{ "best_metric": 0.5079935789108276, "best_model_checkpoint": "./gqa_1_ft_ft/checkpoint-853530", "epoch": 115.0, "eval_steps": 500, "global_step": 853530, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013473457289140393, "grad_norm": 5.050678730010986, "learning_rate": 3.3683643222850987e-07, "loss": 4.3102, "step": 100 }, { "epoch": 0.026946914578280787, "grad_norm": 4.312619686126709, "learning_rate": 6.736728644570197e-07, "loss": 4.2513, "step": 200 }, { "epoch": 0.04042037186742118, "grad_norm": 2.5650596618652344, "learning_rate": 1.0105092966855295e-06, "loss": 4.1388, "step": 300 }, { "epoch": 0.05389382915656157, "grad_norm": 1.7788532972335815, "learning_rate": 1.3473457289140395e-06, "loss": 4.02, "step": 400 }, { "epoch": 0.06736728644570197, "grad_norm": 1.6376471519470215, "learning_rate": 1.6841821611425492e-06, "loss": 3.9344, "step": 500 }, { "epoch": 0.08084074373484236, "grad_norm": 1.603734016418457, "learning_rate": 2.021018593371059e-06, "loss": 3.8594, "step": 600 }, { "epoch": 0.09431420102398276, "grad_norm": 1.6160398721694946, "learning_rate": 2.3578550255995686e-06, "loss": 3.7841, "step": 700 }, { "epoch": 0.10778765831312315, "grad_norm": 1.6393622159957886, "learning_rate": 2.694691457828079e-06, "loss": 3.7157, "step": 800 }, { "epoch": 0.12126111560226355, "grad_norm": 1.6680442094802856, "learning_rate": 3.0315278900565884e-06, "loss": 3.657, "step": 900 }, { "epoch": 0.13473457289140395, "grad_norm": 1.7192258834838867, "learning_rate": 3.3683643222850983e-06, "loss": 3.6023, "step": 1000 }, { "epoch": 0.14820803018054432, "grad_norm": 1.7522443532943726, "learning_rate": 3.7052007545136083e-06, "loss": 3.5483, "step": 1100 }, { "epoch": 0.16168148746968472, "grad_norm": 1.7803658246994019, "learning_rate": 4.042037186742118e-06, "loss": 3.4968, "step": 1200 }, { "epoch": 0.17515494475882512, "grad_norm": 1.7884995937347412, "learning_rate": 4.378873618970627e-06, "loss": 3.4465, "step": 1300 }, { "epoch": 0.18862840204796552, "grad_norm": 1.8088817596435547, "learning_rate": 4.715710051199137e-06, "loss": 3.397, "step": 1400 }, { "epoch": 0.2021018593371059, "grad_norm": 1.8150346279144287, "learning_rate": 5.052546483427648e-06, "loss": 3.3456, "step": 1500 }, { "epoch": 0.2155753166262463, "grad_norm": 1.787262201309204, "learning_rate": 5.389382915656158e-06, "loss": 3.2941, "step": 1600 }, { "epoch": 0.2290487739153867, "grad_norm": 1.7953282594680786, "learning_rate": 5.726219347884668e-06, "loss": 3.2412, "step": 1700 }, { "epoch": 0.2425222312045271, "grad_norm": 1.7741111516952515, "learning_rate": 6.063055780113177e-06, "loss": 3.1863, "step": 1800 }, { "epoch": 0.2559956884936675, "grad_norm": 1.7642431259155273, "learning_rate": 6.399892212341687e-06, "loss": 3.1272, "step": 1900 }, { "epoch": 0.2694691457828079, "grad_norm": 1.7536661624908447, "learning_rate": 6.736728644570197e-06, "loss": 3.0626, "step": 2000 }, { "epoch": 0.28294260307194824, "grad_norm": 1.734358787536621, "learning_rate": 7.073565076798707e-06, "loss": 3.0018, "step": 2100 }, { "epoch": 0.29641606036108864, "grad_norm": 1.7043741941452026, "learning_rate": 7.4104015090272165e-06, "loss": 2.9424, "step": 2200 }, { "epoch": 0.30988951765022904, "grad_norm": 1.6511465311050415, "learning_rate": 7.747237941255726e-06, "loss": 2.8826, "step": 2300 }, { "epoch": 0.32336297493936944, "grad_norm": 1.6271641254425049, "learning_rate": 8.084074373484236e-06, "loss": 2.8234, "step": 2400 }, { "epoch": 0.33683643222850984, "grad_norm": 1.565942645072937, "learning_rate": 8.420910805712747e-06, "loss": 2.7649, "step": 2500 }, { "epoch": 0.35030988951765024, "grad_norm": 1.5313780307769775, "learning_rate": 8.757747237941255e-06, "loss": 2.7075, "step": 2600 }, { "epoch": 0.36378334680679064, "grad_norm": 1.4747436046600342, "learning_rate": 9.094583670169765e-06, "loss": 2.6485, "step": 2700 }, { "epoch": 0.37725680409593104, "grad_norm": 1.4180428981781006, "learning_rate": 9.431420102398274e-06, "loss": 2.5908, "step": 2800 }, { "epoch": 0.3907302613850714, "grad_norm": 1.3671729564666748, "learning_rate": 9.768256534626785e-06, "loss": 2.5363, "step": 2900 }, { "epoch": 0.4042037186742118, "grad_norm": 1.3017171621322632, "learning_rate": 1.0105092966855296e-05, "loss": 2.48, "step": 3000 }, { "epoch": 0.4176771759633522, "grad_norm": 1.2297707796096802, "learning_rate": 1.0441929399083805e-05, "loss": 2.4246, "step": 3100 }, { "epoch": 0.4311506332524926, "grad_norm": 1.1738500595092773, "learning_rate": 1.0778765831312316e-05, "loss": 2.3716, "step": 3200 }, { "epoch": 0.444624090541633, "grad_norm": 1.1590039730072021, "learning_rate": 1.1115602263540825e-05, "loss": 2.3205, "step": 3300 }, { "epoch": 0.4580975478307734, "grad_norm": 1.0251466035842896, "learning_rate": 1.1452438695769336e-05, "loss": 2.2692, "step": 3400 }, { "epoch": 0.4715710051199138, "grad_norm": 0.9825196862220764, "learning_rate": 1.1789275127997845e-05, "loss": 2.2221, "step": 3500 }, { "epoch": 0.4850444624090542, "grad_norm": 0.9162874221801758, "learning_rate": 1.2126111560226354e-05, "loss": 2.1744, "step": 3600 }, { "epoch": 0.49851791969819453, "grad_norm": 0.8263767957687378, "learning_rate": 1.2462947992454863e-05, "loss": 2.1343, "step": 3700 }, { "epoch": 0.511991376987335, "grad_norm": 0.8382564187049866, "learning_rate": 1.2799784424683374e-05, "loss": 2.0955, "step": 3800 }, { "epoch": 0.5254648342764754, "grad_norm": 0.7861710786819458, "learning_rate": 1.3136620856911884e-05, "loss": 2.0591, "step": 3900 }, { "epoch": 0.5389382915656158, "grad_norm": 0.74692702293396, "learning_rate": 1.3473457289140393e-05, "loss": 2.0278, "step": 4000 }, { "epoch": 0.5524117488547561, "grad_norm": 0.6762208938598633, "learning_rate": 1.3810293721368904e-05, "loss": 1.9981, "step": 4100 }, { "epoch": 0.5658852061438965, "grad_norm": 0.8441681861877441, "learning_rate": 1.4147130153597413e-05, "loss": 1.9686, "step": 4200 }, { "epoch": 0.5793586634330369, "grad_norm": 0.7593749165534973, "learning_rate": 1.4483966585825924e-05, "loss": 1.9426, "step": 4300 }, { "epoch": 0.5928321207221773, "grad_norm": 0.824931800365448, "learning_rate": 1.4820803018054433e-05, "loss": 1.9175, "step": 4400 }, { "epoch": 0.6063055780113177, "grad_norm": 0.6178775429725647, "learning_rate": 1.5157639450282944e-05, "loss": 1.8941, "step": 4500 }, { "epoch": 0.6197790353004581, "grad_norm": 0.7404887676239014, "learning_rate": 1.549447588251145e-05, "loss": 1.8733, "step": 4600 }, { "epoch": 0.6332524925895985, "grad_norm": 0.8366172313690186, "learning_rate": 1.5831312314739965e-05, "loss": 1.8492, "step": 4700 }, { "epoch": 0.6467259498787389, "grad_norm": 0.8886280655860901, "learning_rate": 1.6168148746968473e-05, "loss": 1.8286, "step": 4800 }, { "epoch": 0.6601994071678793, "grad_norm": 0.7781857848167419, "learning_rate": 1.6504985179196984e-05, "loss": 1.8101, "step": 4900 }, { "epoch": 0.6736728644570197, "grad_norm": 0.7172331213951111, "learning_rate": 1.6841821611425494e-05, "loss": 1.7905, "step": 5000 }, { "epoch": 0.6871463217461601, "grad_norm": 0.7175844311714172, "learning_rate": 1.7178658043654e-05, "loss": 1.7713, "step": 5100 }, { "epoch": 0.7006197790353005, "grad_norm": 0.7535256147384644, "learning_rate": 1.751549447588251e-05, "loss": 1.7522, "step": 5200 }, { "epoch": 0.7140932363244409, "grad_norm": 0.8760775923728943, "learning_rate": 1.785233090811102e-05, "loss": 1.7362, "step": 5300 }, { "epoch": 0.7275666936135813, "grad_norm": 0.897461473941803, "learning_rate": 1.818916734033953e-05, "loss": 1.7199, "step": 5400 }, { "epoch": 0.7410401509027217, "grad_norm": 0.7702743411064148, "learning_rate": 1.852600377256804e-05, "loss": 1.7041, "step": 5500 }, { "epoch": 0.7545136081918621, "grad_norm": 0.9693697690963745, "learning_rate": 1.886284020479655e-05, "loss": 1.6882, "step": 5600 }, { "epoch": 0.7679870654810024, "grad_norm": 1.0300406217575073, "learning_rate": 1.919967663702506e-05, "loss": 1.6717, "step": 5700 }, { "epoch": 0.7814605227701428, "grad_norm": 1.251861572265625, "learning_rate": 1.953651306925357e-05, "loss": 1.6579, "step": 5800 }, { "epoch": 0.7949339800592832, "grad_norm": 1.1067745685577393, "learning_rate": 1.987334950148208e-05, "loss": 1.646, "step": 5900 }, { "epoch": 0.8084074373484236, "grad_norm": 1.0883229970932007, "learning_rate": 2.0210185933710592e-05, "loss": 1.6302, "step": 6000 }, { "epoch": 0.821880894637564, "grad_norm": 0.9809770584106445, "learning_rate": 2.05470223659391e-05, "loss": 1.6174, "step": 6100 }, { "epoch": 0.8353543519267044, "grad_norm": 1.0365121364593506, "learning_rate": 2.088385879816761e-05, "loss": 1.604, "step": 6200 }, { "epoch": 0.8488278092158448, "grad_norm": 0.7996377944946289, "learning_rate": 2.122069523039612e-05, "loss": 1.5926, "step": 6300 }, { "epoch": 0.8623012665049852, "grad_norm": 0.8180720806121826, "learning_rate": 2.155753166262463e-05, "loss": 1.5787, "step": 6400 }, { "epoch": 0.8757747237941256, "grad_norm": 1.176373839378357, "learning_rate": 2.1894368094853142e-05, "loss": 1.569, "step": 6500 }, { "epoch": 0.889248181083266, "grad_norm": 1.0989619493484497, "learning_rate": 2.223120452708165e-05, "loss": 1.5566, "step": 6600 }, { "epoch": 0.9027216383724064, "grad_norm": 1.2652089595794678, "learning_rate": 2.256804095931016e-05, "loss": 1.5465, "step": 6700 }, { "epoch": 0.9161950956615468, "grad_norm": 1.1882052421569824, "learning_rate": 2.290487739153867e-05, "loss": 1.5333, "step": 6800 }, { "epoch": 0.9296685529506872, "grad_norm": 1.1235698461532593, "learning_rate": 2.3241713823767182e-05, "loss": 1.5229, "step": 6900 }, { "epoch": 0.9431420102398276, "grad_norm": 0.9922112822532654, "learning_rate": 2.357855025599569e-05, "loss": 1.5108, "step": 7000 }, { "epoch": 0.956615467528968, "grad_norm": 1.1093932390213013, "learning_rate": 2.3915386688224197e-05, "loss": 1.5023, "step": 7100 }, { "epoch": 0.9700889248181084, "grad_norm": 1.166998267173767, "learning_rate": 2.4252223120452707e-05, "loss": 1.4923, "step": 7200 }, { "epoch": 0.9835623821072487, "grad_norm": 1.2979530096054077, "learning_rate": 2.4589059552681218e-05, "loss": 1.4831, "step": 7300 }, { "epoch": 0.9970358393963891, "grad_norm": 1.1854064464569092, "learning_rate": 2.4925895984909726e-05, "loss": 1.4726, "step": 7400 }, { "epoch": 1.0, "eval_loss": 1.4243167638778687, "eval_runtime": 4.9168, "eval_samples_per_second": 1016.916, "eval_steps_per_second": 16.067, "step": 7422 }, { "epoch": 1.0105092966855296, "grad_norm": 1.0570918321609497, "learning_rate": 2.5262732417138236e-05, "loss": 1.4615, "step": 7500 }, { "epoch": 1.02398275397467, "grad_norm": 1.4963526725769043, "learning_rate": 2.5599568849366747e-05, "loss": 1.4538, "step": 7600 }, { "epoch": 1.0374562112638104, "grad_norm": 1.2198303937911987, "learning_rate": 2.5936405281595258e-05, "loss": 1.4427, "step": 7700 }, { "epoch": 1.0509296685529508, "grad_norm": 1.3090357780456543, "learning_rate": 2.627324171382377e-05, "loss": 1.4302, "step": 7800 }, { "epoch": 1.0644031258420912, "grad_norm": 1.450103521347046, "learning_rate": 2.6610078146052276e-05, "loss": 1.4217, "step": 7900 }, { "epoch": 1.0778765831312316, "grad_norm": 1.1438026428222656, "learning_rate": 2.6946914578280787e-05, "loss": 1.4151, "step": 8000 }, { "epoch": 1.0913500404203718, "grad_norm": 1.1328705549240112, "learning_rate": 2.7283751010509298e-05, "loss": 1.4042, "step": 8100 }, { "epoch": 1.1048234977095122, "grad_norm": 1.4198311567306519, "learning_rate": 2.762058744273781e-05, "loss": 1.3973, "step": 8200 }, { "epoch": 1.1182969549986526, "grad_norm": 1.3783742189407349, "learning_rate": 2.7957423874966316e-05, "loss": 1.3842, "step": 8300 }, { "epoch": 1.131770412287793, "grad_norm": 1.453730821609497, "learning_rate": 2.8294260307194826e-05, "loss": 1.3775, "step": 8400 }, { "epoch": 1.1452438695769334, "grad_norm": 1.2896552085876465, "learning_rate": 2.8631096739423337e-05, "loss": 1.3671, "step": 8500 }, { "epoch": 1.1587173268660738, "grad_norm": 1.5998131036758423, "learning_rate": 2.8967933171651848e-05, "loss": 1.36, "step": 8600 }, { "epoch": 1.1721907841552142, "grad_norm": 1.5618042945861816, "learning_rate": 2.930476960388036e-05, "loss": 1.3501, "step": 8700 }, { "epoch": 1.1856642414443546, "grad_norm": 1.2911502122879028, "learning_rate": 2.9641606036108866e-05, "loss": 1.342, "step": 8800 }, { "epoch": 1.199137698733495, "grad_norm": 1.5170632600784302, "learning_rate": 2.9978442468337377e-05, "loss": 1.3328, "step": 8900 }, { "epoch": 1.2126111560226354, "grad_norm": 1.3672733306884766, "learning_rate": 3.0315278900565888e-05, "loss": 1.3267, "step": 9000 }, { "epoch": 1.2260846133117758, "grad_norm": 1.439145565032959, "learning_rate": 3.065211533279439e-05, "loss": 1.3181, "step": 9100 }, { "epoch": 1.2395580706009162, "grad_norm": 1.6262305974960327, "learning_rate": 3.09889517650229e-05, "loss": 1.3094, "step": 9200 }, { "epoch": 1.2530315278900566, "grad_norm": 1.6867625713348389, "learning_rate": 3.132578819725142e-05, "loss": 1.3014, "step": 9300 }, { "epoch": 1.266504985179197, "grad_norm": 1.4283783435821533, "learning_rate": 3.166262462947993e-05, "loss": 1.293, "step": 9400 }, { "epoch": 1.2799784424683374, "grad_norm": 1.639009714126587, "learning_rate": 3.1999461061708435e-05, "loss": 1.2864, "step": 9500 }, { "epoch": 1.2934518997574778, "grad_norm": 1.4431902170181274, "learning_rate": 3.2336297493936946e-05, "loss": 1.279, "step": 9600 }, { "epoch": 1.3069253570466182, "grad_norm": 1.5873929262161255, "learning_rate": 3.2673133926165456e-05, "loss": 1.2711, "step": 9700 }, { "epoch": 1.3203988143357586, "grad_norm": 1.525464653968811, "learning_rate": 3.300997035839397e-05, "loss": 1.2623, "step": 9800 }, { "epoch": 1.333872271624899, "grad_norm": 1.7639635801315308, "learning_rate": 3.334680679062248e-05, "loss": 1.2567, "step": 9900 }, { "epoch": 1.3473457289140394, "grad_norm": 1.5616530179977417, "learning_rate": 3.368364322285099e-05, "loss": 1.2497, "step": 10000 }, { "epoch": 1.3608191862031798, "grad_norm": 1.741506576538086, "learning_rate": 3.402047965507949e-05, "loss": 1.2449, "step": 10100 }, { "epoch": 1.3742926434923202, "grad_norm": 1.8986817598342896, "learning_rate": 3.4357316087308e-05, "loss": 1.2377, "step": 10200 }, { "epoch": 1.3877661007814606, "grad_norm": 1.5373413562774658, "learning_rate": 3.469415251953651e-05, "loss": 1.231, "step": 10300 }, { "epoch": 1.401239558070601, "grad_norm": 1.6659778356552124, "learning_rate": 3.503098895176502e-05, "loss": 1.2243, "step": 10400 }, { "epoch": 1.4147130153597414, "grad_norm": 1.7636853456497192, "learning_rate": 3.536782538399353e-05, "loss": 1.2181, "step": 10500 }, { "epoch": 1.4281864726488818, "grad_norm": 1.5813233852386475, "learning_rate": 3.570466181622204e-05, "loss": 1.2133, "step": 10600 }, { "epoch": 1.4416599299380222, "grad_norm": 1.6228280067443848, "learning_rate": 3.604149824845055e-05, "loss": 1.2063, "step": 10700 }, { "epoch": 1.4551333872271626, "grad_norm": 1.610800862312317, "learning_rate": 3.637833468067906e-05, "loss": 1.2011, "step": 10800 }, { "epoch": 1.468606844516303, "grad_norm": 1.81444251537323, "learning_rate": 3.671517111290757e-05, "loss": 1.1967, "step": 10900 }, { "epoch": 1.4820803018054431, "grad_norm": 1.9130659103393555, "learning_rate": 3.705200754513608e-05, "loss": 1.1879, "step": 11000 }, { "epoch": 1.4955537590945838, "grad_norm": 1.7679039239883423, "learning_rate": 3.7388843977364593e-05, "loss": 1.1851, "step": 11100 }, { "epoch": 1.509027216383724, "grad_norm": 1.7163736820220947, "learning_rate": 3.77256804095931e-05, "loss": 1.1792, "step": 11200 }, { "epoch": 1.5225006736728646, "grad_norm": 1.96596097946167, "learning_rate": 3.806251684182161e-05, "loss": 1.1737, "step": 11300 }, { "epoch": 1.5359741309620047, "grad_norm": 1.6757841110229492, "learning_rate": 3.839935327405012e-05, "loss": 1.1682, "step": 11400 }, { "epoch": 1.5494475882511454, "grad_norm": 1.5971980094909668, "learning_rate": 3.873618970627863e-05, "loss": 1.1643, "step": 11500 }, { "epoch": 1.5629210455402855, "grad_norm": 1.7918592691421509, "learning_rate": 3.907302613850714e-05, "loss": 1.1591, "step": 11600 }, { "epoch": 1.5763945028294262, "grad_norm": 1.5908026695251465, "learning_rate": 3.940986257073565e-05, "loss": 1.1526, "step": 11700 }, { "epoch": 1.5898679601185663, "grad_norm": 1.8521835803985596, "learning_rate": 3.974669900296416e-05, "loss": 1.1447, "step": 11800 }, { "epoch": 1.603341417407707, "grad_norm": 1.6523579359054565, "learning_rate": 4.008353543519267e-05, "loss": 1.1373, "step": 11900 }, { "epoch": 1.6168148746968471, "grad_norm": 1.7173532247543335, "learning_rate": 4.0420371867421184e-05, "loss": 1.1286, "step": 12000 }, { "epoch": 1.6302883319859878, "grad_norm": 1.6983517408370972, "learning_rate": 4.0757208299649694e-05, "loss": 1.1174, "step": 12100 }, { "epoch": 1.643761789275128, "grad_norm": 2.1942336559295654, "learning_rate": 4.10940447318782e-05, "loss": 1.1106, "step": 12200 }, { "epoch": 1.6572352465642683, "grad_norm": 1.9910157918930054, "learning_rate": 4.143088116410671e-05, "loss": 1.0988, "step": 12300 }, { "epoch": 1.6707087038534087, "grad_norm": 1.7737709283828735, "learning_rate": 4.176771759633522e-05, "loss": 1.0869, "step": 12400 }, { "epoch": 1.6841821611425492, "grad_norm": 1.816585659980774, "learning_rate": 4.210455402856373e-05, "loss": 1.0733, "step": 12500 }, { "epoch": 1.6976556184316896, "grad_norm": 2.0184226036071777, "learning_rate": 4.244139046079224e-05, "loss": 1.0642, "step": 12600 }, { "epoch": 1.71112907572083, "grad_norm": 1.868251085281372, "learning_rate": 4.277822689302075e-05, "loss": 1.0514, "step": 12700 }, { "epoch": 1.7246025330099704, "grad_norm": 2.1273250579833984, "learning_rate": 4.311506332524926e-05, "loss": 1.0412, "step": 12800 }, { "epoch": 1.7380759902991108, "grad_norm": 2.015742778778076, "learning_rate": 4.3451899757477774e-05, "loss": 1.0296, "step": 12900 }, { "epoch": 1.7515494475882512, "grad_norm": 2.0739262104034424, "learning_rate": 4.3788736189706284e-05, "loss": 1.0215, "step": 13000 }, { "epoch": 1.7650229048773916, "grad_norm": 1.9672932624816895, "learning_rate": 4.412557262193479e-05, "loss": 1.016, "step": 13100 }, { "epoch": 1.778496362166532, "grad_norm": 2.0514392852783203, "learning_rate": 4.44624090541633e-05, "loss": 1.0082, "step": 13200 }, { "epoch": 1.7919698194556724, "grad_norm": 1.8100783824920654, "learning_rate": 4.479924548639181e-05, "loss": 1.002, "step": 13300 }, { "epoch": 1.8054432767448128, "grad_norm": 1.7987364530563354, "learning_rate": 4.513608191862032e-05, "loss": 0.9984, "step": 13400 }, { "epoch": 1.8189167340339532, "grad_norm": 1.9320526123046875, "learning_rate": 4.547291835084883e-05, "loss": 0.9931, "step": 13500 }, { "epoch": 1.8323901913230936, "grad_norm": 1.731566309928894, "learning_rate": 4.580975478307734e-05, "loss": 0.9877, "step": 13600 }, { "epoch": 1.845863648612234, "grad_norm": 1.5202717781066895, "learning_rate": 4.614659121530585e-05, "loss": 0.9831, "step": 13700 }, { "epoch": 1.8593371059013744, "grad_norm": 2.0450828075408936, "learning_rate": 4.6483427647534364e-05, "loss": 0.9807, "step": 13800 }, { "epoch": 1.8728105631905145, "grad_norm": 1.4799247980117798, "learning_rate": 4.6820264079762875e-05, "loss": 0.9768, "step": 13900 }, { "epoch": 1.8862840204796552, "grad_norm": 1.5794858932495117, "learning_rate": 4.715710051199138e-05, "loss": 0.973, "step": 14000 }, { "epoch": 1.8997574777687953, "grad_norm": 1.7370023727416992, "learning_rate": 4.749393694421989e-05, "loss": 0.969, "step": 14100 }, { "epoch": 1.913230935057936, "grad_norm": 1.6848158836364746, "learning_rate": 4.783077337644839e-05, "loss": 0.9685, "step": 14200 }, { "epoch": 1.9267043923470761, "grad_norm": 1.5446182489395142, "learning_rate": 4.8167609808676904e-05, "loss": 0.963, "step": 14300 }, { "epoch": 1.9401778496362168, "grad_norm": 1.9139388799667358, "learning_rate": 4.8504446240905415e-05, "loss": 0.96, "step": 14400 }, { "epoch": 1.953651306925357, "grad_norm": 1.726624608039856, "learning_rate": 4.8841282673133926e-05, "loss": 0.9568, "step": 14500 }, { "epoch": 1.9671247642144976, "grad_norm": 1.702346920967102, "learning_rate": 4.9178119105362436e-05, "loss": 0.9537, "step": 14600 }, { "epoch": 1.9805982215036377, "grad_norm": 1.7672113180160522, "learning_rate": 4.951495553759095e-05, "loss": 0.9504, "step": 14700 }, { "epoch": 1.9940716787927784, "grad_norm": 2.078725576400757, "learning_rate": 4.985179196981945e-05, "loss": 0.9481, "step": 14800 }, { "epoch": 2.0, "eval_loss": 0.9009393453598022, "eval_runtime": 4.9357, "eval_samples_per_second": 1013.02, "eval_steps_per_second": 16.006, "step": 14844 }, { "epoch": 2.0075451360819185, "grad_norm": 1.779005765914917, "learning_rate": 5.018862840204796e-05, "loss": 0.945, "step": 14900 }, { "epoch": 2.021018593371059, "grad_norm": 1.6590195894241333, "learning_rate": 5.052546483427647e-05, "loss": 0.9429, "step": 15000 }, { "epoch": 2.0344920506601993, "grad_norm": 1.8849517107009888, "learning_rate": 5.0862301266504983e-05, "loss": 0.9413, "step": 15100 }, { "epoch": 2.04796550794934, "grad_norm": 1.7405115365982056, "learning_rate": 5.1199137698733494e-05, "loss": 0.936, "step": 15200 }, { "epoch": 2.06143896523848, "grad_norm": 1.7108243703842163, "learning_rate": 5.1535974130962005e-05, "loss": 0.9366, "step": 15300 }, { "epoch": 2.0749124225276208, "grad_norm": 1.6526060104370117, "learning_rate": 5.1872810563190516e-05, "loss": 0.9317, "step": 15400 }, { "epoch": 2.088385879816761, "grad_norm": 1.7151249647140503, "learning_rate": 5.2209646995419027e-05, "loss": 0.9316, "step": 15500 }, { "epoch": 2.1018593371059016, "grad_norm": 1.6582752466201782, "learning_rate": 5.254648342764754e-05, "loss": 0.9279, "step": 15600 }, { "epoch": 2.1153327943950417, "grad_norm": 1.7550755739212036, "learning_rate": 5.288331985987604e-05, "loss": 0.9268, "step": 15700 }, { "epoch": 2.1288062516841824, "grad_norm": 1.798999547958374, "learning_rate": 5.322015629210455e-05, "loss": 0.9241, "step": 15800 }, { "epoch": 2.1422797089733225, "grad_norm": 1.501911997795105, "learning_rate": 5.355699272433306e-05, "loss": 0.9236, "step": 15900 }, { "epoch": 2.155753166262463, "grad_norm": 1.754218578338623, "learning_rate": 5.3893829156561574e-05, "loss": 0.9193, "step": 16000 }, { "epoch": 2.1692266235516033, "grad_norm": 1.5622211694717407, "learning_rate": 5.4230665588790084e-05, "loss": 0.9194, "step": 16100 }, { "epoch": 2.1827000808407435, "grad_norm": 1.593868374824524, "learning_rate": 5.4567502021018595e-05, "loss": 0.9161, "step": 16200 }, { "epoch": 2.196173538129884, "grad_norm": 1.704789161682129, "learning_rate": 5.4904338453247106e-05, "loss": 0.9127, "step": 16300 }, { "epoch": 2.2096469954190243, "grad_norm": 1.7025717496871948, "learning_rate": 5.524117488547562e-05, "loss": 0.9119, "step": 16400 }, { "epoch": 2.223120452708165, "grad_norm": 1.603441596031189, "learning_rate": 5.557801131770413e-05, "loss": 0.9094, "step": 16500 }, { "epoch": 2.236593909997305, "grad_norm": 1.6574318408966064, "learning_rate": 5.591484774993263e-05, "loss": 0.9088, "step": 16600 }, { "epoch": 2.2500673672864457, "grad_norm": 1.4947423934936523, "learning_rate": 5.625168418216114e-05, "loss": 0.9064, "step": 16700 }, { "epoch": 2.263540824575586, "grad_norm": 1.8505243062973022, "learning_rate": 5.658852061438965e-05, "loss": 0.905, "step": 16800 }, { "epoch": 2.2770142818647265, "grad_norm": 1.6498006582260132, "learning_rate": 5.6925357046618164e-05, "loss": 0.9015, "step": 16900 }, { "epoch": 2.2904877391538667, "grad_norm": 1.472609519958496, "learning_rate": 5.7262193478846674e-05, "loss": 0.9013, "step": 17000 }, { "epoch": 2.3039611964430073, "grad_norm": 1.8265018463134766, "learning_rate": 5.7599029911075185e-05, "loss": 0.899, "step": 17100 }, { "epoch": 2.3174346537321475, "grad_norm": 1.691354513168335, "learning_rate": 5.7935866343303696e-05, "loss": 0.8975, "step": 17200 }, { "epoch": 2.330908111021288, "grad_norm": 1.5141873359680176, "learning_rate": 5.827270277553221e-05, "loss": 0.8951, "step": 17300 }, { "epoch": 2.3443815683104283, "grad_norm": 1.702109456062317, "learning_rate": 5.860953920776072e-05, "loss": 0.8943, "step": 17400 }, { "epoch": 2.357855025599569, "grad_norm": 1.689287781715393, "learning_rate": 5.894637563998922e-05, "loss": 0.8915, "step": 17500 }, { "epoch": 2.371328482888709, "grad_norm": 1.5354336500167847, "learning_rate": 5.928321207221773e-05, "loss": 0.8904, "step": 17600 }, { "epoch": 2.3848019401778497, "grad_norm": 1.480864405632019, "learning_rate": 5.962004850444624e-05, "loss": 0.8881, "step": 17700 }, { "epoch": 2.39827539746699, "grad_norm": 1.692920446395874, "learning_rate": 5.9956884936674754e-05, "loss": 0.8885, "step": 17800 }, { "epoch": 2.4117488547561305, "grad_norm": 1.7953569889068604, "learning_rate": 6.0293721368903265e-05, "loss": 0.8862, "step": 17900 }, { "epoch": 2.4252223120452707, "grad_norm": 1.5273362398147583, "learning_rate": 6.0630557801131775e-05, "loss": 0.8847, "step": 18000 }, { "epoch": 2.4386957693344113, "grad_norm": 1.7911235094070435, "learning_rate": 6.0967394233360286e-05, "loss": 0.8841, "step": 18100 }, { "epoch": 2.4521692266235515, "grad_norm": 1.5316576957702637, "learning_rate": 6.130423066558878e-05, "loss": 0.883, "step": 18200 }, { "epoch": 2.465642683912692, "grad_norm": 1.6153037548065186, "learning_rate": 6.16410670978173e-05, "loss": 0.8779, "step": 18300 }, { "epoch": 2.4791161412018323, "grad_norm": 1.5775448083877563, "learning_rate": 6.19779035300458e-05, "loss": 0.8777, "step": 18400 }, { "epoch": 2.492589598490973, "grad_norm": 1.6069939136505127, "learning_rate": 6.231473996227432e-05, "loss": 0.8763, "step": 18500 }, { "epoch": 2.506063055780113, "grad_norm": 1.6464314460754395, "learning_rate": 6.265157639450284e-05, "loss": 0.8746, "step": 18600 }, { "epoch": 2.5195365130692533, "grad_norm": 1.537147879600525, "learning_rate": 6.298841282673135e-05, "loss": 0.8742, "step": 18700 }, { "epoch": 2.533009970358394, "grad_norm": 1.690079689025879, "learning_rate": 6.332524925895986e-05, "loss": 0.8737, "step": 18800 }, { "epoch": 2.5464834276475345, "grad_norm": 1.5641427040100098, "learning_rate": 6.366208569118836e-05, "loss": 0.8717, "step": 18900 }, { "epoch": 2.5599568849366747, "grad_norm": 1.7134102582931519, "learning_rate": 6.399892212341687e-05, "loss": 0.8708, "step": 19000 }, { "epoch": 2.573430342225815, "grad_norm": 1.7088773250579834, "learning_rate": 6.433575855564538e-05, "loss": 0.8683, "step": 19100 }, { "epoch": 2.5869037995149555, "grad_norm": 1.5207405090332031, "learning_rate": 6.467259498787389e-05, "loss": 0.867, "step": 19200 }, { "epoch": 2.600377256804096, "grad_norm": 1.541784644126892, "learning_rate": 6.50094314201024e-05, "loss": 0.8655, "step": 19300 }, { "epoch": 2.6138507140932363, "grad_norm": 1.419013500213623, "learning_rate": 6.534626785233091e-05, "loss": 0.8654, "step": 19400 }, { "epoch": 2.6273241713823765, "grad_norm": 1.611279010772705, "learning_rate": 6.568310428455942e-05, "loss": 0.864, "step": 19500 }, { "epoch": 2.640797628671517, "grad_norm": 1.5877684354782104, "learning_rate": 6.601994071678793e-05, "loss": 0.8629, "step": 19600 }, { "epoch": 2.6542710859606578, "grad_norm": 1.458388328552246, "learning_rate": 6.635677714901644e-05, "loss": 0.8601, "step": 19700 }, { "epoch": 2.667744543249798, "grad_norm": 1.5770941972732544, "learning_rate": 6.669361358124496e-05, "loss": 0.8593, "step": 19800 }, { "epoch": 2.681218000538938, "grad_norm": 1.6758652925491333, "learning_rate": 6.703045001347347e-05, "loss": 0.8587, "step": 19900 }, { "epoch": 2.6946914578280787, "grad_norm": 1.550167202949524, "learning_rate": 6.736728644570198e-05, "loss": 0.8581, "step": 20000 }, { "epoch": 2.7081649151172194, "grad_norm": 1.5537160634994507, "learning_rate": 6.770412287793049e-05, "loss": 0.8568, "step": 20100 }, { "epoch": 2.7216383724063595, "grad_norm": 1.4086116552352905, "learning_rate": 6.804095931015899e-05, "loss": 0.8553, "step": 20200 }, { "epoch": 2.7351118296954997, "grad_norm": 1.5523277521133423, "learning_rate": 6.83777957423875e-05, "loss": 0.8545, "step": 20300 }, { "epoch": 2.7485852869846403, "grad_norm": 1.3250375986099243, "learning_rate": 6.8714632174616e-05, "loss": 0.8528, "step": 20400 }, { "epoch": 2.7620587442737805, "grad_norm": 1.4796454906463623, "learning_rate": 6.905146860684452e-05, "loss": 0.852, "step": 20500 }, { "epoch": 2.775532201562921, "grad_norm": 1.4476518630981445, "learning_rate": 6.938830503907301e-05, "loss": 0.8495, "step": 20600 }, { "epoch": 2.7890056588520613, "grad_norm": 1.5159904956817627, "learning_rate": 6.972514147130153e-05, "loss": 0.8486, "step": 20700 }, { "epoch": 2.802479116141202, "grad_norm": 1.6851344108581543, "learning_rate": 7.006197790353004e-05, "loss": 0.8491, "step": 20800 }, { "epoch": 2.815952573430342, "grad_norm": 1.6081255674362183, "learning_rate": 7.039881433575855e-05, "loss": 0.8466, "step": 20900 }, { "epoch": 2.8294260307194827, "grad_norm": 1.4334090948104858, "learning_rate": 7.073565076798706e-05, "loss": 0.8468, "step": 21000 }, { "epoch": 2.842899488008623, "grad_norm": 1.5177432298660278, "learning_rate": 7.107248720021557e-05, "loss": 0.8443, "step": 21100 }, { "epoch": 2.8563729452977635, "grad_norm": 1.8086514472961426, "learning_rate": 7.140932363244408e-05, "loss": 0.8433, "step": 21200 }, { "epoch": 2.8698464025869037, "grad_norm": 1.7057809829711914, "learning_rate": 7.174616006467259e-05, "loss": 0.841, "step": 21300 }, { "epoch": 2.8833198598760443, "grad_norm": 1.4802790880203247, "learning_rate": 7.20829964969011e-05, "loss": 0.8424, "step": 21400 }, { "epoch": 2.8967933171651845, "grad_norm": 1.5523607730865479, "learning_rate": 7.241983292912961e-05, "loss": 0.8418, "step": 21500 }, { "epoch": 2.910266774454325, "grad_norm": 1.5498450994491577, "learning_rate": 7.275666936135812e-05, "loss": 0.839, "step": 21600 }, { "epoch": 2.9237402317434653, "grad_norm": 1.5559934377670288, "learning_rate": 7.309350579358663e-05, "loss": 0.8372, "step": 21700 }, { "epoch": 2.937213689032606, "grad_norm": 1.4754034280776978, "learning_rate": 7.343034222581514e-05, "loss": 0.8359, "step": 21800 }, { "epoch": 2.950687146321746, "grad_norm": 1.4573789834976196, "learning_rate": 7.376717865804365e-05, "loss": 0.8359, "step": 21900 }, { "epoch": 2.9641606036108863, "grad_norm": 1.3782004117965698, "learning_rate": 7.410401509027217e-05, "loss": 0.8349, "step": 22000 }, { "epoch": 2.977634060900027, "grad_norm": 1.5825591087341309, "learning_rate": 7.444085152250068e-05, "loss": 0.8337, "step": 22100 }, { "epoch": 2.9911075181891675, "grad_norm": 1.38994562625885, "learning_rate": 7.477768795472919e-05, "loss": 0.8333, "step": 22200 }, { "epoch": 3.0, "eval_loss": 0.799736738204956, "eval_runtime": 4.9265, "eval_samples_per_second": 1014.929, "eval_steps_per_second": 16.036, "step": 22266 }, { "epoch": 3.0045809754783077, "grad_norm": 1.4885023832321167, "learning_rate": 7.51145243869577e-05, "loss": 0.8315, "step": 22300 }, { "epoch": 3.0180544327674483, "grad_norm": 1.4334232807159424, "learning_rate": 7.54513608191862e-05, "loss": 0.8312, "step": 22400 }, { "epoch": 3.0315278900565885, "grad_norm": 1.3167600631713867, "learning_rate": 7.57881972514147e-05, "loss": 0.8312, "step": 22500 }, { "epoch": 3.045001347345729, "grad_norm": 1.4370670318603516, "learning_rate": 7.612503368364322e-05, "loss": 0.8284, "step": 22600 }, { "epoch": 3.0584748046348693, "grad_norm": 1.5740081071853638, "learning_rate": 7.646187011587173e-05, "loss": 0.8279, "step": 22700 }, { "epoch": 3.0719482619240095, "grad_norm": 1.5045068264007568, "learning_rate": 7.679870654810024e-05, "loss": 0.8267, "step": 22800 }, { "epoch": 3.08542171921315, "grad_norm": 1.584980845451355, "learning_rate": 7.713554298032875e-05, "loss": 0.8249, "step": 22900 }, { "epoch": 3.0988951765022903, "grad_norm": 1.4468419551849365, "learning_rate": 7.747237941255726e-05, "loss": 0.8254, "step": 23000 }, { "epoch": 3.112368633791431, "grad_norm": 1.3465651273727417, "learning_rate": 7.780921584478577e-05, "loss": 0.8241, "step": 23100 }, { "epoch": 3.125842091080571, "grad_norm": 1.3511197566986084, "learning_rate": 7.814605227701428e-05, "loss": 0.8215, "step": 23200 }, { "epoch": 3.1393155483697117, "grad_norm": 1.5290985107421875, "learning_rate": 7.848288870924279e-05, "loss": 0.8217, "step": 23300 }, { "epoch": 3.152789005658852, "grad_norm": 1.5088189840316772, "learning_rate": 7.88197251414713e-05, "loss": 0.821, "step": 23400 }, { "epoch": 3.1662624629479925, "grad_norm": 1.3461265563964844, "learning_rate": 7.915656157369981e-05, "loss": 0.8205, "step": 23500 }, { "epoch": 3.1797359202371327, "grad_norm": 1.6933553218841553, "learning_rate": 7.949339800592832e-05, "loss": 0.8193, "step": 23600 }, { "epoch": 3.1932093775262733, "grad_norm": 1.6161068677902222, "learning_rate": 7.983023443815683e-05, "loss": 0.8171, "step": 23700 }, { "epoch": 3.2066828348154135, "grad_norm": 1.459757685661316, "learning_rate": 8.016707087038535e-05, "loss": 0.8166, "step": 23800 }, { "epoch": 3.220156292104554, "grad_norm": 1.4641468524932861, "learning_rate": 8.050390730261386e-05, "loss": 0.8151, "step": 23900 }, { "epoch": 3.2336297493936943, "grad_norm": 1.4080913066864014, "learning_rate": 8.084074373484237e-05, "loss": 0.8156, "step": 24000 }, { "epoch": 3.247103206682835, "grad_norm": 1.3389708995819092, "learning_rate": 8.117758016707088e-05, "loss": 0.814, "step": 24100 }, { "epoch": 3.260576663971975, "grad_norm": 1.326341986656189, "learning_rate": 8.151441659929939e-05, "loss": 0.8147, "step": 24200 }, { "epoch": 3.2740501212611157, "grad_norm": 1.291643738746643, "learning_rate": 8.185125303152789e-05, "loss": 0.813, "step": 24300 }, { "epoch": 3.287523578550256, "grad_norm": 1.2975057363510132, "learning_rate": 8.21880894637564e-05, "loss": 0.8118, "step": 24400 }, { "epoch": 3.3009970358393965, "grad_norm": 1.412612795829773, "learning_rate": 8.252492589598491e-05, "loss": 0.8108, "step": 24500 }, { "epoch": 3.3144704931285367, "grad_norm": 1.430005431175232, "learning_rate": 8.286176232821342e-05, "loss": 0.8101, "step": 24600 }, { "epoch": 3.3279439504176773, "grad_norm": 1.327523946762085, "learning_rate": 8.319859876044193e-05, "loss": 0.81, "step": 24700 }, { "epoch": 3.3414174077068175, "grad_norm": 1.3939589262008667, "learning_rate": 8.353543519267044e-05, "loss": 0.8081, "step": 24800 }, { "epoch": 3.354890864995958, "grad_norm": 1.337794303894043, "learning_rate": 8.387227162489895e-05, "loss": 0.8087, "step": 24900 }, { "epoch": 3.3683643222850983, "grad_norm": 1.387837529182434, "learning_rate": 8.420910805712746e-05, "loss": 0.8064, "step": 25000 }, { "epoch": 3.381837779574239, "grad_norm": 1.344419240951538, "learning_rate": 8.454594448935597e-05, "loss": 0.8073, "step": 25100 }, { "epoch": 3.395311236863379, "grad_norm": 1.318400502204895, "learning_rate": 8.488278092158448e-05, "loss": 0.8049, "step": 25200 }, { "epoch": 3.4087846941525197, "grad_norm": 1.4658153057098389, "learning_rate": 8.5219617353813e-05, "loss": 0.8032, "step": 25300 }, { "epoch": 3.42225815144166, "grad_norm": 1.3893009424209595, "learning_rate": 8.55564537860415e-05, "loss": 0.8028, "step": 25400 }, { "epoch": 3.4357316087308005, "grad_norm": 1.381014108657837, "learning_rate": 8.589329021827002e-05, "loss": 0.8035, "step": 25500 }, { "epoch": 3.4492050660199407, "grad_norm": 1.501815915107727, "learning_rate": 8.623012665049853e-05, "loss": 0.8008, "step": 25600 }, { "epoch": 3.462678523309081, "grad_norm": 1.3108172416687012, "learning_rate": 8.656696308272704e-05, "loss": 0.8014, "step": 25700 }, { "epoch": 3.4761519805982215, "grad_norm": 1.281020998954773, "learning_rate": 8.690379951495555e-05, "loss": 0.8, "step": 25800 }, { "epoch": 3.489625437887362, "grad_norm": 1.5074561834335327, "learning_rate": 8.724063594718406e-05, "loss": 0.7988, "step": 25900 }, { "epoch": 3.5030988951765023, "grad_norm": 1.3180803060531616, "learning_rate": 8.757747237941257e-05, "loss": 0.7993, "step": 26000 }, { "epoch": 3.5165723524656425, "grad_norm": 1.4048805236816406, "learning_rate": 8.791430881164107e-05, "loss": 0.7969, "step": 26100 }, { "epoch": 3.530045809754783, "grad_norm": 1.1951408386230469, "learning_rate": 8.825114524386958e-05, "loss": 0.7974, "step": 26200 }, { "epoch": 3.5435192670439237, "grad_norm": 1.2480727434158325, "learning_rate": 8.858798167609809e-05, "loss": 0.7956, "step": 26300 }, { "epoch": 3.556992724333064, "grad_norm": 1.258634090423584, "learning_rate": 8.89248181083266e-05, "loss": 0.7954, "step": 26400 }, { "epoch": 3.570466181622204, "grad_norm": 1.2156716585159302, "learning_rate": 8.926165454055511e-05, "loss": 0.7945, "step": 26500 }, { "epoch": 3.5839396389113447, "grad_norm": 1.2537472248077393, "learning_rate": 8.959849097278362e-05, "loss": 0.7937, "step": 26600 }, { "epoch": 3.597413096200485, "grad_norm": 1.3081258535385132, "learning_rate": 8.993532740501213e-05, "loss": 0.7916, "step": 26700 }, { "epoch": 3.6108865534896255, "grad_norm": 1.3487190008163452, "learning_rate": 9.027216383724064e-05, "loss": 0.7924, "step": 26800 }, { "epoch": 3.6243600107787657, "grad_norm": 1.3321914672851562, "learning_rate": 9.060900026946915e-05, "loss": 0.7915, "step": 26900 }, { "epoch": 3.6378334680679063, "grad_norm": 1.209767460823059, "learning_rate": 9.094583670169766e-05, "loss": 0.7911, "step": 27000 }, { "epoch": 3.6513069253570465, "grad_norm": 1.271689772605896, "learning_rate": 9.128267313392617e-05, "loss": 0.7892, "step": 27100 }, { "epoch": 3.664780382646187, "grad_norm": 1.1579375267028809, "learning_rate": 9.161950956615468e-05, "loss": 0.791, "step": 27200 }, { "epoch": 3.6782538399353273, "grad_norm": 1.251380443572998, "learning_rate": 9.19563459983832e-05, "loss": 0.7883, "step": 27300 }, { "epoch": 3.691727297224468, "grad_norm": 1.310412883758545, "learning_rate": 9.22931824306117e-05, "loss": 0.7871, "step": 27400 }, { "epoch": 3.705200754513608, "grad_norm": 1.2627147436141968, "learning_rate": 9.263001886284022e-05, "loss": 0.787, "step": 27500 }, { "epoch": 3.7186742118027487, "grad_norm": 1.1902483701705933, "learning_rate": 9.296685529506873e-05, "loss": 0.7855, "step": 27600 }, { "epoch": 3.732147669091889, "grad_norm": 1.4136449098587036, "learning_rate": 9.330369172729724e-05, "loss": 0.7856, "step": 27700 }, { "epoch": 3.7456211263810295, "grad_norm": 1.2495746612548828, "learning_rate": 9.364052815952575e-05, "loss": 0.785, "step": 27800 }, { "epoch": 3.7590945836701697, "grad_norm": 1.3513541221618652, "learning_rate": 9.397736459175425e-05, "loss": 0.7824, "step": 27900 }, { "epoch": 3.7725680409593103, "grad_norm": 1.2921991348266602, "learning_rate": 9.431420102398276e-05, "loss": 0.7844, "step": 28000 }, { "epoch": 3.7860414982484505, "grad_norm": 1.2404944896697998, "learning_rate": 9.465103745621127e-05, "loss": 0.7828, "step": 28100 }, { "epoch": 3.799514955537591, "grad_norm": 1.1229581832885742, "learning_rate": 9.498787388843978e-05, "loss": 0.7806, "step": 28200 }, { "epoch": 3.8129884128267313, "grad_norm": 1.112694263458252, "learning_rate": 9.532471032066828e-05, "loss": 0.7811, "step": 28300 }, { "epoch": 3.826461870115872, "grad_norm": 1.3388352394104004, "learning_rate": 9.566154675289679e-05, "loss": 0.7798, "step": 28400 }, { "epoch": 3.839935327405012, "grad_norm": 1.2395657300949097, "learning_rate": 9.59983831851253e-05, "loss": 0.7796, "step": 28500 }, { "epoch": 3.8534087846941523, "grad_norm": 1.2434518337249756, "learning_rate": 9.633521961735381e-05, "loss": 0.7776, "step": 28600 }, { "epoch": 3.866882241983293, "grad_norm": 1.2230993509292603, "learning_rate": 9.667205604958232e-05, "loss": 0.7773, "step": 28700 }, { "epoch": 3.8803556992724335, "grad_norm": 1.184511661529541, "learning_rate": 9.700889248181083e-05, "loss": 0.7776, "step": 28800 }, { "epoch": 3.8938291565615737, "grad_norm": 1.2041497230529785, "learning_rate": 9.734572891403934e-05, "loss": 0.7778, "step": 28900 }, { "epoch": 3.907302613850714, "grad_norm": 1.1748751401901245, "learning_rate": 9.768256534626785e-05, "loss": 0.7747, "step": 29000 }, { "epoch": 3.9207760711398545, "grad_norm": 1.1405631303787231, "learning_rate": 9.801940177849636e-05, "loss": 0.776, "step": 29100 }, { "epoch": 3.934249528428995, "grad_norm": 1.0897303819656372, "learning_rate": 9.835623821072487e-05, "loss": 0.7742, "step": 29200 }, { "epoch": 3.9477229857181353, "grad_norm": 1.1475759744644165, "learning_rate": 9.869307464295338e-05, "loss": 0.7736, "step": 29300 }, { "epoch": 3.9611964430072755, "grad_norm": 1.137139916419983, "learning_rate": 9.90299110751819e-05, "loss": 0.7726, "step": 29400 }, { "epoch": 3.974669900296416, "grad_norm": 1.153213381767273, "learning_rate": 9.93667475074104e-05, "loss": 0.7713, "step": 29500 }, { "epoch": 3.9881433575855567, "grad_norm": 1.2605184316635132, "learning_rate": 9.97035839396389e-05, "loss": 0.7713, "step": 29600 }, { "epoch": 4.0, "eval_loss": 0.745492160320282, "eval_runtime": 4.9507, "eval_samples_per_second": 1009.949, "eval_steps_per_second": 15.957, "step": 29688 }, { "epoch": 4.001616814874697, "grad_norm": 1.185999870300293, "learning_rate": 0.00010004042037186741, "loss": 0.772, "step": 29700 }, { "epoch": 4.015090272163837, "grad_norm": 1.1159234046936035, "learning_rate": 0.00010037725680409592, "loss": 0.7719, "step": 29800 }, { "epoch": 4.028563729452977, "grad_norm": 1.26315438747406, "learning_rate": 0.00010071409323632443, "loss": 0.7696, "step": 29900 }, { "epoch": 4.042037186742118, "grad_norm": 1.2203866243362427, "learning_rate": 0.00010105092966855295, "loss": 0.7678, "step": 30000 }, { "epoch": 4.0555106440312585, "grad_norm": 1.1392083168029785, "learning_rate": 0.00010138776610078146, "loss": 0.7698, "step": 30100 }, { "epoch": 4.068984101320399, "grad_norm": 1.1184396743774414, "learning_rate": 0.00010172460253300997, "loss": 0.7683, "step": 30200 }, { "epoch": 4.082457558609539, "grad_norm": 1.236886739730835, "learning_rate": 0.00010206143896523848, "loss": 0.7669, "step": 30300 }, { "epoch": 4.09593101589868, "grad_norm": 1.1693611145019531, "learning_rate": 0.00010239827539746699, "loss": 0.765, "step": 30400 }, { "epoch": 4.10940447318782, "grad_norm": 1.2848975658416748, "learning_rate": 0.0001027351118296955, "loss": 0.7643, "step": 30500 }, { "epoch": 4.12287793047696, "grad_norm": 1.124135136604309, "learning_rate": 0.00010307194826192401, "loss": 0.7657, "step": 30600 }, { "epoch": 4.1363513877661005, "grad_norm": 1.1294893026351929, "learning_rate": 0.00010340878469415252, "loss": 0.7652, "step": 30700 }, { "epoch": 4.1498248450552415, "grad_norm": 1.1813340187072754, "learning_rate": 0.00010374562112638103, "loss": 0.7644, "step": 30800 }, { "epoch": 4.163298302344382, "grad_norm": 1.2368444204330444, "learning_rate": 0.00010408245755860954, "loss": 0.7647, "step": 30900 }, { "epoch": 4.176771759633522, "grad_norm": 1.1886606216430664, "learning_rate": 0.00010441929399083805, "loss": 0.7621, "step": 31000 }, { "epoch": 4.190245216922662, "grad_norm": 1.121414303779602, "learning_rate": 0.00010475613042306656, "loss": 0.7634, "step": 31100 }, { "epoch": 4.203718674211803, "grad_norm": 1.1746305227279663, "learning_rate": 0.00010509296685529507, "loss": 0.7615, "step": 31200 }, { "epoch": 4.217192131500943, "grad_norm": 1.2289410829544067, "learning_rate": 0.00010542980328752359, "loss": 0.7608, "step": 31300 }, { "epoch": 4.2306655887900835, "grad_norm": 1.0735752582550049, "learning_rate": 0.00010576663971975208, "loss": 0.7623, "step": 31400 }, { "epoch": 4.244139046079224, "grad_norm": 1.1223698854446411, "learning_rate": 0.0001061034761519806, "loss": 0.7611, "step": 31500 }, { "epoch": 4.257612503368365, "grad_norm": 1.0524061918258667, "learning_rate": 0.0001064403125842091, "loss": 0.76, "step": 31600 }, { "epoch": 4.271085960657505, "grad_norm": 1.1129183769226074, "learning_rate": 0.00010677714901643761, "loss": 0.7603, "step": 31700 }, { "epoch": 4.284559417946645, "grad_norm": 1.1210554838180542, "learning_rate": 0.00010711398544866613, "loss": 0.7586, "step": 31800 }, { "epoch": 4.298032875235785, "grad_norm": 1.1126420497894287, "learning_rate": 0.00010745082188089464, "loss": 0.7574, "step": 31900 }, { "epoch": 4.311506332524926, "grad_norm": 1.059046745300293, "learning_rate": 0.00010778765831312315, "loss": 0.7567, "step": 32000 }, { "epoch": 4.3249797898140665, "grad_norm": 1.1068280935287476, "learning_rate": 0.00010812449474535166, "loss": 0.7569, "step": 32100 }, { "epoch": 4.338453247103207, "grad_norm": 1.056628942489624, "learning_rate": 0.00010846133117758017, "loss": 0.7568, "step": 32200 }, { "epoch": 4.351926704392347, "grad_norm": 1.1537504196166992, "learning_rate": 0.00010879816760980868, "loss": 0.7561, "step": 32300 }, { "epoch": 4.365400161681487, "grad_norm": 1.2492071390151978, "learning_rate": 0.00010913500404203719, "loss": 0.7558, "step": 32400 }, { "epoch": 4.378873618970628, "grad_norm": 1.1514471769332886, "learning_rate": 0.0001094718404742657, "loss": 0.7538, "step": 32500 }, { "epoch": 4.392347076259768, "grad_norm": 1.123184084892273, "learning_rate": 0.00010980867690649421, "loss": 0.7543, "step": 32600 }, { "epoch": 4.4058205335489085, "grad_norm": 1.1592423915863037, "learning_rate": 0.00011014551333872272, "loss": 0.7518, "step": 32700 }, { "epoch": 4.419293990838049, "grad_norm": 1.0830798149108887, "learning_rate": 0.00011048234977095123, "loss": 0.7514, "step": 32800 }, { "epoch": 4.43276744812719, "grad_norm": 1.110124945640564, "learning_rate": 0.00011081918620317974, "loss": 0.7532, "step": 32900 }, { "epoch": 4.44624090541633, "grad_norm": 1.0757759809494019, "learning_rate": 0.00011115602263540825, "loss": 0.7528, "step": 33000 }, { "epoch": 4.45971436270547, "grad_norm": 1.0417282581329346, "learning_rate": 0.00011149285906763677, "loss": 0.7518, "step": 33100 }, { "epoch": 4.47318781999461, "grad_norm": 0.9674953818321228, "learning_rate": 0.00011182969549986526, "loss": 0.75, "step": 33200 }, { "epoch": 4.486661277283751, "grad_norm": 1.11676824092865, "learning_rate": 0.00011216653193209377, "loss": 0.7489, "step": 33300 }, { "epoch": 4.5001347345728915, "grad_norm": 1.1305615901947021, "learning_rate": 0.00011250336836432228, "loss": 0.7501, "step": 33400 }, { "epoch": 4.513608191862032, "grad_norm": 1.054303526878357, "learning_rate": 0.0001128402047965508, "loss": 0.7503, "step": 33500 }, { "epoch": 4.527081649151172, "grad_norm": 0.9846808910369873, "learning_rate": 0.0001131770412287793, "loss": 0.7475, "step": 33600 }, { "epoch": 4.540555106440313, "grad_norm": 1.0176594257354736, "learning_rate": 0.00011351387766100782, "loss": 0.7471, "step": 33700 }, { "epoch": 4.554028563729453, "grad_norm": 1.0512880086898804, "learning_rate": 0.00011385071409323633, "loss": 0.7466, "step": 33800 }, { "epoch": 4.567502021018593, "grad_norm": 1.0515183210372925, "learning_rate": 0.00011418755052546484, "loss": 0.7474, "step": 33900 }, { "epoch": 4.580975478307733, "grad_norm": 1.052741527557373, "learning_rate": 0.00011452438695769335, "loss": 0.7456, "step": 34000 }, { "epoch": 4.5944489355968745, "grad_norm": 1.0624927282333374, "learning_rate": 0.00011486122338992186, "loss": 0.7467, "step": 34100 }, { "epoch": 4.607922392886015, "grad_norm": 1.020132303237915, "learning_rate": 0.00011519805982215037, "loss": 0.7454, "step": 34200 }, { "epoch": 4.621395850175155, "grad_norm": 0.9999297857284546, "learning_rate": 0.00011553489625437888, "loss": 0.7458, "step": 34300 }, { "epoch": 4.634869307464295, "grad_norm": 0.980705976486206, "learning_rate": 0.00011587173268660739, "loss": 0.746, "step": 34400 }, { "epoch": 4.648342764753436, "grad_norm": 1.1061065196990967, "learning_rate": 0.0001162085691188359, "loss": 0.7439, "step": 34500 }, { "epoch": 4.661816222042576, "grad_norm": 0.9089061617851257, "learning_rate": 0.00011654540555106441, "loss": 0.7449, "step": 34600 }, { "epoch": 4.6752896793317165, "grad_norm": 1.0703544616699219, "learning_rate": 0.00011688224198329292, "loss": 0.7428, "step": 34700 }, { "epoch": 4.688763136620857, "grad_norm": 0.9740894436836243, "learning_rate": 0.00011721907841552144, "loss": 0.7423, "step": 34800 }, { "epoch": 4.702236593909998, "grad_norm": 0.9512559175491333, "learning_rate": 0.00011755591484774995, "loss": 0.7403, "step": 34900 }, { "epoch": 4.715710051199138, "grad_norm": 0.9666496515274048, "learning_rate": 0.00011789275127997844, "loss": 0.7405, "step": 35000 }, { "epoch": 4.729183508488278, "grad_norm": 1.0284311771392822, "learning_rate": 0.00011822958771220695, "loss": 0.741, "step": 35100 }, { "epoch": 4.742656965777418, "grad_norm": 1.0507092475891113, "learning_rate": 0.00011856642414443546, "loss": 0.7401, "step": 35200 }, { "epoch": 4.756130423066558, "grad_norm": 0.9864184856414795, "learning_rate": 0.00011890326057666398, "loss": 0.7399, "step": 35300 }, { "epoch": 4.7696038803556995, "grad_norm": 0.9916123747825623, "learning_rate": 0.00011924009700889249, "loss": 0.7396, "step": 35400 }, { "epoch": 4.78307733764484, "grad_norm": 0.9854558706283569, "learning_rate": 0.000119576933441121, "loss": 0.7367, "step": 35500 }, { "epoch": 4.79655079493398, "grad_norm": 0.9764343500137329, "learning_rate": 0.00011991376987334951, "loss": 0.7372, "step": 35600 }, { "epoch": 4.810024252223121, "grad_norm": 0.9600427746772766, "learning_rate": 0.00012025060630557802, "loss": 0.7371, "step": 35700 }, { "epoch": 4.823497709512261, "grad_norm": 0.9664416909217834, "learning_rate": 0.00012058744273780653, "loss": 0.7369, "step": 35800 }, { "epoch": 4.836971166801401, "grad_norm": 1.0120084285736084, "learning_rate": 0.00012092427917003504, "loss": 0.7374, "step": 35900 }, { "epoch": 4.8504446240905414, "grad_norm": 0.9682172536849976, "learning_rate": 0.00012126111560226355, "loss": 0.7353, "step": 36000 }, { "epoch": 4.863918081379682, "grad_norm": 1.123008131980896, "learning_rate": 0.00012159795203449206, "loss": 0.7361, "step": 36100 }, { "epoch": 4.877391538668823, "grad_norm": 1.079131841659546, "learning_rate": 0.00012193478846672057, "loss": 0.7353, "step": 36200 }, { "epoch": 4.890864995957963, "grad_norm": 0.8529148101806641, "learning_rate": 0.00012227162489894907, "loss": 0.7364, "step": 36300 }, { "epoch": 4.904338453247103, "grad_norm": 0.9336905479431152, "learning_rate": 0.00012260846133117757, "loss": 0.7344, "step": 36400 }, { "epoch": 4.917811910536243, "grad_norm": 1.0440037250518799, "learning_rate": 0.0001229452977634061, "loss": 0.7336, "step": 36500 }, { "epoch": 4.931285367825384, "grad_norm": 0.9409441351890564, "learning_rate": 0.0001232821341956346, "loss": 0.733, "step": 36600 }, { "epoch": 4.9447588251145245, "grad_norm": 0.9291223883628845, "learning_rate": 0.0001236189706278631, "loss": 0.7324, "step": 36700 }, { "epoch": 4.958232282403665, "grad_norm": 0.9350892901420593, "learning_rate": 0.0001239558070600916, "loss": 0.732, "step": 36800 }, { "epoch": 4.971705739692805, "grad_norm": 0.9372636079788208, "learning_rate": 0.00012429264349232013, "loss": 0.731, "step": 36900 }, { "epoch": 4.985179196981946, "grad_norm": 0.9152778387069702, "learning_rate": 0.00012462947992454863, "loss": 0.731, "step": 37000 }, { "epoch": 4.998652654271086, "grad_norm": 0.8974970579147339, "learning_rate": 0.00012496631635677716, "loss": 0.7296, "step": 37100 }, { "epoch": 5.0, "eval_loss": 0.7076295614242554, "eval_runtime": 4.9543, "eval_samples_per_second": 1009.215, "eval_steps_per_second": 15.946, "step": 37110 }, { "epoch": 5.012126111560226, "grad_norm": 0.9408003687858582, "learning_rate": 0.00012530315278900568, "loss": 0.7292, "step": 37200 }, { "epoch": 5.025599568849366, "grad_norm": 0.8918381333351135, "learning_rate": 0.00012563998922123418, "loss": 0.7297, "step": 37300 }, { "epoch": 5.0390730261385075, "grad_norm": 0.976574182510376, "learning_rate": 0.0001259768256534627, "loss": 0.7293, "step": 37400 }, { "epoch": 5.052546483427648, "grad_norm": 0.8967993259429932, "learning_rate": 0.0001263136620856912, "loss": 0.73, "step": 37500 }, { "epoch": 5.066019940716788, "grad_norm": 0.9340730309486389, "learning_rate": 0.00012665049851791972, "loss": 0.7277, "step": 37600 }, { "epoch": 5.079493398005928, "grad_norm": 0.9547955393791199, "learning_rate": 0.00012698733495014822, "loss": 0.7268, "step": 37700 }, { "epoch": 5.092966855295069, "grad_norm": 1.0452197790145874, "learning_rate": 0.00012732417138237672, "loss": 0.7265, "step": 37800 }, { "epoch": 5.106440312584209, "grad_norm": 1.036320447921753, "learning_rate": 0.00012766100781460524, "loss": 0.7271, "step": 37900 }, { "epoch": 5.1199137698733495, "grad_norm": 0.8826032280921936, "learning_rate": 0.00012799784424683374, "loss": 0.7253, "step": 38000 }, { "epoch": 5.13338722716249, "grad_norm": 0.9047442078590393, "learning_rate": 0.00012833468067906226, "loss": 0.7242, "step": 38100 }, { "epoch": 5.146860684451631, "grad_norm": 0.9214485883712769, "learning_rate": 0.00012867151711129076, "loss": 0.7263, "step": 38200 }, { "epoch": 5.160334141740771, "grad_norm": 0.9749125838279724, "learning_rate": 0.00012900835354351928, "loss": 0.7249, "step": 38300 }, { "epoch": 5.173807599029911, "grad_norm": 0.9060753583908081, "learning_rate": 0.00012934518997574778, "loss": 0.7236, "step": 38400 }, { "epoch": 5.187281056319051, "grad_norm": 0.8124890923500061, "learning_rate": 0.0001296820264079763, "loss": 0.7237, "step": 38500 }, { "epoch": 5.200754513608192, "grad_norm": 0.8791264295578003, "learning_rate": 0.0001300188628402048, "loss": 0.7234, "step": 38600 }, { "epoch": 5.2142279708973325, "grad_norm": 0.9152727127075195, "learning_rate": 0.00013035569927243333, "loss": 0.7225, "step": 38700 }, { "epoch": 5.227701428186473, "grad_norm": 0.8851249814033508, "learning_rate": 0.00013069253570466183, "loss": 0.7236, "step": 38800 }, { "epoch": 5.241174885475613, "grad_norm": 0.8641852140426636, "learning_rate": 0.00013102937213689035, "loss": 0.7231, "step": 38900 }, { "epoch": 5.254648342764753, "grad_norm": 0.8725935816764832, "learning_rate": 0.00013136620856911885, "loss": 0.7212, "step": 39000 }, { "epoch": 5.268121800053894, "grad_norm": 0.8939389586448669, "learning_rate": 0.00013170304500134737, "loss": 0.7224, "step": 39100 }, { "epoch": 5.281595257343034, "grad_norm": 0.9639482498168945, "learning_rate": 0.00013203988143357587, "loss": 0.7221, "step": 39200 }, { "epoch": 5.295068714632174, "grad_norm": 0.9055954217910767, "learning_rate": 0.0001323767178658044, "loss": 0.7217, "step": 39300 }, { "epoch": 5.308542171921315, "grad_norm": 0.913792610168457, "learning_rate": 0.0001327135542980329, "loss": 0.72, "step": 39400 }, { "epoch": 5.322015629210456, "grad_norm": 0.8808144330978394, "learning_rate": 0.0001330503907302614, "loss": 0.7206, "step": 39500 }, { "epoch": 5.335489086499596, "grad_norm": 0.8109551072120667, "learning_rate": 0.0001333872271624899, "loss": 0.7188, "step": 39600 }, { "epoch": 5.348962543788736, "grad_norm": 0.8155485987663269, "learning_rate": 0.0001337240635947184, "loss": 0.7196, "step": 39700 }, { "epoch": 5.362436001077876, "grad_norm": 0.8922649025917053, "learning_rate": 0.00013406090002694693, "loss": 0.7194, "step": 39800 }, { "epoch": 5.375909458367017, "grad_norm": 0.8298534750938416, "learning_rate": 0.00013439773645917543, "loss": 0.7193, "step": 39900 }, { "epoch": 5.3893829156561575, "grad_norm": 0.8020102381706238, "learning_rate": 0.00013473457289140395, "loss": 0.7181, "step": 40000 }, { "epoch": 5.402856372945298, "grad_norm": 0.8372957706451416, "learning_rate": 0.00013507140932363245, "loss": 0.7167, "step": 40100 }, { "epoch": 5.416329830234438, "grad_norm": 0.8687503933906555, "learning_rate": 0.00013540824575586098, "loss": 0.7169, "step": 40200 }, { "epoch": 5.429803287523579, "grad_norm": 0.8092111349105835, "learning_rate": 0.00013574508218808947, "loss": 0.7167, "step": 40300 }, { "epoch": 5.443276744812719, "grad_norm": 0.8147724866867065, "learning_rate": 0.00013608191862031797, "loss": 0.7177, "step": 40400 }, { "epoch": 5.456750202101859, "grad_norm": 0.8693458437919617, "learning_rate": 0.00013641875505254647, "loss": 0.7158, "step": 40500 }, { "epoch": 5.470223659390999, "grad_norm": 0.9576795101165771, "learning_rate": 0.000136755591484775, "loss": 0.7151, "step": 40600 }, { "epoch": 5.4836971166801405, "grad_norm": 0.7444314360618591, "learning_rate": 0.0001370924279170035, "loss": 0.7144, "step": 40700 }, { "epoch": 5.497170573969281, "grad_norm": 0.7642235159873962, "learning_rate": 0.000137429264349232, "loss": 0.7137, "step": 40800 }, { "epoch": 5.510644031258421, "grad_norm": 0.8059471249580383, "learning_rate": 0.0001377661007814605, "loss": 0.7147, "step": 40900 }, { "epoch": 5.524117488547561, "grad_norm": 0.8565640449523926, "learning_rate": 0.00013810293721368903, "loss": 0.7132, "step": 41000 }, { "epoch": 5.537590945836702, "grad_norm": 0.9605796337127686, "learning_rate": 0.00013843977364591753, "loss": 0.713, "step": 41100 }, { "epoch": 5.551064403125842, "grad_norm": 0.8143794536590576, "learning_rate": 0.00013877661007814603, "loss": 0.7131, "step": 41200 }, { "epoch": 5.564537860414982, "grad_norm": 0.8640410900115967, "learning_rate": 0.00013911344651037455, "loss": 0.713, "step": 41300 }, { "epoch": 5.578011317704123, "grad_norm": 0.9741572141647339, "learning_rate": 0.00013945028294260305, "loss": 0.7124, "step": 41400 }, { "epoch": 5.591484774993264, "grad_norm": 0.8128600716590881, "learning_rate": 0.00013978711937483158, "loss": 0.7107, "step": 41500 }, { "epoch": 5.604958232282404, "grad_norm": 0.832990288734436, "learning_rate": 0.00014012395580706007, "loss": 0.7114, "step": 41600 }, { "epoch": 5.618431689571544, "grad_norm": 0.8145182728767395, "learning_rate": 0.0001404607922392886, "loss": 0.7112, "step": 41700 }, { "epoch": 5.631905146860684, "grad_norm": 0.7523565292358398, "learning_rate": 0.0001407976286715171, "loss": 0.7108, "step": 41800 }, { "epoch": 5.645378604149824, "grad_norm": 0.7929809093475342, "learning_rate": 0.00014113446510374562, "loss": 0.7103, "step": 41900 }, { "epoch": 5.6588520614389655, "grad_norm": 0.8976119756698608, "learning_rate": 0.00014147130153597412, "loss": 0.7093, "step": 42000 }, { "epoch": 5.672325518728106, "grad_norm": 0.8686313629150391, "learning_rate": 0.00014180813796820264, "loss": 0.7094, "step": 42100 }, { "epoch": 5.685798976017246, "grad_norm": 0.8027270436286926, "learning_rate": 0.00014214497440043114, "loss": 0.7086, "step": 42200 }, { "epoch": 5.699272433306387, "grad_norm": 0.8768233060836792, "learning_rate": 0.00014248181083265966, "loss": 0.7082, "step": 42300 }, { "epoch": 5.712745890595527, "grad_norm": 0.7889737486839294, "learning_rate": 0.00014281864726488816, "loss": 0.7078, "step": 42400 }, { "epoch": 5.726219347884667, "grad_norm": 0.8199788331985474, "learning_rate": 0.00014315548369711668, "loss": 0.7068, "step": 42500 }, { "epoch": 5.739692805173807, "grad_norm": 0.8263741731643677, "learning_rate": 0.00014349232012934518, "loss": 0.7082, "step": 42600 }, { "epoch": 5.753166262462948, "grad_norm": 0.7950279116630554, "learning_rate": 0.0001438291565615737, "loss": 0.7059, "step": 42700 }, { "epoch": 5.766639719752089, "grad_norm": 0.7786477208137512, "learning_rate": 0.0001441659929938022, "loss": 0.7055, "step": 42800 }, { "epoch": 5.780113177041229, "grad_norm": 0.7392706274986267, "learning_rate": 0.00014450282942603073, "loss": 0.7045, "step": 42900 }, { "epoch": 5.793586634330369, "grad_norm": 0.8650729656219482, "learning_rate": 0.00014483966585825922, "loss": 0.7038, "step": 43000 }, { "epoch": 5.807060091619509, "grad_norm": 0.7785390019416809, "learning_rate": 0.00014517650229048772, "loss": 0.7031, "step": 43100 }, { "epoch": 5.82053354890865, "grad_norm": 0.7475181818008423, "learning_rate": 0.00014551333872271624, "loss": 0.7047, "step": 43200 }, { "epoch": 5.8340070061977904, "grad_norm": 0.8173367381095886, "learning_rate": 0.00014585017515494474, "loss": 0.7045, "step": 43300 }, { "epoch": 5.847480463486931, "grad_norm": 0.7598131895065308, "learning_rate": 0.00014618701158717327, "loss": 0.7027, "step": 43400 }, { "epoch": 5.860953920776071, "grad_norm": 0.754601776599884, "learning_rate": 0.00014652384801940176, "loss": 0.7029, "step": 43500 }, { "epoch": 5.874427378065212, "grad_norm": 0.7667624354362488, "learning_rate": 0.0001468606844516303, "loss": 0.7021, "step": 43600 }, { "epoch": 5.887900835354352, "grad_norm": 0.7558141350746155, "learning_rate": 0.00014719752088385878, "loss": 0.7021, "step": 43700 }, { "epoch": 5.901374292643492, "grad_norm": 0.8691041469573975, "learning_rate": 0.0001475343573160873, "loss": 0.7014, "step": 43800 }, { "epoch": 5.914847749932632, "grad_norm": 0.8230217099189758, "learning_rate": 0.0001478711937483158, "loss": 0.7, "step": 43900 }, { "epoch": 5.9283212072217735, "grad_norm": 0.7664693593978882, "learning_rate": 0.00014820803018054433, "loss": 0.6995, "step": 44000 }, { "epoch": 5.941794664510914, "grad_norm": 0.7028418183326721, "learning_rate": 0.00014854486661277283, "loss": 0.699, "step": 44100 }, { "epoch": 5.955268121800054, "grad_norm": 0.7044640183448792, "learning_rate": 0.00014888170304500135, "loss": 0.6992, "step": 44200 }, { "epoch": 5.968741579089194, "grad_norm": 0.7698266506195068, "learning_rate": 0.00014921853947722985, "loss": 0.6973, "step": 44300 }, { "epoch": 5.982215036378335, "grad_norm": 0.7782478928565979, "learning_rate": 0.00014955537590945837, "loss": 0.6967, "step": 44400 }, { "epoch": 5.995688493667475, "grad_norm": 0.7759624719619751, "learning_rate": 0.00014989221234168687, "loss": 0.6984, "step": 44500 }, { "epoch": 6.0, "eval_loss": 0.6768075823783875, "eval_runtime": 4.9494, "eval_samples_per_second": 1010.215, "eval_steps_per_second": 15.961, "step": 44532 }, { "epoch": 6.009161950956615, "grad_norm": 0.8147016763687134, "learning_rate": 0.0001502290487739154, "loss": 0.6961, "step": 44600 }, { "epoch": 6.022635408245756, "grad_norm": 0.7870176434516907, "learning_rate": 0.0001505658852061439, "loss": 0.6962, "step": 44700 }, { "epoch": 6.036108865534897, "grad_norm": 0.8365423083305359, "learning_rate": 0.0001509027216383724, "loss": 0.6955, "step": 44800 }, { "epoch": 6.049582322824037, "grad_norm": 0.696557879447937, "learning_rate": 0.00015123955807060091, "loss": 0.6944, "step": 44900 }, { "epoch": 6.063055780113177, "grad_norm": 0.7576380372047424, "learning_rate": 0.0001515763945028294, "loss": 0.6948, "step": 45000 }, { "epoch": 6.076529237402317, "grad_norm": 0.7760909795761108, "learning_rate": 0.00015191323093505794, "loss": 0.695, "step": 45100 }, { "epoch": 6.090002694691458, "grad_norm": 0.8410043120384216, "learning_rate": 0.00015225006736728643, "loss": 0.693, "step": 45200 }, { "epoch": 6.1034761519805985, "grad_norm": 0.9241583347320557, "learning_rate": 0.00015258690379951496, "loss": 0.6927, "step": 45300 }, { "epoch": 6.116949609269739, "grad_norm": 0.7018818855285645, "learning_rate": 0.00015292374023174345, "loss": 0.6921, "step": 45400 }, { "epoch": 6.130423066558879, "grad_norm": 0.7426802515983582, "learning_rate": 0.00015326057666397198, "loss": 0.6928, "step": 45500 }, { "epoch": 6.143896523848019, "grad_norm": 0.710498571395874, "learning_rate": 0.00015359741309620048, "loss": 0.6925, "step": 45600 }, { "epoch": 6.15736998113716, "grad_norm": 0.6912098526954651, "learning_rate": 0.000153934249528429, "loss": 0.6925, "step": 45700 }, { "epoch": 6.1708434384263, "grad_norm": 0.7314077615737915, "learning_rate": 0.0001542710859606575, "loss": 0.6898, "step": 45800 }, { "epoch": 6.18431689571544, "grad_norm": 0.7480527758598328, "learning_rate": 0.00015460792239288602, "loss": 0.6905, "step": 45900 }, { "epoch": 6.197790353004581, "grad_norm": 0.8154057264328003, "learning_rate": 0.00015494475882511452, "loss": 0.6921, "step": 46000 }, { "epoch": 6.211263810293722, "grad_norm": 0.7706659436225891, "learning_rate": 0.00015528159525734304, "loss": 0.6908, "step": 46100 }, { "epoch": 6.224737267582862, "grad_norm": 0.7643518447875977, "learning_rate": 0.00015561843168957154, "loss": 0.6898, "step": 46200 }, { "epoch": 6.238210724872002, "grad_norm": 0.7190268635749817, "learning_rate": 0.00015595526812180006, "loss": 0.6885, "step": 46300 }, { "epoch": 6.251684182161142, "grad_norm": 0.8116057515144348, "learning_rate": 0.00015629210455402856, "loss": 0.6879, "step": 46400 }, { "epoch": 6.265157639450283, "grad_norm": 0.6882048845291138, "learning_rate": 0.00015662894098625709, "loss": 0.6886, "step": 46500 }, { "epoch": 6.278631096739423, "grad_norm": 0.8105291128158569, "learning_rate": 0.00015696577741848558, "loss": 0.689, "step": 46600 }, { "epoch": 6.292104554028564, "grad_norm": 0.7206712961196899, "learning_rate": 0.00015730261385071408, "loss": 0.6883, "step": 46700 }, { "epoch": 6.305578011317704, "grad_norm": 0.8075223565101624, "learning_rate": 0.0001576394502829426, "loss": 0.6886, "step": 46800 }, { "epoch": 6.319051468606845, "grad_norm": 0.7896627187728882, "learning_rate": 0.0001579762867151711, "loss": 0.686, "step": 46900 }, { "epoch": 6.332524925895985, "grad_norm": 0.7512733936309814, "learning_rate": 0.00015831312314739963, "loss": 0.6868, "step": 47000 }, { "epoch": 6.345998383185125, "grad_norm": 0.7456969022750854, "learning_rate": 0.00015864995957962812, "loss": 0.6859, "step": 47100 }, { "epoch": 6.359471840474265, "grad_norm": 0.7805710434913635, "learning_rate": 0.00015898679601185665, "loss": 0.6859, "step": 47200 }, { "epoch": 6.3729452977634065, "grad_norm": 0.7076267004013062, "learning_rate": 0.00015932363244408515, "loss": 0.6855, "step": 47300 }, { "epoch": 6.386418755052547, "grad_norm": 0.6913865804672241, "learning_rate": 0.00015966046887631367, "loss": 0.6835, "step": 47400 }, { "epoch": 6.399892212341687, "grad_norm": 0.7496386170387268, "learning_rate": 0.00015999730530854217, "loss": 0.6848, "step": 47500 }, { "epoch": 6.413365669630827, "grad_norm": 0.6829120516777039, "learning_rate": 0.0001603341417407707, "loss": 0.6846, "step": 47600 }, { "epoch": 6.426839126919968, "grad_norm": 0.6701867580413818, "learning_rate": 0.0001606709781729992, "loss": 0.6823, "step": 47700 }, { "epoch": 6.440312584209108, "grad_norm": 0.6849319338798523, "learning_rate": 0.0001610078146052277, "loss": 0.6831, "step": 47800 }, { "epoch": 6.453786041498248, "grad_norm": 0.6918026804924011, "learning_rate": 0.0001613446510374562, "loss": 0.6826, "step": 47900 }, { "epoch": 6.467259498787389, "grad_norm": 0.7458677291870117, "learning_rate": 0.00016168148746968473, "loss": 0.6827, "step": 48000 }, { "epoch": 6.48073295607653, "grad_norm": 0.7171884775161743, "learning_rate": 0.00016201832390191323, "loss": 0.6828, "step": 48100 }, { "epoch": 6.49420641336567, "grad_norm": 0.7932873368263245, "learning_rate": 0.00016235516033414176, "loss": 0.6816, "step": 48200 }, { "epoch": 6.50767987065481, "grad_norm": 0.7039599418640137, "learning_rate": 0.00016269199676637025, "loss": 0.6815, "step": 48300 }, { "epoch": 6.52115332794395, "grad_norm": 0.6816318035125732, "learning_rate": 0.00016302883319859878, "loss": 0.6811, "step": 48400 }, { "epoch": 6.53462678523309, "grad_norm": 0.6453189849853516, "learning_rate": 0.00016336566963082727, "loss": 0.6812, "step": 48500 }, { "epoch": 6.548100242522231, "grad_norm": 0.657359778881073, "learning_rate": 0.00016370250606305577, "loss": 0.6812, "step": 48600 }, { "epoch": 6.561573699811372, "grad_norm": 0.7323804497718811, "learning_rate": 0.0001640393424952843, "loss": 0.6807, "step": 48700 }, { "epoch": 6.575047157100512, "grad_norm": 0.6694400310516357, "learning_rate": 0.0001643761789275128, "loss": 0.6796, "step": 48800 }, { "epoch": 6.588520614389653, "grad_norm": 0.6849894523620605, "learning_rate": 0.00016471301535974132, "loss": 0.6781, "step": 48900 }, { "epoch": 6.601994071678793, "grad_norm": 0.7223933339118958, "learning_rate": 0.00016504985179196981, "loss": 0.6795, "step": 49000 }, { "epoch": 6.615467528967933, "grad_norm": 0.6698912978172302, "learning_rate": 0.00016538668822419834, "loss": 0.6797, "step": 49100 }, { "epoch": 6.628940986257073, "grad_norm": 0.725713849067688, "learning_rate": 0.00016572352465642684, "loss": 0.6783, "step": 49200 }, { "epoch": 6.642414443546214, "grad_norm": 0.6652211546897888, "learning_rate": 0.00016606036108865536, "loss": 0.6762, "step": 49300 }, { "epoch": 6.655887900835355, "grad_norm": 0.6882320642471313, "learning_rate": 0.00016639719752088386, "loss": 0.6762, "step": 49400 }, { "epoch": 6.669361358124495, "grad_norm": 0.6172476410865784, "learning_rate": 0.00016673403395311238, "loss": 0.6763, "step": 49500 }, { "epoch": 6.682834815413635, "grad_norm": 0.7008011341094971, "learning_rate": 0.00016707087038534088, "loss": 0.6763, "step": 49600 }, { "epoch": 6.696308272702775, "grad_norm": 0.6852391362190247, "learning_rate": 0.0001674077068175694, "loss": 0.6767, "step": 49700 }, { "epoch": 6.709781729991916, "grad_norm": 0.7000615000724792, "learning_rate": 0.0001677445432497979, "loss": 0.6769, "step": 49800 }, { "epoch": 6.723255187281056, "grad_norm": 0.6721837520599365, "learning_rate": 0.00016808137968202643, "loss": 0.6759, "step": 49900 }, { "epoch": 6.736728644570197, "grad_norm": 0.6796983480453491, "learning_rate": 0.00016841821611425492, "loss": 0.6762, "step": 50000 }, { "epoch": 6.750202101859337, "grad_norm": 0.6687979102134705, "learning_rate": 0.00016875505254648345, "loss": 0.6747, "step": 50100 }, { "epoch": 6.763675559148478, "grad_norm": 0.695571780204773, "learning_rate": 0.00016909188897871194, "loss": 0.674, "step": 50200 }, { "epoch": 6.777149016437618, "grad_norm": 0.6038774847984314, "learning_rate": 0.00016942872541094044, "loss": 0.6723, "step": 50300 }, { "epoch": 6.790622473726758, "grad_norm": 0.6579914689064026, "learning_rate": 0.00016976556184316897, "loss": 0.674, "step": 50400 }, { "epoch": 6.804095931015898, "grad_norm": 0.6430501341819763, "learning_rate": 0.00017010239827539746, "loss": 0.6731, "step": 50500 }, { "epoch": 6.8175693883050394, "grad_norm": 0.6879277229309082, "learning_rate": 0.000170439234707626, "loss": 0.6734, "step": 50600 }, { "epoch": 6.83104284559418, "grad_norm": 0.6557313203811646, "learning_rate": 0.00017077607113985448, "loss": 0.674, "step": 50700 }, { "epoch": 6.84451630288332, "grad_norm": 0.6209240555763245, "learning_rate": 0.000171112907572083, "loss": 0.6737, "step": 50800 }, { "epoch": 6.85798976017246, "grad_norm": 0.6733800768852234, "learning_rate": 0.0001714497440043115, "loss": 0.6729, "step": 50900 }, { "epoch": 6.871463217461601, "grad_norm": 0.6506111025810242, "learning_rate": 0.00017178658043654003, "loss": 0.6727, "step": 51000 }, { "epoch": 6.884936674750741, "grad_norm": 0.6569960117340088, "learning_rate": 0.00017212341686876853, "loss": 0.6719, "step": 51100 }, { "epoch": 6.898410132039881, "grad_norm": 0.6665491461753845, "learning_rate": 0.00017246025330099705, "loss": 0.6716, "step": 51200 }, { "epoch": 6.911883589329022, "grad_norm": 0.6506064534187317, "learning_rate": 0.00017279708973322555, "loss": 0.6701, "step": 51300 }, { "epoch": 6.925357046618162, "grad_norm": 0.6387166976928711, "learning_rate": 0.00017313392616545407, "loss": 0.6711, "step": 51400 }, { "epoch": 6.938830503907303, "grad_norm": 0.6348710060119629, "learning_rate": 0.00017347076259768257, "loss": 0.6707, "step": 51500 }, { "epoch": 6.952303961196443, "grad_norm": 0.6120295524597168, "learning_rate": 0.0001738075990299111, "loss": 0.6692, "step": 51600 }, { "epoch": 6.965777418485583, "grad_norm": 0.6019930839538574, "learning_rate": 0.0001741444354621396, "loss": 0.6695, "step": 51700 }, { "epoch": 6.979250875774724, "grad_norm": 0.585022509098053, "learning_rate": 0.00017448127189436812, "loss": 0.6705, "step": 51800 }, { "epoch": 6.992724333063864, "grad_norm": 0.6564197540283203, "learning_rate": 0.0001748181083265966, "loss": 0.6691, "step": 51900 }, { "epoch": 7.0, "eval_loss": 0.6510282158851624, "eval_runtime": 4.9606, "eval_samples_per_second": 1007.933, "eval_steps_per_second": 15.925, "step": 51954 }, { "epoch": 7.006197790353005, "grad_norm": 0.656204879283905, "learning_rate": 0.00017515494475882514, "loss": 0.6686, "step": 52000 }, { "epoch": 7.019671247642145, "grad_norm": 0.6207842230796814, "learning_rate": 0.00017549178119105364, "loss": 0.6682, "step": 52100 }, { "epoch": 7.033144704931285, "grad_norm": 0.652436375617981, "learning_rate": 0.00017582861762328213, "loss": 0.668, "step": 52200 }, { "epoch": 7.046618162220426, "grad_norm": 0.585910975933075, "learning_rate": 0.00017616545405551066, "loss": 0.6661, "step": 52300 }, { "epoch": 7.060091619509566, "grad_norm": 0.5869996547698975, "learning_rate": 0.00017650229048773915, "loss": 0.6673, "step": 52400 }, { "epoch": 7.073565076798706, "grad_norm": 0.5738584995269775, "learning_rate": 0.00017683912691996768, "loss": 0.6674, "step": 52500 }, { "epoch": 7.087038534087847, "grad_norm": 0.6454165577888489, "learning_rate": 0.00017717596335219618, "loss": 0.6662, "step": 52600 }, { "epoch": 7.100511991376988, "grad_norm": 0.581243634223938, "learning_rate": 0.0001775127997844247, "loss": 0.6669, "step": 52700 }, { "epoch": 7.113985448666128, "grad_norm": 0.6395180821418762, "learning_rate": 0.0001778496362166532, "loss": 0.6668, "step": 52800 }, { "epoch": 7.127458905955268, "grad_norm": 0.6187218427658081, "learning_rate": 0.00017818647264888172, "loss": 0.6661, "step": 52900 }, { "epoch": 7.140932363244408, "grad_norm": 0.6004672646522522, "learning_rate": 0.00017852330908111022, "loss": 0.6656, "step": 53000 }, { "epoch": 7.154405820533549, "grad_norm": 0.6465744972229004, "learning_rate": 0.00017886014551333874, "loss": 0.6662, "step": 53100 }, { "epoch": 7.167879277822689, "grad_norm": 0.5654754042625427, "learning_rate": 0.00017919698194556724, "loss": 0.6648, "step": 53200 }, { "epoch": 7.18135273511183, "grad_norm": 0.5796323418617249, "learning_rate": 0.00017953381837779576, "loss": 0.6651, "step": 53300 }, { "epoch": 7.19482619240097, "grad_norm": 0.6140373349189758, "learning_rate": 0.00017987065481002426, "loss": 0.6642, "step": 53400 }, { "epoch": 7.208299649690111, "grad_norm": 0.6188362240791321, "learning_rate": 0.00018020749124225279, "loss": 0.6638, "step": 53500 }, { "epoch": 7.221773106979251, "grad_norm": 0.6651822924613953, "learning_rate": 0.00018054432767448128, "loss": 0.6638, "step": 53600 }, { "epoch": 7.235246564268391, "grad_norm": 0.6173692941665649, "learning_rate": 0.0001808811641067098, "loss": 0.6635, "step": 53700 }, { "epoch": 7.248720021557531, "grad_norm": 0.6754614114761353, "learning_rate": 0.0001812180005389383, "loss": 0.6632, "step": 53800 }, { "epoch": 7.262193478846672, "grad_norm": 0.6550272703170776, "learning_rate": 0.0001815548369711668, "loss": 0.663, "step": 53900 }, { "epoch": 7.275666936135813, "grad_norm": 0.5743744969367981, "learning_rate": 0.00018189167340339533, "loss": 0.6628, "step": 54000 }, { "epoch": 7.289140393424953, "grad_norm": 0.6050006747245789, "learning_rate": 0.00018222850983562382, "loss": 0.662, "step": 54100 }, { "epoch": 7.302613850714093, "grad_norm": 0.650579035282135, "learning_rate": 0.00018256534626785235, "loss": 0.6625, "step": 54200 }, { "epoch": 7.316087308003234, "grad_norm": 0.6060268878936768, "learning_rate": 0.00018290218270008084, "loss": 0.6632, "step": 54300 }, { "epoch": 7.329560765292374, "grad_norm": 0.6309126615524292, "learning_rate": 0.00018323901913230937, "loss": 0.6621, "step": 54400 }, { "epoch": 7.343034222581514, "grad_norm": 0.6029520034790039, "learning_rate": 0.00018357585556453787, "loss": 0.6616, "step": 54500 }, { "epoch": 7.356507679870655, "grad_norm": 0.587424099445343, "learning_rate": 0.0001839126919967664, "loss": 0.6598, "step": 54600 }, { "epoch": 7.369981137159796, "grad_norm": 0.6058446764945984, "learning_rate": 0.0001842495284289949, "loss": 0.6607, "step": 54700 }, { "epoch": 7.383454594448936, "grad_norm": 0.581112265586853, "learning_rate": 0.0001845863648612234, "loss": 0.6595, "step": 54800 }, { "epoch": 7.396928051738076, "grad_norm": 0.5640543103218079, "learning_rate": 0.0001849232012934519, "loss": 0.6596, "step": 54900 }, { "epoch": 7.410401509027216, "grad_norm": 0.539020836353302, "learning_rate": 0.00018526003772568043, "loss": 0.6593, "step": 55000 }, { "epoch": 7.423874966316356, "grad_norm": 0.5788360238075256, "learning_rate": 0.00018559687415790893, "loss": 0.6599, "step": 55100 }, { "epoch": 7.437348423605497, "grad_norm": 0.602780282497406, "learning_rate": 0.00018593371059013746, "loss": 0.66, "step": 55200 }, { "epoch": 7.450821880894638, "grad_norm": 0.575023889541626, "learning_rate": 0.00018627054702236595, "loss": 0.6591, "step": 55300 }, { "epoch": 7.464295338183778, "grad_norm": 0.5739133358001709, "learning_rate": 0.00018660738345459448, "loss": 0.6593, "step": 55400 }, { "epoch": 7.477768795472918, "grad_norm": 0.5594607591629028, "learning_rate": 0.00018694421988682297, "loss": 0.6585, "step": 55500 }, { "epoch": 7.491242252762059, "grad_norm": 0.5811671614646912, "learning_rate": 0.0001872810563190515, "loss": 0.6592, "step": 55600 }, { "epoch": 7.504715710051199, "grad_norm": 0.5692673325538635, "learning_rate": 0.00018761789275128, "loss": 0.658, "step": 55700 }, { "epoch": 7.518189167340339, "grad_norm": 0.6082265973091125, "learning_rate": 0.0001879547291835085, "loss": 0.6579, "step": 55800 }, { "epoch": 7.5316626246294796, "grad_norm": 0.6443072557449341, "learning_rate": 0.00018829156561573702, "loss": 0.6561, "step": 55900 }, { "epoch": 7.545136081918621, "grad_norm": 0.6234450936317444, "learning_rate": 0.00018862840204796551, "loss": 0.6565, "step": 56000 }, { "epoch": 7.558609539207761, "grad_norm": 0.5859986543655396, "learning_rate": 0.00018896523848019404, "loss": 0.6577, "step": 56100 }, { "epoch": 7.572082996496901, "grad_norm": 0.569501519203186, "learning_rate": 0.00018930207491242254, "loss": 0.6568, "step": 56200 }, { "epoch": 7.585556453786041, "grad_norm": 0.5712246298789978, "learning_rate": 0.00018963891134465106, "loss": 0.6563, "step": 56300 }, { "epoch": 7.599029911075182, "grad_norm": 0.5680332183837891, "learning_rate": 0.00018997574777687956, "loss": 0.656, "step": 56400 }, { "epoch": 7.612503368364322, "grad_norm": 0.5614754557609558, "learning_rate": 0.00019031258420910805, "loss": 0.6569, "step": 56500 }, { "epoch": 7.625976825653463, "grad_norm": 0.5745932459831238, "learning_rate": 0.00019064942064133655, "loss": 0.6549, "step": 56600 }, { "epoch": 7.639450282942603, "grad_norm": 0.5797784328460693, "learning_rate": 0.00019098625707356508, "loss": 0.6554, "step": 56700 }, { "epoch": 7.652923740231744, "grad_norm": 0.581120491027832, "learning_rate": 0.00019132309350579357, "loss": 0.6564, "step": 56800 }, { "epoch": 7.666397197520884, "grad_norm": 0.6504297852516174, "learning_rate": 0.0001916599299380221, "loss": 0.6548, "step": 56900 }, { "epoch": 7.679870654810024, "grad_norm": 0.6237537860870361, "learning_rate": 0.0001919967663702506, "loss": 0.6553, "step": 57000 }, { "epoch": 7.693344112099164, "grad_norm": 0.5756508111953735, "learning_rate": 0.00019233360280247912, "loss": 0.6549, "step": 57100 }, { "epoch": 7.706817569388305, "grad_norm": 0.5695337057113647, "learning_rate": 0.00019267043923470762, "loss": 0.6546, "step": 57200 }, { "epoch": 7.720291026677446, "grad_norm": 0.5604103803634644, "learning_rate": 0.00019300727566693614, "loss": 0.654, "step": 57300 }, { "epoch": 7.733764483966586, "grad_norm": 0.651355504989624, "learning_rate": 0.00019334411209916464, "loss": 0.6543, "step": 57400 }, { "epoch": 7.747237941255726, "grad_norm": 0.5530677437782288, "learning_rate": 0.00019368094853139314, "loss": 0.6526, "step": 57500 }, { "epoch": 7.760711398544867, "grad_norm": 0.5971905589103699, "learning_rate": 0.00019401778496362166, "loss": 0.6539, "step": 57600 }, { "epoch": 7.774184855834007, "grad_norm": 0.5190899968147278, "learning_rate": 0.00019435462139585016, "loss": 0.6524, "step": 57700 }, { "epoch": 7.787658313123147, "grad_norm": 0.522835910320282, "learning_rate": 0.00019469145782807868, "loss": 0.6522, "step": 57800 }, { "epoch": 7.801131770412288, "grad_norm": 0.5731785297393799, "learning_rate": 0.00019502829426030718, "loss": 0.6523, "step": 57900 }, { "epoch": 7.814605227701428, "grad_norm": 0.5887252688407898, "learning_rate": 0.0001953651306925357, "loss": 0.6523, "step": 58000 }, { "epoch": 7.828078684990569, "grad_norm": 0.5819473266601562, "learning_rate": 0.0001957019671247642, "loss": 0.6523, "step": 58100 }, { "epoch": 7.841552142279709, "grad_norm": 0.6126819252967834, "learning_rate": 0.00019603880355699272, "loss": 0.6503, "step": 58200 }, { "epoch": 7.855025599568849, "grad_norm": 0.5561129450798035, "learning_rate": 0.00019637563998922122, "loss": 0.6517, "step": 58300 }, { "epoch": 7.86849905685799, "grad_norm": 0.5428786873817444, "learning_rate": 0.00019671247642144975, "loss": 0.6504, "step": 58400 }, { "epoch": 7.88197251414713, "grad_norm": 0.5454368591308594, "learning_rate": 0.00019704931285367824, "loss": 0.6503, "step": 58500 }, { "epoch": 7.895445971436271, "grad_norm": 0.5166246294975281, "learning_rate": 0.00019738614928590677, "loss": 0.6499, "step": 58600 }, { "epoch": 7.908919428725411, "grad_norm": 0.7034215331077576, "learning_rate": 0.00019772298571813526, "loss": 0.6504, "step": 58700 }, { "epoch": 7.922392886014551, "grad_norm": 0.5037605166435242, "learning_rate": 0.0001980598221503638, "loss": 0.6515, "step": 58800 }, { "epoch": 7.935866343303692, "grad_norm": 0.5653728246688843, "learning_rate": 0.00019839665858259229, "loss": 0.6504, "step": 58900 }, { "epoch": 7.949339800592832, "grad_norm": 0.5996730923652649, "learning_rate": 0.0001987334950148208, "loss": 0.6497, "step": 59000 }, { "epoch": 7.962813257881972, "grad_norm": 0.5440583229064941, "learning_rate": 0.0001990703314470493, "loss": 0.6497, "step": 59100 }, { "epoch": 7.9762867151711125, "grad_norm": 0.5644957423210144, "learning_rate": 0.0001994071678792778, "loss": 0.651, "step": 59200 }, { "epoch": 7.989760172460254, "grad_norm": 0.6108840703964233, "learning_rate": 0.00019974400431150633, "loss": 0.6496, "step": 59300 }, { "epoch": 8.0, "eval_loss": 0.6295620203018188, "eval_runtime": 4.9313, "eval_samples_per_second": 1013.925, "eval_steps_per_second": 16.02, "step": 59376 }, { "epoch": 8.003233629749394, "grad_norm": 0.5037506222724915, "learning_rate": 0.00020008084074373483, "loss": 0.648, "step": 59400 }, { "epoch": 8.016707087038535, "grad_norm": 0.5116773247718811, "learning_rate": 0.00020041767717596335, "loss": 0.6464, "step": 59500 }, { "epoch": 8.030180544327674, "grad_norm": 0.5819870829582214, "learning_rate": 0.00020075451360819185, "loss": 0.6479, "step": 59600 }, { "epoch": 8.043654001616815, "grad_norm": 0.4798097312450409, "learning_rate": 0.00020109135004042037, "loss": 0.6463, "step": 59700 }, { "epoch": 8.057127458905954, "grad_norm": 0.5755693912506104, "learning_rate": 0.00020142818647264887, "loss": 0.6466, "step": 59800 }, { "epoch": 8.070600916195096, "grad_norm": 0.4981718063354492, "learning_rate": 0.0002017650229048774, "loss": 0.6474, "step": 59900 }, { "epoch": 8.084074373484237, "grad_norm": 0.5067675113677979, "learning_rate": 0.0002021018593371059, "loss": 0.6457, "step": 60000 }, { "epoch": 8.097547830773376, "grad_norm": 0.5034761428833008, "learning_rate": 0.00020243869576933442, "loss": 0.6464, "step": 60100 }, { "epoch": 8.111021288062517, "grad_norm": 0.5190382599830627, "learning_rate": 0.0002027755322015629, "loss": 0.6452, "step": 60200 }, { "epoch": 8.124494745351658, "grad_norm": 0.4988050162792206, "learning_rate": 0.00020311236863379144, "loss": 0.6467, "step": 60300 }, { "epoch": 8.137968202640797, "grad_norm": 0.5221449136734009, "learning_rate": 0.00020344920506601993, "loss": 0.6466, "step": 60400 }, { "epoch": 8.151441659929938, "grad_norm": 0.5368093848228455, "learning_rate": 0.00020378604149824846, "loss": 0.6471, "step": 60500 }, { "epoch": 8.164915117219078, "grad_norm": 0.512754499912262, "learning_rate": 0.00020412287793047696, "loss": 0.6454, "step": 60600 }, { "epoch": 8.178388574508219, "grad_norm": 0.5929804444313049, "learning_rate": 0.00020445971436270548, "loss": 0.6459, "step": 60700 }, { "epoch": 8.19186203179736, "grad_norm": 0.5221571326255798, "learning_rate": 0.00020479655079493398, "loss": 0.6454, "step": 60800 }, { "epoch": 8.2053354890865, "grad_norm": 0.49967026710510254, "learning_rate": 0.0002051333872271625, "loss": 0.6453, "step": 60900 }, { "epoch": 8.21880894637564, "grad_norm": 0.5121259689331055, "learning_rate": 0.000205470223659391, "loss": 0.6457, "step": 61000 }, { "epoch": 8.232282403664781, "grad_norm": 0.4606134593486786, "learning_rate": 0.0002058070600916195, "loss": 0.6454, "step": 61100 }, { "epoch": 8.24575586095392, "grad_norm": 0.4797939360141754, "learning_rate": 0.00020614389652384802, "loss": 0.6452, "step": 61200 }, { "epoch": 8.259229318243062, "grad_norm": 0.5148810148239136, "learning_rate": 0.00020648073295607652, "loss": 0.6454, "step": 61300 }, { "epoch": 8.272702775532201, "grad_norm": 0.48415476083755493, "learning_rate": 0.00020681756938830504, "loss": 0.643, "step": 61400 }, { "epoch": 8.286176232821342, "grad_norm": 0.4854748249053955, "learning_rate": 0.00020715440582053354, "loss": 0.6435, "step": 61500 }, { "epoch": 8.299649690110483, "grad_norm": 0.5731302499771118, "learning_rate": 0.00020749124225276206, "loss": 0.6443, "step": 61600 }, { "epoch": 8.313123147399622, "grad_norm": 0.5286055207252502, "learning_rate": 0.00020782807868499056, "loss": 0.6428, "step": 61700 }, { "epoch": 8.326596604688763, "grad_norm": 0.45164167881011963, "learning_rate": 0.00020816491511721908, "loss": 0.6418, "step": 61800 }, { "epoch": 8.340070061977903, "grad_norm": 0.5031310319900513, "learning_rate": 0.00020850175154944758, "loss": 0.6414, "step": 61900 }, { "epoch": 8.353543519267044, "grad_norm": 0.469394326210022, "learning_rate": 0.0002088385879816761, "loss": 0.6435, "step": 62000 }, { "epoch": 8.367016976556185, "grad_norm": 0.48615971207618713, "learning_rate": 0.0002091754244139046, "loss": 0.6438, "step": 62100 }, { "epoch": 8.380490433845324, "grad_norm": 0.49986782670021057, "learning_rate": 0.00020951226084613313, "loss": 0.6425, "step": 62200 }, { "epoch": 8.393963891134465, "grad_norm": 0.5694671273231506, "learning_rate": 0.00020984909727836162, "loss": 0.642, "step": 62300 }, { "epoch": 8.407437348423606, "grad_norm": 0.5613475441932678, "learning_rate": 0.00021018593371059015, "loss": 0.6407, "step": 62400 }, { "epoch": 8.420910805712746, "grad_norm": 0.4828580617904663, "learning_rate": 0.00021052277014281865, "loss": 0.6408, "step": 62500 }, { "epoch": 8.434384263001887, "grad_norm": 0.4758937954902649, "learning_rate": 0.00021085960657504717, "loss": 0.6419, "step": 62600 }, { "epoch": 8.447857720291026, "grad_norm": 0.500779926776886, "learning_rate": 0.00021119644300727567, "loss": 0.6406, "step": 62700 }, { "epoch": 8.461331177580167, "grad_norm": 0.5079861283302307, "learning_rate": 0.00021153327943950417, "loss": 0.6416, "step": 62800 }, { "epoch": 8.474804634869308, "grad_norm": 0.5049075484275818, "learning_rate": 0.0002118701158717327, "loss": 0.6416, "step": 62900 }, { "epoch": 8.488278092158447, "grad_norm": 0.48694440722465515, "learning_rate": 0.0002122069523039612, "loss": 0.6399, "step": 63000 }, { "epoch": 8.501751549447588, "grad_norm": 0.4706982374191284, "learning_rate": 0.0002125437887361897, "loss": 0.6405, "step": 63100 }, { "epoch": 8.51522500673673, "grad_norm": 0.5294811129570007, "learning_rate": 0.0002128806251684182, "loss": 0.6408, "step": 63200 }, { "epoch": 8.528698464025869, "grad_norm": 0.5092609524726868, "learning_rate": 0.00021321746160064673, "loss": 0.6413, "step": 63300 }, { "epoch": 8.54217192131501, "grad_norm": 0.507790744304657, "learning_rate": 0.00021355429803287523, "loss": 0.6399, "step": 63400 }, { "epoch": 8.555645378604149, "grad_norm": 0.5049028396606445, "learning_rate": 0.00021389113446510375, "loss": 0.6407, "step": 63500 }, { "epoch": 8.56911883589329, "grad_norm": 0.5106599926948547, "learning_rate": 0.00021422797089733225, "loss": 0.6395, "step": 63600 }, { "epoch": 8.582592293182431, "grad_norm": 0.4770093262195587, "learning_rate": 0.00021456480732956078, "loss": 0.6388, "step": 63700 }, { "epoch": 8.59606575047157, "grad_norm": 0.47173407673835754, "learning_rate": 0.00021490164376178927, "loss": 0.639, "step": 63800 }, { "epoch": 8.609539207760712, "grad_norm": 0.49109339714050293, "learning_rate": 0.0002152384801940178, "loss": 0.6388, "step": 63900 }, { "epoch": 8.623012665049853, "grad_norm": 0.4819713234901428, "learning_rate": 0.0002155753166262463, "loss": 0.6394, "step": 64000 }, { "epoch": 8.636486122338992, "grad_norm": 0.5132026076316833, "learning_rate": 0.00021591215305847482, "loss": 0.6387, "step": 64100 }, { "epoch": 8.649959579628133, "grad_norm": 0.49125608801841736, "learning_rate": 0.00021624898949070332, "loss": 0.6371, "step": 64200 }, { "epoch": 8.663433036917272, "grad_norm": 0.4882797300815582, "learning_rate": 0.00021658582592293184, "loss": 0.6385, "step": 64300 }, { "epoch": 8.676906494206413, "grad_norm": 0.45046529173851013, "learning_rate": 0.00021692266235516034, "loss": 0.6379, "step": 64400 }, { "epoch": 8.690379951495554, "grad_norm": 0.44371822476387024, "learning_rate": 0.00021725949878738886, "loss": 0.6381, "step": 64500 }, { "epoch": 8.703853408784694, "grad_norm": 0.49564328789711, "learning_rate": 0.00021759633521961736, "loss": 0.6375, "step": 64600 }, { "epoch": 8.717326866073835, "grad_norm": 0.4777640700340271, "learning_rate": 0.00021793317165184586, "loss": 0.6369, "step": 64700 }, { "epoch": 8.730800323362974, "grad_norm": 0.487048864364624, "learning_rate": 0.00021827000808407438, "loss": 0.6374, "step": 64800 }, { "epoch": 8.744273780652115, "grad_norm": 0.486797958612442, "learning_rate": 0.00021860684451630288, "loss": 0.6368, "step": 64900 }, { "epoch": 8.757747237941256, "grad_norm": 0.46243101358413696, "learning_rate": 0.0002189436809485314, "loss": 0.636, "step": 65000 }, { "epoch": 8.771220695230395, "grad_norm": 0.45059797167778015, "learning_rate": 0.0002192805173807599, "loss": 0.6363, "step": 65100 }, { "epoch": 8.784694152519537, "grad_norm": 0.4282694458961487, "learning_rate": 0.00021961735381298842, "loss": 0.6364, "step": 65200 }, { "epoch": 8.798167609808678, "grad_norm": 0.4927918612957001, "learning_rate": 0.00021995419024521692, "loss": 0.6358, "step": 65300 }, { "epoch": 8.811641067097817, "grad_norm": 0.508073627948761, "learning_rate": 0.00022029102667744545, "loss": 0.6363, "step": 65400 }, { "epoch": 8.825114524386958, "grad_norm": 0.5196787714958191, "learning_rate": 0.00022062786310967394, "loss": 0.6353, "step": 65500 }, { "epoch": 8.838587981676097, "grad_norm": 0.4590548872947693, "learning_rate": 0.00022096469954190247, "loss": 0.6367, "step": 65600 }, { "epoch": 8.852061438965238, "grad_norm": 0.48109981417655945, "learning_rate": 0.00022130153597413096, "loss": 0.6352, "step": 65700 }, { "epoch": 8.86553489625438, "grad_norm": 0.45325857400894165, "learning_rate": 0.0002216383724063595, "loss": 0.6357, "step": 65800 }, { "epoch": 8.879008353543519, "grad_norm": 0.43372291326522827, "learning_rate": 0.00022197520883858799, "loss": 0.636, "step": 65900 }, { "epoch": 8.89248181083266, "grad_norm": 0.48235049843788147, "learning_rate": 0.0002223120452708165, "loss": 0.6347, "step": 66000 }, { "epoch": 8.9059552681218, "grad_norm": 0.4330347180366516, "learning_rate": 0.000222648881703045, "loss": 0.6343, "step": 66100 }, { "epoch": 8.91942872541094, "grad_norm": 0.439972460269928, "learning_rate": 0.00022298571813527353, "loss": 0.6352, "step": 66200 }, { "epoch": 8.932902182700081, "grad_norm": 0.4680459201335907, "learning_rate": 0.00022332255456750203, "loss": 0.6353, "step": 66300 }, { "epoch": 8.94637563998922, "grad_norm": 0.45074060559272766, "learning_rate": 0.00022365939099973053, "loss": 0.6346, "step": 66400 }, { "epoch": 8.959849097278362, "grad_norm": 0.47976189851760864, "learning_rate": 0.00022399622743195905, "loss": 0.6351, "step": 66500 }, { "epoch": 8.973322554567503, "grad_norm": 0.49115607142448425, "learning_rate": 0.00022433306386418755, "loss": 0.6342, "step": 66600 }, { "epoch": 8.986796011856642, "grad_norm": 0.42879152297973633, "learning_rate": 0.00022466990029641607, "loss": 0.6329, "step": 66700 }, { "epoch": 9.0, "eval_loss": 0.6160863041877747, "eval_runtime": 4.9383, "eval_samples_per_second": 1012.492, "eval_steps_per_second": 15.997, "step": 66798 }, { "epoch": 9.000269469145783, "grad_norm": 0.4679017663002014, "learning_rate": 0.00022500673672864457, "loss": 0.633, "step": 66800 }, { "epoch": 9.013742926434924, "grad_norm": 0.4536520838737488, "learning_rate": 0.0002253435731608731, "loss": 0.6335, "step": 66900 }, { "epoch": 9.027216383724063, "grad_norm": 0.4459541440010071, "learning_rate": 0.0002256804095931016, "loss": 0.6324, "step": 67000 }, { "epoch": 9.040689841013204, "grad_norm": 0.47634798288345337, "learning_rate": 0.00022601724602533011, "loss": 0.6329, "step": 67100 }, { "epoch": 9.054163298302344, "grad_norm": 0.4515073895454407, "learning_rate": 0.0002263540824575586, "loss": 0.6325, "step": 67200 }, { "epoch": 9.067636755591485, "grad_norm": 0.47574812173843384, "learning_rate": 0.00022669091888978714, "loss": 0.6322, "step": 67300 }, { "epoch": 9.081110212880626, "grad_norm": 0.4464322328567505, "learning_rate": 0.00022702775532201563, "loss": 0.6323, "step": 67400 }, { "epoch": 9.094583670169765, "grad_norm": 0.4448589086532593, "learning_rate": 0.00022736459175424416, "loss": 0.6327, "step": 67500 }, { "epoch": 9.108057127458906, "grad_norm": 0.461879163980484, "learning_rate": 0.00022770142818647265, "loss": 0.6319, "step": 67600 }, { "epoch": 9.121530584748047, "grad_norm": 0.4590347707271576, "learning_rate": 0.00022803826461870118, "loss": 0.6309, "step": 67700 }, { "epoch": 9.135004042037187, "grad_norm": 0.42103254795074463, "learning_rate": 0.00022837510105092968, "loss": 0.6319, "step": 67800 }, { "epoch": 9.148477499326328, "grad_norm": 0.45355117321014404, "learning_rate": 0.0002287119374831582, "loss": 0.6313, "step": 67900 }, { "epoch": 9.161950956615467, "grad_norm": 0.43211865425109863, "learning_rate": 0.0002290487739153867, "loss": 0.6309, "step": 68000 }, { "epoch": 9.175424413904608, "grad_norm": 0.450624018907547, "learning_rate": 0.00022938561034761522, "loss": 0.6311, "step": 68100 }, { "epoch": 9.188897871193749, "grad_norm": 0.41925156116485596, "learning_rate": 0.00022972244677984372, "loss": 0.6311, "step": 68200 }, { "epoch": 9.202371328482888, "grad_norm": 0.42716220021247864, "learning_rate": 0.00023005928321207222, "loss": 0.6303, "step": 68300 }, { "epoch": 9.21584478577203, "grad_norm": 0.45244672894477844, "learning_rate": 0.00023039611964430074, "loss": 0.631, "step": 68400 }, { "epoch": 9.229318243061169, "grad_norm": 0.42681312561035156, "learning_rate": 0.00023073295607652924, "loss": 0.6298, "step": 68500 }, { "epoch": 9.24279170035031, "grad_norm": 0.42173629999160767, "learning_rate": 0.00023106979250875776, "loss": 0.6302, "step": 68600 }, { "epoch": 9.25626515763945, "grad_norm": 0.41555267572402954, "learning_rate": 0.00023140662894098626, "loss": 0.6306, "step": 68700 }, { "epoch": 9.26973861492859, "grad_norm": 0.49909311532974243, "learning_rate": 0.00023174346537321478, "loss": 0.6289, "step": 68800 }, { "epoch": 9.283212072217731, "grad_norm": 0.4344586133956909, "learning_rate": 0.00023208030180544328, "loss": 0.6299, "step": 68900 }, { "epoch": 9.296685529506872, "grad_norm": 0.4465915262699127, "learning_rate": 0.0002324171382376718, "loss": 0.6292, "step": 69000 }, { "epoch": 9.310158986796012, "grad_norm": 0.44993075728416443, "learning_rate": 0.0002327539746699003, "loss": 0.6299, "step": 69100 }, { "epoch": 9.323632444085153, "grad_norm": 0.48325660824775696, "learning_rate": 0.00023309081110212883, "loss": 0.6283, "step": 69200 }, { "epoch": 9.337105901374292, "grad_norm": 0.4197756052017212, "learning_rate": 0.00023342764753435732, "loss": 0.6281, "step": 69300 }, { "epoch": 9.350579358663433, "grad_norm": 0.43382781744003296, "learning_rate": 0.00023376448396658585, "loss": 0.6279, "step": 69400 }, { "epoch": 9.364052815952574, "grad_norm": 0.45718780159950256, "learning_rate": 0.00023410132039881435, "loss": 0.6283, "step": 69500 }, { "epoch": 9.377526273241713, "grad_norm": 0.4167027771472931, "learning_rate": 0.00023443815683104287, "loss": 0.6293, "step": 69600 }, { "epoch": 9.390999730530854, "grad_norm": 0.43850982189178467, "learning_rate": 0.00023477499326327137, "loss": 0.6299, "step": 69700 }, { "epoch": 9.404473187819995, "grad_norm": 0.41316866874694824, "learning_rate": 0.0002351118296954999, "loss": 0.6277, "step": 69800 }, { "epoch": 9.417946645109135, "grad_norm": 0.42040154337882996, "learning_rate": 0.0002354486661277284, "loss": 0.6271, "step": 69900 }, { "epoch": 9.431420102398276, "grad_norm": 0.41585594415664673, "learning_rate": 0.00023578550255995689, "loss": 0.6276, "step": 70000 }, { "epoch": 9.444893559687415, "grad_norm": 0.4450344145298004, "learning_rate": 0.0002361223389921854, "loss": 0.6271, "step": 70100 }, { "epoch": 9.458367016976556, "grad_norm": 0.43497195839881897, "learning_rate": 0.0002364591754244139, "loss": 0.6266, "step": 70200 }, { "epoch": 9.471840474265697, "grad_norm": 0.4524376094341278, "learning_rate": 0.00023679601185664243, "loss": 0.628, "step": 70300 }, { "epoch": 9.485313931554836, "grad_norm": 0.37373456358909607, "learning_rate": 0.00023713284828887093, "loss": 0.6269, "step": 70400 }, { "epoch": 9.498787388843978, "grad_norm": 0.418072372674942, "learning_rate": 0.00023746968472109945, "loss": 0.6278, "step": 70500 }, { "epoch": 9.512260846133117, "grad_norm": 0.4512464702129364, "learning_rate": 0.00023780652115332795, "loss": 0.6284, "step": 70600 }, { "epoch": 9.525734303422258, "grad_norm": 0.42508795857429504, "learning_rate": 0.00023814335758555648, "loss": 0.6273, "step": 70700 }, { "epoch": 9.539207760711399, "grad_norm": 0.38395392894744873, "learning_rate": 0.00023848019401778497, "loss": 0.6267, "step": 70800 }, { "epoch": 9.552681218000538, "grad_norm": 0.411082923412323, "learning_rate": 0.0002388170304500135, "loss": 0.6257, "step": 70900 }, { "epoch": 9.56615467528968, "grad_norm": 0.4163338243961334, "learning_rate": 0.000239153866882242, "loss": 0.626, "step": 71000 }, { "epoch": 9.57962813257882, "grad_norm": 0.39214783906936646, "learning_rate": 0.00023949070331447052, "loss": 0.6256, "step": 71100 }, { "epoch": 9.59310158986796, "grad_norm": 0.46858879923820496, "learning_rate": 0.00023982753974669902, "loss": 0.6268, "step": 71200 }, { "epoch": 9.6065750471571, "grad_norm": 0.44397151470184326, "learning_rate": 0.00024016437617892754, "loss": 0.6258, "step": 71300 }, { "epoch": 9.620048504446242, "grad_norm": 0.47120410203933716, "learning_rate": 0.00024050121261115604, "loss": 0.6259, "step": 71400 }, { "epoch": 9.633521961735381, "grad_norm": 0.4274812340736389, "learning_rate": 0.00024083804904338456, "loss": 0.6254, "step": 71500 }, { "epoch": 9.646995419024522, "grad_norm": 0.4241272211074829, "learning_rate": 0.00024117488547561306, "loss": 0.6252, "step": 71600 }, { "epoch": 9.660468876313661, "grad_norm": 0.41357848048210144, "learning_rate": 0.00024151172190784158, "loss": 0.6259, "step": 71700 }, { "epoch": 9.673942333602803, "grad_norm": 0.4272490441799164, "learning_rate": 0.00024184855834007008, "loss": 0.6253, "step": 71800 }, { "epoch": 9.687415790891944, "grad_norm": 0.43570178747177124, "learning_rate": 0.00024218539477229858, "loss": 0.6256, "step": 71900 }, { "epoch": 9.700889248181083, "grad_norm": 0.3945384621620178, "learning_rate": 0.0002425222312045271, "loss": 0.6254, "step": 72000 }, { "epoch": 9.714362705470224, "grad_norm": 0.3915943205356598, "learning_rate": 0.0002428590676367556, "loss": 0.6254, "step": 72100 }, { "epoch": 9.727836162759363, "grad_norm": 0.3625129461288452, "learning_rate": 0.00024319590406898412, "loss": 0.6235, "step": 72200 }, { "epoch": 9.741309620048504, "grad_norm": 0.4140903055667877, "learning_rate": 0.00024353274050121262, "loss": 0.6241, "step": 72300 }, { "epoch": 9.754783077337645, "grad_norm": 0.4128807783126831, "learning_rate": 0.00024386957693344114, "loss": 0.6231, "step": 72400 }, { "epoch": 9.768256534626785, "grad_norm": 0.4089184105396271, "learning_rate": 0.00024420641336566967, "loss": 0.6236, "step": 72500 }, { "epoch": 9.781729991915926, "grad_norm": 0.4272575378417969, "learning_rate": 0.00024454324979789814, "loss": 0.6247, "step": 72600 }, { "epoch": 9.795203449205067, "grad_norm": 0.42029160261154175, "learning_rate": 0.00024488008623012666, "loss": 0.6242, "step": 72700 }, { "epoch": 9.808676906494206, "grad_norm": 0.400316059589386, "learning_rate": 0.00024521692266235513, "loss": 0.623, "step": 72800 }, { "epoch": 9.822150363783347, "grad_norm": 0.4069356620311737, "learning_rate": 0.00024555375909458366, "loss": 0.6247, "step": 72900 }, { "epoch": 9.835623821072486, "grad_norm": 0.3873896300792694, "learning_rate": 0.0002458905955268122, "loss": 0.6242, "step": 73000 }, { "epoch": 9.849097278361628, "grad_norm": 0.3801201283931732, "learning_rate": 0.0002462274319590407, "loss": 0.6232, "step": 73100 }, { "epoch": 9.862570735650769, "grad_norm": 0.4009399712085724, "learning_rate": 0.0002465642683912692, "loss": 0.6227, "step": 73200 }, { "epoch": 9.876044192939908, "grad_norm": 0.38212910294532776, "learning_rate": 0.0002469011048234977, "loss": 0.6223, "step": 73300 }, { "epoch": 9.889517650229049, "grad_norm": 0.40641260147094727, "learning_rate": 0.0002472379412557262, "loss": 0.6225, "step": 73400 }, { "epoch": 9.90299110751819, "grad_norm": 0.37616103887557983, "learning_rate": 0.0002475747776879547, "loss": 0.6225, "step": 73500 }, { "epoch": 9.91646456480733, "grad_norm": 0.37600022554397583, "learning_rate": 0.0002479116141201832, "loss": 0.6221, "step": 73600 }, { "epoch": 9.92993802209647, "grad_norm": 0.4082411825656891, "learning_rate": 0.00024824845055241174, "loss": 0.6217, "step": 73700 }, { "epoch": 9.94341147938561, "grad_norm": 0.4003206789493561, "learning_rate": 0.00024858528698464027, "loss": 0.6207, "step": 73800 }, { "epoch": 9.95688493667475, "grad_norm": 0.4000953733921051, "learning_rate": 0.00024892212341686874, "loss": 0.6214, "step": 73900 }, { "epoch": 9.970358393963892, "grad_norm": 0.3888005316257477, "learning_rate": 0.00024925895984909726, "loss": 0.6216, "step": 74000 }, { "epoch": 9.983831851253031, "grad_norm": 0.4130525588989258, "learning_rate": 0.0002495957962813258, "loss": 0.6223, "step": 74100 }, { "epoch": 9.997305308542172, "grad_norm": 0.39898404479026794, "learning_rate": 0.0002499326327135543, "loss": 0.6213, "step": 74200 }, { "epoch": 10.0, "eval_loss": 0.6044231057167053, "eval_runtime": 4.9617, "eval_samples_per_second": 1007.729, "eval_steps_per_second": 15.922, "step": 74220 }, { "epoch": 10.010778765831311, "grad_norm": 0.3769437074661255, "learning_rate": 0.0002502694691457828, "loss": 0.6209, "step": 74300 }, { "epoch": 10.024252223120452, "grad_norm": 0.4322029948234558, "learning_rate": 0.00025060630557801136, "loss": 0.6194, "step": 74400 }, { "epoch": 10.037725680409594, "grad_norm": 0.37410834431648254, "learning_rate": 0.00025094314201023983, "loss": 0.6207, "step": 74500 }, { "epoch": 10.051199137698733, "grad_norm": 0.4050537049770355, "learning_rate": 0.00025127997844246835, "loss": 0.6204, "step": 74600 }, { "epoch": 10.064672594987874, "grad_norm": 0.38566479086875916, "learning_rate": 0.0002516168148746968, "loss": 0.62, "step": 74700 }, { "epoch": 10.078146052277015, "grad_norm": 0.41513171792030334, "learning_rate": 0.0002519536513069254, "loss": 0.6198, "step": 74800 }, { "epoch": 10.091619509566154, "grad_norm": 0.363420695066452, "learning_rate": 0.0002522904877391539, "loss": 0.6198, "step": 74900 }, { "epoch": 10.105092966855295, "grad_norm": 0.3818880617618561, "learning_rate": 0.0002526273241713824, "loss": 0.6193, "step": 75000 }, { "epoch": 10.118566424144435, "grad_norm": 0.4052686095237732, "learning_rate": 0.00025296416060361087, "loss": 0.6198, "step": 75100 }, { "epoch": 10.132039881433576, "grad_norm": 0.396451473236084, "learning_rate": 0.00025330099703583945, "loss": 0.6194, "step": 75200 }, { "epoch": 10.145513338722717, "grad_norm": 0.3621233403682709, "learning_rate": 0.0002536378334680679, "loss": 0.6198, "step": 75300 }, { "epoch": 10.158986796011856, "grad_norm": 0.38790085911750793, "learning_rate": 0.00025397466990029644, "loss": 0.6193, "step": 75400 }, { "epoch": 10.172460253300997, "grad_norm": 0.39634206891059875, "learning_rate": 0.0002543115063325249, "loss": 0.6198, "step": 75500 }, { "epoch": 10.185933710590138, "grad_norm": 0.37623685598373413, "learning_rate": 0.00025464834276475343, "loss": 0.6186, "step": 75600 }, { "epoch": 10.199407167879277, "grad_norm": 0.3769906163215637, "learning_rate": 0.00025498517919698196, "loss": 0.6191, "step": 75700 }, { "epoch": 10.212880625168419, "grad_norm": 0.40708407759666443, "learning_rate": 0.0002553220156292105, "loss": 0.6189, "step": 75800 }, { "epoch": 10.226354082457558, "grad_norm": 0.37709224224090576, "learning_rate": 0.00025565885206143895, "loss": 0.6185, "step": 75900 }, { "epoch": 10.239827539746699, "grad_norm": 0.380486398935318, "learning_rate": 0.0002559956884936675, "loss": 0.6186, "step": 76000 }, { "epoch": 10.25330099703584, "grad_norm": 0.3903633654117584, "learning_rate": 0.000256332524925896, "loss": 0.6184, "step": 76100 }, { "epoch": 10.26677445432498, "grad_norm": 0.38153213262557983, "learning_rate": 0.0002566693613581245, "loss": 0.6186, "step": 76200 }, { "epoch": 10.28024791161412, "grad_norm": 0.3848488926887512, "learning_rate": 0.000257006197790353, "loss": 0.6184, "step": 76300 }, { "epoch": 10.293721368903261, "grad_norm": 0.41788655519485474, "learning_rate": 0.0002573430342225815, "loss": 0.6187, "step": 76400 }, { "epoch": 10.3071948261924, "grad_norm": 0.36196058988571167, "learning_rate": 0.00025767987065481005, "loss": 0.6187, "step": 76500 }, { "epoch": 10.320668283481542, "grad_norm": 0.3850809335708618, "learning_rate": 0.00025801670708703857, "loss": 0.6178, "step": 76600 }, { "epoch": 10.334141740770681, "grad_norm": 0.37415528297424316, "learning_rate": 0.00025835354351926704, "loss": 0.6185, "step": 76700 }, { "epoch": 10.347615198059822, "grad_norm": 0.3736751079559326, "learning_rate": 0.00025869037995149556, "loss": 0.6185, "step": 76800 }, { "epoch": 10.361088655348963, "grad_norm": 0.372004896402359, "learning_rate": 0.0002590272163837241, "loss": 0.6178, "step": 76900 }, { "epoch": 10.374562112638102, "grad_norm": 0.3581484258174896, "learning_rate": 0.0002593640528159526, "loss": 0.6173, "step": 77000 }, { "epoch": 10.388035569927244, "grad_norm": 0.46048250794410706, "learning_rate": 0.0002597008892481811, "loss": 0.618, "step": 77100 }, { "epoch": 10.401509027216385, "grad_norm": 0.34623953700065613, "learning_rate": 0.0002600377256804096, "loss": 0.6168, "step": 77200 }, { "epoch": 10.414982484505524, "grad_norm": 0.370850145816803, "learning_rate": 0.0002603745621126381, "loss": 0.6175, "step": 77300 }, { "epoch": 10.428455941794665, "grad_norm": 0.37101244926452637, "learning_rate": 0.00026071139854486666, "loss": 0.6173, "step": 77400 }, { "epoch": 10.441929399083804, "grad_norm": 0.3654983937740326, "learning_rate": 0.0002610482349770951, "loss": 0.617, "step": 77500 }, { "epoch": 10.455402856372945, "grad_norm": 0.33329614996910095, "learning_rate": 0.00026138507140932365, "loss": 0.6171, "step": 77600 }, { "epoch": 10.468876313662086, "grad_norm": 0.3734244108200073, "learning_rate": 0.0002617219078415521, "loss": 0.6154, "step": 77700 }, { "epoch": 10.482349770951226, "grad_norm": 0.3825925290584564, "learning_rate": 0.0002620587442737807, "loss": 0.6166, "step": 77800 }, { "epoch": 10.495823228240367, "grad_norm": 0.3585836887359619, "learning_rate": 0.00026239558070600917, "loss": 0.6156, "step": 77900 }, { "epoch": 10.509296685529506, "grad_norm": 0.36864978075027466, "learning_rate": 0.0002627324171382377, "loss": 0.6174, "step": 78000 }, { "epoch": 10.522770142818647, "grad_norm": 0.34770023822784424, "learning_rate": 0.00026306925357046616, "loss": 0.6151, "step": 78100 }, { "epoch": 10.536243600107788, "grad_norm": 0.3636642098426819, "learning_rate": 0.00026340609000269474, "loss": 0.6154, "step": 78200 }, { "epoch": 10.549717057396927, "grad_norm": 0.3664078414440155, "learning_rate": 0.0002637429264349232, "loss": 0.6159, "step": 78300 }, { "epoch": 10.563190514686069, "grad_norm": 0.34277668595314026, "learning_rate": 0.00026407976286715174, "loss": 0.6157, "step": 78400 }, { "epoch": 10.57666397197521, "grad_norm": 0.3562292158603668, "learning_rate": 0.0002644165992993802, "loss": 0.6149, "step": 78500 }, { "epoch": 10.590137429264349, "grad_norm": 0.37077033519744873, "learning_rate": 0.0002647534357316088, "loss": 0.6148, "step": 78600 }, { "epoch": 10.60361088655349, "grad_norm": 0.34755903482437134, "learning_rate": 0.00026509027216383726, "loss": 0.6151, "step": 78700 }, { "epoch": 10.61708434384263, "grad_norm": 0.37414422631263733, "learning_rate": 0.0002654271085960658, "loss": 0.6156, "step": 78800 }, { "epoch": 10.63055780113177, "grad_norm": 0.339845210313797, "learning_rate": 0.00026576394502829425, "loss": 0.6154, "step": 78900 }, { "epoch": 10.644031258420911, "grad_norm": 0.3617702126502991, "learning_rate": 0.0002661007814605228, "loss": 0.6142, "step": 79000 }, { "epoch": 10.65750471571005, "grad_norm": 0.36159253120422363, "learning_rate": 0.0002664376178927513, "loss": 0.6154, "step": 79100 }, { "epoch": 10.670978172999192, "grad_norm": 0.3266281485557556, "learning_rate": 0.0002667744543249798, "loss": 0.6138, "step": 79200 }, { "epoch": 10.684451630288333, "grad_norm": 0.34264957904815674, "learning_rate": 0.0002671112907572083, "loss": 0.6146, "step": 79300 }, { "epoch": 10.697925087577472, "grad_norm": 0.33766835927963257, "learning_rate": 0.0002674481271894368, "loss": 0.6134, "step": 79400 }, { "epoch": 10.711398544866613, "grad_norm": 0.33463722467422485, "learning_rate": 0.00026778496362166534, "loss": 0.615, "step": 79500 }, { "epoch": 10.724872002155752, "grad_norm": 0.35712766647338867, "learning_rate": 0.00026812180005389387, "loss": 0.6144, "step": 79600 }, { "epoch": 10.738345459444893, "grad_norm": 0.34668922424316406, "learning_rate": 0.00026845863648612234, "loss": 0.6135, "step": 79700 }, { "epoch": 10.751818916734035, "grad_norm": 0.3558567762374878, "learning_rate": 0.00026879547291835086, "loss": 0.6141, "step": 79800 }, { "epoch": 10.765292374023174, "grad_norm": 0.3451814651489258, "learning_rate": 0.0002691323093505794, "loss": 0.6131, "step": 79900 }, { "epoch": 10.778765831312315, "grad_norm": 0.3418267071247101, "learning_rate": 0.0002694691457828079, "loss": 0.6141, "step": 80000 }, { "epoch": 10.792239288601454, "grad_norm": 0.3826930522918701, "learning_rate": 0.0002698059822150364, "loss": 0.6144, "step": 80100 }, { "epoch": 10.805712745890595, "grad_norm": 0.3672187030315399, "learning_rate": 0.0002701428186472649, "loss": 0.6136, "step": 80200 }, { "epoch": 10.819186203179736, "grad_norm": 0.36896586418151855, "learning_rate": 0.00027047965507949343, "loss": 0.6133, "step": 80300 }, { "epoch": 10.832659660468876, "grad_norm": 0.3978537917137146, "learning_rate": 0.00027081649151172195, "loss": 0.6136, "step": 80400 }, { "epoch": 10.846133117758017, "grad_norm": 0.34167930483818054, "learning_rate": 0.0002711533279439504, "loss": 0.6129, "step": 80500 }, { "epoch": 10.859606575047158, "grad_norm": 0.35479867458343506, "learning_rate": 0.00027149016437617895, "loss": 0.6131, "step": 80600 }, { "epoch": 10.873080032336297, "grad_norm": 0.3537484407424927, "learning_rate": 0.0002718270008084074, "loss": 0.613, "step": 80700 }, { "epoch": 10.886553489625438, "grad_norm": 0.35546875, "learning_rate": 0.00027216383724063594, "loss": 0.6119, "step": 80800 }, { "epoch": 10.90002694691458, "grad_norm": 0.35735082626342773, "learning_rate": 0.00027250067367286446, "loss": 0.6123, "step": 80900 }, { "epoch": 10.913500404203718, "grad_norm": 0.3161241412162781, "learning_rate": 0.00027283751010509293, "loss": 0.6127, "step": 81000 }, { "epoch": 10.92697386149286, "grad_norm": 0.32185718417167664, "learning_rate": 0.00027317434653732146, "loss": 0.6124, "step": 81100 }, { "epoch": 10.940447318781999, "grad_norm": 0.3590146601200104, "learning_rate": 0.00027351118296955, "loss": 0.6115, "step": 81200 }, { "epoch": 10.95392077607114, "grad_norm": 0.4015113115310669, "learning_rate": 0.0002738480194017785, "loss": 0.612, "step": 81300 }, { "epoch": 10.967394233360281, "grad_norm": 0.3359557092189789, "learning_rate": 0.000274184855834007, "loss": 0.6122, "step": 81400 }, { "epoch": 10.98086769064942, "grad_norm": 0.35016316175460815, "learning_rate": 0.0002745216922662355, "loss": 0.6122, "step": 81500 }, { "epoch": 10.994341147938561, "grad_norm": 0.3620946705341339, "learning_rate": 0.000274858528698464, "loss": 0.613, "step": 81600 }, { "epoch": 11.0, "eval_loss": 0.5955621004104614, "eval_runtime": 4.9332, "eval_samples_per_second": 1013.551, "eval_steps_per_second": 16.014, "step": 81642 }, { "epoch": 11.0078146052277, "grad_norm": 0.36727264523506165, "learning_rate": 0.00027519536513069255, "loss": 0.6122, "step": 81700 }, { "epoch": 11.021288062516842, "grad_norm": 0.36711281538009644, "learning_rate": 0.000275532201562921, "loss": 0.61, "step": 81800 }, { "epoch": 11.034761519805983, "grad_norm": 0.35460513830184937, "learning_rate": 0.00027586903799514955, "loss": 0.611, "step": 81900 }, { "epoch": 11.048234977095122, "grad_norm": 0.331834614276886, "learning_rate": 0.00027620587442737807, "loss": 0.6104, "step": 82000 }, { "epoch": 11.061708434384263, "grad_norm": 0.3252831995487213, "learning_rate": 0.0002765427108596066, "loss": 0.6093, "step": 82100 }, { "epoch": 11.075181891673404, "grad_norm": 0.3264895975589752, "learning_rate": 0.00027687954729183506, "loss": 0.6112, "step": 82200 }, { "epoch": 11.088655348962543, "grad_norm": 0.3640132546424866, "learning_rate": 0.0002772163837240636, "loss": 0.611, "step": 82300 }, { "epoch": 11.102128806251685, "grad_norm": 0.32109305262565613, "learning_rate": 0.00027755322015629206, "loss": 0.6112, "step": 82400 }, { "epoch": 11.115602263540824, "grad_norm": 0.3509661853313446, "learning_rate": 0.00027789005658852064, "loss": 0.611, "step": 82500 }, { "epoch": 11.129075720829965, "grad_norm": 0.357479453086853, "learning_rate": 0.0002782268930207491, "loss": 0.609, "step": 82600 }, { "epoch": 11.142549178119106, "grad_norm": 0.3288052976131439, "learning_rate": 0.00027856372945297763, "loss": 0.6093, "step": 82700 }, { "epoch": 11.156022635408245, "grad_norm": 0.3224884271621704, "learning_rate": 0.0002789005658852061, "loss": 0.6101, "step": 82800 }, { "epoch": 11.169496092697386, "grad_norm": 0.36794313788414, "learning_rate": 0.0002792374023174347, "loss": 0.6095, "step": 82900 }, { "epoch": 11.182969549986527, "grad_norm": 0.35069936513900757, "learning_rate": 0.00027957423874966315, "loss": 0.61, "step": 83000 }, { "epoch": 11.196443007275667, "grad_norm": 0.32393842935562134, "learning_rate": 0.0002799110751818917, "loss": 0.6096, "step": 83100 }, { "epoch": 11.209916464564808, "grad_norm": 0.34140801429748535, "learning_rate": 0.00028024791161412014, "loss": 0.61, "step": 83200 }, { "epoch": 11.223389921853947, "grad_norm": 0.3485608994960785, "learning_rate": 0.0002805847480463487, "loss": 0.6088, "step": 83300 }, { "epoch": 11.236863379143088, "grad_norm": 0.3717021644115448, "learning_rate": 0.0002809215844785772, "loss": 0.6089, "step": 83400 }, { "epoch": 11.25033683643223, "grad_norm": 0.3605683147907257, "learning_rate": 0.0002812584209108057, "loss": 0.609, "step": 83500 }, { "epoch": 11.263810293721368, "grad_norm": 0.31585827469825745, "learning_rate": 0.0002815952573430342, "loss": 0.609, "step": 83600 }, { "epoch": 11.27728375101051, "grad_norm": 0.37110140919685364, "learning_rate": 0.00028193209377526277, "loss": 0.609, "step": 83700 }, { "epoch": 11.290757208299649, "grad_norm": 0.3301221430301666, "learning_rate": 0.00028226893020749124, "loss": 0.6096, "step": 83800 }, { "epoch": 11.30423066558879, "grad_norm": 0.3560285270214081, "learning_rate": 0.00028260576663971976, "loss": 0.6086, "step": 83900 }, { "epoch": 11.317704122877931, "grad_norm": 0.38138994574546814, "learning_rate": 0.00028294260307194823, "loss": 0.6093, "step": 84000 }, { "epoch": 11.33117758016707, "grad_norm": 0.31577613949775696, "learning_rate": 0.0002832794395041768, "loss": 0.6092, "step": 84100 }, { "epoch": 11.344651037456211, "grad_norm": 0.31748026609420776, "learning_rate": 0.0002836162759364053, "loss": 0.6081, "step": 84200 }, { "epoch": 11.358124494745352, "grad_norm": 0.33299046754837036, "learning_rate": 0.0002839531123686338, "loss": 0.6092, "step": 84300 }, { "epoch": 11.371597952034492, "grad_norm": 0.3281749188899994, "learning_rate": 0.0002842899488008623, "loss": 0.6084, "step": 84400 }, { "epoch": 11.385071409323633, "grad_norm": 0.3382653295993805, "learning_rate": 0.0002846267852330908, "loss": 0.6081, "step": 84500 }, { "epoch": 11.398544866612772, "grad_norm": 0.32810911536216736, "learning_rate": 0.0002849636216653193, "loss": 0.6085, "step": 84600 }, { "epoch": 11.412018323901913, "grad_norm": 0.34234029054641724, "learning_rate": 0.00028530045809754785, "loss": 0.608, "step": 84700 }, { "epoch": 11.425491781191054, "grad_norm": 0.3564986288547516, "learning_rate": 0.0002856372945297763, "loss": 0.6072, "step": 84800 }, { "epoch": 11.438965238480193, "grad_norm": 0.3930758833885193, "learning_rate": 0.00028597413096200484, "loss": 0.6081, "step": 84900 }, { "epoch": 11.452438695769334, "grad_norm": 0.34734079241752625, "learning_rate": 0.00028631096739423337, "loss": 0.6079, "step": 85000 }, { "epoch": 11.465912153058476, "grad_norm": 0.3172045052051544, "learning_rate": 0.0002866478038264619, "loss": 0.6075, "step": 85100 }, { "epoch": 11.479385610347615, "grad_norm": 0.2982329726219177, "learning_rate": 0.00028698464025869036, "loss": 0.6071, "step": 85200 }, { "epoch": 11.492859067636756, "grad_norm": 0.3098890781402588, "learning_rate": 0.0002873214766909189, "loss": 0.6063, "step": 85300 }, { "epoch": 11.506332524925895, "grad_norm": 0.3065027892589569, "learning_rate": 0.0002876583131231474, "loss": 0.6083, "step": 85400 }, { "epoch": 11.519805982215036, "grad_norm": 0.36928603053092957, "learning_rate": 0.00028799514955537593, "loss": 0.6069, "step": 85500 }, { "epoch": 11.533279439504177, "grad_norm": 0.2979229688644409, "learning_rate": 0.0002883319859876044, "loss": 0.6065, "step": 85600 }, { "epoch": 11.546752896793317, "grad_norm": 0.34474480152130127, "learning_rate": 0.00028866882241983293, "loss": 0.6071, "step": 85700 }, { "epoch": 11.560226354082458, "grad_norm": 0.30905717611312866, "learning_rate": 0.00028900565885206145, "loss": 0.607, "step": 85800 }, { "epoch": 11.573699811371599, "grad_norm": 0.33838242292404175, "learning_rate": 0.00028934249528429, "loss": 0.6059, "step": 85900 }, { "epoch": 11.587173268660738, "grad_norm": 0.3017084002494812, "learning_rate": 0.00028967933171651845, "loss": 0.6061, "step": 86000 }, { "epoch": 11.60064672594988, "grad_norm": 0.3642980754375458, "learning_rate": 0.00029001616814874697, "loss": 0.606, "step": 86100 }, { "epoch": 11.614120183239018, "grad_norm": 0.31039348244667053, "learning_rate": 0.00029035300458097544, "loss": 0.6065, "step": 86200 }, { "epoch": 11.62759364052816, "grad_norm": 0.3367164731025696, "learning_rate": 0.000290689841013204, "loss": 0.6058, "step": 86300 }, { "epoch": 11.6410670978173, "grad_norm": 0.31954225897789, "learning_rate": 0.0002910266774454325, "loss": 0.6058, "step": 86400 }, { "epoch": 11.65454055510644, "grad_norm": 0.32119229435920715, "learning_rate": 0.000291363513877661, "loss": 0.6056, "step": 86500 }, { "epoch": 11.668014012395581, "grad_norm": 0.31462904810905457, "learning_rate": 0.0002917003503098895, "loss": 0.6059, "step": 86600 }, { "epoch": 11.681487469684722, "grad_norm": 0.3244771361351013, "learning_rate": 0.00029203718674211806, "loss": 0.606, "step": 86700 }, { "epoch": 11.694960926973861, "grad_norm": 0.32869982719421387, "learning_rate": 0.00029237402317434653, "loss": 0.6059, "step": 86800 }, { "epoch": 11.708434384263002, "grad_norm": 0.30623605847358704, "learning_rate": 0.00029271085960657506, "loss": 0.606, "step": 86900 }, { "epoch": 11.721907841552142, "grad_norm": 0.3786109983921051, "learning_rate": 0.0002930476960388035, "loss": 0.6049, "step": 87000 }, { "epoch": 11.735381298841283, "grad_norm": 0.32856297492980957, "learning_rate": 0.0002933845324710321, "loss": 0.6053, "step": 87100 }, { "epoch": 11.748854756130424, "grad_norm": 0.32152584195137024, "learning_rate": 0.0002937213689032606, "loss": 0.6054, "step": 87200 }, { "epoch": 11.762328213419563, "grad_norm": 0.31926923990249634, "learning_rate": 0.0002940582053354891, "loss": 0.6061, "step": 87300 }, { "epoch": 11.775801670708704, "grad_norm": 0.3208412528038025, "learning_rate": 0.00029439504176771757, "loss": 0.6056, "step": 87400 }, { "epoch": 11.789275127997843, "grad_norm": 0.31100231409072876, "learning_rate": 0.00029473187819994615, "loss": 0.6051, "step": 87500 }, { "epoch": 11.802748585286984, "grad_norm": 0.3119351267814636, "learning_rate": 0.0002950687146321746, "loss": 0.6061, "step": 87600 }, { "epoch": 11.816222042576126, "grad_norm": 0.34856122732162476, "learning_rate": 0.00029540555106440314, "loss": 0.6051, "step": 87700 }, { "epoch": 11.829695499865265, "grad_norm": 0.3115238845348358, "learning_rate": 0.0002957423874966316, "loss": 0.6046, "step": 87800 }, { "epoch": 11.843168957154406, "grad_norm": 0.3181014657020569, "learning_rate": 0.00029607922392886014, "loss": 0.6038, "step": 87900 }, { "epoch": 11.856642414443547, "grad_norm": 0.29362770915031433, "learning_rate": 0.00029641606036108866, "loss": 0.6046, "step": 88000 }, { "epoch": 11.870115871732686, "grad_norm": 0.3429737985134125, "learning_rate": 0.0002967528967933172, "loss": 0.6039, "step": 88100 }, { "epoch": 11.883589329021827, "grad_norm": 0.30550846457481384, "learning_rate": 0.00029708973322554566, "loss": 0.6046, "step": 88200 }, { "epoch": 11.897062786310967, "grad_norm": 0.3173362910747528, "learning_rate": 0.0002974265696577742, "loss": 0.6055, "step": 88300 }, { "epoch": 11.910536243600108, "grad_norm": 0.30763953924179077, "learning_rate": 0.0002977634060900027, "loss": 0.6048, "step": 88400 }, { "epoch": 11.924009700889249, "grad_norm": 0.3321947753429413, "learning_rate": 0.00029810024252223123, "loss": 0.6037, "step": 88500 }, { "epoch": 11.937483158178388, "grad_norm": 0.32831940054893494, "learning_rate": 0.0002984370789544597, "loss": 0.6041, "step": 88600 }, { "epoch": 11.950956615467529, "grad_norm": 0.2870369851589203, "learning_rate": 0.0002987739153866882, "loss": 0.6032, "step": 88700 }, { "epoch": 11.96443007275667, "grad_norm": 0.30078959465026855, "learning_rate": 0.00029911075181891675, "loss": 0.6038, "step": 88800 }, { "epoch": 11.97790353004581, "grad_norm": 0.29838648438453674, "learning_rate": 0.00029944758825114527, "loss": 0.6034, "step": 88900 }, { "epoch": 11.99137698733495, "grad_norm": 0.33946481347084045, "learning_rate": 0.00029978442468337374, "loss": 0.6035, "step": 89000 }, { "epoch": 12.0, "eval_loss": 0.5889723300933838, "eval_runtime": 4.9314, "eval_samples_per_second": 1013.902, "eval_steps_per_second": 16.02, "step": 89064 }, { "epoch": 12.00485044462409, "grad_norm": 0.30453866720199585, "learning_rate": 0.00030012126111560227, "loss": 0.6048, "step": 89100 }, { "epoch": 12.01832390191323, "grad_norm": 0.31908926367759705, "learning_rate": 0.0003004580975478308, "loss": 0.6021, "step": 89200 }, { "epoch": 12.031797359202372, "grad_norm": 0.34583160281181335, "learning_rate": 0.0003007949339800593, "loss": 0.6031, "step": 89300 }, { "epoch": 12.045270816491511, "grad_norm": 0.335435152053833, "learning_rate": 0.0003011317704122878, "loss": 0.6023, "step": 89400 }, { "epoch": 12.058744273780652, "grad_norm": 0.31006506085395813, "learning_rate": 0.0003014686068445163, "loss": 0.6022, "step": 89500 }, { "epoch": 12.072217731069793, "grad_norm": 0.4115130603313446, "learning_rate": 0.0003018054432767448, "loss": 0.6035, "step": 89600 }, { "epoch": 12.085691188358933, "grad_norm": 0.3054640293121338, "learning_rate": 0.00030214227970897336, "loss": 0.6031, "step": 89700 }, { "epoch": 12.099164645648074, "grad_norm": 0.2981049716472626, "learning_rate": 0.00030247911614120183, "loss": 0.6013, "step": 89800 }, { "epoch": 12.112638102937213, "grad_norm": 0.33493950963020325, "learning_rate": 0.00030281595257343035, "loss": 0.6017, "step": 89900 }, { "epoch": 12.126111560226354, "grad_norm": 0.3019053637981415, "learning_rate": 0.0003031527890056588, "loss": 0.6021, "step": 90000 }, { "epoch": 12.139585017515495, "grad_norm": 0.28756940364837646, "learning_rate": 0.0003034896254378874, "loss": 0.6021, "step": 90100 }, { "epoch": 12.153058474804634, "grad_norm": 0.2904725670814514, "learning_rate": 0.00030382646187011587, "loss": 0.6018, "step": 90200 }, { "epoch": 12.166531932093775, "grad_norm": 0.3203810155391693, "learning_rate": 0.0003041632983023444, "loss": 0.6012, "step": 90300 }, { "epoch": 12.180005389382917, "grad_norm": 0.31928887963294983, "learning_rate": 0.00030450013473457287, "loss": 0.6025, "step": 90400 }, { "epoch": 12.193478846672056, "grad_norm": 0.33468613028526306, "learning_rate": 0.00030483697116680144, "loss": 0.6016, "step": 90500 }, { "epoch": 12.206952303961197, "grad_norm": 0.30417874455451965, "learning_rate": 0.0003051738075990299, "loss": 0.6019, "step": 90600 }, { "epoch": 12.220425761250336, "grad_norm": 0.30128952860832214, "learning_rate": 0.00030551064403125844, "loss": 0.6012, "step": 90700 }, { "epoch": 12.233899218539477, "grad_norm": 0.3198239803314209, "learning_rate": 0.0003058474804634869, "loss": 0.6015, "step": 90800 }, { "epoch": 12.247372675828618, "grad_norm": 0.31130683422088623, "learning_rate": 0.0003061843168957155, "loss": 0.6008, "step": 90900 }, { "epoch": 12.260846133117758, "grad_norm": 0.2999829351902008, "learning_rate": 0.00030652115332794396, "loss": 0.6016, "step": 91000 }, { "epoch": 12.274319590406899, "grad_norm": 0.2992890477180481, "learning_rate": 0.0003068579897601725, "loss": 0.6013, "step": 91100 }, { "epoch": 12.287793047696038, "grad_norm": 0.2909565567970276, "learning_rate": 0.00030719482619240095, "loss": 0.6023, "step": 91200 }, { "epoch": 12.301266504985179, "grad_norm": 0.2878779470920563, "learning_rate": 0.00030753166262462953, "loss": 0.6001, "step": 91300 }, { "epoch": 12.31473996227432, "grad_norm": 0.28951889276504517, "learning_rate": 0.000307868499056858, "loss": 0.5999, "step": 91400 }, { "epoch": 12.32821341956346, "grad_norm": 0.3279721140861511, "learning_rate": 0.0003082053354890865, "loss": 0.6014, "step": 91500 }, { "epoch": 12.3416868768526, "grad_norm": 0.3014054298400879, "learning_rate": 0.000308542171921315, "loss": 0.601, "step": 91600 }, { "epoch": 12.355160334141742, "grad_norm": 0.29033875465393066, "learning_rate": 0.0003088790083535435, "loss": 0.6013, "step": 91700 }, { "epoch": 12.36863379143088, "grad_norm": 0.28003042936325073, "learning_rate": 0.00030921584478577204, "loss": 0.6006, "step": 91800 }, { "epoch": 12.382107248720022, "grad_norm": 0.29978838562965393, "learning_rate": 0.00030955268121800057, "loss": 0.6009, "step": 91900 }, { "epoch": 12.395580706009161, "grad_norm": 0.2977657616138458, "learning_rate": 0.00030988951765022904, "loss": 0.5996, "step": 92000 }, { "epoch": 12.409054163298302, "grad_norm": 0.29652756452560425, "learning_rate": 0.00031022635408245756, "loss": 0.6001, "step": 92100 }, { "epoch": 12.422527620587443, "grad_norm": 0.29027998447418213, "learning_rate": 0.0003105631905146861, "loss": 0.6007, "step": 92200 }, { "epoch": 12.436001077876583, "grad_norm": 0.29691606760025024, "learning_rate": 0.0003109000269469146, "loss": 0.6008, "step": 92300 }, { "epoch": 12.449474535165724, "grad_norm": 0.30363813042640686, "learning_rate": 0.0003112368633791431, "loss": 0.6, "step": 92400 }, { "epoch": 12.462947992454865, "grad_norm": 0.2897365987300873, "learning_rate": 0.0003115736998113716, "loss": 0.5995, "step": 92500 }, { "epoch": 12.476421449744004, "grad_norm": 0.30836185812950134, "learning_rate": 0.00031191053624360013, "loss": 0.6002, "step": 92600 }, { "epoch": 12.489894907033145, "grad_norm": 0.2771493196487427, "learning_rate": 0.00031224737267582865, "loss": 0.5998, "step": 92700 }, { "epoch": 12.503368364322284, "grad_norm": 0.2775214612483978, "learning_rate": 0.0003125842091080571, "loss": 0.5998, "step": 92800 }, { "epoch": 12.516841821611425, "grad_norm": 0.2901945114135742, "learning_rate": 0.00031292104554028565, "loss": 0.6013, "step": 92900 }, { "epoch": 12.530315278900567, "grad_norm": 0.3013378083705902, "learning_rate": 0.00031325788197251417, "loss": 0.5994, "step": 93000 }, { "epoch": 12.543788736189706, "grad_norm": 0.2763359248638153, "learning_rate": 0.0003135947184047427, "loss": 0.5992, "step": 93100 }, { "epoch": 12.557262193478847, "grad_norm": 0.3039182126522064, "learning_rate": 0.00031393155483697117, "loss": 0.5984, "step": 93200 }, { "epoch": 12.570735650767986, "grad_norm": 0.3001886308193207, "learning_rate": 0.0003142683912691997, "loss": 0.6001, "step": 93300 }, { "epoch": 12.584209108057127, "grad_norm": 0.2788808345794678, "learning_rate": 0.00031460522770142816, "loss": 0.5992, "step": 93400 }, { "epoch": 12.597682565346268, "grad_norm": 0.2792122960090637, "learning_rate": 0.00031494206413365674, "loss": 0.6002, "step": 93500 }, { "epoch": 12.611156022635408, "grad_norm": 0.2758144736289978, "learning_rate": 0.0003152789005658852, "loss": 0.5991, "step": 93600 }, { "epoch": 12.624629479924549, "grad_norm": 0.2854858338832855, "learning_rate": 0.00031561573699811373, "loss": 0.5993, "step": 93700 }, { "epoch": 12.63810293721369, "grad_norm": 0.3688070476055145, "learning_rate": 0.0003159525734303422, "loss": 0.5982, "step": 93800 }, { "epoch": 12.651576394502829, "grad_norm": 0.29259446263313293, "learning_rate": 0.0003162894098625708, "loss": 0.598, "step": 93900 }, { "epoch": 12.66504985179197, "grad_norm": 0.31507542729377747, "learning_rate": 0.00031662624629479925, "loss": 0.5997, "step": 94000 }, { "epoch": 12.678523309081111, "grad_norm": 0.2972285747528076, "learning_rate": 0.0003169630827270278, "loss": 0.5989, "step": 94100 }, { "epoch": 12.69199676637025, "grad_norm": 0.29468822479248047, "learning_rate": 0.00031729991915925625, "loss": 0.5981, "step": 94200 }, { "epoch": 12.705470223659391, "grad_norm": 0.2802707552909851, "learning_rate": 0.0003176367555914848, "loss": 0.5984, "step": 94300 }, { "epoch": 12.71894368094853, "grad_norm": 0.290730744600296, "learning_rate": 0.0003179735920237133, "loss": 0.5979, "step": 94400 }, { "epoch": 12.732417138237672, "grad_norm": 0.30677369236946106, "learning_rate": 0.0003183104284559418, "loss": 0.5977, "step": 94500 }, { "epoch": 12.745890595526813, "grad_norm": 0.3296619951725006, "learning_rate": 0.0003186472648881703, "loss": 0.5978, "step": 94600 }, { "epoch": 12.759364052815952, "grad_norm": 0.2898562252521515, "learning_rate": 0.00031898410132039887, "loss": 0.5977, "step": 94700 }, { "epoch": 12.772837510105093, "grad_norm": 0.2900584638118744, "learning_rate": 0.00031932093775262734, "loss": 0.5983, "step": 94800 }, { "epoch": 12.786310967394233, "grad_norm": 0.28333956003189087, "learning_rate": 0.00031965777418485586, "loss": 0.5989, "step": 94900 }, { "epoch": 12.799784424683374, "grad_norm": 0.3064444065093994, "learning_rate": 0.00031999461061708433, "loss": 0.5988, "step": 95000 }, { "epoch": 12.813257881972515, "grad_norm": 0.290694922208786, "learning_rate": 0.00032033144704931286, "loss": 0.5995, "step": 95100 }, { "epoch": 12.826731339261654, "grad_norm": 0.2789267301559448, "learning_rate": 0.0003206682834815414, "loss": 0.598, "step": 95200 }, { "epoch": 12.840204796550795, "grad_norm": 0.27945077419281006, "learning_rate": 0.0003210051199137699, "loss": 0.5984, "step": 95300 }, { "epoch": 12.853678253839936, "grad_norm": 0.2967492640018463, "learning_rate": 0.0003213419563459984, "loss": 0.5973, "step": 95400 }, { "epoch": 12.867151711129075, "grad_norm": 0.28006336092948914, "learning_rate": 0.0003216787927782269, "loss": 0.598, "step": 95500 }, { "epoch": 12.880625168418216, "grad_norm": 0.2778092622756958, "learning_rate": 0.0003220156292104554, "loss": 0.5978, "step": 95600 }, { "epoch": 12.894098625707356, "grad_norm": 0.2718038856983185, "learning_rate": 0.00032235246564268395, "loss": 0.597, "step": 95700 }, { "epoch": 12.907572082996497, "grad_norm": 0.28917577862739563, "learning_rate": 0.0003226893020749124, "loss": 0.5974, "step": 95800 }, { "epoch": 12.921045540285638, "grad_norm": 0.29045143723487854, "learning_rate": 0.00032302613850714094, "loss": 0.5976, "step": 95900 }, { "epoch": 12.934518997574777, "grad_norm": 0.26743441820144653, "learning_rate": 0.00032336297493936947, "loss": 0.5964, "step": 96000 }, { "epoch": 12.947992454863918, "grad_norm": 0.29474613070487976, "learning_rate": 0.000323699811371598, "loss": 0.5975, "step": 96100 }, { "epoch": 12.96146591215306, "grad_norm": 0.30439284443855286, "learning_rate": 0.00032403664780382646, "loss": 0.5962, "step": 96200 }, { "epoch": 12.974939369442199, "grad_norm": 0.272291362285614, "learning_rate": 0.000324373484236055, "loss": 0.5971, "step": 96300 }, { "epoch": 12.98841282673134, "grad_norm": 0.2788887619972229, "learning_rate": 0.0003247103206682835, "loss": 0.597, "step": 96400 }, { "epoch": 13.0, "eval_loss": 0.5815761685371399, "eval_runtime": 4.9204, "eval_samples_per_second": 1016.184, "eval_steps_per_second": 16.056, "step": 96486 }, { "epoch": 13.001886284020479, "grad_norm": 0.2986142635345459, "learning_rate": 0.00032504715710051204, "loss": 0.5977, "step": 96500 }, { "epoch": 13.01535974130962, "grad_norm": 0.30748501420021057, "learning_rate": 0.0003253839935327405, "loss": 0.5963, "step": 96600 }, { "epoch": 13.028833198598761, "grad_norm": 0.2868187129497528, "learning_rate": 0.00032572082996496903, "loss": 0.5963, "step": 96700 }, { "epoch": 13.0423066558879, "grad_norm": 0.2851922810077667, "learning_rate": 0.00032605766639719755, "loss": 0.5956, "step": 96800 }, { "epoch": 13.055780113177041, "grad_norm": 0.29730185866355896, "learning_rate": 0.000326394502829426, "loss": 0.5963, "step": 96900 }, { "epoch": 13.06925357046618, "grad_norm": 0.2793130874633789, "learning_rate": 0.00032673133926165455, "loss": 0.5947, "step": 97000 }, { "epoch": 13.082727027755322, "grad_norm": 0.2747945487499237, "learning_rate": 0.000327068175693883, "loss": 0.596, "step": 97100 }, { "epoch": 13.096200485044463, "grad_norm": 0.2810925841331482, "learning_rate": 0.00032740501212611154, "loss": 0.5964, "step": 97200 }, { "epoch": 13.109673942333602, "grad_norm": 0.27531686425209045, "learning_rate": 0.00032774184855834007, "loss": 0.5963, "step": 97300 }, { "epoch": 13.123147399622743, "grad_norm": 0.3010106086730957, "learning_rate": 0.0003280786849905686, "loss": 0.596, "step": 97400 }, { "epoch": 13.136620856911884, "grad_norm": 0.2682129740715027, "learning_rate": 0.00032841552142279706, "loss": 0.5955, "step": 97500 }, { "epoch": 13.150094314201024, "grad_norm": 0.2802772521972656, "learning_rate": 0.0003287523578550256, "loss": 0.5953, "step": 97600 }, { "epoch": 13.163567771490165, "grad_norm": 0.2761726677417755, "learning_rate": 0.0003290891942872541, "loss": 0.5945, "step": 97700 }, { "epoch": 13.177041228779304, "grad_norm": 0.2829822301864624, "learning_rate": 0.00032942603071948264, "loss": 0.5959, "step": 97800 }, { "epoch": 13.190514686068445, "grad_norm": 0.28405165672302246, "learning_rate": 0.0003297628671517111, "loss": 0.5957, "step": 97900 }, { "epoch": 13.203988143357586, "grad_norm": 0.2933911085128784, "learning_rate": 0.00033009970358393963, "loss": 0.5948, "step": 98000 }, { "epoch": 13.217461600646725, "grad_norm": 0.26997318863868713, "learning_rate": 0.00033043654001616815, "loss": 0.5957, "step": 98100 }, { "epoch": 13.230935057935866, "grad_norm": 0.2741181254386902, "learning_rate": 0.0003307733764483967, "loss": 0.5941, "step": 98200 }, { "epoch": 13.244408515225008, "grad_norm": 0.2807440757751465, "learning_rate": 0.00033111021288062515, "loss": 0.5946, "step": 98300 }, { "epoch": 13.257881972514147, "grad_norm": 0.3012031614780426, "learning_rate": 0.00033144704931285367, "loss": 0.5948, "step": 98400 }, { "epoch": 13.271355429803288, "grad_norm": 0.28123989701271057, "learning_rate": 0.0003317838857450822, "loss": 0.595, "step": 98500 }, { "epoch": 13.284828887092427, "grad_norm": 0.26498404145240784, "learning_rate": 0.0003321207221773107, "loss": 0.5949, "step": 98600 }, { "epoch": 13.298302344381568, "grad_norm": 0.27566659450531006, "learning_rate": 0.0003324575586095392, "loss": 0.5946, "step": 98700 }, { "epoch": 13.31177580167071, "grad_norm": 0.2769579589366913, "learning_rate": 0.0003327943950417677, "loss": 0.5948, "step": 98800 }, { "epoch": 13.325249258959849, "grad_norm": 0.2723526656627655, "learning_rate": 0.0003331312314739962, "loss": 0.5946, "step": 98900 }, { "epoch": 13.33872271624899, "grad_norm": 0.30759960412979126, "learning_rate": 0.00033346806790622476, "loss": 0.5945, "step": 99000 }, { "epoch": 13.35219617353813, "grad_norm": 0.27853190898895264, "learning_rate": 0.00033380490433845323, "loss": 0.5949, "step": 99100 }, { "epoch": 13.36566963082727, "grad_norm": 0.2885538339614868, "learning_rate": 0.00033414174077068176, "loss": 0.595, "step": 99200 }, { "epoch": 13.379143088116411, "grad_norm": 0.28033170104026794, "learning_rate": 0.00033447857720291023, "loss": 0.5949, "step": 99300 }, { "epoch": 13.39261654540555, "grad_norm": 0.2706429660320282, "learning_rate": 0.0003348154136351388, "loss": 0.5946, "step": 99400 }, { "epoch": 13.406090002694691, "grad_norm": 0.29855814576148987, "learning_rate": 0.0003351522500673673, "loss": 0.5934, "step": 99500 }, { "epoch": 13.419563459983832, "grad_norm": 0.2529764473438263, "learning_rate": 0.0003354890864995958, "loss": 0.5943, "step": 99600 }, { "epoch": 13.433036917272972, "grad_norm": 0.2868923246860504, "learning_rate": 0.00033582592293182427, "loss": 0.5939, "step": 99700 }, { "epoch": 13.446510374562113, "grad_norm": 0.2690979242324829, "learning_rate": 0.00033616275936405285, "loss": 0.5935, "step": 99800 }, { "epoch": 13.459983831851254, "grad_norm": 0.2868712842464447, "learning_rate": 0.0003364995957962813, "loss": 0.5934, "step": 99900 }, { "epoch": 13.473457289140393, "grad_norm": 0.26075223088264465, "learning_rate": 0.00033683643222850985, "loss": 0.5947, "step": 100000 }, { "epoch": 13.486930746429534, "grad_norm": 0.2691114544868469, "learning_rate": 0.0003371732686607383, "loss": 0.5933, "step": 100100 }, { "epoch": 13.500404203718674, "grad_norm": 0.2856033742427826, "learning_rate": 0.0003375101050929669, "loss": 0.5935, "step": 100200 }, { "epoch": 13.513877661007815, "grad_norm": 0.276174396276474, "learning_rate": 0.00033784694152519536, "loss": 0.5933, "step": 100300 }, { "epoch": 13.527351118296956, "grad_norm": 0.29007798433303833, "learning_rate": 0.0003381837779574239, "loss": 0.5923, "step": 100400 }, { "epoch": 13.540824575586095, "grad_norm": 0.2762569785118103, "learning_rate": 0.00033852061438965236, "loss": 0.5934, "step": 100500 }, { "epoch": 13.554298032875236, "grad_norm": 0.26516565680503845, "learning_rate": 0.0003388574508218809, "loss": 0.593, "step": 100600 }, { "epoch": 13.567771490164375, "grad_norm": 0.306970477104187, "learning_rate": 0.0003391942872541094, "loss": 0.593, "step": 100700 }, { "epoch": 13.581244947453516, "grad_norm": 0.27424800395965576, "learning_rate": 0.00033953112368633793, "loss": 0.5924, "step": 100800 }, { "epoch": 13.594718404742657, "grad_norm": 0.25659918785095215, "learning_rate": 0.0003398679601185664, "loss": 0.593, "step": 100900 }, { "epoch": 13.608191862031797, "grad_norm": 0.28753799200057983, "learning_rate": 0.0003402047965507949, "loss": 0.5939, "step": 101000 }, { "epoch": 13.621665319320938, "grad_norm": 0.264877587556839, "learning_rate": 0.00034054163298302345, "loss": 0.5939, "step": 101100 }, { "epoch": 13.635138776610079, "grad_norm": 0.28194770216941833, "learning_rate": 0.000340878469415252, "loss": 0.593, "step": 101200 }, { "epoch": 13.648612233899218, "grad_norm": 0.26084813475608826, "learning_rate": 0.00034121530584748044, "loss": 0.5923, "step": 101300 }, { "epoch": 13.66208569118836, "grad_norm": 0.26193830370903015, "learning_rate": 0.00034155214227970897, "loss": 0.5932, "step": 101400 }, { "epoch": 13.675559148477499, "grad_norm": 0.27431565523147583, "learning_rate": 0.0003418889787119375, "loss": 0.5933, "step": 101500 }, { "epoch": 13.68903260576664, "grad_norm": 0.26302945613861084, "learning_rate": 0.000342225815144166, "loss": 0.5924, "step": 101600 }, { "epoch": 13.70250606305578, "grad_norm": 0.26278284192085266, "learning_rate": 0.0003425626515763945, "loss": 0.5921, "step": 101700 }, { "epoch": 13.71597952034492, "grad_norm": 0.26920434832572937, "learning_rate": 0.000342899488008623, "loss": 0.5934, "step": 101800 }, { "epoch": 13.729452977634061, "grad_norm": 0.2837969660758972, "learning_rate": 0.00034323632444085154, "loss": 0.5927, "step": 101900 }, { "epoch": 13.742926434923202, "grad_norm": 0.26164159178733826, "learning_rate": 0.00034357316087308006, "loss": 0.5921, "step": 102000 }, { "epoch": 13.756399892212341, "grad_norm": 0.274332195520401, "learning_rate": 0.00034390999730530853, "loss": 0.593, "step": 102100 }, { "epoch": 13.769873349501482, "grad_norm": 0.2672336995601654, "learning_rate": 0.00034424683373753705, "loss": 0.592, "step": 102200 }, { "epoch": 13.783346806790622, "grad_norm": 0.2606847584247589, "learning_rate": 0.0003445836701697655, "loss": 0.5927, "step": 102300 }, { "epoch": 13.796820264079763, "grad_norm": 0.30221912264823914, "learning_rate": 0.0003449205066019941, "loss": 0.5911, "step": 102400 }, { "epoch": 13.810293721368904, "grad_norm": 0.27577823400497437, "learning_rate": 0.0003452573430342226, "loss": 0.5927, "step": 102500 }, { "epoch": 13.823767178658043, "grad_norm": 0.2642453908920288, "learning_rate": 0.0003455941794664511, "loss": 0.5913, "step": 102600 }, { "epoch": 13.837240635947184, "grad_norm": 0.2853885591030121, "learning_rate": 0.00034593101589867957, "loss": 0.5914, "step": 102700 }, { "epoch": 13.850714093236324, "grad_norm": 0.27795684337615967, "learning_rate": 0.00034626785233090815, "loss": 0.5924, "step": 102800 }, { "epoch": 13.864187550525465, "grad_norm": 0.2626563012599945, "learning_rate": 0.0003466046887631366, "loss": 0.5912, "step": 102900 }, { "epoch": 13.877661007814606, "grad_norm": 0.3205104470252991, "learning_rate": 0.00034694152519536514, "loss": 0.5916, "step": 103000 }, { "epoch": 13.891134465103745, "grad_norm": 0.2630579471588135, "learning_rate": 0.0003472783616275936, "loss": 0.5917, "step": 103100 }, { "epoch": 13.904607922392886, "grad_norm": 0.2654939293861389, "learning_rate": 0.0003476151980598222, "loss": 0.5912, "step": 103200 }, { "epoch": 13.918081379682027, "grad_norm": 0.2711382210254669, "learning_rate": 0.00034795203449205066, "loss": 0.5913, "step": 103300 }, { "epoch": 13.931554836971166, "grad_norm": 0.25285041332244873, "learning_rate": 0.0003482888709242792, "loss": 0.5915, "step": 103400 }, { "epoch": 13.945028294260307, "grad_norm": 0.26497402787208557, "learning_rate": 0.00034862570735650765, "loss": 0.5905, "step": 103500 }, { "epoch": 13.958501751549448, "grad_norm": 0.30702584981918335, "learning_rate": 0.00034896254378873623, "loss": 0.5917, "step": 103600 }, { "epoch": 13.971975208838588, "grad_norm": 0.2628515064716339, "learning_rate": 0.0003492993802209647, "loss": 0.591, "step": 103700 }, { "epoch": 13.985448666127729, "grad_norm": 0.3147718906402588, "learning_rate": 0.0003496362166531932, "loss": 0.5902, "step": 103800 }, { "epoch": 13.998922123416868, "grad_norm": 0.2741641700267792, "learning_rate": 0.0003499730530854217, "loss": 0.5908, "step": 103900 }, { "epoch": 14.0, "eval_loss": 0.5761825442314148, "eval_runtime": 4.9364, "eval_samples_per_second": 1012.894, "eval_steps_per_second": 16.004, "step": 103908 }, { "epoch": 14.01239558070601, "grad_norm": 0.250044584274292, "learning_rate": 0.0003503098895176503, "loss": 0.5904, "step": 104000 }, { "epoch": 14.02586903799515, "grad_norm": 0.26357126235961914, "learning_rate": 0.00035064672594987875, "loss": 0.5892, "step": 104100 }, { "epoch": 14.03934249528429, "grad_norm": 0.25762152671813965, "learning_rate": 0.00035098356238210727, "loss": 0.5901, "step": 104200 }, { "epoch": 14.05281595257343, "grad_norm": 0.2631015181541443, "learning_rate": 0.00035132039881433574, "loss": 0.59, "step": 104300 }, { "epoch": 14.06628940986257, "grad_norm": 0.2520255446434021, "learning_rate": 0.00035165723524656426, "loss": 0.5897, "step": 104400 }, { "epoch": 14.079762867151711, "grad_norm": 0.2666439116001129, "learning_rate": 0.0003519940716787928, "loss": 0.5901, "step": 104500 }, { "epoch": 14.093236324440852, "grad_norm": 0.2874584197998047, "learning_rate": 0.0003523309081110213, "loss": 0.5903, "step": 104600 }, { "epoch": 14.106709781729991, "grad_norm": 0.266944020986557, "learning_rate": 0.0003526677445432498, "loss": 0.5891, "step": 104700 }, { "epoch": 14.120183239019132, "grad_norm": 0.2645256817340851, "learning_rate": 0.0003530045809754783, "loss": 0.5886, "step": 104800 }, { "epoch": 14.133656696308273, "grad_norm": 0.2487623542547226, "learning_rate": 0.00035334141740770683, "loss": 0.59, "step": 104900 }, { "epoch": 14.147130153597413, "grad_norm": 0.2520279288291931, "learning_rate": 0.00035367825383993536, "loss": 0.5891, "step": 105000 }, { "epoch": 14.160603610886554, "grad_norm": 0.26101595163345337, "learning_rate": 0.0003540150902721638, "loss": 0.5889, "step": 105100 }, { "epoch": 14.174077068175693, "grad_norm": 0.27361974120140076, "learning_rate": 0.00035435192670439235, "loss": 0.5891, "step": 105200 }, { "epoch": 14.187550525464834, "grad_norm": 0.2688561677932739, "learning_rate": 0.0003546887631366209, "loss": 0.5895, "step": 105300 }, { "epoch": 14.201023982753975, "grad_norm": 0.25963136553764343, "learning_rate": 0.0003550255995688494, "loss": 0.5887, "step": 105400 }, { "epoch": 14.214497440043115, "grad_norm": 0.24969345331192017, "learning_rate": 0.00035536243600107787, "loss": 0.5882, "step": 105500 }, { "epoch": 14.227970897332256, "grad_norm": 0.28187185525894165, "learning_rate": 0.0003556992724333064, "loss": 0.5896, "step": 105600 }, { "epoch": 14.241444354621397, "grad_norm": 0.25488176941871643, "learning_rate": 0.0003560361088655349, "loss": 0.5883, "step": 105700 }, { "epoch": 14.254917811910536, "grad_norm": 0.2506967782974243, "learning_rate": 0.00035637294529776344, "loss": 0.5891, "step": 105800 }, { "epoch": 14.268391269199677, "grad_norm": 0.25195610523223877, "learning_rate": 0.0003567097817299919, "loss": 0.5894, "step": 105900 }, { "epoch": 14.281864726488816, "grad_norm": 0.2671971917152405, "learning_rate": 0.00035704661816222044, "loss": 0.5887, "step": 106000 }, { "epoch": 14.295338183777957, "grad_norm": 0.2650025188922882, "learning_rate": 0.0003573834545944489, "loss": 0.5892, "step": 106100 }, { "epoch": 14.308811641067098, "grad_norm": 0.28284475207328796, "learning_rate": 0.0003577202910266775, "loss": 0.5885, "step": 106200 }, { "epoch": 14.322285098356238, "grad_norm": 0.33997413516044617, "learning_rate": 0.00035805712745890596, "loss": 0.5884, "step": 106300 }, { "epoch": 14.335758555645379, "grad_norm": 0.24823813140392303, "learning_rate": 0.0003583939638911345, "loss": 0.5887, "step": 106400 }, { "epoch": 14.349232012934518, "grad_norm": 0.2505185008049011, "learning_rate": 0.00035873080032336295, "loss": 0.588, "step": 106500 }, { "epoch": 14.36270547022366, "grad_norm": 0.25902891159057617, "learning_rate": 0.00035906763675559153, "loss": 0.5876, "step": 106600 }, { "epoch": 14.3761789275128, "grad_norm": 0.26897504925727844, "learning_rate": 0.00035940447318782, "loss": 0.59, "step": 106700 }, { "epoch": 14.38965238480194, "grad_norm": 0.25394201278686523, "learning_rate": 0.0003597413096200485, "loss": 0.5897, "step": 106800 }, { "epoch": 14.40312584209108, "grad_norm": 0.2601204812526703, "learning_rate": 0.000360078146052277, "loss": 0.588, "step": 106900 }, { "epoch": 14.416599299380222, "grad_norm": 0.24837781488895416, "learning_rate": 0.00036041498248450557, "loss": 0.5884, "step": 107000 }, { "epoch": 14.430072756669361, "grad_norm": 0.26543179154396057, "learning_rate": 0.00036075181891673404, "loss": 0.5879, "step": 107100 }, { "epoch": 14.443546213958502, "grad_norm": 0.24572435021400452, "learning_rate": 0.00036108865534896257, "loss": 0.5894, "step": 107200 }, { "epoch": 14.457019671247641, "grad_norm": 0.2695096731185913, "learning_rate": 0.00036142549178119104, "loss": 0.5891, "step": 107300 }, { "epoch": 14.470493128536782, "grad_norm": 0.2589077353477478, "learning_rate": 0.0003617623282134196, "loss": 0.5893, "step": 107400 }, { "epoch": 14.483966585825923, "grad_norm": 0.25971105694770813, "learning_rate": 0.0003620991646456481, "loss": 0.5874, "step": 107500 }, { "epoch": 14.497440043115063, "grad_norm": 0.2521766722202301, "learning_rate": 0.0003624360010778766, "loss": 0.5897, "step": 107600 }, { "epoch": 14.510913500404204, "grad_norm": 0.3031100630760193, "learning_rate": 0.0003627728375101051, "loss": 0.5892, "step": 107700 }, { "epoch": 14.524386957693345, "grad_norm": 0.2625836431980133, "learning_rate": 0.0003631096739423336, "loss": 0.5888, "step": 107800 }, { "epoch": 14.537860414982484, "grad_norm": 0.23873884975910187, "learning_rate": 0.00036344651037456213, "loss": 0.5879, "step": 107900 }, { "epoch": 14.551333872271625, "grad_norm": 0.25986090302467346, "learning_rate": 0.00036378334680679065, "loss": 0.5874, "step": 108000 }, { "epoch": 14.564807329560765, "grad_norm": 0.25480014085769653, "learning_rate": 0.0003641201832390191, "loss": 0.5875, "step": 108100 }, { "epoch": 14.578280786849906, "grad_norm": 0.2615572512149811, "learning_rate": 0.00036445701967124765, "loss": 0.5888, "step": 108200 }, { "epoch": 14.591754244139047, "grad_norm": 0.23539955914020538, "learning_rate": 0.00036479385610347617, "loss": 0.5871, "step": 108300 }, { "epoch": 14.605227701428186, "grad_norm": 0.2459517866373062, "learning_rate": 0.0003651306925357047, "loss": 0.5874, "step": 108400 }, { "epoch": 14.618701158717327, "grad_norm": 0.2499772608280182, "learning_rate": 0.00036546752896793317, "loss": 0.5873, "step": 108500 }, { "epoch": 14.632174616006468, "grad_norm": 0.25050538778305054, "learning_rate": 0.0003658043654001617, "loss": 0.5881, "step": 108600 }, { "epoch": 14.645648073295607, "grad_norm": 0.2646254003047943, "learning_rate": 0.0003661412018323902, "loss": 0.5881, "step": 108700 }, { "epoch": 14.659121530584748, "grad_norm": 0.2541857361793518, "learning_rate": 0.00036647803826461874, "loss": 0.5881, "step": 108800 }, { "epoch": 14.672594987873888, "grad_norm": 0.24874447286128998, "learning_rate": 0.0003668148746968472, "loss": 0.5875, "step": 108900 }, { "epoch": 14.686068445163029, "grad_norm": 0.24940191209316254, "learning_rate": 0.00036715171112907573, "loss": 0.5872, "step": 109000 }, { "epoch": 14.69954190245217, "grad_norm": 0.27157846093177795, "learning_rate": 0.00036748854756130426, "loss": 0.588, "step": 109100 }, { "epoch": 14.71301535974131, "grad_norm": 0.24967309832572937, "learning_rate": 0.0003678253839935328, "loss": 0.5871, "step": 109200 }, { "epoch": 14.72648881703045, "grad_norm": 0.25281238555908203, "learning_rate": 0.00036816222042576125, "loss": 0.5885, "step": 109300 }, { "epoch": 14.739962274319591, "grad_norm": 0.2552086412906647, "learning_rate": 0.0003684990568579898, "loss": 0.5865, "step": 109400 }, { "epoch": 14.75343573160873, "grad_norm": 0.24793165922164917, "learning_rate": 0.00036883589329021825, "loss": 0.5884, "step": 109500 }, { "epoch": 14.766909188897872, "grad_norm": 0.25716647505760193, "learning_rate": 0.0003691727297224468, "loss": 0.587, "step": 109600 }, { "epoch": 14.780382646187011, "grad_norm": 0.26977935433387756, "learning_rate": 0.0003695095661546753, "loss": 0.5875, "step": 109700 }, { "epoch": 14.793856103476152, "grad_norm": 0.26242998242378235, "learning_rate": 0.0003698464025869038, "loss": 0.5868, "step": 109800 }, { "epoch": 14.807329560765293, "grad_norm": 0.24154193699359894, "learning_rate": 0.0003701832390191323, "loss": 0.5877, "step": 109900 }, { "epoch": 14.820803018054432, "grad_norm": 0.2609753906726837, "learning_rate": 0.00037052007545136087, "loss": 0.5875, "step": 110000 }, { "epoch": 14.834276475343573, "grad_norm": 0.24594362080097198, "learning_rate": 0.00037085691188358934, "loss": 0.5872, "step": 110100 }, { "epoch": 14.847749932632713, "grad_norm": 0.25846558809280396, "learning_rate": 0.00037119374831581786, "loss": 0.586, "step": 110200 }, { "epoch": 14.861223389921854, "grad_norm": 0.2533414661884308, "learning_rate": 0.00037153058474804633, "loss": 0.5877, "step": 110300 }, { "epoch": 14.874696847210995, "grad_norm": 0.23614071309566498, "learning_rate": 0.0003718674211802749, "loss": 0.5851, "step": 110400 }, { "epoch": 14.888170304500134, "grad_norm": 0.2678276300430298, "learning_rate": 0.0003722042576125034, "loss": 0.5857, "step": 110500 }, { "epoch": 14.901643761789275, "grad_norm": 0.2512679398059845, "learning_rate": 0.0003725410940447319, "loss": 0.5859, "step": 110600 }, { "epoch": 14.915117219078416, "grad_norm": 0.23398956656455994, "learning_rate": 0.0003728779304769604, "loss": 0.5865, "step": 110700 }, { "epoch": 14.928590676367556, "grad_norm": 0.2608170807361603, "learning_rate": 0.00037321476690918895, "loss": 0.5857, "step": 110800 }, { "epoch": 14.942064133656697, "grad_norm": 0.259620726108551, "learning_rate": 0.0003735516033414174, "loss": 0.5868, "step": 110900 }, { "epoch": 14.955537590945836, "grad_norm": 0.24535582959651947, "learning_rate": 0.00037388843977364595, "loss": 0.5864, "step": 111000 }, { "epoch": 14.969011048234977, "grad_norm": 0.24334117770195007, "learning_rate": 0.0003742252762058744, "loss": 0.5867, "step": 111100 }, { "epoch": 14.982484505524118, "grad_norm": 0.23684737086296082, "learning_rate": 0.000374562112638103, "loss": 0.5861, "step": 111200 }, { "epoch": 14.995957962813257, "grad_norm": 0.2213577777147293, "learning_rate": 0.00037489894907033147, "loss": 0.5862, "step": 111300 }, { "epoch": 15.0, "eval_loss": 0.5725352168083191, "eval_runtime": 4.9493, "eval_samples_per_second": 1010.243, "eval_steps_per_second": 15.962, "step": 111330 }, { "epoch": 15.009431420102398, "grad_norm": 0.2673628032207489, "learning_rate": 0.00037523578550256, "loss": 0.585, "step": 111400 }, { "epoch": 15.02290487739154, "grad_norm": 0.27163419127464294, "learning_rate": 0.00037557262193478846, "loss": 0.5858, "step": 111500 }, { "epoch": 15.036378334680679, "grad_norm": 0.24414892494678497, "learning_rate": 0.000375909458367017, "loss": 0.5854, "step": 111600 }, { "epoch": 15.04985179196982, "grad_norm": 0.2457321286201477, "learning_rate": 0.0003762462947992455, "loss": 0.5851, "step": 111700 }, { "epoch": 15.063325249258959, "grad_norm": 0.22743983566761017, "learning_rate": 0.00037658313123147403, "loss": 0.5854, "step": 111800 }, { "epoch": 15.0767987065481, "grad_norm": 0.29738926887512207, "learning_rate": 0.0003769199676637025, "loss": 0.5862, "step": 111900 }, { "epoch": 15.090272163837241, "grad_norm": 0.24945063889026642, "learning_rate": 0.00037725680409593103, "loss": 0.5848, "step": 112000 }, { "epoch": 15.10374562112638, "grad_norm": 0.23613038659095764, "learning_rate": 0.00037759364052815955, "loss": 0.585, "step": 112100 }, { "epoch": 15.117219078415522, "grad_norm": 0.2557319700717926, "learning_rate": 0.0003779304769603881, "loss": 0.5857, "step": 112200 }, { "epoch": 15.130692535704663, "grad_norm": 0.23350803554058075, "learning_rate": 0.00037826731339261655, "loss": 0.5848, "step": 112300 }, { "epoch": 15.144165992993802, "grad_norm": 0.23981426656246185, "learning_rate": 0.00037860414982484507, "loss": 0.5845, "step": 112400 }, { "epoch": 15.157639450282943, "grad_norm": 0.24413146078586578, "learning_rate": 0.0003789409862570736, "loss": 0.5848, "step": 112500 }, { "epoch": 15.171112907572082, "grad_norm": 0.24592573940753937, "learning_rate": 0.0003792778226893021, "loss": 0.5856, "step": 112600 }, { "epoch": 15.184586364861223, "grad_norm": 0.2301064431667328, "learning_rate": 0.0003796146591215306, "loss": 0.5839, "step": 112700 }, { "epoch": 15.198059822150364, "grad_norm": 0.239373579621315, "learning_rate": 0.0003799514955537591, "loss": 0.5849, "step": 112800 }, { "epoch": 15.211533279439504, "grad_norm": 0.2549658417701721, "learning_rate": 0.00038028833198598764, "loss": 0.5856, "step": 112900 }, { "epoch": 15.225006736728645, "grad_norm": 0.2565166652202606, "learning_rate": 0.0003806251684182161, "loss": 0.584, "step": 113000 }, { "epoch": 15.238480194017786, "grad_norm": 0.29910749197006226, "learning_rate": 0.00038096200485044463, "loss": 0.5859, "step": 113100 }, { "epoch": 15.251953651306925, "grad_norm": 0.24575622379779816, "learning_rate": 0.0003812988412826731, "loss": 0.585, "step": 113200 }, { "epoch": 15.265427108596066, "grad_norm": 0.24341675639152527, "learning_rate": 0.00038163567771490163, "loss": 0.5841, "step": 113300 }, { "epoch": 15.278900565885206, "grad_norm": 0.23660878837108612, "learning_rate": 0.00038197251414713015, "loss": 0.5848, "step": 113400 }, { "epoch": 15.292374023174347, "grad_norm": 0.2506781220436096, "learning_rate": 0.0003823093505793587, "loss": 0.5852, "step": 113500 }, { "epoch": 15.305847480463488, "grad_norm": 0.2417128086090088, "learning_rate": 0.00038264618701158715, "loss": 0.5843, "step": 113600 }, { "epoch": 15.319320937752627, "grad_norm": 0.25124529004096985, "learning_rate": 0.00038298302344381567, "loss": 0.5846, "step": 113700 }, { "epoch": 15.332794395041768, "grad_norm": 0.23341356217861176, "learning_rate": 0.0003833198598760442, "loss": 0.5839, "step": 113800 }, { "epoch": 15.346267852330907, "grad_norm": 0.24085913598537445, "learning_rate": 0.0003836566963082727, "loss": 0.5831, "step": 113900 }, { "epoch": 15.359741309620048, "grad_norm": 0.24675828218460083, "learning_rate": 0.0003839935327405012, "loss": 0.5841, "step": 114000 }, { "epoch": 15.37321476690919, "grad_norm": 0.2510143518447876, "learning_rate": 0.0003843303691727297, "loss": 0.5844, "step": 114100 }, { "epoch": 15.386688224198329, "grad_norm": 0.2701342701911926, "learning_rate": 0.00038466720560495824, "loss": 0.5832, "step": 114200 }, { "epoch": 15.40016168148747, "grad_norm": 0.24650222063064575, "learning_rate": 0.00038500404203718676, "loss": 0.5843, "step": 114300 }, { "epoch": 15.41363513877661, "grad_norm": 0.2266974151134491, "learning_rate": 0.00038534087846941523, "loss": 0.5834, "step": 114400 }, { "epoch": 15.42710859606575, "grad_norm": 0.23567084968090057, "learning_rate": 0.00038567771490164376, "loss": 0.5833, "step": 114500 }, { "epoch": 15.440582053354891, "grad_norm": 0.2436644285917282, "learning_rate": 0.0003860145513338723, "loss": 0.584, "step": 114600 }, { "epoch": 15.45405551064403, "grad_norm": 0.2424221932888031, "learning_rate": 0.0003863513877661008, "loss": 0.5838, "step": 114700 }, { "epoch": 15.467528967933172, "grad_norm": 0.26299792528152466, "learning_rate": 0.0003866882241983293, "loss": 0.5835, "step": 114800 }, { "epoch": 15.481002425222313, "grad_norm": 0.28555402159690857, "learning_rate": 0.0003870250606305578, "loss": 0.5835, "step": 114900 }, { "epoch": 15.494475882511452, "grad_norm": 0.22111740708351135, "learning_rate": 0.00038736189706278627, "loss": 0.5821, "step": 115000 }, { "epoch": 15.507949339800593, "grad_norm": 0.23244677484035492, "learning_rate": 0.00038769873349501485, "loss": 0.5834, "step": 115100 }, { "epoch": 15.521422797089734, "grad_norm": 0.24694505333900452, "learning_rate": 0.0003880355699272433, "loss": 0.5828, "step": 115200 }, { "epoch": 15.534896254378873, "grad_norm": 0.23246483504772186, "learning_rate": 0.00038837240635947184, "loss": 0.5831, "step": 115300 }, { "epoch": 15.548369711668014, "grad_norm": 0.24365293979644775, "learning_rate": 0.0003887092427917003, "loss": 0.5829, "step": 115400 }, { "epoch": 15.561843168957154, "grad_norm": 0.23336605727672577, "learning_rate": 0.0003890460792239289, "loss": 0.5836, "step": 115500 }, { "epoch": 15.575316626246295, "grad_norm": 0.2605874240398407, "learning_rate": 0.00038938291565615736, "loss": 0.5832, "step": 115600 }, { "epoch": 15.588790083535436, "grad_norm": 0.2375386506319046, "learning_rate": 0.0003897197520883859, "loss": 0.5846, "step": 115700 }, { "epoch": 15.602263540824575, "grad_norm": 0.23535482585430145, "learning_rate": 0.00039005658852061436, "loss": 0.5825, "step": 115800 }, { "epoch": 15.615736998113716, "grad_norm": 0.24907973408699036, "learning_rate": 0.00039039342495284293, "loss": 0.5844, "step": 115900 }, { "epoch": 15.629210455402855, "grad_norm": 0.22645050287246704, "learning_rate": 0.0003907302613850714, "loss": 0.5838, "step": 116000 }, { "epoch": 15.642683912691997, "grad_norm": 0.22864055633544922, "learning_rate": 0.00039106709781729993, "loss": 0.5832, "step": 116100 }, { "epoch": 15.656157369981138, "grad_norm": 0.24937167763710022, "learning_rate": 0.0003914039342495284, "loss": 0.5829, "step": 116200 }, { "epoch": 15.669630827270277, "grad_norm": 0.23465536534786224, "learning_rate": 0.000391740770681757, "loss": 0.5818, "step": 116300 }, { "epoch": 15.683104284559418, "grad_norm": 0.234084814786911, "learning_rate": 0.00039207760711398545, "loss": 0.5815, "step": 116400 }, { "epoch": 15.696577741848559, "grad_norm": 0.2288799285888672, "learning_rate": 0.00039241444354621397, "loss": 0.583, "step": 116500 }, { "epoch": 15.710051199137698, "grad_norm": 0.2244740128517151, "learning_rate": 0.00039275127997844244, "loss": 0.5826, "step": 116600 }, { "epoch": 15.72352465642684, "grad_norm": 0.23420396447181702, "learning_rate": 0.00039308811641067097, "loss": 0.5823, "step": 116700 }, { "epoch": 15.73699811371598, "grad_norm": 0.2523967921733856, "learning_rate": 0.0003934249528428995, "loss": 0.5821, "step": 116800 }, { "epoch": 15.75047157100512, "grad_norm": 0.23723213374614716, "learning_rate": 0.000393761789275128, "loss": 0.583, "step": 116900 }, { "epoch": 15.76394502829426, "grad_norm": 0.23717987537384033, "learning_rate": 0.0003940986257073565, "loss": 0.5826, "step": 117000 }, { "epoch": 15.7774184855834, "grad_norm": 0.225616455078125, "learning_rate": 0.000394435462139585, "loss": 0.5821, "step": 117100 }, { "epoch": 15.790891942872541, "grad_norm": 0.2519909143447876, "learning_rate": 0.00039477229857181353, "loss": 0.582, "step": 117200 }, { "epoch": 15.804365400161682, "grad_norm": 0.22680556774139404, "learning_rate": 0.00039510913500404206, "loss": 0.5837, "step": 117300 }, { "epoch": 15.817838857450822, "grad_norm": 0.2766021192073822, "learning_rate": 0.00039544597143627053, "loss": 0.5828, "step": 117400 }, { "epoch": 15.831312314739963, "grad_norm": 0.22796252369880676, "learning_rate": 0.00039578280786849905, "loss": 0.5822, "step": 117500 }, { "epoch": 15.844785772029102, "grad_norm": 0.262277752161026, "learning_rate": 0.0003961196443007276, "loss": 0.5822, "step": 117600 }, { "epoch": 15.858259229318243, "grad_norm": 0.22592703998088837, "learning_rate": 0.0003964564807329561, "loss": 0.5816, "step": 117700 }, { "epoch": 15.871732686607384, "grad_norm": 0.2287895381450653, "learning_rate": 0.00039679331716518457, "loss": 0.5824, "step": 117800 }, { "epoch": 15.885206143896523, "grad_norm": 0.23615659773349762, "learning_rate": 0.0003971301535974131, "loss": 0.5829, "step": 117900 }, { "epoch": 15.898679601185664, "grad_norm": 0.2578393816947937, "learning_rate": 0.0003974669900296416, "loss": 0.5824, "step": 118000 }, { "epoch": 15.912153058474805, "grad_norm": 0.22683760523796082, "learning_rate": 0.00039780382646187014, "loss": 0.5825, "step": 118100 }, { "epoch": 15.925626515763945, "grad_norm": 0.21271440386772156, "learning_rate": 0.0003981406628940986, "loss": 0.582, "step": 118200 }, { "epoch": 15.939099973053086, "grad_norm": 0.23060791194438934, "learning_rate": 0.00039847749932632714, "loss": 0.5813, "step": 118300 }, { "epoch": 15.952573430342225, "grad_norm": 0.25328826904296875, "learning_rate": 0.0003988143357585556, "loss": 0.5822, "step": 118400 }, { "epoch": 15.966046887631366, "grad_norm": 0.22637681663036346, "learning_rate": 0.0003991511721907842, "loss": 0.5807, "step": 118500 }, { "epoch": 15.979520344920507, "grad_norm": 0.2183547168970108, "learning_rate": 0.00039948800862301266, "loss": 0.5819, "step": 118600 }, { "epoch": 15.992993802209647, "grad_norm": 0.23710979521274567, "learning_rate": 0.0003998248450552412, "loss": 0.5812, "step": 118700 }, { "epoch": 16.0, "eval_loss": 0.5677627921104431, "eval_runtime": 4.9322, "eval_samples_per_second": 1013.741, "eval_steps_per_second": 16.017, "step": 118752 }, { "epoch": 16.006467259498788, "grad_norm": 0.23591957986354828, "learning_rate": 0.00040016168148746965, "loss": 0.5817, "step": 118800 }, { "epoch": 16.01994071678793, "grad_norm": 0.2597757875919342, "learning_rate": 0.00040049851791969823, "loss": 0.5813, "step": 118900 }, { "epoch": 16.03341417407707, "grad_norm": 0.23238997161388397, "learning_rate": 0.0004008353543519267, "loss": 0.5795, "step": 119000 }, { "epoch": 16.046887631366207, "grad_norm": 0.25627148151397705, "learning_rate": 0.0004011721907841552, "loss": 0.5802, "step": 119100 }, { "epoch": 16.06036108865535, "grad_norm": 0.2721649706363678, "learning_rate": 0.0004015090272163837, "loss": 0.5807, "step": 119200 }, { "epoch": 16.07383454594449, "grad_norm": 0.2221726030111313, "learning_rate": 0.0004018458636486123, "loss": 0.5802, "step": 119300 }, { "epoch": 16.08730800323363, "grad_norm": 0.23102732002735138, "learning_rate": 0.00040218270008084074, "loss": 0.5812, "step": 119400 }, { "epoch": 16.10078146052277, "grad_norm": 0.2319263368844986, "learning_rate": 0.00040251953651306927, "loss": 0.5799, "step": 119500 }, { "epoch": 16.11425491781191, "grad_norm": 0.22918906807899475, "learning_rate": 0.00040285637294529774, "loss": 0.5804, "step": 119600 }, { "epoch": 16.12772837510105, "grad_norm": 0.23667050898075104, "learning_rate": 0.0004031932093775263, "loss": 0.5802, "step": 119700 }, { "epoch": 16.14120183239019, "grad_norm": 0.25111550092697144, "learning_rate": 0.0004035300458097548, "loss": 0.5811, "step": 119800 }, { "epoch": 16.154675289679332, "grad_norm": 0.22679661214351654, "learning_rate": 0.0004038668822419833, "loss": 0.5804, "step": 119900 }, { "epoch": 16.168148746968473, "grad_norm": 0.23671898245811462, "learning_rate": 0.0004042037186742118, "loss": 0.5809, "step": 120000 }, { "epoch": 16.18162220425761, "grad_norm": 0.24510857462882996, "learning_rate": 0.00040454055510644036, "loss": 0.5821, "step": 120100 }, { "epoch": 16.195095661546752, "grad_norm": 0.2380576729774475, "learning_rate": 0.00040487739153866883, "loss": 0.5804, "step": 120200 }, { "epoch": 16.208569118835893, "grad_norm": 0.23525765538215637, "learning_rate": 0.00040521422797089735, "loss": 0.5808, "step": 120300 }, { "epoch": 16.222042576125034, "grad_norm": 0.2348586469888687, "learning_rate": 0.0004055510644031258, "loss": 0.5799, "step": 120400 }, { "epoch": 16.235516033414175, "grad_norm": 0.22590427100658417, "learning_rate": 0.00040588790083535435, "loss": 0.5814, "step": 120500 }, { "epoch": 16.248989490703316, "grad_norm": 0.2082596868276596, "learning_rate": 0.0004062247372675829, "loss": 0.5804, "step": 120600 }, { "epoch": 16.262462947992454, "grad_norm": 0.2206827700138092, "learning_rate": 0.0004065615736998114, "loss": 0.5804, "step": 120700 }, { "epoch": 16.275936405281595, "grad_norm": 0.26377829909324646, "learning_rate": 0.00040689841013203987, "loss": 0.5792, "step": 120800 }, { "epoch": 16.289409862570736, "grad_norm": 0.2399434894323349, "learning_rate": 0.0004072352465642684, "loss": 0.5794, "step": 120900 }, { "epoch": 16.302883319859877, "grad_norm": 0.2233598381280899, "learning_rate": 0.0004075720829964969, "loss": 0.5799, "step": 121000 }, { "epoch": 16.316356777149018, "grad_norm": 0.2211935967206955, "learning_rate": 0.00040790891942872544, "loss": 0.5805, "step": 121100 }, { "epoch": 16.329830234438155, "grad_norm": 0.22849605977535248, "learning_rate": 0.0004082457558609539, "loss": 0.5799, "step": 121200 }, { "epoch": 16.343303691727296, "grad_norm": 0.2108621597290039, "learning_rate": 0.00040858259229318244, "loss": 0.581, "step": 121300 }, { "epoch": 16.356777149016438, "grad_norm": 0.2243906408548355, "learning_rate": 0.00040891942872541096, "loss": 0.5795, "step": 121400 }, { "epoch": 16.37025060630558, "grad_norm": 0.22302287817001343, "learning_rate": 0.0004092562651576395, "loss": 0.5793, "step": 121500 }, { "epoch": 16.38372406359472, "grad_norm": 0.23931078612804413, "learning_rate": 0.00040959310158986795, "loss": 0.5802, "step": 121600 }, { "epoch": 16.397197520883857, "grad_norm": 0.23512375354766846, "learning_rate": 0.0004099299380220965, "loss": 0.58, "step": 121700 }, { "epoch": 16.410670978173, "grad_norm": 0.22877946496009827, "learning_rate": 0.000410266774454325, "loss": 0.5785, "step": 121800 }, { "epoch": 16.42414443546214, "grad_norm": 0.22423981130123138, "learning_rate": 0.0004106036108865535, "loss": 0.5794, "step": 121900 }, { "epoch": 16.43761789275128, "grad_norm": 0.22294847667217255, "learning_rate": 0.000410940447318782, "loss": 0.5801, "step": 122000 }, { "epoch": 16.45109135004042, "grad_norm": 0.23378370702266693, "learning_rate": 0.0004112772837510105, "loss": 0.5814, "step": 122100 }, { "epoch": 16.464564807329563, "grad_norm": 0.24044610559940338, "learning_rate": 0.000411614120183239, "loss": 0.5798, "step": 122200 }, { "epoch": 16.4780382646187, "grad_norm": 0.2618865668773651, "learning_rate": 0.00041195095661546757, "loss": 0.5797, "step": 122300 }, { "epoch": 16.49151172190784, "grad_norm": 0.24262408912181854, "learning_rate": 0.00041228779304769604, "loss": 0.5807, "step": 122400 }, { "epoch": 16.504985179196982, "grad_norm": 0.2293367236852646, "learning_rate": 0.00041262462947992456, "loss": 0.5787, "step": 122500 }, { "epoch": 16.518458636486123, "grad_norm": 0.22175849974155426, "learning_rate": 0.00041296146591215303, "loss": 0.58, "step": 122600 }, { "epoch": 16.531932093775264, "grad_norm": 0.23920710384845734, "learning_rate": 0.0004132983023443816, "loss": 0.58, "step": 122700 }, { "epoch": 16.545405551064402, "grad_norm": 0.22499006986618042, "learning_rate": 0.0004136351387766101, "loss": 0.5796, "step": 122800 }, { "epoch": 16.558879008353543, "grad_norm": 0.22133475542068481, "learning_rate": 0.0004139719752088386, "loss": 0.5791, "step": 122900 }, { "epoch": 16.572352465642684, "grad_norm": 0.23046815395355225, "learning_rate": 0.0004143088116410671, "loss": 0.5782, "step": 123000 }, { "epoch": 16.585825922931825, "grad_norm": 0.21978214383125305, "learning_rate": 0.00041464564807329566, "loss": 0.579, "step": 123100 }, { "epoch": 16.599299380220966, "grad_norm": 0.2368471920490265, "learning_rate": 0.0004149824845055241, "loss": 0.58, "step": 123200 }, { "epoch": 16.612772837510104, "grad_norm": 0.2176971584558487, "learning_rate": 0.00041531932093775265, "loss": 0.5793, "step": 123300 }, { "epoch": 16.626246294799245, "grad_norm": 0.2417602688074112, "learning_rate": 0.0004156561573699811, "loss": 0.5794, "step": 123400 }, { "epoch": 16.639719752088386, "grad_norm": 0.21602439880371094, "learning_rate": 0.0004159929938022097, "loss": 0.5783, "step": 123500 }, { "epoch": 16.653193209377527, "grad_norm": 0.23156560957431793, "learning_rate": 0.00041632983023443817, "loss": 0.5798, "step": 123600 }, { "epoch": 16.666666666666668, "grad_norm": 0.23626449704170227, "learning_rate": 0.0004166666666666667, "loss": 0.5791, "step": 123700 }, { "epoch": 16.680140123955805, "grad_norm": 0.22528426349163055, "learning_rate": 0.00041700350309889516, "loss": 0.5796, "step": 123800 }, { "epoch": 16.693613581244946, "grad_norm": 0.2296755164861679, "learning_rate": 0.0004173403395311237, "loss": 0.5791, "step": 123900 }, { "epoch": 16.707087038534087, "grad_norm": 0.21463710069656372, "learning_rate": 0.0004176771759633522, "loss": 0.5792, "step": 124000 }, { "epoch": 16.72056049582323, "grad_norm": 0.2169266790151596, "learning_rate": 0.00041801401239558074, "loss": 0.5783, "step": 124100 }, { "epoch": 16.73403395311237, "grad_norm": 0.23226359486579895, "learning_rate": 0.0004183508488278092, "loss": 0.5786, "step": 124200 }, { "epoch": 16.74750741040151, "grad_norm": 0.21572047472000122, "learning_rate": 0.00041868768526003773, "loss": 0.5796, "step": 124300 }, { "epoch": 16.760980867690648, "grad_norm": 0.21549776196479797, "learning_rate": 0.00041902452169226626, "loss": 0.5786, "step": 124400 }, { "epoch": 16.77445432497979, "grad_norm": 0.2373117208480835, "learning_rate": 0.0004193613581244948, "loss": 0.5791, "step": 124500 }, { "epoch": 16.78792778226893, "grad_norm": 0.21942147612571716, "learning_rate": 0.00041969819455672325, "loss": 0.5773, "step": 124600 }, { "epoch": 16.80140123955807, "grad_norm": 0.21066327393054962, "learning_rate": 0.0004200350309889518, "loss": 0.5779, "step": 124700 }, { "epoch": 16.814874696847212, "grad_norm": 0.22337639331817627, "learning_rate": 0.0004203718674211803, "loss": 0.5793, "step": 124800 }, { "epoch": 16.82834815413635, "grad_norm": 0.23993045091629028, "learning_rate": 0.0004207087038534088, "loss": 0.579, "step": 124900 }, { "epoch": 16.84182161142549, "grad_norm": 0.21157868206501007, "learning_rate": 0.0004210455402856373, "loss": 0.5783, "step": 125000 }, { "epoch": 16.855295068714632, "grad_norm": 0.2286837249994278, "learning_rate": 0.0004213823767178658, "loss": 0.5798, "step": 125100 }, { "epoch": 16.868768526003773, "grad_norm": 0.23027680814266205, "learning_rate": 0.00042171921315009434, "loss": 0.5779, "step": 125200 }, { "epoch": 16.882241983292914, "grad_norm": 0.23066718876361847, "learning_rate": 0.00042205604958232287, "loss": 0.5782, "step": 125300 }, { "epoch": 16.89571544058205, "grad_norm": 0.2091311663389206, "learning_rate": 0.00042239288601455134, "loss": 0.5776, "step": 125400 }, { "epoch": 16.909188897871193, "grad_norm": 0.21950192749500275, "learning_rate": 0.00042272972244677986, "loss": 0.5781, "step": 125500 }, { "epoch": 16.922662355160334, "grad_norm": 0.23509205877780914, "learning_rate": 0.00042306655887900833, "loss": 0.5781, "step": 125600 }, { "epoch": 16.936135812449475, "grad_norm": 0.23926116526126862, "learning_rate": 0.0004234033953112369, "loss": 0.5782, "step": 125700 }, { "epoch": 16.949609269738616, "grad_norm": 0.21354526281356812, "learning_rate": 0.0004237402317434654, "loss": 0.5788, "step": 125800 }, { "epoch": 16.963082727027754, "grad_norm": 0.21780803799629211, "learning_rate": 0.0004240770681756939, "loss": 0.5771, "step": 125900 }, { "epoch": 16.976556184316895, "grad_norm": 0.21488866209983826, "learning_rate": 0.0004244139046079224, "loss": 0.5781, "step": 126000 }, { "epoch": 16.990029641606036, "grad_norm": 0.2582053542137146, "learning_rate": 0.00042475074104015095, "loss": 0.5782, "step": 126100 }, { "epoch": 17.0, "eval_loss": 0.5641703605651855, "eval_runtime": 4.9356, "eval_samples_per_second": 1013.038, "eval_steps_per_second": 16.006, "step": 126174 }, { "epoch": 17.003503098895177, "grad_norm": 0.2513197660446167, "learning_rate": 0.0004250875774723794, "loss": 0.5772, "step": 126200 }, { "epoch": 17.016976556184318, "grad_norm": 0.2111562341451645, "learning_rate": 0.00042542441390460795, "loss": 0.5762, "step": 126300 }, { "epoch": 17.03045001347346, "grad_norm": 0.2174200713634491, "learning_rate": 0.0004257612503368364, "loss": 0.577, "step": 126400 }, { "epoch": 17.043923470762596, "grad_norm": 0.22590124607086182, "learning_rate": 0.000426098086769065, "loss": 0.5783, "step": 126500 }, { "epoch": 17.057396928051737, "grad_norm": 0.21910308301448822, "learning_rate": 0.00042643492320129346, "loss": 0.5756, "step": 126600 }, { "epoch": 17.07087038534088, "grad_norm": 0.22757333517074585, "learning_rate": 0.000426771759633522, "loss": 0.5776, "step": 126700 }, { "epoch": 17.08434384263002, "grad_norm": 0.2316286265850067, "learning_rate": 0.00042710859606575046, "loss": 0.5765, "step": 126800 }, { "epoch": 17.09781729991916, "grad_norm": 0.22320769727230072, "learning_rate": 0.00042744543249797904, "loss": 0.5776, "step": 126900 }, { "epoch": 17.111290757208298, "grad_norm": 0.24563650786876678, "learning_rate": 0.0004277822689302075, "loss": 0.5768, "step": 127000 }, { "epoch": 17.12476421449744, "grad_norm": 0.2313052862882614, "learning_rate": 0.00042811910536243603, "loss": 0.5764, "step": 127100 }, { "epoch": 17.13823767178658, "grad_norm": 0.21952031552791595, "learning_rate": 0.0004284559417946645, "loss": 0.5776, "step": 127200 }, { "epoch": 17.15171112907572, "grad_norm": 0.2324984222650528, "learning_rate": 0.0004287927782268931, "loss": 0.5764, "step": 127300 }, { "epoch": 17.165184586364862, "grad_norm": 0.2190321534872055, "learning_rate": 0.00042912961465912155, "loss": 0.5777, "step": 127400 }, { "epoch": 17.178658043654, "grad_norm": 0.21209074556827545, "learning_rate": 0.0004294664510913501, "loss": 0.5768, "step": 127500 }, { "epoch": 17.19213150094314, "grad_norm": 0.21914049983024597, "learning_rate": 0.00042980328752357855, "loss": 0.5782, "step": 127600 }, { "epoch": 17.205604958232282, "grad_norm": 0.23630505800247192, "learning_rate": 0.00043014012395580707, "loss": 0.5774, "step": 127700 }, { "epoch": 17.219078415521423, "grad_norm": 0.20875222980976105, "learning_rate": 0.0004304769603880356, "loss": 0.5761, "step": 127800 }, { "epoch": 17.232551872810564, "grad_norm": 0.2224559783935547, "learning_rate": 0.0004308137968202641, "loss": 0.5772, "step": 127900 }, { "epoch": 17.246025330099705, "grad_norm": 0.241514652967453, "learning_rate": 0.0004311506332524926, "loss": 0.5777, "step": 128000 }, { "epoch": 17.259498787388843, "grad_norm": 0.23711393773555756, "learning_rate": 0.0004314874696847211, "loss": 0.5773, "step": 128100 }, { "epoch": 17.272972244677984, "grad_norm": 0.21567094326019287, "learning_rate": 0.00043182430611694964, "loss": 0.577, "step": 128200 }, { "epoch": 17.286445701967125, "grad_norm": 0.25360530614852905, "learning_rate": 0.00043216114254917816, "loss": 0.5761, "step": 128300 }, { "epoch": 17.299919159256266, "grad_norm": 0.2314491719007492, "learning_rate": 0.00043249797898140663, "loss": 0.5769, "step": 128400 }, { "epoch": 17.313392616545407, "grad_norm": 0.22653934359550476, "learning_rate": 0.00043283481541363516, "loss": 0.5762, "step": 128500 }, { "epoch": 17.326866073834545, "grad_norm": 0.2007305771112442, "learning_rate": 0.0004331716518458637, "loss": 0.577, "step": 128600 }, { "epoch": 17.340339531123686, "grad_norm": 0.227739155292511, "learning_rate": 0.0004335084882780922, "loss": 0.5762, "step": 128700 }, { "epoch": 17.353812988412827, "grad_norm": 0.2378777712583542, "learning_rate": 0.0004338453247103207, "loss": 0.5757, "step": 128800 }, { "epoch": 17.367286445701968, "grad_norm": 0.21473537385463715, "learning_rate": 0.0004341821611425492, "loss": 0.5761, "step": 128900 }, { "epoch": 17.38075990299111, "grad_norm": 0.2196899801492691, "learning_rate": 0.0004345189975747777, "loss": 0.5769, "step": 129000 }, { "epoch": 17.394233360280246, "grad_norm": 0.21520298719406128, "learning_rate": 0.0004348558340070062, "loss": 0.5765, "step": 129100 }, { "epoch": 17.407706817569387, "grad_norm": 0.2181941717863083, "learning_rate": 0.0004351926704392347, "loss": 0.5764, "step": 129200 }, { "epoch": 17.42118027485853, "grad_norm": 0.21209533512592316, "learning_rate": 0.0004355295068714632, "loss": 0.5773, "step": 129300 }, { "epoch": 17.43465373214767, "grad_norm": 0.23043391108512878, "learning_rate": 0.0004358663433036917, "loss": 0.5767, "step": 129400 }, { "epoch": 17.44812718943681, "grad_norm": 0.22494013607501984, "learning_rate": 0.00043620317973592024, "loss": 0.5764, "step": 129500 }, { "epoch": 17.461600646725948, "grad_norm": 0.22653637826442719, "learning_rate": 0.00043654001616814876, "loss": 0.5767, "step": 129600 }, { "epoch": 17.47507410401509, "grad_norm": 0.20449140667915344, "learning_rate": 0.00043687685260037723, "loss": 0.576, "step": 129700 }, { "epoch": 17.48854756130423, "grad_norm": 0.2127702236175537, "learning_rate": 0.00043721368903260576, "loss": 0.5762, "step": 129800 }, { "epoch": 17.50202101859337, "grad_norm": 0.21842697262763977, "learning_rate": 0.0004375505254648343, "loss": 0.5769, "step": 129900 }, { "epoch": 17.515494475882512, "grad_norm": 0.22426985204219818, "learning_rate": 0.0004378873618970628, "loss": 0.5752, "step": 130000 }, { "epoch": 17.528967933171653, "grad_norm": 0.21152335405349731, "learning_rate": 0.0004382241983292913, "loss": 0.5751, "step": 130100 }, { "epoch": 17.54244139046079, "grad_norm": 0.20793594419956207, "learning_rate": 0.0004385610347615198, "loss": 0.5758, "step": 130200 }, { "epoch": 17.555914847749932, "grad_norm": 0.21515581011772156, "learning_rate": 0.0004388978711937483, "loss": 0.5773, "step": 130300 }, { "epoch": 17.569388305039073, "grad_norm": 0.2147485613822937, "learning_rate": 0.00043923470762597685, "loss": 0.5748, "step": 130400 }, { "epoch": 17.582861762328214, "grad_norm": 0.21714787185192108, "learning_rate": 0.0004395715440582053, "loss": 0.5758, "step": 130500 }, { "epoch": 17.596335219617355, "grad_norm": 0.21718329191207886, "learning_rate": 0.00043990838049043384, "loss": 0.5762, "step": 130600 }, { "epoch": 17.609808676906493, "grad_norm": 0.2117299735546112, "learning_rate": 0.00044024521692266237, "loss": 0.5743, "step": 130700 }, { "epoch": 17.623282134195634, "grad_norm": 0.20716696977615356, "learning_rate": 0.0004405820533548909, "loss": 0.5751, "step": 130800 }, { "epoch": 17.636755591484775, "grad_norm": 0.21852104365825653, "learning_rate": 0.00044091888978711936, "loss": 0.575, "step": 130900 }, { "epoch": 17.650229048773916, "grad_norm": 0.21645832061767578, "learning_rate": 0.0004412557262193479, "loss": 0.5747, "step": 131000 }, { "epoch": 17.663702506063057, "grad_norm": 0.21344801783561707, "learning_rate": 0.00044159256265157635, "loss": 0.5762, "step": 131100 }, { "epoch": 17.677175963352195, "grad_norm": 0.2275877147912979, "learning_rate": 0.00044192939908380493, "loss": 0.5752, "step": 131200 }, { "epoch": 17.690649420641336, "grad_norm": 0.21926811337471008, "learning_rate": 0.0004422662355160334, "loss": 0.5756, "step": 131300 }, { "epoch": 17.704122877930477, "grad_norm": 0.22764620184898376, "learning_rate": 0.00044260307194826193, "loss": 0.576, "step": 131400 }, { "epoch": 17.717596335219618, "grad_norm": 0.21464844048023224, "learning_rate": 0.0004429399083804904, "loss": 0.5765, "step": 131500 }, { "epoch": 17.73106979250876, "grad_norm": 0.20943708717823029, "learning_rate": 0.000443276744812719, "loss": 0.5757, "step": 131600 }, { "epoch": 17.7445432497979, "grad_norm": 0.2095455825328827, "learning_rate": 0.00044361358124494745, "loss": 0.5752, "step": 131700 }, { "epoch": 17.758016707087037, "grad_norm": 0.21979288756847382, "learning_rate": 0.00044395041767717597, "loss": 0.5764, "step": 131800 }, { "epoch": 17.77149016437618, "grad_norm": 0.21057602763175964, "learning_rate": 0.00044428725410940444, "loss": 0.5745, "step": 131900 }, { "epoch": 17.78496362166532, "grad_norm": 0.21133022010326385, "learning_rate": 0.000444624090541633, "loss": 0.5753, "step": 132000 }, { "epoch": 17.79843707895446, "grad_norm": 0.22609752416610718, "learning_rate": 0.0004449609269738615, "loss": 0.574, "step": 132100 }, { "epoch": 17.8119105362436, "grad_norm": 0.20185767114162445, "learning_rate": 0.00044529776340609, "loss": 0.5753, "step": 132200 }, { "epoch": 17.82538399353274, "grad_norm": 0.22553706169128418, "learning_rate": 0.0004456345998383185, "loss": 0.5746, "step": 132300 }, { "epoch": 17.83885745082188, "grad_norm": 0.2162536233663559, "learning_rate": 0.00044597143627054706, "loss": 0.576, "step": 132400 }, { "epoch": 17.85233090811102, "grad_norm": 0.20499248802661896, "learning_rate": 0.00044630827270277553, "loss": 0.5761, "step": 132500 }, { "epoch": 17.865804365400162, "grad_norm": 0.2766418159008026, "learning_rate": 0.00044664510913500406, "loss": 0.575, "step": 132600 }, { "epoch": 17.879277822689303, "grad_norm": 0.2171289175748825, "learning_rate": 0.0004469819455672325, "loss": 0.5743, "step": 132700 }, { "epoch": 17.89275127997844, "grad_norm": 0.22398334741592407, "learning_rate": 0.00044731878199946105, "loss": 0.5755, "step": 132800 }, { "epoch": 17.906224737267582, "grad_norm": 0.21551981568336487, "learning_rate": 0.0004476556184316896, "loss": 0.575, "step": 132900 }, { "epoch": 17.919698194556723, "grad_norm": 0.2018967866897583, "learning_rate": 0.0004479924548639181, "loss": 0.5747, "step": 133000 }, { "epoch": 17.933171651845864, "grad_norm": 0.21150609850883484, "learning_rate": 0.00044832929129614657, "loss": 0.5749, "step": 133100 }, { "epoch": 17.946645109135005, "grad_norm": 0.20147375762462616, "learning_rate": 0.0004486661277283751, "loss": 0.5762, "step": 133200 }, { "epoch": 17.960118566424143, "grad_norm": 0.19710470736026764, "learning_rate": 0.0004490029641606036, "loss": 0.5744, "step": 133300 }, { "epoch": 17.973592023713284, "grad_norm": 0.19882291555404663, "learning_rate": 0.00044933980059283214, "loss": 0.576, "step": 133400 }, { "epoch": 17.987065481002425, "grad_norm": 0.22333897650241852, "learning_rate": 0.0004496766370250606, "loss": 0.5746, "step": 133500 }, { "epoch": 18.0, "eval_loss": 0.5610068440437317, "eval_runtime": 4.9314, "eval_samples_per_second": 1013.903, "eval_steps_per_second": 16.02, "step": 133596 }, { "epoch": 18.000538938291566, "grad_norm": 0.2170102447271347, "learning_rate": 0.00045001347345728914, "loss": 0.5741, "step": 133600 }, { "epoch": 18.014012395580707, "grad_norm": 0.20267732441425323, "learning_rate": 0.00045035030988951766, "loss": 0.5736, "step": 133700 }, { "epoch": 18.027485852869848, "grad_norm": 0.22220991551876068, "learning_rate": 0.0004506871463217462, "loss": 0.5749, "step": 133800 }, { "epoch": 18.040959310158986, "grad_norm": 0.21785970032215118, "learning_rate": 0.00045102398275397466, "loss": 0.5727, "step": 133900 }, { "epoch": 18.054432767448127, "grad_norm": 0.21595804393291473, "learning_rate": 0.0004513608191862032, "loss": 0.5739, "step": 134000 }, { "epoch": 18.067906224737268, "grad_norm": 0.20884045958518982, "learning_rate": 0.0004516976556184317, "loss": 0.5742, "step": 134100 }, { "epoch": 18.08137968202641, "grad_norm": 0.20324234664440155, "learning_rate": 0.00045203449205066023, "loss": 0.5733, "step": 134200 }, { "epoch": 18.09485313931555, "grad_norm": 0.2177741974592209, "learning_rate": 0.0004523713284828887, "loss": 0.5736, "step": 134300 }, { "epoch": 18.108326596604687, "grad_norm": 0.23607109487056732, "learning_rate": 0.0004527081649151172, "loss": 0.5733, "step": 134400 }, { "epoch": 18.12180005389383, "grad_norm": 0.213492751121521, "learning_rate": 0.0004530450013473457, "loss": 0.574, "step": 134500 }, { "epoch": 18.13527351118297, "grad_norm": 0.2042047679424286, "learning_rate": 0.00045338183777957427, "loss": 0.5733, "step": 134600 }, { "epoch": 18.14874696847211, "grad_norm": 0.21351635456085205, "learning_rate": 0.00045371867421180274, "loss": 0.5738, "step": 134700 }, { "epoch": 18.16222042576125, "grad_norm": 0.21454507112503052, "learning_rate": 0.00045405551064403127, "loss": 0.5738, "step": 134800 }, { "epoch": 18.17569388305039, "grad_norm": 0.21550975739955902, "learning_rate": 0.00045439234707625974, "loss": 0.5745, "step": 134900 }, { "epoch": 18.18916734033953, "grad_norm": 0.21730247139930725, "learning_rate": 0.0004547291835084883, "loss": 0.5733, "step": 135000 }, { "epoch": 18.20264079762867, "grad_norm": 0.207492858171463, "learning_rate": 0.0004550660199407168, "loss": 0.5747, "step": 135100 }, { "epoch": 18.216114254917812, "grad_norm": 0.22966423630714417, "learning_rate": 0.0004554028563729453, "loss": 0.5735, "step": 135200 }, { "epoch": 18.229587712206953, "grad_norm": 0.19933751225471497, "learning_rate": 0.0004557396928051738, "loss": 0.5737, "step": 135300 }, { "epoch": 18.243061169496094, "grad_norm": 0.1956418752670288, "learning_rate": 0.00045607652923740236, "loss": 0.5733, "step": 135400 }, { "epoch": 18.256534626785232, "grad_norm": 0.2122432291507721, "learning_rate": 0.00045641336566963083, "loss": 0.574, "step": 135500 }, { "epoch": 18.270008084074373, "grad_norm": 0.21102982759475708, "learning_rate": 0.00045675020210185935, "loss": 0.5727, "step": 135600 }, { "epoch": 18.283481541363514, "grad_norm": 0.2122666984796524, "learning_rate": 0.0004570870385340878, "loss": 0.5733, "step": 135700 }, { "epoch": 18.296954998652655, "grad_norm": 0.19378972053527832, "learning_rate": 0.0004574238749663164, "loss": 0.5736, "step": 135800 }, { "epoch": 18.310428455941796, "grad_norm": 0.21588648855686188, "learning_rate": 0.00045776071139854487, "loss": 0.5732, "step": 135900 }, { "epoch": 18.323901913230934, "grad_norm": 0.21509158611297607, "learning_rate": 0.0004580975478307734, "loss": 0.5732, "step": 136000 }, { "epoch": 18.337375370520075, "grad_norm": 0.21670109033584595, "learning_rate": 0.00045843438426300187, "loss": 0.5733, "step": 136100 }, { "epoch": 18.350848827809216, "grad_norm": 0.21019189059734344, "learning_rate": 0.00045877122069523044, "loss": 0.573, "step": 136200 }, { "epoch": 18.364322285098357, "grad_norm": 0.22340062260627747, "learning_rate": 0.0004591080571274589, "loss": 0.5727, "step": 136300 }, { "epoch": 18.377795742387498, "grad_norm": 0.20567823946475983, "learning_rate": 0.00045944489355968744, "loss": 0.5737, "step": 136400 }, { "epoch": 18.391269199676636, "grad_norm": 0.19434307515621185, "learning_rate": 0.0004597817299919159, "loss": 0.5737, "step": 136500 }, { "epoch": 18.404742656965777, "grad_norm": 0.20298829674720764, "learning_rate": 0.00046011856642414443, "loss": 0.5737, "step": 136600 }, { "epoch": 18.418216114254918, "grad_norm": 0.22724448144435883, "learning_rate": 0.00046045540285637296, "loss": 0.5754, "step": 136700 }, { "epoch": 18.43168957154406, "grad_norm": 0.227760910987854, "learning_rate": 0.0004607922392886015, "loss": 0.5729, "step": 136800 }, { "epoch": 18.4451630288332, "grad_norm": 0.2465094029903412, "learning_rate": 0.00046112907572082995, "loss": 0.5738, "step": 136900 }, { "epoch": 18.458636486122337, "grad_norm": 0.20205485820770264, "learning_rate": 0.0004614659121530585, "loss": 0.5727, "step": 137000 }, { "epoch": 18.47210994341148, "grad_norm": 0.23256918787956238, "learning_rate": 0.000461802748585287, "loss": 0.5732, "step": 137100 }, { "epoch": 18.48558340070062, "grad_norm": 0.20166447758674622, "learning_rate": 0.0004621395850175155, "loss": 0.573, "step": 137200 }, { "epoch": 18.49905685798976, "grad_norm": 0.21253959834575653, "learning_rate": 0.000462476421449744, "loss": 0.5739, "step": 137300 }, { "epoch": 18.5125303152789, "grad_norm": 0.22969654202461243, "learning_rate": 0.0004628132578819725, "loss": 0.5721, "step": 137400 }, { "epoch": 18.526003772568043, "grad_norm": 0.1984296441078186, "learning_rate": 0.00046315009431420104, "loss": 0.573, "step": 137500 }, { "epoch": 18.53947722985718, "grad_norm": 0.20650452375411987, "learning_rate": 0.00046348693074642957, "loss": 0.573, "step": 137600 }, { "epoch": 18.55295068714632, "grad_norm": 0.20221292972564697, "learning_rate": 0.00046382376717865804, "loss": 0.5734, "step": 137700 }, { "epoch": 18.566424144435462, "grad_norm": 0.2057451754808426, "learning_rate": 0.00046416060361088656, "loss": 0.5728, "step": 137800 }, { "epoch": 18.579897601724603, "grad_norm": 0.23063333332538605, "learning_rate": 0.0004644974400431151, "loss": 0.5733, "step": 137900 }, { "epoch": 18.593371059013744, "grad_norm": 0.20474247634410858, "learning_rate": 0.0004648342764753436, "loss": 0.5722, "step": 138000 }, { "epoch": 18.606844516302882, "grad_norm": 0.22316817939281464, "learning_rate": 0.0004651711129075721, "loss": 0.5718, "step": 138100 }, { "epoch": 18.620317973592023, "grad_norm": 0.20774810016155243, "learning_rate": 0.0004655079493398006, "loss": 0.5728, "step": 138200 }, { "epoch": 18.633791430881164, "grad_norm": 0.20184698700904846, "learning_rate": 0.0004658447857720291, "loss": 0.5732, "step": 138300 }, { "epoch": 18.647264888170305, "grad_norm": 0.19400109350681305, "learning_rate": 0.00046618162220425765, "loss": 0.5736, "step": 138400 }, { "epoch": 18.660738345459446, "grad_norm": 0.24667800962924957, "learning_rate": 0.0004665184586364861, "loss": 0.5732, "step": 138500 }, { "epoch": 18.674211802748584, "grad_norm": 0.21212635934352875, "learning_rate": 0.00046685529506871465, "loss": 0.5724, "step": 138600 }, { "epoch": 18.687685260037725, "grad_norm": 0.20222191512584686, "learning_rate": 0.0004671921315009431, "loss": 0.572, "step": 138700 }, { "epoch": 18.701158717326866, "grad_norm": 0.19624833762645721, "learning_rate": 0.0004675289679331717, "loss": 0.5718, "step": 138800 }, { "epoch": 18.714632174616007, "grad_norm": 0.21142026782035828, "learning_rate": 0.00046786580436540017, "loss": 0.5729, "step": 138900 }, { "epoch": 18.728105631905148, "grad_norm": 0.21044322848320007, "learning_rate": 0.0004682026407976287, "loss": 0.573, "step": 139000 }, { "epoch": 18.74157908919429, "grad_norm": 0.21348129212856293, "learning_rate": 0.00046853947722985716, "loss": 0.573, "step": 139100 }, { "epoch": 18.755052546483427, "grad_norm": 0.21009230613708496, "learning_rate": 0.00046887631366208574, "loss": 0.5717, "step": 139200 }, { "epoch": 18.768526003772568, "grad_norm": 0.19502140581607819, "learning_rate": 0.0004692131500943142, "loss": 0.571, "step": 139300 }, { "epoch": 18.78199946106171, "grad_norm": 0.21357713639736176, "learning_rate": 0.00046954998652654273, "loss": 0.5717, "step": 139400 }, { "epoch": 18.79547291835085, "grad_norm": 0.19493907690048218, "learning_rate": 0.0004698868229587712, "loss": 0.573, "step": 139500 }, { "epoch": 18.80894637563999, "grad_norm": 0.2143537551164627, "learning_rate": 0.0004702236593909998, "loss": 0.5718, "step": 139600 }, { "epoch": 18.82241983292913, "grad_norm": 0.2231684923171997, "learning_rate": 0.00047056049582322825, "loss": 0.5723, "step": 139700 }, { "epoch": 18.83589329021827, "grad_norm": 0.1970888376235962, "learning_rate": 0.0004708973322554568, "loss": 0.5722, "step": 139800 }, { "epoch": 18.84936674750741, "grad_norm": 0.185745969414711, "learning_rate": 0.00047123416868768525, "loss": 0.5718, "step": 139900 }, { "epoch": 18.86284020479655, "grad_norm": 0.2056904286146164, "learning_rate": 0.00047157100511991377, "loss": 0.5727, "step": 140000 }, { "epoch": 18.876313662085693, "grad_norm": 0.20419125258922577, "learning_rate": 0.0004719078415521423, "loss": 0.5714, "step": 140100 }, { "epoch": 18.88978711937483, "grad_norm": 0.22754254937171936, "learning_rate": 0.0004722446779843708, "loss": 0.5724, "step": 140200 }, { "epoch": 18.90326057666397, "grad_norm": 0.20089542865753174, "learning_rate": 0.0004725815144165993, "loss": 0.5717, "step": 140300 }, { "epoch": 18.916734033953112, "grad_norm": 0.18849755823612213, "learning_rate": 0.0004729183508488278, "loss": 0.571, "step": 140400 }, { "epoch": 18.930207491242253, "grad_norm": 0.20117299258708954, "learning_rate": 0.00047325518728105634, "loss": 0.5712, "step": 140500 }, { "epoch": 18.943680948531394, "grad_norm": 0.1966305673122406, "learning_rate": 0.00047359202371328486, "loss": 0.5714, "step": 140600 }, { "epoch": 18.957154405820532, "grad_norm": 0.22022128105163574, "learning_rate": 0.00047392886014551333, "loss": 0.5716, "step": 140700 }, { "epoch": 18.970627863109673, "grad_norm": 0.22829777002334595, "learning_rate": 0.00047426569657774186, "loss": 0.5726, "step": 140800 }, { "epoch": 18.984101320398814, "grad_norm": 0.2233799546957016, "learning_rate": 0.0004746025330099704, "loss": 0.5704, "step": 140900 }, { "epoch": 18.997574777687955, "grad_norm": 0.19816461205482483, "learning_rate": 0.0004749393694421989, "loss": 0.5705, "step": 141000 }, { "epoch": 19.0, "eval_loss": 0.5577976107597351, "eval_runtime": 4.9473, "eval_samples_per_second": 1010.651, "eval_steps_per_second": 15.968, "step": 141018 }, { "epoch": 19.011048234977096, "grad_norm": 0.19748270511627197, "learning_rate": 0.0004752762058744274, "loss": 0.5707, "step": 141100 }, { "epoch": 19.024521692266237, "grad_norm": 0.19564403593540192, "learning_rate": 0.0004756130423066559, "loss": 0.5714, "step": 141200 }, { "epoch": 19.037995149555375, "grad_norm": 0.22439762949943542, "learning_rate": 0.0004759498787388844, "loss": 0.5699, "step": 141300 }, { "epoch": 19.051468606844516, "grad_norm": 0.20442508161067963, "learning_rate": 0.00047628671517111295, "loss": 0.571, "step": 141400 }, { "epoch": 19.064942064133657, "grad_norm": 0.19861574470996857, "learning_rate": 0.0004766235516033414, "loss": 0.5709, "step": 141500 }, { "epoch": 19.078415521422798, "grad_norm": 0.21628820896148682, "learning_rate": 0.00047696038803556994, "loss": 0.5718, "step": 141600 }, { "epoch": 19.09188897871194, "grad_norm": 0.1935693770647049, "learning_rate": 0.0004772972244677984, "loss": 0.5703, "step": 141700 }, { "epoch": 19.105362436001077, "grad_norm": 0.22996366024017334, "learning_rate": 0.000477634060900027, "loss": 0.5714, "step": 141800 }, { "epoch": 19.118835893290218, "grad_norm": 0.21738868951797485, "learning_rate": 0.00047797089733225546, "loss": 0.5707, "step": 141900 }, { "epoch": 19.13230935057936, "grad_norm": 0.1965860277414322, "learning_rate": 0.000478307733764484, "loss": 0.5703, "step": 142000 }, { "epoch": 19.1457828078685, "grad_norm": 0.21287283301353455, "learning_rate": 0.00047864457019671246, "loss": 0.5703, "step": 142100 }, { "epoch": 19.15925626515764, "grad_norm": 0.19259221851825714, "learning_rate": 0.00047898140662894104, "loss": 0.5712, "step": 142200 }, { "epoch": 19.17272972244678, "grad_norm": 0.20922081172466278, "learning_rate": 0.0004793182430611695, "loss": 0.5706, "step": 142300 }, { "epoch": 19.18620317973592, "grad_norm": 0.23068641126155853, "learning_rate": 0.00047965507949339803, "loss": 0.5712, "step": 142400 }, { "epoch": 19.19967663702506, "grad_norm": 0.20895925164222717, "learning_rate": 0.0004799919159256265, "loss": 0.5711, "step": 142500 }, { "epoch": 19.2131500943142, "grad_norm": 0.2063889056444168, "learning_rate": 0.0004803287523578551, "loss": 0.5716, "step": 142600 }, { "epoch": 19.226623551603343, "grad_norm": 0.21252769231796265, "learning_rate": 0.00048066558879008355, "loss": 0.57, "step": 142700 }, { "epoch": 19.24009700889248, "grad_norm": 0.2139025777578354, "learning_rate": 0.0004810024252223121, "loss": 0.5701, "step": 142800 }, { "epoch": 19.25357046618162, "grad_norm": 0.2038293182849884, "learning_rate": 0.00048133926165454054, "loss": 0.5703, "step": 142900 }, { "epoch": 19.267043923470762, "grad_norm": 0.19565074145793915, "learning_rate": 0.0004816760980867691, "loss": 0.5706, "step": 143000 }, { "epoch": 19.280517380759903, "grad_norm": 0.20192784070968628, "learning_rate": 0.0004820129345189976, "loss": 0.5697, "step": 143100 }, { "epoch": 19.293990838049044, "grad_norm": 0.19317013025283813, "learning_rate": 0.0004823497709512261, "loss": 0.5705, "step": 143200 }, { "epoch": 19.307464295338185, "grad_norm": 0.20302468538284302, "learning_rate": 0.0004826866073834546, "loss": 0.5702, "step": 143300 }, { "epoch": 19.320937752627323, "grad_norm": 0.1936311572790146, "learning_rate": 0.00048302344381568317, "loss": 0.5716, "step": 143400 }, { "epoch": 19.334411209916464, "grad_norm": 0.22071565687656403, "learning_rate": 0.00048336028024791164, "loss": 0.5714, "step": 143500 }, { "epoch": 19.347884667205605, "grad_norm": 0.19254502654075623, "learning_rate": 0.00048369711668014016, "loss": 0.5713, "step": 143600 }, { "epoch": 19.361358124494746, "grad_norm": 0.2500135898590088, "learning_rate": 0.00048403395311236863, "loss": 0.57, "step": 143700 }, { "epoch": 19.374831581783887, "grad_norm": 0.2135043740272522, "learning_rate": 0.00048437078954459715, "loss": 0.5695, "step": 143800 }, { "epoch": 19.388305039073025, "grad_norm": 0.19196128845214844, "learning_rate": 0.0004847076259768257, "loss": 0.5706, "step": 143900 }, { "epoch": 19.401778496362166, "grad_norm": 0.20183637738227844, "learning_rate": 0.0004850444624090542, "loss": 0.5699, "step": 144000 }, { "epoch": 19.415251953651307, "grad_norm": 0.20724940299987793, "learning_rate": 0.0004853812988412827, "loss": 0.5694, "step": 144100 }, { "epoch": 19.428725410940448, "grad_norm": 0.18938279151916504, "learning_rate": 0.0004857181352735112, "loss": 0.5695, "step": 144200 }, { "epoch": 19.44219886822959, "grad_norm": 0.2089015543460846, "learning_rate": 0.0004860549717057397, "loss": 0.5704, "step": 144300 }, { "epoch": 19.455672325518726, "grad_norm": 0.22780568897724152, "learning_rate": 0.00048639180813796825, "loss": 0.5705, "step": 144400 }, { "epoch": 19.469145782807868, "grad_norm": 0.21498797833919525, "learning_rate": 0.0004867286445701967, "loss": 0.5708, "step": 144500 }, { "epoch": 19.48261924009701, "grad_norm": 0.19071120023727417, "learning_rate": 0.00048706548100242524, "loss": 0.5705, "step": 144600 }, { "epoch": 19.49609269738615, "grad_norm": 0.1934179812669754, "learning_rate": 0.00048740231743465376, "loss": 0.57, "step": 144700 }, { "epoch": 19.50956615467529, "grad_norm": 0.20666851103305817, "learning_rate": 0.0004877391538668823, "loss": 0.5713, "step": 144800 }, { "epoch": 19.52303961196443, "grad_norm": 0.190780907869339, "learning_rate": 0.00048807599029911076, "loss": 0.5706, "step": 144900 }, { "epoch": 19.53651306925357, "grad_norm": 0.20861583948135376, "learning_rate": 0.0004884128267313393, "loss": 0.5697, "step": 145000 }, { "epoch": 19.54998652654271, "grad_norm": 0.19210149347782135, "learning_rate": 0.0004887496631635678, "loss": 0.57, "step": 145100 }, { "epoch": 19.56345998383185, "grad_norm": 0.2013438194990158, "learning_rate": 0.0004890864995957963, "loss": 0.5702, "step": 145200 }, { "epoch": 19.576933441120993, "grad_norm": 0.19611620903015137, "learning_rate": 0.0004894233360280247, "loss": 0.5695, "step": 145300 }, { "epoch": 19.590406898410134, "grad_norm": 0.19775503873825073, "learning_rate": 0.0004897601724602533, "loss": 0.5699, "step": 145400 }, { "epoch": 19.60388035569927, "grad_norm": 0.1975874900817871, "learning_rate": 0.0004900970088924818, "loss": 0.5697, "step": 145500 }, { "epoch": 19.617353812988412, "grad_norm": 0.21658697724342346, "learning_rate": 0.0004904338453247103, "loss": 0.5712, "step": 145600 }, { "epoch": 19.630827270277553, "grad_norm": 0.20112954080104828, "learning_rate": 0.0004907706817569388, "loss": 0.5693, "step": 145700 }, { "epoch": 19.644300727566694, "grad_norm": 0.1982976198196411, "learning_rate": 0.0004911075181891673, "loss": 0.5692, "step": 145800 }, { "epoch": 19.657774184855835, "grad_norm": 0.18779286742210388, "learning_rate": 0.0004914443546213959, "loss": 0.5715, "step": 145900 }, { "epoch": 19.671247642144973, "grad_norm": 0.22301004827022552, "learning_rate": 0.0004917811910536244, "loss": 0.5711, "step": 146000 }, { "epoch": 19.684721099434114, "grad_norm": 0.21330326795578003, "learning_rate": 0.0004921180274858528, "loss": 0.5694, "step": 146100 }, { "epoch": 19.698194556723255, "grad_norm": 0.21213340759277344, "learning_rate": 0.0004924548639180814, "loss": 0.5705, "step": 146200 }, { "epoch": 19.711668014012396, "grad_norm": 0.19298018515110016, "learning_rate": 0.0004927917003503099, "loss": 0.5695, "step": 146300 }, { "epoch": 19.725141471301537, "grad_norm": 0.19282156229019165, "learning_rate": 0.0004931285367825384, "loss": 0.5695, "step": 146400 }, { "epoch": 19.738614928590675, "grad_norm": 0.21479977667331696, "learning_rate": 0.0004934653732147669, "loss": 0.5699, "step": 146500 }, { "epoch": 19.752088385879816, "grad_norm": 0.20075330138206482, "learning_rate": 0.0004938022096469954, "loss": 0.5694, "step": 146600 }, { "epoch": 19.765561843168957, "grad_norm": 0.2072782665491104, "learning_rate": 0.000494139046079224, "loss": 0.5703, "step": 146700 }, { "epoch": 19.779035300458098, "grad_norm": 0.21462304890155792, "learning_rate": 0.0004944758825114525, "loss": 0.5701, "step": 146800 }, { "epoch": 19.79250875774724, "grad_norm": 0.18543347716331482, "learning_rate": 0.0004948127189436809, "loss": 0.5695, "step": 146900 }, { "epoch": 19.80598221503638, "grad_norm": 0.21629741787910461, "learning_rate": 0.0004951495553759094, "loss": 0.5694, "step": 147000 }, { "epoch": 19.819455672325518, "grad_norm": 0.21711041033267975, "learning_rate": 0.000495486391808138, "loss": 0.5694, "step": 147100 }, { "epoch": 19.83292912961466, "grad_norm": 0.23246140778064728, "learning_rate": 0.0004958232282403664, "loss": 0.5683, "step": 147200 }, { "epoch": 19.8464025869038, "grad_norm": 0.20750293135643005, "learning_rate": 0.000496160064672595, "loss": 0.5702, "step": 147300 }, { "epoch": 19.85987604419294, "grad_norm": 0.19113647937774658, "learning_rate": 0.0004964969011048235, "loss": 0.569, "step": 147400 }, { "epoch": 19.873349501482082, "grad_norm": 0.19308046996593475, "learning_rate": 0.0004968337375370521, "loss": 0.569, "step": 147500 }, { "epoch": 19.88682295877122, "grad_norm": 0.23803123831748962, "learning_rate": 0.0004971705739692805, "loss": 0.5687, "step": 147600 }, { "epoch": 19.90029641606036, "grad_norm": 0.18461479246616364, "learning_rate": 0.000497507410401509, "loss": 0.5693, "step": 147700 }, { "epoch": 19.9137698733495, "grad_norm": 0.19458068907260895, "learning_rate": 0.0004978442468337375, "loss": 0.5685, "step": 147800 }, { "epoch": 19.927243330638643, "grad_norm": 0.20595388114452362, "learning_rate": 0.0004981810832659661, "loss": 0.5694, "step": 147900 }, { "epoch": 19.940716787927784, "grad_norm": 0.18838046491146088, "learning_rate": 0.0004985179196981945, "loss": 0.5698, "step": 148000 }, { "epoch": 19.95419024521692, "grad_norm": 0.18238422274589539, "learning_rate": 0.0004988547561304231, "loss": 0.5699, "step": 148100 }, { "epoch": 19.967663702506062, "grad_norm": 0.19157563149929047, "learning_rate": 0.0004991915925626516, "loss": 0.569, "step": 148200 }, { "epoch": 19.981137159795203, "grad_norm": 0.21493083238601685, "learning_rate": 0.0004995284289948802, "loss": 0.5687, "step": 148300 }, { "epoch": 19.994610617084344, "grad_norm": 0.1971777379512787, "learning_rate": 0.0004998652654271086, "loss": 0.5699, "step": 148400 }, { "epoch": 20.0, "eval_loss": 0.5554730296134949, "eval_runtime": 4.929, "eval_samples_per_second": 1014.411, "eval_steps_per_second": 16.028, "step": 148440 }, { "epoch": 20.008084074373485, "grad_norm": 0.19713926315307617, "learning_rate": 0.0004999775442378515, "loss": 0.568, "step": 148500 }, { "epoch": 20.021557531662623, "grad_norm": 0.20767615735530853, "learning_rate": 0.0004999401179676038, "loss": 0.5674, "step": 148600 }, { "epoch": 20.035030988951764, "grad_norm": 0.2104044407606125, "learning_rate": 0.0004999026916973562, "loss": 0.568, "step": 148700 }, { "epoch": 20.048504446240905, "grad_norm": 0.1936338096857071, "learning_rate": 0.0004998652654271086, "loss": 0.5673, "step": 148800 }, { "epoch": 20.061977903530046, "grad_norm": 0.19548213481903076, "learning_rate": 0.000499827839156861, "loss": 0.5692, "step": 148900 }, { "epoch": 20.075451360819187, "grad_norm": 0.18568992614746094, "learning_rate": 0.0004997904128866134, "loss": 0.5683, "step": 149000 }, { "epoch": 20.088924818108328, "grad_norm": 0.2137257158756256, "learning_rate": 0.0004997529866163658, "loss": 0.5692, "step": 149100 }, { "epoch": 20.102398275397466, "grad_norm": 0.18939432501792908, "learning_rate": 0.0004997155603461181, "loss": 0.569, "step": 149200 }, { "epoch": 20.115871732686607, "grad_norm": 0.2151089906692505, "learning_rate": 0.0004996781340758706, "loss": 0.5686, "step": 149300 }, { "epoch": 20.129345189975748, "grad_norm": 0.19952531158924103, "learning_rate": 0.0004996407078056229, "loss": 0.5675, "step": 149400 }, { "epoch": 20.14281864726489, "grad_norm": 0.18842704594135284, "learning_rate": 0.0004996032815353754, "loss": 0.568, "step": 149500 }, { "epoch": 20.15629210455403, "grad_norm": 0.20342102646827698, "learning_rate": 0.0004995658552651277, "loss": 0.5676, "step": 149600 }, { "epoch": 20.169765561843167, "grad_norm": 0.19327715039253235, "learning_rate": 0.0004995284289948802, "loss": 0.5676, "step": 149700 }, { "epoch": 20.18323901913231, "grad_norm": 0.2025727480649948, "learning_rate": 0.0004994910027246325, "loss": 0.5688, "step": 149800 }, { "epoch": 20.19671247642145, "grad_norm": 0.1828172355890274, "learning_rate": 0.0004994535764543848, "loss": 0.5684, "step": 149900 }, { "epoch": 20.21018593371059, "grad_norm": 0.18396034836769104, "learning_rate": 0.0004994161501841373, "loss": 0.568, "step": 150000 }, { "epoch": 20.22365939099973, "grad_norm": 0.2238343209028244, "learning_rate": 0.0004993787239138896, "loss": 0.5695, "step": 150100 }, { "epoch": 20.23713284828887, "grad_norm": 0.21190156042575836, "learning_rate": 0.0004993412976436421, "loss": 0.5679, "step": 150200 }, { "epoch": 20.25060630557801, "grad_norm": 0.2103496938943863, "learning_rate": 0.0004993038713733944, "loss": 0.5684, "step": 150300 }, { "epoch": 20.26407976286715, "grad_norm": 0.19340234994888306, "learning_rate": 0.0004992664451031469, "loss": 0.567, "step": 150400 }, { "epoch": 20.277553220156292, "grad_norm": 0.20674331486225128, "learning_rate": 0.0004992290188328991, "loss": 0.5679, "step": 150500 }, { "epoch": 20.291026677445434, "grad_norm": 0.21257059276103973, "learning_rate": 0.0004991915925626516, "loss": 0.567, "step": 150600 }, { "epoch": 20.304500134734575, "grad_norm": 0.1856154501438141, "learning_rate": 0.0004991541662924039, "loss": 0.5673, "step": 150700 }, { "epoch": 20.317973592023712, "grad_norm": 0.19510680437088013, "learning_rate": 0.0004991167400221564, "loss": 0.5682, "step": 150800 }, { "epoch": 20.331447049312853, "grad_norm": 0.19921459257602692, "learning_rate": 0.0004990793137519087, "loss": 0.5684, "step": 150900 }, { "epoch": 20.344920506601994, "grad_norm": 0.1902490258216858, "learning_rate": 0.0004990418874816612, "loss": 0.568, "step": 151000 }, { "epoch": 20.358393963891135, "grad_norm": 0.23372936248779297, "learning_rate": 0.0004990044612114135, "loss": 0.5671, "step": 151100 }, { "epoch": 20.371867421180276, "grad_norm": 0.19851042330265045, "learning_rate": 0.000498967034941166, "loss": 0.5677, "step": 151200 }, { "epoch": 20.385340878469414, "grad_norm": 0.18056745827198029, "learning_rate": 0.0004989296086709183, "loss": 0.5679, "step": 151300 }, { "epoch": 20.398814335758555, "grad_norm": 0.18214739859104156, "learning_rate": 0.0004988921824006707, "loss": 0.5663, "step": 151400 }, { "epoch": 20.412287793047696, "grad_norm": 0.19098100066184998, "learning_rate": 0.0004988547561304231, "loss": 0.5676, "step": 151500 }, { "epoch": 20.425761250336837, "grad_norm": 0.19894526898860931, "learning_rate": 0.0004988173298601754, "loss": 0.5677, "step": 151600 }, { "epoch": 20.439234707625978, "grad_norm": 0.21052078902721405, "learning_rate": 0.0004987799035899279, "loss": 0.5676, "step": 151700 }, { "epoch": 20.452708164915116, "grad_norm": 0.19846133887767792, "learning_rate": 0.0004987424773196802, "loss": 0.5693, "step": 151800 }, { "epoch": 20.466181622204257, "grad_norm": 0.1813189834356308, "learning_rate": 0.0004987050510494327, "loss": 0.5664, "step": 151900 }, { "epoch": 20.479655079493398, "grad_norm": 0.20844240486621857, "learning_rate": 0.000498667624779185, "loss": 0.5673, "step": 152000 }, { "epoch": 20.49312853678254, "grad_norm": 0.20188036561012268, "learning_rate": 0.0004986301985089374, "loss": 0.5673, "step": 152100 }, { "epoch": 20.50660199407168, "grad_norm": 0.20720399916172028, "learning_rate": 0.0004985927722386898, "loss": 0.5673, "step": 152200 }, { "epoch": 20.520075451360817, "grad_norm": 0.18666911125183105, "learning_rate": 0.0004985553459684422, "loss": 0.5679, "step": 152300 }, { "epoch": 20.53354890864996, "grad_norm": 0.20251156389713287, "learning_rate": 0.0004985179196981945, "loss": 0.5657, "step": 152400 }, { "epoch": 20.5470223659391, "grad_norm": 0.19666804373264313, "learning_rate": 0.000498480493427947, "loss": 0.5672, "step": 152500 }, { "epoch": 20.56049582322824, "grad_norm": 0.21131619811058044, "learning_rate": 0.0004984430671576993, "loss": 0.5673, "step": 152600 }, { "epoch": 20.57396928051738, "grad_norm": 0.20315764844417572, "learning_rate": 0.0004984056408874517, "loss": 0.5673, "step": 152700 }, { "epoch": 20.587442737806523, "grad_norm": 0.18456928431987762, "learning_rate": 0.0004983682146172041, "loss": 0.5681, "step": 152800 }, { "epoch": 20.60091619509566, "grad_norm": 0.1886671632528305, "learning_rate": 0.0004983307883469565, "loss": 0.5676, "step": 152900 }, { "epoch": 20.6143896523848, "grad_norm": 0.1883230209350586, "learning_rate": 0.0004982933620767089, "loss": 0.5669, "step": 153000 }, { "epoch": 20.627863109673942, "grad_norm": 0.19417184591293335, "learning_rate": 0.0004982559358064613, "loss": 0.5652, "step": 153100 }, { "epoch": 20.641336566963083, "grad_norm": 0.19092564284801483, "learning_rate": 0.0004982185095362137, "loss": 0.5677, "step": 153200 }, { "epoch": 20.654810024252225, "grad_norm": 0.20101222395896912, "learning_rate": 0.0004981810832659661, "loss": 0.5678, "step": 153300 }, { "epoch": 20.668283481541362, "grad_norm": 0.18150578439235687, "learning_rate": 0.0004981436569957184, "loss": 0.5673, "step": 153400 }, { "epoch": 20.681756938830503, "grad_norm": 0.19677764177322388, "learning_rate": 0.0004981062307254709, "loss": 0.5665, "step": 153500 }, { "epoch": 20.695230396119644, "grad_norm": 0.18622642755508423, "learning_rate": 0.0004980688044552232, "loss": 0.5674, "step": 153600 }, { "epoch": 20.708703853408785, "grad_norm": 0.19008390605449677, "learning_rate": 0.0004980313781849756, "loss": 0.5668, "step": 153700 }, { "epoch": 20.722177310697926, "grad_norm": 0.19192491471767426, "learning_rate": 0.000497993951914728, "loss": 0.5678, "step": 153800 }, { "epoch": 20.735650767987064, "grad_norm": 0.18889346718788147, "learning_rate": 0.0004979565256444804, "loss": 0.5673, "step": 153900 }, { "epoch": 20.749124225276205, "grad_norm": 0.21489392220973969, "learning_rate": 0.0004979190993742328, "loss": 0.5656, "step": 154000 }, { "epoch": 20.762597682565346, "grad_norm": 0.189887136220932, "learning_rate": 0.0004978816731039851, "loss": 0.5676, "step": 154100 }, { "epoch": 20.776071139854487, "grad_norm": 0.1887272596359253, "learning_rate": 0.0004978442468337375, "loss": 0.5668, "step": 154200 }, { "epoch": 20.789544597143628, "grad_norm": 0.1834978312253952, "learning_rate": 0.0004978068205634899, "loss": 0.5661, "step": 154300 }, { "epoch": 20.80301805443277, "grad_norm": 0.21438460052013397, "learning_rate": 0.0004977693942932423, "loss": 0.5668, "step": 154400 }, { "epoch": 20.816491511721907, "grad_norm": 0.2168339639902115, "learning_rate": 0.0004977319680229947, "loss": 0.5654, "step": 154500 }, { "epoch": 20.829964969011048, "grad_norm": 0.17964358627796173, "learning_rate": 0.0004976945417527471, "loss": 0.5662, "step": 154600 }, { "epoch": 20.84343842630019, "grad_norm": 0.23388200998306274, "learning_rate": 0.0004976571154824995, "loss": 0.5669, "step": 154700 }, { "epoch": 20.85691188358933, "grad_norm": 0.19635948538780212, "learning_rate": 0.0004976196892122519, "loss": 0.5667, "step": 154800 }, { "epoch": 20.87038534087847, "grad_norm": 0.20348678529262543, "learning_rate": 0.0004975822629420042, "loss": 0.5667, "step": 154900 }, { "epoch": 20.88385879816761, "grad_norm": 0.18654173612594604, "learning_rate": 0.0004975448366717567, "loss": 0.5661, "step": 155000 }, { "epoch": 20.89733225545675, "grad_norm": 0.19065189361572266, "learning_rate": 0.000497507410401509, "loss": 0.5655, "step": 155100 }, { "epoch": 20.91080571274589, "grad_norm": 0.18054015934467316, "learning_rate": 0.0004974699841312615, "loss": 0.5674, "step": 155200 }, { "epoch": 20.92427917003503, "grad_norm": 0.1820044070482254, "learning_rate": 0.0004974325578610138, "loss": 0.5659, "step": 155300 }, { "epoch": 20.937752627324173, "grad_norm": 0.18391361832618713, "learning_rate": 0.0004973951315907663, "loss": 0.5665, "step": 155400 }, { "epoch": 20.95122608461331, "grad_norm": 0.19623912870883942, "learning_rate": 0.0004973577053205186, "loss": 0.5658, "step": 155500 }, { "epoch": 20.96469954190245, "grad_norm": 0.20823346078395844, "learning_rate": 0.0004973202790502709, "loss": 0.5664, "step": 155600 }, { "epoch": 20.978172999191592, "grad_norm": 0.19277264177799225, "learning_rate": 0.0004972828527800234, "loss": 0.5668, "step": 155700 }, { "epoch": 20.991646456480733, "grad_norm": 0.20012864470481873, "learning_rate": 0.0004972454265097757, "loss": 0.5662, "step": 155800 }, { "epoch": 21.0, "eval_loss": 0.5525199174880981, "eval_runtime": 4.9247, "eval_samples_per_second": 1015.3, "eval_steps_per_second": 16.042, "step": 155862 }, { "epoch": 21.005119913769875, "grad_norm": 0.17597775161266327, "learning_rate": 0.0004972080002395281, "loss": 0.5643, "step": 155900 }, { "epoch": 21.018593371059012, "grad_norm": 0.19270636141300201, "learning_rate": 0.0004971705739692805, "loss": 0.565, "step": 156000 }, { "epoch": 21.032066828348153, "grad_norm": 0.19546855986118317, "learning_rate": 0.0004971331476990329, "loss": 0.5649, "step": 156100 }, { "epoch": 21.045540285637294, "grad_norm": 0.20594549179077148, "learning_rate": 0.0004970957214287853, "loss": 0.5643, "step": 156200 }, { "epoch": 21.059013742926435, "grad_norm": 0.19519783556461334, "learning_rate": 0.0004970582951585377, "loss": 0.5651, "step": 156300 }, { "epoch": 21.072487200215576, "grad_norm": 0.20319238305091858, "learning_rate": 0.00049702086888829, "loss": 0.5663, "step": 156400 }, { "epoch": 21.085960657504717, "grad_norm": 0.18416166305541992, "learning_rate": 0.0004969834426180425, "loss": 0.5654, "step": 156500 }, { "epoch": 21.099434114793855, "grad_norm": 0.18003292381763458, "learning_rate": 0.0004969460163477948, "loss": 0.5659, "step": 156600 }, { "epoch": 21.112907572082996, "grad_norm": 0.17357538640499115, "learning_rate": 0.0004969085900775473, "loss": 0.5651, "step": 156700 }, { "epoch": 21.126381029372137, "grad_norm": 0.20769518613815308, "learning_rate": 0.0004968711638072996, "loss": 0.5654, "step": 156800 }, { "epoch": 21.139854486661278, "grad_norm": 0.19435834884643555, "learning_rate": 0.0004968337375370521, "loss": 0.5653, "step": 156900 }, { "epoch": 21.15332794395042, "grad_norm": 0.19948576390743256, "learning_rate": 0.0004967963112668044, "loss": 0.5652, "step": 157000 }, { "epoch": 21.166801401239557, "grad_norm": 0.17948327958583832, "learning_rate": 0.0004967588849965568, "loss": 0.566, "step": 157100 }, { "epoch": 21.180274858528698, "grad_norm": 0.18726955354213715, "learning_rate": 0.0004967214587263092, "loss": 0.5646, "step": 157200 }, { "epoch": 21.19374831581784, "grad_norm": 0.1833568662405014, "learning_rate": 0.0004966840324560616, "loss": 0.5645, "step": 157300 }, { "epoch": 21.20722177310698, "grad_norm": 0.18384447693824768, "learning_rate": 0.000496646606185814, "loss": 0.5643, "step": 157400 }, { "epoch": 21.22069523039612, "grad_norm": 0.1933489739894867, "learning_rate": 0.0004966091799155663, "loss": 0.5656, "step": 157500 }, { "epoch": 21.23416868768526, "grad_norm": 0.19515042006969452, "learning_rate": 0.0004965717536453188, "loss": 0.5653, "step": 157600 }, { "epoch": 21.2476421449744, "grad_norm": 0.19678859412670135, "learning_rate": 0.0004965343273750711, "loss": 0.5646, "step": 157700 }, { "epoch": 21.26111560226354, "grad_norm": 0.19129018485546112, "learning_rate": 0.0004964969011048235, "loss": 0.5646, "step": 157800 }, { "epoch": 21.27458905955268, "grad_norm": 0.18177972733974457, "learning_rate": 0.0004964594748345758, "loss": 0.5648, "step": 157900 }, { "epoch": 21.288062516841823, "grad_norm": 0.19403231143951416, "learning_rate": 0.0004964220485643283, "loss": 0.5642, "step": 158000 }, { "epoch": 21.301535974130964, "grad_norm": 0.20224380493164062, "learning_rate": 0.0004963846222940806, "loss": 0.5645, "step": 158100 }, { "epoch": 21.3150094314201, "grad_norm": 0.19097447395324707, "learning_rate": 0.0004963471960238331, "loss": 0.5654, "step": 158200 }, { "epoch": 21.328482888709242, "grad_norm": 0.20119699835777283, "learning_rate": 0.0004963097697535854, "loss": 0.5643, "step": 158300 }, { "epoch": 21.341956345998383, "grad_norm": 0.1753535121679306, "learning_rate": 0.0004962723434833379, "loss": 0.5644, "step": 158400 }, { "epoch": 21.355429803287524, "grad_norm": 0.1786462664604187, "learning_rate": 0.0004962349172130902, "loss": 0.5648, "step": 158500 }, { "epoch": 21.368903260576666, "grad_norm": 0.18840648233890533, "learning_rate": 0.0004961974909428426, "loss": 0.5648, "step": 158600 }, { "epoch": 21.382376717865803, "grad_norm": 0.17723508179187775, "learning_rate": 0.000496160064672595, "loss": 0.565, "step": 158700 }, { "epoch": 21.395850175154944, "grad_norm": 0.183418869972229, "learning_rate": 0.0004961226384023474, "loss": 0.5648, "step": 158800 }, { "epoch": 21.409323632444085, "grad_norm": 0.20922081172466278, "learning_rate": 0.0004960852121320998, "loss": 0.5649, "step": 158900 }, { "epoch": 21.422797089733226, "grad_norm": 0.18486525118350983, "learning_rate": 0.0004960477858618522, "loss": 0.565, "step": 159000 }, { "epoch": 21.436270547022367, "grad_norm": 0.1886783391237259, "learning_rate": 0.0004960103595916046, "loss": 0.565, "step": 159100 }, { "epoch": 21.449744004311505, "grad_norm": 0.18292537331581116, "learning_rate": 0.000495972933321357, "loss": 0.5643, "step": 159200 }, { "epoch": 21.463217461600646, "grad_norm": 0.1852954477071762, "learning_rate": 0.0004959355070511093, "loss": 0.5638, "step": 159300 }, { "epoch": 21.476690918889787, "grad_norm": 0.1873287707567215, "learning_rate": 0.0004958980807808618, "loss": 0.5653, "step": 159400 }, { "epoch": 21.490164376178928, "grad_norm": 0.20763596892356873, "learning_rate": 0.0004958606545106141, "loss": 0.5646, "step": 159500 }, { "epoch": 21.50363783346807, "grad_norm": 0.1819988489151001, "learning_rate": 0.0004958232282403664, "loss": 0.565, "step": 159600 }, { "epoch": 21.517111290757207, "grad_norm": 0.2267882078886032, "learning_rate": 0.0004957858019701189, "loss": 0.5645, "step": 159700 }, { "epoch": 21.530584748046348, "grad_norm": 0.1964913308620453, "learning_rate": 0.0004957483756998712, "loss": 0.565, "step": 159800 }, { "epoch": 21.54405820533549, "grad_norm": 0.18867014348506927, "learning_rate": 0.0004957109494296237, "loss": 0.5657, "step": 159900 }, { "epoch": 21.55753166262463, "grad_norm": 0.20086899399757385, "learning_rate": 0.000495673523159376, "loss": 0.5648, "step": 160000 }, { "epoch": 21.57100511991377, "grad_norm": 0.20706753432750702, "learning_rate": 0.0004956360968891284, "loss": 0.5639, "step": 160100 }, { "epoch": 21.584478577202912, "grad_norm": 0.17812249064445496, "learning_rate": 0.0004955986706188808, "loss": 0.5642, "step": 160200 }, { "epoch": 21.59795203449205, "grad_norm": 0.19530659914016724, "learning_rate": 0.0004955612443486332, "loss": 0.5642, "step": 160300 }, { "epoch": 21.61142549178119, "grad_norm": 0.19237031042575836, "learning_rate": 0.0004955238180783856, "loss": 0.5655, "step": 160400 }, { "epoch": 21.62489894907033, "grad_norm": 0.1822291612625122, "learning_rate": 0.000495486391808138, "loss": 0.5649, "step": 160500 }, { "epoch": 21.638372406359473, "grad_norm": 0.18484021723270416, "learning_rate": 0.0004954489655378904, "loss": 0.5635, "step": 160600 }, { "epoch": 21.651845863648614, "grad_norm": 0.19156455993652344, "learning_rate": 0.0004954115392676428, "loss": 0.5628, "step": 160700 }, { "epoch": 21.66531932093775, "grad_norm": 0.21267811954021454, "learning_rate": 0.0004953741129973951, "loss": 0.5638, "step": 160800 }, { "epoch": 21.678792778226892, "grad_norm": 0.21014073491096497, "learning_rate": 0.0004953366867271476, "loss": 0.5655, "step": 160900 }, { "epoch": 21.692266235516033, "grad_norm": 0.19433985650539398, "learning_rate": 0.0004952992604568999, "loss": 0.5651, "step": 161000 }, { "epoch": 21.705739692805174, "grad_norm": 0.18715786933898926, "learning_rate": 0.0004952618341866524, "loss": 0.5641, "step": 161100 }, { "epoch": 21.719213150094316, "grad_norm": 0.18929694592952728, "learning_rate": 0.0004952244079164047, "loss": 0.564, "step": 161200 }, { "epoch": 21.732686607383453, "grad_norm": 0.1999722719192505, "learning_rate": 0.000495186981646157, "loss": 0.563, "step": 161300 }, { "epoch": 21.746160064672594, "grad_norm": 0.18524287641048431, "learning_rate": 0.0004951495553759094, "loss": 0.564, "step": 161400 }, { "epoch": 21.759633521961735, "grad_norm": 0.1873171627521515, "learning_rate": 0.0004951121291056618, "loss": 0.5648, "step": 161500 }, { "epoch": 21.773106979250876, "grad_norm": 0.17376992106437683, "learning_rate": 0.0004950747028354142, "loss": 0.5636, "step": 161600 }, { "epoch": 21.786580436540017, "grad_norm": 0.23027461767196655, "learning_rate": 0.0004950372765651666, "loss": 0.5634, "step": 161700 }, { "epoch": 21.80005389382916, "grad_norm": 0.18318326771259308, "learning_rate": 0.000494999850294919, "loss": 0.5638, "step": 161800 }, { "epoch": 21.813527351118296, "grad_norm": 0.18532977998256683, "learning_rate": 0.0004949624240246714, "loss": 0.5626, "step": 161900 }, { "epoch": 21.827000808407437, "grad_norm": 0.179155632853508, "learning_rate": 0.0004949249977544238, "loss": 0.564, "step": 162000 }, { "epoch": 21.840474265696578, "grad_norm": 0.18466445803642273, "learning_rate": 0.0004948875714841762, "loss": 0.5635, "step": 162100 }, { "epoch": 21.85394772298572, "grad_norm": 0.18539460003376007, "learning_rate": 0.0004948501452139286, "loss": 0.564, "step": 162200 }, { "epoch": 21.86742118027486, "grad_norm": 0.19619065523147583, "learning_rate": 0.0004948127189436809, "loss": 0.5642, "step": 162300 }, { "epoch": 21.880894637563998, "grad_norm": 0.184801384806633, "learning_rate": 0.0004947752926734334, "loss": 0.5636, "step": 162400 }, { "epoch": 21.89436809485314, "grad_norm": 0.17101196944713593, "learning_rate": 0.0004947378664031857, "loss": 0.5636, "step": 162500 }, { "epoch": 21.90784155214228, "grad_norm": 0.17424826323986053, "learning_rate": 0.0004947004401329382, "loss": 0.5634, "step": 162600 }, { "epoch": 21.92131500943142, "grad_norm": 0.17838972806930542, "learning_rate": 0.0004946630138626905, "loss": 0.5636, "step": 162700 }, { "epoch": 21.934788466720562, "grad_norm": 0.18775369226932526, "learning_rate": 0.000494625587592443, "loss": 0.5634, "step": 162800 }, { "epoch": 21.9482619240097, "grad_norm": 0.18728609383106232, "learning_rate": 0.0004945881613221953, "loss": 0.5626, "step": 162900 }, { "epoch": 21.96173538129884, "grad_norm": 0.1813744306564331, "learning_rate": 0.0004945507350519477, "loss": 0.5638, "step": 163000 }, { "epoch": 21.97520883858798, "grad_norm": 0.19766764342784882, "learning_rate": 0.0004945133087817001, "loss": 0.5635, "step": 163100 }, { "epoch": 21.988682295877123, "grad_norm": 0.1897711455821991, "learning_rate": 0.0004944758825114525, "loss": 0.5636, "step": 163200 }, { "epoch": 22.0, "eval_loss": 0.5499277114868164, "eval_runtime": 4.9204, "eval_samples_per_second": 1016.176, "eval_steps_per_second": 16.056, "step": 163284 }, { "epoch": 22.002155753166264, "grad_norm": 0.20440596342086792, "learning_rate": 0.0004944384562412048, "loss": 0.5632, "step": 163300 }, { "epoch": 22.0156292104554, "grad_norm": 0.1774771511554718, "learning_rate": 0.0004944010299709572, "loss": 0.5624, "step": 163400 }, { "epoch": 22.029102667744542, "grad_norm": 0.17878516018390656, "learning_rate": 0.0004943636037007096, "loss": 0.5619, "step": 163500 }, { "epoch": 22.042576125033683, "grad_norm": 0.18109266459941864, "learning_rate": 0.0004943261774304619, "loss": 0.5623, "step": 163600 }, { "epoch": 22.056049582322824, "grad_norm": 0.18651127815246582, "learning_rate": 0.0004942887511602144, "loss": 0.5631, "step": 163700 }, { "epoch": 22.069523039611965, "grad_norm": 0.17507034540176392, "learning_rate": 0.0004942513248899667, "loss": 0.5623, "step": 163800 }, { "epoch": 22.082996496901107, "grad_norm": 0.18630681931972504, "learning_rate": 0.0004942138986197192, "loss": 0.5628, "step": 163900 }, { "epoch": 22.096469954190244, "grad_norm": 0.19338850677013397, "learning_rate": 0.0004941764723494715, "loss": 0.5625, "step": 164000 }, { "epoch": 22.109943411479385, "grad_norm": 0.18330933153629303, "learning_rate": 0.000494139046079224, "loss": 0.563, "step": 164100 }, { "epoch": 22.123416868768526, "grad_norm": 0.21614223718643188, "learning_rate": 0.0004941016198089763, "loss": 0.5629, "step": 164200 }, { "epoch": 22.136890326057667, "grad_norm": 0.2100418508052826, "learning_rate": 0.0004940641935387288, "loss": 0.5632, "step": 164300 }, { "epoch": 22.15036378334681, "grad_norm": 0.1983720064163208, "learning_rate": 0.0004940267672684811, "loss": 0.5628, "step": 164400 }, { "epoch": 22.163837240635946, "grad_norm": 0.1846802532672882, "learning_rate": 0.0004939893409982335, "loss": 0.5616, "step": 164500 }, { "epoch": 22.177310697925087, "grad_norm": 0.20439152419567108, "learning_rate": 0.0004939519147279859, "loss": 0.5631, "step": 164600 }, { "epoch": 22.190784155214228, "grad_norm": 0.18316437304019928, "learning_rate": 0.0004939144884577383, "loss": 0.5626, "step": 164700 }, { "epoch": 22.20425761250337, "grad_norm": 0.18874777853488922, "learning_rate": 0.0004938770621874907, "loss": 0.5614, "step": 164800 }, { "epoch": 22.21773106979251, "grad_norm": 0.17709331214427948, "learning_rate": 0.0004938396359172431, "loss": 0.5618, "step": 164900 }, { "epoch": 22.231204527081648, "grad_norm": 0.20476138591766357, "learning_rate": 0.0004938022096469954, "loss": 0.5625, "step": 165000 }, { "epoch": 22.24467798437079, "grad_norm": 0.18826881051063538, "learning_rate": 0.0004937647833767477, "loss": 0.5621, "step": 165100 }, { "epoch": 22.25815144165993, "grad_norm": 0.18324942886829376, "learning_rate": 0.0004937273571065002, "loss": 0.5621, "step": 165200 }, { "epoch": 22.27162489894907, "grad_norm": 0.17678643763065338, "learning_rate": 0.0004936899308362525, "loss": 0.5616, "step": 165300 }, { "epoch": 22.285098356238212, "grad_norm": 0.18573962152004242, "learning_rate": 0.000493652504566005, "loss": 0.5627, "step": 165400 }, { "epoch": 22.29857181352735, "grad_norm": 0.17725896835327148, "learning_rate": 0.0004936150782957573, "loss": 0.5633, "step": 165500 }, { "epoch": 22.31204527081649, "grad_norm": 0.18259365856647491, "learning_rate": 0.0004935776520255098, "loss": 0.5636, "step": 165600 }, { "epoch": 22.32551872810563, "grad_norm": 0.2110816389322281, "learning_rate": 0.0004935402257552621, "loss": 0.5636, "step": 165700 }, { "epoch": 22.338992185394773, "grad_norm": 0.1805492788553238, "learning_rate": 0.0004935027994850145, "loss": 0.5619, "step": 165800 }, { "epoch": 22.352465642683914, "grad_norm": 0.1875033974647522, "learning_rate": 0.0004934653732147669, "loss": 0.5625, "step": 165900 }, { "epoch": 22.365939099973055, "grad_norm": 0.19798198342323303, "learning_rate": 0.0004934279469445193, "loss": 0.5619, "step": 166000 }, { "epoch": 22.379412557262192, "grad_norm": 0.17726217210292816, "learning_rate": 0.0004933905206742717, "loss": 0.5625, "step": 166100 }, { "epoch": 22.392886014551333, "grad_norm": 0.1942170262336731, "learning_rate": 0.0004933530944040241, "loss": 0.5626, "step": 166200 }, { "epoch": 22.406359471840474, "grad_norm": 0.1815304011106491, "learning_rate": 0.0004933156681337765, "loss": 0.5619, "step": 166300 }, { "epoch": 22.419832929129615, "grad_norm": 0.17774781584739685, "learning_rate": 0.0004932782418635289, "loss": 0.5617, "step": 166400 }, { "epoch": 22.433306386418757, "grad_norm": 0.18591001629829407, "learning_rate": 0.0004932408155932813, "loss": 0.5622, "step": 166500 }, { "epoch": 22.446779843707894, "grad_norm": 0.17605601251125336, "learning_rate": 0.0004932033893230337, "loss": 0.561, "step": 166600 }, { "epoch": 22.460253300997035, "grad_norm": 0.20248523354530334, "learning_rate": 0.000493165963052786, "loss": 0.5615, "step": 166700 }, { "epoch": 22.473726758286176, "grad_norm": 0.1809377819299698, "learning_rate": 0.0004931285367825384, "loss": 0.5619, "step": 166800 }, { "epoch": 22.487200215575317, "grad_norm": 0.22050178050994873, "learning_rate": 0.0004930911105122908, "loss": 0.5625, "step": 166900 }, { "epoch": 22.50067367286446, "grad_norm": 0.1799422651529312, "learning_rate": 0.0004930536842420431, "loss": 0.5623, "step": 167000 }, { "epoch": 22.514147130153596, "grad_norm": 0.17757967114448547, "learning_rate": 0.0004930162579717956, "loss": 0.5609, "step": 167100 }, { "epoch": 22.527620587442737, "grad_norm": 0.1912573277950287, "learning_rate": 0.0004929788317015479, "loss": 0.5623, "step": 167200 }, { "epoch": 22.541094044731878, "grad_norm": 0.19975891709327698, "learning_rate": 0.0004929414054313003, "loss": 0.5613, "step": 167300 }, { "epoch": 22.55456750202102, "grad_norm": 0.20884960889816284, "learning_rate": 0.0004929039791610527, "loss": 0.5624, "step": 167400 }, { "epoch": 22.56804095931016, "grad_norm": 0.18710607290267944, "learning_rate": 0.0004928665528908051, "loss": 0.562, "step": 167500 }, { "epoch": 22.581514416599298, "grad_norm": 0.18243646621704102, "learning_rate": 0.0004928291266205575, "loss": 0.5622, "step": 167600 }, { "epoch": 22.59498787388844, "grad_norm": 0.17454124987125397, "learning_rate": 0.0004927917003503099, "loss": 0.5607, "step": 167700 }, { "epoch": 22.60846133117758, "grad_norm": 0.1856917291879654, "learning_rate": 0.0004927542740800623, "loss": 0.5622, "step": 167800 }, { "epoch": 22.62193478846672, "grad_norm": 0.19023793935775757, "learning_rate": 0.0004927168478098147, "loss": 0.5612, "step": 167900 }, { "epoch": 22.635408245755862, "grad_norm": 0.17285673320293427, "learning_rate": 0.000492679421539567, "loss": 0.5623, "step": 168000 }, { "epoch": 22.648881703045003, "grad_norm": 0.20997704565525055, "learning_rate": 0.0004926419952693195, "loss": 0.5621, "step": 168100 }, { "epoch": 22.66235516033414, "grad_norm": 0.1777045726776123, "learning_rate": 0.0004926045689990718, "loss": 0.5622, "step": 168200 }, { "epoch": 22.67582861762328, "grad_norm": 0.20138612389564514, "learning_rate": 0.0004925671427288243, "loss": 0.5609, "step": 168300 }, { "epoch": 22.689302074912423, "grad_norm": 0.2206546813249588, "learning_rate": 0.0004925297164585766, "loss": 0.5618, "step": 168400 }, { "epoch": 22.702775532201564, "grad_norm": 0.17673003673553467, "learning_rate": 0.0004924922901883291, "loss": 0.5624, "step": 168500 }, { "epoch": 22.716248989490705, "grad_norm": 0.18426775932312012, "learning_rate": 0.0004924548639180814, "loss": 0.5621, "step": 168600 }, { "epoch": 22.729722446779842, "grad_norm": 0.1896793693304062, "learning_rate": 0.0004924174376478338, "loss": 0.5623, "step": 168700 }, { "epoch": 22.743195904068983, "grad_norm": 0.17833077907562256, "learning_rate": 0.0004923800113775861, "loss": 0.5601, "step": 168800 }, { "epoch": 22.756669361358124, "grad_norm": 0.17767682671546936, "learning_rate": 0.0004923425851073386, "loss": 0.5623, "step": 168900 }, { "epoch": 22.770142818647265, "grad_norm": 0.18551932275295258, "learning_rate": 0.0004923051588370909, "loss": 0.5615, "step": 169000 }, { "epoch": 22.783616275936406, "grad_norm": 0.17786213755607605, "learning_rate": 0.0004922677325668433, "loss": 0.5614, "step": 169100 }, { "epoch": 22.797089733225544, "grad_norm": 0.19128774106502533, "learning_rate": 0.0004922303062965957, "loss": 0.5617, "step": 169200 }, { "epoch": 22.810563190514685, "grad_norm": 0.201755091547966, "learning_rate": 0.0004921928800263481, "loss": 0.561, "step": 169300 }, { "epoch": 22.824036647803826, "grad_norm": 0.20227587223052979, "learning_rate": 0.0004921554537561005, "loss": 0.562, "step": 169400 }, { "epoch": 22.837510105092967, "grad_norm": 0.17342524230480194, "learning_rate": 0.0004921180274858528, "loss": 0.5621, "step": 169500 }, { "epoch": 22.85098356238211, "grad_norm": 0.18072417378425598, "learning_rate": 0.0004920806012156053, "loss": 0.5608, "step": 169600 }, { "epoch": 22.86445701967125, "grad_norm": 0.1695725917816162, "learning_rate": 0.0004920431749453576, "loss": 0.5612, "step": 169700 }, { "epoch": 22.877930476960387, "grad_norm": 0.1943567395210266, "learning_rate": 0.0004920057486751101, "loss": 0.5611, "step": 169800 }, { "epoch": 22.891403934249528, "grad_norm": 0.19336853921413422, "learning_rate": 0.0004919683224048624, "loss": 0.5609, "step": 169900 }, { "epoch": 22.90487739153867, "grad_norm": 0.1963842660188675, "learning_rate": 0.0004919308961346149, "loss": 0.5616, "step": 170000 }, { "epoch": 22.91835084882781, "grad_norm": 0.19471324980258942, "learning_rate": 0.0004918934698643672, "loss": 0.5623, "step": 170100 }, { "epoch": 22.93182430611695, "grad_norm": 0.18232223391532898, "learning_rate": 0.0004918560435941196, "loss": 0.5616, "step": 170200 }, { "epoch": 22.94529776340609, "grad_norm": 0.20877613127231598, "learning_rate": 0.000491818617323872, "loss": 0.5609, "step": 170300 }, { "epoch": 22.95877122069523, "grad_norm": 0.17349277436733246, "learning_rate": 0.0004917811910536244, "loss": 0.5612, "step": 170400 }, { "epoch": 22.97224467798437, "grad_norm": 0.1756744235754013, "learning_rate": 0.0004917437647833767, "loss": 0.5605, "step": 170500 }, { "epoch": 22.985718135273512, "grad_norm": 0.18866467475891113, "learning_rate": 0.0004917063385131292, "loss": 0.5602, "step": 170600 }, { "epoch": 22.999191592562653, "grad_norm": 0.17024242877960205, "learning_rate": 0.0004916689122428815, "loss": 0.5613, "step": 170700 }, { "epoch": 23.0, "eval_loss": 0.5484337210655212, "eval_runtime": 4.966, "eval_samples_per_second": 1006.852, "eval_steps_per_second": 15.908, "step": 170706 }, { "epoch": 23.01266504985179, "grad_norm": 0.1802133470773697, "learning_rate": 0.000491631485972634, "loss": 0.5599, "step": 170800 }, { "epoch": 23.02613850714093, "grad_norm": 0.17397505044937134, "learning_rate": 0.0004915940597023863, "loss": 0.5596, "step": 170900 }, { "epoch": 23.039611964430073, "grad_norm": 0.18454228341579437, "learning_rate": 0.0004915566334321386, "loss": 0.5599, "step": 171000 }, { "epoch": 23.053085421719214, "grad_norm": 0.17356672883033752, "learning_rate": 0.0004915192071618911, "loss": 0.561, "step": 171100 }, { "epoch": 23.066558879008355, "grad_norm": 0.17642800509929657, "learning_rate": 0.0004914817808916434, "loss": 0.5604, "step": 171200 }, { "epoch": 23.080032336297492, "grad_norm": 0.19329267740249634, "learning_rate": 0.0004914443546213959, "loss": 0.5605, "step": 171300 }, { "epoch": 23.093505793586633, "grad_norm": 0.16962982714176178, "learning_rate": 0.0004914069283511482, "loss": 0.5608, "step": 171400 }, { "epoch": 23.106979250875774, "grad_norm": 0.1678919792175293, "learning_rate": 0.0004913695020809007, "loss": 0.5599, "step": 171500 }, { "epoch": 23.120452708164915, "grad_norm": 0.18417988717556, "learning_rate": 0.000491332075810653, "loss": 0.5598, "step": 171600 }, { "epoch": 23.133926165454056, "grad_norm": 0.20306114852428436, "learning_rate": 0.0004912946495404054, "loss": 0.5589, "step": 171700 }, { "epoch": 23.147399622743198, "grad_norm": 0.18179674446582794, "learning_rate": 0.0004912572232701578, "loss": 0.5601, "step": 171800 }, { "epoch": 23.160873080032335, "grad_norm": 0.1733054518699646, "learning_rate": 0.0004912197969999102, "loss": 0.5601, "step": 171900 }, { "epoch": 23.174346537321476, "grad_norm": 0.17235663533210754, "learning_rate": 0.0004911823707296626, "loss": 0.5603, "step": 172000 }, { "epoch": 23.187819994610617, "grad_norm": 0.17395207285881042, "learning_rate": 0.000491144944459415, "loss": 0.5608, "step": 172100 }, { "epoch": 23.20129345189976, "grad_norm": 0.17744717001914978, "learning_rate": 0.0004911075181891673, "loss": 0.5595, "step": 172200 }, { "epoch": 23.2147669091889, "grad_norm": 0.19200854003429413, "learning_rate": 0.0004910700919189198, "loss": 0.5612, "step": 172300 }, { "epoch": 23.228240366478037, "grad_norm": 0.17331577837467194, "learning_rate": 0.0004910326656486721, "loss": 0.5609, "step": 172400 }, { "epoch": 23.241713823767178, "grad_norm": 0.1915639042854309, "learning_rate": 0.0004909952393784245, "loss": 0.561, "step": 172500 }, { "epoch": 23.25518728105632, "grad_norm": 0.18633948266506195, "learning_rate": 0.0004909578131081769, "loss": 0.5604, "step": 172600 }, { "epoch": 23.26866073834546, "grad_norm": 0.16804121434688568, "learning_rate": 0.0004909203868379293, "loss": 0.5612, "step": 172700 }, { "epoch": 23.2821341956346, "grad_norm": 0.18058809638023376, "learning_rate": 0.0004908829605676817, "loss": 0.5609, "step": 172800 }, { "epoch": 23.29560765292374, "grad_norm": 0.1912873089313507, "learning_rate": 0.000490845534297434, "loss": 0.5595, "step": 172900 }, { "epoch": 23.30908111021288, "grad_norm": 0.18396419286727905, "learning_rate": 0.0004908081080271865, "loss": 0.56, "step": 173000 }, { "epoch": 23.32255456750202, "grad_norm": 0.17948633432388306, "learning_rate": 0.0004907706817569388, "loss": 0.5592, "step": 173100 }, { "epoch": 23.336028024791162, "grad_norm": 0.18168383836746216, "learning_rate": 0.0004907332554866912, "loss": 0.5595, "step": 173200 }, { "epoch": 23.349501482080303, "grad_norm": 0.19642141461372375, "learning_rate": 0.0004906958292164436, "loss": 0.559, "step": 173300 }, { "epoch": 23.362974939369444, "grad_norm": 0.18583959341049194, "learning_rate": 0.000490658402946196, "loss": 0.561, "step": 173400 }, { "epoch": 23.37644839665858, "grad_norm": 0.18414270877838135, "learning_rate": 0.0004906209766759484, "loss": 0.5607, "step": 173500 }, { "epoch": 23.389921853947722, "grad_norm": 0.18130391836166382, "learning_rate": 0.0004905835504057008, "loss": 0.56, "step": 173600 }, { "epoch": 23.403395311236864, "grad_norm": 0.19362537562847137, "learning_rate": 0.0004905461241354532, "loss": 0.5605, "step": 173700 }, { "epoch": 23.416868768526005, "grad_norm": 0.16773472726345062, "learning_rate": 0.0004905086978652056, "loss": 0.5599, "step": 173800 }, { "epoch": 23.430342225815146, "grad_norm": 0.18674634397029877, "learning_rate": 0.0004904712715949579, "loss": 0.5604, "step": 173900 }, { "epoch": 23.443815683104283, "grad_norm": 0.18554353713989258, "learning_rate": 0.0004904338453247103, "loss": 0.56, "step": 174000 }, { "epoch": 23.457289140393424, "grad_norm": 0.17821581661701202, "learning_rate": 0.0004903964190544627, "loss": 0.5591, "step": 174100 }, { "epoch": 23.470762597682565, "grad_norm": 0.17980588972568512, "learning_rate": 0.0004903589927842151, "loss": 0.5592, "step": 174200 }, { "epoch": 23.484236054971706, "grad_norm": 0.18674099445343018, "learning_rate": 0.0004903215665139675, "loss": 0.5581, "step": 174300 }, { "epoch": 23.497709512260847, "grad_norm": 0.1934693157672882, "learning_rate": 0.0004902841402437199, "loss": 0.5596, "step": 174400 }, { "epoch": 23.511182969549985, "grad_norm": 0.17265217006206512, "learning_rate": 0.0004902467139734723, "loss": 0.5584, "step": 174500 }, { "epoch": 23.524656426839126, "grad_norm": 0.18116915225982666, "learning_rate": 0.0004902092877032247, "loss": 0.559, "step": 174600 }, { "epoch": 23.538129884128267, "grad_norm": 0.17647728323936462, "learning_rate": 0.000490171861432977, "loss": 0.5601, "step": 174700 }, { "epoch": 23.551603341417408, "grad_norm": 0.17223778367042542, "learning_rate": 0.0004901344351627295, "loss": 0.5599, "step": 174800 }, { "epoch": 23.56507679870655, "grad_norm": 0.17669333517551422, "learning_rate": 0.0004900970088924818, "loss": 0.5585, "step": 174900 }, { "epoch": 23.578550255995687, "grad_norm": 0.18985342979431152, "learning_rate": 0.0004900595826222342, "loss": 0.5593, "step": 175000 }, { "epoch": 23.592023713284828, "grad_norm": 0.1670546978712082, "learning_rate": 0.0004900221563519866, "loss": 0.5595, "step": 175100 }, { "epoch": 23.60549717057397, "grad_norm": 0.17411255836486816, "learning_rate": 0.000489984730081739, "loss": 0.5592, "step": 175200 }, { "epoch": 23.61897062786311, "grad_norm": 0.17771989107131958, "learning_rate": 0.0004899473038114914, "loss": 0.5605, "step": 175300 }, { "epoch": 23.63244408515225, "grad_norm": 0.17684194445610046, "learning_rate": 0.0004899098775412437, "loss": 0.5595, "step": 175400 }, { "epoch": 23.645917542441392, "grad_norm": 0.17193399369716644, "learning_rate": 0.0004898724512709962, "loss": 0.5607, "step": 175500 }, { "epoch": 23.65939099973053, "grad_norm": 0.17925529181957245, "learning_rate": 0.0004898350250007485, "loss": 0.5592, "step": 175600 }, { "epoch": 23.67286445701967, "grad_norm": 0.17730829119682312, "learning_rate": 0.000489797598730501, "loss": 0.5602, "step": 175700 }, { "epoch": 23.68633791430881, "grad_norm": 0.18404234945774078, "learning_rate": 0.0004897601724602533, "loss": 0.5595, "step": 175800 }, { "epoch": 23.699811371597953, "grad_norm": 0.18217375874519348, "learning_rate": 0.0004897227461900057, "loss": 0.5595, "step": 175900 }, { "epoch": 23.713284828887094, "grad_norm": 0.19194573163986206, "learning_rate": 0.000489685319919758, "loss": 0.5601, "step": 176000 }, { "epoch": 23.72675828617623, "grad_norm": 0.1775050312280655, "learning_rate": 0.0004896478936495105, "loss": 0.5589, "step": 176100 }, { "epoch": 23.740231743465372, "grad_norm": 0.1858302354812622, "learning_rate": 0.0004896104673792628, "loss": 0.5586, "step": 176200 }, { "epoch": 23.753705200754514, "grad_norm": 0.1939878612756729, "learning_rate": 0.0004895730411090153, "loss": 0.5602, "step": 176300 }, { "epoch": 23.767178658043655, "grad_norm": 0.16433894634246826, "learning_rate": 0.0004895356148387676, "loss": 0.5602, "step": 176400 }, { "epoch": 23.780652115332796, "grad_norm": 0.18081903457641602, "learning_rate": 0.0004894981885685201, "loss": 0.5587, "step": 176500 }, { "epoch": 23.794125572621933, "grad_norm": 0.18611791729927063, "learning_rate": 0.0004894607622982724, "loss": 0.5604, "step": 176600 }, { "epoch": 23.807599029911074, "grad_norm": 0.1630348265171051, "learning_rate": 0.0004894233360280247, "loss": 0.5596, "step": 176700 }, { "epoch": 23.821072487200215, "grad_norm": 0.17033906280994415, "learning_rate": 0.0004893859097577772, "loss": 0.5597, "step": 176800 }, { "epoch": 23.834545944489356, "grad_norm": 0.1804981678724289, "learning_rate": 0.0004893484834875295, "loss": 0.559, "step": 176900 }, { "epoch": 23.848019401778497, "grad_norm": 0.20591925084590912, "learning_rate": 0.000489311057217282, "loss": 0.5591, "step": 177000 }, { "epoch": 23.86149285906764, "grad_norm": 0.16729497909545898, "learning_rate": 0.0004892736309470343, "loss": 0.558, "step": 177100 }, { "epoch": 23.874966316356776, "grad_norm": 0.17591315507888794, "learning_rate": 0.0004892362046767868, "loss": 0.5586, "step": 177200 }, { "epoch": 23.888439773645917, "grad_norm": 0.17850565910339355, "learning_rate": 0.0004891987784065391, "loss": 0.5602, "step": 177300 }, { "epoch": 23.901913230935058, "grad_norm": 0.17866884171962738, "learning_rate": 0.0004891613521362916, "loss": 0.558, "step": 177400 }, { "epoch": 23.9153866882242, "grad_norm": 0.1906076818704605, "learning_rate": 0.0004891239258660439, "loss": 0.5578, "step": 177500 }, { "epoch": 23.92886014551334, "grad_norm": 0.1793789565563202, "learning_rate": 0.0004890864995957963, "loss": 0.56, "step": 177600 }, { "epoch": 23.942333602802478, "grad_norm": 0.1688150316476822, "learning_rate": 0.0004890490733255486, "loss": 0.5588, "step": 177700 }, { "epoch": 23.95580706009162, "grad_norm": 0.176810160279274, "learning_rate": 0.0004890116470553011, "loss": 0.5582, "step": 177800 }, { "epoch": 23.96928051738076, "grad_norm": 0.16668574512004852, "learning_rate": 0.0004889742207850534, "loss": 0.5589, "step": 177900 }, { "epoch": 23.9827539746699, "grad_norm": 0.18226394057273865, "learning_rate": 0.0004889367945148059, "loss": 0.559, "step": 178000 }, { "epoch": 23.996227431959042, "grad_norm": 0.1677313596010208, "learning_rate": 0.0004888993682445582, "loss": 0.5589, "step": 178100 }, { "epoch": 24.0, "eval_loss": 0.5462620258331299, "eval_runtime": 4.9473, "eval_samples_per_second": 1010.66, "eval_steps_per_second": 15.968, "step": 178128 }, { "epoch": 24.00970088924818, "grad_norm": 0.16527053713798523, "learning_rate": 0.0004888619419743106, "loss": 0.5579, "step": 178200 }, { "epoch": 24.02317434653732, "grad_norm": 0.16996057331562042, "learning_rate": 0.000488824515704063, "loss": 0.5585, "step": 178300 }, { "epoch": 24.03664780382646, "grad_norm": 0.20018506050109863, "learning_rate": 0.0004887870894338154, "loss": 0.559, "step": 178400 }, { "epoch": 24.050121261115603, "grad_norm": 0.186064213514328, "learning_rate": 0.0004887496631635678, "loss": 0.5586, "step": 178500 }, { "epoch": 24.063594718404744, "grad_norm": 0.1772656887769699, "learning_rate": 0.0004887122368933202, "loss": 0.5584, "step": 178600 }, { "epoch": 24.07706817569388, "grad_norm": 0.16770300269126892, "learning_rate": 0.0004886748106230726, "loss": 0.5589, "step": 178700 }, { "epoch": 24.090541632983022, "grad_norm": 0.1918691247701645, "learning_rate": 0.000488637384352825, "loss": 0.5577, "step": 178800 }, { "epoch": 24.104015090272163, "grad_norm": 0.17741252481937408, "learning_rate": 0.0004885999580825773, "loss": 0.559, "step": 178900 }, { "epoch": 24.117488547561305, "grad_norm": 0.1742209494113922, "learning_rate": 0.0004885625318123297, "loss": 0.5592, "step": 179000 }, { "epoch": 24.130962004850446, "grad_norm": 0.1680833250284195, "learning_rate": 0.0004885251055420821, "loss": 0.5585, "step": 179100 }, { "epoch": 24.144435462139587, "grad_norm": 0.18516331911087036, "learning_rate": 0.0004884876792718345, "loss": 0.558, "step": 179200 }, { "epoch": 24.157908919428724, "grad_norm": 0.1694164276123047, "learning_rate": 0.0004884502530015869, "loss": 0.558, "step": 179300 }, { "epoch": 24.171382376717865, "grad_norm": 0.17918741703033447, "learning_rate": 0.0004884128267313393, "loss": 0.5596, "step": 179400 }, { "epoch": 24.184855834007006, "grad_norm": 0.18341219425201416, "learning_rate": 0.0004883754004610917, "loss": 0.5591, "step": 179500 }, { "epoch": 24.198329291296147, "grad_norm": 0.178458109498024, "learning_rate": 0.000488337974190844, "loss": 0.5577, "step": 179600 }, { "epoch": 24.21180274858529, "grad_norm": 0.2020404189825058, "learning_rate": 0.0004883005479205964, "loss": 0.5582, "step": 179700 }, { "epoch": 24.225276205874426, "grad_norm": 0.1804492473602295, "learning_rate": 0.0004882631216503488, "loss": 0.56, "step": 179800 }, { "epoch": 24.238749663163567, "grad_norm": 0.18723396956920624, "learning_rate": 0.0004882256953801012, "loss": 0.5577, "step": 179900 }, { "epoch": 24.252223120452708, "grad_norm": 0.17179477214813232, "learning_rate": 0.00048818826910985356, "loss": 0.5592, "step": 180000 }, { "epoch": 24.26569657774185, "grad_norm": 0.18407440185546875, "learning_rate": 0.00048815084283960596, "loss": 0.5582, "step": 180100 }, { "epoch": 24.27917003503099, "grad_norm": 0.1960892528295517, "learning_rate": 0.00048811341656935836, "loss": 0.5579, "step": 180200 }, { "epoch": 24.292643492320128, "grad_norm": 0.1968892216682434, "learning_rate": 0.00048807599029911076, "loss": 0.5588, "step": 180300 }, { "epoch": 24.30611694960927, "grad_norm": 0.22432373464107513, "learning_rate": 0.00048803856402886316, "loss": 0.5581, "step": 180400 }, { "epoch": 24.31959040689841, "grad_norm": 0.1966257393360138, "learning_rate": 0.00048800113775861556, "loss": 0.5575, "step": 180500 }, { "epoch": 24.33306386418755, "grad_norm": 0.16865691542625427, "learning_rate": 0.00048796371148836795, "loss": 0.5579, "step": 180600 }, { "epoch": 24.346537321476692, "grad_norm": 0.18180525302886963, "learning_rate": 0.0004879262852181203, "loss": 0.5584, "step": 180700 }, { "epoch": 24.360010778765833, "grad_norm": 0.1840043067932129, "learning_rate": 0.0004878888589478727, "loss": 0.5596, "step": 180800 }, { "epoch": 24.37348423605497, "grad_norm": 0.17708534002304077, "learning_rate": 0.0004878514326776251, "loss": 0.5575, "step": 180900 }, { "epoch": 24.38695769334411, "grad_norm": 0.16541631519794464, "learning_rate": 0.0004878140064073775, "loss": 0.5591, "step": 181000 }, { "epoch": 24.400431150633253, "grad_norm": 0.16328050196170807, "learning_rate": 0.0004877765801371299, "loss": 0.5588, "step": 181100 }, { "epoch": 24.413904607922394, "grad_norm": 0.16280323266983032, "learning_rate": 0.0004877391538668823, "loss": 0.5578, "step": 181200 }, { "epoch": 24.427378065211535, "grad_norm": 0.1864689737558365, "learning_rate": 0.00048770172759663463, "loss": 0.5579, "step": 181300 }, { "epoch": 24.440851522500672, "grad_norm": 0.18588761985301971, "learning_rate": 0.00048766430132638703, "loss": 0.5574, "step": 181400 }, { "epoch": 24.454324979789813, "grad_norm": 0.18457630276679993, "learning_rate": 0.0004876268750561394, "loss": 0.5578, "step": 181500 }, { "epoch": 24.467798437078955, "grad_norm": 0.16667474806308746, "learning_rate": 0.0004875894487858918, "loss": 0.5573, "step": 181600 }, { "epoch": 24.481271894368096, "grad_norm": 0.18678000569343567, "learning_rate": 0.00048755202251564417, "loss": 0.557, "step": 181700 }, { "epoch": 24.494745351657237, "grad_norm": 0.16545584797859192, "learning_rate": 0.00048751459624539657, "loss": 0.5578, "step": 181800 }, { "epoch": 24.508218808946374, "grad_norm": 0.18002210557460785, "learning_rate": 0.00048747716997514897, "loss": 0.5578, "step": 181900 }, { "epoch": 24.521692266235515, "grad_norm": 0.16885274648666382, "learning_rate": 0.00048743974370490137, "loss": 0.5577, "step": 182000 }, { "epoch": 24.535165723524656, "grad_norm": 0.18239398300647736, "learning_rate": 0.00048740231743465376, "loss": 0.5583, "step": 182100 }, { "epoch": 24.548639180813797, "grad_norm": 0.18280941247940063, "learning_rate": 0.0004873648911644061, "loss": 0.557, "step": 182200 }, { "epoch": 24.56211263810294, "grad_norm": 0.16450199484825134, "learning_rate": 0.0004873274648941585, "loss": 0.557, "step": 182300 }, { "epoch": 24.575586095392076, "grad_norm": 0.1817501336336136, "learning_rate": 0.0004872900386239109, "loss": 0.5562, "step": 182400 }, { "epoch": 24.589059552681217, "grad_norm": 0.18709951639175415, "learning_rate": 0.0004872526123536633, "loss": 0.5581, "step": 182500 }, { "epoch": 24.602533009970358, "grad_norm": 0.18656493723392487, "learning_rate": 0.0004872151860834157, "loss": 0.5573, "step": 182600 }, { "epoch": 24.6160064672595, "grad_norm": 0.17221207916736603, "learning_rate": 0.0004871777598131681, "loss": 0.5582, "step": 182700 }, { "epoch": 24.62947992454864, "grad_norm": 0.19517621397972107, "learning_rate": 0.0004871403335429205, "loss": 0.5574, "step": 182800 }, { "epoch": 24.64295338183778, "grad_norm": 0.1882408708333969, "learning_rate": 0.00048710290727267284, "loss": 0.5573, "step": 182900 }, { "epoch": 24.65642683912692, "grad_norm": 0.17031066119670868, "learning_rate": 0.00048706548100242524, "loss": 0.5582, "step": 183000 }, { "epoch": 24.66990029641606, "grad_norm": 0.1830800622701645, "learning_rate": 0.0004870280547321776, "loss": 0.5578, "step": 183100 }, { "epoch": 24.6833737537052, "grad_norm": 0.18608978390693665, "learning_rate": 0.00048699062846193, "loss": 0.5575, "step": 183200 }, { "epoch": 24.696847210994342, "grad_norm": 0.17935633659362793, "learning_rate": 0.0004869532021916824, "loss": 0.5571, "step": 183300 }, { "epoch": 24.710320668283483, "grad_norm": 0.17336037755012512, "learning_rate": 0.0004869157759214348, "loss": 0.5583, "step": 183400 }, { "epoch": 24.72379412557262, "grad_norm": 0.16558220982551575, "learning_rate": 0.0004868783496511872, "loss": 0.5568, "step": 183500 }, { "epoch": 24.73726758286176, "grad_norm": 0.16553173959255219, "learning_rate": 0.0004868409233809396, "loss": 0.5558, "step": 183600 }, { "epoch": 24.750741040150903, "grad_norm": 0.1989096701145172, "learning_rate": 0.0004868034971106919, "loss": 0.5568, "step": 183700 }, { "epoch": 24.764214497440044, "grad_norm": 0.18104788661003113, "learning_rate": 0.0004867660708404443, "loss": 0.557, "step": 183800 }, { "epoch": 24.777687954729185, "grad_norm": 0.17270323634147644, "learning_rate": 0.0004867286445701967, "loss": 0.5574, "step": 183900 }, { "epoch": 24.791161412018322, "grad_norm": 0.16855423152446747, "learning_rate": 0.0004866912182999491, "loss": 0.558, "step": 184000 }, { "epoch": 24.804634869307463, "grad_norm": 0.16762346029281616, "learning_rate": 0.0004866537920297015, "loss": 0.5568, "step": 184100 }, { "epoch": 24.818108326596604, "grad_norm": 0.1727660447359085, "learning_rate": 0.0004866163657594539, "loss": 0.5562, "step": 184200 }, { "epoch": 24.831581783885746, "grad_norm": 0.17314627766609192, "learning_rate": 0.0004865789394892063, "loss": 0.5579, "step": 184300 }, { "epoch": 24.845055241174887, "grad_norm": 0.16971036791801453, "learning_rate": 0.00048654151321895865, "loss": 0.5578, "step": 184400 }, { "epoch": 24.858528698464028, "grad_norm": 0.1648082137107849, "learning_rate": 0.00048650408694871105, "loss": 0.5574, "step": 184500 }, { "epoch": 24.872002155753165, "grad_norm": 0.174635648727417, "learning_rate": 0.00048646666067846345, "loss": 0.5573, "step": 184600 }, { "epoch": 24.885475613042306, "grad_norm": 0.17371924221515656, "learning_rate": 0.00048642923440821585, "loss": 0.5578, "step": 184700 }, { "epoch": 24.898949070331447, "grad_norm": 0.17244970798492432, "learning_rate": 0.00048639180813796825, "loss": 0.5569, "step": 184800 }, { "epoch": 24.91242252762059, "grad_norm": 0.1755051463842392, "learning_rate": 0.0004863543818677206, "loss": 0.557, "step": 184900 }, { "epoch": 24.92589598490973, "grad_norm": 0.18390098214149475, "learning_rate": 0.000486316955597473, "loss": 0.5563, "step": 185000 }, { "epoch": 24.939369442198867, "grad_norm": 0.16900460422039032, "learning_rate": 0.00048627952932722533, "loss": 0.5565, "step": 185100 }, { "epoch": 24.952842899488008, "grad_norm": 0.17093566060066223, "learning_rate": 0.00048624210305697773, "loss": 0.5564, "step": 185200 }, { "epoch": 24.96631635677715, "grad_norm": 0.17355068027973175, "learning_rate": 0.00048620467678673013, "loss": 0.5566, "step": 185300 }, { "epoch": 24.97978981406629, "grad_norm": 0.17774474620819092, "learning_rate": 0.0004861672505164825, "loss": 0.557, "step": 185400 }, { "epoch": 24.99326327135543, "grad_norm": 0.1672983467578888, "learning_rate": 0.0004861298242462349, "loss": 0.5563, "step": 185500 }, { "epoch": 25.0, "eval_loss": 0.5447633862495422, "eval_runtime": 4.9642, "eval_samples_per_second": 1007.208, "eval_steps_per_second": 15.914, "step": 185550 }, { "epoch": 25.00673672864457, "grad_norm": 0.16515909135341644, "learning_rate": 0.0004860923979759873, "loss": 0.5563, "step": 185600 }, { "epoch": 25.02021018593371, "grad_norm": 0.1772710233926773, "learning_rate": 0.0004860549717057397, "loss": 0.5559, "step": 185700 }, { "epoch": 25.03368364322285, "grad_norm": 0.17029035091400146, "learning_rate": 0.00048601754543549207, "loss": 0.5556, "step": 185800 }, { "epoch": 25.047157100511992, "grad_norm": 0.17139209806919098, "learning_rate": 0.00048598011916524446, "loss": 0.5571, "step": 185900 }, { "epoch": 25.060630557801133, "grad_norm": 0.1827639490365982, "learning_rate": 0.00048594269289499686, "loss": 0.5565, "step": 186000 }, { "epoch": 25.07410401509027, "grad_norm": 0.168023020029068, "learning_rate": 0.00048590526662474926, "loss": 0.5566, "step": 186100 }, { "epoch": 25.08757747237941, "grad_norm": 0.1644616276025772, "learning_rate": 0.00048586784035450166, "loss": 0.5568, "step": 186200 }, { "epoch": 25.101050929668553, "grad_norm": 0.1688631772994995, "learning_rate": 0.00048583041408425406, "loss": 0.5567, "step": 186300 }, { "epoch": 25.114524386957694, "grad_norm": 0.16753003001213074, "learning_rate": 0.00048579298781400646, "loss": 0.556, "step": 186400 }, { "epoch": 25.127997844246835, "grad_norm": 0.17840027809143066, "learning_rate": 0.00048575556154375885, "loss": 0.5574, "step": 186500 }, { "epoch": 25.141471301535976, "grad_norm": 0.19566774368286133, "learning_rate": 0.0004857181352735112, "loss": 0.5577, "step": 186600 }, { "epoch": 25.154944758825113, "grad_norm": 0.17897406220436096, "learning_rate": 0.00048568070900326354, "loss": 0.5571, "step": 186700 }, { "epoch": 25.168418216114254, "grad_norm": 0.16674786806106567, "learning_rate": 0.00048564328273301594, "loss": 0.5553, "step": 186800 }, { "epoch": 25.181891673403396, "grad_norm": 0.18763217329978943, "learning_rate": 0.00048560585646276834, "loss": 0.5559, "step": 186900 }, { "epoch": 25.195365130692537, "grad_norm": 0.17448917031288147, "learning_rate": 0.00048556843019252074, "loss": 0.5575, "step": 187000 }, { "epoch": 25.208838587981678, "grad_norm": 0.19520021975040436, "learning_rate": 0.00048553100392227313, "loss": 0.5562, "step": 187100 }, { "epoch": 25.222312045270815, "grad_norm": 0.1974489688873291, "learning_rate": 0.00048549357765202553, "loss": 0.5564, "step": 187200 }, { "epoch": 25.235785502559956, "grad_norm": 0.17383214831352234, "learning_rate": 0.0004854561513817779, "loss": 0.5575, "step": 187300 }, { "epoch": 25.249258959849097, "grad_norm": 0.17503872513771057, "learning_rate": 0.0004854187251115303, "loss": 0.5563, "step": 187400 }, { "epoch": 25.26273241713824, "grad_norm": 0.18441222608089447, "learning_rate": 0.0004853812988412827, "loss": 0.5559, "step": 187500 }, { "epoch": 25.27620587442738, "grad_norm": 0.18047015368938446, "learning_rate": 0.00048534387257103507, "loss": 0.5554, "step": 187600 }, { "epoch": 25.289679331716517, "grad_norm": 0.1653052568435669, "learning_rate": 0.00048530644630078747, "loss": 0.557, "step": 187700 }, { "epoch": 25.303152789005658, "grad_norm": 0.16916614770889282, "learning_rate": 0.00048526902003053987, "loss": 0.5573, "step": 187800 }, { "epoch": 25.3166262462948, "grad_norm": 0.19409333169460297, "learning_rate": 0.00048523159376029227, "loss": 0.5563, "step": 187900 }, { "epoch": 25.33009970358394, "grad_norm": 0.1968545764684677, "learning_rate": 0.0004851941674900446, "loss": 0.5562, "step": 188000 }, { "epoch": 25.34357316087308, "grad_norm": 0.1728520691394806, "learning_rate": 0.000485156741219797, "loss": 0.5558, "step": 188100 }, { "epoch": 25.35704661816222, "grad_norm": 0.16767963767051697, "learning_rate": 0.0004851193149495494, "loss": 0.555, "step": 188200 }, { "epoch": 25.37052007545136, "grad_norm": 0.16956518590450287, "learning_rate": 0.0004850818886793018, "loss": 0.5554, "step": 188300 }, { "epoch": 25.3839935327405, "grad_norm": 0.19654439389705658, "learning_rate": 0.0004850444624090542, "loss": 0.5557, "step": 188400 }, { "epoch": 25.397466990029642, "grad_norm": 0.1714678853750229, "learning_rate": 0.00048500703613880655, "loss": 0.556, "step": 188500 }, { "epoch": 25.410940447318783, "grad_norm": 0.1996818333864212, "learning_rate": 0.00048496960986855895, "loss": 0.5566, "step": 188600 }, { "epoch": 25.424413904607924, "grad_norm": 0.16626828908920288, "learning_rate": 0.0004849321835983113, "loss": 0.5556, "step": 188700 }, { "epoch": 25.43788736189706, "grad_norm": 0.17432361841201782, "learning_rate": 0.0004848947573280637, "loss": 0.5572, "step": 188800 }, { "epoch": 25.451360819186203, "grad_norm": 0.17184635996818542, "learning_rate": 0.0004848573310578161, "loss": 0.5562, "step": 188900 }, { "epoch": 25.464834276475344, "grad_norm": 0.17919309437274933, "learning_rate": 0.0004848199047875685, "loss": 0.5556, "step": 189000 }, { "epoch": 25.478307733764485, "grad_norm": 0.1857006549835205, "learning_rate": 0.0004847824785173209, "loss": 0.5566, "step": 189100 }, { "epoch": 25.491781191053626, "grad_norm": 0.16586405038833618, "learning_rate": 0.0004847450522470733, "loss": 0.5567, "step": 189200 }, { "epoch": 25.505254648342763, "grad_norm": 0.17218855023384094, "learning_rate": 0.0004847076259768257, "loss": 0.5557, "step": 189300 }, { "epoch": 25.518728105631904, "grad_norm": 0.17484456300735474, "learning_rate": 0.0004846701997065781, "loss": 0.5554, "step": 189400 }, { "epoch": 25.532201562921045, "grad_norm": 0.17002251744270325, "learning_rate": 0.0004846327734363304, "loss": 0.5567, "step": 189500 }, { "epoch": 25.545675020210187, "grad_norm": 0.1763172298669815, "learning_rate": 0.0004845953471660828, "loss": 0.5565, "step": 189600 }, { "epoch": 25.559148477499328, "grad_norm": 0.1871870458126068, "learning_rate": 0.0004845579208958352, "loss": 0.5558, "step": 189700 }, { "epoch": 25.572621934788465, "grad_norm": 0.19438809156417847, "learning_rate": 0.0004845204946255876, "loss": 0.5556, "step": 189800 }, { "epoch": 25.586095392077606, "grad_norm": 0.17658594250679016, "learning_rate": 0.00048448306835534, "loss": 0.5561, "step": 189900 }, { "epoch": 25.599568849366747, "grad_norm": 0.17504793405532837, "learning_rate": 0.0004844456420850924, "loss": 0.5553, "step": 190000 }, { "epoch": 25.61304230665589, "grad_norm": 0.17590074241161346, "learning_rate": 0.0004844082158148448, "loss": 0.5558, "step": 190100 }, { "epoch": 25.62651576394503, "grad_norm": 0.17594696581363678, "learning_rate": 0.00048437078954459715, "loss": 0.5558, "step": 190200 }, { "epoch": 25.639989221234167, "grad_norm": 0.16775022447109222, "learning_rate": 0.0004843333632743495, "loss": 0.5555, "step": 190300 }, { "epoch": 25.653462678523308, "grad_norm": 0.17402395606040955, "learning_rate": 0.0004842959370041019, "loss": 0.5573, "step": 190400 }, { "epoch": 25.66693613581245, "grad_norm": 0.16879864037036896, "learning_rate": 0.0004842585107338543, "loss": 0.5561, "step": 190500 }, { "epoch": 25.68040959310159, "grad_norm": 0.17998504638671875, "learning_rate": 0.0004842210844636067, "loss": 0.5557, "step": 190600 }, { "epoch": 25.69388305039073, "grad_norm": 0.19109630584716797, "learning_rate": 0.0004841836581933591, "loss": 0.5549, "step": 190700 }, { "epoch": 25.707356507679872, "grad_norm": 0.17825141549110413, "learning_rate": 0.0004841462319231115, "loss": 0.5548, "step": 190800 }, { "epoch": 25.72082996496901, "grad_norm": 0.18689629435539246, "learning_rate": 0.00048410880565286383, "loss": 0.556, "step": 190900 }, { "epoch": 25.73430342225815, "grad_norm": 0.18513089418411255, "learning_rate": 0.00048407137938261623, "loss": 0.5558, "step": 191000 }, { "epoch": 25.747776879547292, "grad_norm": 0.16994662582874298, "learning_rate": 0.00048403395311236863, "loss": 0.5564, "step": 191100 }, { "epoch": 25.761250336836433, "grad_norm": 0.16910216212272644, "learning_rate": 0.00048399652684212103, "loss": 0.5551, "step": 191200 }, { "epoch": 25.774723794125574, "grad_norm": 0.17621973156929016, "learning_rate": 0.0004839591005718734, "loss": 0.5561, "step": 191300 }, { "epoch": 25.78819725141471, "grad_norm": 0.1738387644290924, "learning_rate": 0.0004839216743016258, "loss": 0.5556, "step": 191400 }, { "epoch": 25.801670708703853, "grad_norm": 0.1724604219198227, "learning_rate": 0.0004838842480313782, "loss": 0.5557, "step": 191500 }, { "epoch": 25.815144165992994, "grad_norm": 0.16843943297863007, "learning_rate": 0.0004838468217611306, "loss": 0.5562, "step": 191600 }, { "epoch": 25.828617623282135, "grad_norm": 0.19686436653137207, "learning_rate": 0.00048380939549088297, "loss": 0.5552, "step": 191700 }, { "epoch": 25.842091080571276, "grad_norm": 0.1900775134563446, "learning_rate": 0.00048377196922063536, "loss": 0.5567, "step": 191800 }, { "epoch": 25.855564537860413, "grad_norm": 0.16821780800819397, "learning_rate": 0.00048373454295038776, "loss": 0.5557, "step": 191900 }, { "epoch": 25.869037995149554, "grad_norm": 0.1868707239627838, "learning_rate": 0.00048369711668014016, "loss": 0.5548, "step": 192000 }, { "epoch": 25.882511452438695, "grad_norm": 0.17689774930477142, "learning_rate": 0.0004836596904098925, "loss": 0.5561, "step": 192100 }, { "epoch": 25.895984909727837, "grad_norm": 0.16608783602714539, "learning_rate": 0.0004836222641396449, "loss": 0.5565, "step": 192200 }, { "epoch": 25.909458367016978, "grad_norm": 0.17167040705680847, "learning_rate": 0.0004835848378693973, "loss": 0.5555, "step": 192300 }, { "epoch": 25.92293182430612, "grad_norm": 0.17542345821857452, "learning_rate": 0.00048354741159914964, "loss": 0.555, "step": 192400 }, { "epoch": 25.936405281595256, "grad_norm": 0.17327192425727844, "learning_rate": 0.00048350998532890204, "loss": 0.555, "step": 192500 }, { "epoch": 25.949878738884397, "grad_norm": 0.1684211939573288, "learning_rate": 0.00048347255905865444, "loss": 0.5556, "step": 192600 }, { "epoch": 25.96335219617354, "grad_norm": 0.18426665663719177, "learning_rate": 0.00048343513278840684, "loss": 0.5549, "step": 192700 }, { "epoch": 25.97682565346268, "grad_norm": 0.17690254747867584, "learning_rate": 0.00048339770651815924, "loss": 0.555, "step": 192800 }, { "epoch": 25.99029911075182, "grad_norm": 0.16608786582946777, "learning_rate": 0.00048336028024791164, "loss": 0.5549, "step": 192900 }, { "epoch": 26.0, "eval_loss": 0.5428470969200134, "eval_runtime": 4.9287, "eval_samples_per_second": 1014.461, "eval_steps_per_second": 16.028, "step": 192972 }, { "epoch": 26.003772568040958, "grad_norm": 0.169581800699234, "learning_rate": 0.00048332285397766403, "loss": 0.5555, "step": 193000 }, { "epoch": 26.0172460253301, "grad_norm": 0.16636668145656586, "learning_rate": 0.0004832854277074164, "loss": 0.5558, "step": 193100 }, { "epoch": 26.03071948261924, "grad_norm": 0.19539564847946167, "learning_rate": 0.0004832480014371688, "loss": 0.5534, "step": 193200 }, { "epoch": 26.04419293990838, "grad_norm": 0.17119282484054565, "learning_rate": 0.0004832105751669212, "loss": 0.5548, "step": 193300 }, { "epoch": 26.057666397197522, "grad_norm": 0.16502819955348969, "learning_rate": 0.00048317314889667357, "loss": 0.5536, "step": 193400 }, { "epoch": 26.07113985448666, "grad_norm": 0.1679784059524536, "learning_rate": 0.00048313572262642597, "loss": 0.5552, "step": 193500 }, { "epoch": 26.0846133117758, "grad_norm": 0.16771340370178223, "learning_rate": 0.00048309829635617837, "loss": 0.5547, "step": 193600 }, { "epoch": 26.098086769064942, "grad_norm": 0.19309301674365997, "learning_rate": 0.00048306087008593077, "loss": 0.5534, "step": 193700 }, { "epoch": 26.111560226354083, "grad_norm": 0.18809586763381958, "learning_rate": 0.00048302344381568317, "loss": 0.5535, "step": 193800 }, { "epoch": 26.125033683643224, "grad_norm": 0.17662109434604645, "learning_rate": 0.0004829860175454355, "loss": 0.5544, "step": 193900 }, { "epoch": 26.13850714093236, "grad_norm": 0.16999341547489166, "learning_rate": 0.00048294859127518785, "loss": 0.5539, "step": 194000 }, { "epoch": 26.151980598221503, "grad_norm": 0.16591422259807587, "learning_rate": 0.00048291116500494025, "loss": 0.5558, "step": 194100 }, { "epoch": 26.165454055510644, "grad_norm": 0.16650815308094025, "learning_rate": 0.00048287373873469265, "loss": 0.5537, "step": 194200 }, { "epoch": 26.178927512799785, "grad_norm": 0.18995587527751923, "learning_rate": 0.00048283631246444505, "loss": 0.5544, "step": 194300 }, { "epoch": 26.192400970088926, "grad_norm": 0.17678196728229523, "learning_rate": 0.00048279888619419745, "loss": 0.5549, "step": 194400 }, { "epoch": 26.205874427378067, "grad_norm": 0.17461295425891876, "learning_rate": 0.00048276145992394984, "loss": 0.5548, "step": 194500 }, { "epoch": 26.219347884667204, "grad_norm": 0.1864520013332367, "learning_rate": 0.0004827240336537022, "loss": 0.555, "step": 194600 }, { "epoch": 26.232821341956345, "grad_norm": 0.16985665261745453, "learning_rate": 0.0004826866073834546, "loss": 0.555, "step": 194700 }, { "epoch": 26.246294799245486, "grad_norm": 0.16438685357570648, "learning_rate": 0.000482649181113207, "loss": 0.5556, "step": 194800 }, { "epoch": 26.259768256534628, "grad_norm": 0.16159343719482422, "learning_rate": 0.0004826117548429594, "loss": 0.5549, "step": 194900 }, { "epoch": 26.27324171382377, "grad_norm": 0.16726051270961761, "learning_rate": 0.0004825743285727118, "loss": 0.5535, "step": 195000 }, { "epoch": 26.286715171112906, "grad_norm": 0.17885228991508484, "learning_rate": 0.0004825369023024642, "loss": 0.5549, "step": 195100 }, { "epoch": 26.300188628402047, "grad_norm": 0.1643173098564148, "learning_rate": 0.0004824994760322166, "loss": 0.5556, "step": 195200 }, { "epoch": 26.31366208569119, "grad_norm": 0.18751896917819977, "learning_rate": 0.0004824620497619689, "loss": 0.555, "step": 195300 }, { "epoch": 26.32713554298033, "grad_norm": 0.1645839959383011, "learning_rate": 0.0004824246234917213, "loss": 0.5543, "step": 195400 }, { "epoch": 26.34060900026947, "grad_norm": 0.16698573529720306, "learning_rate": 0.0004823871972214737, "loss": 0.5552, "step": 195500 }, { "epoch": 26.354082457558608, "grad_norm": 0.1576748490333557, "learning_rate": 0.0004823497709512261, "loss": 0.5549, "step": 195600 }, { "epoch": 26.36755591484775, "grad_norm": 0.16641351580619812, "learning_rate": 0.0004823123446809785, "loss": 0.5542, "step": 195700 }, { "epoch": 26.38102937213689, "grad_norm": 0.17299532890319824, "learning_rate": 0.00048227491841073086, "loss": 0.5548, "step": 195800 }, { "epoch": 26.39450282942603, "grad_norm": 0.183042511343956, "learning_rate": 0.00048223749214048326, "loss": 0.555, "step": 195900 }, { "epoch": 26.407976286715172, "grad_norm": 0.17843635380268097, "learning_rate": 0.0004822000658702356, "loss": 0.5552, "step": 196000 }, { "epoch": 26.421449744004313, "grad_norm": 0.17958395183086395, "learning_rate": 0.000482162639599988, "loss": 0.5554, "step": 196100 }, { "epoch": 26.43492320129345, "grad_norm": 0.21439175307750702, "learning_rate": 0.0004821252133297404, "loss": 0.5552, "step": 196200 }, { "epoch": 26.448396658582592, "grad_norm": 0.18584474921226501, "learning_rate": 0.0004820877870594928, "loss": 0.5548, "step": 196300 }, { "epoch": 26.461870115871733, "grad_norm": 0.16947327554225922, "learning_rate": 0.0004820503607892452, "loss": 0.5551, "step": 196400 }, { "epoch": 26.475343573160874, "grad_norm": 0.16985855996608734, "learning_rate": 0.0004820129345189976, "loss": 0.5541, "step": 196500 }, { "epoch": 26.488817030450015, "grad_norm": 0.16476982831954956, "learning_rate": 0.00048197550824875, "loss": 0.5545, "step": 196600 }, { "epoch": 26.502290487739153, "grad_norm": 0.18127785623073578, "learning_rate": 0.0004819380819785024, "loss": 0.5537, "step": 196700 }, { "epoch": 26.515763945028294, "grad_norm": 0.16864602267742157, "learning_rate": 0.00048190065570825473, "loss": 0.5537, "step": 196800 }, { "epoch": 26.529237402317435, "grad_norm": 0.16202248632907867, "learning_rate": 0.00048186322943800713, "loss": 0.5546, "step": 196900 }, { "epoch": 26.542710859606576, "grad_norm": 0.16158807277679443, "learning_rate": 0.00048182580316775953, "loss": 0.5546, "step": 197000 }, { "epoch": 26.556184316895717, "grad_norm": 0.20401926338672638, "learning_rate": 0.00048178837689751193, "loss": 0.5544, "step": 197100 }, { "epoch": 26.569657774184854, "grad_norm": 0.17504920065402985, "learning_rate": 0.0004817509506272643, "loss": 0.5557, "step": 197200 }, { "epoch": 26.583131231473995, "grad_norm": 0.175637349486351, "learning_rate": 0.0004817135243570167, "loss": 0.554, "step": 197300 }, { "epoch": 26.596604688763136, "grad_norm": 0.18107105791568756, "learning_rate": 0.0004816760980867691, "loss": 0.5544, "step": 197400 }, { "epoch": 26.610078146052278, "grad_norm": 0.16737321019172668, "learning_rate": 0.00048163867181652147, "loss": 0.5541, "step": 197500 }, { "epoch": 26.62355160334142, "grad_norm": 0.1712648868560791, "learning_rate": 0.0004816012455462738, "loss": 0.5541, "step": 197600 }, { "epoch": 26.637025060630556, "grad_norm": 0.21402546763420105, "learning_rate": 0.0004815638192760262, "loss": 0.5524, "step": 197700 }, { "epoch": 26.650498517919697, "grad_norm": 0.1805381029844284, "learning_rate": 0.0004815263930057786, "loss": 0.5548, "step": 197800 }, { "epoch": 26.663971975208838, "grad_norm": 0.16797001659870148, "learning_rate": 0.000481488966735531, "loss": 0.5547, "step": 197900 }, { "epoch": 26.67744543249798, "grad_norm": 0.19740000367164612, "learning_rate": 0.0004814515404652834, "loss": 0.5525, "step": 198000 }, { "epoch": 26.69091888978712, "grad_norm": 0.1707502007484436, "learning_rate": 0.0004814141141950358, "loss": 0.5545, "step": 198100 }, { "epoch": 26.70439234707626, "grad_norm": 0.1627354621887207, "learning_rate": 0.00048137668792478815, "loss": 0.5539, "step": 198200 }, { "epoch": 26.7178658043654, "grad_norm": 0.1740783303976059, "learning_rate": 0.00048133926165454054, "loss": 0.5533, "step": 198300 }, { "epoch": 26.73133926165454, "grad_norm": 0.16894122958183289, "learning_rate": 0.00048130183538429294, "loss": 0.5538, "step": 198400 }, { "epoch": 26.74481271894368, "grad_norm": 0.1879124641418457, "learning_rate": 0.00048126440911404534, "loss": 0.5556, "step": 198500 }, { "epoch": 26.758286176232822, "grad_norm": 0.16786183416843414, "learning_rate": 0.00048122698284379774, "loss": 0.554, "step": 198600 }, { "epoch": 26.771759633521963, "grad_norm": 0.1666710078716278, "learning_rate": 0.00048118955657355014, "loss": 0.5545, "step": 198700 }, { "epoch": 26.7852330908111, "grad_norm": 0.1605307161808014, "learning_rate": 0.00048115213030330253, "loss": 0.5545, "step": 198800 }, { "epoch": 26.79870654810024, "grad_norm": 0.17355607450008392, "learning_rate": 0.00048111470403305493, "loss": 0.5546, "step": 198900 }, { "epoch": 26.812180005389383, "grad_norm": 0.18175172805786133, "learning_rate": 0.0004810772777628073, "loss": 0.5554, "step": 199000 }, { "epoch": 26.825653462678524, "grad_norm": 0.16241642832756042, "learning_rate": 0.0004810398514925597, "loss": 0.5545, "step": 199100 }, { "epoch": 26.839126919967665, "grad_norm": 0.22217534482479095, "learning_rate": 0.0004810024252223121, "loss": 0.5529, "step": 199200 }, { "epoch": 26.852600377256802, "grad_norm": 0.1770889014005661, "learning_rate": 0.00048096499895206447, "loss": 0.5542, "step": 199300 }, { "epoch": 26.866073834545944, "grad_norm": 0.1703619509935379, "learning_rate": 0.0004809275726818168, "loss": 0.5546, "step": 199400 }, { "epoch": 26.879547291835085, "grad_norm": 0.1682852953672409, "learning_rate": 0.0004808901464115692, "loss": 0.5545, "step": 199500 }, { "epoch": 26.893020749124226, "grad_norm": 0.1797427386045456, "learning_rate": 0.0004808527201413216, "loss": 0.5546, "step": 199600 }, { "epoch": 26.906494206413367, "grad_norm": 0.2176501452922821, "learning_rate": 0.00048081529387107396, "loss": 0.5541, "step": 199700 }, { "epoch": 26.919967663702508, "grad_norm": 0.16063259541988373, "learning_rate": 0.00048077786760082635, "loss": 0.5541, "step": 199800 }, { "epoch": 26.933441120991645, "grad_norm": 0.16697664558887482, "learning_rate": 0.00048074044133057875, "loss": 0.5546, "step": 199900 }, { "epoch": 26.946914578280786, "grad_norm": 0.17556032538414001, "learning_rate": 0.00048070301506033115, "loss": 0.5542, "step": 200000 }, { "epoch": 26.960388035569927, "grad_norm": 0.17879202961921692, "learning_rate": 0.00048066558879008355, "loss": 0.5543, "step": 200100 }, { "epoch": 26.97386149285907, "grad_norm": 0.16107648611068726, "learning_rate": 0.00048062816251983595, "loss": 0.5542, "step": 200200 }, { "epoch": 26.98733495014821, "grad_norm": 0.16122806072235107, "learning_rate": 0.00048059073624958835, "loss": 0.5541, "step": 200300 }, { "epoch": 27.0, "eval_loss": 0.5411233901977539, "eval_runtime": 4.9226, "eval_samples_per_second": 1015.727, "eval_steps_per_second": 16.048, "step": 200394 }, { "epoch": 27.000808407437347, "grad_norm": 0.1810934841632843, "learning_rate": 0.0004805533099793407, "loss": 0.5545, "step": 200400 }, { "epoch": 27.014281864726488, "grad_norm": 0.16094711422920227, "learning_rate": 0.0004805158837090931, "loss": 0.5522, "step": 200500 }, { "epoch": 27.02775532201563, "grad_norm": 0.171471506357193, "learning_rate": 0.0004804784574388455, "loss": 0.5526, "step": 200600 }, { "epoch": 27.04122877930477, "grad_norm": 0.1732337474822998, "learning_rate": 0.0004804410311685979, "loss": 0.5533, "step": 200700 }, { "epoch": 27.05470223659391, "grad_norm": 0.18065223097801208, "learning_rate": 0.0004804036048983503, "loss": 0.5549, "step": 200800 }, { "epoch": 27.06817569388305, "grad_norm": 0.18907326459884644, "learning_rate": 0.0004803661786281027, "loss": 0.5524, "step": 200900 }, { "epoch": 27.08164915117219, "grad_norm": 0.16971969604492188, "learning_rate": 0.0004803287523578551, "loss": 0.5541, "step": 201000 }, { "epoch": 27.09512260846133, "grad_norm": 0.1660754531621933, "learning_rate": 0.0004802913260876074, "loss": 0.5524, "step": 201100 }, { "epoch": 27.108596065750472, "grad_norm": 0.17365925014019012, "learning_rate": 0.00048025389981735977, "loss": 0.553, "step": 201200 }, { "epoch": 27.122069523039613, "grad_norm": 0.1585741937160492, "learning_rate": 0.00048021647354711217, "loss": 0.5538, "step": 201300 }, { "epoch": 27.13554298032875, "grad_norm": 0.16388624906539917, "learning_rate": 0.00048017904727686456, "loss": 0.5534, "step": 201400 }, { "epoch": 27.14901643761789, "grad_norm": 0.17074500024318695, "learning_rate": 0.00048014162100661696, "loss": 0.5535, "step": 201500 }, { "epoch": 27.162489894907033, "grad_norm": 0.17675772309303284, "learning_rate": 0.00048010419473636936, "loss": 0.5531, "step": 201600 }, { "epoch": 27.175963352196174, "grad_norm": 0.159743994474411, "learning_rate": 0.00048006676846612176, "loss": 0.5542, "step": 201700 }, { "epoch": 27.189436809485315, "grad_norm": 0.178092822432518, "learning_rate": 0.00048002934219587416, "loss": 0.5535, "step": 201800 }, { "epoch": 27.202910266774456, "grad_norm": 0.1700279265642166, "learning_rate": 0.0004799919159256265, "loss": 0.5537, "step": 201900 }, { "epoch": 27.216383724063594, "grad_norm": 0.16312062740325928, "learning_rate": 0.0004799544896553789, "loss": 0.5536, "step": 202000 }, { "epoch": 27.229857181352735, "grad_norm": 0.16056568920612335, "learning_rate": 0.0004799170633851313, "loss": 0.5538, "step": 202100 }, { "epoch": 27.243330638641876, "grad_norm": 0.17838439345359802, "learning_rate": 0.0004798796371148837, "loss": 0.5531, "step": 202200 }, { "epoch": 27.256804095931017, "grad_norm": 0.15841805934906006, "learning_rate": 0.0004798422108446361, "loss": 0.553, "step": 202300 }, { "epoch": 27.270277553220158, "grad_norm": 0.1616373211145401, "learning_rate": 0.0004798047845743885, "loss": 0.5531, "step": 202400 }, { "epoch": 27.283751010509295, "grad_norm": 0.17436742782592773, "learning_rate": 0.0004797673583041409, "loss": 0.5534, "step": 202500 }, { "epoch": 27.297224467798436, "grad_norm": 0.16474924981594086, "learning_rate": 0.00047972993203389323, "loss": 0.5527, "step": 202600 }, { "epoch": 27.310697925087577, "grad_norm": 0.16113273799419403, "learning_rate": 0.00047969250576364563, "loss": 0.5527, "step": 202700 }, { "epoch": 27.32417138237672, "grad_norm": 0.1576123982667923, "learning_rate": 0.00047965507949339803, "loss": 0.5534, "step": 202800 }, { "epoch": 27.33764483966586, "grad_norm": 0.16274847090244293, "learning_rate": 0.00047961765322315043, "loss": 0.5528, "step": 202900 }, { "epoch": 27.351118296954997, "grad_norm": 0.18002042174339294, "learning_rate": 0.00047958022695290277, "loss": 0.5528, "step": 203000 }, { "epoch": 27.364591754244138, "grad_norm": 0.16836759448051453, "learning_rate": 0.00047954280068265517, "loss": 0.552, "step": 203100 }, { "epoch": 27.37806521153328, "grad_norm": 0.16938069462776184, "learning_rate": 0.00047950537441240757, "loss": 0.5538, "step": 203200 }, { "epoch": 27.39153866882242, "grad_norm": 0.18390922248363495, "learning_rate": 0.0004794679481421599, "loss": 0.5536, "step": 203300 }, { "epoch": 27.40501212611156, "grad_norm": 0.17742198705673218, "learning_rate": 0.0004794305218719123, "loss": 0.5535, "step": 203400 }, { "epoch": 27.418485583400702, "grad_norm": 0.17645947635173798, "learning_rate": 0.0004793930956016647, "loss": 0.5534, "step": 203500 }, { "epoch": 27.43195904068984, "grad_norm": 0.16929228603839874, "learning_rate": 0.0004793556693314171, "loss": 0.5528, "step": 203600 }, { "epoch": 27.44543249797898, "grad_norm": 0.17383050918579102, "learning_rate": 0.0004793182430611695, "loss": 0.5531, "step": 203700 }, { "epoch": 27.458905955268122, "grad_norm": 0.17566007375717163, "learning_rate": 0.0004792808167909219, "loss": 0.553, "step": 203800 }, { "epoch": 27.472379412557263, "grad_norm": 0.16999857127666473, "learning_rate": 0.0004792433905206743, "loss": 0.5518, "step": 203900 }, { "epoch": 27.485852869846404, "grad_norm": 0.17492017149925232, "learning_rate": 0.00047920596425042665, "loss": 0.5542, "step": 204000 }, { "epoch": 27.49932632713554, "grad_norm": 0.17160844802856445, "learning_rate": 0.00047916853798017904, "loss": 0.5533, "step": 204100 }, { "epoch": 27.512799784424683, "grad_norm": 0.1838904619216919, "learning_rate": 0.00047913111170993144, "loss": 0.5534, "step": 204200 }, { "epoch": 27.526273241713824, "grad_norm": 0.18080061674118042, "learning_rate": 0.00047909368543968384, "loss": 0.5527, "step": 204300 }, { "epoch": 27.539746699002965, "grad_norm": 0.17060256004333496, "learning_rate": 0.00047905625916943624, "loss": 0.5539, "step": 204400 }, { "epoch": 27.553220156292106, "grad_norm": 0.18229106068611145, "learning_rate": 0.00047901883289918864, "loss": 0.5533, "step": 204500 }, { "epoch": 27.566693613581243, "grad_norm": 0.1696474403142929, "learning_rate": 0.00047898140662894104, "loss": 0.5536, "step": 204600 }, { "epoch": 27.580167070870385, "grad_norm": 0.16129809617996216, "learning_rate": 0.00047894398035869343, "loss": 0.5529, "step": 204700 }, { "epoch": 27.593640528159526, "grad_norm": 0.18206185102462769, "learning_rate": 0.0004789065540884457, "loss": 0.5538, "step": 204800 }, { "epoch": 27.607113985448667, "grad_norm": 0.18331199884414673, "learning_rate": 0.0004788691278181981, "loss": 0.5534, "step": 204900 }, { "epoch": 27.620587442737808, "grad_norm": 0.1738140732049942, "learning_rate": 0.0004788317015479505, "loss": 0.5529, "step": 205000 }, { "epoch": 27.634060900026945, "grad_norm": 0.18666395545005798, "learning_rate": 0.0004787942752777029, "loss": 0.5521, "step": 205100 }, { "epoch": 27.647534357316086, "grad_norm": 0.1867625117301941, "learning_rate": 0.0004787568490074553, "loss": 0.5525, "step": 205200 }, { "epoch": 27.661007814605227, "grad_norm": 0.1967378705739975, "learning_rate": 0.0004787194227372077, "loss": 0.5526, "step": 205300 }, { "epoch": 27.67448127189437, "grad_norm": 0.18751944601535797, "learning_rate": 0.0004786819964669601, "loss": 0.5532, "step": 205400 }, { "epoch": 27.68795472918351, "grad_norm": 0.16310927271842957, "learning_rate": 0.00047864457019671246, "loss": 0.5523, "step": 205500 }, { "epoch": 27.70142818647265, "grad_norm": 0.17304202914237976, "learning_rate": 0.00047860714392646486, "loss": 0.5532, "step": 205600 }, { "epoch": 27.714901643761788, "grad_norm": 0.1646677553653717, "learning_rate": 0.00047856971765621725, "loss": 0.5536, "step": 205700 }, { "epoch": 27.72837510105093, "grad_norm": 0.17544162273406982, "learning_rate": 0.00047853229138596965, "loss": 0.5537, "step": 205800 }, { "epoch": 27.74184855834007, "grad_norm": 0.16086432337760925, "learning_rate": 0.00047849486511572205, "loss": 0.5514, "step": 205900 }, { "epoch": 27.75532201562921, "grad_norm": 0.1619129627943039, "learning_rate": 0.00047845743884547445, "loss": 0.5522, "step": 206000 }, { "epoch": 27.768795472918352, "grad_norm": 0.17424118518829346, "learning_rate": 0.00047842001257522685, "loss": 0.5532, "step": 206100 }, { "epoch": 27.78226893020749, "grad_norm": 0.18489615619182587, "learning_rate": 0.0004783825863049792, "loss": 0.5536, "step": 206200 }, { "epoch": 27.79574238749663, "grad_norm": 0.1803482621908188, "learning_rate": 0.0004783451600347316, "loss": 0.5528, "step": 206300 }, { "epoch": 27.809215844785772, "grad_norm": 0.18016378581523895, "learning_rate": 0.000478307733764484, "loss": 0.552, "step": 206400 }, { "epoch": 27.822689302074913, "grad_norm": 0.20381563901901245, "learning_rate": 0.0004782703074942364, "loss": 0.5523, "step": 206500 }, { "epoch": 27.836162759364054, "grad_norm": 0.21210765838623047, "learning_rate": 0.00047823288122398873, "loss": 0.553, "step": 206600 }, { "epoch": 27.84963621665319, "grad_norm": 0.15436844527721405, "learning_rate": 0.00047819545495374113, "loss": 0.5533, "step": 206700 }, { "epoch": 27.863109673942333, "grad_norm": 0.19507835805416107, "learning_rate": 0.0004781580286834935, "loss": 0.5518, "step": 206800 }, { "epoch": 27.876583131231474, "grad_norm": 0.16579952836036682, "learning_rate": 0.0004781206024132459, "loss": 0.5522, "step": 206900 }, { "epoch": 27.890056588520615, "grad_norm": 0.1635895073413849, "learning_rate": 0.00047808317614299827, "loss": 0.553, "step": 207000 }, { "epoch": 27.903530045809756, "grad_norm": 0.17085525393486023, "learning_rate": 0.00047804574987275067, "loss": 0.5517, "step": 207100 }, { "epoch": 27.917003503098897, "grad_norm": 0.16878004372119904, "learning_rate": 0.00047800832360250306, "loss": 0.553, "step": 207200 }, { "epoch": 27.930476960388035, "grad_norm": 0.1641501486301422, "learning_rate": 0.00047797089733225546, "loss": 0.5524, "step": 207300 }, { "epoch": 27.943950417677176, "grad_norm": 0.1686561554670334, "learning_rate": 0.00047793347106200786, "loss": 0.552, "step": 207400 }, { "epoch": 27.957423874966317, "grad_norm": 0.16937029361724854, "learning_rate": 0.00047789604479176026, "loss": 0.5537, "step": 207500 }, { "epoch": 27.970897332255458, "grad_norm": 0.1728837937116623, "learning_rate": 0.00047785861852151266, "loss": 0.554, "step": 207600 }, { "epoch": 27.9843707895446, "grad_norm": 0.18002097308635712, "learning_rate": 0.000477821192251265, "loss": 0.5518, "step": 207700 }, { "epoch": 27.997844246833736, "grad_norm": 0.18254461884498596, "learning_rate": 0.0004777837659810174, "loss": 0.5517, "step": 207800 }, { "epoch": 28.0, "eval_loss": 0.5407589077949524, "eval_runtime": 4.9869, "eval_samples_per_second": 1002.624, "eval_steps_per_second": 15.841, "step": 207816 }, { "epoch": 28.011317704122877, "grad_norm": 0.1825226992368698, "learning_rate": 0.0004777463397107698, "loss": 0.5527, "step": 207900 }, { "epoch": 28.02479116141202, "grad_norm": 0.1635829210281372, "learning_rate": 0.0004777089134405222, "loss": 0.5512, "step": 208000 }, { "epoch": 28.03826461870116, "grad_norm": 0.1915132999420166, "learning_rate": 0.0004776714871702746, "loss": 0.5523, "step": 208100 }, { "epoch": 28.0517380759903, "grad_norm": 0.16753949224948883, "learning_rate": 0.000477634060900027, "loss": 0.5515, "step": 208200 }, { "epoch": 28.065211533279438, "grad_norm": 0.16844476759433746, "learning_rate": 0.0004775966346297794, "loss": 0.5515, "step": 208300 }, { "epoch": 28.07868499056858, "grad_norm": 0.17073242366313934, "learning_rate": 0.0004775592083595317, "loss": 0.551, "step": 208400 }, { "epoch": 28.09215844785772, "grad_norm": 0.16804617643356323, "learning_rate": 0.0004775217820892841, "loss": 0.5523, "step": 208500 }, { "epoch": 28.10563190514686, "grad_norm": 0.16718262434005737, "learning_rate": 0.0004774843558190365, "loss": 0.5516, "step": 208600 }, { "epoch": 28.119105362436002, "grad_norm": 0.1659059375524521, "learning_rate": 0.0004774469295487889, "loss": 0.5507, "step": 208700 }, { "epoch": 28.13257881972514, "grad_norm": 0.15836142003536224, "learning_rate": 0.0004774095032785413, "loss": 0.5525, "step": 208800 }, { "epoch": 28.14605227701428, "grad_norm": 0.17845451831817627, "learning_rate": 0.00047737207700829367, "loss": 0.5518, "step": 208900 }, { "epoch": 28.159525734303422, "grad_norm": 0.16635914146900177, "learning_rate": 0.00047733465073804607, "loss": 0.5509, "step": 209000 }, { "epoch": 28.172999191592563, "grad_norm": 0.16745851933956146, "learning_rate": 0.0004772972244677984, "loss": 0.5531, "step": 209100 }, { "epoch": 28.186472648881704, "grad_norm": 0.15864455699920654, "learning_rate": 0.0004772597981975508, "loss": 0.5515, "step": 209200 }, { "epoch": 28.199946106170845, "grad_norm": 0.18058133125305176, "learning_rate": 0.0004772223719273032, "loss": 0.5517, "step": 209300 }, { "epoch": 28.213419563459983, "grad_norm": 0.15627892315387726, "learning_rate": 0.0004771849456570556, "loss": 0.5525, "step": 209400 }, { "epoch": 28.226893020749124, "grad_norm": 0.16540834307670593, "learning_rate": 0.000477147519386808, "loss": 0.5514, "step": 209500 }, { "epoch": 28.240366478038265, "grad_norm": 0.19688042998313904, "learning_rate": 0.0004771100931165604, "loss": 0.552, "step": 209600 }, { "epoch": 28.253839935327406, "grad_norm": 0.16836822032928467, "learning_rate": 0.0004770726668463128, "loss": 0.5523, "step": 209700 }, { "epoch": 28.267313392616547, "grad_norm": 0.16620029509067535, "learning_rate": 0.0004770352405760652, "loss": 0.551, "step": 209800 }, { "epoch": 28.280786849905684, "grad_norm": 0.17175555229187012, "learning_rate": 0.00047699781430581755, "loss": 0.5528, "step": 209900 }, { "epoch": 28.294260307194826, "grad_norm": 0.15188170969486237, "learning_rate": 0.00047696038803556994, "loss": 0.5519, "step": 210000 }, { "epoch": 28.307733764483967, "grad_norm": 0.1736784279346466, "learning_rate": 0.00047692296176532234, "loss": 0.5518, "step": 210100 }, { "epoch": 28.321207221773108, "grad_norm": 0.17000342905521393, "learning_rate": 0.00047688553549507474, "loss": 0.5534, "step": 210200 }, { "epoch": 28.33468067906225, "grad_norm": 0.1708294153213501, "learning_rate": 0.0004768481092248271, "loss": 0.5515, "step": 210300 }, { "epoch": 28.348154136351386, "grad_norm": 0.18074417114257812, "learning_rate": 0.0004768106829545795, "loss": 0.5524, "step": 210400 }, { "epoch": 28.361627593640527, "grad_norm": 0.18517650663852692, "learning_rate": 0.0004767732566843319, "loss": 0.5513, "step": 210500 }, { "epoch": 28.37510105092967, "grad_norm": 0.16862566769123077, "learning_rate": 0.0004767358304140842, "loss": 0.5515, "step": 210600 }, { "epoch": 28.38857450821881, "grad_norm": 0.18173977732658386, "learning_rate": 0.0004766984041438366, "loss": 0.5509, "step": 210700 }, { "epoch": 28.40204796550795, "grad_norm": 0.18544824421405792, "learning_rate": 0.000476660977873589, "loss": 0.5516, "step": 210800 }, { "epoch": 28.415521422797088, "grad_norm": 0.1815929263830185, "learning_rate": 0.0004766235516033414, "loss": 0.5531, "step": 210900 }, { "epoch": 28.42899488008623, "grad_norm": 0.1615036278963089, "learning_rate": 0.0004765861253330938, "loss": 0.5526, "step": 211000 }, { "epoch": 28.44246833737537, "grad_norm": 0.18315629661083221, "learning_rate": 0.0004765486990628462, "loss": 0.5523, "step": 211100 }, { "epoch": 28.45594179466451, "grad_norm": 0.16933447122573853, "learning_rate": 0.0004765112727925986, "loss": 0.5523, "step": 211200 }, { "epoch": 28.469415251953652, "grad_norm": 0.1655154973268509, "learning_rate": 0.00047647384652235096, "loss": 0.5512, "step": 211300 }, { "epoch": 28.482888709242793, "grad_norm": 0.1711708903312683, "learning_rate": 0.00047643642025210336, "loss": 0.5514, "step": 211400 }, { "epoch": 28.49636216653193, "grad_norm": 0.1662604808807373, "learning_rate": 0.00047639899398185576, "loss": 0.5515, "step": 211500 }, { "epoch": 28.509835623821072, "grad_norm": 0.1585167497396469, "learning_rate": 0.00047636156771160815, "loss": 0.5518, "step": 211600 }, { "epoch": 28.523309081110213, "grad_norm": 0.16718922555446625, "learning_rate": 0.00047632414144136055, "loss": 0.5506, "step": 211700 }, { "epoch": 28.536782538399354, "grad_norm": 0.15619760751724243, "learning_rate": 0.00047628671517111295, "loss": 0.5504, "step": 211800 }, { "epoch": 28.550255995688495, "grad_norm": 0.19027970731258392, "learning_rate": 0.00047624928890086535, "loss": 0.5524, "step": 211900 }, { "epoch": 28.563729452977633, "grad_norm": 0.1793949455022812, "learning_rate": 0.00047621186263061775, "loss": 0.5516, "step": 212000 }, { "epoch": 28.577202910266774, "grad_norm": 0.16827377676963806, "learning_rate": 0.00047617443636037004, "loss": 0.5525, "step": 212100 }, { "epoch": 28.590676367555915, "grad_norm": 0.1684994250535965, "learning_rate": 0.00047613701009012243, "loss": 0.5519, "step": 212200 }, { "epoch": 28.604149824845056, "grad_norm": 0.16916267573833466, "learning_rate": 0.00047609958381987483, "loss": 0.5521, "step": 212300 }, { "epoch": 28.617623282134197, "grad_norm": 0.15308240056037903, "learning_rate": 0.00047606215754962723, "loss": 0.5529, "step": 212400 }, { "epoch": 28.631096739423334, "grad_norm": 0.17225344479084015, "learning_rate": 0.00047602473127937963, "loss": 0.5517, "step": 212500 }, { "epoch": 28.644570196712476, "grad_norm": 0.176025852560997, "learning_rate": 0.00047598730500913203, "loss": 0.5527, "step": 212600 }, { "epoch": 28.658043654001617, "grad_norm": 0.17321188747882843, "learning_rate": 0.0004759498787388844, "loss": 0.5516, "step": 212700 }, { "epoch": 28.671517111290758, "grad_norm": 0.18300634622573853, "learning_rate": 0.00047591245246863677, "loss": 0.5515, "step": 212800 }, { "epoch": 28.6849905685799, "grad_norm": 0.163783460855484, "learning_rate": 0.00047587502619838917, "loss": 0.551, "step": 212900 }, { "epoch": 28.698464025869036, "grad_norm": 0.1867188811302185, "learning_rate": 0.00047583759992814157, "loss": 0.5519, "step": 213000 }, { "epoch": 28.711937483158177, "grad_norm": 0.17019601166248322, "learning_rate": 0.00047580017365789396, "loss": 0.5517, "step": 213100 }, { "epoch": 28.72541094044732, "grad_norm": 0.18527358770370483, "learning_rate": 0.00047576274738764636, "loss": 0.5517, "step": 213200 }, { "epoch": 28.73888439773646, "grad_norm": 0.17685630917549133, "learning_rate": 0.00047572532111739876, "loss": 0.5517, "step": 213300 }, { "epoch": 28.7523578550256, "grad_norm": 0.1795148402452469, "learning_rate": 0.00047568789484715116, "loss": 0.5517, "step": 213400 }, { "epoch": 28.76583131231474, "grad_norm": 0.173146590590477, "learning_rate": 0.0004756504685769035, "loss": 0.5515, "step": 213500 }, { "epoch": 28.77930476960388, "grad_norm": 0.17372459173202515, "learning_rate": 0.0004756130423066559, "loss": 0.5516, "step": 213600 }, { "epoch": 28.79277822689302, "grad_norm": 0.16768130660057068, "learning_rate": 0.0004755756160364083, "loss": 0.5515, "step": 213700 }, { "epoch": 28.80625168418216, "grad_norm": 0.1824701428413391, "learning_rate": 0.0004755381897661607, "loss": 0.5519, "step": 213800 }, { "epoch": 28.819725141471302, "grad_norm": 0.16179603338241577, "learning_rate": 0.00047550076349591304, "loss": 0.5511, "step": 213900 }, { "epoch": 28.833198598760443, "grad_norm": 0.15407036244869232, "learning_rate": 0.00047546333722566544, "loss": 0.5514, "step": 214000 }, { "epoch": 28.84667205604958, "grad_norm": 0.18005263805389404, "learning_rate": 0.00047542591095541784, "loss": 0.5512, "step": 214100 }, { "epoch": 28.860145513338722, "grad_norm": 0.16784028708934784, "learning_rate": 0.0004753884846851702, "loss": 0.5501, "step": 214200 }, { "epoch": 28.873618970627863, "grad_norm": 0.18165382742881775, "learning_rate": 0.0004753510584149226, "loss": 0.5515, "step": 214300 }, { "epoch": 28.887092427917004, "grad_norm": 0.17098532617092133, "learning_rate": 0.000475313632144675, "loss": 0.5516, "step": 214400 }, { "epoch": 28.900565885206145, "grad_norm": 0.18910369277000427, "learning_rate": 0.0004752762058744274, "loss": 0.5514, "step": 214500 }, { "epoch": 28.914039342495283, "grad_norm": 0.19360175728797913, "learning_rate": 0.0004752387796041798, "loss": 0.5505, "step": 214600 }, { "epoch": 28.927512799784424, "grad_norm": 0.16766591370105743, "learning_rate": 0.0004752013533339322, "loss": 0.5517, "step": 214700 }, { "epoch": 28.940986257073565, "grad_norm": 0.1622832864522934, "learning_rate": 0.00047516392706368457, "loss": 0.551, "step": 214800 }, { "epoch": 28.954459714362706, "grad_norm": 0.17327912151813507, "learning_rate": 0.00047512650079343697, "loss": 0.5517, "step": 214900 }, { "epoch": 28.967933171651847, "grad_norm": 0.18067151308059692, "learning_rate": 0.0004750890745231893, "loss": 0.5516, "step": 215000 }, { "epoch": 28.981406628940988, "grad_norm": 0.16132348775863647, "learning_rate": 0.0004750516482529417, "loss": 0.5516, "step": 215100 }, { "epoch": 28.994880086230125, "grad_norm": 0.16434352099895477, "learning_rate": 0.0004750142219826941, "loss": 0.5509, "step": 215200 }, { "epoch": 29.0, "eval_loss": 0.5393602848052979, "eval_runtime": 4.9577, "eval_samples_per_second": 1008.54, "eval_steps_per_second": 15.935, "step": 215238 }, { "epoch": 29.008353543519267, "grad_norm": 0.16175980865955353, "learning_rate": 0.0004749767957124465, "loss": 0.5501, "step": 215300 }, { "epoch": 29.021827000808408, "grad_norm": 0.1623729020357132, "learning_rate": 0.0004749393694421989, "loss": 0.5492, "step": 215400 }, { "epoch": 29.03530045809755, "grad_norm": 0.1680472046136856, "learning_rate": 0.0004749019431719513, "loss": 0.5506, "step": 215500 }, { "epoch": 29.04877391538669, "grad_norm": 0.1654801368713379, "learning_rate": 0.0004748645169017037, "loss": 0.5506, "step": 215600 }, { "epoch": 29.062247372675827, "grad_norm": 0.15809375047683716, "learning_rate": 0.000474827090631456, "loss": 0.5498, "step": 215700 }, { "epoch": 29.07572082996497, "grad_norm": 0.18189941346645355, "learning_rate": 0.0004747896643612084, "loss": 0.5507, "step": 215800 }, { "epoch": 29.08919428725411, "grad_norm": 0.17083661258220673, "learning_rate": 0.0004747522380909608, "loss": 0.5514, "step": 215900 }, { "epoch": 29.10266774454325, "grad_norm": 0.15544119477272034, "learning_rate": 0.0004747148118207132, "loss": 0.5518, "step": 216000 }, { "epoch": 29.11614120183239, "grad_norm": 0.15943457186222076, "learning_rate": 0.0004746773855504656, "loss": 0.5514, "step": 216100 }, { "epoch": 29.12961465912153, "grad_norm": 0.17103546857833862, "learning_rate": 0.000474639959280218, "loss": 0.5502, "step": 216200 }, { "epoch": 29.14308811641067, "grad_norm": 0.1664302498102188, "learning_rate": 0.0004746025330099704, "loss": 0.5519, "step": 216300 }, { "epoch": 29.15656157369981, "grad_norm": 0.16648606956005096, "learning_rate": 0.0004745651067397227, "loss": 0.5504, "step": 216400 }, { "epoch": 29.170035030988952, "grad_norm": 0.1804518848657608, "learning_rate": 0.0004745276804694751, "loss": 0.5507, "step": 216500 }, { "epoch": 29.183508488278093, "grad_norm": 0.1696319282054901, "learning_rate": 0.0004744902541992275, "loss": 0.5503, "step": 216600 }, { "epoch": 29.19698194556723, "grad_norm": 0.16104631125926971, "learning_rate": 0.0004744528279289799, "loss": 0.55, "step": 216700 }, { "epoch": 29.210455402856372, "grad_norm": 0.17928162217140198, "learning_rate": 0.0004744154016587323, "loss": 0.5509, "step": 216800 }, { "epoch": 29.223928860145513, "grad_norm": 0.16266348958015442, "learning_rate": 0.0004743779753884847, "loss": 0.5504, "step": 216900 }, { "epoch": 29.237402317434654, "grad_norm": 0.16288204491138458, "learning_rate": 0.0004743405491182371, "loss": 0.5511, "step": 217000 }, { "epoch": 29.250875774723795, "grad_norm": 0.1660935878753662, "learning_rate": 0.0004743031228479895, "loss": 0.5516, "step": 217100 }, { "epoch": 29.264349232012936, "grad_norm": 0.16910772025585175, "learning_rate": 0.00047426569657774186, "loss": 0.5507, "step": 217200 }, { "epoch": 29.277822689302074, "grad_norm": 0.18415872752666473, "learning_rate": 0.00047422827030749426, "loss": 0.5505, "step": 217300 }, { "epoch": 29.291296146591215, "grad_norm": 0.1650867760181427, "learning_rate": 0.00047419084403724665, "loss": 0.5502, "step": 217400 }, { "epoch": 29.304769603880356, "grad_norm": 0.1634923368692398, "learning_rate": 0.000474153417766999, "loss": 0.5512, "step": 217500 }, { "epoch": 29.318243061169497, "grad_norm": 0.15636798739433289, "learning_rate": 0.0004741159914967514, "loss": 0.5518, "step": 217600 }, { "epoch": 29.331716518458638, "grad_norm": 0.1665971875190735, "learning_rate": 0.0004740785652265038, "loss": 0.5506, "step": 217700 }, { "epoch": 29.345189975747775, "grad_norm": 0.16817963123321533, "learning_rate": 0.0004740411389562562, "loss": 0.551, "step": 217800 }, { "epoch": 29.358663433036917, "grad_norm": 0.16686372458934784, "learning_rate": 0.00047400371268600854, "loss": 0.5512, "step": 217900 }, { "epoch": 29.372136890326058, "grad_norm": 0.16501329839229584, "learning_rate": 0.00047396628641576094, "loss": 0.5503, "step": 218000 }, { "epoch": 29.3856103476152, "grad_norm": 0.18015408515930176, "learning_rate": 0.00047392886014551333, "loss": 0.5517, "step": 218100 }, { "epoch": 29.39908380490434, "grad_norm": 0.1723400503396988, "learning_rate": 0.00047389143387526573, "loss": 0.5494, "step": 218200 }, { "epoch": 29.412557262193477, "grad_norm": 0.18233159184455872, "learning_rate": 0.00047385400760501813, "loss": 0.5506, "step": 218300 }, { "epoch": 29.42603071948262, "grad_norm": 0.1736634075641632, "learning_rate": 0.00047381658133477053, "loss": 0.5505, "step": 218400 }, { "epoch": 29.43950417677176, "grad_norm": 0.16602951288223267, "learning_rate": 0.0004737791550645229, "loss": 0.551, "step": 218500 }, { "epoch": 29.4529776340609, "grad_norm": 0.18774311244487762, "learning_rate": 0.00047374172879427527, "loss": 0.5506, "step": 218600 }, { "epoch": 29.46645109135004, "grad_norm": 0.153908371925354, "learning_rate": 0.00047370430252402767, "loss": 0.551, "step": 218700 }, { "epoch": 29.479924548639183, "grad_norm": 0.17735613882541656, "learning_rate": 0.00047366687625378007, "loss": 0.5507, "step": 218800 }, { "epoch": 29.49339800592832, "grad_norm": 0.16342556476593018, "learning_rate": 0.00047362944998353247, "loss": 0.5508, "step": 218900 }, { "epoch": 29.50687146321746, "grad_norm": 0.171171635389328, "learning_rate": 0.00047359202371328486, "loss": 0.5508, "step": 219000 }, { "epoch": 29.520344920506602, "grad_norm": 0.15430380403995514, "learning_rate": 0.00047355459744303726, "loss": 0.5508, "step": 219100 }, { "epoch": 29.533818377795743, "grad_norm": 0.15906012058258057, "learning_rate": 0.00047351717117278966, "loss": 0.5516, "step": 219200 }, { "epoch": 29.547291835084884, "grad_norm": 0.18629109859466553, "learning_rate": 0.00047347974490254195, "loss": 0.5513, "step": 219300 }, { "epoch": 29.560765292374022, "grad_norm": 0.1627497375011444, "learning_rate": 0.00047344231863229435, "loss": 0.5509, "step": 219400 }, { "epoch": 29.574238749663163, "grad_norm": 0.15917737782001495, "learning_rate": 0.00047340489236204675, "loss": 0.5498, "step": 219500 }, { "epoch": 29.587712206952304, "grad_norm": 0.1725243180990219, "learning_rate": 0.00047336746609179914, "loss": 0.5499, "step": 219600 }, { "epoch": 29.601185664241445, "grad_norm": 0.3015861511230469, "learning_rate": 0.00047333003982155154, "loss": 0.5516, "step": 219700 }, { "epoch": 29.614659121530586, "grad_norm": 0.162586972117424, "learning_rate": 0.00047329261355130394, "loss": 0.5502, "step": 219800 }, { "epoch": 29.628132578819724, "grad_norm": 0.18504074215888977, "learning_rate": 0.00047325518728105634, "loss": 0.5504, "step": 219900 }, { "epoch": 29.641606036108865, "grad_norm": 0.17180770635604858, "learning_rate": 0.00047321776101080874, "loss": 0.55, "step": 220000 }, { "epoch": 29.655079493398006, "grad_norm": 0.17610548436641693, "learning_rate": 0.0004731803347405611, "loss": 0.5504, "step": 220100 }, { "epoch": 29.668552950687147, "grad_norm": 0.18704736232757568, "learning_rate": 0.0004731429084703135, "loss": 0.5497, "step": 220200 }, { "epoch": 29.682026407976288, "grad_norm": 0.17791172862052917, "learning_rate": 0.0004731054822000659, "loss": 0.5497, "step": 220300 }, { "epoch": 29.695499865265425, "grad_norm": 0.16314288973808289, "learning_rate": 0.0004730680559298183, "loss": 0.5505, "step": 220400 }, { "epoch": 29.708973322554566, "grad_norm": 0.17255868017673492, "learning_rate": 0.0004730306296595707, "loss": 0.5509, "step": 220500 }, { "epoch": 29.722446779843708, "grad_norm": 0.16257493197917938, "learning_rate": 0.0004729932033893231, "loss": 0.5501, "step": 220600 }, { "epoch": 29.73592023713285, "grad_norm": 0.16143719851970673, "learning_rate": 0.00047295577711907547, "loss": 0.5505, "step": 220700 }, { "epoch": 29.74939369442199, "grad_norm": 0.16721542179584503, "learning_rate": 0.0004729183508488278, "loss": 0.5509, "step": 220800 }, { "epoch": 29.76286715171113, "grad_norm": 0.16147081553936005, "learning_rate": 0.0004728809245785802, "loss": 0.5494, "step": 220900 }, { "epoch": 29.77634060900027, "grad_norm": 0.1667952835559845, "learning_rate": 0.0004728434983083326, "loss": 0.5499, "step": 221000 }, { "epoch": 29.78981406628941, "grad_norm": 0.15633098781108856, "learning_rate": 0.00047280607203808496, "loss": 0.5505, "step": 221100 }, { "epoch": 29.80328752357855, "grad_norm": 0.16815999150276184, "learning_rate": 0.00047276864576783735, "loss": 0.5501, "step": 221200 }, { "epoch": 29.81676098086769, "grad_norm": 0.1630353331565857, "learning_rate": 0.00047273121949758975, "loss": 0.5499, "step": 221300 }, { "epoch": 29.830234438156833, "grad_norm": 0.18476080894470215, "learning_rate": 0.00047269379322734215, "loss": 0.5503, "step": 221400 }, { "epoch": 29.84370789544597, "grad_norm": 0.17871052026748657, "learning_rate": 0.0004726563669570945, "loss": 0.55, "step": 221500 }, { "epoch": 29.85718135273511, "grad_norm": 0.16156533360481262, "learning_rate": 0.0004726189406868469, "loss": 0.5503, "step": 221600 }, { "epoch": 29.870654810024252, "grad_norm": 0.15738844871520996, "learning_rate": 0.0004725815144165993, "loss": 0.5499, "step": 221700 }, { "epoch": 29.884128267313393, "grad_norm": 0.18332011997699738, "learning_rate": 0.0004725440881463517, "loss": 0.5495, "step": 221800 }, { "epoch": 29.897601724602534, "grad_norm": 0.166537344455719, "learning_rate": 0.0004725066618761041, "loss": 0.5507, "step": 221900 }, { "epoch": 29.911075181891672, "grad_norm": 0.16585545241832733, "learning_rate": 0.0004724692356058565, "loss": 0.5501, "step": 222000 }, { "epoch": 29.924548639180813, "grad_norm": 0.17764592170715332, "learning_rate": 0.0004724318093356089, "loss": 0.55, "step": 222100 }, { "epoch": 29.938022096469954, "grad_norm": 0.1762271374464035, "learning_rate": 0.0004723943830653613, "loss": 0.5503, "step": 222200 }, { "epoch": 29.951495553759095, "grad_norm": 0.15029293298721313, "learning_rate": 0.0004723569567951136, "loss": 0.5506, "step": 222300 }, { "epoch": 29.964969011048236, "grad_norm": 0.15900087356567383, "learning_rate": 0.000472319530524866, "loss": 0.5503, "step": 222400 }, { "epoch": 29.978442468337377, "grad_norm": 0.16064637899398804, "learning_rate": 0.0004722821042546184, "loss": 0.5509, "step": 222500 }, { "epoch": 29.991915925626515, "grad_norm": 0.1706090271472931, "learning_rate": 0.0004722446779843708, "loss": 0.5491, "step": 222600 }, { "epoch": 30.0, "eval_loss": 0.5375624895095825, "eval_runtime": 4.9502, "eval_samples_per_second": 1010.065, "eval_steps_per_second": 15.959, "step": 222660 }, { "epoch": 30.005389382915656, "grad_norm": 0.16444061696529388, "learning_rate": 0.0004722072517141232, "loss": 0.5497, "step": 222700 }, { "epoch": 30.018862840204797, "grad_norm": 0.18261690437793732, "learning_rate": 0.0004721698254438756, "loss": 0.5485, "step": 222800 }, { "epoch": 30.032336297493938, "grad_norm": 0.16092993319034576, "learning_rate": 0.00047213239917362796, "loss": 0.5498, "step": 222900 }, { "epoch": 30.04580975478308, "grad_norm": 0.181465283036232, "learning_rate": 0.0004720949729033803, "loss": 0.548, "step": 223000 }, { "epoch": 30.059283212072216, "grad_norm": 0.17495670914649963, "learning_rate": 0.0004720575466331327, "loss": 0.549, "step": 223100 }, { "epoch": 30.072756669361357, "grad_norm": 0.16153579950332642, "learning_rate": 0.0004720201203628851, "loss": 0.5482, "step": 223200 }, { "epoch": 30.0862301266505, "grad_norm": 0.20532163977622986, "learning_rate": 0.0004719826940926375, "loss": 0.5493, "step": 223300 }, { "epoch": 30.09970358393964, "grad_norm": 0.15219807624816895, "learning_rate": 0.0004719452678223899, "loss": 0.5502, "step": 223400 }, { "epoch": 30.11317704122878, "grad_norm": 0.16211473941802979, "learning_rate": 0.0004719078415521423, "loss": 0.5493, "step": 223500 }, { "epoch": 30.126650498517918, "grad_norm": 0.15491171181201935, "learning_rate": 0.0004718704152818947, "loss": 0.5494, "step": 223600 }, { "epoch": 30.14012395580706, "grad_norm": 0.16542358696460724, "learning_rate": 0.00047183298901164704, "loss": 0.5501, "step": 223700 }, { "epoch": 30.1535974130962, "grad_norm": 0.17619509994983673, "learning_rate": 0.00047179556274139944, "loss": 0.5498, "step": 223800 }, { "epoch": 30.16707087038534, "grad_norm": 0.16047513484954834, "learning_rate": 0.00047175813647115184, "loss": 0.5494, "step": 223900 }, { "epoch": 30.180544327674482, "grad_norm": 0.1671617478132248, "learning_rate": 0.00047172071020090423, "loss": 0.5498, "step": 224000 }, { "epoch": 30.19401778496362, "grad_norm": 0.16569577157497406, "learning_rate": 0.00047168328393065663, "loss": 0.5497, "step": 224100 }, { "epoch": 30.20749124225276, "grad_norm": 0.1671258509159088, "learning_rate": 0.00047164585766040903, "loss": 0.5487, "step": 224200 }, { "epoch": 30.220964699541902, "grad_norm": 0.15857456624507904, "learning_rate": 0.00047160843139016143, "loss": 0.5509, "step": 224300 }, { "epoch": 30.234438156831043, "grad_norm": 0.1641431450843811, "learning_rate": 0.00047157100511991377, "loss": 0.55, "step": 224400 }, { "epoch": 30.247911614120184, "grad_norm": 0.17667804658412933, "learning_rate": 0.00047153357884966617, "loss": 0.55, "step": 224500 }, { "epoch": 30.261385071409325, "grad_norm": 0.19437497854232788, "learning_rate": 0.00047149615257941857, "loss": 0.5495, "step": 224600 }, { "epoch": 30.274858528698463, "grad_norm": 0.16525694727897644, "learning_rate": 0.0004714587263091709, "loss": 0.5501, "step": 224700 }, { "epoch": 30.288331985987604, "grad_norm": 0.15795183181762695, "learning_rate": 0.0004714213000389233, "loss": 0.5498, "step": 224800 }, { "epoch": 30.301805443276745, "grad_norm": 0.16996946930885315, "learning_rate": 0.0004713838737686757, "loss": 0.5493, "step": 224900 }, { "epoch": 30.315278900565886, "grad_norm": 0.17609617114067078, "learning_rate": 0.0004713464474984281, "loss": 0.5505, "step": 225000 }, { "epoch": 30.328752357855027, "grad_norm": 0.17066606879234314, "learning_rate": 0.0004713090212281805, "loss": 0.5494, "step": 225100 }, { "epoch": 30.342225815144165, "grad_norm": 0.1907203495502472, "learning_rate": 0.00047127159495793285, "loss": 0.5496, "step": 225200 }, { "epoch": 30.355699272433306, "grad_norm": 0.16222654283046722, "learning_rate": 0.00047123416868768525, "loss": 0.5493, "step": 225300 }, { "epoch": 30.369172729722447, "grad_norm": 0.164020374417305, "learning_rate": 0.00047119674241743765, "loss": 0.5502, "step": 225400 }, { "epoch": 30.382646187011588, "grad_norm": 0.16933782398700714, "learning_rate": 0.00047115931614719004, "loss": 0.5492, "step": 225500 }, { "epoch": 30.39611964430073, "grad_norm": 0.1742478311061859, "learning_rate": 0.00047112188987694244, "loss": 0.5492, "step": 225600 }, { "epoch": 30.409593101589866, "grad_norm": 0.17545448243618011, "learning_rate": 0.00047108446360669484, "loss": 0.5502, "step": 225700 }, { "epoch": 30.423066558879007, "grad_norm": 0.15988750755786896, "learning_rate": 0.00047104703733644724, "loss": 0.5494, "step": 225800 }, { "epoch": 30.43654001616815, "grad_norm": 0.16769777238368988, "learning_rate": 0.0004710096110661996, "loss": 0.5484, "step": 225900 }, { "epoch": 30.45001347345729, "grad_norm": 0.16294007003307343, "learning_rate": 0.000470972184795952, "loss": 0.5492, "step": 226000 }, { "epoch": 30.46348693074643, "grad_norm": 0.16126714646816254, "learning_rate": 0.0004709347585257044, "loss": 0.5499, "step": 226100 }, { "epoch": 30.47696038803557, "grad_norm": 0.15729458630084991, "learning_rate": 0.0004708973322554568, "loss": 0.5499, "step": 226200 }, { "epoch": 30.49043384532471, "grad_norm": 0.17493607103824615, "learning_rate": 0.0004708599059852092, "loss": 0.5498, "step": 226300 }, { "epoch": 30.50390730261385, "grad_norm": 0.16889838874340057, "learning_rate": 0.0004708224797149616, "loss": 0.5496, "step": 226400 }, { "epoch": 30.51738075990299, "grad_norm": 0.16203758120536804, "learning_rate": 0.00047078505344471397, "loss": 0.5491, "step": 226500 }, { "epoch": 30.530854217192132, "grad_norm": 0.1678483635187149, "learning_rate": 0.00047074762717446626, "loss": 0.5493, "step": 226600 }, { "epoch": 30.544327674481274, "grad_norm": 0.18642477691173553, "learning_rate": 0.00047071020090421866, "loss": 0.5483, "step": 226700 }, { "epoch": 30.55780113177041, "grad_norm": 0.16961412131786346, "learning_rate": 0.00047067277463397106, "loss": 0.5505, "step": 226800 }, { "epoch": 30.571274589059552, "grad_norm": 0.17467595636844635, "learning_rate": 0.00047063534836372346, "loss": 0.5483, "step": 226900 }, { "epoch": 30.584748046348693, "grad_norm": 0.15547186136245728, "learning_rate": 0.00047059792209347586, "loss": 0.5495, "step": 227000 }, { "epoch": 30.598221503637834, "grad_norm": 0.17581704258918762, "learning_rate": 0.00047056049582322825, "loss": 0.5489, "step": 227100 }, { "epoch": 30.611694960926975, "grad_norm": 0.17284882068634033, "learning_rate": 0.00047052306955298065, "loss": 0.5504, "step": 227200 }, { "epoch": 30.625168418216113, "grad_norm": 0.18454591929912567, "learning_rate": 0.000470485643282733, "loss": 0.5499, "step": 227300 }, { "epoch": 30.638641875505254, "grad_norm": 0.16541548073291779, "learning_rate": 0.0004704482170124854, "loss": 0.549, "step": 227400 }, { "epoch": 30.652115332794395, "grad_norm": 0.17093463242053986, "learning_rate": 0.0004704107907422378, "loss": 0.5496, "step": 227500 }, { "epoch": 30.665588790083536, "grad_norm": 0.17438755929470062, "learning_rate": 0.0004703733644719902, "loss": 0.5495, "step": 227600 }, { "epoch": 30.679062247372677, "grad_norm": 0.15828566253185272, "learning_rate": 0.0004703359382017426, "loss": 0.5498, "step": 227700 }, { "epoch": 30.692535704661815, "grad_norm": 0.1652129739522934, "learning_rate": 0.000470298511931495, "loss": 0.5499, "step": 227800 }, { "epoch": 30.706009161950956, "grad_norm": 0.16446566581726074, "learning_rate": 0.0004702610856612474, "loss": 0.5494, "step": 227900 }, { "epoch": 30.719482619240097, "grad_norm": 0.2109801173210144, "learning_rate": 0.0004702236593909998, "loss": 0.5491, "step": 228000 }, { "epoch": 30.732956076529238, "grad_norm": 0.17543745040893555, "learning_rate": 0.00047018623312075213, "loss": 0.5508, "step": 228100 }, { "epoch": 30.74642953381838, "grad_norm": 0.168788880109787, "learning_rate": 0.0004701488068505045, "loss": 0.5486, "step": 228200 }, { "epoch": 30.75990299110752, "grad_norm": 0.15469510853290558, "learning_rate": 0.0004701113805802569, "loss": 0.5495, "step": 228300 }, { "epoch": 30.773376448396657, "grad_norm": 0.18011906743049622, "learning_rate": 0.00047007395431000927, "loss": 0.5488, "step": 228400 }, { "epoch": 30.7868499056858, "grad_norm": 0.16626369953155518, "learning_rate": 0.00047003652803976167, "loss": 0.5491, "step": 228500 }, { "epoch": 30.80032336297494, "grad_norm": 0.17928315699100494, "learning_rate": 0.00046999910176951406, "loss": 0.5482, "step": 228600 }, { "epoch": 30.81379682026408, "grad_norm": 0.16480132937431335, "learning_rate": 0.00046996167549926646, "loss": 0.5489, "step": 228700 }, { "epoch": 30.82727027755322, "grad_norm": 0.1579809933900833, "learning_rate": 0.0004699242492290188, "loss": 0.5503, "step": 228800 }, { "epoch": 30.84074373484236, "grad_norm": 0.17569509148597717, "learning_rate": 0.0004698868229587712, "loss": 0.5494, "step": 228900 }, { "epoch": 30.8542171921315, "grad_norm": 0.16323085129261017, "learning_rate": 0.0004698493966885236, "loss": 0.5493, "step": 229000 }, { "epoch": 30.86769064942064, "grad_norm": 0.17210128903388977, "learning_rate": 0.000469811970418276, "loss": 0.549, "step": 229100 }, { "epoch": 30.881164106709782, "grad_norm": 0.17688266932964325, "learning_rate": 0.0004697745441480284, "loss": 0.5492, "step": 229200 }, { "epoch": 30.894637563998923, "grad_norm": 0.174661785364151, "learning_rate": 0.0004697371178777808, "loss": 0.5489, "step": 229300 }, { "epoch": 30.90811102128806, "grad_norm": 0.16810278594493866, "learning_rate": 0.0004696996916075332, "loss": 0.5494, "step": 229400 }, { "epoch": 30.921584478577202, "grad_norm": 0.1657787710428238, "learning_rate": 0.00046966226533728554, "loss": 0.5484, "step": 229500 }, { "epoch": 30.935057935866343, "grad_norm": 0.1619708389043808, "learning_rate": 0.00046962483906703794, "loss": 0.549, "step": 229600 }, { "epoch": 30.948531393155484, "grad_norm": 0.15108296275138855, "learning_rate": 0.00046958741279679034, "loss": 0.5485, "step": 229700 }, { "epoch": 30.962004850444625, "grad_norm": 0.16775868833065033, "learning_rate": 0.00046954998652654273, "loss": 0.5495, "step": 229800 }, { "epoch": 30.975478307733766, "grad_norm": 0.17151187360286713, "learning_rate": 0.00046951256025629513, "loss": 0.5489, "step": 229900 }, { "epoch": 30.988951765022904, "grad_norm": 0.16629044711589813, "learning_rate": 0.00046947513398604753, "loss": 0.5484, "step": 230000 }, { "epoch": 31.0, "eval_loss": 0.5365706086158752, "eval_runtime": 5.0034, "eval_samples_per_second": 999.314, "eval_steps_per_second": 15.789, "step": 230082 }, { "epoch": 31.002425222312045, "grad_norm": 0.16865189373493195, "learning_rate": 0.00046943770771579993, "loss": 0.5493, "step": 230100 }, { "epoch": 31.015898679601186, "grad_norm": 0.1639135330915451, "learning_rate": 0.0004694002814455523, "loss": 0.5481, "step": 230200 }, { "epoch": 31.029372136890327, "grad_norm": 0.1661965399980545, "learning_rate": 0.0004693628551753046, "loss": 0.5472, "step": 230300 }, { "epoch": 31.042845594179468, "grad_norm": 0.17717289924621582, "learning_rate": 0.000469325428905057, "loss": 0.5489, "step": 230400 }, { "epoch": 31.056319051468606, "grad_norm": 0.17398786544799805, "learning_rate": 0.0004692880026348094, "loss": 0.5496, "step": 230500 }, { "epoch": 31.069792508757747, "grad_norm": 0.16656455397605896, "learning_rate": 0.0004692505763645618, "loss": 0.5496, "step": 230600 }, { "epoch": 31.083265966046888, "grad_norm": 0.1619802862405777, "learning_rate": 0.0004692131500943142, "loss": 0.5478, "step": 230700 }, { "epoch": 31.09673942333603, "grad_norm": 0.17168349027633667, "learning_rate": 0.0004691757238240666, "loss": 0.5485, "step": 230800 }, { "epoch": 31.11021288062517, "grad_norm": 0.1796734780073166, "learning_rate": 0.000469138297553819, "loss": 0.5482, "step": 230900 }, { "epoch": 31.123686337914307, "grad_norm": 0.16125023365020752, "learning_rate": 0.00046910087128357135, "loss": 0.5483, "step": 231000 }, { "epoch": 31.13715979520345, "grad_norm": 0.1778833568096161, "learning_rate": 0.00046906344501332375, "loss": 0.5482, "step": 231100 }, { "epoch": 31.15063325249259, "grad_norm": 0.1623266637325287, "learning_rate": 0.00046902601874307615, "loss": 0.5491, "step": 231200 }, { "epoch": 31.16410670978173, "grad_norm": 0.18042749166488647, "learning_rate": 0.00046898859247282855, "loss": 0.5488, "step": 231300 }, { "epoch": 31.17758016707087, "grad_norm": 0.16434484720230103, "learning_rate": 0.00046895116620258094, "loss": 0.5476, "step": 231400 }, { "epoch": 31.19105362436001, "grad_norm": 0.1641870141029358, "learning_rate": 0.00046891373993233334, "loss": 0.5482, "step": 231500 }, { "epoch": 31.20452708164915, "grad_norm": 0.17183858156204224, "learning_rate": 0.00046887631366208574, "loss": 0.5487, "step": 231600 }, { "epoch": 31.21800053893829, "grad_norm": 0.1615336686372757, "learning_rate": 0.0004688388873918381, "loss": 0.5472, "step": 231700 }, { "epoch": 31.231473996227432, "grad_norm": 0.15530604124069214, "learning_rate": 0.0004688014611215905, "loss": 0.5472, "step": 231800 }, { "epoch": 31.244947453516573, "grad_norm": 0.16777341067790985, "learning_rate": 0.0004687640348513429, "loss": 0.5481, "step": 231900 }, { "epoch": 31.25842091080571, "grad_norm": 0.16296987235546112, "learning_rate": 0.0004687266085810952, "loss": 0.5485, "step": 232000 }, { "epoch": 31.271894368094852, "grad_norm": 0.17065976560115814, "learning_rate": 0.0004686891823108476, "loss": 0.5485, "step": 232100 }, { "epoch": 31.285367825383993, "grad_norm": 0.18319553136825562, "learning_rate": 0.0004686517560406, "loss": 0.5499, "step": 232200 }, { "epoch": 31.298841282673134, "grad_norm": 0.15881969034671783, "learning_rate": 0.0004686143297703524, "loss": 0.5485, "step": 232300 }, { "epoch": 31.312314739962275, "grad_norm": 0.17165528237819672, "learning_rate": 0.00046857690350010476, "loss": 0.5477, "step": 232400 }, { "epoch": 31.325788197251416, "grad_norm": 0.1648222655057907, "learning_rate": 0.00046853947722985716, "loss": 0.5473, "step": 232500 }, { "epoch": 31.339261654540554, "grad_norm": 0.1592414528131485, "learning_rate": 0.00046850205095960956, "loss": 0.549, "step": 232600 }, { "epoch": 31.352735111829695, "grad_norm": 0.1758939027786255, "learning_rate": 0.00046846462468936196, "loss": 0.5489, "step": 232700 }, { "epoch": 31.366208569118836, "grad_norm": 0.1721964031457901, "learning_rate": 0.00046842719841911436, "loss": 0.5477, "step": 232800 }, { "epoch": 31.379682026407977, "grad_norm": 0.1646258682012558, "learning_rate": 0.00046838977214886675, "loss": 0.5489, "step": 232900 }, { "epoch": 31.393155483697118, "grad_norm": 0.1792680323123932, "learning_rate": 0.00046835234587861915, "loss": 0.5481, "step": 233000 }, { "epoch": 31.406628940986256, "grad_norm": 0.16914886236190796, "learning_rate": 0.00046831491960837155, "loss": 0.549, "step": 233100 }, { "epoch": 31.420102398275397, "grad_norm": 0.1796351671218872, "learning_rate": 0.0004682774933381239, "loss": 0.5478, "step": 233200 }, { "epoch": 31.433575855564538, "grad_norm": 0.16763421893119812, "learning_rate": 0.0004682400670678763, "loss": 0.5484, "step": 233300 }, { "epoch": 31.44704931285368, "grad_norm": 0.17544065415859222, "learning_rate": 0.0004682026407976287, "loss": 0.5492, "step": 233400 }, { "epoch": 31.46052277014282, "grad_norm": 0.16714885830879211, "learning_rate": 0.0004681652145273811, "loss": 0.5499, "step": 233500 }, { "epoch": 31.473996227431957, "grad_norm": 0.16708113253116608, "learning_rate": 0.0004681277882571335, "loss": 0.5484, "step": 233600 }, { "epoch": 31.4874696847211, "grad_norm": 0.16788318753242493, "learning_rate": 0.0004680903619868859, "loss": 0.548, "step": 233700 }, { "epoch": 31.50094314201024, "grad_norm": 0.15791718661785126, "learning_rate": 0.00046805293571663823, "loss": 0.5497, "step": 233800 }, { "epoch": 31.51441659929938, "grad_norm": 0.16031372547149658, "learning_rate": 0.0004680155094463906, "loss": 0.5483, "step": 233900 }, { "epoch": 31.52789005658852, "grad_norm": 0.17643168568611145, "learning_rate": 0.00046797808317614297, "loss": 0.5485, "step": 234000 }, { "epoch": 31.541363513877663, "grad_norm": 0.1841372698545456, "learning_rate": 0.00046794065690589537, "loss": 0.5481, "step": 234100 }, { "epoch": 31.5548369711668, "grad_norm": 0.17814664542675018, "learning_rate": 0.00046790323063564777, "loss": 0.5488, "step": 234200 }, { "epoch": 31.56831042845594, "grad_norm": 0.17008495330810547, "learning_rate": 0.00046786580436540017, "loss": 0.5495, "step": 234300 }, { "epoch": 31.581783885745082, "grad_norm": 0.15965133905410767, "learning_rate": 0.00046782837809515257, "loss": 0.5475, "step": 234400 }, { "epoch": 31.595257343034223, "grad_norm": 0.17374013364315033, "learning_rate": 0.00046779095182490496, "loss": 0.5479, "step": 234500 }, { "epoch": 31.608730800323364, "grad_norm": 0.1739463359117508, "learning_rate": 0.0004677535255546573, "loss": 0.5475, "step": 234600 }, { "epoch": 31.622204257612502, "grad_norm": 0.175629660487175, "learning_rate": 0.0004677160992844097, "loss": 0.5473, "step": 234700 }, { "epoch": 31.635677714901643, "grad_norm": 0.1807900071144104, "learning_rate": 0.0004676786730141621, "loss": 0.5489, "step": 234800 }, { "epoch": 31.649151172190784, "grad_norm": 0.1669939160346985, "learning_rate": 0.0004676412467439145, "loss": 0.5466, "step": 234900 }, { "epoch": 31.662624629479925, "grad_norm": 0.17651617527008057, "learning_rate": 0.0004676038204736669, "loss": 0.5469, "step": 235000 }, { "epoch": 31.676098086769066, "grad_norm": 0.17230592668056488, "learning_rate": 0.0004675663942034193, "loss": 0.5483, "step": 235100 }, { "epoch": 31.689571544058204, "grad_norm": 0.1569424718618393, "learning_rate": 0.0004675289679331717, "loss": 0.5492, "step": 235200 }, { "epoch": 31.703045001347345, "grad_norm": 0.1603575199842453, "learning_rate": 0.0004674915416629241, "loss": 0.5475, "step": 235300 }, { "epoch": 31.716518458636486, "grad_norm": 0.15725374221801758, "learning_rate": 0.00046745411539267644, "loss": 0.5484, "step": 235400 }, { "epoch": 31.729991915925627, "grad_norm": 0.15798556804656982, "learning_rate": 0.00046741668912242884, "loss": 0.5474, "step": 235500 }, { "epoch": 31.743465373214768, "grad_norm": 0.18820776045322418, "learning_rate": 0.0004673792628521812, "loss": 0.5483, "step": 235600 }, { "epoch": 31.756938830503906, "grad_norm": 0.17847400903701782, "learning_rate": 0.0004673418365819336, "loss": 0.5478, "step": 235700 }, { "epoch": 31.770412287793047, "grad_norm": 0.17400044202804565, "learning_rate": 0.000467304410311686, "loss": 0.5484, "step": 235800 }, { "epoch": 31.783885745082188, "grad_norm": 0.1535678654909134, "learning_rate": 0.0004672669840414384, "loss": 0.5481, "step": 235900 }, { "epoch": 31.79735920237133, "grad_norm": 0.16782939434051514, "learning_rate": 0.0004672295577711908, "loss": 0.5487, "step": 236000 }, { "epoch": 31.81083265966047, "grad_norm": 0.162967249751091, "learning_rate": 0.0004671921315009431, "loss": 0.548, "step": 236100 }, { "epoch": 31.82430611694961, "grad_norm": 0.16107435524463654, "learning_rate": 0.0004671547052306955, "loss": 0.5478, "step": 236200 }, { "epoch": 31.83777957423875, "grad_norm": 0.16900591552257538, "learning_rate": 0.0004671172789604479, "loss": 0.5477, "step": 236300 }, { "epoch": 31.85125303152789, "grad_norm": 0.16693326830863953, "learning_rate": 0.0004670798526902003, "loss": 0.5488, "step": 236400 }, { "epoch": 31.86472648881703, "grad_norm": 0.1633964478969574, "learning_rate": 0.0004670424264199527, "loss": 0.5466, "step": 236500 }, { "epoch": 31.87819994610617, "grad_norm": 0.1512547731399536, "learning_rate": 0.0004670050001497051, "loss": 0.5482, "step": 236600 }, { "epoch": 31.891673403395313, "grad_norm": 0.1682012379169464, "learning_rate": 0.0004669675738794575, "loss": 0.5481, "step": 236700 }, { "epoch": 31.90514686068445, "grad_norm": 0.16442981362342834, "learning_rate": 0.00046693014760920985, "loss": 0.5469, "step": 236800 }, { "epoch": 31.91862031797359, "grad_norm": 0.1662144809961319, "learning_rate": 0.00046689272133896225, "loss": 0.549, "step": 236900 }, { "epoch": 31.932093775262732, "grad_norm": 0.17824286222457886, "learning_rate": 0.00046685529506871465, "loss": 0.5481, "step": 237000 }, { "epoch": 31.945567232551873, "grad_norm": 0.16644833981990814, "learning_rate": 0.00046681786879846705, "loss": 0.5485, "step": 237100 }, { "epoch": 31.959040689841014, "grad_norm": 0.16360092163085938, "learning_rate": 0.00046678044252821945, "loss": 0.5483, "step": 237200 }, { "epoch": 31.972514147130152, "grad_norm": 0.1604997217655182, "learning_rate": 0.00046674301625797184, "loss": 0.5497, "step": 237300 }, { "epoch": 31.985987604419293, "grad_norm": 0.16528329253196716, "learning_rate": 0.0004667055899877242, "loss": 0.5487, "step": 237400 }, { "epoch": 31.999461061708434, "grad_norm": 0.1689598560333252, "learning_rate": 0.00046666816371747653, "loss": 0.5474, "step": 237500 }, { "epoch": 32.0, "eval_loss": 0.5353801250457764, "eval_runtime": 4.9539, "eval_samples_per_second": 1009.312, "eval_steps_per_second": 15.947, "step": 237504 }, { "epoch": 32.012934518997575, "grad_norm": 0.16965165734291077, "learning_rate": 0.00046663073744722893, "loss": 0.547, "step": 237600 }, { "epoch": 32.02640797628671, "grad_norm": 0.16133156418800354, "learning_rate": 0.00046659331117698133, "loss": 0.5468, "step": 237700 }, { "epoch": 32.03988143357586, "grad_norm": 0.17452144622802734, "learning_rate": 0.0004665558849067337, "loss": 0.5479, "step": 237800 }, { "epoch": 32.053354890864995, "grad_norm": 0.1555728316307068, "learning_rate": 0.0004665184586364861, "loss": 0.5487, "step": 237900 }, { "epoch": 32.06682834815414, "grad_norm": 0.16087688505649567, "learning_rate": 0.0004664810323662385, "loss": 0.5483, "step": 238000 }, { "epoch": 32.08030180544328, "grad_norm": 0.16196173429489136, "learning_rate": 0.0004664436060959909, "loss": 0.5468, "step": 238100 }, { "epoch": 32.093775262732414, "grad_norm": 0.16502241790294647, "learning_rate": 0.0004664061798257433, "loss": 0.5474, "step": 238200 }, { "epoch": 32.10724872002156, "grad_norm": 0.18711768090724945, "learning_rate": 0.00046636875355549566, "loss": 0.547, "step": 238300 }, { "epoch": 32.1207221773107, "grad_norm": 0.1618199348449707, "learning_rate": 0.00046633132728524806, "loss": 0.5472, "step": 238400 }, { "epoch": 32.13419563459984, "grad_norm": 0.1667901873588562, "learning_rate": 0.00046629390101500046, "loss": 0.5464, "step": 238500 }, { "epoch": 32.14766909188898, "grad_norm": 0.16639821231365204, "learning_rate": 0.00046625647474475286, "loss": 0.5475, "step": 238600 }, { "epoch": 32.161142549178116, "grad_norm": 0.16041816771030426, "learning_rate": 0.00046621904847450526, "loss": 0.5477, "step": 238700 }, { "epoch": 32.17461600646726, "grad_norm": 0.1667042374610901, "learning_rate": 0.00046618162220425765, "loss": 0.5473, "step": 238800 }, { "epoch": 32.1880894637564, "grad_norm": 0.20161186158657074, "learning_rate": 0.00046614419593401005, "loss": 0.548, "step": 238900 }, { "epoch": 32.20156292104554, "grad_norm": 0.1545700877904892, "learning_rate": 0.0004661067696637624, "loss": 0.5487, "step": 239000 }, { "epoch": 32.21503637833468, "grad_norm": 0.15329113602638245, "learning_rate": 0.0004660693433935148, "loss": 0.5484, "step": 239100 }, { "epoch": 32.22850983562382, "grad_norm": 0.1586741805076599, "learning_rate": 0.00046603191712326714, "loss": 0.5478, "step": 239200 }, { "epoch": 32.24198329291296, "grad_norm": 0.1567579060792923, "learning_rate": 0.00046599449085301954, "loss": 0.5477, "step": 239300 }, { "epoch": 32.2554567502021, "grad_norm": 0.15090882778167725, "learning_rate": 0.00046595706458277194, "loss": 0.5466, "step": 239400 }, { "epoch": 32.268930207491245, "grad_norm": 0.16265642642974854, "learning_rate": 0.00046591963831252433, "loss": 0.5469, "step": 239500 }, { "epoch": 32.28240366478038, "grad_norm": 0.1644817441701889, "learning_rate": 0.00046588221204227673, "loss": 0.5469, "step": 239600 }, { "epoch": 32.29587712206952, "grad_norm": 0.17696227133274078, "learning_rate": 0.0004658447857720291, "loss": 0.5478, "step": 239700 }, { "epoch": 32.309350579358664, "grad_norm": 0.1697552353143692, "learning_rate": 0.0004658073595017815, "loss": 0.5475, "step": 239800 }, { "epoch": 32.3228240366478, "grad_norm": 0.1667381376028061, "learning_rate": 0.00046576993323153387, "loss": 0.5479, "step": 239900 }, { "epoch": 32.33629749393695, "grad_norm": 0.16961532831192017, "learning_rate": 0.00046573250696128627, "loss": 0.5481, "step": 240000 }, { "epoch": 32.349770951226084, "grad_norm": 0.18624578416347504, "learning_rate": 0.00046569508069103867, "loss": 0.5478, "step": 240100 }, { "epoch": 32.36324440851522, "grad_norm": 0.15623445808887482, "learning_rate": 0.00046565765442079107, "loss": 0.5472, "step": 240200 }, { "epoch": 32.376717865804366, "grad_norm": 0.15837538242340088, "learning_rate": 0.00046562022815054346, "loss": 0.5472, "step": 240300 }, { "epoch": 32.390191323093504, "grad_norm": 0.1676948517560959, "learning_rate": 0.00046558280188029586, "loss": 0.5461, "step": 240400 }, { "epoch": 32.40366478038265, "grad_norm": 0.16832207143306732, "learning_rate": 0.0004655453756100482, "loss": 0.5482, "step": 240500 }, { "epoch": 32.417138237671786, "grad_norm": 0.16490961611270905, "learning_rate": 0.0004655079493398006, "loss": 0.5473, "step": 240600 }, { "epoch": 32.43061169496093, "grad_norm": 0.16323786973953247, "learning_rate": 0.000465470523069553, "loss": 0.5483, "step": 240700 }, { "epoch": 32.44408515225007, "grad_norm": 0.15371820330619812, "learning_rate": 0.0004654330967993054, "loss": 0.5468, "step": 240800 }, { "epoch": 32.457558609539205, "grad_norm": 0.17800095677375793, "learning_rate": 0.0004653956705290578, "loss": 0.5478, "step": 240900 }, { "epoch": 32.47103206682835, "grad_norm": 0.1612703949213028, "learning_rate": 0.00046535824425881014, "loss": 0.5461, "step": 241000 }, { "epoch": 32.48450552411749, "grad_norm": 0.1545567661523819, "learning_rate": 0.00046532081798856254, "loss": 0.5468, "step": 241100 }, { "epoch": 32.49797898140663, "grad_norm": 0.18128305673599243, "learning_rate": 0.0004652833917183149, "loss": 0.5482, "step": 241200 }, { "epoch": 32.51145243869577, "grad_norm": 0.19356133043766022, "learning_rate": 0.0004652459654480673, "loss": 0.5469, "step": 241300 }, { "epoch": 32.52492589598491, "grad_norm": 0.16739876568317413, "learning_rate": 0.0004652085391778197, "loss": 0.5474, "step": 241400 }, { "epoch": 32.53839935327405, "grad_norm": 0.157633975148201, "learning_rate": 0.0004651711129075721, "loss": 0.5464, "step": 241500 }, { "epoch": 32.55187281056319, "grad_norm": 0.15462172031402588, "learning_rate": 0.0004651336866373245, "loss": 0.5489, "step": 241600 }, { "epoch": 32.565346267852334, "grad_norm": 0.15784238278865814, "learning_rate": 0.0004650962603670769, "loss": 0.5474, "step": 241700 }, { "epoch": 32.57881972514147, "grad_norm": 0.165744811296463, "learning_rate": 0.0004650588340968293, "loss": 0.5474, "step": 241800 }, { "epoch": 32.59229318243061, "grad_norm": 0.16672490537166595, "learning_rate": 0.0004650214078265816, "loss": 0.548, "step": 241900 }, { "epoch": 32.605766639719754, "grad_norm": 0.16079501807689667, "learning_rate": 0.000464983981556334, "loss": 0.5468, "step": 242000 }, { "epoch": 32.61924009700889, "grad_norm": 0.16080233454704285, "learning_rate": 0.0004649465552860864, "loss": 0.5457, "step": 242100 }, { "epoch": 32.632713554298036, "grad_norm": 0.17308412492275238, "learning_rate": 0.0004649091290158388, "loss": 0.5481, "step": 242200 }, { "epoch": 32.64618701158717, "grad_norm": 0.1571972817182541, "learning_rate": 0.0004648717027455912, "loss": 0.5464, "step": 242300 }, { "epoch": 32.65966046887631, "grad_norm": 0.16127263009548187, "learning_rate": 0.0004648342764753436, "loss": 0.547, "step": 242400 }, { "epoch": 32.673133926165455, "grad_norm": 0.1818430870771408, "learning_rate": 0.000464796850205096, "loss": 0.5469, "step": 242500 }, { "epoch": 32.68660738345459, "grad_norm": 0.1680542677640915, "learning_rate": 0.00046475942393484835, "loss": 0.547, "step": 242600 }, { "epoch": 32.70008084074374, "grad_norm": 0.1713476926088333, "learning_rate": 0.00046472199766460075, "loss": 0.5475, "step": 242700 }, { "epoch": 32.713554298032875, "grad_norm": 0.15737827122211456, "learning_rate": 0.00046468457139435315, "loss": 0.5474, "step": 242800 }, { "epoch": 32.72702775532201, "grad_norm": 0.15127331018447876, "learning_rate": 0.0004646471451241055, "loss": 0.5461, "step": 242900 }, { "epoch": 32.74050121261116, "grad_norm": 0.16529744863510132, "learning_rate": 0.0004646097188538579, "loss": 0.547, "step": 243000 }, { "epoch": 32.753974669900295, "grad_norm": 0.1634749323129654, "learning_rate": 0.0004645722925836103, "loss": 0.5471, "step": 243100 }, { "epoch": 32.76744812718944, "grad_norm": 0.1681445986032486, "learning_rate": 0.0004645348663133627, "loss": 0.5465, "step": 243200 }, { "epoch": 32.78092158447858, "grad_norm": 0.17729538679122925, "learning_rate": 0.0004644974400431151, "loss": 0.5462, "step": 243300 }, { "epoch": 32.794395041767714, "grad_norm": 0.1551133692264557, "learning_rate": 0.00046446001377286743, "loss": 0.5466, "step": 243400 }, { "epoch": 32.80786849905686, "grad_norm": 0.17416973412036896, "learning_rate": 0.00046442258750261983, "loss": 0.5467, "step": 243500 }, { "epoch": 32.821341956346, "grad_norm": 0.16529807448387146, "learning_rate": 0.00046438516123237223, "loss": 0.547, "step": 243600 }, { "epoch": 32.83481541363514, "grad_norm": 0.16037686169147491, "learning_rate": 0.0004643477349621246, "loss": 0.5469, "step": 243700 }, { "epoch": 32.84828887092428, "grad_norm": 0.17465688288211823, "learning_rate": 0.000464310308691877, "loss": 0.547, "step": 243800 }, { "epoch": 32.861762328213416, "grad_norm": 0.1692625731229782, "learning_rate": 0.0004642728824216294, "loss": 0.5471, "step": 243900 }, { "epoch": 32.87523578550256, "grad_norm": 0.17341122031211853, "learning_rate": 0.0004642354561513818, "loss": 0.5467, "step": 244000 }, { "epoch": 32.8887092427917, "grad_norm": 0.15253186225891113, "learning_rate": 0.00046419802988113416, "loss": 0.5466, "step": 244100 }, { "epoch": 32.90218270008084, "grad_norm": 0.16640067100524902, "learning_rate": 0.00046416060361088656, "loss": 0.5469, "step": 244200 }, { "epoch": 32.91565615736998, "grad_norm": 0.1569000482559204, "learning_rate": 0.00046412317734063896, "loss": 0.5467, "step": 244300 }, { "epoch": 32.929129614659125, "grad_norm": 0.1526719331741333, "learning_rate": 0.00046408575107039136, "loss": 0.5461, "step": 244400 }, { "epoch": 32.94260307194826, "grad_norm": 0.1689184606075287, "learning_rate": 0.00046404832480014376, "loss": 0.5475, "step": 244500 }, { "epoch": 32.9560765292374, "grad_norm": 0.161563903093338, "learning_rate": 0.00046401089852989616, "loss": 0.5471, "step": 244600 }, { "epoch": 32.969549986526545, "grad_norm": 0.17448903620243073, "learning_rate": 0.0004639734722596485, "loss": 0.5469, "step": 244700 }, { "epoch": 32.98302344381568, "grad_norm": 0.17175902426242828, "learning_rate": 0.00046393604598940084, "loss": 0.5471, "step": 244800 }, { "epoch": 32.99649690110483, "grad_norm": 0.17090977728366852, "learning_rate": 0.00046389861971915324, "loss": 0.5476, "step": 244900 }, { "epoch": 33.0, "eval_loss": 0.5345906615257263, "eval_runtime": 4.9532, "eval_samples_per_second": 1009.455, "eval_steps_per_second": 15.949, "step": 244926 }, { "epoch": 33.009970358393964, "grad_norm": 0.15591661632061005, "learning_rate": 0.00046386119344890564, "loss": 0.5467, "step": 245000 }, { "epoch": 33.0234438156831, "grad_norm": 0.1674133539199829, "learning_rate": 0.00046382376717865804, "loss": 0.5452, "step": 245100 }, { "epoch": 33.03691727297225, "grad_norm": 0.1739063411951065, "learning_rate": 0.00046378634090841044, "loss": 0.546, "step": 245200 }, { "epoch": 33.050390730261384, "grad_norm": 0.17226147651672363, "learning_rate": 0.00046374891463816283, "loss": 0.5463, "step": 245300 }, { "epoch": 33.06386418755053, "grad_norm": 0.1976121962070465, "learning_rate": 0.00046371148836791523, "loss": 0.5469, "step": 245400 }, { "epoch": 33.077337644839666, "grad_norm": 0.1635006219148636, "learning_rate": 0.0004636740620976676, "loss": 0.547, "step": 245500 }, { "epoch": 33.090811102128804, "grad_norm": 0.15853171050548553, "learning_rate": 0.00046363663582742, "loss": 0.5466, "step": 245600 }, { "epoch": 33.10428455941795, "grad_norm": 0.1683986485004425, "learning_rate": 0.0004635992095571724, "loss": 0.5465, "step": 245700 }, { "epoch": 33.117758016707086, "grad_norm": 0.1727258414030075, "learning_rate": 0.00046356178328692477, "loss": 0.547, "step": 245800 }, { "epoch": 33.13123147399623, "grad_norm": 0.180406391620636, "learning_rate": 0.00046352435701667717, "loss": 0.5464, "step": 245900 }, { "epoch": 33.14470493128537, "grad_norm": 0.16970470547676086, "learning_rate": 0.00046348693074642957, "loss": 0.5458, "step": 246000 }, { "epoch": 33.158178388574505, "grad_norm": 0.15612603724002838, "learning_rate": 0.00046344950447618197, "loss": 0.5471, "step": 246100 }, { "epoch": 33.17165184586365, "grad_norm": 0.16413600742816925, "learning_rate": 0.00046341207820593436, "loss": 0.5472, "step": 246200 }, { "epoch": 33.18512530315279, "grad_norm": 0.1565878540277481, "learning_rate": 0.0004633746519356867, "loss": 0.5452, "step": 246300 }, { "epoch": 33.19859876044193, "grad_norm": 0.16484445333480835, "learning_rate": 0.0004633372256654391, "loss": 0.5458, "step": 246400 }, { "epoch": 33.21207221773107, "grad_norm": 0.1883203685283661, "learning_rate": 0.00046329979939519145, "loss": 0.5455, "step": 246500 }, { "epoch": 33.22554567502021, "grad_norm": 0.17535391449928284, "learning_rate": 0.00046326237312494385, "loss": 0.5463, "step": 246600 }, { "epoch": 33.23901913230935, "grad_norm": 0.15356333553791046, "learning_rate": 0.00046322494685469625, "loss": 0.547, "step": 246700 }, { "epoch": 33.25249258959849, "grad_norm": 0.15874618291854858, "learning_rate": 0.00046318752058444865, "loss": 0.5451, "step": 246800 }, { "epoch": 33.265966046887634, "grad_norm": 0.16444894671440125, "learning_rate": 0.00046315009431420104, "loss": 0.5467, "step": 246900 }, { "epoch": 33.27943950417677, "grad_norm": 0.17313721776008606, "learning_rate": 0.0004631126680439534, "loss": 0.5466, "step": 247000 }, { "epoch": 33.29291296146591, "grad_norm": 0.1801762580871582, "learning_rate": 0.0004630752417737058, "loss": 0.5452, "step": 247100 }, { "epoch": 33.306386418755054, "grad_norm": 0.1771732121706009, "learning_rate": 0.0004630378155034582, "loss": 0.5473, "step": 247200 }, { "epoch": 33.31985987604419, "grad_norm": 0.17513054609298706, "learning_rate": 0.0004630003892332106, "loss": 0.5461, "step": 247300 }, { "epoch": 33.333333333333336, "grad_norm": 0.15581487119197845, "learning_rate": 0.000462962962962963, "loss": 0.5468, "step": 247400 }, { "epoch": 33.34680679062247, "grad_norm": 0.16661986708641052, "learning_rate": 0.0004629255366927154, "loss": 0.5451, "step": 247500 }, { "epoch": 33.36028024791161, "grad_norm": 0.1745883673429489, "learning_rate": 0.0004628881104224678, "loss": 0.5468, "step": 247600 }, { "epoch": 33.373753705200755, "grad_norm": 0.16820184886455536, "learning_rate": 0.0004628506841522201, "loss": 0.547, "step": 247700 }, { "epoch": 33.38722716248989, "grad_norm": 0.15906819701194763, "learning_rate": 0.0004628132578819725, "loss": 0.5465, "step": 247800 }, { "epoch": 33.40070061977904, "grad_norm": 0.16798458993434906, "learning_rate": 0.0004627758316117249, "loss": 0.5466, "step": 247900 }, { "epoch": 33.414174077068175, "grad_norm": 0.16156378388404846, "learning_rate": 0.0004627384053414773, "loss": 0.547, "step": 248000 }, { "epoch": 33.42764753435732, "grad_norm": 0.17751917243003845, "learning_rate": 0.0004627009790712297, "loss": 0.5465, "step": 248100 }, { "epoch": 33.44112099164646, "grad_norm": 0.15853700041770935, "learning_rate": 0.0004626635528009821, "loss": 0.5462, "step": 248200 }, { "epoch": 33.454594448935595, "grad_norm": 0.14387917518615723, "learning_rate": 0.00046262612653073446, "loss": 0.5481, "step": 248300 }, { "epoch": 33.46806790622474, "grad_norm": 0.1590173989534378, "learning_rate": 0.00046258870026048685, "loss": 0.5468, "step": 248400 }, { "epoch": 33.48154136351388, "grad_norm": 0.15296225249767303, "learning_rate": 0.0004625512739902392, "loss": 0.5464, "step": 248500 }, { "epoch": 33.49501482080302, "grad_norm": 0.15879538655281067, "learning_rate": 0.0004625138477199916, "loss": 0.5457, "step": 248600 }, { "epoch": 33.50848827809216, "grad_norm": 0.15713928639888763, "learning_rate": 0.000462476421449744, "loss": 0.5451, "step": 248700 }, { "epoch": 33.521961735381296, "grad_norm": 0.16491101682186127, "learning_rate": 0.0004624389951794964, "loss": 0.5465, "step": 248800 }, { "epoch": 33.53543519267044, "grad_norm": 0.16367357969284058, "learning_rate": 0.0004624015689092488, "loss": 0.5468, "step": 248900 }, { "epoch": 33.54890864995958, "grad_norm": 0.16818933188915253, "learning_rate": 0.0004623641426390012, "loss": 0.5461, "step": 249000 }, { "epoch": 33.56238210724872, "grad_norm": 0.16500000655651093, "learning_rate": 0.0004623267163687536, "loss": 0.5465, "step": 249100 }, { "epoch": 33.57585556453786, "grad_norm": 0.17025426030158997, "learning_rate": 0.00046228929009850593, "loss": 0.5462, "step": 249200 }, { "epoch": 33.589329021827, "grad_norm": 0.17564000189304352, "learning_rate": 0.00046225186382825833, "loss": 0.5457, "step": 249300 }, { "epoch": 33.60280247911614, "grad_norm": 0.1572171151638031, "learning_rate": 0.00046221443755801073, "loss": 0.5472, "step": 249400 }, { "epoch": 33.61627593640528, "grad_norm": 0.15571925044059753, "learning_rate": 0.0004621770112877631, "loss": 0.5454, "step": 249500 }, { "epoch": 33.629749393694425, "grad_norm": 0.17379796504974365, "learning_rate": 0.0004621395850175155, "loss": 0.5455, "step": 249600 }, { "epoch": 33.64322285098356, "grad_norm": 0.18281924724578857, "learning_rate": 0.0004621021587472679, "loss": 0.5462, "step": 249700 }, { "epoch": 33.6566963082727, "grad_norm": 0.1557118445634842, "learning_rate": 0.0004620647324770203, "loss": 0.5467, "step": 249800 }, { "epoch": 33.670169765561845, "grad_norm": 0.15771359205245972, "learning_rate": 0.00046202730620677267, "loss": 0.5465, "step": 249900 }, { "epoch": 33.68364322285098, "grad_norm": 0.16880641877651215, "learning_rate": 0.00046198987993652506, "loss": 0.5465, "step": 250000 }, { "epoch": 33.69711668014013, "grad_norm": 0.17630651593208313, "learning_rate": 0.0004619524536662774, "loss": 0.5464, "step": 250100 }, { "epoch": 33.710590137429264, "grad_norm": 0.16401535272598267, "learning_rate": 0.0004619150273960298, "loss": 0.5463, "step": 250200 }, { "epoch": 33.7240635947184, "grad_norm": 0.1665366291999817, "learning_rate": 0.0004618776011257822, "loss": 0.5462, "step": 250300 }, { "epoch": 33.737537052007546, "grad_norm": 0.15576423704624176, "learning_rate": 0.0004618401748555346, "loss": 0.546, "step": 250400 }, { "epoch": 33.751010509296684, "grad_norm": 0.15121960639953613, "learning_rate": 0.000461802748585287, "loss": 0.546, "step": 250500 }, { "epoch": 33.76448396658583, "grad_norm": 0.16248658299446106, "learning_rate": 0.00046176532231503934, "loss": 0.5461, "step": 250600 }, { "epoch": 33.777957423874966, "grad_norm": 0.1720743179321289, "learning_rate": 0.00046172789604479174, "loss": 0.5471, "step": 250700 }, { "epoch": 33.7914308811641, "grad_norm": 0.1534797102212906, "learning_rate": 0.00046169046977454414, "loss": 0.5463, "step": 250800 }, { "epoch": 33.80490433845325, "grad_norm": 0.166120707988739, "learning_rate": 0.00046165304350429654, "loss": 0.5465, "step": 250900 }, { "epoch": 33.818377795742386, "grad_norm": 0.1927204430103302, "learning_rate": 0.00046161561723404894, "loss": 0.546, "step": 251000 }, { "epoch": 33.83185125303153, "grad_norm": 0.16927194595336914, "learning_rate": 0.00046157819096380134, "loss": 0.5463, "step": 251100 }, { "epoch": 33.84532471032067, "grad_norm": 0.17176465690135956, "learning_rate": 0.00046154076469355373, "loss": 0.5467, "step": 251200 }, { "epoch": 33.858798167609805, "grad_norm": 0.17001786828041077, "learning_rate": 0.00046150333842330613, "loss": 0.5448, "step": 251300 }, { "epoch": 33.87227162489895, "grad_norm": 0.16636674106121063, "learning_rate": 0.0004614659121530585, "loss": 0.5465, "step": 251400 }, { "epoch": 33.88574508218809, "grad_norm": 0.16816960275173187, "learning_rate": 0.0004614284858828109, "loss": 0.5461, "step": 251500 }, { "epoch": 33.89921853947723, "grad_norm": 0.16133366525173187, "learning_rate": 0.00046139105961256327, "loss": 0.547, "step": 251600 }, { "epoch": 33.91269199676637, "grad_norm": 0.17868778109550476, "learning_rate": 0.00046135363334231567, "loss": 0.5457, "step": 251700 }, { "epoch": 33.92616545405551, "grad_norm": 0.17699669301509857, "learning_rate": 0.00046131620707206807, "loss": 0.5466, "step": 251800 }, { "epoch": 33.93963891134465, "grad_norm": 0.17374880611896515, "learning_rate": 0.0004612787808018204, "loss": 0.5455, "step": 251900 }, { "epoch": 33.95311236863379, "grad_norm": 0.1637049913406372, "learning_rate": 0.0004612413545315728, "loss": 0.5469, "step": 252000 }, { "epoch": 33.966585825922934, "grad_norm": 0.16768357157707214, "learning_rate": 0.00046120392826132516, "loss": 0.5462, "step": 252100 }, { "epoch": 33.98005928321207, "grad_norm": 0.17253845930099487, "learning_rate": 0.00046116650199107755, "loss": 0.5471, "step": 252200 }, { "epoch": 33.993532740501216, "grad_norm": 0.16446629166603088, "learning_rate": 0.00046112907572082995, "loss": 0.5457, "step": 252300 }, { "epoch": 34.0, "eval_loss": 0.5336807370185852, "eval_runtime": 4.9864, "eval_samples_per_second": 1002.736, "eval_steps_per_second": 15.843, "step": 252348 }, { "epoch": 34.00700619779035, "grad_norm": 0.1581709384918213, "learning_rate": 0.00046109164945058235, "loss": 0.5453, "step": 252400 }, { "epoch": 34.02047965507949, "grad_norm": 0.15291084349155426, "learning_rate": 0.00046105422318033475, "loss": 0.5447, "step": 252500 }, { "epoch": 34.033953112368636, "grad_norm": 0.16508027911186218, "learning_rate": 0.00046101679691008715, "loss": 0.5446, "step": 252600 }, { "epoch": 34.04742656965777, "grad_norm": 0.16377118229866028, "learning_rate": 0.00046097937063983954, "loss": 0.5438, "step": 252700 }, { "epoch": 34.06090002694692, "grad_norm": 0.169657900929451, "learning_rate": 0.0004609419443695919, "loss": 0.5459, "step": 252800 }, { "epoch": 34.074373484236055, "grad_norm": 0.16379189491271973, "learning_rate": 0.0004609045180993443, "loss": 0.5455, "step": 252900 }, { "epoch": 34.08784694152519, "grad_norm": 0.15668754279613495, "learning_rate": 0.0004608670918290967, "loss": 0.5467, "step": 253000 }, { "epoch": 34.10132039881434, "grad_norm": 0.17896074056625366, "learning_rate": 0.0004608296655588491, "loss": 0.5454, "step": 253100 }, { "epoch": 34.114793856103475, "grad_norm": 0.16657911241054535, "learning_rate": 0.0004607922392886015, "loss": 0.545, "step": 253200 }, { "epoch": 34.12826731339262, "grad_norm": 0.15629757940769196, "learning_rate": 0.0004607548130183539, "loss": 0.5458, "step": 253300 }, { "epoch": 34.14174077068176, "grad_norm": 0.15852873027324677, "learning_rate": 0.0004607173867481063, "loss": 0.5459, "step": 253400 }, { "epoch": 34.155214227970895, "grad_norm": 0.1594444215297699, "learning_rate": 0.0004606799604778587, "loss": 0.5451, "step": 253500 }, { "epoch": 34.16868768526004, "grad_norm": 0.18535706400871277, "learning_rate": 0.000460642534207611, "loss": 0.5443, "step": 253600 }, { "epoch": 34.18216114254918, "grad_norm": 0.17870935797691345, "learning_rate": 0.00046060510793736336, "loss": 0.5457, "step": 253700 }, { "epoch": 34.19563459983832, "grad_norm": 0.15746945142745972, "learning_rate": 0.00046056768166711576, "loss": 0.546, "step": 253800 }, { "epoch": 34.20910805712746, "grad_norm": 0.19258064031600952, "learning_rate": 0.00046053025539686816, "loss": 0.545, "step": 253900 }, { "epoch": 34.222581514416596, "grad_norm": 0.16267631947994232, "learning_rate": 0.00046049282912662056, "loss": 0.5463, "step": 254000 }, { "epoch": 34.23605497170574, "grad_norm": 0.18115182220935822, "learning_rate": 0.00046045540285637296, "loss": 0.5453, "step": 254100 }, { "epoch": 34.24952842899488, "grad_norm": 0.17353041470050812, "learning_rate": 0.00046041797658612536, "loss": 0.5451, "step": 254200 }, { "epoch": 34.26300188628402, "grad_norm": 0.16303282976150513, "learning_rate": 0.0004603805503158777, "loss": 0.5459, "step": 254300 }, { "epoch": 34.27647534357316, "grad_norm": 0.1636786162853241, "learning_rate": 0.0004603431240456301, "loss": 0.5448, "step": 254400 }, { "epoch": 34.2899488008623, "grad_norm": 0.16475607454776764, "learning_rate": 0.0004603056977753825, "loss": 0.5452, "step": 254500 }, { "epoch": 34.30342225815144, "grad_norm": 0.17420624196529388, "learning_rate": 0.0004602682715051349, "loss": 0.546, "step": 254600 }, { "epoch": 34.31689571544058, "grad_norm": 0.17619077861309052, "learning_rate": 0.0004602308452348873, "loss": 0.5455, "step": 254700 }, { "epoch": 34.330369172729725, "grad_norm": 0.15741872787475586, "learning_rate": 0.0004601934189646397, "loss": 0.5446, "step": 254800 }, { "epoch": 34.34384263001886, "grad_norm": 0.16757343709468842, "learning_rate": 0.0004601559926943921, "loss": 0.5445, "step": 254900 }, { "epoch": 34.357316087308, "grad_norm": 0.1567937284708023, "learning_rate": 0.00046011856642414443, "loss": 0.5459, "step": 255000 }, { "epoch": 34.370789544597145, "grad_norm": 0.17011265456676483, "learning_rate": 0.00046008114015389683, "loss": 0.5461, "step": 255100 }, { "epoch": 34.38426300188628, "grad_norm": 0.16556581854820251, "learning_rate": 0.00046004371388364923, "loss": 0.5456, "step": 255200 }, { "epoch": 34.39773645917543, "grad_norm": 0.163911372423172, "learning_rate": 0.00046000628761340163, "loss": 0.5447, "step": 255300 }, { "epoch": 34.411209916464564, "grad_norm": 0.161689892411232, "learning_rate": 0.000459968861343154, "loss": 0.5451, "step": 255400 }, { "epoch": 34.4246833737537, "grad_norm": 0.1613406091928482, "learning_rate": 0.00045993143507290637, "loss": 0.5454, "step": 255500 }, { "epoch": 34.438156831042846, "grad_norm": 0.15477506816387177, "learning_rate": 0.00045989400880265877, "loss": 0.5451, "step": 255600 }, { "epoch": 34.451630288331984, "grad_norm": 0.1620619148015976, "learning_rate": 0.0004598565825324111, "loss": 0.5447, "step": 255700 }, { "epoch": 34.46510374562113, "grad_norm": 0.15076418220996857, "learning_rate": 0.0004598191562621635, "loss": 0.5463, "step": 255800 }, { "epoch": 34.478577202910266, "grad_norm": 0.15809151530265808, "learning_rate": 0.0004597817299919159, "loss": 0.5453, "step": 255900 }, { "epoch": 34.49205066019941, "grad_norm": 0.16336578130722046, "learning_rate": 0.0004597443037216683, "loss": 0.5462, "step": 256000 }, { "epoch": 34.50552411748855, "grad_norm": 0.17799431085586548, "learning_rate": 0.0004597068774514207, "loss": 0.5455, "step": 256100 }, { "epoch": 34.518997574777686, "grad_norm": 0.1670408993959427, "learning_rate": 0.0004596694511811731, "loss": 0.5456, "step": 256200 }, { "epoch": 34.53247103206683, "grad_norm": 0.1626916229724884, "learning_rate": 0.0004596320249109255, "loss": 0.5461, "step": 256300 }, { "epoch": 34.54594448935597, "grad_norm": 0.1538504660129547, "learning_rate": 0.0004595945986406779, "loss": 0.5458, "step": 256400 }, { "epoch": 34.55941794664511, "grad_norm": 0.17899812757968903, "learning_rate": 0.00045955717237043024, "loss": 0.5452, "step": 256500 }, { "epoch": 34.57289140393425, "grad_norm": 0.17188848555088043, "learning_rate": 0.00045951974610018264, "loss": 0.5455, "step": 256600 }, { "epoch": 34.58636486122339, "grad_norm": 0.1601961851119995, "learning_rate": 0.00045948231982993504, "loss": 0.5464, "step": 256700 }, { "epoch": 34.59983831851253, "grad_norm": 0.15880750119686127, "learning_rate": 0.00045944489355968744, "loss": 0.5452, "step": 256800 }, { "epoch": 34.61331177580167, "grad_norm": 0.1584312617778778, "learning_rate": 0.00045940746728943984, "loss": 0.5448, "step": 256900 }, { "epoch": 34.626785233090814, "grad_norm": 0.16316135227680206, "learning_rate": 0.00045937004101919224, "loss": 0.5454, "step": 257000 }, { "epoch": 34.64025869037995, "grad_norm": 0.1670149266719818, "learning_rate": 0.00045933261474894463, "loss": 0.5462, "step": 257100 }, { "epoch": 34.65373214766909, "grad_norm": 0.1495470404624939, "learning_rate": 0.000459295188478697, "loss": 0.5447, "step": 257200 }, { "epoch": 34.667205604958234, "grad_norm": 0.16318975389003754, "learning_rate": 0.0004592577622084493, "loss": 0.5449, "step": 257300 }, { "epoch": 34.68067906224737, "grad_norm": 0.1668025106191635, "learning_rate": 0.0004592203359382017, "loss": 0.5451, "step": 257400 }, { "epoch": 34.694152519536516, "grad_norm": 0.16636627912521362, "learning_rate": 0.0004591829096679541, "loss": 0.5446, "step": 257500 }, { "epoch": 34.70762597682565, "grad_norm": 0.17989395558834076, "learning_rate": 0.0004591454833977065, "loss": 0.5465, "step": 257600 }, { "epoch": 34.72109943411479, "grad_norm": 0.16488872468471527, "learning_rate": 0.0004591080571274589, "loss": 0.5448, "step": 257700 }, { "epoch": 34.734572891403936, "grad_norm": 0.16724635660648346, "learning_rate": 0.0004590706308572113, "loss": 0.5446, "step": 257800 }, { "epoch": 34.74804634869307, "grad_norm": 0.1785886585712433, "learning_rate": 0.00045903320458696366, "loss": 0.5452, "step": 257900 }, { "epoch": 34.76151980598222, "grad_norm": 0.16649699211120605, "learning_rate": 0.00045899577831671605, "loss": 0.5448, "step": 258000 }, { "epoch": 34.774993263271355, "grad_norm": 0.15943209826946259, "learning_rate": 0.00045895835204646845, "loss": 0.5469, "step": 258100 }, { "epoch": 34.78846672056049, "grad_norm": 0.1747884899377823, "learning_rate": 0.00045892092577622085, "loss": 0.5447, "step": 258200 }, { "epoch": 34.80194017784964, "grad_norm": 0.15982168912887573, "learning_rate": 0.00045888349950597325, "loss": 0.5453, "step": 258300 }, { "epoch": 34.815413635138775, "grad_norm": 0.16442462801933289, "learning_rate": 0.00045884607323572565, "loss": 0.5446, "step": 258400 }, { "epoch": 34.82888709242792, "grad_norm": 0.1600576490163803, "learning_rate": 0.00045880864696547805, "loss": 0.5455, "step": 258500 }, { "epoch": 34.84236054971706, "grad_norm": 0.15996409952640533, "learning_rate": 0.00045877122069523044, "loss": 0.5453, "step": 258600 }, { "epoch": 34.855834007006194, "grad_norm": 0.16564905643463135, "learning_rate": 0.0004587337944249828, "loss": 0.5446, "step": 258700 }, { "epoch": 34.86930746429534, "grad_norm": 0.15942466259002686, "learning_rate": 0.0004586963681547352, "loss": 0.5457, "step": 258800 }, { "epoch": 34.88278092158448, "grad_norm": 0.1929023563861847, "learning_rate": 0.0004586589418844876, "loss": 0.545, "step": 258900 }, { "epoch": 34.89625437887362, "grad_norm": 0.16643261909484863, "learning_rate": 0.00045862151561424, "loss": 0.545, "step": 259000 }, { "epoch": 34.90972783616276, "grad_norm": 0.15944822132587433, "learning_rate": 0.0004585840893439924, "loss": 0.5466, "step": 259100 }, { "epoch": 34.923201293451896, "grad_norm": 0.1855081468820572, "learning_rate": 0.0004585466630737447, "loss": 0.5449, "step": 259200 }, { "epoch": 34.93667475074104, "grad_norm": 0.16678060591220856, "learning_rate": 0.0004585092368034971, "loss": 0.5444, "step": 259300 }, { "epoch": 34.95014820803018, "grad_norm": 0.16947638988494873, "learning_rate": 0.00045847181053324947, "loss": 0.5449, "step": 259400 }, { "epoch": 34.96362166531932, "grad_norm": 0.15968604385852814, "learning_rate": 0.00045843438426300187, "loss": 0.5451, "step": 259500 }, { "epoch": 34.97709512260846, "grad_norm": 0.16776520013809204, "learning_rate": 0.00045839695799275426, "loss": 0.5463, "step": 259600 }, { "epoch": 34.990568579897605, "grad_norm": 0.17682644724845886, "learning_rate": 0.00045835953172250666, "loss": 0.5462, "step": 259700 }, { "epoch": 35.0, "eval_loss": 0.5325470566749573, "eval_runtime": 4.9995, "eval_samples_per_second": 1000.09, "eval_steps_per_second": 15.801, "step": 259770 }, { "epoch": 35.00404203718674, "grad_norm": 0.17302462458610535, "learning_rate": 0.00045832210545225906, "loss": 0.5448, "step": 259800 }, { "epoch": 35.01751549447588, "grad_norm": 0.17950224876403809, "learning_rate": 0.00045828467918201146, "loss": 0.5446, "step": 259900 }, { "epoch": 35.030988951765025, "grad_norm": 0.16886471211910248, "learning_rate": 0.00045824725291176386, "loss": 0.5442, "step": 260000 }, { "epoch": 35.04446240905416, "grad_norm": 0.1688198298215866, "learning_rate": 0.0004582098266415162, "loss": 0.5448, "step": 260100 }, { "epoch": 35.05793586634331, "grad_norm": 0.15617217123508453, "learning_rate": 0.0004581724003712686, "loss": 0.5437, "step": 260200 }, { "epoch": 35.071409323632444, "grad_norm": 0.1677985042333603, "learning_rate": 0.000458134974101021, "loss": 0.5447, "step": 260300 }, { "epoch": 35.08488278092158, "grad_norm": 0.179569274187088, "learning_rate": 0.0004580975478307734, "loss": 0.5456, "step": 260400 }, { "epoch": 35.09835623821073, "grad_norm": 0.15810731053352356, "learning_rate": 0.0004580601215605258, "loss": 0.545, "step": 260500 }, { "epoch": 35.111829695499864, "grad_norm": 0.1582070291042328, "learning_rate": 0.0004580226952902782, "loss": 0.5441, "step": 260600 }, { "epoch": 35.12530315278901, "grad_norm": 0.16252188384532928, "learning_rate": 0.0004579852690200306, "loss": 0.5442, "step": 260700 }, { "epoch": 35.138776610078146, "grad_norm": 0.1629231572151184, "learning_rate": 0.00045794784274978293, "loss": 0.5448, "step": 260800 }, { "epoch": 35.152250067367284, "grad_norm": 0.1513308733701706, "learning_rate": 0.00045791041647953533, "loss": 0.5441, "step": 260900 }, { "epoch": 35.16572352465643, "grad_norm": 0.15816928446292877, "learning_rate": 0.0004578729902092877, "loss": 0.5445, "step": 261000 }, { "epoch": 35.179196981945566, "grad_norm": 0.16806752979755402, "learning_rate": 0.0004578355639390401, "loss": 0.5439, "step": 261100 }, { "epoch": 35.19267043923471, "grad_norm": 0.16244299709796906, "learning_rate": 0.0004577981376687925, "loss": 0.5447, "step": 261200 }, { "epoch": 35.20614389652385, "grad_norm": 0.16773243248462677, "learning_rate": 0.00045776071139854487, "loss": 0.5451, "step": 261300 }, { "epoch": 35.219617353812986, "grad_norm": 0.169888436794281, "learning_rate": 0.00045772328512829727, "loss": 0.5447, "step": 261400 }, { "epoch": 35.23309081110213, "grad_norm": 0.1681048423051834, "learning_rate": 0.00045768585885804967, "loss": 0.5438, "step": 261500 }, { "epoch": 35.24656426839127, "grad_norm": 0.16409838199615479, "learning_rate": 0.000457648432587802, "loss": 0.5445, "step": 261600 }, { "epoch": 35.26003772568041, "grad_norm": 0.1750326007604599, "learning_rate": 0.0004576110063175544, "loss": 0.5455, "step": 261700 }, { "epoch": 35.27351118296955, "grad_norm": 0.15907502174377441, "learning_rate": 0.0004575735800473068, "loss": 0.5447, "step": 261800 }, { "epoch": 35.28698464025869, "grad_norm": 0.15560781955718994, "learning_rate": 0.0004575361537770592, "loss": 0.545, "step": 261900 }, { "epoch": 35.30045809754783, "grad_norm": 0.19215363264083862, "learning_rate": 0.0004574987275068116, "loss": 0.5453, "step": 262000 }, { "epoch": 35.31393155483697, "grad_norm": 0.1850959062576294, "learning_rate": 0.000457461301236564, "loss": 0.5444, "step": 262100 }, { "epoch": 35.327405012126114, "grad_norm": 0.16394783556461334, "learning_rate": 0.0004574238749663164, "loss": 0.5443, "step": 262200 }, { "epoch": 35.34087846941525, "grad_norm": 0.1594417244195938, "learning_rate": 0.00045738644869606875, "loss": 0.5455, "step": 262300 }, { "epoch": 35.35435192670439, "grad_norm": 0.16522282361984253, "learning_rate": 0.00045734902242582114, "loss": 0.5437, "step": 262400 }, { "epoch": 35.367825383993534, "grad_norm": 0.18307393789291382, "learning_rate": 0.00045731159615557354, "loss": 0.5444, "step": 262500 }, { "epoch": 35.38129884128267, "grad_norm": 0.1568196713924408, "learning_rate": 0.00045727416988532594, "loss": 0.5447, "step": 262600 }, { "epoch": 35.394772298571816, "grad_norm": 0.18576055765151978, "learning_rate": 0.00045723674361507834, "loss": 0.544, "step": 262700 }, { "epoch": 35.40824575586095, "grad_norm": 0.15945306420326233, "learning_rate": 0.0004571993173448307, "loss": 0.5457, "step": 262800 }, { "epoch": 35.42171921315009, "grad_norm": 0.1566971093416214, "learning_rate": 0.0004571618910745831, "loss": 0.5443, "step": 262900 }, { "epoch": 35.435192670439235, "grad_norm": 0.16146595776081085, "learning_rate": 0.0004571244648043354, "loss": 0.5449, "step": 263000 }, { "epoch": 35.44866612772837, "grad_norm": 0.16817542910575867, "learning_rate": 0.0004570870385340878, "loss": 0.545, "step": 263100 }, { "epoch": 35.46213958501752, "grad_norm": 0.1624116599559784, "learning_rate": 0.0004570496122638402, "loss": 0.5442, "step": 263200 }, { "epoch": 35.475613042306655, "grad_norm": 0.1577225625514984, "learning_rate": 0.0004570121859935926, "loss": 0.5446, "step": 263300 }, { "epoch": 35.4890864995958, "grad_norm": 0.18481236696243286, "learning_rate": 0.000456974759723345, "loss": 0.5444, "step": 263400 }, { "epoch": 35.50255995688494, "grad_norm": 0.16082793474197388, "learning_rate": 0.0004569373334530974, "loss": 0.545, "step": 263500 }, { "epoch": 35.516033414174075, "grad_norm": 0.1536889374256134, "learning_rate": 0.0004568999071828498, "loss": 0.5451, "step": 263600 }, { "epoch": 35.52950687146322, "grad_norm": 0.1574636995792389, "learning_rate": 0.0004568624809126022, "loss": 0.5441, "step": 263700 }, { "epoch": 35.54298032875236, "grad_norm": 0.16182789206504822, "learning_rate": 0.00045682505464235456, "loss": 0.5434, "step": 263800 }, { "epoch": 35.5564537860415, "grad_norm": 0.16802279651165009, "learning_rate": 0.00045678762837210695, "loss": 0.5447, "step": 263900 }, { "epoch": 35.56992724333064, "grad_norm": 0.1689632385969162, "learning_rate": 0.00045675020210185935, "loss": 0.5438, "step": 264000 }, { "epoch": 35.58340070061978, "grad_norm": 0.15677805244922638, "learning_rate": 0.00045671277583161175, "loss": 0.5448, "step": 264100 }, { "epoch": 35.59687415790892, "grad_norm": 0.17647810280323029, "learning_rate": 0.00045667534956136415, "loss": 0.5438, "step": 264200 }, { "epoch": 35.61034761519806, "grad_norm": 0.16878104209899902, "learning_rate": 0.00045663792329111655, "loss": 0.5433, "step": 264300 }, { "epoch": 35.6238210724872, "grad_norm": 0.15627369284629822, "learning_rate": 0.00045660049702086895, "loss": 0.544, "step": 264400 }, { "epoch": 35.63729452977634, "grad_norm": 0.17160604894161224, "learning_rate": 0.0004565630707506213, "loss": 0.5437, "step": 264500 }, { "epoch": 35.65076798706548, "grad_norm": 0.15982720255851746, "learning_rate": 0.00045652564448037363, "loss": 0.5449, "step": 264600 }, { "epoch": 35.66424144435462, "grad_norm": 0.1602887660264969, "learning_rate": 0.00045648821821012603, "loss": 0.544, "step": 264700 }, { "epoch": 35.67771490164376, "grad_norm": 0.1654953956604004, "learning_rate": 0.00045645079193987843, "loss": 0.5441, "step": 264800 }, { "epoch": 35.691188358932905, "grad_norm": 0.17067058384418488, "learning_rate": 0.00045641336566963083, "loss": 0.5451, "step": 264900 }, { "epoch": 35.70466181622204, "grad_norm": 0.15982100367546082, "learning_rate": 0.0004563759393993832, "loss": 0.5436, "step": 265000 }, { "epoch": 35.71813527351118, "grad_norm": 0.1568395495414734, "learning_rate": 0.0004563385131291356, "loss": 0.5453, "step": 265100 }, { "epoch": 35.731608730800325, "grad_norm": 0.16553686559200287, "learning_rate": 0.00045630108685888797, "loss": 0.544, "step": 265200 }, { "epoch": 35.74508218808946, "grad_norm": 0.16671441495418549, "learning_rate": 0.00045626366058864037, "loss": 0.5457, "step": 265300 }, { "epoch": 35.75855564537861, "grad_norm": 0.16710083186626434, "learning_rate": 0.00045622623431839277, "loss": 0.5449, "step": 265400 }, { "epoch": 35.772029102667744, "grad_norm": 0.16081497073173523, "learning_rate": 0.00045618880804814516, "loss": 0.5438, "step": 265500 }, { "epoch": 35.78550255995688, "grad_norm": 0.16450169682502747, "learning_rate": 0.00045615138177789756, "loss": 0.5456, "step": 265600 }, { "epoch": 35.79897601724603, "grad_norm": 0.1567000150680542, "learning_rate": 0.00045611395550764996, "loss": 0.5449, "step": 265700 }, { "epoch": 35.812449474535164, "grad_norm": 0.17784637212753296, "learning_rate": 0.00045607652923740236, "loss": 0.5449, "step": 265800 }, { "epoch": 35.82592293182431, "grad_norm": 0.1597386598587036, "learning_rate": 0.0004560391029671547, "loss": 0.5434, "step": 265900 }, { "epoch": 35.839396389113446, "grad_norm": 0.15658296644687653, "learning_rate": 0.0004560016766969071, "loss": 0.5443, "step": 266000 }, { "epoch": 35.852869846402584, "grad_norm": 0.16270630061626434, "learning_rate": 0.0004559642504266595, "loss": 0.5442, "step": 266100 }, { "epoch": 35.86634330369173, "grad_norm": 0.17306478321552277, "learning_rate": 0.0004559268241564119, "loss": 0.5443, "step": 266200 }, { "epoch": 35.879816760980866, "grad_norm": 0.16425630450248718, "learning_rate": 0.0004558893978861643, "loss": 0.5441, "step": 266300 }, { "epoch": 35.89329021827001, "grad_norm": 0.1719721257686615, "learning_rate": 0.00045585197161591664, "loss": 0.5445, "step": 266400 }, { "epoch": 35.90676367555915, "grad_norm": 0.16039013862609863, "learning_rate": 0.00045581454534566904, "loss": 0.5451, "step": 266500 }, { "epoch": 35.920237132848285, "grad_norm": 0.17219646275043488, "learning_rate": 0.00045577711907542144, "loss": 0.5442, "step": 266600 }, { "epoch": 35.93371059013743, "grad_norm": 0.1508023589849472, "learning_rate": 0.0004557396928051738, "loss": 0.5448, "step": 266700 }, { "epoch": 35.94718404742657, "grad_norm": 0.16893737018108368, "learning_rate": 0.0004557022665349262, "loss": 0.5452, "step": 266800 }, { "epoch": 35.96065750471571, "grad_norm": 0.1754666268825531, "learning_rate": 0.0004556648402646786, "loss": 0.5448, "step": 266900 }, { "epoch": 35.97413096200485, "grad_norm": 0.15439988672733307, "learning_rate": 0.000455627413994431, "loss": 0.5437, "step": 267000 }, { "epoch": 35.98760441929399, "grad_norm": 0.1550523191690445, "learning_rate": 0.00045558998772418337, "loss": 0.5448, "step": 267100 }, { "epoch": 36.0, "eval_loss": 0.5314903259277344, "eval_runtime": 4.9793, "eval_samples_per_second": 1004.153, "eval_steps_per_second": 15.866, "step": 267192 }, { "epoch": 36.00107787658313, "grad_norm": 0.17049069702625275, "learning_rate": 0.00045555256145393577, "loss": 0.5436, "step": 267200 }, { "epoch": 36.01455133387227, "grad_norm": 0.17195719480514526, "learning_rate": 0.00045551513518368817, "loss": 0.5427, "step": 267300 }, { "epoch": 36.028024791161414, "grad_norm": 0.15903617441654205, "learning_rate": 0.0004554777089134405, "loss": 0.5434, "step": 267400 }, { "epoch": 36.04149824845055, "grad_norm": 0.15774424374103546, "learning_rate": 0.0004554402826431929, "loss": 0.5425, "step": 267500 }, { "epoch": 36.054971705739696, "grad_norm": 0.17111237347126007, "learning_rate": 0.0004554028563729453, "loss": 0.5437, "step": 267600 }, { "epoch": 36.068445163028834, "grad_norm": 0.1650141477584839, "learning_rate": 0.0004553654301026977, "loss": 0.5439, "step": 267700 }, { "epoch": 36.08191862031797, "grad_norm": 0.16102071106433868, "learning_rate": 0.0004553280038324501, "loss": 0.5435, "step": 267800 }, { "epoch": 36.095392077607116, "grad_norm": 0.15537242591381073, "learning_rate": 0.0004552905775622025, "loss": 0.5432, "step": 267900 }, { "epoch": 36.10886553489625, "grad_norm": 0.17212887108325958, "learning_rate": 0.0004552531512919549, "loss": 0.5446, "step": 268000 }, { "epoch": 36.1223389921854, "grad_norm": 0.16342471539974213, "learning_rate": 0.00045521572502170725, "loss": 0.5433, "step": 268100 }, { "epoch": 36.135812449474535, "grad_norm": 0.16164572536945343, "learning_rate": 0.0004551782987514596, "loss": 0.5437, "step": 268200 }, { "epoch": 36.14928590676367, "grad_norm": 0.15742848813533783, "learning_rate": 0.000455140872481212, "loss": 0.5445, "step": 268300 }, { "epoch": 36.16275936405282, "grad_norm": 0.16964998841285706, "learning_rate": 0.0004551034462109644, "loss": 0.5436, "step": 268400 }, { "epoch": 36.176232821341955, "grad_norm": 0.15878985822200775, "learning_rate": 0.0004550660199407168, "loss": 0.5442, "step": 268500 }, { "epoch": 36.1897062786311, "grad_norm": 0.16397181153297424, "learning_rate": 0.0004550285936704692, "loss": 0.5442, "step": 268600 }, { "epoch": 36.20317973592024, "grad_norm": 0.1682373434305191, "learning_rate": 0.0004549911674002216, "loss": 0.5435, "step": 268700 }, { "epoch": 36.216653193209375, "grad_norm": 0.1535012423992157, "learning_rate": 0.0004549537411299739, "loss": 0.5441, "step": 268800 }, { "epoch": 36.23012665049852, "grad_norm": 0.16794352233409882, "learning_rate": 0.0004549163148597263, "loss": 0.543, "step": 268900 }, { "epoch": 36.24360010778766, "grad_norm": 0.16108538210391998, "learning_rate": 0.0004548788885894787, "loss": 0.5439, "step": 269000 }, { "epoch": 36.2570735650768, "grad_norm": 0.17606915533542633, "learning_rate": 0.0004548414623192311, "loss": 0.5439, "step": 269100 }, { "epoch": 36.27054702236594, "grad_norm": 0.1563364565372467, "learning_rate": 0.0004548040360489835, "loss": 0.5441, "step": 269200 }, { "epoch": 36.28402047965508, "grad_norm": 0.15732832252979279, "learning_rate": 0.0004547666097787359, "loss": 0.543, "step": 269300 }, { "epoch": 36.29749393694422, "grad_norm": 0.16008606553077698, "learning_rate": 0.0004547291835084883, "loss": 0.5427, "step": 269400 }, { "epoch": 36.31096739423336, "grad_norm": 0.16641445457935333, "learning_rate": 0.0004546917572382407, "loss": 0.5435, "step": 269500 }, { "epoch": 36.3244408515225, "grad_norm": 0.16949276626110077, "learning_rate": 0.00045465433096799306, "loss": 0.5435, "step": 269600 }, { "epoch": 36.33791430881164, "grad_norm": 0.16279937326908112, "learning_rate": 0.00045461690469774546, "loss": 0.5443, "step": 269700 }, { "epoch": 36.35138776610078, "grad_norm": 0.1629323661327362, "learning_rate": 0.00045457947842749785, "loss": 0.5439, "step": 269800 }, { "epoch": 36.36486122338992, "grad_norm": 0.17128035426139832, "learning_rate": 0.00045454205215725025, "loss": 0.544, "step": 269900 }, { "epoch": 36.37833468067906, "grad_norm": 0.17181378602981567, "learning_rate": 0.0004545046258870026, "loss": 0.5438, "step": 270000 }, { "epoch": 36.391808137968205, "grad_norm": 0.16720552742481232, "learning_rate": 0.000454467199616755, "loss": 0.5443, "step": 270100 }, { "epoch": 36.40528159525734, "grad_norm": 0.17021512985229492, "learning_rate": 0.0004544297733465074, "loss": 0.5442, "step": 270200 }, { "epoch": 36.41875505254648, "grad_norm": 0.18953515589237213, "learning_rate": 0.00045439234707625974, "loss": 0.5432, "step": 270300 }, { "epoch": 36.432228509835625, "grad_norm": 0.16311123967170715, "learning_rate": 0.00045435492080601213, "loss": 0.5427, "step": 270400 }, { "epoch": 36.44570196712476, "grad_norm": 0.15885980427265167, "learning_rate": 0.00045431749453576453, "loss": 0.5439, "step": 270500 }, { "epoch": 36.45917542441391, "grad_norm": 0.14862027764320374, "learning_rate": 0.00045428006826551693, "loss": 0.5439, "step": 270600 }, { "epoch": 36.472648881703044, "grad_norm": 0.16232840716838837, "learning_rate": 0.00045424264199526933, "loss": 0.5437, "step": 270700 }, { "epoch": 36.48612233899219, "grad_norm": 0.1598924994468689, "learning_rate": 0.00045420521572502173, "loss": 0.5439, "step": 270800 }, { "epoch": 36.499595796281326, "grad_norm": 0.1493937075138092, "learning_rate": 0.0004541677894547741, "loss": 0.5433, "step": 270900 }, { "epoch": 36.513069253570464, "grad_norm": 0.16499727964401245, "learning_rate": 0.00045413036318452647, "loss": 0.545, "step": 271000 }, { "epoch": 36.52654271085961, "grad_norm": 0.15433113276958466, "learning_rate": 0.00045409293691427887, "loss": 0.5426, "step": 271100 }, { "epoch": 36.540016168148746, "grad_norm": 0.15469856560230255, "learning_rate": 0.00045405551064403127, "loss": 0.5438, "step": 271200 }, { "epoch": 36.55348962543789, "grad_norm": 0.15927809476852417, "learning_rate": 0.00045401808437378366, "loss": 0.5433, "step": 271300 }, { "epoch": 36.56696308272703, "grad_norm": 0.16612911224365234, "learning_rate": 0.00045398065810353606, "loss": 0.545, "step": 271400 }, { "epoch": 36.580436540016166, "grad_norm": 0.16425463557243347, "learning_rate": 0.00045394323183328846, "loss": 0.5441, "step": 271500 }, { "epoch": 36.59390999730531, "grad_norm": 0.15892231464385986, "learning_rate": 0.00045390580556304086, "loss": 0.5449, "step": 271600 }, { "epoch": 36.60738345459445, "grad_norm": 0.1520305871963501, "learning_rate": 0.00045386837929279326, "loss": 0.5443, "step": 271700 }, { "epoch": 36.62085691188359, "grad_norm": 0.1628778874874115, "learning_rate": 0.00045383095302254555, "loss": 0.5427, "step": 271800 }, { "epoch": 36.63433036917273, "grad_norm": 0.1642678827047348, "learning_rate": 0.00045379352675229795, "loss": 0.5445, "step": 271900 }, { "epoch": 36.64780382646187, "grad_norm": 0.16836661100387573, "learning_rate": 0.00045375610048205034, "loss": 0.5425, "step": 272000 }, { "epoch": 36.66127728375101, "grad_norm": 0.16396155953407288, "learning_rate": 0.00045371867421180274, "loss": 0.5438, "step": 272100 }, { "epoch": 36.67475074104015, "grad_norm": 0.17900796234607697, "learning_rate": 0.00045368124794155514, "loss": 0.5432, "step": 272200 }, { "epoch": 36.688224198329294, "grad_norm": 0.15117749571800232, "learning_rate": 0.00045364382167130754, "loss": 0.5448, "step": 272300 }, { "epoch": 36.70169765561843, "grad_norm": 0.16857248544692993, "learning_rate": 0.00045360639540105994, "loss": 0.5436, "step": 272400 }, { "epoch": 36.71517111290757, "grad_norm": 0.1666034460067749, "learning_rate": 0.0004535689691308123, "loss": 0.5433, "step": 272500 }, { "epoch": 36.728644570196714, "grad_norm": 0.17848099768161774, "learning_rate": 0.0004535315428605647, "loss": 0.5442, "step": 272600 }, { "epoch": 36.74211802748585, "grad_norm": 0.16065993905067444, "learning_rate": 0.0004534941165903171, "loss": 0.5435, "step": 272700 }, { "epoch": 36.755591484774996, "grad_norm": 0.15877331793308258, "learning_rate": 0.0004534566903200695, "loss": 0.5439, "step": 272800 }, { "epoch": 36.769064942064134, "grad_norm": 0.15406017005443573, "learning_rate": 0.0004534192640498219, "loss": 0.5441, "step": 272900 }, { "epoch": 36.78253839935327, "grad_norm": 0.15868352353572845, "learning_rate": 0.00045338183777957427, "loss": 0.5441, "step": 273000 }, { "epoch": 36.796011856642416, "grad_norm": 0.1670391857624054, "learning_rate": 0.00045334441150932667, "loss": 0.5439, "step": 273100 }, { "epoch": 36.80948531393155, "grad_norm": 0.16102512180805206, "learning_rate": 0.000453306985239079, "loss": 0.5436, "step": 273200 }, { "epoch": 36.8229587712207, "grad_norm": 0.16918231546878815, "learning_rate": 0.0004532695589688314, "loss": 0.5433, "step": 273300 }, { "epoch": 36.836432228509835, "grad_norm": 0.16319067776203156, "learning_rate": 0.0004532321326985838, "loss": 0.544, "step": 273400 }, { "epoch": 36.84990568579897, "grad_norm": 0.16784313321113586, "learning_rate": 0.0004531947064283362, "loss": 0.5427, "step": 273500 }, { "epoch": 36.86337914308812, "grad_norm": 0.1615971177816391, "learning_rate": 0.00045315728015808855, "loss": 0.5441, "step": 273600 }, { "epoch": 36.876852600377255, "grad_norm": 0.17246809601783752, "learning_rate": 0.00045311985388784095, "loss": 0.5437, "step": 273700 }, { "epoch": 36.8903260576664, "grad_norm": 0.1554945409297943, "learning_rate": 0.00045308242761759335, "loss": 0.5437, "step": 273800 }, { "epoch": 36.90379951495554, "grad_norm": 0.15430980920791626, "learning_rate": 0.0004530450013473457, "loss": 0.544, "step": 273900 }, { "epoch": 36.917272972244675, "grad_norm": 0.1756894886493683, "learning_rate": 0.0004530075750770981, "loss": 0.5447, "step": 274000 }, { "epoch": 36.93074642953382, "grad_norm": 0.1481265127658844, "learning_rate": 0.0004529701488068505, "loss": 0.5434, "step": 274100 }, { "epoch": 36.94421988682296, "grad_norm": 0.1601264476776123, "learning_rate": 0.0004529327225366029, "loss": 0.5441, "step": 274200 }, { "epoch": 36.9576933441121, "grad_norm": 0.15739302337169647, "learning_rate": 0.0004528952962663553, "loss": 0.5436, "step": 274300 }, { "epoch": 36.97116680140124, "grad_norm": 0.14660170674324036, "learning_rate": 0.0004528578699961077, "loss": 0.5432, "step": 274400 }, { "epoch": 36.984640258690376, "grad_norm": 0.16184915602207184, "learning_rate": 0.0004528204437258601, "loss": 0.5438, "step": 274500 }, { "epoch": 36.99811371597952, "grad_norm": 0.1689547896385193, "learning_rate": 0.0004527830174556125, "loss": 0.5429, "step": 274600 }, { "epoch": 37.0, "eval_loss": 0.531499981880188, "eval_runtime": 4.9679, "eval_samples_per_second": 1006.455, "eval_steps_per_second": 15.902, "step": 274614 }, { "epoch": 37.01158717326866, "grad_norm": 0.17609257996082306, "learning_rate": 0.0004527455911853648, "loss": 0.5422, "step": 274700 }, { "epoch": 37.0250606305578, "grad_norm": 0.15697357058525085, "learning_rate": 0.0004527081649151172, "loss": 0.5431, "step": 274800 }, { "epoch": 37.03853408784694, "grad_norm": 0.15780583024024963, "learning_rate": 0.0004526707386448696, "loss": 0.542, "step": 274900 }, { "epoch": 37.052007545136085, "grad_norm": 0.17487500607967377, "learning_rate": 0.000452633312374622, "loss": 0.5431, "step": 275000 }, { "epoch": 37.06548100242522, "grad_norm": 0.18406163156032562, "learning_rate": 0.0004525958861043744, "loss": 0.5428, "step": 275100 }, { "epoch": 37.07895445971436, "grad_norm": 0.16495969891548157, "learning_rate": 0.0004525584598341268, "loss": 0.5426, "step": 275200 }, { "epoch": 37.092427917003505, "grad_norm": 0.15935444831848145, "learning_rate": 0.0004525210335638792, "loss": 0.5425, "step": 275300 }, { "epoch": 37.10590137429264, "grad_norm": 0.15641099214553833, "learning_rate": 0.00045248360729363156, "loss": 0.544, "step": 275400 }, { "epoch": 37.11937483158179, "grad_norm": 0.16168805956840515, "learning_rate": 0.0004524461810233839, "loss": 0.543, "step": 275500 }, { "epoch": 37.132848288870925, "grad_norm": 0.16907022893428802, "learning_rate": 0.0004524087547531363, "loss": 0.5422, "step": 275600 }, { "epoch": 37.14632174616006, "grad_norm": 0.18252867460250854, "learning_rate": 0.0004523713284828887, "loss": 0.5421, "step": 275700 }, { "epoch": 37.15979520344921, "grad_norm": 0.155948668718338, "learning_rate": 0.0004523339022126411, "loss": 0.5433, "step": 275800 }, { "epoch": 37.173268660738344, "grad_norm": 0.164517343044281, "learning_rate": 0.0004522964759423935, "loss": 0.5432, "step": 275900 }, { "epoch": 37.18674211802749, "grad_norm": 0.15518097579479218, "learning_rate": 0.0004522590496721459, "loss": 0.5428, "step": 276000 }, { "epoch": 37.200215575316626, "grad_norm": 0.15603123605251312, "learning_rate": 0.00045222162340189824, "loss": 0.5422, "step": 276100 }, { "epoch": 37.213689032605764, "grad_norm": 0.16837798058986664, "learning_rate": 0.00045218419713165064, "loss": 0.5437, "step": 276200 }, { "epoch": 37.22716248989491, "grad_norm": 0.16123871505260468, "learning_rate": 0.00045214677086140303, "loss": 0.5436, "step": 276300 }, { "epoch": 37.240635947184046, "grad_norm": 0.16160422563552856, "learning_rate": 0.00045210934459115543, "loss": 0.5432, "step": 276400 }, { "epoch": 37.25410940447319, "grad_norm": 0.17926979064941406, "learning_rate": 0.00045207191832090783, "loss": 0.5428, "step": 276500 }, { "epoch": 37.26758286176233, "grad_norm": 0.15777526795864105, "learning_rate": 0.00045203449205066023, "loss": 0.5435, "step": 276600 }, { "epoch": 37.281056319051466, "grad_norm": 0.16009552776813507, "learning_rate": 0.00045199706578041263, "loss": 0.5435, "step": 276700 }, { "epoch": 37.29452977634061, "grad_norm": 0.15925903618335724, "learning_rate": 0.000451959639510165, "loss": 0.5435, "step": 276800 }, { "epoch": 37.30800323362975, "grad_norm": 0.16948650777339935, "learning_rate": 0.00045192221323991737, "loss": 0.5431, "step": 276900 }, { "epoch": 37.32147669091889, "grad_norm": 0.16710762679576874, "learning_rate": 0.00045188478696966977, "loss": 0.5437, "step": 277000 }, { "epoch": 37.33495014820803, "grad_norm": 0.1661403924226761, "learning_rate": 0.00045184736069942217, "loss": 0.5429, "step": 277100 }, { "epoch": 37.34842360549717, "grad_norm": 0.18031497299671173, "learning_rate": 0.00045180993442917456, "loss": 0.5437, "step": 277200 }, { "epoch": 37.36189706278631, "grad_norm": 0.16348904371261597, "learning_rate": 0.0004517725081589269, "loss": 0.5427, "step": 277300 }, { "epoch": 37.37537052007545, "grad_norm": 0.16712206602096558, "learning_rate": 0.0004517350818886793, "loss": 0.5428, "step": 277400 }, { "epoch": 37.388843977364594, "grad_norm": 0.1725291609764099, "learning_rate": 0.0004516976556184317, "loss": 0.5433, "step": 277500 }, { "epoch": 37.40231743465373, "grad_norm": 0.16580651700496674, "learning_rate": 0.00045166022934818405, "loss": 0.5423, "step": 277600 }, { "epoch": 37.41579089194287, "grad_norm": 0.16482336819171906, "learning_rate": 0.00045162280307793645, "loss": 0.5426, "step": 277700 }, { "epoch": 37.429264349232014, "grad_norm": 0.17494821548461914, "learning_rate": 0.00045158537680768885, "loss": 0.5429, "step": 277800 }, { "epoch": 37.44273780652115, "grad_norm": 0.1668137162923813, "learning_rate": 0.00045154795053744124, "loss": 0.5424, "step": 277900 }, { "epoch": 37.456211263810296, "grad_norm": 0.17254406213760376, "learning_rate": 0.00045151052426719364, "loss": 0.5425, "step": 278000 }, { "epoch": 37.46968472109943, "grad_norm": 0.1704644113779068, "learning_rate": 0.00045147309799694604, "loss": 0.5429, "step": 278100 }, { "epoch": 37.48315817838857, "grad_norm": 0.18198078870773315, "learning_rate": 0.00045143567172669844, "loss": 0.5441, "step": 278200 }, { "epoch": 37.496631635677716, "grad_norm": 0.16812030971050262, "learning_rate": 0.0004513982454564508, "loss": 0.5439, "step": 278300 }, { "epoch": 37.51010509296685, "grad_norm": 0.1561025232076645, "learning_rate": 0.0004513608191862032, "loss": 0.5434, "step": 278400 }, { "epoch": 37.523578550256, "grad_norm": 0.16183161735534668, "learning_rate": 0.0004513233929159556, "loss": 0.5432, "step": 278500 }, { "epoch": 37.537052007545135, "grad_norm": 0.15706105530261993, "learning_rate": 0.000451285966645708, "loss": 0.5416, "step": 278600 }, { "epoch": 37.55052546483428, "grad_norm": 0.15808263421058655, "learning_rate": 0.0004512485403754604, "loss": 0.543, "step": 278700 }, { "epoch": 37.56399892212342, "grad_norm": 0.16649234294891357, "learning_rate": 0.0004512111141052128, "loss": 0.5438, "step": 278800 }, { "epoch": 37.577472379412555, "grad_norm": 0.1591363251209259, "learning_rate": 0.00045117368783496517, "loss": 0.5432, "step": 278900 }, { "epoch": 37.5909458367017, "grad_norm": 0.17523792386054993, "learning_rate": 0.00045113626156471757, "loss": 0.5428, "step": 279000 }, { "epoch": 37.60441929399084, "grad_norm": 0.17907467484474182, "learning_rate": 0.00045109883529446986, "loss": 0.5424, "step": 279100 }, { "epoch": 37.61789275127998, "grad_norm": 0.15947535634040833, "learning_rate": 0.00045106140902422226, "loss": 0.5428, "step": 279200 }, { "epoch": 37.63136620856912, "grad_norm": 0.1895524561405182, "learning_rate": 0.00045102398275397466, "loss": 0.542, "step": 279300 }, { "epoch": 37.64483966585826, "grad_norm": 0.18081235885620117, "learning_rate": 0.00045098655648372705, "loss": 0.5427, "step": 279400 }, { "epoch": 37.6583131231474, "grad_norm": 0.16223911941051483, "learning_rate": 0.00045094913021347945, "loss": 0.543, "step": 279500 }, { "epoch": 37.67178658043654, "grad_norm": 0.16676202416419983, "learning_rate": 0.00045091170394323185, "loss": 0.5437, "step": 279600 }, { "epoch": 37.68526003772568, "grad_norm": 0.16352351009845734, "learning_rate": 0.00045087427767298425, "loss": 0.5428, "step": 279700 }, { "epoch": 37.69873349501482, "grad_norm": 0.16208629310131073, "learning_rate": 0.0004508368514027366, "loss": 0.5418, "step": 279800 }, { "epoch": 37.71220695230396, "grad_norm": 0.16636663675308228, "learning_rate": 0.000450799425132489, "loss": 0.5415, "step": 279900 }, { "epoch": 37.7256804095931, "grad_norm": 0.17347775399684906, "learning_rate": 0.0004507619988622414, "loss": 0.5435, "step": 280000 }, { "epoch": 37.73915386688224, "grad_norm": 0.16317208111286163, "learning_rate": 0.0004507245725919938, "loss": 0.5433, "step": 280100 }, { "epoch": 37.752627324171385, "grad_norm": 0.17293807864189148, "learning_rate": 0.0004506871463217462, "loss": 0.5417, "step": 280200 }, { "epoch": 37.76610078146052, "grad_norm": 0.1558123379945755, "learning_rate": 0.0004506497200514986, "loss": 0.5429, "step": 280300 }, { "epoch": 37.77957423874966, "grad_norm": 0.1655375063419342, "learning_rate": 0.000450612293781251, "loss": 0.5427, "step": 280400 }, { "epoch": 37.793047696038805, "grad_norm": 0.15640513598918915, "learning_rate": 0.0004505748675110033, "loss": 0.5425, "step": 280500 }, { "epoch": 37.80652115332794, "grad_norm": 0.15335996448993683, "learning_rate": 0.0004505374412407557, "loss": 0.5422, "step": 280600 }, { "epoch": 37.81999461061709, "grad_norm": 0.15112721920013428, "learning_rate": 0.0004505000149705081, "loss": 0.5429, "step": 280700 }, { "epoch": 37.833468067906225, "grad_norm": 0.16539216041564941, "learning_rate": 0.0004504625887002605, "loss": 0.5438, "step": 280800 }, { "epoch": 37.84694152519536, "grad_norm": 0.17252077162265778, "learning_rate": 0.00045042516243001287, "loss": 0.542, "step": 280900 }, { "epoch": 37.86041498248451, "grad_norm": 0.15131761133670807, "learning_rate": 0.00045038773615976526, "loss": 0.5431, "step": 281000 }, { "epoch": 37.873888439773644, "grad_norm": 0.16333800554275513, "learning_rate": 0.00045035030988951766, "loss": 0.5419, "step": 281100 }, { "epoch": 37.88736189706279, "grad_norm": 0.17525725066661835, "learning_rate": 0.00045031288361927, "loss": 0.5425, "step": 281200 }, { "epoch": 37.900835354351926, "grad_norm": 0.1738375574350357, "learning_rate": 0.0004502754573490224, "loss": 0.5434, "step": 281300 }, { "epoch": 37.914308811641064, "grad_norm": 0.16145378351211548, "learning_rate": 0.0004502380310787748, "loss": 0.5434, "step": 281400 }, { "epoch": 37.92778226893021, "grad_norm": 0.157108336687088, "learning_rate": 0.0004502006048085272, "loss": 0.5424, "step": 281500 }, { "epoch": 37.941255726219346, "grad_norm": 0.16549262404441833, "learning_rate": 0.0004501631785382796, "loss": 0.5432, "step": 281600 }, { "epoch": 37.95472918350849, "grad_norm": 0.15085510909557343, "learning_rate": 0.000450125752268032, "loss": 0.5434, "step": 281700 }, { "epoch": 37.96820264079763, "grad_norm": 0.1723518669605255, "learning_rate": 0.0004500883259977844, "loss": 0.5438, "step": 281800 }, { "epoch": 37.981676098086766, "grad_norm": 0.15057320892810822, "learning_rate": 0.0004500508997275368, "loss": 0.543, "step": 281900 }, { "epoch": 37.99514955537591, "grad_norm": 0.17607377469539642, "learning_rate": 0.00045001347345728914, "loss": 0.5425, "step": 282000 }, { "epoch": 38.0, "eval_loss": 0.5298476815223694, "eval_runtime": 4.9668, "eval_samples_per_second": 1006.676, "eval_steps_per_second": 15.905, "step": 282036 }, { "epoch": 38.00862301266505, "grad_norm": 0.15194258093833923, "learning_rate": 0.00044997604718704154, "loss": 0.5427, "step": 282100 }, { "epoch": 38.02209646995419, "grad_norm": 0.17406414449214935, "learning_rate": 0.00044993862091679393, "loss": 0.5421, "step": 282200 }, { "epoch": 38.03556992724333, "grad_norm": 0.15617522597312927, "learning_rate": 0.00044990119464654633, "loss": 0.5407, "step": 282300 }, { "epoch": 38.049043384532474, "grad_norm": 0.15990018844604492, "learning_rate": 0.00044986376837629873, "loss": 0.5416, "step": 282400 }, { "epoch": 38.06251684182161, "grad_norm": 0.15895090997219086, "learning_rate": 0.00044982634210605113, "loss": 0.5418, "step": 282500 }, { "epoch": 38.07599029911075, "grad_norm": 0.15680643916130066, "learning_rate": 0.0004497889158358035, "loss": 0.542, "step": 282600 }, { "epoch": 38.089463756399894, "grad_norm": 0.15957297384738922, "learning_rate": 0.0004497514895655558, "loss": 0.5422, "step": 282700 }, { "epoch": 38.10293721368903, "grad_norm": 0.1676464080810547, "learning_rate": 0.0004497140632953082, "loss": 0.5419, "step": 282800 }, { "epoch": 38.116410670978176, "grad_norm": 0.16603244841098785, "learning_rate": 0.0004496766370250606, "loss": 0.5429, "step": 282900 }, { "epoch": 38.129884128267314, "grad_norm": 0.16463272273540497, "learning_rate": 0.000449639210754813, "loss": 0.5416, "step": 283000 }, { "epoch": 38.14335758555645, "grad_norm": 0.15401971340179443, "learning_rate": 0.0004496017844845654, "loss": 0.5428, "step": 283100 }, { "epoch": 38.156831042845596, "grad_norm": 0.1593908816576004, "learning_rate": 0.0004495643582143178, "loss": 0.5424, "step": 283200 }, { "epoch": 38.17030450013473, "grad_norm": 0.17323292791843414, "learning_rate": 0.0004495269319440702, "loss": 0.5419, "step": 283300 }, { "epoch": 38.18377795742388, "grad_norm": 0.15941284596920013, "learning_rate": 0.00044948950567382255, "loss": 0.5417, "step": 283400 }, { "epoch": 38.197251414713016, "grad_norm": 0.15719394385814667, "learning_rate": 0.00044945207940357495, "loss": 0.5428, "step": 283500 }, { "epoch": 38.21072487200215, "grad_norm": 0.15818172693252563, "learning_rate": 0.00044941465313332735, "loss": 0.5427, "step": 283600 }, { "epoch": 38.2241983292913, "grad_norm": 0.1664736419916153, "learning_rate": 0.00044937722686307974, "loss": 0.5422, "step": 283700 }, { "epoch": 38.237671786580435, "grad_norm": 0.15800341963768005, "learning_rate": 0.00044933980059283214, "loss": 0.5424, "step": 283800 }, { "epoch": 38.25114524386958, "grad_norm": 0.1586909294128418, "learning_rate": 0.00044930237432258454, "loss": 0.5426, "step": 283900 }, { "epoch": 38.26461870115872, "grad_norm": 0.15848733484745026, "learning_rate": 0.00044926494805233694, "loss": 0.5423, "step": 284000 }, { "epoch": 38.278092158447855, "grad_norm": 0.1705896109342575, "learning_rate": 0.0004492275217820893, "loss": 0.5415, "step": 284100 }, { "epoch": 38.291565615737, "grad_norm": 0.15275321900844574, "learning_rate": 0.0004491900955118417, "loss": 0.5426, "step": 284200 }, { "epoch": 38.30503907302614, "grad_norm": 0.16694992780685425, "learning_rate": 0.0004491526692415941, "loss": 0.5413, "step": 284300 }, { "epoch": 38.31851253031528, "grad_norm": 0.1475980281829834, "learning_rate": 0.0004491152429713465, "loss": 0.5427, "step": 284400 }, { "epoch": 38.33198598760442, "grad_norm": 0.161117285490036, "learning_rate": 0.0004490778167010988, "loss": 0.543, "step": 284500 }, { "epoch": 38.34545944489356, "grad_norm": 0.14940394461154938, "learning_rate": 0.0004490403904308512, "loss": 0.5419, "step": 284600 }, { "epoch": 38.3589329021827, "grad_norm": 0.16546902060508728, "learning_rate": 0.0004490029641606036, "loss": 0.5418, "step": 284700 }, { "epoch": 38.37240635947184, "grad_norm": 0.17082440853118896, "learning_rate": 0.000448965537890356, "loss": 0.542, "step": 284800 }, { "epoch": 38.38587981676098, "grad_norm": 0.16399548947811127, "learning_rate": 0.00044892811162010836, "loss": 0.5426, "step": 284900 }, { "epoch": 38.39935327405012, "grad_norm": 0.14962071180343628, "learning_rate": 0.00044889068534986076, "loss": 0.543, "step": 285000 }, { "epoch": 38.41282673133926, "grad_norm": 0.1578522026538849, "learning_rate": 0.00044885325907961316, "loss": 0.5426, "step": 285100 }, { "epoch": 38.4263001886284, "grad_norm": 0.1841772496700287, "learning_rate": 0.00044881583280936556, "loss": 0.5427, "step": 285200 }, { "epoch": 38.43977364591754, "grad_norm": 0.15954339504241943, "learning_rate": 0.00044877840653911795, "loss": 0.542, "step": 285300 }, { "epoch": 38.453247103206685, "grad_norm": 0.1580885648727417, "learning_rate": 0.00044874098026887035, "loss": 0.5427, "step": 285400 }, { "epoch": 38.46672056049582, "grad_norm": 0.16671571135520935, "learning_rate": 0.00044870355399862275, "loss": 0.5426, "step": 285500 }, { "epoch": 38.48019401778496, "grad_norm": 0.15982593595981598, "learning_rate": 0.0004486661277283751, "loss": 0.5431, "step": 285600 }, { "epoch": 38.493667475074105, "grad_norm": 0.15899144113063812, "learning_rate": 0.0004486287014581275, "loss": 0.5418, "step": 285700 }, { "epoch": 38.50714093236324, "grad_norm": 0.1708635687828064, "learning_rate": 0.0004485912751878799, "loss": 0.5425, "step": 285800 }, { "epoch": 38.52061438965239, "grad_norm": 0.16765554249286652, "learning_rate": 0.0004485538489176323, "loss": 0.5425, "step": 285900 }, { "epoch": 38.534087846941524, "grad_norm": 0.17819105088710785, "learning_rate": 0.0004485164226473847, "loss": 0.5426, "step": 286000 }, { "epoch": 38.54756130423067, "grad_norm": 0.1599542498588562, "learning_rate": 0.0004484789963771371, "loss": 0.5416, "step": 286100 }, { "epoch": 38.56103476151981, "grad_norm": 0.1646592617034912, "learning_rate": 0.0004484415701068895, "loss": 0.5424, "step": 286200 }, { "epoch": 38.574508218808944, "grad_norm": 0.16109956800937653, "learning_rate": 0.0004484041438366418, "loss": 0.5425, "step": 286300 }, { "epoch": 38.58798167609809, "grad_norm": 0.1644330769777298, "learning_rate": 0.00044836671756639417, "loss": 0.5421, "step": 286400 }, { "epoch": 38.601455133387226, "grad_norm": 0.15936408936977386, "learning_rate": 0.00044832929129614657, "loss": 0.5418, "step": 286500 }, { "epoch": 38.61492859067637, "grad_norm": 0.1630745381116867, "learning_rate": 0.00044829186502589897, "loss": 0.5423, "step": 286600 }, { "epoch": 38.62840204796551, "grad_norm": 0.16643035411834717, "learning_rate": 0.00044825443875565137, "loss": 0.5426, "step": 286700 }, { "epoch": 38.641875505254646, "grad_norm": 0.18333446979522705, "learning_rate": 0.00044821701248540376, "loss": 0.5421, "step": 286800 }, { "epoch": 38.65534896254379, "grad_norm": 0.1610826998949051, "learning_rate": 0.00044817958621515616, "loss": 0.5419, "step": 286900 }, { "epoch": 38.66882241983293, "grad_norm": 0.16503150761127472, "learning_rate": 0.00044814215994490856, "loss": 0.5431, "step": 287000 }, { "epoch": 38.68229587712207, "grad_norm": 0.1593654751777649, "learning_rate": 0.0004481047336746609, "loss": 0.5428, "step": 287100 }, { "epoch": 38.69576933441121, "grad_norm": 0.16071946918964386, "learning_rate": 0.0004480673074044133, "loss": 0.5407, "step": 287200 }, { "epoch": 38.70924279170035, "grad_norm": 0.1463434100151062, "learning_rate": 0.0004480298811341657, "loss": 0.5427, "step": 287300 }, { "epoch": 38.72271624898949, "grad_norm": 0.1624186486005783, "learning_rate": 0.0004479924548639181, "loss": 0.5427, "step": 287400 }, { "epoch": 38.73618970627863, "grad_norm": 0.15581828355789185, "learning_rate": 0.0004479550285936705, "loss": 0.5422, "step": 287500 }, { "epoch": 38.749663163567774, "grad_norm": 0.15468882024288177, "learning_rate": 0.0004479176023234229, "loss": 0.5429, "step": 287600 }, { "epoch": 38.76313662085691, "grad_norm": 0.17080482840538025, "learning_rate": 0.0004478801760531753, "loss": 0.5421, "step": 287700 }, { "epoch": 38.77661007814605, "grad_norm": 0.16844020783901215, "learning_rate": 0.00044784274978292764, "loss": 0.5422, "step": 287800 }, { "epoch": 38.790083535435194, "grad_norm": 0.1610775738954544, "learning_rate": 0.00044780532351268004, "loss": 0.5424, "step": 287900 }, { "epoch": 38.80355699272433, "grad_norm": 0.17992375791072845, "learning_rate": 0.00044776789724243244, "loss": 0.5407, "step": 288000 }, { "epoch": 38.817030450013476, "grad_norm": 0.1703549176454544, "learning_rate": 0.0004477304709721848, "loss": 0.5418, "step": 288100 }, { "epoch": 38.830503907302614, "grad_norm": 0.16481734812259674, "learning_rate": 0.0004476930447019372, "loss": 0.5418, "step": 288200 }, { "epoch": 38.84397736459175, "grad_norm": 0.15937292575836182, "learning_rate": 0.0004476556184316896, "loss": 0.5413, "step": 288300 }, { "epoch": 38.857450821880896, "grad_norm": 0.1551876664161682, "learning_rate": 0.000447618192161442, "loss": 0.5409, "step": 288400 }, { "epoch": 38.87092427917003, "grad_norm": 0.1641976237297058, "learning_rate": 0.0004475807658911943, "loss": 0.542, "step": 288500 }, { "epoch": 38.88439773645918, "grad_norm": 0.186809703707695, "learning_rate": 0.0004475433396209467, "loss": 0.5415, "step": 288600 }, { "epoch": 38.897871193748315, "grad_norm": 0.1643601655960083, "learning_rate": 0.0004475059133506991, "loss": 0.5427, "step": 288700 }, { "epoch": 38.91134465103745, "grad_norm": 0.1724822074174881, "learning_rate": 0.0004474684870804515, "loss": 0.5422, "step": 288800 }, { "epoch": 38.9248181083266, "grad_norm": 0.17374828457832336, "learning_rate": 0.0004474310608102039, "loss": 0.5423, "step": 288900 }, { "epoch": 38.938291565615735, "grad_norm": 0.15004830062389374, "learning_rate": 0.0004473936345399563, "loss": 0.5423, "step": 289000 }, { "epoch": 38.95176502290488, "grad_norm": 0.1720760315656662, "learning_rate": 0.0004473562082697087, "loss": 0.5426, "step": 289100 }, { "epoch": 38.96523848019402, "grad_norm": 0.16455575823783875, "learning_rate": 0.00044731878199946105, "loss": 0.5411, "step": 289200 }, { "epoch": 38.978711937483155, "grad_norm": 0.16846053302288055, "learning_rate": 0.00044728135572921345, "loss": 0.5422, "step": 289300 }, { "epoch": 38.9921853947723, "grad_norm": 0.18402361869812012, "learning_rate": 0.00044724392945896585, "loss": 0.5429, "step": 289400 }, { "epoch": 39.0, "eval_loss": 0.5293181538581848, "eval_runtime": 5.0065, "eval_samples_per_second": 998.701, "eval_steps_per_second": 15.779, "step": 289458 }, { "epoch": 39.00565885206144, "grad_norm": 0.17421624064445496, "learning_rate": 0.00044720650318871825, "loss": 0.5409, "step": 289500 }, { "epoch": 39.01913230935058, "grad_norm": 0.16574575006961823, "learning_rate": 0.00044716907691847064, "loss": 0.5409, "step": 289600 }, { "epoch": 39.03260576663972, "grad_norm": 0.1652921736240387, "learning_rate": 0.00044713165064822304, "loss": 0.5409, "step": 289700 }, { "epoch": 39.046079223928864, "grad_norm": 0.15647625923156738, "learning_rate": 0.00044709422437797544, "loss": 0.5407, "step": 289800 }, { "epoch": 39.059552681218, "grad_norm": 0.15543963015079498, "learning_rate": 0.0004470567981077278, "loss": 0.5412, "step": 289900 }, { "epoch": 39.07302613850714, "grad_norm": 0.16520971059799194, "learning_rate": 0.00044701937183748013, "loss": 0.5413, "step": 290000 }, { "epoch": 39.08649959579628, "grad_norm": 0.16320188343524933, "learning_rate": 0.0004469819455672325, "loss": 0.5411, "step": 290100 }, { "epoch": 39.09997305308542, "grad_norm": 0.1606891006231308, "learning_rate": 0.0004469445192969849, "loss": 0.5413, "step": 290200 }, { "epoch": 39.113446510374565, "grad_norm": 0.15945537388324738, "learning_rate": 0.0004469070930267373, "loss": 0.5418, "step": 290300 }, { "epoch": 39.1269199676637, "grad_norm": 0.16005119681358337, "learning_rate": 0.0004468696667564897, "loss": 0.5404, "step": 290400 }, { "epoch": 39.14039342495284, "grad_norm": 0.15635517239570618, "learning_rate": 0.0004468322404862421, "loss": 0.5418, "step": 290500 }, { "epoch": 39.153866882241985, "grad_norm": 0.20741207897663116, "learning_rate": 0.0004467948142159945, "loss": 0.5412, "step": 290600 }, { "epoch": 39.16734033953112, "grad_norm": 0.16091643273830414, "learning_rate": 0.00044675738794574686, "loss": 0.5402, "step": 290700 }, { "epoch": 39.18081379682027, "grad_norm": 0.17330510914325714, "learning_rate": 0.00044671996167549926, "loss": 0.5406, "step": 290800 }, { "epoch": 39.194287254109405, "grad_norm": 0.17084236443042755, "learning_rate": 0.00044668253540525166, "loss": 0.5422, "step": 290900 }, { "epoch": 39.20776071139854, "grad_norm": 0.15219415724277496, "learning_rate": 0.00044664510913500406, "loss": 0.5408, "step": 291000 }, { "epoch": 39.22123416868769, "grad_norm": 0.15481777489185333, "learning_rate": 0.00044660768286475646, "loss": 0.5413, "step": 291100 }, { "epoch": 39.234707625976824, "grad_norm": 0.15235944092273712, "learning_rate": 0.00044657025659450885, "loss": 0.5409, "step": 291200 }, { "epoch": 39.24818108326597, "grad_norm": 0.16384808719158173, "learning_rate": 0.00044653283032426125, "loss": 0.5417, "step": 291300 }, { "epoch": 39.26165454055511, "grad_norm": 0.16030670702457428, "learning_rate": 0.0004464954040540136, "loss": 0.5416, "step": 291400 }, { "epoch": 39.275127997844244, "grad_norm": 0.18253934383392334, "learning_rate": 0.000446457977783766, "loss": 0.542, "step": 291500 }, { "epoch": 39.28860145513339, "grad_norm": 0.15785913169384003, "learning_rate": 0.0004464205515135184, "loss": 0.5422, "step": 291600 }, { "epoch": 39.302074912422526, "grad_norm": 0.16375069320201874, "learning_rate": 0.0004463831252432708, "loss": 0.5414, "step": 291700 }, { "epoch": 39.31554836971167, "grad_norm": 0.17003779113292694, "learning_rate": 0.00044634569897302313, "loss": 0.5418, "step": 291800 }, { "epoch": 39.32902182700081, "grad_norm": 0.16497643291950226, "learning_rate": 0.00044630827270277553, "loss": 0.542, "step": 291900 }, { "epoch": 39.342495284289946, "grad_norm": 0.16848674416542053, "learning_rate": 0.00044627084643252793, "loss": 0.5419, "step": 292000 }, { "epoch": 39.35596874157909, "grad_norm": 0.14892050623893738, "learning_rate": 0.0004462334201622803, "loss": 0.5415, "step": 292100 }, { "epoch": 39.36944219886823, "grad_norm": 0.1481846123933792, "learning_rate": 0.0004461959938920327, "loss": 0.542, "step": 292200 }, { "epoch": 39.38291565615737, "grad_norm": 0.16322053968906403, "learning_rate": 0.00044615856762178507, "loss": 0.5412, "step": 292300 }, { "epoch": 39.39638911344651, "grad_norm": 0.16297951340675354, "learning_rate": 0.00044612114135153747, "loss": 0.5405, "step": 292400 }, { "epoch": 39.40986257073565, "grad_norm": 0.15490208566188812, "learning_rate": 0.00044608371508128987, "loss": 0.5428, "step": 292500 }, { "epoch": 39.42333602802479, "grad_norm": 0.18383954465389252, "learning_rate": 0.00044604628881104227, "loss": 0.542, "step": 292600 }, { "epoch": 39.43680948531393, "grad_norm": 0.16536033153533936, "learning_rate": 0.00044600886254079466, "loss": 0.5416, "step": 292700 }, { "epoch": 39.450282942603074, "grad_norm": 0.16486318409442902, "learning_rate": 0.00044597143627054706, "loss": 0.5404, "step": 292800 }, { "epoch": 39.46375639989221, "grad_norm": 0.15453729033470154, "learning_rate": 0.0004459340100002994, "loss": 0.5419, "step": 292900 }, { "epoch": 39.47722985718135, "grad_norm": 0.1734340339899063, "learning_rate": 0.0004458965837300518, "loss": 0.542, "step": 293000 }, { "epoch": 39.490703314470494, "grad_norm": 0.17347460985183716, "learning_rate": 0.0004458591574598042, "loss": 0.5427, "step": 293100 }, { "epoch": 39.50417677175963, "grad_norm": 0.15425196290016174, "learning_rate": 0.0004458217311895566, "loss": 0.5408, "step": 293200 }, { "epoch": 39.517650229048776, "grad_norm": 0.15546655654907227, "learning_rate": 0.000445784304919309, "loss": 0.5406, "step": 293300 }, { "epoch": 39.531123686337914, "grad_norm": 0.15889400243759155, "learning_rate": 0.0004457468786490614, "loss": 0.5422, "step": 293400 }, { "epoch": 39.54459714362706, "grad_norm": 0.16462524235248566, "learning_rate": 0.0004457094523788138, "loss": 0.5416, "step": 293500 }, { "epoch": 39.558070600916196, "grad_norm": 0.1606285274028778, "learning_rate": 0.0004456720261085661, "loss": 0.5422, "step": 293600 }, { "epoch": 39.57154405820533, "grad_norm": 0.14818914234638214, "learning_rate": 0.0004456345998383185, "loss": 0.5418, "step": 293700 }, { "epoch": 39.58501751549448, "grad_norm": 0.14688317477703094, "learning_rate": 0.0004455971735680709, "loss": 0.5414, "step": 293800 }, { "epoch": 39.598490972783615, "grad_norm": 0.16393886506557465, "learning_rate": 0.0004455597472978233, "loss": 0.5411, "step": 293900 }, { "epoch": 39.61196443007276, "grad_norm": 0.1733819842338562, "learning_rate": 0.0004455223210275757, "loss": 0.5422, "step": 294000 }, { "epoch": 39.6254378873619, "grad_norm": 0.15922453999519348, "learning_rate": 0.0004454848947573281, "loss": 0.542, "step": 294100 }, { "epoch": 39.638911344651035, "grad_norm": 0.16287484765052795, "learning_rate": 0.0004454474684870805, "loss": 0.541, "step": 294200 }, { "epoch": 39.65238480194018, "grad_norm": 0.15801692008972168, "learning_rate": 0.0004454100422168328, "loss": 0.5412, "step": 294300 }, { "epoch": 39.66585825922932, "grad_norm": 0.16367682814598083, "learning_rate": 0.0004453726159465852, "loss": 0.5416, "step": 294400 }, { "epoch": 39.67933171651846, "grad_norm": 0.1694125384092331, "learning_rate": 0.0004453351896763376, "loss": 0.5415, "step": 294500 }, { "epoch": 39.6928051738076, "grad_norm": 0.15980815887451172, "learning_rate": 0.00044529776340609, "loss": 0.5418, "step": 294600 }, { "epoch": 39.70627863109674, "grad_norm": 0.16409245133399963, "learning_rate": 0.0004452603371358424, "loss": 0.5416, "step": 294700 }, { "epoch": 39.71975208838588, "grad_norm": 0.1710311323404312, "learning_rate": 0.0004452229108655948, "loss": 0.5408, "step": 294800 }, { "epoch": 39.73322554567502, "grad_norm": 0.15901906788349152, "learning_rate": 0.0004451854845953472, "loss": 0.5412, "step": 294900 }, { "epoch": 39.746699002964164, "grad_norm": 0.15753750503063202, "learning_rate": 0.0004451480583250996, "loss": 0.5408, "step": 295000 }, { "epoch": 39.7601724602533, "grad_norm": 0.16277019679546356, "learning_rate": 0.00044511063205485195, "loss": 0.5416, "step": 295100 }, { "epoch": 39.77364591754244, "grad_norm": 0.15850792825222015, "learning_rate": 0.00044507320578460435, "loss": 0.5423, "step": 295200 }, { "epoch": 39.78711937483158, "grad_norm": 0.1672496348619461, "learning_rate": 0.00044503577951435675, "loss": 0.5432, "step": 295300 }, { "epoch": 39.80059283212072, "grad_norm": 0.170857235789299, "learning_rate": 0.0004449983532441091, "loss": 0.5415, "step": 295400 }, { "epoch": 39.814066289409865, "grad_norm": 0.1743311583995819, "learning_rate": 0.0004449609269738615, "loss": 0.5407, "step": 295500 }, { "epoch": 39.827539746699, "grad_norm": 0.16261868178844452, "learning_rate": 0.0004449235007036139, "loss": 0.5418, "step": 295600 }, { "epoch": 39.84101320398814, "grad_norm": 0.15245771408081055, "learning_rate": 0.0004448860744333663, "loss": 0.5416, "step": 295700 }, { "epoch": 39.854486661277285, "grad_norm": 0.16751962900161743, "learning_rate": 0.00044484864816311863, "loss": 0.5414, "step": 295800 }, { "epoch": 39.86796011856642, "grad_norm": 0.1717243790626526, "learning_rate": 0.00044481122189287103, "loss": 0.5408, "step": 295900 }, { "epoch": 39.88143357585557, "grad_norm": 0.16761094331741333, "learning_rate": 0.0004447737956226234, "loss": 0.5403, "step": 296000 }, { "epoch": 39.894907033144705, "grad_norm": 0.16734249889850616, "learning_rate": 0.0004447363693523758, "loss": 0.5421, "step": 296100 }, { "epoch": 39.90838049043384, "grad_norm": 0.15944868326187134, "learning_rate": 0.0004446989430821282, "loss": 0.5417, "step": 296200 }, { "epoch": 39.92185394772299, "grad_norm": 0.16264590620994568, "learning_rate": 0.0004446615168118806, "loss": 0.5416, "step": 296300 }, { "epoch": 39.935327405012124, "grad_norm": 0.16068172454833984, "learning_rate": 0.000444624090541633, "loss": 0.5419, "step": 296400 }, { "epoch": 39.94880086230127, "grad_norm": 0.16696514189243317, "learning_rate": 0.00044458666427138536, "loss": 0.5416, "step": 296500 }, { "epoch": 39.962274319590406, "grad_norm": 0.1731678992509842, "learning_rate": 0.00044454923800113776, "loss": 0.5418, "step": 296600 }, { "epoch": 39.975747776879544, "grad_norm": 0.16581419110298157, "learning_rate": 0.00044451181173089016, "loss": 0.5414, "step": 296700 }, { "epoch": 39.98922123416869, "grad_norm": 0.16944678127765656, "learning_rate": 0.00044447438546064256, "loss": 0.5412, "step": 296800 }, { "epoch": 40.0, "eval_loss": 0.5293787121772766, "eval_runtime": 4.9541, "eval_samples_per_second": 1009.264, "eval_steps_per_second": 15.946, "step": 296880 }, { "epoch": 40.002694691457826, "grad_norm": 0.15418203175067902, "learning_rate": 0.00044443695919039496, "loss": 0.5408, "step": 296900 }, { "epoch": 40.01616814874697, "grad_norm": 0.15446141362190247, "learning_rate": 0.00044439953292014735, "loss": 0.5398, "step": 297000 }, { "epoch": 40.02964160603611, "grad_norm": 0.15812774002552032, "learning_rate": 0.00044436210664989975, "loss": 0.5398, "step": 297100 }, { "epoch": 40.043115063325246, "grad_norm": 0.1557961404323578, "learning_rate": 0.00044432468037965204, "loss": 0.5407, "step": 297200 }, { "epoch": 40.05658852061439, "grad_norm": 0.1622840315103531, "learning_rate": 0.00044428725410940444, "loss": 0.5403, "step": 297300 }, { "epoch": 40.07006197790353, "grad_norm": 0.2056179642677307, "learning_rate": 0.00044424982783915684, "loss": 0.5406, "step": 297400 }, { "epoch": 40.08353543519267, "grad_norm": 0.16057004034519196, "learning_rate": 0.00044421240156890924, "loss": 0.5409, "step": 297500 }, { "epoch": 40.09700889248181, "grad_norm": 0.15509197115898132, "learning_rate": 0.00044417497529866164, "loss": 0.5405, "step": 297600 }, { "epoch": 40.110482349770955, "grad_norm": 0.16229718923568726, "learning_rate": 0.00044413754902841403, "loss": 0.5406, "step": 297700 }, { "epoch": 40.12395580706009, "grad_norm": 0.15591126680374146, "learning_rate": 0.00044410012275816643, "loss": 0.5407, "step": 297800 }, { "epoch": 40.13742926434923, "grad_norm": 0.15090732276439667, "learning_rate": 0.00044406269648791883, "loss": 0.5398, "step": 297900 }, { "epoch": 40.150902721638374, "grad_norm": 0.16222235560417175, "learning_rate": 0.0004440252702176712, "loss": 0.5409, "step": 298000 }, { "epoch": 40.16437617892751, "grad_norm": 0.16945649683475494, "learning_rate": 0.00044398784394742357, "loss": 0.5413, "step": 298100 }, { "epoch": 40.177849636216656, "grad_norm": 0.18067216873168945, "learning_rate": 0.00044395041767717597, "loss": 0.5414, "step": 298200 }, { "epoch": 40.191323093505794, "grad_norm": 0.1564316749572754, "learning_rate": 0.00044391299140692837, "loss": 0.5418, "step": 298300 }, { "epoch": 40.20479655079493, "grad_norm": 0.16339795291423798, "learning_rate": 0.00044387556513668077, "loss": 0.5407, "step": 298400 }, { "epoch": 40.218270008084076, "grad_norm": 0.16268610954284668, "learning_rate": 0.00044383813886643317, "loss": 0.5405, "step": 298500 }, { "epoch": 40.231743465373214, "grad_norm": 0.1775568127632141, "learning_rate": 0.00044380071259618556, "loss": 0.5405, "step": 298600 }, { "epoch": 40.24521692266236, "grad_norm": 0.16329458355903625, "learning_rate": 0.0004437632863259379, "loss": 0.5413, "step": 298700 }, { "epoch": 40.258690379951496, "grad_norm": 0.16200651228427887, "learning_rate": 0.0004437258600556903, "loss": 0.5413, "step": 298800 }, { "epoch": 40.27216383724063, "grad_norm": 0.16230377554893494, "learning_rate": 0.0004436884337854427, "loss": 0.5404, "step": 298900 }, { "epoch": 40.28563729452978, "grad_norm": 0.1574954241514206, "learning_rate": 0.00044365100751519505, "loss": 0.5403, "step": 299000 }, { "epoch": 40.299110751818915, "grad_norm": 0.16502368450164795, "learning_rate": 0.00044361358124494745, "loss": 0.5411, "step": 299100 }, { "epoch": 40.31258420910806, "grad_norm": 0.1615886688232422, "learning_rate": 0.00044357615497469984, "loss": 0.5415, "step": 299200 }, { "epoch": 40.3260576663972, "grad_norm": 0.1629536896944046, "learning_rate": 0.00044353872870445224, "loss": 0.5413, "step": 299300 }, { "epoch": 40.339531123686335, "grad_norm": 0.1673208773136139, "learning_rate": 0.0004435013024342046, "loss": 0.5399, "step": 299400 }, { "epoch": 40.35300458097548, "grad_norm": 0.1784520000219345, "learning_rate": 0.000443463876163957, "loss": 0.541, "step": 299500 }, { "epoch": 40.36647803826462, "grad_norm": 0.16594311594963074, "learning_rate": 0.0004434264498937094, "loss": 0.5406, "step": 299600 }, { "epoch": 40.37995149555376, "grad_norm": 0.15568844974040985, "learning_rate": 0.0004433890236234618, "loss": 0.5404, "step": 299700 }, { "epoch": 40.3934249528429, "grad_norm": 0.16115763783454895, "learning_rate": 0.0004433515973532142, "loss": 0.5404, "step": 299800 }, { "epoch": 40.40689841013204, "grad_norm": 0.16859614849090576, "learning_rate": 0.0004433141710829666, "loss": 0.5409, "step": 299900 }, { "epoch": 40.42037186742118, "grad_norm": 0.15394900739192963, "learning_rate": 0.000443276744812719, "loss": 0.5407, "step": 300000 }, { "epoch": 40.43384532471032, "grad_norm": 0.15111714601516724, "learning_rate": 0.0004432393185424714, "loss": 0.5408, "step": 300100 }, { "epoch": 40.44731878199946, "grad_norm": 0.15229469537734985, "learning_rate": 0.0004432018922722237, "loss": 0.541, "step": 300200 }, { "epoch": 40.4607922392886, "grad_norm": 0.20892100036144257, "learning_rate": 0.0004431644660019761, "loss": 0.5418, "step": 300300 }, { "epoch": 40.47426569657774, "grad_norm": 0.15388375520706177, "learning_rate": 0.0004431270397317285, "loss": 0.5405, "step": 300400 }, { "epoch": 40.48773915386688, "grad_norm": 0.17628993093967438, "learning_rate": 0.0004430896134614809, "loss": 0.5409, "step": 300500 }, { "epoch": 40.50121261115602, "grad_norm": 0.15850894153118134, "learning_rate": 0.0004430521871912333, "loss": 0.5407, "step": 300600 }, { "epoch": 40.514686068445165, "grad_norm": 0.16217687726020813, "learning_rate": 0.0004430147609209857, "loss": 0.5403, "step": 300700 }, { "epoch": 40.5281595257343, "grad_norm": 0.15982140600681305, "learning_rate": 0.00044297733465073805, "loss": 0.5418, "step": 300800 }, { "epoch": 40.54163298302345, "grad_norm": 0.16470809280872345, "learning_rate": 0.0004429399083804904, "loss": 0.5402, "step": 300900 }, { "epoch": 40.555106440312585, "grad_norm": 0.1745022088289261, "learning_rate": 0.0004429024821102428, "loss": 0.5409, "step": 301000 }, { "epoch": 40.56857989760172, "grad_norm": 0.15736602246761322, "learning_rate": 0.0004428650558399952, "loss": 0.5408, "step": 301100 }, { "epoch": 40.58205335489087, "grad_norm": 0.1737998127937317, "learning_rate": 0.0004428276295697476, "loss": 0.5414, "step": 301200 }, { "epoch": 40.595526812180005, "grad_norm": 0.19420695304870605, "learning_rate": 0.0004427902032995, "loss": 0.5413, "step": 301300 }, { "epoch": 40.60900026946915, "grad_norm": 0.16078442335128784, "learning_rate": 0.0004427527770292524, "loss": 0.5417, "step": 301400 }, { "epoch": 40.62247372675829, "grad_norm": 0.16083437204360962, "learning_rate": 0.0004427153507590048, "loss": 0.5422, "step": 301500 }, { "epoch": 40.635947184047424, "grad_norm": 0.1484527885913849, "learning_rate": 0.00044267792448875713, "loss": 0.541, "step": 301600 }, { "epoch": 40.64942064133657, "grad_norm": 0.16715426743030548, "learning_rate": 0.00044264049821850953, "loss": 0.5408, "step": 301700 }, { "epoch": 40.662894098625706, "grad_norm": 0.1660909354686737, "learning_rate": 0.00044260307194826193, "loss": 0.5401, "step": 301800 }, { "epoch": 40.67636755591485, "grad_norm": 0.1567399501800537, "learning_rate": 0.0004425656456780143, "loss": 0.5408, "step": 301900 }, { "epoch": 40.68984101320399, "grad_norm": 0.16137635707855225, "learning_rate": 0.0004425282194077667, "loss": 0.5399, "step": 302000 }, { "epoch": 40.703314470493126, "grad_norm": 0.15806880593299866, "learning_rate": 0.0004424907931375191, "loss": 0.5396, "step": 302100 }, { "epoch": 40.71678792778227, "grad_norm": 0.15574197471141815, "learning_rate": 0.0004424533668672715, "loss": 0.5406, "step": 302200 }, { "epoch": 40.73026138507141, "grad_norm": 0.1597529649734497, "learning_rate": 0.00044241594059702386, "loss": 0.5419, "step": 302300 }, { "epoch": 40.74373484236055, "grad_norm": 0.15756022930145264, "learning_rate": 0.00044237851432677626, "loss": 0.5398, "step": 302400 }, { "epoch": 40.75720829964969, "grad_norm": 0.1773592233657837, "learning_rate": 0.00044234108805652866, "loss": 0.5403, "step": 302500 }, { "epoch": 40.77068175693883, "grad_norm": 0.15994875133037567, "learning_rate": 0.000442303661786281, "loss": 0.5408, "step": 302600 }, { "epoch": 40.78415521422797, "grad_norm": 0.15765586495399475, "learning_rate": 0.0004422662355160334, "loss": 0.5411, "step": 302700 }, { "epoch": 40.79762867151711, "grad_norm": 0.17288655042648315, "learning_rate": 0.0004422288092457858, "loss": 0.5402, "step": 302800 }, { "epoch": 40.811102128806255, "grad_norm": 0.1509028673171997, "learning_rate": 0.0004421913829755382, "loss": 0.5417, "step": 302900 }, { "epoch": 40.82457558609539, "grad_norm": 0.19738410413265228, "learning_rate": 0.0004421539567052906, "loss": 0.5415, "step": 303000 }, { "epoch": 40.83804904338453, "grad_norm": 0.17848564684391022, "learning_rate": 0.00044211653043504294, "loss": 0.5415, "step": 303100 }, { "epoch": 40.851522500673674, "grad_norm": 0.16032171249389648, "learning_rate": 0.00044207910416479534, "loss": 0.5411, "step": 303200 }, { "epoch": 40.86499595796281, "grad_norm": 0.1651422679424286, "learning_rate": 0.00044204167789454774, "loss": 0.5404, "step": 303300 }, { "epoch": 40.878469415251956, "grad_norm": 0.16628240048885345, "learning_rate": 0.00044200425162430014, "loss": 0.5406, "step": 303400 }, { "epoch": 40.891942872541094, "grad_norm": 0.16874919831752777, "learning_rate": 0.00044196682535405253, "loss": 0.5404, "step": 303500 }, { "epoch": 40.90541632983023, "grad_norm": 0.16377593576908112, "learning_rate": 0.00044192939908380493, "loss": 0.5402, "step": 303600 }, { "epoch": 40.918889787119376, "grad_norm": 0.16435506939888, "learning_rate": 0.00044189197281355733, "loss": 0.5403, "step": 303700 }, { "epoch": 40.93236324440851, "grad_norm": 0.16044414043426514, "learning_rate": 0.0004418545465433097, "loss": 0.5415, "step": 303800 }, { "epoch": 40.94583670169766, "grad_norm": 0.20025722682476044, "learning_rate": 0.0004418171202730621, "loss": 0.5406, "step": 303900 }, { "epoch": 40.959310158986796, "grad_norm": 0.1542777568101883, "learning_rate": 0.00044177969400281447, "loss": 0.5414, "step": 304000 }, { "epoch": 40.97278361627593, "grad_norm": 0.16083243489265442, "learning_rate": 0.00044174226773256687, "loss": 0.5405, "step": 304100 }, { "epoch": 40.98625707356508, "grad_norm": 0.16638696193695068, "learning_rate": 0.00044170484146231927, "loss": 0.5407, "step": 304200 }, { "epoch": 40.999730530854215, "grad_norm": 0.15287384390830994, "learning_rate": 0.00044166741519207167, "loss": 0.54, "step": 304300 }, { "epoch": 41.0, "eval_loss": 0.528369128704071, "eval_runtime": 4.9676, "eval_samples_per_second": 1006.518, "eval_steps_per_second": 15.903, "step": 304302 }, { "epoch": 41.01320398814336, "grad_norm": 0.1471395492553711, "learning_rate": 0.000441629988921824, "loss": 0.5399, "step": 304400 }, { "epoch": 41.0266774454325, "grad_norm": 0.1587388813495636, "learning_rate": 0.00044159256265157635, "loss": 0.5391, "step": 304500 }, { "epoch": 41.040150902721635, "grad_norm": 0.1808934360742569, "learning_rate": 0.00044155513638132875, "loss": 0.5397, "step": 304600 }, { "epoch": 41.05362436001078, "grad_norm": 0.17997236549854279, "learning_rate": 0.00044151771011108115, "loss": 0.5398, "step": 304700 }, { "epoch": 41.06709781729992, "grad_norm": 0.15667693316936493, "learning_rate": 0.00044148028384083355, "loss": 0.5397, "step": 304800 }, { "epoch": 41.08057127458906, "grad_norm": 0.15824592113494873, "learning_rate": 0.00044144285757058595, "loss": 0.5395, "step": 304900 }, { "epoch": 41.0940447318782, "grad_norm": 0.17008696496486664, "learning_rate": 0.00044140543130033835, "loss": 0.5407, "step": 305000 }, { "epoch": 41.107518189167344, "grad_norm": 0.1639477163553238, "learning_rate": 0.00044136800503009074, "loss": 0.541, "step": 305100 }, { "epoch": 41.12099164645648, "grad_norm": 0.15201248228549957, "learning_rate": 0.00044133057875984314, "loss": 0.5408, "step": 305200 }, { "epoch": 41.13446510374562, "grad_norm": 0.1609974056482315, "learning_rate": 0.0004412931524895955, "loss": 0.5403, "step": 305300 }, { "epoch": 41.14793856103476, "grad_norm": 0.15146829187870026, "learning_rate": 0.0004412557262193479, "loss": 0.5414, "step": 305400 }, { "epoch": 41.1614120183239, "grad_norm": 0.1613481044769287, "learning_rate": 0.0004412182999491003, "loss": 0.5401, "step": 305500 }, { "epoch": 41.174885475613046, "grad_norm": 0.16257226467132568, "learning_rate": 0.0004411808736788527, "loss": 0.5402, "step": 305600 }, { "epoch": 41.18835893290218, "grad_norm": 0.176707923412323, "learning_rate": 0.0004411434474086051, "loss": 0.5392, "step": 305700 }, { "epoch": 41.20183239019132, "grad_norm": 0.16037070751190186, "learning_rate": 0.0004411060211383575, "loss": 0.5403, "step": 305800 }, { "epoch": 41.215305847480465, "grad_norm": 0.15859931707382202, "learning_rate": 0.0004410685948681099, "loss": 0.5408, "step": 305900 }, { "epoch": 41.2287793047696, "grad_norm": 0.18641893565654755, "learning_rate": 0.0004410311685978622, "loss": 0.5391, "step": 306000 }, { "epoch": 41.24225276205875, "grad_norm": 0.1831163763999939, "learning_rate": 0.0004409937423276146, "loss": 0.5397, "step": 306100 }, { "epoch": 41.255726219347885, "grad_norm": 0.17167928814888, "learning_rate": 0.00044095631605736696, "loss": 0.5399, "step": 306200 }, { "epoch": 41.26919967663702, "grad_norm": 0.16403551399707794, "learning_rate": 0.00044091888978711936, "loss": 0.5403, "step": 306300 }, { "epoch": 41.28267313392617, "grad_norm": 0.15914443135261536, "learning_rate": 0.00044088146351687176, "loss": 0.5395, "step": 306400 }, { "epoch": 41.296146591215305, "grad_norm": 0.16766662895679474, "learning_rate": 0.00044084403724662416, "loss": 0.5406, "step": 306500 }, { "epoch": 41.30962004850445, "grad_norm": 0.16560514271259308, "learning_rate": 0.00044080661097637655, "loss": 0.5398, "step": 306600 }, { "epoch": 41.32309350579359, "grad_norm": 0.16262447834014893, "learning_rate": 0.0004407691847061289, "loss": 0.5394, "step": 306700 }, { "epoch": 41.336566963082724, "grad_norm": 0.17176228761672974, "learning_rate": 0.0004407317584358813, "loss": 0.5404, "step": 306800 }, { "epoch": 41.35004042037187, "grad_norm": 0.15477122366428375, "learning_rate": 0.0004406943321656337, "loss": 0.5391, "step": 306900 }, { "epoch": 41.363513877661006, "grad_norm": 0.17035891115665436, "learning_rate": 0.0004406569058953861, "loss": 0.5393, "step": 307000 }, { "epoch": 41.37698733495015, "grad_norm": 0.15949642658233643, "learning_rate": 0.0004406194796251385, "loss": 0.5391, "step": 307100 }, { "epoch": 41.39046079223929, "grad_norm": 0.15490780770778656, "learning_rate": 0.0004405820533548909, "loss": 0.5411, "step": 307200 }, { "epoch": 41.403934249528426, "grad_norm": 0.15694892406463623, "learning_rate": 0.0004405446270846433, "loss": 0.54, "step": 307300 }, { "epoch": 41.41740770681757, "grad_norm": 0.16645295917987823, "learning_rate": 0.00044050720081439563, "loss": 0.5396, "step": 307400 }, { "epoch": 41.43088116410671, "grad_norm": 0.16150474548339844, "learning_rate": 0.00044046977454414803, "loss": 0.5414, "step": 307500 }, { "epoch": 41.44435462139585, "grad_norm": 0.17871999740600586, "learning_rate": 0.00044043234827390043, "loss": 0.5404, "step": 307600 }, { "epoch": 41.45782807868499, "grad_norm": 0.16713058948516846, "learning_rate": 0.0004403949220036528, "loss": 0.5398, "step": 307700 }, { "epoch": 41.47130153597413, "grad_norm": 0.16057606041431427, "learning_rate": 0.0004403574957334052, "loss": 0.5406, "step": 307800 }, { "epoch": 41.48477499326327, "grad_norm": 0.15873870253562927, "learning_rate": 0.0004403200694631576, "loss": 0.5398, "step": 307900 }, { "epoch": 41.49824845055241, "grad_norm": 0.16017259657382965, "learning_rate": 0.00044028264319291, "loss": 0.5398, "step": 308000 }, { "epoch": 41.511721907841554, "grad_norm": 0.1943824142217636, "learning_rate": 0.00044024521692266237, "loss": 0.5406, "step": 308100 }, { "epoch": 41.52519536513069, "grad_norm": 0.16625982522964478, "learning_rate": 0.0004402077906524147, "loss": 0.5405, "step": 308200 }, { "epoch": 41.53866882241983, "grad_norm": 0.162593275308609, "learning_rate": 0.0004401703643821671, "loss": 0.5393, "step": 308300 }, { "epoch": 41.552142279708974, "grad_norm": 0.17530430853366852, "learning_rate": 0.0004401329381119195, "loss": 0.5412, "step": 308400 }, { "epoch": 41.56561573699811, "grad_norm": 0.1566816121339798, "learning_rate": 0.0004400955118416719, "loss": 0.5399, "step": 308500 }, { "epoch": 41.579089194287256, "grad_norm": 0.1785976141691208, "learning_rate": 0.0004400580855714243, "loss": 0.5412, "step": 308600 }, { "epoch": 41.592562651576394, "grad_norm": 0.16173692047595978, "learning_rate": 0.0004400206593011767, "loss": 0.5406, "step": 308700 }, { "epoch": 41.60603610886554, "grad_norm": 0.1530049443244934, "learning_rate": 0.0004399832330309291, "loss": 0.5395, "step": 308800 }, { "epoch": 41.619509566154676, "grad_norm": 0.16963498294353485, "learning_rate": 0.00043994580676068144, "loss": 0.541, "step": 308900 }, { "epoch": 41.63298302344381, "grad_norm": 0.16012391448020935, "learning_rate": 0.00043990838049043384, "loss": 0.5409, "step": 309000 }, { "epoch": 41.64645648073296, "grad_norm": 0.15854138135910034, "learning_rate": 0.00043987095422018624, "loss": 0.5403, "step": 309100 }, { "epoch": 41.659929938022096, "grad_norm": 0.16325640678405762, "learning_rate": 0.00043983352794993864, "loss": 0.5406, "step": 309200 }, { "epoch": 41.67340339531124, "grad_norm": 0.16188134253025055, "learning_rate": 0.00043979610167969104, "loss": 0.5394, "step": 309300 }, { "epoch": 41.68687685260038, "grad_norm": 0.1504247784614563, "learning_rate": 0.00043975867540944343, "loss": 0.5409, "step": 309400 }, { "epoch": 41.700350309889515, "grad_norm": 0.14685972034931183, "learning_rate": 0.00043972124913919583, "loss": 0.5395, "step": 309500 }, { "epoch": 41.71382376717866, "grad_norm": 0.1552925407886505, "learning_rate": 0.0004396838228689482, "loss": 0.54, "step": 309600 }, { "epoch": 41.7272972244678, "grad_norm": 0.15065596997737885, "learning_rate": 0.0004396463965987006, "loss": 0.5405, "step": 309700 }, { "epoch": 41.74077068175694, "grad_norm": 0.16305072605609894, "learning_rate": 0.000439608970328453, "loss": 0.5403, "step": 309800 }, { "epoch": 41.75424413904608, "grad_norm": 0.16749843955039978, "learning_rate": 0.0004395715440582053, "loss": 0.5398, "step": 309900 }, { "epoch": 41.76771759633522, "grad_norm": 0.1666700392961502, "learning_rate": 0.0004395341177879577, "loss": 0.541, "step": 310000 }, { "epoch": 41.78119105362436, "grad_norm": 0.15356387197971344, "learning_rate": 0.0004394966915177101, "loss": 0.5406, "step": 310100 }, { "epoch": 41.7946645109135, "grad_norm": 0.15895235538482666, "learning_rate": 0.0004394592652474625, "loss": 0.5404, "step": 310200 }, { "epoch": 41.808137968202644, "grad_norm": 0.15727539360523224, "learning_rate": 0.00043942183897721486, "loss": 0.5398, "step": 310300 }, { "epoch": 41.82161142549178, "grad_norm": 0.1649629920721054, "learning_rate": 0.00043938441270696725, "loss": 0.5403, "step": 310400 }, { "epoch": 41.83508488278092, "grad_norm": 0.16111265122890472, "learning_rate": 0.00043934698643671965, "loss": 0.5407, "step": 310500 }, { "epoch": 41.84855834007006, "grad_norm": 0.15896311402320862, "learning_rate": 0.00043930956016647205, "loss": 0.5391, "step": 310600 }, { "epoch": 41.8620317973592, "grad_norm": 0.1649448573589325, "learning_rate": 0.00043927213389622445, "loss": 0.5404, "step": 310700 }, { "epoch": 41.875505254648345, "grad_norm": 0.18347127735614777, "learning_rate": 0.00043923470762597685, "loss": 0.5392, "step": 310800 }, { "epoch": 41.88897871193748, "grad_norm": 0.16251984238624573, "learning_rate": 0.00043919728135572925, "loss": 0.5403, "step": 310900 }, { "epoch": 41.90245216922662, "grad_norm": 0.16747277975082397, "learning_rate": 0.00043915985508548164, "loss": 0.5409, "step": 311000 }, { "epoch": 41.915925626515765, "grad_norm": 0.1736704558134079, "learning_rate": 0.000439122428815234, "loss": 0.5396, "step": 311100 }, { "epoch": 41.9293990838049, "grad_norm": 0.178583562374115, "learning_rate": 0.0004390850025449864, "loss": 0.5394, "step": 311200 }, { "epoch": 41.94287254109405, "grad_norm": 0.15126048028469086, "learning_rate": 0.0004390475762747388, "loss": 0.5399, "step": 311300 }, { "epoch": 41.956345998383185, "grad_norm": 0.16783130168914795, "learning_rate": 0.0004390101500044912, "loss": 0.5402, "step": 311400 }, { "epoch": 41.96981945567232, "grad_norm": 0.1912906914949417, "learning_rate": 0.0004389727237342436, "loss": 0.5406, "step": 311500 }, { "epoch": 41.98329291296147, "grad_norm": 0.1647043526172638, "learning_rate": 0.000438935297463996, "loss": 0.5406, "step": 311600 }, { "epoch": 41.996766370250604, "grad_norm": 0.1716320663690567, "learning_rate": 0.0004388978711937483, "loss": 0.5395, "step": 311700 }, { "epoch": 42.0, "eval_loss": 0.5271638631820679, "eval_runtime": 4.9959, "eval_samples_per_second": 1000.815, "eval_steps_per_second": 15.813, "step": 311724 }, { "epoch": 42.01023982753975, "grad_norm": 0.17649519443511963, "learning_rate": 0.00043886044492350067, "loss": 0.5387, "step": 311800 }, { "epoch": 42.02371328482889, "grad_norm": 0.15869951248168945, "learning_rate": 0.00043882301865325306, "loss": 0.5397, "step": 311900 }, { "epoch": 42.037186742118024, "grad_norm": 0.15677355229854584, "learning_rate": 0.00043878559238300546, "loss": 0.5394, "step": 312000 }, { "epoch": 42.05066019940717, "grad_norm": 0.15884560346603394, "learning_rate": 0.00043874816611275786, "loss": 0.5401, "step": 312100 }, { "epoch": 42.064133656696306, "grad_norm": 0.1569085568189621, "learning_rate": 0.00043871073984251026, "loss": 0.5396, "step": 312200 }, { "epoch": 42.07760711398545, "grad_norm": 0.1893143355846405, "learning_rate": 0.00043867331357226266, "loss": 0.5394, "step": 312300 }, { "epoch": 42.09108057127459, "grad_norm": 0.16864433884620667, "learning_rate": 0.00043863588730201506, "loss": 0.5388, "step": 312400 }, { "epoch": 42.10455402856373, "grad_norm": 0.1547868400812149, "learning_rate": 0.0004385984610317674, "loss": 0.54, "step": 312500 }, { "epoch": 42.11802748585287, "grad_norm": 0.17105774581432343, "learning_rate": 0.0004385610347615198, "loss": 0.5399, "step": 312600 }, { "epoch": 42.13150094314201, "grad_norm": 0.17889279127120972, "learning_rate": 0.0004385236084912722, "loss": 0.5397, "step": 312700 }, { "epoch": 42.14497440043115, "grad_norm": 0.15174615383148193, "learning_rate": 0.0004384861822210246, "loss": 0.5395, "step": 312800 }, { "epoch": 42.15844785772029, "grad_norm": 0.16011589765548706, "learning_rate": 0.000438448755950777, "loss": 0.5382, "step": 312900 }, { "epoch": 42.171921315009435, "grad_norm": 0.16813501715660095, "learning_rate": 0.0004384113296805294, "loss": 0.5389, "step": 313000 }, { "epoch": 42.18539477229857, "grad_norm": 0.15899017453193665, "learning_rate": 0.0004383739034102818, "loss": 0.5396, "step": 313100 }, { "epoch": 42.19886822958771, "grad_norm": 0.16155275702476501, "learning_rate": 0.0004383364771400342, "loss": 0.5402, "step": 313200 }, { "epoch": 42.212341686876854, "grad_norm": 0.16723108291625977, "learning_rate": 0.00043829905086978653, "loss": 0.5401, "step": 313300 }, { "epoch": 42.22581514416599, "grad_norm": 0.15910322964191437, "learning_rate": 0.00043826162459953893, "loss": 0.5393, "step": 313400 }, { "epoch": 42.23928860145514, "grad_norm": 0.16229908168315887, "learning_rate": 0.0004382241983292913, "loss": 0.5388, "step": 313500 }, { "epoch": 42.252762058744274, "grad_norm": 0.17421451210975647, "learning_rate": 0.00043818677205904367, "loss": 0.5395, "step": 313600 }, { "epoch": 42.26623551603341, "grad_norm": 0.17244483530521393, "learning_rate": 0.00043814934578879607, "loss": 0.5411, "step": 313700 }, { "epoch": 42.279708973322556, "grad_norm": 0.16874419152736664, "learning_rate": 0.00043811191951854847, "loss": 0.5401, "step": 313800 }, { "epoch": 42.293182430611694, "grad_norm": 0.1545574814081192, "learning_rate": 0.00043807449324830087, "loss": 0.5384, "step": 313900 }, { "epoch": 42.30665588790084, "grad_norm": 0.16079173982143402, "learning_rate": 0.0004380370669780532, "loss": 0.5392, "step": 314000 }, { "epoch": 42.320129345189976, "grad_norm": 0.17354518175125122, "learning_rate": 0.0004379996407078056, "loss": 0.5403, "step": 314100 }, { "epoch": 42.33360280247911, "grad_norm": 0.1591595560312271, "learning_rate": 0.000437962214437558, "loss": 0.5398, "step": 314200 }, { "epoch": 42.34707625976826, "grad_norm": 0.17825888097286224, "learning_rate": 0.0004379247881673104, "loss": 0.54, "step": 314300 }, { "epoch": 42.360549717057395, "grad_norm": 0.17150963842868805, "learning_rate": 0.0004378873618970628, "loss": 0.5399, "step": 314400 }, { "epoch": 42.37402317434654, "grad_norm": 0.15663255751132965, "learning_rate": 0.0004378499356268152, "loss": 0.5381, "step": 314500 }, { "epoch": 42.38749663163568, "grad_norm": 0.16488175094127655, "learning_rate": 0.0004378125093565676, "loss": 0.5385, "step": 314600 }, { "epoch": 42.400970088924815, "grad_norm": 0.17738498747348785, "learning_rate": 0.00043777508308631994, "loss": 0.5394, "step": 314700 }, { "epoch": 42.41444354621396, "grad_norm": 0.19634293019771576, "learning_rate": 0.00043773765681607234, "loss": 0.54, "step": 314800 }, { "epoch": 42.4279170035031, "grad_norm": 0.17325522005558014, "learning_rate": 0.00043770023054582474, "loss": 0.5399, "step": 314900 }, { "epoch": 42.44139046079224, "grad_norm": 0.1514672040939331, "learning_rate": 0.00043766280427557714, "loss": 0.5389, "step": 315000 }, { "epoch": 42.45486391808138, "grad_norm": 0.1574205458164215, "learning_rate": 0.00043762537800532954, "loss": 0.5398, "step": 315100 }, { "epoch": 42.46833737537052, "grad_norm": 0.17759157717227936, "learning_rate": 0.00043758795173508194, "loss": 0.5384, "step": 315200 }, { "epoch": 42.48181083265966, "grad_norm": 0.14840683341026306, "learning_rate": 0.0004375505254648343, "loss": 0.5396, "step": 315300 }, { "epoch": 42.4952842899488, "grad_norm": 0.15943163633346558, "learning_rate": 0.0004375130991945866, "loss": 0.54, "step": 315400 }, { "epoch": 42.508757747237944, "grad_norm": 0.17665527760982513, "learning_rate": 0.000437475672924339, "loss": 0.539, "step": 315500 }, { "epoch": 42.52223120452708, "grad_norm": 0.1609688103199005, "learning_rate": 0.0004374382466540914, "loss": 0.5394, "step": 315600 }, { "epoch": 42.53570466181622, "grad_norm": 0.15406116843223572, "learning_rate": 0.0004374008203838438, "loss": 0.5411, "step": 315700 }, { "epoch": 42.54917811910536, "grad_norm": 0.17138652503490448, "learning_rate": 0.0004373633941135962, "loss": 0.5401, "step": 315800 }, { "epoch": 42.5626515763945, "grad_norm": 0.16889971494674683, "learning_rate": 0.0004373259678433486, "loss": 0.539, "step": 315900 }, { "epoch": 42.576125033683645, "grad_norm": 0.14693647623062134, "learning_rate": 0.000437288541573101, "loss": 0.5397, "step": 316000 }, { "epoch": 42.58959849097278, "grad_norm": 0.1575612723827362, "learning_rate": 0.0004372511153028534, "loss": 0.5383, "step": 316100 }, { "epoch": 42.60307194826193, "grad_norm": 0.16323421895503998, "learning_rate": 0.00043721368903260576, "loss": 0.5406, "step": 316200 }, { "epoch": 42.616545405551065, "grad_norm": 0.1704384982585907, "learning_rate": 0.00043717626276235815, "loss": 0.5389, "step": 316300 }, { "epoch": 42.6300188628402, "grad_norm": 0.15978944301605225, "learning_rate": 0.00043713883649211055, "loss": 0.5403, "step": 316400 }, { "epoch": 42.64349232012935, "grad_norm": 0.1764036864042282, "learning_rate": 0.00043710141022186295, "loss": 0.5404, "step": 316500 }, { "epoch": 42.656965777418485, "grad_norm": 0.15360479056835175, "learning_rate": 0.00043706398395161535, "loss": 0.5396, "step": 316600 }, { "epoch": 42.67043923470763, "grad_norm": 0.15218062698841095, "learning_rate": 0.00043702655768136775, "loss": 0.5388, "step": 316700 }, { "epoch": 42.68391269199677, "grad_norm": 0.17752915620803833, "learning_rate": 0.00043698913141112014, "loss": 0.5396, "step": 316800 }, { "epoch": 42.697386149285904, "grad_norm": 0.17691880464553833, "learning_rate": 0.0004369517051408725, "loss": 0.5401, "step": 316900 }, { "epoch": 42.71085960657505, "grad_norm": 0.15367822349071503, "learning_rate": 0.0004369142788706249, "loss": 0.5396, "step": 317000 }, { "epoch": 42.72433306386419, "grad_norm": 0.16032391786575317, "learning_rate": 0.00043687685260037723, "loss": 0.5392, "step": 317100 }, { "epoch": 42.73780652115333, "grad_norm": 0.1805604100227356, "learning_rate": 0.00043683942633012963, "loss": 0.5382, "step": 317200 }, { "epoch": 42.75127997844247, "grad_norm": 0.1625720113515854, "learning_rate": 0.00043680200005988203, "loss": 0.5388, "step": 317300 }, { "epoch": 42.764753435731606, "grad_norm": 0.15811863541603088, "learning_rate": 0.0004367645737896344, "loss": 0.5389, "step": 317400 }, { "epoch": 42.77822689302075, "grad_norm": 0.1513429433107376, "learning_rate": 0.0004367271475193868, "loss": 0.54, "step": 317500 }, { "epoch": 42.79170035030989, "grad_norm": 0.17483411729335785, "learning_rate": 0.00043668972124913917, "loss": 0.5402, "step": 317600 }, { "epoch": 42.80517380759903, "grad_norm": 0.16211234033107758, "learning_rate": 0.00043665229497889157, "loss": 0.5397, "step": 317700 }, { "epoch": 42.81864726488817, "grad_norm": 0.20465907454490662, "learning_rate": 0.00043661486870864396, "loss": 0.5396, "step": 317800 }, { "epoch": 42.83212072217731, "grad_norm": 0.16823872923851013, "learning_rate": 0.00043657744243839636, "loss": 0.5398, "step": 317900 }, { "epoch": 42.84559417946645, "grad_norm": 0.1616853028535843, "learning_rate": 0.00043654001616814876, "loss": 0.5404, "step": 318000 }, { "epoch": 42.85906763675559, "grad_norm": 0.1690060943365097, "learning_rate": 0.00043650258989790116, "loss": 0.5394, "step": 318100 }, { "epoch": 42.872541094044735, "grad_norm": 0.16141027212142944, "learning_rate": 0.00043646516362765356, "loss": 0.5394, "step": 318200 }, { "epoch": 42.88601455133387, "grad_norm": 0.1599453240633011, "learning_rate": 0.00043642773735740596, "loss": 0.5397, "step": 318300 }, { "epoch": 42.89948800862301, "grad_norm": 0.16172759234905243, "learning_rate": 0.0004363903110871583, "loss": 0.5396, "step": 318400 }, { "epoch": 42.912961465912154, "grad_norm": 0.15461957454681396, "learning_rate": 0.0004363528848169107, "loss": 0.5393, "step": 318500 }, { "epoch": 42.92643492320129, "grad_norm": 0.15597553551197052, "learning_rate": 0.0004363154585466631, "loss": 0.5387, "step": 318600 }, { "epoch": 42.93990838049044, "grad_norm": 0.17167913913726807, "learning_rate": 0.0004362780322764155, "loss": 0.5404, "step": 318700 }, { "epoch": 42.953381837779574, "grad_norm": 0.17331022024154663, "learning_rate": 0.0004362406060061679, "loss": 0.539, "step": 318800 }, { "epoch": 42.96685529506871, "grad_norm": 0.1535799652338028, "learning_rate": 0.00043620317973592024, "loss": 0.5389, "step": 318900 }, { "epoch": 42.980328752357856, "grad_norm": 0.180367112159729, "learning_rate": 0.00043616575346567263, "loss": 0.5389, "step": 319000 }, { "epoch": 42.993802209646994, "grad_norm": 0.16032364964485168, "learning_rate": 0.000436128327195425, "loss": 0.5394, "step": 319100 }, { "epoch": 43.0, "eval_loss": 0.5279526710510254, "eval_runtime": 4.9682, "eval_samples_per_second": 1006.41, "eval_steps_per_second": 15.901, "step": 319146 }, { "epoch": 43.00727566693614, "grad_norm": 0.1563524454832077, "learning_rate": 0.0004360909009251774, "loss": 0.5383, "step": 319200 }, { "epoch": 43.020749124225276, "grad_norm": 0.16533569991588593, "learning_rate": 0.0004360534746549298, "loss": 0.5385, "step": 319300 }, { "epoch": 43.03422258151441, "grad_norm": 0.15109261870384216, "learning_rate": 0.0004360160483846822, "loss": 0.5384, "step": 319400 }, { "epoch": 43.04769603880356, "grad_norm": 0.15598304569721222, "learning_rate": 0.00043597862211443457, "loss": 0.5382, "step": 319500 }, { "epoch": 43.061169496092695, "grad_norm": 0.14419983327388763, "learning_rate": 0.00043594119584418697, "loss": 0.5383, "step": 319600 }, { "epoch": 43.07464295338184, "grad_norm": 0.16413302719593048, "learning_rate": 0.00043590376957393937, "loss": 0.539, "step": 319700 }, { "epoch": 43.08811641067098, "grad_norm": 0.1683787852525711, "learning_rate": 0.0004358663433036917, "loss": 0.5384, "step": 319800 }, { "epoch": 43.101589867960115, "grad_norm": 0.16408658027648926, "learning_rate": 0.0004358289170334441, "loss": 0.5394, "step": 319900 }, { "epoch": 43.11506332524926, "grad_norm": 0.1549149453639984, "learning_rate": 0.0004357914907631965, "loss": 0.5397, "step": 320000 }, { "epoch": 43.1285367825384, "grad_norm": 0.16065484285354614, "learning_rate": 0.0004357540644929489, "loss": 0.5392, "step": 320100 }, { "epoch": 43.14201023982754, "grad_norm": 0.16148333251476288, "learning_rate": 0.0004357166382227013, "loss": 0.5384, "step": 320200 }, { "epoch": 43.15548369711668, "grad_norm": 0.1678134649991989, "learning_rate": 0.0004356792119524537, "loss": 0.5395, "step": 320300 }, { "epoch": 43.168957154405824, "grad_norm": 0.15520022809505463, "learning_rate": 0.0004356417856822061, "loss": 0.5404, "step": 320400 }, { "epoch": 43.18243061169496, "grad_norm": 0.17196735739707947, "learning_rate": 0.0004356043594119585, "loss": 0.5379, "step": 320500 }, { "epoch": 43.1959040689841, "grad_norm": 0.1701822280883789, "learning_rate": 0.00043556693314171084, "loss": 0.5383, "step": 320600 }, { "epoch": 43.209377526273244, "grad_norm": 0.1599099189043045, "learning_rate": 0.0004355295068714632, "loss": 0.5387, "step": 320700 }, { "epoch": 43.22285098356238, "grad_norm": 0.15888634324073792, "learning_rate": 0.0004354920806012156, "loss": 0.5405, "step": 320800 }, { "epoch": 43.236324440851526, "grad_norm": 0.15710200369358063, "learning_rate": 0.000435454654330968, "loss": 0.5399, "step": 320900 }, { "epoch": 43.24979789814066, "grad_norm": 0.1553744077682495, "learning_rate": 0.0004354172280607204, "loss": 0.5388, "step": 321000 }, { "epoch": 43.2632713554298, "grad_norm": 0.15592516958713531, "learning_rate": 0.0004353798017904728, "loss": 0.5378, "step": 321100 }, { "epoch": 43.276744812718945, "grad_norm": 0.16535793244838715, "learning_rate": 0.0004353423755202252, "loss": 0.5394, "step": 321200 }, { "epoch": 43.29021827000808, "grad_norm": 0.1560070961713791, "learning_rate": 0.0004353049492499775, "loss": 0.5395, "step": 321300 }, { "epoch": 43.30369172729723, "grad_norm": 0.1534852534532547, "learning_rate": 0.0004352675229797299, "loss": 0.5395, "step": 321400 }, { "epoch": 43.317165184586365, "grad_norm": 0.17295540869235992, "learning_rate": 0.0004352300967094823, "loss": 0.5386, "step": 321500 }, { "epoch": 43.3306386418755, "grad_norm": 0.15630193054676056, "learning_rate": 0.0004351926704392347, "loss": 0.5392, "step": 321600 }, { "epoch": 43.34411209916465, "grad_norm": 0.16939294338226318, "learning_rate": 0.0004351552441689871, "loss": 0.539, "step": 321700 }, { "epoch": 43.357585556453785, "grad_norm": 0.1659676879644394, "learning_rate": 0.0004351178178987395, "loss": 0.5391, "step": 321800 }, { "epoch": 43.37105901374293, "grad_norm": 0.1900232583284378, "learning_rate": 0.0004350803916284919, "loss": 0.5388, "step": 321900 }, { "epoch": 43.38453247103207, "grad_norm": 0.16294412314891815, "learning_rate": 0.00043504296535824426, "loss": 0.539, "step": 322000 }, { "epoch": 43.398005928321204, "grad_norm": 0.158513605594635, "learning_rate": 0.00043500553908799665, "loss": 0.5395, "step": 322100 }, { "epoch": 43.41147938561035, "grad_norm": 0.18823480606079102, "learning_rate": 0.00043496811281774905, "loss": 0.5381, "step": 322200 }, { "epoch": 43.424952842899486, "grad_norm": 0.21416808664798737, "learning_rate": 0.00043493068654750145, "loss": 0.5402, "step": 322300 }, { "epoch": 43.43842630018863, "grad_norm": 0.17098718881607056, "learning_rate": 0.00043489326027725385, "loss": 0.5385, "step": 322400 }, { "epoch": 43.45189975747777, "grad_norm": 0.16158369183540344, "learning_rate": 0.0004348558340070062, "loss": 0.5399, "step": 322500 }, { "epoch": 43.465373214766906, "grad_norm": 0.16588079929351807, "learning_rate": 0.0004348184077367586, "loss": 0.5395, "step": 322600 }, { "epoch": 43.47884667205605, "grad_norm": 0.15401771664619446, "learning_rate": 0.00043478098146651094, "loss": 0.539, "step": 322700 }, { "epoch": 43.49232012934519, "grad_norm": 0.17583073675632477, "learning_rate": 0.00043474355519626333, "loss": 0.5384, "step": 322800 }, { "epoch": 43.50579358663433, "grad_norm": 0.1724446415901184, "learning_rate": 0.00043470612892601573, "loss": 0.5386, "step": 322900 }, { "epoch": 43.51926704392347, "grad_norm": 0.16406846046447754, "learning_rate": 0.00043466870265576813, "loss": 0.5388, "step": 323000 }, { "epoch": 43.53274050121261, "grad_norm": 0.14369134604930878, "learning_rate": 0.00043463127638552053, "loss": 0.5389, "step": 323100 }, { "epoch": 43.54621395850175, "grad_norm": 0.1896660029888153, "learning_rate": 0.0004345938501152729, "loss": 0.5388, "step": 323200 }, { "epoch": 43.55968741579089, "grad_norm": 0.1636536717414856, "learning_rate": 0.0004345564238450253, "loss": 0.5391, "step": 323300 }, { "epoch": 43.573160873080035, "grad_norm": 0.1764601767063141, "learning_rate": 0.0004345189975747777, "loss": 0.539, "step": 323400 }, { "epoch": 43.58663433036917, "grad_norm": 0.20018726587295532, "learning_rate": 0.00043448157130453007, "loss": 0.5388, "step": 323500 }, { "epoch": 43.60010778765831, "grad_norm": 0.18007300794124603, "learning_rate": 0.00043444414503428247, "loss": 0.5393, "step": 323600 }, { "epoch": 43.613581244947454, "grad_norm": 0.15496060252189636, "learning_rate": 0.00043440671876403486, "loss": 0.5384, "step": 323700 }, { "epoch": 43.62705470223659, "grad_norm": 0.14939910173416138, "learning_rate": 0.00043436929249378726, "loss": 0.5395, "step": 323800 }, { "epoch": 43.640528159525736, "grad_norm": 0.16397763788700104, "learning_rate": 0.00043433186622353966, "loss": 0.5382, "step": 323900 }, { "epoch": 43.654001616814874, "grad_norm": 0.15545520186424255, "learning_rate": 0.00043429443995329206, "loss": 0.5387, "step": 324000 }, { "epoch": 43.66747507410402, "grad_norm": 0.156644806265831, "learning_rate": 0.00043425701368304446, "loss": 0.5379, "step": 324100 }, { "epoch": 43.680948531393156, "grad_norm": 0.17255975306034088, "learning_rate": 0.0004342195874127968, "loss": 0.5396, "step": 324200 }, { "epoch": 43.69442198868229, "grad_norm": 0.17265461385250092, "learning_rate": 0.0004341821611425492, "loss": 0.5382, "step": 324300 }, { "epoch": 43.70789544597144, "grad_norm": 0.15545910596847534, "learning_rate": 0.00043414473487230154, "loss": 0.5385, "step": 324400 }, { "epoch": 43.721368903260576, "grad_norm": 0.16321395337581635, "learning_rate": 0.00043410730860205394, "loss": 0.5387, "step": 324500 }, { "epoch": 43.73484236054972, "grad_norm": 0.16124463081359863, "learning_rate": 0.00043406988233180634, "loss": 0.539, "step": 324600 }, { "epoch": 43.74831581783886, "grad_norm": 0.1679839789867401, "learning_rate": 0.00043403245606155874, "loss": 0.5396, "step": 324700 }, { "epoch": 43.761789275127995, "grad_norm": 0.16070903837680817, "learning_rate": 0.00043399502979131114, "loss": 0.5382, "step": 324800 }, { "epoch": 43.77526273241714, "grad_norm": 0.1595311015844345, "learning_rate": 0.0004339576035210635, "loss": 0.5393, "step": 324900 }, { "epoch": 43.78873618970628, "grad_norm": 0.1571149080991745, "learning_rate": 0.0004339201772508159, "loss": 0.5386, "step": 325000 }, { "epoch": 43.80220964699542, "grad_norm": 0.15341909229755402, "learning_rate": 0.0004338827509805683, "loss": 0.539, "step": 325100 }, { "epoch": 43.81568310428456, "grad_norm": 0.17186079919338226, "learning_rate": 0.0004338453247103207, "loss": 0.5402, "step": 325200 }, { "epoch": 43.8291565615737, "grad_norm": 0.15846183896064758, "learning_rate": 0.0004338078984400731, "loss": 0.5394, "step": 325300 }, { "epoch": 43.84263001886284, "grad_norm": 0.16071408987045288, "learning_rate": 0.00043377047216982547, "loss": 0.5396, "step": 325400 }, { "epoch": 43.85610347615198, "grad_norm": 0.1773345172405243, "learning_rate": 0.00043373304589957787, "loss": 0.539, "step": 325500 }, { "epoch": 43.869576933441124, "grad_norm": 0.16039025783538818, "learning_rate": 0.0004336956196293302, "loss": 0.539, "step": 325600 }, { "epoch": 43.88305039073026, "grad_norm": 0.1663745939731598, "learning_rate": 0.0004336581933590826, "loss": 0.5392, "step": 325700 }, { "epoch": 43.8965238480194, "grad_norm": 0.1597408950328827, "learning_rate": 0.000433620767088835, "loss": 0.5394, "step": 325800 }, { "epoch": 43.90999730530854, "grad_norm": 0.15487222373485565, "learning_rate": 0.0004335833408185874, "loss": 0.5384, "step": 325900 }, { "epoch": 43.92347076259768, "grad_norm": 0.15526586771011353, "learning_rate": 0.0004335459145483398, "loss": 0.5378, "step": 326000 }, { "epoch": 43.936944219886826, "grad_norm": 0.16187934577465057, "learning_rate": 0.0004335084882780922, "loss": 0.5393, "step": 326100 }, { "epoch": 43.95041767717596, "grad_norm": 0.16067305207252502, "learning_rate": 0.00043347106200784455, "loss": 0.5383, "step": 326200 }, { "epoch": 43.9638911344651, "grad_norm": 0.16550444066524506, "learning_rate": 0.00043343363573759695, "loss": 0.5394, "step": 326300 }, { "epoch": 43.977364591754245, "grad_norm": 0.1635962575674057, "learning_rate": 0.0004333962094673493, "loss": 0.5384, "step": 326400 }, { "epoch": 43.99083804904338, "grad_norm": 0.16417285799980164, "learning_rate": 0.0004333587831971017, "loss": 0.5388, "step": 326500 }, { "epoch": 44.0, "eval_loss": 0.526480495929718, "eval_runtime": 4.9539, "eval_samples_per_second": 1009.298, "eval_steps_per_second": 15.947, "step": 326568 }, { "epoch": 44.00431150633253, "grad_norm": 0.1535174697637558, "learning_rate": 0.0004333213569268541, "loss": 0.538, "step": 326600 }, { "epoch": 44.017784963621665, "grad_norm": 0.16555623710155487, "learning_rate": 0.0004332839306566065, "loss": 0.5379, "step": 326700 }, { "epoch": 44.0312584209108, "grad_norm": 0.15324117243289948, "learning_rate": 0.0004332465043863589, "loss": 0.5382, "step": 326800 }, { "epoch": 44.04473187819995, "grad_norm": 0.15100961923599243, "learning_rate": 0.0004332090781161113, "loss": 0.5377, "step": 326900 }, { "epoch": 44.058205335489085, "grad_norm": 0.15869836509227753, "learning_rate": 0.0004331716518458637, "loss": 0.5384, "step": 327000 }, { "epoch": 44.07167879277823, "grad_norm": 0.15142416954040527, "learning_rate": 0.000433134225575616, "loss": 0.5389, "step": 327100 }, { "epoch": 44.08515225006737, "grad_norm": 0.17429080605506897, "learning_rate": 0.0004330967993053684, "loss": 0.5387, "step": 327200 }, { "epoch": 44.098625707356504, "grad_norm": 0.17552974820137024, "learning_rate": 0.0004330593730351208, "loss": 0.538, "step": 327300 }, { "epoch": 44.11209916464565, "grad_norm": 0.1602127104997635, "learning_rate": 0.0004330219467648732, "loss": 0.5381, "step": 327400 }, { "epoch": 44.125572621934786, "grad_norm": 0.16882656514644623, "learning_rate": 0.0004329845204946256, "loss": 0.5376, "step": 327500 }, { "epoch": 44.13904607922393, "grad_norm": 0.16228605806827545, "learning_rate": 0.000432947094224378, "loss": 0.5382, "step": 327600 }, { "epoch": 44.15251953651307, "grad_norm": 0.15977659821510315, "learning_rate": 0.0004329096679541304, "loss": 0.5373, "step": 327700 }, { "epoch": 44.16599299380221, "grad_norm": 0.16384106874465942, "learning_rate": 0.00043287224168388276, "loss": 0.5372, "step": 327800 }, { "epoch": 44.17946645109135, "grad_norm": 0.15863437950611115, "learning_rate": 0.00043283481541363516, "loss": 0.5386, "step": 327900 }, { "epoch": 44.19293990838049, "grad_norm": 0.15809236466884613, "learning_rate": 0.0004327973891433875, "loss": 0.5388, "step": 328000 }, { "epoch": 44.20641336566963, "grad_norm": 0.1591002196073532, "learning_rate": 0.0004327599628731399, "loss": 0.5372, "step": 328100 }, { "epoch": 44.21988682295877, "grad_norm": 0.18714639544487, "learning_rate": 0.0004327225366028923, "loss": 0.5375, "step": 328200 }, { "epoch": 44.233360280247915, "grad_norm": 0.17667406797409058, "learning_rate": 0.0004326851103326447, "loss": 0.5393, "step": 328300 }, { "epoch": 44.24683373753705, "grad_norm": 0.1586468666791916, "learning_rate": 0.0004326476840623971, "loss": 0.538, "step": 328400 }, { "epoch": 44.26030719482619, "grad_norm": 0.1746850311756134, "learning_rate": 0.0004326102577921495, "loss": 0.5384, "step": 328500 }, { "epoch": 44.273780652115335, "grad_norm": 0.1657198965549469, "learning_rate": 0.00043257283152190184, "loss": 0.5377, "step": 328600 }, { "epoch": 44.28725410940447, "grad_norm": 0.16072240471839905, "learning_rate": 0.00043253540525165423, "loss": 0.5381, "step": 328700 }, { "epoch": 44.30072756669362, "grad_norm": 0.15322427451610565, "learning_rate": 0.00043249797898140663, "loss": 0.538, "step": 328800 }, { "epoch": 44.314201023982754, "grad_norm": 0.15856841206550598, "learning_rate": 0.00043246055271115903, "loss": 0.5391, "step": 328900 }, { "epoch": 44.32767448127189, "grad_norm": 0.17050056159496307, "learning_rate": 0.00043242312644091143, "loss": 0.5381, "step": 329000 }, { "epoch": 44.341147938561036, "grad_norm": 0.1565883904695511, "learning_rate": 0.0004323857001706638, "loss": 0.5382, "step": 329100 }, { "epoch": 44.354621395850174, "grad_norm": 0.16658306121826172, "learning_rate": 0.0004323482739004162, "loss": 0.5394, "step": 329200 }, { "epoch": 44.36809485313932, "grad_norm": 0.15555551648139954, "learning_rate": 0.00043231084763016857, "loss": 0.5387, "step": 329300 }, { "epoch": 44.381568310428456, "grad_norm": 0.1750364750623703, "learning_rate": 0.00043227342135992097, "loss": 0.5381, "step": 329400 }, { "epoch": 44.39504176771759, "grad_norm": 0.1543634980916977, "learning_rate": 0.00043223599508967337, "loss": 0.538, "step": 329500 }, { "epoch": 44.40851522500674, "grad_norm": 0.15413375198841095, "learning_rate": 0.00043219856881942576, "loss": 0.5378, "step": 329600 }, { "epoch": 44.421988682295876, "grad_norm": 0.16950470209121704, "learning_rate": 0.00043216114254917816, "loss": 0.5396, "step": 329700 }, { "epoch": 44.43546213958502, "grad_norm": 0.16679498553276062, "learning_rate": 0.0004321237162789305, "loss": 0.5396, "step": 329800 }, { "epoch": 44.44893559687416, "grad_norm": 0.1534895896911621, "learning_rate": 0.0004320862900086829, "loss": 0.5397, "step": 329900 }, { "epoch": 44.462409054163295, "grad_norm": 0.16044855117797852, "learning_rate": 0.00043204886373843525, "loss": 0.5384, "step": 330000 }, { "epoch": 44.47588251145244, "grad_norm": 0.1556829810142517, "learning_rate": 0.00043201143746818765, "loss": 0.5374, "step": 330100 }, { "epoch": 44.48935596874158, "grad_norm": 0.18878360092639923, "learning_rate": 0.00043197401119794004, "loss": 0.5385, "step": 330200 }, { "epoch": 44.50282942603072, "grad_norm": 0.1639781892299652, "learning_rate": 0.00043193658492769244, "loss": 0.5394, "step": 330300 }, { "epoch": 44.51630288331986, "grad_norm": 0.15013033151626587, "learning_rate": 0.00043189915865744484, "loss": 0.538, "step": 330400 }, { "epoch": 44.529776340609, "grad_norm": 0.1707155555486679, "learning_rate": 0.00043186173238719724, "loss": 0.5392, "step": 330500 }, { "epoch": 44.54324979789814, "grad_norm": 0.18457897007465363, "learning_rate": 0.00043182430611694964, "loss": 0.538, "step": 330600 }, { "epoch": 44.55672325518728, "grad_norm": 0.18545015156269073, "learning_rate": 0.000431786879846702, "loss": 0.5391, "step": 330700 }, { "epoch": 44.570196712476424, "grad_norm": 0.16160573065280914, "learning_rate": 0.0004317494535764544, "loss": 0.5379, "step": 330800 }, { "epoch": 44.58367016976556, "grad_norm": 0.16168814897537231, "learning_rate": 0.0004317120273062068, "loss": 0.5384, "step": 330900 }, { "epoch": 44.5971436270547, "grad_norm": 0.15384730696678162, "learning_rate": 0.0004316746010359592, "loss": 0.5379, "step": 331000 }, { "epoch": 44.61061708434384, "grad_norm": 0.1534099131822586, "learning_rate": 0.0004316371747657116, "loss": 0.5385, "step": 331100 }, { "epoch": 44.62409054163298, "grad_norm": 0.16311407089233398, "learning_rate": 0.00043159974849546397, "loss": 0.5378, "step": 331200 }, { "epoch": 44.637563998922126, "grad_norm": 0.17234933376312256, "learning_rate": 0.00043156232222521637, "loss": 0.5387, "step": 331300 }, { "epoch": 44.65103745621126, "grad_norm": 0.1680169254541397, "learning_rate": 0.00043152489595496877, "loss": 0.5384, "step": 331400 }, { "epoch": 44.66451091350041, "grad_norm": 0.15343976020812988, "learning_rate": 0.0004314874696847211, "loss": 0.5375, "step": 331500 }, { "epoch": 44.677984370789545, "grad_norm": 0.15733399987220764, "learning_rate": 0.00043145004341447346, "loss": 0.5387, "step": 331600 }, { "epoch": 44.69145782807868, "grad_norm": 0.15507479012012482, "learning_rate": 0.00043141261714422586, "loss": 0.5375, "step": 331700 }, { "epoch": 44.70493128536783, "grad_norm": 0.16592217981815338, "learning_rate": 0.00043137519087397825, "loss": 0.5399, "step": 331800 }, { "epoch": 44.718404742656965, "grad_norm": 0.1641741394996643, "learning_rate": 0.00043133776460373065, "loss": 0.5383, "step": 331900 }, { "epoch": 44.73187819994611, "grad_norm": 0.1662999987602234, "learning_rate": 0.00043130033833348305, "loss": 0.5385, "step": 332000 }, { "epoch": 44.74535165723525, "grad_norm": 0.17512483894824982, "learning_rate": 0.00043126291206323545, "loss": 0.5389, "step": 332100 }, { "epoch": 44.758825114524385, "grad_norm": 0.16216881573200226, "learning_rate": 0.0004312254857929878, "loss": 0.5387, "step": 332200 }, { "epoch": 44.77229857181353, "grad_norm": 0.16430053114891052, "learning_rate": 0.0004311880595227402, "loss": 0.5391, "step": 332300 }, { "epoch": 44.78577202910267, "grad_norm": 0.15223635733127594, "learning_rate": 0.0004311506332524926, "loss": 0.5386, "step": 332400 }, { "epoch": 44.79924548639181, "grad_norm": 0.15166717767715454, "learning_rate": 0.000431113206982245, "loss": 0.5381, "step": 332500 }, { "epoch": 44.81271894368095, "grad_norm": 0.16295462846755981, "learning_rate": 0.0004310757807119974, "loss": 0.5384, "step": 332600 }, { "epoch": 44.826192400970086, "grad_norm": 0.15732258558273315, "learning_rate": 0.0004310383544417498, "loss": 0.5381, "step": 332700 }, { "epoch": 44.83966585825923, "grad_norm": 0.16663368046283722, "learning_rate": 0.0004310009281715022, "loss": 0.5377, "step": 332800 }, { "epoch": 44.85313931554837, "grad_norm": 0.15967164933681488, "learning_rate": 0.0004309635019012545, "loss": 0.5378, "step": 332900 }, { "epoch": 44.86661277283751, "grad_norm": 0.1777462214231491, "learning_rate": 0.0004309260756310069, "loss": 0.5388, "step": 333000 }, { "epoch": 44.88008623012665, "grad_norm": 0.15077507495880127, "learning_rate": 0.0004308886493607593, "loss": 0.5387, "step": 333100 }, { "epoch": 44.89355968741579, "grad_norm": 0.15538929402828217, "learning_rate": 0.0004308512230905117, "loss": 0.5382, "step": 333200 }, { "epoch": 44.90703314470493, "grad_norm": 0.15637347102165222, "learning_rate": 0.0004308137968202641, "loss": 0.5377, "step": 333300 }, { "epoch": 44.92050660199407, "grad_norm": 0.1611017882823944, "learning_rate": 0.00043077637055001646, "loss": 0.5376, "step": 333400 }, { "epoch": 44.933980059283215, "grad_norm": 0.15884527564048767, "learning_rate": 0.00043073894427976886, "loss": 0.539, "step": 333500 }, { "epoch": 44.94745351657235, "grad_norm": 0.15475323796272278, "learning_rate": 0.0004307015180095212, "loss": 0.5383, "step": 333600 }, { "epoch": 44.96092697386149, "grad_norm": 0.17703673243522644, "learning_rate": 0.0004306640917392736, "loss": 0.5376, "step": 333700 }, { "epoch": 44.974400431150634, "grad_norm": 0.16330265998840332, "learning_rate": 0.000430626665469026, "loss": 0.538, "step": 333800 }, { "epoch": 44.98787388843977, "grad_norm": 0.1732833832502365, "learning_rate": 0.0004305892391987784, "loss": 0.539, "step": 333900 }, { "epoch": 45.0, "eval_loss": 0.5268197059631348, "eval_runtime": 4.9898, "eval_samples_per_second": 1002.044, "eval_steps_per_second": 15.832, "step": 333990 }, { "epoch": 45.00134734572892, "grad_norm": 0.15244244039058685, "learning_rate": 0.0004305518129285308, "loss": 0.5373, "step": 334000 }, { "epoch": 45.014820803018054, "grad_norm": 0.1637188196182251, "learning_rate": 0.0004305143866582832, "loss": 0.5373, "step": 334100 }, { "epoch": 45.02829426030719, "grad_norm": 0.15932372212409973, "learning_rate": 0.0004304769603880356, "loss": 0.537, "step": 334200 }, { "epoch": 45.041767717596336, "grad_norm": 0.1567033976316452, "learning_rate": 0.000430439534117788, "loss": 0.5376, "step": 334300 }, { "epoch": 45.055241174885474, "grad_norm": 0.15267375111579895, "learning_rate": 0.00043040210784754034, "loss": 0.5376, "step": 334400 }, { "epoch": 45.06871463217462, "grad_norm": 0.1795230507850647, "learning_rate": 0.00043036468157729273, "loss": 0.5377, "step": 334500 }, { "epoch": 45.082188089463756, "grad_norm": 0.15106229484081268, "learning_rate": 0.00043032725530704513, "loss": 0.5372, "step": 334600 }, { "epoch": 45.09566154675289, "grad_norm": 0.1770576387643814, "learning_rate": 0.00043028982903679753, "loss": 0.5371, "step": 334700 }, { "epoch": 45.10913500404204, "grad_norm": 0.18898151814937592, "learning_rate": 0.00043025240276654993, "loss": 0.5372, "step": 334800 }, { "epoch": 45.122608461331176, "grad_norm": 0.16120700538158417, "learning_rate": 0.00043021497649630233, "loss": 0.5383, "step": 334900 }, { "epoch": 45.13608191862032, "grad_norm": 0.17458540201187134, "learning_rate": 0.0004301775502260547, "loss": 0.5378, "step": 335000 }, { "epoch": 45.14955537590946, "grad_norm": 0.1595836579799652, "learning_rate": 0.00043014012395580707, "loss": 0.5383, "step": 335100 }, { "epoch": 45.1630288331986, "grad_norm": 0.1643499881029129, "learning_rate": 0.0004301026976855594, "loss": 0.5392, "step": 335200 }, { "epoch": 45.17650229048774, "grad_norm": 0.1696358323097229, "learning_rate": 0.0004300652714153118, "loss": 0.5371, "step": 335300 }, { "epoch": 45.18997574777688, "grad_norm": 0.18551288545131683, "learning_rate": 0.0004300278451450642, "loss": 0.5377, "step": 335400 }, { "epoch": 45.20344920506602, "grad_norm": 0.16496038436889648, "learning_rate": 0.0004299904188748166, "loss": 0.5395, "step": 335500 }, { "epoch": 45.21692266235516, "grad_norm": 0.17440782487392426, "learning_rate": 0.000429952992604569, "loss": 0.5378, "step": 335600 }, { "epoch": 45.230396119644304, "grad_norm": 0.1633365899324417, "learning_rate": 0.0004299155663343214, "loss": 0.5383, "step": 335700 }, { "epoch": 45.24386957693344, "grad_norm": 0.165269672870636, "learning_rate": 0.00042987814006407375, "loss": 0.5372, "step": 335800 }, { "epoch": 45.25734303422258, "grad_norm": 0.15769940614700317, "learning_rate": 0.00042984071379382615, "loss": 0.5382, "step": 335900 }, { "epoch": 45.270816491511724, "grad_norm": 0.15825597941875458, "learning_rate": 0.00042980328752357855, "loss": 0.5382, "step": 336000 }, { "epoch": 45.28428994880086, "grad_norm": 0.1619444489479065, "learning_rate": 0.00042976586125333094, "loss": 0.5388, "step": 336100 }, { "epoch": 45.297763406090006, "grad_norm": 0.1637950837612152, "learning_rate": 0.00042972843498308334, "loss": 0.5369, "step": 336200 }, { "epoch": 45.31123686337914, "grad_norm": 0.16437767446041107, "learning_rate": 0.00042969100871283574, "loss": 0.5375, "step": 336300 }, { "epoch": 45.32471032066828, "grad_norm": 0.1543952226638794, "learning_rate": 0.00042965358244258814, "loss": 0.5372, "step": 336400 }, { "epoch": 45.338183777957425, "grad_norm": 0.17328287661075592, "learning_rate": 0.00042961615617234054, "loss": 0.5374, "step": 336500 }, { "epoch": 45.35165723524656, "grad_norm": 0.15424412488937378, "learning_rate": 0.0004295787299020929, "loss": 0.5375, "step": 336600 }, { "epoch": 45.36513069253571, "grad_norm": 0.16059134900569916, "learning_rate": 0.0004295413036318453, "loss": 0.5383, "step": 336700 }, { "epoch": 45.378604149824845, "grad_norm": 0.1812136173248291, "learning_rate": 0.0004295038773615977, "loss": 0.5377, "step": 336800 }, { "epoch": 45.39207760711398, "grad_norm": 0.16232940554618835, "learning_rate": 0.0004294664510913501, "loss": 0.5382, "step": 336900 }, { "epoch": 45.40555106440313, "grad_norm": 0.1586821973323822, "learning_rate": 0.0004294290248211024, "loss": 0.5374, "step": 337000 }, { "epoch": 45.419024521692265, "grad_norm": 0.15675102174282074, "learning_rate": 0.0004293915985508548, "loss": 0.538, "step": 337100 }, { "epoch": 45.43249797898141, "grad_norm": 0.17377416789531708, "learning_rate": 0.0004293541722806072, "loss": 0.5372, "step": 337200 }, { "epoch": 45.44597143627055, "grad_norm": 0.15278080105781555, "learning_rate": 0.00042931674601035956, "loss": 0.5369, "step": 337300 }, { "epoch": 45.459444893559684, "grad_norm": 0.16028577089309692, "learning_rate": 0.00042927931974011196, "loss": 0.5376, "step": 337400 }, { "epoch": 45.47291835084883, "grad_norm": 0.1597863733768463, "learning_rate": 0.00042924189346986436, "loss": 0.5379, "step": 337500 }, { "epoch": 45.48639180813797, "grad_norm": 0.16006192564964294, "learning_rate": 0.00042920446719961675, "loss": 0.5381, "step": 337600 }, { "epoch": 45.49986526542711, "grad_norm": 0.15167619287967682, "learning_rate": 0.00042916704092936915, "loss": 0.538, "step": 337700 }, { "epoch": 45.51333872271625, "grad_norm": 0.16569939255714417, "learning_rate": 0.00042912961465912155, "loss": 0.537, "step": 337800 }, { "epoch": 45.526812180005386, "grad_norm": 0.1636512279510498, "learning_rate": 0.00042909218838887395, "loss": 0.5377, "step": 337900 }, { "epoch": 45.54028563729453, "grad_norm": 0.17777976393699646, "learning_rate": 0.0004290547621186263, "loss": 0.5382, "step": 338000 }, { "epoch": 45.55375909458367, "grad_norm": 0.17359544336795807, "learning_rate": 0.0004290173358483787, "loss": 0.5382, "step": 338100 }, { "epoch": 45.56723255187281, "grad_norm": 0.15733125805854797, "learning_rate": 0.0004289799095781311, "loss": 0.5378, "step": 338200 }, { "epoch": 45.58070600916195, "grad_norm": 0.15963685512542725, "learning_rate": 0.0004289424833078835, "loss": 0.5376, "step": 338300 }, { "epoch": 45.59417946645109, "grad_norm": 0.1635846495628357, "learning_rate": 0.0004289050570376359, "loss": 0.5374, "step": 338400 }, { "epoch": 45.60765292374023, "grad_norm": 0.1597873866558075, "learning_rate": 0.0004288676307673883, "loss": 0.5384, "step": 338500 }, { "epoch": 45.62112638102937, "grad_norm": 0.15858018398284912, "learning_rate": 0.0004288302044971407, "loss": 0.5374, "step": 338600 }, { "epoch": 45.634599838318515, "grad_norm": 0.1615791767835617, "learning_rate": 0.0004287927782268931, "loss": 0.5382, "step": 338700 }, { "epoch": 45.64807329560765, "grad_norm": 0.16311369836330414, "learning_rate": 0.00042875535195664537, "loss": 0.5381, "step": 338800 }, { "epoch": 45.6615467528968, "grad_norm": 0.15863671898841858, "learning_rate": 0.00042871792568639777, "loss": 0.5379, "step": 338900 }, { "epoch": 45.675020210185934, "grad_norm": 0.1560491919517517, "learning_rate": 0.00042868049941615017, "loss": 0.5383, "step": 339000 }, { "epoch": 45.68849366747507, "grad_norm": 0.18407662212848663, "learning_rate": 0.00042864307314590257, "loss": 0.5375, "step": 339100 }, { "epoch": 45.70196712476422, "grad_norm": 0.17355972528457642, "learning_rate": 0.00042860564687565496, "loss": 0.5374, "step": 339200 }, { "epoch": 45.715440582053354, "grad_norm": 0.1753043383359909, "learning_rate": 0.00042856822060540736, "loss": 0.5372, "step": 339300 }, { "epoch": 45.7289140393425, "grad_norm": 0.1829231232404709, "learning_rate": 0.00042853079433515976, "loss": 0.5379, "step": 339400 }, { "epoch": 45.742387496631636, "grad_norm": 0.15833617746829987, "learning_rate": 0.0004284933680649121, "loss": 0.5382, "step": 339500 }, { "epoch": 45.755860953920774, "grad_norm": 0.16987794637680054, "learning_rate": 0.0004284559417946645, "loss": 0.538, "step": 339600 }, { "epoch": 45.76933441120992, "grad_norm": 0.1640637218952179, "learning_rate": 0.0004284185155244169, "loss": 0.5376, "step": 339700 }, { "epoch": 45.782807868499056, "grad_norm": 0.19275522232055664, "learning_rate": 0.0004283810892541693, "loss": 0.5382, "step": 339800 }, { "epoch": 45.7962813257882, "grad_norm": 0.15555047988891602, "learning_rate": 0.0004283436629839217, "loss": 0.5379, "step": 339900 }, { "epoch": 45.80975478307734, "grad_norm": 0.17588438093662262, "learning_rate": 0.0004283062367136741, "loss": 0.5387, "step": 340000 }, { "epoch": 45.823228240366475, "grad_norm": 0.15249796211719513, "learning_rate": 0.0004282688104434265, "loss": 0.5379, "step": 340100 }, { "epoch": 45.83670169765562, "grad_norm": 0.16909322142601013, "learning_rate": 0.00042823138417317884, "loss": 0.5384, "step": 340200 }, { "epoch": 45.85017515494476, "grad_norm": 0.17301827669143677, "learning_rate": 0.00042819395790293124, "loss": 0.5386, "step": 340300 }, { "epoch": 45.8636486122339, "grad_norm": 0.1647764891386032, "learning_rate": 0.00042815653163268363, "loss": 0.5376, "step": 340400 }, { "epoch": 45.87712206952304, "grad_norm": 0.18106921017169952, "learning_rate": 0.00042811910536243603, "loss": 0.5384, "step": 340500 }, { "epoch": 45.89059552681218, "grad_norm": 0.16761955618858337, "learning_rate": 0.00042808167909218843, "loss": 0.5383, "step": 340600 }, { "epoch": 45.90406898410132, "grad_norm": 0.15074320137500763, "learning_rate": 0.0004280442528219408, "loss": 0.5371, "step": 340700 }, { "epoch": 45.91754244139046, "grad_norm": 0.14524920284748077, "learning_rate": 0.00042800682655169317, "loss": 0.5365, "step": 340800 }, { "epoch": 45.931015898679604, "grad_norm": 0.16712205111980438, "learning_rate": 0.0004279694002814455, "loss": 0.5381, "step": 340900 }, { "epoch": 45.94448935596874, "grad_norm": 0.1647794544696808, "learning_rate": 0.0004279319740111979, "loss": 0.5381, "step": 341000 }, { "epoch": 45.95796281325788, "grad_norm": 0.17693766951560974, "learning_rate": 0.0004278945477409503, "loss": 0.5367, "step": 341100 }, { "epoch": 45.971436270547024, "grad_norm": 0.1597561240196228, "learning_rate": 0.0004278571214707027, "loss": 0.5379, "step": 341200 }, { "epoch": 45.98490972783616, "grad_norm": 0.16247786581516266, "learning_rate": 0.0004278196952004551, "loss": 0.5371, "step": 341300 }, { "epoch": 45.998383185125306, "grad_norm": 0.1590157300233841, "learning_rate": 0.0004277822689302075, "loss": 0.5383, "step": 341400 }, { "epoch": 46.0, "eval_loss": 0.5261401534080505, "eval_runtime": 5.0013, "eval_samples_per_second": 999.739, "eval_steps_per_second": 15.796, "step": 341412 }, { "epoch": 46.01185664241444, "grad_norm": 0.1593475639820099, "learning_rate": 0.0004277448426599599, "loss": 0.5367, "step": 341500 }, { "epoch": 46.02533009970358, "grad_norm": 0.18208596110343933, "learning_rate": 0.0004277074163897123, "loss": 0.5369, "step": 341600 }, { "epoch": 46.038803556992725, "grad_norm": 0.15779320895671844, "learning_rate": 0.00042766999011946465, "loss": 0.5371, "step": 341700 }, { "epoch": 46.05227701428186, "grad_norm": 0.14991651475429535, "learning_rate": 0.00042763256384921705, "loss": 0.5368, "step": 341800 }, { "epoch": 46.06575047157101, "grad_norm": 0.17012453079223633, "learning_rate": 0.00042759513757896945, "loss": 0.5369, "step": 341900 }, { "epoch": 46.079223928860145, "grad_norm": 0.16264134645462036, "learning_rate": 0.00042755771130872184, "loss": 0.5368, "step": 342000 }, { "epoch": 46.09269738614928, "grad_norm": 0.16654811799526215, "learning_rate": 0.00042752028503847424, "loss": 0.537, "step": 342100 }, { "epoch": 46.10617084343843, "grad_norm": 0.17677241563796997, "learning_rate": 0.00042748285876822664, "loss": 0.5374, "step": 342200 }, { "epoch": 46.119644300727565, "grad_norm": 0.16987788677215576, "learning_rate": 0.00042744543249797904, "loss": 0.5372, "step": 342300 }, { "epoch": 46.13311775801671, "grad_norm": 0.1498269885778427, "learning_rate": 0.0004274080062277314, "loss": 0.5364, "step": 342400 }, { "epoch": 46.14659121530585, "grad_norm": 0.17891579866409302, "learning_rate": 0.0004273705799574837, "loss": 0.5372, "step": 342500 }, { "epoch": 46.160064672594984, "grad_norm": 0.16171914339065552, "learning_rate": 0.0004273331536872361, "loss": 0.5363, "step": 342600 }, { "epoch": 46.17353812988413, "grad_norm": 0.1600518673658371, "learning_rate": 0.0004272957274169885, "loss": 0.5367, "step": 342700 }, { "epoch": 46.18701158717327, "grad_norm": 0.17032285034656525, "learning_rate": 0.0004272583011467409, "loss": 0.5368, "step": 342800 }, { "epoch": 46.20048504446241, "grad_norm": 0.1761009246110916, "learning_rate": 0.0004272208748764933, "loss": 0.5375, "step": 342900 }, { "epoch": 46.21395850175155, "grad_norm": 0.17303045094013214, "learning_rate": 0.0004271834486062457, "loss": 0.5376, "step": 343000 }, { "epoch": 46.22743195904069, "grad_norm": 0.15544159710407257, "learning_rate": 0.00042714602233599806, "loss": 0.5376, "step": 343100 }, { "epoch": 46.24090541632983, "grad_norm": 0.17867109179496765, "learning_rate": 0.00042710859606575046, "loss": 0.5375, "step": 343200 }, { "epoch": 46.25437887361897, "grad_norm": 0.18193060159683228, "learning_rate": 0.00042707116979550286, "loss": 0.538, "step": 343300 }, { "epoch": 46.26785233090811, "grad_norm": 0.163673996925354, "learning_rate": 0.00042703374352525526, "loss": 0.537, "step": 343400 }, { "epoch": 46.28132578819725, "grad_norm": 0.17486903071403503, "learning_rate": 0.00042699631725500765, "loss": 0.5374, "step": 343500 }, { "epoch": 46.294799245486395, "grad_norm": 0.17560534179210663, "learning_rate": 0.00042695889098476005, "loss": 0.5375, "step": 343600 }, { "epoch": 46.30827270277553, "grad_norm": 0.16669386625289917, "learning_rate": 0.00042692146471451245, "loss": 0.5379, "step": 343700 }, { "epoch": 46.32174616006467, "grad_norm": 0.15775088965892792, "learning_rate": 0.0004268840384442648, "loss": 0.5369, "step": 343800 }, { "epoch": 46.335219617353815, "grad_norm": 0.1703256368637085, "learning_rate": 0.0004268466121740172, "loss": 0.5377, "step": 343900 }, { "epoch": 46.34869307464295, "grad_norm": 0.15392915904521942, "learning_rate": 0.0004268091859037696, "loss": 0.5364, "step": 344000 }, { "epoch": 46.3621665319321, "grad_norm": 0.16020475327968597, "learning_rate": 0.000426771759633522, "loss": 0.5373, "step": 344100 }, { "epoch": 46.375639989221234, "grad_norm": 0.1775726079940796, "learning_rate": 0.0004267343333632744, "loss": 0.5381, "step": 344200 }, { "epoch": 46.38911344651037, "grad_norm": 0.15932531654834747, "learning_rate": 0.00042669690709302673, "loss": 0.5374, "step": 344300 }, { "epoch": 46.40258690379952, "grad_norm": 0.15405189990997314, "learning_rate": 0.00042665948082277913, "loss": 0.5379, "step": 344400 }, { "epoch": 46.416060361088654, "grad_norm": 0.16119639575481415, "learning_rate": 0.00042662205455253153, "loss": 0.537, "step": 344500 }, { "epoch": 46.4295338183778, "grad_norm": 0.15254344046115875, "learning_rate": 0.00042658462828228387, "loss": 0.5366, "step": 344600 }, { "epoch": 46.443007275666936, "grad_norm": 0.16073550283908844, "learning_rate": 0.00042654720201203627, "loss": 0.5375, "step": 344700 }, { "epoch": 46.456480732956074, "grad_norm": 0.17009542882442474, "learning_rate": 0.00042650977574178867, "loss": 0.538, "step": 344800 }, { "epoch": 46.46995419024522, "grad_norm": 0.1747761368751526, "learning_rate": 0.00042647234947154107, "loss": 0.5369, "step": 344900 }, { "epoch": 46.483427647534356, "grad_norm": 0.18642817437648773, "learning_rate": 0.00042643492320129346, "loss": 0.5377, "step": 345000 }, { "epoch": 46.4969011048235, "grad_norm": 0.19162848591804504, "learning_rate": 0.00042639749693104586, "loss": 0.5375, "step": 345100 }, { "epoch": 46.51037456211264, "grad_norm": 0.15512049198150635, "learning_rate": 0.00042636007066079826, "loss": 0.5383, "step": 345200 }, { "epoch": 46.523848019401775, "grad_norm": 0.15918216109275818, "learning_rate": 0.0004263226443905506, "loss": 0.537, "step": 345300 }, { "epoch": 46.53732147669092, "grad_norm": 0.17244358360767365, "learning_rate": 0.000426285218120303, "loss": 0.5379, "step": 345400 }, { "epoch": 46.55079493398006, "grad_norm": 0.1601395606994629, "learning_rate": 0.0004262477918500554, "loss": 0.5374, "step": 345500 }, { "epoch": 46.5642683912692, "grad_norm": 0.17236573994159698, "learning_rate": 0.0004262103655798078, "loss": 0.5381, "step": 345600 }, { "epoch": 46.57774184855834, "grad_norm": 0.159519761800766, "learning_rate": 0.0004261729393095602, "loss": 0.5374, "step": 345700 }, { "epoch": 46.59121530584748, "grad_norm": 0.1602565199136734, "learning_rate": 0.0004261355130393126, "loss": 0.5375, "step": 345800 }, { "epoch": 46.60468876313662, "grad_norm": 0.1700669378042221, "learning_rate": 0.000426098086769065, "loss": 0.5373, "step": 345900 }, { "epoch": 46.61816222042576, "grad_norm": 0.1608603298664093, "learning_rate": 0.00042606066049881734, "loss": 0.5373, "step": 346000 }, { "epoch": 46.631635677714904, "grad_norm": 0.14741000533103943, "learning_rate": 0.0004260232342285697, "loss": 0.5364, "step": 346100 }, { "epoch": 46.64510913500404, "grad_norm": 0.14954932034015656, "learning_rate": 0.0004259858079583221, "loss": 0.5381, "step": 346200 }, { "epoch": 46.65858259229318, "grad_norm": 0.17221461236476898, "learning_rate": 0.0004259483816880745, "loss": 0.5374, "step": 346300 }, { "epoch": 46.672056049582324, "grad_norm": 0.1694900393486023, "learning_rate": 0.0004259109554178269, "loss": 0.537, "step": 346400 }, { "epoch": 46.68552950687146, "grad_norm": 0.16158990561962128, "learning_rate": 0.0004258735291475793, "loss": 0.537, "step": 346500 }, { "epoch": 46.699002964160606, "grad_norm": 0.15102995932102203, "learning_rate": 0.0004258361028773317, "loss": 0.5368, "step": 346600 }, { "epoch": 46.71247642144974, "grad_norm": 0.15154121816158295, "learning_rate": 0.00042579867660708407, "loss": 0.5375, "step": 346700 }, { "epoch": 46.72594987873889, "grad_norm": 0.16520650684833527, "learning_rate": 0.0004257612503368364, "loss": 0.5364, "step": 346800 }, { "epoch": 46.739423336028025, "grad_norm": 0.1673058122396469, "learning_rate": 0.0004257238240665888, "loss": 0.5377, "step": 346900 }, { "epoch": 46.75289679331716, "grad_norm": 0.19808551669120789, "learning_rate": 0.0004256863977963412, "loss": 0.5365, "step": 347000 }, { "epoch": 46.76637025060631, "grad_norm": 0.15461435914039612, "learning_rate": 0.0004256489715260936, "loss": 0.5373, "step": 347100 }, { "epoch": 46.779843707895445, "grad_norm": 0.1731865108013153, "learning_rate": 0.000425611545255846, "loss": 0.5375, "step": 347200 }, { "epoch": 46.79331716518459, "grad_norm": 0.15704266726970673, "learning_rate": 0.0004255741189855984, "loss": 0.537, "step": 347300 }, { "epoch": 46.80679062247373, "grad_norm": 0.16318312287330627, "learning_rate": 0.0004255366927153508, "loss": 0.5381, "step": 347400 }, { "epoch": 46.820264079762865, "grad_norm": 0.16051694750785828, "learning_rate": 0.00042549926644510315, "loss": 0.537, "step": 347500 }, { "epoch": 46.83373753705201, "grad_norm": 0.15324744582176208, "learning_rate": 0.00042546184017485555, "loss": 0.5371, "step": 347600 }, { "epoch": 46.84721099434115, "grad_norm": 0.16667823493480682, "learning_rate": 0.00042542441390460795, "loss": 0.5377, "step": 347700 }, { "epoch": 46.86068445163029, "grad_norm": 0.15748338401317596, "learning_rate": 0.00042538698763436034, "loss": 0.538, "step": 347800 }, { "epoch": 46.87415790891943, "grad_norm": 0.16614273190498352, "learning_rate": 0.0004253495613641127, "loss": 0.5372, "step": 347900 }, { "epoch": 46.887631366208566, "grad_norm": 0.19189433753490448, "learning_rate": 0.0004253121350938651, "loss": 0.5361, "step": 348000 }, { "epoch": 46.90110482349771, "grad_norm": 0.18182863295078278, "learning_rate": 0.0004252747088236175, "loss": 0.5372, "step": 348100 }, { "epoch": 46.91457828078685, "grad_norm": 0.1412532925605774, "learning_rate": 0.00042523728255336983, "loss": 0.5381, "step": 348200 }, { "epoch": 46.92805173807599, "grad_norm": 0.1776362806558609, "learning_rate": 0.00042519985628312223, "loss": 0.5374, "step": 348300 }, { "epoch": 46.94152519536513, "grad_norm": 0.16506925225257874, "learning_rate": 0.0004251624300128746, "loss": 0.5374, "step": 348400 }, { "epoch": 46.95499865265427, "grad_norm": 0.23601064085960388, "learning_rate": 0.000425125003742627, "loss": 0.5374, "step": 348500 }, { "epoch": 46.96847210994341, "grad_norm": 0.15828123688697815, "learning_rate": 0.0004250875774723794, "loss": 0.5375, "step": 348600 }, { "epoch": 46.98194556723255, "grad_norm": 0.15853647887706757, "learning_rate": 0.0004250501512021318, "loss": 0.5361, "step": 348700 }, { "epoch": 46.995419024521695, "grad_norm": 0.16052967309951782, "learning_rate": 0.0004250127249318842, "loss": 0.5372, "step": 348800 }, { "epoch": 47.0, "eval_loss": 0.5248714089393616, "eval_runtime": 4.9776, "eval_samples_per_second": 1004.506, "eval_steps_per_second": 15.871, "step": 348834 }, { "epoch": 47.00889248181083, "grad_norm": 0.17484883964061737, "learning_rate": 0.00042497529866163656, "loss": 0.5358, "step": 348900 }, { "epoch": 47.02236593909997, "grad_norm": 0.15547503530979156, "learning_rate": 0.00042493787239138896, "loss": 0.537, "step": 349000 }, { "epoch": 47.035839396389115, "grad_norm": 0.15295138955116272, "learning_rate": 0.00042490044612114136, "loss": 0.5374, "step": 349100 }, { "epoch": 47.04931285367825, "grad_norm": 0.16044485569000244, "learning_rate": 0.00042486301985089376, "loss": 0.5355, "step": 349200 }, { "epoch": 47.0627863109674, "grad_norm": 0.17802079021930695, "learning_rate": 0.00042482559358064616, "loss": 0.537, "step": 349300 }, { "epoch": 47.076259768256534, "grad_norm": 0.18690741062164307, "learning_rate": 0.00042478816731039855, "loss": 0.5373, "step": 349400 }, { "epoch": 47.08973322554567, "grad_norm": 0.16098980605602264, "learning_rate": 0.00042475074104015095, "loss": 0.5363, "step": 349500 }, { "epoch": 47.103206682834816, "grad_norm": 0.15716516971588135, "learning_rate": 0.00042471331476990335, "loss": 0.5376, "step": 349600 }, { "epoch": 47.116680140123954, "grad_norm": 0.17561763525009155, "learning_rate": 0.00042467588849965564, "loss": 0.5372, "step": 349700 }, { "epoch": 47.1301535974131, "grad_norm": 0.16518302261829376, "learning_rate": 0.00042463846222940804, "loss": 0.5356, "step": 349800 }, { "epoch": 47.143627054702236, "grad_norm": 0.15119408071041107, "learning_rate": 0.00042460103595916044, "loss": 0.5358, "step": 349900 }, { "epoch": 47.15710051199137, "grad_norm": 0.15630410611629486, "learning_rate": 0.00042456360968891283, "loss": 0.5362, "step": 350000 }, { "epoch": 47.17057396928052, "grad_norm": 0.16125494241714478, "learning_rate": 0.00042452618341866523, "loss": 0.5364, "step": 350100 }, { "epoch": 47.184047426569656, "grad_norm": 0.1622151881456375, "learning_rate": 0.00042448875714841763, "loss": 0.5361, "step": 350200 }, { "epoch": 47.1975208838588, "grad_norm": 0.15453362464904785, "learning_rate": 0.00042445133087817003, "loss": 0.5369, "step": 350300 }, { "epoch": 47.21099434114794, "grad_norm": 0.1753547489643097, "learning_rate": 0.0004244139046079224, "loss": 0.5368, "step": 350400 }, { "epoch": 47.22446779843708, "grad_norm": 0.15847791731357574, "learning_rate": 0.00042437647833767477, "loss": 0.5374, "step": 350500 }, { "epoch": 47.23794125572622, "grad_norm": 0.18098415434360504, "learning_rate": 0.00042433905206742717, "loss": 0.5379, "step": 350600 }, { "epoch": 47.25141471301536, "grad_norm": 0.1485273391008377, "learning_rate": 0.00042430162579717957, "loss": 0.537, "step": 350700 }, { "epoch": 47.2648881703045, "grad_norm": 0.15811693668365479, "learning_rate": 0.00042426419952693197, "loss": 0.5361, "step": 350800 }, { "epoch": 47.27836162759364, "grad_norm": 0.15368446707725525, "learning_rate": 0.00042422677325668436, "loss": 0.5368, "step": 350900 }, { "epoch": 47.291835084882784, "grad_norm": 0.16380159556865692, "learning_rate": 0.00042418934698643676, "loss": 0.5365, "step": 351000 }, { "epoch": 47.30530854217192, "grad_norm": 0.15296198427677155, "learning_rate": 0.0004241519207161891, "loss": 0.537, "step": 351100 }, { "epoch": 47.31878199946106, "grad_norm": 0.16646495461463928, "learning_rate": 0.0004241144944459415, "loss": 0.5368, "step": 351200 }, { "epoch": 47.332255456750204, "grad_norm": 0.16139324009418488, "learning_rate": 0.0004240770681756939, "loss": 0.5371, "step": 351300 }, { "epoch": 47.34572891403934, "grad_norm": 0.16830706596374512, "learning_rate": 0.0004240396419054463, "loss": 0.536, "step": 351400 }, { "epoch": 47.359202371328486, "grad_norm": 0.16094139218330383, "learning_rate": 0.00042400221563519865, "loss": 0.5362, "step": 351500 }, { "epoch": 47.37267582861762, "grad_norm": 0.16942794620990753, "learning_rate": 0.00042396478936495104, "loss": 0.537, "step": 351600 }, { "epoch": 47.38614928590676, "grad_norm": 0.15839697420597076, "learning_rate": 0.00042392736309470344, "loss": 0.5371, "step": 351700 }, { "epoch": 47.399622743195906, "grad_norm": 0.1503293812274933, "learning_rate": 0.0004238899368244558, "loss": 0.5364, "step": 351800 }, { "epoch": 47.41309620048504, "grad_norm": 0.1569593846797943, "learning_rate": 0.0004238525105542082, "loss": 0.5366, "step": 351900 }, { "epoch": 47.42656965777419, "grad_norm": 0.16463486850261688, "learning_rate": 0.0004238150842839606, "loss": 0.5364, "step": 352000 }, { "epoch": 47.440043115063325, "grad_norm": 0.16806866228580475, "learning_rate": 0.000423777658013713, "loss": 0.5362, "step": 352100 }, { "epoch": 47.45351657235246, "grad_norm": 0.17485010623931885, "learning_rate": 0.0004237402317434654, "loss": 0.5381, "step": 352200 }, { "epoch": 47.46699002964161, "grad_norm": 0.16898897290229797, "learning_rate": 0.0004237028054732178, "loss": 0.5374, "step": 352300 }, { "epoch": 47.480463486930745, "grad_norm": 0.1630278378725052, "learning_rate": 0.0004236653792029702, "loss": 0.5365, "step": 352400 }, { "epoch": 47.49393694421989, "grad_norm": 0.15006786584854126, "learning_rate": 0.0004236279529327226, "loss": 0.5372, "step": 352500 }, { "epoch": 47.50741040150903, "grad_norm": 0.16214460134506226, "learning_rate": 0.0004235905266624749, "loss": 0.5368, "step": 352600 }, { "epoch": 47.520883858798165, "grad_norm": 0.18828748166561127, "learning_rate": 0.0004235531003922273, "loss": 0.536, "step": 352700 }, { "epoch": 47.53435731608731, "grad_norm": 0.1889219433069229, "learning_rate": 0.0004235156741219797, "loss": 0.5361, "step": 352800 }, { "epoch": 47.54783077337645, "grad_norm": 0.1675838977098465, "learning_rate": 0.0004234782478517321, "loss": 0.536, "step": 352900 }, { "epoch": 47.56130423066559, "grad_norm": 0.15051600337028503, "learning_rate": 0.0004234408215814845, "loss": 0.5374, "step": 353000 }, { "epoch": 47.57477768795473, "grad_norm": 0.15948353707790375, "learning_rate": 0.0004234033953112369, "loss": 0.5367, "step": 353100 }, { "epoch": 47.588251145243866, "grad_norm": 0.15156616270542145, "learning_rate": 0.0004233659690409893, "loss": 0.5374, "step": 353200 }, { "epoch": 47.60172460253301, "grad_norm": 0.15581117570400238, "learning_rate": 0.0004233285427707416, "loss": 0.5367, "step": 353300 }, { "epoch": 47.61519805982215, "grad_norm": 0.15727347135543823, "learning_rate": 0.000423291116500494, "loss": 0.5364, "step": 353400 }, { "epoch": 47.62867151711129, "grad_norm": 0.15058699250221252, "learning_rate": 0.0004232536902302464, "loss": 0.5379, "step": 353500 }, { "epoch": 47.64214497440043, "grad_norm": 0.1552381068468094, "learning_rate": 0.0004232162639599988, "loss": 0.5377, "step": 353600 }, { "epoch": 47.65561843168957, "grad_norm": 0.16762210428714752, "learning_rate": 0.0004231788376897512, "loss": 0.5369, "step": 353700 }, { "epoch": 47.66909188897871, "grad_norm": 0.1595732569694519, "learning_rate": 0.0004231414114195036, "loss": 0.5365, "step": 353800 }, { "epoch": 47.68256534626785, "grad_norm": 0.17290395498275757, "learning_rate": 0.000423103985149256, "loss": 0.5378, "step": 353900 }, { "epoch": 47.696038803556995, "grad_norm": 0.1859910637140274, "learning_rate": 0.00042306655887900833, "loss": 0.5369, "step": 354000 }, { "epoch": 47.70951226084613, "grad_norm": 0.16668701171875, "learning_rate": 0.00042302913260876073, "loss": 0.5367, "step": 354100 }, { "epoch": 47.72298571813528, "grad_norm": 0.1692536622285843, "learning_rate": 0.0004229917063385131, "loss": 0.5374, "step": 354200 }, { "epoch": 47.736459175424415, "grad_norm": 0.17567028105258942, "learning_rate": 0.0004229542800682655, "loss": 0.5369, "step": 354300 }, { "epoch": 47.74993263271355, "grad_norm": 0.17075923085212708, "learning_rate": 0.0004229168537980179, "loss": 0.5372, "step": 354400 }, { "epoch": 47.7634060900027, "grad_norm": 0.16088275611400604, "learning_rate": 0.0004228794275277703, "loss": 0.5364, "step": 354500 }, { "epoch": 47.776879547291834, "grad_norm": 0.15426446497440338, "learning_rate": 0.0004228420012575227, "loss": 0.5362, "step": 354600 }, { "epoch": 47.79035300458098, "grad_norm": 0.16322071850299835, "learning_rate": 0.0004228045749872751, "loss": 0.5369, "step": 354700 }, { "epoch": 47.803826461870116, "grad_norm": 0.1678885817527771, "learning_rate": 0.00042276714871702746, "loss": 0.5374, "step": 354800 }, { "epoch": 47.817299919159254, "grad_norm": 0.16707101464271545, "learning_rate": 0.00042272972244677986, "loss": 0.5365, "step": 354900 }, { "epoch": 47.8307733764484, "grad_norm": 0.15838731825351715, "learning_rate": 0.00042269229617653226, "loss": 0.5377, "step": 355000 }, { "epoch": 47.844246833737536, "grad_norm": 0.1503244787454605, "learning_rate": 0.00042265486990628466, "loss": 0.5362, "step": 355100 }, { "epoch": 47.85772029102668, "grad_norm": 0.16906489431858063, "learning_rate": 0.000422617443636037, "loss": 0.5367, "step": 355200 }, { "epoch": 47.87119374831582, "grad_norm": 0.17383015155792236, "learning_rate": 0.0004225800173657894, "loss": 0.5359, "step": 355300 }, { "epoch": 47.884667205604956, "grad_norm": 0.15414012968540192, "learning_rate": 0.0004225425910955418, "loss": 0.5366, "step": 355400 }, { "epoch": 47.8981406628941, "grad_norm": 0.1595008224248886, "learning_rate": 0.00042250516482529414, "loss": 0.5366, "step": 355500 }, { "epoch": 47.91161412018324, "grad_norm": 0.15845203399658203, "learning_rate": 0.00042246773855504654, "loss": 0.5364, "step": 355600 }, { "epoch": 47.92508757747238, "grad_norm": 0.15601105988025665, "learning_rate": 0.00042243031228479894, "loss": 0.537, "step": 355700 }, { "epoch": 47.93856103476152, "grad_norm": 0.16053351759910583, "learning_rate": 0.00042239288601455134, "loss": 0.5357, "step": 355800 }, { "epoch": 47.95203449205066, "grad_norm": 0.16165316104888916, "learning_rate": 0.00042235545974430373, "loss": 0.5366, "step": 355900 }, { "epoch": 47.9655079493398, "grad_norm": 0.16783812642097473, "learning_rate": 0.00042231803347405613, "loss": 0.5369, "step": 356000 }, { "epoch": 47.97898140662894, "grad_norm": 0.16281366348266602, "learning_rate": 0.00042228060720380853, "loss": 0.5377, "step": 356100 }, { "epoch": 47.992454863918084, "grad_norm": 0.156035378575325, "learning_rate": 0.0004222431809335609, "loss": 0.5367, "step": 356200 }, { "epoch": 48.0, "eval_loss": 0.5244420170783997, "eval_runtime": 4.9528, "eval_samples_per_second": 1009.537, "eval_steps_per_second": 15.951, "step": 356256 }, { "epoch": 48.00592832120722, "grad_norm": 0.16214235126972198, "learning_rate": 0.00042220575466331327, "loss": 0.5355, "step": 356300 }, { "epoch": 48.01940177849636, "grad_norm": 0.15858575701713562, "learning_rate": 0.00042216832839306567, "loss": 0.5368, "step": 356400 }, { "epoch": 48.032875235785504, "grad_norm": 0.17719505727291107, "learning_rate": 0.00042213090212281807, "loss": 0.5371, "step": 356500 }, { "epoch": 48.04634869307464, "grad_norm": 0.16827790439128876, "learning_rate": 0.00042209347585257047, "loss": 0.5359, "step": 356600 }, { "epoch": 48.059822150363786, "grad_norm": 0.16420228779315948, "learning_rate": 0.00042205604958232287, "loss": 0.5368, "step": 356700 }, { "epoch": 48.07329560765292, "grad_norm": 0.15509283542633057, "learning_rate": 0.00042201862331207526, "loss": 0.5349, "step": 356800 }, { "epoch": 48.08676906494206, "grad_norm": 0.1571352630853653, "learning_rate": 0.00042198119704182766, "loss": 0.5359, "step": 356900 }, { "epoch": 48.100242522231206, "grad_norm": 0.1631690412759781, "learning_rate": 0.00042194377077157995, "loss": 0.5366, "step": 357000 }, { "epoch": 48.11371597952034, "grad_norm": 0.15699271857738495, "learning_rate": 0.00042190634450133235, "loss": 0.5369, "step": 357100 }, { "epoch": 48.12718943680949, "grad_norm": 0.1637302041053772, "learning_rate": 0.00042186891823108475, "loss": 0.5362, "step": 357200 }, { "epoch": 48.140662894098625, "grad_norm": 0.17911136150360107, "learning_rate": 0.00042183149196083715, "loss": 0.5363, "step": 357300 }, { "epoch": 48.15413635138776, "grad_norm": 0.16008812189102173, "learning_rate": 0.00042179406569058954, "loss": 0.5367, "step": 357400 }, { "epoch": 48.16760980867691, "grad_norm": 0.1619536280632019, "learning_rate": 0.00042175663942034194, "loss": 0.5362, "step": 357500 }, { "epoch": 48.181083265966045, "grad_norm": 0.15590330958366394, "learning_rate": 0.00042171921315009434, "loss": 0.5366, "step": 357600 }, { "epoch": 48.19455672325519, "grad_norm": 0.16918116807937622, "learning_rate": 0.0004216817868798467, "loss": 0.536, "step": 357700 }, { "epoch": 48.20803018054433, "grad_norm": 0.173081174492836, "learning_rate": 0.0004216443606095991, "loss": 0.5358, "step": 357800 }, { "epoch": 48.22150363783347, "grad_norm": 0.16209612786769867, "learning_rate": 0.0004216069343393515, "loss": 0.5361, "step": 357900 }, { "epoch": 48.23497709512261, "grad_norm": 0.16327553987503052, "learning_rate": 0.0004215695080691039, "loss": 0.5371, "step": 358000 }, { "epoch": 48.24845055241175, "grad_norm": 0.15475064516067505, "learning_rate": 0.0004215320817988563, "loss": 0.536, "step": 358100 }, { "epoch": 48.26192400970089, "grad_norm": 0.15227502584457397, "learning_rate": 0.0004214946555286087, "loss": 0.5366, "step": 358200 }, { "epoch": 48.27539746699003, "grad_norm": 0.16706374287605286, "learning_rate": 0.0004214572292583611, "loss": 0.5371, "step": 358300 }, { "epoch": 48.28887092427917, "grad_norm": 0.15700851380825043, "learning_rate": 0.0004214198029881134, "loss": 0.5361, "step": 358400 }, { "epoch": 48.30234438156831, "grad_norm": 0.16699005663394928, "learning_rate": 0.0004213823767178658, "loss": 0.5369, "step": 358500 }, { "epoch": 48.31581783885745, "grad_norm": 0.1575392633676529, "learning_rate": 0.0004213449504476182, "loss": 0.5359, "step": 358600 }, { "epoch": 48.32929129614659, "grad_norm": 0.1654680222272873, "learning_rate": 0.0004213075241773706, "loss": 0.5369, "step": 358700 }, { "epoch": 48.34276475343573, "grad_norm": 0.15651962161064148, "learning_rate": 0.00042127009790712296, "loss": 0.5359, "step": 358800 }, { "epoch": 48.356238210724875, "grad_norm": 0.1688709259033203, "learning_rate": 0.00042123267163687536, "loss": 0.5359, "step": 358900 }, { "epoch": 48.36971166801401, "grad_norm": 0.15028266608715057, "learning_rate": 0.00042119524536662775, "loss": 0.5373, "step": 359000 }, { "epoch": 48.38318512530315, "grad_norm": 0.16191382706165314, "learning_rate": 0.0004211578190963801, "loss": 0.5359, "step": 359100 }, { "epoch": 48.396658582592295, "grad_norm": 0.1775227189064026, "learning_rate": 0.0004211203928261325, "loss": 0.5366, "step": 359200 }, { "epoch": 48.41013203988143, "grad_norm": 0.16699646413326263, "learning_rate": 0.0004210829665558849, "loss": 0.5372, "step": 359300 }, { "epoch": 48.42360549717058, "grad_norm": 0.19643212854862213, "learning_rate": 0.0004210455402856373, "loss": 0.5361, "step": 359400 }, { "epoch": 48.437078954459714, "grad_norm": 0.16181132197380066, "learning_rate": 0.0004210081140153897, "loss": 0.536, "step": 359500 }, { "epoch": 48.45055241174885, "grad_norm": 0.1689271479845047, "learning_rate": 0.0004209706877451421, "loss": 0.5365, "step": 359600 }, { "epoch": 48.464025869038, "grad_norm": 0.16232840716838837, "learning_rate": 0.0004209332614748945, "loss": 0.5358, "step": 359700 }, { "epoch": 48.477499326327134, "grad_norm": 0.15785439312458038, "learning_rate": 0.0004208958352046469, "loss": 0.5364, "step": 359800 }, { "epoch": 48.49097278361628, "grad_norm": 0.16743524372577667, "learning_rate": 0.00042085840893439923, "loss": 0.5364, "step": 359900 }, { "epoch": 48.504446240905416, "grad_norm": 0.14939960837364197, "learning_rate": 0.00042082098266415163, "loss": 0.5367, "step": 360000 }, { "epoch": 48.517919698194554, "grad_norm": 0.1541537195444107, "learning_rate": 0.000420783556393904, "loss": 0.5368, "step": 360100 }, { "epoch": 48.5313931554837, "grad_norm": 0.15638019144535065, "learning_rate": 0.0004207461301236564, "loss": 0.5364, "step": 360200 }, { "epoch": 48.544866612772836, "grad_norm": 0.16069968044757843, "learning_rate": 0.0004207087038534088, "loss": 0.5373, "step": 360300 }, { "epoch": 48.55834007006198, "grad_norm": 0.1550784856081009, "learning_rate": 0.0004206712775831612, "loss": 0.5357, "step": 360400 }, { "epoch": 48.57181352735112, "grad_norm": 0.18612807989120483, "learning_rate": 0.0004206338513129136, "loss": 0.5366, "step": 360500 }, { "epoch": 48.585286984640256, "grad_norm": 0.16980792582035065, "learning_rate": 0.0004205964250426659, "loss": 0.536, "step": 360600 }, { "epoch": 48.5987604419294, "grad_norm": 0.1521618813276291, "learning_rate": 0.0004205589987724183, "loss": 0.536, "step": 360700 }, { "epoch": 48.61223389921854, "grad_norm": 0.15422846376895905, "learning_rate": 0.0004205215725021707, "loss": 0.5352, "step": 360800 }, { "epoch": 48.62570735650768, "grad_norm": 0.1603870987892151, "learning_rate": 0.0004204841462319231, "loss": 0.5361, "step": 360900 }, { "epoch": 48.63918081379682, "grad_norm": 0.16706407070159912, "learning_rate": 0.0004204467199616755, "loss": 0.5358, "step": 361000 }, { "epoch": 48.65265427108596, "grad_norm": 0.17526578903198242, "learning_rate": 0.0004204092936914279, "loss": 0.5365, "step": 361100 }, { "epoch": 48.6661277283751, "grad_norm": 0.15904855728149414, "learning_rate": 0.0004203718674211803, "loss": 0.5355, "step": 361200 }, { "epoch": 48.67960118566424, "grad_norm": 0.1558976024389267, "learning_rate": 0.00042033444115093264, "loss": 0.5368, "step": 361300 }, { "epoch": 48.693074642953384, "grad_norm": 0.15806174278259277, "learning_rate": 0.00042029701488068504, "loss": 0.5358, "step": 361400 }, { "epoch": 48.70654810024252, "grad_norm": 0.14656326174736023, "learning_rate": 0.00042025958861043744, "loss": 0.5363, "step": 361500 }, { "epoch": 48.720021557531666, "grad_norm": 0.16138949990272522, "learning_rate": 0.00042022216234018984, "loss": 0.5366, "step": 361600 }, { "epoch": 48.733495014820804, "grad_norm": 0.1528913825750351, "learning_rate": 0.00042018473606994224, "loss": 0.5362, "step": 361700 }, { "epoch": 48.74696847210994, "grad_norm": 0.15490442514419556, "learning_rate": 0.00042014730979969463, "loss": 0.5361, "step": 361800 }, { "epoch": 48.760441929399086, "grad_norm": 0.1772974282503128, "learning_rate": 0.00042010988352944703, "loss": 0.5371, "step": 361900 }, { "epoch": 48.77391538668822, "grad_norm": 0.14545170962810516, "learning_rate": 0.00042007245725919943, "loss": 0.5363, "step": 362000 }, { "epoch": 48.78738884397737, "grad_norm": 0.18924154341220856, "learning_rate": 0.0004200350309889518, "loss": 0.5354, "step": 362100 }, { "epoch": 48.800862301266505, "grad_norm": 0.15657034516334534, "learning_rate": 0.00041999760471870417, "loss": 0.5359, "step": 362200 }, { "epoch": 48.81433575855564, "grad_norm": 0.15621767938137054, "learning_rate": 0.00041996017844845657, "loss": 0.5357, "step": 362300 }, { "epoch": 48.82780921584479, "grad_norm": 0.16632044315338135, "learning_rate": 0.0004199227521782089, "loss": 0.5365, "step": 362400 }, { "epoch": 48.841282673133925, "grad_norm": 0.16137391328811646, "learning_rate": 0.0004198853259079613, "loss": 0.536, "step": 362500 }, { "epoch": 48.85475613042307, "grad_norm": 0.16748879849910736, "learning_rate": 0.0004198478996377137, "loss": 0.5346, "step": 362600 }, { "epoch": 48.86822958771221, "grad_norm": 0.16599422693252563, "learning_rate": 0.0004198104733674661, "loss": 0.5362, "step": 362700 }, { "epoch": 48.881703045001345, "grad_norm": 0.17336969077587128, "learning_rate": 0.00041977304709721845, "loss": 0.5365, "step": 362800 }, { "epoch": 48.89517650229049, "grad_norm": 0.1553795039653778, "learning_rate": 0.00041973562082697085, "loss": 0.5366, "step": 362900 }, { "epoch": 48.90864995957963, "grad_norm": 0.17294219136238098, "learning_rate": 0.00041969819455672325, "loss": 0.5366, "step": 363000 }, { "epoch": 48.92212341686877, "grad_norm": 0.15678703784942627, "learning_rate": 0.00041966076828647565, "loss": 0.5361, "step": 363100 }, { "epoch": 48.93559687415791, "grad_norm": 0.16657818853855133, "learning_rate": 0.00041962334201622805, "loss": 0.5359, "step": 363200 }, { "epoch": 48.94907033144705, "grad_norm": 0.1582270860671997, "learning_rate": 0.00041958591574598044, "loss": 0.5362, "step": 363300 }, { "epoch": 48.96254378873619, "grad_norm": 0.16243626177310944, "learning_rate": 0.00041954848947573284, "loss": 0.5366, "step": 363400 }, { "epoch": 48.97601724602533, "grad_norm": 0.15985989570617676, "learning_rate": 0.0004195110632054852, "loss": 0.5363, "step": 363500 }, { "epoch": 48.98949070331447, "grad_norm": 0.15214091539382935, "learning_rate": 0.0004194736369352376, "loss": 0.5368, "step": 363600 }, { "epoch": 49.0, "eval_loss": 0.5240633487701416, "eval_runtime": 4.9664, "eval_samples_per_second": 1006.763, "eval_steps_per_second": 15.907, "step": 363678 }, { "epoch": 49.00296416060361, "grad_norm": 0.15980744361877441, "learning_rate": 0.00041943621066499, "loss": 0.5356, "step": 363700 }, { "epoch": 49.01643761789275, "grad_norm": 0.16467812657356262, "learning_rate": 0.0004193987843947424, "loss": 0.5355, "step": 363800 }, { "epoch": 49.02991107518189, "grad_norm": 0.1715898960828781, "learning_rate": 0.0004193613581244948, "loss": 0.535, "step": 363900 }, { "epoch": 49.04338453247103, "grad_norm": 0.16534188389778137, "learning_rate": 0.0004193239318542472, "loss": 0.5361, "step": 364000 }, { "epoch": 49.056857989760175, "grad_norm": 0.1649378389120102, "learning_rate": 0.0004192865055839996, "loss": 0.5358, "step": 364100 }, { "epoch": 49.07033144704931, "grad_norm": 0.16485662758350372, "learning_rate": 0.00041924907931375187, "loss": 0.5362, "step": 364200 }, { "epoch": 49.08380490433845, "grad_norm": 0.16828694939613342, "learning_rate": 0.00041921165304350426, "loss": 0.5369, "step": 364300 }, { "epoch": 49.097278361627595, "grad_norm": 0.16905084252357483, "learning_rate": 0.00041917422677325666, "loss": 0.5349, "step": 364400 }, { "epoch": 49.11075181891673, "grad_norm": 0.16231870651245117, "learning_rate": 0.00041913680050300906, "loss": 0.5352, "step": 364500 }, { "epoch": 49.12422527620588, "grad_norm": 0.15930427610874176, "learning_rate": 0.00041909937423276146, "loss": 0.5347, "step": 364600 }, { "epoch": 49.137698733495014, "grad_norm": 0.15655289590358734, "learning_rate": 0.00041906194796251386, "loss": 0.5355, "step": 364700 }, { "epoch": 49.15117219078415, "grad_norm": 0.16439348459243774, "learning_rate": 0.00041902452169226626, "loss": 0.536, "step": 364800 }, { "epoch": 49.1646456480733, "grad_norm": 0.15159694850444794, "learning_rate": 0.00041898709542201865, "loss": 0.5364, "step": 364900 }, { "epoch": 49.178119105362434, "grad_norm": 0.17599663138389587, "learning_rate": 0.000418949669151771, "loss": 0.5358, "step": 365000 }, { "epoch": 49.19159256265158, "grad_norm": 0.16158431768417358, "learning_rate": 0.0004189122428815234, "loss": 0.5361, "step": 365100 }, { "epoch": 49.205066019940716, "grad_norm": 0.170488640666008, "learning_rate": 0.0004188748166112758, "loss": 0.5349, "step": 365200 }, { "epoch": 49.218539477229854, "grad_norm": 0.166050985455513, "learning_rate": 0.0004188373903410282, "loss": 0.5362, "step": 365300 }, { "epoch": 49.232012934519, "grad_norm": 0.18600329756736755, "learning_rate": 0.0004187999640707806, "loss": 0.5358, "step": 365400 }, { "epoch": 49.245486391808136, "grad_norm": 0.16138635575771332, "learning_rate": 0.000418762537800533, "loss": 0.5368, "step": 365500 }, { "epoch": 49.25895984909728, "grad_norm": 0.16342195868492126, "learning_rate": 0.0004187251115302854, "loss": 0.536, "step": 365600 }, { "epoch": 49.27243330638642, "grad_norm": 0.15159574151039124, "learning_rate": 0.00041868768526003773, "loss": 0.5354, "step": 365700 }, { "epoch": 49.28590676367556, "grad_norm": 0.16041214764118195, "learning_rate": 0.00041865025898979013, "loss": 0.5356, "step": 365800 }, { "epoch": 49.2993802209647, "grad_norm": 0.15392962098121643, "learning_rate": 0.00041861283271954253, "loss": 0.5356, "step": 365900 }, { "epoch": 49.31285367825384, "grad_norm": 0.16304752230644226, "learning_rate": 0.00041857540644929487, "loss": 0.5365, "step": 366000 }, { "epoch": 49.32632713554298, "grad_norm": 0.15528923273086548, "learning_rate": 0.00041853798017904727, "loss": 0.5364, "step": 366100 }, { "epoch": 49.33980059283212, "grad_norm": 0.16136358678340912, "learning_rate": 0.00041850055390879967, "loss": 0.535, "step": 366200 }, { "epoch": 49.353274050121264, "grad_norm": 0.1681663542985916, "learning_rate": 0.00041846312763855207, "loss": 0.5352, "step": 366300 }, { "epoch": 49.3667475074104, "grad_norm": 0.1617625206708908, "learning_rate": 0.0004184257013683044, "loss": 0.536, "step": 366400 }, { "epoch": 49.38022096469954, "grad_norm": 0.16360796988010406, "learning_rate": 0.0004183882750980568, "loss": 0.5367, "step": 366500 }, { "epoch": 49.393694421988684, "grad_norm": 0.157668337225914, "learning_rate": 0.0004183508488278092, "loss": 0.5369, "step": 366600 }, { "epoch": 49.40716787927782, "grad_norm": 0.16497674584388733, "learning_rate": 0.0004183134225575616, "loss": 0.5368, "step": 366700 }, { "epoch": 49.420641336566966, "grad_norm": 0.1665722280740738, "learning_rate": 0.000418275996287314, "loss": 0.536, "step": 366800 }, { "epoch": 49.434114793856104, "grad_norm": 0.15928126871585846, "learning_rate": 0.0004182385700170664, "loss": 0.536, "step": 366900 }, { "epoch": 49.44758825114524, "grad_norm": 0.16610145568847656, "learning_rate": 0.0004182011437468188, "loss": 0.5365, "step": 367000 }, { "epoch": 49.461061708434386, "grad_norm": 0.17806021869182587, "learning_rate": 0.00041816371747657114, "loss": 0.5365, "step": 367100 }, { "epoch": 49.47453516572352, "grad_norm": 0.16989752650260925, "learning_rate": 0.00041812629120632354, "loss": 0.5361, "step": 367200 }, { "epoch": 49.48800862301267, "grad_norm": 0.16169023513793945, "learning_rate": 0.00041808886493607594, "loss": 0.5347, "step": 367300 }, { "epoch": 49.501482080301805, "grad_norm": 0.15584781765937805, "learning_rate": 0.00041805143866582834, "loss": 0.5372, "step": 367400 }, { "epoch": 49.51495553759094, "grad_norm": 0.1483347862958908, "learning_rate": 0.00041801401239558074, "loss": 0.5353, "step": 367500 }, { "epoch": 49.52842899488009, "grad_norm": 0.1665424406528473, "learning_rate": 0.00041797658612533313, "loss": 0.5355, "step": 367600 }, { "epoch": 49.541902452169225, "grad_norm": 0.1594177633523941, "learning_rate": 0.00041793915985508553, "loss": 0.5364, "step": 367700 }, { "epoch": 49.55537590945837, "grad_norm": 0.15634430944919586, "learning_rate": 0.0004179017335848379, "loss": 0.5359, "step": 367800 }, { "epoch": 49.56884936674751, "grad_norm": 0.16097307205200195, "learning_rate": 0.0004178643073145902, "loss": 0.5362, "step": 367900 }, { "epoch": 49.582322824036645, "grad_norm": 0.17136341333389282, "learning_rate": 0.0004178268810443426, "loss": 0.5356, "step": 368000 }, { "epoch": 49.59579628132579, "grad_norm": 0.15449944138526917, "learning_rate": 0.000417789454774095, "loss": 0.5349, "step": 368100 }, { "epoch": 49.60926973861493, "grad_norm": 0.1693868339061737, "learning_rate": 0.0004177520285038474, "loss": 0.5365, "step": 368200 }, { "epoch": 49.62274319590407, "grad_norm": 0.15654122829437256, "learning_rate": 0.0004177146022335998, "loss": 0.5363, "step": 368300 }, { "epoch": 49.63621665319321, "grad_norm": 0.16025054454803467, "learning_rate": 0.0004176771759633522, "loss": 0.5364, "step": 368400 }, { "epoch": 49.64969011048235, "grad_norm": 0.16055132448673248, "learning_rate": 0.0004176397496931046, "loss": 0.5358, "step": 368500 }, { "epoch": 49.66316356777149, "grad_norm": 0.16103863716125488, "learning_rate": 0.00041760232342285695, "loss": 0.5357, "step": 368600 }, { "epoch": 49.67663702506063, "grad_norm": 0.1504707932472229, "learning_rate": 0.00041756489715260935, "loss": 0.5355, "step": 368700 }, { "epoch": 49.69011048234977, "grad_norm": 0.1715795248746872, "learning_rate": 0.00041752747088236175, "loss": 0.5358, "step": 368800 }, { "epoch": 49.70358393963891, "grad_norm": 0.15580907464027405, "learning_rate": 0.00041749004461211415, "loss": 0.5361, "step": 368900 }, { "epoch": 49.71705739692805, "grad_norm": 0.15264181792736053, "learning_rate": 0.00041745261834186655, "loss": 0.5353, "step": 369000 }, { "epoch": 49.73053085421719, "grad_norm": 0.1585473120212555, "learning_rate": 0.00041741519207161895, "loss": 0.5353, "step": 369100 }, { "epoch": 49.74400431150633, "grad_norm": 0.15682289004325867, "learning_rate": 0.00041737776580137134, "loss": 0.5366, "step": 369200 }, { "epoch": 49.757477768795475, "grad_norm": 0.17305384576320648, "learning_rate": 0.0004173403395311237, "loss": 0.5353, "step": 369300 }, { "epoch": 49.77095122608461, "grad_norm": 0.159352108836174, "learning_rate": 0.0004173029132608761, "loss": 0.535, "step": 369400 }, { "epoch": 49.78442468337376, "grad_norm": 0.17483201622962952, "learning_rate": 0.0004172654869906285, "loss": 0.536, "step": 369500 }, { "epoch": 49.797898140662895, "grad_norm": 0.155501127243042, "learning_rate": 0.00041722806072038083, "loss": 0.5357, "step": 369600 }, { "epoch": 49.81137159795203, "grad_norm": 0.16870315372943878, "learning_rate": 0.0004171906344501332, "loss": 0.5359, "step": 369700 }, { "epoch": 49.82484505524118, "grad_norm": 0.16058865189552307, "learning_rate": 0.0004171532081798856, "loss": 0.5353, "step": 369800 }, { "epoch": 49.838318512530314, "grad_norm": 0.15703275799751282, "learning_rate": 0.000417115781909638, "loss": 0.5348, "step": 369900 }, { "epoch": 49.85179196981946, "grad_norm": 0.15190359950065613, "learning_rate": 0.0004170783556393904, "loss": 0.5361, "step": 370000 }, { "epoch": 49.865265427108596, "grad_norm": 0.17077045142650604, "learning_rate": 0.00041704092936914277, "loss": 0.5357, "step": 370100 }, { "epoch": 49.878738884397734, "grad_norm": 0.1612505167722702, "learning_rate": 0.00041700350309889516, "loss": 0.5357, "step": 370200 }, { "epoch": 49.89221234168688, "grad_norm": 0.16581054031848907, "learning_rate": 0.00041696607682864756, "loss": 0.5363, "step": 370300 }, { "epoch": 49.905685798976016, "grad_norm": 0.16173523664474487, "learning_rate": 0.00041692865055839996, "loss": 0.5361, "step": 370400 }, { "epoch": 49.91915925626516, "grad_norm": 0.17643627524375916, "learning_rate": 0.00041689122428815236, "loss": 0.5352, "step": 370500 }, { "epoch": 49.9326327135543, "grad_norm": 0.17759360373020172, "learning_rate": 0.00041685379801790476, "loss": 0.5357, "step": 370600 }, { "epoch": 49.946106170843436, "grad_norm": 0.16844573616981506, "learning_rate": 0.00041681637174765715, "loss": 0.5353, "step": 370700 }, { "epoch": 49.95957962813258, "grad_norm": 0.1655067652463913, "learning_rate": 0.0004167789454774095, "loss": 0.5367, "step": 370800 }, { "epoch": 49.97305308542172, "grad_norm": 0.16235190629959106, "learning_rate": 0.0004167415192071619, "loss": 0.5357, "step": 370900 }, { "epoch": 49.98652654271086, "grad_norm": 0.1576276272535324, "learning_rate": 0.0004167040929369143, "loss": 0.5357, "step": 371000 }, { "epoch": 50.0, "grad_norm": 0.18001601099967957, "learning_rate": 0.0004166666666666667, "loss": 0.5358, "step": 371100 }, { "epoch": 50.0, "eval_loss": 0.5237890481948853, "eval_runtime": 4.9605, "eval_samples_per_second": 1007.972, "eval_steps_per_second": 15.926, "step": 371100 }, { "epoch": 50.01347345728914, "grad_norm": 0.15638819336891174, "learning_rate": 0.0004166292403964191, "loss": 0.535, "step": 371200 }, { "epoch": 50.02694691457828, "grad_norm": 0.15592530369758606, "learning_rate": 0.0004165918141261715, "loss": 0.5342, "step": 371300 }, { "epoch": 50.04042037186742, "grad_norm": 0.18377090990543365, "learning_rate": 0.0004165543878559239, "loss": 0.5341, "step": 371400 }, { "epoch": 50.053893829156564, "grad_norm": 0.15710438787937164, "learning_rate": 0.0004165169615856762, "loss": 0.5351, "step": 371500 }, { "epoch": 50.0673672864457, "grad_norm": 0.15581411123275757, "learning_rate": 0.0004164795353154286, "loss": 0.5357, "step": 371600 }, { "epoch": 50.08084074373484, "grad_norm": 0.1520436555147171, "learning_rate": 0.000416442109045181, "loss": 0.5357, "step": 371700 }, { "epoch": 50.094314201023984, "grad_norm": 0.15704195201396942, "learning_rate": 0.00041640468277493337, "loss": 0.5357, "step": 371800 }, { "epoch": 50.10778765831312, "grad_norm": 0.15510277450084686, "learning_rate": 0.00041636725650468577, "loss": 0.5345, "step": 371900 }, { "epoch": 50.121261115602266, "grad_norm": 0.15302430093288422, "learning_rate": 0.00041632983023443817, "loss": 0.5351, "step": 372000 }, { "epoch": 50.134734572891404, "grad_norm": 0.16218456625938416, "learning_rate": 0.00041629240396419057, "loss": 0.5354, "step": 372100 }, { "epoch": 50.14820803018054, "grad_norm": 0.17044590413570404, "learning_rate": 0.0004162549776939429, "loss": 0.5351, "step": 372200 }, { "epoch": 50.161681487469686, "grad_norm": 0.15831322968006134, "learning_rate": 0.0004162175514236953, "loss": 0.5357, "step": 372300 }, { "epoch": 50.17515494475882, "grad_norm": 0.17963460087776184, "learning_rate": 0.0004161801251534477, "loss": 0.5354, "step": 372400 }, { "epoch": 50.18862840204797, "grad_norm": 0.17376472055912018, "learning_rate": 0.0004161426988832001, "loss": 0.5349, "step": 372500 }, { "epoch": 50.202101859337105, "grad_norm": 0.1730252504348755, "learning_rate": 0.0004161052726129525, "loss": 0.5346, "step": 372600 }, { "epoch": 50.21557531662624, "grad_norm": 0.17204579710960388, "learning_rate": 0.0004160678463427049, "loss": 0.5347, "step": 372700 }, { "epoch": 50.22904877391539, "grad_norm": 0.163859024643898, "learning_rate": 0.0004160304200724573, "loss": 0.5362, "step": 372800 }, { "epoch": 50.242522231204525, "grad_norm": 0.17565537989139557, "learning_rate": 0.0004159929938022097, "loss": 0.536, "step": 372900 }, { "epoch": 50.25599568849367, "grad_norm": 0.1506534367799759, "learning_rate": 0.00041595556753196204, "loss": 0.5352, "step": 373000 }, { "epoch": 50.26946914578281, "grad_norm": 0.17567186057567596, "learning_rate": 0.00041591814126171444, "loss": 0.5353, "step": 373100 }, { "epoch": 50.28294260307195, "grad_norm": 0.1640816479921341, "learning_rate": 0.00041588071499146684, "loss": 0.535, "step": 373200 }, { "epoch": 50.29641606036109, "grad_norm": 0.186070054769516, "learning_rate": 0.0004158432887212192, "loss": 0.5358, "step": 373300 }, { "epoch": 50.30988951765023, "grad_norm": 0.17690810561180115, "learning_rate": 0.0004158058624509716, "loss": 0.5364, "step": 373400 }, { "epoch": 50.32336297493937, "grad_norm": 0.1560261845588684, "learning_rate": 0.000415768436180724, "loss": 0.5356, "step": 373500 }, { "epoch": 50.33683643222851, "grad_norm": 0.15926818549633026, "learning_rate": 0.0004157310099104764, "loss": 0.5354, "step": 373600 }, { "epoch": 50.35030988951765, "grad_norm": 0.1566755324602127, "learning_rate": 0.0004156935836402287, "loss": 0.5359, "step": 373700 }, { "epoch": 50.36378334680679, "grad_norm": 0.16231411695480347, "learning_rate": 0.0004156561573699811, "loss": 0.5359, "step": 373800 }, { "epoch": 50.37725680409593, "grad_norm": 0.16948749125003815, "learning_rate": 0.0004156187310997335, "loss": 0.5353, "step": 373900 }, { "epoch": 50.39073026138507, "grad_norm": 0.15407058596611023, "learning_rate": 0.0004155813048294859, "loss": 0.5362, "step": 374000 }, { "epoch": 50.40420371867421, "grad_norm": 0.16842612624168396, "learning_rate": 0.0004155438785592383, "loss": 0.5352, "step": 374100 }, { "epoch": 50.417677175963355, "grad_norm": 0.1568542718887329, "learning_rate": 0.0004155064522889907, "loss": 0.5359, "step": 374200 }, { "epoch": 50.43115063325249, "grad_norm": 0.16860583424568176, "learning_rate": 0.0004154690260187431, "loss": 0.5353, "step": 374300 }, { "epoch": 50.44462409054163, "grad_norm": 0.16031858325004578, "learning_rate": 0.00041543159974849546, "loss": 0.5352, "step": 374400 }, { "epoch": 50.458097547830775, "grad_norm": 0.15846741199493408, "learning_rate": 0.00041539417347824785, "loss": 0.537, "step": 374500 }, { "epoch": 50.47157100511991, "grad_norm": 0.15624095499515533, "learning_rate": 0.00041535674720800025, "loss": 0.5354, "step": 374600 }, { "epoch": 50.48504446240906, "grad_norm": 0.154720738530159, "learning_rate": 0.00041531932093775265, "loss": 0.5359, "step": 374700 }, { "epoch": 50.498517919698195, "grad_norm": 0.16052229702472687, "learning_rate": 0.00041528189466750505, "loss": 0.5353, "step": 374800 }, { "epoch": 50.51199137698733, "grad_norm": 0.1778571754693985, "learning_rate": 0.00041524446839725745, "loss": 0.5353, "step": 374900 }, { "epoch": 50.52546483427648, "grad_norm": 0.1512429565191269, "learning_rate": 0.00041520704212700985, "loss": 0.5359, "step": 375000 }, { "epoch": 50.538938291565614, "grad_norm": 0.1784011870622635, "learning_rate": 0.00041516961585676213, "loss": 0.5346, "step": 375100 }, { "epoch": 50.55241174885476, "grad_norm": 0.15275277197360992, "learning_rate": 0.00041513218958651453, "loss": 0.5337, "step": 375200 }, { "epoch": 50.565885206143896, "grad_norm": 0.1622103452682495, "learning_rate": 0.00041509476331626693, "loss": 0.5354, "step": 375300 }, { "epoch": 50.579358663433034, "grad_norm": 0.17048804461956024, "learning_rate": 0.00041505733704601933, "loss": 0.5345, "step": 375400 }, { "epoch": 50.59283212072218, "grad_norm": 0.15884697437286377, "learning_rate": 0.00041501991077577173, "loss": 0.5347, "step": 375500 }, { "epoch": 50.606305578011316, "grad_norm": 0.16322040557861328, "learning_rate": 0.0004149824845055241, "loss": 0.535, "step": 375600 }, { "epoch": 50.61977903530046, "grad_norm": 0.16570064425468445, "learning_rate": 0.0004149450582352765, "loss": 0.5353, "step": 375700 }, { "epoch": 50.6332524925896, "grad_norm": 0.1689014434814453, "learning_rate": 0.0004149076319650289, "loss": 0.5349, "step": 375800 }, { "epoch": 50.646725949878736, "grad_norm": 0.1710018813610077, "learning_rate": 0.00041487020569478127, "loss": 0.5346, "step": 375900 }, { "epoch": 50.66019940716788, "grad_norm": 0.186722993850708, "learning_rate": 0.00041483277942453366, "loss": 0.5356, "step": 376000 }, { "epoch": 50.67367286445702, "grad_norm": 0.1653611809015274, "learning_rate": 0.00041479535315428606, "loss": 0.5342, "step": 376100 }, { "epoch": 50.68714632174616, "grad_norm": 0.16367118060588837, "learning_rate": 0.00041475792688403846, "loss": 0.5352, "step": 376200 }, { "epoch": 50.7006197790353, "grad_norm": 0.17119716107845306, "learning_rate": 0.00041472050061379086, "loss": 0.5358, "step": 376300 }, { "epoch": 50.71409323632444, "grad_norm": 0.15886220335960388, "learning_rate": 0.00041468307434354326, "loss": 0.5356, "step": 376400 }, { "epoch": 50.72756669361358, "grad_norm": 0.1650625616312027, "learning_rate": 0.00041464564807329566, "loss": 0.5367, "step": 376500 }, { "epoch": 50.74104015090272, "grad_norm": 0.168894425034523, "learning_rate": 0.000414608221803048, "loss": 0.535, "step": 376600 }, { "epoch": 50.754513608191864, "grad_norm": 0.16567488014698029, "learning_rate": 0.0004145707955328004, "loss": 0.536, "step": 376700 }, { "epoch": 50.767987065481, "grad_norm": 0.17705009877681732, "learning_rate": 0.0004145333692625528, "loss": 0.535, "step": 376800 }, { "epoch": 50.781460522770146, "grad_norm": 0.1764899045228958, "learning_rate": 0.00041449594299230514, "loss": 0.5354, "step": 376900 }, { "epoch": 50.794933980059284, "grad_norm": 0.186435267329216, "learning_rate": 0.00041445851672205754, "loss": 0.5361, "step": 377000 }, { "epoch": 50.80840743734842, "grad_norm": 0.16158515214920044, "learning_rate": 0.00041442109045180994, "loss": 0.5359, "step": 377100 }, { "epoch": 50.821880894637566, "grad_norm": 0.1810023933649063, "learning_rate": 0.00041438366418156234, "loss": 0.5347, "step": 377200 }, { "epoch": 50.8353543519267, "grad_norm": 0.16264548897743225, "learning_rate": 0.0004143462379113147, "loss": 0.5355, "step": 377300 }, { "epoch": 50.84882780921585, "grad_norm": 0.15435533225536346, "learning_rate": 0.0004143088116410671, "loss": 0.536, "step": 377400 }, { "epoch": 50.862301266504986, "grad_norm": 0.16134600341320038, "learning_rate": 0.0004142713853708195, "loss": 0.5357, "step": 377500 }, { "epoch": 50.87577472379412, "grad_norm": 0.16606752574443817, "learning_rate": 0.0004142339591005719, "loss": 0.536, "step": 377600 }, { "epoch": 50.88924818108327, "grad_norm": 0.15709799528121948, "learning_rate": 0.00041419653283032427, "loss": 0.5348, "step": 377700 }, { "epoch": 50.902721638372405, "grad_norm": 0.15273939073085785, "learning_rate": 0.00041415910656007667, "loss": 0.5352, "step": 377800 }, { "epoch": 50.91619509566155, "grad_norm": 0.18609808385372162, "learning_rate": 0.00041412168028982907, "loss": 0.5356, "step": 377900 }, { "epoch": 50.92966855295069, "grad_norm": 0.1667564958333969, "learning_rate": 0.00041408425401958147, "loss": 0.5347, "step": 378000 }, { "epoch": 50.943142010239825, "grad_norm": 0.14969079196453094, "learning_rate": 0.0004140468277493338, "loss": 0.5352, "step": 378100 }, { "epoch": 50.95661546752897, "grad_norm": 0.17373034358024597, "learning_rate": 0.0004140094014790862, "loss": 0.5354, "step": 378200 }, { "epoch": 50.97008892481811, "grad_norm": 0.1572829633951187, "learning_rate": 0.0004139719752088386, "loss": 0.5344, "step": 378300 }, { "epoch": 50.98356238210725, "grad_norm": 0.15026821196079254, "learning_rate": 0.000413934548938591, "loss": 0.5361, "step": 378400 }, { "epoch": 50.99703583939639, "grad_norm": 0.1591465324163437, "learning_rate": 0.0004138971226683434, "loss": 0.535, "step": 378500 }, { "epoch": 51.0, "eval_loss": 0.5238462686538696, "eval_runtime": 4.9609, "eval_samples_per_second": 1007.887, "eval_steps_per_second": 15.925, "step": 378522 }, { "epoch": 51.01050929668553, "grad_norm": 0.16450956463813782, "learning_rate": 0.0004138596963980958, "loss": 0.5338, "step": 378600 }, { "epoch": 51.02398275397467, "grad_norm": 0.1675262749195099, "learning_rate": 0.00041382227012784815, "loss": 0.5352, "step": 378700 }, { "epoch": 51.03745621126381, "grad_norm": 0.16040731966495514, "learning_rate": 0.0004137848438576005, "loss": 0.5345, "step": 378800 }, { "epoch": 51.05092966855295, "grad_norm": 0.18233586847782135, "learning_rate": 0.0004137474175873529, "loss": 0.5348, "step": 378900 }, { "epoch": 51.06440312584209, "grad_norm": 0.17257095873355865, "learning_rate": 0.0004137099913171053, "loss": 0.5349, "step": 379000 }, { "epoch": 51.07787658313123, "grad_norm": 0.19906923174858093, "learning_rate": 0.0004136725650468577, "loss": 0.5345, "step": 379100 }, { "epoch": 51.09135004042037, "grad_norm": 0.15706400573253632, "learning_rate": 0.0004136351387766101, "loss": 0.5341, "step": 379200 }, { "epoch": 51.10482349770951, "grad_norm": 0.17616848647594452, "learning_rate": 0.0004135977125063625, "loss": 0.5348, "step": 379300 }, { "epoch": 51.118296954998655, "grad_norm": 0.15747608244419098, "learning_rate": 0.0004135602862361149, "loss": 0.5331, "step": 379400 }, { "epoch": 51.13177041228779, "grad_norm": 0.18173883855342865, "learning_rate": 0.0004135228599658672, "loss": 0.5355, "step": 379500 }, { "epoch": 51.14524386957693, "grad_norm": 0.17049364745616913, "learning_rate": 0.0004134854336956196, "loss": 0.5359, "step": 379600 }, { "epoch": 51.158717326866075, "grad_norm": 0.19384898245334625, "learning_rate": 0.000413448007425372, "loss": 0.5352, "step": 379700 }, { "epoch": 51.17219078415521, "grad_norm": 0.15639127790927887, "learning_rate": 0.0004134105811551244, "loss": 0.5365, "step": 379800 }, { "epoch": 51.18566424144436, "grad_norm": 0.17905782163143158, "learning_rate": 0.0004133731548848768, "loss": 0.5344, "step": 379900 }, { "epoch": 51.199137698733495, "grad_norm": 0.15274903178215027, "learning_rate": 0.0004133357286146292, "loss": 0.534, "step": 380000 }, { "epoch": 51.21261115602263, "grad_norm": 0.1650441288948059, "learning_rate": 0.0004132983023443816, "loss": 0.5353, "step": 380100 }, { "epoch": 51.22608461331178, "grad_norm": 0.16087771952152252, "learning_rate": 0.000413260876074134, "loss": 0.5355, "step": 380200 }, { "epoch": 51.239558070600914, "grad_norm": 0.15426281094551086, "learning_rate": 0.00041322344980388636, "loss": 0.5357, "step": 380300 }, { "epoch": 51.25303152789006, "grad_norm": 0.16411279141902924, "learning_rate": 0.00041318602353363875, "loss": 0.5352, "step": 380400 }, { "epoch": 51.266504985179196, "grad_norm": 0.16224312782287598, "learning_rate": 0.0004131485972633911, "loss": 0.5349, "step": 380500 }, { "epoch": 51.279978442468334, "grad_norm": 0.1603717803955078, "learning_rate": 0.0004131111709931435, "loss": 0.5345, "step": 380600 }, { "epoch": 51.29345189975748, "grad_norm": 0.16184623539447784, "learning_rate": 0.0004130737447228959, "loss": 0.535, "step": 380700 }, { "epoch": 51.306925357046616, "grad_norm": 0.16729411482810974, "learning_rate": 0.0004130363184526483, "loss": 0.5347, "step": 380800 }, { "epoch": 51.32039881433576, "grad_norm": 0.169197678565979, "learning_rate": 0.0004129988921824007, "loss": 0.5347, "step": 380900 }, { "epoch": 51.3338722716249, "grad_norm": 0.15675321221351624, "learning_rate": 0.00041296146591215303, "loss": 0.5352, "step": 381000 }, { "epoch": 51.34734572891404, "grad_norm": 0.16819757223129272, "learning_rate": 0.00041292403964190543, "loss": 0.5363, "step": 381100 }, { "epoch": 51.36081918620318, "grad_norm": 0.14997585117816925, "learning_rate": 0.00041288661337165783, "loss": 0.5349, "step": 381200 }, { "epoch": 51.37429264349232, "grad_norm": 0.16850918531417847, "learning_rate": 0.00041284918710141023, "loss": 0.5351, "step": 381300 }, { "epoch": 51.38776610078146, "grad_norm": 0.16102814674377441, "learning_rate": 0.00041281176083116263, "loss": 0.5355, "step": 381400 }, { "epoch": 51.4012395580706, "grad_norm": 0.16912135481834412, "learning_rate": 0.000412774334560915, "loss": 0.5336, "step": 381500 }, { "epoch": 51.414713015359744, "grad_norm": 0.15917746722698212, "learning_rate": 0.0004127369082906674, "loss": 0.5345, "step": 381600 }, { "epoch": 51.42818647264888, "grad_norm": 0.1795923411846161, "learning_rate": 0.00041269948202041977, "loss": 0.5346, "step": 381700 }, { "epoch": 51.44165992993802, "grad_norm": 0.17376776039600372, "learning_rate": 0.00041266205575017217, "loss": 0.5348, "step": 381800 }, { "epoch": 51.455133387227164, "grad_norm": 0.17877037823200226, "learning_rate": 0.00041262462947992456, "loss": 0.5355, "step": 381900 }, { "epoch": 51.4686068445163, "grad_norm": 0.14823128283023834, "learning_rate": 0.00041258720320967696, "loss": 0.5343, "step": 382000 }, { "epoch": 51.482080301805446, "grad_norm": 0.16996727883815765, "learning_rate": 0.00041254977693942936, "loss": 0.5347, "step": 382100 }, { "epoch": 51.495553759094584, "grad_norm": 0.15616989135742188, "learning_rate": 0.00041251235066918176, "loss": 0.5341, "step": 382200 }, { "epoch": 51.50902721638372, "grad_norm": 0.15826641023159027, "learning_rate": 0.0004124749243989341, "loss": 0.5349, "step": 382300 }, { "epoch": 51.522500673672866, "grad_norm": 0.16151678562164307, "learning_rate": 0.00041243749812868645, "loss": 0.5359, "step": 382400 }, { "epoch": 51.535974130962, "grad_norm": 0.16055332124233246, "learning_rate": 0.00041240007185843885, "loss": 0.5344, "step": 382500 }, { "epoch": 51.54944758825115, "grad_norm": 0.15796847641468048, "learning_rate": 0.00041236264558819124, "loss": 0.535, "step": 382600 }, { "epoch": 51.562921045540286, "grad_norm": 0.16708436608314514, "learning_rate": 0.00041232521931794364, "loss": 0.5356, "step": 382700 }, { "epoch": 51.57639450282942, "grad_norm": 0.15307264029979706, "learning_rate": 0.00041228779304769604, "loss": 0.5346, "step": 382800 }, { "epoch": 51.58986796011857, "grad_norm": 0.15336500108242035, "learning_rate": 0.00041225036677744844, "loss": 0.5352, "step": 382900 }, { "epoch": 51.603341417407705, "grad_norm": 0.15885455906391144, "learning_rate": 0.00041221294050720084, "loss": 0.5345, "step": 383000 }, { "epoch": 51.61681487469685, "grad_norm": 0.16817495226860046, "learning_rate": 0.00041217551423695323, "loss": 0.5345, "step": 383100 }, { "epoch": 51.63028833198599, "grad_norm": 0.16727064549922943, "learning_rate": 0.0004121380879667056, "loss": 0.5342, "step": 383200 }, { "epoch": 51.643761789275125, "grad_norm": 0.14900054037570953, "learning_rate": 0.000412100661696458, "loss": 0.5362, "step": 383300 }, { "epoch": 51.65723524656427, "grad_norm": 0.16726279258728027, "learning_rate": 0.0004120632354262104, "loss": 0.5353, "step": 383400 }, { "epoch": 51.67070870385341, "grad_norm": 0.16145025193691254, "learning_rate": 0.0004120258091559628, "loss": 0.5354, "step": 383500 }, { "epoch": 51.68418216114255, "grad_norm": 0.17545461654663086, "learning_rate": 0.00041198838288571517, "loss": 0.5357, "step": 383600 }, { "epoch": 51.69765561843169, "grad_norm": 0.18434692919254303, "learning_rate": 0.00041195095661546757, "loss": 0.5353, "step": 383700 }, { "epoch": 51.71112907572083, "grad_norm": 0.15852418541908264, "learning_rate": 0.00041191353034521997, "loss": 0.5355, "step": 383800 }, { "epoch": 51.72460253300997, "grad_norm": 0.15774668753147125, "learning_rate": 0.0004118761040749723, "loss": 0.5348, "step": 383900 }, { "epoch": 51.73807599029911, "grad_norm": 0.17925392091274261, "learning_rate": 0.0004118386778047247, "loss": 0.535, "step": 384000 }, { "epoch": 51.75154944758825, "grad_norm": 0.1566782146692276, "learning_rate": 0.00041180125153447705, "loss": 0.5355, "step": 384100 }, { "epoch": 51.76502290487739, "grad_norm": 0.17646028101444244, "learning_rate": 0.00041176382526422945, "loss": 0.5345, "step": 384200 }, { "epoch": 51.778496362166536, "grad_norm": 0.16022889316082, "learning_rate": 0.00041172639899398185, "loss": 0.5354, "step": 384300 }, { "epoch": 51.79196981945567, "grad_norm": 0.17409180104732513, "learning_rate": 0.00041168897272373425, "loss": 0.5341, "step": 384400 }, { "epoch": 51.80544327674481, "grad_norm": 0.15499258041381836, "learning_rate": 0.00041165154645348665, "loss": 0.5343, "step": 384500 }, { "epoch": 51.818916734033955, "grad_norm": 0.16608406603336334, "learning_rate": 0.000411614120183239, "loss": 0.5343, "step": 384600 }, { "epoch": 51.83239019132309, "grad_norm": 0.15954382717609406, "learning_rate": 0.0004115766939129914, "loss": 0.535, "step": 384700 }, { "epoch": 51.84586364861224, "grad_norm": 0.15257330238819122, "learning_rate": 0.0004115392676427438, "loss": 0.5346, "step": 384800 }, { "epoch": 51.859337105901375, "grad_norm": 0.15934553742408752, "learning_rate": 0.0004115018413724962, "loss": 0.5341, "step": 384900 }, { "epoch": 51.87281056319051, "grad_norm": 0.15600048005580902, "learning_rate": 0.0004114644151022486, "loss": 0.5357, "step": 385000 }, { "epoch": 51.88628402047966, "grad_norm": 0.1582929491996765, "learning_rate": 0.000411426988832001, "loss": 0.5349, "step": 385100 }, { "epoch": 51.899757477768794, "grad_norm": 0.16111701726913452, "learning_rate": 0.0004113895625617534, "loss": 0.5349, "step": 385200 }, { "epoch": 51.91323093505794, "grad_norm": 0.16110146045684814, "learning_rate": 0.0004113521362915058, "loss": 0.5354, "step": 385300 }, { "epoch": 51.92670439234708, "grad_norm": 0.16141143441200256, "learning_rate": 0.0004113147100212581, "loss": 0.5349, "step": 385400 }, { "epoch": 51.940177849636214, "grad_norm": 0.1596037745475769, "learning_rate": 0.0004112772837510105, "loss": 0.5343, "step": 385500 }, { "epoch": 51.95365130692536, "grad_norm": 0.16392533481121063, "learning_rate": 0.0004112398574807629, "loss": 0.5343, "step": 385600 }, { "epoch": 51.967124764214496, "grad_norm": 0.17088045179843903, "learning_rate": 0.0004112024312105153, "loss": 0.5346, "step": 385700 }, { "epoch": 51.98059822150364, "grad_norm": 0.17076930403709412, "learning_rate": 0.0004111650049402677, "loss": 0.5351, "step": 385800 }, { "epoch": 51.99407167879278, "grad_norm": 0.15702229738235474, "learning_rate": 0.00041112757867002006, "loss": 0.5352, "step": 385900 }, { "epoch": 52.0, "eval_loss": 0.5226619839668274, "eval_runtime": 4.9585, "eval_samples_per_second": 1008.377, "eval_steps_per_second": 15.932, "step": 385944 }, { "epoch": 52.007545136081916, "grad_norm": 0.1589498668909073, "learning_rate": 0.00041109015239977246, "loss": 0.5335, "step": 386000 }, { "epoch": 52.02101859337106, "grad_norm": 0.15424680709838867, "learning_rate": 0.0004110527261295248, "loss": 0.5346, "step": 386100 }, { "epoch": 52.0344920506602, "grad_norm": 0.1818409562110901, "learning_rate": 0.0004110152998592772, "loss": 0.5342, "step": 386200 }, { "epoch": 52.04796550794934, "grad_norm": 0.15368524193763733, "learning_rate": 0.0004109778735890296, "loss": 0.5337, "step": 386300 }, { "epoch": 52.06143896523848, "grad_norm": 0.16633054614067078, "learning_rate": 0.000410940447318782, "loss": 0.5341, "step": 386400 }, { "epoch": 52.07491242252762, "grad_norm": 0.15077462792396545, "learning_rate": 0.0004109030210485344, "loss": 0.5345, "step": 386500 }, { "epoch": 52.08838587981676, "grad_norm": 0.15223419666290283, "learning_rate": 0.0004108655947782868, "loss": 0.5345, "step": 386600 }, { "epoch": 52.1018593371059, "grad_norm": 0.16710230708122253, "learning_rate": 0.0004108281685080392, "loss": 0.5329, "step": 386700 }, { "epoch": 52.115332794395044, "grad_norm": 0.15318751335144043, "learning_rate": 0.00041079074223779154, "loss": 0.5345, "step": 386800 }, { "epoch": 52.12880625168418, "grad_norm": 0.1820966750383377, "learning_rate": 0.00041075331596754393, "loss": 0.5345, "step": 386900 }, { "epoch": 52.14227970897332, "grad_norm": 0.15346543490886688, "learning_rate": 0.00041071588969729633, "loss": 0.5343, "step": 387000 }, { "epoch": 52.155753166262464, "grad_norm": 0.20120160281658173, "learning_rate": 0.00041067846342704873, "loss": 0.535, "step": 387100 }, { "epoch": 52.1692266235516, "grad_norm": 0.16130881011486053, "learning_rate": 0.00041064103715680113, "loss": 0.5347, "step": 387200 }, { "epoch": 52.182700080840746, "grad_norm": 0.15871186554431915, "learning_rate": 0.0004106036108865535, "loss": 0.5333, "step": 387300 }, { "epoch": 52.196173538129884, "grad_norm": 0.17339283227920532, "learning_rate": 0.0004105661846163059, "loss": 0.5355, "step": 387400 }, { "epoch": 52.20964699541902, "grad_norm": 0.15987877547740936, "learning_rate": 0.00041052875834605827, "loss": 0.5347, "step": 387500 }, { "epoch": 52.223120452708166, "grad_norm": 0.1576877236366272, "learning_rate": 0.00041049133207581067, "loss": 0.5346, "step": 387600 }, { "epoch": 52.2365939099973, "grad_norm": 0.16335347294807434, "learning_rate": 0.00041045390580556307, "loss": 0.5341, "step": 387700 }, { "epoch": 52.25006736728645, "grad_norm": 0.15498851239681244, "learning_rate": 0.0004104164795353154, "loss": 0.5347, "step": 387800 }, { "epoch": 52.263540824575585, "grad_norm": 0.17132532596588135, "learning_rate": 0.0004103790532650678, "loss": 0.5341, "step": 387900 }, { "epoch": 52.27701428186472, "grad_norm": 0.163446843624115, "learning_rate": 0.0004103416269948202, "loss": 0.5339, "step": 388000 }, { "epoch": 52.29048773915387, "grad_norm": 0.15430493652820587, "learning_rate": 0.0004103042007245726, "loss": 0.5356, "step": 388100 }, { "epoch": 52.303961196443005, "grad_norm": 0.15214760601520538, "learning_rate": 0.000410266774454325, "loss": 0.5341, "step": 388200 }, { "epoch": 52.31743465373215, "grad_norm": 0.1560440957546234, "learning_rate": 0.00041022934818407735, "loss": 0.5358, "step": 388300 }, { "epoch": 52.33090811102129, "grad_norm": 0.15578524768352509, "learning_rate": 0.00041019192191382974, "loss": 0.5349, "step": 388400 }, { "epoch": 52.34438156831043, "grad_norm": 0.1698550432920456, "learning_rate": 0.00041015449564358214, "loss": 0.5344, "step": 388500 }, { "epoch": 52.35785502559957, "grad_norm": 0.15528367459774017, "learning_rate": 0.00041011706937333454, "loss": 0.5356, "step": 388600 }, { "epoch": 52.37132848288871, "grad_norm": 0.15835505723953247, "learning_rate": 0.00041007964310308694, "loss": 0.5357, "step": 388700 }, { "epoch": 52.38480194017785, "grad_norm": 0.1552198976278305, "learning_rate": 0.00041004221683283934, "loss": 0.5342, "step": 388800 }, { "epoch": 52.39827539746699, "grad_norm": 0.1628342717885971, "learning_rate": 0.00041000479056259174, "loss": 0.5347, "step": 388900 }, { "epoch": 52.411748854756134, "grad_norm": 0.16433045268058777, "learning_rate": 0.0004099673642923441, "loss": 0.5342, "step": 389000 }, { "epoch": 52.42522231204527, "grad_norm": 0.16250765323638916, "learning_rate": 0.0004099299380220965, "loss": 0.5341, "step": 389100 }, { "epoch": 52.43869576933441, "grad_norm": 0.17097686231136322, "learning_rate": 0.0004098925117518489, "loss": 0.5351, "step": 389200 }, { "epoch": 52.45216922662355, "grad_norm": 0.15374036133289337, "learning_rate": 0.0004098550854816013, "loss": 0.534, "step": 389300 }, { "epoch": 52.46564268391269, "grad_norm": 0.16238868236541748, "learning_rate": 0.00040981765921135367, "loss": 0.5341, "step": 389400 }, { "epoch": 52.479116141201835, "grad_norm": 0.18146024644374847, "learning_rate": 0.00040978023294110607, "loss": 0.5356, "step": 389500 }, { "epoch": 52.49258959849097, "grad_norm": 0.16507862508296967, "learning_rate": 0.0004097428066708584, "loss": 0.5347, "step": 389600 }, { "epoch": 52.50606305578011, "grad_norm": 0.1629118174314499, "learning_rate": 0.00040970538040061076, "loss": 0.534, "step": 389700 }, { "epoch": 52.519536513069255, "grad_norm": 0.15293797850608826, "learning_rate": 0.00040966795413036316, "loss": 0.5343, "step": 389800 }, { "epoch": 52.53300997035839, "grad_norm": 0.15439824759960175, "learning_rate": 0.00040963052786011556, "loss": 0.5338, "step": 389900 }, { "epoch": 52.54648342764754, "grad_norm": 0.1566559374332428, "learning_rate": 0.00040959310158986795, "loss": 0.5345, "step": 390000 }, { "epoch": 52.559956884936675, "grad_norm": 0.17326828837394714, "learning_rate": 0.00040955567531962035, "loss": 0.5341, "step": 390100 }, { "epoch": 52.57343034222581, "grad_norm": 0.16783972084522247, "learning_rate": 0.00040951824904937275, "loss": 0.5333, "step": 390200 }, { "epoch": 52.58690379951496, "grad_norm": 0.1764957159757614, "learning_rate": 0.00040948082277912515, "loss": 0.5349, "step": 390300 }, { "epoch": 52.600377256804094, "grad_norm": 0.1880938708782196, "learning_rate": 0.0004094433965088775, "loss": 0.534, "step": 390400 }, { "epoch": 52.61385071409324, "grad_norm": 0.1654127687215805, "learning_rate": 0.0004094059702386299, "loss": 0.5337, "step": 390500 }, { "epoch": 52.62732417138238, "grad_norm": 0.19481903314590454, "learning_rate": 0.0004093685439683823, "loss": 0.5349, "step": 390600 }, { "epoch": 52.640797628671514, "grad_norm": 0.1630757451057434, "learning_rate": 0.0004093311176981347, "loss": 0.5354, "step": 390700 }, { "epoch": 52.65427108596066, "grad_norm": 0.1671997308731079, "learning_rate": 0.0004092936914278871, "loss": 0.5341, "step": 390800 }, { "epoch": 52.667744543249796, "grad_norm": 0.16666170954704285, "learning_rate": 0.0004092562651576395, "loss": 0.5341, "step": 390900 }, { "epoch": 52.68121800053894, "grad_norm": 0.16686637699604034, "learning_rate": 0.0004092188388873919, "loss": 0.5333, "step": 391000 }, { "epoch": 52.69469145782808, "grad_norm": 0.18331053853034973, "learning_rate": 0.0004091814126171443, "loss": 0.5356, "step": 391100 }, { "epoch": 52.708164915117216, "grad_norm": 0.16908667981624603, "learning_rate": 0.0004091439863468966, "loss": 0.5345, "step": 391200 }, { "epoch": 52.72163837240636, "grad_norm": 0.15570679306983948, "learning_rate": 0.000409106560076649, "loss": 0.5344, "step": 391300 }, { "epoch": 52.7351118296955, "grad_norm": 0.16675889492034912, "learning_rate": 0.00040906913380640137, "loss": 0.5347, "step": 391400 }, { "epoch": 52.74858528698464, "grad_norm": 0.19338847696781158, "learning_rate": 0.00040903170753615376, "loss": 0.5343, "step": 391500 }, { "epoch": 52.76205874427378, "grad_norm": 0.16351962089538574, "learning_rate": 0.00040899428126590616, "loss": 0.5345, "step": 391600 }, { "epoch": 52.77553220156292, "grad_norm": 0.17345304787158966, "learning_rate": 0.00040895685499565856, "loss": 0.534, "step": 391700 }, { "epoch": 52.78900565885206, "grad_norm": 0.1661025434732437, "learning_rate": 0.00040891942872541096, "loss": 0.5342, "step": 391800 }, { "epoch": 52.8024791161412, "grad_norm": 0.15562967956066132, "learning_rate": 0.0004088820024551633, "loss": 0.5348, "step": 391900 }, { "epoch": 52.815952573430344, "grad_norm": 0.16562913358211517, "learning_rate": 0.0004088445761849157, "loss": 0.5347, "step": 392000 }, { "epoch": 52.82942603071948, "grad_norm": 0.19181779026985168, "learning_rate": 0.0004088071499146681, "loss": 0.5342, "step": 392100 }, { "epoch": 52.84289948800863, "grad_norm": 0.15081411600112915, "learning_rate": 0.0004087697236444205, "loss": 0.5338, "step": 392200 }, { "epoch": 52.856372945297764, "grad_norm": 0.16775359213352203, "learning_rate": 0.0004087322973741729, "loss": 0.5344, "step": 392300 }, { "epoch": 52.8698464025869, "grad_norm": 0.15967942774295807, "learning_rate": 0.0004086948711039253, "loss": 0.5343, "step": 392400 }, { "epoch": 52.883319859876046, "grad_norm": 0.17052926123142242, "learning_rate": 0.0004086574448336777, "loss": 0.5348, "step": 392500 }, { "epoch": 52.896793317165184, "grad_norm": 0.17232441902160645, "learning_rate": 0.00040862001856343004, "loss": 0.5339, "step": 392600 }, { "epoch": 52.91026677445433, "grad_norm": 0.1725412756204605, "learning_rate": 0.00040858259229318244, "loss": 0.5347, "step": 392700 }, { "epoch": 52.923740231743466, "grad_norm": 0.16053630411624908, "learning_rate": 0.00040854516602293483, "loss": 0.5333, "step": 392800 }, { "epoch": 52.9372136890326, "grad_norm": 0.17657151818275452, "learning_rate": 0.00040850773975268723, "loss": 0.5349, "step": 392900 }, { "epoch": 52.95068714632175, "grad_norm": 0.1511440873146057, "learning_rate": 0.00040847031348243963, "loss": 0.5339, "step": 393000 }, { "epoch": 52.964160603610885, "grad_norm": 0.16916778683662415, "learning_rate": 0.00040843288721219203, "loss": 0.5342, "step": 393100 }, { "epoch": 52.97763406090003, "grad_norm": 0.16658848524093628, "learning_rate": 0.00040839546094194437, "loss": 0.5358, "step": 393200 }, { "epoch": 52.99110751818917, "grad_norm": 0.1674860119819641, "learning_rate": 0.00040835803467169677, "loss": 0.5342, "step": 393300 }, { "epoch": 53.0, "eval_loss": 0.5226081013679504, "eval_runtime": 4.9709, "eval_samples_per_second": 1005.853, "eval_steps_per_second": 15.892, "step": 393366 }, { "epoch": 53.004580975478305, "grad_norm": 0.17614401876926422, "learning_rate": 0.0004083206084014491, "loss": 0.5346, "step": 393400 }, { "epoch": 53.01805443276745, "grad_norm": 0.16308701038360596, "learning_rate": 0.0004082831821312015, "loss": 0.5334, "step": 393500 }, { "epoch": 53.03152789005659, "grad_norm": 0.15705932676792145, "learning_rate": 0.0004082457558609539, "loss": 0.5337, "step": 393600 }, { "epoch": 53.04500134734573, "grad_norm": 0.15682637691497803, "learning_rate": 0.0004082083295907063, "loss": 0.5332, "step": 393700 }, { "epoch": 53.05847480463487, "grad_norm": 0.1638183742761612, "learning_rate": 0.0004081709033204587, "loss": 0.5337, "step": 393800 }, { "epoch": 53.07194826192401, "grad_norm": 0.16187521815299988, "learning_rate": 0.0004081334770502111, "loss": 0.5342, "step": 393900 }, { "epoch": 53.08542171921315, "grad_norm": 0.1734238713979721, "learning_rate": 0.0004080960507799635, "loss": 0.5333, "step": 394000 }, { "epoch": 53.09889517650229, "grad_norm": 0.16164454817771912, "learning_rate": 0.00040805862450971585, "loss": 0.535, "step": 394100 }, { "epoch": 53.112368633791434, "grad_norm": 0.15889710187911987, "learning_rate": 0.00040802119823946825, "loss": 0.5357, "step": 394200 }, { "epoch": 53.12584209108057, "grad_norm": 0.16178539395332336, "learning_rate": 0.00040798377196922064, "loss": 0.5336, "step": 394300 }, { "epoch": 53.13931554836971, "grad_norm": 0.1654803305864334, "learning_rate": 0.00040794634569897304, "loss": 0.5346, "step": 394400 }, { "epoch": 53.15278900565885, "grad_norm": 0.15787829458713531, "learning_rate": 0.00040790891942872544, "loss": 0.5329, "step": 394500 }, { "epoch": 53.16626246294799, "grad_norm": 0.1571262925863266, "learning_rate": 0.00040787149315847784, "loss": 0.5334, "step": 394600 }, { "epoch": 53.179735920237135, "grad_norm": 0.17500273883342743, "learning_rate": 0.00040783406688823024, "loss": 0.5338, "step": 394700 }, { "epoch": 53.19320937752627, "grad_norm": 0.1699448823928833, "learning_rate": 0.0004077966406179826, "loss": 0.5344, "step": 394800 }, { "epoch": 53.20668283481541, "grad_norm": 0.17638513445854187, "learning_rate": 0.000407759214347735, "loss": 0.535, "step": 394900 }, { "epoch": 53.220156292104555, "grad_norm": 0.17560696601867676, "learning_rate": 0.0004077217880774873, "loss": 0.5338, "step": 395000 }, { "epoch": 53.23362974939369, "grad_norm": 0.16810381412506104, "learning_rate": 0.0004076843618072397, "loss": 0.5344, "step": 395100 }, { "epoch": 53.24710320668284, "grad_norm": 0.15997448563575745, "learning_rate": 0.0004076469355369921, "loss": 0.533, "step": 395200 }, { "epoch": 53.260576663971975, "grad_norm": 0.15981519222259521, "learning_rate": 0.0004076095092667445, "loss": 0.5334, "step": 395300 }, { "epoch": 53.27405012126111, "grad_norm": 0.17203710973262787, "learning_rate": 0.0004075720829964969, "loss": 0.5346, "step": 395400 }, { "epoch": 53.28752357855026, "grad_norm": 0.1699325144290924, "learning_rate": 0.00040753465672624926, "loss": 0.5351, "step": 395500 }, { "epoch": 53.300997035839394, "grad_norm": 0.15891151130199432, "learning_rate": 0.00040749723045600166, "loss": 0.5344, "step": 395600 }, { "epoch": 53.31447049312854, "grad_norm": 0.17479093372821808, "learning_rate": 0.00040745980418575406, "loss": 0.5342, "step": 395700 }, { "epoch": 53.327943950417676, "grad_norm": 0.16602008044719696, "learning_rate": 0.00040742237791550645, "loss": 0.533, "step": 395800 }, { "epoch": 53.34141740770682, "grad_norm": 0.18701249361038208, "learning_rate": 0.00040738495164525885, "loss": 0.5337, "step": 395900 }, { "epoch": 53.35489086499596, "grad_norm": 0.1584673374891281, "learning_rate": 0.00040734752537501125, "loss": 0.535, "step": 396000 }, { "epoch": 53.368364322285096, "grad_norm": 0.21205997467041016, "learning_rate": 0.00040731009910476365, "loss": 0.5346, "step": 396100 }, { "epoch": 53.38183777957424, "grad_norm": 0.1625450849533081, "learning_rate": 0.00040727267283451605, "loss": 0.5344, "step": 396200 }, { "epoch": 53.39531123686338, "grad_norm": 0.1909109205007553, "learning_rate": 0.0004072352465642684, "loss": 0.5346, "step": 396300 }, { "epoch": 53.40878469415252, "grad_norm": 0.15626537799835205, "learning_rate": 0.0004071978202940208, "loss": 0.5342, "step": 396400 }, { "epoch": 53.42225815144166, "grad_norm": 0.18276937305927277, "learning_rate": 0.0004071603940237732, "loss": 0.5333, "step": 396500 }, { "epoch": 53.4357316087308, "grad_norm": 0.15560074150562286, "learning_rate": 0.0004071229677535256, "loss": 0.5336, "step": 396600 }, { "epoch": 53.44920506601994, "grad_norm": 0.15257976949214935, "learning_rate": 0.000407085541483278, "loss": 0.5347, "step": 396700 }, { "epoch": 53.46267852330908, "grad_norm": 0.17421911656856537, "learning_rate": 0.00040704811521303033, "loss": 0.5344, "step": 396800 }, { "epoch": 53.476151980598225, "grad_norm": 0.19284473359584808, "learning_rate": 0.00040701068894278273, "loss": 0.5335, "step": 396900 }, { "epoch": 53.48962543788736, "grad_norm": 0.18741656839847565, "learning_rate": 0.00040697326267253507, "loss": 0.5339, "step": 397000 }, { "epoch": 53.5030988951765, "grad_norm": 0.16688477993011475, "learning_rate": 0.00040693583640228747, "loss": 0.5342, "step": 397100 }, { "epoch": 53.516572352465644, "grad_norm": 0.16316640377044678, "learning_rate": 0.00040689841013203987, "loss": 0.5344, "step": 397200 }, { "epoch": 53.53004580975478, "grad_norm": 0.15453243255615234, "learning_rate": 0.00040686098386179227, "loss": 0.5338, "step": 397300 }, { "epoch": 53.543519267043926, "grad_norm": 0.1605626493692398, "learning_rate": 0.00040682355759154466, "loss": 0.5347, "step": 397400 }, { "epoch": 53.556992724333064, "grad_norm": 0.1528843194246292, "learning_rate": 0.00040678613132129706, "loss": 0.5348, "step": 397500 }, { "epoch": 53.5704661816222, "grad_norm": 0.161251038312912, "learning_rate": 0.00040674870505104946, "loss": 0.534, "step": 397600 }, { "epoch": 53.583939638911346, "grad_norm": 0.17004653811454773, "learning_rate": 0.0004067112787808018, "loss": 0.5348, "step": 397700 }, { "epoch": 53.59741309620048, "grad_norm": 0.16604365408420563, "learning_rate": 0.0004066738525105542, "loss": 0.5336, "step": 397800 }, { "epoch": 53.61088655348963, "grad_norm": 0.16590258479118347, "learning_rate": 0.0004066364262403066, "loss": 0.5343, "step": 397900 }, { "epoch": 53.624360010778766, "grad_norm": 0.1704234778881073, "learning_rate": 0.000406598999970059, "loss": 0.5344, "step": 398000 }, { "epoch": 53.6378334680679, "grad_norm": 0.16301414370536804, "learning_rate": 0.0004065615736998114, "loss": 0.534, "step": 398100 }, { "epoch": 53.65130692535705, "grad_norm": 0.15135295689105988, "learning_rate": 0.0004065241474295638, "loss": 0.533, "step": 398200 }, { "epoch": 53.664780382646185, "grad_norm": 0.16420860588550568, "learning_rate": 0.0004064867211593162, "loss": 0.5347, "step": 398300 }, { "epoch": 53.67825383993533, "grad_norm": 0.17475010454654694, "learning_rate": 0.0004064492948890686, "loss": 0.5335, "step": 398400 }, { "epoch": 53.69172729722447, "grad_norm": 0.1758614182472229, "learning_rate": 0.00040641186861882094, "loss": 0.5332, "step": 398500 }, { "epoch": 53.705200754513605, "grad_norm": 0.1697923243045807, "learning_rate": 0.0004063744423485733, "loss": 0.5351, "step": 398600 }, { "epoch": 53.71867421180275, "grad_norm": 0.1808752864599228, "learning_rate": 0.0004063370160783257, "loss": 0.5336, "step": 398700 }, { "epoch": 53.73214766909189, "grad_norm": 0.17128917574882507, "learning_rate": 0.0004062995898080781, "loss": 0.5335, "step": 398800 }, { "epoch": 53.74562112638103, "grad_norm": 0.1760786473751068, "learning_rate": 0.0004062621635378305, "loss": 0.5338, "step": 398900 }, { "epoch": 53.75909458367017, "grad_norm": 0.16333454847335815, "learning_rate": 0.0004062247372675829, "loss": 0.5347, "step": 399000 }, { "epoch": 53.77256804095931, "grad_norm": 0.16137734055519104, "learning_rate": 0.00040618731099733527, "loss": 0.5327, "step": 399100 }, { "epoch": 53.78604149824845, "grad_norm": 0.16812703013420105, "learning_rate": 0.0004061498847270876, "loss": 0.5339, "step": 399200 }, { "epoch": 53.79951495553759, "grad_norm": 0.16289560496807098, "learning_rate": 0.00040611245845684, "loss": 0.5345, "step": 399300 }, { "epoch": 53.81298841282673, "grad_norm": 0.1701275259256363, "learning_rate": 0.0004060750321865924, "loss": 0.5343, "step": 399400 }, { "epoch": 53.82646187011587, "grad_norm": 0.17643608152866364, "learning_rate": 0.0004060376059163448, "loss": 0.5331, "step": 399500 }, { "epoch": 53.839935327405016, "grad_norm": 0.1601739078760147, "learning_rate": 0.0004060001796460972, "loss": 0.5346, "step": 399600 }, { "epoch": 53.85340878469415, "grad_norm": 0.17004093527793884, "learning_rate": 0.0004059627533758496, "loss": 0.5343, "step": 399700 }, { "epoch": 53.86688224198329, "grad_norm": 0.15341438353061676, "learning_rate": 0.000405925327105602, "loss": 0.5338, "step": 399800 }, { "epoch": 53.880355699272435, "grad_norm": 0.14918842911720276, "learning_rate": 0.00040588790083535435, "loss": 0.5339, "step": 399900 }, { "epoch": 53.89382915656157, "grad_norm": 0.1566653847694397, "learning_rate": 0.00040585047456510675, "loss": 0.5332, "step": 400000 }, { "epoch": 53.90730261385072, "grad_norm": 0.16668584942817688, "learning_rate": 0.00040581304829485915, "loss": 0.5338, "step": 400100 }, { "epoch": 53.920776071139855, "grad_norm": 0.15925435721874237, "learning_rate": 0.00040577562202461154, "loss": 0.5342, "step": 400200 }, { "epoch": 53.93424952842899, "grad_norm": 0.17795869708061218, "learning_rate": 0.00040573819575436394, "loss": 0.5339, "step": 400300 }, { "epoch": 53.94772298571814, "grad_norm": 0.15283392369747162, "learning_rate": 0.0004057007694841163, "loss": 0.5342, "step": 400400 }, { "epoch": 53.961196443007275, "grad_norm": 0.15097463130950928, "learning_rate": 0.0004056633432138687, "loss": 0.5338, "step": 400500 }, { "epoch": 53.97466990029642, "grad_norm": 0.16937889158725739, "learning_rate": 0.00040562591694362103, "loss": 0.5343, "step": 400600 }, { "epoch": 53.98814335758556, "grad_norm": 0.19589370489120483, "learning_rate": 0.0004055884906733734, "loss": 0.5343, "step": 400700 }, { "epoch": 54.0, "eval_loss": 0.5217633843421936, "eval_runtime": 4.9637, "eval_samples_per_second": 1007.323, "eval_steps_per_second": 15.916, "step": 400788 }, { "epoch": 54.001616814874694, "grad_norm": 0.16061806678771973, "learning_rate": 0.0004055510644031258, "loss": 0.5345, "step": 400800 }, { "epoch": 54.01509027216384, "grad_norm": 0.15547320246696472, "learning_rate": 0.0004055136381328782, "loss": 0.5335, "step": 400900 }, { "epoch": 54.028563729452976, "grad_norm": 0.18170513212680817, "learning_rate": 0.0004054762118626306, "loss": 0.5339, "step": 401000 }, { "epoch": 54.04203718674212, "grad_norm": 0.15916971862316132, "learning_rate": 0.000405438785592383, "loss": 0.5339, "step": 401100 }, { "epoch": 54.05551064403126, "grad_norm": 0.16700026392936707, "learning_rate": 0.0004054013593221354, "loss": 0.5335, "step": 401200 }, { "epoch": 54.068984101320396, "grad_norm": 0.16241079568862915, "learning_rate": 0.0004053639330518878, "loss": 0.5335, "step": 401300 }, { "epoch": 54.08245755860954, "grad_norm": 0.1642398089170456, "learning_rate": 0.00040532650678164016, "loss": 0.5336, "step": 401400 }, { "epoch": 54.09593101589868, "grad_norm": 0.1729261577129364, "learning_rate": 0.00040528908051139256, "loss": 0.5333, "step": 401500 }, { "epoch": 54.10940447318782, "grad_norm": 0.1829478144645691, "learning_rate": 0.00040525165424114496, "loss": 0.5338, "step": 401600 }, { "epoch": 54.12287793047696, "grad_norm": 0.1898423135280609, "learning_rate": 0.00040521422797089735, "loss": 0.5332, "step": 401700 }, { "epoch": 54.1363513877661, "grad_norm": 0.16220401227474213, "learning_rate": 0.00040517680170064975, "loss": 0.5352, "step": 401800 }, { "epoch": 54.14982484505524, "grad_norm": 0.16253246366977692, "learning_rate": 0.00040513937543040215, "loss": 0.5344, "step": 401900 }, { "epoch": 54.16329830234438, "grad_norm": 0.15447930991649628, "learning_rate": 0.00040510194916015455, "loss": 0.5341, "step": 402000 }, { "epoch": 54.176771759633525, "grad_norm": 0.16670317947864532, "learning_rate": 0.0004050645228899069, "loss": 0.5342, "step": 402100 }, { "epoch": 54.19024521692266, "grad_norm": 0.17033393681049347, "learning_rate": 0.00040502709661965924, "loss": 0.5336, "step": 402200 }, { "epoch": 54.2037186742118, "grad_norm": 0.15715813636779785, "learning_rate": 0.00040498967034941164, "loss": 0.5326, "step": 402300 }, { "epoch": 54.217192131500944, "grad_norm": 0.1660919189453125, "learning_rate": 0.00040495224407916403, "loss": 0.5333, "step": 402400 }, { "epoch": 54.23066558879008, "grad_norm": 0.17396895587444305, "learning_rate": 0.00040491481780891643, "loss": 0.534, "step": 402500 }, { "epoch": 54.244139046079226, "grad_norm": 0.16776476800441742, "learning_rate": 0.00040487739153866883, "loss": 0.534, "step": 402600 }, { "epoch": 54.257612503368364, "grad_norm": 0.15295152366161346, "learning_rate": 0.00040483996526842123, "loss": 0.5335, "step": 402700 }, { "epoch": 54.2710859606575, "grad_norm": 0.1604563295841217, "learning_rate": 0.00040480253899817357, "loss": 0.5337, "step": 402800 }, { "epoch": 54.284559417946646, "grad_norm": 0.15882207453250885, "learning_rate": 0.00040476511272792597, "loss": 0.5338, "step": 402900 }, { "epoch": 54.29803287523578, "grad_norm": 0.17626291513442993, "learning_rate": 0.00040472768645767837, "loss": 0.5341, "step": 403000 }, { "epoch": 54.31150633252493, "grad_norm": 0.1771312654018402, "learning_rate": 0.00040469026018743077, "loss": 0.5327, "step": 403100 }, { "epoch": 54.324979789814066, "grad_norm": 0.16339096426963806, "learning_rate": 0.00040465283391718317, "loss": 0.5322, "step": 403200 }, { "epoch": 54.3384532471032, "grad_norm": 0.1638442873954773, "learning_rate": 0.00040461540764693556, "loss": 0.5345, "step": 403300 }, { "epoch": 54.35192670439235, "grad_norm": 0.15116825699806213, "learning_rate": 0.00040457798137668796, "loss": 0.5326, "step": 403400 }, { "epoch": 54.365400161681485, "grad_norm": 0.1559973806142807, "learning_rate": 0.00040454055510644036, "loss": 0.5331, "step": 403500 }, { "epoch": 54.37887361897063, "grad_norm": 0.15909384191036224, "learning_rate": 0.0004045031288361927, "loss": 0.5343, "step": 403600 }, { "epoch": 54.39234707625977, "grad_norm": 0.16408325731754303, "learning_rate": 0.0004044657025659451, "loss": 0.535, "step": 403700 }, { "epoch": 54.40582053354891, "grad_norm": 0.1553477793931961, "learning_rate": 0.0004044282762956975, "loss": 0.5328, "step": 403800 }, { "epoch": 54.41929399083805, "grad_norm": 0.15466538071632385, "learning_rate": 0.0004043908500254499, "loss": 0.5348, "step": 403900 }, { "epoch": 54.43276744812719, "grad_norm": 0.158246248960495, "learning_rate": 0.0004043534237552023, "loss": 0.5337, "step": 404000 }, { "epoch": 54.44624090541633, "grad_norm": 0.16234412789344788, "learning_rate": 0.00040431599748495464, "loss": 0.5337, "step": 404100 }, { "epoch": 54.45971436270547, "grad_norm": 0.1833895742893219, "learning_rate": 0.00040427857121470704, "loss": 0.5339, "step": 404200 }, { "epoch": 54.473187819994614, "grad_norm": 0.15472093224525452, "learning_rate": 0.0004042411449444594, "loss": 0.5342, "step": 404300 }, { "epoch": 54.48666127728375, "grad_norm": 0.15869997441768646, "learning_rate": 0.0004042037186742118, "loss": 0.5337, "step": 404400 }, { "epoch": 54.50013473457289, "grad_norm": 0.16048674285411835, "learning_rate": 0.0004041662924039642, "loss": 0.5329, "step": 404500 }, { "epoch": 54.51360819186203, "grad_norm": 0.170028418302536, "learning_rate": 0.0004041288661337166, "loss": 0.5337, "step": 404600 }, { "epoch": 54.52708164915117, "grad_norm": 0.15455254912376404, "learning_rate": 0.000404091439863469, "loss": 0.5339, "step": 404700 }, { "epoch": 54.540555106440316, "grad_norm": 0.16706368327140808, "learning_rate": 0.0004040540135932214, "loss": 0.5331, "step": 404800 }, { "epoch": 54.55402856372945, "grad_norm": 0.18007178604602814, "learning_rate": 0.00040401658732297377, "loss": 0.5335, "step": 404900 }, { "epoch": 54.56750202101859, "grad_norm": 0.175103098154068, "learning_rate": 0.0004039791610527261, "loss": 0.5334, "step": 405000 }, { "epoch": 54.580975478307735, "grad_norm": 0.16708935797214508, "learning_rate": 0.0004039417347824785, "loss": 0.5333, "step": 405100 }, { "epoch": 54.59444893559687, "grad_norm": 0.17473043501377106, "learning_rate": 0.0004039043085122309, "loss": 0.5343, "step": 405200 }, { "epoch": 54.60792239288602, "grad_norm": 0.16038939356803894, "learning_rate": 0.0004038668822419833, "loss": 0.5345, "step": 405300 }, { "epoch": 54.621395850175155, "grad_norm": 0.167421355843544, "learning_rate": 0.0004038294559717357, "loss": 0.5335, "step": 405400 }, { "epoch": 54.63486930746429, "grad_norm": 0.15784400701522827, "learning_rate": 0.0004037920297014881, "loss": 0.5345, "step": 405500 }, { "epoch": 54.64834276475344, "grad_norm": 0.16672486066818237, "learning_rate": 0.0004037546034312405, "loss": 0.5342, "step": 405600 }, { "epoch": 54.661816222042575, "grad_norm": 0.1668253242969513, "learning_rate": 0.00040371717716099285, "loss": 0.5334, "step": 405700 }, { "epoch": 54.67528967933172, "grad_norm": 0.16551053524017334, "learning_rate": 0.00040367975089074525, "loss": 0.5327, "step": 405800 }, { "epoch": 54.68876313662086, "grad_norm": 0.15789887309074402, "learning_rate": 0.0004036423246204976, "loss": 0.5332, "step": 405900 }, { "epoch": 54.702236593909994, "grad_norm": 0.15720219910144806, "learning_rate": 0.00040360489835025, "loss": 0.5337, "step": 406000 }, { "epoch": 54.71571005119914, "grad_norm": 0.1985282301902771, "learning_rate": 0.0004035674720800024, "loss": 0.5342, "step": 406100 }, { "epoch": 54.729183508488276, "grad_norm": 0.17548783123493195, "learning_rate": 0.0004035300458097548, "loss": 0.5341, "step": 406200 }, { "epoch": 54.74265696577742, "grad_norm": 0.16247881948947906, "learning_rate": 0.0004034926195395072, "loss": 0.5332, "step": 406300 }, { "epoch": 54.75613042306656, "grad_norm": 0.17647695541381836, "learning_rate": 0.0004034551932692596, "loss": 0.5342, "step": 406400 }, { "epoch": 54.769603880355696, "grad_norm": 0.16347141563892365, "learning_rate": 0.00040341776699901193, "loss": 0.5338, "step": 406500 }, { "epoch": 54.78307733764484, "grad_norm": 0.16576161980628967, "learning_rate": 0.0004033803407287643, "loss": 0.5334, "step": 406600 }, { "epoch": 54.79655079493398, "grad_norm": 0.15113547444343567, "learning_rate": 0.0004033429144585167, "loss": 0.5337, "step": 406700 }, { "epoch": 54.81002425222312, "grad_norm": 0.16312125325202942, "learning_rate": 0.0004033054881882691, "loss": 0.5337, "step": 406800 }, { "epoch": 54.82349770951226, "grad_norm": 0.16920079290866852, "learning_rate": 0.0004032680619180215, "loss": 0.533, "step": 406900 }, { "epoch": 54.836971166801405, "grad_norm": 0.1669631004333496, "learning_rate": 0.0004032306356477739, "loss": 0.5328, "step": 407000 }, { "epoch": 54.85044462409054, "grad_norm": 0.16862671077251434, "learning_rate": 0.0004031932093775263, "loss": 0.5337, "step": 407100 }, { "epoch": 54.86391808137968, "grad_norm": 0.16104431450366974, "learning_rate": 0.00040315578310727866, "loss": 0.5343, "step": 407200 }, { "epoch": 54.877391538668824, "grad_norm": 0.16758544743061066, "learning_rate": 0.00040311835683703106, "loss": 0.5335, "step": 407300 }, { "epoch": 54.89086499595796, "grad_norm": 0.1622161865234375, "learning_rate": 0.00040308093056678346, "loss": 0.5333, "step": 407400 }, { "epoch": 54.90433845324711, "grad_norm": 0.156871497631073, "learning_rate": 0.00040304350429653586, "loss": 0.5337, "step": 407500 }, { "epoch": 54.917811910536244, "grad_norm": 0.17655618488788605, "learning_rate": 0.00040300607802628825, "loss": 0.5331, "step": 407600 }, { "epoch": 54.93128536782538, "grad_norm": 0.17678005993366241, "learning_rate": 0.0004029686517560406, "loss": 0.5339, "step": 407700 }, { "epoch": 54.944758825114526, "grad_norm": 0.16863121092319489, "learning_rate": 0.000402931225485793, "loss": 0.5326, "step": 407800 }, { "epoch": 54.958232282403664, "grad_norm": 0.15411369502544403, "learning_rate": 0.00040289379921554534, "loss": 0.5342, "step": 407900 }, { "epoch": 54.97170573969281, "grad_norm": 0.16688424348831177, "learning_rate": 0.00040285637294529774, "loss": 0.5342, "step": 408000 }, { "epoch": 54.985179196981946, "grad_norm": 0.1667654663324356, "learning_rate": 0.00040281894667505014, "loss": 0.5333, "step": 408100 }, { "epoch": 54.99865265427108, "grad_norm": 0.17081615328788757, "learning_rate": 0.00040278152040480253, "loss": 0.5339, "step": 408200 }, { "epoch": 55.0, "eval_loss": 0.5219131112098694, "eval_runtime": 4.9713, "eval_samples_per_second": 1005.765, "eval_steps_per_second": 15.891, "step": 408210 }, { "epoch": 55.01212611156023, "grad_norm": 0.16090594232082367, "learning_rate": 0.00040274409413455493, "loss": 0.5327, "step": 408300 }, { "epoch": 55.025599568849366, "grad_norm": 0.17138054966926575, "learning_rate": 0.00040270666786430733, "loss": 0.5326, "step": 408400 }, { "epoch": 55.03907302613851, "grad_norm": 0.1601586639881134, "learning_rate": 0.00040266924159405973, "loss": 0.5324, "step": 408500 }, { "epoch": 55.05254648342765, "grad_norm": 0.17823415994644165, "learning_rate": 0.0004026318153238121, "loss": 0.5335, "step": 408600 }, { "epoch": 55.066019940716785, "grad_norm": 0.17052516341209412, "learning_rate": 0.00040259438905356447, "loss": 0.5323, "step": 408700 }, { "epoch": 55.07949339800593, "grad_norm": 0.16807973384857178, "learning_rate": 0.00040255696278331687, "loss": 0.5325, "step": 408800 }, { "epoch": 55.09296685529507, "grad_norm": 0.16380520164966583, "learning_rate": 0.00040251953651306927, "loss": 0.5335, "step": 408900 }, { "epoch": 55.10644031258421, "grad_norm": 0.17315632104873657, "learning_rate": 0.00040248211024282167, "loss": 0.534, "step": 409000 }, { "epoch": 55.11991376987335, "grad_norm": 0.17065651714801788, "learning_rate": 0.00040244468397257406, "loss": 0.5336, "step": 409100 }, { "epoch": 55.13338722716249, "grad_norm": 0.1656249612569809, "learning_rate": 0.00040240725770232646, "loss": 0.5339, "step": 409200 }, { "epoch": 55.14686068445163, "grad_norm": 0.16015423834323883, "learning_rate": 0.00040236983143207886, "loss": 0.5333, "step": 409300 }, { "epoch": 55.16033414174077, "grad_norm": 0.17105211317539215, "learning_rate": 0.0004023324051618312, "loss": 0.5332, "step": 409400 }, { "epoch": 55.173807599029914, "grad_norm": 0.15695279836654663, "learning_rate": 0.00040229497889158355, "loss": 0.5325, "step": 409500 }, { "epoch": 55.18728105631905, "grad_norm": 0.1606682538986206, "learning_rate": 0.00040225755262133595, "loss": 0.5343, "step": 409600 }, { "epoch": 55.20075451360819, "grad_norm": 0.1706857830286026, "learning_rate": 0.00040222012635108835, "loss": 0.5322, "step": 409700 }, { "epoch": 55.21422797089733, "grad_norm": 0.15508122742176056, "learning_rate": 0.00040218270008084074, "loss": 0.5323, "step": 409800 }, { "epoch": 55.22770142818647, "grad_norm": 0.16881118714809418, "learning_rate": 0.00040214527381059314, "loss": 0.5339, "step": 409900 }, { "epoch": 55.241174885475615, "grad_norm": 0.18320167064666748, "learning_rate": 0.00040210784754034554, "loss": 0.5328, "step": 410000 }, { "epoch": 55.25464834276475, "grad_norm": 0.16760551929473877, "learning_rate": 0.0004020704212700979, "loss": 0.533, "step": 410100 }, { "epoch": 55.26812180005389, "grad_norm": 0.1763678938150406, "learning_rate": 0.0004020329949998503, "loss": 0.5333, "step": 410200 }, { "epoch": 55.281595257343035, "grad_norm": 0.1638425588607788, "learning_rate": 0.0004019955687296027, "loss": 0.5338, "step": 410300 }, { "epoch": 55.29506871463217, "grad_norm": 0.1587848663330078, "learning_rate": 0.0004019581424593551, "loss": 0.5338, "step": 410400 }, { "epoch": 55.30854217192132, "grad_norm": 0.1645885705947876, "learning_rate": 0.0004019207161891075, "loss": 0.5324, "step": 410500 }, { "epoch": 55.322015629210455, "grad_norm": 0.16586418449878693, "learning_rate": 0.0004018832899188599, "loss": 0.5339, "step": 410600 }, { "epoch": 55.33548908649959, "grad_norm": 0.1644919067621231, "learning_rate": 0.0004018458636486123, "loss": 0.5327, "step": 410700 }, { "epoch": 55.34896254378874, "grad_norm": 0.15975262224674225, "learning_rate": 0.0004018084373783646, "loss": 0.5334, "step": 410800 }, { "epoch": 55.362436001077874, "grad_norm": 0.1613089144229889, "learning_rate": 0.000401771011108117, "loss": 0.5344, "step": 410900 }, { "epoch": 55.37590945836702, "grad_norm": 0.15898312628269196, "learning_rate": 0.0004017335848378694, "loss": 0.533, "step": 411000 }, { "epoch": 55.38938291565616, "grad_norm": 0.19507789611816406, "learning_rate": 0.0004016961585676218, "loss": 0.5336, "step": 411100 }, { "epoch": 55.4028563729453, "grad_norm": 0.1545983850955963, "learning_rate": 0.0004016587322973742, "loss": 0.5337, "step": 411200 }, { "epoch": 55.41632983023444, "grad_norm": 0.16735969483852386, "learning_rate": 0.00040162130602712655, "loss": 0.5331, "step": 411300 }, { "epoch": 55.429803287523576, "grad_norm": 0.16265740990638733, "learning_rate": 0.00040158387975687895, "loss": 0.5328, "step": 411400 }, { "epoch": 55.44327674481272, "grad_norm": 0.16494236886501312, "learning_rate": 0.00040154645348663135, "loss": 0.5333, "step": 411500 }, { "epoch": 55.45675020210186, "grad_norm": 0.16657745838165283, "learning_rate": 0.0004015090272163837, "loss": 0.5329, "step": 411600 }, { "epoch": 55.470223659391, "grad_norm": 0.1712654083967209, "learning_rate": 0.0004014716009461361, "loss": 0.532, "step": 411700 }, { "epoch": 55.48369711668014, "grad_norm": 0.16127870976924896, "learning_rate": 0.0004014341746758885, "loss": 0.5325, "step": 411800 }, { "epoch": 55.49717057396928, "grad_norm": 0.15728220343589783, "learning_rate": 0.0004013967484056409, "loss": 0.5343, "step": 411900 }, { "epoch": 55.51064403125842, "grad_norm": 0.17104727029800415, "learning_rate": 0.0004013593221353933, "loss": 0.5328, "step": 412000 }, { "epoch": 55.52411748854756, "grad_norm": 0.18679894506931305, "learning_rate": 0.0004013218958651457, "loss": 0.533, "step": 412100 }, { "epoch": 55.537590945836705, "grad_norm": 0.1638510376214981, "learning_rate": 0.0004012844695948981, "loss": 0.533, "step": 412200 }, { "epoch": 55.55106440312584, "grad_norm": 0.1622994840145111, "learning_rate": 0.00040124704332465043, "loss": 0.5343, "step": 412300 }, { "epoch": 55.56453786041498, "grad_norm": 0.17437787353992462, "learning_rate": 0.0004012096170544028, "loss": 0.5328, "step": 412400 }, { "epoch": 55.578011317704124, "grad_norm": 0.15946462750434875, "learning_rate": 0.0004011721907841552, "loss": 0.5338, "step": 412500 }, { "epoch": 55.59148477499326, "grad_norm": 0.16625472903251648, "learning_rate": 0.0004011347645139076, "loss": 0.5341, "step": 412600 }, { "epoch": 55.60495823228241, "grad_norm": 0.17464125156402588, "learning_rate": 0.00040109733824366, "loss": 0.5332, "step": 412700 }, { "epoch": 55.618431689571544, "grad_norm": 0.1604507863521576, "learning_rate": 0.0004010599119734124, "loss": 0.5333, "step": 412800 }, { "epoch": 55.63190514686068, "grad_norm": 0.15778440237045288, "learning_rate": 0.0004010224857031648, "loss": 0.5328, "step": 412900 }, { "epoch": 55.645378604149826, "grad_norm": 0.16766226291656494, "learning_rate": 0.00040098505943291716, "loss": 0.5342, "step": 413000 }, { "epoch": 55.658852061438964, "grad_norm": 0.16608819365501404, "learning_rate": 0.0004009476331626695, "loss": 0.5338, "step": 413100 }, { "epoch": 55.67232551872811, "grad_norm": 0.15156391263008118, "learning_rate": 0.0004009102068924219, "loss": 0.5332, "step": 413200 }, { "epoch": 55.685798976017246, "grad_norm": 0.16076144576072693, "learning_rate": 0.0004008727806221743, "loss": 0.5325, "step": 413300 }, { "epoch": 55.69927243330638, "grad_norm": 0.1567985564470291, "learning_rate": 0.0004008353543519267, "loss": 0.5336, "step": 413400 }, { "epoch": 55.71274589059553, "grad_norm": 0.17460009455680847, "learning_rate": 0.0004007979280816791, "loss": 0.5334, "step": 413500 }, { "epoch": 55.726219347884665, "grad_norm": 0.15539082884788513, "learning_rate": 0.0004007605018114315, "loss": 0.5327, "step": 413600 }, { "epoch": 55.73969280517381, "grad_norm": 0.1775708943605423, "learning_rate": 0.00040072307554118384, "loss": 0.5327, "step": 413700 }, { "epoch": 55.75316626246295, "grad_norm": 0.15967586636543274, "learning_rate": 0.00040068564927093624, "loss": 0.5325, "step": 413800 }, { "epoch": 55.766639719752085, "grad_norm": 0.161858469247818, "learning_rate": 0.00040064822300068864, "loss": 0.5336, "step": 413900 }, { "epoch": 55.78011317704123, "grad_norm": 0.1666560173034668, "learning_rate": 0.00040061079673044104, "loss": 0.533, "step": 414000 }, { "epoch": 55.79358663433037, "grad_norm": 0.17087894678115845, "learning_rate": 0.00040057337046019343, "loss": 0.5344, "step": 414100 }, { "epoch": 55.80706009161951, "grad_norm": 0.17153605818748474, "learning_rate": 0.00040053594418994583, "loss": 0.5336, "step": 414200 }, { "epoch": 55.82053354890865, "grad_norm": 0.15981899201869965, "learning_rate": 0.00040049851791969823, "loss": 0.5341, "step": 414300 }, { "epoch": 55.83400700619779, "grad_norm": 0.16122595965862274, "learning_rate": 0.00040046109164945063, "loss": 0.5329, "step": 414400 }, { "epoch": 55.84748046348693, "grad_norm": 0.17681366205215454, "learning_rate": 0.000400423665379203, "loss": 0.5326, "step": 414500 }, { "epoch": 55.86095392077607, "grad_norm": 0.15009845793247223, "learning_rate": 0.00040038623910895537, "loss": 0.5332, "step": 414600 }, { "epoch": 55.874427378065214, "grad_norm": 0.14725716412067413, "learning_rate": 0.00040034881283870777, "loss": 0.533, "step": 414700 }, { "epoch": 55.88790083535435, "grad_norm": 0.17313933372497559, "learning_rate": 0.00040031138656846017, "loss": 0.5339, "step": 414800 }, { "epoch": 55.901374292643496, "grad_norm": 0.17927514016628265, "learning_rate": 0.0004002739602982125, "loss": 0.5326, "step": 414900 }, { "epoch": 55.91484774993263, "grad_norm": 0.16691939532756805, "learning_rate": 0.0004002365340279649, "loss": 0.5326, "step": 415000 }, { "epoch": 55.92832120722177, "grad_norm": 0.16007177531719208, "learning_rate": 0.0004001991077577173, "loss": 0.5333, "step": 415100 }, { "epoch": 55.941794664510915, "grad_norm": 0.1608082801103592, "learning_rate": 0.00040016168148746965, "loss": 0.5338, "step": 415200 }, { "epoch": 55.95526812180005, "grad_norm": 0.15851359069347382, "learning_rate": 0.00040012425521722205, "loss": 0.533, "step": 415300 }, { "epoch": 55.9687415790892, "grad_norm": 0.1769058108329773, "learning_rate": 0.00040008682894697445, "loss": 0.5338, "step": 415400 }, { "epoch": 55.982215036378335, "grad_norm": 0.1685706377029419, "learning_rate": 0.00040004940267672685, "loss": 0.5332, "step": 415500 }, { "epoch": 55.99568849366747, "grad_norm": 0.15608832240104675, "learning_rate": 0.00040001197640647925, "loss": 0.5339, "step": 415600 }, { "epoch": 56.0, "eval_loss": 0.5215436220169067, "eval_runtime": 4.9901, "eval_samples_per_second": 1001.98, "eval_steps_per_second": 15.831, "step": 415632 }, { "epoch": 56.00916195095662, "grad_norm": 0.16733983159065247, "learning_rate": 0.00039997455013623164, "loss": 0.5321, "step": 415700 }, { "epoch": 56.022635408245755, "grad_norm": 0.1732570230960846, "learning_rate": 0.00039993712386598404, "loss": 0.5318, "step": 415800 }, { "epoch": 56.0361088655349, "grad_norm": 0.16072948276996613, "learning_rate": 0.0003998996975957364, "loss": 0.5332, "step": 415900 }, { "epoch": 56.04958232282404, "grad_norm": 0.1571209579706192, "learning_rate": 0.0003998622713254888, "loss": 0.533, "step": 416000 }, { "epoch": 56.063055780113174, "grad_norm": 0.1764611005783081, "learning_rate": 0.0003998248450552412, "loss": 0.5334, "step": 416100 }, { "epoch": 56.07652923740232, "grad_norm": 0.17006206512451172, "learning_rate": 0.0003997874187849936, "loss": 0.5332, "step": 416200 }, { "epoch": 56.09000269469146, "grad_norm": 0.1802278608083725, "learning_rate": 0.000399749992514746, "loss": 0.531, "step": 416300 }, { "epoch": 56.1034761519806, "grad_norm": 0.16861173510551453, "learning_rate": 0.0003997125662444984, "loss": 0.5333, "step": 416400 }, { "epoch": 56.11694960926974, "grad_norm": 0.16089081764221191, "learning_rate": 0.0003996751399742508, "loss": 0.531, "step": 416500 }, { "epoch": 56.130423066558876, "grad_norm": 0.15762804448604584, "learning_rate": 0.0003996377137040032, "loss": 0.5324, "step": 416600 }, { "epoch": 56.14389652384802, "grad_norm": 0.15425695478916168, "learning_rate": 0.00039960028743375546, "loss": 0.5325, "step": 416700 }, { "epoch": 56.15736998113716, "grad_norm": 0.17497391998767853, "learning_rate": 0.00039956286116350786, "loss": 0.5327, "step": 416800 }, { "epoch": 56.1708434384263, "grad_norm": 0.16449551284313202, "learning_rate": 0.00039952543489326026, "loss": 0.5323, "step": 416900 }, { "epoch": 56.18431689571544, "grad_norm": 0.17285434901714325, "learning_rate": 0.00039948800862301266, "loss": 0.5326, "step": 417000 }, { "epoch": 56.19779035300458, "grad_norm": 0.16161639988422394, "learning_rate": 0.00039945058235276506, "loss": 0.5326, "step": 417100 }, { "epoch": 56.21126381029372, "grad_norm": 0.15889199078083038, "learning_rate": 0.00039941315608251745, "loss": 0.5323, "step": 417200 }, { "epoch": 56.22473726758286, "grad_norm": 0.16540437936782837, "learning_rate": 0.00039937572981226985, "loss": 0.533, "step": 417300 }, { "epoch": 56.238210724872005, "grad_norm": 0.16353638470172882, "learning_rate": 0.0003993383035420222, "loss": 0.5327, "step": 417400 }, { "epoch": 56.25168418216114, "grad_norm": 0.1693779081106186, "learning_rate": 0.0003993008772717746, "loss": 0.5324, "step": 417500 }, { "epoch": 56.26515763945028, "grad_norm": 0.16311967372894287, "learning_rate": 0.000399263451001527, "loss": 0.5336, "step": 417600 }, { "epoch": 56.278631096739424, "grad_norm": 0.160919189453125, "learning_rate": 0.0003992260247312794, "loss": 0.5328, "step": 417700 }, { "epoch": 56.29210455402856, "grad_norm": 0.1834770143032074, "learning_rate": 0.0003991885984610318, "loss": 0.5326, "step": 417800 }, { "epoch": 56.30557801131771, "grad_norm": 0.16002944111824036, "learning_rate": 0.0003991511721907842, "loss": 0.5336, "step": 417900 }, { "epoch": 56.319051468606844, "grad_norm": 0.15761518478393555, "learning_rate": 0.0003991137459205366, "loss": 0.5334, "step": 418000 }, { "epoch": 56.33252492589598, "grad_norm": 0.19262349605560303, "learning_rate": 0.00039907631965028893, "loss": 0.5331, "step": 418100 }, { "epoch": 56.345998383185126, "grad_norm": 0.18862544000148773, "learning_rate": 0.00039903889338004133, "loss": 0.534, "step": 418200 }, { "epoch": 56.359471840474264, "grad_norm": 0.16646257042884827, "learning_rate": 0.0003990014671097937, "loss": 0.5322, "step": 418300 }, { "epoch": 56.37294529776341, "grad_norm": 0.15726757049560547, "learning_rate": 0.0003989640408395461, "loss": 0.5333, "step": 418400 }, { "epoch": 56.386418755052546, "grad_norm": 0.16282248497009277, "learning_rate": 0.00039892661456929847, "loss": 0.5317, "step": 418500 }, { "epoch": 56.39989221234169, "grad_norm": 0.18287912011146545, "learning_rate": 0.00039888918829905087, "loss": 0.5329, "step": 418600 }, { "epoch": 56.41336566963083, "grad_norm": 0.1649145483970642, "learning_rate": 0.00039885176202880327, "loss": 0.5333, "step": 418700 }, { "epoch": 56.426839126919965, "grad_norm": 0.16444478929042816, "learning_rate": 0.0003988143357585556, "loss": 0.5329, "step": 418800 }, { "epoch": 56.44031258420911, "grad_norm": 0.1787283569574356, "learning_rate": 0.000398776909488308, "loss": 0.5324, "step": 418900 }, { "epoch": 56.45378604149825, "grad_norm": 0.16364824771881104, "learning_rate": 0.0003987394832180604, "loss": 0.5337, "step": 419000 }, { "epoch": 56.46725949878739, "grad_norm": 0.1836814433336258, "learning_rate": 0.0003987020569478128, "loss": 0.5335, "step": 419100 }, { "epoch": 56.48073295607653, "grad_norm": 0.17397461831569672, "learning_rate": 0.0003986646306775652, "loss": 0.5349, "step": 419200 }, { "epoch": 56.49420641336567, "grad_norm": 0.1833903044462204, "learning_rate": 0.0003986272044073176, "loss": 0.5333, "step": 419300 }, { "epoch": 56.50767987065481, "grad_norm": 0.1558668464422226, "learning_rate": 0.00039858977813707, "loss": 0.5329, "step": 419400 }, { "epoch": 56.52115332794395, "grad_norm": 0.16303583979606628, "learning_rate": 0.0003985523518668224, "loss": 0.5326, "step": 419500 }, { "epoch": 56.534626785233094, "grad_norm": 0.16531561315059662, "learning_rate": 0.00039851492559657474, "loss": 0.5326, "step": 419600 }, { "epoch": 56.54810024252223, "grad_norm": 0.19312965869903564, "learning_rate": 0.00039847749932632714, "loss": 0.5326, "step": 419700 }, { "epoch": 56.56157369981137, "grad_norm": 0.16278789937496185, "learning_rate": 0.00039844007305607954, "loss": 0.5328, "step": 419800 }, { "epoch": 56.575047157100514, "grad_norm": 0.15624016523361206, "learning_rate": 0.00039840264678583194, "loss": 0.5332, "step": 419900 }, { "epoch": 56.58852061438965, "grad_norm": 0.17616747319698334, "learning_rate": 0.00039836522051558433, "loss": 0.5324, "step": 420000 }, { "epoch": 56.601994071678796, "grad_norm": 0.16244013607501984, "learning_rate": 0.00039832779424533673, "loss": 0.5324, "step": 420100 }, { "epoch": 56.61546752896793, "grad_norm": 0.14926983416080475, "learning_rate": 0.00039829036797508913, "loss": 0.5317, "step": 420200 }, { "epoch": 56.62894098625707, "grad_norm": 0.16351266205310822, "learning_rate": 0.0003982529417048415, "loss": 0.5326, "step": 420300 }, { "epoch": 56.642414443546215, "grad_norm": 0.16922031342983246, "learning_rate": 0.0003982155154345938, "loss": 0.5335, "step": 420400 }, { "epoch": 56.65588790083535, "grad_norm": 0.15950240194797516, "learning_rate": 0.0003981780891643462, "loss": 0.5326, "step": 420500 }, { "epoch": 56.6693613581245, "grad_norm": 0.17180295288562775, "learning_rate": 0.0003981406628940986, "loss": 0.5333, "step": 420600 }, { "epoch": 56.682834815413635, "grad_norm": 0.17854048311710358, "learning_rate": 0.000398103236623851, "loss": 0.5333, "step": 420700 }, { "epoch": 56.69630827270277, "grad_norm": 0.16291745007038116, "learning_rate": 0.0003980658103536034, "loss": 0.5323, "step": 420800 }, { "epoch": 56.70978172999192, "grad_norm": 0.16731683909893036, "learning_rate": 0.0003980283840833558, "loss": 0.5324, "step": 420900 }, { "epoch": 56.723255187281055, "grad_norm": 0.17534618079662323, "learning_rate": 0.00039799095781310815, "loss": 0.5321, "step": 421000 }, { "epoch": 56.7367286445702, "grad_norm": 0.1622830182313919, "learning_rate": 0.00039795353154286055, "loss": 0.5328, "step": 421100 }, { "epoch": 56.75020210185934, "grad_norm": 0.16572310030460358, "learning_rate": 0.00039791610527261295, "loss": 0.5324, "step": 421200 }, { "epoch": 56.763675559148474, "grad_norm": 0.16909842193126678, "learning_rate": 0.00039787867900236535, "loss": 0.5336, "step": 421300 }, { "epoch": 56.77714901643762, "grad_norm": 0.179739847779274, "learning_rate": 0.00039784125273211775, "loss": 0.5341, "step": 421400 }, { "epoch": 56.790622473726756, "grad_norm": 0.1630297303199768, "learning_rate": 0.00039780382646187014, "loss": 0.5335, "step": 421500 }, { "epoch": 56.8040959310159, "grad_norm": 0.16256679594516754, "learning_rate": 0.00039776640019162254, "loss": 0.5337, "step": 421600 }, { "epoch": 56.81756938830504, "grad_norm": 0.16313040256500244, "learning_rate": 0.00039772897392137494, "loss": 0.5321, "step": 421700 }, { "epoch": 56.831042845594176, "grad_norm": 0.1745389848947525, "learning_rate": 0.0003976915476511273, "loss": 0.5333, "step": 421800 }, { "epoch": 56.84451630288332, "grad_norm": 0.17240141332149506, "learning_rate": 0.0003976541213808797, "loss": 0.532, "step": 421900 }, { "epoch": 56.85798976017246, "grad_norm": 0.18036098778247833, "learning_rate": 0.0003976166951106321, "loss": 0.5329, "step": 422000 }, { "epoch": 56.8714632174616, "grad_norm": 0.15597333014011383, "learning_rate": 0.0003975792688403845, "loss": 0.532, "step": 422100 }, { "epoch": 56.88493667475074, "grad_norm": 0.1662895679473877, "learning_rate": 0.0003975418425701368, "loss": 0.5322, "step": 422200 }, { "epoch": 56.898410132039885, "grad_norm": 0.16163340210914612, "learning_rate": 0.0003975044162998892, "loss": 0.5325, "step": 422300 }, { "epoch": 56.91188358932902, "grad_norm": 0.159813791513443, "learning_rate": 0.0003974669900296416, "loss": 0.5333, "step": 422400 }, { "epoch": 56.92535704661816, "grad_norm": 0.16654448211193085, "learning_rate": 0.00039742956375939396, "loss": 0.5327, "step": 422500 }, { "epoch": 56.938830503907305, "grad_norm": 0.17712745070457458, "learning_rate": 0.00039739213748914636, "loss": 0.5327, "step": 422600 }, { "epoch": 56.95230396119644, "grad_norm": 0.15728163719177246, "learning_rate": 0.00039735471121889876, "loss": 0.5336, "step": 422700 }, { "epoch": 56.96577741848559, "grad_norm": 0.17221461236476898, "learning_rate": 0.00039731728494865116, "loss": 0.5325, "step": 422800 }, { "epoch": 56.979250875774724, "grad_norm": 0.17468085885047913, "learning_rate": 0.00039727985867840356, "loss": 0.5327, "step": 422900 }, { "epoch": 56.99272433306386, "grad_norm": 0.18168891966342926, "learning_rate": 0.00039724243240815596, "loss": 0.5325, "step": 423000 }, { "epoch": 57.0, "eval_loss": 0.5212702751159668, "eval_runtime": 4.9619, "eval_samples_per_second": 1007.672, "eval_steps_per_second": 15.921, "step": 423054 }, { "epoch": 57.006197790353006, "grad_norm": 0.1741379052400589, "learning_rate": 0.00039720500613790835, "loss": 0.5329, "step": 423100 }, { "epoch": 57.019671247642144, "grad_norm": 0.17741447687149048, "learning_rate": 0.0003971675798676607, "loss": 0.5328, "step": 423200 }, { "epoch": 57.03314470493129, "grad_norm": 0.17315010726451874, "learning_rate": 0.0003971301535974131, "loss": 0.5321, "step": 423300 }, { "epoch": 57.046618162220426, "grad_norm": 0.16728916764259338, "learning_rate": 0.0003970927273271655, "loss": 0.5328, "step": 423400 }, { "epoch": 57.06009161950956, "grad_norm": 0.17281128466129303, "learning_rate": 0.0003970553010569179, "loss": 0.5313, "step": 423500 }, { "epoch": 57.07356507679871, "grad_norm": 0.15886937081813812, "learning_rate": 0.0003970178747866703, "loss": 0.5315, "step": 423600 }, { "epoch": 57.087038534087846, "grad_norm": 0.16918517649173737, "learning_rate": 0.0003969804485164227, "loss": 0.5321, "step": 423700 }, { "epoch": 57.10051199137699, "grad_norm": 0.16753724217414856, "learning_rate": 0.0003969430222461751, "loss": 0.5331, "step": 423800 }, { "epoch": 57.11398544866613, "grad_norm": 0.1725275069475174, "learning_rate": 0.00039690559597592743, "loss": 0.5327, "step": 423900 }, { "epoch": 57.127458905955265, "grad_norm": 0.17590458691120148, "learning_rate": 0.0003968681697056798, "loss": 0.5318, "step": 424000 }, { "epoch": 57.14093236324441, "grad_norm": 0.16189634799957275, "learning_rate": 0.0003968307434354322, "loss": 0.5318, "step": 424100 }, { "epoch": 57.15440582053355, "grad_norm": 0.1740255504846573, "learning_rate": 0.00039679331716518457, "loss": 0.5325, "step": 424200 }, { "epoch": 57.16787927782269, "grad_norm": 0.16053909063339233, "learning_rate": 0.00039675589089493697, "loss": 0.532, "step": 424300 }, { "epoch": 57.18135273511183, "grad_norm": 0.16356515884399414, "learning_rate": 0.00039671846462468937, "loss": 0.532, "step": 424400 }, { "epoch": 57.19482619240097, "grad_norm": 0.15960854291915894, "learning_rate": 0.00039668103835444177, "loss": 0.5316, "step": 424500 }, { "epoch": 57.20829964969011, "grad_norm": 0.17442262172698975, "learning_rate": 0.00039664361208419416, "loss": 0.5328, "step": 424600 }, { "epoch": 57.22177310697925, "grad_norm": 0.15985746681690216, "learning_rate": 0.0003966061858139465, "loss": 0.532, "step": 424700 }, { "epoch": 57.235246564268394, "grad_norm": 0.16446448862552643, "learning_rate": 0.0003965687595436989, "loss": 0.5331, "step": 424800 }, { "epoch": 57.24872002155753, "grad_norm": 0.16436736285686493, "learning_rate": 0.0003965313332734513, "loss": 0.5316, "step": 424900 }, { "epoch": 57.26219347884667, "grad_norm": 0.16192424297332764, "learning_rate": 0.0003964939070032037, "loss": 0.5322, "step": 425000 }, { "epoch": 57.27566693613581, "grad_norm": 0.16594868898391724, "learning_rate": 0.0003964564807329561, "loss": 0.5312, "step": 425100 }, { "epoch": 57.28914039342495, "grad_norm": 0.17409563064575195, "learning_rate": 0.0003964190544627085, "loss": 0.5316, "step": 425200 }, { "epoch": 57.302613850714096, "grad_norm": 0.16562150418758392, "learning_rate": 0.0003963816281924609, "loss": 0.532, "step": 425300 }, { "epoch": 57.31608730800323, "grad_norm": 0.1599189043045044, "learning_rate": 0.00039634420192221324, "loss": 0.5317, "step": 425400 }, { "epoch": 57.32956076529237, "grad_norm": 0.16219648718833923, "learning_rate": 0.00039630677565196564, "loss": 0.5315, "step": 425500 }, { "epoch": 57.343034222581515, "grad_norm": 0.18287543952465057, "learning_rate": 0.00039626934938171804, "loss": 0.533, "step": 425600 }, { "epoch": 57.35650767987065, "grad_norm": 0.16604387760162354, "learning_rate": 0.00039623192311147044, "loss": 0.5324, "step": 425700 }, { "epoch": 57.3699811371598, "grad_norm": 0.1700259894132614, "learning_rate": 0.0003961944968412228, "loss": 0.5308, "step": 425800 }, { "epoch": 57.383454594448935, "grad_norm": 0.1667424440383911, "learning_rate": 0.0003961570705709752, "loss": 0.5317, "step": 425900 }, { "epoch": 57.39692805173807, "grad_norm": 0.17270569503307343, "learning_rate": 0.0003961196443007276, "loss": 0.5321, "step": 426000 }, { "epoch": 57.41040150902722, "grad_norm": 0.17160969972610474, "learning_rate": 0.0003960822180304799, "loss": 0.5327, "step": 426100 }, { "epoch": 57.423874966316355, "grad_norm": 0.1596253365278244, "learning_rate": 0.0003960447917602323, "loss": 0.5331, "step": 426200 }, { "epoch": 57.4373484236055, "grad_norm": 0.1773855835199356, "learning_rate": 0.0003960073654899847, "loss": 0.5318, "step": 426300 }, { "epoch": 57.45082188089464, "grad_norm": 0.16689664125442505, "learning_rate": 0.0003959699392197371, "loss": 0.5326, "step": 426400 }, { "epoch": 57.46429533818378, "grad_norm": 0.15811356902122498, "learning_rate": 0.0003959325129494895, "loss": 0.5328, "step": 426500 }, { "epoch": 57.47776879547292, "grad_norm": 0.1818304806947708, "learning_rate": 0.0003958950866792419, "loss": 0.5331, "step": 426600 }, { "epoch": 57.491242252762056, "grad_norm": 0.1719752699136734, "learning_rate": 0.0003958576604089943, "loss": 0.5329, "step": 426700 }, { "epoch": 57.5047157100512, "grad_norm": 0.1653396636247635, "learning_rate": 0.0003958202341387467, "loss": 0.5328, "step": 426800 }, { "epoch": 57.51818916734034, "grad_norm": 0.1608920395374298, "learning_rate": 0.00039578280786849905, "loss": 0.5328, "step": 426900 }, { "epoch": 57.53166262462948, "grad_norm": 0.16755282878875732, "learning_rate": 0.00039574538159825145, "loss": 0.5316, "step": 427000 }, { "epoch": 57.54513608191862, "grad_norm": 0.1642603874206543, "learning_rate": 0.00039570795532800385, "loss": 0.5322, "step": 427100 }, { "epoch": 57.55860953920776, "grad_norm": 0.17173312604427338, "learning_rate": 0.00039567052905775625, "loss": 0.5326, "step": 427200 }, { "epoch": 57.5720829964969, "grad_norm": 0.1689954400062561, "learning_rate": 0.00039563310278750865, "loss": 0.5329, "step": 427300 }, { "epoch": 57.58555645378604, "grad_norm": 0.17017415165901184, "learning_rate": 0.00039559567651726104, "loss": 0.5316, "step": 427400 }, { "epoch": 57.599029911075185, "grad_norm": 0.16675716638565063, "learning_rate": 0.00039555825024701344, "loss": 0.5323, "step": 427500 }, { "epoch": 57.61250336836432, "grad_norm": 0.15378272533416748, "learning_rate": 0.00039552082397676573, "loss": 0.5328, "step": 427600 }, { "epoch": 57.62597682565346, "grad_norm": 0.15733768045902252, "learning_rate": 0.00039548339770651813, "loss": 0.531, "step": 427700 }, { "epoch": 57.639450282942605, "grad_norm": 0.1688629388809204, "learning_rate": 0.00039544597143627053, "loss": 0.532, "step": 427800 }, { "epoch": 57.65292374023174, "grad_norm": 0.15587887167930603, "learning_rate": 0.0003954085451660229, "loss": 0.5318, "step": 427900 }, { "epoch": 57.66639719752089, "grad_norm": 0.15600213408470154, "learning_rate": 0.0003953711188957753, "loss": 0.5327, "step": 428000 }, { "epoch": 57.679870654810024, "grad_norm": 0.1753648966550827, "learning_rate": 0.0003953336926255277, "loss": 0.5326, "step": 428100 }, { "epoch": 57.69334411209916, "grad_norm": 0.16544198989868164, "learning_rate": 0.0003952962663552801, "loss": 0.5325, "step": 428200 }, { "epoch": 57.706817569388306, "grad_norm": 0.15999862551689148, "learning_rate": 0.00039525884008503247, "loss": 0.5326, "step": 428300 }, { "epoch": 57.720291026677444, "grad_norm": 0.17912288010120392, "learning_rate": 0.00039522141381478486, "loss": 0.5333, "step": 428400 }, { "epoch": 57.73376448396659, "grad_norm": 0.15834744274616241, "learning_rate": 0.00039518398754453726, "loss": 0.5335, "step": 428500 }, { "epoch": 57.747237941255726, "grad_norm": 0.17013922333717346, "learning_rate": 0.00039514656127428966, "loss": 0.5318, "step": 428600 }, { "epoch": 57.76071139854486, "grad_norm": 0.1688043177127838, "learning_rate": 0.00039510913500404206, "loss": 0.5329, "step": 428700 }, { "epoch": 57.77418485583401, "grad_norm": 0.1638663411140442, "learning_rate": 0.00039507170873379446, "loss": 0.5324, "step": 428800 }, { "epoch": 57.787658313123146, "grad_norm": 0.182266965508461, "learning_rate": 0.00039503428246354686, "loss": 0.5334, "step": 428900 }, { "epoch": 57.80113177041229, "grad_norm": 0.16389918327331543, "learning_rate": 0.0003949968561932992, "loss": 0.5327, "step": 429000 }, { "epoch": 57.81460522770143, "grad_norm": 0.17295487225055695, "learning_rate": 0.0003949594299230516, "loss": 0.5324, "step": 429100 }, { "epoch": 57.828078684990565, "grad_norm": 0.1992053985595703, "learning_rate": 0.000394922003652804, "loss": 0.5333, "step": 429200 }, { "epoch": 57.84155214227971, "grad_norm": 0.16508732736110687, "learning_rate": 0.0003948845773825564, "loss": 0.5326, "step": 429300 }, { "epoch": 57.85502559956885, "grad_norm": 0.18095728754997253, "learning_rate": 0.00039484715111230874, "loss": 0.5337, "step": 429400 }, { "epoch": 57.86849905685799, "grad_norm": 0.17261917889118195, "learning_rate": 0.00039480972484206114, "loss": 0.5332, "step": 429500 }, { "epoch": 57.88197251414713, "grad_norm": 0.16661140322685242, "learning_rate": 0.00039477229857181353, "loss": 0.5328, "step": 429600 }, { "epoch": 57.895445971436274, "grad_norm": 0.1672297567129135, "learning_rate": 0.00039473487230156593, "loss": 0.5334, "step": 429700 }, { "epoch": 57.90891942872541, "grad_norm": 0.17437425255775452, "learning_rate": 0.0003946974460313183, "loss": 0.5331, "step": 429800 }, { "epoch": 57.92239288601455, "grad_norm": 0.1656084954738617, "learning_rate": 0.0003946600197610707, "loss": 0.5326, "step": 429900 }, { "epoch": 57.935866343303694, "grad_norm": 0.17133209109306335, "learning_rate": 0.0003946225934908231, "loss": 0.5329, "step": 430000 }, { "epoch": 57.94933980059283, "grad_norm": 0.16616758704185486, "learning_rate": 0.00039458516722057547, "loss": 0.5327, "step": 430100 }, { "epoch": 57.962813257881976, "grad_norm": 0.16120696067810059, "learning_rate": 0.00039454774095032787, "loss": 0.5334, "step": 430200 }, { "epoch": 57.97628671517111, "grad_norm": 0.16212531924247742, "learning_rate": 0.00039451031468008027, "loss": 0.5326, "step": 430300 }, { "epoch": 57.98976017246025, "grad_norm": 0.17231859266757965, "learning_rate": 0.00039447288840983267, "loss": 0.532, "step": 430400 }, { "epoch": 58.0, "eval_loss": 0.5205399394035339, "eval_runtime": 4.9693, "eval_samples_per_second": 1006.175, "eval_steps_per_second": 15.898, "step": 430476 }, { "epoch": 58.003233629749396, "grad_norm": 0.1650974452495575, "learning_rate": 0.000394435462139585, "loss": 0.5329, "step": 430500 }, { "epoch": 58.01670708703853, "grad_norm": 0.16885708272457123, "learning_rate": 0.0003943980358693374, "loss": 0.5311, "step": 430600 }, { "epoch": 58.03018054432768, "grad_norm": 0.1835523545742035, "learning_rate": 0.0003943606095990898, "loss": 0.5301, "step": 430700 }, { "epoch": 58.043654001616815, "grad_norm": 0.16076242923736572, "learning_rate": 0.0003943231833288422, "loss": 0.5325, "step": 430800 }, { "epoch": 58.05712745890595, "grad_norm": 0.15435940027236938, "learning_rate": 0.0003942857570585946, "loss": 0.5309, "step": 430900 }, { "epoch": 58.0706009161951, "grad_norm": 0.18238072097301483, "learning_rate": 0.000394248330788347, "loss": 0.5323, "step": 431000 }, { "epoch": 58.084074373484235, "grad_norm": 0.1656237691640854, "learning_rate": 0.0003942109045180994, "loss": 0.5312, "step": 431100 }, { "epoch": 58.09754783077338, "grad_norm": 0.15699079632759094, "learning_rate": 0.0003941734782478517, "loss": 0.5315, "step": 431200 }, { "epoch": 58.11102128806252, "grad_norm": 0.15783850848674774, "learning_rate": 0.0003941360519776041, "loss": 0.5326, "step": 431300 }, { "epoch": 58.124494745351655, "grad_norm": 0.18947680294513702, "learning_rate": 0.0003940986257073565, "loss": 0.5322, "step": 431400 }, { "epoch": 58.1379682026408, "grad_norm": 0.15363310277462006, "learning_rate": 0.0003940611994371089, "loss": 0.532, "step": 431500 }, { "epoch": 58.15144165992994, "grad_norm": 0.16453775763511658, "learning_rate": 0.0003940237731668613, "loss": 0.5328, "step": 431600 }, { "epoch": 58.16491511721908, "grad_norm": 0.18371380865573883, "learning_rate": 0.0003939863468966137, "loss": 0.5319, "step": 431700 }, { "epoch": 58.17838857450822, "grad_norm": 0.18010009825229645, "learning_rate": 0.0003939489206263661, "loss": 0.5329, "step": 431800 }, { "epoch": 58.191862031797356, "grad_norm": 0.19152270257472992, "learning_rate": 0.0003939114943561184, "loss": 0.5315, "step": 431900 }, { "epoch": 58.2053354890865, "grad_norm": 0.15721027553081512, "learning_rate": 0.0003938740680858708, "loss": 0.5328, "step": 432000 }, { "epoch": 58.21880894637564, "grad_norm": 0.17501860857009888, "learning_rate": 0.0003938366418156232, "loss": 0.5325, "step": 432100 }, { "epoch": 58.23228240366478, "grad_norm": 0.15471284091472626, "learning_rate": 0.0003937992155453756, "loss": 0.533, "step": 432200 }, { "epoch": 58.24575586095392, "grad_norm": 0.16894672811031342, "learning_rate": 0.000393761789275128, "loss": 0.5315, "step": 432300 }, { "epoch": 58.25922931824306, "grad_norm": 0.1626005917787552, "learning_rate": 0.0003937243630048804, "loss": 0.5318, "step": 432400 }, { "epoch": 58.2727027755322, "grad_norm": 0.15841448307037354, "learning_rate": 0.0003936869367346328, "loss": 0.5315, "step": 432500 }, { "epoch": 58.28617623282134, "grad_norm": 0.15428443253040314, "learning_rate": 0.0003936495104643852, "loss": 0.5322, "step": 432600 }, { "epoch": 58.299649690110485, "grad_norm": 0.1742153912782669, "learning_rate": 0.00039361208419413755, "loss": 0.5323, "step": 432700 }, { "epoch": 58.31312314739962, "grad_norm": 0.16711537539958954, "learning_rate": 0.00039357465792388995, "loss": 0.5317, "step": 432800 }, { "epoch": 58.32659660468876, "grad_norm": 0.1600601077079773, "learning_rate": 0.00039353723165364235, "loss": 0.5322, "step": 432900 }, { "epoch": 58.340070061977904, "grad_norm": 0.1545344591140747, "learning_rate": 0.0003934998053833947, "loss": 0.5313, "step": 433000 }, { "epoch": 58.35354351926704, "grad_norm": 0.1575859934091568, "learning_rate": 0.0003934623791131471, "loss": 0.5323, "step": 433100 }, { "epoch": 58.36701697655619, "grad_norm": 0.14654594659805298, "learning_rate": 0.0003934249528428995, "loss": 0.5319, "step": 433200 }, { "epoch": 58.380490433845324, "grad_norm": 0.16343405842781067, "learning_rate": 0.0003933875265726519, "loss": 0.5324, "step": 433300 }, { "epoch": 58.39396389113446, "grad_norm": 0.1734415739774704, "learning_rate": 0.00039335010030240423, "loss": 0.5316, "step": 433400 }, { "epoch": 58.407437348423606, "grad_norm": 0.176221564412117, "learning_rate": 0.00039331267403215663, "loss": 0.5317, "step": 433500 }, { "epoch": 58.420910805712744, "grad_norm": 0.16772539913654327, "learning_rate": 0.00039327524776190903, "loss": 0.5322, "step": 433600 }, { "epoch": 58.43438426300189, "grad_norm": 0.18040847778320312, "learning_rate": 0.00039323782149166143, "loss": 0.5315, "step": 433700 }, { "epoch": 58.447857720291026, "grad_norm": 0.1661454141139984, "learning_rate": 0.0003932003952214138, "loss": 0.5325, "step": 433800 }, { "epoch": 58.46133117758017, "grad_norm": 0.17197471857070923, "learning_rate": 0.0003931629689511662, "loss": 0.5328, "step": 433900 }, { "epoch": 58.47480463486931, "grad_norm": 0.17053113877773285, "learning_rate": 0.0003931255426809186, "loss": 0.5318, "step": 434000 }, { "epoch": 58.488278092158446, "grad_norm": 0.17964458465576172, "learning_rate": 0.00039308811641067097, "loss": 0.534, "step": 434100 }, { "epoch": 58.50175154944759, "grad_norm": 0.16791771352291107, "learning_rate": 0.00039305069014042337, "loss": 0.5321, "step": 434200 }, { "epoch": 58.51522500673673, "grad_norm": 0.17017802596092224, "learning_rate": 0.00039301326387017576, "loss": 0.5318, "step": 434300 }, { "epoch": 58.52869846402587, "grad_norm": 0.16511553525924683, "learning_rate": 0.00039297583759992816, "loss": 0.5322, "step": 434400 }, { "epoch": 58.54217192131501, "grad_norm": 0.1609744131565094, "learning_rate": 0.00039293841132968056, "loss": 0.5323, "step": 434500 }, { "epoch": 58.55564537860415, "grad_norm": 0.15700431168079376, "learning_rate": 0.00039290098505943296, "loss": 0.532, "step": 434600 }, { "epoch": 58.56911883589329, "grad_norm": 0.1538897305727005, "learning_rate": 0.00039286355878918536, "loss": 0.5317, "step": 434700 }, { "epoch": 58.58259229318243, "grad_norm": 0.15404987335205078, "learning_rate": 0.0003928261325189377, "loss": 0.5339, "step": 434800 }, { "epoch": 58.596065750471574, "grad_norm": 0.15823377668857574, "learning_rate": 0.00039278870624869004, "loss": 0.5327, "step": 434900 }, { "epoch": 58.60953920776071, "grad_norm": 0.15894301235675812, "learning_rate": 0.00039275127997844244, "loss": 0.5321, "step": 435000 }, { "epoch": 58.62301266504985, "grad_norm": 0.17335838079452515, "learning_rate": 0.00039271385370819484, "loss": 0.5329, "step": 435100 }, { "epoch": 58.636486122338994, "grad_norm": 0.16305634379386902, "learning_rate": 0.00039267642743794724, "loss": 0.5311, "step": 435200 }, { "epoch": 58.64995957962813, "grad_norm": 0.17993609607219696, "learning_rate": 0.00039263900116769964, "loss": 0.5307, "step": 435300 }, { "epoch": 58.663433036917276, "grad_norm": 0.16387666761875153, "learning_rate": 0.00039260157489745204, "loss": 0.5329, "step": 435400 }, { "epoch": 58.67690649420641, "grad_norm": 0.17318862676620483, "learning_rate": 0.00039256414862720443, "loss": 0.5333, "step": 435500 }, { "epoch": 58.69037995149555, "grad_norm": 0.1674841046333313, "learning_rate": 0.0003925267223569568, "loss": 0.5324, "step": 435600 }, { "epoch": 58.703853408784695, "grad_norm": 0.15490388870239258, "learning_rate": 0.0003924892960867092, "loss": 0.5319, "step": 435700 }, { "epoch": 58.71732686607383, "grad_norm": 0.18666067719459534, "learning_rate": 0.0003924518698164616, "loss": 0.5331, "step": 435800 }, { "epoch": 58.73080032336298, "grad_norm": 0.17659056186676025, "learning_rate": 0.00039241444354621397, "loss": 0.5322, "step": 435900 }, { "epoch": 58.744273780652115, "grad_norm": 0.15726184844970703, "learning_rate": 0.00039237701727596637, "loss": 0.5319, "step": 436000 }, { "epoch": 58.75774723794125, "grad_norm": 0.17744648456573486, "learning_rate": 0.00039233959100571877, "loss": 0.5322, "step": 436100 }, { "epoch": 58.7712206952304, "grad_norm": 0.15942205488681793, "learning_rate": 0.00039230216473547117, "loss": 0.5326, "step": 436200 }, { "epoch": 58.784694152519535, "grad_norm": 0.17552779614925385, "learning_rate": 0.0003922647384652235, "loss": 0.5328, "step": 436300 }, { "epoch": 58.79816760980868, "grad_norm": 0.15031267702579498, "learning_rate": 0.0003922273121949759, "loss": 0.5324, "step": 436400 }, { "epoch": 58.81164106709782, "grad_norm": 0.16508065164089203, "learning_rate": 0.0003921898859247283, "loss": 0.5319, "step": 436500 }, { "epoch": 58.825114524386954, "grad_norm": 0.17325721681118011, "learning_rate": 0.0003921524596544807, "loss": 0.5319, "step": 436600 }, { "epoch": 58.8385879816761, "grad_norm": 0.16298386454582214, "learning_rate": 0.00039211503338423305, "loss": 0.531, "step": 436700 }, { "epoch": 58.85206143896524, "grad_norm": 0.16990350186824799, "learning_rate": 0.00039207760711398545, "loss": 0.5315, "step": 436800 }, { "epoch": 58.86553489625438, "grad_norm": 0.16769687831401825, "learning_rate": 0.00039204018084373785, "loss": 0.5316, "step": 436900 }, { "epoch": 58.87900835354352, "grad_norm": 0.1604524552822113, "learning_rate": 0.0003920027545734902, "loss": 0.5313, "step": 437000 }, { "epoch": 58.892481810832656, "grad_norm": 0.15669675171375275, "learning_rate": 0.0003919653283032426, "loss": 0.5321, "step": 437100 }, { "epoch": 58.9059552681218, "grad_norm": 0.15939319133758545, "learning_rate": 0.000391927902032995, "loss": 0.5328, "step": 437200 }, { "epoch": 58.91942872541094, "grad_norm": 0.17015878856182098, "learning_rate": 0.0003918904757627474, "loss": 0.5316, "step": 437300 }, { "epoch": 58.93290218270008, "grad_norm": 0.1761561781167984, "learning_rate": 0.0003918530494924998, "loss": 0.5317, "step": 437400 }, { "epoch": 58.94637563998922, "grad_norm": 0.15599720180034637, "learning_rate": 0.0003918156232222522, "loss": 0.5319, "step": 437500 }, { "epoch": 58.959849097278365, "grad_norm": 0.17739169299602509, "learning_rate": 0.0003917781969520046, "loss": 0.5327, "step": 437600 }, { "epoch": 58.9733225545675, "grad_norm": 0.16505736112594604, "learning_rate": 0.000391740770681757, "loss": 0.5336, "step": 437700 }, { "epoch": 58.98679601185664, "grad_norm": 0.15257415175437927, "learning_rate": 0.0003917033444115093, "loss": 0.5327, "step": 437800 }, { "epoch": 59.0, "eval_loss": 0.5203569531440735, "eval_runtime": 4.9758, "eval_samples_per_second": 1004.855, "eval_steps_per_second": 15.877, "step": 437898 }, { "epoch": 59.000269469145785, "grad_norm": 0.15863528847694397, "learning_rate": 0.0003916659181412617, "loss": 0.5309, "step": 437900 }, { "epoch": 59.01374292643492, "grad_norm": 0.15955188870429993, "learning_rate": 0.0003916284918710141, "loss": 0.5307, "step": 438000 }, { "epoch": 59.02721638372407, "grad_norm": 0.15837879478931427, "learning_rate": 0.0003915910656007665, "loss": 0.5309, "step": 438100 }, { "epoch": 59.040689841013204, "grad_norm": 0.182156041264534, "learning_rate": 0.0003915536393305189, "loss": 0.5313, "step": 438200 }, { "epoch": 59.05416329830234, "grad_norm": 0.15845689177513123, "learning_rate": 0.0003915162130602713, "loss": 0.5311, "step": 438300 }, { "epoch": 59.06763675559149, "grad_norm": 0.15765978395938873, "learning_rate": 0.0003914787867900237, "loss": 0.5311, "step": 438400 }, { "epoch": 59.081110212880624, "grad_norm": 0.1740104854106903, "learning_rate": 0.000391441360519776, "loss": 0.5324, "step": 438500 }, { "epoch": 59.09458367016977, "grad_norm": 0.17054280638694763, "learning_rate": 0.0003914039342495284, "loss": 0.5315, "step": 438600 }, { "epoch": 59.108057127458906, "grad_norm": 0.25340673327445984, "learning_rate": 0.0003913665079792808, "loss": 0.5315, "step": 438700 }, { "epoch": 59.121530584748044, "grad_norm": 0.1711089164018631, "learning_rate": 0.0003913290817090332, "loss": 0.5308, "step": 438800 }, { "epoch": 59.13500404203719, "grad_norm": 0.17276650667190552, "learning_rate": 0.0003912916554387856, "loss": 0.5313, "step": 438900 }, { "epoch": 59.148477499326326, "grad_norm": 0.1847844421863556, "learning_rate": 0.000391254229168538, "loss": 0.5316, "step": 439000 }, { "epoch": 59.16195095661547, "grad_norm": 0.18226875364780426, "learning_rate": 0.0003912168028982904, "loss": 0.5322, "step": 439100 }, { "epoch": 59.17542441390461, "grad_norm": 0.16725198924541473, "learning_rate": 0.00039117937662804273, "loss": 0.532, "step": 439200 }, { "epoch": 59.188897871193745, "grad_norm": 0.1725640743970871, "learning_rate": 0.00039114195035779513, "loss": 0.5302, "step": 439300 }, { "epoch": 59.20237132848289, "grad_norm": 0.17348648607730865, "learning_rate": 0.00039110452408754753, "loss": 0.5316, "step": 439400 }, { "epoch": 59.21584478577203, "grad_norm": 0.16692472994327545, "learning_rate": 0.00039106709781729993, "loss": 0.5321, "step": 439500 }, { "epoch": 59.22931824306117, "grad_norm": 0.16655206680297852, "learning_rate": 0.00039102967154705233, "loss": 0.5315, "step": 439600 }, { "epoch": 59.24279170035031, "grad_norm": 0.19569598138332367, "learning_rate": 0.0003909922452768047, "loss": 0.5316, "step": 439700 }, { "epoch": 59.25626515763945, "grad_norm": 0.1738811433315277, "learning_rate": 0.0003909548190065571, "loss": 0.531, "step": 439800 }, { "epoch": 59.26973861492859, "grad_norm": 0.15299953520298004, "learning_rate": 0.0003909173927363095, "loss": 0.5311, "step": 439900 }, { "epoch": 59.28321207221773, "grad_norm": 0.17782655358314514, "learning_rate": 0.00039087996646606187, "loss": 0.5319, "step": 440000 }, { "epoch": 59.296685529506874, "grad_norm": 0.1613665521144867, "learning_rate": 0.00039084254019581426, "loss": 0.5326, "step": 440100 }, { "epoch": 59.31015898679601, "grad_norm": 0.17613647878170013, "learning_rate": 0.00039080511392556666, "loss": 0.5311, "step": 440200 }, { "epoch": 59.32363244408515, "grad_norm": 0.16461287438869476, "learning_rate": 0.000390767687655319, "loss": 0.5321, "step": 440300 }, { "epoch": 59.337105901374294, "grad_norm": 0.1521161049604416, "learning_rate": 0.0003907302613850714, "loss": 0.5316, "step": 440400 }, { "epoch": 59.35057935866343, "grad_norm": 0.17003169655799866, "learning_rate": 0.0003906928351148238, "loss": 0.5321, "step": 440500 }, { "epoch": 59.364052815952576, "grad_norm": 0.1731773167848587, "learning_rate": 0.0003906554088445762, "loss": 0.5311, "step": 440600 }, { "epoch": 59.37752627324171, "grad_norm": 0.15706676244735718, "learning_rate": 0.00039061798257432855, "loss": 0.5313, "step": 440700 }, { "epoch": 59.39099973053085, "grad_norm": 0.15294428169727325, "learning_rate": 0.00039058055630408094, "loss": 0.5332, "step": 440800 }, { "epoch": 59.404473187819995, "grad_norm": 0.15991553664207458, "learning_rate": 0.00039054313003383334, "loss": 0.5322, "step": 440900 }, { "epoch": 59.41794664510913, "grad_norm": 0.1525879204273224, "learning_rate": 0.00039050570376358574, "loss": 0.5316, "step": 441000 }, { "epoch": 59.43142010239828, "grad_norm": 0.15964531898498535, "learning_rate": 0.00039046827749333814, "loss": 0.5323, "step": 441100 }, { "epoch": 59.444893559687415, "grad_norm": 0.15736539661884308, "learning_rate": 0.00039043085122309054, "loss": 0.5329, "step": 441200 }, { "epoch": 59.45836701697656, "grad_norm": 0.15810218453407288, "learning_rate": 0.00039039342495284293, "loss": 0.531, "step": 441300 }, { "epoch": 59.4718404742657, "grad_norm": 0.16838522255420685, "learning_rate": 0.0003903559986825953, "loss": 0.5321, "step": 441400 }, { "epoch": 59.485313931554835, "grad_norm": 0.17031297087669373, "learning_rate": 0.0003903185724123477, "loss": 0.5327, "step": 441500 }, { "epoch": 59.49878738884398, "grad_norm": 0.1641901582479477, "learning_rate": 0.0003902811461421001, "loss": 0.5326, "step": 441600 }, { "epoch": 59.51226084613312, "grad_norm": 0.1787346750497818, "learning_rate": 0.0003902437198718525, "loss": 0.5319, "step": 441700 }, { "epoch": 59.52573430342226, "grad_norm": 0.15771181881427765, "learning_rate": 0.00039020629360160487, "loss": 0.5308, "step": 441800 }, { "epoch": 59.5392077607114, "grad_norm": 0.1624462753534317, "learning_rate": 0.00039016886733135727, "loss": 0.5314, "step": 441900 }, { "epoch": 59.55268121800054, "grad_norm": 0.16121861338615417, "learning_rate": 0.00039013144106110967, "loss": 0.5316, "step": 442000 }, { "epoch": 59.56615467528968, "grad_norm": 0.16043515503406525, "learning_rate": 0.00039009401479086196, "loss": 0.5318, "step": 442100 }, { "epoch": 59.57962813257882, "grad_norm": 0.1667453944683075, "learning_rate": 0.00039005658852061436, "loss": 0.532, "step": 442200 }, { "epoch": 59.59310158986796, "grad_norm": 0.15283086895942688, "learning_rate": 0.00039001916225036675, "loss": 0.532, "step": 442300 }, { "epoch": 59.6065750471571, "grad_norm": 0.15915945172309875, "learning_rate": 0.00038998173598011915, "loss": 0.5325, "step": 442400 }, { "epoch": 59.62004850444624, "grad_norm": 0.1724628210067749, "learning_rate": 0.00038994430970987155, "loss": 0.5309, "step": 442500 }, { "epoch": 59.63352196173538, "grad_norm": 0.15983447432518005, "learning_rate": 0.00038990688343962395, "loss": 0.5321, "step": 442600 }, { "epoch": 59.64699541902452, "grad_norm": 0.16101975739002228, "learning_rate": 0.00038986945716937635, "loss": 0.5329, "step": 442700 }, { "epoch": 59.660468876313665, "grad_norm": 0.17455124855041504, "learning_rate": 0.00038983203089912875, "loss": 0.5316, "step": 442800 }, { "epoch": 59.6739423336028, "grad_norm": 0.16215543448925018, "learning_rate": 0.0003897946046288811, "loss": 0.5316, "step": 442900 }, { "epoch": 59.68741579089194, "grad_norm": 0.16215290129184723, "learning_rate": 0.0003897571783586335, "loss": 0.5328, "step": 443000 }, { "epoch": 59.700889248181085, "grad_norm": 0.1726350337266922, "learning_rate": 0.0003897197520883859, "loss": 0.5327, "step": 443100 }, { "epoch": 59.71436270547022, "grad_norm": 0.1670963615179062, "learning_rate": 0.0003896823258181383, "loss": 0.5318, "step": 443200 }, { "epoch": 59.72783616275937, "grad_norm": 0.16859392821788788, "learning_rate": 0.0003896448995478907, "loss": 0.5319, "step": 443300 }, { "epoch": 59.741309620048504, "grad_norm": 0.17446286976337433, "learning_rate": 0.0003896074732776431, "loss": 0.5315, "step": 443400 }, { "epoch": 59.75478307733764, "grad_norm": 0.16021886467933655, "learning_rate": 0.0003895700470073955, "loss": 0.5316, "step": 443500 }, { "epoch": 59.768256534626786, "grad_norm": 0.1714743673801422, "learning_rate": 0.0003895326207371478, "loss": 0.5308, "step": 443600 }, { "epoch": 59.781729991915924, "grad_norm": 0.16896235942840576, "learning_rate": 0.0003894951944669002, "loss": 0.5313, "step": 443700 }, { "epoch": 59.79520344920507, "grad_norm": 0.16763935983181, "learning_rate": 0.0003894577681966526, "loss": 0.5319, "step": 443800 }, { "epoch": 59.808676906494206, "grad_norm": 0.16533197462558746, "learning_rate": 0.00038942034192640496, "loss": 0.5329, "step": 443900 }, { "epoch": 59.822150363783344, "grad_norm": 0.15890729427337646, "learning_rate": 0.00038938291565615736, "loss": 0.5319, "step": 444000 }, { "epoch": 59.83562382107249, "grad_norm": 0.16563041508197784, "learning_rate": 0.00038934548938590976, "loss": 0.5308, "step": 444100 }, { "epoch": 59.849097278361626, "grad_norm": 0.18952162563800812, "learning_rate": 0.00038930806311566216, "loss": 0.5316, "step": 444200 }, { "epoch": 59.86257073565077, "grad_norm": 0.14977924525737762, "learning_rate": 0.0003892706368454145, "loss": 0.5322, "step": 444300 }, { "epoch": 59.87604419293991, "grad_norm": 0.17150402069091797, "learning_rate": 0.0003892332105751669, "loss": 0.5315, "step": 444400 }, { "epoch": 59.889517650229045, "grad_norm": 0.16002288460731506, "learning_rate": 0.0003891957843049193, "loss": 0.532, "step": 444500 }, { "epoch": 59.90299110751819, "grad_norm": 0.20196984708309174, "learning_rate": 0.0003891583580346717, "loss": 0.5319, "step": 444600 }, { "epoch": 59.91646456480733, "grad_norm": 0.15274947881698608, "learning_rate": 0.0003891209317644241, "loss": 0.5325, "step": 444700 }, { "epoch": 59.92993802209647, "grad_norm": 0.2092270404100418, "learning_rate": 0.0003890835054941765, "loss": 0.5319, "step": 444800 }, { "epoch": 59.94341147938561, "grad_norm": 0.1830248087644577, "learning_rate": 0.0003890460792239289, "loss": 0.5318, "step": 444900 }, { "epoch": 59.956884936674754, "grad_norm": 0.16355375945568085, "learning_rate": 0.0003890086529536813, "loss": 0.532, "step": 445000 }, { "epoch": 59.97035839396389, "grad_norm": 0.16404463350772858, "learning_rate": 0.00038897122668343363, "loss": 0.5325, "step": 445100 }, { "epoch": 59.98383185125303, "grad_norm": 0.18273763358592987, "learning_rate": 0.00038893380041318603, "loss": 0.5318, "step": 445200 }, { "epoch": 59.997305308542174, "grad_norm": 0.16128095984458923, "learning_rate": 0.00038889637414293843, "loss": 0.5313, "step": 445300 }, { "epoch": 60.0, "eval_loss": 0.5198246836662292, "eval_runtime": 4.9907, "eval_samples_per_second": 1001.867, "eval_steps_per_second": 15.83, "step": 445320 }, { "epoch": 60.01077876583131, "grad_norm": 0.211460143327713, "learning_rate": 0.00038885894787269083, "loss": 0.5312, "step": 445400 }, { "epoch": 60.024252223120456, "grad_norm": 0.1626313179731369, "learning_rate": 0.0003888215216024432, "loss": 0.5313, "step": 445500 }, { "epoch": 60.037725680409594, "grad_norm": 0.16410158574581146, "learning_rate": 0.0003887840953321956, "loss": 0.5317, "step": 445600 }, { "epoch": 60.05119913769873, "grad_norm": 0.1561516523361206, "learning_rate": 0.00038874666906194797, "loss": 0.5321, "step": 445700 }, { "epoch": 60.064672594987876, "grad_norm": 0.16815754771232605, "learning_rate": 0.0003887092427917003, "loss": 0.5314, "step": 445800 }, { "epoch": 60.07814605227701, "grad_norm": 0.1718672662973404, "learning_rate": 0.0003886718165214527, "loss": 0.5305, "step": 445900 }, { "epoch": 60.09161950956616, "grad_norm": 0.18792834877967834, "learning_rate": 0.0003886343902512051, "loss": 0.5316, "step": 446000 }, { "epoch": 60.105092966855295, "grad_norm": 0.17757076025009155, "learning_rate": 0.0003885969639809575, "loss": 0.5317, "step": 446100 }, { "epoch": 60.11856642414443, "grad_norm": 0.16299375891685486, "learning_rate": 0.0003885595377107099, "loss": 0.531, "step": 446200 }, { "epoch": 60.13203988143358, "grad_norm": 0.1609777957201004, "learning_rate": 0.0003885221114404623, "loss": 0.5314, "step": 446300 }, { "epoch": 60.145513338722715, "grad_norm": 0.1566891372203827, "learning_rate": 0.0003884846851702147, "loss": 0.5311, "step": 446400 }, { "epoch": 60.15898679601186, "grad_norm": 0.19197164475917816, "learning_rate": 0.00038844725889996705, "loss": 0.5305, "step": 446500 }, { "epoch": 60.172460253301, "grad_norm": 0.15520845353603363, "learning_rate": 0.00038840983262971944, "loss": 0.5319, "step": 446600 }, { "epoch": 60.185933710590135, "grad_norm": 0.16548766195774078, "learning_rate": 0.00038837240635947184, "loss": 0.5303, "step": 446700 }, { "epoch": 60.19940716787928, "grad_norm": 0.15354874730110168, "learning_rate": 0.00038833498008922424, "loss": 0.5322, "step": 446800 }, { "epoch": 60.21288062516842, "grad_norm": 0.15612976253032684, "learning_rate": 0.00038829755381897664, "loss": 0.5306, "step": 446900 }, { "epoch": 60.22635408245756, "grad_norm": 0.19084995985031128, "learning_rate": 0.00038826012754872904, "loss": 0.5318, "step": 447000 }, { "epoch": 60.2398275397467, "grad_norm": 0.18454930186271667, "learning_rate": 0.00038822270127848144, "loss": 0.5324, "step": 447100 }, { "epoch": 60.253300997035836, "grad_norm": 0.15952953696250916, "learning_rate": 0.0003881852750082338, "loss": 0.5322, "step": 447200 }, { "epoch": 60.26677445432498, "grad_norm": 0.1619080752134323, "learning_rate": 0.0003881478487379862, "loss": 0.5315, "step": 447300 }, { "epoch": 60.28024791161412, "grad_norm": 0.1704082041978836, "learning_rate": 0.0003881104224677386, "loss": 0.5305, "step": 447400 }, { "epoch": 60.29372136890326, "grad_norm": 0.16544653475284576, "learning_rate": 0.0003880729961974909, "loss": 0.5317, "step": 447500 }, { "epoch": 60.3071948261924, "grad_norm": 0.1703435182571411, "learning_rate": 0.0003880355699272433, "loss": 0.5317, "step": 447600 }, { "epoch": 60.32066828348154, "grad_norm": 0.17745940387248993, "learning_rate": 0.0003879981436569957, "loss": 0.5314, "step": 447700 }, { "epoch": 60.33414174077068, "grad_norm": 0.1696242392063141, "learning_rate": 0.0003879607173867481, "loss": 0.5319, "step": 447800 }, { "epoch": 60.34761519805982, "grad_norm": 0.16456498205661774, "learning_rate": 0.0003879232911165005, "loss": 0.531, "step": 447900 }, { "epoch": 60.361088655348965, "grad_norm": 0.16845661401748657, "learning_rate": 0.00038788586484625286, "loss": 0.5319, "step": 448000 }, { "epoch": 60.3745621126381, "grad_norm": 0.1699349582195282, "learning_rate": 0.00038784843857600526, "loss": 0.532, "step": 448100 }, { "epoch": 60.38803556992724, "grad_norm": 0.1688002049922943, "learning_rate": 0.00038781101230575765, "loss": 0.5321, "step": 448200 }, { "epoch": 60.401509027216385, "grad_norm": 0.1724925935268402, "learning_rate": 0.00038777358603551005, "loss": 0.5311, "step": 448300 }, { "epoch": 60.41498248450552, "grad_norm": 0.1725500226020813, "learning_rate": 0.00038773615976526245, "loss": 0.5318, "step": 448400 }, { "epoch": 60.42845594179467, "grad_norm": 0.19304364919662476, "learning_rate": 0.00038769873349501485, "loss": 0.5305, "step": 448500 }, { "epoch": 60.441929399083804, "grad_norm": 0.16192558407783508, "learning_rate": 0.00038766130722476725, "loss": 0.5303, "step": 448600 }, { "epoch": 60.45540285637294, "grad_norm": 0.1615862250328064, "learning_rate": 0.0003876238809545196, "loss": 0.5313, "step": 448700 }, { "epoch": 60.468876313662086, "grad_norm": 0.15851177275180817, "learning_rate": 0.000387586454684272, "loss": 0.5318, "step": 448800 }, { "epoch": 60.482349770951224, "grad_norm": 0.158542662858963, "learning_rate": 0.0003875490284140244, "loss": 0.5311, "step": 448900 }, { "epoch": 60.49582322824037, "grad_norm": 0.16512596607208252, "learning_rate": 0.0003875116021437768, "loss": 0.5317, "step": 449000 }, { "epoch": 60.509296685529506, "grad_norm": 0.1608448326587677, "learning_rate": 0.0003874741758735292, "loss": 0.5328, "step": 449100 }, { "epoch": 60.52277014281865, "grad_norm": 0.15717831254005432, "learning_rate": 0.0003874367496032816, "loss": 0.5316, "step": 449200 }, { "epoch": 60.53624360010779, "grad_norm": 0.17623427510261536, "learning_rate": 0.0003873993233330339, "loss": 0.5306, "step": 449300 }, { "epoch": 60.549717057396926, "grad_norm": 0.19173845648765564, "learning_rate": 0.00038736189706278627, "loss": 0.5317, "step": 449400 }, { "epoch": 60.56319051468607, "grad_norm": 0.16532361507415771, "learning_rate": 0.00038732447079253867, "loss": 0.5307, "step": 449500 }, { "epoch": 60.57666397197521, "grad_norm": 0.16083645820617676, "learning_rate": 0.00038728704452229107, "loss": 0.5312, "step": 449600 }, { "epoch": 60.59013742926435, "grad_norm": 0.16576704382896423, "learning_rate": 0.00038724961825204346, "loss": 0.5315, "step": 449700 }, { "epoch": 60.60361088655349, "grad_norm": 0.15719543397426605, "learning_rate": 0.00038721219198179586, "loss": 0.5317, "step": 449800 }, { "epoch": 60.61708434384263, "grad_norm": 0.1724221557378769, "learning_rate": 0.00038717476571154826, "loss": 0.5307, "step": 449900 }, { "epoch": 60.63055780113177, "grad_norm": 0.17921310663223267, "learning_rate": 0.00038713733944130066, "loss": 0.532, "step": 450000 }, { "epoch": 60.64403125842091, "grad_norm": 0.16082558035850525, "learning_rate": 0.00038709991317105306, "loss": 0.5317, "step": 450100 }, { "epoch": 60.657504715710054, "grad_norm": 0.17311625182628632, "learning_rate": 0.0003870624869008054, "loss": 0.5312, "step": 450200 }, { "epoch": 60.67097817299919, "grad_norm": 0.1890050619840622, "learning_rate": 0.0003870250606305578, "loss": 0.5321, "step": 450300 }, { "epoch": 60.68445163028833, "grad_norm": 0.1763237714767456, "learning_rate": 0.0003869876343603102, "loss": 0.5312, "step": 450400 }, { "epoch": 60.697925087577474, "grad_norm": 0.1572764664888382, "learning_rate": 0.0003869502080900626, "loss": 0.5325, "step": 450500 }, { "epoch": 60.71139854486661, "grad_norm": 0.16465668380260468, "learning_rate": 0.000386912781819815, "loss": 0.5308, "step": 450600 }, { "epoch": 60.724872002155756, "grad_norm": 0.1672065258026123, "learning_rate": 0.0003868753555495674, "loss": 0.5308, "step": 450700 }, { "epoch": 60.73834545944489, "grad_norm": 0.16812825202941895, "learning_rate": 0.0003868379292793198, "loss": 0.5311, "step": 450800 }, { "epoch": 60.75181891673403, "grad_norm": 0.15650871396064758, "learning_rate": 0.00038680050300907214, "loss": 0.5318, "step": 450900 }, { "epoch": 60.765292374023176, "grad_norm": 0.17655012011528015, "learning_rate": 0.00038676307673882453, "loss": 0.5317, "step": 451000 }, { "epoch": 60.77876583131231, "grad_norm": 0.18722525238990784, "learning_rate": 0.0003867256504685769, "loss": 0.5321, "step": 451100 }, { "epoch": 60.79223928860146, "grad_norm": 0.1717829704284668, "learning_rate": 0.0003866882241983293, "loss": 0.5318, "step": 451200 }, { "epoch": 60.805712745890595, "grad_norm": 0.17225591838359833, "learning_rate": 0.0003866507979280817, "loss": 0.5305, "step": 451300 }, { "epoch": 60.81918620317973, "grad_norm": 0.17175020277500153, "learning_rate": 0.00038661337165783407, "loss": 0.5316, "step": 451400 }, { "epoch": 60.83265966046888, "grad_norm": 0.15722791850566864, "learning_rate": 0.00038657594538758647, "loss": 0.5311, "step": 451500 }, { "epoch": 60.846133117758015, "grad_norm": 0.1574077308177948, "learning_rate": 0.0003865385191173388, "loss": 0.5313, "step": 451600 }, { "epoch": 60.85960657504716, "grad_norm": 0.1614358127117157, "learning_rate": 0.0003865010928470912, "loss": 0.531, "step": 451700 }, { "epoch": 60.8730800323363, "grad_norm": 0.17077188193798065, "learning_rate": 0.0003864636665768436, "loss": 0.5319, "step": 451800 }, { "epoch": 60.886553489625435, "grad_norm": 0.15970392525196075, "learning_rate": 0.000386426240306596, "loss": 0.5318, "step": 451900 }, { "epoch": 60.90002694691458, "grad_norm": 0.18566665053367615, "learning_rate": 0.0003863888140363484, "loss": 0.5318, "step": 452000 }, { "epoch": 60.91350040420372, "grad_norm": 0.1600792557001114, "learning_rate": 0.0003863513877661008, "loss": 0.5306, "step": 452100 }, { "epoch": 60.92697386149286, "grad_norm": 0.15982475876808167, "learning_rate": 0.0003863139614958532, "loss": 0.5311, "step": 452200 }, { "epoch": 60.940447318782, "grad_norm": 0.1671268790960312, "learning_rate": 0.00038627653522560555, "loss": 0.5317, "step": 452300 }, { "epoch": 60.95392077607114, "grad_norm": 0.16677145659923553, "learning_rate": 0.00038623910895535795, "loss": 0.5312, "step": 452400 }, { "epoch": 60.96739423336028, "grad_norm": 0.1713666319847107, "learning_rate": 0.00038620168268511034, "loss": 0.5313, "step": 452500 }, { "epoch": 60.98086769064942, "grad_norm": 0.16805773973464966, "learning_rate": 0.00038616425641486274, "loss": 0.5316, "step": 452600 }, { "epoch": 60.99434114793856, "grad_norm": 0.1632053554058075, "learning_rate": 0.00038612683014461514, "loss": 0.5302, "step": 452700 }, { "epoch": 61.0, "eval_loss": 0.5199304819107056, "eval_runtime": 5.0004, "eval_samples_per_second": 999.919, "eval_steps_per_second": 15.799, "step": 452742 }, { "epoch": 61.0078146052277, "grad_norm": 0.1660417765378952, "learning_rate": 0.00038608940387436754, "loss": 0.5303, "step": 452800 }, { "epoch": 61.021288062516845, "grad_norm": 0.1526987999677658, "learning_rate": 0.00038605197760411994, "loss": 0.5306, "step": 452900 }, { "epoch": 61.03476151980598, "grad_norm": 0.1806449592113495, "learning_rate": 0.0003860145513338723, "loss": 0.5306, "step": 453000 }, { "epoch": 61.04823497709512, "grad_norm": 0.16239304840564728, "learning_rate": 0.0003859771250636246, "loss": 0.5305, "step": 453100 }, { "epoch": 61.061708434384265, "grad_norm": 0.16324450075626373, "learning_rate": 0.000385939698793377, "loss": 0.5301, "step": 453200 }, { "epoch": 61.0751818916734, "grad_norm": 0.16678813099861145, "learning_rate": 0.0003859022725231294, "loss": 0.5306, "step": 453300 }, { "epoch": 61.08865534896255, "grad_norm": 0.15487563610076904, "learning_rate": 0.0003858648462528818, "loss": 0.5307, "step": 453400 }, { "epoch": 61.102128806251685, "grad_norm": 0.1535150110721588, "learning_rate": 0.0003858274199826342, "loss": 0.5309, "step": 453500 }, { "epoch": 61.11560226354082, "grad_norm": 0.16856281459331512, "learning_rate": 0.0003857899937123866, "loss": 0.5309, "step": 453600 }, { "epoch": 61.12907572082997, "grad_norm": 0.16161969304084778, "learning_rate": 0.000385752567442139, "loss": 0.5314, "step": 453700 }, { "epoch": 61.142549178119104, "grad_norm": 0.15104152262210846, "learning_rate": 0.00038571514117189136, "loss": 0.5316, "step": 453800 }, { "epoch": 61.15602263540825, "grad_norm": 0.15515421330928802, "learning_rate": 0.00038567771490164376, "loss": 0.5315, "step": 453900 }, { "epoch": 61.169496092697386, "grad_norm": 0.17072004079818726, "learning_rate": 0.00038564028863139616, "loss": 0.5313, "step": 454000 }, { "epoch": 61.182969549986524, "grad_norm": 0.15990492701530457, "learning_rate": 0.00038560286236114855, "loss": 0.5303, "step": 454100 }, { "epoch": 61.19644300727567, "grad_norm": 0.164400115609169, "learning_rate": 0.00038556543609090095, "loss": 0.5299, "step": 454200 }, { "epoch": 61.209916464564806, "grad_norm": 0.17043688893318176, "learning_rate": 0.00038552800982065335, "loss": 0.5314, "step": 454300 }, { "epoch": 61.22338992185395, "grad_norm": 0.1756928563117981, "learning_rate": 0.00038549058355040575, "loss": 0.5315, "step": 454400 }, { "epoch": 61.23686337914309, "grad_norm": 0.1779462844133377, "learning_rate": 0.0003854531572801581, "loss": 0.5304, "step": 454500 }, { "epoch": 61.250336836432226, "grad_norm": 0.16328759491443634, "learning_rate": 0.0003854157310099105, "loss": 0.5304, "step": 454600 }, { "epoch": 61.26381029372137, "grad_norm": 0.17902471125125885, "learning_rate": 0.0003853783047396629, "loss": 0.5309, "step": 454700 }, { "epoch": 61.27728375101051, "grad_norm": 0.17064212262630463, "learning_rate": 0.00038534087846941523, "loss": 0.5303, "step": 454800 }, { "epoch": 61.29075720829965, "grad_norm": 0.18630629777908325, "learning_rate": 0.00038530345219916763, "loss": 0.5313, "step": 454900 }, { "epoch": 61.30423066558879, "grad_norm": 0.16887596249580383, "learning_rate": 0.00038526602592892003, "loss": 0.5313, "step": 455000 }, { "epoch": 61.31770412287793, "grad_norm": 0.17429421842098236, "learning_rate": 0.00038522859965867243, "loss": 0.5306, "step": 455100 }, { "epoch": 61.33117758016707, "grad_norm": 0.17613743245601654, "learning_rate": 0.00038519117338842477, "loss": 0.5312, "step": 455200 }, { "epoch": 61.34465103745621, "grad_norm": 0.18855871260166168, "learning_rate": 0.00038515374711817717, "loss": 0.5318, "step": 455300 }, { "epoch": 61.358124494745354, "grad_norm": 0.17717775702476501, "learning_rate": 0.00038511632084792957, "loss": 0.5315, "step": 455400 }, { "epoch": 61.37159795203449, "grad_norm": 0.17842957377433777, "learning_rate": 0.00038507889457768197, "loss": 0.5314, "step": 455500 }, { "epoch": 61.38507140932363, "grad_norm": 0.17896977066993713, "learning_rate": 0.00038504146830743436, "loss": 0.5312, "step": 455600 }, { "epoch": 61.398544866612774, "grad_norm": 0.15895970165729523, "learning_rate": 0.00038500404203718676, "loss": 0.5306, "step": 455700 }, { "epoch": 61.41201832390191, "grad_norm": 0.16217350959777832, "learning_rate": 0.00038496661576693916, "loss": 0.5315, "step": 455800 }, { "epoch": 61.425491781191056, "grad_norm": 0.16861240565776825, "learning_rate": 0.00038492918949669156, "loss": 0.5312, "step": 455900 }, { "epoch": 61.43896523848019, "grad_norm": 0.17330963909626007, "learning_rate": 0.0003848917632264439, "loss": 0.531, "step": 456000 }, { "epoch": 61.45243869576933, "grad_norm": 0.16217003762722015, "learning_rate": 0.0003848543369561963, "loss": 0.5305, "step": 456100 }, { "epoch": 61.465912153058476, "grad_norm": 0.15853267908096313, "learning_rate": 0.0003848169106859487, "loss": 0.5308, "step": 456200 }, { "epoch": 61.47938561034761, "grad_norm": 0.17645667493343353, "learning_rate": 0.0003847794844157011, "loss": 0.5315, "step": 456300 }, { "epoch": 61.49285906763676, "grad_norm": 0.15650807321071625, "learning_rate": 0.0003847420581454535, "loss": 0.5325, "step": 456400 }, { "epoch": 61.506332524925895, "grad_norm": 0.15399838984012604, "learning_rate": 0.0003847046318752059, "loss": 0.5312, "step": 456500 }, { "epoch": 61.51980598221504, "grad_norm": 0.1767614483833313, "learning_rate": 0.00038466720560495824, "loss": 0.5308, "step": 456600 }, { "epoch": 61.53327943950418, "grad_norm": 0.18883280456066132, "learning_rate": 0.0003846297793347106, "loss": 0.5318, "step": 456700 }, { "epoch": 61.546752896793315, "grad_norm": 0.18113523721694946, "learning_rate": 0.000384592353064463, "loss": 0.5305, "step": 456800 }, { "epoch": 61.56022635408246, "grad_norm": 0.16241683065891266, "learning_rate": 0.0003845549267942154, "loss": 0.5309, "step": 456900 }, { "epoch": 61.5736998113716, "grad_norm": 0.17174234986305237, "learning_rate": 0.0003845175005239678, "loss": 0.5302, "step": 457000 }, { "epoch": 61.58717326866074, "grad_norm": 0.17583321034908295, "learning_rate": 0.0003844800742537202, "loss": 0.5313, "step": 457100 }, { "epoch": 61.60064672594988, "grad_norm": 0.16091890633106232, "learning_rate": 0.0003844426479834726, "loss": 0.5314, "step": 457200 }, { "epoch": 61.61412018323902, "grad_norm": 0.15645577013492584, "learning_rate": 0.00038440522171322497, "loss": 0.5317, "step": 457300 }, { "epoch": 61.62759364052816, "grad_norm": 0.15771707892417908, "learning_rate": 0.0003843677954429773, "loss": 0.532, "step": 457400 }, { "epoch": 61.6410670978173, "grad_norm": 0.17293763160705566, "learning_rate": 0.0003843303691727297, "loss": 0.5309, "step": 457500 }, { "epoch": 61.65454055510644, "grad_norm": 0.17552411556243896, "learning_rate": 0.0003842929429024821, "loss": 0.5324, "step": 457600 }, { "epoch": 61.66801401239558, "grad_norm": 0.15811116993427277, "learning_rate": 0.0003842555166322345, "loss": 0.5309, "step": 457700 }, { "epoch": 61.68148746968472, "grad_norm": 0.18083980679512024, "learning_rate": 0.0003842180903619869, "loss": 0.5312, "step": 457800 }, { "epoch": 61.69496092697386, "grad_norm": 0.1691490262746811, "learning_rate": 0.0003841806640917393, "loss": 0.5295, "step": 457900 }, { "epoch": 61.708434384263, "grad_norm": 0.17406970262527466, "learning_rate": 0.0003841432378214917, "loss": 0.5315, "step": 458000 }, { "epoch": 61.721907841552145, "grad_norm": 0.18699799478054047, "learning_rate": 0.0003841058115512441, "loss": 0.5306, "step": 458100 }, { "epoch": 61.73538129884128, "grad_norm": 0.18533627688884735, "learning_rate": 0.00038406838528099645, "loss": 0.5312, "step": 458200 }, { "epoch": 61.74885475613042, "grad_norm": 0.16559502482414246, "learning_rate": 0.00038403095901074885, "loss": 0.5312, "step": 458300 }, { "epoch": 61.762328213419565, "grad_norm": 0.17552603781223297, "learning_rate": 0.0003839935327405012, "loss": 0.5316, "step": 458400 }, { "epoch": 61.7758016707087, "grad_norm": 0.15600840747356415, "learning_rate": 0.0003839561064702536, "loss": 0.5312, "step": 458500 }, { "epoch": 61.78927512799785, "grad_norm": 0.16065548360347748, "learning_rate": 0.000383918680200006, "loss": 0.5304, "step": 458600 }, { "epoch": 61.802748585286984, "grad_norm": 0.16083459556102753, "learning_rate": 0.0003838812539297584, "loss": 0.5305, "step": 458700 }, { "epoch": 61.81622204257612, "grad_norm": 0.17477260529994965, "learning_rate": 0.0003838438276595108, "loss": 0.5311, "step": 458800 }, { "epoch": 61.82969549986527, "grad_norm": 0.16783417761325836, "learning_rate": 0.0003838064013892631, "loss": 0.5318, "step": 458900 }, { "epoch": 61.843168957154404, "grad_norm": 0.15655438601970673, "learning_rate": 0.0003837689751190155, "loss": 0.5321, "step": 459000 }, { "epoch": 61.85664241444355, "grad_norm": 0.15941192209720612, "learning_rate": 0.0003837315488487679, "loss": 0.5311, "step": 459100 }, { "epoch": 61.870115871732686, "grad_norm": 0.1739235520362854, "learning_rate": 0.0003836941225785203, "loss": 0.5306, "step": 459200 }, { "epoch": 61.883589329021824, "grad_norm": 0.16648058593273163, "learning_rate": 0.0003836566963082727, "loss": 0.5318, "step": 459300 }, { "epoch": 61.89706278631097, "grad_norm": 0.16728639602661133, "learning_rate": 0.0003836192700380251, "loss": 0.5314, "step": 459400 }, { "epoch": 61.910536243600106, "grad_norm": 0.16096553206443787, "learning_rate": 0.0003835818437677775, "loss": 0.5307, "step": 459500 }, { "epoch": 61.92400970088925, "grad_norm": 0.16175739467144012, "learning_rate": 0.00038354441749752986, "loss": 0.5312, "step": 459600 }, { "epoch": 61.93748315817839, "grad_norm": 0.1628226935863495, "learning_rate": 0.00038350699122728226, "loss": 0.5307, "step": 459700 }, { "epoch": 61.950956615467526, "grad_norm": 0.16844508051872253, "learning_rate": 0.00038346956495703466, "loss": 0.5319, "step": 459800 }, { "epoch": 61.96443007275667, "grad_norm": 0.16786840558052063, "learning_rate": 0.00038343213868678705, "loss": 0.5307, "step": 459900 }, { "epoch": 61.97790353004581, "grad_norm": 0.16313284635543823, "learning_rate": 0.00038339471241653945, "loss": 0.5318, "step": 460000 }, { "epoch": 61.99137698733495, "grad_norm": 0.16666193306446075, "learning_rate": 0.00038335728614629185, "loss": 0.5309, "step": 460100 }, { "epoch": 62.0, "eval_loss": 0.5191996097564697, "eval_runtime": 5.0128, "eval_samples_per_second": 997.443, "eval_steps_per_second": 15.76, "step": 460164 }, { "epoch": 62.00485044462409, "grad_norm": 0.1599048227071762, "learning_rate": 0.0003833198598760442, "loss": 0.5296, "step": 460200 }, { "epoch": 62.018323901913234, "grad_norm": 0.17736534774303436, "learning_rate": 0.00038328243360579654, "loss": 0.531, "step": 460300 }, { "epoch": 62.03179735920237, "grad_norm": 0.1768103390932083, "learning_rate": 0.00038324500733554894, "loss": 0.5301, "step": 460400 }, { "epoch": 62.04527081649151, "grad_norm": 0.1783478707075119, "learning_rate": 0.00038320758106530134, "loss": 0.5315, "step": 460500 }, { "epoch": 62.058744273780654, "grad_norm": 0.1686343550682068, "learning_rate": 0.00038317015479505373, "loss": 0.5301, "step": 460600 }, { "epoch": 62.07221773106979, "grad_norm": 0.16592437028884888, "learning_rate": 0.00038313272852480613, "loss": 0.5308, "step": 460700 }, { "epoch": 62.085691188358936, "grad_norm": 0.16779154539108276, "learning_rate": 0.00038309530225455853, "loss": 0.5318, "step": 460800 }, { "epoch": 62.099164645648074, "grad_norm": 0.16301897168159485, "learning_rate": 0.00038305787598431093, "loss": 0.5311, "step": 460900 }, { "epoch": 62.11263810293721, "grad_norm": 0.16207192838191986, "learning_rate": 0.0003830204497140633, "loss": 0.5303, "step": 461000 }, { "epoch": 62.126111560226356, "grad_norm": 0.16368532180786133, "learning_rate": 0.00038298302344381567, "loss": 0.5306, "step": 461100 }, { "epoch": 62.13958501751549, "grad_norm": 0.17190615832805634, "learning_rate": 0.00038294559717356807, "loss": 0.5309, "step": 461200 }, { "epoch": 62.15305847480464, "grad_norm": 0.18091483414173126, "learning_rate": 0.00038290817090332047, "loss": 0.5304, "step": 461300 }, { "epoch": 62.166531932093775, "grad_norm": 0.17004595696926117, "learning_rate": 0.00038287074463307287, "loss": 0.5295, "step": 461400 }, { "epoch": 62.18000538938291, "grad_norm": 0.17312446236610413, "learning_rate": 0.00038283331836282526, "loss": 0.5303, "step": 461500 }, { "epoch": 62.19347884667206, "grad_norm": 0.16484792530536652, "learning_rate": 0.00038279589209257766, "loss": 0.5301, "step": 461600 }, { "epoch": 62.206952303961195, "grad_norm": 0.16115973889827728, "learning_rate": 0.00038275846582233006, "loss": 0.5298, "step": 461700 }, { "epoch": 62.22042576125034, "grad_norm": 0.16410712897777557, "learning_rate": 0.0003827210395520824, "loss": 0.5311, "step": 461800 }, { "epoch": 62.23389921853948, "grad_norm": 0.16626448929309845, "learning_rate": 0.0003826836132818348, "loss": 0.5311, "step": 461900 }, { "epoch": 62.247372675828615, "grad_norm": 0.16726692020893097, "learning_rate": 0.00038264618701158715, "loss": 0.5307, "step": 462000 }, { "epoch": 62.26084613311776, "grad_norm": 0.1729539930820465, "learning_rate": 0.00038260876074133954, "loss": 0.5302, "step": 462100 }, { "epoch": 62.2743195904069, "grad_norm": 0.16282851994037628, "learning_rate": 0.00038257133447109194, "loss": 0.5311, "step": 462200 }, { "epoch": 62.28779304769604, "grad_norm": 0.1745838224887848, "learning_rate": 0.00038253390820084434, "loss": 0.531, "step": 462300 }, { "epoch": 62.30126650498518, "grad_norm": 0.18524739146232605, "learning_rate": 0.00038249648193059674, "loss": 0.5309, "step": 462400 }, { "epoch": 62.31473996227432, "grad_norm": 0.18816104531288147, "learning_rate": 0.0003824590556603491, "loss": 0.5306, "step": 462500 }, { "epoch": 62.32821341956346, "grad_norm": 0.16499707102775574, "learning_rate": 0.0003824216293901015, "loss": 0.5307, "step": 462600 }, { "epoch": 62.3416868768526, "grad_norm": 0.16565051674842834, "learning_rate": 0.0003823842031198539, "loss": 0.5309, "step": 462700 }, { "epoch": 62.35516033414174, "grad_norm": 0.1735220104455948, "learning_rate": 0.0003823467768496063, "loss": 0.5308, "step": 462800 }, { "epoch": 62.36863379143088, "grad_norm": 0.1643969863653183, "learning_rate": 0.0003823093505793587, "loss": 0.5313, "step": 462900 }, { "epoch": 62.38210724872002, "grad_norm": 0.16240926086902618, "learning_rate": 0.0003822719243091111, "loss": 0.5302, "step": 463000 }, { "epoch": 62.39558070600916, "grad_norm": 0.15992116928100586, "learning_rate": 0.0003822344980388635, "loss": 0.5314, "step": 463100 }, { "epoch": 62.4090541632983, "grad_norm": 0.15793904662132263, "learning_rate": 0.00038219707176861587, "loss": 0.5305, "step": 463200 }, { "epoch": 62.422527620587445, "grad_norm": 0.16226573288440704, "learning_rate": 0.0003821596454983682, "loss": 0.5297, "step": 463300 }, { "epoch": 62.43600107787658, "grad_norm": 0.16572386026382446, "learning_rate": 0.0003821222192281206, "loss": 0.5311, "step": 463400 }, { "epoch": 62.44947453516572, "grad_norm": 0.1679881066083908, "learning_rate": 0.000382084792957873, "loss": 0.5308, "step": 463500 }, { "epoch": 62.462947992454865, "grad_norm": 0.1610627919435501, "learning_rate": 0.0003820473666876254, "loss": 0.5302, "step": 463600 }, { "epoch": 62.476421449744, "grad_norm": 0.1577569842338562, "learning_rate": 0.0003820099404173778, "loss": 0.5305, "step": 463700 }, { "epoch": 62.48989490703315, "grad_norm": 0.16195599734783173, "learning_rate": 0.00038197251414713015, "loss": 0.5306, "step": 463800 }, { "epoch": 62.503368364322284, "grad_norm": 0.18758775293827057, "learning_rate": 0.00038193508787688255, "loss": 0.53, "step": 463900 }, { "epoch": 62.51684182161142, "grad_norm": 0.1727546751499176, "learning_rate": 0.0003818976616066349, "loss": 0.5304, "step": 464000 }, { "epoch": 62.53031527890057, "grad_norm": 0.15541566908359528, "learning_rate": 0.0003818602353363873, "loss": 0.5312, "step": 464100 }, { "epoch": 62.543788736189704, "grad_norm": 0.16467051208019257, "learning_rate": 0.0003818228090661397, "loss": 0.5306, "step": 464200 }, { "epoch": 62.55726219347885, "grad_norm": 0.2014835774898529, "learning_rate": 0.0003817853827958921, "loss": 0.5305, "step": 464300 }, { "epoch": 62.570735650767986, "grad_norm": 0.1743200570344925, "learning_rate": 0.0003817479565256445, "loss": 0.5311, "step": 464400 }, { "epoch": 62.58420910805713, "grad_norm": 0.17797483503818512, "learning_rate": 0.0003817105302553969, "loss": 0.531, "step": 464500 }, { "epoch": 62.59768256534627, "grad_norm": 0.16978897154331207, "learning_rate": 0.0003816731039851493, "loss": 0.5308, "step": 464600 }, { "epoch": 62.611156022635406, "grad_norm": 0.16048651933670044, "learning_rate": 0.00038163567771490163, "loss": 0.5303, "step": 464700 }, { "epoch": 62.62462947992455, "grad_norm": 0.1791849285364151, "learning_rate": 0.000381598251444654, "loss": 0.5305, "step": 464800 }, { "epoch": 62.63810293721369, "grad_norm": 0.16371501982212067, "learning_rate": 0.0003815608251744064, "loss": 0.531, "step": 464900 }, { "epoch": 62.65157639450283, "grad_norm": 0.16981597244739532, "learning_rate": 0.0003815233989041588, "loss": 0.5307, "step": 465000 }, { "epoch": 62.66504985179197, "grad_norm": 0.18147772550582886, "learning_rate": 0.0003814859726339112, "loss": 0.5308, "step": 465100 }, { "epoch": 62.67852330908111, "grad_norm": 0.16574156284332275, "learning_rate": 0.0003814485463636636, "loss": 0.5305, "step": 465200 }, { "epoch": 62.69199676637025, "grad_norm": 0.17263171076774597, "learning_rate": 0.000381411120093416, "loss": 0.5312, "step": 465300 }, { "epoch": 62.70547022365939, "grad_norm": 0.16965393722057343, "learning_rate": 0.00038137369382316836, "loss": 0.5308, "step": 465400 }, { "epoch": 62.718943680948534, "grad_norm": 0.1636982262134552, "learning_rate": 0.00038133626755292076, "loss": 0.5303, "step": 465500 }, { "epoch": 62.73241713823767, "grad_norm": 0.16308064758777618, "learning_rate": 0.0003812988412826731, "loss": 0.5299, "step": 465600 }, { "epoch": 62.74589059552681, "grad_norm": 0.1568232923746109, "learning_rate": 0.0003812614150124255, "loss": 0.5319, "step": 465700 }, { "epoch": 62.759364052815954, "grad_norm": 0.15934035181999207, "learning_rate": 0.0003812239887421779, "loss": 0.5322, "step": 465800 }, { "epoch": 62.77283751010509, "grad_norm": 0.15934450924396515, "learning_rate": 0.0003811865624719303, "loss": 0.5303, "step": 465900 }, { "epoch": 62.786310967394236, "grad_norm": 0.17760753631591797, "learning_rate": 0.0003811491362016827, "loss": 0.5313, "step": 466000 }, { "epoch": 62.799784424683374, "grad_norm": 0.16506651043891907, "learning_rate": 0.0003811117099314351, "loss": 0.5315, "step": 466100 }, { "epoch": 62.81325788197251, "grad_norm": 0.16786092519760132, "learning_rate": 0.00038107428366118744, "loss": 0.5297, "step": 466200 }, { "epoch": 62.826731339261656, "grad_norm": 0.17542190849781036, "learning_rate": 0.00038103685739093984, "loss": 0.5307, "step": 466300 }, { "epoch": 62.84020479655079, "grad_norm": 0.15886156260967255, "learning_rate": 0.00038099943112069224, "loss": 0.5311, "step": 466400 }, { "epoch": 62.85367825383994, "grad_norm": 0.1609421670436859, "learning_rate": 0.00038096200485044463, "loss": 0.5312, "step": 466500 }, { "epoch": 62.867151711129075, "grad_norm": 0.18735432624816895, "learning_rate": 0.00038092457858019703, "loss": 0.5308, "step": 466600 }, { "epoch": 62.88062516841821, "grad_norm": 0.1687903106212616, "learning_rate": 0.00038088715230994943, "loss": 0.5302, "step": 466700 }, { "epoch": 62.89409862570736, "grad_norm": 0.1737627536058426, "learning_rate": 0.00038084972603970183, "loss": 0.531, "step": 466800 }, { "epoch": 62.907572082996495, "grad_norm": 0.18485966324806213, "learning_rate": 0.00038081229976945417, "loss": 0.53, "step": 466900 }, { "epoch": 62.92104554028564, "grad_norm": 0.1752188354730606, "learning_rate": 0.00038077487349920657, "loss": 0.5314, "step": 467000 }, { "epoch": 62.93451899757478, "grad_norm": 0.17394258081912994, "learning_rate": 0.00038073744722895897, "loss": 0.5311, "step": 467100 }, { "epoch": 62.947992454863915, "grad_norm": 0.15270273387432098, "learning_rate": 0.00038070002095871137, "loss": 0.5309, "step": 467200 }, { "epoch": 62.96146591215306, "grad_norm": 0.15581753849983215, "learning_rate": 0.00038066259468846377, "loss": 0.5303, "step": 467300 }, { "epoch": 62.9749393694422, "grad_norm": 0.16892029345035553, "learning_rate": 0.0003806251684182161, "loss": 0.5314, "step": 467400 }, { "epoch": 62.98841282673134, "grad_norm": 0.15260067582130432, "learning_rate": 0.0003805877421479685, "loss": 0.5309, "step": 467500 }, { "epoch": 63.0, "eval_loss": 0.5194298028945923, "eval_runtime": 4.9806, "eval_samples_per_second": 1003.899, "eval_steps_per_second": 15.862, "step": 467586 }, { "epoch": 63.00188628402048, "grad_norm": 0.17645375430583954, "learning_rate": 0.00038055031587772085, "loss": 0.5306, "step": 467600 }, { "epoch": 63.015359741309624, "grad_norm": 0.17539045214653015, "learning_rate": 0.00038051288960747325, "loss": 0.5299, "step": 467700 }, { "epoch": 63.02883319859876, "grad_norm": 0.17724251747131348, "learning_rate": 0.00038047546333722565, "loss": 0.5289, "step": 467800 }, { "epoch": 63.0423066558879, "grad_norm": 0.1608201563358307, "learning_rate": 0.00038043803706697805, "loss": 0.53, "step": 467900 }, { "epoch": 63.05578011317704, "grad_norm": 0.19123616814613342, "learning_rate": 0.00038040061079673044, "loss": 0.5294, "step": 468000 }, { "epoch": 63.06925357046618, "grad_norm": 0.17717140913009644, "learning_rate": 0.00038036318452648284, "loss": 0.53, "step": 468100 }, { "epoch": 63.082727027755325, "grad_norm": 0.15242215991020203, "learning_rate": 0.00038032575825623524, "loss": 0.531, "step": 468200 }, { "epoch": 63.09620048504446, "grad_norm": 0.18182472884655, "learning_rate": 0.00038028833198598764, "loss": 0.5303, "step": 468300 }, { "epoch": 63.1096739423336, "grad_norm": 0.16004683077335358, "learning_rate": 0.00038025090571574, "loss": 0.5309, "step": 468400 }, { "epoch": 63.123147399622745, "grad_norm": 0.18186993896961212, "learning_rate": 0.0003802134794454924, "loss": 0.5305, "step": 468500 }, { "epoch": 63.13662085691188, "grad_norm": 0.17065685987472534, "learning_rate": 0.0003801760531752448, "loss": 0.5303, "step": 468600 }, { "epoch": 63.15009431420103, "grad_norm": 0.159909188747406, "learning_rate": 0.0003801386269049972, "loss": 0.5302, "step": 468700 }, { "epoch": 63.163567771490165, "grad_norm": 0.17227111756801605, "learning_rate": 0.0003801012006347496, "loss": 0.5306, "step": 468800 }, { "epoch": 63.1770412287793, "grad_norm": 0.1700202375650406, "learning_rate": 0.000380063774364502, "loss": 0.5308, "step": 468900 }, { "epoch": 63.19051468606845, "grad_norm": 0.15964053571224213, "learning_rate": 0.00038002634809425437, "loss": 0.5296, "step": 469000 }, { "epoch": 63.203988143357584, "grad_norm": 0.15552981197834015, "learning_rate": 0.0003799889218240067, "loss": 0.5302, "step": 469100 }, { "epoch": 63.21746160064673, "grad_norm": 0.16148848831653595, "learning_rate": 0.0003799514955537591, "loss": 0.5294, "step": 469200 }, { "epoch": 63.230935057935866, "grad_norm": 0.1681824028491974, "learning_rate": 0.00037991406928351146, "loss": 0.5309, "step": 469300 }, { "epoch": 63.244408515225004, "grad_norm": 0.16589170694351196, "learning_rate": 0.00037987664301326386, "loss": 0.5305, "step": 469400 }, { "epoch": 63.25788197251415, "grad_norm": 0.1621243804693222, "learning_rate": 0.00037983921674301626, "loss": 0.5306, "step": 469500 }, { "epoch": 63.271355429803286, "grad_norm": 0.16626806557178497, "learning_rate": 0.00037980179047276865, "loss": 0.5306, "step": 469600 }, { "epoch": 63.28482888709243, "grad_norm": 0.1680564135313034, "learning_rate": 0.00037976436420252105, "loss": 0.5312, "step": 469700 }, { "epoch": 63.29830234438157, "grad_norm": 0.1756800264120102, "learning_rate": 0.0003797269379322734, "loss": 0.5294, "step": 469800 }, { "epoch": 63.311775801670706, "grad_norm": 0.16104033589363098, "learning_rate": 0.0003796895116620258, "loss": 0.5308, "step": 469900 }, { "epoch": 63.32524925895985, "grad_norm": 0.1657528281211853, "learning_rate": 0.0003796520853917782, "loss": 0.5293, "step": 470000 }, { "epoch": 63.33872271624899, "grad_norm": 0.17463940382003784, "learning_rate": 0.0003796146591215306, "loss": 0.5306, "step": 470100 }, { "epoch": 63.35219617353813, "grad_norm": 0.1866832822561264, "learning_rate": 0.000379577232851283, "loss": 0.5298, "step": 470200 }, { "epoch": 63.36566963082727, "grad_norm": 0.1647256463766098, "learning_rate": 0.0003795398065810354, "loss": 0.5299, "step": 470300 }, { "epoch": 63.37914308811641, "grad_norm": 0.16036076843738556, "learning_rate": 0.0003795023803107878, "loss": 0.5299, "step": 470400 }, { "epoch": 63.39261654540555, "grad_norm": 0.17116126418113708, "learning_rate": 0.00037946495404054013, "loss": 0.5302, "step": 470500 }, { "epoch": 63.40609000269469, "grad_norm": 0.17683888971805573, "learning_rate": 0.00037942752777029253, "loss": 0.5301, "step": 470600 }, { "epoch": 63.419563459983834, "grad_norm": 0.16254347562789917, "learning_rate": 0.0003793901015000449, "loss": 0.5306, "step": 470700 }, { "epoch": 63.43303691727297, "grad_norm": 0.1701781153678894, "learning_rate": 0.0003793526752297973, "loss": 0.53, "step": 470800 }, { "epoch": 63.44651037456211, "grad_norm": 0.16584670543670654, "learning_rate": 0.0003793152489595497, "loss": 0.5303, "step": 470900 }, { "epoch": 63.459983831851254, "grad_norm": 0.16841068863868713, "learning_rate": 0.0003792778226893021, "loss": 0.5302, "step": 471000 }, { "epoch": 63.47345728914039, "grad_norm": 0.16620628535747528, "learning_rate": 0.00037924039641905446, "loss": 0.5299, "step": 471100 }, { "epoch": 63.486930746429536, "grad_norm": 0.1721305102109909, "learning_rate": 0.00037920297014880686, "loss": 0.5301, "step": 471200 }, { "epoch": 63.500404203718674, "grad_norm": 0.17095503211021423, "learning_rate": 0.0003791655438785592, "loss": 0.5296, "step": 471300 }, { "epoch": 63.51387766100781, "grad_norm": 0.18344305455684662, "learning_rate": 0.0003791281176083116, "loss": 0.5306, "step": 471400 }, { "epoch": 63.527351118296956, "grad_norm": 0.1542639583349228, "learning_rate": 0.000379090691338064, "loss": 0.5314, "step": 471500 }, { "epoch": 63.54082457558609, "grad_norm": 0.16038043797016144, "learning_rate": 0.0003790532650678164, "loss": 0.5312, "step": 471600 }, { "epoch": 63.55429803287524, "grad_norm": 0.1745276153087616, "learning_rate": 0.0003790158387975688, "loss": 0.5304, "step": 471700 }, { "epoch": 63.567771490164375, "grad_norm": 0.16044339537620544, "learning_rate": 0.0003789784125273212, "loss": 0.5295, "step": 471800 }, { "epoch": 63.58124494745352, "grad_norm": 0.1759246587753296, "learning_rate": 0.0003789409862570736, "loss": 0.531, "step": 471900 }, { "epoch": 63.59471840474266, "grad_norm": 0.17016103863716125, "learning_rate": 0.00037890355998682594, "loss": 0.5306, "step": 472000 }, { "epoch": 63.608191862031795, "grad_norm": 0.17679797112941742, "learning_rate": 0.00037886613371657834, "loss": 0.5314, "step": 472100 }, { "epoch": 63.62166531932094, "grad_norm": 0.30374428629875183, "learning_rate": 0.00037882870744633074, "loss": 0.5312, "step": 472200 }, { "epoch": 63.63513877661008, "grad_norm": 0.1679650992155075, "learning_rate": 0.00037879128117608313, "loss": 0.5309, "step": 472300 }, { "epoch": 63.64861223389922, "grad_norm": 0.168943390250206, "learning_rate": 0.00037875385490583553, "loss": 0.5298, "step": 472400 }, { "epoch": 63.66208569118836, "grad_norm": 0.16865819692611694, "learning_rate": 0.00037871642863558793, "loss": 0.5307, "step": 472500 }, { "epoch": 63.6755591484775, "grad_norm": 0.16130997240543365, "learning_rate": 0.00037867900236534033, "loss": 0.5293, "step": 472600 }, { "epoch": 63.68903260576664, "grad_norm": 0.16391806304454803, "learning_rate": 0.0003786415760950927, "loss": 0.531, "step": 472700 }, { "epoch": 63.70250606305578, "grad_norm": 0.16100351512432098, "learning_rate": 0.00037860414982484507, "loss": 0.5301, "step": 472800 }, { "epoch": 63.71597952034492, "grad_norm": 0.1847156137228012, "learning_rate": 0.0003785667235545974, "loss": 0.5308, "step": 472900 }, { "epoch": 63.72945297763406, "grad_norm": 0.16179172694683075, "learning_rate": 0.0003785292972843498, "loss": 0.5318, "step": 473000 }, { "epoch": 63.7429264349232, "grad_norm": 0.169780895113945, "learning_rate": 0.0003784918710141022, "loss": 0.5309, "step": 473100 }, { "epoch": 63.75639989221234, "grad_norm": 0.1774236410856247, "learning_rate": 0.0003784544447438546, "loss": 0.5313, "step": 473200 }, { "epoch": 63.76987334950148, "grad_norm": 0.16259239614009857, "learning_rate": 0.000378417018473607, "loss": 0.5305, "step": 473300 }, { "epoch": 63.783346806790625, "grad_norm": 0.16554813086986542, "learning_rate": 0.00037837959220335935, "loss": 0.5297, "step": 473400 }, { "epoch": 63.79682026407976, "grad_norm": 0.1658412665128708, "learning_rate": 0.00037834216593311175, "loss": 0.5312, "step": 473500 }, { "epoch": 63.8102937213689, "grad_norm": 0.1947665810585022, "learning_rate": 0.00037830473966286415, "loss": 0.531, "step": 473600 }, { "epoch": 63.823767178658045, "grad_norm": 0.15720874071121216, "learning_rate": 0.00037826731339261655, "loss": 0.5299, "step": 473700 }, { "epoch": 63.83724063594718, "grad_norm": 0.18767699599266052, "learning_rate": 0.00037822988712236895, "loss": 0.5316, "step": 473800 }, { "epoch": 63.85071409323633, "grad_norm": 0.1706378310918808, "learning_rate": 0.00037819246085212134, "loss": 0.5299, "step": 473900 }, { "epoch": 63.864187550525465, "grad_norm": 0.16497546434402466, "learning_rate": 0.00037815503458187374, "loss": 0.5303, "step": 474000 }, { "epoch": 63.8776610078146, "grad_norm": 0.15908105671405792, "learning_rate": 0.00037811760831162614, "loss": 0.5306, "step": 474100 }, { "epoch": 63.89113446510375, "grad_norm": 0.1655188649892807, "learning_rate": 0.0003780801820413785, "loss": 0.531, "step": 474200 }, { "epoch": 63.904607922392884, "grad_norm": 0.1711314469575882, "learning_rate": 0.0003780427557711309, "loss": 0.5315, "step": 474300 }, { "epoch": 63.91808137968203, "grad_norm": 0.17418967187404633, "learning_rate": 0.0003780053295008833, "loss": 0.531, "step": 474400 }, { "epoch": 63.931554836971166, "grad_norm": 0.16637040674686432, "learning_rate": 0.0003779679032306357, "loss": 0.5301, "step": 474500 }, { "epoch": 63.945028294260304, "grad_norm": 0.15700410306453705, "learning_rate": 0.0003779304769603881, "loss": 0.531, "step": 474600 }, { "epoch": 63.95850175154945, "grad_norm": 0.1929236799478531, "learning_rate": 0.0003778930506901404, "loss": 0.5312, "step": 474700 }, { "epoch": 63.971975208838586, "grad_norm": 0.17009477317333221, "learning_rate": 0.0003778556244198928, "loss": 0.5301, "step": 474800 }, { "epoch": 63.98544866612773, "grad_norm": 0.16811323165893555, "learning_rate": 0.00037781819814964516, "loss": 0.5311, "step": 474900 }, { "epoch": 63.99892212341687, "grad_norm": 0.15886425971984863, "learning_rate": 0.00037778077187939756, "loss": 0.5315, "step": 475000 }, { "epoch": 64.0, "eval_loss": 0.5181534290313721, "eval_runtime": 4.9605, "eval_samples_per_second": 1007.964, "eval_steps_per_second": 15.926, "step": 475008 }, { "epoch": 64.01239558070601, "grad_norm": 0.16366414725780487, "learning_rate": 0.00037774334560914996, "loss": 0.5304, "step": 475100 }, { "epoch": 64.02586903799515, "grad_norm": 0.15899880230426788, "learning_rate": 0.00037770591933890236, "loss": 0.5292, "step": 475200 }, { "epoch": 64.03934249528429, "grad_norm": 0.16225296258926392, "learning_rate": 0.00037766849306865476, "loss": 0.5306, "step": 475300 }, { "epoch": 64.05281595257343, "grad_norm": 0.1699511855840683, "learning_rate": 0.00037763106679840715, "loss": 0.5292, "step": 475400 }, { "epoch": 64.06628940986258, "grad_norm": 0.16736912727355957, "learning_rate": 0.00037759364052815955, "loss": 0.5301, "step": 475500 }, { "epoch": 64.07976286715171, "grad_norm": 0.15302225947380066, "learning_rate": 0.0003775562142579119, "loss": 0.5309, "step": 475600 }, { "epoch": 64.09323632444085, "grad_norm": 0.16519637405872345, "learning_rate": 0.0003775187879876643, "loss": 0.5296, "step": 475700 }, { "epoch": 64.10670978172999, "grad_norm": 0.18741224706172943, "learning_rate": 0.0003774813617174167, "loss": 0.5301, "step": 475800 }, { "epoch": 64.12018323901913, "grad_norm": 0.16600774228572845, "learning_rate": 0.0003774439354471691, "loss": 0.5298, "step": 475900 }, { "epoch": 64.13365669630828, "grad_norm": 0.18346421420574188, "learning_rate": 0.0003774065091769215, "loss": 0.5306, "step": 476000 }, { "epoch": 64.14713015359742, "grad_norm": 0.17180868983268738, "learning_rate": 0.0003773690829066739, "loss": 0.5303, "step": 476100 }, { "epoch": 64.16060361088655, "grad_norm": 0.16292442381381989, "learning_rate": 0.0003773316566364263, "loss": 0.5299, "step": 476200 }, { "epoch": 64.17407706817569, "grad_norm": 0.1767559051513672, "learning_rate": 0.0003772942303661787, "loss": 0.5303, "step": 476300 }, { "epoch": 64.18755052546483, "grad_norm": 0.16955427825450897, "learning_rate": 0.00037725680409593103, "loss": 0.5299, "step": 476400 }, { "epoch": 64.20102398275398, "grad_norm": 0.17062005400657654, "learning_rate": 0.00037721937782568337, "loss": 0.5298, "step": 476500 }, { "epoch": 64.21449744004312, "grad_norm": 0.17057958245277405, "learning_rate": 0.00037718195155543577, "loss": 0.5292, "step": 476600 }, { "epoch": 64.22797089733226, "grad_norm": 0.15816862881183624, "learning_rate": 0.00037714452528518817, "loss": 0.5301, "step": 476700 }, { "epoch": 64.2414443546214, "grad_norm": 0.1609853357076645, "learning_rate": 0.00037710709901494057, "loss": 0.5305, "step": 476800 }, { "epoch": 64.25491781191053, "grad_norm": 0.17421527206897736, "learning_rate": 0.00037706967274469297, "loss": 0.5302, "step": 476900 }, { "epoch": 64.26839126919968, "grad_norm": 0.17097395658493042, "learning_rate": 0.00037703224647444536, "loss": 0.5306, "step": 477000 }, { "epoch": 64.28186472648882, "grad_norm": 0.15760228037834167, "learning_rate": 0.0003769948202041977, "loss": 0.5297, "step": 477100 }, { "epoch": 64.29533818377796, "grad_norm": 0.157700315117836, "learning_rate": 0.0003769573939339501, "loss": 0.5293, "step": 477200 }, { "epoch": 64.3088116410671, "grad_norm": 0.17529383301734924, "learning_rate": 0.0003769199676637025, "loss": 0.5298, "step": 477300 }, { "epoch": 64.32228509835623, "grad_norm": 0.17382629215717316, "learning_rate": 0.0003768825413934549, "loss": 0.53, "step": 477400 }, { "epoch": 64.33575855564538, "grad_norm": 0.17804501950740814, "learning_rate": 0.0003768451151232073, "loss": 0.53, "step": 477500 }, { "epoch": 64.34923201293452, "grad_norm": 0.17744582891464233, "learning_rate": 0.0003768076888529597, "loss": 0.5298, "step": 477600 }, { "epoch": 64.36270547022366, "grad_norm": 0.172838494181633, "learning_rate": 0.0003767702625827121, "loss": 0.5303, "step": 477700 }, { "epoch": 64.3761789275128, "grad_norm": 0.16664884984493256, "learning_rate": 0.00037673283631246444, "loss": 0.5288, "step": 477800 }, { "epoch": 64.38965238480193, "grad_norm": 0.15760454535484314, "learning_rate": 0.00037669541004221684, "loss": 0.5295, "step": 477900 }, { "epoch": 64.40312584209109, "grad_norm": 0.16310493648052216, "learning_rate": 0.00037665798377196924, "loss": 0.5303, "step": 478000 }, { "epoch": 64.41659929938022, "grad_norm": 0.1682506948709488, "learning_rate": 0.00037662055750172164, "loss": 0.5296, "step": 478100 }, { "epoch": 64.43007275666936, "grad_norm": 0.17104050517082214, "learning_rate": 0.00037658313123147403, "loss": 0.5301, "step": 478200 }, { "epoch": 64.4435462139585, "grad_norm": 0.18621374666690826, "learning_rate": 0.0003765457049612264, "loss": 0.5296, "step": 478300 }, { "epoch": 64.45701967124764, "grad_norm": 0.16126583516597748, "learning_rate": 0.0003765082786909788, "loss": 0.5301, "step": 478400 }, { "epoch": 64.47049312853679, "grad_norm": 0.15911246836185455, "learning_rate": 0.0003764708524207311, "loss": 0.5297, "step": 478500 }, { "epoch": 64.48396658582593, "grad_norm": 0.18784895539283752, "learning_rate": 0.0003764334261504835, "loss": 0.5308, "step": 478600 }, { "epoch": 64.49744004311506, "grad_norm": 0.1726282387971878, "learning_rate": 0.0003763959998802359, "loss": 0.5302, "step": 478700 }, { "epoch": 64.5109135004042, "grad_norm": 0.1639472097158432, "learning_rate": 0.0003763585736099883, "loss": 0.5295, "step": 478800 }, { "epoch": 64.52438695769334, "grad_norm": 0.18458271026611328, "learning_rate": 0.0003763211473397407, "loss": 0.5295, "step": 478900 }, { "epoch": 64.53786041498249, "grad_norm": 0.1717463582754135, "learning_rate": 0.0003762837210694931, "loss": 0.5311, "step": 479000 }, { "epoch": 64.55133387227163, "grad_norm": 0.16589489579200745, "learning_rate": 0.0003762462947992455, "loss": 0.5307, "step": 479100 }, { "epoch": 64.56480732956076, "grad_norm": 0.17040188610553741, "learning_rate": 0.0003762088685289979, "loss": 0.5301, "step": 479200 }, { "epoch": 64.5782807868499, "grad_norm": 0.18555359542369843, "learning_rate": 0.00037617144225875025, "loss": 0.5309, "step": 479300 }, { "epoch": 64.59175424413904, "grad_norm": 0.1612403690814972, "learning_rate": 0.00037613401598850265, "loss": 0.5298, "step": 479400 }, { "epoch": 64.60522770142819, "grad_norm": 0.16849978268146515, "learning_rate": 0.00037609658971825505, "loss": 0.5311, "step": 479500 }, { "epoch": 64.61870115871733, "grad_norm": 0.1721200793981552, "learning_rate": 0.00037605916344800745, "loss": 0.5297, "step": 479600 }, { "epoch": 64.63217461600647, "grad_norm": 0.15923097729682922, "learning_rate": 0.00037602173717775985, "loss": 0.5301, "step": 479700 }, { "epoch": 64.6456480732956, "grad_norm": 0.17431512475013733, "learning_rate": 0.00037598431090751224, "loss": 0.5305, "step": 479800 }, { "epoch": 64.65912153058474, "grad_norm": 0.15602028369903564, "learning_rate": 0.00037594688463726464, "loss": 0.5303, "step": 479900 }, { "epoch": 64.6725949878739, "grad_norm": 0.1832934319972992, "learning_rate": 0.000375909458367017, "loss": 0.5306, "step": 480000 }, { "epoch": 64.68606844516303, "grad_norm": 0.16852447390556335, "learning_rate": 0.00037587203209676933, "loss": 0.5287, "step": 480100 }, { "epoch": 64.69954190245217, "grad_norm": 0.1794508993625641, "learning_rate": 0.00037583460582652173, "loss": 0.5297, "step": 480200 }, { "epoch": 64.7130153597413, "grad_norm": 0.1763547658920288, "learning_rate": 0.0003757971795562741, "loss": 0.5313, "step": 480300 }, { "epoch": 64.72648881703044, "grad_norm": 0.16785641014575958, "learning_rate": 0.0003757597532860265, "loss": 0.5302, "step": 480400 }, { "epoch": 64.7399622743196, "grad_norm": 0.16760727763175964, "learning_rate": 0.0003757223270157789, "loss": 0.5312, "step": 480500 }, { "epoch": 64.75343573160873, "grad_norm": 0.16663624346256256, "learning_rate": 0.0003756849007455313, "loss": 0.5311, "step": 480600 }, { "epoch": 64.76690918889787, "grad_norm": 0.1886647343635559, "learning_rate": 0.00037564747447528366, "loss": 0.5292, "step": 480700 }, { "epoch": 64.78038264618701, "grad_norm": 0.16549107432365417, "learning_rate": 0.00037561004820503606, "loss": 0.5296, "step": 480800 }, { "epoch": 64.79385610347614, "grad_norm": 0.16514188051223755, "learning_rate": 0.00037557262193478846, "loss": 0.5303, "step": 480900 }, { "epoch": 64.8073295607653, "grad_norm": 0.16896314918994904, "learning_rate": 0.00037553519566454086, "loss": 0.5302, "step": 481000 }, { "epoch": 64.82080301805443, "grad_norm": 0.15894095599651337, "learning_rate": 0.00037549776939429326, "loss": 0.5305, "step": 481100 }, { "epoch": 64.83427647534357, "grad_norm": 0.16513077914714813, "learning_rate": 0.00037546034312404566, "loss": 0.5295, "step": 481200 }, { "epoch": 64.84774993263271, "grad_norm": 0.16831229627132416, "learning_rate": 0.00037542291685379805, "loss": 0.5303, "step": 481300 }, { "epoch": 64.86122338992186, "grad_norm": 0.18870121240615845, "learning_rate": 0.00037538549058355045, "loss": 0.53, "step": 481400 }, { "epoch": 64.874696847211, "grad_norm": 0.1674545407295227, "learning_rate": 0.0003753480643133028, "loss": 0.5298, "step": 481500 }, { "epoch": 64.88817030450014, "grad_norm": 0.1608443558216095, "learning_rate": 0.0003753106380430552, "loss": 0.5313, "step": 481600 }, { "epoch": 64.90164376178927, "grad_norm": 0.16306838393211365, "learning_rate": 0.0003752732117728076, "loss": 0.5316, "step": 481700 }, { "epoch": 64.91511721907841, "grad_norm": 0.15771666169166565, "learning_rate": 0.00037523578550256, "loss": 0.5305, "step": 481800 }, { "epoch": 64.92859067636756, "grad_norm": 0.16221052408218384, "learning_rate": 0.00037519835923231234, "loss": 0.5304, "step": 481900 }, { "epoch": 64.9420641336567, "grad_norm": 0.16969269514083862, "learning_rate": 0.00037516093296206473, "loss": 0.5299, "step": 482000 }, { "epoch": 64.95553759094584, "grad_norm": 0.19980625808238983, "learning_rate": 0.00037512350669181713, "loss": 0.5302, "step": 482100 }, { "epoch": 64.96901104823498, "grad_norm": 0.20108343660831451, "learning_rate": 0.0003750860804215695, "loss": 0.5302, "step": 482200 }, { "epoch": 64.98248450552411, "grad_norm": 0.14887802302837372, "learning_rate": 0.0003750486541513219, "loss": 0.5298, "step": 482300 }, { "epoch": 64.99595796281326, "grad_norm": 0.16871879994869232, "learning_rate": 0.00037501122788107427, "loss": 0.5295, "step": 482400 }, { "epoch": 65.0, "eval_loss": 0.5189712643623352, "eval_runtime": 4.9904, "eval_samples_per_second": 1001.925, "eval_steps_per_second": 15.83, "step": 482430 }, { "epoch": 65.0094314201024, "grad_norm": 0.1750590205192566, "learning_rate": 0.00037497380161082667, "loss": 0.53, "step": 482500 }, { "epoch": 65.02290487739154, "grad_norm": 0.17255792021751404, "learning_rate": 0.00037493637534057907, "loss": 0.5283, "step": 482600 }, { "epoch": 65.03637833468068, "grad_norm": 0.1511944979429245, "learning_rate": 0.00037489894907033147, "loss": 0.5295, "step": 482700 }, { "epoch": 65.04985179196981, "grad_norm": 0.18006020784378052, "learning_rate": 0.00037486152280008387, "loss": 0.5291, "step": 482800 }, { "epoch": 65.06332524925897, "grad_norm": 0.15845541656017303, "learning_rate": 0.0003748240965298362, "loss": 0.5288, "step": 482900 }, { "epoch": 65.0767987065481, "grad_norm": 0.16594992578029633, "learning_rate": 0.0003747866702595886, "loss": 0.5296, "step": 483000 }, { "epoch": 65.09027216383724, "grad_norm": 0.19655168056488037, "learning_rate": 0.000374749243989341, "loss": 0.5305, "step": 483100 }, { "epoch": 65.10374562112638, "grad_norm": 0.1614590734243393, "learning_rate": 0.0003747118177190934, "loss": 0.5295, "step": 483200 }, { "epoch": 65.11721907841552, "grad_norm": 0.15794697403907776, "learning_rate": 0.0003746743914488458, "loss": 0.5286, "step": 483300 }, { "epoch": 65.13069253570467, "grad_norm": 0.155609592795372, "learning_rate": 0.0003746369651785982, "loss": 0.5293, "step": 483400 }, { "epoch": 65.1441659929938, "grad_norm": 0.1729879081249237, "learning_rate": 0.0003745995389083506, "loss": 0.5289, "step": 483500 }, { "epoch": 65.15763945028294, "grad_norm": 0.1611943244934082, "learning_rate": 0.000374562112638103, "loss": 0.529, "step": 483600 }, { "epoch": 65.17111290757208, "grad_norm": 0.16293002665042877, "learning_rate": 0.0003745246863678553, "loss": 0.5288, "step": 483700 }, { "epoch": 65.18458636486122, "grad_norm": 0.16942749917507172, "learning_rate": 0.0003744872600976077, "loss": 0.5283, "step": 483800 }, { "epoch": 65.19805982215037, "grad_norm": 0.16748903691768646, "learning_rate": 0.0003744498338273601, "loss": 0.5283, "step": 483900 }, { "epoch": 65.21153327943951, "grad_norm": 0.17204172909259796, "learning_rate": 0.0003744124075571125, "loss": 0.5291, "step": 484000 }, { "epoch": 65.22500673672864, "grad_norm": 0.15835368633270264, "learning_rate": 0.0003743749812868649, "loss": 0.5291, "step": 484100 }, { "epoch": 65.23848019401778, "grad_norm": 0.1541292667388916, "learning_rate": 0.0003743375550166173, "loss": 0.5293, "step": 484200 }, { "epoch": 65.25195365130692, "grad_norm": 0.16561466455459595, "learning_rate": 0.0003743001287463697, "loss": 0.5289, "step": 484300 }, { "epoch": 65.26542710859607, "grad_norm": 0.16290274262428284, "learning_rate": 0.000374262702476122, "loss": 0.5298, "step": 484400 }, { "epoch": 65.27890056588521, "grad_norm": 0.16421489417552948, "learning_rate": 0.0003742252762058744, "loss": 0.5299, "step": 484500 }, { "epoch": 65.29237402317435, "grad_norm": 0.1693597286939621, "learning_rate": 0.0003741878499356268, "loss": 0.5297, "step": 484600 }, { "epoch": 65.30584748046348, "grad_norm": 0.18650765717029572, "learning_rate": 0.0003741504236653792, "loss": 0.5304, "step": 484700 }, { "epoch": 65.31932093775262, "grad_norm": 0.1720425933599472, "learning_rate": 0.0003741129973951316, "loss": 0.5302, "step": 484800 }, { "epoch": 65.33279439504177, "grad_norm": 0.17780843377113342, "learning_rate": 0.000374075571124884, "loss": 0.5308, "step": 484900 }, { "epoch": 65.34626785233091, "grad_norm": 0.15454325079917908, "learning_rate": 0.0003740381448546364, "loss": 0.5287, "step": 485000 }, { "epoch": 65.35974130962005, "grad_norm": 0.16181465983390808, "learning_rate": 0.00037400071858438875, "loss": 0.5297, "step": 485100 }, { "epoch": 65.37321476690919, "grad_norm": 0.1641819328069687, "learning_rate": 0.00037396329231414115, "loss": 0.531, "step": 485200 }, { "epoch": 65.38668822419832, "grad_norm": 0.16842225193977356, "learning_rate": 0.00037392586604389355, "loss": 0.5312, "step": 485300 }, { "epoch": 65.40016168148748, "grad_norm": 0.19312117993831635, "learning_rate": 0.00037388843977364595, "loss": 0.5305, "step": 485400 }, { "epoch": 65.41363513877661, "grad_norm": 0.17619286477565765, "learning_rate": 0.00037385101350339835, "loss": 0.5285, "step": 485500 }, { "epoch": 65.42710859606575, "grad_norm": 0.1808594912290573, "learning_rate": 0.0003738135872331507, "loss": 0.5297, "step": 485600 }, { "epoch": 65.44058205335489, "grad_norm": 0.18137820065021515, "learning_rate": 0.0003737761609629031, "loss": 0.5304, "step": 485700 }, { "epoch": 65.45405551064403, "grad_norm": 0.17392337322235107, "learning_rate": 0.00037373873469265543, "loss": 0.5299, "step": 485800 }, { "epoch": 65.46752896793318, "grad_norm": 0.17429889738559723, "learning_rate": 0.00037370130842240783, "loss": 0.5299, "step": 485900 }, { "epoch": 65.48100242522231, "grad_norm": 0.17585189640522003, "learning_rate": 0.00037366388215216023, "loss": 0.5288, "step": 486000 }, { "epoch": 65.49447588251145, "grad_norm": 0.16492821276187897, "learning_rate": 0.00037362645588191263, "loss": 0.5293, "step": 486100 }, { "epoch": 65.50794933980059, "grad_norm": 0.16612431406974792, "learning_rate": 0.000373589029611665, "loss": 0.5301, "step": 486200 }, { "epoch": 65.52142279708973, "grad_norm": 0.18211393058300018, "learning_rate": 0.0003735516033414174, "loss": 0.5304, "step": 486300 }, { "epoch": 65.53489625437888, "grad_norm": 0.17513959109783173, "learning_rate": 0.0003735141770711698, "loss": 0.5297, "step": 486400 }, { "epoch": 65.54836971166802, "grad_norm": 0.16088628768920898, "learning_rate": 0.0003734767508009222, "loss": 0.5297, "step": 486500 }, { "epoch": 65.56184316895715, "grad_norm": 0.1617375612258911, "learning_rate": 0.00037343932453067456, "loss": 0.5302, "step": 486600 }, { "epoch": 65.57531662624629, "grad_norm": 0.1739509105682373, "learning_rate": 0.00037340189826042696, "loss": 0.5292, "step": 486700 }, { "epoch": 65.58879008353543, "grad_norm": 0.16393284499645233, "learning_rate": 0.00037336447199017936, "loss": 0.5295, "step": 486800 }, { "epoch": 65.60226354082458, "grad_norm": 0.1865531951189041, "learning_rate": 0.00037332704571993176, "loss": 0.5299, "step": 486900 }, { "epoch": 65.61573699811372, "grad_norm": 0.16976016759872437, "learning_rate": 0.00037328961944968416, "loss": 0.5301, "step": 487000 }, { "epoch": 65.62921045540286, "grad_norm": 0.16942566633224487, "learning_rate": 0.00037325219317943656, "loss": 0.5298, "step": 487100 }, { "epoch": 65.642683912692, "grad_norm": 0.16634824872016907, "learning_rate": 0.00037321476690918895, "loss": 0.5293, "step": 487200 }, { "epoch": 65.65615736998113, "grad_norm": 0.20059320330619812, "learning_rate": 0.0003731773406389413, "loss": 0.5298, "step": 487300 }, { "epoch": 65.66963082727028, "grad_norm": 0.1706826388835907, "learning_rate": 0.00037313991436869364, "loss": 0.5291, "step": 487400 }, { "epoch": 65.68310428455942, "grad_norm": 0.17998819053173065, "learning_rate": 0.00037310248809844604, "loss": 0.5306, "step": 487500 }, { "epoch": 65.69657774184856, "grad_norm": 0.18006432056427002, "learning_rate": 0.00037306506182819844, "loss": 0.5311, "step": 487600 }, { "epoch": 65.7100511991377, "grad_norm": 0.16349409520626068, "learning_rate": 0.00037302763555795084, "loss": 0.5298, "step": 487700 }, { "epoch": 65.72352465642683, "grad_norm": 0.18231575191020966, "learning_rate": 0.00037299020928770323, "loss": 0.5308, "step": 487800 }, { "epoch": 65.73699811371598, "grad_norm": 0.16029365360736847, "learning_rate": 0.00037295278301745563, "loss": 0.5296, "step": 487900 }, { "epoch": 65.75047157100512, "grad_norm": 0.16150720417499542, "learning_rate": 0.000372915356747208, "loss": 0.5292, "step": 488000 }, { "epoch": 65.76394502829426, "grad_norm": 0.16666574776172638, "learning_rate": 0.0003728779304769604, "loss": 0.5289, "step": 488100 }, { "epoch": 65.7774184855834, "grad_norm": 0.17585302889347076, "learning_rate": 0.0003728405042067128, "loss": 0.5301, "step": 488200 }, { "epoch": 65.79089194287253, "grad_norm": 0.18461067974567413, "learning_rate": 0.00037280307793646517, "loss": 0.53, "step": 488300 }, { "epoch": 65.80436540016169, "grad_norm": 0.18088191747665405, "learning_rate": 0.00037276565166621757, "loss": 0.5298, "step": 488400 }, { "epoch": 65.81783885745082, "grad_norm": 0.19044668972492218, "learning_rate": 0.00037272822539596997, "loss": 0.5303, "step": 488500 }, { "epoch": 65.83131231473996, "grad_norm": 0.16111624240875244, "learning_rate": 0.00037269079912572237, "loss": 0.5305, "step": 488600 }, { "epoch": 65.8447857720291, "grad_norm": 0.16965247690677643, "learning_rate": 0.0003726533728554747, "loss": 0.5309, "step": 488700 }, { "epoch": 65.85825922931825, "grad_norm": 0.15824063122272491, "learning_rate": 0.0003726159465852271, "loss": 0.5301, "step": 488800 }, { "epoch": 65.87173268660739, "grad_norm": 0.19037875533103943, "learning_rate": 0.0003725785203149795, "loss": 0.5298, "step": 488900 }, { "epoch": 65.88520614389653, "grad_norm": 0.1663791835308075, "learning_rate": 0.0003725410940447319, "loss": 0.53, "step": 489000 }, { "epoch": 65.89867960118566, "grad_norm": 0.15977078676223755, "learning_rate": 0.0003725036677744843, "loss": 0.5299, "step": 489100 }, { "epoch": 65.9121530584748, "grad_norm": 0.16041940450668335, "learning_rate": 0.00037246624150423665, "loss": 0.5313, "step": 489200 }, { "epoch": 65.92562651576395, "grad_norm": 0.15550026297569275, "learning_rate": 0.00037242881523398905, "loss": 0.5303, "step": 489300 }, { "epoch": 65.93909997305309, "grad_norm": 0.16850055754184723, "learning_rate": 0.00037239138896374144, "loss": 0.5308, "step": 489400 }, { "epoch": 65.95257343034223, "grad_norm": 0.16227853298187256, "learning_rate": 0.0003723539626934938, "loss": 0.5299, "step": 489500 }, { "epoch": 65.96604688763136, "grad_norm": 0.2080729454755783, "learning_rate": 0.0003723165364232462, "loss": 0.531, "step": 489600 }, { "epoch": 65.9795203449205, "grad_norm": 0.16233840584754944, "learning_rate": 0.0003722791101529986, "loss": 0.5302, "step": 489700 }, { "epoch": 65.99299380220965, "grad_norm": 0.16270720958709717, "learning_rate": 0.000372241683882751, "loss": 0.5306, "step": 489800 }, { "epoch": 66.0, "eval_loss": 0.5185097455978394, "eval_runtime": 4.9687, "eval_samples_per_second": 1006.298, "eval_steps_per_second": 15.9, "step": 489852 }, { "epoch": 66.00646725949879, "grad_norm": 0.17580965161323547, "learning_rate": 0.0003722042576125034, "loss": 0.5286, "step": 489900 }, { "epoch": 66.01994071678793, "grad_norm": 0.17202188074588776, "learning_rate": 0.0003721668313422558, "loss": 0.5289, "step": 490000 }, { "epoch": 66.03341417407707, "grad_norm": 0.16904623806476593, "learning_rate": 0.0003721294050720082, "loss": 0.5293, "step": 490100 }, { "epoch": 66.0468876313662, "grad_norm": 0.17686672508716583, "learning_rate": 0.0003720919788017605, "loss": 0.5282, "step": 490200 }, { "epoch": 66.06036108865536, "grad_norm": 0.16623269021511078, "learning_rate": 0.0003720545525315129, "loss": 0.5293, "step": 490300 }, { "epoch": 66.0738345459445, "grad_norm": 0.15982434153556824, "learning_rate": 0.0003720171262612653, "loss": 0.5298, "step": 490400 }, { "epoch": 66.08730800323363, "grad_norm": 0.16813980042934418, "learning_rate": 0.0003719796999910177, "loss": 0.5289, "step": 490500 }, { "epoch": 66.10078146052277, "grad_norm": 0.18249264359474182, "learning_rate": 0.0003719422737207701, "loss": 0.5296, "step": 490600 }, { "epoch": 66.1142549178119, "grad_norm": 0.1699482947587967, "learning_rate": 0.0003719048474505225, "loss": 0.5301, "step": 490700 }, { "epoch": 66.12772837510106, "grad_norm": 0.16636419296264648, "learning_rate": 0.0003718674211802749, "loss": 0.5301, "step": 490800 }, { "epoch": 66.1412018323902, "grad_norm": 0.18177931010723114, "learning_rate": 0.00037182999491002725, "loss": 0.5291, "step": 490900 }, { "epoch": 66.15467528967933, "grad_norm": 0.2180538922548294, "learning_rate": 0.0003717925686397796, "loss": 0.5288, "step": 491000 }, { "epoch": 66.16814874696847, "grad_norm": 0.19091586768627167, "learning_rate": 0.000371755142369532, "loss": 0.5287, "step": 491100 }, { "epoch": 66.18162220425761, "grad_norm": 0.17267584800720215, "learning_rate": 0.0003717177160992844, "loss": 0.5297, "step": 491200 }, { "epoch": 66.19509566154676, "grad_norm": 0.16144396364688873, "learning_rate": 0.0003716802898290368, "loss": 0.5299, "step": 491300 }, { "epoch": 66.2085691188359, "grad_norm": 0.16817133128643036, "learning_rate": 0.0003716428635587892, "loss": 0.5297, "step": 491400 }, { "epoch": 66.22204257612503, "grad_norm": 0.16724370419979095, "learning_rate": 0.0003716054372885416, "loss": 0.5293, "step": 491500 }, { "epoch": 66.23551603341417, "grad_norm": 0.16737854480743408, "learning_rate": 0.000371568011018294, "loss": 0.5299, "step": 491600 }, { "epoch": 66.24898949070331, "grad_norm": 0.168152317404747, "learning_rate": 0.00037153058474804633, "loss": 0.5285, "step": 491700 }, { "epoch": 66.26246294799246, "grad_norm": 0.16846148669719696, "learning_rate": 0.00037149315847779873, "loss": 0.5294, "step": 491800 }, { "epoch": 66.2759364052816, "grad_norm": 0.1694139689207077, "learning_rate": 0.00037145573220755113, "loss": 0.5294, "step": 491900 }, { "epoch": 66.28940986257074, "grad_norm": 0.17138232290744781, "learning_rate": 0.0003714183059373035, "loss": 0.5285, "step": 492000 }, { "epoch": 66.30288331985987, "grad_norm": 0.1661110520362854, "learning_rate": 0.0003713808796670559, "loss": 0.5296, "step": 492100 }, { "epoch": 66.31635677714901, "grad_norm": 0.1713724583387375, "learning_rate": 0.0003713434533968083, "loss": 0.5288, "step": 492200 }, { "epoch": 66.32983023443816, "grad_norm": 0.1722675859928131, "learning_rate": 0.0003713060271265607, "loss": 0.5301, "step": 492300 }, { "epoch": 66.3433036917273, "grad_norm": 0.16121907532215118, "learning_rate": 0.00037126860085631307, "loss": 0.5296, "step": 492400 }, { "epoch": 66.35677714901644, "grad_norm": 0.16022473573684692, "learning_rate": 0.00037123117458606546, "loss": 0.5302, "step": 492500 }, { "epoch": 66.37025060630558, "grad_norm": 0.17346617579460144, "learning_rate": 0.00037119374831581786, "loss": 0.5301, "step": 492600 }, { "epoch": 66.38372406359471, "grad_norm": 0.16827809810638428, "learning_rate": 0.00037115632204557026, "loss": 0.53, "step": 492700 }, { "epoch": 66.39719752088386, "grad_norm": 0.17153063416481018, "learning_rate": 0.0003711188957753226, "loss": 0.5293, "step": 492800 }, { "epoch": 66.410670978173, "grad_norm": 0.1679174304008484, "learning_rate": 0.000371081469505075, "loss": 0.5292, "step": 492900 }, { "epoch": 66.42414443546214, "grad_norm": 0.1765991747379303, "learning_rate": 0.0003710440432348274, "loss": 0.529, "step": 493000 }, { "epoch": 66.43761789275128, "grad_norm": 0.17674541473388672, "learning_rate": 0.00037100661696457974, "loss": 0.5294, "step": 493100 }, { "epoch": 66.45109135004041, "grad_norm": 0.1675368696451187, "learning_rate": 0.00037096919069433214, "loss": 0.5294, "step": 493200 }, { "epoch": 66.46456480732957, "grad_norm": 0.1593385934829712, "learning_rate": 0.00037093176442408454, "loss": 0.5299, "step": 493300 }, { "epoch": 66.4780382646187, "grad_norm": 0.18384046852588654, "learning_rate": 0.00037089433815383694, "loss": 0.5288, "step": 493400 }, { "epoch": 66.49151172190784, "grad_norm": 0.1747644543647766, "learning_rate": 0.00037085691188358934, "loss": 0.5284, "step": 493500 }, { "epoch": 66.50498517919698, "grad_norm": 0.19789865612983704, "learning_rate": 0.00037081948561334174, "loss": 0.5307, "step": 493600 }, { "epoch": 66.51845863648612, "grad_norm": 0.1694607138633728, "learning_rate": 0.00037078205934309413, "loss": 0.5296, "step": 493700 }, { "epoch": 66.53193209377527, "grad_norm": 0.1613084375858307, "learning_rate": 0.0003707446330728465, "loss": 0.5291, "step": 493800 }, { "epoch": 66.5454055510644, "grad_norm": 0.17708547413349152, "learning_rate": 0.0003707072068025989, "loss": 0.5292, "step": 493900 }, { "epoch": 66.55887900835354, "grad_norm": 0.15856590867042542, "learning_rate": 0.0003706697805323513, "loss": 0.5294, "step": 494000 }, { "epoch": 66.57235246564268, "grad_norm": 0.16793511807918549, "learning_rate": 0.00037063235426210367, "loss": 0.5309, "step": 494100 }, { "epoch": 66.58582592293182, "grad_norm": 0.1793222725391388, "learning_rate": 0.00037059492799185607, "loss": 0.53, "step": 494200 }, { "epoch": 66.59929938022097, "grad_norm": 0.16658832132816315, "learning_rate": 0.00037055750172160847, "loss": 0.5296, "step": 494300 }, { "epoch": 66.61277283751011, "grad_norm": 0.1738715022802353, "learning_rate": 0.00037052007545136087, "loss": 0.5293, "step": 494400 }, { "epoch": 66.62624629479924, "grad_norm": 0.15992644429206848, "learning_rate": 0.00037048264918111327, "loss": 0.5291, "step": 494500 }, { "epoch": 66.63971975208838, "grad_norm": 0.18051253259181976, "learning_rate": 0.00037044522291086556, "loss": 0.5303, "step": 494600 }, { "epoch": 66.65319320937752, "grad_norm": 0.18546156585216522, "learning_rate": 0.00037040779664061795, "loss": 0.531, "step": 494700 }, { "epoch": 66.66666666666667, "grad_norm": 0.18078356981277466, "learning_rate": 0.00037037037037037035, "loss": 0.5289, "step": 494800 }, { "epoch": 66.68014012395581, "grad_norm": 0.16539201140403748, "learning_rate": 0.00037033294410012275, "loss": 0.5301, "step": 494900 }, { "epoch": 66.69361358124495, "grad_norm": 0.1751711219549179, "learning_rate": 0.00037029551782987515, "loss": 0.5294, "step": 495000 }, { "epoch": 66.70708703853408, "grad_norm": 0.1815720647573471, "learning_rate": 0.00037025809155962755, "loss": 0.5294, "step": 495100 }, { "epoch": 66.72056049582322, "grad_norm": 0.16880014538764954, "learning_rate": 0.00037022066528937994, "loss": 0.5287, "step": 495200 }, { "epoch": 66.73403395311237, "grad_norm": 0.17345701158046722, "learning_rate": 0.0003701832390191323, "loss": 0.5301, "step": 495300 }, { "epoch": 66.74750741040151, "grad_norm": 0.17125393450260162, "learning_rate": 0.0003701458127488847, "loss": 0.5295, "step": 495400 }, { "epoch": 66.76098086769065, "grad_norm": 0.1665409356355667, "learning_rate": 0.0003701083864786371, "loss": 0.5306, "step": 495500 }, { "epoch": 66.77445432497979, "grad_norm": 0.16945381462574005, "learning_rate": 0.0003700709602083895, "loss": 0.5297, "step": 495600 }, { "epoch": 66.78792778226892, "grad_norm": 0.16157814860343933, "learning_rate": 0.0003700335339381419, "loss": 0.5295, "step": 495700 }, { "epoch": 66.80140123955807, "grad_norm": 0.17455680668354034, "learning_rate": 0.0003699961076678943, "loss": 0.5301, "step": 495800 }, { "epoch": 66.81487469684721, "grad_norm": 0.17055314779281616, "learning_rate": 0.0003699586813976467, "loss": 0.5304, "step": 495900 }, { "epoch": 66.82834815413635, "grad_norm": 0.16337551176548004, "learning_rate": 0.000369921255127399, "loss": 0.5288, "step": 496000 }, { "epoch": 66.84182161142549, "grad_norm": 0.16490429639816284, "learning_rate": 0.0003698838288571514, "loss": 0.5308, "step": 496100 }, { "epoch": 66.85529506871464, "grad_norm": 0.16516517102718353, "learning_rate": 0.0003698464025869038, "loss": 0.529, "step": 496200 }, { "epoch": 66.86876852600378, "grad_norm": 0.17289021611213684, "learning_rate": 0.0003698089763166562, "loss": 0.5291, "step": 496300 }, { "epoch": 66.88224198329291, "grad_norm": 0.17264996469020844, "learning_rate": 0.00036977155004640856, "loss": 0.5294, "step": 496400 }, { "epoch": 66.89571544058205, "grad_norm": 0.167804554104805, "learning_rate": 0.00036973412377616096, "loss": 0.5288, "step": 496500 }, { "epoch": 66.90918889787119, "grad_norm": 0.1650325506925583, "learning_rate": 0.00036969669750591336, "loss": 0.5297, "step": 496600 }, { "epoch": 66.92266235516034, "grad_norm": 0.1773194968700409, "learning_rate": 0.0003696592712356657, "loss": 0.5303, "step": 496700 }, { "epoch": 66.93613581244948, "grad_norm": 0.168755903840065, "learning_rate": 0.0003696218449654181, "loss": 0.5298, "step": 496800 }, { "epoch": 66.94960926973862, "grad_norm": 0.16576872766017914, "learning_rate": 0.0003695844186951705, "loss": 0.5289, "step": 496900 }, { "epoch": 66.96308272702775, "grad_norm": 0.19128350913524628, "learning_rate": 0.0003695469924249229, "loss": 0.5295, "step": 497000 }, { "epoch": 66.97655618431689, "grad_norm": 0.16100852191448212, "learning_rate": 0.0003695095661546753, "loss": 0.5303, "step": 497100 }, { "epoch": 66.99002964160604, "grad_norm": 0.2059338241815567, "learning_rate": 0.0003694721398844277, "loss": 0.5293, "step": 497200 }, { "epoch": 67.0, "eval_loss": 0.5174561738967896, "eval_runtime": 4.9552, "eval_samples_per_second": 1009.046, "eval_steps_per_second": 15.943, "step": 497274 }, { "epoch": 67.00350309889518, "grad_norm": 0.15509559214115143, "learning_rate": 0.0003694347136141801, "loss": 0.5293, "step": 497300 }, { "epoch": 67.01697655618432, "grad_norm": 0.16103386878967285, "learning_rate": 0.0003693972873439325, "loss": 0.5283, "step": 497400 }, { "epoch": 67.03045001347346, "grad_norm": 0.17604957520961761, "learning_rate": 0.00036935986107368483, "loss": 0.5295, "step": 497500 }, { "epoch": 67.04392347076259, "grad_norm": 0.1597185581922531, "learning_rate": 0.00036932243480343723, "loss": 0.5295, "step": 497600 }, { "epoch": 67.05739692805174, "grad_norm": 0.17771217226982117, "learning_rate": 0.00036928500853318963, "loss": 0.528, "step": 497700 }, { "epoch": 67.07087038534088, "grad_norm": 0.1827809363603592, "learning_rate": 0.00036924758226294203, "loss": 0.5293, "step": 497800 }, { "epoch": 67.08434384263002, "grad_norm": 0.17443571984767914, "learning_rate": 0.0003692101559926944, "loss": 0.5286, "step": 497900 }, { "epoch": 67.09781729991916, "grad_norm": 0.15538878738880157, "learning_rate": 0.0003691727297224468, "loss": 0.5295, "step": 498000 }, { "epoch": 67.1112907572083, "grad_norm": 0.17943862080574036, "learning_rate": 0.0003691353034521992, "loss": 0.5291, "step": 498100 }, { "epoch": 67.12476421449745, "grad_norm": 0.18899904191493988, "learning_rate": 0.0003690978771819515, "loss": 0.5302, "step": 498200 }, { "epoch": 67.13823767178658, "grad_norm": 0.17308010160923004, "learning_rate": 0.0003690604509117039, "loss": 0.5287, "step": 498300 }, { "epoch": 67.15171112907572, "grad_norm": 0.1724758744239807, "learning_rate": 0.0003690230246414563, "loss": 0.5288, "step": 498400 }, { "epoch": 67.16518458636486, "grad_norm": 0.16791675984859467, "learning_rate": 0.0003689855983712087, "loss": 0.5294, "step": 498500 }, { "epoch": 67.178658043654, "grad_norm": 0.15886333584785461, "learning_rate": 0.0003689481721009611, "loss": 0.528, "step": 498600 }, { "epoch": 67.19213150094315, "grad_norm": 0.17039886116981506, "learning_rate": 0.0003689107458307135, "loss": 0.5283, "step": 498700 }, { "epoch": 67.20560495823229, "grad_norm": 0.15822158753871918, "learning_rate": 0.0003688733195604659, "loss": 0.5288, "step": 498800 }, { "epoch": 67.21907841552142, "grad_norm": 0.1737460196018219, "learning_rate": 0.00036883589329021825, "loss": 0.5283, "step": 498900 }, { "epoch": 67.23255187281056, "grad_norm": 0.17066796123981476, "learning_rate": 0.00036879846701997064, "loss": 0.5284, "step": 499000 }, { "epoch": 67.2460253300997, "grad_norm": 0.17316336929798126, "learning_rate": 0.00036876104074972304, "loss": 0.5285, "step": 499100 }, { "epoch": 67.25949878738885, "grad_norm": 0.16630800068378448, "learning_rate": 0.00036872361447947544, "loss": 0.5288, "step": 499200 }, { "epoch": 67.27297224467799, "grad_norm": 0.15709395706653595, "learning_rate": 0.00036868618820922784, "loss": 0.5281, "step": 499300 }, { "epoch": 67.28644570196712, "grad_norm": 0.15710632503032684, "learning_rate": 0.00036864876193898024, "loss": 0.5294, "step": 499400 }, { "epoch": 67.29991915925626, "grad_norm": 0.17680422961711884, "learning_rate": 0.00036861133566873264, "loss": 0.5286, "step": 499500 }, { "epoch": 67.3133926165454, "grad_norm": 0.1708769053220749, "learning_rate": 0.00036857390939848503, "loss": 0.5291, "step": 499600 }, { "epoch": 67.32686607383455, "grad_norm": 0.15603885054588318, "learning_rate": 0.0003685364831282374, "loss": 0.5289, "step": 499700 }, { "epoch": 67.34033953112369, "grad_norm": 0.17323662340641022, "learning_rate": 0.0003684990568579898, "loss": 0.5283, "step": 499800 }, { "epoch": 67.35381298841283, "grad_norm": 0.16015899181365967, "learning_rate": 0.0003684616305877422, "loss": 0.5281, "step": 499900 }, { "epoch": 67.36728644570196, "grad_norm": 0.15738750994205475, "learning_rate": 0.0003684242043174945, "loss": 0.5295, "step": 500000 }, { "epoch": 67.3807599029911, "grad_norm": 0.15778711438179016, "learning_rate": 0.0003683867780472469, "loss": 0.5308, "step": 500100 }, { "epoch": 67.39423336028025, "grad_norm": 0.1699347198009491, "learning_rate": 0.0003683493517769993, "loss": 0.5294, "step": 500200 }, { "epoch": 67.40770681756939, "grad_norm": 0.1595175564289093, "learning_rate": 0.0003683119255067517, "loss": 0.5301, "step": 500300 }, { "epoch": 67.42118027485853, "grad_norm": 0.1929474174976349, "learning_rate": 0.00036827449923650406, "loss": 0.5288, "step": 500400 }, { "epoch": 67.43465373214767, "grad_norm": 0.1583140641450882, "learning_rate": 0.00036823707296625645, "loss": 0.53, "step": 500500 }, { "epoch": 67.4481271894368, "grad_norm": 0.16895000636577606, "learning_rate": 0.00036819964669600885, "loss": 0.5299, "step": 500600 }, { "epoch": 67.46160064672596, "grad_norm": 0.17087970674037933, "learning_rate": 0.00036816222042576125, "loss": 0.5299, "step": 500700 }, { "epoch": 67.47507410401509, "grad_norm": 0.16473503410816193, "learning_rate": 0.00036812479415551365, "loss": 0.5292, "step": 500800 }, { "epoch": 67.48854756130423, "grad_norm": 0.17105479538440704, "learning_rate": 0.00036808736788526605, "loss": 0.5287, "step": 500900 }, { "epoch": 67.50202101859337, "grad_norm": 0.19351240992546082, "learning_rate": 0.00036804994161501845, "loss": 0.5286, "step": 501000 }, { "epoch": 67.5154944758825, "grad_norm": 0.1674152910709381, "learning_rate": 0.0003680125153447708, "loss": 0.5296, "step": 501100 }, { "epoch": 67.52896793317166, "grad_norm": 0.1741122305393219, "learning_rate": 0.0003679750890745232, "loss": 0.5292, "step": 501200 }, { "epoch": 67.5424413904608, "grad_norm": 0.16239891946315765, "learning_rate": 0.0003679376628042756, "loss": 0.529, "step": 501300 }, { "epoch": 67.55591484774993, "grad_norm": 0.16046348214149475, "learning_rate": 0.000367900236534028, "loss": 0.5308, "step": 501400 }, { "epoch": 67.56938830503907, "grad_norm": 0.1749071180820465, "learning_rate": 0.0003678628102637804, "loss": 0.5287, "step": 501500 }, { "epoch": 67.5828617623282, "grad_norm": 0.1694994568824768, "learning_rate": 0.0003678253839935328, "loss": 0.5292, "step": 501600 }, { "epoch": 67.59633521961736, "grad_norm": 0.16966408491134644, "learning_rate": 0.0003677879577232852, "loss": 0.5285, "step": 501700 }, { "epoch": 67.6098086769065, "grad_norm": 0.16819503903388977, "learning_rate": 0.0003677505314530376, "loss": 0.5298, "step": 501800 }, { "epoch": 67.62328213419563, "grad_norm": 0.17295542359352112, "learning_rate": 0.00036771310518278987, "loss": 0.5301, "step": 501900 }, { "epoch": 67.63675559148477, "grad_norm": 0.17692309617996216, "learning_rate": 0.00036767567891254227, "loss": 0.53, "step": 502000 }, { "epoch": 67.65022904877391, "grad_norm": 0.15853871405124664, "learning_rate": 0.00036763825264229466, "loss": 0.5285, "step": 502100 }, { "epoch": 67.66370250606306, "grad_norm": 0.16104498505592346, "learning_rate": 0.00036760082637204706, "loss": 0.5293, "step": 502200 }, { "epoch": 67.6771759633522, "grad_norm": 0.18042542040348053, "learning_rate": 0.00036756340010179946, "loss": 0.5294, "step": 502300 }, { "epoch": 67.69064942064134, "grad_norm": 0.16404511034488678, "learning_rate": 0.00036752597383155186, "loss": 0.5292, "step": 502400 }, { "epoch": 67.70412287793047, "grad_norm": 0.18852011859416962, "learning_rate": 0.00036748854756130426, "loss": 0.5301, "step": 502500 }, { "epoch": 67.71759633521961, "grad_norm": 0.16889281570911407, "learning_rate": 0.0003674511212910566, "loss": 0.5296, "step": 502600 }, { "epoch": 67.73106979250876, "grad_norm": 0.16979892551898956, "learning_rate": 0.000367413695020809, "loss": 0.529, "step": 502700 }, { "epoch": 67.7445432497979, "grad_norm": 0.18507936596870422, "learning_rate": 0.0003673762687505614, "loss": 0.5303, "step": 502800 }, { "epoch": 67.75801670708704, "grad_norm": 0.17016734182834625, "learning_rate": 0.0003673388424803138, "loss": 0.5291, "step": 502900 }, { "epoch": 67.77149016437617, "grad_norm": 0.16590696573257446, "learning_rate": 0.0003673014162100662, "loss": 0.5298, "step": 503000 }, { "epoch": 67.78496362166531, "grad_norm": 0.162347674369812, "learning_rate": 0.0003672639899398186, "loss": 0.5296, "step": 503100 }, { "epoch": 67.79843707895446, "grad_norm": 0.16410186886787415, "learning_rate": 0.000367226563669571, "loss": 0.53, "step": 503200 }, { "epoch": 67.8119105362436, "grad_norm": 0.16617220640182495, "learning_rate": 0.00036718913739932333, "loss": 0.5294, "step": 503300 }, { "epoch": 67.82538399353274, "grad_norm": 0.16064612567424774, "learning_rate": 0.00036715171112907573, "loss": 0.5294, "step": 503400 }, { "epoch": 67.83885745082188, "grad_norm": 0.17121905088424683, "learning_rate": 0.00036711428485882813, "loss": 0.5296, "step": 503500 }, { "epoch": 67.85233090811101, "grad_norm": 0.17186732590198517, "learning_rate": 0.00036707685858858053, "loss": 0.5285, "step": 503600 }, { "epoch": 67.86580436540017, "grad_norm": 0.19112524390220642, "learning_rate": 0.0003670394323183329, "loss": 0.5291, "step": 503700 }, { "epoch": 67.8792778226893, "grad_norm": 0.1749352216720581, "learning_rate": 0.00036700200604808527, "loss": 0.5289, "step": 503800 }, { "epoch": 67.89275127997844, "grad_norm": 0.2029276341199875, "learning_rate": 0.00036696457977783767, "loss": 0.5294, "step": 503900 }, { "epoch": 67.90622473726758, "grad_norm": 0.17010356485843658, "learning_rate": 0.00036692715350759, "loss": 0.5295, "step": 504000 }, { "epoch": 67.91969819455673, "grad_norm": 0.1829620748758316, "learning_rate": 0.0003668897272373424, "loss": 0.5295, "step": 504100 }, { "epoch": 67.93317165184587, "grad_norm": 0.1687210649251938, "learning_rate": 0.0003668523009670948, "loss": 0.5288, "step": 504200 }, { "epoch": 67.946645109135, "grad_norm": 0.1672811061143875, "learning_rate": 0.0003668148746968472, "loss": 0.5282, "step": 504300 }, { "epoch": 67.96011856642414, "grad_norm": 0.16652578115463257, "learning_rate": 0.0003667774484265996, "loss": 0.5291, "step": 504400 }, { "epoch": 67.97359202371328, "grad_norm": 0.17311646044254303, "learning_rate": 0.000366740022156352, "loss": 0.5276, "step": 504500 }, { "epoch": 67.98706548100243, "grad_norm": 0.17870654165744781, "learning_rate": 0.0003667025958861044, "loss": 0.53, "step": 504600 }, { "epoch": 68.0, "eval_loss": 0.5180266499519348, "eval_runtime": 5.0006, "eval_samples_per_second": 999.87, "eval_steps_per_second": 15.798, "step": 504696 }, { "epoch": 68.00053893829157, "grad_norm": 0.1759510189294815, "learning_rate": 0.0003666651696158568, "loss": 0.5299, "step": 504700 }, { "epoch": 68.0140123955807, "grad_norm": 0.1591038703918457, "learning_rate": 0.00036662774334560915, "loss": 0.528, "step": 504800 }, { "epoch": 68.02748585286984, "grad_norm": 0.1712048053741455, "learning_rate": 0.00036659031707536154, "loss": 0.5285, "step": 504900 }, { "epoch": 68.04095931015898, "grad_norm": 0.15747909247875214, "learning_rate": 0.00036655289080511394, "loss": 0.5283, "step": 505000 }, { "epoch": 68.05443276744813, "grad_norm": 0.19863934814929962, "learning_rate": 0.00036651546453486634, "loss": 0.5292, "step": 505100 }, { "epoch": 68.06790622473727, "grad_norm": 0.2054462432861328, "learning_rate": 0.00036647803826461874, "loss": 0.5288, "step": 505200 }, { "epoch": 68.08137968202641, "grad_norm": 0.17743085324764252, "learning_rate": 0.00036644061199437114, "loss": 0.5292, "step": 505300 }, { "epoch": 68.09485313931555, "grad_norm": 0.1625303477048874, "learning_rate": 0.00036640318572412353, "loss": 0.5285, "step": 505400 }, { "epoch": 68.10832659660468, "grad_norm": 0.17619530856609344, "learning_rate": 0.0003663657594538758, "loss": 0.5292, "step": 505500 }, { "epoch": 68.12180005389384, "grad_norm": 0.15768373012542725, "learning_rate": 0.0003663283331836282, "loss": 0.5277, "step": 505600 }, { "epoch": 68.13527351118297, "grad_norm": 0.1551257073879242, "learning_rate": 0.0003662909069133806, "loss": 0.5283, "step": 505700 }, { "epoch": 68.14874696847211, "grad_norm": 0.17882880568504333, "learning_rate": 0.000366253480643133, "loss": 0.5279, "step": 505800 }, { "epoch": 68.16222042576125, "grad_norm": 0.16932079195976257, "learning_rate": 0.0003662160543728854, "loss": 0.5289, "step": 505900 }, { "epoch": 68.17569388305039, "grad_norm": 0.18277963995933533, "learning_rate": 0.0003661786281026378, "loss": 0.5283, "step": 506000 }, { "epoch": 68.18916734033954, "grad_norm": 0.15992188453674316, "learning_rate": 0.0003661412018323902, "loss": 0.5292, "step": 506100 }, { "epoch": 68.20264079762867, "grad_norm": 0.16750501096248627, "learning_rate": 0.00036610377556214256, "loss": 0.5285, "step": 506200 }, { "epoch": 68.21611425491781, "grad_norm": 0.18068759143352509, "learning_rate": 0.00036606634929189496, "loss": 0.5295, "step": 506300 }, { "epoch": 68.22958771220695, "grad_norm": 0.1667480766773224, "learning_rate": 0.00036602892302164735, "loss": 0.5286, "step": 506400 }, { "epoch": 68.24306116949609, "grad_norm": 0.15941864252090454, "learning_rate": 0.00036599149675139975, "loss": 0.5294, "step": 506500 }, { "epoch": 68.25653462678524, "grad_norm": 0.17099831998348236, "learning_rate": 0.00036595407048115215, "loss": 0.5301, "step": 506600 }, { "epoch": 68.27000808407438, "grad_norm": 0.1712314784526825, "learning_rate": 0.00036591664421090455, "loss": 0.529, "step": 506700 }, { "epoch": 68.28348154136351, "grad_norm": 0.17090310156345367, "learning_rate": 0.00036587921794065695, "loss": 0.5299, "step": 506800 }, { "epoch": 68.29695499865265, "grad_norm": 0.1743193417787552, "learning_rate": 0.00036584179167040935, "loss": 0.5292, "step": 506900 }, { "epoch": 68.31042845594179, "grad_norm": 0.17380252480506897, "learning_rate": 0.0003658043654001617, "loss": 0.5282, "step": 507000 }, { "epoch": 68.32390191323094, "grad_norm": 0.18768231570720673, "learning_rate": 0.0003657669391299141, "loss": 0.5278, "step": 507100 }, { "epoch": 68.33737537052008, "grad_norm": 0.21506698429584503, "learning_rate": 0.0003657295128596665, "loss": 0.53, "step": 507200 }, { "epoch": 68.35084882780922, "grad_norm": 0.17753365635871887, "learning_rate": 0.00036569208658941883, "loss": 0.5285, "step": 507300 }, { "epoch": 68.36432228509835, "grad_norm": 0.15388382971286774, "learning_rate": 0.00036565466031917123, "loss": 0.5284, "step": 507400 }, { "epoch": 68.37779574238749, "grad_norm": 0.16117408871650696, "learning_rate": 0.0003656172340489236, "loss": 0.5291, "step": 507500 }, { "epoch": 68.39126919967664, "grad_norm": 0.19582872092723846, "learning_rate": 0.000365579807778676, "loss": 0.5284, "step": 507600 }, { "epoch": 68.40474265696578, "grad_norm": 0.16270197927951813, "learning_rate": 0.00036554238150842837, "loss": 0.5294, "step": 507700 }, { "epoch": 68.41821611425492, "grad_norm": 0.1739441156387329, "learning_rate": 0.00036550495523818077, "loss": 0.5288, "step": 507800 }, { "epoch": 68.43168957154406, "grad_norm": 0.19180519878864288, "learning_rate": 0.00036546752896793317, "loss": 0.5291, "step": 507900 }, { "epoch": 68.44516302883319, "grad_norm": 0.1574881374835968, "learning_rate": 0.00036543010269768556, "loss": 0.529, "step": 508000 }, { "epoch": 68.45863648612234, "grad_norm": 0.1844193935394287, "learning_rate": 0.00036539267642743796, "loss": 0.5287, "step": 508100 }, { "epoch": 68.47210994341148, "grad_norm": 0.1660059690475464, "learning_rate": 0.00036535525015719036, "loss": 0.5281, "step": 508200 }, { "epoch": 68.48558340070062, "grad_norm": 0.15246924757957458, "learning_rate": 0.00036531782388694276, "loss": 0.5293, "step": 508300 }, { "epoch": 68.49905685798976, "grad_norm": 0.16581739485263824, "learning_rate": 0.0003652803976166951, "loss": 0.5288, "step": 508400 }, { "epoch": 68.5125303152789, "grad_norm": 0.18547876179218292, "learning_rate": 0.0003652429713464475, "loss": 0.529, "step": 508500 }, { "epoch": 68.52600377256805, "grad_norm": 0.1691884696483612, "learning_rate": 0.0003652055450761999, "loss": 0.5288, "step": 508600 }, { "epoch": 68.53947722985718, "grad_norm": 0.17312167584896088, "learning_rate": 0.0003651681188059523, "loss": 0.5282, "step": 508700 }, { "epoch": 68.55295068714632, "grad_norm": 0.17576497793197632, "learning_rate": 0.0003651306925357047, "loss": 0.5287, "step": 508800 }, { "epoch": 68.56642414443546, "grad_norm": 0.1520536243915558, "learning_rate": 0.0003650932662654571, "loss": 0.528, "step": 508900 }, { "epoch": 68.5798976017246, "grad_norm": 0.17256005108356476, "learning_rate": 0.0003650558399952095, "loss": 0.5289, "step": 509000 }, { "epoch": 68.59337105901375, "grad_norm": 0.17109382152557373, "learning_rate": 0.0003650184137249618, "loss": 0.5283, "step": 509100 }, { "epoch": 68.60684451630289, "grad_norm": 0.1743270754814148, "learning_rate": 0.0003649809874547142, "loss": 0.5302, "step": 509200 }, { "epoch": 68.62031797359202, "grad_norm": 0.1653723418712616, "learning_rate": 0.0003649435611844666, "loss": 0.5289, "step": 509300 }, { "epoch": 68.63379143088116, "grad_norm": 0.17867355048656464, "learning_rate": 0.000364906134914219, "loss": 0.5281, "step": 509400 }, { "epoch": 68.6472648881703, "grad_norm": 0.16260485351085663, "learning_rate": 0.0003648687086439714, "loss": 0.529, "step": 509500 }, { "epoch": 68.66073834545945, "grad_norm": 0.17895913124084473, "learning_rate": 0.00036483128237372377, "loss": 0.5287, "step": 509600 }, { "epoch": 68.67421180274859, "grad_norm": 0.16949766874313354, "learning_rate": 0.00036479385610347617, "loss": 0.529, "step": 509700 }, { "epoch": 68.68768526003772, "grad_norm": 0.16151919960975647, "learning_rate": 0.00036475642983322857, "loss": 0.5289, "step": 509800 }, { "epoch": 68.70115871732686, "grad_norm": 0.18438158929347992, "learning_rate": 0.0003647190035629809, "loss": 0.5292, "step": 509900 }, { "epoch": 68.714632174616, "grad_norm": 0.1899958848953247, "learning_rate": 0.0003646815772927333, "loss": 0.5282, "step": 510000 }, { "epoch": 68.72810563190515, "grad_norm": 0.15830737352371216, "learning_rate": 0.0003646441510224857, "loss": 0.5292, "step": 510100 }, { "epoch": 68.74157908919429, "grad_norm": 0.1676456332206726, "learning_rate": 0.0003646067247522381, "loss": 0.5285, "step": 510200 }, { "epoch": 68.75505254648343, "grad_norm": 0.17918558418750763, "learning_rate": 0.0003645692984819905, "loss": 0.5289, "step": 510300 }, { "epoch": 68.76852600377256, "grad_norm": 0.17030011117458344, "learning_rate": 0.0003645318722117429, "loss": 0.5291, "step": 510400 }, { "epoch": 68.7819994610617, "grad_norm": 0.16811759769916534, "learning_rate": 0.0003644944459414953, "loss": 0.5292, "step": 510500 }, { "epoch": 68.79547291835085, "grad_norm": 0.16175460815429688, "learning_rate": 0.00036445701967124765, "loss": 0.529, "step": 510600 }, { "epoch": 68.80894637563999, "grad_norm": 0.174069344997406, "learning_rate": 0.00036441959340100004, "loss": 0.5297, "step": 510700 }, { "epoch": 68.82241983292913, "grad_norm": 0.17747862637043, "learning_rate": 0.00036438216713075244, "loss": 0.5284, "step": 510800 }, { "epoch": 68.83589329021827, "grad_norm": 0.16930299997329712, "learning_rate": 0.0003643447408605048, "loss": 0.5297, "step": 510900 }, { "epoch": 68.8493667475074, "grad_norm": 0.1805487871170044, "learning_rate": 0.0003643073145902572, "loss": 0.5295, "step": 511000 }, { "epoch": 68.86284020479656, "grad_norm": 0.1634327918291092, "learning_rate": 0.0003642698883200096, "loss": 0.5291, "step": 511100 }, { "epoch": 68.87631366208569, "grad_norm": 0.1703648865222931, "learning_rate": 0.000364232462049762, "loss": 0.5294, "step": 511200 }, { "epoch": 68.88978711937483, "grad_norm": 0.16412962973117828, "learning_rate": 0.0003641950357795143, "loss": 0.5291, "step": 511300 }, { "epoch": 68.90326057666397, "grad_norm": 0.15870419144630432, "learning_rate": 0.0003641576095092667, "loss": 0.5281, "step": 511400 }, { "epoch": 68.91673403395312, "grad_norm": 0.16975438594818115, "learning_rate": 0.0003641201832390191, "loss": 0.529, "step": 511500 }, { "epoch": 68.93020749124226, "grad_norm": 0.1803872138261795, "learning_rate": 0.0003640827569687715, "loss": 0.5293, "step": 511600 }, { "epoch": 68.9436809485314, "grad_norm": 0.15912564098834991, "learning_rate": 0.0003640453306985239, "loss": 0.5292, "step": 511700 }, { "epoch": 68.95715440582053, "grad_norm": 0.19963307678699493, "learning_rate": 0.0003640079044282763, "loss": 0.5292, "step": 511800 }, { "epoch": 68.97062786310967, "grad_norm": 0.16891297698020935, "learning_rate": 0.0003639704781580287, "loss": 0.5296, "step": 511900 }, { "epoch": 68.98410132039882, "grad_norm": 0.17959745228290558, "learning_rate": 0.00036393305188778106, "loss": 0.5284, "step": 512000 }, { "epoch": 68.99757477768796, "grad_norm": 0.1640542447566986, "learning_rate": 0.00036389562561753346, "loss": 0.5282, "step": 512100 }, { "epoch": 69.0, "eval_loss": 0.5170564651489258, "eval_runtime": 4.9793, "eval_samples_per_second": 1004.148, "eval_steps_per_second": 15.866, "step": 512118 }, { "epoch": 69.0110482349771, "grad_norm": 0.16515213251113892, "learning_rate": 0.00036385819934728586, "loss": 0.5289, "step": 512200 }, { "epoch": 69.02452169226623, "grad_norm": 0.15811863541603088, "learning_rate": 0.00036382077307703825, "loss": 0.5271, "step": 512300 }, { "epoch": 69.03799514955537, "grad_norm": 0.15986157953739166, "learning_rate": 0.00036378334680679065, "loss": 0.5278, "step": 512400 }, { "epoch": 69.05146860684452, "grad_norm": 0.171857550740242, "learning_rate": 0.00036374592053654305, "loss": 0.5286, "step": 512500 }, { "epoch": 69.06494206413366, "grad_norm": 0.16225264966487885, "learning_rate": 0.00036370849426629545, "loss": 0.5279, "step": 512600 }, { "epoch": 69.0784155214228, "grad_norm": 0.1805567890405655, "learning_rate": 0.0003636710679960478, "loss": 0.5286, "step": 512700 }, { "epoch": 69.09188897871194, "grad_norm": 0.16823497414588928, "learning_rate": 0.00036363364172580014, "loss": 0.5282, "step": 512800 }, { "epoch": 69.10536243600107, "grad_norm": 0.1594049036502838, "learning_rate": 0.00036359621545555253, "loss": 0.5289, "step": 512900 }, { "epoch": 69.11883589329022, "grad_norm": 0.17424048483371735, "learning_rate": 0.00036355878918530493, "loss": 0.528, "step": 513000 }, { "epoch": 69.13230935057936, "grad_norm": 0.16316643357276917, "learning_rate": 0.00036352136291505733, "loss": 0.5293, "step": 513100 }, { "epoch": 69.1457828078685, "grad_norm": 0.17853614687919617, "learning_rate": 0.00036348393664480973, "loss": 0.5287, "step": 513200 }, { "epoch": 69.15925626515764, "grad_norm": 0.16083909571170807, "learning_rate": 0.00036344651037456213, "loss": 0.5287, "step": 513300 }, { "epoch": 69.17272972244677, "grad_norm": 0.18439429998397827, "learning_rate": 0.0003634090841043145, "loss": 0.529, "step": 513400 }, { "epoch": 69.18620317973593, "grad_norm": 0.17795027792453766, "learning_rate": 0.00036337165783406687, "loss": 0.5284, "step": 513500 }, { "epoch": 69.19967663702506, "grad_norm": 0.23241296410560608, "learning_rate": 0.00036333423156381927, "loss": 0.5294, "step": 513600 }, { "epoch": 69.2131500943142, "grad_norm": 0.16582292318344116, "learning_rate": 0.00036329680529357167, "loss": 0.5283, "step": 513700 }, { "epoch": 69.22662355160334, "grad_norm": 0.18982768058776855, "learning_rate": 0.00036325937902332406, "loss": 0.5285, "step": 513800 }, { "epoch": 69.24009700889248, "grad_norm": 0.16919954121112823, "learning_rate": 0.00036322195275307646, "loss": 0.5288, "step": 513900 }, { "epoch": 69.25357046618163, "grad_norm": 0.189523845911026, "learning_rate": 0.00036318452648282886, "loss": 0.5285, "step": 514000 }, { "epoch": 69.26704392347077, "grad_norm": 0.16714832186698914, "learning_rate": 0.00036314710021258126, "loss": 0.528, "step": 514100 }, { "epoch": 69.2805173807599, "grad_norm": 0.16948123276233673, "learning_rate": 0.0003631096739423336, "loss": 0.528, "step": 514200 }, { "epoch": 69.29399083804904, "grad_norm": 0.1670616865158081, "learning_rate": 0.000363072247672086, "loss": 0.529, "step": 514300 }, { "epoch": 69.30746429533818, "grad_norm": 0.17767785489559174, "learning_rate": 0.0003630348214018384, "loss": 0.5278, "step": 514400 }, { "epoch": 69.32093775262733, "grad_norm": 0.1766625940799713, "learning_rate": 0.00036299739513159074, "loss": 0.5281, "step": 514500 }, { "epoch": 69.33441120991647, "grad_norm": 0.1701284497976303, "learning_rate": 0.00036295996886134314, "loss": 0.5295, "step": 514600 }, { "epoch": 69.3478846672056, "grad_norm": 0.17718152701854706, "learning_rate": 0.00036292254259109554, "loss": 0.5291, "step": 514700 }, { "epoch": 69.36135812449474, "grad_norm": 0.17374277114868164, "learning_rate": 0.00036288511632084794, "loss": 0.5289, "step": 514800 }, { "epoch": 69.37483158178388, "grad_norm": 0.17688030004501343, "learning_rate": 0.0003628476900506003, "loss": 0.5295, "step": 514900 }, { "epoch": 69.38830503907303, "grad_norm": 0.1601601392030716, "learning_rate": 0.0003628102637803527, "loss": 0.528, "step": 515000 }, { "epoch": 69.40177849636217, "grad_norm": 0.16447998583316803, "learning_rate": 0.0003627728375101051, "loss": 0.5283, "step": 515100 }, { "epoch": 69.4152519536513, "grad_norm": 0.17933721840381622, "learning_rate": 0.0003627354112398575, "loss": 0.5287, "step": 515200 }, { "epoch": 69.42872541094044, "grad_norm": 0.1788279414176941, "learning_rate": 0.0003626979849696099, "loss": 0.5288, "step": 515300 }, { "epoch": 69.44219886822958, "grad_norm": 0.16208936274051666, "learning_rate": 0.0003626605586993623, "loss": 0.5282, "step": 515400 }, { "epoch": 69.45567232551873, "grad_norm": 0.19414371252059937, "learning_rate": 0.00036262313242911467, "loss": 0.5285, "step": 515500 }, { "epoch": 69.46914578280787, "grad_norm": 0.1698944866657257, "learning_rate": 0.00036258570615886707, "loss": 0.5288, "step": 515600 }, { "epoch": 69.48261924009701, "grad_norm": 0.18245893716812134, "learning_rate": 0.0003625482798886194, "loss": 0.5287, "step": 515700 }, { "epoch": 69.49609269738615, "grad_norm": 0.18446709215641022, "learning_rate": 0.0003625108536183718, "loss": 0.5288, "step": 515800 }, { "epoch": 69.50956615467528, "grad_norm": 0.17982271313667297, "learning_rate": 0.0003624734273481242, "loss": 0.5291, "step": 515900 }, { "epoch": 69.52303961196444, "grad_norm": 0.1762552559375763, "learning_rate": 0.0003624360010778766, "loss": 0.5288, "step": 516000 }, { "epoch": 69.53651306925357, "grad_norm": 0.1776135265827179, "learning_rate": 0.000362398574807629, "loss": 0.5294, "step": 516100 }, { "epoch": 69.54998652654271, "grad_norm": 0.17654308676719666, "learning_rate": 0.0003623611485373814, "loss": 0.529, "step": 516200 }, { "epoch": 69.56345998383185, "grad_norm": 0.1674351841211319, "learning_rate": 0.00036232372226713375, "loss": 0.5276, "step": 516300 }, { "epoch": 69.57693344112099, "grad_norm": 0.16153883934020996, "learning_rate": 0.0003622862959968861, "loss": 0.5291, "step": 516400 }, { "epoch": 69.59040689841014, "grad_norm": 0.14957119524478912, "learning_rate": 0.0003622488697266385, "loss": 0.5281, "step": 516500 }, { "epoch": 69.60388035569927, "grad_norm": 0.17016316950321198, "learning_rate": 0.0003622114434563909, "loss": 0.5284, "step": 516600 }, { "epoch": 69.61735381298841, "grad_norm": 0.184808149933815, "learning_rate": 0.0003621740171861433, "loss": 0.5287, "step": 516700 }, { "epoch": 69.63082727027755, "grad_norm": 0.16121113300323486, "learning_rate": 0.0003621365909158957, "loss": 0.5281, "step": 516800 }, { "epoch": 69.64430072756669, "grad_norm": 0.17068304121494293, "learning_rate": 0.0003620991646456481, "loss": 0.5283, "step": 516900 }, { "epoch": 69.65777418485584, "grad_norm": 0.1720399558544159, "learning_rate": 0.0003620617383754005, "loss": 0.529, "step": 517000 }, { "epoch": 69.67124764214498, "grad_norm": 0.16441209614276886, "learning_rate": 0.0003620243121051528, "loss": 0.5294, "step": 517100 }, { "epoch": 69.68472109943411, "grad_norm": 0.16535291075706482, "learning_rate": 0.0003619868858349052, "loss": 0.5288, "step": 517200 }, { "epoch": 69.69819455672325, "grad_norm": 0.16973046958446503, "learning_rate": 0.0003619494595646576, "loss": 0.5292, "step": 517300 }, { "epoch": 69.71166801401239, "grad_norm": 0.1789030283689499, "learning_rate": 0.00036191203329441, "loss": 0.5293, "step": 517400 }, { "epoch": 69.72514147130154, "grad_norm": 0.16699951887130737, "learning_rate": 0.0003618746070241624, "loss": 0.5281, "step": 517500 }, { "epoch": 69.73861492859068, "grad_norm": 0.16599352657794952, "learning_rate": 0.0003618371807539148, "loss": 0.5282, "step": 517600 }, { "epoch": 69.75208838587982, "grad_norm": 0.16355177760124207, "learning_rate": 0.0003617997544836672, "loss": 0.5282, "step": 517700 }, { "epoch": 69.76556184316895, "grad_norm": 0.16220712661743164, "learning_rate": 0.0003617623282134196, "loss": 0.5296, "step": 517800 }, { "epoch": 69.77903530045809, "grad_norm": 0.17431290447711945, "learning_rate": 0.00036172490194317196, "loss": 0.5278, "step": 517900 }, { "epoch": 69.79250875774724, "grad_norm": 0.1541222333908081, "learning_rate": 0.00036168747567292436, "loss": 0.528, "step": 518000 }, { "epoch": 69.80598221503638, "grad_norm": 0.17392896115779877, "learning_rate": 0.00036165004940267676, "loss": 0.5293, "step": 518100 }, { "epoch": 69.81945567232552, "grad_norm": 0.16785268485546112, "learning_rate": 0.0003616126231324291, "loss": 0.5286, "step": 518200 }, { "epoch": 69.83292912961466, "grad_norm": 0.1801600307226181, "learning_rate": 0.0003615751968621815, "loss": 0.5294, "step": 518300 }, { "epoch": 69.84640258690379, "grad_norm": 0.17435573041439056, "learning_rate": 0.0003615377705919339, "loss": 0.5293, "step": 518400 }, { "epoch": 69.85987604419294, "grad_norm": 0.17061902582645416, "learning_rate": 0.0003615003443216863, "loss": 0.5288, "step": 518500 }, { "epoch": 69.87334950148208, "grad_norm": 0.1643121838569641, "learning_rate": 0.00036146291805143864, "loss": 0.5291, "step": 518600 }, { "epoch": 69.88682295877122, "grad_norm": 0.18151748180389404, "learning_rate": 0.00036142549178119104, "loss": 0.5293, "step": 518700 }, { "epoch": 69.90029641606036, "grad_norm": 0.17181630432605743, "learning_rate": 0.00036138806551094343, "loss": 0.5276, "step": 518800 }, { "epoch": 69.91376987334951, "grad_norm": 0.16087278723716736, "learning_rate": 0.00036135063924069583, "loss": 0.5279, "step": 518900 }, { "epoch": 69.92724333063865, "grad_norm": 0.1823626458644867, "learning_rate": 0.00036131321297044823, "loss": 0.5285, "step": 519000 }, { "epoch": 69.94071678792778, "grad_norm": 0.16472528874874115, "learning_rate": 0.00036127578670020063, "loss": 0.5283, "step": 519100 }, { "epoch": 69.95419024521692, "grad_norm": 0.17278392612934113, "learning_rate": 0.00036123836042995303, "loss": 0.5287, "step": 519200 }, { "epoch": 69.96766370250606, "grad_norm": 0.16273964941501617, "learning_rate": 0.00036120093415970537, "loss": 0.5287, "step": 519300 }, { "epoch": 69.98113715979521, "grad_norm": 0.16963879764080048, "learning_rate": 0.00036116350788945777, "loss": 0.5295, "step": 519400 }, { "epoch": 69.99461061708435, "grad_norm": 0.17228339612483978, "learning_rate": 0.00036112608161921017, "loss": 0.5285, "step": 519500 }, { "epoch": 70.0, "eval_loss": 0.5173009037971497, "eval_runtime": 4.9549, "eval_samples_per_second": 1009.105, "eval_steps_per_second": 15.944, "step": 519540 }, { "epoch": 70.00808407437349, "grad_norm": 0.17375874519348145, "learning_rate": 0.00036108865534896257, "loss": 0.5277, "step": 519600 }, { "epoch": 70.02155753166262, "grad_norm": 0.1732734590768814, "learning_rate": 0.00036105122907871496, "loss": 0.5282, "step": 519700 }, { "epoch": 70.03503098895176, "grad_norm": 0.1814701408147812, "learning_rate": 0.00036101380280846736, "loss": 0.5282, "step": 519800 }, { "epoch": 70.04850444624091, "grad_norm": 0.17470726370811462, "learning_rate": 0.00036097637653821976, "loss": 0.5276, "step": 519900 }, { "epoch": 70.06197790353005, "grad_norm": 0.17623820900917053, "learning_rate": 0.00036093895026797205, "loss": 0.5273, "step": 520000 }, { "epoch": 70.07545136081919, "grad_norm": 0.1700960248708725, "learning_rate": 0.00036090152399772445, "loss": 0.5276, "step": 520100 }, { "epoch": 70.08892481810832, "grad_norm": 0.1612946093082428, "learning_rate": 0.00036086409772747685, "loss": 0.5276, "step": 520200 }, { "epoch": 70.10239827539746, "grad_norm": 0.1703154444694519, "learning_rate": 0.00036082667145722925, "loss": 0.5276, "step": 520300 }, { "epoch": 70.11587173268661, "grad_norm": 0.18643717467784882, "learning_rate": 0.00036078924518698164, "loss": 0.5275, "step": 520400 }, { "epoch": 70.12934518997575, "grad_norm": 0.16570048034191132, "learning_rate": 0.00036075181891673404, "loss": 0.529, "step": 520500 }, { "epoch": 70.14281864726489, "grad_norm": 0.17028014361858368, "learning_rate": 0.00036071439264648644, "loss": 0.5283, "step": 520600 }, { "epoch": 70.15629210455403, "grad_norm": 0.16367508471012115, "learning_rate": 0.00036067696637623884, "loss": 0.5285, "step": 520700 }, { "epoch": 70.16976556184316, "grad_norm": 0.18497583270072937, "learning_rate": 0.0003606395401059912, "loss": 0.5279, "step": 520800 }, { "epoch": 70.18323901913232, "grad_norm": 0.19187624752521515, "learning_rate": 0.0003606021138357436, "loss": 0.5288, "step": 520900 }, { "epoch": 70.19671247642145, "grad_norm": 0.1666116863489151, "learning_rate": 0.000360564687565496, "loss": 0.529, "step": 521000 }, { "epoch": 70.21018593371059, "grad_norm": 0.17487989366054535, "learning_rate": 0.0003605272612952484, "loss": 0.5282, "step": 521100 }, { "epoch": 70.22365939099973, "grad_norm": 0.15832054615020752, "learning_rate": 0.0003604898350250008, "loss": 0.527, "step": 521200 }, { "epoch": 70.23713284828887, "grad_norm": 0.17649781703948975, "learning_rate": 0.0003604524087547532, "loss": 0.5277, "step": 521300 }, { "epoch": 70.25060630557802, "grad_norm": 0.16984125971794128, "learning_rate": 0.00036041498248450557, "loss": 0.5276, "step": 521400 }, { "epoch": 70.26407976286715, "grad_norm": 0.15615403652191162, "learning_rate": 0.0003603775562142579, "loss": 0.5276, "step": 521500 }, { "epoch": 70.27755322015629, "grad_norm": 0.17289279401302338, "learning_rate": 0.0003603401299440103, "loss": 0.5276, "step": 521600 }, { "epoch": 70.29102667744543, "grad_norm": 0.17761312425136566, "learning_rate": 0.0003603027036737627, "loss": 0.528, "step": 521700 }, { "epoch": 70.30450013473457, "grad_norm": 0.16262580454349518, "learning_rate": 0.00036026527740351506, "loss": 0.5295, "step": 521800 }, { "epoch": 70.31797359202372, "grad_norm": 0.168336421251297, "learning_rate": 0.00036022785113326745, "loss": 0.5278, "step": 521900 }, { "epoch": 70.33144704931286, "grad_norm": 0.161973774433136, "learning_rate": 0.00036019042486301985, "loss": 0.5286, "step": 522000 }, { "epoch": 70.344920506602, "grad_norm": 0.1632167249917984, "learning_rate": 0.00036015299859277225, "loss": 0.5282, "step": 522100 }, { "epoch": 70.35839396389113, "grad_norm": 0.16832266747951508, "learning_rate": 0.0003601155723225246, "loss": 0.5289, "step": 522200 }, { "epoch": 70.37186742118027, "grad_norm": 0.1641651839017868, "learning_rate": 0.000360078146052277, "loss": 0.5285, "step": 522300 }, { "epoch": 70.38534087846942, "grad_norm": 0.17084892094135284, "learning_rate": 0.0003600407197820294, "loss": 0.5287, "step": 522400 }, { "epoch": 70.39881433575856, "grad_norm": 0.16870807111263275, "learning_rate": 0.0003600032935117818, "loss": 0.5292, "step": 522500 }, { "epoch": 70.4122877930477, "grad_norm": 0.16287212073802948, "learning_rate": 0.0003599658672415342, "loss": 0.5276, "step": 522600 }, { "epoch": 70.42576125033683, "grad_norm": 0.18008974194526672, "learning_rate": 0.0003599284409712866, "loss": 0.5274, "step": 522700 }, { "epoch": 70.43923470762597, "grad_norm": 0.1624898761510849, "learning_rate": 0.000359891014701039, "loss": 0.5283, "step": 522800 }, { "epoch": 70.45270816491512, "grad_norm": 0.1602194756269455, "learning_rate": 0.0003598535884307914, "loss": 0.5291, "step": 522900 }, { "epoch": 70.46618162220426, "grad_norm": 0.19040976464748383, "learning_rate": 0.0003598161621605437, "loss": 0.5288, "step": 523000 }, { "epoch": 70.4796550794934, "grad_norm": 0.17037346959114075, "learning_rate": 0.0003597787358902961, "loss": 0.529, "step": 523100 }, { "epoch": 70.49312853678254, "grad_norm": 0.159316286444664, "learning_rate": 0.0003597413096200485, "loss": 0.5279, "step": 523200 }, { "epoch": 70.50660199407167, "grad_norm": 0.17125017940998077, "learning_rate": 0.0003597038833498009, "loss": 0.5288, "step": 523300 }, { "epoch": 70.52007545136082, "grad_norm": 0.1877738982439041, "learning_rate": 0.0003596664570795533, "loss": 0.529, "step": 523400 }, { "epoch": 70.53354890864996, "grad_norm": 0.16042105853557587, "learning_rate": 0.0003596290308093057, "loss": 0.529, "step": 523500 }, { "epoch": 70.5470223659391, "grad_norm": 0.16424855589866638, "learning_rate": 0.00035959160453905806, "loss": 0.5288, "step": 523600 }, { "epoch": 70.56049582322824, "grad_norm": 0.1647593230009079, "learning_rate": 0.0003595541782688104, "loss": 0.529, "step": 523700 }, { "epoch": 70.57396928051737, "grad_norm": 0.18196125328540802, "learning_rate": 0.0003595167519985628, "loss": 0.5279, "step": 523800 }, { "epoch": 70.58744273780653, "grad_norm": 0.192519411444664, "learning_rate": 0.0003594793257283152, "loss": 0.5284, "step": 523900 }, { "epoch": 70.60091619509566, "grad_norm": 0.17879928648471832, "learning_rate": 0.0003594418994580676, "loss": 0.5284, "step": 524000 }, { "epoch": 70.6143896523848, "grad_norm": 0.1666075438261032, "learning_rate": 0.00035940447318782, "loss": 0.5276, "step": 524100 }, { "epoch": 70.62786310967394, "grad_norm": 0.17189070582389832, "learning_rate": 0.0003593670469175724, "loss": 0.5285, "step": 524200 }, { "epoch": 70.64133656696308, "grad_norm": 0.17142492532730103, "learning_rate": 0.0003593296206473248, "loss": 0.5284, "step": 524300 }, { "epoch": 70.65481002425223, "grad_norm": 0.17538632452487946, "learning_rate": 0.00035929219437707714, "loss": 0.5279, "step": 524400 }, { "epoch": 70.66828348154137, "grad_norm": 0.17189833521842957, "learning_rate": 0.00035925476810682954, "loss": 0.5284, "step": 524500 }, { "epoch": 70.6817569388305, "grad_norm": 0.17419536411762238, "learning_rate": 0.00035921734183658194, "loss": 0.5292, "step": 524600 }, { "epoch": 70.69523039611964, "grad_norm": 0.1753648966550827, "learning_rate": 0.00035917991556633433, "loss": 0.5285, "step": 524700 }, { "epoch": 70.70870385340878, "grad_norm": 0.15655365586280823, "learning_rate": 0.00035914248929608673, "loss": 0.5277, "step": 524800 }, { "epoch": 70.72217731069793, "grad_norm": 0.17487385869026184, "learning_rate": 0.00035910506302583913, "loss": 0.5285, "step": 524900 }, { "epoch": 70.73565076798707, "grad_norm": 0.17209291458129883, "learning_rate": 0.00035906763675559153, "loss": 0.5287, "step": 525000 }, { "epoch": 70.7491242252762, "grad_norm": 0.187491774559021, "learning_rate": 0.0003590302104853439, "loss": 0.5285, "step": 525100 }, { "epoch": 70.76259768256534, "grad_norm": 0.17080871760845184, "learning_rate": 0.00035899278421509627, "loss": 0.5288, "step": 525200 }, { "epoch": 70.77607113985448, "grad_norm": 0.16773313283920288, "learning_rate": 0.00035895535794484867, "loss": 0.5289, "step": 525300 }, { "epoch": 70.78954459714363, "grad_norm": 0.17704983055591583, "learning_rate": 0.000358917931674601, "loss": 0.5289, "step": 525400 }, { "epoch": 70.80301805443277, "grad_norm": 0.16734862327575684, "learning_rate": 0.0003588805054043534, "loss": 0.5277, "step": 525500 }, { "epoch": 70.8164915117219, "grad_norm": 0.2227095663547516, "learning_rate": 0.0003588430791341058, "loss": 0.5289, "step": 525600 }, { "epoch": 70.82996496901104, "grad_norm": 0.17877240478992462, "learning_rate": 0.0003588056528638582, "loss": 0.5288, "step": 525700 }, { "epoch": 70.84343842630018, "grad_norm": 0.16013169288635254, "learning_rate": 0.0003587682265936106, "loss": 0.5294, "step": 525800 }, { "epoch": 70.85691188358933, "grad_norm": 0.16204605996608734, "learning_rate": 0.00035873080032336295, "loss": 0.5287, "step": 525900 }, { "epoch": 70.87038534087847, "grad_norm": 0.1818481981754303, "learning_rate": 0.00035869337405311535, "loss": 0.5291, "step": 526000 }, { "epoch": 70.88385879816761, "grad_norm": 0.16616572439670563, "learning_rate": 0.00035865594778286775, "loss": 0.5291, "step": 526100 }, { "epoch": 70.89733225545675, "grad_norm": 0.16415759921073914, "learning_rate": 0.00035861852151262014, "loss": 0.5281, "step": 526200 }, { "epoch": 70.91080571274588, "grad_norm": 0.17406323552131653, "learning_rate": 0.00035858109524237254, "loss": 0.5286, "step": 526300 }, { "epoch": 70.92427917003504, "grad_norm": 0.21139372885227203, "learning_rate": 0.00035854366897212494, "loss": 0.5282, "step": 526400 }, { "epoch": 70.93775262732417, "grad_norm": 0.18411031365394592, "learning_rate": 0.00035850624270187734, "loss": 0.5294, "step": 526500 }, { "epoch": 70.95122608461331, "grad_norm": 0.17910660803318024, "learning_rate": 0.0003584688164316297, "loss": 0.5288, "step": 526600 }, { "epoch": 70.96469954190245, "grad_norm": 0.22952014207839966, "learning_rate": 0.0003584313901613821, "loss": 0.5283, "step": 526700 }, { "epoch": 70.9781729991916, "grad_norm": 0.1815253049135208, "learning_rate": 0.0003583939638911345, "loss": 0.5289, "step": 526800 }, { "epoch": 70.99164645648074, "grad_norm": 0.1860535740852356, "learning_rate": 0.0003583565376208869, "loss": 0.528, "step": 526900 }, { "epoch": 71.0, "eval_loss": 0.5171042680740356, "eval_runtime": 4.959, "eval_samples_per_second": 1008.264, "eval_steps_per_second": 15.931, "step": 526962 }, { "epoch": 71.00511991376987, "grad_norm": 0.20414842665195465, "learning_rate": 0.0003583191113506393, "loss": 0.5279, "step": 527000 }, { "epoch": 71.01859337105901, "grad_norm": 0.1678440123796463, "learning_rate": 0.0003582816850803917, "loss": 0.5276, "step": 527100 }, { "epoch": 71.03206682834815, "grad_norm": 0.16157573461532593, "learning_rate": 0.000358244258810144, "loss": 0.5291, "step": 527200 }, { "epoch": 71.0455402856373, "grad_norm": 0.17595107853412628, "learning_rate": 0.00035820683253989636, "loss": 0.5274, "step": 527300 }, { "epoch": 71.05901374292644, "grad_norm": 0.1939457505941391, "learning_rate": 0.00035816940626964876, "loss": 0.528, "step": 527400 }, { "epoch": 71.07248720021558, "grad_norm": 0.1672024428844452, "learning_rate": 0.00035813197999940116, "loss": 0.5287, "step": 527500 }, { "epoch": 71.08596065750471, "grad_norm": 0.17849169671535492, "learning_rate": 0.00035809455372915356, "loss": 0.5285, "step": 527600 }, { "epoch": 71.09943411479385, "grad_norm": 0.1671864539384842, "learning_rate": 0.00035805712745890596, "loss": 0.5278, "step": 527700 }, { "epoch": 71.112907572083, "grad_norm": 0.17994576692581177, "learning_rate": 0.00035801970118865835, "loss": 0.5275, "step": 527800 }, { "epoch": 71.12638102937214, "grad_norm": 0.16920289397239685, "learning_rate": 0.00035798227491841075, "loss": 0.528, "step": 527900 }, { "epoch": 71.13985448666128, "grad_norm": 0.15903447568416595, "learning_rate": 0.00035794484864816315, "loss": 0.5272, "step": 528000 }, { "epoch": 71.15332794395042, "grad_norm": 0.16648566722869873, "learning_rate": 0.0003579074223779155, "loss": 0.5279, "step": 528100 }, { "epoch": 71.16680140123955, "grad_norm": 0.16076532006263733, "learning_rate": 0.0003578699961076679, "loss": 0.528, "step": 528200 }, { "epoch": 71.1802748585287, "grad_norm": 0.17169621586799622, "learning_rate": 0.0003578325698374203, "loss": 0.5286, "step": 528300 }, { "epoch": 71.19374831581784, "grad_norm": 0.1578981876373291, "learning_rate": 0.0003577951435671727, "loss": 0.5277, "step": 528400 }, { "epoch": 71.20722177310698, "grad_norm": 0.16602419316768646, "learning_rate": 0.0003577577172969251, "loss": 0.5275, "step": 528500 }, { "epoch": 71.22069523039612, "grad_norm": 0.16507484018802643, "learning_rate": 0.0003577202910266775, "loss": 0.529, "step": 528600 }, { "epoch": 71.23416868768525, "grad_norm": 0.1582343429327011, "learning_rate": 0.0003576828647564299, "loss": 0.5278, "step": 528700 }, { "epoch": 71.2476421449744, "grad_norm": 0.16325077414512634, "learning_rate": 0.00035764543848618223, "loss": 0.5283, "step": 528800 }, { "epoch": 71.26111560226354, "grad_norm": 0.17080889642238617, "learning_rate": 0.0003576080122159346, "loss": 0.5269, "step": 528900 }, { "epoch": 71.27458905955268, "grad_norm": 0.16492925584316254, "learning_rate": 0.00035757058594568697, "loss": 0.5281, "step": 529000 }, { "epoch": 71.28806251684182, "grad_norm": 0.16396519541740417, "learning_rate": 0.00035753315967543937, "loss": 0.5282, "step": 529100 }, { "epoch": 71.30153597413096, "grad_norm": 0.17938809096813202, "learning_rate": 0.00035749573340519177, "loss": 0.5285, "step": 529200 }, { "epoch": 71.31500943142011, "grad_norm": 0.16011731326580048, "learning_rate": 0.00035745830713494416, "loss": 0.5276, "step": 529300 }, { "epoch": 71.32848288870925, "grad_norm": 0.19241242110729218, "learning_rate": 0.00035742088086469656, "loss": 0.5276, "step": 529400 }, { "epoch": 71.34195634599838, "grad_norm": 0.17008128762245178, "learning_rate": 0.0003573834545944489, "loss": 0.5278, "step": 529500 }, { "epoch": 71.35542980328752, "grad_norm": 0.1657331883907318, "learning_rate": 0.0003573460283242013, "loss": 0.5283, "step": 529600 }, { "epoch": 71.36890326057666, "grad_norm": 0.1858435571193695, "learning_rate": 0.0003573086020539537, "loss": 0.5267, "step": 529700 }, { "epoch": 71.38237671786581, "grad_norm": 0.16542288661003113, "learning_rate": 0.0003572711757837061, "loss": 0.5277, "step": 529800 }, { "epoch": 71.39585017515495, "grad_norm": 0.21264345943927765, "learning_rate": 0.0003572337495134585, "loss": 0.5288, "step": 529900 }, { "epoch": 71.40932363244409, "grad_norm": 0.16190698742866516, "learning_rate": 0.0003571963232432109, "loss": 0.5295, "step": 530000 }, { "epoch": 71.42279708973322, "grad_norm": 0.1591610610485077, "learning_rate": 0.0003571588969729633, "loss": 0.5284, "step": 530100 }, { "epoch": 71.43627054702236, "grad_norm": 0.1844765543937683, "learning_rate": 0.00035712147070271564, "loss": 0.5278, "step": 530200 }, { "epoch": 71.44974400431151, "grad_norm": 0.17924068868160248, "learning_rate": 0.00035708404443246804, "loss": 0.5279, "step": 530300 }, { "epoch": 71.46321746160065, "grad_norm": 0.16548091173171997, "learning_rate": 0.00035704661816222044, "loss": 0.5268, "step": 530400 }, { "epoch": 71.47669091888979, "grad_norm": 0.17338694632053375, "learning_rate": 0.00035700919189197284, "loss": 0.5282, "step": 530500 }, { "epoch": 71.49016437617892, "grad_norm": 0.20434382557868958, "learning_rate": 0.00035697176562172523, "loss": 0.5283, "step": 530600 }, { "epoch": 71.50363783346806, "grad_norm": 0.1593264490365982, "learning_rate": 0.00035693433935147763, "loss": 0.5286, "step": 530700 }, { "epoch": 71.51711129075721, "grad_norm": 0.17438718676567078, "learning_rate": 0.00035689691308123, "loss": 0.528, "step": 530800 }, { "epoch": 71.53058474804635, "grad_norm": 0.1678730696439743, "learning_rate": 0.0003568594868109824, "loss": 0.528, "step": 530900 }, { "epoch": 71.54405820533549, "grad_norm": 0.17203694581985474, "learning_rate": 0.0003568220605407347, "loss": 0.529, "step": 531000 }, { "epoch": 71.55753166262463, "grad_norm": 0.1880297064781189, "learning_rate": 0.0003567846342704871, "loss": 0.5282, "step": 531100 }, { "epoch": 71.57100511991376, "grad_norm": 0.17932672798633575, "learning_rate": 0.0003567472080002395, "loss": 0.5275, "step": 531200 }, { "epoch": 71.58447857720292, "grad_norm": 0.19204267859458923, "learning_rate": 0.0003567097817299919, "loss": 0.5278, "step": 531300 }, { "epoch": 71.59795203449205, "grad_norm": 0.1597428172826767, "learning_rate": 0.0003566723554597443, "loss": 0.5293, "step": 531400 }, { "epoch": 71.61142549178119, "grad_norm": 0.1708197444677353, "learning_rate": 0.0003566349291894967, "loss": 0.5291, "step": 531500 }, { "epoch": 71.62489894907033, "grad_norm": 0.17671073973178864, "learning_rate": 0.0003565975029192491, "loss": 0.5285, "step": 531600 }, { "epoch": 71.63837240635947, "grad_norm": 0.16999030113220215, "learning_rate": 0.00035656007664900145, "loss": 0.5273, "step": 531700 }, { "epoch": 71.65184586364862, "grad_norm": 0.16219577193260193, "learning_rate": 0.00035652265037875385, "loss": 0.5274, "step": 531800 }, { "epoch": 71.66531932093775, "grad_norm": 0.20188835263252258, "learning_rate": 0.00035648522410850625, "loss": 0.5283, "step": 531900 }, { "epoch": 71.67879277822689, "grad_norm": 0.19858555495738983, "learning_rate": 0.00035644779783825865, "loss": 0.5276, "step": 532000 }, { "epoch": 71.69226623551603, "grad_norm": 0.1691129356622696, "learning_rate": 0.00035641037156801104, "loss": 0.5277, "step": 532100 }, { "epoch": 71.70573969280517, "grad_norm": 0.18574170768260956, "learning_rate": 0.00035637294529776344, "loss": 0.5277, "step": 532200 }, { "epoch": 71.71921315009432, "grad_norm": 0.16812726855278015, "learning_rate": 0.00035633551902751584, "loss": 0.5272, "step": 532300 }, { "epoch": 71.73268660738346, "grad_norm": 0.17656190693378448, "learning_rate": 0.0003562980927572682, "loss": 0.5284, "step": 532400 }, { "epoch": 71.7461600646726, "grad_norm": 0.1676611602306366, "learning_rate": 0.0003562606664870206, "loss": 0.5285, "step": 532500 }, { "epoch": 71.75963352196173, "grad_norm": 0.17494429647922516, "learning_rate": 0.000356223240216773, "loss": 0.5277, "step": 532600 }, { "epoch": 71.77310697925087, "grad_norm": 0.16821837425231934, "learning_rate": 0.0003561858139465253, "loss": 0.5295, "step": 532700 }, { "epoch": 71.78658043654002, "grad_norm": 0.1653452068567276, "learning_rate": 0.0003561483876762777, "loss": 0.529, "step": 532800 }, { "epoch": 71.80005389382916, "grad_norm": 0.1713324934244156, "learning_rate": 0.0003561109614060301, "loss": 0.5276, "step": 532900 }, { "epoch": 71.8135273511183, "grad_norm": 0.1653285175561905, "learning_rate": 0.0003560735351357825, "loss": 0.5276, "step": 533000 }, { "epoch": 71.82700080840743, "grad_norm": 0.15718974173069, "learning_rate": 0.0003560361088655349, "loss": 0.5283, "step": 533100 }, { "epoch": 71.84047426569657, "grad_norm": 0.18446508049964905, "learning_rate": 0.00035599868259528726, "loss": 0.5289, "step": 533200 }, { "epoch": 71.85394772298572, "grad_norm": 0.17394620180130005, "learning_rate": 0.00035596125632503966, "loss": 0.5284, "step": 533300 }, { "epoch": 71.86742118027486, "grad_norm": 0.17065860331058502, "learning_rate": 0.00035592383005479206, "loss": 0.5282, "step": 533400 }, { "epoch": 71.880894637564, "grad_norm": 0.190011665225029, "learning_rate": 0.00035588640378454446, "loss": 0.5282, "step": 533500 }, { "epoch": 71.89436809485314, "grad_norm": 0.1674264371395111, "learning_rate": 0.00035584897751429686, "loss": 0.5279, "step": 533600 }, { "epoch": 71.90784155214227, "grad_norm": 0.16993898153305054, "learning_rate": 0.00035581155124404925, "loss": 0.5277, "step": 533700 }, { "epoch": 71.92131500943142, "grad_norm": 0.1686369925737381, "learning_rate": 0.00035577412497380165, "loss": 0.5268, "step": 533800 }, { "epoch": 71.93478846672056, "grad_norm": 0.1746547371149063, "learning_rate": 0.000355736698703554, "loss": 0.5284, "step": 533900 }, { "epoch": 71.9482619240097, "grad_norm": 0.1722799688577652, "learning_rate": 0.0003556992724333064, "loss": 0.5277, "step": 534000 }, { "epoch": 71.96173538129884, "grad_norm": 0.1778189241886139, "learning_rate": 0.0003556618461630588, "loss": 0.5272, "step": 534100 }, { "epoch": 71.97520883858799, "grad_norm": 0.16899435222148895, "learning_rate": 0.0003556244198928112, "loss": 0.5277, "step": 534200 }, { "epoch": 71.98868229587713, "grad_norm": 0.16243501007556915, "learning_rate": 0.0003555869936225636, "loss": 0.5284, "step": 534300 }, { "epoch": 72.0, "eval_loss": 0.5174572467803955, "eval_runtime": 4.9637, "eval_samples_per_second": 1007.305, "eval_steps_per_second": 15.915, "step": 534384 }, { "epoch": 72.00215575316626, "grad_norm": 0.1904071420431137, "learning_rate": 0.000355549567352316, "loss": 0.5282, "step": 534400 }, { "epoch": 72.0156292104554, "grad_norm": 0.18246500194072723, "learning_rate": 0.00035551214108206833, "loss": 0.5277, "step": 534500 }, { "epoch": 72.02910266774454, "grad_norm": 0.18173940479755402, "learning_rate": 0.0003554747148118207, "loss": 0.528, "step": 534600 }, { "epoch": 72.04257612503369, "grad_norm": 0.17749778926372528, "learning_rate": 0.0003554372885415731, "loss": 0.5277, "step": 534700 }, { "epoch": 72.05604958232283, "grad_norm": 0.17017462849617004, "learning_rate": 0.00035539986227132547, "loss": 0.5276, "step": 534800 }, { "epoch": 72.06952303961197, "grad_norm": 0.17305952310562134, "learning_rate": 0.00035536243600107787, "loss": 0.5266, "step": 534900 }, { "epoch": 72.0829964969011, "grad_norm": 0.16483111679553986, "learning_rate": 0.00035532500973083027, "loss": 0.5274, "step": 535000 }, { "epoch": 72.09646995419024, "grad_norm": 0.17517760396003723, "learning_rate": 0.00035528758346058267, "loss": 0.5283, "step": 535100 }, { "epoch": 72.10994341147939, "grad_norm": 0.17756135761737823, "learning_rate": 0.00035525015719033506, "loss": 0.5281, "step": 535200 }, { "epoch": 72.12341686876853, "grad_norm": 0.17205452919006348, "learning_rate": 0.0003552127309200874, "loss": 0.5267, "step": 535300 }, { "epoch": 72.13689032605767, "grad_norm": 0.16483661532402039, "learning_rate": 0.0003551753046498398, "loss": 0.528, "step": 535400 }, { "epoch": 72.1503637833468, "grad_norm": 0.18351486325263977, "learning_rate": 0.0003551378783795922, "loss": 0.5275, "step": 535500 }, { "epoch": 72.16383724063594, "grad_norm": 0.16281700134277344, "learning_rate": 0.0003551004521093446, "loss": 0.5275, "step": 535600 }, { "epoch": 72.1773106979251, "grad_norm": 0.16735686361789703, "learning_rate": 0.000355063025839097, "loss": 0.5282, "step": 535700 }, { "epoch": 72.19078415521423, "grad_norm": 0.15861289203166962, "learning_rate": 0.0003550255995688494, "loss": 0.527, "step": 535800 }, { "epoch": 72.20425761250337, "grad_norm": 0.1703135222196579, "learning_rate": 0.0003549881732986018, "loss": 0.5287, "step": 535900 }, { "epoch": 72.2177310697925, "grad_norm": 0.18154487013816833, "learning_rate": 0.0003549507470283542, "loss": 0.5272, "step": 536000 }, { "epoch": 72.23120452708164, "grad_norm": 0.17033961415290833, "learning_rate": 0.00035491332075810654, "loss": 0.5283, "step": 536100 }, { "epoch": 72.2446779843708, "grad_norm": 0.1794615387916565, "learning_rate": 0.00035487589448785894, "loss": 0.5277, "step": 536200 }, { "epoch": 72.25815144165993, "grad_norm": 0.17723286151885986, "learning_rate": 0.0003548384682176113, "loss": 0.5277, "step": 536300 }, { "epoch": 72.27162489894907, "grad_norm": 0.17403021454811096, "learning_rate": 0.0003548010419473637, "loss": 0.5275, "step": 536400 }, { "epoch": 72.28509835623821, "grad_norm": 0.16149547696113586, "learning_rate": 0.0003547636156771161, "loss": 0.5281, "step": 536500 }, { "epoch": 72.29857181352735, "grad_norm": 0.16944313049316406, "learning_rate": 0.0003547261894068685, "loss": 0.5282, "step": 536600 }, { "epoch": 72.3120452708165, "grad_norm": 0.16829021275043488, "learning_rate": 0.0003546887631366209, "loss": 0.5281, "step": 536700 }, { "epoch": 72.32551872810564, "grad_norm": 0.18277204036712646, "learning_rate": 0.0003546513368663732, "loss": 0.5272, "step": 536800 }, { "epoch": 72.33899218539477, "grad_norm": 0.19695614278316498, "learning_rate": 0.0003546139105961256, "loss": 0.5275, "step": 536900 }, { "epoch": 72.35246564268391, "grad_norm": 0.16019101440906525, "learning_rate": 0.000354576484325878, "loss": 0.5278, "step": 537000 }, { "epoch": 72.36593909997305, "grad_norm": 0.18825414776802063, "learning_rate": 0.0003545390580556304, "loss": 0.5286, "step": 537100 }, { "epoch": 72.3794125572622, "grad_norm": 0.18435190618038177, "learning_rate": 0.0003545016317853828, "loss": 0.5275, "step": 537200 }, { "epoch": 72.39288601455134, "grad_norm": 0.19266678392887115, "learning_rate": 0.0003544642055151352, "loss": 0.5269, "step": 537300 }, { "epoch": 72.40635947184047, "grad_norm": 0.18043886125087738, "learning_rate": 0.0003544267792448876, "loss": 0.528, "step": 537400 }, { "epoch": 72.41983292912961, "grad_norm": 0.1679864078760147, "learning_rate": 0.00035438935297463995, "loss": 0.5274, "step": 537500 }, { "epoch": 72.43330638641875, "grad_norm": 0.17437618970870972, "learning_rate": 0.00035435192670439235, "loss": 0.5274, "step": 537600 }, { "epoch": 72.4467798437079, "grad_norm": 0.17508603632450104, "learning_rate": 0.00035431450043414475, "loss": 0.5278, "step": 537700 }, { "epoch": 72.46025330099704, "grad_norm": 0.17260020971298218, "learning_rate": 0.00035427707416389715, "loss": 0.5279, "step": 537800 }, { "epoch": 72.47372675828618, "grad_norm": 0.16609491407871246, "learning_rate": 0.00035423964789364955, "loss": 0.5292, "step": 537900 }, { "epoch": 72.48720021557531, "grad_norm": 0.16825643181800842, "learning_rate": 0.00035420222162340194, "loss": 0.5273, "step": 538000 }, { "epoch": 72.50067367286445, "grad_norm": 0.16246351599693298, "learning_rate": 0.0003541647953531543, "loss": 0.5277, "step": 538100 }, { "epoch": 72.5141471301536, "grad_norm": 0.1648639291524887, "learning_rate": 0.00035412736908290663, "loss": 0.5286, "step": 538200 }, { "epoch": 72.52762058744274, "grad_norm": 0.16503435373306274, "learning_rate": 0.00035408994281265903, "loss": 0.5279, "step": 538300 }, { "epoch": 72.54109404473188, "grad_norm": 0.18334650993347168, "learning_rate": 0.00035405251654241143, "loss": 0.5274, "step": 538400 }, { "epoch": 72.55456750202102, "grad_norm": 0.16064229607582092, "learning_rate": 0.0003540150902721638, "loss": 0.5284, "step": 538500 }, { "epoch": 72.56804095931015, "grad_norm": 0.16880956292152405, "learning_rate": 0.0003539776640019162, "loss": 0.5272, "step": 538600 }, { "epoch": 72.5815144165993, "grad_norm": 0.17014265060424805, "learning_rate": 0.0003539402377316686, "loss": 0.5274, "step": 538700 }, { "epoch": 72.59498787388844, "grad_norm": 0.16948619484901428, "learning_rate": 0.000353902811461421, "loss": 0.5278, "step": 538800 }, { "epoch": 72.60846133117758, "grad_norm": 0.16856646537780762, "learning_rate": 0.0003538653851911734, "loss": 0.5276, "step": 538900 }, { "epoch": 72.62193478846672, "grad_norm": 0.1764369159936905, "learning_rate": 0.00035382795892092576, "loss": 0.5267, "step": 539000 }, { "epoch": 72.63540824575585, "grad_norm": 0.1748054474592209, "learning_rate": 0.00035379053265067816, "loss": 0.5278, "step": 539100 }, { "epoch": 72.648881703045, "grad_norm": 0.1782694011926651, "learning_rate": 0.00035375310638043056, "loss": 0.5282, "step": 539200 }, { "epoch": 72.66235516033414, "grad_norm": 0.18313667178153992, "learning_rate": 0.00035371568011018296, "loss": 0.5281, "step": 539300 }, { "epoch": 72.67582861762328, "grad_norm": 0.16524413228034973, "learning_rate": 0.00035367825383993536, "loss": 0.5278, "step": 539400 }, { "epoch": 72.68930207491242, "grad_norm": 0.19993364810943604, "learning_rate": 0.00035364082756968775, "loss": 0.528, "step": 539500 }, { "epoch": 72.70277553220156, "grad_norm": 0.17505007982254028, "learning_rate": 0.00035360340129944015, "loss": 0.5277, "step": 539600 }, { "epoch": 72.71624898949071, "grad_norm": 0.16399285197257996, "learning_rate": 0.0003535659750291925, "loss": 0.5286, "step": 539700 }, { "epoch": 72.72972244677985, "grad_norm": 0.16325929760932922, "learning_rate": 0.0003535285487589449, "loss": 0.5271, "step": 539800 }, { "epoch": 72.74319590406898, "grad_norm": 0.1707286536693573, "learning_rate": 0.00035349112248869724, "loss": 0.5284, "step": 539900 }, { "epoch": 72.75666936135812, "grad_norm": 0.1707233488559723, "learning_rate": 0.00035345369621844964, "loss": 0.5277, "step": 540000 }, { "epoch": 72.77014281864726, "grad_norm": 0.16023118793964386, "learning_rate": 0.00035341626994820204, "loss": 0.528, "step": 540100 }, { "epoch": 72.78361627593641, "grad_norm": 0.18924443423748016, "learning_rate": 0.00035337884367795443, "loss": 0.5293, "step": 540200 }, { "epoch": 72.79708973322555, "grad_norm": 0.17475464940071106, "learning_rate": 0.00035334141740770683, "loss": 0.5278, "step": 540300 }, { "epoch": 72.81056319051469, "grad_norm": 0.15779924392700195, "learning_rate": 0.0003533039911374592, "loss": 0.5284, "step": 540400 }, { "epoch": 72.82403664780382, "grad_norm": 0.16399303078651428, "learning_rate": 0.0003532665648672116, "loss": 0.528, "step": 540500 }, { "epoch": 72.83751010509296, "grad_norm": 0.16995081305503845, "learning_rate": 0.00035322913859696397, "loss": 0.5276, "step": 540600 }, { "epoch": 72.85098356238211, "grad_norm": 0.16978920996189117, "learning_rate": 0.00035319171232671637, "loss": 0.5275, "step": 540700 }, { "epoch": 72.86445701967125, "grad_norm": 0.16572736203670502, "learning_rate": 0.00035315428605646877, "loss": 0.5273, "step": 540800 }, { "epoch": 72.87793047696039, "grad_norm": 0.18200436234474182, "learning_rate": 0.00035311685978622117, "loss": 0.5278, "step": 540900 }, { "epoch": 72.89140393424952, "grad_norm": 0.187676340341568, "learning_rate": 0.00035307943351597357, "loss": 0.5274, "step": 541000 }, { "epoch": 72.90487739153866, "grad_norm": 0.18717622756958008, "learning_rate": 0.00035304200724572596, "loss": 0.5265, "step": 541100 }, { "epoch": 72.91835084882781, "grad_norm": 0.15857712924480438, "learning_rate": 0.0003530045809754783, "loss": 0.5285, "step": 541200 }, { "epoch": 72.93182430611695, "grad_norm": 0.1603444218635559, "learning_rate": 0.0003529671547052307, "loss": 0.5276, "step": 541300 }, { "epoch": 72.94529776340609, "grad_norm": 0.17418470978736877, "learning_rate": 0.0003529297284349831, "loss": 0.5282, "step": 541400 }, { "epoch": 72.95877122069523, "grad_norm": 0.20394858717918396, "learning_rate": 0.0003528923021647355, "loss": 0.5289, "step": 541500 }, { "epoch": 72.97224467798438, "grad_norm": 0.17879872024059296, "learning_rate": 0.0003528548758944879, "loss": 0.5279, "step": 541600 }, { "epoch": 72.98571813527352, "grad_norm": 0.17305734753608704, "learning_rate": 0.00035281744962424024, "loss": 0.5268, "step": 541700 }, { "epoch": 72.99919159256265, "grad_norm": 0.1693567931652069, "learning_rate": 0.00035278002335399264, "loss": 0.5277, "step": 541800 }, { "epoch": 73.0, "eval_loss": 0.516471266746521, "eval_runtime": 4.9569, "eval_samples_per_second": 1008.686, "eval_steps_per_second": 15.937, "step": 541806 }, { "epoch": 73.01266504985179, "grad_norm": 0.1635219156742096, "learning_rate": 0.000352742597083745, "loss": 0.5276, "step": 541900 }, { "epoch": 73.02613850714093, "grad_norm": 0.17682500183582306, "learning_rate": 0.0003527051708134974, "loss": 0.5271, "step": 542000 }, { "epoch": 73.03961196443008, "grad_norm": 0.17426583170890808, "learning_rate": 0.0003526677445432498, "loss": 0.5274, "step": 542100 }, { "epoch": 73.05308542171922, "grad_norm": 0.1736755669116974, "learning_rate": 0.0003526303182730022, "loss": 0.5268, "step": 542200 }, { "epoch": 73.06655887900835, "grad_norm": 0.17110484838485718, "learning_rate": 0.0003525928920027546, "loss": 0.5272, "step": 542300 }, { "epoch": 73.08003233629749, "grad_norm": 0.16200505197048187, "learning_rate": 0.000352555465732507, "loss": 0.5273, "step": 542400 }, { "epoch": 73.09350579358663, "grad_norm": 0.18171340227127075, "learning_rate": 0.0003525180394622594, "loss": 0.5271, "step": 542500 }, { "epoch": 73.10697925087578, "grad_norm": 0.16837963461875916, "learning_rate": 0.0003524806131920117, "loss": 0.5263, "step": 542600 }, { "epoch": 73.12045270816492, "grad_norm": 0.16369441151618958, "learning_rate": 0.0003524431869217641, "loss": 0.5272, "step": 542700 }, { "epoch": 73.13392616545406, "grad_norm": 0.22550025582313538, "learning_rate": 0.0003524057606515165, "loss": 0.5272, "step": 542800 }, { "epoch": 73.1473996227432, "grad_norm": 0.19621413946151733, "learning_rate": 0.0003523683343812689, "loss": 0.5274, "step": 542900 }, { "epoch": 73.16087308003233, "grad_norm": 0.1720239222049713, "learning_rate": 0.0003523309081110213, "loss": 0.5277, "step": 543000 }, { "epoch": 73.17434653732148, "grad_norm": 0.1699734479188919, "learning_rate": 0.0003522934818407737, "loss": 0.5271, "step": 543100 }, { "epoch": 73.18781999461062, "grad_norm": 0.16656376421451569, "learning_rate": 0.0003522560555705261, "loss": 0.5278, "step": 543200 }, { "epoch": 73.20129345189976, "grad_norm": 0.1635126769542694, "learning_rate": 0.0003522186293002785, "loss": 0.5268, "step": 543300 }, { "epoch": 73.2147669091889, "grad_norm": 0.17283502221107483, "learning_rate": 0.00035218120303003085, "loss": 0.5275, "step": 543400 }, { "epoch": 73.22824036647803, "grad_norm": 0.16848623752593994, "learning_rate": 0.0003521437767597832, "loss": 0.5274, "step": 543500 }, { "epoch": 73.24171382376718, "grad_norm": 0.16619126498699188, "learning_rate": 0.0003521063504895356, "loss": 0.5274, "step": 543600 }, { "epoch": 73.25518728105632, "grad_norm": 0.1687723696231842, "learning_rate": 0.000352068924219288, "loss": 0.5273, "step": 543700 }, { "epoch": 73.26866073834546, "grad_norm": 0.16665421426296234, "learning_rate": 0.0003520314979490404, "loss": 0.5277, "step": 543800 }, { "epoch": 73.2821341956346, "grad_norm": 0.16781651973724365, "learning_rate": 0.0003519940716787928, "loss": 0.5273, "step": 543900 }, { "epoch": 73.29560765292374, "grad_norm": 0.1823163628578186, "learning_rate": 0.0003519566454085452, "loss": 0.5272, "step": 544000 }, { "epoch": 73.30908111021289, "grad_norm": 0.16019348800182343, "learning_rate": 0.00035191921913829753, "loss": 0.5269, "step": 544100 }, { "epoch": 73.32255456750202, "grad_norm": 0.1768031269311905, "learning_rate": 0.00035188179286804993, "loss": 0.5275, "step": 544200 }, { "epoch": 73.33602802479116, "grad_norm": 0.17094740271568298, "learning_rate": 0.00035184436659780233, "loss": 0.5266, "step": 544300 }, { "epoch": 73.3495014820803, "grad_norm": 0.1830805242061615, "learning_rate": 0.0003518069403275547, "loss": 0.5275, "step": 544400 }, { "epoch": 73.36297493936944, "grad_norm": 0.1618606299161911, "learning_rate": 0.0003517695140573071, "loss": 0.5289, "step": 544500 }, { "epoch": 73.37644839665859, "grad_norm": 0.1799558699131012, "learning_rate": 0.0003517320877870595, "loss": 0.5262, "step": 544600 }, { "epoch": 73.38992185394773, "grad_norm": 0.18183186650276184, "learning_rate": 0.0003516946615168119, "loss": 0.527, "step": 544700 }, { "epoch": 73.40339531123686, "grad_norm": 0.18840822577476501, "learning_rate": 0.00035165723524656426, "loss": 0.5272, "step": 544800 }, { "epoch": 73.416868768526, "grad_norm": 0.1639128029346466, "learning_rate": 0.00035161980897631666, "loss": 0.5271, "step": 544900 }, { "epoch": 73.43034222581514, "grad_norm": 0.16752198338508606, "learning_rate": 0.00035158238270606906, "loss": 0.528, "step": 545000 }, { "epoch": 73.44381568310429, "grad_norm": 0.17636512219905853, "learning_rate": 0.00035154495643582146, "loss": 0.5287, "step": 545100 }, { "epoch": 73.45728914039343, "grad_norm": 0.17550115287303925, "learning_rate": 0.00035150753016557386, "loss": 0.528, "step": 545200 }, { "epoch": 73.47076259768257, "grad_norm": 0.17351317405700684, "learning_rate": 0.0003514701038953262, "loss": 0.5265, "step": 545300 }, { "epoch": 73.4842360549717, "grad_norm": 0.17001713812351227, "learning_rate": 0.0003514326776250786, "loss": 0.5271, "step": 545400 }, { "epoch": 73.49770951226084, "grad_norm": 0.17159713804721832, "learning_rate": 0.00035139525135483094, "loss": 0.5275, "step": 545500 }, { "epoch": 73.51118296954999, "grad_norm": 0.17853081226348877, "learning_rate": 0.00035135782508458334, "loss": 0.5272, "step": 545600 }, { "epoch": 73.52465642683913, "grad_norm": 0.18031266331672668, "learning_rate": 0.00035132039881433574, "loss": 0.5269, "step": 545700 }, { "epoch": 73.53812988412827, "grad_norm": 0.16607913374900818, "learning_rate": 0.00035128297254408814, "loss": 0.5282, "step": 545800 }, { "epoch": 73.5516033414174, "grad_norm": 0.16717948019504547, "learning_rate": 0.00035124554627384054, "loss": 0.5279, "step": 545900 }, { "epoch": 73.56507679870654, "grad_norm": 0.18261808156967163, "learning_rate": 0.00035120812000359293, "loss": 0.5276, "step": 546000 }, { "epoch": 73.5785502559957, "grad_norm": 0.1729024350643158, "learning_rate": 0.00035117069373334533, "loss": 0.528, "step": 546100 }, { "epoch": 73.59202371328483, "grad_norm": 0.16001708805561066, "learning_rate": 0.00035113326746309773, "loss": 0.5278, "step": 546200 }, { "epoch": 73.60549717057397, "grad_norm": 0.18143878877162933, "learning_rate": 0.0003510958411928501, "loss": 0.5274, "step": 546300 }, { "epoch": 73.6189706278631, "grad_norm": 0.1809764802455902, "learning_rate": 0.0003510584149226025, "loss": 0.5271, "step": 546400 }, { "epoch": 73.63244408515224, "grad_norm": 0.1637347936630249, "learning_rate": 0.00035102098865235487, "loss": 0.527, "step": 546500 }, { "epoch": 73.6459175424414, "grad_norm": 0.17943044006824493, "learning_rate": 0.00035098356238210727, "loss": 0.5275, "step": 546600 }, { "epoch": 73.65939099973053, "grad_norm": 0.17253296077251434, "learning_rate": 0.00035094613611185967, "loss": 0.5276, "step": 546700 }, { "epoch": 73.67286445701967, "grad_norm": 0.1662263721227646, "learning_rate": 0.00035090870984161207, "loss": 0.5282, "step": 546800 }, { "epoch": 73.68633791430881, "grad_norm": 0.17495085299015045, "learning_rate": 0.00035087128357136446, "loss": 0.5272, "step": 546900 }, { "epoch": 73.69981137159795, "grad_norm": 0.19026295840740204, "learning_rate": 0.0003508338573011168, "loss": 0.5292, "step": 547000 }, { "epoch": 73.7132848288871, "grad_norm": 0.1671060174703598, "learning_rate": 0.00035079643103086915, "loss": 0.5273, "step": 547100 }, { "epoch": 73.72675828617623, "grad_norm": 0.1605205088853836, "learning_rate": 0.00035075900476062155, "loss": 0.5281, "step": 547200 }, { "epoch": 73.74023174346537, "grad_norm": 0.16976672410964966, "learning_rate": 0.00035072157849037395, "loss": 0.5282, "step": 547300 }, { "epoch": 73.75370520075451, "grad_norm": 0.17684851586818695, "learning_rate": 0.00035068415222012635, "loss": 0.5286, "step": 547400 }, { "epoch": 73.76717865804365, "grad_norm": 0.16691309213638306, "learning_rate": 0.00035064672594987875, "loss": 0.5271, "step": 547500 }, { "epoch": 73.7806521153328, "grad_norm": 0.1770971715450287, "learning_rate": 0.00035060929967963114, "loss": 0.5278, "step": 547600 }, { "epoch": 73.79412557262194, "grad_norm": 0.1937476396560669, "learning_rate": 0.0003505718734093835, "loss": 0.5277, "step": 547700 }, { "epoch": 73.80759902991107, "grad_norm": 0.16799750924110413, "learning_rate": 0.0003505344471391359, "loss": 0.5267, "step": 547800 }, { "epoch": 73.82107248720021, "grad_norm": 0.16689813137054443, "learning_rate": 0.0003504970208688883, "loss": 0.5269, "step": 547900 }, { "epoch": 73.83454594448935, "grad_norm": 0.17822083830833435, "learning_rate": 0.0003504595945986407, "loss": 0.5283, "step": 548000 }, { "epoch": 73.8480194017785, "grad_norm": 0.1653146743774414, "learning_rate": 0.0003504221683283931, "loss": 0.5273, "step": 548100 }, { "epoch": 73.86149285906764, "grad_norm": 0.1758868396282196, "learning_rate": 0.0003503847420581455, "loss": 0.5287, "step": 548200 }, { "epoch": 73.87496631635678, "grad_norm": 0.1884235292673111, "learning_rate": 0.0003503473157878979, "loss": 0.5281, "step": 548300 }, { "epoch": 73.88843977364591, "grad_norm": 0.1821472942829132, "learning_rate": 0.0003503098895176503, "loss": 0.528, "step": 548400 }, { "epoch": 73.90191323093505, "grad_norm": 0.16472332179546356, "learning_rate": 0.0003502724632474026, "loss": 0.5275, "step": 548500 }, { "epoch": 73.9153866882242, "grad_norm": 0.1744505763053894, "learning_rate": 0.000350235036977155, "loss": 0.5283, "step": 548600 }, { "epoch": 73.92886014551334, "grad_norm": 0.16695688664913177, "learning_rate": 0.0003501976107069074, "loss": 0.5289, "step": 548700 }, { "epoch": 73.94233360280248, "grad_norm": 0.17366795241832733, "learning_rate": 0.0003501601844366598, "loss": 0.5274, "step": 548800 }, { "epoch": 73.95580706009162, "grad_norm": 0.18236228823661804, "learning_rate": 0.0003501227581664122, "loss": 0.5277, "step": 548900 }, { "epoch": 73.96928051738075, "grad_norm": 0.17420117557048798, "learning_rate": 0.00035008533189616456, "loss": 0.5271, "step": 549000 }, { "epoch": 73.9827539746699, "grad_norm": 0.17389310896396637, "learning_rate": 0.00035004790562591695, "loss": 0.5276, "step": 549100 }, { "epoch": 73.99622743195904, "grad_norm": 0.18083162605762482, "learning_rate": 0.0003500104793556693, "loss": 0.5276, "step": 549200 }, { "epoch": 74.0, "eval_loss": 0.5159794092178345, "eval_runtime": 4.9504, "eval_samples_per_second": 1010.018, "eval_steps_per_second": 15.958, "step": 549228 }, { "epoch": 74.00970088924818, "grad_norm": 0.16030675172805786, "learning_rate": 0.0003499730530854217, "loss": 0.527, "step": 549300 }, { "epoch": 74.02317434653732, "grad_norm": 0.1658765822649002, "learning_rate": 0.0003499356268151741, "loss": 0.526, "step": 549400 }, { "epoch": 74.03664780382647, "grad_norm": 0.1631772369146347, "learning_rate": 0.0003498982005449265, "loss": 0.5275, "step": 549500 }, { "epoch": 74.0501212611156, "grad_norm": 0.17218880355358124, "learning_rate": 0.0003498607742746789, "loss": 0.5276, "step": 549600 }, { "epoch": 74.06359471840474, "grad_norm": 0.15893732011318207, "learning_rate": 0.0003498233480044313, "loss": 0.5265, "step": 549700 }, { "epoch": 74.07706817569388, "grad_norm": 0.18535307049751282, "learning_rate": 0.0003497859217341837, "loss": 0.5273, "step": 549800 }, { "epoch": 74.09054163298302, "grad_norm": 0.16315698623657227, "learning_rate": 0.00034974849546393603, "loss": 0.5275, "step": 549900 }, { "epoch": 74.10401509027217, "grad_norm": 0.19221745431423187, "learning_rate": 0.00034971106919368843, "loss": 0.5272, "step": 550000 }, { "epoch": 74.11748854756131, "grad_norm": 0.17638404667377472, "learning_rate": 0.00034967364292344083, "loss": 0.5281, "step": 550100 }, { "epoch": 74.13096200485045, "grad_norm": 0.1687018722295761, "learning_rate": 0.0003496362166531932, "loss": 0.5267, "step": 550200 }, { "epoch": 74.14443546213958, "grad_norm": 0.17486533522605896, "learning_rate": 0.0003495987903829456, "loss": 0.527, "step": 550300 }, { "epoch": 74.15790891942872, "grad_norm": 0.16908106207847595, "learning_rate": 0.000349561364112698, "loss": 0.5277, "step": 550400 }, { "epoch": 74.17138237671787, "grad_norm": 0.18251127004623413, "learning_rate": 0.0003495239378424504, "loss": 0.5265, "step": 550500 }, { "epoch": 74.18485583400701, "grad_norm": 0.17783527076244354, "learning_rate": 0.00034948651157220277, "loss": 0.5258, "step": 550600 }, { "epoch": 74.19832929129615, "grad_norm": 0.15969635546207428, "learning_rate": 0.00034944908530195516, "loss": 0.5264, "step": 550700 }, { "epoch": 74.21180274858528, "grad_norm": 0.1671421229839325, "learning_rate": 0.0003494116590317075, "loss": 0.527, "step": 550800 }, { "epoch": 74.22527620587442, "grad_norm": 0.18099305033683777, "learning_rate": 0.0003493742327614599, "loss": 0.5272, "step": 550900 }, { "epoch": 74.23874966316357, "grad_norm": 0.16972649097442627, "learning_rate": 0.0003493368064912123, "loss": 0.5274, "step": 551000 }, { "epoch": 74.25222312045271, "grad_norm": 0.17903591692447662, "learning_rate": 0.0003492993802209647, "loss": 0.5278, "step": 551100 }, { "epoch": 74.26569657774185, "grad_norm": 0.16531816124916077, "learning_rate": 0.0003492619539507171, "loss": 0.5269, "step": 551200 }, { "epoch": 74.27917003503099, "grad_norm": 0.1754881888628006, "learning_rate": 0.0003492245276804695, "loss": 0.5253, "step": 551300 }, { "epoch": 74.29264349232012, "grad_norm": 0.1692098081111908, "learning_rate": 0.00034918710141022184, "loss": 0.5272, "step": 551400 }, { "epoch": 74.30611694960928, "grad_norm": 0.19436238706111908, "learning_rate": 0.00034914967513997424, "loss": 0.527, "step": 551500 }, { "epoch": 74.31959040689841, "grad_norm": 0.17987747490406036, "learning_rate": 0.00034911224886972664, "loss": 0.5273, "step": 551600 }, { "epoch": 74.33306386418755, "grad_norm": 0.16030636429786682, "learning_rate": 0.00034907482259947904, "loss": 0.5265, "step": 551700 }, { "epoch": 74.34653732147669, "grad_norm": 0.18396463990211487, "learning_rate": 0.00034903739632923144, "loss": 0.5267, "step": 551800 }, { "epoch": 74.36001077876583, "grad_norm": 0.1730097234249115, "learning_rate": 0.00034899997005898383, "loss": 0.5276, "step": 551900 }, { "epoch": 74.37348423605498, "grad_norm": 0.19576705992221832, "learning_rate": 0.00034896254378873623, "loss": 0.5273, "step": 552000 }, { "epoch": 74.38695769334412, "grad_norm": 0.1691599041223526, "learning_rate": 0.0003489251175184886, "loss": 0.5277, "step": 552100 }, { "epoch": 74.40043115063325, "grad_norm": 0.17897234857082367, "learning_rate": 0.000348887691248241, "loss": 0.5274, "step": 552200 }, { "epoch": 74.41390460792239, "grad_norm": 0.16327683627605438, "learning_rate": 0.0003488502649779934, "loss": 0.5276, "step": 552300 }, { "epoch": 74.42737806521153, "grad_norm": 0.17264950275421143, "learning_rate": 0.00034881283870774577, "loss": 0.5272, "step": 552400 }, { "epoch": 74.44085152250068, "grad_norm": 0.20191046595573425, "learning_rate": 0.00034877541243749817, "loss": 0.5285, "step": 552500 }, { "epoch": 74.45432497978982, "grad_norm": 0.17441867291927338, "learning_rate": 0.0003487379861672505, "loss": 0.5274, "step": 552600 }, { "epoch": 74.46779843707895, "grad_norm": 0.19100165367126465, "learning_rate": 0.0003487005598970029, "loss": 0.5267, "step": 552700 }, { "epoch": 74.48127189436809, "grad_norm": 0.16100651025772095, "learning_rate": 0.00034866313362675526, "loss": 0.5279, "step": 552800 }, { "epoch": 74.49474535165723, "grad_norm": 0.1623719483613968, "learning_rate": 0.00034862570735650765, "loss": 0.5266, "step": 552900 }, { "epoch": 74.50821880894638, "grad_norm": 0.16974864900112152, "learning_rate": 0.00034858828108626005, "loss": 0.5273, "step": 553000 }, { "epoch": 74.52169226623552, "grad_norm": 0.16911475360393524, "learning_rate": 0.00034855085481601245, "loss": 0.5275, "step": 553100 }, { "epoch": 74.53516572352466, "grad_norm": 0.17156021296977997, "learning_rate": 0.00034851342854576485, "loss": 0.5274, "step": 553200 }, { "epoch": 74.5486391808138, "grad_norm": 0.17134957015514374, "learning_rate": 0.00034847600227551725, "loss": 0.5278, "step": 553300 }, { "epoch": 74.56211263810293, "grad_norm": 0.16161222755908966, "learning_rate": 0.00034843857600526965, "loss": 0.5275, "step": 553400 }, { "epoch": 74.57558609539208, "grad_norm": 0.16443023085594177, "learning_rate": 0.000348401149735022, "loss": 0.5273, "step": 553500 }, { "epoch": 74.58905955268122, "grad_norm": 0.1750527322292328, "learning_rate": 0.0003483637234647744, "loss": 0.5268, "step": 553600 }, { "epoch": 74.60253300997036, "grad_norm": 0.17215029895305634, "learning_rate": 0.0003483262971945268, "loss": 0.5266, "step": 553700 }, { "epoch": 74.6160064672595, "grad_norm": 0.17213335633277893, "learning_rate": 0.0003482888709242792, "loss": 0.5277, "step": 553800 }, { "epoch": 74.62947992454863, "grad_norm": 0.1861678659915924, "learning_rate": 0.0003482514446540316, "loss": 0.5279, "step": 553900 }, { "epoch": 74.64295338183778, "grad_norm": 0.19342027604579926, "learning_rate": 0.000348214018383784, "loss": 0.5265, "step": 554000 }, { "epoch": 74.65642683912692, "grad_norm": 0.16388088464736938, "learning_rate": 0.0003481765921135364, "loss": 0.5275, "step": 554100 }, { "epoch": 74.66990029641606, "grad_norm": 0.15559697151184082, "learning_rate": 0.0003481391658432888, "loss": 0.5271, "step": 554200 }, { "epoch": 74.6833737537052, "grad_norm": 0.17874878644943237, "learning_rate": 0.0003481017395730411, "loss": 0.528, "step": 554300 }, { "epoch": 74.69684721099433, "grad_norm": 0.1702241748571396, "learning_rate": 0.00034806431330279346, "loss": 0.5283, "step": 554400 }, { "epoch": 74.71032066828349, "grad_norm": 0.17701135575771332, "learning_rate": 0.00034802688703254586, "loss": 0.5285, "step": 554500 }, { "epoch": 74.72379412557262, "grad_norm": 0.1753622591495514, "learning_rate": 0.00034798946076229826, "loss": 0.5275, "step": 554600 }, { "epoch": 74.73726758286176, "grad_norm": 0.19364440441131592, "learning_rate": 0.00034795203449205066, "loss": 0.5271, "step": 554700 }, { "epoch": 74.7507410401509, "grad_norm": 0.17288640141487122, "learning_rate": 0.00034791460822180306, "loss": 0.5267, "step": 554800 }, { "epoch": 74.76421449744004, "grad_norm": 0.17682261765003204, "learning_rate": 0.00034787718195155546, "loss": 0.5273, "step": 554900 }, { "epoch": 74.77768795472919, "grad_norm": 0.17045341432094574, "learning_rate": 0.0003478397556813078, "loss": 0.5276, "step": 555000 }, { "epoch": 74.79116141201833, "grad_norm": 0.1645391583442688, "learning_rate": 0.0003478023294110602, "loss": 0.5264, "step": 555100 }, { "epoch": 74.80463486930746, "grad_norm": 0.18193696439266205, "learning_rate": 0.0003477649031408126, "loss": 0.528, "step": 555200 }, { "epoch": 74.8181083265966, "grad_norm": 0.17729395627975464, "learning_rate": 0.000347727476870565, "loss": 0.5271, "step": 555300 }, { "epoch": 74.83158178388574, "grad_norm": 0.17160825431346893, "learning_rate": 0.0003476900506003174, "loss": 0.5278, "step": 555400 }, { "epoch": 74.84505524117489, "grad_norm": 0.1828102022409439, "learning_rate": 0.0003476526243300698, "loss": 0.527, "step": 555500 }, { "epoch": 74.85852869846403, "grad_norm": 0.1732804924249649, "learning_rate": 0.0003476151980598222, "loss": 0.5272, "step": 555600 }, { "epoch": 74.87200215575317, "grad_norm": 0.18993686139583588, "learning_rate": 0.00034757777178957453, "loss": 0.5275, "step": 555700 }, { "epoch": 74.8854756130423, "grad_norm": 0.16782023012638092, "learning_rate": 0.00034754034551932693, "loss": 0.5278, "step": 555800 }, { "epoch": 74.89894907033144, "grad_norm": 0.18591448664665222, "learning_rate": 0.00034750291924907933, "loss": 0.5273, "step": 555900 }, { "epoch": 74.91242252762059, "grad_norm": 0.17494013905525208, "learning_rate": 0.00034746549297883173, "loss": 0.5275, "step": 556000 }, { "epoch": 74.92589598490973, "grad_norm": 0.19005146622657776, "learning_rate": 0.0003474280667085841, "loss": 0.527, "step": 556100 }, { "epoch": 74.93936944219887, "grad_norm": 0.17968367040157318, "learning_rate": 0.00034739064043833647, "loss": 0.528, "step": 556200 }, { "epoch": 74.952842899488, "grad_norm": 0.19568027555942535, "learning_rate": 0.00034735321416808887, "loss": 0.5281, "step": 556300 }, { "epoch": 74.96631635677714, "grad_norm": 0.1632426530122757, "learning_rate": 0.00034731578789784127, "loss": 0.5279, "step": 556400 }, { "epoch": 74.9797898140663, "grad_norm": 0.17301280796527863, "learning_rate": 0.0003472783616275936, "loss": 0.528, "step": 556500 }, { "epoch": 74.99326327135543, "grad_norm": 0.1758497804403305, "learning_rate": 0.000347240935357346, "loss": 0.5277, "step": 556600 }, { "epoch": 75.0, "eval_loss": 0.5157985091209412, "eval_runtime": 4.9643, "eval_samples_per_second": 1007.195, "eval_steps_per_second": 15.914, "step": 556650 }, { "epoch": 75.00673672864457, "grad_norm": 0.1668112874031067, "learning_rate": 0.0003472035090870984, "loss": 0.5264, "step": 556700 }, { "epoch": 75.0202101859337, "grad_norm": 0.17113912105560303, "learning_rate": 0.0003471660828168508, "loss": 0.5259, "step": 556800 }, { "epoch": 75.03368364322286, "grad_norm": 0.18191801011562347, "learning_rate": 0.0003471286565466032, "loss": 0.5263, "step": 556900 }, { "epoch": 75.047157100512, "grad_norm": 0.17364607751369476, "learning_rate": 0.0003470912302763556, "loss": 0.527, "step": 557000 }, { "epoch": 75.06063055780113, "grad_norm": 0.1632552146911621, "learning_rate": 0.000347053804006108, "loss": 0.5268, "step": 557100 }, { "epoch": 75.07410401509027, "grad_norm": 0.171133354306221, "learning_rate": 0.00034701637773586034, "loss": 0.5263, "step": 557200 }, { "epoch": 75.08757747237941, "grad_norm": 0.18421556055545807, "learning_rate": 0.00034697895146561274, "loss": 0.527, "step": 557300 }, { "epoch": 75.10105092966856, "grad_norm": 0.17142996191978455, "learning_rate": 0.00034694152519536514, "loss": 0.5262, "step": 557400 }, { "epoch": 75.1145243869577, "grad_norm": 0.19057653844356537, "learning_rate": 0.00034690409892511754, "loss": 0.5265, "step": 557500 }, { "epoch": 75.12799784424683, "grad_norm": 0.16899925470352173, "learning_rate": 0.00034686667265486994, "loss": 0.526, "step": 557600 }, { "epoch": 75.14147130153597, "grad_norm": 0.17421680688858032, "learning_rate": 0.00034682924638462234, "loss": 0.527, "step": 557700 }, { "epoch": 75.15494475882511, "grad_norm": 0.18070268630981445, "learning_rate": 0.00034679182011437473, "loss": 0.5265, "step": 557800 }, { "epoch": 75.16841821611426, "grad_norm": 0.20989862084388733, "learning_rate": 0.0003467543938441271, "loss": 0.5274, "step": 557900 }, { "epoch": 75.1818916734034, "grad_norm": 0.18283657729625702, "learning_rate": 0.0003467169675738794, "loss": 0.5267, "step": 558000 }, { "epoch": 75.19536513069254, "grad_norm": 0.17854700982570648, "learning_rate": 0.0003466795413036318, "loss": 0.5268, "step": 558100 }, { "epoch": 75.20883858798167, "grad_norm": 0.17380841076374054, "learning_rate": 0.0003466421150333842, "loss": 0.5276, "step": 558200 }, { "epoch": 75.22231204527081, "grad_norm": 0.16388918459415436, "learning_rate": 0.0003466046887631366, "loss": 0.5275, "step": 558300 }, { "epoch": 75.23578550255996, "grad_norm": 0.18495264649391174, "learning_rate": 0.000346567262492889, "loss": 0.527, "step": 558400 }, { "epoch": 75.2492589598491, "grad_norm": 0.17849552631378174, "learning_rate": 0.0003465298362226414, "loss": 0.5259, "step": 558500 }, { "epoch": 75.26273241713824, "grad_norm": 0.18511207401752472, "learning_rate": 0.00034649240995239376, "loss": 0.5265, "step": 558600 }, { "epoch": 75.27620587442738, "grad_norm": 0.17204998433589935, "learning_rate": 0.00034645498368214616, "loss": 0.5282, "step": 558700 }, { "epoch": 75.28967933171651, "grad_norm": 0.17375442385673523, "learning_rate": 0.00034641755741189855, "loss": 0.5271, "step": 558800 }, { "epoch": 75.30315278900567, "grad_norm": 0.19294443726539612, "learning_rate": 0.00034638013114165095, "loss": 0.5267, "step": 558900 }, { "epoch": 75.3166262462948, "grad_norm": 0.16497984528541565, "learning_rate": 0.00034634270487140335, "loss": 0.5276, "step": 559000 }, { "epoch": 75.33009970358394, "grad_norm": 0.16921456158161163, "learning_rate": 0.00034630527860115575, "loss": 0.5275, "step": 559100 }, { "epoch": 75.34357316087308, "grad_norm": 0.1821753978729248, "learning_rate": 0.00034626785233090815, "loss": 0.5268, "step": 559200 }, { "epoch": 75.35704661816222, "grad_norm": 0.16933858394622803, "learning_rate": 0.00034623042606066054, "loss": 0.5275, "step": 559300 }, { "epoch": 75.37052007545137, "grad_norm": 0.16087007522583008, "learning_rate": 0.0003461929997904129, "loss": 0.5261, "step": 559400 }, { "epoch": 75.3839935327405, "grad_norm": 0.17945875227451324, "learning_rate": 0.0003461555735201653, "loss": 0.5267, "step": 559500 }, { "epoch": 75.39746699002964, "grad_norm": 0.16270174086093903, "learning_rate": 0.0003461181472499177, "loss": 0.5284, "step": 559600 }, { "epoch": 75.41094044731878, "grad_norm": 0.1761896163225174, "learning_rate": 0.0003460807209796701, "loss": 0.5277, "step": 559700 }, { "epoch": 75.42441390460792, "grad_norm": 0.17797981202602386, "learning_rate": 0.00034604329470942243, "loss": 0.5269, "step": 559800 }, { "epoch": 75.43788736189707, "grad_norm": 0.166952982544899, "learning_rate": 0.0003460058684391748, "loss": 0.527, "step": 559900 }, { "epoch": 75.4513608191862, "grad_norm": 0.19520649313926697, "learning_rate": 0.0003459684421689272, "loss": 0.5267, "step": 560000 }, { "epoch": 75.46483427647534, "grad_norm": 0.17881262302398682, "learning_rate": 0.00034593101589867957, "loss": 0.5262, "step": 560100 }, { "epoch": 75.47830773376448, "grad_norm": 0.1965353786945343, "learning_rate": 0.00034589358962843197, "loss": 0.5275, "step": 560200 }, { "epoch": 75.49178119105362, "grad_norm": 0.177378311753273, "learning_rate": 0.00034585616335818436, "loss": 0.5275, "step": 560300 }, { "epoch": 75.50525464834277, "grad_norm": 0.18857568502426147, "learning_rate": 0.00034581873708793676, "loss": 0.5272, "step": 560400 }, { "epoch": 75.51872810563191, "grad_norm": 0.18228670954704285, "learning_rate": 0.00034578131081768916, "loss": 0.5268, "step": 560500 }, { "epoch": 75.53220156292105, "grad_norm": 0.1608773171901703, "learning_rate": 0.00034574388454744156, "loss": 0.5268, "step": 560600 }, { "epoch": 75.54567502021018, "grad_norm": 0.1644149124622345, "learning_rate": 0.00034570645827719396, "loss": 0.5272, "step": 560700 }, { "epoch": 75.55914847749932, "grad_norm": 0.1798667460680008, "learning_rate": 0.0003456690320069463, "loss": 0.5276, "step": 560800 }, { "epoch": 75.57262193478847, "grad_norm": 0.16699904203414917, "learning_rate": 0.0003456316057366987, "loss": 0.5268, "step": 560900 }, { "epoch": 75.58609539207761, "grad_norm": 0.1616780161857605, "learning_rate": 0.0003455941794664511, "loss": 0.5272, "step": 561000 }, { "epoch": 75.59956884936675, "grad_norm": 0.17822526395320892, "learning_rate": 0.0003455567531962035, "loss": 0.5272, "step": 561100 }, { "epoch": 75.61304230665588, "grad_norm": 0.17820598185062408, "learning_rate": 0.0003455193269259559, "loss": 0.5272, "step": 561200 }, { "epoch": 75.62651576394502, "grad_norm": 0.1847083419561386, "learning_rate": 0.0003454819006557083, "loss": 0.5274, "step": 561300 }, { "epoch": 75.63998922123417, "grad_norm": 0.16917684674263, "learning_rate": 0.0003454444743854607, "loss": 0.527, "step": 561400 }, { "epoch": 75.65346267852331, "grad_norm": 0.16785278916358948, "learning_rate": 0.0003454070481152131, "loss": 0.5277, "step": 561500 }, { "epoch": 75.66693613581245, "grad_norm": 0.17193926870822906, "learning_rate": 0.0003453696218449654, "loss": 0.5279, "step": 561600 }, { "epoch": 75.68040959310159, "grad_norm": 0.1699514240026474, "learning_rate": 0.0003453321955747178, "loss": 0.5266, "step": 561700 }, { "epoch": 75.69388305039072, "grad_norm": 0.18469177186489105, "learning_rate": 0.0003452947693044702, "loss": 0.5273, "step": 561800 }, { "epoch": 75.70735650767988, "grad_norm": 0.1706123799085617, "learning_rate": 0.0003452573430342226, "loss": 0.5269, "step": 561900 }, { "epoch": 75.72082996496901, "grad_norm": 0.17702627182006836, "learning_rate": 0.00034521991676397497, "loss": 0.5265, "step": 562000 }, { "epoch": 75.73430342225815, "grad_norm": 0.17789588868618011, "learning_rate": 0.00034518249049372737, "loss": 0.5262, "step": 562100 }, { "epoch": 75.74777687954729, "grad_norm": 0.1665324568748474, "learning_rate": 0.00034514506422347977, "loss": 0.5275, "step": 562200 }, { "epoch": 75.76125033683643, "grad_norm": 0.17942196130752563, "learning_rate": 0.0003451076379532321, "loss": 0.5278, "step": 562300 }, { "epoch": 75.77472379412558, "grad_norm": 0.16082844138145447, "learning_rate": 0.0003450702116829845, "loss": 0.5273, "step": 562400 }, { "epoch": 75.78819725141472, "grad_norm": 0.17690101265907288, "learning_rate": 0.0003450327854127369, "loss": 0.5265, "step": 562500 }, { "epoch": 75.80167070870385, "grad_norm": 0.16965077817440033, "learning_rate": 0.0003449953591424893, "loss": 0.526, "step": 562600 }, { "epoch": 75.81514416599299, "grad_norm": 0.182224303483963, "learning_rate": 0.0003449579328722417, "loss": 0.5275, "step": 562700 }, { "epoch": 75.82861762328213, "grad_norm": 0.16860520839691162, "learning_rate": 0.0003449205066019941, "loss": 0.5267, "step": 562800 }, { "epoch": 75.84209108057128, "grad_norm": 0.18669761717319489, "learning_rate": 0.0003448830803317465, "loss": 0.5274, "step": 562900 }, { "epoch": 75.85556453786042, "grad_norm": 0.19127194583415985, "learning_rate": 0.00034484565406149885, "loss": 0.527, "step": 563000 }, { "epoch": 75.86903799514955, "grad_norm": 0.17391806840896606, "learning_rate": 0.00034480822779125124, "loss": 0.5267, "step": 563100 }, { "epoch": 75.88251145243869, "grad_norm": 0.16644509136676788, "learning_rate": 0.00034477080152100364, "loss": 0.527, "step": 563200 }, { "epoch": 75.89598490972783, "grad_norm": 0.17363835871219635, "learning_rate": 0.00034473337525075604, "loss": 0.5272, "step": 563300 }, { "epoch": 75.90945836701698, "grad_norm": 0.16296468675136566, "learning_rate": 0.0003446959489805084, "loss": 0.5276, "step": 563400 }, { "epoch": 75.92293182430612, "grad_norm": 0.17689567804336548, "learning_rate": 0.0003446585227102608, "loss": 0.5273, "step": 563500 }, { "epoch": 75.93640528159526, "grad_norm": 0.17600926756858826, "learning_rate": 0.0003446210964400132, "loss": 0.5275, "step": 563600 }, { "epoch": 75.9498787388844, "grad_norm": 0.17655859887599945, "learning_rate": 0.0003445836701697655, "loss": 0.5275, "step": 563700 }, { "epoch": 75.96335219617353, "grad_norm": 0.18127909302711487, "learning_rate": 0.0003445462438995179, "loss": 0.5271, "step": 563800 }, { "epoch": 75.97682565346268, "grad_norm": 0.19012220203876495, "learning_rate": 0.0003445088176292703, "loss": 0.5275, "step": 563900 }, { "epoch": 75.99029911075182, "grad_norm": 0.17001394927501678, "learning_rate": 0.0003444713913590227, "loss": 0.5267, "step": 564000 }, { "epoch": 76.0, "eval_loss": 0.5154352784156799, "eval_runtime": 4.9523, "eval_samples_per_second": 1009.633, "eval_steps_per_second": 15.952, "step": 564072 }, { "epoch": 76.00377256804096, "grad_norm": 0.16696986556053162, "learning_rate": 0.0003444339650887751, "loss": 0.5265, "step": 564100 }, { "epoch": 76.0172460253301, "grad_norm": 0.1666778326034546, "learning_rate": 0.0003443965388185275, "loss": 0.5263, "step": 564200 }, { "epoch": 76.03071948261925, "grad_norm": 0.15804444253444672, "learning_rate": 0.0003443591125482799, "loss": 0.5268, "step": 564300 }, { "epoch": 76.04419293990838, "grad_norm": 0.17888271808624268, "learning_rate": 0.0003443216862780323, "loss": 0.526, "step": 564400 }, { "epoch": 76.05766639719752, "grad_norm": 0.21644757688045502, "learning_rate": 0.00034428426000778466, "loss": 0.5254, "step": 564500 }, { "epoch": 76.07113985448666, "grad_norm": 0.167195126414299, "learning_rate": 0.00034424683373753705, "loss": 0.5257, "step": 564600 }, { "epoch": 76.0846133117758, "grad_norm": 0.18242061138153076, "learning_rate": 0.00034420940746728945, "loss": 0.5271, "step": 564700 }, { "epoch": 76.09808676906495, "grad_norm": 0.1865101307630539, "learning_rate": 0.00034417198119704185, "loss": 0.526, "step": 564800 }, { "epoch": 76.11156022635409, "grad_norm": 0.16826102137565613, "learning_rate": 0.00034413455492679425, "loss": 0.5264, "step": 564900 }, { "epoch": 76.12503368364322, "grad_norm": 0.20959676802158356, "learning_rate": 0.00034409712865654665, "loss": 0.5261, "step": 565000 }, { "epoch": 76.13850714093236, "grad_norm": 0.170002743601799, "learning_rate": 0.00034405970238629905, "loss": 0.526, "step": 565100 }, { "epoch": 76.1519805982215, "grad_norm": 0.17131438851356506, "learning_rate": 0.0003440222761160514, "loss": 0.5273, "step": 565200 }, { "epoch": 76.16545405551065, "grad_norm": 0.16064147651195526, "learning_rate": 0.00034398484984580373, "loss": 0.5266, "step": 565300 }, { "epoch": 76.17892751279979, "grad_norm": 0.17770282924175262, "learning_rate": 0.00034394742357555613, "loss": 0.5266, "step": 565400 }, { "epoch": 76.19240097008893, "grad_norm": 0.16207338869571686, "learning_rate": 0.00034390999730530853, "loss": 0.5267, "step": 565500 }, { "epoch": 76.20587442737806, "grad_norm": 0.1722482293844223, "learning_rate": 0.00034387257103506093, "loss": 0.5261, "step": 565600 }, { "epoch": 76.2193478846672, "grad_norm": 0.17685651779174805, "learning_rate": 0.0003438351447648133, "loss": 0.526, "step": 565700 }, { "epoch": 76.23282134195635, "grad_norm": 0.16924868524074554, "learning_rate": 0.0003437977184945657, "loss": 0.5272, "step": 565800 }, { "epoch": 76.24629479924549, "grad_norm": 0.1709623634815216, "learning_rate": 0.00034376029222431807, "loss": 0.5269, "step": 565900 }, { "epoch": 76.25976825653463, "grad_norm": 0.17478016018867493, "learning_rate": 0.00034372286595407047, "loss": 0.5275, "step": 566000 }, { "epoch": 76.27324171382377, "grad_norm": 0.18808171153068542, "learning_rate": 0.00034368543968382287, "loss": 0.5267, "step": 566100 }, { "epoch": 76.2867151711129, "grad_norm": 0.16685235500335693, "learning_rate": 0.00034364801341357526, "loss": 0.5267, "step": 566200 }, { "epoch": 76.30018862840205, "grad_norm": 0.18283571302890778, "learning_rate": 0.00034361058714332766, "loss": 0.5267, "step": 566300 }, { "epoch": 76.31366208569119, "grad_norm": 0.17845793068408966, "learning_rate": 0.00034357316087308006, "loss": 0.5278, "step": 566400 }, { "epoch": 76.32713554298033, "grad_norm": 0.18401089310646057, "learning_rate": 0.00034353573460283246, "loss": 0.5261, "step": 566500 }, { "epoch": 76.34060900026947, "grad_norm": 0.1842774897813797, "learning_rate": 0.00034349830833258486, "loss": 0.5265, "step": 566600 }, { "epoch": 76.3540824575586, "grad_norm": 0.16993024945259094, "learning_rate": 0.0003434608820623372, "loss": 0.5272, "step": 566700 }, { "epoch": 76.36755591484776, "grad_norm": 0.17774108052253723, "learning_rate": 0.0003434234557920896, "loss": 0.5257, "step": 566800 }, { "epoch": 76.3810293721369, "grad_norm": 0.164369598031044, "learning_rate": 0.000343386029521842, "loss": 0.5268, "step": 566900 }, { "epoch": 76.39450282942603, "grad_norm": 0.18441277742385864, "learning_rate": 0.0003433486032515944, "loss": 0.5266, "step": 567000 }, { "epoch": 76.40797628671517, "grad_norm": 0.18211469054222107, "learning_rate": 0.00034331117698134674, "loss": 0.5268, "step": 567100 }, { "epoch": 76.4214497440043, "grad_norm": 0.17919544875621796, "learning_rate": 0.00034327375071109914, "loss": 0.5272, "step": 567200 }, { "epoch": 76.43492320129346, "grad_norm": 0.18635427951812744, "learning_rate": 0.00034323632444085154, "loss": 0.5278, "step": 567300 }, { "epoch": 76.4483966585826, "grad_norm": 0.17927032709121704, "learning_rate": 0.0003431988981706039, "loss": 0.5266, "step": 567400 }, { "epoch": 76.46187011587173, "grad_norm": 0.1778843253850937, "learning_rate": 0.0003431614719003563, "loss": 0.5273, "step": 567500 }, { "epoch": 76.47534357316087, "grad_norm": 0.1718340963125229, "learning_rate": 0.0003431240456301087, "loss": 0.5267, "step": 567600 }, { "epoch": 76.48881703045001, "grad_norm": 0.17678038775920868, "learning_rate": 0.0003430866193598611, "loss": 0.5267, "step": 567700 }, { "epoch": 76.50229048773916, "grad_norm": 0.16324174404144287, "learning_rate": 0.0003430491930896135, "loss": 0.5272, "step": 567800 }, { "epoch": 76.5157639450283, "grad_norm": 0.1778813600540161, "learning_rate": 0.00034301176681936587, "loss": 0.5263, "step": 567900 }, { "epoch": 76.52923740231743, "grad_norm": 0.1802339255809784, "learning_rate": 0.00034297434054911827, "loss": 0.5272, "step": 568000 }, { "epoch": 76.54271085960657, "grad_norm": 0.17203085124492645, "learning_rate": 0.0003429369142788706, "loss": 0.5257, "step": 568100 }, { "epoch": 76.55618431689571, "grad_norm": 0.22122175991535187, "learning_rate": 0.000342899488008623, "loss": 0.5276, "step": 568200 }, { "epoch": 76.56965777418486, "grad_norm": 0.16901172697544098, "learning_rate": 0.0003428620617383754, "loss": 0.5267, "step": 568300 }, { "epoch": 76.583131231474, "grad_norm": 0.16048219799995422, "learning_rate": 0.0003428246354681278, "loss": 0.5267, "step": 568400 }, { "epoch": 76.59660468876314, "grad_norm": 0.17565272748470306, "learning_rate": 0.0003427872091978802, "loss": 0.5275, "step": 568500 }, { "epoch": 76.61007814605227, "grad_norm": 0.19285047054290771, "learning_rate": 0.0003427497829276326, "loss": 0.5263, "step": 568600 }, { "epoch": 76.62355160334141, "grad_norm": 0.17936207354068756, "learning_rate": 0.000342712356657385, "loss": 0.5265, "step": 568700 }, { "epoch": 76.63702506063056, "grad_norm": 0.17136716842651367, "learning_rate": 0.00034267493038713735, "loss": 0.5256, "step": 568800 }, { "epoch": 76.6504985179197, "grad_norm": 0.19156783819198608, "learning_rate": 0.0003426375041168897, "loss": 0.5274, "step": 568900 }, { "epoch": 76.66397197520884, "grad_norm": 0.19093012809753418, "learning_rate": 0.0003426000778466421, "loss": 0.5273, "step": 569000 }, { "epoch": 76.67744543249798, "grad_norm": 0.16830845177173615, "learning_rate": 0.0003425626515763945, "loss": 0.5266, "step": 569100 }, { "epoch": 76.69091888978711, "grad_norm": 0.18889053165912628, "learning_rate": 0.0003425252253061469, "loss": 0.5261, "step": 569200 }, { "epoch": 76.70439234707626, "grad_norm": 0.18887895345687866, "learning_rate": 0.0003424877990358993, "loss": 0.5271, "step": 569300 }, { "epoch": 76.7178658043654, "grad_norm": 0.17600315809249878, "learning_rate": 0.0003424503727656517, "loss": 0.5269, "step": 569400 }, { "epoch": 76.73133926165454, "grad_norm": 0.18498869240283966, "learning_rate": 0.0003424129464954041, "loss": 0.5268, "step": 569500 }, { "epoch": 76.74481271894368, "grad_norm": 0.17564785480499268, "learning_rate": 0.0003423755202251564, "loss": 0.5284, "step": 569600 }, { "epoch": 76.75828617623282, "grad_norm": 0.16687224805355072, "learning_rate": 0.0003423380939549088, "loss": 0.5261, "step": 569700 }, { "epoch": 76.77175963352197, "grad_norm": 0.1716606616973877, "learning_rate": 0.0003423006676846612, "loss": 0.5275, "step": 569800 }, { "epoch": 76.7852330908111, "grad_norm": 0.19443944096565247, "learning_rate": 0.0003422632414144136, "loss": 0.5278, "step": 569900 }, { "epoch": 76.79870654810024, "grad_norm": 0.17303310334682465, "learning_rate": 0.000342225815144166, "loss": 0.5274, "step": 570000 }, { "epoch": 76.81218000538938, "grad_norm": 0.19316209852695465, "learning_rate": 0.0003421883888739184, "loss": 0.5276, "step": 570100 }, { "epoch": 76.82565346267852, "grad_norm": 0.16342130303382874, "learning_rate": 0.0003421509626036708, "loss": 0.5274, "step": 570200 }, { "epoch": 76.83912691996767, "grad_norm": 0.172649547457695, "learning_rate": 0.00034211353633342316, "loss": 0.5263, "step": 570300 }, { "epoch": 76.8526003772568, "grad_norm": 0.17515195906162262, "learning_rate": 0.00034207611006317556, "loss": 0.5273, "step": 570400 }, { "epoch": 76.86607383454594, "grad_norm": 0.16162173449993134, "learning_rate": 0.00034203868379292795, "loss": 0.5272, "step": 570500 }, { "epoch": 76.87954729183508, "grad_norm": 0.16894936561584473, "learning_rate": 0.00034200125752268035, "loss": 0.5279, "step": 570600 }, { "epoch": 76.89302074912422, "grad_norm": 0.16294167935848236, "learning_rate": 0.0003419638312524327, "loss": 0.5264, "step": 570700 }, { "epoch": 76.90649420641337, "grad_norm": 0.20492473244667053, "learning_rate": 0.0003419264049821851, "loss": 0.5267, "step": 570800 }, { "epoch": 76.91996766370251, "grad_norm": 0.16554763913154602, "learning_rate": 0.0003418889787119375, "loss": 0.5267, "step": 570900 }, { "epoch": 76.93344112099165, "grad_norm": 0.18140162527561188, "learning_rate": 0.00034185155244168984, "loss": 0.5272, "step": 571000 }, { "epoch": 76.94691457828078, "grad_norm": 0.17438943684101105, "learning_rate": 0.00034181412617144224, "loss": 0.5264, "step": 571100 }, { "epoch": 76.96038803556992, "grad_norm": 0.16629952192306519, "learning_rate": 0.00034177669990119463, "loss": 0.5263, "step": 571200 }, { "epoch": 76.97386149285907, "grad_norm": 0.1672884225845337, "learning_rate": 0.00034173927363094703, "loss": 0.5267, "step": 571300 }, { "epoch": 76.98733495014821, "grad_norm": 0.16132338345050812, "learning_rate": 0.00034170184736069943, "loss": 0.5267, "step": 571400 }, { "epoch": 77.0, "eval_loss": 0.5161170363426208, "eval_runtime": 4.9562, "eval_samples_per_second": 1008.831, "eval_steps_per_second": 15.94, "step": 571494 }, { "epoch": 77.00080840743735, "grad_norm": 0.16224999725818634, "learning_rate": 0.00034166442109045183, "loss": 0.5277, "step": 571500 }, { "epoch": 77.01428186472648, "grad_norm": 0.17482122778892517, "learning_rate": 0.0003416269948202042, "loss": 0.5246, "step": 571600 }, { "epoch": 77.02775532201562, "grad_norm": 0.1990591287612915, "learning_rate": 0.00034158956854995657, "loss": 0.5266, "step": 571700 }, { "epoch": 77.04122877930477, "grad_norm": 0.1739325225353241, "learning_rate": 0.00034155214227970897, "loss": 0.5256, "step": 571800 }, { "epoch": 77.05470223659391, "grad_norm": 0.16907796263694763, "learning_rate": 0.00034151471600946137, "loss": 0.5256, "step": 571900 }, { "epoch": 77.06817569388305, "grad_norm": 0.1702558398246765, "learning_rate": 0.00034147728973921377, "loss": 0.5268, "step": 572000 }, { "epoch": 77.08164915117219, "grad_norm": 0.17276225984096527, "learning_rate": 0.00034143986346896616, "loss": 0.5255, "step": 572100 }, { "epoch": 77.09512260846134, "grad_norm": 0.18859830498695374, "learning_rate": 0.00034140243719871856, "loss": 0.5256, "step": 572200 }, { "epoch": 77.10859606575048, "grad_norm": 0.17316259443759918, "learning_rate": 0.00034136501092847096, "loss": 0.5254, "step": 572300 }, { "epoch": 77.12206952303961, "grad_norm": 0.1657380610704422, "learning_rate": 0.00034132758465822336, "loss": 0.5267, "step": 572400 }, { "epoch": 77.13554298032875, "grad_norm": 0.19296367466449738, "learning_rate": 0.00034129015838797565, "loss": 0.5271, "step": 572500 }, { "epoch": 77.14901643761789, "grad_norm": 0.1779109090566635, "learning_rate": 0.00034125273211772805, "loss": 0.526, "step": 572600 }, { "epoch": 77.16248989490704, "grad_norm": 0.19076280295848846, "learning_rate": 0.00034121530584748044, "loss": 0.5261, "step": 572700 }, { "epoch": 77.17596335219618, "grad_norm": 0.16342516243457794, "learning_rate": 0.00034117787957723284, "loss": 0.5259, "step": 572800 }, { "epoch": 77.18943680948531, "grad_norm": 0.19866904616355896, "learning_rate": 0.00034114045330698524, "loss": 0.5262, "step": 572900 }, { "epoch": 77.20291026677445, "grad_norm": 0.1807924211025238, "learning_rate": 0.00034110302703673764, "loss": 0.5269, "step": 573000 }, { "epoch": 77.21638372406359, "grad_norm": 0.1678183376789093, "learning_rate": 0.00034106560076649004, "loss": 0.5269, "step": 573100 }, { "epoch": 77.22985718135274, "grad_norm": 0.16762562096118927, "learning_rate": 0.0003410281744962424, "loss": 0.5267, "step": 573200 }, { "epoch": 77.24333063864188, "grad_norm": 0.18807679414749146, "learning_rate": 0.0003409907482259948, "loss": 0.5273, "step": 573300 }, { "epoch": 77.25680409593102, "grad_norm": 0.18166925013065338, "learning_rate": 0.0003409533219557472, "loss": 0.5261, "step": 573400 }, { "epoch": 77.27027755322015, "grad_norm": 0.21720992028713226, "learning_rate": 0.0003409158956854996, "loss": 0.5267, "step": 573500 }, { "epoch": 77.28375101050929, "grad_norm": 0.2067132145166397, "learning_rate": 0.000340878469415252, "loss": 0.5273, "step": 573600 }, { "epoch": 77.29722446779844, "grad_norm": 0.1660112589597702, "learning_rate": 0.00034084104314500437, "loss": 0.5269, "step": 573700 }, { "epoch": 77.31069792508758, "grad_norm": 0.1877432018518448, "learning_rate": 0.00034080361687475677, "loss": 0.526, "step": 573800 }, { "epoch": 77.32417138237672, "grad_norm": 0.1766035556793213, "learning_rate": 0.0003407661906045091, "loss": 0.5267, "step": 573900 }, { "epoch": 77.33764483966586, "grad_norm": 0.17661935091018677, "learning_rate": 0.0003407287643342615, "loss": 0.5272, "step": 574000 }, { "epoch": 77.351118296955, "grad_norm": 0.16779685020446777, "learning_rate": 0.0003406913380640139, "loss": 0.5263, "step": 574100 }, { "epoch": 77.36459175424415, "grad_norm": 0.1828848123550415, "learning_rate": 0.0003406539117937663, "loss": 0.5258, "step": 574200 }, { "epoch": 77.37806521153328, "grad_norm": 0.17963197827339172, "learning_rate": 0.00034061648552351865, "loss": 0.527, "step": 574300 }, { "epoch": 77.39153866882242, "grad_norm": 0.1784793734550476, "learning_rate": 0.00034057905925327105, "loss": 0.5264, "step": 574400 }, { "epoch": 77.40501212611156, "grad_norm": 0.17569603025913239, "learning_rate": 0.00034054163298302345, "loss": 0.526, "step": 574500 }, { "epoch": 77.4184855834007, "grad_norm": 0.16268472373485565, "learning_rate": 0.00034050420671277585, "loss": 0.5254, "step": 574600 }, { "epoch": 77.43195904068985, "grad_norm": 0.1763172447681427, "learning_rate": 0.0003404667804425282, "loss": 0.5266, "step": 574700 }, { "epoch": 77.44543249797898, "grad_norm": 0.18863990902900696, "learning_rate": 0.0003404293541722806, "loss": 0.5263, "step": 574800 }, { "epoch": 77.45890595526812, "grad_norm": 0.16473138332366943, "learning_rate": 0.000340391927902033, "loss": 0.5266, "step": 574900 }, { "epoch": 77.47237941255726, "grad_norm": 0.1719098836183548, "learning_rate": 0.0003403545016317854, "loss": 0.5271, "step": 575000 }, { "epoch": 77.4858528698464, "grad_norm": 0.1866026222705841, "learning_rate": 0.0003403170753615378, "loss": 0.5271, "step": 575100 }, { "epoch": 77.49932632713555, "grad_norm": 0.16823147237300873, "learning_rate": 0.0003402796490912902, "loss": 0.5256, "step": 575200 }, { "epoch": 77.51279978442469, "grad_norm": 0.16696280241012573, "learning_rate": 0.0003402422228210426, "loss": 0.5264, "step": 575300 }, { "epoch": 77.52627324171382, "grad_norm": 0.17561958730220795, "learning_rate": 0.0003402047965507949, "loss": 0.526, "step": 575400 }, { "epoch": 77.53974669900296, "grad_norm": 0.18314345180988312, "learning_rate": 0.0003401673702805473, "loss": 0.5259, "step": 575500 }, { "epoch": 77.5532201562921, "grad_norm": 0.19161848723888397, "learning_rate": 0.0003401299440102997, "loss": 0.5265, "step": 575600 }, { "epoch": 77.56669361358125, "grad_norm": 0.1721286028623581, "learning_rate": 0.0003400925177400521, "loss": 0.5267, "step": 575700 }, { "epoch": 77.58016707087039, "grad_norm": 0.16094960272312164, "learning_rate": 0.0003400550914698045, "loss": 0.5266, "step": 575800 }, { "epoch": 77.59364052815953, "grad_norm": 0.17625877261161804, "learning_rate": 0.0003400176651995569, "loss": 0.5266, "step": 575900 }, { "epoch": 77.60711398544866, "grad_norm": 0.16953817009925842, "learning_rate": 0.0003399802389293093, "loss": 0.5267, "step": 576000 }, { "epoch": 77.6205874427378, "grad_norm": 0.18770605325698853, "learning_rate": 0.0003399428126590616, "loss": 0.5268, "step": 576100 }, { "epoch": 77.63406090002695, "grad_norm": 0.17116859555244446, "learning_rate": 0.000339905386388814, "loss": 0.5259, "step": 576200 }, { "epoch": 77.64753435731609, "grad_norm": 0.18533281981945038, "learning_rate": 0.0003398679601185664, "loss": 0.5278, "step": 576300 }, { "epoch": 77.66100781460523, "grad_norm": 0.1789787858724594, "learning_rate": 0.0003398305338483188, "loss": 0.5271, "step": 576400 }, { "epoch": 77.67448127189436, "grad_norm": 0.17311416566371918, "learning_rate": 0.0003397931075780712, "loss": 0.5265, "step": 576500 }, { "epoch": 77.6879547291835, "grad_norm": 0.17027437686920166, "learning_rate": 0.0003397556813078236, "loss": 0.5258, "step": 576600 }, { "epoch": 77.70142818647265, "grad_norm": 0.17785294353961945, "learning_rate": 0.000339718255037576, "loss": 0.5267, "step": 576700 }, { "epoch": 77.71490164376179, "grad_norm": 0.17470069229602814, "learning_rate": 0.00033968082876732834, "loss": 0.5261, "step": 576800 }, { "epoch": 77.72837510105093, "grad_norm": 0.17903584241867065, "learning_rate": 0.00033964340249708074, "loss": 0.527, "step": 576900 }, { "epoch": 77.74184855834007, "grad_norm": 0.17805199325084686, "learning_rate": 0.00033960597622683313, "loss": 0.528, "step": 577000 }, { "epoch": 77.7553220156292, "grad_norm": 0.17504870891571045, "learning_rate": 0.00033956854995658553, "loss": 0.5268, "step": 577100 }, { "epoch": 77.76879547291836, "grad_norm": 0.1714431196451187, "learning_rate": 0.00033953112368633793, "loss": 0.5267, "step": 577200 }, { "epoch": 77.7822689302075, "grad_norm": 0.17343229055404663, "learning_rate": 0.00033949369741609033, "loss": 0.5266, "step": 577300 }, { "epoch": 77.79574238749663, "grad_norm": 0.25227904319763184, "learning_rate": 0.00033945627114584273, "loss": 0.5277, "step": 577400 }, { "epoch": 77.80921584478577, "grad_norm": 0.18699587881565094, "learning_rate": 0.0003394188448755951, "loss": 0.5269, "step": 577500 }, { "epoch": 77.8226893020749, "grad_norm": 0.17048737406730652, "learning_rate": 0.00033938141860534747, "loss": 0.5262, "step": 577600 }, { "epoch": 77.83616275936406, "grad_norm": 0.17489220201969147, "learning_rate": 0.00033934399233509987, "loss": 0.5259, "step": 577700 }, { "epoch": 77.8496362166532, "grad_norm": 0.1637500673532486, "learning_rate": 0.00033930656606485227, "loss": 0.5269, "step": 577800 }, { "epoch": 77.86310967394233, "grad_norm": 0.17138193547725677, "learning_rate": 0.0003392691397946046, "loss": 0.5275, "step": 577900 }, { "epoch": 77.87658313123147, "grad_norm": 0.19398097693920135, "learning_rate": 0.000339231713524357, "loss": 0.5275, "step": 578000 }, { "epoch": 77.89005658852061, "grad_norm": 0.18307459354400635, "learning_rate": 0.0003391942872541094, "loss": 0.5262, "step": 578100 }, { "epoch": 77.90353004580976, "grad_norm": 0.17365333437919617, "learning_rate": 0.0003391568609838618, "loss": 0.5268, "step": 578200 }, { "epoch": 77.9170035030989, "grad_norm": 0.17138779163360596, "learning_rate": 0.00033911943471361415, "loss": 0.5274, "step": 578300 }, { "epoch": 77.93047696038803, "grad_norm": 0.18344426155090332, "learning_rate": 0.00033908200844336655, "loss": 0.5262, "step": 578400 }, { "epoch": 77.94395041767717, "grad_norm": 0.17324841022491455, "learning_rate": 0.00033904458217311895, "loss": 0.5264, "step": 578500 }, { "epoch": 77.95742387496631, "grad_norm": 0.17998364567756653, "learning_rate": 0.00033900715590287134, "loss": 0.5268, "step": 578600 }, { "epoch": 77.97089733225546, "grad_norm": 0.19167739152908325, "learning_rate": 0.00033896972963262374, "loss": 0.5262, "step": 578700 }, { "epoch": 77.9843707895446, "grad_norm": 0.17042063176631927, "learning_rate": 0.00033893230336237614, "loss": 0.5271, "step": 578800 }, { "epoch": 77.99784424683374, "grad_norm": 0.18091917037963867, "learning_rate": 0.00033889487709212854, "loss": 0.5263, "step": 578900 }, { "epoch": 78.0, "eval_loss": 0.5151711106300354, "eval_runtime": 4.9616, "eval_samples_per_second": 1007.738, "eval_steps_per_second": 15.922, "step": 578916 }, { "epoch": 78.01131770412287, "grad_norm": 0.1660514622926712, "learning_rate": 0.0003388574508218809, "loss": 0.5263, "step": 579000 }, { "epoch": 78.02479116141201, "grad_norm": 0.16597354412078857, "learning_rate": 0.0003388200245516333, "loss": 0.5252, "step": 579100 }, { "epoch": 78.03826461870116, "grad_norm": 0.18020303547382355, "learning_rate": 0.0003387825982813857, "loss": 0.5257, "step": 579200 }, { "epoch": 78.0517380759903, "grad_norm": 0.16896681487560272, "learning_rate": 0.0003387451720111381, "loss": 0.5262, "step": 579300 }, { "epoch": 78.06521153327944, "grad_norm": 0.16923467814922333, "learning_rate": 0.0003387077457408905, "loss": 0.5252, "step": 579400 }, { "epoch": 78.07868499056858, "grad_norm": 0.17852084338665009, "learning_rate": 0.0003386703194706429, "loss": 0.5253, "step": 579500 }, { "epoch": 78.09215844785773, "grad_norm": 0.17654986679553986, "learning_rate": 0.00033863289320039527, "loss": 0.5263, "step": 579600 }, { "epoch": 78.10563190514686, "grad_norm": 0.1749502718448639, "learning_rate": 0.00033859546693014756, "loss": 0.5259, "step": 579700 }, { "epoch": 78.119105362436, "grad_norm": 0.22940441966056824, "learning_rate": 0.00033855804065989996, "loss": 0.5273, "step": 579800 }, { "epoch": 78.13257881972514, "grad_norm": 0.16505444049835205, "learning_rate": 0.00033852061438965236, "loss": 0.5264, "step": 579900 }, { "epoch": 78.14605227701428, "grad_norm": 0.1735852211713791, "learning_rate": 0.00033848318811940476, "loss": 0.5263, "step": 580000 }, { "epoch": 78.15952573430343, "grad_norm": 0.17611804604530334, "learning_rate": 0.00033844576184915715, "loss": 0.5261, "step": 580100 }, { "epoch": 78.17299919159257, "grad_norm": 0.16547945141792297, "learning_rate": 0.00033840833557890955, "loss": 0.5258, "step": 580200 }, { "epoch": 78.1864726488817, "grad_norm": 0.1741621494293213, "learning_rate": 0.00033837090930866195, "loss": 0.5251, "step": 580300 }, { "epoch": 78.19994610617084, "grad_norm": 0.16709932684898376, "learning_rate": 0.00033833348303841435, "loss": 0.5272, "step": 580400 }, { "epoch": 78.21341956345998, "grad_norm": 0.16006159782409668, "learning_rate": 0.0003382960567681667, "loss": 0.5261, "step": 580500 }, { "epoch": 78.22689302074913, "grad_norm": 0.1783631443977356, "learning_rate": 0.0003382586304979191, "loss": 0.527, "step": 580600 }, { "epoch": 78.24036647803827, "grad_norm": 0.1760578155517578, "learning_rate": 0.0003382212042276715, "loss": 0.5268, "step": 580700 }, { "epoch": 78.2538399353274, "grad_norm": 0.18785402178764343, "learning_rate": 0.0003381837779574239, "loss": 0.5262, "step": 580800 }, { "epoch": 78.26731339261654, "grad_norm": 0.16079048812389374, "learning_rate": 0.0003381463516871763, "loss": 0.5261, "step": 580900 }, { "epoch": 78.28078684990568, "grad_norm": 0.19194073975086212, "learning_rate": 0.0003381089254169287, "loss": 0.5266, "step": 581000 }, { "epoch": 78.29426030719483, "grad_norm": 0.16927184164524078, "learning_rate": 0.0003380714991466811, "loss": 0.526, "step": 581100 }, { "epoch": 78.30773376448397, "grad_norm": 0.1957925409078598, "learning_rate": 0.0003380340728764334, "loss": 0.5269, "step": 581200 }, { "epoch": 78.32120722177311, "grad_norm": 0.1868658810853958, "learning_rate": 0.0003379966466061858, "loss": 0.5269, "step": 581300 }, { "epoch": 78.33468067906225, "grad_norm": 0.17129193246364594, "learning_rate": 0.0003379592203359382, "loss": 0.5262, "step": 581400 }, { "epoch": 78.34815413635138, "grad_norm": 0.19107632339000702, "learning_rate": 0.0003379217940656906, "loss": 0.5257, "step": 581500 }, { "epoch": 78.36162759364053, "grad_norm": 0.17422020435333252, "learning_rate": 0.00033788436779544297, "loss": 0.5266, "step": 581600 }, { "epoch": 78.37510105092967, "grad_norm": 0.1726553738117218, "learning_rate": 0.00033784694152519536, "loss": 0.5253, "step": 581700 }, { "epoch": 78.38857450821881, "grad_norm": 0.17068849503993988, "learning_rate": 0.00033780951525494776, "loss": 0.5266, "step": 581800 }, { "epoch": 78.40204796550795, "grad_norm": 0.17284341156482697, "learning_rate": 0.0003377720889847001, "loss": 0.5263, "step": 581900 }, { "epoch": 78.41552142279708, "grad_norm": 0.17138715088367462, "learning_rate": 0.0003377346627144525, "loss": 0.5272, "step": 582000 }, { "epoch": 78.42899488008624, "grad_norm": 0.1861540675163269, "learning_rate": 0.0003376972364442049, "loss": 0.5263, "step": 582100 }, { "epoch": 78.44246833737537, "grad_norm": 0.17012225091457367, "learning_rate": 0.0003376598101739573, "loss": 0.527, "step": 582200 }, { "epoch": 78.45594179466451, "grad_norm": 0.17093007266521454, "learning_rate": 0.0003376223839037097, "loss": 0.5257, "step": 582300 }, { "epoch": 78.46941525195365, "grad_norm": 0.18323881924152374, "learning_rate": 0.0003375849576334621, "loss": 0.5263, "step": 582400 }, { "epoch": 78.48288870924279, "grad_norm": 0.1684422641992569, "learning_rate": 0.0003375475313632145, "loss": 0.5267, "step": 582500 }, { "epoch": 78.49636216653194, "grad_norm": 0.18870428204536438, "learning_rate": 0.0003375101050929669, "loss": 0.5258, "step": 582600 }, { "epoch": 78.50983562382108, "grad_norm": 0.16566556692123413, "learning_rate": 0.00033747267882271924, "loss": 0.5272, "step": 582700 }, { "epoch": 78.52330908111021, "grad_norm": 0.17286844551563263, "learning_rate": 0.00033743525255247164, "loss": 0.5261, "step": 582800 }, { "epoch": 78.53678253839935, "grad_norm": 0.17313595116138458, "learning_rate": 0.00033739782628222403, "loss": 0.5263, "step": 582900 }, { "epoch": 78.55025599568849, "grad_norm": 0.18013496696949005, "learning_rate": 0.00033736040001197643, "loss": 0.526, "step": 583000 }, { "epoch": 78.56372945297764, "grad_norm": 0.17947326600551605, "learning_rate": 0.00033732297374172883, "loss": 0.5265, "step": 583100 }, { "epoch": 78.57720291026678, "grad_norm": 0.18075670301914215, "learning_rate": 0.00033728554747148123, "loss": 0.527, "step": 583200 }, { "epoch": 78.59067636755591, "grad_norm": 0.17683744430541992, "learning_rate": 0.0003372481212012336, "loss": 0.5268, "step": 583300 }, { "epoch": 78.60414982484505, "grad_norm": 0.18307355046272278, "learning_rate": 0.0003372106949309859, "loss": 0.5259, "step": 583400 }, { "epoch": 78.61762328213419, "grad_norm": 0.16842374205589294, "learning_rate": 0.0003371732686607383, "loss": 0.526, "step": 583500 }, { "epoch": 78.63109673942334, "grad_norm": 0.18418751657009125, "learning_rate": 0.0003371358423904907, "loss": 0.5265, "step": 583600 }, { "epoch": 78.64457019671248, "grad_norm": 0.17650240659713745, "learning_rate": 0.0003370984161202431, "loss": 0.5256, "step": 583700 }, { "epoch": 78.65804365400162, "grad_norm": 0.19162790477275848, "learning_rate": 0.0003370609898499955, "loss": 0.5264, "step": 583800 }, { "epoch": 78.67151711129075, "grad_norm": 0.1788625717163086, "learning_rate": 0.0003370235635797479, "loss": 0.5263, "step": 583900 }, { "epoch": 78.68499056857989, "grad_norm": 0.16552814841270447, "learning_rate": 0.0003369861373095003, "loss": 0.5259, "step": 584000 }, { "epoch": 78.69846402586904, "grad_norm": 0.1859419196844101, "learning_rate": 0.00033694871103925265, "loss": 0.5264, "step": 584100 }, { "epoch": 78.71193748315818, "grad_norm": 0.183681920170784, "learning_rate": 0.00033691128476900505, "loss": 0.5271, "step": 584200 }, { "epoch": 78.72541094044732, "grad_norm": 0.19056737422943115, "learning_rate": 0.00033687385849875745, "loss": 0.5263, "step": 584300 }, { "epoch": 78.73888439773646, "grad_norm": 0.18498654663562775, "learning_rate": 0.00033683643222850985, "loss": 0.5261, "step": 584400 }, { "epoch": 78.7523578550256, "grad_norm": 0.17115089297294617, "learning_rate": 0.00033679900595826224, "loss": 0.5271, "step": 584500 }, { "epoch": 78.76583131231475, "grad_norm": 0.1850656419992447, "learning_rate": 0.00033676157968801464, "loss": 0.5261, "step": 584600 }, { "epoch": 78.77930476960388, "grad_norm": 0.17523299157619476, "learning_rate": 0.00033672415341776704, "loss": 0.5275, "step": 584700 }, { "epoch": 78.79277822689302, "grad_norm": 0.166998490691185, "learning_rate": 0.00033668672714751944, "loss": 0.5263, "step": 584800 }, { "epoch": 78.80625168418216, "grad_norm": 0.1761331856250763, "learning_rate": 0.0003366493008772718, "loss": 0.5259, "step": 584900 }, { "epoch": 78.8197251414713, "grad_norm": 0.21859779953956604, "learning_rate": 0.0003366118746070242, "loss": 0.5265, "step": 585000 }, { "epoch": 78.83319859876045, "grad_norm": 0.1762695461511612, "learning_rate": 0.0003365744483367766, "loss": 0.5267, "step": 585100 }, { "epoch": 78.84667205604958, "grad_norm": 0.16154643893241882, "learning_rate": 0.0003365370220665289, "loss": 0.5261, "step": 585200 }, { "epoch": 78.86014551333872, "grad_norm": 0.17262960970401764, "learning_rate": 0.0003364995957962813, "loss": 0.526, "step": 585300 }, { "epoch": 78.87361897062786, "grad_norm": 0.16794580221176147, "learning_rate": 0.0003364621695260337, "loss": 0.5254, "step": 585400 }, { "epoch": 78.887092427917, "grad_norm": 0.20804981887340546, "learning_rate": 0.0003364247432557861, "loss": 0.5266, "step": 585500 }, { "epoch": 78.90056588520615, "grad_norm": 0.18074138462543488, "learning_rate": 0.00033638731698553846, "loss": 0.5254, "step": 585600 }, { "epoch": 78.91403934249529, "grad_norm": 0.18516266345977783, "learning_rate": 0.00033634989071529086, "loss": 0.5259, "step": 585700 }, { "epoch": 78.92751279978442, "grad_norm": 0.20182599127292633, "learning_rate": 0.00033631246444504326, "loss": 0.5262, "step": 585800 }, { "epoch": 78.94098625707356, "grad_norm": 0.1804744303226471, "learning_rate": 0.00033627503817479566, "loss": 0.5261, "step": 585900 }, { "epoch": 78.9544597143627, "grad_norm": 0.1746237426996231, "learning_rate": 0.00033623761190454805, "loss": 0.5258, "step": 586000 }, { "epoch": 78.96793317165185, "grad_norm": 0.17251969873905182, "learning_rate": 0.00033620018563430045, "loss": 0.5253, "step": 586100 }, { "epoch": 78.98140662894099, "grad_norm": 0.172410786151886, "learning_rate": 0.00033616275936405285, "loss": 0.5271, "step": 586200 }, { "epoch": 78.99488008623013, "grad_norm": 0.1667579859495163, "learning_rate": 0.0003361253330938052, "loss": 0.5278, "step": 586300 }, { "epoch": 79.0, "eval_loss": 0.5152682065963745, "eval_runtime": 4.9619, "eval_samples_per_second": 1007.672, "eval_steps_per_second": 15.921, "step": 586338 }, { "epoch": 79.00835354351926, "grad_norm": 0.16673311591148376, "learning_rate": 0.0003360879068235576, "loss": 0.5259, "step": 586400 }, { "epoch": 79.0218270008084, "grad_norm": 0.16794447600841522, "learning_rate": 0.00033605048055331, "loss": 0.5261, "step": 586500 }, { "epoch": 79.03530045809755, "grad_norm": 0.16872002184391022, "learning_rate": 0.0003360130542830624, "loss": 0.5248, "step": 586600 }, { "epoch": 79.04877391538669, "grad_norm": 0.17345844209194183, "learning_rate": 0.0003359756280128148, "loss": 0.5265, "step": 586700 }, { "epoch": 79.06224737267583, "grad_norm": 0.1794559359550476, "learning_rate": 0.0003359382017425672, "loss": 0.5248, "step": 586800 }, { "epoch": 79.07572082996496, "grad_norm": 0.16758772730827332, "learning_rate": 0.0003359007754723196, "loss": 0.5266, "step": 586900 }, { "epoch": 79.08919428725412, "grad_norm": 0.1814035326242447, "learning_rate": 0.0003358633492020719, "loss": 0.5253, "step": 587000 }, { "epoch": 79.10266774454325, "grad_norm": 0.17482416331768036, "learning_rate": 0.00033582592293182427, "loss": 0.5259, "step": 587100 }, { "epoch": 79.11614120183239, "grad_norm": 0.18094807863235474, "learning_rate": 0.00033578849666157667, "loss": 0.5267, "step": 587200 }, { "epoch": 79.12961465912153, "grad_norm": 0.18208035826683044, "learning_rate": 0.00033575107039132907, "loss": 0.5255, "step": 587300 }, { "epoch": 79.14308811641067, "grad_norm": 0.17258132994174957, "learning_rate": 0.00033571364412108147, "loss": 0.5271, "step": 587400 }, { "epoch": 79.15656157369982, "grad_norm": 0.17303304374217987, "learning_rate": 0.00033567621785083386, "loss": 0.527, "step": 587500 }, { "epoch": 79.17003503098896, "grad_norm": 0.1675858199596405, "learning_rate": 0.00033563879158058626, "loss": 0.5256, "step": 587600 }, { "epoch": 79.1835084882781, "grad_norm": 0.17353129386901855, "learning_rate": 0.00033560136531033866, "loss": 0.5251, "step": 587700 }, { "epoch": 79.19698194556723, "grad_norm": 0.1924937516450882, "learning_rate": 0.000335563939040091, "loss": 0.526, "step": 587800 }, { "epoch": 79.21045540285637, "grad_norm": 0.18968282639980316, "learning_rate": 0.0003355265127698434, "loss": 0.5258, "step": 587900 }, { "epoch": 79.22392886014552, "grad_norm": 0.1740303933620453, "learning_rate": 0.0003354890864995958, "loss": 0.5269, "step": 588000 }, { "epoch": 79.23740231743466, "grad_norm": 0.1648896038532257, "learning_rate": 0.0003354516602293482, "loss": 0.5256, "step": 588100 }, { "epoch": 79.2508757747238, "grad_norm": 0.20996390283107758, "learning_rate": 0.0003354142339591006, "loss": 0.5268, "step": 588200 }, { "epoch": 79.26434923201293, "grad_norm": 0.17936153709888458, "learning_rate": 0.000335376807688853, "loss": 0.5251, "step": 588300 }, { "epoch": 79.27782268930207, "grad_norm": 0.18319149315357208, "learning_rate": 0.0003353393814186054, "loss": 0.5263, "step": 588400 }, { "epoch": 79.29129614659122, "grad_norm": 0.1654215306043625, "learning_rate": 0.00033530195514835774, "loss": 0.5262, "step": 588500 }, { "epoch": 79.30476960388036, "grad_norm": 0.2004551887512207, "learning_rate": 0.00033526452887811014, "loss": 0.5259, "step": 588600 }, { "epoch": 79.3182430611695, "grad_norm": 0.19028764963150024, "learning_rate": 0.00033522710260786254, "loss": 0.5265, "step": 588700 }, { "epoch": 79.33171651845863, "grad_norm": 0.1743023544549942, "learning_rate": 0.0003351896763376149, "loss": 0.5264, "step": 588800 }, { "epoch": 79.34518997574777, "grad_norm": 0.17699430882930756, "learning_rate": 0.0003351522500673673, "loss": 0.5275, "step": 588900 }, { "epoch": 79.35866343303692, "grad_norm": 0.17242006957530975, "learning_rate": 0.0003351148237971197, "loss": 0.5267, "step": 589000 }, { "epoch": 79.37213689032606, "grad_norm": 0.1561955064535141, "learning_rate": 0.0003350773975268721, "loss": 0.5248, "step": 589100 }, { "epoch": 79.3856103476152, "grad_norm": 0.16771501302719116, "learning_rate": 0.0003350399712566244, "loss": 0.5265, "step": 589200 }, { "epoch": 79.39908380490434, "grad_norm": 0.18103191256523132, "learning_rate": 0.0003350025449863768, "loss": 0.5251, "step": 589300 }, { "epoch": 79.41255726219347, "grad_norm": 0.17678865790367126, "learning_rate": 0.0003349651187161292, "loss": 0.5266, "step": 589400 }, { "epoch": 79.42603071948263, "grad_norm": 0.16828972101211548, "learning_rate": 0.0003349276924458816, "loss": 0.5257, "step": 589500 }, { "epoch": 79.43950417677176, "grad_norm": 0.1990930587053299, "learning_rate": 0.000334890266175634, "loss": 0.5262, "step": 589600 }, { "epoch": 79.4529776340609, "grad_norm": 0.17783519625663757, "learning_rate": 0.0003348528399053864, "loss": 0.5265, "step": 589700 }, { "epoch": 79.46645109135004, "grad_norm": 0.19471684098243713, "learning_rate": 0.0003348154136351388, "loss": 0.526, "step": 589800 }, { "epoch": 79.47992454863918, "grad_norm": 0.16579975187778473, "learning_rate": 0.0003347779873648912, "loss": 0.5253, "step": 589900 }, { "epoch": 79.49339800592833, "grad_norm": 0.17369654774665833, "learning_rate": 0.00033474056109464355, "loss": 0.5267, "step": 590000 }, { "epoch": 79.50687146321746, "grad_norm": 0.17526917159557343, "learning_rate": 0.00033470313482439595, "loss": 0.5261, "step": 590100 }, { "epoch": 79.5203449205066, "grad_norm": 0.1751345694065094, "learning_rate": 0.00033466570855414835, "loss": 0.526, "step": 590200 }, { "epoch": 79.53381837779574, "grad_norm": 0.1740531027317047, "learning_rate": 0.00033462828228390074, "loss": 0.5254, "step": 590300 }, { "epoch": 79.54729183508488, "grad_norm": 0.1693229079246521, "learning_rate": 0.00033459085601365314, "loss": 0.5266, "step": 590400 }, { "epoch": 79.56076529237403, "grad_norm": 0.171098530292511, "learning_rate": 0.00033455342974340554, "loss": 0.5261, "step": 590500 }, { "epoch": 79.57423874966317, "grad_norm": 0.22368094325065613, "learning_rate": 0.0003345160034731579, "loss": 0.5268, "step": 590600 }, { "epoch": 79.5877122069523, "grad_norm": 0.16503006219863892, "learning_rate": 0.00033447857720291023, "loss": 0.5254, "step": 590700 }, { "epoch": 79.60118566424144, "grad_norm": 0.17014744877815247, "learning_rate": 0.00033444115093266263, "loss": 0.5266, "step": 590800 }, { "epoch": 79.61465912153058, "grad_norm": 0.1931743174791336, "learning_rate": 0.000334403724662415, "loss": 0.5262, "step": 590900 }, { "epoch": 79.62813257881973, "grad_norm": 0.1838299036026001, "learning_rate": 0.0003343662983921674, "loss": 0.5257, "step": 591000 }, { "epoch": 79.64160603610887, "grad_norm": 0.17927774786949158, "learning_rate": 0.0003343288721219198, "loss": 0.5256, "step": 591100 }, { "epoch": 79.655079493398, "grad_norm": 0.1772068440914154, "learning_rate": 0.0003342914458516722, "loss": 0.5258, "step": 591200 }, { "epoch": 79.66855295068714, "grad_norm": 0.17227047681808472, "learning_rate": 0.0003342540195814246, "loss": 0.5256, "step": 591300 }, { "epoch": 79.68202640797628, "grad_norm": 0.1623670756816864, "learning_rate": 0.00033421659331117696, "loss": 0.526, "step": 591400 }, { "epoch": 79.69549986526543, "grad_norm": 0.18684259057044983, "learning_rate": 0.00033417916704092936, "loss": 0.5256, "step": 591500 }, { "epoch": 79.70897332255457, "grad_norm": 0.17035043239593506, "learning_rate": 0.00033414174077068176, "loss": 0.5257, "step": 591600 }, { "epoch": 79.72244677984371, "grad_norm": 0.18863505125045776, "learning_rate": 0.00033410431450043416, "loss": 0.5254, "step": 591700 }, { "epoch": 79.73592023713285, "grad_norm": 0.19063520431518555, "learning_rate": 0.00033406688823018656, "loss": 0.5262, "step": 591800 }, { "epoch": 79.74939369442198, "grad_norm": 0.16837453842163086, "learning_rate": 0.00033402946195993895, "loss": 0.5261, "step": 591900 }, { "epoch": 79.76286715171113, "grad_norm": 0.16288872063159943, "learning_rate": 0.00033399203568969135, "loss": 0.5272, "step": 592000 }, { "epoch": 79.77634060900027, "grad_norm": 0.16493667662143707, "learning_rate": 0.0003339546094194437, "loss": 0.5259, "step": 592100 }, { "epoch": 79.78981406628941, "grad_norm": 0.18134473264217377, "learning_rate": 0.0003339171831491961, "loss": 0.5257, "step": 592200 }, { "epoch": 79.80328752357855, "grad_norm": 0.1629263311624527, "learning_rate": 0.0003338797568789485, "loss": 0.5271, "step": 592300 }, { "epoch": 79.81676098086768, "grad_norm": 0.18628916144371033, "learning_rate": 0.00033384233060870084, "loss": 0.5255, "step": 592400 }, { "epoch": 79.83023443815684, "grad_norm": 0.1938614696264267, "learning_rate": 0.00033380490433845323, "loss": 0.5259, "step": 592500 }, { "epoch": 79.84370789544597, "grad_norm": 0.18165044486522675, "learning_rate": 0.00033376747806820563, "loss": 0.5261, "step": 592600 }, { "epoch": 79.85718135273511, "grad_norm": 0.16028577089309692, "learning_rate": 0.00033373005179795803, "loss": 0.5267, "step": 592700 }, { "epoch": 79.87065481002425, "grad_norm": 0.17035113275051117, "learning_rate": 0.00033369262552771043, "loss": 0.5258, "step": 592800 }, { "epoch": 79.88412826731339, "grad_norm": 0.16919878125190735, "learning_rate": 0.0003336551992574628, "loss": 0.5261, "step": 592900 }, { "epoch": 79.89760172460254, "grad_norm": 0.18249641358852386, "learning_rate": 0.00033361777298721517, "loss": 0.5262, "step": 593000 }, { "epoch": 79.91107518189168, "grad_norm": 0.16121414303779602, "learning_rate": 0.00033358034671696757, "loss": 0.5261, "step": 593100 }, { "epoch": 79.92454863918081, "grad_norm": 0.17222881317138672, "learning_rate": 0.00033354292044671997, "loss": 0.5256, "step": 593200 }, { "epoch": 79.93802209646995, "grad_norm": 0.16165569424629211, "learning_rate": 0.00033350549417647237, "loss": 0.5263, "step": 593300 }, { "epoch": 79.95149555375909, "grad_norm": 0.16786980628967285, "learning_rate": 0.00033346806790622476, "loss": 0.5252, "step": 593400 }, { "epoch": 79.96496901104824, "grad_norm": 0.18006762862205505, "learning_rate": 0.00033343064163597716, "loss": 0.5271, "step": 593500 }, { "epoch": 79.97844246833738, "grad_norm": 0.16775979101657867, "learning_rate": 0.0003333932153657295, "loss": 0.5252, "step": 593600 }, { "epoch": 79.99191592562651, "grad_norm": 0.18635866045951843, "learning_rate": 0.0003333557890954819, "loss": 0.5261, "step": 593700 }, { "epoch": 80.0, "eval_loss": 0.514883816242218, "eval_runtime": 4.9616, "eval_samples_per_second": 1007.747, "eval_steps_per_second": 15.922, "step": 593760 }, { "epoch": 80.00538938291565, "grad_norm": 0.17584486305713654, "learning_rate": 0.0003333183628252343, "loss": 0.5265, "step": 593800 }, { "epoch": 80.01886284020479, "grad_norm": 0.1701592355966568, "learning_rate": 0.0003332809365549867, "loss": 0.5257, "step": 593900 }, { "epoch": 80.03233629749394, "grad_norm": 0.16635377705097198, "learning_rate": 0.0003332435102847391, "loss": 0.5261, "step": 594000 }, { "epoch": 80.04580975478308, "grad_norm": 0.18495024740695953, "learning_rate": 0.0003332060840144915, "loss": 0.5253, "step": 594100 }, { "epoch": 80.05928321207222, "grad_norm": 0.1624477207660675, "learning_rate": 0.00033316865774424384, "loss": 0.5256, "step": 594200 }, { "epoch": 80.07275666936135, "grad_norm": 0.17824424803256989, "learning_rate": 0.0003331312314739962, "loss": 0.5255, "step": 594300 }, { "epoch": 80.08623012665049, "grad_norm": 0.1955820471048355, "learning_rate": 0.0003330938052037486, "loss": 0.5264, "step": 594400 }, { "epoch": 80.09970358393964, "grad_norm": 0.19988514482975006, "learning_rate": 0.000333056378933501, "loss": 0.5251, "step": 594500 }, { "epoch": 80.11317704122878, "grad_norm": 0.1738845258951187, "learning_rate": 0.0003330189526632534, "loss": 0.5255, "step": 594600 }, { "epoch": 80.12665049851792, "grad_norm": 0.18276746571063995, "learning_rate": 0.0003329815263930058, "loss": 0.5267, "step": 594700 }, { "epoch": 80.14012395580706, "grad_norm": 0.1842145472764969, "learning_rate": 0.0003329441001227582, "loss": 0.5247, "step": 594800 }, { "epoch": 80.15359741309621, "grad_norm": 0.20660798251628876, "learning_rate": 0.0003329066738525106, "loss": 0.5259, "step": 594900 }, { "epoch": 80.16707087038534, "grad_norm": 0.17922890186309814, "learning_rate": 0.0003328692475822629, "loss": 0.5264, "step": 595000 }, { "epoch": 80.18054432767448, "grad_norm": 0.1751810908317566, "learning_rate": 0.0003328318213120153, "loss": 0.5264, "step": 595100 }, { "epoch": 80.19401778496362, "grad_norm": 0.18197324872016907, "learning_rate": 0.0003327943950417677, "loss": 0.5264, "step": 595200 }, { "epoch": 80.20749124225276, "grad_norm": 0.16665567457675934, "learning_rate": 0.0003327569687715201, "loss": 0.5257, "step": 595300 }, { "epoch": 80.22096469954191, "grad_norm": 0.1820480227470398, "learning_rate": 0.0003327195425012725, "loss": 0.5269, "step": 595400 }, { "epoch": 80.23443815683105, "grad_norm": 0.19373264908790588, "learning_rate": 0.0003326821162310249, "loss": 0.5257, "step": 595500 }, { "epoch": 80.24791161412018, "grad_norm": 0.16981196403503418, "learning_rate": 0.0003326446899607773, "loss": 0.5254, "step": 595600 }, { "epoch": 80.26138507140932, "grad_norm": 0.20686081051826477, "learning_rate": 0.0003326072636905297, "loss": 0.5247, "step": 595700 }, { "epoch": 80.27485852869846, "grad_norm": 0.1632644236087799, "learning_rate": 0.00033256983742028205, "loss": 0.5256, "step": 595800 }, { "epoch": 80.28833198598761, "grad_norm": 0.17188133299350739, "learning_rate": 0.00033253241115003445, "loss": 0.5249, "step": 595900 }, { "epoch": 80.30180544327675, "grad_norm": 0.17062979936599731, "learning_rate": 0.0003324949848797868, "loss": 0.5257, "step": 596000 }, { "epoch": 80.31527890056589, "grad_norm": 0.1719110757112503, "learning_rate": 0.0003324575586095392, "loss": 0.5262, "step": 596100 }, { "epoch": 80.32875235785502, "grad_norm": 0.17535459995269775, "learning_rate": 0.0003324201323392916, "loss": 0.526, "step": 596200 }, { "epoch": 80.34222581514416, "grad_norm": 0.17238779366016388, "learning_rate": 0.000332382706069044, "loss": 0.5255, "step": 596300 }, { "epoch": 80.35569927243331, "grad_norm": 0.16448840498924255, "learning_rate": 0.0003323452797987964, "loss": 0.5261, "step": 596400 }, { "epoch": 80.36917272972245, "grad_norm": 0.19046422839164734, "learning_rate": 0.00033230785352854873, "loss": 0.5264, "step": 596500 }, { "epoch": 80.38264618701159, "grad_norm": 0.17578402161598206, "learning_rate": 0.00033227042725830113, "loss": 0.5247, "step": 596600 }, { "epoch": 80.39611964430073, "grad_norm": 0.2088845819234848, "learning_rate": 0.0003322330009880535, "loss": 0.5253, "step": 596700 }, { "epoch": 80.40959310158986, "grad_norm": 0.16846176981925964, "learning_rate": 0.0003321955747178059, "loss": 0.5261, "step": 596800 }, { "epoch": 80.42306655887901, "grad_norm": 0.18090379238128662, "learning_rate": 0.0003321581484475583, "loss": 0.5268, "step": 596900 }, { "epoch": 80.43654001616815, "grad_norm": 0.1928042769432068, "learning_rate": 0.0003321207221773107, "loss": 0.5252, "step": 597000 }, { "epoch": 80.45001347345729, "grad_norm": 0.1712573915719986, "learning_rate": 0.0003320832959070631, "loss": 0.5258, "step": 597100 }, { "epoch": 80.46348693074643, "grad_norm": 0.19004681706428528, "learning_rate": 0.00033204586963681546, "loss": 0.5259, "step": 597200 }, { "epoch": 80.47696038803556, "grad_norm": 0.19157971441745758, "learning_rate": 0.00033200844336656786, "loss": 0.5262, "step": 597300 }, { "epoch": 80.49043384532472, "grad_norm": 0.16805800795555115, "learning_rate": 0.00033197101709632026, "loss": 0.5259, "step": 597400 }, { "epoch": 80.50390730261385, "grad_norm": 0.17962004244327545, "learning_rate": 0.00033193359082607266, "loss": 0.5261, "step": 597500 }, { "epoch": 80.51738075990299, "grad_norm": 0.19326622784137726, "learning_rate": 0.00033189616455582506, "loss": 0.5239, "step": 597600 }, { "epoch": 80.53085421719213, "grad_norm": 0.18643110990524292, "learning_rate": 0.00033185873828557745, "loss": 0.5268, "step": 597700 }, { "epoch": 80.54432767448127, "grad_norm": 0.20457930862903595, "learning_rate": 0.00033182131201532985, "loss": 0.5256, "step": 597800 }, { "epoch": 80.55780113177042, "grad_norm": 0.17562884092330933, "learning_rate": 0.0003317838857450822, "loss": 0.5255, "step": 597900 }, { "epoch": 80.57127458905956, "grad_norm": 0.1660386621952057, "learning_rate": 0.00033174645947483454, "loss": 0.5258, "step": 598000 }, { "epoch": 80.5847480463487, "grad_norm": 0.1741422861814499, "learning_rate": 0.00033170903320458694, "loss": 0.5264, "step": 598100 }, { "epoch": 80.59822150363783, "grad_norm": 0.15960785746574402, "learning_rate": 0.00033167160693433934, "loss": 0.5244, "step": 598200 }, { "epoch": 80.61169496092697, "grad_norm": 0.17340825498104095, "learning_rate": 0.00033163418066409174, "loss": 0.5263, "step": 598300 }, { "epoch": 80.62516841821612, "grad_norm": 0.1780538707971573, "learning_rate": 0.00033159675439384413, "loss": 0.5249, "step": 598400 }, { "epoch": 80.63864187550526, "grad_norm": 0.179254949092865, "learning_rate": 0.00033155932812359653, "loss": 0.5261, "step": 598500 }, { "epoch": 80.6521153327944, "grad_norm": 0.17093659937381744, "learning_rate": 0.00033152190185334893, "loss": 0.5253, "step": 598600 }, { "epoch": 80.66558879008353, "grad_norm": 0.17106206715106964, "learning_rate": 0.0003314844755831013, "loss": 0.5264, "step": 598700 }, { "epoch": 80.67906224737267, "grad_norm": 0.19688981771469116, "learning_rate": 0.00033144704931285367, "loss": 0.5259, "step": 598800 }, { "epoch": 80.69253570466182, "grad_norm": 0.1690053939819336, "learning_rate": 0.00033140962304260607, "loss": 0.5252, "step": 598900 }, { "epoch": 80.70600916195096, "grad_norm": 0.1867567002773285, "learning_rate": 0.00033137219677235847, "loss": 0.526, "step": 599000 }, { "epoch": 80.7194826192401, "grad_norm": 0.17334353923797607, "learning_rate": 0.00033133477050211087, "loss": 0.5262, "step": 599100 }, { "epoch": 80.73295607652923, "grad_norm": 0.18121610581874847, "learning_rate": 0.00033129734423186327, "loss": 0.5268, "step": 599200 }, { "epoch": 80.74642953381837, "grad_norm": 0.1888413280248642, "learning_rate": 0.00033125991796161566, "loss": 0.5261, "step": 599300 }, { "epoch": 80.75990299110752, "grad_norm": 0.1737643927335739, "learning_rate": 0.000331222491691368, "loss": 0.5264, "step": 599400 }, { "epoch": 80.77337644839666, "grad_norm": 0.17280001938343048, "learning_rate": 0.0003311850654211204, "loss": 0.5259, "step": 599500 }, { "epoch": 80.7868499056858, "grad_norm": 0.1767398864030838, "learning_rate": 0.0003311476391508728, "loss": 0.5245, "step": 599600 }, { "epoch": 80.80032336297494, "grad_norm": 0.16653932631015778, "learning_rate": 0.00033111021288062515, "loss": 0.5254, "step": 599700 }, { "epoch": 80.81379682026407, "grad_norm": 0.17850752174854279, "learning_rate": 0.00033107278661037755, "loss": 0.5264, "step": 599800 }, { "epoch": 80.82727027755323, "grad_norm": 0.1627688705921173, "learning_rate": 0.00033103536034012994, "loss": 0.5258, "step": 599900 }, { "epoch": 80.84074373484236, "grad_norm": 0.18089836835861206, "learning_rate": 0.00033099793406988234, "loss": 0.5264, "step": 600000 }, { "epoch": 80.8542171921315, "grad_norm": 0.18351466953754425, "learning_rate": 0.0003309605077996347, "loss": 0.5256, "step": 600100 }, { "epoch": 80.86769064942064, "grad_norm": 0.17412187159061432, "learning_rate": 0.0003309230815293871, "loss": 0.526, "step": 600200 }, { "epoch": 80.88116410670978, "grad_norm": 0.17181430757045746, "learning_rate": 0.0003308856552591395, "loss": 0.5265, "step": 600300 }, { "epoch": 80.89463756399893, "grad_norm": 0.17248058319091797, "learning_rate": 0.0003308482289888919, "loss": 0.5262, "step": 600400 }, { "epoch": 80.90811102128806, "grad_norm": 0.17402704060077667, "learning_rate": 0.0003308108027186443, "loss": 0.5257, "step": 600500 }, { "epoch": 80.9215844785772, "grad_norm": 0.17357997596263885, "learning_rate": 0.0003307733764483967, "loss": 0.5261, "step": 600600 }, { "epoch": 80.93505793586634, "grad_norm": 0.178225576877594, "learning_rate": 0.0003307359501781491, "loss": 0.5265, "step": 600700 }, { "epoch": 80.94853139315548, "grad_norm": 0.1838589757680893, "learning_rate": 0.0003306985239079015, "loss": 0.5252, "step": 600800 }, { "epoch": 80.96200485044463, "grad_norm": 0.17293983697891235, "learning_rate": 0.0003306610976376538, "loss": 0.5259, "step": 600900 }, { "epoch": 80.97547830773377, "grad_norm": 0.17370212078094482, "learning_rate": 0.0003306236713674062, "loss": 0.5266, "step": 601000 }, { "epoch": 80.9889517650229, "grad_norm": 0.15644042193889618, "learning_rate": 0.0003305862450971586, "loss": 0.5251, "step": 601100 }, { "epoch": 81.0, "eval_loss": 0.5145230293273926, "eval_runtime": 4.9562, "eval_samples_per_second": 1008.844, "eval_steps_per_second": 15.94, "step": 601182 }, { "epoch": 81.00242522231204, "grad_norm": 0.17471301555633545, "learning_rate": 0.000330548818826911, "loss": 0.5257, "step": 601200 }, { "epoch": 81.01589867960118, "grad_norm": 0.1983606517314911, "learning_rate": 0.0003305113925566634, "loss": 0.5252, "step": 601300 }, { "epoch": 81.02937213689033, "grad_norm": 0.1768524944782257, "learning_rate": 0.0003304739662864158, "loss": 0.5253, "step": 601400 }, { "epoch": 81.04284559417947, "grad_norm": 0.18000610172748566, "learning_rate": 0.00033043654001616815, "loss": 0.5253, "step": 601500 }, { "epoch": 81.0563190514686, "grad_norm": 0.18648026883602142, "learning_rate": 0.0003303991137459205, "loss": 0.5248, "step": 601600 }, { "epoch": 81.06979250875774, "grad_norm": 0.17322242259979248, "learning_rate": 0.0003303616874756729, "loss": 0.5246, "step": 601700 }, { "epoch": 81.08326596604688, "grad_norm": 0.17919810116291046, "learning_rate": 0.0003303242612054253, "loss": 0.5251, "step": 601800 }, { "epoch": 81.09673942333603, "grad_norm": 0.19162562489509583, "learning_rate": 0.0003302868349351777, "loss": 0.5256, "step": 601900 }, { "epoch": 81.11021288062517, "grad_norm": 0.19024991989135742, "learning_rate": 0.0003302494086649301, "loss": 0.5254, "step": 602000 }, { "epoch": 81.12368633791431, "grad_norm": 0.1680820733308792, "learning_rate": 0.0003302119823946825, "loss": 0.525, "step": 602100 }, { "epoch": 81.13715979520344, "grad_norm": 0.1769382655620575, "learning_rate": 0.0003301745561244349, "loss": 0.5262, "step": 602200 }, { "epoch": 81.1506332524926, "grad_norm": 0.17421066761016846, "learning_rate": 0.00033013712985418723, "loss": 0.5244, "step": 602300 }, { "epoch": 81.16410670978173, "grad_norm": 0.18113091588020325, "learning_rate": 0.00033009970358393963, "loss": 0.5243, "step": 602400 }, { "epoch": 81.17758016707087, "grad_norm": 0.17257197201251984, "learning_rate": 0.00033006227731369203, "loss": 0.5256, "step": 602500 }, { "epoch": 81.19105362436001, "grad_norm": 0.1716271936893463, "learning_rate": 0.0003300248510434444, "loss": 0.5252, "step": 602600 }, { "epoch": 81.20452708164915, "grad_norm": 0.18829648196697235, "learning_rate": 0.0003299874247731968, "loss": 0.5263, "step": 602700 }, { "epoch": 81.2180005389383, "grad_norm": 0.21031080186367035, "learning_rate": 0.0003299499985029492, "loss": 0.5256, "step": 602800 }, { "epoch": 81.23147399622744, "grad_norm": 0.17182008922100067, "learning_rate": 0.0003299125722327016, "loss": 0.5248, "step": 602900 }, { "epoch": 81.24494745351657, "grad_norm": 0.17262132465839386, "learning_rate": 0.000329875145962454, "loss": 0.5266, "step": 603000 }, { "epoch": 81.25842091080571, "grad_norm": 0.17039577662944794, "learning_rate": 0.00032983771969220636, "loss": 0.526, "step": 603100 }, { "epoch": 81.27189436809485, "grad_norm": 0.16798429191112518, "learning_rate": 0.00032980029342195876, "loss": 0.5262, "step": 603200 }, { "epoch": 81.285367825384, "grad_norm": 0.17778435349464417, "learning_rate": 0.0003297628671517111, "loss": 0.5249, "step": 603300 }, { "epoch": 81.29884128267314, "grad_norm": 0.17744915187358856, "learning_rate": 0.0003297254408814635, "loss": 0.5255, "step": 603400 }, { "epoch": 81.31231473996228, "grad_norm": 0.18954657018184662, "learning_rate": 0.0003296880146112159, "loss": 0.5252, "step": 603500 }, { "epoch": 81.32578819725141, "grad_norm": 0.19815713167190552, "learning_rate": 0.0003296505883409683, "loss": 0.5263, "step": 603600 }, { "epoch": 81.33926165454055, "grad_norm": 0.17860394716262817, "learning_rate": 0.0003296131620707207, "loss": 0.5256, "step": 603700 }, { "epoch": 81.3527351118297, "grad_norm": 0.18822070956230164, "learning_rate": 0.00032957573580047304, "loss": 0.5264, "step": 603800 }, { "epoch": 81.36620856911884, "grad_norm": 0.18098978698253632, "learning_rate": 0.00032953830953022544, "loss": 0.5254, "step": 603900 }, { "epoch": 81.37968202640798, "grad_norm": 0.18972449004650116, "learning_rate": 0.00032950088325997784, "loss": 0.5258, "step": 604000 }, { "epoch": 81.39315548369711, "grad_norm": 0.1987673044204712, "learning_rate": 0.00032946345698973024, "loss": 0.525, "step": 604100 }, { "epoch": 81.40662894098625, "grad_norm": 0.16944968700408936, "learning_rate": 0.00032942603071948264, "loss": 0.5253, "step": 604200 }, { "epoch": 81.4201023982754, "grad_norm": 0.20448020100593567, "learning_rate": 0.00032938860444923503, "loss": 0.5259, "step": 604300 }, { "epoch": 81.43357585556454, "grad_norm": 0.20044949650764465, "learning_rate": 0.00032935117817898743, "loss": 0.5246, "step": 604400 }, { "epoch": 81.44704931285368, "grad_norm": 0.18928442895412445, "learning_rate": 0.0003293137519087398, "loss": 0.5253, "step": 604500 }, { "epoch": 81.46052277014282, "grad_norm": 0.16801312565803528, "learning_rate": 0.0003292763256384922, "loss": 0.5257, "step": 604600 }, { "epoch": 81.47399622743195, "grad_norm": 0.16533108055591583, "learning_rate": 0.00032923889936824457, "loss": 0.5255, "step": 604700 }, { "epoch": 81.4874696847211, "grad_norm": 0.20434489846229553, "learning_rate": 0.00032920147309799697, "loss": 0.525, "step": 604800 }, { "epoch": 81.50094314201024, "grad_norm": 0.1891251504421234, "learning_rate": 0.00032916404682774937, "loss": 0.5257, "step": 604900 }, { "epoch": 81.51441659929938, "grad_norm": 0.17296354472637177, "learning_rate": 0.00032912662055750177, "loss": 0.5251, "step": 605000 }, { "epoch": 81.52789005658852, "grad_norm": 0.18480709195137024, "learning_rate": 0.0003290891942872541, "loss": 0.5262, "step": 605100 }, { "epoch": 81.54136351387766, "grad_norm": 0.19343584775924683, "learning_rate": 0.00032905176801700645, "loss": 0.5269, "step": 605200 }, { "epoch": 81.55483697116681, "grad_norm": 0.17708206176757812, "learning_rate": 0.00032901434174675885, "loss": 0.526, "step": 605300 }, { "epoch": 81.56831042845594, "grad_norm": 0.17085233330726624, "learning_rate": 0.00032897691547651125, "loss": 0.5253, "step": 605400 }, { "epoch": 81.58178388574508, "grad_norm": 0.18603108823299408, "learning_rate": 0.00032893948920626365, "loss": 0.5258, "step": 605500 }, { "epoch": 81.59525734303422, "grad_norm": 0.17961181700229645, "learning_rate": 0.00032890206293601605, "loss": 0.5254, "step": 605600 }, { "epoch": 81.60873080032336, "grad_norm": 0.18065768480300903, "learning_rate": 0.00032886463666576845, "loss": 0.526, "step": 605700 }, { "epoch": 81.62220425761251, "grad_norm": 0.19553224742412567, "learning_rate": 0.00032882721039552084, "loss": 0.5259, "step": 605800 }, { "epoch": 81.63567771490165, "grad_norm": 0.17129439115524292, "learning_rate": 0.00032878978412527324, "loss": 0.5262, "step": 605900 }, { "epoch": 81.64915117219078, "grad_norm": 0.18600261211395264, "learning_rate": 0.0003287523578550256, "loss": 0.5262, "step": 606000 }, { "epoch": 81.66262462947992, "grad_norm": 0.17212966084480286, "learning_rate": 0.000328714931584778, "loss": 0.5252, "step": 606100 }, { "epoch": 81.67609808676906, "grad_norm": 0.17572517693042755, "learning_rate": 0.0003286775053145304, "loss": 0.5245, "step": 606200 }, { "epoch": 81.68957154405821, "grad_norm": 0.16572438180446625, "learning_rate": 0.0003286400790442828, "loss": 0.5264, "step": 606300 }, { "epoch": 81.70304500134735, "grad_norm": 0.17394596338272095, "learning_rate": 0.0003286026527740352, "loss": 0.5252, "step": 606400 }, { "epoch": 81.71651845863649, "grad_norm": 0.172743558883667, "learning_rate": 0.0003285652265037876, "loss": 0.5259, "step": 606500 }, { "epoch": 81.72999191592562, "grad_norm": 0.18762463331222534, "learning_rate": 0.00032852780023354, "loss": 0.5254, "step": 606600 }, { "epoch": 81.74346537321476, "grad_norm": 0.1897730678319931, "learning_rate": 0.0003284903739632923, "loss": 0.5266, "step": 606700 }, { "epoch": 81.75693883050391, "grad_norm": 0.2053108811378479, "learning_rate": 0.0003284529476930447, "loss": 0.5266, "step": 606800 }, { "epoch": 81.77041228779305, "grad_norm": 0.1596875786781311, "learning_rate": 0.00032841552142279706, "loss": 0.526, "step": 606900 }, { "epoch": 81.78388574508219, "grad_norm": 0.17645113170146942, "learning_rate": 0.00032837809515254946, "loss": 0.5256, "step": 607000 }, { "epoch": 81.79735920237133, "grad_norm": 0.18176785111427307, "learning_rate": 0.00032834066888230186, "loss": 0.5257, "step": 607100 }, { "epoch": 81.81083265966046, "grad_norm": 0.1697119176387787, "learning_rate": 0.00032830324261205426, "loss": 0.5251, "step": 607200 }, { "epoch": 81.82430611694961, "grad_norm": 0.1663534939289093, "learning_rate": 0.00032826581634180666, "loss": 0.5252, "step": 607300 }, { "epoch": 81.83777957423875, "grad_norm": 0.18926745653152466, "learning_rate": 0.000328228390071559, "loss": 0.5242, "step": 607400 }, { "epoch": 81.85125303152789, "grad_norm": 0.19714398682117462, "learning_rate": 0.0003281909638013114, "loss": 0.5261, "step": 607500 }, { "epoch": 81.86472648881703, "grad_norm": 0.16625919938087463, "learning_rate": 0.0003281535375310638, "loss": 0.5259, "step": 607600 }, { "epoch": 81.87819994610616, "grad_norm": 0.1855078935623169, "learning_rate": 0.0003281161112608162, "loss": 0.5258, "step": 607700 }, { "epoch": 81.89167340339532, "grad_norm": 0.1719999760389328, "learning_rate": 0.0003280786849905686, "loss": 0.526, "step": 607800 }, { "epoch": 81.90514686068445, "grad_norm": 0.19600725173950195, "learning_rate": 0.000328041258720321, "loss": 0.5256, "step": 607900 }, { "epoch": 81.91862031797359, "grad_norm": 0.19010254740715027, "learning_rate": 0.0003280038324500734, "loss": 0.5262, "step": 608000 }, { "epoch": 81.93209377526273, "grad_norm": 0.19822683930397034, "learning_rate": 0.0003279664061798258, "loss": 0.5252, "step": 608100 }, { "epoch": 81.94556723255187, "grad_norm": 0.18395617604255676, "learning_rate": 0.00032792897990957813, "loss": 0.5259, "step": 608200 }, { "epoch": 81.95904068984102, "grad_norm": 0.1725267618894577, "learning_rate": 0.00032789155363933053, "loss": 0.5263, "step": 608300 }, { "epoch": 81.97251414713016, "grad_norm": 0.1810740977525711, "learning_rate": 0.00032785412736908293, "loss": 0.5259, "step": 608400 }, { "epoch": 81.9859876044193, "grad_norm": 0.16908736526966095, "learning_rate": 0.0003278167010988353, "loss": 0.5257, "step": 608500 }, { "epoch": 81.99946106170843, "grad_norm": 0.16614781320095062, "learning_rate": 0.0003277792748285877, "loss": 0.5271, "step": 608600 }, { "epoch": 82.0, "eval_loss": 0.5142410397529602, "eval_runtime": 4.9646, "eval_samples_per_second": 1007.133, "eval_steps_per_second": 15.913, "step": 608604 }, { "epoch": 82.01293451899757, "grad_norm": 0.18083705008029938, "learning_rate": 0.00032774184855834007, "loss": 0.5254, "step": 608700 }, { "epoch": 82.02640797628672, "grad_norm": 0.1719261109828949, "learning_rate": 0.00032770442228809247, "loss": 0.5243, "step": 608800 }, { "epoch": 82.03988143357586, "grad_norm": 0.18126773834228516, "learning_rate": 0.0003276669960178448, "loss": 0.5248, "step": 608900 }, { "epoch": 82.053354890865, "grad_norm": 0.1853313446044922, "learning_rate": 0.0003276295697475972, "loss": 0.5246, "step": 609000 }, { "epoch": 82.06682834815413, "grad_norm": 0.17043426632881165, "learning_rate": 0.0003275921434773496, "loss": 0.525, "step": 609100 }, { "epoch": 82.08030180544327, "grad_norm": 0.18065226078033447, "learning_rate": 0.000327554717207102, "loss": 0.5258, "step": 609200 }, { "epoch": 82.09377526273242, "grad_norm": 0.17719034850597382, "learning_rate": 0.0003275172909368544, "loss": 0.5245, "step": 609300 }, { "epoch": 82.10724872002156, "grad_norm": 0.1760374754667282, "learning_rate": 0.0003274798646666068, "loss": 0.5257, "step": 609400 }, { "epoch": 82.1207221773107, "grad_norm": 0.18159501254558563, "learning_rate": 0.0003274424383963592, "loss": 0.5253, "step": 609500 }, { "epoch": 82.13419563459983, "grad_norm": 0.18026389181613922, "learning_rate": 0.00032740501212611154, "loss": 0.5241, "step": 609600 }, { "epoch": 82.14766909188899, "grad_norm": 0.1831192672252655, "learning_rate": 0.00032736758585586394, "loss": 0.5259, "step": 609700 }, { "epoch": 82.16114254917812, "grad_norm": 0.19636943936347961, "learning_rate": 0.00032733015958561634, "loss": 0.5248, "step": 609800 }, { "epoch": 82.17461600646726, "grad_norm": 0.18468518555164337, "learning_rate": 0.00032729273331536874, "loss": 0.5244, "step": 609900 }, { "epoch": 82.1880894637564, "grad_norm": 0.1768549084663391, "learning_rate": 0.00032725530704512114, "loss": 0.5248, "step": 610000 }, { "epoch": 82.20156292104554, "grad_norm": 0.18220476806163788, "learning_rate": 0.00032721788077487353, "loss": 0.5251, "step": 610100 }, { "epoch": 82.21503637833469, "grad_norm": 0.17549683153629303, "learning_rate": 0.00032718045450462593, "loss": 0.5259, "step": 610200 }, { "epoch": 82.22850983562383, "grad_norm": 0.16259200870990753, "learning_rate": 0.0003271430282343783, "loss": 0.5249, "step": 610300 }, { "epoch": 82.24198329291296, "grad_norm": 0.16812103986740112, "learning_rate": 0.0003271056019641307, "loss": 0.5257, "step": 610400 }, { "epoch": 82.2554567502021, "grad_norm": 0.1878058910369873, "learning_rate": 0.000327068175693883, "loss": 0.5255, "step": 610500 }, { "epoch": 82.26893020749124, "grad_norm": 0.20485776662826538, "learning_rate": 0.0003270307494236354, "loss": 0.5246, "step": 610600 }, { "epoch": 82.28240366478039, "grad_norm": 0.17950963973999023, "learning_rate": 0.0003269933231533878, "loss": 0.5264, "step": 610700 }, { "epoch": 82.29587712206953, "grad_norm": 0.18534019589424133, "learning_rate": 0.0003269558968831402, "loss": 0.525, "step": 610800 }, { "epoch": 82.30935057935866, "grad_norm": 0.17760014533996582, "learning_rate": 0.0003269184706128926, "loss": 0.525, "step": 610900 }, { "epoch": 82.3228240366478, "grad_norm": 0.18400931358337402, "learning_rate": 0.000326881044342645, "loss": 0.5257, "step": 611000 }, { "epoch": 82.33629749393694, "grad_norm": 0.1849706918001175, "learning_rate": 0.00032684361807239735, "loss": 0.5251, "step": 611100 }, { "epoch": 82.34977095122609, "grad_norm": 0.18218305706977844, "learning_rate": 0.00032680619180214975, "loss": 0.5246, "step": 611200 }, { "epoch": 82.36324440851523, "grad_norm": 0.16818107664585114, "learning_rate": 0.00032676876553190215, "loss": 0.5258, "step": 611300 }, { "epoch": 82.37671786580437, "grad_norm": 0.1829637736082077, "learning_rate": 0.00032673133926165455, "loss": 0.526, "step": 611400 }, { "epoch": 82.3901913230935, "grad_norm": 0.2016962170600891, "learning_rate": 0.00032669391299140695, "loss": 0.5254, "step": 611500 }, { "epoch": 82.40366478038264, "grad_norm": 0.17560918629169464, "learning_rate": 0.00032665648672115935, "loss": 0.5259, "step": 611600 }, { "epoch": 82.4171382376718, "grad_norm": 0.1746102124452591, "learning_rate": 0.00032661906045091174, "loss": 0.525, "step": 611700 }, { "epoch": 82.43061169496093, "grad_norm": 0.174470916390419, "learning_rate": 0.0003265816341806641, "loss": 0.5255, "step": 611800 }, { "epoch": 82.44408515225007, "grad_norm": 0.17255811393260956, "learning_rate": 0.0003265442079104165, "loss": 0.5254, "step": 611900 }, { "epoch": 82.4575586095392, "grad_norm": 0.1805427074432373, "learning_rate": 0.0003265067816401689, "loss": 0.5251, "step": 612000 }, { "epoch": 82.47103206682834, "grad_norm": 0.17377051711082458, "learning_rate": 0.0003264693553699213, "loss": 0.5256, "step": 612100 }, { "epoch": 82.4845055241175, "grad_norm": 0.18164682388305664, "learning_rate": 0.0003264319290996737, "loss": 0.5252, "step": 612200 }, { "epoch": 82.49797898140663, "grad_norm": 0.19084039330482483, "learning_rate": 0.000326394502829426, "loss": 0.5261, "step": 612300 }, { "epoch": 82.51145243869577, "grad_norm": 0.16499996185302734, "learning_rate": 0.0003263570765591784, "loss": 0.5266, "step": 612400 }, { "epoch": 82.52492589598491, "grad_norm": 0.2623251676559448, "learning_rate": 0.00032631965028893077, "loss": 0.5257, "step": 612500 }, { "epoch": 82.53839935327404, "grad_norm": 0.17727480828762054, "learning_rate": 0.00032628222401868317, "loss": 0.5249, "step": 612600 }, { "epoch": 82.5518728105632, "grad_norm": 0.18288792669773102, "learning_rate": 0.00032624479774843556, "loss": 0.5253, "step": 612700 }, { "epoch": 82.56534626785233, "grad_norm": 0.1694965958595276, "learning_rate": 0.00032620737147818796, "loss": 0.5255, "step": 612800 }, { "epoch": 82.57881972514147, "grad_norm": 0.17703893780708313, "learning_rate": 0.00032616994520794036, "loss": 0.5256, "step": 612900 }, { "epoch": 82.59229318243061, "grad_norm": 0.17571516335010529, "learning_rate": 0.00032613251893769276, "loss": 0.525, "step": 613000 }, { "epoch": 82.60576663971975, "grad_norm": 0.17852595448493958, "learning_rate": 0.00032609509266744516, "loss": 0.525, "step": 613100 }, { "epoch": 82.6192400970089, "grad_norm": 0.1800220012664795, "learning_rate": 0.00032605766639719755, "loss": 0.5251, "step": 613200 }, { "epoch": 82.63271355429804, "grad_norm": 0.17791509628295898, "learning_rate": 0.0003260202401269499, "loss": 0.5258, "step": 613300 }, { "epoch": 82.64618701158717, "grad_norm": 0.16795755922794342, "learning_rate": 0.0003259828138567023, "loss": 0.526, "step": 613400 }, { "epoch": 82.65966046887631, "grad_norm": 0.2234017252922058, "learning_rate": 0.0003259453875864547, "loss": 0.5264, "step": 613500 }, { "epoch": 82.67313392616545, "grad_norm": 0.17574714124202728, "learning_rate": 0.0003259079613162071, "loss": 0.5262, "step": 613600 }, { "epoch": 82.6866073834546, "grad_norm": 0.17825762927532196, "learning_rate": 0.0003258705350459595, "loss": 0.5261, "step": 613700 }, { "epoch": 82.70008084074374, "grad_norm": 0.17810530960559845, "learning_rate": 0.0003258331087757119, "loss": 0.5259, "step": 613800 }, { "epoch": 82.71355429803288, "grad_norm": 0.16835454106330872, "learning_rate": 0.0003257956825054643, "loss": 0.5252, "step": 613900 }, { "epoch": 82.72702775532201, "grad_norm": 0.19643503427505493, "learning_rate": 0.00032575825623521663, "loss": 0.5259, "step": 614000 }, { "epoch": 82.74050121261115, "grad_norm": 0.1816793531179428, "learning_rate": 0.00032572082996496903, "loss": 0.5261, "step": 614100 }, { "epoch": 82.7539746699003, "grad_norm": 0.1815301924943924, "learning_rate": 0.0003256834036947214, "loss": 0.5246, "step": 614200 }, { "epoch": 82.76744812718944, "grad_norm": 0.18876560032367706, "learning_rate": 0.00032564597742447377, "loss": 0.525, "step": 614300 }, { "epoch": 82.78092158447858, "grad_norm": 0.16782154142856598, "learning_rate": 0.00032560855115422617, "loss": 0.5255, "step": 614400 }, { "epoch": 82.79439504176771, "grad_norm": 0.17986463010311127, "learning_rate": 0.00032557112488397857, "loss": 0.5256, "step": 614500 }, { "epoch": 82.80786849905685, "grad_norm": 0.17072710394859314, "learning_rate": 0.00032553369861373097, "loss": 0.525, "step": 614600 }, { "epoch": 82.821341956346, "grad_norm": 0.1871005743741989, "learning_rate": 0.0003254962723434833, "loss": 0.5259, "step": 614700 }, { "epoch": 82.83481541363514, "grad_norm": 0.18624424934387207, "learning_rate": 0.0003254588460732357, "loss": 0.5244, "step": 614800 }, { "epoch": 82.84828887092428, "grad_norm": 0.18850770592689514, "learning_rate": 0.0003254214198029881, "loss": 0.5256, "step": 614900 }, { "epoch": 82.86176232821342, "grad_norm": 0.17285622656345367, "learning_rate": 0.0003253839935327405, "loss": 0.5254, "step": 615000 }, { "epoch": 82.87523578550255, "grad_norm": 0.17334526777267456, "learning_rate": 0.0003253465672624929, "loss": 0.5253, "step": 615100 }, { "epoch": 82.8887092427917, "grad_norm": 0.18031227588653564, "learning_rate": 0.0003253091409922453, "loss": 0.5246, "step": 615200 }, { "epoch": 82.90218270008084, "grad_norm": 0.18659663200378418, "learning_rate": 0.0003252717147219977, "loss": 0.5255, "step": 615300 }, { "epoch": 82.91565615736998, "grad_norm": 0.16411511600017548, "learning_rate": 0.00032523428845175004, "loss": 0.526, "step": 615400 }, { "epoch": 82.92912961465912, "grad_norm": 0.1898762434720993, "learning_rate": 0.00032519686218150244, "loss": 0.5246, "step": 615500 }, { "epoch": 82.94260307194826, "grad_norm": 0.1937422901391983, "learning_rate": 0.00032515943591125484, "loss": 0.5248, "step": 615600 }, { "epoch": 82.95607652923741, "grad_norm": 0.18434768915176392, "learning_rate": 0.00032512200964100724, "loss": 0.5249, "step": 615700 }, { "epoch": 82.96954998652654, "grad_norm": 0.17934228479862213, "learning_rate": 0.00032508458337075964, "loss": 0.5253, "step": 615800 }, { "epoch": 82.98302344381568, "grad_norm": 0.18011043965816498, "learning_rate": 0.00032504715710051204, "loss": 0.5259, "step": 615900 }, { "epoch": 82.99649690110482, "grad_norm": 0.17227128148078918, "learning_rate": 0.0003250097308302644, "loss": 0.5253, "step": 616000 }, { "epoch": 83.0, "eval_loss": 0.5136860609054565, "eval_runtime": 4.9552, "eval_samples_per_second": 1009.05, "eval_steps_per_second": 15.943, "step": 616026 }, { "epoch": 83.00997035839396, "grad_norm": 0.17185840010643005, "learning_rate": 0.0003249723045600168, "loss": 0.5252, "step": 616100 }, { "epoch": 83.02344381568311, "grad_norm": 0.15817280113697052, "learning_rate": 0.0003249348782897691, "loss": 0.5239, "step": 616200 }, { "epoch": 83.03691727297225, "grad_norm": 0.18560537695884705, "learning_rate": 0.0003248974520195215, "loss": 0.5246, "step": 616300 }, { "epoch": 83.05039073026138, "grad_norm": 0.18400610983371735, "learning_rate": 0.0003248600257492739, "loss": 0.524, "step": 616400 }, { "epoch": 83.06386418755052, "grad_norm": 0.18046720325946808, "learning_rate": 0.0003248225994790263, "loss": 0.5238, "step": 616500 }, { "epoch": 83.07733764483966, "grad_norm": 0.1696176677942276, "learning_rate": 0.0003247851732087787, "loss": 0.5253, "step": 616600 }, { "epoch": 83.09081110212881, "grad_norm": 0.17994417250156403, "learning_rate": 0.0003247477469385311, "loss": 0.526, "step": 616700 }, { "epoch": 83.10428455941795, "grad_norm": 0.1732337772846222, "learning_rate": 0.0003247103206682835, "loss": 0.5251, "step": 616800 }, { "epoch": 83.11775801670709, "grad_norm": 0.1832607239484787, "learning_rate": 0.00032467289439803586, "loss": 0.5251, "step": 616900 }, { "epoch": 83.13123147399622, "grad_norm": 0.23149845004081726, "learning_rate": 0.00032463546812778825, "loss": 0.524, "step": 617000 }, { "epoch": 83.14470493128536, "grad_norm": 0.20631112158298492, "learning_rate": 0.00032459804185754065, "loss": 0.5258, "step": 617100 }, { "epoch": 83.15817838857451, "grad_norm": 0.17107756435871124, "learning_rate": 0.00032456061558729305, "loss": 0.5246, "step": 617200 }, { "epoch": 83.17165184586365, "grad_norm": 0.16736598312854767, "learning_rate": 0.00032452318931704545, "loss": 0.525, "step": 617300 }, { "epoch": 83.18512530315279, "grad_norm": 0.17778652906417847, "learning_rate": 0.00032448576304679785, "loss": 0.5244, "step": 617400 }, { "epoch": 83.19859876044193, "grad_norm": 0.20845025777816772, "learning_rate": 0.00032444833677655025, "loss": 0.5262, "step": 617500 }, { "epoch": 83.21207221773108, "grad_norm": 0.18301443755626678, "learning_rate": 0.0003244109105063026, "loss": 0.5253, "step": 617600 }, { "epoch": 83.22554567502021, "grad_norm": 0.17165394127368927, "learning_rate": 0.000324373484236055, "loss": 0.5257, "step": 617700 }, { "epoch": 83.23901913230935, "grad_norm": 0.17963092029094696, "learning_rate": 0.00032433605796580733, "loss": 0.5247, "step": 617800 }, { "epoch": 83.25249258959849, "grad_norm": 0.2086455225944519, "learning_rate": 0.00032429863169555973, "loss": 0.5255, "step": 617900 }, { "epoch": 83.26596604688763, "grad_norm": 0.1760541796684265, "learning_rate": 0.00032426120542531213, "loss": 0.525, "step": 618000 }, { "epoch": 83.27943950417678, "grad_norm": 0.1952749639749527, "learning_rate": 0.0003242237791550645, "loss": 0.5254, "step": 618100 }, { "epoch": 83.29291296146592, "grad_norm": 0.17369024455547333, "learning_rate": 0.0003241863528848169, "loss": 0.5244, "step": 618200 }, { "epoch": 83.30638641875505, "grad_norm": 0.17533956468105316, "learning_rate": 0.00032414892661456927, "loss": 0.5257, "step": 618300 }, { "epoch": 83.31985987604419, "grad_norm": 0.17836733162403107, "learning_rate": 0.00032411150034432167, "loss": 0.5258, "step": 618400 }, { "epoch": 83.33333333333333, "grad_norm": 0.16901111602783203, "learning_rate": 0.00032407407407407406, "loss": 0.5246, "step": 618500 }, { "epoch": 83.34680679062248, "grad_norm": 0.1672687530517578, "learning_rate": 0.00032403664780382646, "loss": 0.5261, "step": 618600 }, { "epoch": 83.36028024791162, "grad_norm": 0.17507381737232208, "learning_rate": 0.00032399922153357886, "loss": 0.5249, "step": 618700 }, { "epoch": 83.37375370520076, "grad_norm": 0.1896631270647049, "learning_rate": 0.00032396179526333126, "loss": 0.5254, "step": 618800 }, { "epoch": 83.38722716248989, "grad_norm": 0.16482286155223846, "learning_rate": 0.00032392436899308366, "loss": 0.5255, "step": 618900 }, { "epoch": 83.40070061977903, "grad_norm": 0.19193804264068604, "learning_rate": 0.00032388694272283606, "loss": 0.5251, "step": 619000 }, { "epoch": 83.41417407706818, "grad_norm": 0.18249580264091492, "learning_rate": 0.0003238495164525884, "loss": 0.5244, "step": 619100 }, { "epoch": 83.42764753435732, "grad_norm": 0.1726117879152298, "learning_rate": 0.0003238120901823408, "loss": 0.5258, "step": 619200 }, { "epoch": 83.44112099164646, "grad_norm": 0.18416717648506165, "learning_rate": 0.0003237746639120932, "loss": 0.5258, "step": 619300 }, { "epoch": 83.4545944489356, "grad_norm": 0.1846969872713089, "learning_rate": 0.0003237372376418456, "loss": 0.5243, "step": 619400 }, { "epoch": 83.46806790622473, "grad_norm": 0.1739848554134369, "learning_rate": 0.000323699811371598, "loss": 0.5244, "step": 619500 }, { "epoch": 83.48154136351388, "grad_norm": 0.18679291009902954, "learning_rate": 0.00032366238510135034, "loss": 0.5249, "step": 619600 }, { "epoch": 83.49501482080302, "grad_norm": 0.1757814735174179, "learning_rate": 0.00032362495883110274, "loss": 0.5241, "step": 619700 }, { "epoch": 83.50848827809216, "grad_norm": 0.17353388667106628, "learning_rate": 0.0003235875325608551, "loss": 0.5257, "step": 619800 }, { "epoch": 83.5219617353813, "grad_norm": 0.1691889464855194, "learning_rate": 0.0003235501062906075, "loss": 0.5255, "step": 619900 }, { "epoch": 83.53543519267043, "grad_norm": 0.1814165711402893, "learning_rate": 0.0003235126800203599, "loss": 0.5252, "step": 620000 }, { "epoch": 83.54890864995959, "grad_norm": 0.20164614915847778, "learning_rate": 0.0003234752537501123, "loss": 0.5257, "step": 620100 }, { "epoch": 83.56238210724872, "grad_norm": 0.16609162092208862, "learning_rate": 0.00032343782747986467, "loss": 0.5253, "step": 620200 }, { "epoch": 83.57585556453786, "grad_norm": 0.18846997618675232, "learning_rate": 0.00032340040120961707, "loss": 0.5263, "step": 620300 }, { "epoch": 83.589329021827, "grad_norm": 0.17588414251804352, "learning_rate": 0.00032336297493936947, "loss": 0.5255, "step": 620400 }, { "epoch": 83.60280247911614, "grad_norm": 0.16797366738319397, "learning_rate": 0.0003233255486691218, "loss": 0.5252, "step": 620500 }, { "epoch": 83.61627593640529, "grad_norm": 0.170575350522995, "learning_rate": 0.0003232881223988742, "loss": 0.5246, "step": 620600 }, { "epoch": 83.62974939369442, "grad_norm": 0.1791813224554062, "learning_rate": 0.0003232506961286266, "loss": 0.5255, "step": 620700 }, { "epoch": 83.64322285098356, "grad_norm": 0.1709374040365219, "learning_rate": 0.000323213269858379, "loss": 0.5261, "step": 620800 }, { "epoch": 83.6566963082727, "grad_norm": 0.17402803897857666, "learning_rate": 0.0003231758435881314, "loss": 0.5257, "step": 620900 }, { "epoch": 83.67016976556184, "grad_norm": 0.17131607234477997, "learning_rate": 0.0003231384173178838, "loss": 0.5249, "step": 621000 }, { "epoch": 83.68364322285099, "grad_norm": 0.16851310431957245, "learning_rate": 0.0003231009910476362, "loss": 0.5257, "step": 621100 }, { "epoch": 83.69711668014013, "grad_norm": 0.16777905821800232, "learning_rate": 0.0003230635647773886, "loss": 0.5252, "step": 621200 }, { "epoch": 83.71059013742926, "grad_norm": 0.17471781373023987, "learning_rate": 0.00032302613850714094, "loss": 0.5262, "step": 621300 }, { "epoch": 83.7240635947184, "grad_norm": 0.19219553470611572, "learning_rate": 0.0003229887122368933, "loss": 0.5259, "step": 621400 }, { "epoch": 83.73753705200754, "grad_norm": 0.17863516509532928, "learning_rate": 0.0003229512859666457, "loss": 0.5242, "step": 621500 }, { "epoch": 83.75101050929669, "grad_norm": 0.18359573185443878, "learning_rate": 0.0003229138596963981, "loss": 0.5265, "step": 621600 }, { "epoch": 83.76448396658583, "grad_norm": 0.20578452944755554, "learning_rate": 0.0003228764334261505, "loss": 0.5244, "step": 621700 }, { "epoch": 83.77795742387497, "grad_norm": 0.18617568910121918, "learning_rate": 0.0003228390071559029, "loss": 0.5247, "step": 621800 }, { "epoch": 83.7914308811641, "grad_norm": 0.16746394336223602, "learning_rate": 0.0003228015808856553, "loss": 0.5258, "step": 621900 }, { "epoch": 83.80490433845324, "grad_norm": 0.17124250531196594, "learning_rate": 0.0003227641546154076, "loss": 0.525, "step": 622000 }, { "epoch": 83.81837779574239, "grad_norm": 0.16704684495925903, "learning_rate": 0.00032272672834516, "loss": 0.5248, "step": 622100 }, { "epoch": 83.83185125303153, "grad_norm": 0.1776348054409027, "learning_rate": 0.0003226893020749124, "loss": 0.5244, "step": 622200 }, { "epoch": 83.84532471032067, "grad_norm": 0.17771905660629272, "learning_rate": 0.0003226518758046648, "loss": 0.5249, "step": 622300 }, { "epoch": 83.8587981676098, "grad_norm": 0.19434261322021484, "learning_rate": 0.0003226144495344172, "loss": 0.5243, "step": 622400 }, { "epoch": 83.87227162489894, "grad_norm": 0.1623610109090805, "learning_rate": 0.0003225770232641696, "loss": 0.5253, "step": 622500 }, { "epoch": 83.8857450821881, "grad_norm": 0.17533862590789795, "learning_rate": 0.000322539596993922, "loss": 0.5239, "step": 622600 }, { "epoch": 83.89921853947723, "grad_norm": 0.17957176268100739, "learning_rate": 0.00032250217072367436, "loss": 0.5242, "step": 622700 }, { "epoch": 83.91269199676637, "grad_norm": 0.1686936318874359, "learning_rate": 0.00032246474445342676, "loss": 0.5254, "step": 622800 }, { "epoch": 83.9261654540555, "grad_norm": 0.17860493063926697, "learning_rate": 0.00032242731818317915, "loss": 0.5251, "step": 622900 }, { "epoch": 83.93963891134464, "grad_norm": 0.17402516305446625, "learning_rate": 0.00032238989191293155, "loss": 0.5269, "step": 623000 }, { "epoch": 83.9531123686338, "grad_norm": 0.20762982964515686, "learning_rate": 0.00032235246564268395, "loss": 0.5243, "step": 623100 }, { "epoch": 83.96658582592293, "grad_norm": 0.199662983417511, "learning_rate": 0.0003223150393724363, "loss": 0.5253, "step": 623200 }, { "epoch": 83.98005928321207, "grad_norm": 0.1754889041185379, "learning_rate": 0.0003222776131021887, "loss": 0.5248, "step": 623300 }, { "epoch": 83.99353274050121, "grad_norm": 0.1843092441558838, "learning_rate": 0.00032224018683194104, "loss": 0.5262, "step": 623400 }, { "epoch": 84.0, "eval_loss": 0.5137937664985657, "eval_runtime": 4.952, "eval_samples_per_second": 1009.702, "eval_steps_per_second": 15.953, "step": 623448 }, { "epoch": 84.00700619779035, "grad_norm": 0.1631525158882141, "learning_rate": 0.00032220276056169343, "loss": 0.5245, "step": 623500 }, { "epoch": 84.0204796550795, "grad_norm": 0.19435818493366241, "learning_rate": 0.00032216533429144583, "loss": 0.5247, "step": 623600 }, { "epoch": 84.03395311236864, "grad_norm": 0.17586258053779602, "learning_rate": 0.00032212790802119823, "loss": 0.525, "step": 623700 }, { "epoch": 84.04742656965777, "grad_norm": 0.19734880328178406, "learning_rate": 0.00032209048175095063, "loss": 0.5245, "step": 623800 }, { "epoch": 84.06090002694691, "grad_norm": 0.190974161028862, "learning_rate": 0.00032205305548070303, "loss": 0.5237, "step": 623900 }, { "epoch": 84.07437348423605, "grad_norm": 0.1876324713230133, "learning_rate": 0.0003220156292104554, "loss": 0.525, "step": 624000 }, { "epoch": 84.0878469415252, "grad_norm": 0.18822923302650452, "learning_rate": 0.0003219782029402078, "loss": 0.5238, "step": 624100 }, { "epoch": 84.10132039881434, "grad_norm": 0.1878012865781784, "learning_rate": 0.00032194077666996017, "loss": 0.5246, "step": 624200 }, { "epoch": 84.11479385610347, "grad_norm": 0.1669558882713318, "learning_rate": 0.00032190335039971257, "loss": 0.5238, "step": 624300 }, { "epoch": 84.12826731339261, "grad_norm": 0.1862858533859253, "learning_rate": 0.00032186592412946496, "loss": 0.5243, "step": 624400 }, { "epoch": 84.14174077068175, "grad_norm": 0.16173766553401947, "learning_rate": 0.00032182849785921736, "loss": 0.5257, "step": 624500 }, { "epoch": 84.1552142279709, "grad_norm": 0.15827752649784088, "learning_rate": 0.00032179107158896976, "loss": 0.5245, "step": 624600 }, { "epoch": 84.16868768526004, "grad_norm": 0.1830851286649704, "learning_rate": 0.00032175364531872216, "loss": 0.525, "step": 624700 }, { "epoch": 84.18216114254918, "grad_norm": 0.19096432626247406, "learning_rate": 0.00032171621904847456, "loss": 0.524, "step": 624800 }, { "epoch": 84.19563459983831, "grad_norm": 0.18399503827095032, "learning_rate": 0.0003216787927782269, "loss": 0.5241, "step": 624900 }, { "epoch": 84.20910805712747, "grad_norm": 0.16892164945602417, "learning_rate": 0.00032164136650797925, "loss": 0.5245, "step": 625000 }, { "epoch": 84.2225815144166, "grad_norm": 0.1608998328447342, "learning_rate": 0.00032160394023773164, "loss": 0.5249, "step": 625100 }, { "epoch": 84.23605497170574, "grad_norm": 0.17938201129436493, "learning_rate": 0.00032156651396748404, "loss": 0.5245, "step": 625200 }, { "epoch": 84.24952842899488, "grad_norm": 0.16064997017383575, "learning_rate": 0.00032152908769723644, "loss": 0.5251, "step": 625300 }, { "epoch": 84.26300188628402, "grad_norm": 0.19701489806175232, "learning_rate": 0.00032149166142698884, "loss": 0.5245, "step": 625400 }, { "epoch": 84.27647534357317, "grad_norm": 0.20850777626037598, "learning_rate": 0.00032145423515674124, "loss": 0.5246, "step": 625500 }, { "epoch": 84.2899488008623, "grad_norm": 0.1682354211807251, "learning_rate": 0.0003214168088864936, "loss": 0.5246, "step": 625600 }, { "epoch": 84.30342225815144, "grad_norm": 0.17666421830654144, "learning_rate": 0.000321379382616246, "loss": 0.5262, "step": 625700 }, { "epoch": 84.31689571544058, "grad_norm": 0.1711249053478241, "learning_rate": 0.0003213419563459984, "loss": 0.5249, "step": 625800 }, { "epoch": 84.33036917272972, "grad_norm": 0.17120441794395447, "learning_rate": 0.0003213045300757508, "loss": 0.5247, "step": 625900 }, { "epoch": 84.34384263001887, "grad_norm": 0.2095709592103958, "learning_rate": 0.0003212671038055032, "loss": 0.5253, "step": 626000 }, { "epoch": 84.357316087308, "grad_norm": 0.17378516495227814, "learning_rate": 0.00032122967753525557, "loss": 0.525, "step": 626100 }, { "epoch": 84.37078954459714, "grad_norm": 0.18005120754241943, "learning_rate": 0.00032119225126500797, "loss": 0.5245, "step": 626200 }, { "epoch": 84.38426300188628, "grad_norm": 0.17523068189620972, "learning_rate": 0.00032115482499476037, "loss": 0.5257, "step": 626300 }, { "epoch": 84.39773645917542, "grad_norm": 0.17949169874191284, "learning_rate": 0.0003211173987245127, "loss": 0.5244, "step": 626400 }, { "epoch": 84.41120991646457, "grad_norm": 0.18321385979652405, "learning_rate": 0.0003210799724542651, "loss": 0.5248, "step": 626500 }, { "epoch": 84.42468337375371, "grad_norm": 0.18388502299785614, "learning_rate": 0.0003210425461840175, "loss": 0.5248, "step": 626600 }, { "epoch": 84.43815683104285, "grad_norm": 0.17123845219612122, "learning_rate": 0.0003210051199137699, "loss": 0.5257, "step": 626700 }, { "epoch": 84.45163028833198, "grad_norm": 0.2012043595314026, "learning_rate": 0.00032096769364352225, "loss": 0.5252, "step": 626800 }, { "epoch": 84.46510374562112, "grad_norm": 0.17802776396274567, "learning_rate": 0.00032093026737327465, "loss": 0.5259, "step": 626900 }, { "epoch": 84.47857720291027, "grad_norm": 0.18161429464817047, "learning_rate": 0.00032089284110302705, "loss": 0.5242, "step": 627000 }, { "epoch": 84.49205066019941, "grad_norm": 0.17309492826461792, "learning_rate": 0.0003208554148327794, "loss": 0.5249, "step": 627100 }, { "epoch": 84.50552411748855, "grad_norm": 0.18384374678134918, "learning_rate": 0.0003208179885625318, "loss": 0.5247, "step": 627200 }, { "epoch": 84.51899757477769, "grad_norm": 0.192138209939003, "learning_rate": 0.0003207805622922842, "loss": 0.5253, "step": 627300 }, { "epoch": 84.53247103206682, "grad_norm": 0.17760252952575684, "learning_rate": 0.0003207431360220366, "loss": 0.5258, "step": 627400 }, { "epoch": 84.54594448935597, "grad_norm": 0.17258068919181824, "learning_rate": 0.000320705709751789, "loss": 0.524, "step": 627500 }, { "epoch": 84.55941794664511, "grad_norm": 0.17460624873638153, "learning_rate": 0.0003206682834815414, "loss": 0.5266, "step": 627600 }, { "epoch": 84.57289140393425, "grad_norm": 0.20928388833999634, "learning_rate": 0.0003206308572112938, "loss": 0.5251, "step": 627700 }, { "epoch": 84.58636486122339, "grad_norm": 0.1823420524597168, "learning_rate": 0.0003205934309410461, "loss": 0.5249, "step": 627800 }, { "epoch": 84.59983831851252, "grad_norm": 0.17410005629062653, "learning_rate": 0.0003205560046707985, "loss": 0.5259, "step": 627900 }, { "epoch": 84.61331177580168, "grad_norm": 0.1704656332731247, "learning_rate": 0.0003205185784005509, "loss": 0.5254, "step": 628000 }, { "epoch": 84.62678523309081, "grad_norm": 0.1753152459859848, "learning_rate": 0.0003204811521303033, "loss": 0.5257, "step": 628100 }, { "epoch": 84.64025869037995, "grad_norm": 0.17870837450027466, "learning_rate": 0.0003204437258600557, "loss": 0.5235, "step": 628200 }, { "epoch": 84.65373214766909, "grad_norm": 0.1778959333896637, "learning_rate": 0.0003204062995898081, "loss": 0.5249, "step": 628300 }, { "epoch": 84.66720560495823, "grad_norm": 0.20170004665851593, "learning_rate": 0.0003203688733195605, "loss": 0.526, "step": 628400 }, { "epoch": 84.68067906224738, "grad_norm": 0.18114228546619415, "learning_rate": 0.00032033144704931286, "loss": 0.5248, "step": 628500 }, { "epoch": 84.69415251953652, "grad_norm": 0.17744296789169312, "learning_rate": 0.0003202940207790652, "loss": 0.524, "step": 628600 }, { "epoch": 84.70762597682565, "grad_norm": 0.1895989179611206, "learning_rate": 0.0003202565945088176, "loss": 0.5247, "step": 628700 }, { "epoch": 84.72109943411479, "grad_norm": 0.165429025888443, "learning_rate": 0.00032021916823857, "loss": 0.5247, "step": 628800 }, { "epoch": 84.73457289140393, "grad_norm": 0.17535889148712158, "learning_rate": 0.0003201817419683224, "loss": 0.5251, "step": 628900 }, { "epoch": 84.74804634869308, "grad_norm": 0.16934718191623688, "learning_rate": 0.0003201443156980748, "loss": 0.5265, "step": 629000 }, { "epoch": 84.76151980598222, "grad_norm": 0.17353951930999756, "learning_rate": 0.0003201068894278272, "loss": 0.5253, "step": 629100 }, { "epoch": 84.77499326327136, "grad_norm": 0.17974790930747986, "learning_rate": 0.0003200694631575796, "loss": 0.5246, "step": 629200 }, { "epoch": 84.78846672056049, "grad_norm": 0.17150141298770905, "learning_rate": 0.00032003203688733194, "loss": 0.5254, "step": 629300 }, { "epoch": 84.80194017784963, "grad_norm": 0.17419105768203735, "learning_rate": 0.00031999461061708433, "loss": 0.5253, "step": 629400 }, { "epoch": 84.81541363513878, "grad_norm": 0.17561344802379608, "learning_rate": 0.00031995718434683673, "loss": 0.5255, "step": 629500 }, { "epoch": 84.82888709242792, "grad_norm": 0.2075549066066742, "learning_rate": 0.00031991975807658913, "loss": 0.5243, "step": 629600 }, { "epoch": 84.84236054971706, "grad_norm": 0.18386074900627136, "learning_rate": 0.00031988233180634153, "loss": 0.5244, "step": 629700 }, { "epoch": 84.8558340070062, "grad_norm": 0.16621564328670502, "learning_rate": 0.0003198449055360939, "loss": 0.5242, "step": 629800 }, { "epoch": 84.86930746429533, "grad_norm": 0.17556814849376678, "learning_rate": 0.0003198074792658463, "loss": 0.5237, "step": 629900 }, { "epoch": 84.88278092158448, "grad_norm": 0.16154924035072327, "learning_rate": 0.00031977005299559867, "loss": 0.5246, "step": 630000 }, { "epoch": 84.89625437887362, "grad_norm": 0.16674213111400604, "learning_rate": 0.00031973262672535107, "loss": 0.5249, "step": 630100 }, { "epoch": 84.90972783616276, "grad_norm": 0.16908438503742218, "learning_rate": 0.00031969520045510347, "loss": 0.526, "step": 630200 }, { "epoch": 84.9232012934519, "grad_norm": 0.1936819851398468, "learning_rate": 0.00031965777418485586, "loss": 0.5254, "step": 630300 }, { "epoch": 84.93667475074103, "grad_norm": 0.19563324749469757, "learning_rate": 0.00031962034791460826, "loss": 0.5255, "step": 630400 }, { "epoch": 84.95014820803019, "grad_norm": 0.2072954624891281, "learning_rate": 0.0003195829216443606, "loss": 0.5254, "step": 630500 }, { "epoch": 84.96362166531932, "grad_norm": 0.17326579988002777, "learning_rate": 0.000319545495374113, "loss": 0.5249, "step": 630600 }, { "epoch": 84.97709512260846, "grad_norm": 0.16581010818481445, "learning_rate": 0.00031950806910386535, "loss": 0.5253, "step": 630700 }, { "epoch": 84.9905685798976, "grad_norm": 0.16620996594429016, "learning_rate": 0.00031947064283361775, "loss": 0.5244, "step": 630800 }, { "epoch": 85.0, "eval_loss": 0.5138347744941711, "eval_runtime": 4.9525, "eval_samples_per_second": 1009.594, "eval_steps_per_second": 15.952, "step": 630870 }, { "epoch": 85.00404203718674, "grad_norm": 0.16222234070301056, "learning_rate": 0.00031943321656337014, "loss": 0.5247, "step": 630900 }, { "epoch": 85.01751549447589, "grad_norm": 0.17003203928470612, "learning_rate": 0.00031939579029312254, "loss": 0.523, "step": 631000 }, { "epoch": 85.03098895176502, "grad_norm": 0.18466681241989136, "learning_rate": 0.00031935836402287494, "loss": 0.5252, "step": 631100 }, { "epoch": 85.04446240905416, "grad_norm": 0.19903269410133362, "learning_rate": 0.00031932093775262734, "loss": 0.523, "step": 631200 }, { "epoch": 85.0579358663433, "grad_norm": 0.1804131120443344, "learning_rate": 0.00031928351148237974, "loss": 0.5239, "step": 631300 }, { "epoch": 85.07140932363244, "grad_norm": 0.16761088371276855, "learning_rate": 0.00031924608521213214, "loss": 0.5242, "step": 631400 }, { "epoch": 85.08488278092159, "grad_norm": 0.19078367948532104, "learning_rate": 0.0003192086589418845, "loss": 0.5246, "step": 631500 }, { "epoch": 85.09835623821073, "grad_norm": 0.17686964571475983, "learning_rate": 0.0003191712326716369, "loss": 0.5241, "step": 631600 }, { "epoch": 85.11182969549986, "grad_norm": 0.17401933670043945, "learning_rate": 0.0003191338064013893, "loss": 0.525, "step": 631700 }, { "epoch": 85.125303152789, "grad_norm": 0.1938088834285736, "learning_rate": 0.0003190963801311417, "loss": 0.5243, "step": 631800 }, { "epoch": 85.13877661007814, "grad_norm": 0.17691628634929657, "learning_rate": 0.00031905895386089407, "loss": 0.5244, "step": 631900 }, { "epoch": 85.15225006736729, "grad_norm": 0.21515311300754547, "learning_rate": 0.00031902152759064647, "loss": 0.5253, "step": 632000 }, { "epoch": 85.16572352465643, "grad_norm": 0.1689358502626419, "learning_rate": 0.00031898410132039887, "loss": 0.5241, "step": 632100 }, { "epoch": 85.17919698194557, "grad_norm": 0.21515443921089172, "learning_rate": 0.0003189466750501512, "loss": 0.5244, "step": 632200 }, { "epoch": 85.1926704392347, "grad_norm": 0.1931312382221222, "learning_rate": 0.00031890924877990356, "loss": 0.5237, "step": 632300 }, { "epoch": 85.20614389652386, "grad_norm": 0.18741953372955322, "learning_rate": 0.00031887182250965596, "loss": 0.5251, "step": 632400 }, { "epoch": 85.21961735381299, "grad_norm": 0.18364231288433075, "learning_rate": 0.00031883439623940835, "loss": 0.5259, "step": 632500 }, { "epoch": 85.23309081110213, "grad_norm": 0.18715298175811768, "learning_rate": 0.00031879696996916075, "loss": 0.5248, "step": 632600 }, { "epoch": 85.24656426839127, "grad_norm": 0.17073041200637817, "learning_rate": 0.00031875954369891315, "loss": 0.5249, "step": 632700 }, { "epoch": 85.2600377256804, "grad_norm": 0.1770680546760559, "learning_rate": 0.00031872211742866555, "loss": 0.5249, "step": 632800 }, { "epoch": 85.27351118296956, "grad_norm": 0.17612048983573914, "learning_rate": 0.0003186846911584179, "loss": 0.5242, "step": 632900 }, { "epoch": 85.2869846402587, "grad_norm": 0.2035418599843979, "learning_rate": 0.0003186472648881703, "loss": 0.5246, "step": 633000 }, { "epoch": 85.30045809754783, "grad_norm": 0.1945079118013382, "learning_rate": 0.0003186098386179227, "loss": 0.5249, "step": 633100 }, { "epoch": 85.31393155483697, "grad_norm": 0.1787368506193161, "learning_rate": 0.0003185724123476751, "loss": 0.5245, "step": 633200 }, { "epoch": 85.3274050121261, "grad_norm": 0.18296557664871216, "learning_rate": 0.0003185349860774275, "loss": 0.5253, "step": 633300 }, { "epoch": 85.34087846941526, "grad_norm": 0.17148138582706451, "learning_rate": 0.0003184975598071799, "loss": 0.5248, "step": 633400 }, { "epoch": 85.3543519267044, "grad_norm": 0.17409870028495789, "learning_rate": 0.0003184601335369323, "loss": 0.5245, "step": 633500 }, { "epoch": 85.36782538399353, "grad_norm": 0.18050552904605865, "learning_rate": 0.0003184227072666846, "loss": 0.525, "step": 633600 }, { "epoch": 85.38129884128267, "grad_norm": 0.18351095914840698, "learning_rate": 0.000318385280996437, "loss": 0.5244, "step": 633700 }, { "epoch": 85.39477229857181, "grad_norm": 0.17282089591026306, "learning_rate": 0.0003183478547261894, "loss": 0.5254, "step": 633800 }, { "epoch": 85.40824575586096, "grad_norm": 0.19246892631053925, "learning_rate": 0.0003183104284559418, "loss": 0.5254, "step": 633900 }, { "epoch": 85.4217192131501, "grad_norm": 0.19193905591964722, "learning_rate": 0.0003182730021856942, "loss": 0.5241, "step": 634000 }, { "epoch": 85.43519267043924, "grad_norm": 0.19397033751010895, "learning_rate": 0.00031823557591544656, "loss": 0.5239, "step": 634100 }, { "epoch": 85.44866612772837, "grad_norm": 0.17208139598369598, "learning_rate": 0.00031819814964519896, "loss": 0.5252, "step": 634200 }, { "epoch": 85.46213958501751, "grad_norm": 0.19689098000526428, "learning_rate": 0.00031816072337495136, "loss": 0.5258, "step": 634300 }, { "epoch": 85.47561304230666, "grad_norm": 0.18250831961631775, "learning_rate": 0.0003181232971047037, "loss": 0.5238, "step": 634400 }, { "epoch": 85.4890864995958, "grad_norm": 0.17202527821063995, "learning_rate": 0.0003180858708344561, "loss": 0.5247, "step": 634500 }, { "epoch": 85.50255995688494, "grad_norm": 0.21206504106521606, "learning_rate": 0.0003180484445642085, "loss": 0.5242, "step": 634600 }, { "epoch": 85.51603341417407, "grad_norm": 0.17534807324409485, "learning_rate": 0.0003180110182939609, "loss": 0.5249, "step": 634700 }, { "epoch": 85.52950687146321, "grad_norm": 0.2002815455198288, "learning_rate": 0.0003179735920237133, "loss": 0.5255, "step": 634800 }, { "epoch": 85.54298032875236, "grad_norm": 0.17787857353687286, "learning_rate": 0.0003179361657534657, "loss": 0.5248, "step": 634900 }, { "epoch": 85.5564537860415, "grad_norm": 0.16757109761238098, "learning_rate": 0.0003178987394832181, "loss": 0.5255, "step": 635000 }, { "epoch": 85.56992724333064, "grad_norm": 0.1730855405330658, "learning_rate": 0.00031786131321297044, "loss": 0.5254, "step": 635100 }, { "epoch": 85.58340070061978, "grad_norm": 0.19579915702342987, "learning_rate": 0.00031782388694272284, "loss": 0.5252, "step": 635200 }, { "epoch": 85.59687415790891, "grad_norm": 0.18066026270389557, "learning_rate": 0.00031778646067247523, "loss": 0.5263, "step": 635300 }, { "epoch": 85.61034761519807, "grad_norm": 0.17179277539253235, "learning_rate": 0.00031774903440222763, "loss": 0.5247, "step": 635400 }, { "epoch": 85.6238210724872, "grad_norm": 0.17876331508159637, "learning_rate": 0.00031771160813198003, "loss": 0.5245, "step": 635500 }, { "epoch": 85.63729452977634, "grad_norm": 0.17427976429462433, "learning_rate": 0.00031767418186173243, "loss": 0.5248, "step": 635600 }, { "epoch": 85.65076798706548, "grad_norm": 0.191936194896698, "learning_rate": 0.0003176367555914848, "loss": 0.524, "step": 635700 }, { "epoch": 85.66424144435462, "grad_norm": 0.17531774938106537, "learning_rate": 0.00031759932932123717, "loss": 0.525, "step": 635800 }, { "epoch": 85.67771490164377, "grad_norm": 0.16860218346118927, "learning_rate": 0.0003175619030509895, "loss": 0.5239, "step": 635900 }, { "epoch": 85.6911883589329, "grad_norm": 0.18736489117145538, "learning_rate": 0.0003175244767807419, "loss": 0.5245, "step": 636000 }, { "epoch": 85.70466181622204, "grad_norm": 0.18515194952487946, "learning_rate": 0.0003174870505104943, "loss": 0.5244, "step": 636100 }, { "epoch": 85.71813527351118, "grad_norm": 0.17708969116210938, "learning_rate": 0.0003174496242402467, "loss": 0.5259, "step": 636200 }, { "epoch": 85.73160873080032, "grad_norm": 0.1918165683746338, "learning_rate": 0.0003174121979699991, "loss": 0.5249, "step": 636300 }, { "epoch": 85.74508218808947, "grad_norm": 0.1765337586402893, "learning_rate": 0.0003173747716997515, "loss": 0.5246, "step": 636400 }, { "epoch": 85.7585556453786, "grad_norm": 0.19897343218326569, "learning_rate": 0.00031733734542950385, "loss": 0.5254, "step": 636500 }, { "epoch": 85.77202910266774, "grad_norm": 0.196843683719635, "learning_rate": 0.00031729991915925625, "loss": 0.5253, "step": 636600 }, { "epoch": 85.78550255995688, "grad_norm": 0.17494142055511475, "learning_rate": 0.00031726249288900865, "loss": 0.5259, "step": 636700 }, { "epoch": 85.79897601724602, "grad_norm": 0.17251402139663696, "learning_rate": 0.00031722506661876104, "loss": 0.5247, "step": 636800 }, { "epoch": 85.81244947453517, "grad_norm": 0.17681759595870972, "learning_rate": 0.00031718764034851344, "loss": 0.5242, "step": 636900 }, { "epoch": 85.82592293182431, "grad_norm": 0.1901988685131073, "learning_rate": 0.00031715021407826584, "loss": 0.5243, "step": 637000 }, { "epoch": 85.83939638911345, "grad_norm": 0.1782391220331192, "learning_rate": 0.00031711278780801824, "loss": 0.5249, "step": 637100 }, { "epoch": 85.85286984640258, "grad_norm": 0.18923629820346832, "learning_rate": 0.00031707536153777064, "loss": 0.5244, "step": 637200 }, { "epoch": 85.86634330369172, "grad_norm": 0.17704296112060547, "learning_rate": 0.000317037935267523, "loss": 0.5243, "step": 637300 }, { "epoch": 85.87981676098087, "grad_norm": 0.17573584616184235, "learning_rate": 0.0003170005089972754, "loss": 0.5247, "step": 637400 }, { "epoch": 85.89329021827001, "grad_norm": 0.18796662986278534, "learning_rate": 0.0003169630827270278, "loss": 0.5248, "step": 637500 }, { "epoch": 85.90676367555915, "grad_norm": 0.17710302770137787, "learning_rate": 0.0003169256564567802, "loss": 0.5248, "step": 637600 }, { "epoch": 85.92023713284829, "grad_norm": 0.17643769085407257, "learning_rate": 0.0003168882301865325, "loss": 0.5251, "step": 637700 }, { "epoch": 85.93371059013742, "grad_norm": 0.18521462380886078, "learning_rate": 0.0003168508039162849, "loss": 0.5249, "step": 637800 }, { "epoch": 85.94718404742657, "grad_norm": 0.1756461262702942, "learning_rate": 0.0003168133776460373, "loss": 0.5253, "step": 637900 }, { "epoch": 85.96065750471571, "grad_norm": 0.2123958170413971, "learning_rate": 0.00031677595137578966, "loss": 0.5242, "step": 638000 }, { "epoch": 85.97413096200485, "grad_norm": 0.18216776847839355, "learning_rate": 0.00031673852510554206, "loss": 0.5247, "step": 638100 }, { "epoch": 85.98760441929399, "grad_norm": 0.19412323832511902, "learning_rate": 0.00031670109883529446, "loss": 0.5239, "step": 638200 }, { "epoch": 86.0, "eval_loss": 0.5134301781654358, "eval_runtime": 4.9525, "eval_samples_per_second": 1009.596, "eval_steps_per_second": 15.952, "step": 638292 }, { "epoch": 86.00107787658312, "grad_norm": 0.17572519183158875, "learning_rate": 0.00031666367256504685, "loss": 0.5247, "step": 638300 }, { "epoch": 86.01455133387228, "grad_norm": 0.16446489095687866, "learning_rate": 0.00031662624629479925, "loss": 0.5238, "step": 638400 }, { "epoch": 86.02802479116141, "grad_norm": 0.18854770064353943, "learning_rate": 0.00031658882002455165, "loss": 0.5238, "step": 638500 }, { "epoch": 86.04149824845055, "grad_norm": 0.19018013775348663, "learning_rate": 0.00031655139375430405, "loss": 0.5238, "step": 638600 }, { "epoch": 86.05497170573969, "grad_norm": 0.18732920289039612, "learning_rate": 0.0003165139674840564, "loss": 0.5235, "step": 638700 }, { "epoch": 86.06844516302883, "grad_norm": 0.1692236065864563, "learning_rate": 0.0003164765412138088, "loss": 0.5245, "step": 638800 }, { "epoch": 86.08191862031798, "grad_norm": 0.18056418001651764, "learning_rate": 0.0003164391149435612, "loss": 0.5257, "step": 638900 }, { "epoch": 86.09539207760712, "grad_norm": 0.18813586235046387, "learning_rate": 0.0003164016886733136, "loss": 0.5233, "step": 639000 }, { "epoch": 86.10886553489625, "grad_norm": 0.18205402791500092, "learning_rate": 0.000316364262403066, "loss": 0.5237, "step": 639100 }, { "epoch": 86.12233899218539, "grad_norm": 0.19192513823509216, "learning_rate": 0.0003163268361328184, "loss": 0.5242, "step": 639200 }, { "epoch": 86.13581244947453, "grad_norm": 0.1703474521636963, "learning_rate": 0.0003162894098625708, "loss": 0.5242, "step": 639300 }, { "epoch": 86.14928590676368, "grad_norm": 0.18172642588615417, "learning_rate": 0.0003162519835923232, "loss": 0.5239, "step": 639400 }, { "epoch": 86.16275936405282, "grad_norm": 0.18514147400856018, "learning_rate": 0.00031621455732207547, "loss": 0.5244, "step": 639500 }, { "epoch": 86.17623282134196, "grad_norm": 0.17683954536914825, "learning_rate": 0.00031617713105182787, "loss": 0.5246, "step": 639600 }, { "epoch": 86.18970627863109, "grad_norm": 0.17687979340553284, "learning_rate": 0.00031613970478158027, "loss": 0.5249, "step": 639700 }, { "epoch": 86.20317973592023, "grad_norm": 0.1956094354391098, "learning_rate": 0.00031610227851133267, "loss": 0.5251, "step": 639800 }, { "epoch": 86.21665319320938, "grad_norm": 0.17664426565170288, "learning_rate": 0.00031606485224108506, "loss": 0.5246, "step": 639900 }, { "epoch": 86.23012665049852, "grad_norm": 0.17839829623699188, "learning_rate": 0.00031602742597083746, "loss": 0.5242, "step": 640000 }, { "epoch": 86.24360010778766, "grad_norm": 0.18463453650474548, "learning_rate": 0.00031598999970058986, "loss": 0.5239, "step": 640100 }, { "epoch": 86.2570735650768, "grad_norm": 0.16911724209785461, "learning_rate": 0.0003159525734303422, "loss": 0.5243, "step": 640200 }, { "epoch": 86.27054702236595, "grad_norm": 0.16346919536590576, "learning_rate": 0.0003159151471600946, "loss": 0.5248, "step": 640300 }, { "epoch": 86.28402047965508, "grad_norm": 0.17986489832401276, "learning_rate": 0.000315877720889847, "loss": 0.5239, "step": 640400 }, { "epoch": 86.29749393694422, "grad_norm": 0.19224020838737488, "learning_rate": 0.0003158402946195994, "loss": 0.5241, "step": 640500 }, { "epoch": 86.31096739423336, "grad_norm": 0.1936699002981186, "learning_rate": 0.0003158028683493518, "loss": 0.5239, "step": 640600 }, { "epoch": 86.3244408515225, "grad_norm": 0.18187753856182098, "learning_rate": 0.0003157654420791042, "loss": 0.5242, "step": 640700 }, { "epoch": 86.33791430881165, "grad_norm": 0.17693305015563965, "learning_rate": 0.0003157280158088566, "loss": 0.5235, "step": 640800 }, { "epoch": 86.35138776610079, "grad_norm": 0.18101656436920166, "learning_rate": 0.00031569058953860894, "loss": 0.5253, "step": 640900 }, { "epoch": 86.36486122338992, "grad_norm": 0.17028625309467316, "learning_rate": 0.00031565316326836134, "loss": 0.5248, "step": 641000 }, { "epoch": 86.37833468067906, "grad_norm": 0.18669983744621277, "learning_rate": 0.00031561573699811373, "loss": 0.5257, "step": 641100 }, { "epoch": 86.3918081379682, "grad_norm": 0.17807996273040771, "learning_rate": 0.00031557831072786613, "loss": 0.5247, "step": 641200 }, { "epoch": 86.40528159525735, "grad_norm": 0.16928820312023163, "learning_rate": 0.0003155408844576185, "loss": 0.5238, "step": 641300 }, { "epoch": 86.41875505254649, "grad_norm": 0.1752147227525711, "learning_rate": 0.0003155034581873709, "loss": 0.5238, "step": 641400 }, { "epoch": 86.43222850983562, "grad_norm": 0.19450895488262177, "learning_rate": 0.0003154660319171233, "loss": 0.524, "step": 641500 }, { "epoch": 86.44570196712476, "grad_norm": 0.17235665023326874, "learning_rate": 0.0003154286056468756, "loss": 0.5247, "step": 641600 }, { "epoch": 86.4591754244139, "grad_norm": 0.19433631002902985, "learning_rate": 0.000315391179376628, "loss": 0.524, "step": 641700 }, { "epoch": 86.47264888170305, "grad_norm": 0.18618181347846985, "learning_rate": 0.0003153537531063804, "loss": 0.525, "step": 641800 }, { "epoch": 86.48612233899219, "grad_norm": 0.17484858632087708, "learning_rate": 0.0003153163268361328, "loss": 0.5242, "step": 641900 }, { "epoch": 86.49959579628133, "grad_norm": 0.17318132519721985, "learning_rate": 0.0003152789005658852, "loss": 0.5249, "step": 642000 }, { "epoch": 86.51306925357046, "grad_norm": 0.19604943692684174, "learning_rate": 0.0003152414742956376, "loss": 0.525, "step": 642100 }, { "epoch": 86.5265427108596, "grad_norm": 0.18605369329452515, "learning_rate": 0.00031520404802539, "loss": 0.5244, "step": 642200 }, { "epoch": 86.54001616814875, "grad_norm": 0.18185929954051971, "learning_rate": 0.0003151666217551424, "loss": 0.5248, "step": 642300 }, { "epoch": 86.55348962543789, "grad_norm": 0.18475574254989624, "learning_rate": 0.00031512919548489475, "loss": 0.5251, "step": 642400 }, { "epoch": 86.56696308272703, "grad_norm": 0.1871904730796814, "learning_rate": 0.00031509176921464715, "loss": 0.5254, "step": 642500 }, { "epoch": 86.58043654001617, "grad_norm": 0.17537200450897217, "learning_rate": 0.00031505434294439955, "loss": 0.5255, "step": 642600 }, { "epoch": 86.5939099973053, "grad_norm": 0.1894635707139969, "learning_rate": 0.00031501691667415194, "loss": 0.5248, "step": 642700 }, { "epoch": 86.60738345459445, "grad_norm": 0.17952091991901398, "learning_rate": 0.00031497949040390434, "loss": 0.525, "step": 642800 }, { "epoch": 86.62085691188359, "grad_norm": 0.18400074541568756, "learning_rate": 0.00031494206413365674, "loss": 0.5247, "step": 642900 }, { "epoch": 86.63433036917273, "grad_norm": 0.18384712934494019, "learning_rate": 0.00031490463786340914, "loss": 0.5228, "step": 643000 }, { "epoch": 86.64780382646187, "grad_norm": 0.17953842878341675, "learning_rate": 0.00031486721159316143, "loss": 0.5242, "step": 643100 }, { "epoch": 86.661277283751, "grad_norm": 0.17626610398292542, "learning_rate": 0.0003148297853229138, "loss": 0.5241, "step": 643200 }, { "epoch": 86.67475074104016, "grad_norm": 0.18872688710689545, "learning_rate": 0.0003147923590526662, "loss": 0.5256, "step": 643300 }, { "epoch": 86.6882241983293, "grad_norm": 0.17578807473182678, "learning_rate": 0.0003147549327824186, "loss": 0.5248, "step": 643400 }, { "epoch": 86.70169765561843, "grad_norm": 0.18342025578022003, "learning_rate": 0.000314717506512171, "loss": 0.524, "step": 643500 }, { "epoch": 86.71517111290757, "grad_norm": 0.1841798573732376, "learning_rate": 0.0003146800802419234, "loss": 0.5244, "step": 643600 }, { "epoch": 86.7286445701967, "grad_norm": 0.18922290205955505, "learning_rate": 0.0003146426539716758, "loss": 0.5245, "step": 643700 }, { "epoch": 86.74211802748586, "grad_norm": 0.17717230319976807, "learning_rate": 0.00031460522770142816, "loss": 0.525, "step": 643800 }, { "epoch": 86.755591484775, "grad_norm": 0.18420617282390594, "learning_rate": 0.00031456780143118056, "loss": 0.5255, "step": 643900 }, { "epoch": 86.76906494206413, "grad_norm": 0.22953347861766815, "learning_rate": 0.00031453037516093296, "loss": 0.5243, "step": 644000 }, { "epoch": 86.78253839935327, "grad_norm": 0.19191263616085052, "learning_rate": 0.00031449294889068536, "loss": 0.5245, "step": 644100 }, { "epoch": 86.79601185664241, "grad_norm": 0.17641031742095947, "learning_rate": 0.00031445552262043775, "loss": 0.5245, "step": 644200 }, { "epoch": 86.80948531393156, "grad_norm": 0.1749146580696106, "learning_rate": 0.00031441809635019015, "loss": 0.5256, "step": 644300 }, { "epoch": 86.8229587712207, "grad_norm": 0.20035070180892944, "learning_rate": 0.00031438067007994255, "loss": 0.524, "step": 644400 }, { "epoch": 86.83643222850984, "grad_norm": 0.1709221750497818, "learning_rate": 0.00031434324380969495, "loss": 0.5247, "step": 644500 }, { "epoch": 86.84990568579897, "grad_norm": 0.18480101227760315, "learning_rate": 0.0003143058175394473, "loss": 0.5243, "step": 644600 }, { "epoch": 86.86337914308811, "grad_norm": 0.18094423413276672, "learning_rate": 0.0003142683912691997, "loss": 0.5254, "step": 644700 }, { "epoch": 86.87685260037726, "grad_norm": 0.18257375061511993, "learning_rate": 0.0003142309649989521, "loss": 0.5244, "step": 644800 }, { "epoch": 86.8903260576664, "grad_norm": 0.1767590343952179, "learning_rate": 0.00031419353872870443, "loss": 0.5238, "step": 644900 }, { "epoch": 86.90379951495554, "grad_norm": 0.1861894428730011, "learning_rate": 0.00031415611245845683, "loss": 0.5251, "step": 645000 }, { "epoch": 86.91727297224467, "grad_norm": 0.1885875165462494, "learning_rate": 0.00031411868618820923, "loss": 0.5238, "step": 645100 }, { "epoch": 86.93074642953381, "grad_norm": 0.16717788577079773, "learning_rate": 0.00031408125991796163, "loss": 0.525, "step": 645200 }, { "epoch": 86.94421988682296, "grad_norm": 0.17849616706371307, "learning_rate": 0.00031404383364771397, "loss": 0.5254, "step": 645300 }, { "epoch": 86.9576933441121, "grad_norm": 0.1919020265340805, "learning_rate": 0.00031400640737746637, "loss": 0.524, "step": 645400 }, { "epoch": 86.97116680140124, "grad_norm": 0.20212522149085999, "learning_rate": 0.00031396898110721877, "loss": 0.5243, "step": 645500 }, { "epoch": 86.98464025869038, "grad_norm": 0.18147900700569153, "learning_rate": 0.00031393155483697117, "loss": 0.5244, "step": 645600 }, { "epoch": 86.99811371597951, "grad_norm": 0.17871223390102386, "learning_rate": 0.00031389412856672357, "loss": 0.5246, "step": 645700 }, { "epoch": 87.0, "eval_loss": 0.5132917165756226, "eval_runtime": 4.9558, "eval_samples_per_second": 1008.909, "eval_steps_per_second": 15.941, "step": 645714 }, { "epoch": 87.01158717326867, "grad_norm": 0.2020510584115982, "learning_rate": 0.00031385670229647596, "loss": 0.5237, "step": 645800 }, { "epoch": 87.0250606305578, "grad_norm": 0.17934566736221313, "learning_rate": 0.00031381927602622836, "loss": 0.5243, "step": 645900 }, { "epoch": 87.03853408784694, "grad_norm": 0.21909019351005554, "learning_rate": 0.0003137818497559807, "loss": 0.524, "step": 646000 }, { "epoch": 87.05200754513608, "grad_norm": 0.17216932773590088, "learning_rate": 0.0003137444234857331, "loss": 0.5227, "step": 646100 }, { "epoch": 87.06548100242522, "grad_norm": 0.18134011328220367, "learning_rate": 0.0003137069972154855, "loss": 0.5242, "step": 646200 }, { "epoch": 87.07895445971437, "grad_norm": 0.1981343924999237, "learning_rate": 0.0003136695709452379, "loss": 0.5235, "step": 646300 }, { "epoch": 87.0924279170035, "grad_norm": 0.16881859302520752, "learning_rate": 0.0003136321446749903, "loss": 0.5237, "step": 646400 }, { "epoch": 87.10590137429264, "grad_norm": 0.18076275289058685, "learning_rate": 0.0003135947184047427, "loss": 0.5224, "step": 646500 }, { "epoch": 87.11937483158178, "grad_norm": 0.1773282289505005, "learning_rate": 0.0003135572921344951, "loss": 0.5246, "step": 646600 }, { "epoch": 87.13284828887092, "grad_norm": 0.1995941400527954, "learning_rate": 0.0003135198658642475, "loss": 0.5242, "step": 646700 }, { "epoch": 87.14632174616007, "grad_norm": 0.1830795258283615, "learning_rate": 0.0003134824395939998, "loss": 0.5242, "step": 646800 }, { "epoch": 87.1597952034492, "grad_norm": 0.18728387355804443, "learning_rate": 0.0003134450133237522, "loss": 0.5242, "step": 646900 }, { "epoch": 87.17326866073834, "grad_norm": 0.16196191310882568, "learning_rate": 0.0003134075870535046, "loss": 0.5241, "step": 647000 }, { "epoch": 87.18674211802748, "grad_norm": 0.18554091453552246, "learning_rate": 0.000313370160783257, "loss": 0.5237, "step": 647100 }, { "epoch": 87.20021557531662, "grad_norm": 0.19444464147090912, "learning_rate": 0.0003133327345130094, "loss": 0.5239, "step": 647200 }, { "epoch": 87.21368903260577, "grad_norm": 0.18407104909420013, "learning_rate": 0.0003132953082427618, "loss": 0.5247, "step": 647300 }, { "epoch": 87.22716248989491, "grad_norm": 0.1818556785583496, "learning_rate": 0.00031325788197251417, "loss": 0.5241, "step": 647400 }, { "epoch": 87.24063594718405, "grad_norm": 0.1936110556125641, "learning_rate": 0.0003132204557022665, "loss": 0.5238, "step": 647500 }, { "epoch": 87.25410940447318, "grad_norm": 0.17424429953098297, "learning_rate": 0.0003131830294320189, "loss": 0.5243, "step": 647600 }, { "epoch": 87.26758286176234, "grad_norm": 0.17011578381061554, "learning_rate": 0.0003131456031617713, "loss": 0.5253, "step": 647700 }, { "epoch": 87.28105631905147, "grad_norm": 0.1730143278837204, "learning_rate": 0.0003131081768915237, "loss": 0.5235, "step": 647800 }, { "epoch": 87.29452977634061, "grad_norm": 0.18638795614242554, "learning_rate": 0.0003130707506212761, "loss": 0.5255, "step": 647900 }, { "epoch": 87.30800323362975, "grad_norm": 0.18559981882572174, "learning_rate": 0.0003130333243510285, "loss": 0.5245, "step": 648000 }, { "epoch": 87.32147669091889, "grad_norm": 0.1848166435956955, "learning_rate": 0.0003129958980807809, "loss": 0.5241, "step": 648100 }, { "epoch": 87.33495014820804, "grad_norm": 0.17911119759082794, "learning_rate": 0.00031295847181053325, "loss": 0.5247, "step": 648200 }, { "epoch": 87.34842360549717, "grad_norm": 0.18627670407295227, "learning_rate": 0.00031292104554028565, "loss": 0.5236, "step": 648300 }, { "epoch": 87.36189706278631, "grad_norm": 0.1784815639257431, "learning_rate": 0.00031288361927003805, "loss": 0.5246, "step": 648400 }, { "epoch": 87.37537052007545, "grad_norm": 0.18922454118728638, "learning_rate": 0.00031284619299979044, "loss": 0.5235, "step": 648500 }, { "epoch": 87.38884397736459, "grad_norm": 0.17302563786506653, "learning_rate": 0.0003128087667295428, "loss": 0.5247, "step": 648600 }, { "epoch": 87.40231743465374, "grad_norm": 0.19116069376468658, "learning_rate": 0.0003127713404592952, "loss": 0.5242, "step": 648700 }, { "epoch": 87.41579089194288, "grad_norm": 0.1864149123430252, "learning_rate": 0.0003127339141890476, "loss": 0.5248, "step": 648800 }, { "epoch": 87.42926434923201, "grad_norm": 0.1913546323776245, "learning_rate": 0.00031269648791879993, "loss": 0.5239, "step": 648900 }, { "epoch": 87.44273780652115, "grad_norm": 0.1833522766828537, "learning_rate": 0.00031265906164855233, "loss": 0.524, "step": 649000 }, { "epoch": 87.45621126381029, "grad_norm": 0.17755405604839325, "learning_rate": 0.0003126216353783047, "loss": 0.5236, "step": 649100 }, { "epoch": 87.46968472109944, "grad_norm": 0.2004234343767166, "learning_rate": 0.0003125842091080571, "loss": 0.5263, "step": 649200 }, { "epoch": 87.48315817838858, "grad_norm": 0.1812882274389267, "learning_rate": 0.0003125467828378095, "loss": 0.5251, "step": 649300 }, { "epoch": 87.49663163567772, "grad_norm": 0.2136378139257431, "learning_rate": 0.0003125093565675619, "loss": 0.524, "step": 649400 }, { "epoch": 87.51010509296685, "grad_norm": 0.21527045965194702, "learning_rate": 0.0003124719302973143, "loss": 0.5242, "step": 649500 }, { "epoch": 87.52357855025599, "grad_norm": 0.17152146995067596, "learning_rate": 0.0003124345040270667, "loss": 0.5245, "step": 649600 }, { "epoch": 87.53705200754514, "grad_norm": 0.17270059883594513, "learning_rate": 0.00031239707775681906, "loss": 0.5251, "step": 649700 }, { "epoch": 87.55052546483428, "grad_norm": 0.17365245521068573, "learning_rate": 0.00031235965148657146, "loss": 0.5241, "step": 649800 }, { "epoch": 87.56399892212342, "grad_norm": 0.20427151024341583, "learning_rate": 0.00031232222521632386, "loss": 0.5242, "step": 649900 }, { "epoch": 87.57747237941255, "grad_norm": 0.16835802793502808, "learning_rate": 0.00031228479894607626, "loss": 0.5239, "step": 650000 }, { "epoch": 87.59094583670169, "grad_norm": 0.16418160498142242, "learning_rate": 0.00031224737267582865, "loss": 0.5243, "step": 650100 }, { "epoch": 87.60441929399084, "grad_norm": 0.1889192759990692, "learning_rate": 0.00031220994640558105, "loss": 0.5248, "step": 650200 }, { "epoch": 87.61789275127998, "grad_norm": 0.18046042323112488, "learning_rate": 0.00031217252013533345, "loss": 0.5243, "step": 650300 }, { "epoch": 87.63136620856912, "grad_norm": 0.17526017129421234, "learning_rate": 0.00031213509386508574, "loss": 0.5245, "step": 650400 }, { "epoch": 87.64483966585826, "grad_norm": 0.19057518243789673, "learning_rate": 0.00031209766759483814, "loss": 0.525, "step": 650500 }, { "epoch": 87.6583131231474, "grad_norm": 0.16627953946590424, "learning_rate": 0.00031206024132459054, "loss": 0.5249, "step": 650600 }, { "epoch": 87.67178658043655, "grad_norm": 0.1828208714723587, "learning_rate": 0.00031202281505434293, "loss": 0.5234, "step": 650700 }, { "epoch": 87.68526003772568, "grad_norm": 0.18470612168312073, "learning_rate": 0.00031198538878409533, "loss": 0.5246, "step": 650800 }, { "epoch": 87.69873349501482, "grad_norm": 0.20078806579113007, "learning_rate": 0.00031194796251384773, "loss": 0.524, "step": 650900 }, { "epoch": 87.71220695230396, "grad_norm": 0.16653142869472504, "learning_rate": 0.00031191053624360013, "loss": 0.5253, "step": 651000 }, { "epoch": 87.7256804095931, "grad_norm": 0.17412112653255463, "learning_rate": 0.0003118731099733525, "loss": 0.5234, "step": 651100 }, { "epoch": 87.73915386688225, "grad_norm": 0.1910969465970993, "learning_rate": 0.00031183568370310487, "loss": 0.5248, "step": 651200 }, { "epoch": 87.75262732417139, "grad_norm": 0.193672776222229, "learning_rate": 0.00031179825743285727, "loss": 0.5244, "step": 651300 }, { "epoch": 87.76610078146052, "grad_norm": 0.19432073831558228, "learning_rate": 0.00031176083116260967, "loss": 0.5234, "step": 651400 }, { "epoch": 87.77957423874966, "grad_norm": 0.1851307898759842, "learning_rate": 0.00031172340489236207, "loss": 0.5241, "step": 651500 }, { "epoch": 87.7930476960388, "grad_norm": 0.17093561589717865, "learning_rate": 0.00031168597862211446, "loss": 0.5245, "step": 651600 }, { "epoch": 87.80652115332795, "grad_norm": 0.18786172568798065, "learning_rate": 0.00031164855235186686, "loss": 0.5247, "step": 651700 }, { "epoch": 87.81999461061709, "grad_norm": 0.18444302678108215, "learning_rate": 0.0003116111260816192, "loss": 0.5234, "step": 651800 }, { "epoch": 87.83346806790622, "grad_norm": 0.16623622179031372, "learning_rate": 0.0003115736998113716, "loss": 0.5243, "step": 651900 }, { "epoch": 87.84694152519536, "grad_norm": 0.20925122499465942, "learning_rate": 0.000311536273541124, "loss": 0.5243, "step": 652000 }, { "epoch": 87.8604149824845, "grad_norm": 0.17751923203468323, "learning_rate": 0.0003114988472708764, "loss": 0.5246, "step": 652100 }, { "epoch": 87.87388843977365, "grad_norm": 0.1733647882938385, "learning_rate": 0.00031146142100062875, "loss": 0.524, "step": 652200 }, { "epoch": 87.88736189706279, "grad_norm": 0.20123735070228577, "learning_rate": 0.00031142399473038114, "loss": 0.5244, "step": 652300 }, { "epoch": 87.90083535435193, "grad_norm": 0.1917857825756073, "learning_rate": 0.00031138656846013354, "loss": 0.5243, "step": 652400 }, { "epoch": 87.91430881164106, "grad_norm": 0.18259091675281525, "learning_rate": 0.00031134914218988594, "loss": 0.525, "step": 652500 }, { "epoch": 87.9277822689302, "grad_norm": 0.18820995092391968, "learning_rate": 0.0003113117159196383, "loss": 0.5247, "step": 652600 }, { "epoch": 87.94125572621935, "grad_norm": 0.19120875000953674, "learning_rate": 0.0003112742896493907, "loss": 0.5248, "step": 652700 }, { "epoch": 87.95472918350849, "grad_norm": 0.17935536801815033, "learning_rate": 0.0003112368633791431, "loss": 0.5235, "step": 652800 }, { "epoch": 87.96820264079763, "grad_norm": 0.17052701115608215, "learning_rate": 0.0003111994371088955, "loss": 0.5248, "step": 652900 }, { "epoch": 87.98167609808677, "grad_norm": 0.18595632910728455, "learning_rate": 0.0003111620108386479, "loss": 0.5243, "step": 653000 }, { "epoch": 87.9951495553759, "grad_norm": 0.18954956531524658, "learning_rate": 0.0003111245845684003, "loss": 0.5246, "step": 653100 }, { "epoch": 88.0, "eval_loss": 0.5130694508552551, "eval_runtime": 4.9561, "eval_samples_per_second": 1008.855, "eval_steps_per_second": 15.94, "step": 653136 }, { "epoch": 88.00862301266505, "grad_norm": 0.17873449623584747, "learning_rate": 0.0003110871582981527, "loss": 0.5239, "step": 653200 }, { "epoch": 88.02209646995419, "grad_norm": 0.17209184169769287, "learning_rate": 0.000311049732027905, "loss": 0.5233, "step": 653300 }, { "epoch": 88.03556992724333, "grad_norm": 0.18026894330978394, "learning_rate": 0.0003110123057576574, "loss": 0.5232, "step": 653400 }, { "epoch": 88.04904338453247, "grad_norm": 0.1735898107290268, "learning_rate": 0.0003109748794874098, "loss": 0.5237, "step": 653500 }, { "epoch": 88.0625168418216, "grad_norm": 0.1752881556749344, "learning_rate": 0.0003109374532171622, "loss": 0.5223, "step": 653600 }, { "epoch": 88.07599029911076, "grad_norm": 0.18532757461071014, "learning_rate": 0.0003109000269469146, "loss": 0.5251, "step": 653700 }, { "epoch": 88.0894637563999, "grad_norm": 0.1726042479276657, "learning_rate": 0.000310862600676667, "loss": 0.5232, "step": 653800 }, { "epoch": 88.10293721368903, "grad_norm": 0.2051495760679245, "learning_rate": 0.0003108251744064194, "loss": 0.5243, "step": 653900 }, { "epoch": 88.11641067097817, "grad_norm": 0.18254457414150238, "learning_rate": 0.0003107877481361717, "loss": 0.5235, "step": 654000 }, { "epoch": 88.1298841282673, "grad_norm": 0.1738194078207016, "learning_rate": 0.0003107503218659241, "loss": 0.5224, "step": 654100 }, { "epoch": 88.14335758555646, "grad_norm": 0.17241716384887695, "learning_rate": 0.0003107128955956765, "loss": 0.5249, "step": 654200 }, { "epoch": 88.1568310428456, "grad_norm": 0.17666015028953552, "learning_rate": 0.0003106754693254289, "loss": 0.5248, "step": 654300 }, { "epoch": 88.17030450013473, "grad_norm": 0.1769765317440033, "learning_rate": 0.0003106380430551813, "loss": 0.5228, "step": 654400 }, { "epoch": 88.18377795742387, "grad_norm": 0.17598660290241241, "learning_rate": 0.0003106006167849337, "loss": 0.5233, "step": 654500 }, { "epoch": 88.19725141471301, "grad_norm": 0.1776646375656128, "learning_rate": 0.0003105631905146861, "loss": 0.5243, "step": 654600 }, { "epoch": 88.21072487200216, "grad_norm": 0.16499574482440948, "learning_rate": 0.0003105257642444385, "loss": 0.5242, "step": 654700 }, { "epoch": 88.2241983292913, "grad_norm": 0.17735162377357483, "learning_rate": 0.00031048833797419083, "loss": 0.5239, "step": 654800 }, { "epoch": 88.23767178658044, "grad_norm": 0.18439795076847076, "learning_rate": 0.0003104509117039432, "loss": 0.5237, "step": 654900 }, { "epoch": 88.25114524386957, "grad_norm": 0.18516860902309418, "learning_rate": 0.0003104134854336956, "loss": 0.5242, "step": 655000 }, { "epoch": 88.26461870115872, "grad_norm": 0.16721539199352264, "learning_rate": 0.000310376059163448, "loss": 0.5239, "step": 655100 }, { "epoch": 88.27809215844786, "grad_norm": 0.18117576837539673, "learning_rate": 0.0003103386328932004, "loss": 0.5247, "step": 655200 }, { "epoch": 88.291565615737, "grad_norm": 0.17620432376861572, "learning_rate": 0.0003103012066229528, "loss": 0.5242, "step": 655300 }, { "epoch": 88.30503907302614, "grad_norm": 0.19648203253746033, "learning_rate": 0.0003102637803527052, "loss": 0.5243, "step": 655400 }, { "epoch": 88.31851253031527, "grad_norm": 0.1669141948223114, "learning_rate": 0.00031022635408245756, "loss": 0.5235, "step": 655500 }, { "epoch": 88.33198598760443, "grad_norm": 0.1731535792350769, "learning_rate": 0.00031018892781220996, "loss": 0.524, "step": 655600 }, { "epoch": 88.34545944489356, "grad_norm": 0.18533740937709808, "learning_rate": 0.00031015150154196236, "loss": 0.5241, "step": 655700 }, { "epoch": 88.3589329021827, "grad_norm": 0.1831606924533844, "learning_rate": 0.0003101140752717147, "loss": 0.5236, "step": 655800 }, { "epoch": 88.37240635947184, "grad_norm": 0.1975221186876297, "learning_rate": 0.0003100766490014671, "loss": 0.524, "step": 655900 }, { "epoch": 88.38587981676098, "grad_norm": 0.18156130611896515, "learning_rate": 0.0003100392227312195, "loss": 0.525, "step": 656000 }, { "epoch": 88.39935327405013, "grad_norm": 0.18903182446956635, "learning_rate": 0.0003100017964609719, "loss": 0.5241, "step": 656100 }, { "epoch": 88.41282673133927, "grad_norm": 0.18571822345256805, "learning_rate": 0.00030996437019072424, "loss": 0.5246, "step": 656200 }, { "epoch": 88.4263001886284, "grad_norm": 0.17069150507450104, "learning_rate": 0.00030992694392047664, "loss": 0.5238, "step": 656300 }, { "epoch": 88.43977364591754, "grad_norm": 0.1767728477716446, "learning_rate": 0.00030988951765022904, "loss": 0.5232, "step": 656400 }, { "epoch": 88.45324710320668, "grad_norm": 0.16800053417682648, "learning_rate": 0.00030985209137998144, "loss": 0.5232, "step": 656500 }, { "epoch": 88.46672056049583, "grad_norm": 0.19801490008831024, "learning_rate": 0.00030981466510973383, "loss": 0.525, "step": 656600 }, { "epoch": 88.48019401778497, "grad_norm": 0.17687878012657166, "learning_rate": 0.00030977723883948623, "loss": 0.5242, "step": 656700 }, { "epoch": 88.4936674750741, "grad_norm": 0.1845872700214386, "learning_rate": 0.00030973981256923863, "loss": 0.5247, "step": 656800 }, { "epoch": 88.50714093236324, "grad_norm": 0.19672498106956482, "learning_rate": 0.000309702386298991, "loss": 0.5238, "step": 656900 }, { "epoch": 88.52061438965238, "grad_norm": 0.19268286228179932, "learning_rate": 0.0003096649600287434, "loss": 0.5256, "step": 657000 }, { "epoch": 88.53408784694153, "grad_norm": 0.18386025726795197, "learning_rate": 0.00030962753375849577, "loss": 0.5244, "step": 657100 }, { "epoch": 88.54756130423067, "grad_norm": 0.17457835376262665, "learning_rate": 0.00030959010748824817, "loss": 0.5225, "step": 657200 }, { "epoch": 88.5610347615198, "grad_norm": 0.18888908624649048, "learning_rate": 0.00030955268121800057, "loss": 0.5231, "step": 657300 }, { "epoch": 88.57450821880894, "grad_norm": 0.17668560147285461, "learning_rate": 0.00030951525494775297, "loss": 0.5238, "step": 657400 }, { "epoch": 88.58798167609808, "grad_norm": 0.18237993121147156, "learning_rate": 0.00030947782867750536, "loss": 0.524, "step": 657500 }, { "epoch": 88.60145513338723, "grad_norm": 0.18821614980697632, "learning_rate": 0.0003094404024072577, "loss": 0.5241, "step": 657600 }, { "epoch": 88.61492859067637, "grad_norm": 0.17333835363388062, "learning_rate": 0.00030940297613701005, "loss": 0.525, "step": 657700 }, { "epoch": 88.62840204796551, "grad_norm": 0.17775379121303558, "learning_rate": 0.00030936554986676245, "loss": 0.5247, "step": 657800 }, { "epoch": 88.64187550525465, "grad_norm": 0.19942671060562134, "learning_rate": 0.00030932812359651485, "loss": 0.5243, "step": 657900 }, { "epoch": 88.65534896254378, "grad_norm": 0.18038909137248993, "learning_rate": 0.00030929069732626725, "loss": 0.5228, "step": 658000 }, { "epoch": 88.66882241983294, "grad_norm": 0.1732431799173355, "learning_rate": 0.00030925327105601965, "loss": 0.5253, "step": 658100 }, { "epoch": 88.68229587712207, "grad_norm": 0.1945890337228775, "learning_rate": 0.00030921584478577204, "loss": 0.5233, "step": 658200 }, { "epoch": 88.69576933441121, "grad_norm": 0.17752304673194885, "learning_rate": 0.00030917841851552444, "loss": 0.5247, "step": 658300 }, { "epoch": 88.70924279170035, "grad_norm": 0.16458559036254883, "learning_rate": 0.0003091409922452768, "loss": 0.5227, "step": 658400 }, { "epoch": 88.72271624898949, "grad_norm": 0.19974178075790405, "learning_rate": 0.0003091035659750292, "loss": 0.5235, "step": 658500 }, { "epoch": 88.73618970627864, "grad_norm": 0.20073385536670685, "learning_rate": 0.0003090661397047816, "loss": 0.5242, "step": 658600 }, { "epoch": 88.74966316356777, "grad_norm": 0.2144029140472412, "learning_rate": 0.000309028713434534, "loss": 0.5248, "step": 658700 }, { "epoch": 88.76313662085691, "grad_norm": 0.16888567805290222, "learning_rate": 0.0003089912871642864, "loss": 0.5248, "step": 658800 }, { "epoch": 88.77661007814605, "grad_norm": 0.1943194568157196, "learning_rate": 0.0003089538608940388, "loss": 0.5241, "step": 658900 }, { "epoch": 88.79008353543519, "grad_norm": 0.1903180181980133, "learning_rate": 0.0003089164346237912, "loss": 0.5247, "step": 659000 }, { "epoch": 88.80355699272434, "grad_norm": 0.1803772747516632, "learning_rate": 0.0003088790083535435, "loss": 0.5247, "step": 659100 }, { "epoch": 88.81703045001348, "grad_norm": 0.17798301577568054, "learning_rate": 0.0003088415820832959, "loss": 0.5251, "step": 659200 }, { "epoch": 88.83050390730261, "grad_norm": 0.16757839918136597, "learning_rate": 0.0003088041558130483, "loss": 0.5238, "step": 659300 }, { "epoch": 88.84397736459175, "grad_norm": 0.17566944658756256, "learning_rate": 0.00030876672954280066, "loss": 0.5238, "step": 659400 }, { "epoch": 88.85745082188089, "grad_norm": 0.17961305379867554, "learning_rate": 0.00030872930327255306, "loss": 0.5242, "step": 659500 }, { "epoch": 88.87092427917004, "grad_norm": 0.1787859946489334, "learning_rate": 0.00030869187700230546, "loss": 0.5239, "step": 659600 }, { "epoch": 88.88439773645918, "grad_norm": 0.20082423090934753, "learning_rate": 0.00030865445073205785, "loss": 0.5247, "step": 659700 }, { "epoch": 88.89787119374832, "grad_norm": 0.17774510383605957, "learning_rate": 0.0003086170244618102, "loss": 0.5238, "step": 659800 }, { "epoch": 88.91134465103745, "grad_norm": 0.17574435472488403, "learning_rate": 0.0003085795981915626, "loss": 0.5247, "step": 659900 }, { "epoch": 88.92481810832659, "grad_norm": 0.19486728310585022, "learning_rate": 0.000308542171921315, "loss": 0.5251, "step": 660000 }, { "epoch": 88.93829156561574, "grad_norm": 0.1911998689174652, "learning_rate": 0.0003085047456510674, "loss": 0.5242, "step": 660100 }, { "epoch": 88.95176502290488, "grad_norm": 0.18171550333499908, "learning_rate": 0.0003084673193808198, "loss": 0.5233, "step": 660200 }, { "epoch": 88.96523848019402, "grad_norm": 0.1802893728017807, "learning_rate": 0.0003084298931105722, "loss": 0.5236, "step": 660300 }, { "epoch": 88.97871193748315, "grad_norm": 0.18126343190670013, "learning_rate": 0.0003083924668403246, "loss": 0.5246, "step": 660400 }, { "epoch": 88.99218539477229, "grad_norm": 0.16901229321956635, "learning_rate": 0.000308355040570077, "loss": 0.5241, "step": 660500 }, { "epoch": 89.0, "eval_loss": 0.5130789875984192, "eval_runtime": 4.9599, "eval_samples_per_second": 1008.083, "eval_steps_per_second": 15.928, "step": 660558 }, { "epoch": 89.00565885206144, "grad_norm": 0.18724672496318817, "learning_rate": 0.00030831761429982933, "loss": 0.524, "step": 660600 }, { "epoch": 89.01913230935058, "grad_norm": 0.18376463651657104, "learning_rate": 0.00030828018802958173, "loss": 0.5227, "step": 660700 }, { "epoch": 89.03260576663972, "grad_norm": 0.19445601105690002, "learning_rate": 0.0003082427617593341, "loss": 0.5238, "step": 660800 }, { "epoch": 89.04607922392886, "grad_norm": 0.2045118808746338, "learning_rate": 0.0003082053354890865, "loss": 0.5239, "step": 660900 }, { "epoch": 89.059552681218, "grad_norm": 0.1720426231622696, "learning_rate": 0.0003081679092188389, "loss": 0.5233, "step": 661000 }, { "epoch": 89.07302613850715, "grad_norm": 0.16608412563800812, "learning_rate": 0.0003081304829485913, "loss": 0.5245, "step": 661100 }, { "epoch": 89.08649959579628, "grad_norm": 0.18899571895599365, "learning_rate": 0.00030809305667834367, "loss": 0.5235, "step": 661200 }, { "epoch": 89.09997305308542, "grad_norm": 0.17177589237689972, "learning_rate": 0.000308055630408096, "loss": 0.5228, "step": 661300 }, { "epoch": 89.11344651037456, "grad_norm": 0.16562055051326752, "learning_rate": 0.0003080182041378484, "loss": 0.5236, "step": 661400 }, { "epoch": 89.1269199676637, "grad_norm": 0.16825488209724426, "learning_rate": 0.0003079807778676008, "loss": 0.5235, "step": 661500 }, { "epoch": 89.14039342495285, "grad_norm": 0.18139967322349548, "learning_rate": 0.0003079433515973532, "loss": 0.5235, "step": 661600 }, { "epoch": 89.15386688224199, "grad_norm": 0.18364834785461426, "learning_rate": 0.0003079059253271056, "loss": 0.5243, "step": 661700 }, { "epoch": 89.16734033953112, "grad_norm": 0.17470787465572357, "learning_rate": 0.000307868499056858, "loss": 0.5238, "step": 661800 }, { "epoch": 89.18081379682026, "grad_norm": 0.19538576900959015, "learning_rate": 0.0003078310727866104, "loss": 0.5238, "step": 661900 }, { "epoch": 89.1942872541094, "grad_norm": 0.17883415520191193, "learning_rate": 0.00030779364651636274, "loss": 0.5245, "step": 662000 }, { "epoch": 89.20776071139855, "grad_norm": 0.17098204791545868, "learning_rate": 0.00030775622024611514, "loss": 0.523, "step": 662100 }, { "epoch": 89.22123416868769, "grad_norm": 0.19858162105083466, "learning_rate": 0.00030771879397586754, "loss": 0.5237, "step": 662200 }, { "epoch": 89.23470762597682, "grad_norm": 0.20545166730880737, "learning_rate": 0.00030768136770561994, "loss": 0.5236, "step": 662300 }, { "epoch": 89.24818108326596, "grad_norm": 0.19109337031841278, "learning_rate": 0.00030764394143537234, "loss": 0.5243, "step": 662400 }, { "epoch": 89.2616545405551, "grad_norm": 0.18718485534191132, "learning_rate": 0.00030760651516512473, "loss": 0.5242, "step": 662500 }, { "epoch": 89.27512799784425, "grad_norm": 0.1821664273738861, "learning_rate": 0.00030756908889487713, "loss": 0.5248, "step": 662600 }, { "epoch": 89.28860145513339, "grad_norm": 0.17144852876663208, "learning_rate": 0.00030753166262462953, "loss": 0.5243, "step": 662700 }, { "epoch": 89.30207491242253, "grad_norm": 0.17681989073753357, "learning_rate": 0.0003074942363543819, "loss": 0.5233, "step": 662800 }, { "epoch": 89.31554836971166, "grad_norm": 0.1824473887681961, "learning_rate": 0.00030745681008413427, "loss": 0.5233, "step": 662900 }, { "epoch": 89.32902182700082, "grad_norm": 0.16992133855819702, "learning_rate": 0.00030741938381388667, "loss": 0.5222, "step": 663000 }, { "epoch": 89.34249528428995, "grad_norm": 0.20101132988929749, "learning_rate": 0.000307381957543639, "loss": 0.5237, "step": 663100 }, { "epoch": 89.35596874157909, "grad_norm": 0.1924501359462738, "learning_rate": 0.0003073445312733914, "loss": 0.5233, "step": 663200 }, { "epoch": 89.36944219886823, "grad_norm": 0.17193196713924408, "learning_rate": 0.0003073071050031438, "loss": 0.523, "step": 663300 }, { "epoch": 89.38291565615737, "grad_norm": 0.17880825698375702, "learning_rate": 0.0003072696787328962, "loss": 0.5237, "step": 663400 }, { "epoch": 89.39638911344652, "grad_norm": 0.18296732008457184, "learning_rate": 0.00030723225246264855, "loss": 0.5243, "step": 663500 }, { "epoch": 89.40986257073565, "grad_norm": 0.17693878710269928, "learning_rate": 0.00030719482619240095, "loss": 0.5233, "step": 663600 }, { "epoch": 89.42333602802479, "grad_norm": 0.19942018389701843, "learning_rate": 0.00030715739992215335, "loss": 0.5249, "step": 663700 }, { "epoch": 89.43680948531393, "grad_norm": 0.18195851147174835, "learning_rate": 0.00030711997365190575, "loss": 0.5245, "step": 663800 }, { "epoch": 89.45028294260307, "grad_norm": 0.19300803542137146, "learning_rate": 0.00030708254738165815, "loss": 0.524, "step": 663900 }, { "epoch": 89.46375639989222, "grad_norm": 0.216787189245224, "learning_rate": 0.00030704512111141054, "loss": 0.5242, "step": 664000 }, { "epoch": 89.47722985718136, "grad_norm": 0.1686127483844757, "learning_rate": 0.00030700769484116294, "loss": 0.524, "step": 664100 }, { "epoch": 89.4907033144705, "grad_norm": 0.17126381397247314, "learning_rate": 0.0003069702685709153, "loss": 0.524, "step": 664200 }, { "epoch": 89.50417677175963, "grad_norm": 0.182813823223114, "learning_rate": 0.0003069328423006677, "loss": 0.5237, "step": 664300 }, { "epoch": 89.51765022904877, "grad_norm": 0.19210080802440643, "learning_rate": 0.0003068954160304201, "loss": 0.5244, "step": 664400 }, { "epoch": 89.53112368633792, "grad_norm": 0.1837306171655655, "learning_rate": 0.0003068579897601725, "loss": 0.5243, "step": 664500 }, { "epoch": 89.54459714362706, "grad_norm": 0.20062874257564545, "learning_rate": 0.0003068205634899249, "loss": 0.5237, "step": 664600 }, { "epoch": 89.5580706009162, "grad_norm": 0.18497523665428162, "learning_rate": 0.0003067831372196773, "loss": 0.524, "step": 664700 }, { "epoch": 89.57154405820533, "grad_norm": 0.18935830891132355, "learning_rate": 0.0003067457109494297, "loss": 0.5239, "step": 664800 }, { "epoch": 89.58501751549447, "grad_norm": 0.1850636601448059, "learning_rate": 0.00030670828467918197, "loss": 0.5242, "step": 664900 }, { "epoch": 89.59849097278362, "grad_norm": 0.21705083549022675, "learning_rate": 0.00030667085840893436, "loss": 0.5244, "step": 665000 }, { "epoch": 89.61196443007276, "grad_norm": 0.20809312164783478, "learning_rate": 0.00030663343213868676, "loss": 0.5238, "step": 665100 }, { "epoch": 89.6254378873619, "grad_norm": 0.16991816461086273, "learning_rate": 0.00030659600586843916, "loss": 0.5233, "step": 665200 }, { "epoch": 89.63891134465104, "grad_norm": 0.16637155413627625, "learning_rate": 0.00030655857959819156, "loss": 0.5249, "step": 665300 }, { "epoch": 89.65238480194017, "grad_norm": 0.16870170831680298, "learning_rate": 0.00030652115332794396, "loss": 0.5236, "step": 665400 }, { "epoch": 89.66585825922932, "grad_norm": 0.18139073252677917, "learning_rate": 0.00030648372705769636, "loss": 0.5245, "step": 665500 }, { "epoch": 89.67933171651846, "grad_norm": 0.18917733430862427, "learning_rate": 0.00030644630078744875, "loss": 0.5237, "step": 665600 }, { "epoch": 89.6928051738076, "grad_norm": 0.18651428818702698, "learning_rate": 0.0003064088745172011, "loss": 0.5234, "step": 665700 }, { "epoch": 89.70627863109674, "grad_norm": 0.17850512266159058, "learning_rate": 0.0003063714482469535, "loss": 0.5237, "step": 665800 }, { "epoch": 89.71975208838587, "grad_norm": 0.17366352677345276, "learning_rate": 0.0003063340219767059, "loss": 0.5226, "step": 665900 }, { "epoch": 89.73322554567503, "grad_norm": 0.1740991324186325, "learning_rate": 0.0003062965957064583, "loss": 0.5243, "step": 666000 }, { "epoch": 89.74669900296416, "grad_norm": 0.18239888548851013, "learning_rate": 0.0003062591694362107, "loss": 0.5251, "step": 666100 }, { "epoch": 89.7601724602533, "grad_norm": 0.1930287480354309, "learning_rate": 0.0003062217431659631, "loss": 0.5236, "step": 666200 }, { "epoch": 89.77364591754244, "grad_norm": 0.172322079539299, "learning_rate": 0.0003061843168957155, "loss": 0.5238, "step": 666300 }, { "epoch": 89.78711937483158, "grad_norm": 0.1790483146905899, "learning_rate": 0.00030614689062546783, "loss": 0.5234, "step": 666400 }, { "epoch": 89.80059283212073, "grad_norm": 0.16891717910766602, "learning_rate": 0.00030610946435522023, "loss": 0.524, "step": 666500 }, { "epoch": 89.81406628940987, "grad_norm": 0.19291935861110687, "learning_rate": 0.00030607203808497263, "loss": 0.5239, "step": 666600 }, { "epoch": 89.827539746699, "grad_norm": 0.17823009192943573, "learning_rate": 0.00030603461181472497, "loss": 0.524, "step": 666700 }, { "epoch": 89.84101320398814, "grad_norm": 0.18884176015853882, "learning_rate": 0.00030599718554447737, "loss": 0.5235, "step": 666800 }, { "epoch": 89.85448666127728, "grad_norm": 0.17589342594146729, "learning_rate": 0.00030595975927422977, "loss": 0.5246, "step": 666900 }, { "epoch": 89.86796011856643, "grad_norm": 0.18421490490436554, "learning_rate": 0.00030592233300398217, "loss": 0.5236, "step": 667000 }, { "epoch": 89.88143357585557, "grad_norm": 0.1720883548259735, "learning_rate": 0.0003058849067337345, "loss": 0.5242, "step": 667100 }, { "epoch": 89.8949070331447, "grad_norm": 0.18172003328800201, "learning_rate": 0.0003058474804634869, "loss": 0.524, "step": 667200 }, { "epoch": 89.90838049043384, "grad_norm": 0.18226803839206696, "learning_rate": 0.0003058100541932393, "loss": 0.5243, "step": 667300 }, { "epoch": 89.92185394772298, "grad_norm": 0.19589850306510925, "learning_rate": 0.0003057726279229917, "loss": 0.524, "step": 667400 }, { "epoch": 89.93532740501213, "grad_norm": 0.23034262657165527, "learning_rate": 0.0003057352016527441, "loss": 0.5238, "step": 667500 }, { "epoch": 89.94880086230127, "grad_norm": 0.1941206455230713, "learning_rate": 0.0003056977753824965, "loss": 0.5234, "step": 667600 }, { "epoch": 89.9622743195904, "grad_norm": 0.18109720945358276, "learning_rate": 0.0003056603491122489, "loss": 0.5232, "step": 667700 }, { "epoch": 89.97574777687954, "grad_norm": 0.18716146051883698, "learning_rate": 0.0003056229228420013, "loss": 0.5236, "step": 667800 }, { "epoch": 89.98922123416868, "grad_norm": 0.16889137029647827, "learning_rate": 0.00030558549657175364, "loss": 0.5241, "step": 667900 }, { "epoch": 90.0, "eval_loss": 0.5127894282341003, "eval_runtime": 4.9691, "eval_samples_per_second": 1006.22, "eval_steps_per_second": 15.898, "step": 667980 }, { "epoch": 90.00269469145783, "grad_norm": 0.19807367026805878, "learning_rate": 0.00030554807030150604, "loss": 0.5235, "step": 668000 }, { "epoch": 90.01616814874697, "grad_norm": 0.1857241690158844, "learning_rate": 0.00030551064403125844, "loss": 0.5231, "step": 668100 }, { "epoch": 90.02964160603611, "grad_norm": 0.17161820828914642, "learning_rate": 0.00030547321776101084, "loss": 0.5242, "step": 668200 }, { "epoch": 90.04311506332525, "grad_norm": 0.18829801678657532, "learning_rate": 0.00030543579149076324, "loss": 0.5244, "step": 668300 }, { "epoch": 90.05658852061438, "grad_norm": 0.1807820051908493, "learning_rate": 0.00030539836522051563, "loss": 0.5234, "step": 668400 }, { "epoch": 90.07006197790353, "grad_norm": 0.19162815809249878, "learning_rate": 0.000305360938950268, "loss": 0.5223, "step": 668500 }, { "epoch": 90.08353543519267, "grad_norm": 0.19086907804012299, "learning_rate": 0.0003053235126800203, "loss": 0.5217, "step": 668600 }, { "epoch": 90.09700889248181, "grad_norm": 0.1975257396697998, "learning_rate": 0.0003052860864097727, "loss": 0.5228, "step": 668700 }, { "epoch": 90.11048234977095, "grad_norm": 0.18069685995578766, "learning_rate": 0.0003052486601395251, "loss": 0.5233, "step": 668800 }, { "epoch": 90.12395580706009, "grad_norm": 0.18533265590667725, "learning_rate": 0.0003052112338692775, "loss": 0.5231, "step": 668900 }, { "epoch": 90.13742926434924, "grad_norm": 0.19035382568836212, "learning_rate": 0.0003051738075990299, "loss": 0.5234, "step": 669000 }, { "epoch": 90.15090272163837, "grad_norm": 0.20064306259155273, "learning_rate": 0.0003051363813287823, "loss": 0.5253, "step": 669100 }, { "epoch": 90.16437617892751, "grad_norm": 0.18258042633533478, "learning_rate": 0.0003050989550585347, "loss": 0.5234, "step": 669200 }, { "epoch": 90.17784963621665, "grad_norm": 0.21105457842350006, "learning_rate": 0.00030506152878828705, "loss": 0.5227, "step": 669300 }, { "epoch": 90.19132309350579, "grad_norm": 0.19441543519496918, "learning_rate": 0.00030502410251803945, "loss": 0.5241, "step": 669400 }, { "epoch": 90.20479655079494, "grad_norm": 0.1915389746427536, "learning_rate": 0.00030498667624779185, "loss": 0.5239, "step": 669500 }, { "epoch": 90.21827000808408, "grad_norm": 0.20426565408706665, "learning_rate": 0.00030494924997754425, "loss": 0.5242, "step": 669600 }, { "epoch": 90.23174346537321, "grad_norm": 0.18631303310394287, "learning_rate": 0.00030491182370729665, "loss": 0.523, "step": 669700 }, { "epoch": 90.24521692266235, "grad_norm": 0.17618149518966675, "learning_rate": 0.00030487439743704905, "loss": 0.5224, "step": 669800 }, { "epoch": 90.25869037995149, "grad_norm": 0.18138523399829865, "learning_rate": 0.00030483697116680144, "loss": 0.5243, "step": 669900 }, { "epoch": 90.27216383724064, "grad_norm": 0.21571293473243713, "learning_rate": 0.00030479954489655384, "loss": 0.5231, "step": 670000 }, { "epoch": 90.28563729452978, "grad_norm": 0.18815752863883972, "learning_rate": 0.0003047621186263062, "loss": 0.5239, "step": 670100 }, { "epoch": 90.29911075181892, "grad_norm": 0.16928747296333313, "learning_rate": 0.0003047246923560586, "loss": 0.5234, "step": 670200 }, { "epoch": 90.31258420910805, "grad_norm": 0.2014044225215912, "learning_rate": 0.00030468726608581093, "loss": 0.5232, "step": 670300 }, { "epoch": 90.3260576663972, "grad_norm": 0.1831892430782318, "learning_rate": 0.0003046498398155633, "loss": 0.5231, "step": 670400 }, { "epoch": 90.33953112368634, "grad_norm": 0.18936218321323395, "learning_rate": 0.0003046124135453157, "loss": 0.5239, "step": 670500 }, { "epoch": 90.35300458097548, "grad_norm": 0.18053406476974487, "learning_rate": 0.0003045749872750681, "loss": 0.5231, "step": 670600 }, { "epoch": 90.36647803826462, "grad_norm": 0.2123175710439682, "learning_rate": 0.0003045375610048205, "loss": 0.5228, "step": 670700 }, { "epoch": 90.37995149555375, "grad_norm": 0.18505987524986267, "learning_rate": 0.00030450013473457287, "loss": 0.5233, "step": 670800 }, { "epoch": 90.3934249528429, "grad_norm": 0.21670955419540405, "learning_rate": 0.00030446270846432526, "loss": 0.5241, "step": 670900 }, { "epoch": 90.40689841013204, "grad_norm": 0.18585951626300812, "learning_rate": 0.00030442528219407766, "loss": 0.5236, "step": 671000 }, { "epoch": 90.42037186742118, "grad_norm": 0.19336873292922974, "learning_rate": 0.00030438785592383006, "loss": 0.5231, "step": 671100 }, { "epoch": 90.43384532471032, "grad_norm": 0.23382817208766937, "learning_rate": 0.00030435042965358246, "loss": 0.5248, "step": 671200 }, { "epoch": 90.44731878199946, "grad_norm": 0.17312806844711304, "learning_rate": 0.00030431300338333486, "loss": 0.5246, "step": 671300 }, { "epoch": 90.46079223928861, "grad_norm": 0.1956392079591751, "learning_rate": 0.00030427557711308726, "loss": 0.5242, "step": 671400 }, { "epoch": 90.47426569657775, "grad_norm": 0.17653732001781464, "learning_rate": 0.0003042381508428396, "loss": 0.5238, "step": 671500 }, { "epoch": 90.48773915386688, "grad_norm": 0.17474718391895294, "learning_rate": 0.000304200724572592, "loss": 0.5246, "step": 671600 }, { "epoch": 90.50121261115602, "grad_norm": 0.1785057634115219, "learning_rate": 0.0003041632983023444, "loss": 0.5239, "step": 671700 }, { "epoch": 90.51468606844516, "grad_norm": 0.17429740726947784, "learning_rate": 0.0003041258720320968, "loss": 0.5235, "step": 671800 }, { "epoch": 90.52815952573431, "grad_norm": 0.18376387655735016, "learning_rate": 0.0003040884457618492, "loss": 0.5234, "step": 671900 }, { "epoch": 90.54163298302345, "grad_norm": 0.20074006915092468, "learning_rate": 0.0003040510194916016, "loss": 0.5223, "step": 672000 }, { "epoch": 90.55510644031258, "grad_norm": 0.1854151040315628, "learning_rate": 0.00030401359322135393, "loss": 0.5238, "step": 672100 }, { "epoch": 90.56857989760172, "grad_norm": 0.17612120509147644, "learning_rate": 0.0003039761669511063, "loss": 0.5245, "step": 672200 }, { "epoch": 90.58205335489086, "grad_norm": 0.19302763044834137, "learning_rate": 0.0003039387406808587, "loss": 0.5239, "step": 672300 }, { "epoch": 90.59552681218001, "grad_norm": 0.17072603106498718, "learning_rate": 0.0003039013144106111, "loss": 0.5236, "step": 672400 }, { "epoch": 90.60900026946915, "grad_norm": 0.17483721673488617, "learning_rate": 0.0003038638881403635, "loss": 0.5241, "step": 672500 }, { "epoch": 90.62247372675829, "grad_norm": 0.18877609074115753, "learning_rate": 0.00030382646187011587, "loss": 0.524, "step": 672600 }, { "epoch": 90.63594718404742, "grad_norm": 0.1846694052219391, "learning_rate": 0.00030378903559986827, "loss": 0.5242, "step": 672700 }, { "epoch": 90.64942064133656, "grad_norm": 0.18536566197872162, "learning_rate": 0.00030375160932962067, "loss": 0.5244, "step": 672800 }, { "epoch": 90.66289409862571, "grad_norm": 0.1823270618915558, "learning_rate": 0.00030371418305937307, "loss": 0.5231, "step": 672900 }, { "epoch": 90.67636755591485, "grad_norm": 0.1788761168718338, "learning_rate": 0.0003036767567891254, "loss": 0.5234, "step": 673000 }, { "epoch": 90.68984101320399, "grad_norm": 0.16798661649227142, "learning_rate": 0.0003036393305188778, "loss": 0.5237, "step": 673100 }, { "epoch": 90.70331447049313, "grad_norm": 0.19167625904083252, "learning_rate": 0.0003036019042486302, "loss": 0.5233, "step": 673200 }, { "epoch": 90.71678792778226, "grad_norm": 0.19097916781902313, "learning_rate": 0.0003035644779783826, "loss": 0.5228, "step": 673300 }, { "epoch": 90.73026138507142, "grad_norm": 0.1835690587759018, "learning_rate": 0.000303527051708135, "loss": 0.5233, "step": 673400 }, { "epoch": 90.74373484236055, "grad_norm": 0.18339915573596954, "learning_rate": 0.0003034896254378874, "loss": 0.5225, "step": 673500 }, { "epoch": 90.75720829964969, "grad_norm": 0.17877307534217834, "learning_rate": 0.0003034521991676398, "loss": 0.5244, "step": 673600 }, { "epoch": 90.77068175693883, "grad_norm": 0.20008745789527893, "learning_rate": 0.00030341477289739214, "loss": 0.524, "step": 673700 }, { "epoch": 90.78415521422797, "grad_norm": 0.17006604373455048, "learning_rate": 0.00030337734662714454, "loss": 0.5227, "step": 673800 }, { "epoch": 90.79762867151712, "grad_norm": 0.17672747373580933, "learning_rate": 0.0003033399203568969, "loss": 0.5253, "step": 673900 }, { "epoch": 90.81110212880625, "grad_norm": 0.18535073101520538, "learning_rate": 0.0003033024940866493, "loss": 0.5233, "step": 674000 }, { "epoch": 90.82457558609539, "grad_norm": 0.18292136490345, "learning_rate": 0.0003032650678164017, "loss": 0.5236, "step": 674100 }, { "epoch": 90.83804904338453, "grad_norm": 0.1722126603126526, "learning_rate": 0.0003032276415461541, "loss": 0.5231, "step": 674200 }, { "epoch": 90.85152250067367, "grad_norm": 0.17341171205043793, "learning_rate": 0.0003031902152759065, "loss": 0.5233, "step": 674300 }, { "epoch": 90.86499595796282, "grad_norm": 0.21057361364364624, "learning_rate": 0.0003031527890056588, "loss": 0.5243, "step": 674400 }, { "epoch": 90.87846941525196, "grad_norm": 0.17216452956199646, "learning_rate": 0.0003031153627354112, "loss": 0.524, "step": 674500 }, { "epoch": 90.8919428725411, "grad_norm": 0.18952453136444092, "learning_rate": 0.0003030779364651636, "loss": 0.5249, "step": 674600 }, { "epoch": 90.90541632983023, "grad_norm": 0.18989062309265137, "learning_rate": 0.000303040510194916, "loss": 0.5237, "step": 674700 }, { "epoch": 90.91888978711937, "grad_norm": 0.17239277064800262, "learning_rate": 0.0003030030839246684, "loss": 0.5235, "step": 674800 }, { "epoch": 90.93236324440852, "grad_norm": 0.18030975759029388, "learning_rate": 0.0003029656576544208, "loss": 0.5235, "step": 674900 }, { "epoch": 90.94583670169766, "grad_norm": 0.18575157225131989, "learning_rate": 0.0003029282313841732, "loss": 0.5241, "step": 675000 }, { "epoch": 90.9593101589868, "grad_norm": 0.20609824359416962, "learning_rate": 0.00030289080511392556, "loss": 0.5245, "step": 675100 }, { "epoch": 90.97278361627593, "grad_norm": 0.1742342859506607, "learning_rate": 0.00030285337884367795, "loss": 0.524, "step": 675200 }, { "epoch": 90.98625707356507, "grad_norm": 0.19289614260196686, "learning_rate": 0.00030281595257343035, "loss": 0.524, "step": 675300 }, { "epoch": 90.99973053085422, "grad_norm": 0.20268696546554565, "learning_rate": 0.00030277852630318275, "loss": 0.5236, "step": 675400 }, { "epoch": 91.0, "eval_loss": 0.5123173594474792, "eval_runtime": 4.9508, "eval_samples_per_second": 1009.936, "eval_steps_per_second": 15.957, "step": 675402 }, { "epoch": 91.01320398814336, "grad_norm": 0.1825486123561859, "learning_rate": 0.00030274110003293515, "loss": 0.5234, "step": 675500 }, { "epoch": 91.0266774454325, "grad_norm": 0.2102806270122528, "learning_rate": 0.00030270367376268755, "loss": 0.5213, "step": 675600 }, { "epoch": 91.04015090272163, "grad_norm": 0.1781783550977707, "learning_rate": 0.0003026662474924399, "loss": 0.5233, "step": 675700 }, { "epoch": 91.05362436001077, "grad_norm": 0.20629310607910156, "learning_rate": 0.0003026288212221923, "loss": 0.5227, "step": 675800 }, { "epoch": 91.06709781729992, "grad_norm": 0.17207570374011993, "learning_rate": 0.00030259139495194463, "loss": 0.522, "step": 675900 }, { "epoch": 91.08057127458906, "grad_norm": 0.18987184762954712, "learning_rate": 0.00030255396868169703, "loss": 0.523, "step": 676000 }, { "epoch": 91.0940447318782, "grad_norm": 0.2042580395936966, "learning_rate": 0.00030251654241144943, "loss": 0.523, "step": 676100 }, { "epoch": 91.10751818916734, "grad_norm": 0.17467811703681946, "learning_rate": 0.00030247911614120183, "loss": 0.5238, "step": 676200 }, { "epoch": 91.12099164645647, "grad_norm": 0.17381258308887482, "learning_rate": 0.0003024416898709542, "loss": 0.5234, "step": 676300 }, { "epoch": 91.13446510374563, "grad_norm": 0.1873312145471573, "learning_rate": 0.0003024042636007066, "loss": 0.5229, "step": 676400 }, { "epoch": 91.14793856103476, "grad_norm": 0.1791018843650818, "learning_rate": 0.000302366837330459, "loss": 0.5234, "step": 676500 }, { "epoch": 91.1614120183239, "grad_norm": 0.17077983915805817, "learning_rate": 0.00030232941106021137, "loss": 0.5235, "step": 676600 }, { "epoch": 91.17488547561304, "grad_norm": 0.17599619925022125, "learning_rate": 0.00030229198478996377, "loss": 0.523, "step": 676700 }, { "epoch": 91.18835893290218, "grad_norm": 0.17153578996658325, "learning_rate": 0.00030225455851971616, "loss": 0.5233, "step": 676800 }, { "epoch": 91.20183239019133, "grad_norm": 0.1826757788658142, "learning_rate": 0.00030221713224946856, "loss": 0.5248, "step": 676900 }, { "epoch": 91.21530584748047, "grad_norm": 0.19776244461536407, "learning_rate": 0.00030217970597922096, "loss": 0.523, "step": 677000 }, { "epoch": 91.2287793047696, "grad_norm": 0.18185459077358246, "learning_rate": 0.00030214227970897336, "loss": 0.5231, "step": 677100 }, { "epoch": 91.24225276205874, "grad_norm": 0.20468156039714813, "learning_rate": 0.00030210485343872576, "loss": 0.524, "step": 677200 }, { "epoch": 91.25572621934788, "grad_norm": 0.17127001285552979, "learning_rate": 0.0003020674271684781, "loss": 0.5227, "step": 677300 }, { "epoch": 91.26919967663703, "grad_norm": 0.17113123834133148, "learning_rate": 0.0003020300008982305, "loss": 0.5235, "step": 677400 }, { "epoch": 91.28267313392617, "grad_norm": 0.1739196926355362, "learning_rate": 0.00030199257462798284, "loss": 0.5236, "step": 677500 }, { "epoch": 91.2961465912153, "grad_norm": 0.1812613159418106, "learning_rate": 0.00030195514835773524, "loss": 0.5222, "step": 677600 }, { "epoch": 91.30962004850444, "grad_norm": 0.18832363188266754, "learning_rate": 0.00030191772208748764, "loss": 0.524, "step": 677700 }, { "epoch": 91.3230935057936, "grad_norm": 0.18763235211372375, "learning_rate": 0.00030188029581724004, "loss": 0.5231, "step": 677800 }, { "epoch": 91.33656696308273, "grad_norm": 0.18586254119873047, "learning_rate": 0.00030184286954699244, "loss": 0.524, "step": 677900 }, { "epoch": 91.35004042037187, "grad_norm": 0.1854696273803711, "learning_rate": 0.0003018054432767448, "loss": 0.5225, "step": 678000 }, { "epoch": 91.363513877661, "grad_norm": 0.1824968308210373, "learning_rate": 0.0003017680170064972, "loss": 0.5235, "step": 678100 }, { "epoch": 91.37698733495014, "grad_norm": 0.1788484752178192, "learning_rate": 0.0003017305907362496, "loss": 0.5234, "step": 678200 }, { "epoch": 91.3904607922393, "grad_norm": 0.19671902060508728, "learning_rate": 0.000301693164466002, "loss": 0.5236, "step": 678300 }, { "epoch": 91.40393424952843, "grad_norm": 0.19844557344913483, "learning_rate": 0.00030165573819575437, "loss": 0.5232, "step": 678400 }, { "epoch": 91.41740770681757, "grad_norm": 0.20410820841789246, "learning_rate": 0.00030161831192550677, "loss": 0.5238, "step": 678500 }, { "epoch": 91.43088116410671, "grad_norm": 0.17868372797966003, "learning_rate": 0.00030158088565525917, "loss": 0.5241, "step": 678600 }, { "epoch": 91.44435462139585, "grad_norm": 0.1798778623342514, "learning_rate": 0.00030154345938501157, "loss": 0.5236, "step": 678700 }, { "epoch": 91.457828078685, "grad_norm": 0.18289139866828918, "learning_rate": 0.0003015060331147639, "loss": 0.5231, "step": 678800 }, { "epoch": 91.47130153597413, "grad_norm": 0.20291316509246826, "learning_rate": 0.0003014686068445163, "loss": 0.5238, "step": 678900 }, { "epoch": 91.48477499326327, "grad_norm": 0.19442245364189148, "learning_rate": 0.0003014311805742687, "loss": 0.5234, "step": 679000 }, { "epoch": 91.49824845055241, "grad_norm": 0.17197886109352112, "learning_rate": 0.0003013937543040211, "loss": 0.5226, "step": 679100 }, { "epoch": 91.51172190784155, "grad_norm": 0.18218116462230682, "learning_rate": 0.0003013563280337735, "loss": 0.5234, "step": 679200 }, { "epoch": 91.5251953651307, "grad_norm": 0.20629659295082092, "learning_rate": 0.0003013189017635259, "loss": 0.5236, "step": 679300 }, { "epoch": 91.53866882241984, "grad_norm": 0.18844032287597656, "learning_rate": 0.00030128147549327825, "loss": 0.5236, "step": 679400 }, { "epoch": 91.55214227970897, "grad_norm": 0.1945532262325287, "learning_rate": 0.0003012440492230306, "loss": 0.5234, "step": 679500 }, { "epoch": 91.56561573699811, "grad_norm": 0.18406759202480316, "learning_rate": 0.000301206622952783, "loss": 0.5239, "step": 679600 }, { "epoch": 91.57908919428725, "grad_norm": 0.17625980079174042, "learning_rate": 0.0003011691966825354, "loss": 0.5238, "step": 679700 }, { "epoch": 91.5925626515764, "grad_norm": 0.18760110437870026, "learning_rate": 0.0003011317704122878, "loss": 0.5236, "step": 679800 }, { "epoch": 91.60603610886554, "grad_norm": 0.18515600264072418, "learning_rate": 0.0003010943441420402, "loss": 0.5241, "step": 679900 }, { "epoch": 91.61950956615468, "grad_norm": 0.20092801749706268, "learning_rate": 0.0003010569178717926, "loss": 0.5242, "step": 680000 }, { "epoch": 91.63298302344381, "grad_norm": 0.1910741925239563, "learning_rate": 0.000301019491601545, "loss": 0.5234, "step": 680100 }, { "epoch": 91.64645648073295, "grad_norm": 0.17356230318546295, "learning_rate": 0.0003009820653312973, "loss": 0.5232, "step": 680200 }, { "epoch": 91.6599299380221, "grad_norm": 0.17393893003463745, "learning_rate": 0.0003009446390610497, "loss": 0.5227, "step": 680300 }, { "epoch": 91.67340339531124, "grad_norm": 0.17475645244121552, "learning_rate": 0.0003009072127908021, "loss": 0.5231, "step": 680400 }, { "epoch": 91.68687685260038, "grad_norm": 0.1782800555229187, "learning_rate": 0.0003008697865205545, "loss": 0.5227, "step": 680500 }, { "epoch": 91.70035030988952, "grad_norm": 0.16701611876487732, "learning_rate": 0.0003008323602503069, "loss": 0.5231, "step": 680600 }, { "epoch": 91.71382376717865, "grad_norm": 0.17838706076145172, "learning_rate": 0.0003007949339800593, "loss": 0.5232, "step": 680700 }, { "epoch": 91.7272972244678, "grad_norm": 0.17981387674808502, "learning_rate": 0.0003007575077098117, "loss": 0.5244, "step": 680800 }, { "epoch": 91.74077068175694, "grad_norm": 0.17791983485221863, "learning_rate": 0.0003007200814395641, "loss": 0.523, "step": 680900 }, { "epoch": 91.75424413904608, "grad_norm": 0.1830924153327942, "learning_rate": 0.00030068265516931646, "loss": 0.524, "step": 681000 }, { "epoch": 91.76771759633522, "grad_norm": 0.18576323986053467, "learning_rate": 0.00030064522889906885, "loss": 0.5234, "step": 681100 }, { "epoch": 91.78119105362435, "grad_norm": 0.17027071118354797, "learning_rate": 0.0003006078026288212, "loss": 0.524, "step": 681200 }, { "epoch": 91.7946645109135, "grad_norm": 0.1813175082206726, "learning_rate": 0.0003005703763585736, "loss": 0.5247, "step": 681300 }, { "epoch": 91.80813796820264, "grad_norm": 0.20478390157222748, "learning_rate": 0.000300532950088326, "loss": 0.5229, "step": 681400 }, { "epoch": 91.82161142549178, "grad_norm": 0.17281244695186615, "learning_rate": 0.0003004955238180784, "loss": 0.5242, "step": 681500 }, { "epoch": 91.83508488278092, "grad_norm": 0.17363683879375458, "learning_rate": 0.0003004580975478308, "loss": 0.5242, "step": 681600 }, { "epoch": 91.84855834007006, "grad_norm": 0.18542228639125824, "learning_rate": 0.00030042067127758313, "loss": 0.5231, "step": 681700 }, { "epoch": 91.86203179735921, "grad_norm": 0.1801498681306839, "learning_rate": 0.00030038324500733553, "loss": 0.5235, "step": 681800 }, { "epoch": 91.87550525464835, "grad_norm": 0.18326161801815033, "learning_rate": 0.00030034581873708793, "loss": 0.522, "step": 681900 }, { "epoch": 91.88897871193748, "grad_norm": 0.17079930007457733, "learning_rate": 0.00030030839246684033, "loss": 0.5238, "step": 682000 }, { "epoch": 91.90245216922662, "grad_norm": 0.17512179911136627, "learning_rate": 0.00030027096619659273, "loss": 0.5236, "step": 682100 }, { "epoch": 91.91592562651576, "grad_norm": 0.22300061583518982, "learning_rate": 0.0003002335399263451, "loss": 0.524, "step": 682200 }, { "epoch": 91.92939908380491, "grad_norm": 0.18083910644054413, "learning_rate": 0.0003001961136560975, "loss": 0.524, "step": 682300 }, { "epoch": 91.94287254109405, "grad_norm": 0.1864597052335739, "learning_rate": 0.00030015868738584987, "loss": 0.523, "step": 682400 }, { "epoch": 91.95634599838318, "grad_norm": 0.20166030526161194, "learning_rate": 0.00030012126111560227, "loss": 0.5239, "step": 682500 }, { "epoch": 91.96981945567232, "grad_norm": 0.17999428510665894, "learning_rate": 0.00030008383484535466, "loss": 0.5235, "step": 682600 }, { "epoch": 91.98329291296146, "grad_norm": 0.18549595773220062, "learning_rate": 0.00030004640857510706, "loss": 0.5241, "step": 682700 }, { "epoch": 91.99676637025061, "grad_norm": 0.18400125205516815, "learning_rate": 0.00030000898230485946, "loss": 0.5238, "step": 682800 }, { "epoch": 92.0, "eval_loss": 0.512269139289856, "eval_runtime": 4.9535, "eval_samples_per_second": 1009.386, "eval_steps_per_second": 15.948, "step": 682824 }, { "epoch": 92.01023982753975, "grad_norm": 0.18837805092334747, "learning_rate": 0.00029997155603461186, "loss": 0.5229, "step": 682900 }, { "epoch": 92.02371328482889, "grad_norm": 0.2055080533027649, "learning_rate": 0.0002999341297643642, "loss": 0.5237, "step": 683000 }, { "epoch": 92.03718674211802, "grad_norm": 0.1714886724948883, "learning_rate": 0.00029989670349411655, "loss": 0.5223, "step": 683100 }, { "epoch": 92.05066019940716, "grad_norm": 0.18947507441043854, "learning_rate": 0.00029985927722386895, "loss": 0.5225, "step": 683200 }, { "epoch": 92.06413365669631, "grad_norm": 0.17648854851722717, "learning_rate": 0.00029982185095362134, "loss": 0.5223, "step": 683300 }, { "epoch": 92.07760711398545, "grad_norm": 0.17745250463485718, "learning_rate": 0.00029978442468337374, "loss": 0.5223, "step": 683400 }, { "epoch": 92.09108057127459, "grad_norm": 0.16634680330753326, "learning_rate": 0.00029974699841312614, "loss": 0.5239, "step": 683500 }, { "epoch": 92.10455402856373, "grad_norm": 0.18230025470256805, "learning_rate": 0.00029970957214287854, "loss": 0.5235, "step": 683600 }, { "epoch": 92.11802748585286, "grad_norm": 0.19335703551769257, "learning_rate": 0.00029967214587263094, "loss": 0.5228, "step": 683700 }, { "epoch": 92.13150094314202, "grad_norm": 0.18391941487789154, "learning_rate": 0.00029963471960238333, "loss": 0.5235, "step": 683800 }, { "epoch": 92.14497440043115, "grad_norm": 0.17985199391841888, "learning_rate": 0.0002995972933321357, "loss": 0.524, "step": 683900 }, { "epoch": 92.15844785772029, "grad_norm": 0.18311652541160583, "learning_rate": 0.0002995598670618881, "loss": 0.5219, "step": 684000 }, { "epoch": 92.17192131500943, "grad_norm": 0.16781307756900787, "learning_rate": 0.0002995224407916405, "loss": 0.5224, "step": 684100 }, { "epoch": 92.18539477229857, "grad_norm": 0.18763627111911774, "learning_rate": 0.0002994850145213929, "loss": 0.5234, "step": 684200 }, { "epoch": 92.19886822958772, "grad_norm": 0.1789606809616089, "learning_rate": 0.00029944758825114527, "loss": 0.5227, "step": 684300 }, { "epoch": 92.21234168687685, "grad_norm": 0.19011056423187256, "learning_rate": 0.00029941016198089767, "loss": 0.5219, "step": 684400 }, { "epoch": 92.22581514416599, "grad_norm": 0.1834786832332611, "learning_rate": 0.00029937273571065007, "loss": 0.5223, "step": 684500 }, { "epoch": 92.23928860145513, "grad_norm": 0.19285836815834045, "learning_rate": 0.0002993353094404024, "loss": 0.5235, "step": 684600 }, { "epoch": 92.25276205874427, "grad_norm": 0.1987258344888687, "learning_rate": 0.0002992978831701548, "loss": 0.5237, "step": 684700 }, { "epoch": 92.26623551603342, "grad_norm": 0.19468402862548828, "learning_rate": 0.00029926045689990715, "loss": 0.523, "step": 684800 }, { "epoch": 92.27970897332256, "grad_norm": 0.21445941925048828, "learning_rate": 0.00029922303062965955, "loss": 0.5239, "step": 684900 }, { "epoch": 92.2931824306117, "grad_norm": 0.19969496130943298, "learning_rate": 0.00029918560435941195, "loss": 0.5242, "step": 685000 }, { "epoch": 92.30665588790083, "grad_norm": 0.20302368700504303, "learning_rate": 0.00029914817808916435, "loss": 0.5232, "step": 685100 }, { "epoch": 92.32012934518997, "grad_norm": 0.18164947628974915, "learning_rate": 0.00029911075181891675, "loss": 0.5228, "step": 685200 }, { "epoch": 92.33360280247912, "grad_norm": 0.17792312800884247, "learning_rate": 0.0002990733255486691, "loss": 0.5232, "step": 685300 }, { "epoch": 92.34707625976826, "grad_norm": 0.17592297494411469, "learning_rate": 0.0002990358992784215, "loss": 0.5236, "step": 685400 }, { "epoch": 92.3605497170574, "grad_norm": 0.17347046732902527, "learning_rate": 0.0002989984730081739, "loss": 0.5236, "step": 685500 }, { "epoch": 92.37402317434653, "grad_norm": 0.1719921976327896, "learning_rate": 0.0002989610467379263, "loss": 0.5233, "step": 685600 }, { "epoch": 92.38749663163568, "grad_norm": 0.17772404849529266, "learning_rate": 0.0002989236204676787, "loss": 0.5225, "step": 685700 }, { "epoch": 92.40097008892482, "grad_norm": 0.22136597335338593, "learning_rate": 0.0002988861941974311, "loss": 0.5229, "step": 685800 }, { "epoch": 92.41444354621396, "grad_norm": 0.182691290974617, "learning_rate": 0.0002988487679271835, "loss": 0.5241, "step": 685900 }, { "epoch": 92.4279170035031, "grad_norm": 0.17574550211429596, "learning_rate": 0.0002988113416569359, "loss": 0.5235, "step": 686000 }, { "epoch": 92.44139046079223, "grad_norm": 0.19654104113578796, "learning_rate": 0.0002987739153866882, "loss": 0.5234, "step": 686100 }, { "epoch": 92.45486391808139, "grad_norm": 0.17145505547523499, "learning_rate": 0.0002987364891164406, "loss": 0.524, "step": 686200 }, { "epoch": 92.46833737537052, "grad_norm": 0.19001194834709167, "learning_rate": 0.000298699062846193, "loss": 0.5226, "step": 686300 }, { "epoch": 92.48181083265966, "grad_norm": 0.1749984622001648, "learning_rate": 0.0002986616365759454, "loss": 0.5234, "step": 686400 }, { "epoch": 92.4952842899488, "grad_norm": 0.19105187058448792, "learning_rate": 0.0002986242103056978, "loss": 0.524, "step": 686500 }, { "epoch": 92.50875774723794, "grad_norm": 0.17769064009189606, "learning_rate": 0.00029858678403545016, "loss": 0.5232, "step": 686600 }, { "epoch": 92.52223120452709, "grad_norm": 0.19501124322414398, "learning_rate": 0.00029854935776520256, "loss": 0.5227, "step": 686700 }, { "epoch": 92.53570466181623, "grad_norm": 0.1878000944852829, "learning_rate": 0.0002985119314949549, "loss": 0.5238, "step": 686800 }, { "epoch": 92.54917811910536, "grad_norm": 0.20668621361255646, "learning_rate": 0.0002984745052247073, "loss": 0.5231, "step": 686900 }, { "epoch": 92.5626515763945, "grad_norm": 0.18786738812923431, "learning_rate": 0.0002984370789544597, "loss": 0.5236, "step": 687000 }, { "epoch": 92.57612503368364, "grad_norm": 0.19233089685440063, "learning_rate": 0.0002983996526842121, "loss": 0.5234, "step": 687100 }, { "epoch": 92.58959849097279, "grad_norm": 0.1786269247531891, "learning_rate": 0.0002983622264139645, "loss": 0.5225, "step": 687200 }, { "epoch": 92.60307194826193, "grad_norm": 0.17542928457260132, "learning_rate": 0.0002983248001437169, "loss": 0.5223, "step": 687300 }, { "epoch": 92.61654540555107, "grad_norm": 0.18711170554161072, "learning_rate": 0.0002982873738734693, "loss": 0.5232, "step": 687400 }, { "epoch": 92.6300188628402, "grad_norm": 0.17537043988704681, "learning_rate": 0.00029824994760322164, "loss": 0.5235, "step": 687500 }, { "epoch": 92.64349232012934, "grad_norm": 0.23628173768520355, "learning_rate": 0.00029821252133297403, "loss": 0.5226, "step": 687600 }, { "epoch": 92.65696577741849, "grad_norm": 0.18777911365032196, "learning_rate": 0.00029817509506272643, "loss": 0.524, "step": 687700 }, { "epoch": 92.67043923470763, "grad_norm": 0.20079270005226135, "learning_rate": 0.00029813766879247883, "loss": 0.5228, "step": 687800 }, { "epoch": 92.68391269199677, "grad_norm": 0.18772384524345398, "learning_rate": 0.00029810024252223123, "loss": 0.524, "step": 687900 }, { "epoch": 92.6973861492859, "grad_norm": 0.17133820056915283, "learning_rate": 0.0002980628162519836, "loss": 0.524, "step": 688000 }, { "epoch": 92.71085960657504, "grad_norm": 0.2133195400238037, "learning_rate": 0.000298025389981736, "loss": 0.5229, "step": 688100 }, { "epoch": 92.7243330638642, "grad_norm": 0.1927766054868698, "learning_rate": 0.0002979879637114884, "loss": 0.5227, "step": 688200 }, { "epoch": 92.73780652115333, "grad_norm": 0.1965072900056839, "learning_rate": 0.00029795053744124077, "loss": 0.523, "step": 688300 }, { "epoch": 92.75127997844247, "grad_norm": 0.18084309995174408, "learning_rate": 0.0002979131111709931, "loss": 0.5227, "step": 688400 }, { "epoch": 92.7647534357316, "grad_norm": 0.2024170160293579, "learning_rate": 0.0002978756849007455, "loss": 0.5237, "step": 688500 }, { "epoch": 92.77822689302074, "grad_norm": 0.20845851302146912, "learning_rate": 0.0002978382586304979, "loss": 0.5232, "step": 688600 }, { "epoch": 92.7917003503099, "grad_norm": 0.19282089173793793, "learning_rate": 0.0002978008323602503, "loss": 0.5233, "step": 688700 }, { "epoch": 92.80517380759903, "grad_norm": 0.16794773936271667, "learning_rate": 0.0002977634060900027, "loss": 0.5245, "step": 688800 }, { "epoch": 92.81864726488817, "grad_norm": 0.17128083109855652, "learning_rate": 0.0002977259798197551, "loss": 0.5235, "step": 688900 }, { "epoch": 92.83212072217731, "grad_norm": 0.1931869387626648, "learning_rate": 0.00029768855354950745, "loss": 0.5233, "step": 689000 }, { "epoch": 92.84559417946645, "grad_norm": 0.17516259849071503, "learning_rate": 0.00029765112727925985, "loss": 0.5237, "step": 689100 }, { "epoch": 92.8590676367556, "grad_norm": 0.17594021558761597, "learning_rate": 0.00029761370100901224, "loss": 0.5232, "step": 689200 }, { "epoch": 92.87254109404473, "grad_norm": 0.18191973865032196, "learning_rate": 0.00029757627473876464, "loss": 0.5227, "step": 689300 }, { "epoch": 92.88601455133387, "grad_norm": 0.18488043546676636, "learning_rate": 0.00029753884846851704, "loss": 0.5235, "step": 689400 }, { "epoch": 92.89948800862301, "grad_norm": 0.17599605023860931, "learning_rate": 0.00029750142219826944, "loss": 0.5233, "step": 689500 }, { "epoch": 92.91296146591215, "grad_norm": 0.18544991314411163, "learning_rate": 0.00029746399592802184, "loss": 0.5238, "step": 689600 }, { "epoch": 92.9264349232013, "grad_norm": 0.19827263057231903, "learning_rate": 0.0002974265696577742, "loss": 0.5237, "step": 689700 }, { "epoch": 92.93990838049044, "grad_norm": 0.19711536169052124, "learning_rate": 0.0002973891433875266, "loss": 0.523, "step": 689800 }, { "epoch": 92.95338183777957, "grad_norm": 0.17659662663936615, "learning_rate": 0.000297351717117279, "loss": 0.5234, "step": 689900 }, { "epoch": 92.96685529506871, "grad_norm": 0.18290288746356964, "learning_rate": 0.0002973142908470314, "loss": 0.5234, "step": 690000 }, { "epoch": 92.98032875235785, "grad_norm": 0.1968613564968109, "learning_rate": 0.0002972768645767838, "loss": 0.5225, "step": 690100 }, { "epoch": 92.993802209647, "grad_norm": 0.18878358602523804, "learning_rate": 0.0002972394383065361, "loss": 0.5243, "step": 690200 }, { "epoch": 93.0, "eval_loss": 0.5119098424911499, "eval_runtime": 4.9529, "eval_samples_per_second": 1009.515, "eval_steps_per_second": 15.95, "step": 690246 }, { "epoch": 93.00727566693614, "grad_norm": 0.1790333390235901, "learning_rate": 0.0002972020120362885, "loss": 0.5224, "step": 690300 }, { "epoch": 93.02074912422528, "grad_norm": 0.17507365345954895, "learning_rate": 0.00029716458576604086, "loss": 0.5228, "step": 690400 }, { "epoch": 93.03422258151441, "grad_norm": 0.18600419163703918, "learning_rate": 0.00029712715949579326, "loss": 0.5218, "step": 690500 }, { "epoch": 93.04769603880355, "grad_norm": 0.19886192679405212, "learning_rate": 0.00029708973322554566, "loss": 0.5232, "step": 690600 }, { "epoch": 93.0611694960927, "grad_norm": 0.17947159707546234, "learning_rate": 0.00029705230695529805, "loss": 0.5231, "step": 690700 }, { "epoch": 93.07464295338184, "grad_norm": 0.1885027438402176, "learning_rate": 0.00029701488068505045, "loss": 0.5214, "step": 690800 }, { "epoch": 93.08811641067098, "grad_norm": 0.18667495250701904, "learning_rate": 0.00029697745441480285, "loss": 0.523, "step": 690900 }, { "epoch": 93.10158986796012, "grad_norm": 0.17977279424667358, "learning_rate": 0.00029694002814455525, "loss": 0.5226, "step": 691000 }, { "epoch": 93.11506332524925, "grad_norm": 0.18191565573215485, "learning_rate": 0.00029690260187430765, "loss": 0.5217, "step": 691100 }, { "epoch": 93.1285367825384, "grad_norm": 0.18696977198123932, "learning_rate": 0.00029686517560406, "loss": 0.5235, "step": 691200 }, { "epoch": 93.14201023982754, "grad_norm": 0.1794235110282898, "learning_rate": 0.0002968277493338124, "loss": 0.523, "step": 691300 }, { "epoch": 93.15548369711668, "grad_norm": 0.18160200119018555, "learning_rate": 0.0002967903230635648, "loss": 0.522, "step": 691400 }, { "epoch": 93.16895715440582, "grad_norm": 0.2020624279975891, "learning_rate": 0.0002967528967933172, "loss": 0.522, "step": 691500 }, { "epoch": 93.18243061169495, "grad_norm": 0.17280523478984833, "learning_rate": 0.0002967154705230696, "loss": 0.5231, "step": 691600 }, { "epoch": 93.1959040689841, "grad_norm": 0.190608948469162, "learning_rate": 0.000296678044252822, "loss": 0.5231, "step": 691700 }, { "epoch": 93.20937752627324, "grad_norm": 0.18069763481616974, "learning_rate": 0.0002966406179825744, "loss": 0.5231, "step": 691800 }, { "epoch": 93.22285098356238, "grad_norm": 0.17971430718898773, "learning_rate": 0.0002966031917123267, "loss": 0.5234, "step": 691900 }, { "epoch": 93.23632444085152, "grad_norm": 0.18500179052352905, "learning_rate": 0.00029656576544207907, "loss": 0.5223, "step": 692000 }, { "epoch": 93.24979789814066, "grad_norm": 0.19008444249629974, "learning_rate": 0.00029652833917183147, "loss": 0.5232, "step": 692100 }, { "epoch": 93.26327135542981, "grad_norm": 0.18006250262260437, "learning_rate": 0.00029649091290158386, "loss": 0.5234, "step": 692200 }, { "epoch": 93.27674481271895, "grad_norm": 0.18709927797317505, "learning_rate": 0.00029645348663133626, "loss": 0.5232, "step": 692300 }, { "epoch": 93.29021827000808, "grad_norm": 0.17454521358013153, "learning_rate": 0.00029641606036108866, "loss": 0.523, "step": 692400 }, { "epoch": 93.30369172729722, "grad_norm": 0.1926705688238144, "learning_rate": 0.00029637863409084106, "loss": 0.5236, "step": 692500 }, { "epoch": 93.31716518458636, "grad_norm": 0.1783429980278015, "learning_rate": 0.0002963412078205934, "loss": 0.523, "step": 692600 }, { "epoch": 93.33063864187551, "grad_norm": 0.17730475962162018, "learning_rate": 0.0002963037815503458, "loss": 0.5229, "step": 692700 }, { "epoch": 93.34411209916465, "grad_norm": 0.170645073056221, "learning_rate": 0.0002962663552800982, "loss": 0.5232, "step": 692800 }, { "epoch": 93.35758555645378, "grad_norm": 0.1815531700849533, "learning_rate": 0.0002962289290098506, "loss": 0.5229, "step": 692900 }, { "epoch": 93.37105901374292, "grad_norm": 0.16753125190734863, "learning_rate": 0.000296191502739603, "loss": 0.5231, "step": 693000 }, { "epoch": 93.38453247103207, "grad_norm": 0.19324594736099243, "learning_rate": 0.0002961540764693554, "loss": 0.5231, "step": 693100 }, { "epoch": 93.39800592832121, "grad_norm": 0.17147667706012726, "learning_rate": 0.0002961166501991078, "loss": 0.5232, "step": 693200 }, { "epoch": 93.41147938561035, "grad_norm": 0.19014789164066315, "learning_rate": 0.00029607922392886014, "loss": 0.5231, "step": 693300 }, { "epoch": 93.42495284289949, "grad_norm": 0.18695583939552307, "learning_rate": 0.00029604179765861254, "loss": 0.5225, "step": 693400 }, { "epoch": 93.43842630018862, "grad_norm": 0.1927400678396225, "learning_rate": 0.00029600437138836493, "loss": 0.5241, "step": 693500 }, { "epoch": 93.45189975747778, "grad_norm": 0.18604008853435516, "learning_rate": 0.00029596694511811733, "loss": 0.5229, "step": 693600 }, { "epoch": 93.46537321476691, "grad_norm": 0.21669624745845795, "learning_rate": 0.00029592951884786973, "loss": 0.5231, "step": 693700 }, { "epoch": 93.47884667205605, "grad_norm": 0.19635611772537231, "learning_rate": 0.00029589209257762213, "loss": 0.5224, "step": 693800 }, { "epoch": 93.49232012934519, "grad_norm": 0.19221949577331543, "learning_rate": 0.00029585466630737447, "loss": 0.5234, "step": 693900 }, { "epoch": 93.50579358663433, "grad_norm": 0.202737957239151, "learning_rate": 0.00029581724003712687, "loss": 0.5232, "step": 694000 }, { "epoch": 93.51926704392348, "grad_norm": 0.18027733266353607, "learning_rate": 0.0002957798137668792, "loss": 0.5238, "step": 694100 }, { "epoch": 93.53274050121261, "grad_norm": 0.19829589128494263, "learning_rate": 0.0002957423874966316, "loss": 0.5233, "step": 694200 }, { "epoch": 93.54621395850175, "grad_norm": 0.1766379177570343, "learning_rate": 0.000295704961226384, "loss": 0.5229, "step": 694300 }, { "epoch": 93.55968741579089, "grad_norm": 0.19807255268096924, "learning_rate": 0.0002956675349561364, "loss": 0.5231, "step": 694400 }, { "epoch": 93.57316087308003, "grad_norm": 0.170002281665802, "learning_rate": 0.0002956301086858888, "loss": 0.5236, "step": 694500 }, { "epoch": 93.58663433036918, "grad_norm": 0.18841136991977692, "learning_rate": 0.0002955926824156412, "loss": 0.5233, "step": 694600 }, { "epoch": 93.60010778765832, "grad_norm": 0.17620539665222168, "learning_rate": 0.0002955552561453936, "loss": 0.5226, "step": 694700 }, { "epoch": 93.61358124494745, "grad_norm": 0.17463624477386475, "learning_rate": 0.00029551782987514595, "loss": 0.5235, "step": 694800 }, { "epoch": 93.62705470223659, "grad_norm": 0.1964477002620697, "learning_rate": 0.00029548040360489835, "loss": 0.5229, "step": 694900 }, { "epoch": 93.64052815952573, "grad_norm": 0.17418666183948517, "learning_rate": 0.00029544297733465074, "loss": 0.5224, "step": 695000 }, { "epoch": 93.65400161681488, "grad_norm": 0.18571420013904572, "learning_rate": 0.00029540555106440314, "loss": 0.5234, "step": 695100 }, { "epoch": 93.66747507410402, "grad_norm": 0.1761966049671173, "learning_rate": 0.00029536812479415554, "loss": 0.5237, "step": 695200 }, { "epoch": 93.68094853139316, "grad_norm": 0.17941386997699738, "learning_rate": 0.00029533069852390794, "loss": 0.5237, "step": 695300 }, { "epoch": 93.6944219886823, "grad_norm": 0.1902530938386917, "learning_rate": 0.00029529327225366034, "loss": 0.5229, "step": 695400 }, { "epoch": 93.70789544597143, "grad_norm": 0.17340172827243805, "learning_rate": 0.0002952558459834127, "loss": 0.5226, "step": 695500 }, { "epoch": 93.72136890326058, "grad_norm": 0.19774949550628662, "learning_rate": 0.0002952184197131651, "loss": 0.5227, "step": 695600 }, { "epoch": 93.73484236054972, "grad_norm": 0.21054457128047943, "learning_rate": 0.0002951809934429174, "loss": 0.5227, "step": 695700 }, { "epoch": 93.74831581783886, "grad_norm": 0.1860658973455429, "learning_rate": 0.0002951435671726698, "loss": 0.5233, "step": 695800 }, { "epoch": 93.761789275128, "grad_norm": 0.20093975961208344, "learning_rate": 0.0002951061409024222, "loss": 0.5229, "step": 695900 }, { "epoch": 93.77526273241713, "grad_norm": 0.20171858370304108, "learning_rate": 0.0002950687146321746, "loss": 0.5224, "step": 696000 }, { "epoch": 93.78873618970628, "grad_norm": 0.18930520117282867, "learning_rate": 0.000295031288361927, "loss": 0.5233, "step": 696100 }, { "epoch": 93.80220964699542, "grad_norm": 0.18504808843135834, "learning_rate": 0.0002949938620916794, "loss": 0.5239, "step": 696200 }, { "epoch": 93.81568310428456, "grad_norm": 0.19891873002052307, "learning_rate": 0.00029495643582143176, "loss": 0.5234, "step": 696300 }, { "epoch": 93.8291565615737, "grad_norm": 0.18194499611854553, "learning_rate": 0.00029491900955118416, "loss": 0.5239, "step": 696400 }, { "epoch": 93.84263001886283, "grad_norm": 0.17752869427204132, "learning_rate": 0.00029488158328093656, "loss": 0.5241, "step": 696500 }, { "epoch": 93.85610347615199, "grad_norm": 0.16341407597064972, "learning_rate": 0.00029484415701068895, "loss": 0.5228, "step": 696600 }, { "epoch": 93.86957693344112, "grad_norm": 0.18255843222141266, "learning_rate": 0.00029480673074044135, "loss": 0.5228, "step": 696700 }, { "epoch": 93.88305039073026, "grad_norm": 0.18823067843914032, "learning_rate": 0.00029476930447019375, "loss": 0.5233, "step": 696800 }, { "epoch": 93.8965238480194, "grad_norm": 0.167974054813385, "learning_rate": 0.00029473187819994615, "loss": 0.5228, "step": 696900 }, { "epoch": 93.90999730530854, "grad_norm": 0.19403384625911713, "learning_rate": 0.0002946944519296985, "loss": 0.5231, "step": 697000 }, { "epoch": 93.92347076259769, "grad_norm": 0.2011253535747528, "learning_rate": 0.0002946570256594509, "loss": 0.5234, "step": 697100 }, { "epoch": 93.93694421988683, "grad_norm": 0.18637077510356903, "learning_rate": 0.0002946195993892033, "loss": 0.5219, "step": 697200 }, { "epoch": 93.95041767717596, "grad_norm": 0.1780603677034378, "learning_rate": 0.0002945821731189557, "loss": 0.5225, "step": 697300 }, { "epoch": 93.9638911344651, "grad_norm": 0.18962478637695312, "learning_rate": 0.0002945447468487081, "loss": 0.5236, "step": 697400 }, { "epoch": 93.97736459175424, "grad_norm": 0.1928565949201584, "learning_rate": 0.00029450732057846043, "loss": 0.5235, "step": 697500 }, { "epoch": 93.99083804904339, "grad_norm": 0.1747860312461853, "learning_rate": 0.00029446989430821283, "loss": 0.5241, "step": 697600 }, { "epoch": 94.0, "eval_loss": 0.5118894577026367, "eval_runtime": 4.9519, "eval_samples_per_second": 1009.704, "eval_steps_per_second": 15.953, "step": 697668 }, { "epoch": 94.00431150633253, "grad_norm": 0.21682192385196686, "learning_rate": 0.00029443246803796517, "loss": 0.5217, "step": 697700 }, { "epoch": 94.01778496362166, "grad_norm": 0.21032312512397766, "learning_rate": 0.00029439504176771757, "loss": 0.522, "step": 697800 }, { "epoch": 94.0312584209108, "grad_norm": 0.173993781208992, "learning_rate": 0.00029435761549746997, "loss": 0.5223, "step": 697900 }, { "epoch": 94.04473187819994, "grad_norm": 0.18371060490608215, "learning_rate": 0.00029432018922722237, "loss": 0.5221, "step": 698000 }, { "epoch": 94.05820533548909, "grad_norm": 0.1732446402311325, "learning_rate": 0.00029428276295697476, "loss": 0.5225, "step": 698100 }, { "epoch": 94.07167879277823, "grad_norm": 0.1796966791152954, "learning_rate": 0.00029424533668672716, "loss": 0.5227, "step": 698200 }, { "epoch": 94.08515225006737, "grad_norm": 0.20513981580734253, "learning_rate": 0.00029420791041647956, "loss": 0.523, "step": 698300 }, { "epoch": 94.0986257073565, "grad_norm": 0.1759486347436905, "learning_rate": 0.0002941704841462319, "loss": 0.5242, "step": 698400 }, { "epoch": 94.11209916464564, "grad_norm": 0.17762942612171173, "learning_rate": 0.0002941330578759843, "loss": 0.5217, "step": 698500 }, { "epoch": 94.1255726219348, "grad_norm": 0.18017913401126862, "learning_rate": 0.0002940956316057367, "loss": 0.5226, "step": 698600 }, { "epoch": 94.13904607922393, "grad_norm": 0.17408691346645355, "learning_rate": 0.0002940582053354891, "loss": 0.5222, "step": 698700 }, { "epoch": 94.15251953651307, "grad_norm": 0.1993837207555771, "learning_rate": 0.0002940207790652415, "loss": 0.523, "step": 698800 }, { "epoch": 94.1659929938022, "grad_norm": 0.19028015434741974, "learning_rate": 0.0002939833527949939, "loss": 0.5223, "step": 698900 }, { "epoch": 94.17946645109134, "grad_norm": 0.18538329005241394, "learning_rate": 0.0002939459265247463, "loss": 0.5222, "step": 699000 }, { "epoch": 94.1929399083805, "grad_norm": 0.1857767254114151, "learning_rate": 0.0002939085002544987, "loss": 0.523, "step": 699100 }, { "epoch": 94.20641336566963, "grad_norm": 0.17437945306301117, "learning_rate": 0.00029387107398425104, "loss": 0.523, "step": 699200 }, { "epoch": 94.21988682295877, "grad_norm": 0.1817256510257721, "learning_rate": 0.0002938336477140034, "loss": 0.5224, "step": 699300 }, { "epoch": 94.23336028024791, "grad_norm": 0.18669940531253815, "learning_rate": 0.0002937962214437558, "loss": 0.5228, "step": 699400 }, { "epoch": 94.24683373753705, "grad_norm": 0.17742404341697693, "learning_rate": 0.0002937587951735082, "loss": 0.5228, "step": 699500 }, { "epoch": 94.2603071948262, "grad_norm": 0.18409943580627441, "learning_rate": 0.0002937213689032606, "loss": 0.5224, "step": 699600 }, { "epoch": 94.27378065211533, "grad_norm": 0.20300227403640747, "learning_rate": 0.000293683942633013, "loss": 0.5236, "step": 699700 }, { "epoch": 94.28725410940447, "grad_norm": 0.19445565342903137, "learning_rate": 0.00029364651636276537, "loss": 0.5238, "step": 699800 }, { "epoch": 94.30072756669361, "grad_norm": 0.18765023350715637, "learning_rate": 0.0002936090900925177, "loss": 0.5228, "step": 699900 }, { "epoch": 94.31420102398275, "grad_norm": 0.18512558937072754, "learning_rate": 0.0002935716638222701, "loss": 0.5229, "step": 700000 }, { "epoch": 94.3276744812719, "grad_norm": 0.2284773886203766, "learning_rate": 0.0002935342375520225, "loss": 0.523, "step": 700100 }, { "epoch": 94.34114793856104, "grad_norm": 0.17466995120048523, "learning_rate": 0.0002934968112817749, "loss": 0.5233, "step": 700200 }, { "epoch": 94.35462139585017, "grad_norm": 0.18043150007724762, "learning_rate": 0.0002934593850115273, "loss": 0.5219, "step": 700300 }, { "epoch": 94.36809485313931, "grad_norm": 0.21195508539676666, "learning_rate": 0.0002934219587412797, "loss": 0.5232, "step": 700400 }, { "epoch": 94.38156831042846, "grad_norm": 0.1960461288690567, "learning_rate": 0.0002933845324710321, "loss": 0.5222, "step": 700500 }, { "epoch": 94.3950417677176, "grad_norm": 0.17179718613624573, "learning_rate": 0.00029334710620078445, "loss": 0.5225, "step": 700600 }, { "epoch": 94.40851522500674, "grad_norm": 0.20699681341648102, "learning_rate": 0.00029330967993053685, "loss": 0.5232, "step": 700700 }, { "epoch": 94.42198868229588, "grad_norm": 0.21011970937252045, "learning_rate": 0.00029327225366028925, "loss": 0.5224, "step": 700800 }, { "epoch": 94.43546213958501, "grad_norm": 0.18806912004947662, "learning_rate": 0.00029323482739004164, "loss": 0.5231, "step": 700900 }, { "epoch": 94.44893559687416, "grad_norm": 0.1749875545501709, "learning_rate": 0.00029319740111979404, "loss": 0.5219, "step": 701000 }, { "epoch": 94.4624090541633, "grad_norm": 0.1855938881635666, "learning_rate": 0.0002931599748495464, "loss": 0.5226, "step": 701100 }, { "epoch": 94.47588251145244, "grad_norm": 0.17123809456825256, "learning_rate": 0.0002931225485792988, "loss": 0.5239, "step": 701200 }, { "epoch": 94.48935596874158, "grad_norm": 0.20363645255565643, "learning_rate": 0.00029308512230905113, "loss": 0.5229, "step": 701300 }, { "epoch": 94.50282942603071, "grad_norm": 0.20384660363197327, "learning_rate": 0.0002930476960388035, "loss": 0.5224, "step": 701400 }, { "epoch": 94.51630288331987, "grad_norm": 0.16678699851036072, "learning_rate": 0.0002930102697685559, "loss": 0.5219, "step": 701500 }, { "epoch": 94.529776340609, "grad_norm": 0.1775287389755249, "learning_rate": 0.0002929728434983083, "loss": 0.5226, "step": 701600 }, { "epoch": 94.54324979789814, "grad_norm": 0.19179579615592957, "learning_rate": 0.0002929354172280607, "loss": 0.5231, "step": 701700 }, { "epoch": 94.55672325518728, "grad_norm": 0.1911093145608902, "learning_rate": 0.0002928979909578131, "loss": 0.5243, "step": 701800 }, { "epoch": 94.57019671247642, "grad_norm": 0.18186962604522705, "learning_rate": 0.0002928605646875655, "loss": 0.5224, "step": 701900 }, { "epoch": 94.58367016976557, "grad_norm": 0.19009339809417725, "learning_rate": 0.0002928231384173179, "loss": 0.523, "step": 702000 }, { "epoch": 94.5971436270547, "grad_norm": 0.17108382284641266, "learning_rate": 0.00029278571214707026, "loss": 0.5223, "step": 702100 }, { "epoch": 94.61061708434384, "grad_norm": 0.19059503078460693, "learning_rate": 0.00029274828587682266, "loss": 0.5226, "step": 702200 }, { "epoch": 94.62409054163298, "grad_norm": 0.19769278168678284, "learning_rate": 0.00029271085960657506, "loss": 0.5231, "step": 702300 }, { "epoch": 94.63756399892212, "grad_norm": 0.18687328696250916, "learning_rate": 0.00029267343333632745, "loss": 0.5242, "step": 702400 }, { "epoch": 94.65103745621127, "grad_norm": 0.18281330168247223, "learning_rate": 0.00029263600706607985, "loss": 0.5239, "step": 702500 }, { "epoch": 94.66451091350041, "grad_norm": 0.19114643335342407, "learning_rate": 0.00029259858079583225, "loss": 0.5236, "step": 702600 }, { "epoch": 94.67798437078955, "grad_norm": 0.19333960115909576, "learning_rate": 0.00029256115452558465, "loss": 0.523, "step": 702700 }, { "epoch": 94.69145782807868, "grad_norm": 0.1965240240097046, "learning_rate": 0.000292523728255337, "loss": 0.5227, "step": 702800 }, { "epoch": 94.70493128536782, "grad_norm": 0.19884730875492096, "learning_rate": 0.00029248630198508934, "loss": 0.5226, "step": 702900 }, { "epoch": 94.71840474265697, "grad_norm": 0.18729977309703827, "learning_rate": 0.00029244887571484174, "loss": 0.5239, "step": 703000 }, { "epoch": 94.73187819994611, "grad_norm": 0.1860930621623993, "learning_rate": 0.00029241144944459413, "loss": 0.5222, "step": 703100 }, { "epoch": 94.74535165723525, "grad_norm": 0.1861838400363922, "learning_rate": 0.00029237402317434653, "loss": 0.5221, "step": 703200 }, { "epoch": 94.75882511452438, "grad_norm": 0.17986220121383667, "learning_rate": 0.00029233659690409893, "loss": 0.523, "step": 703300 }, { "epoch": 94.77229857181352, "grad_norm": 0.1926204413175583, "learning_rate": 0.00029229917063385133, "loss": 0.5222, "step": 703400 }, { "epoch": 94.78577202910267, "grad_norm": 0.20427332818508148, "learning_rate": 0.00029226174436360367, "loss": 0.5225, "step": 703500 }, { "epoch": 94.79924548639181, "grad_norm": 0.19252969324588776, "learning_rate": 0.00029222431809335607, "loss": 0.5226, "step": 703600 }, { "epoch": 94.81271894368095, "grad_norm": 0.18984580039978027, "learning_rate": 0.00029218689182310847, "loss": 0.5227, "step": 703700 }, { "epoch": 94.82619240097009, "grad_norm": 0.18676522374153137, "learning_rate": 0.00029214946555286087, "loss": 0.5227, "step": 703800 }, { "epoch": 94.83966585825922, "grad_norm": 0.19871100783348083, "learning_rate": 0.00029211203928261327, "loss": 0.5221, "step": 703900 }, { "epoch": 94.85313931554838, "grad_norm": 0.18752004206180573, "learning_rate": 0.00029207461301236566, "loss": 0.5234, "step": 704000 }, { "epoch": 94.86661277283751, "grad_norm": 0.18200543522834778, "learning_rate": 0.00029203718674211806, "loss": 0.5229, "step": 704100 }, { "epoch": 94.88008623012665, "grad_norm": 0.2073075920343399, "learning_rate": 0.00029199976047187046, "loss": 0.5219, "step": 704200 }, { "epoch": 94.89355968741579, "grad_norm": 0.17967191338539124, "learning_rate": 0.0002919623342016228, "loss": 0.5238, "step": 704300 }, { "epoch": 94.90703314470493, "grad_norm": 0.19033488631248474, "learning_rate": 0.0002919249079313752, "loss": 0.523, "step": 704400 }, { "epoch": 94.92050660199408, "grad_norm": 0.19411924481391907, "learning_rate": 0.0002918874816611276, "loss": 0.524, "step": 704500 }, { "epoch": 94.93398005928321, "grad_norm": 0.1792326271533966, "learning_rate": 0.00029185005539088, "loss": 0.5235, "step": 704600 }, { "epoch": 94.94745351657235, "grad_norm": 0.19451342523097992, "learning_rate": 0.00029181262912063234, "loss": 0.5223, "step": 704700 }, { "epoch": 94.96092697386149, "grad_norm": 0.18401820957660675, "learning_rate": 0.00029177520285038474, "loss": 0.524, "step": 704800 }, { "epoch": 94.97440043115063, "grad_norm": 0.17833800613880157, "learning_rate": 0.00029173777658013714, "loss": 0.5222, "step": 704900 }, { "epoch": 94.98787388843978, "grad_norm": 0.1776718944311142, "learning_rate": 0.0002917003503098895, "loss": 0.5226, "step": 705000 }, { "epoch": 95.0, "eval_loss": 0.5117392539978027, "eval_runtime": 4.9552, "eval_samples_per_second": 1009.038, "eval_steps_per_second": 15.943, "step": 705090 }, { "epoch": 95.00134734572892, "grad_norm": 0.1820594221353531, "learning_rate": 0.0002916629240396419, "loss": 0.5234, "step": 705100 }, { "epoch": 95.01482080301805, "grad_norm": 0.17345169186592102, "learning_rate": 0.0002916254977693943, "loss": 0.5223, "step": 705200 }, { "epoch": 95.02829426030719, "grad_norm": 0.19404955208301544, "learning_rate": 0.0002915880714991467, "loss": 0.523, "step": 705300 }, { "epoch": 95.04176771759633, "grad_norm": 0.17958186566829681, "learning_rate": 0.0002915506452288991, "loss": 0.5221, "step": 705400 }, { "epoch": 95.05524117488548, "grad_norm": 0.19313064217567444, "learning_rate": 0.0002915132189586515, "loss": 0.5216, "step": 705500 }, { "epoch": 95.06871463217462, "grad_norm": 0.1909513920545578, "learning_rate": 0.0002914757926884039, "loss": 0.5225, "step": 705600 }, { "epoch": 95.08218808946376, "grad_norm": 0.19369301199913025, "learning_rate": 0.0002914383664181562, "loss": 0.5227, "step": 705700 }, { "epoch": 95.0956615467529, "grad_norm": 0.18028217554092407, "learning_rate": 0.0002914009401479086, "loss": 0.5232, "step": 705800 }, { "epoch": 95.10913500404203, "grad_norm": 0.19462719559669495, "learning_rate": 0.000291363513877661, "loss": 0.522, "step": 705900 }, { "epoch": 95.12260846133118, "grad_norm": 0.2136775702238083, "learning_rate": 0.0002913260876074134, "loss": 0.5227, "step": 706000 }, { "epoch": 95.13608191862032, "grad_norm": 0.19212883710861206, "learning_rate": 0.0002912886613371658, "loss": 0.5227, "step": 706100 }, { "epoch": 95.14955537590946, "grad_norm": 0.17325977981090546, "learning_rate": 0.0002912512350669182, "loss": 0.5232, "step": 706200 }, { "epoch": 95.1630288331986, "grad_norm": 0.18062567710876465, "learning_rate": 0.0002912138087966706, "loss": 0.5221, "step": 706300 }, { "epoch": 95.17650229048773, "grad_norm": 0.18218502402305603, "learning_rate": 0.000291176382526423, "loss": 0.5231, "step": 706400 }, { "epoch": 95.18997574777688, "grad_norm": 0.18082834780216217, "learning_rate": 0.0002911389562561753, "loss": 0.5233, "step": 706500 }, { "epoch": 95.20344920506602, "grad_norm": 0.19332224130630493, "learning_rate": 0.0002911015299859277, "loss": 0.5217, "step": 706600 }, { "epoch": 95.21692266235516, "grad_norm": 0.17943823337554932, "learning_rate": 0.0002910641037156801, "loss": 0.5228, "step": 706700 }, { "epoch": 95.2303961196443, "grad_norm": 0.20554178953170776, "learning_rate": 0.0002910266774454325, "loss": 0.5226, "step": 706800 }, { "epoch": 95.24386957693343, "grad_norm": 0.19103063642978668, "learning_rate": 0.0002909892511751849, "loss": 0.5229, "step": 706900 }, { "epoch": 95.25734303422259, "grad_norm": 0.2167213261127472, "learning_rate": 0.0002909518249049373, "loss": 0.5228, "step": 707000 }, { "epoch": 95.27081649151172, "grad_norm": 0.20879121124744415, "learning_rate": 0.0002909143986346897, "loss": 0.5221, "step": 707100 }, { "epoch": 95.28428994880086, "grad_norm": 0.19365574419498444, "learning_rate": 0.00029087697236444203, "loss": 0.523, "step": 707200 }, { "epoch": 95.29776340609, "grad_norm": 0.1799764186143875, "learning_rate": 0.0002908395460941944, "loss": 0.5229, "step": 707300 }, { "epoch": 95.31123686337914, "grad_norm": 0.19225572049617767, "learning_rate": 0.0002908021198239468, "loss": 0.523, "step": 707400 }, { "epoch": 95.32471032066829, "grad_norm": 0.20773287117481232, "learning_rate": 0.0002907646935536992, "loss": 0.5217, "step": 707500 }, { "epoch": 95.33818377795743, "grad_norm": 0.18117167055606842, "learning_rate": 0.0002907272672834516, "loss": 0.5219, "step": 707600 }, { "epoch": 95.35165723524656, "grad_norm": 0.1877511590719223, "learning_rate": 0.000290689841013204, "loss": 0.5223, "step": 707700 }, { "epoch": 95.3651306925357, "grad_norm": 0.23672688007354736, "learning_rate": 0.0002906524147429564, "loss": 0.5224, "step": 707800 }, { "epoch": 95.37860414982484, "grad_norm": 0.21316350996494293, "learning_rate": 0.00029061498847270876, "loss": 0.5228, "step": 707900 }, { "epoch": 95.39207760711399, "grad_norm": 0.18906241655349731, "learning_rate": 0.00029057756220246116, "loss": 0.5224, "step": 708000 }, { "epoch": 95.40555106440313, "grad_norm": 0.17778439819812775, "learning_rate": 0.00029054013593221356, "loss": 0.5239, "step": 708100 }, { "epoch": 95.41902452169226, "grad_norm": 0.18994279205799103, "learning_rate": 0.00029050270966196596, "loss": 0.5217, "step": 708200 }, { "epoch": 95.4324979789814, "grad_norm": 0.16524139046669006, "learning_rate": 0.0002904652833917183, "loss": 0.5232, "step": 708300 }, { "epoch": 95.44597143627055, "grad_norm": 0.17890778183937073, "learning_rate": 0.0002904278571214707, "loss": 0.5218, "step": 708400 }, { "epoch": 95.45944489355969, "grad_norm": 0.18178986012935638, "learning_rate": 0.0002903904308512231, "loss": 0.523, "step": 708500 }, { "epoch": 95.47291835084883, "grad_norm": 0.1859983503818512, "learning_rate": 0.00029035300458097544, "loss": 0.5231, "step": 708600 }, { "epoch": 95.48639180813797, "grad_norm": 0.2037556916475296, "learning_rate": 0.00029031557831072784, "loss": 0.5221, "step": 708700 }, { "epoch": 95.4998652654271, "grad_norm": 0.17180684208869934, "learning_rate": 0.00029027815204048024, "loss": 0.5229, "step": 708800 }, { "epoch": 95.51333872271626, "grad_norm": 0.20080384612083435, "learning_rate": 0.00029024072577023264, "loss": 0.5229, "step": 708900 }, { "epoch": 95.5268121800054, "grad_norm": 0.17723606526851654, "learning_rate": 0.00029020329949998503, "loss": 0.5229, "step": 709000 }, { "epoch": 95.54028563729453, "grad_norm": 0.1731012612581253, "learning_rate": 0.00029016587322973743, "loss": 0.5213, "step": 709100 }, { "epoch": 95.55375909458367, "grad_norm": 0.2325502634048462, "learning_rate": 0.00029012844695948983, "loss": 0.5226, "step": 709200 }, { "epoch": 95.5672325518728, "grad_norm": 0.1863650679588318, "learning_rate": 0.00029009102068924223, "loss": 0.5222, "step": 709300 }, { "epoch": 95.58070600916196, "grad_norm": 0.18925268948078156, "learning_rate": 0.00029005359441899457, "loss": 0.5212, "step": 709400 }, { "epoch": 95.5941794664511, "grad_norm": 0.1769581437110901, "learning_rate": 0.00029001616814874697, "loss": 0.5223, "step": 709500 }, { "epoch": 95.60765292374023, "grad_norm": 0.17834044992923737, "learning_rate": 0.00028997874187849937, "loss": 0.5231, "step": 709600 }, { "epoch": 95.62112638102937, "grad_norm": 0.19362734258174896, "learning_rate": 0.00028994131560825177, "loss": 0.5236, "step": 709700 }, { "epoch": 95.63459983831851, "grad_norm": 0.17220856249332428, "learning_rate": 0.00028990388933800417, "loss": 0.5227, "step": 709800 }, { "epoch": 95.64807329560766, "grad_norm": 0.17917141318321228, "learning_rate": 0.00028986646306775656, "loss": 0.5226, "step": 709900 }, { "epoch": 95.6615467528968, "grad_norm": 0.2204906940460205, "learning_rate": 0.00028982903679750896, "loss": 0.524, "step": 710000 }, { "epoch": 95.67502021018593, "grad_norm": 0.18735459446907043, "learning_rate": 0.0002897916105272613, "loss": 0.5241, "step": 710100 }, { "epoch": 95.68849366747507, "grad_norm": 0.18174593150615692, "learning_rate": 0.00028975418425701365, "loss": 0.5219, "step": 710200 }, { "epoch": 95.70196712476421, "grad_norm": 0.2111864686012268, "learning_rate": 0.00028971675798676605, "loss": 0.5231, "step": 710300 }, { "epoch": 95.71544058205336, "grad_norm": 0.18047483265399933, "learning_rate": 0.00028967933171651845, "loss": 0.5232, "step": 710400 }, { "epoch": 95.7289140393425, "grad_norm": 0.18961310386657715, "learning_rate": 0.00028964190544627084, "loss": 0.5232, "step": 710500 }, { "epoch": 95.74238749663164, "grad_norm": 0.1795778125524521, "learning_rate": 0.00028960447917602324, "loss": 0.5233, "step": 710600 }, { "epoch": 95.75586095392077, "grad_norm": 0.21127276122570038, "learning_rate": 0.00028956705290577564, "loss": 0.523, "step": 710700 }, { "epoch": 95.76933441120991, "grad_norm": 0.17938001453876495, "learning_rate": 0.000289529626635528, "loss": 0.5229, "step": 710800 }, { "epoch": 95.78280786849906, "grad_norm": 0.19308243691921234, "learning_rate": 0.0002894922003652804, "loss": 0.523, "step": 710900 }, { "epoch": 95.7962813257882, "grad_norm": 0.1780468076467514, "learning_rate": 0.0002894547740950328, "loss": 0.5219, "step": 711000 }, { "epoch": 95.80975478307734, "grad_norm": 0.21702976524829865, "learning_rate": 0.0002894173478247852, "loss": 0.5233, "step": 711100 }, { "epoch": 95.82322824036648, "grad_norm": 0.17047972977161407, "learning_rate": 0.0002893799215545376, "loss": 0.5231, "step": 711200 }, { "epoch": 95.83670169765561, "grad_norm": 0.18960827589035034, "learning_rate": 0.00028934249528429, "loss": 0.522, "step": 711300 }, { "epoch": 95.85017515494476, "grad_norm": 0.18834882974624634, "learning_rate": 0.0002893050690140424, "loss": 0.5228, "step": 711400 }, { "epoch": 95.8636486122339, "grad_norm": 0.19012999534606934, "learning_rate": 0.00028926764274379477, "loss": 0.5223, "step": 711500 }, { "epoch": 95.87712206952304, "grad_norm": 0.18600955605506897, "learning_rate": 0.0002892302164735471, "loss": 0.5228, "step": 711600 }, { "epoch": 95.89059552681218, "grad_norm": 0.18093451857566833, "learning_rate": 0.0002891927902032995, "loss": 0.5221, "step": 711700 }, { "epoch": 95.90406898410131, "grad_norm": 0.18062306940555573, "learning_rate": 0.0002891553639330519, "loss": 0.5228, "step": 711800 }, { "epoch": 95.91754244139047, "grad_norm": 0.1726381778717041, "learning_rate": 0.0002891179376628043, "loss": 0.5227, "step": 711900 }, { "epoch": 95.9310158986796, "grad_norm": 0.16839486360549927, "learning_rate": 0.00028908051139255666, "loss": 0.5224, "step": 712000 }, { "epoch": 95.94448935596874, "grad_norm": 0.1908797323703766, "learning_rate": 0.00028904308512230905, "loss": 0.523, "step": 712100 }, { "epoch": 95.95796281325788, "grad_norm": 0.193431556224823, "learning_rate": 0.00028900565885206145, "loss": 0.5225, "step": 712200 }, { "epoch": 95.97143627054702, "grad_norm": 0.18694765865802765, "learning_rate": 0.0002889682325818138, "loss": 0.5223, "step": 712300 }, { "epoch": 95.98490972783617, "grad_norm": 0.18116642534732819, "learning_rate": 0.0002889308063115662, "loss": 0.5231, "step": 712400 }, { "epoch": 95.9983831851253, "grad_norm": 0.20007793605327606, "learning_rate": 0.0002888933800413186, "loss": 0.5226, "step": 712500 }, { "epoch": 96.0, "eval_loss": 0.5114685297012329, "eval_runtime": 4.9525, "eval_samples_per_second": 1009.586, "eval_steps_per_second": 15.951, "step": 712512 }, { "epoch": 96.01185664241444, "grad_norm": 0.1811710000038147, "learning_rate": 0.000288855953771071, "loss": 0.5218, "step": 712600 }, { "epoch": 96.02533009970358, "grad_norm": 0.1941787749528885, "learning_rate": 0.0002888185275008234, "loss": 0.5214, "step": 712700 }, { "epoch": 96.03880355699272, "grad_norm": 0.19701707363128662, "learning_rate": 0.0002887811012305758, "loss": 0.5223, "step": 712800 }, { "epoch": 96.05227701428187, "grad_norm": 0.19167600572109222, "learning_rate": 0.0002887436749603282, "loss": 0.5221, "step": 712900 }, { "epoch": 96.06575047157101, "grad_norm": 0.17858314514160156, "learning_rate": 0.00028870624869008053, "loss": 0.5225, "step": 713000 }, { "epoch": 96.07922392886015, "grad_norm": 0.2007930427789688, "learning_rate": 0.00028866882241983293, "loss": 0.5223, "step": 713100 }, { "epoch": 96.09269738614928, "grad_norm": 0.18336080014705658, "learning_rate": 0.0002886313961495853, "loss": 0.5214, "step": 713200 }, { "epoch": 96.10617084343842, "grad_norm": 0.18901588022708893, "learning_rate": 0.0002885939698793377, "loss": 0.5219, "step": 713300 }, { "epoch": 96.11964430072757, "grad_norm": 0.23817165195941925, "learning_rate": 0.0002885565436090901, "loss": 0.522, "step": 713400 }, { "epoch": 96.13311775801671, "grad_norm": 0.18332539498806, "learning_rate": 0.0002885191173388425, "loss": 0.5227, "step": 713500 }, { "epoch": 96.14659121530585, "grad_norm": 0.17459805309772491, "learning_rate": 0.0002884816910685949, "loss": 0.5219, "step": 713600 }, { "epoch": 96.16006467259498, "grad_norm": 0.17003099620342255, "learning_rate": 0.00028844426479834726, "loss": 0.5235, "step": 713700 }, { "epoch": 96.17353812988412, "grad_norm": 0.1794077306985855, "learning_rate": 0.0002884068385280996, "loss": 0.5216, "step": 713800 }, { "epoch": 96.18701158717327, "grad_norm": 0.18562164902687073, "learning_rate": 0.000288369412257852, "loss": 0.5219, "step": 713900 }, { "epoch": 96.20048504446241, "grad_norm": 0.17242754995822906, "learning_rate": 0.0002883319859876044, "loss": 0.5212, "step": 714000 }, { "epoch": 96.21395850175155, "grad_norm": 0.18148313462734222, "learning_rate": 0.0002882945597173568, "loss": 0.5232, "step": 714100 }, { "epoch": 96.22743195904069, "grad_norm": 0.1889973282814026, "learning_rate": 0.0002882571334471092, "loss": 0.5223, "step": 714200 }, { "epoch": 96.24090541632982, "grad_norm": 0.17718039453029633, "learning_rate": 0.0002882197071768616, "loss": 0.5234, "step": 714300 }, { "epoch": 96.25437887361898, "grad_norm": 0.18405863642692566, "learning_rate": 0.000288182280906614, "loss": 0.5227, "step": 714400 }, { "epoch": 96.26785233090811, "grad_norm": 0.17135992646217346, "learning_rate": 0.00028814485463636634, "loss": 0.5227, "step": 714500 }, { "epoch": 96.28132578819725, "grad_norm": 0.19410187005996704, "learning_rate": 0.00028810742836611874, "loss": 0.5224, "step": 714600 }, { "epoch": 96.29479924548639, "grad_norm": 0.18173222243785858, "learning_rate": 0.00028807000209587114, "loss": 0.5232, "step": 714700 }, { "epoch": 96.30827270277553, "grad_norm": 0.18119361996650696, "learning_rate": 0.00028803257582562353, "loss": 0.5227, "step": 714800 }, { "epoch": 96.32174616006468, "grad_norm": 0.17957240343093872, "learning_rate": 0.00028799514955537593, "loss": 0.5217, "step": 714900 }, { "epoch": 96.33521961735381, "grad_norm": 0.1887197196483612, "learning_rate": 0.00028795772328512833, "loss": 0.5227, "step": 715000 }, { "epoch": 96.34869307464295, "grad_norm": 0.23211714625358582, "learning_rate": 0.00028792029701488073, "loss": 0.5228, "step": 715100 }, { "epoch": 96.36216653193209, "grad_norm": 0.17982782423496246, "learning_rate": 0.0002878828707446331, "loss": 0.5213, "step": 715200 }, { "epoch": 96.37563998922123, "grad_norm": 0.17843103408813477, "learning_rate": 0.00028784544447438547, "loss": 0.5234, "step": 715300 }, { "epoch": 96.38911344651038, "grad_norm": 0.17092712223529816, "learning_rate": 0.00028780801820413787, "loss": 0.5227, "step": 715400 }, { "epoch": 96.40258690379952, "grad_norm": 0.17941531538963318, "learning_rate": 0.00028777059193389027, "loss": 0.5224, "step": 715500 }, { "epoch": 96.41606036108865, "grad_norm": 0.19431157410144806, "learning_rate": 0.0002877331656636426, "loss": 0.5222, "step": 715600 }, { "epoch": 96.42953381837779, "grad_norm": 0.20001807808876038, "learning_rate": 0.000287695739393395, "loss": 0.5218, "step": 715700 }, { "epoch": 96.44300727566694, "grad_norm": 0.1816766858100891, "learning_rate": 0.0002876583131231474, "loss": 0.5224, "step": 715800 }, { "epoch": 96.45648073295608, "grad_norm": 0.19021575152873993, "learning_rate": 0.00028762088685289975, "loss": 0.5229, "step": 715900 }, { "epoch": 96.46995419024522, "grad_norm": 0.19579598307609558, "learning_rate": 0.00028758346058265215, "loss": 0.5228, "step": 716000 }, { "epoch": 96.48342764753436, "grad_norm": 0.22743633389472961, "learning_rate": 0.00028754603431240455, "loss": 0.522, "step": 716100 }, { "epoch": 96.4969011048235, "grad_norm": 0.17913639545440674, "learning_rate": 0.00028750860804215695, "loss": 0.5229, "step": 716200 }, { "epoch": 96.51037456211264, "grad_norm": 0.17683516442775726, "learning_rate": 0.00028747118177190935, "loss": 0.5222, "step": 716300 }, { "epoch": 96.52384801940178, "grad_norm": 0.2007060945034027, "learning_rate": 0.00028743375550166174, "loss": 0.5222, "step": 716400 }, { "epoch": 96.53732147669092, "grad_norm": 0.1924578845500946, "learning_rate": 0.00028739632923141414, "loss": 0.5221, "step": 716500 }, { "epoch": 96.55079493398006, "grad_norm": 0.18029673397541046, "learning_rate": 0.0002873589029611665, "loss": 0.5219, "step": 716600 }, { "epoch": 96.5642683912692, "grad_norm": 0.290314644575119, "learning_rate": 0.0002873214766909189, "loss": 0.5222, "step": 716700 }, { "epoch": 96.57774184855835, "grad_norm": 0.1788051426410675, "learning_rate": 0.0002872840504206713, "loss": 0.5223, "step": 716800 }, { "epoch": 96.59121530584748, "grad_norm": 0.20173092186450958, "learning_rate": 0.0002872466241504237, "loss": 0.5226, "step": 716900 }, { "epoch": 96.60468876313662, "grad_norm": 0.1771472692489624, "learning_rate": 0.0002872091978801761, "loss": 0.5224, "step": 717000 }, { "epoch": 96.61816222042576, "grad_norm": 0.22781701385974884, "learning_rate": 0.0002871717716099285, "loss": 0.5227, "step": 717100 }, { "epoch": 96.6316356777149, "grad_norm": 0.2002069056034088, "learning_rate": 0.0002871343453396809, "loss": 0.5222, "step": 717200 }, { "epoch": 96.64510913500405, "grad_norm": 0.18238565325737, "learning_rate": 0.0002870969190694333, "loss": 0.5221, "step": 717300 }, { "epoch": 96.65858259229319, "grad_norm": 0.182054340839386, "learning_rate": 0.00028705949279918556, "loss": 0.5229, "step": 717400 }, { "epoch": 96.67205604958232, "grad_norm": 0.17328771948814392, "learning_rate": 0.00028702206652893796, "loss": 0.5225, "step": 717500 }, { "epoch": 96.68552950687146, "grad_norm": 0.21185079216957092, "learning_rate": 0.00028698464025869036, "loss": 0.523, "step": 717600 }, { "epoch": 96.6990029641606, "grad_norm": 0.18636102974414825, "learning_rate": 0.00028694721398844276, "loss": 0.5224, "step": 717700 }, { "epoch": 96.71247642144975, "grad_norm": 0.2089177966117859, "learning_rate": 0.00028690978771819516, "loss": 0.5227, "step": 717800 }, { "epoch": 96.72594987873889, "grad_norm": 0.19574575126171112, "learning_rate": 0.00028687236144794755, "loss": 0.5219, "step": 717900 }, { "epoch": 96.73942333602803, "grad_norm": 0.18756195902824402, "learning_rate": 0.00028683493517769995, "loss": 0.522, "step": 718000 }, { "epoch": 96.75289679331716, "grad_norm": 0.20693308115005493, "learning_rate": 0.0002867975089074523, "loss": 0.5232, "step": 718100 }, { "epoch": 96.7663702506063, "grad_norm": 0.1783106029033661, "learning_rate": 0.0002867600826372047, "loss": 0.5223, "step": 718200 }, { "epoch": 96.77984370789545, "grad_norm": 0.20046482980251312, "learning_rate": 0.0002867226563669571, "loss": 0.5227, "step": 718300 }, { "epoch": 96.79331716518459, "grad_norm": 0.19648335874080658, "learning_rate": 0.0002866852300967095, "loss": 0.5237, "step": 718400 }, { "epoch": 96.80679062247373, "grad_norm": 0.1909601390361786, "learning_rate": 0.0002866478038264619, "loss": 0.5229, "step": 718500 }, { "epoch": 96.82026407976286, "grad_norm": 0.19479787349700928, "learning_rate": 0.0002866103775562143, "loss": 0.5226, "step": 718600 }, { "epoch": 96.833737537052, "grad_norm": 0.1974005401134491, "learning_rate": 0.0002865729512859667, "loss": 0.5233, "step": 718700 }, { "epoch": 96.84721099434115, "grad_norm": 0.2040482461452484, "learning_rate": 0.00028653552501571903, "loss": 0.5225, "step": 718800 }, { "epoch": 96.86068445163029, "grad_norm": 0.18931913375854492, "learning_rate": 0.00028649809874547143, "loss": 0.5218, "step": 718900 }, { "epoch": 96.87415790891943, "grad_norm": 0.1883486807346344, "learning_rate": 0.0002864606724752238, "loss": 0.5216, "step": 719000 }, { "epoch": 96.88763136620857, "grad_norm": 0.18975022435188293, "learning_rate": 0.0002864232462049762, "loss": 0.5227, "step": 719100 }, { "epoch": 96.9011048234977, "grad_norm": 0.17646729946136475, "learning_rate": 0.00028638581993472857, "loss": 0.5232, "step": 719200 }, { "epoch": 96.91457828078686, "grad_norm": 0.17611093819141388, "learning_rate": 0.00028634839366448097, "loss": 0.5222, "step": 719300 }, { "epoch": 96.928051738076, "grad_norm": 0.1817905604839325, "learning_rate": 0.00028631096739423337, "loss": 0.5225, "step": 719400 }, { "epoch": 96.94152519536513, "grad_norm": 0.2132764458656311, "learning_rate": 0.00028627354112398576, "loss": 0.5227, "step": 719500 }, { "epoch": 96.95499865265427, "grad_norm": 0.17761015892028809, "learning_rate": 0.0002862361148537381, "loss": 0.5225, "step": 719600 }, { "epoch": 96.9684721099434, "grad_norm": 0.1805490106344223, "learning_rate": 0.0002861986885834905, "loss": 0.5226, "step": 719700 }, { "epoch": 96.98194556723256, "grad_norm": 0.19137020409107208, "learning_rate": 0.0002861612623132429, "loss": 0.5225, "step": 719800 }, { "epoch": 96.9954190245217, "grad_norm": 0.2305106520652771, "learning_rate": 0.0002861238360429953, "loss": 0.5227, "step": 719900 }, { "epoch": 97.0, "eval_loss": 0.5115074515342712, "eval_runtime": 4.9582, "eval_samples_per_second": 1008.44, "eval_steps_per_second": 15.933, "step": 719934 }, { "epoch": 97.00889248181083, "grad_norm": 0.18457642197608948, "learning_rate": 0.0002860864097727477, "loss": 0.5219, "step": 720000 }, { "epoch": 97.02236593909997, "grad_norm": 0.17778262495994568, "learning_rate": 0.0002860489835025001, "loss": 0.5212, "step": 720100 }, { "epoch": 97.03583939638911, "grad_norm": 0.18936806917190552, "learning_rate": 0.0002860115572322525, "loss": 0.5234, "step": 720200 }, { "epoch": 97.04931285367826, "grad_norm": 0.2157118171453476, "learning_rate": 0.00028597413096200484, "loss": 0.5218, "step": 720300 }, { "epoch": 97.0627863109674, "grad_norm": 0.1720123440027237, "learning_rate": 0.00028593670469175724, "loss": 0.5227, "step": 720400 }, { "epoch": 97.07625976825653, "grad_norm": 0.18479615449905396, "learning_rate": 0.00028589927842150964, "loss": 0.5229, "step": 720500 }, { "epoch": 97.08973322554567, "grad_norm": 0.1761920154094696, "learning_rate": 0.00028586185215126204, "loss": 0.5224, "step": 720600 }, { "epoch": 97.10320668283481, "grad_norm": 0.17101992666721344, "learning_rate": 0.00028582442588101443, "loss": 0.5216, "step": 720700 }, { "epoch": 97.11668014012396, "grad_norm": 0.18044210970401764, "learning_rate": 0.00028578699961076683, "loss": 0.5218, "step": 720800 }, { "epoch": 97.1301535974131, "grad_norm": 0.20594309270381927, "learning_rate": 0.00028574957334051923, "loss": 0.5226, "step": 720900 }, { "epoch": 97.14362705470224, "grad_norm": 0.18546882271766663, "learning_rate": 0.0002857121470702715, "loss": 0.5231, "step": 721000 }, { "epoch": 97.15710051199137, "grad_norm": 0.1866813451051712, "learning_rate": 0.0002856747208000239, "loss": 0.5226, "step": 721100 }, { "epoch": 97.17057396928051, "grad_norm": 0.1986604928970337, "learning_rate": 0.0002856372945297763, "loss": 0.5219, "step": 721200 }, { "epoch": 97.18404742656966, "grad_norm": 0.17669041454792023, "learning_rate": 0.0002855998682595287, "loss": 0.5216, "step": 721300 }, { "epoch": 97.1975208838588, "grad_norm": 0.1897980272769928, "learning_rate": 0.0002855624419892811, "loss": 0.522, "step": 721400 }, { "epoch": 97.21099434114794, "grad_norm": 0.22246187925338745, "learning_rate": 0.0002855250157190335, "loss": 0.5225, "step": 721500 }, { "epoch": 97.22446779843708, "grad_norm": 0.20526108145713806, "learning_rate": 0.0002854875894487859, "loss": 0.5224, "step": 721600 }, { "epoch": 97.23794125572621, "grad_norm": 0.2021414190530777, "learning_rate": 0.00028545016317853825, "loss": 0.5228, "step": 721700 }, { "epoch": 97.25141471301536, "grad_norm": 0.1957855373620987, "learning_rate": 0.00028541273690829065, "loss": 0.522, "step": 721800 }, { "epoch": 97.2648881703045, "grad_norm": 0.18524347245693207, "learning_rate": 0.00028537531063804305, "loss": 0.522, "step": 721900 }, { "epoch": 97.27836162759364, "grad_norm": 0.2111314833164215, "learning_rate": 0.00028533788436779545, "loss": 0.5217, "step": 722000 }, { "epoch": 97.29183508488278, "grad_norm": 0.19592498242855072, "learning_rate": 0.00028530045809754785, "loss": 0.5225, "step": 722100 }, { "epoch": 97.30530854217191, "grad_norm": 0.1997593641281128, "learning_rate": 0.00028526303182730025, "loss": 0.5225, "step": 722200 }, { "epoch": 97.31878199946107, "grad_norm": 0.18816131353378296, "learning_rate": 0.00028522560555705264, "loss": 0.5226, "step": 722300 }, { "epoch": 97.3322554567502, "grad_norm": 0.18178533017635345, "learning_rate": 0.00028518817928680504, "loss": 0.522, "step": 722400 }, { "epoch": 97.34572891403934, "grad_norm": 0.19400116801261902, "learning_rate": 0.0002851507530165574, "loss": 0.5218, "step": 722500 }, { "epoch": 97.35920237132848, "grad_norm": 0.18395766615867615, "learning_rate": 0.0002851133267463098, "loss": 0.5226, "step": 722600 }, { "epoch": 97.37267582861762, "grad_norm": 0.19589710235595703, "learning_rate": 0.0002850759004760622, "loss": 0.5222, "step": 722700 }, { "epoch": 97.38614928590677, "grad_norm": 0.1902022808790207, "learning_rate": 0.0002850384742058145, "loss": 0.523, "step": 722800 }, { "epoch": 97.3996227431959, "grad_norm": 0.20120863616466522, "learning_rate": 0.0002850010479355669, "loss": 0.5215, "step": 722900 }, { "epoch": 97.41309620048504, "grad_norm": 0.1766006201505661, "learning_rate": 0.0002849636216653193, "loss": 0.5229, "step": 723000 }, { "epoch": 97.42656965777418, "grad_norm": 0.2019764930009842, "learning_rate": 0.0002849261953950717, "loss": 0.5217, "step": 723100 }, { "epoch": 97.44004311506333, "grad_norm": 0.19278813898563385, "learning_rate": 0.00028488876912482406, "loss": 0.5219, "step": 723200 }, { "epoch": 97.45351657235247, "grad_norm": 0.19287006556987762, "learning_rate": 0.00028485134285457646, "loss": 0.5215, "step": 723300 }, { "epoch": 97.46699002964161, "grad_norm": 0.17765329778194427, "learning_rate": 0.00028481391658432886, "loss": 0.5221, "step": 723400 }, { "epoch": 97.48046348693074, "grad_norm": 0.18599018454551697, "learning_rate": 0.00028477649031408126, "loss": 0.523, "step": 723500 }, { "epoch": 97.49393694421988, "grad_norm": 0.18343950808048248, "learning_rate": 0.00028473906404383366, "loss": 0.5223, "step": 723600 }, { "epoch": 97.50741040150903, "grad_norm": 0.1814102977514267, "learning_rate": 0.00028470163777358606, "loss": 0.5215, "step": 723700 }, { "epoch": 97.52088385879817, "grad_norm": 0.19098545610904694, "learning_rate": 0.00028466421150333845, "loss": 0.5222, "step": 723800 }, { "epoch": 97.53435731608731, "grad_norm": 0.2132042795419693, "learning_rate": 0.0002846267852330908, "loss": 0.5226, "step": 723900 }, { "epoch": 97.54783077337645, "grad_norm": 0.18535059690475464, "learning_rate": 0.0002845893589628432, "loss": 0.5227, "step": 724000 }, { "epoch": 97.56130423066558, "grad_norm": 0.17808286845684052, "learning_rate": 0.0002845519326925956, "loss": 0.5234, "step": 724100 }, { "epoch": 97.57477768795474, "grad_norm": 0.18466299772262573, "learning_rate": 0.000284514506422348, "loss": 0.5225, "step": 724200 }, { "epoch": 97.58825114524387, "grad_norm": 0.17276261746883392, "learning_rate": 0.0002844770801521004, "loss": 0.5228, "step": 724300 }, { "epoch": 97.60172460253301, "grad_norm": 0.1928931325674057, "learning_rate": 0.0002844396538818528, "loss": 0.5217, "step": 724400 }, { "epoch": 97.61519805982215, "grad_norm": 0.19677582383155823, "learning_rate": 0.0002844022276116052, "loss": 0.523, "step": 724500 }, { "epoch": 97.62867151711129, "grad_norm": 0.17817717790603638, "learning_rate": 0.0002843648013413575, "loss": 0.5216, "step": 724600 }, { "epoch": 97.64214497440044, "grad_norm": 0.1870831698179245, "learning_rate": 0.0002843273750711099, "loss": 0.5213, "step": 724700 }, { "epoch": 97.65561843168958, "grad_norm": 0.17772425711154938, "learning_rate": 0.0002842899488008623, "loss": 0.5221, "step": 724800 }, { "epoch": 97.66909188897871, "grad_norm": 0.17944775521755219, "learning_rate": 0.00028425252253061467, "loss": 0.5235, "step": 724900 }, { "epoch": 97.68256534626785, "grad_norm": 0.1919795125722885, "learning_rate": 0.00028421509626036707, "loss": 0.5225, "step": 725000 }, { "epoch": 97.69603880355699, "grad_norm": 0.16962558031082153, "learning_rate": 0.00028417766999011947, "loss": 0.5218, "step": 725100 }, { "epoch": 97.70951226084614, "grad_norm": 0.21872000396251678, "learning_rate": 0.00028414024371987187, "loss": 0.5216, "step": 725200 }, { "epoch": 97.72298571813528, "grad_norm": 0.17227484285831451, "learning_rate": 0.00028410281744962427, "loss": 0.5218, "step": 725300 }, { "epoch": 97.73645917542441, "grad_norm": 0.1734735369682312, "learning_rate": 0.0002840653911793766, "loss": 0.5214, "step": 725400 }, { "epoch": 97.74993263271355, "grad_norm": 0.18946576118469238, "learning_rate": 0.000284027964909129, "loss": 0.523, "step": 725500 }, { "epoch": 97.76340609000269, "grad_norm": 0.18027029931545258, "learning_rate": 0.0002839905386388814, "loss": 0.5221, "step": 725600 }, { "epoch": 97.77687954729184, "grad_norm": 0.17338839173316956, "learning_rate": 0.0002839531123686338, "loss": 0.523, "step": 725700 }, { "epoch": 97.79035300458098, "grad_norm": 0.20168249309062958, "learning_rate": 0.0002839156860983862, "loss": 0.5226, "step": 725800 }, { "epoch": 97.80382646187012, "grad_norm": 0.17809076607227325, "learning_rate": 0.0002838782598281386, "loss": 0.5224, "step": 725900 }, { "epoch": 97.81729991915925, "grad_norm": 0.20484261214733124, "learning_rate": 0.000283840833557891, "loss": 0.5224, "step": 726000 }, { "epoch": 97.83077337644839, "grad_norm": 0.18850761651992798, "learning_rate": 0.00028380340728764334, "loss": 0.5212, "step": 726100 }, { "epoch": 97.84424683373754, "grad_norm": 0.17886915802955627, "learning_rate": 0.00028376598101739574, "loss": 0.5222, "step": 726200 }, { "epoch": 97.85772029102668, "grad_norm": 0.19486506283283234, "learning_rate": 0.00028372855474714814, "loss": 0.5221, "step": 726300 }, { "epoch": 97.87119374831582, "grad_norm": 0.1904607117176056, "learning_rate": 0.00028369112847690054, "loss": 0.522, "step": 726400 }, { "epoch": 97.88466720560496, "grad_norm": 0.1813652068376541, "learning_rate": 0.0002836537022066529, "loss": 0.5231, "step": 726500 }, { "epoch": 97.8981406628941, "grad_norm": 0.1900782585144043, "learning_rate": 0.0002836162759364053, "loss": 0.522, "step": 726600 }, { "epoch": 97.91161412018324, "grad_norm": 0.18526817858219147, "learning_rate": 0.0002835788496661577, "loss": 0.522, "step": 726700 }, { "epoch": 97.92508757747238, "grad_norm": 0.17991112172603607, "learning_rate": 0.00028354142339591, "loss": 0.5226, "step": 726800 }, { "epoch": 97.93856103476152, "grad_norm": 0.18820121884346008, "learning_rate": 0.0002835039971256624, "loss": 0.5223, "step": 726900 }, { "epoch": 97.95203449205066, "grad_norm": 0.198014497756958, "learning_rate": 0.0002834665708554148, "loss": 0.5211, "step": 727000 }, { "epoch": 97.9655079493398, "grad_norm": 0.19178232550621033, "learning_rate": 0.0002834291445851672, "loss": 0.5226, "step": 727100 }, { "epoch": 97.97898140662895, "grad_norm": 0.20615476369857788, "learning_rate": 0.0002833917183149196, "loss": 0.5227, "step": 727200 }, { "epoch": 97.99245486391808, "grad_norm": 0.1927938312292099, "learning_rate": 0.000283354292044672, "loss": 0.5221, "step": 727300 }, { "epoch": 98.0, "eval_loss": 0.5112837553024292, "eval_runtime": 4.9536, "eval_samples_per_second": 1009.361, "eval_steps_per_second": 15.948, "step": 727356 }, { "epoch": 98.00592832120722, "grad_norm": 0.19854143261909485, "learning_rate": 0.0002833168657744244, "loss": 0.522, "step": 727400 }, { "epoch": 98.01940177849636, "grad_norm": 0.17603962123394012, "learning_rate": 0.0002832794395041768, "loss": 0.5199, "step": 727500 }, { "epoch": 98.0328752357855, "grad_norm": 0.22543029487133026, "learning_rate": 0.00028324201323392915, "loss": 0.5218, "step": 727600 }, { "epoch": 98.04634869307465, "grad_norm": 0.18253183364868164, "learning_rate": 0.00028320458696368155, "loss": 0.5211, "step": 727700 }, { "epoch": 98.05982215036379, "grad_norm": 0.18220154941082, "learning_rate": 0.00028316716069343395, "loss": 0.5224, "step": 727800 }, { "epoch": 98.07329560765292, "grad_norm": 0.18702445924282074, "learning_rate": 0.00028312973442318635, "loss": 0.522, "step": 727900 }, { "epoch": 98.08676906494206, "grad_norm": 0.19644710421562195, "learning_rate": 0.00028309230815293875, "loss": 0.5239, "step": 728000 }, { "epoch": 98.1002425222312, "grad_norm": 0.18799911439418793, "learning_rate": 0.00028305488188269114, "loss": 0.5219, "step": 728100 }, { "epoch": 98.11371597952035, "grad_norm": 0.18223734200000763, "learning_rate": 0.00028301745561244354, "loss": 0.5223, "step": 728200 }, { "epoch": 98.12718943680949, "grad_norm": 0.17994552850723267, "learning_rate": 0.00028298002934219583, "loss": 0.5225, "step": 728300 }, { "epoch": 98.14066289409863, "grad_norm": 0.16709822416305542, "learning_rate": 0.00028294260307194823, "loss": 0.5214, "step": 728400 }, { "epoch": 98.15413635138776, "grad_norm": 0.17766298353672028, "learning_rate": 0.00028290517680170063, "loss": 0.522, "step": 728500 }, { "epoch": 98.1676098086769, "grad_norm": 0.19416730105876923, "learning_rate": 0.00028286775053145303, "loss": 0.522, "step": 728600 }, { "epoch": 98.18108326596605, "grad_norm": 0.1887141466140747, "learning_rate": 0.0002828303242612054, "loss": 0.523, "step": 728700 }, { "epoch": 98.19455672325519, "grad_norm": 0.19654808938503265, "learning_rate": 0.0002827928979909578, "loss": 0.5229, "step": 728800 }, { "epoch": 98.20803018054433, "grad_norm": 0.18101197481155396, "learning_rate": 0.0002827554717207102, "loss": 0.5216, "step": 728900 }, { "epoch": 98.22150363783346, "grad_norm": 0.19579145312309265, "learning_rate": 0.00028271804545046257, "loss": 0.523, "step": 729000 }, { "epoch": 98.2349770951226, "grad_norm": 0.1735994666814804, "learning_rate": 0.00028268061918021496, "loss": 0.5216, "step": 729100 }, { "epoch": 98.24845055241175, "grad_norm": 0.21671918034553528, "learning_rate": 0.00028264319290996736, "loss": 0.5219, "step": 729200 }, { "epoch": 98.26192400970089, "grad_norm": 0.18034544587135315, "learning_rate": 0.00028260576663971976, "loss": 0.5221, "step": 729300 }, { "epoch": 98.27539746699003, "grad_norm": 0.1811492145061493, "learning_rate": 0.00028256834036947216, "loss": 0.5222, "step": 729400 }, { "epoch": 98.28887092427917, "grad_norm": 0.18334196507930756, "learning_rate": 0.00028253091409922456, "loss": 0.5222, "step": 729500 }, { "epoch": 98.3023443815683, "grad_norm": 0.18905922770500183, "learning_rate": 0.00028249348782897696, "loss": 0.5217, "step": 729600 }, { "epoch": 98.31581783885746, "grad_norm": 0.18788175284862518, "learning_rate": 0.00028245606155872935, "loss": 0.523, "step": 729700 }, { "epoch": 98.3292912961466, "grad_norm": 0.1963902860879898, "learning_rate": 0.0002824186352884817, "loss": 0.5216, "step": 729800 }, { "epoch": 98.34276475343573, "grad_norm": 0.18780285120010376, "learning_rate": 0.0002823812090182341, "loss": 0.5207, "step": 729900 }, { "epoch": 98.35623821072487, "grad_norm": 0.2175162136554718, "learning_rate": 0.0002823437827479865, "loss": 0.5211, "step": 730000 }, { "epoch": 98.369711668014, "grad_norm": 0.19151927530765533, "learning_rate": 0.00028230635647773884, "loss": 0.5226, "step": 730100 }, { "epoch": 98.38318512530316, "grad_norm": 0.18309904634952545, "learning_rate": 0.00028226893020749124, "loss": 0.5224, "step": 730200 }, { "epoch": 98.3966585825923, "grad_norm": 0.17577996850013733, "learning_rate": 0.00028223150393724363, "loss": 0.5219, "step": 730300 }, { "epoch": 98.41013203988143, "grad_norm": 0.18067002296447754, "learning_rate": 0.00028219407766699603, "loss": 0.5228, "step": 730400 }, { "epoch": 98.42360549717057, "grad_norm": 0.17995859682559967, "learning_rate": 0.0002821566513967484, "loss": 0.5214, "step": 730500 }, { "epoch": 98.43707895445971, "grad_norm": 0.2066086083650589, "learning_rate": 0.0002821192251265008, "loss": 0.5217, "step": 730600 }, { "epoch": 98.45055241174886, "grad_norm": 0.21620208024978638, "learning_rate": 0.0002820817988562532, "loss": 0.5222, "step": 730700 }, { "epoch": 98.464025869038, "grad_norm": 0.17805571854114532, "learning_rate": 0.00028204437258600557, "loss": 0.5226, "step": 730800 }, { "epoch": 98.47749932632713, "grad_norm": 0.18712948262691498, "learning_rate": 0.00028200694631575797, "loss": 0.522, "step": 730900 }, { "epoch": 98.49097278361627, "grad_norm": 0.18300162255764008, "learning_rate": 0.00028196952004551037, "loss": 0.5215, "step": 731000 }, { "epoch": 98.50444624090542, "grad_norm": 0.1824488490819931, "learning_rate": 0.00028193209377526277, "loss": 0.522, "step": 731100 }, { "epoch": 98.51791969819456, "grad_norm": 0.19408783316612244, "learning_rate": 0.0002818946675050151, "loss": 0.5223, "step": 731200 }, { "epoch": 98.5313931554837, "grad_norm": 0.19937776029109955, "learning_rate": 0.0002818572412347675, "loss": 0.5226, "step": 731300 }, { "epoch": 98.54486661277284, "grad_norm": 0.19288478791713715, "learning_rate": 0.0002818198149645199, "loss": 0.5222, "step": 731400 }, { "epoch": 98.55834007006197, "grad_norm": 0.21338917315006256, "learning_rate": 0.0002817823886942723, "loss": 0.5216, "step": 731500 }, { "epoch": 98.57181352735113, "grad_norm": 0.18976151943206787, "learning_rate": 0.0002817449624240247, "loss": 0.5215, "step": 731600 }, { "epoch": 98.58528698464026, "grad_norm": 0.25139570236206055, "learning_rate": 0.0002817075361537771, "loss": 0.522, "step": 731700 }, { "epoch": 98.5987604419294, "grad_norm": 0.18516035377979279, "learning_rate": 0.0002816701098835295, "loss": 0.5218, "step": 731800 }, { "epoch": 98.61223389921854, "grad_norm": 0.1785147339105606, "learning_rate": 0.0002816326836132818, "loss": 0.5223, "step": 731900 }, { "epoch": 98.62570735650768, "grad_norm": 0.2071269154548645, "learning_rate": 0.0002815952573430342, "loss": 0.5233, "step": 732000 }, { "epoch": 98.63918081379683, "grad_norm": 0.18347430229187012, "learning_rate": 0.0002815578310727866, "loss": 0.5212, "step": 732100 }, { "epoch": 98.65265427108596, "grad_norm": 0.1893170177936554, "learning_rate": 0.000281520404802539, "loss": 0.5238, "step": 732200 }, { "epoch": 98.6661277283751, "grad_norm": 0.1766822785139084, "learning_rate": 0.0002814829785322914, "loss": 0.5215, "step": 732300 }, { "epoch": 98.67960118566424, "grad_norm": 0.18416531383991241, "learning_rate": 0.0002814455522620438, "loss": 0.521, "step": 732400 }, { "epoch": 98.69307464295338, "grad_norm": 0.2165343463420868, "learning_rate": 0.0002814081259917962, "loss": 0.5225, "step": 732500 }, { "epoch": 98.70654810024253, "grad_norm": 0.18406003713607788, "learning_rate": 0.0002813706997215486, "loss": 0.5212, "step": 732600 }, { "epoch": 98.72002155753167, "grad_norm": 0.1846037358045578, "learning_rate": 0.0002813332734513009, "loss": 0.5229, "step": 732700 }, { "epoch": 98.7334950148208, "grad_norm": 0.21802613139152527, "learning_rate": 0.0002812958471810533, "loss": 0.5219, "step": 732800 }, { "epoch": 98.74696847210994, "grad_norm": 0.1766752302646637, "learning_rate": 0.0002812584209108057, "loss": 0.5216, "step": 732900 }, { "epoch": 98.76044192939908, "grad_norm": 0.18542778491973877, "learning_rate": 0.0002812209946405581, "loss": 0.5232, "step": 733000 }, { "epoch": 98.77391538668823, "grad_norm": 0.18905800580978394, "learning_rate": 0.0002811835683703105, "loss": 0.522, "step": 733100 }, { "epoch": 98.78738884397737, "grad_norm": 0.1926591992378235, "learning_rate": 0.0002811461421000629, "loss": 0.5222, "step": 733200 }, { "epoch": 98.8008623012665, "grad_norm": 0.1843578964471817, "learning_rate": 0.0002811087158298153, "loss": 0.522, "step": 733300 }, { "epoch": 98.81433575855564, "grad_norm": 0.17043329775333405, "learning_rate": 0.00028107128955956765, "loss": 0.5223, "step": 733400 }, { "epoch": 98.82780921584478, "grad_norm": 0.17996962368488312, "learning_rate": 0.00028103386328932005, "loss": 0.5218, "step": 733500 }, { "epoch": 98.84128267313393, "grad_norm": 0.18533097207546234, "learning_rate": 0.00028099643701907245, "loss": 0.5207, "step": 733600 }, { "epoch": 98.85475613042307, "grad_norm": 0.21970729529857635, "learning_rate": 0.0002809590107488248, "loss": 0.5215, "step": 733700 }, { "epoch": 98.86822958771221, "grad_norm": 0.17488951981067657, "learning_rate": 0.0002809215844785772, "loss": 0.5218, "step": 733800 }, { "epoch": 98.88170304500134, "grad_norm": 0.208049476146698, "learning_rate": 0.0002808841582083296, "loss": 0.5217, "step": 733900 }, { "epoch": 98.89517650229048, "grad_norm": 0.18810656666755676, "learning_rate": 0.000280846731938082, "loss": 0.5232, "step": 734000 }, { "epoch": 98.90864995957963, "grad_norm": 0.1920100301504135, "learning_rate": 0.00028080930566783433, "loss": 0.5232, "step": 734100 }, { "epoch": 98.92212341686877, "grad_norm": 0.18189987540245056, "learning_rate": 0.00028077187939758673, "loss": 0.5227, "step": 734200 }, { "epoch": 98.93559687415791, "grad_norm": 0.18371599912643433, "learning_rate": 0.00028073445312733913, "loss": 0.5221, "step": 734300 }, { "epoch": 98.94907033144705, "grad_norm": 0.21399609744548798, "learning_rate": 0.00028069702685709153, "loss": 0.5225, "step": 734400 }, { "epoch": 98.96254378873618, "grad_norm": 0.18611010909080505, "learning_rate": 0.0002806596005868439, "loss": 0.523, "step": 734500 }, { "epoch": 98.97601724602534, "grad_norm": 0.18398486077785492, "learning_rate": 0.0002806221743165963, "loss": 0.5217, "step": 734600 }, { "epoch": 98.98949070331447, "grad_norm": 0.17936468124389648, "learning_rate": 0.0002805847480463487, "loss": 0.5212, "step": 734700 }, { "epoch": 99.0, "eval_loss": 0.5104764699935913, "eval_runtime": 4.9531, "eval_samples_per_second": 1009.478, "eval_steps_per_second": 15.95, "step": 734778 }, { "epoch": 99.00296416060361, "grad_norm": 0.19077911972999573, "learning_rate": 0.00028054732177610107, "loss": 0.5223, "step": 734800 }, { "epoch": 99.01643761789275, "grad_norm": 0.1805621087551117, "learning_rate": 0.00028050989550585347, "loss": 0.5216, "step": 734900 }, { "epoch": 99.02991107518189, "grad_norm": 0.18272632360458374, "learning_rate": 0.00028047246923560586, "loss": 0.5205, "step": 735000 }, { "epoch": 99.04338453247104, "grad_norm": 0.1939617246389389, "learning_rate": 0.00028043504296535826, "loss": 0.5207, "step": 735100 }, { "epoch": 99.05685798976018, "grad_norm": 0.17964553833007812, "learning_rate": 0.00028039761669511066, "loss": 0.5214, "step": 735200 }, { "epoch": 99.07033144704931, "grad_norm": 0.17790572345256805, "learning_rate": 0.00028036019042486306, "loss": 0.5203, "step": 735300 }, { "epoch": 99.08380490433845, "grad_norm": 0.2142571061849594, "learning_rate": 0.00028032276415461546, "loss": 0.5209, "step": 735400 }, { "epoch": 99.09727836162759, "grad_norm": 0.2240862399339676, "learning_rate": 0.0002802853378843678, "loss": 0.523, "step": 735500 }, { "epoch": 99.11075181891674, "grad_norm": 0.18619880080223083, "learning_rate": 0.00028024791161412014, "loss": 0.5217, "step": 735600 }, { "epoch": 99.12422527620588, "grad_norm": 0.20634953677654266, "learning_rate": 0.00028021048534387254, "loss": 0.5218, "step": 735700 }, { "epoch": 99.13769873349501, "grad_norm": 0.17835476994514465, "learning_rate": 0.00028017305907362494, "loss": 0.5217, "step": 735800 }, { "epoch": 99.15117219078415, "grad_norm": 0.18917948007583618, "learning_rate": 0.00028013563280337734, "loss": 0.5214, "step": 735900 }, { "epoch": 99.16464564807329, "grad_norm": 0.17712543904781342, "learning_rate": 0.00028009820653312974, "loss": 0.5211, "step": 736000 }, { "epoch": 99.17811910536244, "grad_norm": 0.18115469813346863, "learning_rate": 0.00028006078026288214, "loss": 0.5221, "step": 736100 }, { "epoch": 99.19159256265158, "grad_norm": 0.19132637977600098, "learning_rate": 0.00028002335399263453, "loss": 0.5216, "step": 736200 }, { "epoch": 99.20506601994072, "grad_norm": 0.22615322470664978, "learning_rate": 0.0002799859277223869, "loss": 0.522, "step": 736300 }, { "epoch": 99.21853947722985, "grad_norm": 0.16756492853164673, "learning_rate": 0.0002799485014521393, "loss": 0.522, "step": 736400 }, { "epoch": 99.23201293451899, "grad_norm": 0.19684313237667084, "learning_rate": 0.0002799110751818917, "loss": 0.5217, "step": 736500 }, { "epoch": 99.24548639180814, "grad_norm": 0.22684475779533386, "learning_rate": 0.00027987364891164407, "loss": 0.5218, "step": 736600 }, { "epoch": 99.25895984909728, "grad_norm": 0.17624573409557343, "learning_rate": 0.00027983622264139647, "loss": 0.5216, "step": 736700 }, { "epoch": 99.27243330638642, "grad_norm": 0.19082270562648773, "learning_rate": 0.00027979879637114887, "loss": 0.5229, "step": 736800 }, { "epoch": 99.28590676367556, "grad_norm": 0.19591474533081055, "learning_rate": 0.00027976137010090127, "loss": 0.5229, "step": 736900 }, { "epoch": 99.2993802209647, "grad_norm": 0.18678347766399384, "learning_rate": 0.0002797239438306536, "loss": 0.5209, "step": 737000 }, { "epoch": 99.31285367825384, "grad_norm": 0.1911178082227707, "learning_rate": 0.000279686517560406, "loss": 0.5215, "step": 737100 }, { "epoch": 99.32632713554298, "grad_norm": 0.19681480526924133, "learning_rate": 0.0002796490912901584, "loss": 0.5214, "step": 737200 }, { "epoch": 99.33980059283212, "grad_norm": 0.18990914523601532, "learning_rate": 0.00027961166501991075, "loss": 0.5217, "step": 737300 }, { "epoch": 99.35327405012126, "grad_norm": 0.1862216591835022, "learning_rate": 0.00027957423874966315, "loss": 0.5225, "step": 737400 }, { "epoch": 99.3667475074104, "grad_norm": 0.16911421716213226, "learning_rate": 0.00027953681247941555, "loss": 0.5209, "step": 737500 }, { "epoch": 99.38022096469955, "grad_norm": 0.176059752702713, "learning_rate": 0.00027949938620916795, "loss": 0.5213, "step": 737600 }, { "epoch": 99.39369442198868, "grad_norm": 0.18155108392238617, "learning_rate": 0.00027946195993892034, "loss": 0.522, "step": 737700 }, { "epoch": 99.40716787927782, "grad_norm": 0.1821502149105072, "learning_rate": 0.0002794245336686727, "loss": 0.5227, "step": 737800 }, { "epoch": 99.42064133656696, "grad_norm": 0.2100301831960678, "learning_rate": 0.0002793871073984251, "loss": 0.5224, "step": 737900 }, { "epoch": 99.4341147938561, "grad_norm": 0.1730816811323166, "learning_rate": 0.0002793496811281775, "loss": 0.5225, "step": 738000 }, { "epoch": 99.44758825114525, "grad_norm": 0.18138352036476135, "learning_rate": 0.0002793122548579299, "loss": 0.5222, "step": 738100 }, { "epoch": 99.46106170843439, "grad_norm": 0.2517598867416382, "learning_rate": 0.0002792748285876823, "loss": 0.5219, "step": 738200 }, { "epoch": 99.47453516572352, "grad_norm": 0.1870316118001938, "learning_rate": 0.0002792374023174347, "loss": 0.5226, "step": 738300 }, { "epoch": 99.48800862301266, "grad_norm": 0.18621349334716797, "learning_rate": 0.0002791999760471871, "loss": 0.5229, "step": 738400 }, { "epoch": 99.5014820803018, "grad_norm": 0.19429346919059753, "learning_rate": 0.0002791625497769394, "loss": 0.5219, "step": 738500 }, { "epoch": 99.51495553759095, "grad_norm": 0.18381521105766296, "learning_rate": 0.0002791251235066918, "loss": 0.5218, "step": 738600 }, { "epoch": 99.52842899488009, "grad_norm": 0.1975812017917633, "learning_rate": 0.0002790876972364442, "loss": 0.5213, "step": 738700 }, { "epoch": 99.54190245216923, "grad_norm": 0.18302606046199799, "learning_rate": 0.0002790502709661966, "loss": 0.5212, "step": 738800 }, { "epoch": 99.55537590945836, "grad_norm": 0.20521660149097443, "learning_rate": 0.000279012844695949, "loss": 0.5225, "step": 738900 }, { "epoch": 99.56884936674751, "grad_norm": 0.17946404218673706, "learning_rate": 0.0002789754184257014, "loss": 0.5222, "step": 739000 }, { "epoch": 99.58232282403665, "grad_norm": 0.17457570135593414, "learning_rate": 0.00027893799215545376, "loss": 0.5225, "step": 739100 }, { "epoch": 99.59579628132579, "grad_norm": 0.1853523999452591, "learning_rate": 0.0002789005658852061, "loss": 0.5234, "step": 739200 }, { "epoch": 99.60926973861493, "grad_norm": 0.18789042532444, "learning_rate": 0.0002788631396149585, "loss": 0.5223, "step": 739300 }, { "epoch": 99.62274319590406, "grad_norm": 0.1964259296655655, "learning_rate": 0.0002788257133447109, "loss": 0.5215, "step": 739400 }, { "epoch": 99.63621665319322, "grad_norm": 0.17950226366519928, "learning_rate": 0.0002787882870744633, "loss": 0.5214, "step": 739500 }, { "epoch": 99.64969011048235, "grad_norm": 0.18100401759147644, "learning_rate": 0.0002787508608042157, "loss": 0.5214, "step": 739600 }, { "epoch": 99.66316356777149, "grad_norm": 0.18264645338058472, "learning_rate": 0.0002787134345339681, "loss": 0.5221, "step": 739700 }, { "epoch": 99.67663702506063, "grad_norm": 0.1909472495317459, "learning_rate": 0.0002786760082637205, "loss": 0.5223, "step": 739800 }, { "epoch": 99.69011048234977, "grad_norm": 0.18415270745754242, "learning_rate": 0.00027863858199347284, "loss": 0.5217, "step": 739900 }, { "epoch": 99.70358393963892, "grad_norm": 0.20566795766353607, "learning_rate": 0.00027860115572322523, "loss": 0.522, "step": 740000 }, { "epoch": 99.71705739692806, "grad_norm": 0.20001435279846191, "learning_rate": 0.00027856372945297763, "loss": 0.5226, "step": 740100 }, { "epoch": 99.73053085421719, "grad_norm": 0.17106536030769348, "learning_rate": 0.00027852630318273003, "loss": 0.5221, "step": 740200 }, { "epoch": 99.74400431150633, "grad_norm": 0.19322806596755981, "learning_rate": 0.00027848887691248243, "loss": 0.5216, "step": 740300 }, { "epoch": 99.75747776879547, "grad_norm": 0.19426825642585754, "learning_rate": 0.0002784514506422348, "loss": 0.5218, "step": 740400 }, { "epoch": 99.77095122608462, "grad_norm": 0.19306285679340363, "learning_rate": 0.0002784140243719872, "loss": 0.5216, "step": 740500 }, { "epoch": 99.78442468337376, "grad_norm": 0.17826297879219055, "learning_rate": 0.0002783765981017396, "loss": 0.5218, "step": 740600 }, { "epoch": 99.7978981406629, "grad_norm": 0.19125112891197205, "learning_rate": 0.00027833917183149197, "loss": 0.5221, "step": 740700 }, { "epoch": 99.81137159795203, "grad_norm": 0.2020578384399414, "learning_rate": 0.00027830174556124436, "loss": 0.5223, "step": 740800 }, { "epoch": 99.82484505524117, "grad_norm": 0.1977124661207199, "learning_rate": 0.0002782643192909967, "loss": 0.5214, "step": 740900 }, { "epoch": 99.83831851253032, "grad_norm": 0.192857027053833, "learning_rate": 0.0002782268930207491, "loss": 0.5225, "step": 741000 }, { "epoch": 99.85179196981946, "grad_norm": 0.1880609393119812, "learning_rate": 0.0002781894667505015, "loss": 0.5218, "step": 741100 }, { "epoch": 99.8652654271086, "grad_norm": 0.1969519406557083, "learning_rate": 0.0002781520404802539, "loss": 0.522, "step": 741200 }, { "epoch": 99.87873888439773, "grad_norm": 0.23663905262947083, "learning_rate": 0.0002781146142100063, "loss": 0.5214, "step": 741300 }, { "epoch": 99.89221234168687, "grad_norm": 0.18858322501182556, "learning_rate": 0.00027807718793975865, "loss": 0.5223, "step": 741400 }, { "epoch": 99.90568579897602, "grad_norm": 0.19380253553390503, "learning_rate": 0.00027803976166951104, "loss": 0.5223, "step": 741500 }, { "epoch": 99.91915925626516, "grad_norm": 0.1837623566389084, "learning_rate": 0.00027800233539926344, "loss": 0.5228, "step": 741600 }, { "epoch": 99.9326327135543, "grad_norm": 0.17632964253425598, "learning_rate": 0.00027796490912901584, "loss": 0.5212, "step": 741700 }, { "epoch": 99.94610617084344, "grad_norm": 0.18754911422729492, "learning_rate": 0.00027792748285876824, "loss": 0.5215, "step": 741800 }, { "epoch": 99.95957962813257, "grad_norm": 0.21998168528079987, "learning_rate": 0.00027789005658852064, "loss": 0.5219, "step": 741900 }, { "epoch": 99.97305308542172, "grad_norm": 0.17607009410858154, "learning_rate": 0.00027785263031827304, "loss": 0.5211, "step": 742000 }, { "epoch": 99.98652654271086, "grad_norm": 0.19744046032428741, "learning_rate": 0.0002778152040480254, "loss": 0.5213, "step": 742100 }, { "epoch": 100.0, "grad_norm": 0.1887170970439911, "learning_rate": 0.0002777777777777778, "loss": 0.5224, "step": 742200 }, { "epoch": 100.0, "eval_loss": 0.51082444190979, "eval_runtime": 4.9739, "eval_samples_per_second": 1005.239, "eval_steps_per_second": 15.883, "step": 742200 }, { "epoch": 100.01347345728914, "grad_norm": 0.17490755021572113, "learning_rate": 0.0002777403515075302, "loss": 0.5203, "step": 742300 }, { "epoch": 100.02694691457828, "grad_norm": 0.21721023321151733, "learning_rate": 0.0002777029252372826, "loss": 0.5209, "step": 742400 }, { "epoch": 100.04042037186743, "grad_norm": 0.18368734419345856, "learning_rate": 0.00027766549896703497, "loss": 0.5209, "step": 742500 }, { "epoch": 100.05389382915656, "grad_norm": 0.1917073279619217, "learning_rate": 0.00027762807269678737, "loss": 0.5215, "step": 742600 }, { "epoch": 100.0673672864457, "grad_norm": 0.18302714824676514, "learning_rate": 0.00027759064642653977, "loss": 0.5215, "step": 742700 }, { "epoch": 100.08084074373484, "grad_norm": 0.18152979016304016, "learning_rate": 0.00027755322015629206, "loss": 0.5213, "step": 742800 }, { "epoch": 100.09431420102398, "grad_norm": 0.17557555437088013, "learning_rate": 0.00027751579388604446, "loss": 0.5219, "step": 742900 }, { "epoch": 100.10778765831313, "grad_norm": 0.19469085335731506, "learning_rate": 0.00027747836761579685, "loss": 0.5209, "step": 743000 }, { "epoch": 100.12126111560227, "grad_norm": 0.18854689598083496, "learning_rate": 0.00027744094134554925, "loss": 0.5215, "step": 743100 }, { "epoch": 100.1347345728914, "grad_norm": 0.1840236783027649, "learning_rate": 0.00027740351507530165, "loss": 0.5216, "step": 743200 }, { "epoch": 100.14820803018054, "grad_norm": 0.17448286712169647, "learning_rate": 0.00027736608880505405, "loss": 0.5225, "step": 743300 }, { "epoch": 100.16168148746968, "grad_norm": 0.1872272938489914, "learning_rate": 0.00027732866253480645, "loss": 0.5223, "step": 743400 }, { "epoch": 100.17515494475883, "grad_norm": 0.21519918739795685, "learning_rate": 0.00027729123626455885, "loss": 0.5214, "step": 743500 }, { "epoch": 100.18862840204797, "grad_norm": 0.182611882686615, "learning_rate": 0.0002772538099943112, "loss": 0.5223, "step": 743600 }, { "epoch": 100.2021018593371, "grad_norm": 0.19064022600650787, "learning_rate": 0.0002772163837240636, "loss": 0.5215, "step": 743700 }, { "epoch": 100.21557531662624, "grad_norm": 0.18026818335056305, "learning_rate": 0.000277178957453816, "loss": 0.5221, "step": 743800 }, { "epoch": 100.22904877391538, "grad_norm": 0.18432694673538208, "learning_rate": 0.0002771415311835684, "loss": 0.5227, "step": 743900 }, { "epoch": 100.24252223120453, "grad_norm": 0.20028568804264069, "learning_rate": 0.0002771041049133208, "loss": 0.5217, "step": 744000 }, { "epoch": 100.25599568849367, "grad_norm": 0.21274645626544952, "learning_rate": 0.0002770666786430732, "loss": 0.5213, "step": 744100 }, { "epoch": 100.26946914578281, "grad_norm": 0.18611915409564972, "learning_rate": 0.0002770292523728256, "loss": 0.5223, "step": 744200 }, { "epoch": 100.28294260307194, "grad_norm": 0.18764811754226685, "learning_rate": 0.0002769918261025779, "loss": 0.5216, "step": 744300 }, { "epoch": 100.29641606036108, "grad_norm": 0.18504805862903595, "learning_rate": 0.0002769543998323303, "loss": 0.521, "step": 744400 }, { "epoch": 100.30988951765023, "grad_norm": 0.19866898655891418, "learning_rate": 0.0002769169735620827, "loss": 0.521, "step": 744500 }, { "epoch": 100.32336297493937, "grad_norm": 0.179465651512146, "learning_rate": 0.00027687954729183506, "loss": 0.5209, "step": 744600 }, { "epoch": 100.33683643222851, "grad_norm": 0.1897486299276352, "learning_rate": 0.00027684212102158746, "loss": 0.5207, "step": 744700 }, { "epoch": 100.35030988951765, "grad_norm": 0.19241151213645935, "learning_rate": 0.00027680469475133986, "loss": 0.5222, "step": 744800 }, { "epoch": 100.36378334680678, "grad_norm": 0.181998148560524, "learning_rate": 0.00027676726848109226, "loss": 0.5213, "step": 744900 }, { "epoch": 100.37725680409594, "grad_norm": 0.2181134968996048, "learning_rate": 0.0002767298422108446, "loss": 0.522, "step": 745000 }, { "epoch": 100.39073026138507, "grad_norm": 0.17835795879364014, "learning_rate": 0.000276692415940597, "loss": 0.5221, "step": 745100 }, { "epoch": 100.40420371867421, "grad_norm": 0.18228372931480408, "learning_rate": 0.0002766549896703494, "loss": 0.5214, "step": 745200 }, { "epoch": 100.41767717596335, "grad_norm": 0.18856355547904968, "learning_rate": 0.0002766175634001018, "loss": 0.5206, "step": 745300 }, { "epoch": 100.43115063325249, "grad_norm": 0.17728352546691895, "learning_rate": 0.0002765801371298542, "loss": 0.5219, "step": 745400 }, { "epoch": 100.44462409054164, "grad_norm": 0.19858944416046143, "learning_rate": 0.0002765427108596066, "loss": 0.521, "step": 745500 }, { "epoch": 100.45809754783077, "grad_norm": 0.17702274024486542, "learning_rate": 0.000276505284589359, "loss": 0.5221, "step": 745600 }, { "epoch": 100.47157100511991, "grad_norm": 0.18873678147792816, "learning_rate": 0.0002764678583191114, "loss": 0.5217, "step": 745700 }, { "epoch": 100.48504446240905, "grad_norm": 0.19320300221443176, "learning_rate": 0.00027643043204886373, "loss": 0.5217, "step": 745800 }, { "epoch": 100.49851791969819, "grad_norm": 0.20071281492710114, "learning_rate": 0.00027639300577861613, "loss": 0.5223, "step": 745900 }, { "epoch": 100.51199137698734, "grad_norm": 0.18405789136886597, "learning_rate": 0.00027635557950836853, "loss": 0.5224, "step": 746000 }, { "epoch": 100.52546483427648, "grad_norm": 0.20928654074668884, "learning_rate": 0.00027631815323812093, "loss": 0.5211, "step": 746100 }, { "epoch": 100.53893829156561, "grad_norm": 0.20183569192886353, "learning_rate": 0.00027628072696787333, "loss": 0.5211, "step": 746200 }, { "epoch": 100.55241174885475, "grad_norm": 0.20459336042404175, "learning_rate": 0.0002762433006976257, "loss": 0.522, "step": 746300 }, { "epoch": 100.5658852061439, "grad_norm": 0.1848243772983551, "learning_rate": 0.00027620587442737807, "loss": 0.5217, "step": 746400 }, { "epoch": 100.57935866343304, "grad_norm": 0.1929187774658203, "learning_rate": 0.0002761684481571304, "loss": 0.5218, "step": 746500 }, { "epoch": 100.59283212072218, "grad_norm": 0.18809592723846436, "learning_rate": 0.0002761310218868828, "loss": 0.5223, "step": 746600 }, { "epoch": 100.60630557801132, "grad_norm": 0.18913257122039795, "learning_rate": 0.0002760935956166352, "loss": 0.5215, "step": 746700 }, { "epoch": 100.61977903530045, "grad_norm": 0.17928314208984375, "learning_rate": 0.0002760561693463876, "loss": 0.522, "step": 746800 }, { "epoch": 100.6332524925896, "grad_norm": 0.1864246279001236, "learning_rate": 0.00027601874307614, "loss": 0.5223, "step": 746900 }, { "epoch": 100.64672594987874, "grad_norm": 0.1862383633852005, "learning_rate": 0.0002759813168058924, "loss": 0.5218, "step": 747000 }, { "epoch": 100.66019940716788, "grad_norm": 0.17971371114253998, "learning_rate": 0.0002759438905356448, "loss": 0.5217, "step": 747100 }, { "epoch": 100.67367286445702, "grad_norm": 0.1970626711845398, "learning_rate": 0.00027590646426539715, "loss": 0.5227, "step": 747200 }, { "epoch": 100.68714632174616, "grad_norm": 0.18194662034511566, "learning_rate": 0.00027586903799514955, "loss": 0.522, "step": 747300 }, { "epoch": 100.7006197790353, "grad_norm": 0.20786727964878082, "learning_rate": 0.00027583161172490194, "loss": 0.5218, "step": 747400 }, { "epoch": 100.71409323632444, "grad_norm": 0.18593810498714447, "learning_rate": 0.00027579418545465434, "loss": 0.5215, "step": 747500 }, { "epoch": 100.72756669361358, "grad_norm": 0.19041290879249573, "learning_rate": 0.00027575675918440674, "loss": 0.522, "step": 747600 }, { "epoch": 100.74104015090272, "grad_norm": 0.18208007514476776, "learning_rate": 0.00027571933291415914, "loss": 0.5223, "step": 747700 }, { "epoch": 100.75451360819186, "grad_norm": 0.1999935358762741, "learning_rate": 0.00027568190664391154, "loss": 0.5214, "step": 747800 }, { "epoch": 100.76798706548101, "grad_norm": 0.18635965883731842, "learning_rate": 0.00027564448037366393, "loss": 0.5217, "step": 747900 }, { "epoch": 100.78146052277015, "grad_norm": 0.18200528621673584, "learning_rate": 0.0002756070541034163, "loss": 0.5212, "step": 748000 }, { "epoch": 100.79493398005928, "grad_norm": 0.20580336451530457, "learning_rate": 0.0002755696278331687, "loss": 0.5217, "step": 748100 }, { "epoch": 100.80840743734842, "grad_norm": 0.18214361369609833, "learning_rate": 0.000275532201562921, "loss": 0.5223, "step": 748200 }, { "epoch": 100.82188089463756, "grad_norm": 0.1821276992559433, "learning_rate": 0.0002754947752926734, "loss": 0.5228, "step": 748300 }, { "epoch": 100.83535435192671, "grad_norm": 0.20312044024467468, "learning_rate": 0.0002754573490224258, "loss": 0.5214, "step": 748400 }, { "epoch": 100.84882780921585, "grad_norm": 0.181726336479187, "learning_rate": 0.0002754199227521782, "loss": 0.5216, "step": 748500 }, { "epoch": 100.86230126650499, "grad_norm": 0.20414972305297852, "learning_rate": 0.0002753824964819306, "loss": 0.5222, "step": 748600 }, { "epoch": 100.87577472379412, "grad_norm": 0.19633565843105316, "learning_rate": 0.00027534507021168296, "loss": 0.522, "step": 748700 }, { "epoch": 100.88924818108326, "grad_norm": 0.1912154257297516, "learning_rate": 0.00027530764394143536, "loss": 0.523, "step": 748800 }, { "epoch": 100.90272163837241, "grad_norm": 0.23211383819580078, "learning_rate": 0.00027527021767118775, "loss": 0.5226, "step": 748900 }, { "epoch": 100.91619509566155, "grad_norm": 0.19414514303207397, "learning_rate": 0.00027523279140094015, "loss": 0.5212, "step": 749000 }, { "epoch": 100.92966855295069, "grad_norm": 0.18434062600135803, "learning_rate": 0.00027519536513069255, "loss": 0.5218, "step": 749100 }, { "epoch": 100.94314201023982, "grad_norm": 0.19020523130893707, "learning_rate": 0.00027515793886044495, "loss": 0.5217, "step": 749200 }, { "epoch": 100.95661546752896, "grad_norm": 0.18483731150627136, "learning_rate": 0.00027512051259019735, "loss": 0.5213, "step": 749300 }, { "epoch": 100.97008892481811, "grad_norm": 0.18839488923549652, "learning_rate": 0.0002750830863199497, "loss": 0.5219, "step": 749400 }, { "epoch": 100.98356238210725, "grad_norm": 0.18681183457374573, "learning_rate": 0.0002750456600497021, "loss": 0.5223, "step": 749500 }, { "epoch": 100.99703583939639, "grad_norm": 0.17703577876091003, "learning_rate": 0.0002750082337794545, "loss": 0.5223, "step": 749600 }, { "epoch": 101.0, "eval_loss": 0.5109626054763794, "eval_runtime": 4.9439, "eval_samples_per_second": 1011.342, "eval_steps_per_second": 15.979, "step": 749622 }, { "epoch": 101.01050929668553, "grad_norm": 0.18624143302440643, "learning_rate": 0.0002749708075092069, "loss": 0.5205, "step": 749700 }, { "epoch": 101.02398275397466, "grad_norm": 0.2315622717142105, "learning_rate": 0.0002749333812389593, "loss": 0.52, "step": 749800 }, { "epoch": 101.03745621126382, "grad_norm": 0.17946207523345947, "learning_rate": 0.0002748959549687117, "loss": 0.5207, "step": 749900 }, { "epoch": 101.05092966855295, "grad_norm": 0.19936777651309967, "learning_rate": 0.000274858528698464, "loss": 0.5219, "step": 750000 }, { "epoch": 101.06440312584209, "grad_norm": 0.19615690410137177, "learning_rate": 0.00027482110242821637, "loss": 0.5222, "step": 750100 }, { "epoch": 101.07787658313123, "grad_norm": 0.17862968146800995, "learning_rate": 0.00027478367615796877, "loss": 0.5212, "step": 750200 }, { "epoch": 101.09135004042037, "grad_norm": 0.17456459999084473, "learning_rate": 0.00027474624988772117, "loss": 0.5223, "step": 750300 }, { "epoch": 101.10482349770952, "grad_norm": 0.1981360912322998, "learning_rate": 0.00027470882361747357, "loss": 0.5201, "step": 750400 }, { "epoch": 101.11829695499866, "grad_norm": 0.21715512871742249, "learning_rate": 0.00027467139734722596, "loss": 0.5208, "step": 750500 }, { "epoch": 101.13177041228779, "grad_norm": 0.1850256323814392, "learning_rate": 0.00027463397107697836, "loss": 0.5218, "step": 750600 }, { "epoch": 101.14524386957693, "grad_norm": 0.20511141419410706, "learning_rate": 0.00027459654480673076, "loss": 0.522, "step": 750700 }, { "epoch": 101.15871732686607, "grad_norm": 0.17400823533535004, "learning_rate": 0.00027455911853648316, "loss": 0.5217, "step": 750800 }, { "epoch": 101.17219078415522, "grad_norm": 0.18329980969429016, "learning_rate": 0.0002745216922662355, "loss": 0.5211, "step": 750900 }, { "epoch": 101.18566424144436, "grad_norm": 0.18357807397842407, "learning_rate": 0.0002744842659959879, "loss": 0.5215, "step": 751000 }, { "epoch": 101.1991376987335, "grad_norm": 0.1955699920654297, "learning_rate": 0.0002744468397257403, "loss": 0.5213, "step": 751100 }, { "epoch": 101.21261115602263, "grad_norm": 0.18529708683490753, "learning_rate": 0.0002744094134554927, "loss": 0.5207, "step": 751200 }, { "epoch": 101.22608461331177, "grad_norm": 0.19266560673713684, "learning_rate": 0.0002743719871852451, "loss": 0.5211, "step": 751300 }, { "epoch": 101.23955807060092, "grad_norm": 0.20185092091560364, "learning_rate": 0.0002743345609149975, "loss": 0.5225, "step": 751400 }, { "epoch": 101.25303152789006, "grad_norm": 0.18219459056854248, "learning_rate": 0.0002742971346447499, "loss": 0.5205, "step": 751500 }, { "epoch": 101.2665049851792, "grad_norm": 0.19371066987514496, "learning_rate": 0.00027425970837450224, "loss": 0.5219, "step": 751600 }, { "epoch": 101.27997844246833, "grad_norm": 0.20023204386234283, "learning_rate": 0.00027422228210425463, "loss": 0.5215, "step": 751700 }, { "epoch": 101.29345189975747, "grad_norm": 0.1768636256456375, "learning_rate": 0.000274184855834007, "loss": 0.5222, "step": 751800 }, { "epoch": 101.30692535704662, "grad_norm": 0.20805758237838745, "learning_rate": 0.0002741474295637594, "loss": 0.5204, "step": 751900 }, { "epoch": 101.32039881433576, "grad_norm": 0.2159169316291809, "learning_rate": 0.0002741100032935118, "loss": 0.5217, "step": 752000 }, { "epoch": 101.3338722716249, "grad_norm": 0.18717271089553833, "learning_rate": 0.00027407257702326417, "loss": 0.5213, "step": 752100 }, { "epoch": 101.34734572891404, "grad_norm": 0.20948590338230133, "learning_rate": 0.00027403515075301657, "loss": 0.5205, "step": 752200 }, { "epoch": 101.36081918620317, "grad_norm": 0.1847599297761917, "learning_rate": 0.0002739977244827689, "loss": 0.522, "step": 752300 }, { "epoch": 101.37429264349232, "grad_norm": 0.19229499995708466, "learning_rate": 0.0002739602982125213, "loss": 0.5212, "step": 752400 }, { "epoch": 101.38776610078146, "grad_norm": 0.18874549865722656, "learning_rate": 0.0002739228719422737, "loss": 0.5219, "step": 752500 }, { "epoch": 101.4012395580706, "grad_norm": 0.20033085346221924, "learning_rate": 0.0002738854456720261, "loss": 0.5207, "step": 752600 }, { "epoch": 101.41471301535974, "grad_norm": 0.2036386877298355, "learning_rate": 0.0002738480194017785, "loss": 0.5217, "step": 752700 }, { "epoch": 101.42818647264887, "grad_norm": 0.1938789337873459, "learning_rate": 0.0002738105931315309, "loss": 0.5222, "step": 752800 }, { "epoch": 101.44165992993803, "grad_norm": 0.18905605375766754, "learning_rate": 0.0002737731668612833, "loss": 0.5228, "step": 752900 }, { "epoch": 101.45513338722716, "grad_norm": 0.21526281535625458, "learning_rate": 0.0002737357405910357, "loss": 0.5214, "step": 753000 }, { "epoch": 101.4686068445163, "grad_norm": 0.19761109352111816, "learning_rate": 0.00027369831432078805, "loss": 0.5211, "step": 753100 }, { "epoch": 101.48208030180544, "grad_norm": 0.1843656748533249, "learning_rate": 0.00027366088805054044, "loss": 0.5213, "step": 753200 }, { "epoch": 101.49555375909458, "grad_norm": 0.20318162441253662, "learning_rate": 0.00027362346178029284, "loss": 0.5223, "step": 753300 }, { "epoch": 101.50902721638373, "grad_norm": 0.19532538950443268, "learning_rate": 0.00027358603551004524, "loss": 0.5222, "step": 753400 }, { "epoch": 101.52250067367287, "grad_norm": 0.17618109285831451, "learning_rate": 0.00027354860923979764, "loss": 0.5223, "step": 753500 }, { "epoch": 101.535974130962, "grad_norm": 0.18537142872810364, "learning_rate": 0.00027351118296955, "loss": 0.5214, "step": 753600 }, { "epoch": 101.54944758825114, "grad_norm": 0.18344026803970337, "learning_rate": 0.0002734737566993024, "loss": 0.522, "step": 753700 }, { "epoch": 101.56292104554029, "grad_norm": 0.1844058483839035, "learning_rate": 0.0002734363304290547, "loss": 0.522, "step": 753800 }, { "epoch": 101.57639450282943, "grad_norm": 0.19062012434005737, "learning_rate": 0.0002733989041588071, "loss": 0.522, "step": 753900 }, { "epoch": 101.58986796011857, "grad_norm": 0.19110701978206635, "learning_rate": 0.0002733614778885595, "loss": 0.5219, "step": 754000 }, { "epoch": 101.6033414174077, "grad_norm": 0.201072096824646, "learning_rate": 0.0002733240516183119, "loss": 0.5211, "step": 754100 }, { "epoch": 101.61681487469684, "grad_norm": 0.18320170044898987, "learning_rate": 0.0002732866253480643, "loss": 0.5205, "step": 754200 }, { "epoch": 101.630288331986, "grad_norm": 0.17679788172245026, "learning_rate": 0.0002732491990778167, "loss": 0.5207, "step": 754300 }, { "epoch": 101.64376178927513, "grad_norm": 0.18247568607330322, "learning_rate": 0.0002732117728075691, "loss": 0.5225, "step": 754400 }, { "epoch": 101.65723524656427, "grad_norm": 0.1855001151561737, "learning_rate": 0.00027317434653732146, "loss": 0.5222, "step": 754500 }, { "epoch": 101.6707087038534, "grad_norm": 0.20135462284088135, "learning_rate": 0.00027313692026707386, "loss": 0.5219, "step": 754600 }, { "epoch": 101.68418216114254, "grad_norm": 0.19905762374401093, "learning_rate": 0.00027309949399682626, "loss": 0.5212, "step": 754700 }, { "epoch": 101.6976556184317, "grad_norm": 0.19517789781093597, "learning_rate": 0.00027306206772657865, "loss": 0.5206, "step": 754800 }, { "epoch": 101.71112907572083, "grad_norm": 0.1787770539522171, "learning_rate": 0.00027302464145633105, "loss": 0.5211, "step": 754900 }, { "epoch": 101.72460253300997, "grad_norm": 0.18777351081371307, "learning_rate": 0.00027298721518608345, "loss": 0.5215, "step": 755000 }, { "epoch": 101.73807599029911, "grad_norm": 0.1953069567680359, "learning_rate": 0.00027294978891583585, "loss": 0.5206, "step": 755100 }, { "epoch": 101.75154944758825, "grad_norm": 0.17508922517299652, "learning_rate": 0.0002729123626455882, "loss": 0.5217, "step": 755200 }, { "epoch": 101.7650229048774, "grad_norm": 0.19074515998363495, "learning_rate": 0.0002728749363753406, "loss": 0.5217, "step": 755300 }, { "epoch": 101.77849636216654, "grad_norm": 0.20605041086673737, "learning_rate": 0.00027283751010509293, "loss": 0.5224, "step": 755400 }, { "epoch": 101.79196981945567, "grad_norm": 0.24417030811309814, "learning_rate": 0.00027280008383484533, "loss": 0.521, "step": 755500 }, { "epoch": 101.80544327674481, "grad_norm": 0.19419938325881958, "learning_rate": 0.00027276265756459773, "loss": 0.5211, "step": 755600 }, { "epoch": 101.81891673403395, "grad_norm": 0.19101062417030334, "learning_rate": 0.00027272523129435013, "loss": 0.5207, "step": 755700 }, { "epoch": 101.8323901913231, "grad_norm": 0.18207313120365143, "learning_rate": 0.00027268780502410253, "loss": 0.5223, "step": 755800 }, { "epoch": 101.84586364861224, "grad_norm": 0.18531742691993713, "learning_rate": 0.0002726503787538549, "loss": 0.5222, "step": 755900 }, { "epoch": 101.85933710590137, "grad_norm": 0.21490031480789185, "learning_rate": 0.00027261295248360727, "loss": 0.5221, "step": 756000 }, { "epoch": 101.87281056319051, "grad_norm": 0.20004914700984955, "learning_rate": 0.00027257552621335967, "loss": 0.5221, "step": 756100 }, { "epoch": 101.88628402047965, "grad_norm": 0.20205718278884888, "learning_rate": 0.00027253809994311207, "loss": 0.5213, "step": 756200 }, { "epoch": 101.8997574777688, "grad_norm": 0.19971634447574615, "learning_rate": 0.00027250067367286446, "loss": 0.5207, "step": 756300 }, { "epoch": 101.91323093505794, "grad_norm": 0.18316441774368286, "learning_rate": 0.00027246324740261686, "loss": 0.5214, "step": 756400 }, { "epoch": 101.92670439234708, "grad_norm": 0.19489476084709167, "learning_rate": 0.00027242582113236926, "loss": 0.5218, "step": 756500 }, { "epoch": 101.94017784963621, "grad_norm": 0.18952083587646484, "learning_rate": 0.00027238839486212166, "loss": 0.522, "step": 756600 }, { "epoch": 101.95365130692535, "grad_norm": 0.18064266443252563, "learning_rate": 0.000272350968591874, "loss": 0.5209, "step": 756700 }, { "epoch": 101.9671247642145, "grad_norm": 0.18252837657928467, "learning_rate": 0.0002723135423216264, "loss": 0.5223, "step": 756800 }, { "epoch": 101.98059822150364, "grad_norm": 0.20892927050590515, "learning_rate": 0.0002722761160513788, "loss": 0.5223, "step": 756900 }, { "epoch": 101.99407167879278, "grad_norm": 0.18699362874031067, "learning_rate": 0.0002722386897811312, "loss": 0.5216, "step": 757000 }, { "epoch": 102.0, "eval_loss": 0.5108630657196045, "eval_runtime": 4.9487, "eval_samples_per_second": 1010.368, "eval_steps_per_second": 15.964, "step": 757044 }, { "epoch": 102.00754513608192, "grad_norm": 0.21695776283740997, "learning_rate": 0.0002722012635108836, "loss": 0.5215, "step": 757100 }, { "epoch": 102.02101859337105, "grad_norm": 0.20097194612026215, "learning_rate": 0.00027216383724063594, "loss": 0.5197, "step": 757200 }, { "epoch": 102.0344920506602, "grad_norm": 0.1767139434814453, "learning_rate": 0.00027212641097038834, "loss": 0.5196, "step": 757300 }, { "epoch": 102.04796550794934, "grad_norm": 0.1969229280948639, "learning_rate": 0.0002720889847001407, "loss": 0.5206, "step": 757400 }, { "epoch": 102.06143896523848, "grad_norm": 0.1865367442369461, "learning_rate": 0.0002720515584298931, "loss": 0.5216, "step": 757500 }, { "epoch": 102.07491242252762, "grad_norm": 0.18276934325695038, "learning_rate": 0.0002720141321596455, "loss": 0.521, "step": 757600 }, { "epoch": 102.08838587981676, "grad_norm": 0.21159857511520386, "learning_rate": 0.0002719767058893979, "loss": 0.5213, "step": 757700 }, { "epoch": 102.1018593371059, "grad_norm": 0.1864270269870758, "learning_rate": 0.0002719392796191503, "loss": 0.5214, "step": 757800 }, { "epoch": 102.11533279439504, "grad_norm": 0.18459643423557281, "learning_rate": 0.0002719018533489027, "loss": 0.5213, "step": 757900 }, { "epoch": 102.12880625168418, "grad_norm": 0.1904495656490326, "learning_rate": 0.00027186442707865507, "loss": 0.5214, "step": 758000 }, { "epoch": 102.14227970897332, "grad_norm": 0.18828411400318146, "learning_rate": 0.0002718270008084074, "loss": 0.5205, "step": 758100 }, { "epoch": 102.15575316626246, "grad_norm": 0.18950700759887695, "learning_rate": 0.0002717895745381598, "loss": 0.5219, "step": 758200 }, { "epoch": 102.16922662355161, "grad_norm": 0.2035508155822754, "learning_rate": 0.0002717521482679122, "loss": 0.5213, "step": 758300 }, { "epoch": 102.18270008084075, "grad_norm": 0.19833151996135712, "learning_rate": 0.0002717147219976646, "loss": 0.5211, "step": 758400 }, { "epoch": 102.19617353812988, "grad_norm": 0.18195171654224396, "learning_rate": 0.000271677295727417, "loss": 0.5208, "step": 758500 }, { "epoch": 102.20964699541902, "grad_norm": 0.22490929067134857, "learning_rate": 0.0002716398694571694, "loss": 0.5212, "step": 758600 }, { "epoch": 102.22312045270816, "grad_norm": 0.1920624077320099, "learning_rate": 0.0002716024431869218, "loss": 0.5214, "step": 758700 }, { "epoch": 102.23659390999731, "grad_norm": 0.19337034225463867, "learning_rate": 0.0002715650169166742, "loss": 0.5216, "step": 758800 }, { "epoch": 102.25006736728645, "grad_norm": 0.1985582709312439, "learning_rate": 0.00027152759064642655, "loss": 0.5212, "step": 758900 }, { "epoch": 102.26354082457559, "grad_norm": 0.20295099914073944, "learning_rate": 0.00027149016437617895, "loss": 0.5212, "step": 759000 }, { "epoch": 102.27701428186472, "grad_norm": 0.19482629001140594, "learning_rate": 0.0002714527381059313, "loss": 0.5212, "step": 759100 }, { "epoch": 102.29048773915386, "grad_norm": 0.194545716047287, "learning_rate": 0.0002714153118356837, "loss": 0.5219, "step": 759200 }, { "epoch": 102.30396119644301, "grad_norm": 0.18482190370559692, "learning_rate": 0.0002713778855654361, "loss": 0.5209, "step": 759300 }, { "epoch": 102.31743465373215, "grad_norm": 0.20092608034610748, "learning_rate": 0.0002713404592951885, "loss": 0.522, "step": 759400 }, { "epoch": 102.33090811102129, "grad_norm": 0.1920200139284134, "learning_rate": 0.0002713030330249409, "loss": 0.5212, "step": 759500 }, { "epoch": 102.34438156831042, "grad_norm": 0.19680219888687134, "learning_rate": 0.0002712656067546932, "loss": 0.5219, "step": 759600 }, { "epoch": 102.35785502559956, "grad_norm": 0.19612626731395721, "learning_rate": 0.0002712281804844456, "loss": 0.5213, "step": 759700 }, { "epoch": 102.37132848288871, "grad_norm": 0.1910802274942398, "learning_rate": 0.000271190754214198, "loss": 0.5214, "step": 759800 }, { "epoch": 102.38480194017785, "grad_norm": 0.2037854790687561, "learning_rate": 0.0002711533279439504, "loss": 0.521, "step": 759900 }, { "epoch": 102.39827539746699, "grad_norm": 0.18862807750701904, "learning_rate": 0.0002711159016737028, "loss": 0.5223, "step": 760000 }, { "epoch": 102.41174885475613, "grad_norm": 0.18247783184051514, "learning_rate": 0.0002710784754034552, "loss": 0.5214, "step": 760100 }, { "epoch": 102.42522231204526, "grad_norm": 0.20065805315971375, "learning_rate": 0.0002710410491332076, "loss": 0.5204, "step": 760200 }, { "epoch": 102.43869576933442, "grad_norm": 0.183046355843544, "learning_rate": 0.00027100362286295996, "loss": 0.5216, "step": 760300 }, { "epoch": 102.45216922662355, "grad_norm": 0.20796267688274384, "learning_rate": 0.00027096619659271236, "loss": 0.5218, "step": 760400 }, { "epoch": 102.46564268391269, "grad_norm": 0.18534713983535767, "learning_rate": 0.00027092877032246476, "loss": 0.5203, "step": 760500 }, { "epoch": 102.47911614120183, "grad_norm": 0.18883676826953888, "learning_rate": 0.00027089134405221716, "loss": 0.5216, "step": 760600 }, { "epoch": 102.49258959849097, "grad_norm": 0.22416996955871582, "learning_rate": 0.00027085391778196955, "loss": 0.5216, "step": 760700 }, { "epoch": 102.50606305578012, "grad_norm": 0.2192162573337555, "learning_rate": 0.00027081649151172195, "loss": 0.5211, "step": 760800 }, { "epoch": 102.51953651306926, "grad_norm": 0.19080010056495667, "learning_rate": 0.0002707790652414743, "loss": 0.5211, "step": 760900 }, { "epoch": 102.53300997035839, "grad_norm": 0.20532920956611633, "learning_rate": 0.0002707416389712267, "loss": 0.5224, "step": 761000 }, { "epoch": 102.54648342764753, "grad_norm": 0.17946206033229828, "learning_rate": 0.00027070421270097904, "loss": 0.521, "step": 761100 }, { "epoch": 102.55995688493667, "grad_norm": 0.18946796655654907, "learning_rate": 0.00027066678643073144, "loss": 0.5213, "step": 761200 }, { "epoch": 102.57343034222582, "grad_norm": 0.17445889115333557, "learning_rate": 0.00027062936016048383, "loss": 0.5218, "step": 761300 }, { "epoch": 102.58690379951496, "grad_norm": 0.20336507260799408, "learning_rate": 0.00027059193389023623, "loss": 0.5219, "step": 761400 }, { "epoch": 102.6003772568041, "grad_norm": 0.186836376786232, "learning_rate": 0.00027055450761998863, "loss": 0.5205, "step": 761500 }, { "epoch": 102.61385071409323, "grad_norm": 0.18645942211151123, "learning_rate": 0.00027051708134974103, "loss": 0.5228, "step": 761600 }, { "epoch": 102.62732417138238, "grad_norm": 0.1843416392803192, "learning_rate": 0.00027047965507949343, "loss": 0.5211, "step": 761700 }, { "epoch": 102.64079762867152, "grad_norm": 0.19058313965797424, "learning_rate": 0.00027044222880924577, "loss": 0.5217, "step": 761800 }, { "epoch": 102.65427108596066, "grad_norm": 0.19660545885562897, "learning_rate": 0.00027040480253899817, "loss": 0.5219, "step": 761900 }, { "epoch": 102.6677445432498, "grad_norm": 0.1835106462240219, "learning_rate": 0.00027036737626875057, "loss": 0.5222, "step": 762000 }, { "epoch": 102.68121800053893, "grad_norm": 0.1805116832256317, "learning_rate": 0.00027032994999850297, "loss": 0.5215, "step": 762100 }, { "epoch": 102.69469145782809, "grad_norm": 0.19263450801372528, "learning_rate": 0.00027029252372825536, "loss": 0.5209, "step": 762200 }, { "epoch": 102.70816491511722, "grad_norm": 0.1902167648077011, "learning_rate": 0.00027025509745800776, "loss": 0.5206, "step": 762300 }, { "epoch": 102.72163837240636, "grad_norm": 0.21243920922279358, "learning_rate": 0.00027021767118776016, "loss": 0.521, "step": 762400 }, { "epoch": 102.7351118296955, "grad_norm": 0.17875900864601135, "learning_rate": 0.0002701802449175125, "loss": 0.5213, "step": 762500 }, { "epoch": 102.74858528698464, "grad_norm": 0.20650449395179749, "learning_rate": 0.0002701428186472649, "loss": 0.5225, "step": 762600 }, { "epoch": 102.76205874427379, "grad_norm": 0.19786791503429413, "learning_rate": 0.00027010539237701725, "loss": 0.5212, "step": 762700 }, { "epoch": 102.77553220156292, "grad_norm": 0.23018227517604828, "learning_rate": 0.00027006796610676965, "loss": 0.5224, "step": 762800 }, { "epoch": 102.78900565885206, "grad_norm": 0.1914496272802353, "learning_rate": 0.00027003053983652204, "loss": 0.5207, "step": 762900 }, { "epoch": 102.8024791161412, "grad_norm": 0.17875656485557556, "learning_rate": 0.00026999311356627444, "loss": 0.5216, "step": 763000 }, { "epoch": 102.81595257343034, "grad_norm": 0.1810566484928131, "learning_rate": 0.00026995568729602684, "loss": 0.5225, "step": 763100 }, { "epoch": 102.82942603071949, "grad_norm": 0.21228301525115967, "learning_rate": 0.0002699182610257792, "loss": 0.5205, "step": 763200 }, { "epoch": 102.84289948800863, "grad_norm": 0.20583851635456085, "learning_rate": 0.0002698808347555316, "loss": 0.5203, "step": 763300 }, { "epoch": 102.85637294529776, "grad_norm": 0.2076621800661087, "learning_rate": 0.000269843408485284, "loss": 0.5219, "step": 763400 }, { "epoch": 102.8698464025869, "grad_norm": 0.18375642597675323, "learning_rate": 0.0002698059822150364, "loss": 0.5203, "step": 763500 }, { "epoch": 102.88331985987604, "grad_norm": 0.1870594620704651, "learning_rate": 0.0002697685559447888, "loss": 0.5216, "step": 763600 }, { "epoch": 102.89679331716519, "grad_norm": 0.18288373947143555, "learning_rate": 0.0002697311296745412, "loss": 0.5215, "step": 763700 }, { "epoch": 102.91026677445433, "grad_norm": 0.19566600024700165, "learning_rate": 0.0002696937034042936, "loss": 0.5212, "step": 763800 }, { "epoch": 102.92374023174347, "grad_norm": 0.20406901836395264, "learning_rate": 0.00026965627713404597, "loss": 0.5213, "step": 763900 }, { "epoch": 102.9372136890326, "grad_norm": 0.20211854577064514, "learning_rate": 0.0002696188508637983, "loss": 0.5215, "step": 764000 }, { "epoch": 102.95068714632174, "grad_norm": 0.2032322883605957, "learning_rate": 0.0002695814245935507, "loss": 0.5204, "step": 764100 }, { "epoch": 102.96416060361089, "grad_norm": 0.18831142783164978, "learning_rate": 0.0002695439983233031, "loss": 0.5216, "step": 764200 }, { "epoch": 102.97763406090003, "grad_norm": 0.21133844554424286, "learning_rate": 0.0002695065720530555, "loss": 0.5232, "step": 764300 }, { "epoch": 102.99110751818917, "grad_norm": 0.18922701478004456, "learning_rate": 0.0002694691457828079, "loss": 0.5213, "step": 764400 }, { "epoch": 103.0, "eval_loss": 0.5101766586303711, "eval_runtime": 4.9632, "eval_samples_per_second": 1007.407, "eval_steps_per_second": 15.917, "step": 764466 }, { "epoch": 103.0045809754783, "grad_norm": 0.17884021997451782, "learning_rate": 0.00026943171951256025, "loss": 0.5207, "step": 764500 }, { "epoch": 103.01805443276744, "grad_norm": 0.19954721629619598, "learning_rate": 0.00026939429324231265, "loss": 0.5211, "step": 764600 }, { "epoch": 103.0315278900566, "grad_norm": 0.19341987371444702, "learning_rate": 0.000269356866972065, "loss": 0.5204, "step": 764700 }, { "epoch": 103.04500134734573, "grad_norm": 0.197254940867424, "learning_rate": 0.0002693194407018174, "loss": 0.5206, "step": 764800 }, { "epoch": 103.05847480463487, "grad_norm": 0.19470667839050293, "learning_rate": 0.0002692820144315698, "loss": 0.5211, "step": 764900 }, { "epoch": 103.071948261924, "grad_norm": 0.1916249394416809, "learning_rate": 0.0002692445881613222, "loss": 0.5202, "step": 765000 }, { "epoch": 103.08542171921314, "grad_norm": 0.18343976140022278, "learning_rate": 0.0002692071618910746, "loss": 0.5211, "step": 765100 }, { "epoch": 103.0988951765023, "grad_norm": 0.1761413961648941, "learning_rate": 0.000269169735620827, "loss": 0.5201, "step": 765200 }, { "epoch": 103.11236863379143, "grad_norm": 0.1943068504333496, "learning_rate": 0.0002691323093505794, "loss": 0.521, "step": 765300 }, { "epoch": 103.12584209108057, "grad_norm": 0.1927172839641571, "learning_rate": 0.00026909488308033173, "loss": 0.5209, "step": 765400 }, { "epoch": 103.13931554836971, "grad_norm": 0.19405677914619446, "learning_rate": 0.0002690574568100841, "loss": 0.5213, "step": 765500 }, { "epoch": 103.15278900565885, "grad_norm": 0.2056225836277008, "learning_rate": 0.0002690200305398365, "loss": 0.5212, "step": 765600 }, { "epoch": 103.166262462948, "grad_norm": 0.17989790439605713, "learning_rate": 0.0002689826042695889, "loss": 0.5214, "step": 765700 }, { "epoch": 103.17973592023714, "grad_norm": 0.21113841235637665, "learning_rate": 0.0002689451779993413, "loss": 0.5204, "step": 765800 }, { "epoch": 103.19320937752627, "grad_norm": 0.20474055409431458, "learning_rate": 0.0002689077517290937, "loss": 0.522, "step": 765900 }, { "epoch": 103.20668283481541, "grad_norm": 0.19753974676132202, "learning_rate": 0.0002688703254588461, "loss": 0.5218, "step": 766000 }, { "epoch": 103.22015629210455, "grad_norm": 0.18657580018043518, "learning_rate": 0.0002688328991885985, "loss": 0.5212, "step": 766100 }, { "epoch": 103.2336297493937, "grad_norm": 0.18400977551937103, "learning_rate": 0.00026879547291835086, "loss": 0.5206, "step": 766200 }, { "epoch": 103.24710320668284, "grad_norm": 0.17860917747020721, "learning_rate": 0.0002687580466481032, "loss": 0.5217, "step": 766300 }, { "epoch": 103.26057666397197, "grad_norm": 0.1681405007839203, "learning_rate": 0.0002687206203778556, "loss": 0.5216, "step": 766400 }, { "epoch": 103.27405012126111, "grad_norm": 0.19615143537521362, "learning_rate": 0.000268683194107608, "loss": 0.5202, "step": 766500 }, { "epoch": 103.28752357855025, "grad_norm": 0.1852668821811676, "learning_rate": 0.0002686457678373604, "loss": 0.5212, "step": 766600 }, { "epoch": 103.3009970358394, "grad_norm": 0.1799110323190689, "learning_rate": 0.0002686083415671128, "loss": 0.5208, "step": 766700 }, { "epoch": 103.31447049312854, "grad_norm": 0.1927529126405716, "learning_rate": 0.0002685709152968652, "loss": 0.5217, "step": 766800 }, { "epoch": 103.32794395041768, "grad_norm": 0.18798907101154327, "learning_rate": 0.00026853348902661754, "loss": 0.521, "step": 766900 }, { "epoch": 103.34141740770681, "grad_norm": 0.18513308465480804, "learning_rate": 0.00026849606275636994, "loss": 0.5212, "step": 767000 }, { "epoch": 103.35489086499595, "grad_norm": 0.20094676315784454, "learning_rate": 0.00026845863648612234, "loss": 0.5213, "step": 767100 }, { "epoch": 103.3683643222851, "grad_norm": 0.1942676603794098, "learning_rate": 0.00026842121021587473, "loss": 0.5225, "step": 767200 }, { "epoch": 103.38183777957424, "grad_norm": 0.1969568282365799, "learning_rate": 0.00026838378394562713, "loss": 0.5225, "step": 767300 }, { "epoch": 103.39531123686338, "grad_norm": 0.18806250393390656, "learning_rate": 0.00026834635767537953, "loss": 0.5216, "step": 767400 }, { "epoch": 103.40878469415252, "grad_norm": 0.1979444921016693, "learning_rate": 0.00026830893140513193, "loss": 0.5211, "step": 767500 }, { "epoch": 103.42225815144165, "grad_norm": 0.21642059087753296, "learning_rate": 0.00026827150513488427, "loss": 0.5215, "step": 767600 }, { "epoch": 103.4357316087308, "grad_norm": 0.20212797820568085, "learning_rate": 0.00026823407886463667, "loss": 0.5202, "step": 767700 }, { "epoch": 103.44920506601994, "grad_norm": 0.18749314546585083, "learning_rate": 0.00026819665259438907, "loss": 0.5197, "step": 767800 }, { "epoch": 103.46267852330908, "grad_norm": 0.18629980087280273, "learning_rate": 0.00026815922632414147, "loss": 0.5212, "step": 767900 }, { "epoch": 103.47615198059822, "grad_norm": 0.20065850019454956, "learning_rate": 0.00026812180005389387, "loss": 0.5213, "step": 768000 }, { "epoch": 103.48962543788736, "grad_norm": 0.18924319744110107, "learning_rate": 0.0002680843737836462, "loss": 0.5216, "step": 768100 }, { "epoch": 103.5030988951765, "grad_norm": 0.19436587393283844, "learning_rate": 0.0002680469475133986, "loss": 0.5217, "step": 768200 }, { "epoch": 103.51657235246564, "grad_norm": 0.2032962292432785, "learning_rate": 0.00026800952124315095, "loss": 0.5203, "step": 768300 }, { "epoch": 103.53004580975478, "grad_norm": 0.1808687001466751, "learning_rate": 0.00026797209497290335, "loss": 0.5201, "step": 768400 }, { "epoch": 103.54351926704392, "grad_norm": 0.20010899007320404, "learning_rate": 0.00026793466870265575, "loss": 0.5211, "step": 768500 }, { "epoch": 103.55699272433307, "grad_norm": 0.19148144125938416, "learning_rate": 0.00026789724243240815, "loss": 0.5223, "step": 768600 }, { "epoch": 103.57046618162221, "grad_norm": 0.18196062743663788, "learning_rate": 0.00026785981616216054, "loss": 0.5218, "step": 768700 }, { "epoch": 103.58393963891135, "grad_norm": 0.20811514556407928, "learning_rate": 0.00026782238989191294, "loss": 0.5217, "step": 768800 }, { "epoch": 103.59741309620048, "grad_norm": 0.18555808067321777, "learning_rate": 0.00026778496362166534, "loss": 0.5223, "step": 768900 }, { "epoch": 103.61088655348962, "grad_norm": 0.21338717639446259, "learning_rate": 0.00026774753735141774, "loss": 0.5213, "step": 769000 }, { "epoch": 103.62436001077877, "grad_norm": 0.1857941597700119, "learning_rate": 0.0002677101110811701, "loss": 0.5214, "step": 769100 }, { "epoch": 103.63783346806791, "grad_norm": 0.19436956942081451, "learning_rate": 0.0002676726848109225, "loss": 0.5221, "step": 769200 }, { "epoch": 103.65130692535705, "grad_norm": 0.20367580652236938, "learning_rate": 0.0002676352585406749, "loss": 0.5209, "step": 769300 }, { "epoch": 103.66478038264619, "grad_norm": 0.18978819251060486, "learning_rate": 0.0002675978322704273, "loss": 0.5222, "step": 769400 }, { "epoch": 103.67825383993532, "grad_norm": 0.20536449551582336, "learning_rate": 0.0002675604060001797, "loss": 0.5201, "step": 769500 }, { "epoch": 103.69172729722447, "grad_norm": 0.18307377398014069, "learning_rate": 0.0002675229797299321, "loss": 0.5207, "step": 769600 }, { "epoch": 103.70520075451361, "grad_norm": 0.18766532838344574, "learning_rate": 0.00026748555345968447, "loss": 0.5204, "step": 769700 }, { "epoch": 103.71867421180275, "grad_norm": 0.21496669948101044, "learning_rate": 0.0002674481271894368, "loss": 0.5213, "step": 769800 }, { "epoch": 103.73214766909189, "grad_norm": 0.18872618675231934, "learning_rate": 0.00026741070091918916, "loss": 0.5209, "step": 769900 }, { "epoch": 103.74562112638102, "grad_norm": 0.1844988316297531, "learning_rate": 0.00026737327464894156, "loss": 0.5212, "step": 770000 }, { "epoch": 103.75909458367018, "grad_norm": 0.19890792667865753, "learning_rate": 0.00026733584837869396, "loss": 0.521, "step": 770100 }, { "epoch": 103.77256804095931, "grad_norm": 0.20319172739982605, "learning_rate": 0.00026729842210844636, "loss": 0.5216, "step": 770200 }, { "epoch": 103.78604149824845, "grad_norm": 0.1937636137008667, "learning_rate": 0.00026726099583819875, "loss": 0.5215, "step": 770300 }, { "epoch": 103.79951495553759, "grad_norm": 0.20016424357891083, "learning_rate": 0.00026722356956795115, "loss": 0.5205, "step": 770400 }, { "epoch": 103.81298841282673, "grad_norm": 0.19301412999629974, "learning_rate": 0.0002671861432977035, "loss": 0.5219, "step": 770500 }, { "epoch": 103.82646187011588, "grad_norm": 0.20926423370838165, "learning_rate": 0.0002671487170274559, "loss": 0.5213, "step": 770600 }, { "epoch": 103.83993532740502, "grad_norm": 0.20399145781993866, "learning_rate": 0.0002671112907572083, "loss": 0.5222, "step": 770700 }, { "epoch": 103.85340878469415, "grad_norm": 0.20036330819129944, "learning_rate": 0.0002670738644869607, "loss": 0.5213, "step": 770800 }, { "epoch": 103.86688224198329, "grad_norm": 0.1920090615749359, "learning_rate": 0.0002670364382167131, "loss": 0.5219, "step": 770900 }, { "epoch": 103.88035569927243, "grad_norm": 0.17641407251358032, "learning_rate": 0.0002669990119464655, "loss": 0.521, "step": 771000 }, { "epoch": 103.89382915656158, "grad_norm": 0.2041090428829193, "learning_rate": 0.0002669615856762179, "loss": 0.5209, "step": 771100 }, { "epoch": 103.90730261385072, "grad_norm": 0.22438199818134308, "learning_rate": 0.0002669241594059703, "loss": 0.5212, "step": 771200 }, { "epoch": 103.92077607113985, "grad_norm": 0.18353873491287231, "learning_rate": 0.00026688673313572263, "loss": 0.5215, "step": 771300 }, { "epoch": 103.93424952842899, "grad_norm": 0.18702727556228638, "learning_rate": 0.000266849306865475, "loss": 0.5207, "step": 771400 }, { "epoch": 103.94772298571813, "grad_norm": 0.20185783505439758, "learning_rate": 0.0002668118805952274, "loss": 0.5209, "step": 771500 }, { "epoch": 103.96119644300728, "grad_norm": 0.20386295020580292, "learning_rate": 0.0002667744543249798, "loss": 0.5202, "step": 771600 }, { "epoch": 103.97466990029642, "grad_norm": 0.2008417695760727, "learning_rate": 0.00026673702805473217, "loss": 0.5209, "step": 771700 }, { "epoch": 103.98814335758556, "grad_norm": 0.18655520677566528, "learning_rate": 0.00026669960178448456, "loss": 0.5221, "step": 771800 }, { "epoch": 104.0, "eval_loss": 0.509543776512146, "eval_runtime": 4.9417, "eval_samples_per_second": 1011.806, "eval_steps_per_second": 15.987, "step": 771888 }, { "epoch": 104.0016168148747, "grad_norm": 0.19537383317947388, "learning_rate": 0.00026666217551423696, "loss": 0.5201, "step": 771900 }, { "epoch": 104.01509027216383, "grad_norm": 0.21651028096675873, "learning_rate": 0.0002666247492439893, "loss": 0.5208, "step": 772000 }, { "epoch": 104.02856372945298, "grad_norm": 0.17716237902641296, "learning_rate": 0.0002665873229737417, "loss": 0.5196, "step": 772100 }, { "epoch": 104.04203718674212, "grad_norm": 0.21253210306167603, "learning_rate": 0.0002665498967034941, "loss": 0.5208, "step": 772200 }, { "epoch": 104.05551064403126, "grad_norm": 0.20278631150722504, "learning_rate": 0.0002665124704332465, "loss": 0.52, "step": 772300 }, { "epoch": 104.0689841013204, "grad_norm": 0.19846078753471375, "learning_rate": 0.0002664750441629989, "loss": 0.5201, "step": 772400 }, { "epoch": 104.08245755860953, "grad_norm": 0.179824098944664, "learning_rate": 0.0002664376178927513, "loss": 0.5203, "step": 772500 }, { "epoch": 104.09593101589869, "grad_norm": 0.23726336658000946, "learning_rate": 0.0002664001916225037, "loss": 0.5204, "step": 772600 }, { "epoch": 104.10940447318782, "grad_norm": 0.21595700085163116, "learning_rate": 0.00026636276535225604, "loss": 0.5211, "step": 772700 }, { "epoch": 104.12287793047696, "grad_norm": 0.2035108208656311, "learning_rate": 0.00026632533908200844, "loss": 0.5207, "step": 772800 }, { "epoch": 104.1363513877661, "grad_norm": 0.20133692026138306, "learning_rate": 0.00026628791281176084, "loss": 0.5203, "step": 772900 }, { "epoch": 104.14982484505524, "grad_norm": 0.19756411015987396, "learning_rate": 0.00026625048654151324, "loss": 0.5207, "step": 773000 }, { "epoch": 104.16329830234439, "grad_norm": 0.1990814357995987, "learning_rate": 0.00026621306027126563, "loss": 0.5203, "step": 773100 }, { "epoch": 104.17677175963352, "grad_norm": 0.18633948266506195, "learning_rate": 0.00026617563400101803, "loss": 0.5215, "step": 773200 }, { "epoch": 104.19024521692266, "grad_norm": 0.2089063972234726, "learning_rate": 0.00026613820773077043, "loss": 0.5214, "step": 773300 }, { "epoch": 104.2037186742118, "grad_norm": 0.1810973435640335, "learning_rate": 0.0002661007814605228, "loss": 0.5214, "step": 773400 }, { "epoch": 104.21719213150094, "grad_norm": 0.1960265338420868, "learning_rate": 0.0002660633551902751, "loss": 0.521, "step": 773500 }, { "epoch": 104.23066558879009, "grad_norm": 0.18715223670005798, "learning_rate": 0.0002660259289200275, "loss": 0.5211, "step": 773600 }, { "epoch": 104.24413904607923, "grad_norm": 0.20602764189243317, "learning_rate": 0.0002659885026497799, "loss": 0.5204, "step": 773700 }, { "epoch": 104.25761250336836, "grad_norm": 0.1924056112766266, "learning_rate": 0.0002659510763795323, "loss": 0.5215, "step": 773800 }, { "epoch": 104.2710859606575, "grad_norm": 0.19356319308280945, "learning_rate": 0.0002659136501092847, "loss": 0.5207, "step": 773900 }, { "epoch": 104.28455941794664, "grad_norm": 0.18460392951965332, "learning_rate": 0.0002658762238390371, "loss": 0.5212, "step": 774000 }, { "epoch": 104.29803287523579, "grad_norm": 0.19397370517253876, "learning_rate": 0.0002658387975687895, "loss": 0.5203, "step": 774100 }, { "epoch": 104.31150633252493, "grad_norm": 0.18641966581344604, "learning_rate": 0.00026580137129854185, "loss": 0.5212, "step": 774200 }, { "epoch": 104.32497978981407, "grad_norm": 0.18444111943244934, "learning_rate": 0.00026576394502829425, "loss": 0.5207, "step": 774300 }, { "epoch": 104.3384532471032, "grad_norm": 0.18622896075248718, "learning_rate": 0.00026572651875804665, "loss": 0.5201, "step": 774400 }, { "epoch": 104.35192670439234, "grad_norm": 0.19096867740154266, "learning_rate": 0.00026568909248779905, "loss": 0.5206, "step": 774500 }, { "epoch": 104.36540016168149, "grad_norm": 0.18897394835948944, "learning_rate": 0.00026565166621755144, "loss": 0.5208, "step": 774600 }, { "epoch": 104.37887361897063, "grad_norm": 0.18294833600521088, "learning_rate": 0.00026561423994730384, "loss": 0.5209, "step": 774700 }, { "epoch": 104.39234707625977, "grad_norm": 0.18290284276008606, "learning_rate": 0.00026557681367705624, "loss": 0.5205, "step": 774800 }, { "epoch": 104.4058205335489, "grad_norm": 0.19536349177360535, "learning_rate": 0.0002655393874068086, "loss": 0.5206, "step": 774900 }, { "epoch": 104.41929399083804, "grad_norm": 0.1809062659740448, "learning_rate": 0.000265501961136561, "loss": 0.5213, "step": 775000 }, { "epoch": 104.4327674481272, "grad_norm": 0.2140905112028122, "learning_rate": 0.0002654645348663134, "loss": 0.5217, "step": 775100 }, { "epoch": 104.44624090541633, "grad_norm": 0.20304127037525177, "learning_rate": 0.0002654271085960658, "loss": 0.5216, "step": 775200 }, { "epoch": 104.45971436270547, "grad_norm": 0.22124521434307098, "learning_rate": 0.0002653896823258182, "loss": 0.5198, "step": 775300 }, { "epoch": 104.4731878199946, "grad_norm": 0.1999436318874359, "learning_rate": 0.0002653522560555705, "loss": 0.5204, "step": 775400 }, { "epoch": 104.48666127728374, "grad_norm": 0.18324607610702515, "learning_rate": 0.0002653148297853229, "loss": 0.5212, "step": 775500 }, { "epoch": 104.5001347345729, "grad_norm": 0.1969374418258667, "learning_rate": 0.00026527740351507526, "loss": 0.521, "step": 775600 }, { "epoch": 104.51360819186203, "grad_norm": 0.18635505437850952, "learning_rate": 0.00026523997724482766, "loss": 0.5204, "step": 775700 }, { "epoch": 104.52708164915117, "grad_norm": 0.19136942923069, "learning_rate": 0.00026520255097458006, "loss": 0.5211, "step": 775800 }, { "epoch": 104.54055510644031, "grad_norm": 0.18568983674049377, "learning_rate": 0.00026516512470433246, "loss": 0.5208, "step": 775900 }, { "epoch": 104.55402856372945, "grad_norm": 0.19421271979808807, "learning_rate": 0.00026512769843408486, "loss": 0.5218, "step": 776000 }, { "epoch": 104.5675020210186, "grad_norm": 0.19376559555530548, "learning_rate": 0.00026509027216383726, "loss": 0.521, "step": 776100 }, { "epoch": 104.58097547830774, "grad_norm": 0.1900901049375534, "learning_rate": 0.00026505284589358965, "loss": 0.522, "step": 776200 }, { "epoch": 104.59444893559687, "grad_norm": 0.19398340582847595, "learning_rate": 0.00026501541962334205, "loss": 0.5212, "step": 776300 }, { "epoch": 104.60792239288601, "grad_norm": 0.19594915211200714, "learning_rate": 0.0002649779933530944, "loss": 0.5218, "step": 776400 }, { "epoch": 104.62139585017516, "grad_norm": 0.19181634485721588, "learning_rate": 0.0002649405670828468, "loss": 0.5216, "step": 776500 }, { "epoch": 104.6348693074643, "grad_norm": 0.20764662325382233, "learning_rate": 0.0002649031408125992, "loss": 0.522, "step": 776600 }, { "epoch": 104.64834276475344, "grad_norm": 0.18342165648937225, "learning_rate": 0.0002648657145423516, "loss": 0.5216, "step": 776700 }, { "epoch": 104.66181622204257, "grad_norm": 0.18536362051963806, "learning_rate": 0.000264828288272104, "loss": 0.5208, "step": 776800 }, { "epoch": 104.67528967933171, "grad_norm": 0.19736982882022858, "learning_rate": 0.0002647908620018564, "loss": 0.5209, "step": 776900 }, { "epoch": 104.68876313662086, "grad_norm": 0.18533942103385925, "learning_rate": 0.0002647534357316088, "loss": 0.5208, "step": 777000 }, { "epoch": 104.70223659391, "grad_norm": 0.20455235242843628, "learning_rate": 0.00026471600946136113, "loss": 0.5215, "step": 777100 }, { "epoch": 104.71571005119914, "grad_norm": 0.19822263717651367, "learning_rate": 0.0002646785831911135, "loss": 0.5212, "step": 777200 }, { "epoch": 104.72918350848828, "grad_norm": 0.23248709738254547, "learning_rate": 0.00026464115692086587, "loss": 0.522, "step": 777300 }, { "epoch": 104.74265696577741, "grad_norm": 0.19801612198352814, "learning_rate": 0.00026460373065061827, "loss": 0.5209, "step": 777400 }, { "epoch": 104.75613042306657, "grad_norm": 0.19239449501037598, "learning_rate": 0.00026456630438037067, "loss": 0.5215, "step": 777500 }, { "epoch": 104.7696038803557, "grad_norm": 0.18424434959888458, "learning_rate": 0.00026452887811012307, "loss": 0.5219, "step": 777600 }, { "epoch": 104.78307733764484, "grad_norm": 0.20848271250724792, "learning_rate": 0.00026449145183987546, "loss": 0.5213, "step": 777700 }, { "epoch": 104.79655079493398, "grad_norm": 0.18370263278484344, "learning_rate": 0.0002644540255696278, "loss": 0.5213, "step": 777800 }, { "epoch": 104.81002425222312, "grad_norm": 0.19741475582122803, "learning_rate": 0.0002644165992993802, "loss": 0.5215, "step": 777900 }, { "epoch": 104.82349770951227, "grad_norm": 0.18318863213062286, "learning_rate": 0.0002643791730291326, "loss": 0.5212, "step": 778000 }, { "epoch": 104.8369711668014, "grad_norm": 0.18992169201374054, "learning_rate": 0.000264341746758885, "loss": 0.5215, "step": 778100 }, { "epoch": 104.85044462409054, "grad_norm": 0.20853684842586517, "learning_rate": 0.0002643043204886374, "loss": 0.5205, "step": 778200 }, { "epoch": 104.86391808137968, "grad_norm": 0.17327748239040375, "learning_rate": 0.0002642668942183898, "loss": 0.5215, "step": 778300 }, { "epoch": 104.87739153866882, "grad_norm": 0.2032080888748169, "learning_rate": 0.0002642294679481422, "loss": 0.5202, "step": 778400 }, { "epoch": 104.89086499595797, "grad_norm": 0.17929968237876892, "learning_rate": 0.00026419204167789454, "loss": 0.5209, "step": 778500 }, { "epoch": 104.9043384532471, "grad_norm": 0.21810749173164368, "learning_rate": 0.00026415461540764694, "loss": 0.5214, "step": 778600 }, { "epoch": 104.91781191053624, "grad_norm": 0.18244801461696625, "learning_rate": 0.00026411718913739934, "loss": 0.5211, "step": 778700 }, { "epoch": 104.93128536782538, "grad_norm": 0.18999376893043518, "learning_rate": 0.00026407976286715174, "loss": 0.5208, "step": 778800 }, { "epoch": 104.94475882511452, "grad_norm": 0.1895948350429535, "learning_rate": 0.00026404233659690413, "loss": 0.522, "step": 778900 }, { "epoch": 104.95823228240367, "grad_norm": 0.17894238233566284, "learning_rate": 0.0002640049103266565, "loss": 0.5224, "step": 779000 }, { "epoch": 104.97170573969281, "grad_norm": 0.21924929320812225, "learning_rate": 0.0002639674840564089, "loss": 0.5208, "step": 779100 }, { "epoch": 104.98517919698195, "grad_norm": 0.20212741196155548, "learning_rate": 0.0002639300577861613, "loss": 0.5216, "step": 779200 }, { "epoch": 104.99865265427108, "grad_norm": 0.19874992966651917, "learning_rate": 0.0002638926315159136, "loss": 0.5206, "step": 779300 }, { "epoch": 105.0, "eval_loss": 0.5100452899932861, "eval_runtime": 4.9339, "eval_samples_per_second": 1013.387, "eval_steps_per_second": 16.012, "step": 779310 }, { "epoch": 105.01212611156022, "grad_norm": 0.19215701520442963, "learning_rate": 0.000263855205245666, "loss": 0.5208, "step": 779400 }, { "epoch": 105.02559956884937, "grad_norm": 0.18789692223072052, "learning_rate": 0.0002638177789754184, "loss": 0.5208, "step": 779500 }, { "epoch": 105.03907302613851, "grad_norm": 0.20408810675144196, "learning_rate": 0.0002637803527051708, "loss": 0.5204, "step": 779600 }, { "epoch": 105.05254648342765, "grad_norm": 0.17384177446365356, "learning_rate": 0.0002637429264349232, "loss": 0.5206, "step": 779700 }, { "epoch": 105.06601994071679, "grad_norm": 0.1911306381225586, "learning_rate": 0.0002637055001646756, "loss": 0.5214, "step": 779800 }, { "epoch": 105.07949339800592, "grad_norm": 0.19679440557956696, "learning_rate": 0.000263668073894428, "loss": 0.5205, "step": 779900 }, { "epoch": 105.09296685529507, "grad_norm": 0.2177019715309143, "learning_rate": 0.00026363064762418035, "loss": 0.5203, "step": 780000 }, { "epoch": 105.10644031258421, "grad_norm": 0.17697252333164215, "learning_rate": 0.00026359322135393275, "loss": 0.5204, "step": 780100 }, { "epoch": 105.11991376987335, "grad_norm": 0.19379976391792297, "learning_rate": 0.00026355579508368515, "loss": 0.5206, "step": 780200 }, { "epoch": 105.13338722716249, "grad_norm": 0.18273116648197174, "learning_rate": 0.00026351836881343755, "loss": 0.5205, "step": 780300 }, { "epoch": 105.14686068445162, "grad_norm": 0.1901829093694687, "learning_rate": 0.00026348094254318995, "loss": 0.5198, "step": 780400 }, { "epoch": 105.16033414174078, "grad_norm": 0.1798679083585739, "learning_rate": 0.00026344351627294234, "loss": 0.5201, "step": 780500 }, { "epoch": 105.17380759902991, "grad_norm": 0.18898028135299683, "learning_rate": 0.00026340609000269474, "loss": 0.5215, "step": 780600 }, { "epoch": 105.18728105631905, "grad_norm": 0.19530221819877625, "learning_rate": 0.0002633686637324471, "loss": 0.5206, "step": 780700 }, { "epoch": 105.20075451360819, "grad_norm": 0.1951768845319748, "learning_rate": 0.00026333123746219943, "loss": 0.5203, "step": 780800 }, { "epoch": 105.21422797089733, "grad_norm": 0.1834346503019333, "learning_rate": 0.00026329381119195183, "loss": 0.5204, "step": 780900 }, { "epoch": 105.22770142818648, "grad_norm": 0.20989909768104553, "learning_rate": 0.0002632563849217042, "loss": 0.52, "step": 781000 }, { "epoch": 105.24117488547562, "grad_norm": 0.19471308588981628, "learning_rate": 0.0002632189586514566, "loss": 0.5211, "step": 781100 }, { "epoch": 105.25464834276475, "grad_norm": 0.23274995386600494, "learning_rate": 0.000263181532381209, "loss": 0.5212, "step": 781200 }, { "epoch": 105.26812180005389, "grad_norm": 0.1880931258201599, "learning_rate": 0.0002631441061109614, "loss": 0.5211, "step": 781300 }, { "epoch": 105.28159525734303, "grad_norm": 0.21757759153842926, "learning_rate": 0.00026310667984071377, "loss": 0.5208, "step": 781400 }, { "epoch": 105.29506871463218, "grad_norm": 0.19025801122188568, "learning_rate": 0.00026306925357046616, "loss": 0.5212, "step": 781500 }, { "epoch": 105.30854217192132, "grad_norm": 0.18365277349948883, "learning_rate": 0.00026303182730021856, "loss": 0.5211, "step": 781600 }, { "epoch": 105.32201562921045, "grad_norm": 0.189235657453537, "learning_rate": 0.00026299440102997096, "loss": 0.5217, "step": 781700 }, { "epoch": 105.33548908649959, "grad_norm": 0.19952765107154846, "learning_rate": 0.00026295697475972336, "loss": 0.5209, "step": 781800 }, { "epoch": 105.34896254378873, "grad_norm": 0.19376379251480103, "learning_rate": 0.00026291954848947576, "loss": 0.5207, "step": 781900 }, { "epoch": 105.36243600107788, "grad_norm": 0.1948009431362152, "learning_rate": 0.00026288212221922815, "loss": 0.5211, "step": 782000 }, { "epoch": 105.37590945836702, "grad_norm": 0.17628145217895508, "learning_rate": 0.00026284469594898055, "loss": 0.5207, "step": 782100 }, { "epoch": 105.38938291565616, "grad_norm": 0.1873674988746643, "learning_rate": 0.0002628072696787329, "loss": 0.5197, "step": 782200 }, { "epoch": 105.4028563729453, "grad_norm": 0.1993507593870163, "learning_rate": 0.0002627698434084853, "loss": 0.52, "step": 782300 }, { "epoch": 105.41632983023443, "grad_norm": 0.1827133148908615, "learning_rate": 0.0002627324171382377, "loss": 0.5205, "step": 782400 }, { "epoch": 105.42980328752358, "grad_norm": 0.21771325170993805, "learning_rate": 0.0002626949908679901, "loss": 0.522, "step": 782500 }, { "epoch": 105.44327674481272, "grad_norm": 0.18261492252349854, "learning_rate": 0.00026265756459774244, "loss": 0.5205, "step": 782600 }, { "epoch": 105.45675020210186, "grad_norm": 0.19488494098186493, "learning_rate": 0.00026262013832749483, "loss": 0.5214, "step": 782700 }, { "epoch": 105.470223659391, "grad_norm": 0.18240045011043549, "learning_rate": 0.00026258271205724723, "loss": 0.5207, "step": 782800 }, { "epoch": 105.48369711668013, "grad_norm": 0.1874956637620926, "learning_rate": 0.0002625452857869996, "loss": 0.5222, "step": 782900 }, { "epoch": 105.49717057396929, "grad_norm": 0.17666879296302795, "learning_rate": 0.000262507859516752, "loss": 0.5209, "step": 783000 }, { "epoch": 105.51064403125842, "grad_norm": 0.1826154887676239, "learning_rate": 0.00026247043324650437, "loss": 0.5197, "step": 783100 }, { "epoch": 105.52411748854756, "grad_norm": 0.19685788452625275, "learning_rate": 0.00026243300697625677, "loss": 0.5204, "step": 783200 }, { "epoch": 105.5375909458367, "grad_norm": 0.17827777564525604, "learning_rate": 0.00026239558070600917, "loss": 0.5208, "step": 783300 }, { "epoch": 105.55106440312584, "grad_norm": 0.18649360537528992, "learning_rate": 0.00026235815443576157, "loss": 0.5204, "step": 783400 }, { "epoch": 105.56453786041499, "grad_norm": 0.20545461773872375, "learning_rate": 0.00026232072816551397, "loss": 0.5219, "step": 783500 }, { "epoch": 105.57801131770412, "grad_norm": 0.21093985438346863, "learning_rate": 0.0002622833018952663, "loss": 0.5207, "step": 783600 }, { "epoch": 105.59148477499326, "grad_norm": 0.20648036897182465, "learning_rate": 0.0002622458756250187, "loss": 0.5212, "step": 783700 }, { "epoch": 105.6049582322824, "grad_norm": 0.1972782015800476, "learning_rate": 0.0002622084493547711, "loss": 0.521, "step": 783800 }, { "epoch": 105.61843168957154, "grad_norm": 0.1923370361328125, "learning_rate": 0.0002621710230845235, "loss": 0.52, "step": 783900 }, { "epoch": 105.63190514686069, "grad_norm": 0.1894480288028717, "learning_rate": 0.0002621335968142759, "loss": 0.5199, "step": 784000 }, { "epoch": 105.64537860414983, "grad_norm": 0.17973493039608002, "learning_rate": 0.0002620961705440283, "loss": 0.5202, "step": 784100 }, { "epoch": 105.65885206143896, "grad_norm": 0.2092074751853943, "learning_rate": 0.0002620587442737807, "loss": 0.5215, "step": 784200 }, { "epoch": 105.6723255187281, "grad_norm": 0.20059601962566376, "learning_rate": 0.0002620213180035331, "loss": 0.5204, "step": 784300 }, { "epoch": 105.68579897601725, "grad_norm": 0.1983763873577118, "learning_rate": 0.0002619838917332854, "loss": 0.5209, "step": 784400 }, { "epoch": 105.69927243330639, "grad_norm": 0.20269107818603516, "learning_rate": 0.0002619464654630378, "loss": 0.5207, "step": 784500 }, { "epoch": 105.71274589059553, "grad_norm": 0.1900489330291748, "learning_rate": 0.0002619090391927902, "loss": 0.521, "step": 784600 }, { "epoch": 105.72621934788467, "grad_norm": 0.19697487354278564, "learning_rate": 0.0002618716129225426, "loss": 0.5206, "step": 784700 }, { "epoch": 105.7396928051738, "grad_norm": 0.19817186892032623, "learning_rate": 0.000261834186652295, "loss": 0.5208, "step": 784800 }, { "epoch": 105.75316626246295, "grad_norm": 0.20158390700817108, "learning_rate": 0.0002617967603820474, "loss": 0.5216, "step": 784900 }, { "epoch": 105.76663971975209, "grad_norm": 0.1950959414243698, "learning_rate": 0.0002617593341117998, "loss": 0.5221, "step": 785000 }, { "epoch": 105.78011317704123, "grad_norm": 0.2382810413837433, "learning_rate": 0.0002617219078415521, "loss": 0.5207, "step": 785100 }, { "epoch": 105.79358663433037, "grad_norm": 0.2109946459531784, "learning_rate": 0.0002616844815713045, "loss": 0.5209, "step": 785200 }, { "epoch": 105.8070600916195, "grad_norm": 0.19368518888950348, "learning_rate": 0.0002616470553010569, "loss": 0.5207, "step": 785300 }, { "epoch": 105.82053354890866, "grad_norm": 0.18676894903182983, "learning_rate": 0.0002616096290308093, "loss": 0.5207, "step": 785400 }, { "epoch": 105.8340070061978, "grad_norm": 0.1807517111301422, "learning_rate": 0.0002615722027605617, "loss": 0.5204, "step": 785500 }, { "epoch": 105.84748046348693, "grad_norm": 0.1967315673828125, "learning_rate": 0.0002615347764903141, "loss": 0.521, "step": 785600 }, { "epoch": 105.86095392077607, "grad_norm": 0.1926768571138382, "learning_rate": 0.0002614973502200665, "loss": 0.5201, "step": 785700 }, { "epoch": 105.8744273780652, "grad_norm": 0.186419278383255, "learning_rate": 0.00026145992394981885, "loss": 0.5207, "step": 785800 }, { "epoch": 105.88790083535436, "grad_norm": 0.18234458565711975, "learning_rate": 0.00026142249767957125, "loss": 0.5211, "step": 785900 }, { "epoch": 105.9013742926435, "grad_norm": 0.20306973159313202, "learning_rate": 0.00026138507140932365, "loss": 0.5213, "step": 786000 }, { "epoch": 105.91484774993263, "grad_norm": 0.20470592379570007, "learning_rate": 0.00026134764513907605, "loss": 0.5214, "step": 786100 }, { "epoch": 105.92832120722177, "grad_norm": 0.19212643802165985, "learning_rate": 0.0002613102188688284, "loss": 0.5203, "step": 786200 }, { "epoch": 105.94179466451091, "grad_norm": 0.18195070326328278, "learning_rate": 0.0002612727925985808, "loss": 0.5209, "step": 786300 }, { "epoch": 105.95526812180006, "grad_norm": 0.1893858164548874, "learning_rate": 0.0002612353663283332, "loss": 0.5211, "step": 786400 }, { "epoch": 105.9687415790892, "grad_norm": 0.21810874342918396, "learning_rate": 0.00026119794005808553, "loss": 0.5213, "step": 786500 }, { "epoch": 105.98221503637834, "grad_norm": 0.1988682895898819, "learning_rate": 0.00026116051378783793, "loss": 0.5211, "step": 786600 }, { "epoch": 105.99568849366747, "grad_norm": 0.1862511783838272, "learning_rate": 0.00026112308751759033, "loss": 0.5209, "step": 786700 }, { "epoch": 106.0, "eval_loss": 0.5096709132194519, "eval_runtime": 4.9361, "eval_samples_per_second": 1012.955, "eval_steps_per_second": 16.005, "step": 786732 }, { "epoch": 106.00916195095661, "grad_norm": 0.19812442362308502, "learning_rate": 0.00026108566124734273, "loss": 0.5201, "step": 786800 }, { "epoch": 106.02263540824576, "grad_norm": 0.19916267693042755, "learning_rate": 0.0002610482349770951, "loss": 0.5188, "step": 786900 }, { "epoch": 106.0361088655349, "grad_norm": 0.1921694427728653, "learning_rate": 0.0002610108087068475, "loss": 0.5203, "step": 787000 }, { "epoch": 106.04958232282404, "grad_norm": 0.20212846994400024, "learning_rate": 0.0002609733824365999, "loss": 0.5202, "step": 787100 }, { "epoch": 106.06305578011317, "grad_norm": 0.21043206751346588, "learning_rate": 0.0002609359561663523, "loss": 0.5205, "step": 787200 }, { "epoch": 106.07652923740231, "grad_norm": 0.22468435764312744, "learning_rate": 0.00026089852989610466, "loss": 0.5203, "step": 787300 }, { "epoch": 106.09000269469146, "grad_norm": 0.19944721460342407, "learning_rate": 0.00026086110362585706, "loss": 0.5193, "step": 787400 }, { "epoch": 106.1034761519806, "grad_norm": 0.19365637004375458, "learning_rate": 0.00026082367735560946, "loss": 0.5202, "step": 787500 }, { "epoch": 106.11694960926974, "grad_norm": 0.1837846040725708, "learning_rate": 0.00026078625108536186, "loss": 0.5203, "step": 787600 }, { "epoch": 106.13042306655888, "grad_norm": 0.19484150409698486, "learning_rate": 0.00026074882481511426, "loss": 0.5202, "step": 787700 }, { "epoch": 106.14389652384801, "grad_norm": 0.1965959668159485, "learning_rate": 0.00026071139854486666, "loss": 0.5211, "step": 787800 }, { "epoch": 106.15736998113717, "grad_norm": 0.20943234860897064, "learning_rate": 0.00026067397227461905, "loss": 0.5203, "step": 787900 }, { "epoch": 106.1708434384263, "grad_norm": 0.20214961469173431, "learning_rate": 0.00026063654600437134, "loss": 0.5202, "step": 788000 }, { "epoch": 106.18431689571544, "grad_norm": 0.1887456625699997, "learning_rate": 0.00026059911973412374, "loss": 0.52, "step": 788100 }, { "epoch": 106.19779035300458, "grad_norm": 0.18793298304080963, "learning_rate": 0.00026056169346387614, "loss": 0.521, "step": 788200 }, { "epoch": 106.21126381029372, "grad_norm": 0.1920541524887085, "learning_rate": 0.00026052426719362854, "loss": 0.5202, "step": 788300 }, { "epoch": 106.22473726758287, "grad_norm": 0.18986952304840088, "learning_rate": 0.00026048684092338094, "loss": 0.5211, "step": 788400 }, { "epoch": 106.238210724872, "grad_norm": 0.18276700377464294, "learning_rate": 0.00026044941465313333, "loss": 0.5208, "step": 788500 }, { "epoch": 106.25168418216114, "grad_norm": 0.19415925443172455, "learning_rate": 0.00026041198838288573, "loss": 0.5205, "step": 788600 }, { "epoch": 106.26515763945028, "grad_norm": 0.2070445865392685, "learning_rate": 0.0002603745621126381, "loss": 0.5207, "step": 788700 }, { "epoch": 106.27863109673942, "grad_norm": 0.18674413859844208, "learning_rate": 0.0002603371358423905, "loss": 0.5204, "step": 788800 }, { "epoch": 106.29210455402857, "grad_norm": 0.1827012002468109, "learning_rate": 0.0002602997095721429, "loss": 0.5202, "step": 788900 }, { "epoch": 106.3055780113177, "grad_norm": 0.18443068861961365, "learning_rate": 0.00026026228330189527, "loss": 0.521, "step": 789000 }, { "epoch": 106.31905146860684, "grad_norm": 0.20162244141101837, "learning_rate": 0.00026022485703164767, "loss": 0.5213, "step": 789100 }, { "epoch": 106.33252492589598, "grad_norm": 0.2360360026359558, "learning_rate": 0.00026018743076140007, "loss": 0.5204, "step": 789200 }, { "epoch": 106.34599838318512, "grad_norm": 0.19736845791339874, "learning_rate": 0.00026015000449115247, "loss": 0.5201, "step": 789300 }, { "epoch": 106.35947184047427, "grad_norm": 0.19134265184402466, "learning_rate": 0.00026011257822090486, "loss": 0.5196, "step": 789400 }, { "epoch": 106.37294529776341, "grad_norm": 0.17223390936851501, "learning_rate": 0.0002600751519506572, "loss": 0.5202, "step": 789500 }, { "epoch": 106.38641875505255, "grad_norm": 0.21065157651901245, "learning_rate": 0.0002600377256804096, "loss": 0.5213, "step": 789600 }, { "epoch": 106.39989221234168, "grad_norm": 0.1863216608762741, "learning_rate": 0.000260000299410162, "loss": 0.5208, "step": 789700 }, { "epoch": 106.41336566963082, "grad_norm": 0.18831866979599, "learning_rate": 0.00025996287313991435, "loss": 0.521, "step": 789800 }, { "epoch": 106.42683912691997, "grad_norm": 0.1971854418516159, "learning_rate": 0.00025992544686966675, "loss": 0.5207, "step": 789900 }, { "epoch": 106.44031258420911, "grad_norm": 0.1900806427001953, "learning_rate": 0.00025988802059941915, "loss": 0.5207, "step": 790000 }, { "epoch": 106.45378604149825, "grad_norm": 0.19586192071437836, "learning_rate": 0.00025985059432917154, "loss": 0.5219, "step": 790100 }, { "epoch": 106.46725949878739, "grad_norm": 0.20514030754566193, "learning_rate": 0.0002598131680589239, "loss": 0.5212, "step": 790200 }, { "epoch": 106.48073295607652, "grad_norm": 0.18994325399398804, "learning_rate": 0.0002597757417886763, "loss": 0.5194, "step": 790300 }, { "epoch": 106.49420641336567, "grad_norm": 0.1772071123123169, "learning_rate": 0.0002597383155184287, "loss": 0.521, "step": 790400 }, { "epoch": 106.50767987065481, "grad_norm": 0.18773430585861206, "learning_rate": 0.0002597008892481811, "loss": 0.5204, "step": 790500 }, { "epoch": 106.52115332794395, "grad_norm": 0.22513750195503235, "learning_rate": 0.0002596634629779335, "loss": 0.5213, "step": 790600 }, { "epoch": 106.53462678523309, "grad_norm": 0.18424831330776215, "learning_rate": 0.0002596260367076859, "loss": 0.52, "step": 790700 }, { "epoch": 106.54810024252222, "grad_norm": 0.18860433995723724, "learning_rate": 0.0002595886104374383, "loss": 0.5211, "step": 790800 }, { "epoch": 106.56157369981138, "grad_norm": 0.20174288749694824, "learning_rate": 0.0002595511841671906, "loss": 0.5199, "step": 790900 }, { "epoch": 106.57504715710051, "grad_norm": 0.1952853500843048, "learning_rate": 0.000259513757896943, "loss": 0.5202, "step": 791000 }, { "epoch": 106.58852061438965, "grad_norm": 0.20942026376724243, "learning_rate": 0.0002594763316266954, "loss": 0.5208, "step": 791100 }, { "epoch": 106.60199407167879, "grad_norm": 0.1802225410938263, "learning_rate": 0.0002594389053564478, "loss": 0.5196, "step": 791200 }, { "epoch": 106.61546752896794, "grad_norm": 0.18602070212364197, "learning_rate": 0.0002594014790862002, "loss": 0.5213, "step": 791300 }, { "epoch": 106.62894098625708, "grad_norm": 0.1804376095533371, "learning_rate": 0.0002593640528159526, "loss": 0.5199, "step": 791400 }, { "epoch": 106.64241444354622, "grad_norm": 0.18981702625751495, "learning_rate": 0.000259326626545705, "loss": 0.5213, "step": 791500 }, { "epoch": 106.65588790083535, "grad_norm": 0.19089367985725403, "learning_rate": 0.00025928920027545735, "loss": 0.5211, "step": 791600 }, { "epoch": 106.66936135812449, "grad_norm": 0.20342430472373962, "learning_rate": 0.0002592517740052097, "loss": 0.5203, "step": 791700 }, { "epoch": 106.68283481541364, "grad_norm": 0.19479729235172272, "learning_rate": 0.0002592143477349621, "loss": 0.5207, "step": 791800 }, { "epoch": 106.69630827270278, "grad_norm": 0.1861610859632492, "learning_rate": 0.0002591769214647145, "loss": 0.521, "step": 791900 }, { "epoch": 106.70978172999192, "grad_norm": 0.1876479834318161, "learning_rate": 0.0002591394951944669, "loss": 0.5217, "step": 792000 }, { "epoch": 106.72325518728105, "grad_norm": 0.2133457213640213, "learning_rate": 0.0002591020689242193, "loss": 0.5205, "step": 792100 }, { "epoch": 106.73672864457019, "grad_norm": 0.20727145671844482, "learning_rate": 0.0002590646426539717, "loss": 0.5211, "step": 792200 }, { "epoch": 106.75020210185934, "grad_norm": 0.2079066038131714, "learning_rate": 0.0002590272163837241, "loss": 0.5212, "step": 792300 }, { "epoch": 106.76367555914848, "grad_norm": 0.1961582452058792, "learning_rate": 0.00025898979011347643, "loss": 0.5208, "step": 792400 }, { "epoch": 106.77714901643762, "grad_norm": 0.183585524559021, "learning_rate": 0.00025895236384322883, "loss": 0.5214, "step": 792500 }, { "epoch": 106.79062247372676, "grad_norm": 0.1866372674703598, "learning_rate": 0.00025891493757298123, "loss": 0.5205, "step": 792600 }, { "epoch": 106.8040959310159, "grad_norm": 0.19664135575294495, "learning_rate": 0.0002588775113027336, "loss": 0.5206, "step": 792700 }, { "epoch": 106.81756938830505, "grad_norm": 0.1887149065732956, "learning_rate": 0.000258840085032486, "loss": 0.521, "step": 792800 }, { "epoch": 106.83104284559418, "grad_norm": 0.18691466748714447, "learning_rate": 0.0002588026587622384, "loss": 0.5208, "step": 792900 }, { "epoch": 106.84451630288332, "grad_norm": 0.1910405308008194, "learning_rate": 0.0002587652324919908, "loss": 0.5206, "step": 793000 }, { "epoch": 106.85798976017246, "grad_norm": 0.21112346649169922, "learning_rate": 0.00025872780622174317, "loss": 0.5215, "step": 793100 }, { "epoch": 106.8714632174616, "grad_norm": 0.2009659707546234, "learning_rate": 0.00025869037995149556, "loss": 0.5208, "step": 793200 }, { "epoch": 106.88493667475075, "grad_norm": 0.2096218764781952, "learning_rate": 0.00025865295368124796, "loss": 0.5208, "step": 793300 }, { "epoch": 106.89841013203988, "grad_norm": 0.18683335185050964, "learning_rate": 0.00025861552741100036, "loss": 0.5211, "step": 793400 }, { "epoch": 106.91188358932902, "grad_norm": 0.17582662403583527, "learning_rate": 0.0002585781011407527, "loss": 0.5217, "step": 793500 }, { "epoch": 106.92535704661816, "grad_norm": 0.19348786771297455, "learning_rate": 0.0002585406748705051, "loss": 0.5206, "step": 793600 }, { "epoch": 106.9388305039073, "grad_norm": 0.21839338541030884, "learning_rate": 0.0002585032486002575, "loss": 0.5216, "step": 793700 }, { "epoch": 106.95230396119645, "grad_norm": 0.18678665161132812, "learning_rate": 0.00025846582233000984, "loss": 0.5215, "step": 793800 }, { "epoch": 106.96577741848559, "grad_norm": 0.1970764845609665, "learning_rate": 0.00025842839605976224, "loss": 0.5211, "step": 793900 }, { "epoch": 106.97925087577472, "grad_norm": 0.18361957371234894, "learning_rate": 0.00025839096978951464, "loss": 0.5212, "step": 794000 }, { "epoch": 106.99272433306386, "grad_norm": 0.21295660734176636, "learning_rate": 0.00025835354351926704, "loss": 0.5198, "step": 794100 }, { "epoch": 107.0, "eval_loss": 0.5100466012954712, "eval_runtime": 4.9718, "eval_samples_per_second": 1005.672, "eval_steps_per_second": 15.89, "step": 794154 }, { "epoch": 107.006197790353, "grad_norm": 0.19165976345539093, "learning_rate": 0.00025831611724901944, "loss": 0.5216, "step": 794200 }, { "epoch": 107.01967124764215, "grad_norm": 0.1814134120941162, "learning_rate": 0.00025827869097877184, "loss": 0.5201, "step": 794300 }, { "epoch": 107.03314470493129, "grad_norm": 0.18374286592006683, "learning_rate": 0.00025824126470852423, "loss": 0.5218, "step": 794400 }, { "epoch": 107.04661816222043, "grad_norm": 0.20750024914741516, "learning_rate": 0.00025820383843827663, "loss": 0.5201, "step": 794500 }, { "epoch": 107.06009161950956, "grad_norm": 0.20285598933696747, "learning_rate": 0.000258166412168029, "loss": 0.5198, "step": 794600 }, { "epoch": 107.0735650767987, "grad_norm": 0.17903392016887665, "learning_rate": 0.0002581289858977814, "loss": 0.5191, "step": 794700 }, { "epoch": 107.08703853408785, "grad_norm": 0.21471340954303741, "learning_rate": 0.0002580915596275338, "loss": 0.52, "step": 794800 }, { "epoch": 107.10051199137699, "grad_norm": 0.18358999490737915, "learning_rate": 0.00025805413335728617, "loss": 0.5211, "step": 794900 }, { "epoch": 107.11398544866613, "grad_norm": 0.1854161024093628, "learning_rate": 0.00025801670708703857, "loss": 0.5197, "step": 795000 }, { "epoch": 107.12745890595527, "grad_norm": 0.21030162274837494, "learning_rate": 0.00025797928081679097, "loss": 0.5205, "step": 795100 }, { "epoch": 107.1409323632444, "grad_norm": 0.19646352529525757, "learning_rate": 0.00025794185454654337, "loss": 0.5204, "step": 795200 }, { "epoch": 107.15440582053355, "grad_norm": 0.18289442360401154, "learning_rate": 0.00025790442827629566, "loss": 0.5204, "step": 795300 }, { "epoch": 107.16787927782269, "grad_norm": 0.21518640220165253, "learning_rate": 0.00025786700200604805, "loss": 0.5203, "step": 795400 }, { "epoch": 107.18135273511183, "grad_norm": 0.19932778179645538, "learning_rate": 0.00025782957573580045, "loss": 0.5196, "step": 795500 }, { "epoch": 107.19482619240097, "grad_norm": 0.19183386862277985, "learning_rate": 0.00025779214946555285, "loss": 0.5195, "step": 795600 }, { "epoch": 107.2082996496901, "grad_norm": 0.1868259459733963, "learning_rate": 0.00025775472319530525, "loss": 0.5201, "step": 795700 }, { "epoch": 107.22177310697926, "grad_norm": 0.1918644905090332, "learning_rate": 0.00025771729692505765, "loss": 0.5205, "step": 795800 }, { "epoch": 107.2352465642684, "grad_norm": 0.18952880799770355, "learning_rate": 0.00025767987065481005, "loss": 0.5204, "step": 795900 }, { "epoch": 107.24872002155753, "grad_norm": 0.19742174446582794, "learning_rate": 0.0002576424443845624, "loss": 0.5215, "step": 796000 }, { "epoch": 107.26219347884667, "grad_norm": 0.21718591451644897, "learning_rate": 0.0002576050181143148, "loss": 0.5198, "step": 796100 }, { "epoch": 107.2756669361358, "grad_norm": 0.19960807263851166, "learning_rate": 0.0002575675918440672, "loss": 0.5209, "step": 796200 }, { "epoch": 107.28914039342496, "grad_norm": 0.18225198984146118, "learning_rate": 0.0002575301655738196, "loss": 0.5206, "step": 796300 }, { "epoch": 107.3026138507141, "grad_norm": 0.20370033383369446, "learning_rate": 0.000257492739303572, "loss": 0.5214, "step": 796400 }, { "epoch": 107.31608730800323, "grad_norm": 0.1865866333246231, "learning_rate": 0.0002574553130333244, "loss": 0.5213, "step": 796500 }, { "epoch": 107.32956076529237, "grad_norm": 0.20228621363639832, "learning_rate": 0.0002574178867630768, "loss": 0.5201, "step": 796600 }, { "epoch": 107.34303422258151, "grad_norm": 0.20552772283554077, "learning_rate": 0.0002573804604928291, "loss": 0.5208, "step": 796700 }, { "epoch": 107.35650767987066, "grad_norm": 0.19622169435024261, "learning_rate": 0.0002573430342225815, "loss": 0.5207, "step": 796800 }, { "epoch": 107.3699811371598, "grad_norm": 0.1860787272453308, "learning_rate": 0.0002573056079523339, "loss": 0.5207, "step": 796900 }, { "epoch": 107.38345459444893, "grad_norm": 0.18785759806632996, "learning_rate": 0.0002572681816820863, "loss": 0.5208, "step": 797000 }, { "epoch": 107.39692805173807, "grad_norm": 0.18963830173015594, "learning_rate": 0.00025723075541183866, "loss": 0.5204, "step": 797100 }, { "epoch": 107.41040150902721, "grad_norm": 0.18585336208343506, "learning_rate": 0.00025719332914159106, "loss": 0.5201, "step": 797200 }, { "epoch": 107.42387496631636, "grad_norm": 0.18615926802158356, "learning_rate": 0.00025715590287134346, "loss": 0.5204, "step": 797300 }, { "epoch": 107.4373484236055, "grad_norm": 0.1879274696111679, "learning_rate": 0.00025711847660109586, "loss": 0.5205, "step": 797400 }, { "epoch": 107.45082188089464, "grad_norm": 0.1978682279586792, "learning_rate": 0.0002570810503308482, "loss": 0.5205, "step": 797500 }, { "epoch": 107.46429533818377, "grad_norm": 0.17711056768894196, "learning_rate": 0.0002570436240606006, "loss": 0.5197, "step": 797600 }, { "epoch": 107.47776879547291, "grad_norm": 0.201747864484787, "learning_rate": 0.000257006197790353, "loss": 0.5207, "step": 797700 }, { "epoch": 107.49124225276206, "grad_norm": 0.18010269105434418, "learning_rate": 0.0002569687715201054, "loss": 0.5198, "step": 797800 }, { "epoch": 107.5047157100512, "grad_norm": 0.19091956317424774, "learning_rate": 0.0002569313452498578, "loss": 0.5213, "step": 797900 }, { "epoch": 107.51818916734034, "grad_norm": 0.18397699296474457, "learning_rate": 0.0002568939189796102, "loss": 0.5205, "step": 798000 }, { "epoch": 107.53166262462948, "grad_norm": 0.2058003693819046, "learning_rate": 0.0002568564927093626, "loss": 0.5203, "step": 798100 }, { "epoch": 107.54513608191861, "grad_norm": 0.2075928896665573, "learning_rate": 0.00025681906643911493, "loss": 0.5203, "step": 798200 }, { "epoch": 107.55860953920777, "grad_norm": 0.20209529995918274, "learning_rate": 0.00025678164016886733, "loss": 0.5203, "step": 798300 }, { "epoch": 107.5720829964969, "grad_norm": 0.19980736076831818, "learning_rate": 0.00025674421389861973, "loss": 0.5207, "step": 798400 }, { "epoch": 107.58555645378604, "grad_norm": 0.20432454347610474, "learning_rate": 0.00025670678762837213, "loss": 0.5208, "step": 798500 }, { "epoch": 107.59902991107518, "grad_norm": 0.1886196732521057, "learning_rate": 0.0002566693613581245, "loss": 0.5208, "step": 798600 }, { "epoch": 107.61250336836432, "grad_norm": 0.21953928470611572, "learning_rate": 0.0002566319350878769, "loss": 0.5209, "step": 798700 }, { "epoch": 107.62597682565347, "grad_norm": 0.19431139528751373, "learning_rate": 0.0002565945088176293, "loss": 0.5199, "step": 798800 }, { "epoch": 107.6394502829426, "grad_norm": 0.22328303754329681, "learning_rate": 0.0002565570825473816, "loss": 0.5205, "step": 798900 }, { "epoch": 107.65292374023174, "grad_norm": 0.1923709213733673, "learning_rate": 0.000256519656277134, "loss": 0.5194, "step": 799000 }, { "epoch": 107.66639719752088, "grad_norm": 0.1789938062429428, "learning_rate": 0.0002564822300068864, "loss": 0.5209, "step": 799100 }, { "epoch": 107.67987065481003, "grad_norm": 0.18811114132404327, "learning_rate": 0.0002564448037366388, "loss": 0.5205, "step": 799200 }, { "epoch": 107.69334411209917, "grad_norm": 0.19331514835357666, "learning_rate": 0.0002564073774663912, "loss": 0.52, "step": 799300 }, { "epoch": 107.7068175693883, "grad_norm": 0.2004549652338028, "learning_rate": 0.0002563699511961436, "loss": 0.5198, "step": 799400 }, { "epoch": 107.72029102667744, "grad_norm": 0.19271144270896912, "learning_rate": 0.000256332524925896, "loss": 0.5208, "step": 799500 }, { "epoch": 107.73376448396658, "grad_norm": 0.18637652695178986, "learning_rate": 0.00025629509865564835, "loss": 0.5213, "step": 799600 }, { "epoch": 107.74723794125573, "grad_norm": 0.20318831503391266, "learning_rate": 0.00025625767238540074, "loss": 0.5207, "step": 799700 }, { "epoch": 107.76071139854487, "grad_norm": 0.1904263198375702, "learning_rate": 0.00025622024611515314, "loss": 0.5217, "step": 799800 }, { "epoch": 107.77418485583401, "grad_norm": 0.19289058446884155, "learning_rate": 0.00025618281984490554, "loss": 0.5204, "step": 799900 }, { "epoch": 107.78765831312315, "grad_norm": 0.20437903702259064, "learning_rate": 0.00025614539357465794, "loss": 0.521, "step": 800000 }, { "epoch": 107.80113177041228, "grad_norm": 0.1895752102136612, "learning_rate": 0.00025610796730441034, "loss": 0.5197, "step": 800100 }, { "epoch": 107.81460522770143, "grad_norm": 0.22690962255001068, "learning_rate": 0.00025607054103416274, "loss": 0.5213, "step": 800200 }, { "epoch": 107.82807868499057, "grad_norm": 0.1989445984363556, "learning_rate": 0.00025603311476391513, "loss": 0.5196, "step": 800300 }, { "epoch": 107.84155214227971, "grad_norm": 0.21036653220653534, "learning_rate": 0.0002559956884936675, "loss": 0.5214, "step": 800400 }, { "epoch": 107.85502559956885, "grad_norm": 0.18454234302043915, "learning_rate": 0.0002559582622234199, "loss": 0.5214, "step": 800500 }, { "epoch": 107.86849905685798, "grad_norm": 0.2008080631494522, "learning_rate": 0.0002559208359531723, "loss": 0.5212, "step": 800600 }, { "epoch": 107.88197251414714, "grad_norm": 0.18611635267734528, "learning_rate": 0.0002558834096829246, "loss": 0.5208, "step": 800700 }, { "epoch": 107.89544597143627, "grad_norm": 0.20894359052181244, "learning_rate": 0.000255845983412677, "loss": 0.5202, "step": 800800 }, { "epoch": 107.90891942872541, "grad_norm": 0.2026352882385254, "learning_rate": 0.0002558085571424294, "loss": 0.5204, "step": 800900 }, { "epoch": 107.92239288601455, "grad_norm": 0.1962963491678238, "learning_rate": 0.0002557711308721818, "loss": 0.5205, "step": 801000 }, { "epoch": 107.93586634330369, "grad_norm": 0.19121114909648895, "learning_rate": 0.00025573370460193416, "loss": 0.5204, "step": 801100 }, { "epoch": 107.94933980059284, "grad_norm": 0.22102288901805878, "learning_rate": 0.00025569627833168656, "loss": 0.5213, "step": 801200 }, { "epoch": 107.96281325788198, "grad_norm": 0.19074784219264984, "learning_rate": 0.00025565885206143895, "loss": 0.5201, "step": 801300 }, { "epoch": 107.97628671517111, "grad_norm": 0.1833614706993103, "learning_rate": 0.00025562142579119135, "loss": 0.5202, "step": 801400 }, { "epoch": 107.98976017246025, "grad_norm": 0.19268953800201416, "learning_rate": 0.00025558399952094375, "loss": 0.5206, "step": 801500 }, { "epoch": 108.0, "eval_loss": 0.5095017552375793, "eval_runtime": 4.9907, "eval_samples_per_second": 1001.862, "eval_steps_per_second": 15.829, "step": 801576 }, { "epoch": 108.00323362974939, "grad_norm": 0.19980740547180176, "learning_rate": 0.00025554657325069615, "loss": 0.5208, "step": 801600 }, { "epoch": 108.01670708703854, "grad_norm": 0.20154106616973877, "learning_rate": 0.00025550914698044855, "loss": 0.5196, "step": 801700 }, { "epoch": 108.03018054432768, "grad_norm": 0.18370771408081055, "learning_rate": 0.0002554717207102009, "loss": 0.5196, "step": 801800 }, { "epoch": 108.04365400161682, "grad_norm": 0.17547112703323364, "learning_rate": 0.0002554342944399533, "loss": 0.5193, "step": 801900 }, { "epoch": 108.05712745890595, "grad_norm": 0.186521977186203, "learning_rate": 0.0002553968681697057, "loss": 0.52, "step": 802000 }, { "epoch": 108.07060091619509, "grad_norm": 0.23677153885364532, "learning_rate": 0.0002553594418994581, "loss": 0.5204, "step": 802100 }, { "epoch": 108.08407437348424, "grad_norm": 0.1821960210800171, "learning_rate": 0.0002553220156292105, "loss": 0.5194, "step": 802200 }, { "epoch": 108.09754783077338, "grad_norm": 0.1979355663061142, "learning_rate": 0.0002552845893589629, "loss": 0.5197, "step": 802300 }, { "epoch": 108.11102128806252, "grad_norm": 0.1797402948141098, "learning_rate": 0.0002552471630887153, "loss": 0.5209, "step": 802400 }, { "epoch": 108.12449474535165, "grad_norm": 0.18540754914283752, "learning_rate": 0.0002552097368184676, "loss": 0.5202, "step": 802500 }, { "epoch": 108.13796820264079, "grad_norm": 0.19462379813194275, "learning_rate": 0.00025517231054821997, "loss": 0.5205, "step": 802600 }, { "epoch": 108.15144165992994, "grad_norm": 0.21041832864284515, "learning_rate": 0.00025513488427797237, "loss": 0.5186, "step": 802700 }, { "epoch": 108.16491511721908, "grad_norm": 0.18830350041389465, "learning_rate": 0.00025509745800772476, "loss": 0.5202, "step": 802800 }, { "epoch": 108.17838857450822, "grad_norm": 0.1933022290468216, "learning_rate": 0.00025506003173747716, "loss": 0.5206, "step": 802900 }, { "epoch": 108.19186203179736, "grad_norm": 0.1892881691455841, "learning_rate": 0.00025502260546722956, "loss": 0.5198, "step": 803000 }, { "epoch": 108.2053354890865, "grad_norm": 0.18478275835514069, "learning_rate": 0.00025498517919698196, "loss": 0.5203, "step": 803100 }, { "epoch": 108.21880894637565, "grad_norm": 0.19790233671665192, "learning_rate": 0.00025494775292673436, "loss": 0.5205, "step": 803200 }, { "epoch": 108.23228240366478, "grad_norm": 0.2073841392993927, "learning_rate": 0.0002549103266564867, "loss": 0.5203, "step": 803300 }, { "epoch": 108.24575586095392, "grad_norm": 0.18867720663547516, "learning_rate": 0.0002548729003862391, "loss": 0.5202, "step": 803400 }, { "epoch": 108.25922931824306, "grad_norm": 0.19721931219100952, "learning_rate": 0.0002548354741159915, "loss": 0.52, "step": 803500 }, { "epoch": 108.2727027755322, "grad_norm": 0.20014697313308716, "learning_rate": 0.0002547980478457439, "loss": 0.5196, "step": 803600 }, { "epoch": 108.28617623282135, "grad_norm": 0.1890314519405365, "learning_rate": 0.0002547606215754963, "loss": 0.5201, "step": 803700 }, { "epoch": 108.29964969011048, "grad_norm": 0.2022112011909485, "learning_rate": 0.0002547231953052487, "loss": 0.5202, "step": 803800 }, { "epoch": 108.31312314739962, "grad_norm": 0.19709227979183197, "learning_rate": 0.0002546857690350011, "loss": 0.5208, "step": 803900 }, { "epoch": 108.32659660468876, "grad_norm": 0.1963728368282318, "learning_rate": 0.00025464834276475343, "loss": 0.5199, "step": 804000 }, { "epoch": 108.3400700619779, "grad_norm": 0.21159707009792328, "learning_rate": 0.00025461091649450583, "loss": 0.5201, "step": 804100 }, { "epoch": 108.35354351926705, "grad_norm": 0.1899135410785675, "learning_rate": 0.00025457349022425823, "loss": 0.5209, "step": 804200 }, { "epoch": 108.36701697655619, "grad_norm": 0.1827055811882019, "learning_rate": 0.0002545360639540106, "loss": 0.5207, "step": 804300 }, { "epoch": 108.38049043384532, "grad_norm": 0.19160686433315277, "learning_rate": 0.000254498637683763, "loss": 0.5195, "step": 804400 }, { "epoch": 108.39396389113446, "grad_norm": 0.18602964282035828, "learning_rate": 0.00025446121141351537, "loss": 0.5197, "step": 804500 }, { "epoch": 108.4074373484236, "grad_norm": 0.19305379688739777, "learning_rate": 0.00025442378514326777, "loss": 0.5205, "step": 804600 }, { "epoch": 108.42091080571275, "grad_norm": 0.19404202699661255, "learning_rate": 0.0002543863588730201, "loss": 0.5211, "step": 804700 }, { "epoch": 108.43438426300189, "grad_norm": 0.23068921267986298, "learning_rate": 0.0002543489326027725, "loss": 0.5204, "step": 804800 }, { "epoch": 108.44785772029103, "grad_norm": 0.18786321580410004, "learning_rate": 0.0002543115063325249, "loss": 0.5209, "step": 804900 }, { "epoch": 108.46133117758016, "grad_norm": 0.1922258585691452, "learning_rate": 0.0002542740800622773, "loss": 0.5201, "step": 805000 }, { "epoch": 108.4748046348693, "grad_norm": 0.1824691742658615, "learning_rate": 0.0002542366537920297, "loss": 0.5213, "step": 805100 }, { "epoch": 108.48827809215845, "grad_norm": 0.19676372408866882, "learning_rate": 0.0002541992275217821, "loss": 0.52, "step": 805200 }, { "epoch": 108.50175154944759, "grad_norm": 0.20554110407829285, "learning_rate": 0.0002541618012515345, "loss": 0.5215, "step": 805300 }, { "epoch": 108.51522500673673, "grad_norm": 0.22177281975746155, "learning_rate": 0.0002541243749812869, "loss": 0.5198, "step": 805400 }, { "epoch": 108.52869846402587, "grad_norm": 0.2199915647506714, "learning_rate": 0.00025408694871103925, "loss": 0.5195, "step": 805500 }, { "epoch": 108.542171921315, "grad_norm": 0.18649564683437347, "learning_rate": 0.00025404952244079164, "loss": 0.5204, "step": 805600 }, { "epoch": 108.55564537860415, "grad_norm": 0.18702827394008636, "learning_rate": 0.00025401209617054404, "loss": 0.5211, "step": 805700 }, { "epoch": 108.56911883589329, "grad_norm": 0.19902516901493073, "learning_rate": 0.00025397466990029644, "loss": 0.5209, "step": 805800 }, { "epoch": 108.58259229318243, "grad_norm": 0.18939878046512604, "learning_rate": 0.00025393724363004884, "loss": 0.52, "step": 805900 }, { "epoch": 108.59606575047157, "grad_norm": 0.20218393206596375, "learning_rate": 0.00025389981735980124, "loss": 0.521, "step": 806000 }, { "epoch": 108.6095392077607, "grad_norm": 0.1930580884218216, "learning_rate": 0.0002538623910895536, "loss": 0.5209, "step": 806100 }, { "epoch": 108.62301266504986, "grad_norm": 0.20073658227920532, "learning_rate": 0.0002538249648193059, "loss": 0.5208, "step": 806200 }, { "epoch": 108.636486122339, "grad_norm": 0.18001414835453033, "learning_rate": 0.0002537875385490583, "loss": 0.5201, "step": 806300 }, { "epoch": 108.64995957962813, "grad_norm": 0.18308231234550476, "learning_rate": 0.0002537501122788107, "loss": 0.5203, "step": 806400 }, { "epoch": 108.66343303691727, "grad_norm": 0.1922195851802826, "learning_rate": 0.0002537126860085631, "loss": 0.5208, "step": 806500 }, { "epoch": 108.6769064942064, "grad_norm": 0.21939638257026672, "learning_rate": 0.0002536752597383155, "loss": 0.5215, "step": 806600 }, { "epoch": 108.69037995149556, "grad_norm": 0.1864059716463089, "learning_rate": 0.0002536378334680679, "loss": 0.5204, "step": 806700 }, { "epoch": 108.7038534087847, "grad_norm": 0.19088053703308105, "learning_rate": 0.0002536004071978203, "loss": 0.5207, "step": 806800 }, { "epoch": 108.71732686607383, "grad_norm": 0.19052888453006744, "learning_rate": 0.00025356298092757266, "loss": 0.5188, "step": 806900 }, { "epoch": 108.73080032336297, "grad_norm": 0.2043793946504593, "learning_rate": 0.00025352555465732506, "loss": 0.5207, "step": 807000 }, { "epoch": 108.74427378065212, "grad_norm": 0.18892468512058258, "learning_rate": 0.00025348812838707745, "loss": 0.5203, "step": 807100 }, { "epoch": 108.75774723794126, "grad_norm": 0.1903337985277176, "learning_rate": 0.00025345070211682985, "loss": 0.5205, "step": 807200 }, { "epoch": 108.7712206952304, "grad_norm": 0.19615206122398376, "learning_rate": 0.00025341327584658225, "loss": 0.5199, "step": 807300 }, { "epoch": 108.78469415251953, "grad_norm": 0.20710544288158417, "learning_rate": 0.00025337584957633465, "loss": 0.521, "step": 807400 }, { "epoch": 108.79816760980867, "grad_norm": 0.19171227514743805, "learning_rate": 0.00025333842330608705, "loss": 0.5206, "step": 807500 }, { "epoch": 108.81164106709782, "grad_norm": 0.19986945390701294, "learning_rate": 0.00025330099703583945, "loss": 0.5211, "step": 807600 }, { "epoch": 108.82511452438696, "grad_norm": 0.19285809993743896, "learning_rate": 0.0002532635707655918, "loss": 0.5196, "step": 807700 }, { "epoch": 108.8385879816761, "grad_norm": 0.18917405605316162, "learning_rate": 0.0002532261444953442, "loss": 0.5211, "step": 807800 }, { "epoch": 108.85206143896524, "grad_norm": 0.21361802518367767, "learning_rate": 0.0002531887182250966, "loss": 0.5208, "step": 807900 }, { "epoch": 108.86553489625437, "grad_norm": 0.20034366846084595, "learning_rate": 0.00025315129195484893, "loss": 0.5213, "step": 808000 }, { "epoch": 108.87900835354353, "grad_norm": 0.22395214438438416, "learning_rate": 0.00025311386568460133, "loss": 0.5204, "step": 808100 }, { "epoch": 108.89248181083266, "grad_norm": 0.23709021508693695, "learning_rate": 0.0002530764394143537, "loss": 0.5199, "step": 808200 }, { "epoch": 108.9059552681218, "grad_norm": 0.20187082886695862, "learning_rate": 0.0002530390131441061, "loss": 0.5208, "step": 808300 }, { "epoch": 108.91942872541094, "grad_norm": 0.19651928544044495, "learning_rate": 0.00025300158687385847, "loss": 0.5196, "step": 808400 }, { "epoch": 108.93290218270008, "grad_norm": 0.2095334827899933, "learning_rate": 0.00025296416060361087, "loss": 0.5208, "step": 808500 }, { "epoch": 108.94637563998923, "grad_norm": 0.18657997250556946, "learning_rate": 0.00025292673433336327, "loss": 0.52, "step": 808600 }, { "epoch": 108.95984909727837, "grad_norm": 0.189545139670372, "learning_rate": 0.00025288930806311566, "loss": 0.5203, "step": 808700 }, { "epoch": 108.9733225545675, "grad_norm": 0.18926046788692474, "learning_rate": 0.00025285188179286806, "loss": 0.5211, "step": 808800 }, { "epoch": 108.98679601185664, "grad_norm": 0.19196662306785583, "learning_rate": 0.00025281445552262046, "loss": 0.5197, "step": 808900 }, { "epoch": 109.0, "eval_loss": 0.5091363787651062, "eval_runtime": 4.9847, "eval_samples_per_second": 1003.069, "eval_steps_per_second": 15.848, "step": 808998 }, { "epoch": 109.00026946914578, "grad_norm": 0.19467176496982574, "learning_rate": 0.00025277702925237286, "loss": 0.5208, "step": 809000 }, { "epoch": 109.01374292643493, "grad_norm": 0.21749576926231384, "learning_rate": 0.0002527396029821252, "loss": 0.5203, "step": 809100 }, { "epoch": 109.02721638372407, "grad_norm": 0.20164860785007477, "learning_rate": 0.0002527021767118776, "loss": 0.5191, "step": 809200 }, { "epoch": 109.0406898410132, "grad_norm": 0.1841711401939392, "learning_rate": 0.00025266475044163, "loss": 0.5193, "step": 809300 }, { "epoch": 109.05416329830234, "grad_norm": 0.19733203947544098, "learning_rate": 0.0002526273241713824, "loss": 0.5188, "step": 809400 }, { "epoch": 109.06763675559148, "grad_norm": 0.19283394515514374, "learning_rate": 0.0002525898979011348, "loss": 0.5191, "step": 809500 }, { "epoch": 109.08111021288063, "grad_norm": 0.18884563446044922, "learning_rate": 0.0002525524716308872, "loss": 0.5194, "step": 809600 }, { "epoch": 109.09458367016977, "grad_norm": 0.1949816197156906, "learning_rate": 0.0002525150453606396, "loss": 0.5191, "step": 809700 }, { "epoch": 109.1080571274589, "grad_norm": 0.19046840071678162, "learning_rate": 0.0002524776190903919, "loss": 0.5198, "step": 809800 }, { "epoch": 109.12153058474804, "grad_norm": 0.1824449598789215, "learning_rate": 0.0002524401928201443, "loss": 0.5207, "step": 809900 }, { "epoch": 109.13500404203718, "grad_norm": 0.185593381524086, "learning_rate": 0.0002524027665498967, "loss": 0.5205, "step": 810000 }, { "epoch": 109.14847749932633, "grad_norm": 0.19695743918418884, "learning_rate": 0.0002523653402796491, "loss": 0.5188, "step": 810100 }, { "epoch": 109.16195095661547, "grad_norm": 0.19484317302703857, "learning_rate": 0.0002523279140094015, "loss": 0.5209, "step": 810200 }, { "epoch": 109.17542441390461, "grad_norm": 0.20502357184886932, "learning_rate": 0.0002522904877391539, "loss": 0.5203, "step": 810300 }, { "epoch": 109.18889787119375, "grad_norm": 0.20539911091327667, "learning_rate": 0.00025225306146890627, "loss": 0.52, "step": 810400 }, { "epoch": 109.20237132848288, "grad_norm": 0.19438447058200836, "learning_rate": 0.00025221563519865867, "loss": 0.5211, "step": 810500 }, { "epoch": 109.21584478577203, "grad_norm": 0.2164972722530365, "learning_rate": 0.000252178208928411, "loss": 0.5204, "step": 810600 }, { "epoch": 109.22931824306117, "grad_norm": 0.19516319036483765, "learning_rate": 0.0002521407826581634, "loss": 0.5201, "step": 810700 }, { "epoch": 109.24279170035031, "grad_norm": 0.19835050404071808, "learning_rate": 0.0002521033563879158, "loss": 0.5206, "step": 810800 }, { "epoch": 109.25626515763945, "grad_norm": 0.19646456837654114, "learning_rate": 0.0002520659301176682, "loss": 0.5203, "step": 810900 }, { "epoch": 109.26973861492858, "grad_norm": 0.1868869513273239, "learning_rate": 0.0002520285038474206, "loss": 0.5203, "step": 811000 }, { "epoch": 109.28321207221774, "grad_norm": 0.18816332519054413, "learning_rate": 0.000251991077577173, "loss": 0.52, "step": 811100 }, { "epoch": 109.29668552950687, "grad_norm": 0.18592488765716553, "learning_rate": 0.0002519536513069254, "loss": 0.5193, "step": 811200 }, { "epoch": 109.31015898679601, "grad_norm": 0.19208721816539764, "learning_rate": 0.00025191622503667775, "loss": 0.5202, "step": 811300 }, { "epoch": 109.32363244408515, "grad_norm": 0.18905915319919586, "learning_rate": 0.00025187879876643015, "loss": 0.5196, "step": 811400 }, { "epoch": 109.33710590137429, "grad_norm": 0.21825799345970154, "learning_rate": 0.00025184137249618254, "loss": 0.5207, "step": 811500 }, { "epoch": 109.35057935866344, "grad_norm": 0.2062133103609085, "learning_rate": 0.0002518039462259349, "loss": 0.5199, "step": 811600 }, { "epoch": 109.36405281595258, "grad_norm": 0.2376926988363266, "learning_rate": 0.0002517665199556873, "loss": 0.5203, "step": 811700 }, { "epoch": 109.37752627324171, "grad_norm": 0.19403518736362457, "learning_rate": 0.0002517290936854397, "loss": 0.52, "step": 811800 }, { "epoch": 109.39099973053085, "grad_norm": 0.21439488232135773, "learning_rate": 0.0002516916674151921, "loss": 0.5199, "step": 811900 }, { "epoch": 109.40447318781999, "grad_norm": 0.19203749299049377, "learning_rate": 0.0002516542411449444, "loss": 0.5208, "step": 812000 }, { "epoch": 109.41794664510914, "grad_norm": 0.19482208788394928, "learning_rate": 0.0002516168148746968, "loss": 0.5198, "step": 812100 }, { "epoch": 109.43142010239828, "grad_norm": 0.1886143982410431, "learning_rate": 0.0002515793886044492, "loss": 0.5209, "step": 812200 }, { "epoch": 109.44489355968742, "grad_norm": 0.20183861255645752, "learning_rate": 0.0002515419623342016, "loss": 0.5199, "step": 812300 }, { "epoch": 109.45836701697655, "grad_norm": 0.20124228298664093, "learning_rate": 0.000251504536063954, "loss": 0.5208, "step": 812400 }, { "epoch": 109.47184047426569, "grad_norm": 0.21100212633609772, "learning_rate": 0.0002514671097937064, "loss": 0.5199, "step": 812500 }, { "epoch": 109.48531393155484, "grad_norm": 0.1863083392381668, "learning_rate": 0.0002514296835234588, "loss": 0.5203, "step": 812600 }, { "epoch": 109.49878738884398, "grad_norm": 0.1901504397392273, "learning_rate": 0.0002513922572532112, "loss": 0.5201, "step": 812700 }, { "epoch": 109.51226084613312, "grad_norm": 0.20257456600666046, "learning_rate": 0.00025135483098296356, "loss": 0.5203, "step": 812800 }, { "epoch": 109.52573430342225, "grad_norm": 0.20628653466701508, "learning_rate": 0.00025131740471271596, "loss": 0.5201, "step": 812900 }, { "epoch": 109.53920776071139, "grad_norm": 0.20471349358558655, "learning_rate": 0.00025127997844246835, "loss": 0.5211, "step": 813000 }, { "epoch": 109.55268121800054, "grad_norm": 0.18132634460926056, "learning_rate": 0.00025124255217222075, "loss": 0.5206, "step": 813100 }, { "epoch": 109.56615467528968, "grad_norm": 0.1981988549232483, "learning_rate": 0.00025120512590197315, "loss": 0.5213, "step": 813200 }, { "epoch": 109.57962813257882, "grad_norm": 0.22428059577941895, "learning_rate": 0.00025116769963172555, "loss": 0.5212, "step": 813300 }, { "epoch": 109.59310158986796, "grad_norm": 0.20285309851169586, "learning_rate": 0.0002511302733614779, "loss": 0.5204, "step": 813400 }, { "epoch": 109.6065750471571, "grad_norm": 0.18810027837753296, "learning_rate": 0.00025109284709123024, "loss": 0.5205, "step": 813500 }, { "epoch": 109.62004850444625, "grad_norm": 0.19792205095291138, "learning_rate": 0.00025105542082098264, "loss": 0.5195, "step": 813600 }, { "epoch": 109.63352196173538, "grad_norm": 0.23709997534751892, "learning_rate": 0.00025101799455073503, "loss": 0.5199, "step": 813700 }, { "epoch": 109.64699541902452, "grad_norm": 0.22395311295986176, "learning_rate": 0.00025098056828048743, "loss": 0.5211, "step": 813800 }, { "epoch": 109.66046887631366, "grad_norm": 0.19379308819770813, "learning_rate": 0.00025094314201023983, "loss": 0.5201, "step": 813900 }, { "epoch": 109.6739423336028, "grad_norm": 0.19178354740142822, "learning_rate": 0.00025090571573999223, "loss": 0.5199, "step": 814000 }, { "epoch": 109.68741579089195, "grad_norm": 0.2037651538848877, "learning_rate": 0.0002508682894697446, "loss": 0.5194, "step": 814100 }, { "epoch": 109.70088924818108, "grad_norm": 0.18493366241455078, "learning_rate": 0.00025083086319949697, "loss": 0.5199, "step": 814200 }, { "epoch": 109.71436270547022, "grad_norm": 0.19619178771972656, "learning_rate": 0.00025079343692924937, "loss": 0.5198, "step": 814300 }, { "epoch": 109.72783616275936, "grad_norm": 0.19608475267887115, "learning_rate": 0.00025075601065900177, "loss": 0.519, "step": 814400 }, { "epoch": 109.74130962004851, "grad_norm": 0.19618189334869385, "learning_rate": 0.00025071858438875417, "loss": 0.5216, "step": 814500 }, { "epoch": 109.75478307733765, "grad_norm": 0.1931634396314621, "learning_rate": 0.00025068115811850656, "loss": 0.5205, "step": 814600 }, { "epoch": 109.76825653462679, "grad_norm": 0.19948728382587433, "learning_rate": 0.00025064373184825896, "loss": 0.5194, "step": 814700 }, { "epoch": 109.78172999191592, "grad_norm": 0.19077368080615997, "learning_rate": 0.00025060630557801136, "loss": 0.5197, "step": 814800 }, { "epoch": 109.79520344920506, "grad_norm": 0.19092519581317902, "learning_rate": 0.0002505688793077637, "loss": 0.52, "step": 814900 }, { "epoch": 109.80867690649421, "grad_norm": 0.21395891904830933, "learning_rate": 0.0002505314530375161, "loss": 0.5189, "step": 815000 }, { "epoch": 109.82215036378335, "grad_norm": 0.19054509699344635, "learning_rate": 0.0002504940267672685, "loss": 0.5207, "step": 815100 }, { "epoch": 109.83562382107249, "grad_norm": 0.18392811715602875, "learning_rate": 0.00025045660049702084, "loss": 0.5211, "step": 815200 }, { "epoch": 109.84909727836163, "grad_norm": 0.199106827378273, "learning_rate": 0.00025041917422677324, "loss": 0.52, "step": 815300 }, { "epoch": 109.86257073565076, "grad_norm": 0.19975467026233673, "learning_rate": 0.00025038174795652564, "loss": 0.5195, "step": 815400 }, { "epoch": 109.87604419293991, "grad_norm": 0.19286876916885376, "learning_rate": 0.00025034432168627804, "loss": 0.5203, "step": 815500 }, { "epoch": 109.88951765022905, "grad_norm": 0.18478140234947205, "learning_rate": 0.00025030689541603044, "loss": 0.5205, "step": 815600 }, { "epoch": 109.90299110751819, "grad_norm": 0.18406930565834045, "learning_rate": 0.0002502694691457828, "loss": 0.5204, "step": 815700 }, { "epoch": 109.91646456480733, "grad_norm": 0.21680176258087158, "learning_rate": 0.0002502320428755352, "loss": 0.5202, "step": 815800 }, { "epoch": 109.92993802209647, "grad_norm": 0.18952837586402893, "learning_rate": 0.0002501946166052876, "loss": 0.5209, "step": 815900 }, { "epoch": 109.94341147938562, "grad_norm": 0.19354364275932312, "learning_rate": 0.00025015719033504, "loss": 0.5202, "step": 816000 }, { "epoch": 109.95688493667475, "grad_norm": 0.18560223281383514, "learning_rate": 0.0002501197640647924, "loss": 0.5206, "step": 816100 }, { "epoch": 109.97035839396389, "grad_norm": 0.19167958199977875, "learning_rate": 0.00025008233779454477, "loss": 0.5194, "step": 816200 }, { "epoch": 109.98383185125303, "grad_norm": 0.20219959318637848, "learning_rate": 0.00025004491152429717, "loss": 0.5196, "step": 816300 }, { "epoch": 109.99730530854217, "grad_norm": 0.20736342668533325, "learning_rate": 0.0002500074852540495, "loss": 0.5208, "step": 816400 }, { "epoch": 110.0, "eval_loss": 0.5098603367805481, "eval_runtime": 4.9937, "eval_samples_per_second": 1001.26, "eval_steps_per_second": 15.82, "step": 816420 }, { "epoch": 110.01077876583132, "grad_norm": 0.23136399686336517, "learning_rate": 0.0002499700589838019, "loss": 0.5194, "step": 816500 }, { "epoch": 110.02425222312046, "grad_norm": 0.19284717738628387, "learning_rate": 0.0002499326327135543, "loss": 0.5199, "step": 816600 }, { "epoch": 110.0377256804096, "grad_norm": 0.19346529245376587, "learning_rate": 0.0002498952064433067, "loss": 0.5189, "step": 816700 }, { "epoch": 110.05119913769873, "grad_norm": 0.19926199316978455, "learning_rate": 0.00024985778017305905, "loss": 0.5178, "step": 816800 }, { "epoch": 110.06467259498787, "grad_norm": 0.1913483738899231, "learning_rate": 0.00024982035390281145, "loss": 0.5201, "step": 816900 }, { "epoch": 110.07814605227702, "grad_norm": 0.2091105431318283, "learning_rate": 0.00024978292763256385, "loss": 0.5205, "step": 817000 }, { "epoch": 110.09161950956616, "grad_norm": 0.18336336314678192, "learning_rate": 0.00024974550136231625, "loss": 0.5195, "step": 817100 }, { "epoch": 110.1050929668553, "grad_norm": 0.18801099061965942, "learning_rate": 0.00024970807509206865, "loss": 0.5198, "step": 817200 }, { "epoch": 110.11856642414443, "grad_norm": 0.20768973231315613, "learning_rate": 0.00024967064882182104, "loss": 0.52, "step": 817300 }, { "epoch": 110.13203988143357, "grad_norm": 0.19022664427757263, "learning_rate": 0.00024963322255157344, "loss": 0.5196, "step": 817400 }, { "epoch": 110.14551333872272, "grad_norm": 0.19782419502735138, "learning_rate": 0.0002495957962813258, "loss": 0.5209, "step": 817500 }, { "epoch": 110.15898679601186, "grad_norm": 0.1964697390794754, "learning_rate": 0.0002495583700110782, "loss": 0.5201, "step": 817600 }, { "epoch": 110.172460253301, "grad_norm": 0.1954570859670639, "learning_rate": 0.0002495209437408306, "loss": 0.5201, "step": 817700 }, { "epoch": 110.18593371059013, "grad_norm": 0.20626261830329895, "learning_rate": 0.000249483517470583, "loss": 0.5202, "step": 817800 }, { "epoch": 110.19940716787927, "grad_norm": 0.22597269713878632, "learning_rate": 0.0002494460912003353, "loss": 0.5193, "step": 817900 }, { "epoch": 110.21288062516842, "grad_norm": 0.21615348756313324, "learning_rate": 0.0002494086649300877, "loss": 0.5199, "step": 818000 }, { "epoch": 110.22635408245756, "grad_norm": 0.2054414004087448, "learning_rate": 0.0002493712386598401, "loss": 0.5198, "step": 818100 }, { "epoch": 110.2398275397467, "grad_norm": 0.22306004166603088, "learning_rate": 0.0002493338123895925, "loss": 0.5196, "step": 818200 }, { "epoch": 110.25330099703584, "grad_norm": 0.1819489300251007, "learning_rate": 0.0002492963861193449, "loss": 0.5196, "step": 818300 }, { "epoch": 110.26677445432497, "grad_norm": 0.20111480355262756, "learning_rate": 0.00024925895984909726, "loss": 0.5197, "step": 818400 }, { "epoch": 110.28024791161413, "grad_norm": 0.18684503436088562, "learning_rate": 0.00024922153357884966, "loss": 0.519, "step": 818500 }, { "epoch": 110.29372136890326, "grad_norm": 0.20253954827785492, "learning_rate": 0.00024918410730860206, "loss": 0.5205, "step": 818600 }, { "epoch": 110.3071948261924, "grad_norm": 0.20365369319915771, "learning_rate": 0.00024914668103835446, "loss": 0.5193, "step": 818700 }, { "epoch": 110.32066828348154, "grad_norm": 0.18746459484100342, "learning_rate": 0.00024910925476810686, "loss": 0.5204, "step": 818800 }, { "epoch": 110.33414174077068, "grad_norm": 0.20577573776245117, "learning_rate": 0.0002490718284978592, "loss": 0.5195, "step": 818900 }, { "epoch": 110.34761519805983, "grad_norm": 0.21558134257793427, "learning_rate": 0.0002490344022276116, "loss": 0.52, "step": 819000 }, { "epoch": 110.36108865534896, "grad_norm": 0.18918585777282715, "learning_rate": 0.000248996975957364, "loss": 0.5189, "step": 819100 }, { "epoch": 110.3745621126381, "grad_norm": 0.196012943983078, "learning_rate": 0.0002489595496871164, "loss": 0.5202, "step": 819200 }, { "epoch": 110.38803556992724, "grad_norm": 0.19441670179367065, "learning_rate": 0.00024892212341686874, "loss": 0.5206, "step": 819300 }, { "epoch": 110.40150902721638, "grad_norm": 0.18935708701610565, "learning_rate": 0.00024888469714662114, "loss": 0.5199, "step": 819400 }, { "epoch": 110.41498248450553, "grad_norm": 0.18920914828777313, "learning_rate": 0.00024884727087637353, "loss": 0.5195, "step": 819500 }, { "epoch": 110.42845594179467, "grad_norm": 0.21265853941440582, "learning_rate": 0.00024880984460612593, "loss": 0.5198, "step": 819600 }, { "epoch": 110.4419293990838, "grad_norm": 0.21876417100429535, "learning_rate": 0.00024877241833587833, "loss": 0.5199, "step": 819700 }, { "epoch": 110.45540285637294, "grad_norm": 0.20936235785484314, "learning_rate": 0.00024873499206563073, "loss": 0.5187, "step": 819800 }, { "epoch": 110.46887631366208, "grad_norm": 0.20754186809062958, "learning_rate": 0.00024869756579538313, "loss": 0.5215, "step": 819900 }, { "epoch": 110.48234977095123, "grad_norm": 0.20511527359485626, "learning_rate": 0.00024866013952513547, "loss": 0.5186, "step": 820000 }, { "epoch": 110.49582322824037, "grad_norm": 0.20159755647182465, "learning_rate": 0.00024862271325488787, "loss": 0.5204, "step": 820100 }, { "epoch": 110.5092966855295, "grad_norm": 0.1865263283252716, "learning_rate": 0.00024858528698464027, "loss": 0.5205, "step": 820200 }, { "epoch": 110.52277014281864, "grad_norm": 0.19038677215576172, "learning_rate": 0.00024854786071439267, "loss": 0.5197, "step": 820300 }, { "epoch": 110.53624360010778, "grad_norm": 0.191759392619133, "learning_rate": 0.000248510434444145, "loss": 0.5207, "step": 820400 }, { "epoch": 110.54971705739693, "grad_norm": 0.21637409925460815, "learning_rate": 0.0002484730081738974, "loss": 0.5195, "step": 820500 }, { "epoch": 110.56319051468607, "grad_norm": 0.18832893669605255, "learning_rate": 0.0002484355819036498, "loss": 0.5218, "step": 820600 }, { "epoch": 110.57666397197521, "grad_norm": 0.21187447011470795, "learning_rate": 0.0002483981556334022, "loss": 0.5205, "step": 820700 }, { "epoch": 110.59013742926435, "grad_norm": 0.19637174904346466, "learning_rate": 0.0002483607293631546, "loss": 0.5201, "step": 820800 }, { "epoch": 110.60361088655348, "grad_norm": 0.21037067472934723, "learning_rate": 0.000248323303092907, "loss": 0.5206, "step": 820900 }, { "epoch": 110.61708434384263, "grad_norm": 0.22109080851078033, "learning_rate": 0.0002482858768226594, "loss": 0.52, "step": 821000 }, { "epoch": 110.63055780113177, "grad_norm": 0.20223502814769745, "learning_rate": 0.00024824845055241174, "loss": 0.5204, "step": 821100 }, { "epoch": 110.64403125842091, "grad_norm": 0.2126893699169159, "learning_rate": 0.00024821102428216414, "loss": 0.5202, "step": 821200 }, { "epoch": 110.65750471571005, "grad_norm": 0.20110350847244263, "learning_rate": 0.00024817359801191654, "loss": 0.5203, "step": 821300 }, { "epoch": 110.67097817299918, "grad_norm": 0.18727003037929535, "learning_rate": 0.00024813617174166894, "loss": 0.5199, "step": 821400 }, { "epoch": 110.68445163028834, "grad_norm": 0.18027883768081665, "learning_rate": 0.0002480987454714213, "loss": 0.52, "step": 821500 }, { "epoch": 110.69792508757747, "grad_norm": 0.19443857669830322, "learning_rate": 0.0002480613192011737, "loss": 0.5202, "step": 821600 }, { "epoch": 110.71139854486661, "grad_norm": 0.19446943700313568, "learning_rate": 0.0002480238929309261, "loss": 0.5206, "step": 821700 }, { "epoch": 110.72487200215575, "grad_norm": 0.18427106738090515, "learning_rate": 0.0002479864666606785, "loss": 0.5203, "step": 821800 }, { "epoch": 110.7383454594449, "grad_norm": 0.17786012589931488, "learning_rate": 0.0002479490403904309, "loss": 0.52, "step": 821900 }, { "epoch": 110.75181891673404, "grad_norm": 0.2108645886182785, "learning_rate": 0.0002479116141201832, "loss": 0.5201, "step": 822000 }, { "epoch": 110.76529237402318, "grad_norm": 0.1883762627840042, "learning_rate": 0.0002478741878499356, "loss": 0.5205, "step": 822100 }, { "epoch": 110.77876583131231, "grad_norm": 0.211094468832016, "learning_rate": 0.000247836761579688, "loss": 0.5196, "step": 822200 }, { "epoch": 110.79223928860145, "grad_norm": 0.20387008786201477, "learning_rate": 0.0002477993353094404, "loss": 0.5184, "step": 822300 }, { "epoch": 110.8057127458906, "grad_norm": 0.19305327534675598, "learning_rate": 0.0002477619090391928, "loss": 0.5201, "step": 822400 }, { "epoch": 110.81918620317974, "grad_norm": 0.18766775727272034, "learning_rate": 0.0002477244827689452, "loss": 0.5207, "step": 822500 }, { "epoch": 110.83265966046888, "grad_norm": 0.18799513578414917, "learning_rate": 0.00024768705649869755, "loss": 0.5202, "step": 822600 }, { "epoch": 110.84613311775801, "grad_norm": 0.1835545301437378, "learning_rate": 0.00024764963022844995, "loss": 0.521, "step": 822700 }, { "epoch": 110.85960657504715, "grad_norm": 0.22203189134597778, "learning_rate": 0.00024761220395820235, "loss": 0.5193, "step": 822800 }, { "epoch": 110.8730800323363, "grad_norm": 0.20705938339233398, "learning_rate": 0.0002475747776879547, "loss": 0.52, "step": 822900 }, { "epoch": 110.88655348962544, "grad_norm": 0.23717479407787323, "learning_rate": 0.0002475373514177071, "loss": 0.5193, "step": 823000 }, { "epoch": 110.90002694691458, "grad_norm": 0.1927008181810379, "learning_rate": 0.0002474999251474595, "loss": 0.5213, "step": 823100 }, { "epoch": 110.91350040420372, "grad_norm": 0.18919432163238525, "learning_rate": 0.0002474624988772119, "loss": 0.5202, "step": 823200 }, { "epoch": 110.92697386149285, "grad_norm": 0.18036115169525146, "learning_rate": 0.0002474250726069643, "loss": 0.5207, "step": 823300 }, { "epoch": 110.940447318782, "grad_norm": 0.20550191402435303, "learning_rate": 0.0002473876463367167, "loss": 0.5208, "step": 823400 }, { "epoch": 110.95392077607114, "grad_norm": 0.19665655493736267, "learning_rate": 0.0002473502200664691, "loss": 0.5211, "step": 823500 }, { "epoch": 110.96739423336028, "grad_norm": 0.18342483043670654, "learning_rate": 0.0002473127937962215, "loss": 0.521, "step": 823600 }, { "epoch": 110.98086769064942, "grad_norm": 0.20508328080177307, "learning_rate": 0.0002472753675259738, "loss": 0.5196, "step": 823700 }, { "epoch": 110.99434114793856, "grad_norm": 0.19059516489505768, "learning_rate": 0.0002472379412557262, "loss": 0.5197, "step": 823800 }, { "epoch": 111.0, "eval_loss": 0.5090236663818359, "eval_runtime": 4.9945, "eval_samples_per_second": 1001.106, "eval_steps_per_second": 15.817, "step": 823842 }, { "epoch": 111.00781460522771, "grad_norm": 0.1844802349805832, "learning_rate": 0.0002472005149854786, "loss": 0.5201, "step": 823900 }, { "epoch": 111.02128806251685, "grad_norm": 0.2044488936662674, "learning_rate": 0.00024716308871523097, "loss": 0.5184, "step": 824000 }, { "epoch": 111.03476151980598, "grad_norm": 0.2031673938035965, "learning_rate": 0.00024712566244498337, "loss": 0.5194, "step": 824100 }, { "epoch": 111.04823497709512, "grad_norm": 0.19852474331855774, "learning_rate": 0.00024708823617473576, "loss": 0.5192, "step": 824200 }, { "epoch": 111.06170843438426, "grad_norm": 0.20172137022018433, "learning_rate": 0.00024705080990448816, "loss": 0.5193, "step": 824300 }, { "epoch": 111.07518189167341, "grad_norm": 0.23259209096431732, "learning_rate": 0.00024701338363424056, "loss": 0.52, "step": 824400 }, { "epoch": 111.08865534896255, "grad_norm": 0.19906646013259888, "learning_rate": 0.00024697595736399296, "loss": 0.5197, "step": 824500 }, { "epoch": 111.10212880625168, "grad_norm": 0.19978976249694824, "learning_rate": 0.00024693853109374536, "loss": 0.5195, "step": 824600 }, { "epoch": 111.11560226354082, "grad_norm": 0.18608041107654572, "learning_rate": 0.0002469011048234977, "loss": 0.5193, "step": 824700 }, { "epoch": 111.12907572082996, "grad_norm": 0.19478021562099457, "learning_rate": 0.0002468636785532501, "loss": 0.5199, "step": 824800 }, { "epoch": 111.14254917811911, "grad_norm": 0.21959766745567322, "learning_rate": 0.0002468262522830025, "loss": 0.5198, "step": 824900 }, { "epoch": 111.15602263540825, "grad_norm": 0.2575378119945526, "learning_rate": 0.0002467888260127549, "loss": 0.5198, "step": 825000 }, { "epoch": 111.16949609269739, "grad_norm": 0.18918853998184204, "learning_rate": 0.00024675139974250724, "loss": 0.5197, "step": 825100 }, { "epoch": 111.18296954998652, "grad_norm": 0.1867135763168335, "learning_rate": 0.00024671397347225964, "loss": 0.5195, "step": 825200 }, { "epoch": 111.19644300727566, "grad_norm": 0.18281888961791992, "learning_rate": 0.00024667654720201204, "loss": 0.5199, "step": 825300 }, { "epoch": 111.20991646456481, "grad_norm": 0.18514187633991241, "learning_rate": 0.00024663912093176443, "loss": 0.5199, "step": 825400 }, { "epoch": 111.22338992185395, "grad_norm": 0.23621436953544617, "learning_rate": 0.00024660169466151683, "loss": 0.5207, "step": 825500 }, { "epoch": 111.23686337914309, "grad_norm": 0.2273499220609665, "learning_rate": 0.0002465642683912692, "loss": 0.5199, "step": 825600 }, { "epoch": 111.25033683643223, "grad_norm": 0.2261454313993454, "learning_rate": 0.0002465268421210216, "loss": 0.5197, "step": 825700 }, { "epoch": 111.26381029372136, "grad_norm": 0.18144933879375458, "learning_rate": 0.00024648941585077397, "loss": 0.5194, "step": 825800 }, { "epoch": 111.27728375101051, "grad_norm": 0.19025234878063202, "learning_rate": 0.00024645198958052637, "loss": 0.52, "step": 825900 }, { "epoch": 111.29075720829965, "grad_norm": 0.19056923687458038, "learning_rate": 0.00024641456331027877, "loss": 0.5189, "step": 826000 }, { "epoch": 111.30423066558879, "grad_norm": 0.19173197448253632, "learning_rate": 0.00024637713704003117, "loss": 0.5202, "step": 826100 }, { "epoch": 111.31770412287793, "grad_norm": 0.20920312404632568, "learning_rate": 0.0002463397107697835, "loss": 0.5195, "step": 826200 }, { "epoch": 111.33117758016706, "grad_norm": 0.21464134752750397, "learning_rate": 0.0002463022844995359, "loss": 0.5202, "step": 826300 }, { "epoch": 111.34465103745622, "grad_norm": 0.2103624939918518, "learning_rate": 0.0002462648582292883, "loss": 0.5212, "step": 826400 }, { "epoch": 111.35812449474535, "grad_norm": 0.20444197952747345, "learning_rate": 0.0002462274319590407, "loss": 0.5193, "step": 826500 }, { "epoch": 111.37159795203449, "grad_norm": 0.18951478600502014, "learning_rate": 0.00024619000568879305, "loss": 0.5192, "step": 826600 }, { "epoch": 111.38507140932363, "grad_norm": 0.19754959642887115, "learning_rate": 0.00024615257941854545, "loss": 0.5201, "step": 826700 }, { "epoch": 111.39854486661277, "grad_norm": 0.19273391366004944, "learning_rate": 0.00024611515314829785, "loss": 0.52, "step": 826800 }, { "epoch": 111.41201832390192, "grad_norm": 0.19261085987091064, "learning_rate": 0.00024607772687805025, "loss": 0.5201, "step": 826900 }, { "epoch": 111.42549178119106, "grad_norm": 0.18818797171115875, "learning_rate": 0.00024604030060780264, "loss": 0.5195, "step": 827000 }, { "epoch": 111.4389652384802, "grad_norm": 0.1887395828962326, "learning_rate": 0.00024600287433755504, "loss": 0.5209, "step": 827100 }, { "epoch": 111.45243869576933, "grad_norm": 0.1870594173669815, "learning_rate": 0.00024596544806730744, "loss": 0.5198, "step": 827200 }, { "epoch": 111.46591215305847, "grad_norm": 0.1780320554971695, "learning_rate": 0.0002459280217970598, "loss": 0.5203, "step": 827300 }, { "epoch": 111.47938561034762, "grad_norm": 0.20199674367904663, "learning_rate": 0.0002458905955268122, "loss": 0.5194, "step": 827400 }, { "epoch": 111.49285906763676, "grad_norm": 0.2048376202583313, "learning_rate": 0.0002458531692565646, "loss": 0.5197, "step": 827500 }, { "epoch": 111.5063325249259, "grad_norm": 0.20534583926200867, "learning_rate": 0.000245815742986317, "loss": 0.5203, "step": 827600 }, { "epoch": 111.51980598221503, "grad_norm": 0.2005864530801773, "learning_rate": 0.0002457783167160693, "loss": 0.5197, "step": 827700 }, { "epoch": 111.53327943950417, "grad_norm": 0.23216459155082703, "learning_rate": 0.0002457408904458217, "loss": 0.5203, "step": 827800 }, { "epoch": 111.54675289679332, "grad_norm": 0.18927843868732452, "learning_rate": 0.0002457034641755741, "loss": 0.5205, "step": 827900 }, { "epoch": 111.56022635408246, "grad_norm": 0.21112696826457977, "learning_rate": 0.0002456660379053265, "loss": 0.5203, "step": 828000 }, { "epoch": 111.5736998113716, "grad_norm": 0.20949241518974304, "learning_rate": 0.0002456286116350789, "loss": 0.5201, "step": 828100 }, { "epoch": 111.58717326866073, "grad_norm": 0.1956051141023636, "learning_rate": 0.0002455911853648313, "loss": 0.5202, "step": 828200 }, { "epoch": 111.60064672594987, "grad_norm": 0.19976630806922913, "learning_rate": 0.00024555375909458366, "loss": 0.5204, "step": 828300 }, { "epoch": 111.61412018323902, "grad_norm": 0.2123948186635971, "learning_rate": 0.00024551633282433606, "loss": 0.5194, "step": 828400 }, { "epoch": 111.62759364052816, "grad_norm": 0.2255697250366211, "learning_rate": 0.00024547890655408845, "loss": 0.5208, "step": 828500 }, { "epoch": 111.6410670978173, "grad_norm": 0.18350036442279816, "learning_rate": 0.00024544148028384085, "loss": 0.5202, "step": 828600 }, { "epoch": 111.65454055510644, "grad_norm": 0.2052236944437027, "learning_rate": 0.00024540405401359325, "loss": 0.5201, "step": 828700 }, { "epoch": 111.66801401239557, "grad_norm": 0.1925097405910492, "learning_rate": 0.0002453666277433456, "loss": 0.5211, "step": 828800 }, { "epoch": 111.68148746968473, "grad_norm": 0.2124231457710266, "learning_rate": 0.000245329201473098, "loss": 0.5194, "step": 828900 }, { "epoch": 111.69496092697386, "grad_norm": 0.1880052387714386, "learning_rate": 0.0002452917752028504, "loss": 0.5193, "step": 829000 }, { "epoch": 111.708434384263, "grad_norm": 0.19350405037403107, "learning_rate": 0.0002452543489326028, "loss": 0.5196, "step": 829100 }, { "epoch": 111.72190784155214, "grad_norm": 0.20446030795574188, "learning_rate": 0.00024521692266235513, "loss": 0.5187, "step": 829200 }, { "epoch": 111.73538129884128, "grad_norm": 0.19047404825687408, "learning_rate": 0.00024517949639210753, "loss": 0.5202, "step": 829300 }, { "epoch": 111.74885475613043, "grad_norm": 0.18195895850658417, "learning_rate": 0.00024514207012185993, "loss": 0.5202, "step": 829400 }, { "epoch": 111.76232821341956, "grad_norm": 0.23306667804718018, "learning_rate": 0.00024510464385161233, "loss": 0.5198, "step": 829500 }, { "epoch": 111.7758016707087, "grad_norm": 0.1877521425485611, "learning_rate": 0.0002450672175813647, "loss": 0.5194, "step": 829600 }, { "epoch": 111.78927512799784, "grad_norm": 0.19151648879051208, "learning_rate": 0.0002450297913111171, "loss": 0.5196, "step": 829700 }, { "epoch": 111.80274858528699, "grad_norm": 0.19029901921749115, "learning_rate": 0.0002449923650408695, "loss": 0.5191, "step": 829800 }, { "epoch": 111.81622204257613, "grad_norm": 0.19361014664173126, "learning_rate": 0.00024495493877062187, "loss": 0.5209, "step": 829900 }, { "epoch": 111.82969549986527, "grad_norm": 0.21912072598934174, "learning_rate": 0.00024491751250037427, "loss": 0.5204, "step": 830000 }, { "epoch": 111.8431689571544, "grad_norm": 0.22969774901866913, "learning_rate": 0.00024488008623012666, "loss": 0.5193, "step": 830100 }, { "epoch": 111.85664241444354, "grad_norm": 0.19225451350212097, "learning_rate": 0.000244842659959879, "loss": 0.5188, "step": 830200 }, { "epoch": 111.8701158717327, "grad_norm": 0.20907530188560486, "learning_rate": 0.0002448052336896314, "loss": 0.5202, "step": 830300 }, { "epoch": 111.88358932902183, "grad_norm": 0.1922881156206131, "learning_rate": 0.0002447678074193838, "loss": 0.5194, "step": 830400 }, { "epoch": 111.89706278631097, "grad_norm": 0.18467219173908234, "learning_rate": 0.0002447303811491362, "loss": 0.5195, "step": 830500 }, { "epoch": 111.9105362436001, "grad_norm": 0.1929246038198471, "learning_rate": 0.0002446929548788886, "loss": 0.5197, "step": 830600 }, { "epoch": 111.92400970088924, "grad_norm": 0.19733673334121704, "learning_rate": 0.000244655528608641, "loss": 0.519, "step": 830700 }, { "epoch": 111.9374831581784, "grad_norm": 0.1902424395084381, "learning_rate": 0.0002446181023383934, "loss": 0.5208, "step": 830800 }, { "epoch": 111.95095661546753, "grad_norm": 0.1863129436969757, "learning_rate": 0.0002445806760681458, "loss": 0.5199, "step": 830900 }, { "epoch": 111.96443007275667, "grad_norm": 0.2079223245382309, "learning_rate": 0.00024454324979789814, "loss": 0.5202, "step": 831000 }, { "epoch": 111.97790353004581, "grad_norm": 0.18655626475811005, "learning_rate": 0.00024450582352765054, "loss": 0.5205, "step": 831100 }, { "epoch": 111.99137698733495, "grad_norm": 0.19315369427204132, "learning_rate": 0.00024446839725740294, "loss": 0.5208, "step": 831200 }, { "epoch": 112.0, "eval_loss": 0.5089700818061829, "eval_runtime": 4.962, "eval_samples_per_second": 1007.659, "eval_steps_per_second": 15.921, "step": 831264 }, { "epoch": 112.0048504446241, "grad_norm": 0.2026798278093338, "learning_rate": 0.0002444309709871553, "loss": 0.5198, "step": 831300 }, { "epoch": 112.01832390191323, "grad_norm": 0.1954711675643921, "learning_rate": 0.0002443935447169077, "loss": 0.5196, "step": 831400 }, { "epoch": 112.03179735920237, "grad_norm": 0.19997602701187134, "learning_rate": 0.0002443561184466601, "loss": 0.5207, "step": 831500 }, { "epoch": 112.04527081649151, "grad_norm": 0.19357948005199432, "learning_rate": 0.0002443186921764125, "loss": 0.5194, "step": 831600 }, { "epoch": 112.05874427378065, "grad_norm": 0.202244833111763, "learning_rate": 0.00024428126590616487, "loss": 0.5197, "step": 831700 }, { "epoch": 112.0722177310698, "grad_norm": 0.19543196260929108, "learning_rate": 0.00024424383963591727, "loss": 0.5195, "step": 831800 }, { "epoch": 112.08569118835894, "grad_norm": 0.19916044175624847, "learning_rate": 0.00024420641336566967, "loss": 0.5198, "step": 831900 }, { "epoch": 112.09916464564807, "grad_norm": 0.1920393705368042, "learning_rate": 0.000244168987095422, "loss": 0.5185, "step": 832000 }, { "epoch": 112.11263810293721, "grad_norm": 0.20102255046367645, "learning_rate": 0.0002441315608251744, "loss": 0.5197, "step": 832100 }, { "epoch": 112.12611156022635, "grad_norm": 0.18993563950061798, "learning_rate": 0.00024409413455492678, "loss": 0.5203, "step": 832200 }, { "epoch": 112.1395850175155, "grad_norm": 0.20579706132411957, "learning_rate": 0.00024405670828467918, "loss": 0.5183, "step": 832300 }, { "epoch": 112.15305847480464, "grad_norm": 0.20664657652378082, "learning_rate": 0.00024401928201443158, "loss": 0.5192, "step": 832400 }, { "epoch": 112.16653193209378, "grad_norm": 0.1831059455871582, "learning_rate": 0.00024398185574418398, "loss": 0.5188, "step": 832500 }, { "epoch": 112.18000538938291, "grad_norm": 0.2024809867143631, "learning_rate": 0.00024394442947393635, "loss": 0.5204, "step": 832600 }, { "epoch": 112.19347884667205, "grad_norm": 0.20346593856811523, "learning_rate": 0.00024390700320368875, "loss": 0.5201, "step": 832700 }, { "epoch": 112.2069523039612, "grad_norm": 0.18972986936569214, "learning_rate": 0.00024386957693344114, "loss": 0.519, "step": 832800 }, { "epoch": 112.22042576125034, "grad_norm": 0.19553148746490479, "learning_rate": 0.00024383215066319352, "loss": 0.5192, "step": 832900 }, { "epoch": 112.23389921853948, "grad_norm": 0.19045090675354004, "learning_rate": 0.0002437947243929459, "loss": 0.5194, "step": 833000 }, { "epoch": 112.24737267582861, "grad_norm": 0.21275341510772705, "learning_rate": 0.00024375729812269828, "loss": 0.5186, "step": 833100 }, { "epoch": 112.26084613311775, "grad_norm": 0.2007904350757599, "learning_rate": 0.00024371987185245068, "loss": 0.5191, "step": 833200 }, { "epoch": 112.2743195904069, "grad_norm": 0.19586201012134552, "learning_rate": 0.00024368244558220305, "loss": 0.5194, "step": 833300 }, { "epoch": 112.28779304769604, "grad_norm": 0.19082295894622803, "learning_rate": 0.00024364501931195545, "loss": 0.5201, "step": 833400 }, { "epoch": 112.30126650498518, "grad_norm": 0.19674809277057648, "learning_rate": 0.00024360759304170785, "loss": 0.5185, "step": 833500 }, { "epoch": 112.31473996227432, "grad_norm": 0.20549966394901276, "learning_rate": 0.00024357016677146025, "loss": 0.5199, "step": 833600 }, { "epoch": 112.32821341956345, "grad_norm": 0.19899539649486542, "learning_rate": 0.00024353274050121262, "loss": 0.5195, "step": 833700 }, { "epoch": 112.3416868768526, "grad_norm": 0.18698181211948395, "learning_rate": 0.000243495314230965, "loss": 0.5199, "step": 833800 }, { "epoch": 112.35516033414174, "grad_norm": 0.19805455207824707, "learning_rate": 0.0002434578879607174, "loss": 0.5198, "step": 833900 }, { "epoch": 112.36863379143088, "grad_norm": 0.19860272109508514, "learning_rate": 0.0002434204616904698, "loss": 0.5202, "step": 834000 }, { "epoch": 112.38210724872002, "grad_norm": 0.19312173128128052, "learning_rate": 0.00024338303542022216, "loss": 0.5189, "step": 834100 }, { "epoch": 112.39558070600916, "grad_norm": 0.20862038433551788, "learning_rate": 0.00024334560914997456, "loss": 0.5196, "step": 834200 }, { "epoch": 112.40905416329831, "grad_norm": 0.19325079023838043, "learning_rate": 0.00024330818287972696, "loss": 0.5194, "step": 834300 }, { "epoch": 112.42252762058745, "grad_norm": 0.1891675591468811, "learning_rate": 0.00024327075660947933, "loss": 0.5207, "step": 834400 }, { "epoch": 112.43600107787658, "grad_norm": 0.19617486000061035, "learning_rate": 0.00024323333033923172, "loss": 0.5194, "step": 834500 }, { "epoch": 112.44947453516572, "grad_norm": 0.20370179414749146, "learning_rate": 0.00024319590406898412, "loss": 0.5199, "step": 834600 }, { "epoch": 112.46294799245486, "grad_norm": 0.2265009731054306, "learning_rate": 0.0002431584777987365, "loss": 0.5194, "step": 834700 }, { "epoch": 112.47642144974401, "grad_norm": 0.20110471546649933, "learning_rate": 0.00024312105152848887, "loss": 0.5196, "step": 834800 }, { "epoch": 112.48989490703315, "grad_norm": 0.20358878374099731, "learning_rate": 0.00024308362525824126, "loss": 0.5196, "step": 834900 }, { "epoch": 112.50336836432228, "grad_norm": 0.1884831041097641, "learning_rate": 0.00024304619898799366, "loss": 0.5204, "step": 835000 }, { "epoch": 112.51684182161142, "grad_norm": 0.18657414615154266, "learning_rate": 0.00024300877271774603, "loss": 0.5195, "step": 835100 }, { "epoch": 112.53031527890056, "grad_norm": 0.2069774866104126, "learning_rate": 0.00024297134644749843, "loss": 0.521, "step": 835200 }, { "epoch": 112.54378873618971, "grad_norm": 0.19428150355815887, "learning_rate": 0.00024293392017725083, "loss": 0.5203, "step": 835300 }, { "epoch": 112.55726219347885, "grad_norm": 0.1939903199672699, "learning_rate": 0.00024289649390700323, "loss": 0.5187, "step": 835400 }, { "epoch": 112.57073565076799, "grad_norm": 0.2188117653131485, "learning_rate": 0.0002428590676367556, "loss": 0.5201, "step": 835500 }, { "epoch": 112.58420910805712, "grad_norm": 0.21727463603019714, "learning_rate": 0.00024282164136650797, "loss": 0.5194, "step": 835600 }, { "epoch": 112.59768256534626, "grad_norm": 0.18680819869041443, "learning_rate": 0.00024278421509626037, "loss": 0.5209, "step": 835700 }, { "epoch": 112.61115602263541, "grad_norm": 0.23813334107398987, "learning_rate": 0.00024274678882601277, "loss": 0.5199, "step": 835800 }, { "epoch": 112.62462947992455, "grad_norm": 0.21008794009685516, "learning_rate": 0.00024270936255576514, "loss": 0.5186, "step": 835900 }, { "epoch": 112.63810293721369, "grad_norm": 0.19194869697093964, "learning_rate": 0.00024267193628551754, "loss": 0.5193, "step": 836000 }, { "epoch": 112.65157639450283, "grad_norm": 0.2016589492559433, "learning_rate": 0.00024263451001526993, "loss": 0.52, "step": 836100 }, { "epoch": 112.66504985179196, "grad_norm": 0.18819846212863922, "learning_rate": 0.0002425970837450223, "loss": 0.5198, "step": 836200 }, { "epoch": 112.67852330908111, "grad_norm": 0.19013358652591705, "learning_rate": 0.0002425596574747747, "loss": 0.5196, "step": 836300 }, { "epoch": 112.69199676637025, "grad_norm": 0.2024158537387848, "learning_rate": 0.0002425222312045271, "loss": 0.5197, "step": 836400 }, { "epoch": 112.70547022365939, "grad_norm": 0.1881946325302124, "learning_rate": 0.00024248480493427947, "loss": 0.5194, "step": 836500 }, { "epoch": 112.71894368094853, "grad_norm": 0.20089943706989288, "learning_rate": 0.00024244737866403184, "loss": 0.5201, "step": 836600 }, { "epoch": 112.73241713823766, "grad_norm": 0.22305893898010254, "learning_rate": 0.00024240995239378424, "loss": 0.5193, "step": 836700 }, { "epoch": 112.74589059552682, "grad_norm": 0.19288070499897003, "learning_rate": 0.00024237252612353664, "loss": 0.5196, "step": 836800 }, { "epoch": 112.75936405281595, "grad_norm": 0.19731760025024414, "learning_rate": 0.00024233509985328904, "loss": 0.5203, "step": 836900 }, { "epoch": 112.77283751010509, "grad_norm": 0.1989191323518753, "learning_rate": 0.0002422976735830414, "loss": 0.5206, "step": 837000 }, { "epoch": 112.78631096739423, "grad_norm": 0.2032754272222519, "learning_rate": 0.0002422602473127938, "loss": 0.5205, "step": 837100 }, { "epoch": 112.79978442468338, "grad_norm": 0.2010553777217865, "learning_rate": 0.0002422228210425462, "loss": 0.5196, "step": 837200 }, { "epoch": 112.81325788197252, "grad_norm": 0.226334348320961, "learning_rate": 0.00024218539477229858, "loss": 0.5196, "step": 837300 }, { "epoch": 112.82673133926166, "grad_norm": 0.19482490420341492, "learning_rate": 0.00024214796850205095, "loss": 0.5198, "step": 837400 }, { "epoch": 112.8402047965508, "grad_norm": 0.19287273287773132, "learning_rate": 0.00024211054223180335, "loss": 0.5198, "step": 837500 }, { "epoch": 112.85367825383993, "grad_norm": 0.19083750247955322, "learning_rate": 0.00024207311596155574, "loss": 0.5188, "step": 837600 }, { "epoch": 112.86715171112908, "grad_norm": 0.19897794723510742, "learning_rate": 0.00024203568969130812, "loss": 0.52, "step": 837700 }, { "epoch": 112.88062516841822, "grad_norm": 0.1985556036233902, "learning_rate": 0.00024199826342106051, "loss": 0.5198, "step": 837800 }, { "epoch": 112.89409862570736, "grad_norm": 0.21074476838111877, "learning_rate": 0.0002419608371508129, "loss": 0.5194, "step": 837900 }, { "epoch": 112.9075720829965, "grad_norm": 0.19549933075904846, "learning_rate": 0.0002419234108805653, "loss": 0.5192, "step": 838000 }, { "epoch": 112.92104554028563, "grad_norm": 0.18978920578956604, "learning_rate": 0.00024188598461031768, "loss": 0.5202, "step": 838100 }, { "epoch": 112.93451899757478, "grad_norm": 0.21248503029346466, "learning_rate": 0.00024184855834007008, "loss": 0.5195, "step": 838200 }, { "epoch": 112.94799245486392, "grad_norm": 0.19700045883655548, "learning_rate": 0.00024181113206982245, "loss": 0.5202, "step": 838300 }, { "epoch": 112.96146591215306, "grad_norm": 0.18873977661132812, "learning_rate": 0.00024177370579957482, "loss": 0.5197, "step": 838400 }, { "epoch": 112.9749393694422, "grad_norm": 0.20010407269001007, "learning_rate": 0.00024173627952932722, "loss": 0.5199, "step": 838500 }, { "epoch": 112.98841282673133, "grad_norm": 0.1944817155599594, "learning_rate": 0.00024169885325907962, "loss": 0.5203, "step": 838600 }, { "epoch": 113.0, "eval_loss": 0.5082443356513977, "eval_runtime": 4.9656, "eval_samples_per_second": 1006.921, "eval_steps_per_second": 15.909, "step": 838686 }, { "epoch": 113.00188628402049, "grad_norm": 0.19041135907173157, "learning_rate": 0.00024166142698883202, "loss": 0.5207, "step": 838700 }, { "epoch": 113.01535974130962, "grad_norm": 0.18976888060569763, "learning_rate": 0.0002416240007185844, "loss": 0.5196, "step": 838800 }, { "epoch": 113.02883319859876, "grad_norm": 0.1945093721151352, "learning_rate": 0.00024158657444833679, "loss": 0.5195, "step": 838900 }, { "epoch": 113.0423066558879, "grad_norm": 0.18546228110790253, "learning_rate": 0.00024154914817808918, "loss": 0.5193, "step": 839000 }, { "epoch": 113.05578011317704, "grad_norm": 0.200356125831604, "learning_rate": 0.00024151172190784158, "loss": 0.5194, "step": 839100 }, { "epoch": 113.06925357046619, "grad_norm": 0.19929072260856628, "learning_rate": 0.00024147429563759393, "loss": 0.5185, "step": 839200 }, { "epoch": 113.08272702775533, "grad_norm": 0.19458383321762085, "learning_rate": 0.00024143686936734632, "loss": 0.5188, "step": 839300 }, { "epoch": 113.09620048504446, "grad_norm": 0.2007177323102951, "learning_rate": 0.00024139944309709872, "loss": 0.5193, "step": 839400 }, { "epoch": 113.1096739423336, "grad_norm": 0.19397470355033875, "learning_rate": 0.0002413620168268511, "loss": 0.5184, "step": 839500 }, { "epoch": 113.12314739962274, "grad_norm": 0.2033820003271103, "learning_rate": 0.0002413245905566035, "loss": 0.519, "step": 839600 }, { "epoch": 113.13662085691189, "grad_norm": 0.19249758124351501, "learning_rate": 0.0002412871642863559, "loss": 0.5187, "step": 839700 }, { "epoch": 113.15009431420103, "grad_norm": 0.18440566956996918, "learning_rate": 0.0002412497380161083, "loss": 0.5202, "step": 839800 }, { "epoch": 113.16356777149016, "grad_norm": 0.22973601520061493, "learning_rate": 0.00024121231174586066, "loss": 0.5198, "step": 839900 }, { "epoch": 113.1770412287793, "grad_norm": 0.21327659487724304, "learning_rate": 0.00024117488547561306, "loss": 0.5199, "step": 840000 }, { "epoch": 113.19051468606844, "grad_norm": 0.18743188679218292, "learning_rate": 0.00024113745920536543, "loss": 0.5197, "step": 840100 }, { "epoch": 113.20398814335759, "grad_norm": 0.2085546851158142, "learning_rate": 0.0002411000329351178, "loss": 0.52, "step": 840200 }, { "epoch": 113.21746160064673, "grad_norm": 0.19646874070167542, "learning_rate": 0.0002410626066648702, "loss": 0.5195, "step": 840300 }, { "epoch": 113.23093505793587, "grad_norm": 0.20940910279750824, "learning_rate": 0.0002410251803946226, "loss": 0.5188, "step": 840400 }, { "epoch": 113.244408515225, "grad_norm": 0.21438883244991302, "learning_rate": 0.000240987754124375, "loss": 0.5202, "step": 840500 }, { "epoch": 113.25788197251414, "grad_norm": 0.19484347105026245, "learning_rate": 0.00024095032785412737, "loss": 0.5196, "step": 840600 }, { "epoch": 113.2713554298033, "grad_norm": 0.1933070868253708, "learning_rate": 0.00024091290158387976, "loss": 0.5194, "step": 840700 }, { "epoch": 113.28482888709243, "grad_norm": 0.19113284349441528, "learning_rate": 0.00024087547531363216, "loss": 0.5188, "step": 840800 }, { "epoch": 113.29830234438157, "grad_norm": 0.186537504196167, "learning_rate": 0.00024083804904338456, "loss": 0.52, "step": 840900 }, { "epoch": 113.3117758016707, "grad_norm": 0.18936121463775635, "learning_rate": 0.0002408006227731369, "loss": 0.5194, "step": 841000 }, { "epoch": 113.32524925895984, "grad_norm": 0.19701804220676422, "learning_rate": 0.0002407631965028893, "loss": 0.5192, "step": 841100 }, { "epoch": 113.338722716249, "grad_norm": 0.19012042880058289, "learning_rate": 0.0002407257702326417, "loss": 0.5193, "step": 841200 }, { "epoch": 113.35219617353813, "grad_norm": 0.22334171831607819, "learning_rate": 0.00024068834396239407, "loss": 0.5204, "step": 841300 }, { "epoch": 113.36566963082727, "grad_norm": 0.20417070388793945, "learning_rate": 0.00024065091769214647, "loss": 0.5203, "step": 841400 }, { "epoch": 113.37914308811641, "grad_norm": 0.1956081986427307, "learning_rate": 0.00024061349142189887, "loss": 0.5203, "step": 841500 }, { "epoch": 113.39261654540555, "grad_norm": 0.21126769483089447, "learning_rate": 0.00024057606515165127, "loss": 0.5196, "step": 841600 }, { "epoch": 113.4060900026947, "grad_norm": 0.1901216059923172, "learning_rate": 0.00024053863888140364, "loss": 0.5193, "step": 841700 }, { "epoch": 113.41956345998383, "grad_norm": 0.21137046813964844, "learning_rate": 0.00024050121261115604, "loss": 0.519, "step": 841800 }, { "epoch": 113.43303691727297, "grad_norm": 0.19860400259494781, "learning_rate": 0.0002404637863409084, "loss": 0.519, "step": 841900 }, { "epoch": 113.44651037456211, "grad_norm": 0.20846879482269287, "learning_rate": 0.0002404263600706608, "loss": 0.5193, "step": 842000 }, { "epoch": 113.45998383185125, "grad_norm": 0.20310623943805695, "learning_rate": 0.00024038893380041318, "loss": 0.5188, "step": 842100 }, { "epoch": 113.4734572891404, "grad_norm": 0.203337162733078, "learning_rate": 0.00024035150753016558, "loss": 0.5186, "step": 842200 }, { "epoch": 113.48693074642954, "grad_norm": 0.2071727216243744, "learning_rate": 0.00024031408125991797, "loss": 0.5196, "step": 842300 }, { "epoch": 113.50040420371867, "grad_norm": 0.19264768064022064, "learning_rate": 0.00024027665498967034, "loss": 0.519, "step": 842400 }, { "epoch": 113.51387766100781, "grad_norm": 0.1989620476961136, "learning_rate": 0.00024023922871942274, "loss": 0.5193, "step": 842500 }, { "epoch": 113.52735111829695, "grad_norm": 0.21783877909183502, "learning_rate": 0.00024020180244917514, "loss": 0.5202, "step": 842600 }, { "epoch": 113.5408245755861, "grad_norm": 0.2001928985118866, "learning_rate": 0.00024016437617892754, "loss": 0.5196, "step": 842700 }, { "epoch": 113.55429803287524, "grad_norm": 0.19222983717918396, "learning_rate": 0.00024012694990867988, "loss": 0.5188, "step": 842800 }, { "epoch": 113.56777149016438, "grad_norm": 0.19462484121322632, "learning_rate": 0.00024008952363843228, "loss": 0.5202, "step": 842900 }, { "epoch": 113.58124494745351, "grad_norm": 0.20626480877399445, "learning_rate": 0.00024005209736818468, "loss": 0.5191, "step": 843000 }, { "epoch": 113.59471840474265, "grad_norm": 0.1943737417459488, "learning_rate": 0.00024001467109793708, "loss": 0.519, "step": 843100 }, { "epoch": 113.6081918620318, "grad_norm": 0.22319205105304718, "learning_rate": 0.00023997724482768945, "loss": 0.5196, "step": 843200 }, { "epoch": 113.62166531932094, "grad_norm": 0.1938953548669815, "learning_rate": 0.00023993981855744185, "loss": 0.5214, "step": 843300 }, { "epoch": 113.63513877661008, "grad_norm": 0.19629675149917603, "learning_rate": 0.00023990239228719425, "loss": 0.5201, "step": 843400 }, { "epoch": 113.64861223389921, "grad_norm": 0.18626563251018524, "learning_rate": 0.00023986496601694662, "loss": 0.5208, "step": 843500 }, { "epoch": 113.66208569118835, "grad_norm": 0.18595388531684875, "learning_rate": 0.00023982753974669902, "loss": 0.5192, "step": 843600 }, { "epoch": 113.6755591484775, "grad_norm": 0.21355195343494415, "learning_rate": 0.00023979011347645139, "loss": 0.5198, "step": 843700 }, { "epoch": 113.68903260576664, "grad_norm": 0.21358774602413177, "learning_rate": 0.00023975268720620378, "loss": 0.5202, "step": 843800 }, { "epoch": 113.70250606305578, "grad_norm": 0.20183417201042175, "learning_rate": 0.00023971526093595616, "loss": 0.5194, "step": 843900 }, { "epoch": 113.71597952034492, "grad_norm": 0.1927167773246765, "learning_rate": 0.00023967783466570855, "loss": 0.519, "step": 844000 }, { "epoch": 113.72945297763405, "grad_norm": 0.19207045435905457, "learning_rate": 0.00023964040839546095, "loss": 0.5201, "step": 844100 }, { "epoch": 113.7429264349232, "grad_norm": 0.1943420171737671, "learning_rate": 0.00023960298212521332, "loss": 0.5195, "step": 844200 }, { "epoch": 113.75639989221234, "grad_norm": 0.20136010646820068, "learning_rate": 0.00023956555585496572, "loss": 0.5194, "step": 844300 }, { "epoch": 113.76987334950148, "grad_norm": 0.20302709937095642, "learning_rate": 0.00023952812958471812, "loss": 0.5202, "step": 844400 }, { "epoch": 113.78334680679062, "grad_norm": 0.21241231262683868, "learning_rate": 0.00023949070331447052, "loss": 0.5193, "step": 844500 }, { "epoch": 113.79682026407977, "grad_norm": 0.18242508172988892, "learning_rate": 0.00023945327704422286, "loss": 0.5188, "step": 844600 }, { "epoch": 113.81029372136891, "grad_norm": 0.21473875641822815, "learning_rate": 0.00023941585077397526, "loss": 0.5199, "step": 844700 }, { "epoch": 113.82376717865804, "grad_norm": 0.20326557755470276, "learning_rate": 0.00023937842450372766, "loss": 0.5207, "step": 844800 }, { "epoch": 113.83724063594718, "grad_norm": 0.2155199497938156, "learning_rate": 0.00023934099823348006, "loss": 0.5196, "step": 844900 }, { "epoch": 113.85071409323632, "grad_norm": 0.2059047371149063, "learning_rate": 0.00023930357196323243, "loss": 0.5181, "step": 845000 }, { "epoch": 113.86418755052547, "grad_norm": 0.23421604931354523, "learning_rate": 0.00023926614569298483, "loss": 0.5199, "step": 845100 }, { "epoch": 113.87766100781461, "grad_norm": 0.2056703120470047, "learning_rate": 0.00023922871942273722, "loss": 0.5199, "step": 845200 }, { "epoch": 113.89113446510375, "grad_norm": 0.1873481422662735, "learning_rate": 0.0002391912931524896, "loss": 0.5199, "step": 845300 }, { "epoch": 113.90460792239288, "grad_norm": 0.18311859667301178, "learning_rate": 0.000239153866882242, "loss": 0.5186, "step": 845400 }, { "epoch": 113.91808137968202, "grad_norm": 0.2118682563304901, "learning_rate": 0.00023911644061199436, "loss": 0.5201, "step": 845500 }, { "epoch": 113.93155483697117, "grad_norm": 0.18407678604125977, "learning_rate": 0.00023907901434174676, "loss": 0.5187, "step": 845600 }, { "epoch": 113.94502829426031, "grad_norm": 0.19531773030757904, "learning_rate": 0.00023904158807149913, "loss": 0.5199, "step": 845700 }, { "epoch": 113.95850175154945, "grad_norm": 0.19562207162380219, "learning_rate": 0.00023900416180125153, "loss": 0.52, "step": 845800 }, { "epoch": 113.97197520883859, "grad_norm": 0.19461636245250702, "learning_rate": 0.00023896673553100393, "loss": 0.5196, "step": 845900 }, { "epoch": 113.98544866612772, "grad_norm": 0.19626574218273163, "learning_rate": 0.00023892930926075633, "loss": 0.5205, "step": 846000 }, { "epoch": 113.99892212341688, "grad_norm": 0.2117072343826294, "learning_rate": 0.0002388918829905087, "loss": 0.5189, "step": 846100 }, { "epoch": 114.0, "eval_loss": 0.5088672041893005, "eval_runtime": 4.938, "eval_samples_per_second": 1012.556, "eval_steps_per_second": 15.998, "step": 846108 }, { "epoch": 114.01239558070601, "grad_norm": 0.22046390175819397, "learning_rate": 0.0002388544567202611, "loss": 0.5192, "step": 846200 }, { "epoch": 114.02586903799515, "grad_norm": 0.2072417289018631, "learning_rate": 0.0002388170304500135, "loss": 0.5193, "step": 846300 }, { "epoch": 114.03934249528429, "grad_norm": 0.1960465908050537, "learning_rate": 0.00023877960417976584, "loss": 0.5184, "step": 846400 }, { "epoch": 114.05281595257343, "grad_norm": 0.20021767914295197, "learning_rate": 0.00023874217790951824, "loss": 0.5183, "step": 846500 }, { "epoch": 114.06628940986258, "grad_norm": 0.19531738758087158, "learning_rate": 0.00023870475163927064, "loss": 0.5178, "step": 846600 }, { "epoch": 114.07976286715171, "grad_norm": 0.20029804110527039, "learning_rate": 0.00023866732536902304, "loss": 0.5193, "step": 846700 }, { "epoch": 114.09323632444085, "grad_norm": 0.20839013159275055, "learning_rate": 0.0002386298990987754, "loss": 0.5193, "step": 846800 }, { "epoch": 114.10670978172999, "grad_norm": 0.22360654175281525, "learning_rate": 0.0002385924728285278, "loss": 0.5187, "step": 846900 }, { "epoch": 114.12018323901913, "grad_norm": 0.20030498504638672, "learning_rate": 0.0002385550465582802, "loss": 0.52, "step": 847000 }, { "epoch": 114.13365669630828, "grad_norm": 0.20306046307086945, "learning_rate": 0.0002385176202880326, "loss": 0.5195, "step": 847100 }, { "epoch": 114.14713015359742, "grad_norm": 0.18842637538909912, "learning_rate": 0.00023848019401778497, "loss": 0.5193, "step": 847200 }, { "epoch": 114.16060361088655, "grad_norm": 0.20362123847007751, "learning_rate": 0.00023844276774753737, "loss": 0.5183, "step": 847300 }, { "epoch": 114.17407706817569, "grad_norm": 0.21941077709197998, "learning_rate": 0.00023840534147728974, "loss": 0.5185, "step": 847400 }, { "epoch": 114.18755052546483, "grad_norm": 0.1984620839357376, "learning_rate": 0.0002383679152070421, "loss": 0.52, "step": 847500 }, { "epoch": 114.20102398275398, "grad_norm": 0.2121335119009018, "learning_rate": 0.0002383304889367945, "loss": 0.5183, "step": 847600 }, { "epoch": 114.21449744004312, "grad_norm": 0.19270627200603485, "learning_rate": 0.0002382930626665469, "loss": 0.5195, "step": 847700 }, { "epoch": 114.22797089733226, "grad_norm": 0.1973070204257965, "learning_rate": 0.0002382556363962993, "loss": 0.5182, "step": 847800 }, { "epoch": 114.2414443546214, "grad_norm": 0.20472311973571777, "learning_rate": 0.00023821821012605168, "loss": 0.5193, "step": 847900 }, { "epoch": 114.25491781191053, "grad_norm": 0.21480385959148407, "learning_rate": 0.00023818078385580408, "loss": 0.5191, "step": 848000 }, { "epoch": 114.26839126919968, "grad_norm": 0.20492316782474518, "learning_rate": 0.00023814335758555648, "loss": 0.5191, "step": 848100 }, { "epoch": 114.28186472648882, "grad_norm": 0.22009693086147308, "learning_rate": 0.00023810593131530887, "loss": 0.5196, "step": 848200 }, { "epoch": 114.29533818377796, "grad_norm": 0.20619292557239532, "learning_rate": 0.00023806850504506122, "loss": 0.5206, "step": 848300 }, { "epoch": 114.3088116410671, "grad_norm": 0.19024108350276947, "learning_rate": 0.00023803107877481362, "loss": 0.5207, "step": 848400 }, { "epoch": 114.32228509835623, "grad_norm": 0.19356748461723328, "learning_rate": 0.00023799365250456601, "loss": 0.5197, "step": 848500 }, { "epoch": 114.33575855564538, "grad_norm": 0.210598424077034, "learning_rate": 0.00023795622623431838, "loss": 0.5199, "step": 848600 }, { "epoch": 114.34923201293452, "grad_norm": 0.1992703080177307, "learning_rate": 0.00023791879996407078, "loss": 0.5195, "step": 848700 }, { "epoch": 114.36270547022366, "grad_norm": 0.21016865968704224, "learning_rate": 0.00023788137369382318, "loss": 0.5198, "step": 848800 }, { "epoch": 114.3761789275128, "grad_norm": 0.2063632309436798, "learning_rate": 0.00023784394742357558, "loss": 0.5184, "step": 848900 }, { "epoch": 114.38965238480193, "grad_norm": 0.20893096923828125, "learning_rate": 0.00023780652115332795, "loss": 0.5202, "step": 849000 }, { "epoch": 114.40312584209109, "grad_norm": 0.19554515182971954, "learning_rate": 0.00023776909488308035, "loss": 0.5189, "step": 849100 }, { "epoch": 114.41659929938022, "grad_norm": 0.20011217892169952, "learning_rate": 0.00023773166861283272, "loss": 0.5199, "step": 849200 }, { "epoch": 114.43007275666936, "grad_norm": 0.19367071986198425, "learning_rate": 0.0002376942423425851, "loss": 0.5186, "step": 849300 }, { "epoch": 114.4435462139585, "grad_norm": 0.20477338135242462, "learning_rate": 0.0002376568160723375, "loss": 0.5188, "step": 849400 }, { "epoch": 114.45701967124764, "grad_norm": 0.2114262580871582, "learning_rate": 0.0002376193898020899, "loss": 0.519, "step": 849500 }, { "epoch": 114.47049312853679, "grad_norm": 0.1954859346151352, "learning_rate": 0.00023758196353184229, "loss": 0.519, "step": 849600 }, { "epoch": 114.48396658582593, "grad_norm": 0.2259969711303711, "learning_rate": 0.00023754453726159466, "loss": 0.5192, "step": 849700 }, { "epoch": 114.49744004311506, "grad_norm": 0.3150918185710907, "learning_rate": 0.00023750711099134706, "loss": 0.5205, "step": 849800 }, { "epoch": 114.5109135004042, "grad_norm": 0.2014339715242386, "learning_rate": 0.00023746968472109945, "loss": 0.5193, "step": 849900 }, { "epoch": 114.52438695769334, "grad_norm": 0.2142029106616974, "learning_rate": 0.00023743225845085185, "loss": 0.5191, "step": 850000 }, { "epoch": 114.53786041498249, "grad_norm": 0.18733952939510345, "learning_rate": 0.0002373948321806042, "loss": 0.5195, "step": 850100 }, { "epoch": 114.55133387227163, "grad_norm": 0.20627473294734955, "learning_rate": 0.0002373574059103566, "loss": 0.5201, "step": 850200 }, { "epoch": 114.56480732956076, "grad_norm": 0.20618990063667297, "learning_rate": 0.000237319979640109, "loss": 0.5191, "step": 850300 }, { "epoch": 114.5782807868499, "grad_norm": 0.19887635111808777, "learning_rate": 0.00023728255336986136, "loss": 0.5206, "step": 850400 }, { "epoch": 114.59175424413904, "grad_norm": 0.2198505401611328, "learning_rate": 0.00023724512709961376, "loss": 0.5193, "step": 850500 }, { "epoch": 114.60522770142819, "grad_norm": 0.20728112757205963, "learning_rate": 0.00023720770082936616, "loss": 0.5199, "step": 850600 }, { "epoch": 114.61870115871733, "grad_norm": 0.1942523866891861, "learning_rate": 0.00023717027455911856, "loss": 0.5188, "step": 850700 }, { "epoch": 114.63217461600647, "grad_norm": 0.1926001012325287, "learning_rate": 0.00023713284828887093, "loss": 0.5187, "step": 850800 }, { "epoch": 114.6456480732956, "grad_norm": 0.22767874598503113, "learning_rate": 0.00023709542201862333, "loss": 0.5187, "step": 850900 }, { "epoch": 114.65912153058474, "grad_norm": 0.20024374127388, "learning_rate": 0.0002370579957483757, "loss": 0.5196, "step": 851000 }, { "epoch": 114.6725949878739, "grad_norm": 0.21877813339233398, "learning_rate": 0.0002370205694781281, "loss": 0.5191, "step": 851100 }, { "epoch": 114.68606844516303, "grad_norm": 0.2094699740409851, "learning_rate": 0.00023698314320788047, "loss": 0.5193, "step": 851200 }, { "epoch": 114.69954190245217, "grad_norm": 0.18945778906345367, "learning_rate": 0.00023694571693763287, "loss": 0.5187, "step": 851300 }, { "epoch": 114.7130153597413, "grad_norm": 0.2082107812166214, "learning_rate": 0.00023690829066738526, "loss": 0.5202, "step": 851400 }, { "epoch": 114.72648881703044, "grad_norm": 0.18800435960292816, "learning_rate": 0.00023687086439713764, "loss": 0.5199, "step": 851500 }, { "epoch": 114.7399622743196, "grad_norm": 0.1933862864971161, "learning_rate": 0.00023683343812689003, "loss": 0.52, "step": 851600 }, { "epoch": 114.75343573160873, "grad_norm": 0.19528943300247192, "learning_rate": 0.00023679601185664243, "loss": 0.5202, "step": 851700 }, { "epoch": 114.76690918889787, "grad_norm": 0.20363707840442657, "learning_rate": 0.00023675858558639483, "loss": 0.5192, "step": 851800 }, { "epoch": 114.78038264618701, "grad_norm": 0.19218693673610687, "learning_rate": 0.00023672115931614717, "loss": 0.5198, "step": 851900 }, { "epoch": 114.79385610347614, "grad_norm": 0.2027115672826767, "learning_rate": 0.00023668373304589957, "loss": 0.5206, "step": 852000 }, { "epoch": 114.8073295607653, "grad_norm": 0.2142135202884674, "learning_rate": 0.00023664630677565197, "loss": 0.5194, "step": 852100 }, { "epoch": 114.82080301805443, "grad_norm": 0.21798297762870789, "learning_rate": 0.00023660888050540437, "loss": 0.5193, "step": 852200 }, { "epoch": 114.83427647534357, "grad_norm": 0.20549002289772034, "learning_rate": 0.00023657145423515674, "loss": 0.52, "step": 852300 }, { "epoch": 114.84774993263271, "grad_norm": 0.20835717022418976, "learning_rate": 0.00023653402796490914, "loss": 0.5193, "step": 852400 }, { "epoch": 114.86122338992186, "grad_norm": 0.2258787453174591, "learning_rate": 0.00023649660169466154, "loss": 0.5193, "step": 852500 }, { "epoch": 114.874696847211, "grad_norm": 0.21595744788646698, "learning_rate": 0.0002364591754244139, "loss": 0.519, "step": 852600 }, { "epoch": 114.88817030450014, "grad_norm": 0.21675053238868713, "learning_rate": 0.0002364217491541663, "loss": 0.5188, "step": 852700 }, { "epoch": 114.90164376178927, "grad_norm": 0.1797557920217514, "learning_rate": 0.00023638432288391868, "loss": 0.5184, "step": 852800 }, { "epoch": 114.91511721907841, "grad_norm": 0.19786302745342255, "learning_rate": 0.00023634689661367108, "loss": 0.5195, "step": 852900 }, { "epoch": 114.92859067636756, "grad_norm": 0.20478269457817078, "learning_rate": 0.00023630947034342345, "loss": 0.5201, "step": 853000 }, { "epoch": 114.9420641336567, "grad_norm": 0.19521716237068176, "learning_rate": 0.00023627204407317584, "loss": 0.5193, "step": 853100 }, { "epoch": 114.95553759094584, "grad_norm": 0.20781056582927704, "learning_rate": 0.00023623461780292824, "loss": 0.5193, "step": 853200 }, { "epoch": 114.96901104823498, "grad_norm": 0.22082573175430298, "learning_rate": 0.00023619719153268064, "loss": 0.5188, "step": 853300 }, { "epoch": 114.98248450552411, "grad_norm": 0.20409317314624786, "learning_rate": 0.000236159765262433, "loss": 0.52, "step": 853400 }, { "epoch": 114.99595796281326, "grad_norm": 0.19238312542438507, "learning_rate": 0.0002361223389921854, "loss": 0.5205, "step": 853500 }, { "epoch": 115.0, "eval_loss": 0.5079935789108276, "eval_runtime": 4.9448, "eval_samples_per_second": 1011.162, "eval_steps_per_second": 15.976, "step": 853530 } ], "logging_steps": 100, "max_steps": 1484400, "num_input_tokens_seen": 0, "num_train_epochs": 200, "save_steps": 500, "total_flos": 1.6199513088e+17, "train_batch_size": 128, "trial_name": null, "trial_params": null }