{ "best_metric": 4.588395118713379, "best_model_checkpoint": "learning_source_20260316/rna/bert-output/rna-medium/checkpoint-59600", "epoch": 0.9115595435441549, "eval_steps": 100, "global_step": 60000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0015192659059069249, "grad_norm": 0.36328116059303284, "learning_rate": 3e-06, "loss": 10.2311, "step": 100 }, { "epoch": 0.0015192659059069249, "eval_loss": 10.090826988220215, "eval_runtime": 193.145, "eval_samples_per_second": 51.775, "eval_steps_per_second": 6.472, "step": 100 }, { "epoch": 0.0030385318118138498, "grad_norm": 0.2686520218849182, "learning_rate": 6e-06, "loss": 9.9461, "step": 200 }, { "epoch": 0.0030385318118138498, "eval_loss": 9.758237838745117, "eval_runtime": 193.1826, "eval_samples_per_second": 51.764, "eval_steps_per_second": 6.471, "step": 200 }, { "epoch": 0.004557797717720775, "grad_norm": 0.9466120004653931, "learning_rate": 5.989966555183947e-06, "loss": 9.6291, "step": 300 }, { "epoch": 0.004557797717720775, "eval_loss": 9.439615249633789, "eval_runtime": 193.0329, "eval_samples_per_second": 51.805, "eval_steps_per_second": 6.476, "step": 300 }, { "epoch": 0.0060770636236276996, "grad_norm": 0.4738225042819977, "learning_rate": 5.979933110367893e-06, "loss": 9.3896, "step": 400 }, { "epoch": 0.0060770636236276996, "eval_loss": 9.214252471923828, "eval_runtime": 193.5666, "eval_samples_per_second": 51.662, "eval_steps_per_second": 6.458, "step": 400 }, { "epoch": 0.007596329529534624, "grad_norm": 0.7529183626174927, "learning_rate": 5.96989966555184e-06, "loss": 9.2425, "step": 500 }, { "epoch": 0.007596329529534624, "eval_loss": 9.07589340209961, "eval_runtime": 193.2214, "eval_samples_per_second": 51.754, "eval_steps_per_second": 6.469, "step": 500 }, { "epoch": 0.00911559543544155, "grad_norm": 0.48392170667648315, "learning_rate": 5.959866220735786e-06, "loss": 9.1413, "step": 600 }, { "epoch": 0.00911559543544155, "eval_loss": 8.97410774230957, "eval_runtime": 192.9985, "eval_samples_per_second": 51.814, "eval_steps_per_second": 6.477, "step": 600 }, { "epoch": 0.010634861341348474, "grad_norm": 0.6194415092468262, "learning_rate": 5.949832775919732e-06, "loss": 9.0633, "step": 700 }, { "epoch": 0.010634861341348474, "eval_loss": 8.891473770141602, "eval_runtime": 193.0661, "eval_samples_per_second": 51.796, "eval_steps_per_second": 6.474, "step": 700 }, { "epoch": 0.012154127247255399, "grad_norm": 2.9033119678497314, "learning_rate": 5.939799331103679e-06, "loss": 9.0036, "step": 800 }, { "epoch": 0.012154127247255399, "eval_loss": 8.827642440795898, "eval_runtime": 192.8551, "eval_samples_per_second": 51.852, "eval_steps_per_second": 6.482, "step": 800 }, { "epoch": 0.013673393153162324, "grad_norm": 2.6778995990753174, "learning_rate": 5.929765886287626e-06, "loss": 8.9573, "step": 900 }, { "epoch": 0.013673393153162324, "eval_loss": 8.794682502746582, "eval_runtime": 193.0047, "eval_samples_per_second": 51.812, "eval_steps_per_second": 6.477, "step": 900 }, { "epoch": 0.015192659059069248, "grad_norm": 2.586425304412842, "learning_rate": 5.919732441471572e-06, "loss": 8.927, "step": 1000 }, { "epoch": 0.015192659059069248, "eval_loss": 8.765641212463379, "eval_runtime": 193.0239, "eval_samples_per_second": 51.807, "eval_steps_per_second": 6.476, "step": 1000 }, { "epoch": 0.016711924964976175, "grad_norm": 3.287247657775879, "learning_rate": 5.9096989966555185e-06, "loss": 8.8995, "step": 1100 }, { "epoch": 0.016711924964976175, "eval_loss": 8.736359596252441, "eval_runtime": 193.2903, "eval_samples_per_second": 51.736, "eval_steps_per_second": 6.467, "step": 1100 }, { "epoch": 0.0182311908708831, "grad_norm": 3.2760348320007324, "learning_rate": 5.899665551839465e-06, "loss": 8.8734, "step": 1200 }, { "epoch": 0.0182311908708831, "eval_loss": 8.70866870880127, "eval_runtime": 193.2723, "eval_samples_per_second": 51.74, "eval_steps_per_second": 6.468, "step": 1200 }, { "epoch": 0.019750456776790024, "grad_norm": 3.7369821071624756, "learning_rate": 5.889632107023412e-06, "loss": 8.8472, "step": 1300 }, { "epoch": 0.019750456776790024, "eval_loss": 8.686373710632324, "eval_runtime": 193.3013, "eval_samples_per_second": 51.733, "eval_steps_per_second": 6.467, "step": 1300 }, { "epoch": 0.02126972268269695, "grad_norm": 3.6638362407684326, "learning_rate": 5.879598662207358e-06, "loss": 8.8219, "step": 1400 }, { "epoch": 0.02126972268269695, "eval_loss": 8.66162109375, "eval_runtime": 193.4171, "eval_samples_per_second": 51.702, "eval_steps_per_second": 6.463, "step": 1400 }, { "epoch": 0.022788988588603874, "grad_norm": 3.1928629875183105, "learning_rate": 5.869565217391305e-06, "loss": 8.7973, "step": 1500 }, { "epoch": 0.022788988588603874, "eval_loss": 8.633343696594238, "eval_runtime": 193.7794, "eval_samples_per_second": 51.605, "eval_steps_per_second": 6.451, "step": 1500 }, { "epoch": 0.024308254494510798, "grad_norm": 3.5108275413513184, "learning_rate": 5.8595317725752514e-06, "loss": 8.7682, "step": 1600 }, { "epoch": 0.024308254494510798, "eval_loss": 8.609291076660156, "eval_runtime": 193.2885, "eval_samples_per_second": 51.736, "eval_steps_per_second": 6.467, "step": 1600 }, { "epoch": 0.025827520400417723, "grad_norm": 6.164127349853516, "learning_rate": 5.849498327759197e-06, "loss": 8.7411, "step": 1700 }, { "epoch": 0.025827520400417723, "eval_loss": 8.578601837158203, "eval_runtime": 193.259, "eval_samples_per_second": 51.744, "eval_steps_per_second": 6.468, "step": 1700 }, { "epoch": 0.027346786306324648, "grad_norm": 2.5621981620788574, "learning_rate": 5.839464882943144e-06, "loss": 8.7198, "step": 1800 }, { "epoch": 0.027346786306324648, "eval_loss": 8.555196762084961, "eval_runtime": 193.4982, "eval_samples_per_second": 51.68, "eval_steps_per_second": 6.46, "step": 1800 }, { "epoch": 0.028866052212231572, "grad_norm": 2.957981586456299, "learning_rate": 5.829431438127091e-06, "loss": 8.6935, "step": 1900 }, { "epoch": 0.028866052212231572, "eval_loss": 8.530313491821289, "eval_runtime": 193.7009, "eval_samples_per_second": 51.626, "eval_steps_per_second": 6.453, "step": 1900 }, { "epoch": 0.030385318118138497, "grad_norm": 5.7702836990356445, "learning_rate": 5.819397993311037e-06, "loss": 8.6684, "step": 2000 }, { "epoch": 0.030385318118138497, "eval_loss": 8.509552001953125, "eval_runtime": 193.4646, "eval_samples_per_second": 51.689, "eval_steps_per_second": 6.461, "step": 2000 }, { "epoch": 0.03190458402404542, "grad_norm": 3.653986930847168, "learning_rate": 5.8093645484949836e-06, "loss": 8.6505, "step": 2100 }, { "epoch": 0.03190458402404542, "eval_loss": 8.499613761901855, "eval_runtime": 193.2316, "eval_samples_per_second": 51.751, "eval_steps_per_second": 6.469, "step": 2100 }, { "epoch": 0.03342384992995235, "grad_norm": 4.66618537902832, "learning_rate": 5.79933110367893e-06, "loss": 8.6175, "step": 2200 }, { "epoch": 0.03342384992995235, "eval_loss": 8.473803520202637, "eval_runtime": 193.4189, "eval_samples_per_second": 51.701, "eval_steps_per_second": 6.463, "step": 2200 }, { "epoch": 0.034943115835859274, "grad_norm": 0.7005074620246887, "learning_rate": 5.789297658862876e-06, "loss": 8.5932, "step": 2300 }, { "epoch": 0.034943115835859274, "eval_loss": 8.452431678771973, "eval_runtime": 193.2263, "eval_samples_per_second": 51.753, "eval_steps_per_second": 6.469, "step": 2300 }, { "epoch": 0.0364623817417662, "grad_norm": 5.592404842376709, "learning_rate": 5.779264214046823e-06, "loss": 8.572, "step": 2400 }, { "epoch": 0.0364623817417662, "eval_loss": 8.448452949523926, "eval_runtime": 193.5678, "eval_samples_per_second": 51.661, "eval_steps_per_second": 6.458, "step": 2400 }, { "epoch": 0.037981647647673124, "grad_norm": 4.363527297973633, "learning_rate": 5.76923076923077e-06, "loss": 8.5536, "step": 2500 }, { "epoch": 0.037981647647673124, "eval_loss": 8.417658805847168, "eval_runtime": 193.2714, "eval_samples_per_second": 51.741, "eval_steps_per_second": 6.468, "step": 2500 }, { "epoch": 0.03950091355358005, "grad_norm": 4.716485023498535, "learning_rate": 5.759197324414716e-06, "loss": 8.5332, "step": 2600 }, { "epoch": 0.03950091355358005, "eval_loss": 8.41653823852539, "eval_runtime": 193.336, "eval_samples_per_second": 51.723, "eval_steps_per_second": 6.465, "step": 2600 }, { "epoch": 0.04102017945948697, "grad_norm": 2.145522117614746, "learning_rate": 5.7491638795986624e-06, "loss": 8.5152, "step": 2700 }, { "epoch": 0.04102017945948697, "eval_loss": 8.391885757446289, "eval_runtime": 193.7068, "eval_samples_per_second": 51.624, "eval_steps_per_second": 6.453, "step": 2700 }, { "epoch": 0.0425394453653939, "grad_norm": 3.36438250541687, "learning_rate": 5.739130434782609e-06, "loss": 8.4964, "step": 2800 }, { "epoch": 0.0425394453653939, "eval_loss": 8.382240295410156, "eval_runtime": 193.7119, "eval_samples_per_second": 51.623, "eval_steps_per_second": 6.453, "step": 2800 }, { "epoch": 0.04405871127130082, "grad_norm": 3.0056991577148438, "learning_rate": 5.729096989966555e-06, "loss": 8.4811, "step": 2900 }, { "epoch": 0.04405871127130082, "eval_loss": 8.374021530151367, "eval_runtime": 193.9566, "eval_samples_per_second": 51.558, "eval_steps_per_second": 6.445, "step": 2900 }, { "epoch": 0.04557797717720775, "grad_norm": 2.388469696044922, "learning_rate": 5.719063545150502e-06, "loss": 8.4762, "step": 3000 }, { "epoch": 0.04557797717720775, "eval_loss": 8.371816635131836, "eval_runtime": 193.5842, "eval_samples_per_second": 51.657, "eval_steps_per_second": 6.457, "step": 3000 }, { "epoch": 0.04709724308311467, "grad_norm": 4.248419761657715, "learning_rate": 5.709030100334449e-06, "loss": 8.458, "step": 3100 }, { "epoch": 0.04709724308311467, "eval_loss": 8.359615325927734, "eval_runtime": 193.612, "eval_samples_per_second": 51.65, "eval_steps_per_second": 6.456, "step": 3100 }, { "epoch": 0.048616508989021597, "grad_norm": 1.2234629392623901, "learning_rate": 5.698996655518395e-06, "loss": 8.442, "step": 3200 }, { "epoch": 0.048616508989021597, "eval_loss": 8.356290817260742, "eval_runtime": 193.4203, "eval_samples_per_second": 51.701, "eval_steps_per_second": 6.463, "step": 3200 }, { "epoch": 0.05013577489492852, "grad_norm": 1.149261236190796, "learning_rate": 5.688963210702341e-06, "loss": 8.434, "step": 3300 }, { "epoch": 0.05013577489492852, "eval_loss": 8.348698616027832, "eval_runtime": 193.5122, "eval_samples_per_second": 51.676, "eval_steps_per_second": 6.46, "step": 3300 }, { "epoch": 0.051655040800835446, "grad_norm": 3.746015787124634, "learning_rate": 5.678929765886288e-06, "loss": 8.4225, "step": 3400 }, { "epoch": 0.051655040800835446, "eval_loss": 8.341509819030762, "eval_runtime": 193.694, "eval_samples_per_second": 51.628, "eval_steps_per_second": 6.453, "step": 3400 }, { "epoch": 0.05317430670674237, "grad_norm": 3.512450933456421, "learning_rate": 5.668896321070235e-06, "loss": 8.4084, "step": 3500 }, { "epoch": 0.05317430670674237, "eval_loss": 8.333552360534668, "eval_runtime": 193.637, "eval_samples_per_second": 51.643, "eval_steps_per_second": 6.455, "step": 3500 }, { "epoch": 0.054693572612649295, "grad_norm": 2.823720693588257, "learning_rate": 5.658862876254181e-06, "loss": 8.401, "step": 3600 }, { "epoch": 0.054693572612649295, "eval_loss": 8.334371566772461, "eval_runtime": 193.5758, "eval_samples_per_second": 51.659, "eval_steps_per_second": 6.457, "step": 3600 }, { "epoch": 0.05621283851855622, "grad_norm": 3.2911577224731445, "learning_rate": 5.6488294314381275e-06, "loss": 8.3905, "step": 3700 }, { "epoch": 0.05621283851855622, "eval_loss": 8.324334144592285, "eval_runtime": 193.6614, "eval_samples_per_second": 51.637, "eval_steps_per_second": 6.455, "step": 3700 }, { "epoch": 0.057732104424463145, "grad_norm": 2.3814852237701416, "learning_rate": 5.638795986622074e-06, "loss": 8.3799, "step": 3800 }, { "epoch": 0.057732104424463145, "eval_loss": 8.320505142211914, "eval_runtime": 193.6578, "eval_samples_per_second": 51.637, "eval_steps_per_second": 6.455, "step": 3800 }, { "epoch": 0.05925137033037007, "grad_norm": 3.9368467330932617, "learning_rate": 5.62876254180602e-06, "loss": 8.3716, "step": 3900 }, { "epoch": 0.05925137033037007, "eval_loss": 8.320087432861328, "eval_runtime": 193.6177, "eval_samples_per_second": 51.648, "eval_steps_per_second": 6.456, "step": 3900 }, { "epoch": 0.060770636236276994, "grad_norm": 3.7462780475616455, "learning_rate": 5.618729096989967e-06, "loss": 8.366, "step": 4000 }, { "epoch": 0.060770636236276994, "eval_loss": 8.314221382141113, "eval_runtime": 193.8249, "eval_samples_per_second": 51.593, "eval_steps_per_second": 6.449, "step": 4000 }, { "epoch": 0.06228990214218392, "grad_norm": 4.8095598220825195, "learning_rate": 5.608695652173914e-06, "loss": 8.3588, "step": 4100 }, { "epoch": 0.06228990214218392, "eval_loss": 8.31184196472168, "eval_runtime": 193.6735, "eval_samples_per_second": 51.633, "eval_steps_per_second": 6.454, "step": 4100 }, { "epoch": 0.06380916804809084, "grad_norm": 1.4702892303466797, "learning_rate": 5.59866220735786e-06, "loss": 8.349, "step": 4200 }, { "epoch": 0.06380916804809084, "eval_loss": 8.315442085266113, "eval_runtime": 193.7492, "eval_samples_per_second": 51.613, "eval_steps_per_second": 6.452, "step": 4200 }, { "epoch": 0.06532843395399776, "grad_norm": 1.3424737453460693, "learning_rate": 5.588628762541806e-06, "loss": 8.3377, "step": 4300 }, { "epoch": 0.06532843395399776, "eval_loss": 8.299623489379883, "eval_runtime": 193.8901, "eval_samples_per_second": 51.576, "eval_steps_per_second": 6.447, "step": 4300 }, { "epoch": 0.0668476998599047, "grad_norm": 3.2085587978363037, "learning_rate": 5.578595317725753e-06, "loss": 8.3276, "step": 4400 }, { "epoch": 0.0668476998599047, "eval_loss": 8.291953086853027, "eval_runtime": 193.3945, "eval_samples_per_second": 51.708, "eval_steps_per_second": 6.463, "step": 4400 }, { "epoch": 0.06836696576581162, "grad_norm": 3.0818777084350586, "learning_rate": 5.568561872909699e-06, "loss": 8.3213, "step": 4500 }, { "epoch": 0.06836696576581162, "eval_loss": 8.284076690673828, "eval_runtime": 193.6893, "eval_samples_per_second": 51.629, "eval_steps_per_second": 6.454, "step": 4500 }, { "epoch": 0.06988623167171855, "grad_norm": 2.9899518489837646, "learning_rate": 5.558528428093646e-06, "loss": 8.3146, "step": 4600 }, { "epoch": 0.06988623167171855, "eval_loss": 8.288785934448242, "eval_runtime": 193.4136, "eval_samples_per_second": 51.703, "eval_steps_per_second": 6.463, "step": 4600 }, { "epoch": 0.07140549757762547, "grad_norm": 3.5509963035583496, "learning_rate": 5.548494983277593e-06, "loss": 8.3073, "step": 4700 }, { "epoch": 0.07140549757762547, "eval_loss": 8.283821105957031, "eval_runtime": 193.5669, "eval_samples_per_second": 51.662, "eval_steps_per_second": 6.458, "step": 4700 }, { "epoch": 0.0729247634835324, "grad_norm": 3.2348263263702393, "learning_rate": 5.5384615384615385e-06, "loss": 8.3002, "step": 4800 }, { "epoch": 0.0729247634835324, "eval_loss": 8.275022506713867, "eval_runtime": 193.5313, "eval_samples_per_second": 51.671, "eval_steps_per_second": 6.459, "step": 4800 }, { "epoch": 0.07444402938943932, "grad_norm": 3.035083293914795, "learning_rate": 5.528428093645485e-06, "loss": 8.2929, "step": 4900 }, { "epoch": 0.07444402938943932, "eval_loss": 8.270652770996094, "eval_runtime": 193.6056, "eval_samples_per_second": 51.651, "eval_steps_per_second": 6.456, "step": 4900 }, { "epoch": 0.07596329529534625, "grad_norm": 2.820732593536377, "learning_rate": 5.518394648829432e-06, "loss": 8.2858, "step": 5000 }, { "epoch": 0.07596329529534625, "eval_loss": 8.268174171447754, "eval_runtime": 193.8467, "eval_samples_per_second": 51.587, "eval_steps_per_second": 6.448, "step": 5000 }, { "epoch": 0.07748256120125317, "grad_norm": 3.0247511863708496, "learning_rate": 5.508361204013378e-06, "loss": 8.2786, "step": 5100 }, { "epoch": 0.07748256120125317, "eval_loss": 8.26013469696045, "eval_runtime": 193.8849, "eval_samples_per_second": 51.577, "eval_steps_per_second": 6.447, "step": 5100 }, { "epoch": 0.0790018271071601, "grad_norm": 3.4587104320526123, "learning_rate": 5.498327759197324e-06, "loss": 8.2727, "step": 5200 }, { "epoch": 0.0790018271071601, "eval_loss": 8.258410453796387, "eval_runtime": 193.8157, "eval_samples_per_second": 51.595, "eval_steps_per_second": 6.449, "step": 5200 }, { "epoch": 0.08052109301306702, "grad_norm": 3.1752476692199707, "learning_rate": 5.488294314381271e-06, "loss": 8.269, "step": 5300 }, { "epoch": 0.08052109301306702, "eval_loss": 8.251901626586914, "eval_runtime": 193.7661, "eval_samples_per_second": 51.609, "eval_steps_per_second": 6.451, "step": 5300 }, { "epoch": 0.08204035891897395, "grad_norm": 2.231090784072876, "learning_rate": 5.478260869565217e-06, "loss": 8.2612, "step": 5400 }, { "epoch": 0.08204035891897395, "eval_loss": 8.248543739318848, "eval_runtime": 193.7807, "eval_samples_per_second": 51.605, "eval_steps_per_second": 6.451, "step": 5400 }, { "epoch": 0.08355962482488087, "grad_norm": 1.8496346473693848, "learning_rate": 5.468227424749163e-06, "loss": 8.2594, "step": 5500 }, { "epoch": 0.08355962482488087, "eval_loss": 8.25257396697998, "eval_runtime": 193.5194, "eval_samples_per_second": 51.674, "eval_steps_per_second": 6.459, "step": 5500 }, { "epoch": 0.0850788907307878, "grad_norm": 2.26971435546875, "learning_rate": 5.45819397993311e-06, "loss": 8.2519, "step": 5600 }, { "epoch": 0.0850788907307878, "eval_loss": 8.239155769348145, "eval_runtime": 193.6858, "eval_samples_per_second": 51.63, "eval_steps_per_second": 6.454, "step": 5600 }, { "epoch": 0.08659815663669472, "grad_norm": 4.062191963195801, "learning_rate": 5.448160535117057e-06, "loss": 8.2494, "step": 5700 }, { "epoch": 0.08659815663669472, "eval_loss": 8.248674392700195, "eval_runtime": 193.7941, "eval_samples_per_second": 51.601, "eval_steps_per_second": 6.45, "step": 5700 }, { "epoch": 0.08811742254260164, "grad_norm": 2.0019612312316895, "learning_rate": 5.438127090301003e-06, "loss": 8.246, "step": 5800 }, { "epoch": 0.08811742254260164, "eval_loss": 8.234210968017578, "eval_runtime": 193.7021, "eval_samples_per_second": 51.626, "eval_steps_per_second": 6.453, "step": 5800 }, { "epoch": 0.08963668844850857, "grad_norm": 3.5080573558807373, "learning_rate": 5.4280936454849495e-06, "loss": 8.2391, "step": 5900 }, { "epoch": 0.08963668844850857, "eval_loss": 8.240001678466797, "eval_runtime": 193.5894, "eval_samples_per_second": 51.656, "eval_steps_per_second": 6.457, "step": 5900 }, { "epoch": 0.0911559543544155, "grad_norm": 2.578500747680664, "learning_rate": 5.418060200668896e-06, "loss": 8.2361, "step": 6000 }, { "epoch": 0.0911559543544155, "eval_loss": 8.238499641418457, "eval_runtime": 193.7443, "eval_samples_per_second": 51.614, "eval_steps_per_second": 6.452, "step": 6000 }, { "epoch": 0.09267522026032242, "grad_norm": 2.7456629276275635, "learning_rate": 5.408026755852843e-06, "loss": 8.2331, "step": 6100 }, { "epoch": 0.09267522026032242, "eval_loss": 8.225603103637695, "eval_runtime": 193.7051, "eval_samples_per_second": 51.625, "eval_steps_per_second": 6.453, "step": 6100 }, { "epoch": 0.09419448616622934, "grad_norm": 1.1776982545852661, "learning_rate": 5.397993311036789e-06, "loss": 8.2294, "step": 6200 }, { "epoch": 0.09419448616622934, "eval_loss": 8.235060691833496, "eval_runtime": 193.5474, "eval_samples_per_second": 51.667, "eval_steps_per_second": 6.458, "step": 6200 }, { "epoch": 0.09571375207213627, "grad_norm": 3.159752130508423, "learning_rate": 5.387959866220736e-06, "loss": 8.2273, "step": 6300 }, { "epoch": 0.09571375207213627, "eval_loss": 8.22216510772705, "eval_runtime": 193.5026, "eval_samples_per_second": 51.679, "eval_steps_per_second": 6.46, "step": 6300 }, { "epoch": 0.09723301797804319, "grad_norm": 2.37727427482605, "learning_rate": 5.3779264214046825e-06, "loss": 8.2231, "step": 6400 }, { "epoch": 0.09723301797804319, "eval_loss": 8.222684860229492, "eval_runtime": 193.5974, "eval_samples_per_second": 51.654, "eval_steps_per_second": 6.457, "step": 6400 }, { "epoch": 0.09875228388395012, "grad_norm": 2.0136072635650635, "learning_rate": 5.367892976588628e-06, "loss": 8.2203, "step": 6500 }, { "epoch": 0.09875228388395012, "eval_loss": 8.220030784606934, "eval_runtime": 193.6795, "eval_samples_per_second": 51.632, "eval_steps_per_second": 6.454, "step": 6500 }, { "epoch": 0.10027154978985704, "grad_norm": 2.404653787612915, "learning_rate": 5.357859531772575e-06, "loss": 8.2154, "step": 6600 }, { "epoch": 0.10027154978985704, "eval_loss": 8.219395637512207, "eval_runtime": 193.5124, "eval_samples_per_second": 51.676, "eval_steps_per_second": 6.46, "step": 6600 }, { "epoch": 0.10179081569576397, "grad_norm": 1.6043188571929932, "learning_rate": 5.347826086956522e-06, "loss": 8.2128, "step": 6700 }, { "epoch": 0.10179081569576397, "eval_loss": 8.216435432434082, "eval_runtime": 193.766, "eval_samples_per_second": 51.609, "eval_steps_per_second": 6.451, "step": 6700 }, { "epoch": 0.10331008160167089, "grad_norm": 2.3386034965515137, "learning_rate": 5.337792642140468e-06, "loss": 8.2079, "step": 6800 }, { "epoch": 0.10331008160167089, "eval_loss": 8.212626457214355, "eval_runtime": 193.6166, "eval_samples_per_second": 51.648, "eval_steps_per_second": 6.456, "step": 6800 }, { "epoch": 0.10482934750757782, "grad_norm": 2.259270668029785, "learning_rate": 5.327759197324415e-06, "loss": 8.2067, "step": 6900 }, { "epoch": 0.10482934750757782, "eval_loss": 8.208475112915039, "eval_runtime": 193.3797, "eval_samples_per_second": 51.712, "eval_steps_per_second": 6.464, "step": 6900 }, { "epoch": 0.10634861341348474, "grad_norm": 2.469719409942627, "learning_rate": 5.317725752508361e-06, "loss": 8.1994, "step": 7000 }, { "epoch": 0.10634861341348474, "eval_loss": 8.199501037597656, "eval_runtime": 193.3429, "eval_samples_per_second": 51.722, "eval_steps_per_second": 6.465, "step": 7000 }, { "epoch": 0.10786787931939167, "grad_norm": 4.370075702667236, "learning_rate": 5.307692307692307e-06, "loss": 8.1678, "step": 7100 }, { "epoch": 0.10786787931939167, "eval_loss": 8.09277629852295, "eval_runtime": 193.6702, "eval_samples_per_second": 51.634, "eval_steps_per_second": 6.454, "step": 7100 }, { "epoch": 0.10938714522529859, "grad_norm": 5.548232555389404, "learning_rate": 5.297658862876254e-06, "loss": 8.0856, "step": 7200 }, { "epoch": 0.10938714522529859, "eval_loss": 8.040851593017578, "eval_runtime": 193.617, "eval_samples_per_second": 51.648, "eval_steps_per_second": 6.456, "step": 7200 }, { "epoch": 0.11090641113120552, "grad_norm": 2.152247428894043, "learning_rate": 5.287625418060201e-06, "loss": 8.0478, "step": 7300 }, { "epoch": 0.11090641113120552, "eval_loss": 7.996228218078613, "eval_runtime": 193.613, "eval_samples_per_second": 51.649, "eval_steps_per_second": 6.456, "step": 7300 }, { "epoch": 0.11242567703711244, "grad_norm": 5.204161167144775, "learning_rate": 5.277591973244147e-06, "loss": 8.0095, "step": 7400 }, { "epoch": 0.11242567703711244, "eval_loss": 7.958820343017578, "eval_runtime": 193.5944, "eval_samples_per_second": 51.654, "eval_steps_per_second": 6.457, "step": 7400 }, { "epoch": 0.11394494294301936, "grad_norm": 7.082394123077393, "learning_rate": 5.2675585284280935e-06, "loss": 7.978, "step": 7500 }, { "epoch": 0.11394494294301936, "eval_loss": 7.932178020477295, "eval_runtime": 193.798, "eval_samples_per_second": 51.6, "eval_steps_per_second": 6.45, "step": 7500 }, { "epoch": 0.11546420884892629, "grad_norm": 8.926252365112305, "learning_rate": 5.25752508361204e-06, "loss": 7.9505, "step": 7600 }, { "epoch": 0.11546420884892629, "eval_loss": 7.882853031158447, "eval_runtime": 193.6404, "eval_samples_per_second": 51.642, "eval_steps_per_second": 6.455, "step": 7600 }, { "epoch": 0.11698347475483321, "grad_norm": 3.5671885013580322, "learning_rate": 5.247491638795986e-06, "loss": 7.9086, "step": 7700 }, { "epoch": 0.11698347475483321, "eval_loss": 7.840451717376709, "eval_runtime": 193.8911, "eval_samples_per_second": 51.575, "eval_steps_per_second": 6.447, "step": 7700 }, { "epoch": 0.11850274066074014, "grad_norm": 5.790298938751221, "learning_rate": 5.237458193979933e-06, "loss": 7.861, "step": 7800 }, { "epoch": 0.11850274066074014, "eval_loss": 7.790124416351318, "eval_runtime": 193.8987, "eval_samples_per_second": 51.573, "eval_steps_per_second": 6.447, "step": 7800 }, { "epoch": 0.12002200656664706, "grad_norm": 4.439774513244629, "learning_rate": 5.22742474916388e-06, "loss": 7.8082, "step": 7900 }, { "epoch": 0.12002200656664706, "eval_loss": 7.72878885269165, "eval_runtime": 193.9191, "eval_samples_per_second": 51.568, "eval_steps_per_second": 6.446, "step": 7900 }, { "epoch": 0.12154127247255399, "grad_norm": 3.937167167663574, "learning_rate": 5.2173913043478265e-06, "loss": 7.757, "step": 8000 }, { "epoch": 0.12154127247255399, "eval_loss": 7.679111003875732, "eval_runtime": 193.7697, "eval_samples_per_second": 51.608, "eval_steps_per_second": 6.451, "step": 8000 }, { "epoch": 0.12306053837846091, "grad_norm": 4.227074146270752, "learning_rate": 5.207357859531772e-06, "loss": 7.7088, "step": 8100 }, { "epoch": 0.12306053837846091, "eval_loss": 7.634475231170654, "eval_runtime": 193.766, "eval_samples_per_second": 51.609, "eval_steps_per_second": 6.451, "step": 8100 }, { "epoch": 0.12457980428436784, "grad_norm": 3.042202949523926, "learning_rate": 5.197324414715719e-06, "loss": 7.6639, "step": 8200 }, { "epoch": 0.12457980428436784, "eval_loss": 7.605250835418701, "eval_runtime": 193.8104, "eval_samples_per_second": 51.597, "eval_steps_per_second": 6.45, "step": 8200 }, { "epoch": 0.12609907019027475, "grad_norm": 4.436267375946045, "learning_rate": 5.187290969899666e-06, "loss": 7.6256, "step": 8300 }, { "epoch": 0.12609907019027475, "eval_loss": 7.549579620361328, "eval_runtime": 193.7749, "eval_samples_per_second": 51.606, "eval_steps_per_second": 6.451, "step": 8300 }, { "epoch": 0.12761833609618167, "grad_norm": 3.9829390048980713, "learning_rate": 5.177257525083612e-06, "loss": 7.5838, "step": 8400 }, { "epoch": 0.12761833609618167, "eval_loss": 7.504631042480469, "eval_runtime": 193.7766, "eval_samples_per_second": 51.606, "eval_steps_per_second": 6.451, "step": 8400 }, { "epoch": 0.1291376020020886, "grad_norm": 3.072918176651001, "learning_rate": 5.167224080267559e-06, "loss": 7.5446, "step": 8500 }, { "epoch": 0.1291376020020886, "eval_loss": 7.467813968658447, "eval_runtime": 194.0927, "eval_samples_per_second": 51.522, "eval_steps_per_second": 6.44, "step": 8500 }, { "epoch": 0.13065686790799552, "grad_norm": 3.6358697414398193, "learning_rate": 5.157190635451505e-06, "loss": 7.5114, "step": 8600 }, { "epoch": 0.13065686790799552, "eval_loss": 7.4314045906066895, "eval_runtime": 197.5998, "eval_samples_per_second": 50.607, "eval_steps_per_second": 6.326, "step": 8600 }, { "epoch": 0.13217613381390245, "grad_norm": 2.4115982055664062, "learning_rate": 5.147157190635451e-06, "loss": 7.4749, "step": 8700 }, { "epoch": 0.13217613381390245, "eval_loss": 7.399141311645508, "eval_runtime": 193.7763, "eval_samples_per_second": 51.606, "eval_steps_per_second": 6.451, "step": 8700 }, { "epoch": 0.1336953997198094, "grad_norm": 3.8994717597961426, "learning_rate": 5.137123745819398e-06, "loss": 7.4388, "step": 8800 }, { "epoch": 0.1336953997198094, "eval_loss": 7.3639349937438965, "eval_runtime": 193.6905, "eval_samples_per_second": 51.629, "eval_steps_per_second": 6.454, "step": 8800 }, { "epoch": 0.13521466562571632, "grad_norm": 3.6934337615966797, "learning_rate": 5.127090301003345e-06, "loss": 7.4034, "step": 8900 }, { "epoch": 0.13521466562571632, "eval_loss": 7.327520370483398, "eval_runtime": 193.686, "eval_samples_per_second": 51.63, "eval_steps_per_second": 6.454, "step": 8900 }, { "epoch": 0.13673393153162325, "grad_norm": 3.4741897583007812, "learning_rate": 5.117056856187291e-06, "loss": 7.3684, "step": 9000 }, { "epoch": 0.13673393153162325, "eval_loss": 7.294392108917236, "eval_runtime": 193.3544, "eval_samples_per_second": 51.719, "eval_steps_per_second": 6.465, "step": 9000 }, { "epoch": 0.13825319743753017, "grad_norm": 4.130598545074463, "learning_rate": 5.1070234113712375e-06, "loss": 7.3363, "step": 9100 }, { "epoch": 0.13825319743753017, "eval_loss": 7.256102561950684, "eval_runtime": 193.8255, "eval_samples_per_second": 51.593, "eval_steps_per_second": 6.449, "step": 9100 }, { "epoch": 0.1397724633434371, "grad_norm": 3.8802666664123535, "learning_rate": 5.096989966555184e-06, "loss": 7.3054, "step": 9200 }, { "epoch": 0.1397724633434371, "eval_loss": 7.220945358276367, "eval_runtime": 193.6267, "eval_samples_per_second": 51.646, "eval_steps_per_second": 6.456, "step": 9200 }, { "epoch": 0.14129172924934402, "grad_norm": 3.072411298751831, "learning_rate": 5.08695652173913e-06, "loss": 7.2674, "step": 9300 }, { "epoch": 0.14129172924934402, "eval_loss": 7.181826591491699, "eval_runtime": 193.6566, "eval_samples_per_second": 51.638, "eval_steps_per_second": 6.455, "step": 9300 }, { "epoch": 0.14281099515525095, "grad_norm": 4.051361560821533, "learning_rate": 5.076923076923077e-06, "loss": 7.2294, "step": 9400 }, { "epoch": 0.14281099515525095, "eval_loss": 7.154284477233887, "eval_runtime": 193.737, "eval_samples_per_second": 51.616, "eval_steps_per_second": 6.452, "step": 9400 }, { "epoch": 0.14433026106115787, "grad_norm": 3.4815194606781006, "learning_rate": 5.066889632107024e-06, "loss": 7.1993, "step": 9500 }, { "epoch": 0.14433026106115787, "eval_loss": 7.109873294830322, "eval_runtime": 193.7018, "eval_samples_per_second": 51.626, "eval_steps_per_second": 6.453, "step": 9500 }, { "epoch": 0.1458495269670648, "grad_norm": 4.168730735778809, "learning_rate": 5.05685618729097e-06, "loss": 7.1617, "step": 9600 }, { "epoch": 0.1458495269670648, "eval_loss": 7.068033695220947, "eval_runtime": 193.7844, "eval_samples_per_second": 51.604, "eval_steps_per_second": 6.45, "step": 9600 }, { "epoch": 0.14736879287297172, "grad_norm": 4.632892608642578, "learning_rate": 5.046822742474916e-06, "loss": 7.1265, "step": 9700 }, { "epoch": 0.14736879287297172, "eval_loss": 7.029054641723633, "eval_runtime": 193.7033, "eval_samples_per_second": 51.625, "eval_steps_per_second": 6.453, "step": 9700 }, { "epoch": 0.14888805877887865, "grad_norm": 5.668432235717773, "learning_rate": 5.036789297658863e-06, "loss": 7.0973, "step": 9800 }, { "epoch": 0.14888805877887865, "eval_loss": 7.001068115234375, "eval_runtime": 193.7831, "eval_samples_per_second": 51.604, "eval_steps_per_second": 6.451, "step": 9800 }, { "epoch": 0.15040732468478557, "grad_norm": 6.07447624206543, "learning_rate": 5.02675585284281e-06, "loss": 7.0693, "step": 9900 }, { "epoch": 0.15040732468478557, "eval_loss": 6.974251747131348, "eval_runtime": 193.7698, "eval_samples_per_second": 51.608, "eval_steps_per_second": 6.451, "step": 9900 }, { "epoch": 0.1519265905906925, "grad_norm": 5.610072135925293, "learning_rate": 5.016722408026756e-06, "loss": 7.0395, "step": 10000 }, { "epoch": 0.1519265905906925, "eval_loss": 6.959811687469482, "eval_runtime": 193.7123, "eval_samples_per_second": 51.623, "eval_steps_per_second": 6.453, "step": 10000 }, { "epoch": 0.15344585649659942, "grad_norm": 4.722342491149902, "learning_rate": 5.0066889632107026e-06, "loss": 7.0146, "step": 10100 }, { "epoch": 0.15344585649659942, "eval_loss": 6.919780254364014, "eval_runtime": 193.9627, "eval_samples_per_second": 51.556, "eval_steps_per_second": 6.445, "step": 10100 }, { "epoch": 0.15496512240250634, "grad_norm": 2.454202175140381, "learning_rate": 4.996655518394649e-06, "loss": 6.979, "step": 10200 }, { "epoch": 0.15496512240250634, "eval_loss": 6.8777360916137695, "eval_runtime": 193.5746, "eval_samples_per_second": 51.66, "eval_steps_per_second": 6.457, "step": 10200 }, { "epoch": 0.15648438830841327, "grad_norm": 6.628566265106201, "learning_rate": 4.986622073578595e-06, "loss": 6.9576, "step": 10300 }, { "epoch": 0.15648438830841327, "eval_loss": 6.851335048675537, "eval_runtime": 193.6238, "eval_samples_per_second": 51.647, "eval_steps_per_second": 6.456, "step": 10300 }, { "epoch": 0.1580036542143202, "grad_norm": 4.226571559906006, "learning_rate": 4.976588628762542e-06, "loss": 6.9294, "step": 10400 }, { "epoch": 0.1580036542143202, "eval_loss": 6.837319374084473, "eval_runtime": 193.6321, "eval_samples_per_second": 51.644, "eval_steps_per_second": 6.456, "step": 10400 }, { "epoch": 0.15952292012022712, "grad_norm": 3.949143648147583, "learning_rate": 4.966555183946489e-06, "loss": 6.906, "step": 10500 }, { "epoch": 0.15952292012022712, "eval_loss": 6.798065662384033, "eval_runtime": 193.7227, "eval_samples_per_second": 51.62, "eval_steps_per_second": 6.453, "step": 10500 }, { "epoch": 0.16104218602613404, "grad_norm": 4.327299118041992, "learning_rate": 4.956521739130435e-06, "loss": 6.8789, "step": 10600 }, { "epoch": 0.16104218602613404, "eval_loss": 6.793569564819336, "eval_runtime": 193.6196, "eval_samples_per_second": 51.648, "eval_steps_per_second": 6.456, "step": 10600 }, { "epoch": 0.16256145193204097, "grad_norm": 3.8152856826782227, "learning_rate": 4.9464882943143815e-06, "loss": 6.8557, "step": 10700 }, { "epoch": 0.16256145193204097, "eval_loss": 6.754009246826172, "eval_runtime": 193.7319, "eval_samples_per_second": 51.618, "eval_steps_per_second": 6.452, "step": 10700 }, { "epoch": 0.1640807178379479, "grad_norm": 4.621021747589111, "learning_rate": 4.936454849498328e-06, "loss": 6.8387, "step": 10800 }, { "epoch": 0.1640807178379479, "eval_loss": 6.748414993286133, "eval_runtime": 193.8639, "eval_samples_per_second": 51.583, "eval_steps_per_second": 6.448, "step": 10800 }, { "epoch": 0.16559998374385482, "grad_norm": 4.906980514526367, "learning_rate": 4.926421404682274e-06, "loss": 6.8054, "step": 10900 }, { "epoch": 0.16559998374385482, "eval_loss": 6.722209453582764, "eval_runtime": 193.5438, "eval_samples_per_second": 51.668, "eval_steps_per_second": 6.458, "step": 10900 }, { "epoch": 0.16711924964976174, "grad_norm": 3.9837253093719482, "learning_rate": 4.916387959866221e-06, "loss": 6.7829, "step": 11000 }, { "epoch": 0.16711924964976174, "eval_loss": 6.689162731170654, "eval_runtime": 193.4992, "eval_samples_per_second": 51.68, "eval_steps_per_second": 6.46, "step": 11000 }, { "epoch": 0.16863851555566867, "grad_norm": 4.781426906585693, "learning_rate": 4.906354515050168e-06, "loss": 6.7597, "step": 11100 }, { "epoch": 0.16863851555566867, "eval_loss": 6.658721923828125, "eval_runtime": 193.7146, "eval_samples_per_second": 51.622, "eval_steps_per_second": 6.453, "step": 11100 }, { "epoch": 0.1701577814615756, "grad_norm": 6.702068328857422, "learning_rate": 4.8963210702341136e-06, "loss": 6.7416, "step": 11200 }, { "epoch": 0.1701577814615756, "eval_loss": 6.642455577850342, "eval_runtime": 193.6222, "eval_samples_per_second": 51.647, "eval_steps_per_second": 6.456, "step": 11200 }, { "epoch": 0.17167704736748252, "grad_norm": 3.1839189529418945, "learning_rate": 4.88628762541806e-06, "loss": 6.7201, "step": 11300 }, { "epoch": 0.17167704736748252, "eval_loss": 6.614835262298584, "eval_runtime": 193.735, "eval_samples_per_second": 51.617, "eval_steps_per_second": 6.452, "step": 11300 }, { "epoch": 0.17319631327338944, "grad_norm": 5.427370071411133, "learning_rate": 4.876254180602007e-06, "loss": 6.6993, "step": 11400 }, { "epoch": 0.17319631327338944, "eval_loss": 6.601010799407959, "eval_runtime": 193.6047, "eval_samples_per_second": 51.652, "eval_steps_per_second": 6.456, "step": 11400 }, { "epoch": 0.17471557917929637, "grad_norm": 4.759448051452637, "learning_rate": 4.866220735785953e-06, "loss": 6.6775, "step": 11500 }, { "epoch": 0.17471557917929637, "eval_loss": 6.579466342926025, "eval_runtime": 193.6792, "eval_samples_per_second": 51.632, "eval_steps_per_second": 6.454, "step": 11500 }, { "epoch": 0.1762348450852033, "grad_norm": 3.7401344776153564, "learning_rate": 4.8561872909699e-06, "loss": 6.6565, "step": 11600 }, { "epoch": 0.1762348450852033, "eval_loss": 6.576225757598877, "eval_runtime": 194.0381, "eval_samples_per_second": 51.536, "eval_steps_per_second": 6.442, "step": 11600 }, { "epoch": 0.17775411099111021, "grad_norm": 5.589729309082031, "learning_rate": 4.8461538461538465e-06, "loss": 6.6419, "step": 11700 }, { "epoch": 0.17775411099111021, "eval_loss": 6.542896270751953, "eval_runtime": 193.798, "eval_samples_per_second": 51.6, "eval_steps_per_second": 6.45, "step": 11700 }, { "epoch": 0.17927337689701714, "grad_norm": 4.623971939086914, "learning_rate": 4.8361204013377925e-06, "loss": 6.6199, "step": 11800 }, { "epoch": 0.17927337689701714, "eval_loss": 6.519240856170654, "eval_runtime": 193.739, "eval_samples_per_second": 51.616, "eval_steps_per_second": 6.452, "step": 11800 }, { "epoch": 0.18079264280292406, "grad_norm": 4.685464859008789, "learning_rate": 4.826086956521739e-06, "loss": 6.6012, "step": 11900 }, { "epoch": 0.18079264280292406, "eval_loss": 6.489596843719482, "eval_runtime": 193.7453, "eval_samples_per_second": 51.614, "eval_steps_per_second": 6.452, "step": 11900 }, { "epoch": 0.182311908708831, "grad_norm": 3.054800271987915, "learning_rate": 4.816053511705686e-06, "loss": 6.5813, "step": 12000 }, { "epoch": 0.182311908708831, "eval_loss": 6.497661113739014, "eval_runtime": 193.7138, "eval_samples_per_second": 51.623, "eval_steps_per_second": 6.453, "step": 12000 }, { "epoch": 0.1838311746147379, "grad_norm": 3.547619342803955, "learning_rate": 4.806020066889633e-06, "loss": 6.5623, "step": 12100 }, { "epoch": 0.1838311746147379, "eval_loss": 6.468958377838135, "eval_runtime": 193.766, "eval_samples_per_second": 51.609, "eval_steps_per_second": 6.451, "step": 12100 }, { "epoch": 0.18535044052064484, "grad_norm": 4.697444915771484, "learning_rate": 4.795986622073579e-06, "loss": 6.5448, "step": 12200 }, { "epoch": 0.18535044052064484, "eval_loss": 6.436464309692383, "eval_runtime": 193.7105, "eval_samples_per_second": 51.623, "eval_steps_per_second": 6.453, "step": 12200 }, { "epoch": 0.18686970642655176, "grad_norm": 4.79019021987915, "learning_rate": 4.785953177257525e-06, "loss": 6.525, "step": 12300 }, { "epoch": 0.18686970642655176, "eval_loss": 6.422084331512451, "eval_runtime": 193.7424, "eval_samples_per_second": 51.615, "eval_steps_per_second": 6.452, "step": 12300 }, { "epoch": 0.1883889723324587, "grad_norm": 3.7939579486846924, "learning_rate": 4.775919732441472e-06, "loss": 6.5131, "step": 12400 }, { "epoch": 0.1883889723324587, "eval_loss": 6.425159931182861, "eval_runtime": 193.6634, "eval_samples_per_second": 51.636, "eval_steps_per_second": 6.454, "step": 12400 }, { "epoch": 0.1899082382383656, "grad_norm": 3.271348714828491, "learning_rate": 4.765886287625418e-06, "loss": 6.4978, "step": 12500 }, { "epoch": 0.1899082382383656, "eval_loss": 6.395818710327148, "eval_runtime": 193.5299, "eval_samples_per_second": 51.672, "eval_steps_per_second": 6.459, "step": 12500 }, { "epoch": 0.19142750414427254, "grad_norm": 4.119296073913574, "learning_rate": 4.755852842809365e-06, "loss": 6.483, "step": 12600 }, { "epoch": 0.19142750414427254, "eval_loss": 6.397064208984375, "eval_runtime": 193.5731, "eval_samples_per_second": 51.66, "eval_steps_per_second": 6.458, "step": 12600 }, { "epoch": 0.19294677005017946, "grad_norm": 3.7907373905181885, "learning_rate": 4.745819397993312e-06, "loss": 6.4639, "step": 12700 }, { "epoch": 0.19294677005017946, "eval_loss": 6.348308563232422, "eval_runtime": 193.6279, "eval_samples_per_second": 51.645, "eval_steps_per_second": 6.456, "step": 12700 }, { "epoch": 0.19446603595608639, "grad_norm": 3.8455281257629395, "learning_rate": 4.7357859531772575e-06, "loss": 6.4471, "step": 12800 }, { "epoch": 0.19446603595608639, "eval_loss": 6.347524642944336, "eval_runtime": 193.9737, "eval_samples_per_second": 51.553, "eval_steps_per_second": 6.444, "step": 12800 }, { "epoch": 0.1959853018619933, "grad_norm": 3.5916056632995605, "learning_rate": 4.725752508361204e-06, "loss": 6.4303, "step": 12900 }, { "epoch": 0.1959853018619933, "eval_loss": 6.33302640914917, "eval_runtime": 193.6018, "eval_samples_per_second": 51.652, "eval_steps_per_second": 6.457, "step": 12900 }, { "epoch": 0.19750456776790024, "grad_norm": 3.734985589981079, "learning_rate": 4.715719063545151e-06, "loss": 6.4158, "step": 13000 }, { "epoch": 0.19750456776790024, "eval_loss": 6.3155083656311035, "eval_runtime": 193.6987, "eval_samples_per_second": 51.627, "eval_steps_per_second": 6.453, "step": 13000 }, { "epoch": 0.19902383367380716, "grad_norm": 3.579678535461426, "learning_rate": 4.705685618729097e-06, "loss": 6.4024, "step": 13100 }, { "epoch": 0.19902383367380716, "eval_loss": 6.29377555847168, "eval_runtime": 193.6889, "eval_samples_per_second": 51.629, "eval_steps_per_second": 6.454, "step": 13100 }, { "epoch": 0.20054309957971408, "grad_norm": 4.257501125335693, "learning_rate": 4.695652173913044e-06, "loss": 6.3894, "step": 13200 }, { "epoch": 0.20054309957971408, "eval_loss": 6.288681507110596, "eval_runtime": 193.6264, "eval_samples_per_second": 51.646, "eval_steps_per_second": 6.456, "step": 13200 }, { "epoch": 0.202062365485621, "grad_norm": 3.8430824279785156, "learning_rate": 4.6856187290969905e-06, "loss": 6.3715, "step": 13300 }, { "epoch": 0.202062365485621, "eval_loss": 6.247255802154541, "eval_runtime": 193.7628, "eval_samples_per_second": 51.609, "eval_steps_per_second": 6.451, "step": 13300 }, { "epoch": 0.20358163139152793, "grad_norm": 3.9459517002105713, "learning_rate": 4.675585284280936e-06, "loss": 6.3583, "step": 13400 }, { "epoch": 0.20358163139152793, "eval_loss": 6.250117301940918, "eval_runtime": 193.7665, "eval_samples_per_second": 51.609, "eval_steps_per_second": 6.451, "step": 13400 }, { "epoch": 0.20510089729743486, "grad_norm": 3.475034475326538, "learning_rate": 4.665551839464883e-06, "loss": 6.3431, "step": 13500 }, { "epoch": 0.20510089729743486, "eval_loss": 6.22703742980957, "eval_runtime": 193.7742, "eval_samples_per_second": 51.606, "eval_steps_per_second": 6.451, "step": 13500 }, { "epoch": 0.20662016320334178, "grad_norm": 4.17089319229126, "learning_rate": 4.65551839464883e-06, "loss": 6.3288, "step": 13600 }, { "epoch": 0.20662016320334178, "eval_loss": 6.230484962463379, "eval_runtime": 193.7687, "eval_samples_per_second": 51.608, "eval_steps_per_second": 6.451, "step": 13600 }, { "epoch": 0.2081394291092487, "grad_norm": 2.118986129760742, "learning_rate": 4.645484949832776e-06, "loss": 6.3169, "step": 13700 }, { "epoch": 0.2081394291092487, "eval_loss": 6.206001281738281, "eval_runtime": 193.8247, "eval_samples_per_second": 51.593, "eval_steps_per_second": 6.449, "step": 13700 }, { "epoch": 0.20965869501515563, "grad_norm": 4.810153007507324, "learning_rate": 4.635451505016723e-06, "loss": 6.3032, "step": 13800 }, { "epoch": 0.20965869501515563, "eval_loss": 6.18707275390625, "eval_runtime": 193.7442, "eval_samples_per_second": 51.614, "eval_steps_per_second": 6.452, "step": 13800 }, { "epoch": 0.21117796092106256, "grad_norm": 3.7797763347625732, "learning_rate": 4.625418060200669e-06, "loss": 6.2918, "step": 13900 }, { "epoch": 0.21117796092106256, "eval_loss": 6.1838603019714355, "eval_runtime": 193.7881, "eval_samples_per_second": 51.603, "eval_steps_per_second": 6.45, "step": 13900 }, { "epoch": 0.21269722682696948, "grad_norm": 4.482378959655762, "learning_rate": 4.615384615384616e-06, "loss": 6.2757, "step": 14000 }, { "epoch": 0.21269722682696948, "eval_loss": 6.161965847015381, "eval_runtime": 193.7356, "eval_samples_per_second": 51.617, "eval_steps_per_second": 6.452, "step": 14000 }, { "epoch": 0.2142164927328764, "grad_norm": 4.001418590545654, "learning_rate": 4.605351170568562e-06, "loss": 6.2647, "step": 14100 }, { "epoch": 0.2142164927328764, "eval_loss": 6.15457820892334, "eval_runtime": 193.9234, "eval_samples_per_second": 51.567, "eval_steps_per_second": 6.446, "step": 14100 }, { "epoch": 0.21573575863878333, "grad_norm": 3.8982086181640625, "learning_rate": 4.595317725752509e-06, "loss": 6.2545, "step": 14200 }, { "epoch": 0.21573575863878333, "eval_loss": 6.143900394439697, "eval_runtime": 193.7808, "eval_samples_per_second": 51.605, "eval_steps_per_second": 6.451, "step": 14200 }, { "epoch": 0.21725502454469026, "grad_norm": 3.459050416946411, "learning_rate": 4.585284280936456e-06, "loss": 6.2398, "step": 14300 }, { "epoch": 0.21725502454469026, "eval_loss": 6.131939888000488, "eval_runtime": 193.8856, "eval_samples_per_second": 51.577, "eval_steps_per_second": 6.447, "step": 14300 }, { "epoch": 0.21877429045059718, "grad_norm": 3.335505962371826, "learning_rate": 4.5752508361204015e-06, "loss": 6.2327, "step": 14400 }, { "epoch": 0.21877429045059718, "eval_loss": 6.106751918792725, "eval_runtime": 193.844, "eval_samples_per_second": 51.588, "eval_steps_per_second": 6.448, "step": 14400 }, { "epoch": 0.2202935563565041, "grad_norm": 3.845909357070923, "learning_rate": 4.565217391304348e-06, "loss": 6.2144, "step": 14500 }, { "epoch": 0.2202935563565041, "eval_loss": 6.094777584075928, "eval_runtime": 193.5802, "eval_samples_per_second": 51.658, "eval_steps_per_second": 6.457, "step": 14500 }, { "epoch": 0.22181282226241103, "grad_norm": 3.846149206161499, "learning_rate": 4.555183946488295e-06, "loss": 6.2022, "step": 14600 }, { "epoch": 0.22181282226241103, "eval_loss": 6.085541248321533, "eval_runtime": 193.8098, "eval_samples_per_second": 51.597, "eval_steps_per_second": 6.45, "step": 14600 }, { "epoch": 0.22333208816831795, "grad_norm": 3.50091814994812, "learning_rate": 4.545150501672241e-06, "loss": 6.1915, "step": 14700 }, { "epoch": 0.22333208816831795, "eval_loss": 6.058828830718994, "eval_runtime": 193.4542, "eval_samples_per_second": 51.692, "eval_steps_per_second": 6.461, "step": 14700 }, { "epoch": 0.22485135407422488, "grad_norm": 4.312457084655762, "learning_rate": 4.535117056856188e-06, "loss": 6.1776, "step": 14800 }, { "epoch": 0.22485135407422488, "eval_loss": 6.067806720733643, "eval_runtime": 193.5484, "eval_samples_per_second": 51.667, "eval_steps_per_second": 6.458, "step": 14800 }, { "epoch": 0.2263706199801318, "grad_norm": 3.6476268768310547, "learning_rate": 4.5250836120401345e-06, "loss": 6.1703, "step": 14900 }, { "epoch": 0.2263706199801318, "eval_loss": 6.045175552368164, "eval_runtime": 193.3937, "eval_samples_per_second": 51.708, "eval_steps_per_second": 6.463, "step": 14900 }, { "epoch": 0.22788988588603873, "grad_norm": 4.1377739906311035, "learning_rate": 4.51505016722408e-06, "loss": 6.1577, "step": 15000 }, { "epoch": 0.22788988588603873, "eval_loss": 6.038886547088623, "eval_runtime": 193.5749, "eval_samples_per_second": 51.66, "eval_steps_per_second": 6.457, "step": 15000 }, { "epoch": 0.22940915179194565, "grad_norm": 4.192631244659424, "learning_rate": 4.505016722408027e-06, "loss": 6.1477, "step": 15100 }, { "epoch": 0.22940915179194565, "eval_loss": 6.030833721160889, "eval_runtime": 193.5655, "eval_samples_per_second": 51.662, "eval_steps_per_second": 6.458, "step": 15100 }, { "epoch": 0.23092841769785258, "grad_norm": 3.234416961669922, "learning_rate": 4.494983277591973e-06, "loss": 6.1363, "step": 15200 }, { "epoch": 0.23092841769785258, "eval_loss": 6.008749008178711, "eval_runtime": 193.3663, "eval_samples_per_second": 51.715, "eval_steps_per_second": 6.464, "step": 15200 }, { "epoch": 0.2324476836037595, "grad_norm": 4.860428810119629, "learning_rate": 4.48494983277592e-06, "loss": 6.1298, "step": 15300 }, { "epoch": 0.2324476836037595, "eval_loss": 5.996873378753662, "eval_runtime": 193.6007, "eval_samples_per_second": 51.653, "eval_steps_per_second": 6.457, "step": 15300 }, { "epoch": 0.23396694950966643, "grad_norm": 4.0561323165893555, "learning_rate": 4.474916387959866e-06, "loss": 6.1122, "step": 15400 }, { "epoch": 0.23396694950966643, "eval_loss": 5.984120845794678, "eval_runtime": 193.5721, "eval_samples_per_second": 51.66, "eval_steps_per_second": 6.458, "step": 15400 }, { "epoch": 0.23548621541557335, "grad_norm": 2.9819724559783936, "learning_rate": 4.4648829431438125e-06, "loss": 6.1024, "step": 15500 }, { "epoch": 0.23548621541557335, "eval_loss": 5.982254981994629, "eval_runtime": 193.7425, "eval_samples_per_second": 51.615, "eval_steps_per_second": 6.452, "step": 15500 }, { "epoch": 0.23700548132148028, "grad_norm": 3.733194351196289, "learning_rate": 4.454849498327759e-06, "loss": 6.0924, "step": 15600 }, { "epoch": 0.23700548132148028, "eval_loss": 5.969741344451904, "eval_runtime": 193.8217, "eval_samples_per_second": 51.594, "eval_steps_per_second": 6.449, "step": 15600 }, { "epoch": 0.2385247472273872, "grad_norm": 5.688018321990967, "learning_rate": 4.444816053511705e-06, "loss": 6.0871, "step": 15700 }, { "epoch": 0.2385247472273872, "eval_loss": 5.9461445808410645, "eval_runtime": 193.8283, "eval_samples_per_second": 51.592, "eval_steps_per_second": 6.449, "step": 15700 }, { "epoch": 0.24004401313329413, "grad_norm": 2.9404726028442383, "learning_rate": 4.434782608695652e-06, "loss": 6.0706, "step": 15800 }, { "epoch": 0.24004401313329413, "eval_loss": 5.936134338378906, "eval_runtime": 193.8629, "eval_samples_per_second": 51.583, "eval_steps_per_second": 6.448, "step": 15800 }, { "epoch": 0.24156327903920105, "grad_norm": 4.436812877655029, "learning_rate": 4.424749163879599e-06, "loss": 6.0652, "step": 15900 }, { "epoch": 0.24156327903920105, "eval_loss": 5.9289655685424805, "eval_runtime": 193.8246, "eval_samples_per_second": 51.593, "eval_steps_per_second": 6.449, "step": 15900 }, { "epoch": 0.24308254494510798, "grad_norm": 4.113779544830322, "learning_rate": 4.414715719063545e-06, "loss": 6.0497, "step": 16000 }, { "epoch": 0.24308254494510798, "eval_loss": 5.926904678344727, "eval_runtime": 194.0466, "eval_samples_per_second": 51.534, "eval_steps_per_second": 6.442, "step": 16000 }, { "epoch": 0.2446018108510149, "grad_norm": 3.2827975749969482, "learning_rate": 4.404682274247491e-06, "loss": 6.0365, "step": 16100 }, { "epoch": 0.2446018108510149, "eval_loss": 5.9020185470581055, "eval_runtime": 193.842, "eval_samples_per_second": 51.588, "eval_steps_per_second": 6.449, "step": 16100 }, { "epoch": 0.24612107675692182, "grad_norm": 3.8352739810943604, "learning_rate": 4.394648829431438e-06, "loss": 6.0305, "step": 16200 }, { "epoch": 0.24612107675692182, "eval_loss": 5.900501251220703, "eval_runtime": 193.7914, "eval_samples_per_second": 51.602, "eval_steps_per_second": 6.45, "step": 16200 }, { "epoch": 0.24764034266282875, "grad_norm": 3.2179617881774902, "learning_rate": 4.384615384615384e-06, "loss": 6.0166, "step": 16300 }, { "epoch": 0.24764034266282875, "eval_loss": 5.891448497772217, "eval_runtime": 193.74, "eval_samples_per_second": 51.616, "eval_steps_per_second": 6.452, "step": 16300 }, { "epoch": 0.24915960856873567, "grad_norm": 3.446993112564087, "learning_rate": 4.374581939799331e-06, "loss": 6.0121, "step": 16400 }, { "epoch": 0.24915960856873567, "eval_loss": 5.874625205993652, "eval_runtime": 193.8125, "eval_samples_per_second": 51.596, "eval_steps_per_second": 6.45, "step": 16400 }, { "epoch": 0.2506788744746426, "grad_norm": 4.3962578773498535, "learning_rate": 4.364548494983278e-06, "loss": 6.0029, "step": 16500 }, { "epoch": 0.2506788744746426, "eval_loss": 5.884474754333496, "eval_runtime": 193.754, "eval_samples_per_second": 51.612, "eval_steps_per_second": 6.451, "step": 16500 }, { "epoch": 0.2521981403805495, "grad_norm": 4.52181339263916, "learning_rate": 4.354515050167224e-06, "loss": 5.9925, "step": 16600 }, { "epoch": 0.2521981403805495, "eval_loss": 5.867855548858643, "eval_runtime": 193.7648, "eval_samples_per_second": 51.609, "eval_steps_per_second": 6.451, "step": 16600 }, { "epoch": 0.25371740628645645, "grad_norm": 3.847750186920166, "learning_rate": 4.34448160535117e-06, "loss": 5.9839, "step": 16700 }, { "epoch": 0.25371740628645645, "eval_loss": 5.851235389709473, "eval_runtime": 193.6623, "eval_samples_per_second": 51.636, "eval_steps_per_second": 6.455, "step": 16700 }, { "epoch": 0.25523667219236335, "grad_norm": 2.9024147987365723, "learning_rate": 4.334448160535117e-06, "loss": 5.9744, "step": 16800 }, { "epoch": 0.25523667219236335, "eval_loss": 5.854368686676025, "eval_runtime": 193.8613, "eval_samples_per_second": 51.583, "eval_steps_per_second": 6.448, "step": 16800 }, { "epoch": 0.2567559380982703, "grad_norm": 3.2213125228881836, "learning_rate": 4.324414715719064e-06, "loss": 5.9653, "step": 16900 }, { "epoch": 0.2567559380982703, "eval_loss": 5.836233139038086, "eval_runtime": 193.5642, "eval_samples_per_second": 51.662, "eval_steps_per_second": 6.458, "step": 16900 }, { "epoch": 0.2582752040041772, "grad_norm": 4.198850631713867, "learning_rate": 4.31438127090301e-06, "loss": 5.957, "step": 17000 }, { "epoch": 0.2582752040041772, "eval_loss": 5.818154811859131, "eval_runtime": 193.5777, "eval_samples_per_second": 51.659, "eval_steps_per_second": 6.457, "step": 17000 }, { "epoch": 0.25979446991008415, "grad_norm": 2.9214396476745605, "learning_rate": 4.3043478260869565e-06, "loss": 5.9417, "step": 17100 }, { "epoch": 0.25979446991008415, "eval_loss": 5.829405784606934, "eval_runtime": 193.6055, "eval_samples_per_second": 51.651, "eval_steps_per_second": 6.456, "step": 17100 }, { "epoch": 0.26131373581599104, "grad_norm": 3.7691545486450195, "learning_rate": 4.294314381270903e-06, "loss": 5.934, "step": 17200 }, { "epoch": 0.26131373581599104, "eval_loss": 5.794999122619629, "eval_runtime": 193.5657, "eval_samples_per_second": 51.662, "eval_steps_per_second": 6.458, "step": 17200 }, { "epoch": 0.262833001721898, "grad_norm": 4.013944625854492, "learning_rate": 4.284280936454849e-06, "loss": 5.9269, "step": 17300 }, { "epoch": 0.262833001721898, "eval_loss": 5.787894248962402, "eval_runtime": 193.7935, "eval_samples_per_second": 51.601, "eval_steps_per_second": 6.45, "step": 17300 }, { "epoch": 0.2643522676278049, "grad_norm": 3.784191370010376, "learning_rate": 4.274247491638796e-06, "loss": 5.9224, "step": 17400 }, { "epoch": 0.2643522676278049, "eval_loss": 5.795870780944824, "eval_runtime": 193.6051, "eval_samples_per_second": 51.652, "eval_steps_per_second": 6.456, "step": 17400 }, { "epoch": 0.26587153353371185, "grad_norm": 4.354425430297852, "learning_rate": 4.264214046822743e-06, "loss": 5.909, "step": 17500 }, { "epoch": 0.26587153353371185, "eval_loss": 5.785282611846924, "eval_runtime": 193.5413, "eval_samples_per_second": 51.669, "eval_steps_per_second": 6.459, "step": 17500 }, { "epoch": 0.2673907994396188, "grad_norm": 3.2807064056396484, "learning_rate": 4.254180602006689e-06, "loss": 5.9017, "step": 17600 }, { "epoch": 0.2673907994396188, "eval_loss": 5.772453308105469, "eval_runtime": 193.741, "eval_samples_per_second": 51.615, "eval_steps_per_second": 6.452, "step": 17600 }, { "epoch": 0.2689100653455257, "grad_norm": 3.0385000705718994, "learning_rate": 4.244147157190635e-06, "loss": 5.8906, "step": 17700 }, { "epoch": 0.2689100653455257, "eval_loss": 5.765667915344238, "eval_runtime": 193.8777, "eval_samples_per_second": 51.579, "eval_steps_per_second": 6.447, "step": 17700 }, { "epoch": 0.27042933125143265, "grad_norm": 2.746528148651123, "learning_rate": 4.234113712374582e-06, "loss": 5.8847, "step": 17800 }, { "epoch": 0.27042933125143265, "eval_loss": 5.7541351318359375, "eval_runtime": 193.9065, "eval_samples_per_second": 51.571, "eval_steps_per_second": 6.446, "step": 17800 }, { "epoch": 0.27194859715733954, "grad_norm": 3.3728785514831543, "learning_rate": 4.224080267558528e-06, "loss": 5.8769, "step": 17900 }, { "epoch": 0.27194859715733954, "eval_loss": 5.7371392250061035, "eval_runtime": 193.8325, "eval_samples_per_second": 51.591, "eval_steps_per_second": 6.449, "step": 17900 }, { "epoch": 0.2734678630632465, "grad_norm": 3.4341955184936523, "learning_rate": 4.214046822742475e-06, "loss": 5.8711, "step": 18000 }, { "epoch": 0.2734678630632465, "eval_loss": 5.715305328369141, "eval_runtime": 193.9066, "eval_samples_per_second": 51.571, "eval_steps_per_second": 6.446, "step": 18000 }, { "epoch": 0.2749871289691534, "grad_norm": 4.6379313468933105, "learning_rate": 4.2040133779264216e-06, "loss": 5.861, "step": 18100 }, { "epoch": 0.2749871289691534, "eval_loss": 5.71766996383667, "eval_runtime": 193.7937, "eval_samples_per_second": 51.601, "eval_steps_per_second": 6.45, "step": 18100 }, { "epoch": 0.27650639487506035, "grad_norm": 3.901848554611206, "learning_rate": 4.1939799331103675e-06, "loss": 5.855, "step": 18200 }, { "epoch": 0.27650639487506035, "eval_loss": 5.7228240966796875, "eval_runtime": 194.0011, "eval_samples_per_second": 51.546, "eval_steps_per_second": 6.443, "step": 18200 }, { "epoch": 0.27802566078096724, "grad_norm": 2.7498176097869873, "learning_rate": 4.183946488294314e-06, "loss": 5.8388, "step": 18300 }, { "epoch": 0.27802566078096724, "eval_loss": 5.699355125427246, "eval_runtime": 193.8501, "eval_samples_per_second": 51.586, "eval_steps_per_second": 6.448, "step": 18300 }, { "epoch": 0.2795449266868742, "grad_norm": 3.4318690299987793, "learning_rate": 4.173913043478261e-06, "loss": 5.8356, "step": 18400 }, { "epoch": 0.2795449266868742, "eval_loss": 5.697088241577148, "eval_runtime": 193.7892, "eval_samples_per_second": 51.602, "eval_steps_per_second": 6.45, "step": 18400 }, { "epoch": 0.2810641925927811, "grad_norm": 3.5657687187194824, "learning_rate": 4.163879598662208e-06, "loss": 5.8233, "step": 18500 }, { "epoch": 0.2810641925927811, "eval_loss": 5.683408260345459, "eval_runtime": 193.7355, "eval_samples_per_second": 51.617, "eval_steps_per_second": 6.452, "step": 18500 }, { "epoch": 0.28258345849868804, "grad_norm": 4.344554424285889, "learning_rate": 4.153846153846154e-06, "loss": 5.8187, "step": 18600 }, { "epoch": 0.28258345849868804, "eval_loss": 5.675909042358398, "eval_runtime": 193.8113, "eval_samples_per_second": 51.597, "eval_steps_per_second": 6.45, "step": 18600 }, { "epoch": 0.28410272440459494, "grad_norm": 3.3455545902252197, "learning_rate": 4.1438127090301005e-06, "loss": 5.8091, "step": 18700 }, { "epoch": 0.28410272440459494, "eval_loss": 5.665746688842773, "eval_runtime": 194.1351, "eval_samples_per_second": 51.511, "eval_steps_per_second": 6.439, "step": 18700 }, { "epoch": 0.2856219903105019, "grad_norm": 3.412184476852417, "learning_rate": 4.133779264214047e-06, "loss": 5.8026, "step": 18800 }, { "epoch": 0.2856219903105019, "eval_loss": 5.6578497886657715, "eval_runtime": 193.7841, "eval_samples_per_second": 51.604, "eval_steps_per_second": 6.45, "step": 18800 }, { "epoch": 0.2871412562164088, "grad_norm": 3.717855215072632, "learning_rate": 4.123745819397993e-06, "loss": 5.7957, "step": 18900 }, { "epoch": 0.2871412562164088, "eval_loss": 5.665693759918213, "eval_runtime": 193.7661, "eval_samples_per_second": 51.609, "eval_steps_per_second": 6.451, "step": 18900 }, { "epoch": 0.28866052212231574, "grad_norm": 3.876275062561035, "learning_rate": 4.11371237458194e-06, "loss": 5.7846, "step": 19000 }, { "epoch": 0.28866052212231574, "eval_loss": 5.648958206176758, "eval_runtime": 193.7195, "eval_samples_per_second": 51.621, "eval_steps_per_second": 6.453, "step": 19000 }, { "epoch": 0.29017978802822264, "grad_norm": 3.8186490535736084, "learning_rate": 4.103678929765887e-06, "loss": 5.7777, "step": 19100 }, { "epoch": 0.29017978802822264, "eval_loss": 5.629169940948486, "eval_runtime": 193.5999, "eval_samples_per_second": 51.653, "eval_steps_per_second": 6.457, "step": 19100 }, { "epoch": 0.2916990539341296, "grad_norm": 5.3280839920043945, "learning_rate": 4.0936454849498326e-06, "loss": 5.77, "step": 19200 }, { "epoch": 0.2916990539341296, "eval_loss": 5.620713233947754, "eval_runtime": 193.6523, "eval_samples_per_second": 51.639, "eval_steps_per_second": 6.455, "step": 19200 }, { "epoch": 0.2932183198400365, "grad_norm": 3.260324478149414, "learning_rate": 4.083612040133779e-06, "loss": 5.7611, "step": 19300 }, { "epoch": 0.2932183198400365, "eval_loss": 5.629894733428955, "eval_runtime": 193.685, "eval_samples_per_second": 51.63, "eval_steps_per_second": 6.454, "step": 19300 }, { "epoch": 0.29473758574594344, "grad_norm": 4.145829200744629, "learning_rate": 4.073578595317726e-06, "loss": 5.7538, "step": 19400 }, { "epoch": 0.29473758574594344, "eval_loss": 5.6320037841796875, "eval_runtime": 193.4857, "eval_samples_per_second": 51.683, "eval_steps_per_second": 6.46, "step": 19400 }, { "epoch": 0.29625685165185034, "grad_norm": 4.071881294250488, "learning_rate": 4.063545150501672e-06, "loss": 5.745, "step": 19500 }, { "epoch": 0.29625685165185034, "eval_loss": 5.607526779174805, "eval_runtime": 193.5349, "eval_samples_per_second": 51.67, "eval_steps_per_second": 6.459, "step": 19500 }, { "epoch": 0.2977761175577573, "grad_norm": 3.4075276851654053, "learning_rate": 4.053511705685619e-06, "loss": 5.7404, "step": 19600 }, { "epoch": 0.2977761175577573, "eval_loss": 5.603940010070801, "eval_runtime": 193.5688, "eval_samples_per_second": 51.661, "eval_steps_per_second": 6.458, "step": 19600 }, { "epoch": 0.2992953834636642, "grad_norm": 3.371760129928589, "learning_rate": 4.0434782608695655e-06, "loss": 5.7343, "step": 19700 }, { "epoch": 0.2992953834636642, "eval_loss": 5.597903728485107, "eval_runtime": 193.9636, "eval_samples_per_second": 51.556, "eval_steps_per_second": 6.445, "step": 19700 }, { "epoch": 0.30081464936957114, "grad_norm": 3.1230831146240234, "learning_rate": 4.0334448160535115e-06, "loss": 5.7284, "step": 19800 }, { "epoch": 0.30081464936957114, "eval_loss": 5.580268859863281, "eval_runtime": 194.0245, "eval_samples_per_second": 51.54, "eval_steps_per_second": 6.442, "step": 19800 }, { "epoch": 0.30233391527547804, "grad_norm": 3.339742422103882, "learning_rate": 4.023411371237458e-06, "loss": 5.7206, "step": 19900 }, { "epoch": 0.30233391527547804, "eval_loss": 5.571849822998047, "eval_runtime": 193.8323, "eval_samples_per_second": 51.591, "eval_steps_per_second": 6.449, "step": 19900 }, { "epoch": 0.303853181181385, "grad_norm": 3.2297468185424805, "learning_rate": 4.013377926421405e-06, "loss": 5.7086, "step": 20000 }, { "epoch": 0.303853181181385, "eval_loss": 5.5632548332214355, "eval_runtime": 193.7563, "eval_samples_per_second": 51.611, "eval_steps_per_second": 6.451, "step": 20000 }, { "epoch": 0.3053724470872919, "grad_norm": 2.8698532581329346, "learning_rate": 4.003344481605351e-06, "loss": 5.7027, "step": 20100 }, { "epoch": 0.3053724470872919, "eval_loss": 5.559244632720947, "eval_runtime": 193.8672, "eval_samples_per_second": 51.582, "eval_steps_per_second": 6.448, "step": 20100 }, { "epoch": 0.30689171299319884, "grad_norm": 2.990452289581299, "learning_rate": 3.993311036789298e-06, "loss": 5.6953, "step": 20200 }, { "epoch": 0.30689171299319884, "eval_loss": 5.560790061950684, "eval_runtime": 193.8061, "eval_samples_per_second": 51.598, "eval_steps_per_second": 6.45, "step": 20200 }, { "epoch": 0.30841097889910574, "grad_norm": 3.821631669998169, "learning_rate": 3.9832775919732444e-06, "loss": 5.6881, "step": 20300 }, { "epoch": 0.30841097889910574, "eval_loss": 5.551888465881348, "eval_runtime": 194.0615, "eval_samples_per_second": 51.53, "eval_steps_per_second": 6.441, "step": 20300 }, { "epoch": 0.3099302448050127, "grad_norm": 3.209308624267578, "learning_rate": 3.97324414715719e-06, "loss": 5.683, "step": 20400 }, { "epoch": 0.3099302448050127, "eval_loss": 5.5436787605285645, "eval_runtime": 193.9187, "eval_samples_per_second": 51.568, "eval_steps_per_second": 6.446, "step": 20400 }, { "epoch": 0.3114495107109196, "grad_norm": 4.5453901290893555, "learning_rate": 3.963210702341137e-06, "loss": 5.6747, "step": 20500 }, { "epoch": 0.3114495107109196, "eval_loss": 5.523691177368164, "eval_runtime": 193.8312, "eval_samples_per_second": 51.591, "eval_steps_per_second": 6.449, "step": 20500 }, { "epoch": 0.31296877661682654, "grad_norm": 3.86855411529541, "learning_rate": 3.953177257525084e-06, "loss": 5.6753, "step": 20600 }, { "epoch": 0.31296877661682654, "eval_loss": 5.530142784118652, "eval_runtime": 193.8761, "eval_samples_per_second": 51.579, "eval_steps_per_second": 6.447, "step": 20600 }, { "epoch": 0.31448804252273344, "grad_norm": 3.029080390930176, "learning_rate": 3.943143812709031e-06, "loss": 5.6683, "step": 20700 }, { "epoch": 0.31448804252273344, "eval_loss": 5.527863025665283, "eval_runtime": 193.8252, "eval_samples_per_second": 51.593, "eval_steps_per_second": 6.449, "step": 20700 }, { "epoch": 0.3160073084286404, "grad_norm": 3.5344836711883545, "learning_rate": 3.9331103678929765e-06, "loss": 5.6554, "step": 20800 }, { "epoch": 0.3160073084286404, "eval_loss": 5.510525226593018, "eval_runtime": 193.8394, "eval_samples_per_second": 51.589, "eval_steps_per_second": 6.449, "step": 20800 }, { "epoch": 0.3175265743345473, "grad_norm": 3.153604507446289, "learning_rate": 3.923076923076923e-06, "loss": 5.6508, "step": 20900 }, { "epoch": 0.3175265743345473, "eval_loss": 5.508999824523926, "eval_runtime": 194.0796, "eval_samples_per_second": 51.525, "eval_steps_per_second": 6.441, "step": 20900 }, { "epoch": 0.31904584024045424, "grad_norm": 3.87959623336792, "learning_rate": 3.91304347826087e-06, "loss": 5.644, "step": 21000 }, { "epoch": 0.31904584024045424, "eval_loss": 5.511682987213135, "eval_runtime": 193.8814, "eval_samples_per_second": 51.578, "eval_steps_per_second": 6.447, "step": 21000 }, { "epoch": 0.32056510614636113, "grad_norm": 3.9517741203308105, "learning_rate": 3.903010033444816e-06, "loss": 5.6387, "step": 21100 }, { "epoch": 0.32056510614636113, "eval_loss": 5.499752521514893, "eval_runtime": 193.8141, "eval_samples_per_second": 51.596, "eval_steps_per_second": 6.449, "step": 21100 }, { "epoch": 0.3220843720522681, "grad_norm": 3.191702127456665, "learning_rate": 3.892976588628763e-06, "loss": 5.6333, "step": 21200 }, { "epoch": 0.3220843720522681, "eval_loss": 5.476820945739746, "eval_runtime": 193.9667, "eval_samples_per_second": 51.555, "eval_steps_per_second": 6.444, "step": 21200 }, { "epoch": 0.323603637958175, "grad_norm": 3.1419906616210938, "learning_rate": 3.8829431438127095e-06, "loss": 5.6243, "step": 21300 }, { "epoch": 0.323603637958175, "eval_loss": 5.486774444580078, "eval_runtime": 193.7733, "eval_samples_per_second": 51.607, "eval_steps_per_second": 6.451, "step": 21300 }, { "epoch": 0.32512290386408194, "grad_norm": 4.059791088104248, "learning_rate": 3.8729096989966554e-06, "loss": 5.6163, "step": 21400 }, { "epoch": 0.32512290386408194, "eval_loss": 5.477799415588379, "eval_runtime": 193.6285, "eval_samples_per_second": 51.645, "eval_steps_per_second": 6.456, "step": 21400 }, { "epoch": 0.32664216976998883, "grad_norm": 2.990511417388916, "learning_rate": 3.862876254180602e-06, "loss": 5.6133, "step": 21500 }, { "epoch": 0.32664216976998883, "eval_loss": 5.47875452041626, "eval_runtime": 193.5641, "eval_samples_per_second": 51.662, "eval_steps_per_second": 6.458, "step": 21500 }, { "epoch": 0.3281614356758958, "grad_norm": 2.3832523822784424, "learning_rate": 3.852842809364549e-06, "loss": 5.6062, "step": 21600 }, { "epoch": 0.3281614356758958, "eval_loss": 5.4584455490112305, "eval_runtime": 193.6584, "eval_samples_per_second": 51.637, "eval_steps_per_second": 6.455, "step": 21600 }, { "epoch": 0.3296807015818027, "grad_norm": 3.010307788848877, "learning_rate": 3.842809364548495e-06, "loss": 5.5959, "step": 21700 }, { "epoch": 0.3296807015818027, "eval_loss": 5.451364517211914, "eval_runtime": 193.6921, "eval_samples_per_second": 51.628, "eval_steps_per_second": 6.454, "step": 21700 }, { "epoch": 0.33119996748770963, "grad_norm": 3.718315601348877, "learning_rate": 3.832775919732442e-06, "loss": 5.5919, "step": 21800 }, { "epoch": 0.33119996748770963, "eval_loss": 5.446727752685547, "eval_runtime": 193.8195, "eval_samples_per_second": 51.594, "eval_steps_per_second": 6.449, "step": 21800 }, { "epoch": 0.33271923339361653, "grad_norm": 4.131709098815918, "learning_rate": 3.822742474916388e-06, "loss": 5.5859, "step": 21900 }, { "epoch": 0.33271923339361653, "eval_loss": 5.43417501449585, "eval_runtime": 193.7876, "eval_samples_per_second": 51.603, "eval_steps_per_second": 6.45, "step": 21900 }, { "epoch": 0.3342384992995235, "grad_norm": 3.7145907878875732, "learning_rate": 3.8127090301003347e-06, "loss": 5.5805, "step": 22000 }, { "epoch": 0.3342384992995235, "eval_loss": 5.443439960479736, "eval_runtime": 193.7659, "eval_samples_per_second": 51.609, "eval_steps_per_second": 6.451, "step": 22000 }, { "epoch": 0.3357577652054304, "grad_norm": 3.167874574661255, "learning_rate": 3.802675585284281e-06, "loss": 5.5724, "step": 22100 }, { "epoch": 0.3357577652054304, "eval_loss": 5.419732093811035, "eval_runtime": 193.7654, "eval_samples_per_second": 51.609, "eval_steps_per_second": 6.451, "step": 22100 }, { "epoch": 0.33727703111133733, "grad_norm": 3.820495367050171, "learning_rate": 3.792642140468228e-06, "loss": 5.5694, "step": 22200 }, { "epoch": 0.33727703111133733, "eval_loss": 5.4181647300720215, "eval_runtime": 193.8601, "eval_samples_per_second": 51.584, "eval_steps_per_second": 6.448, "step": 22200 }, { "epoch": 0.33879629701724423, "grad_norm": 3.4039466381073, "learning_rate": 3.782608695652174e-06, "loss": 5.565, "step": 22300 }, { "epoch": 0.33879629701724423, "eval_loss": 5.419365406036377, "eval_runtime": 194.0048, "eval_samples_per_second": 51.545, "eval_steps_per_second": 6.443, "step": 22300 }, { "epoch": 0.3403155629231512, "grad_norm": 2.805332660675049, "learning_rate": 3.7725752508361205e-06, "loss": 5.556, "step": 22400 }, { "epoch": 0.3403155629231512, "eval_loss": 5.415971755981445, "eval_runtime": 193.9692, "eval_samples_per_second": 51.555, "eval_steps_per_second": 6.444, "step": 22400 }, { "epoch": 0.3418348288290581, "grad_norm": 2.43111252784729, "learning_rate": 3.7625418060200673e-06, "loss": 5.5536, "step": 22500 }, { "epoch": 0.3418348288290581, "eval_loss": 5.405710220336914, "eval_runtime": 193.9758, "eval_samples_per_second": 51.553, "eval_steps_per_second": 6.444, "step": 22500 }, { "epoch": 0.34335409473496503, "grad_norm": 3.9612550735473633, "learning_rate": 3.7525083612040136e-06, "loss": 5.544, "step": 22600 }, { "epoch": 0.34335409473496503, "eval_loss": 5.40699577331543, "eval_runtime": 193.9672, "eval_samples_per_second": 51.555, "eval_steps_per_second": 6.444, "step": 22600 }, { "epoch": 0.34487336064087193, "grad_norm": 2.8571434020996094, "learning_rate": 3.74247491638796e-06, "loss": 5.5392, "step": 22700 }, { "epoch": 0.34487336064087193, "eval_loss": 5.397378921508789, "eval_runtime": 194.1852, "eval_samples_per_second": 51.497, "eval_steps_per_second": 6.437, "step": 22700 }, { "epoch": 0.3463926265467789, "grad_norm": 3.1463348865509033, "learning_rate": 3.7324414715719067e-06, "loss": 5.5296, "step": 22800 }, { "epoch": 0.3463926265467789, "eval_loss": 5.3807501792907715, "eval_runtime": 193.8656, "eval_samples_per_second": 51.582, "eval_steps_per_second": 6.448, "step": 22800 }, { "epoch": 0.3479118924526858, "grad_norm": 3.696991205215454, "learning_rate": 3.722408026755853e-06, "loss": 5.5319, "step": 22900 }, { "epoch": 0.3479118924526858, "eval_loss": 5.376627445220947, "eval_runtime": 193.8475, "eval_samples_per_second": 51.587, "eval_steps_per_second": 6.448, "step": 22900 }, { "epoch": 0.34943115835859273, "grad_norm": 3.691133737564087, "learning_rate": 3.7123745819398e-06, "loss": 5.5239, "step": 23000 }, { "epoch": 0.34943115835859273, "eval_loss": 5.379115104675293, "eval_runtime": 193.935, "eval_samples_per_second": 51.564, "eval_steps_per_second": 6.445, "step": 23000 }, { "epoch": 0.3509504242644996, "grad_norm": 2.994180679321289, "learning_rate": 3.702341137123746e-06, "loss": 5.5174, "step": 23100 }, { "epoch": 0.3509504242644996, "eval_loss": 5.36928129196167, "eval_runtime": 193.8929, "eval_samples_per_second": 51.575, "eval_steps_per_second": 6.447, "step": 23100 }, { "epoch": 0.3524696901704066, "grad_norm": 3.528660774230957, "learning_rate": 3.6923076923076925e-06, "loss": 5.5123, "step": 23200 }, { "epoch": 0.3524696901704066, "eval_loss": 5.366269588470459, "eval_runtime": 193.8307, "eval_samples_per_second": 51.591, "eval_steps_per_second": 6.449, "step": 23200 }, { "epoch": 0.3539889560763135, "grad_norm": 2.8609702587127686, "learning_rate": 3.6822742474916393e-06, "loss": 5.5024, "step": 23300 }, { "epoch": 0.3539889560763135, "eval_loss": 5.3636884689331055, "eval_runtime": 193.9144, "eval_samples_per_second": 51.569, "eval_steps_per_second": 6.446, "step": 23300 }, { "epoch": 0.35550822198222043, "grad_norm": 3.146467447280884, "learning_rate": 3.6722408026755856e-06, "loss": 5.4993, "step": 23400 }, { "epoch": 0.35550822198222043, "eval_loss": 5.3594536781311035, "eval_runtime": 193.8636, "eval_samples_per_second": 51.583, "eval_steps_per_second": 6.448, "step": 23400 }, { "epoch": 0.3570274878881273, "grad_norm": 2.4640018939971924, "learning_rate": 3.662207357859532e-06, "loss": 5.4944, "step": 23500 }, { "epoch": 0.3570274878881273, "eval_loss": 5.346569538116455, "eval_runtime": 193.9228, "eval_samples_per_second": 51.567, "eval_steps_per_second": 6.446, "step": 23500 }, { "epoch": 0.3585467537940343, "grad_norm": 4.175319671630859, "learning_rate": 3.6521739130434787e-06, "loss": 5.4874, "step": 23600 }, { "epoch": 0.3585467537940343, "eval_loss": 5.343349456787109, "eval_runtime": 193.585, "eval_samples_per_second": 51.657, "eval_steps_per_second": 6.457, "step": 23600 }, { "epoch": 0.3600660196999412, "grad_norm": 3.4799277782440186, "learning_rate": 3.642140468227425e-06, "loss": 5.4874, "step": 23700 }, { "epoch": 0.3600660196999412, "eval_loss": 5.342945098876953, "eval_runtime": 193.6611, "eval_samples_per_second": 51.637, "eval_steps_per_second": 6.455, "step": 23700 }, { "epoch": 0.36158528560584813, "grad_norm": 1.949639916419983, "learning_rate": 3.6321070234113714e-06, "loss": 5.4786, "step": 23800 }, { "epoch": 0.36158528560584813, "eval_loss": 5.325855255126953, "eval_runtime": 193.5983, "eval_samples_per_second": 51.653, "eval_steps_per_second": 6.457, "step": 23800 }, { "epoch": 0.363104551511755, "grad_norm": 2.983280658721924, "learning_rate": 3.622073578595318e-06, "loss": 5.4736, "step": 23900 }, { "epoch": 0.363104551511755, "eval_loss": 5.320173263549805, "eval_runtime": 193.5482, "eval_samples_per_second": 51.667, "eval_steps_per_second": 6.458, "step": 23900 }, { "epoch": 0.364623817417662, "grad_norm": 3.912425994873047, "learning_rate": 3.6120401337792645e-06, "loss": 5.4694, "step": 24000 }, { "epoch": 0.364623817417662, "eval_loss": 5.3176751136779785, "eval_runtime": 193.8093, "eval_samples_per_second": 51.597, "eval_steps_per_second": 6.45, "step": 24000 }, { "epoch": 0.3661430833235689, "grad_norm": 3.230281114578247, "learning_rate": 3.6020066889632112e-06, "loss": 5.465, "step": 24100 }, { "epoch": 0.3661430833235689, "eval_loss": 5.319676399230957, "eval_runtime": 193.9617, "eval_samples_per_second": 51.557, "eval_steps_per_second": 6.445, "step": 24100 }, { "epoch": 0.3676623492294758, "grad_norm": 2.6516830921173096, "learning_rate": 3.5919732441471576e-06, "loss": 5.4568, "step": 24200 }, { "epoch": 0.3676623492294758, "eval_loss": 5.306182384490967, "eval_runtime": 193.9303, "eval_samples_per_second": 51.565, "eval_steps_per_second": 6.446, "step": 24200 }, { "epoch": 0.3691816151353827, "grad_norm": 3.398289442062378, "learning_rate": 3.581939799331104e-06, "loss": 5.4555, "step": 24300 }, { "epoch": 0.3691816151353827, "eval_loss": 5.304970741271973, "eval_runtime": 193.8994, "eval_samples_per_second": 51.573, "eval_steps_per_second": 6.447, "step": 24300 }, { "epoch": 0.3707008810412897, "grad_norm": 2.9263579845428467, "learning_rate": 3.5719063545150507e-06, "loss": 5.446, "step": 24400 }, { "epoch": 0.3707008810412897, "eval_loss": 5.304412364959717, "eval_runtime": 193.8605, "eval_samples_per_second": 51.583, "eval_steps_per_second": 6.448, "step": 24400 }, { "epoch": 0.3722201469471966, "grad_norm": 3.2696564197540283, "learning_rate": 3.561872909698997e-06, "loss": 5.4418, "step": 24500 }, { "epoch": 0.3722201469471966, "eval_loss": 5.291959285736084, "eval_runtime": 193.8303, "eval_samples_per_second": 51.592, "eval_steps_per_second": 6.449, "step": 24500 }, { "epoch": 0.3737394128531035, "grad_norm": 3.3699710369110107, "learning_rate": 3.5518394648829434e-06, "loss": 5.4367, "step": 24600 }, { "epoch": 0.3737394128531035, "eval_loss": 5.292627334594727, "eval_runtime": 193.9116, "eval_samples_per_second": 51.57, "eval_steps_per_second": 6.446, "step": 24600 }, { "epoch": 0.3752586787590104, "grad_norm": 3.646376848220825, "learning_rate": 3.54180602006689e-06, "loss": 5.4389, "step": 24700 }, { "epoch": 0.3752586787590104, "eval_loss": 5.2781982421875, "eval_runtime": 193.9384, "eval_samples_per_second": 51.563, "eval_steps_per_second": 6.445, "step": 24700 }, { "epoch": 0.3767779446649174, "grad_norm": 2.873612880706787, "learning_rate": 3.5317725752508365e-06, "loss": 5.4344, "step": 24800 }, { "epoch": 0.3767779446649174, "eval_loss": 5.286219596862793, "eval_runtime": 193.9969, "eval_samples_per_second": 51.547, "eval_steps_per_second": 6.443, "step": 24800 }, { "epoch": 0.37829721057082427, "grad_norm": 3.312747001647949, "learning_rate": 3.521739130434783e-06, "loss": 5.427, "step": 24900 }, { "epoch": 0.37829721057082427, "eval_loss": 5.281394004821777, "eval_runtime": 193.9463, "eval_samples_per_second": 51.561, "eval_steps_per_second": 6.445, "step": 24900 }, { "epoch": 0.3798164764767312, "grad_norm": 3.727271556854248, "learning_rate": 3.5117056856187296e-06, "loss": 5.4212, "step": 25000 }, { "epoch": 0.3798164764767312, "eval_loss": 5.266263484954834, "eval_runtime": 193.9471, "eval_samples_per_second": 51.56, "eval_steps_per_second": 6.445, "step": 25000 }, { "epoch": 0.3813357423826381, "grad_norm": 3.3630518913269043, "learning_rate": 3.501672240802676e-06, "loss": 5.4173, "step": 25100 }, { "epoch": 0.3813357423826381, "eval_loss": 5.276744365692139, "eval_runtime": 193.8804, "eval_samples_per_second": 51.578, "eval_steps_per_second": 6.447, "step": 25100 }, { "epoch": 0.3828550082885451, "grad_norm": 4.12694787979126, "learning_rate": 3.491638795986622e-06, "loss": 5.413, "step": 25200 }, { "epoch": 0.3828550082885451, "eval_loss": 5.260261535644531, "eval_runtime": 193.919, "eval_samples_per_second": 51.568, "eval_steps_per_second": 6.446, "step": 25200 }, { "epoch": 0.38437427419445197, "grad_norm": 3.921342611312866, "learning_rate": 3.481605351170568e-06, "loss": 5.4041, "step": 25300 }, { "epoch": 0.38437427419445197, "eval_loss": 5.2696661949157715, "eval_runtime": 193.8783, "eval_samples_per_second": 51.579, "eval_steps_per_second": 6.447, "step": 25300 }, { "epoch": 0.3858935401003589, "grad_norm": 2.464872360229492, "learning_rate": 3.471571906354515e-06, "loss": 5.4035, "step": 25400 }, { "epoch": 0.3858935401003589, "eval_loss": 5.251010894775391, "eval_runtime": 193.8872, "eval_samples_per_second": 51.576, "eval_steps_per_second": 6.447, "step": 25400 }, { "epoch": 0.3874128060062658, "grad_norm": 2.675010919570923, "learning_rate": 3.4615384615384613e-06, "loss": 5.3946, "step": 25500 }, { "epoch": 0.3874128060062658, "eval_loss": 5.2538347244262695, "eval_runtime": 193.8933, "eval_samples_per_second": 51.575, "eval_steps_per_second": 6.447, "step": 25500 }, { "epoch": 0.38893207191217277, "grad_norm": 2.195725202560425, "learning_rate": 3.4515050167224076e-06, "loss": 5.3919, "step": 25600 }, { "epoch": 0.38893207191217277, "eval_loss": 5.230504035949707, "eval_runtime": 194.2054, "eval_samples_per_second": 51.492, "eval_steps_per_second": 6.436, "step": 25600 }, { "epoch": 0.39045133781807967, "grad_norm": 3.359039545059204, "learning_rate": 3.4414715719063544e-06, "loss": 5.3855, "step": 25700 }, { "epoch": 0.39045133781807967, "eval_loss": 5.245420455932617, "eval_runtime": 193.8867, "eval_samples_per_second": 51.577, "eval_steps_per_second": 6.447, "step": 25700 }, { "epoch": 0.3919706037239866, "grad_norm": 3.5205583572387695, "learning_rate": 3.4314381270903007e-06, "loss": 5.3822, "step": 25800 }, { "epoch": 0.3919706037239866, "eval_loss": 5.227876663208008, "eval_runtime": 194.0636, "eval_samples_per_second": 51.529, "eval_steps_per_second": 6.441, "step": 25800 }, { "epoch": 0.3934898696298935, "grad_norm": 3.5619242191314697, "learning_rate": 3.4214046822742475e-06, "loss": 5.3773, "step": 25900 }, { "epoch": 0.3934898696298935, "eval_loss": 5.234467029571533, "eval_runtime": 193.7401, "eval_samples_per_second": 51.616, "eval_steps_per_second": 6.452, "step": 25900 }, { "epoch": 0.39500913553580047, "grad_norm": 3.9920406341552734, "learning_rate": 3.411371237458194e-06, "loss": 5.3735, "step": 26000 }, { "epoch": 0.39500913553580047, "eval_loss": 5.22184944152832, "eval_runtime": 193.7394, "eval_samples_per_second": 51.616, "eval_steps_per_second": 6.452, "step": 26000 }, { "epoch": 0.39652840144170737, "grad_norm": 3.559217691421509, "learning_rate": 3.40133779264214e-06, "loss": 5.3695, "step": 26100 }, { "epoch": 0.39652840144170737, "eval_loss": 5.22100830078125, "eval_runtime": 193.849, "eval_samples_per_second": 51.587, "eval_steps_per_second": 6.448, "step": 26100 }, { "epoch": 0.3980476673476143, "grad_norm": 4.232235908508301, "learning_rate": 3.391304347826087e-06, "loss": 5.3703, "step": 26200 }, { "epoch": 0.3980476673476143, "eval_loss": 5.208474159240723, "eval_runtime": 193.8974, "eval_samples_per_second": 51.574, "eval_steps_per_second": 6.447, "step": 26200 }, { "epoch": 0.3995669332535212, "grad_norm": 1.947100043296814, "learning_rate": 3.3812709030100333e-06, "loss": 5.3627, "step": 26300 }, { "epoch": 0.3995669332535212, "eval_loss": 5.217953681945801, "eval_runtime": 193.928, "eval_samples_per_second": 51.566, "eval_steps_per_second": 6.446, "step": 26300 }, { "epoch": 0.40108619915942817, "grad_norm": 3.8494338989257812, "learning_rate": 3.3712374581939796e-06, "loss": 5.3584, "step": 26400 }, { "epoch": 0.40108619915942817, "eval_loss": 5.212357044219971, "eval_runtime": 193.9466, "eval_samples_per_second": 51.561, "eval_steps_per_second": 6.445, "step": 26400 }, { "epoch": 0.40260546506533507, "grad_norm": 3.837324619293213, "learning_rate": 3.3612040133779264e-06, "loss": 5.3555, "step": 26500 }, { "epoch": 0.40260546506533507, "eval_loss": 5.211539268493652, "eval_runtime": 193.9781, "eval_samples_per_second": 51.552, "eval_steps_per_second": 6.444, "step": 26500 }, { "epoch": 0.404124730971242, "grad_norm": 3.7754664421081543, "learning_rate": 3.3511705685618727e-06, "loss": 5.3476, "step": 26600 }, { "epoch": 0.404124730971242, "eval_loss": 5.1926679611206055, "eval_runtime": 194.0189, "eval_samples_per_second": 51.541, "eval_steps_per_second": 6.443, "step": 26600 }, { "epoch": 0.4056439968771489, "grad_norm": 2.4836502075195312, "learning_rate": 3.3411371237458195e-06, "loss": 5.3471, "step": 26700 }, { "epoch": 0.4056439968771489, "eval_loss": 5.188870429992676, "eval_runtime": 194.0466, "eval_samples_per_second": 51.534, "eval_steps_per_second": 6.442, "step": 26700 }, { "epoch": 0.40716326278305587, "grad_norm": 4.591010093688965, "learning_rate": 3.331103678929766e-06, "loss": 5.3431, "step": 26800 }, { "epoch": 0.40716326278305587, "eval_loss": 5.2042717933654785, "eval_runtime": 193.9239, "eval_samples_per_second": 51.567, "eval_steps_per_second": 6.446, "step": 26800 }, { "epoch": 0.40868252868896277, "grad_norm": 3.4716506004333496, "learning_rate": 3.321070234113712e-06, "loss": 5.3363, "step": 26900 }, { "epoch": 0.40868252868896277, "eval_loss": 5.18259859085083, "eval_runtime": 193.9467, "eval_samples_per_second": 51.561, "eval_steps_per_second": 6.445, "step": 26900 }, { "epoch": 0.4102017945948697, "grad_norm": 2.3968818187713623, "learning_rate": 3.311036789297659e-06, "loss": 5.335, "step": 27000 }, { "epoch": 0.4102017945948697, "eval_loss": 5.189505577087402, "eval_runtime": 193.9236, "eval_samples_per_second": 51.567, "eval_steps_per_second": 6.446, "step": 27000 }, { "epoch": 0.4117210605007766, "grad_norm": 3.8948540687561035, "learning_rate": 3.3010033444816052e-06, "loss": 5.3306, "step": 27100 }, { "epoch": 0.4117210605007766, "eval_loss": 5.193852424621582, "eval_runtime": 193.9813, "eval_samples_per_second": 51.551, "eval_steps_per_second": 6.444, "step": 27100 }, { "epoch": 0.41324032640668357, "grad_norm": 2.8864169120788574, "learning_rate": 3.2909698996655516e-06, "loss": 5.3292, "step": 27200 }, { "epoch": 0.41324032640668357, "eval_loss": 5.173651695251465, "eval_runtime": 193.9832, "eval_samples_per_second": 51.551, "eval_steps_per_second": 6.444, "step": 27200 }, { "epoch": 0.41475959231259046, "grad_norm": 2.733299970626831, "learning_rate": 3.2809364548494983e-06, "loss": 5.33, "step": 27300 }, { "epoch": 0.41475959231259046, "eval_loss": 5.169619083404541, "eval_runtime": 193.8848, "eval_samples_per_second": 51.577, "eval_steps_per_second": 6.447, "step": 27300 }, { "epoch": 0.4162788582184974, "grad_norm": 2.9062700271606445, "learning_rate": 3.2709030100334447e-06, "loss": 5.3173, "step": 27400 }, { "epoch": 0.4162788582184974, "eval_loss": 5.1664323806762695, "eval_runtime": 193.9119, "eval_samples_per_second": 51.57, "eval_steps_per_second": 6.446, "step": 27400 }, { "epoch": 0.4177981241244043, "grad_norm": 3.473586320877075, "learning_rate": 3.260869565217391e-06, "loss": 5.3132, "step": 27500 }, { "epoch": 0.4177981241244043, "eval_loss": 5.160322666168213, "eval_runtime": 193.6677, "eval_samples_per_second": 51.635, "eval_steps_per_second": 6.454, "step": 27500 }, { "epoch": 0.41931739003031127, "grad_norm": 3.763826847076416, "learning_rate": 3.2508361204013378e-06, "loss": 5.3079, "step": 27600 }, { "epoch": 0.41931739003031127, "eval_loss": 5.159815788269043, "eval_runtime": 193.6881, "eval_samples_per_second": 51.629, "eval_steps_per_second": 6.454, "step": 27600 }, { "epoch": 0.42083665593621816, "grad_norm": 3.552210807800293, "learning_rate": 3.240802675585284e-06, "loss": 5.3065, "step": 27700 }, { "epoch": 0.42083665593621816, "eval_loss": 5.150642395019531, "eval_runtime": 193.6169, "eval_samples_per_second": 51.648, "eval_steps_per_second": 6.456, "step": 27700 }, { "epoch": 0.4223559218421251, "grad_norm": 4.059215545654297, "learning_rate": 3.230769230769231e-06, "loss": 5.2979, "step": 27800 }, { "epoch": 0.4223559218421251, "eval_loss": 5.1397881507873535, "eval_runtime": 193.6252, "eval_samples_per_second": 51.646, "eval_steps_per_second": 6.456, "step": 27800 }, { "epoch": 0.423875187748032, "grad_norm": 3.116863250732422, "learning_rate": 3.2207357859531772e-06, "loss": 5.2986, "step": 27900 }, { "epoch": 0.423875187748032, "eval_loss": 5.141936779022217, "eval_runtime": 193.8766, "eval_samples_per_second": 51.579, "eval_steps_per_second": 6.447, "step": 27900 }, { "epoch": 0.42539445365393896, "grad_norm": 3.474275588989258, "learning_rate": 3.2107023411371236e-06, "loss": 5.2969, "step": 28000 }, { "epoch": 0.42539445365393896, "eval_loss": 5.130954742431641, "eval_runtime": 193.6114, "eval_samples_per_second": 51.65, "eval_steps_per_second": 6.456, "step": 28000 }, { "epoch": 0.42691371955984586, "grad_norm": 4.147261619567871, "learning_rate": 3.2006688963210703e-06, "loss": 5.2919, "step": 28100 }, { "epoch": 0.42691371955984586, "eval_loss": 5.131519794464111, "eval_runtime": 193.5876, "eval_samples_per_second": 51.656, "eval_steps_per_second": 6.457, "step": 28100 }, { "epoch": 0.4284329854657528, "grad_norm": 3.2498297691345215, "learning_rate": 3.1906354515050167e-06, "loss": 5.281, "step": 28200 }, { "epoch": 0.4284329854657528, "eval_loss": 5.137979984283447, "eval_runtime": 193.6213, "eval_samples_per_second": 51.647, "eval_steps_per_second": 6.456, "step": 28200 }, { "epoch": 0.4299522513716597, "grad_norm": 2.9977059364318848, "learning_rate": 3.180602006688963e-06, "loss": 5.2799, "step": 28300 }, { "epoch": 0.4299522513716597, "eval_loss": 5.123497009277344, "eval_runtime": 193.6764, "eval_samples_per_second": 51.633, "eval_steps_per_second": 6.454, "step": 28300 }, { "epoch": 0.43147151727756666, "grad_norm": 3.6998023986816406, "learning_rate": 3.1705685618729098e-06, "loss": 5.2772, "step": 28400 }, { "epoch": 0.43147151727756666, "eval_loss": 5.125461101531982, "eval_runtime": 194.1824, "eval_samples_per_second": 51.498, "eval_steps_per_second": 6.437, "step": 28400 }, { "epoch": 0.43299078318347356, "grad_norm": 2.8865628242492676, "learning_rate": 3.160535117056856e-06, "loss": 5.2778, "step": 28500 }, { "epoch": 0.43299078318347356, "eval_loss": 5.130805492401123, "eval_runtime": 194.0322, "eval_samples_per_second": 51.538, "eval_steps_per_second": 6.442, "step": 28500 }, { "epoch": 0.4345100490893805, "grad_norm": 3.853248357772827, "learning_rate": 3.1505016722408024e-06, "loss": 5.2722, "step": 28600 }, { "epoch": 0.4345100490893805, "eval_loss": 5.125495910644531, "eval_runtime": 194.031, "eval_samples_per_second": 51.538, "eval_steps_per_second": 6.442, "step": 28600 }, { "epoch": 0.4360293149952874, "grad_norm": 2.8595046997070312, "learning_rate": 3.140468227424749e-06, "loss": 5.2644, "step": 28700 }, { "epoch": 0.4360293149952874, "eval_loss": 5.113553524017334, "eval_runtime": 194.0586, "eval_samples_per_second": 51.531, "eval_steps_per_second": 6.441, "step": 28700 }, { "epoch": 0.43754858090119436, "grad_norm": 3.5894057750701904, "learning_rate": 3.1304347826086955e-06, "loss": 5.261, "step": 28800 }, { "epoch": 0.43754858090119436, "eval_loss": 5.1062846183776855, "eval_runtime": 194.0309, "eval_samples_per_second": 51.538, "eval_steps_per_second": 6.442, "step": 28800 }, { "epoch": 0.43906784680710126, "grad_norm": 2.79595685005188, "learning_rate": 3.1204013377926423e-06, "loss": 5.257, "step": 28900 }, { "epoch": 0.43906784680710126, "eval_loss": 5.108764171600342, "eval_runtime": 194.0394, "eval_samples_per_second": 51.536, "eval_steps_per_second": 6.442, "step": 28900 }, { "epoch": 0.4405871127130082, "grad_norm": 3.3071796894073486, "learning_rate": 3.1103678929765886e-06, "loss": 5.2543, "step": 29000 }, { "epoch": 0.4405871127130082, "eval_loss": 5.101233005523682, "eval_runtime": 194.0058, "eval_samples_per_second": 51.545, "eval_steps_per_second": 6.443, "step": 29000 }, { "epoch": 0.4421063786189151, "grad_norm": 2.916874408721924, "learning_rate": 3.100334448160535e-06, "loss": 5.2474, "step": 29100 }, { "epoch": 0.4421063786189151, "eval_loss": 5.100154876708984, "eval_runtime": 194.0356, "eval_samples_per_second": 51.537, "eval_steps_per_second": 6.442, "step": 29100 }, { "epoch": 0.44362564452482206, "grad_norm": 2.6649153232574463, "learning_rate": 3.0903010033444818e-06, "loss": 5.2504, "step": 29200 }, { "epoch": 0.44362564452482206, "eval_loss": 5.0921311378479, "eval_runtime": 194.0892, "eval_samples_per_second": 51.523, "eval_steps_per_second": 6.44, "step": 29200 }, { "epoch": 0.44514491043072896, "grad_norm": 2.398049831390381, "learning_rate": 3.080267558528428e-06, "loss": 5.2441, "step": 29300 }, { "epoch": 0.44514491043072896, "eval_loss": 5.0853142738342285, "eval_runtime": 193.8364, "eval_samples_per_second": 51.59, "eval_steps_per_second": 6.449, "step": 29300 }, { "epoch": 0.4466641763366359, "grad_norm": 2.485322952270508, "learning_rate": 3.0702341137123744e-06, "loss": 5.2415, "step": 29400 }, { "epoch": 0.4466641763366359, "eval_loss": 5.091442584991455, "eval_runtime": 193.9724, "eval_samples_per_second": 51.554, "eval_steps_per_second": 6.444, "step": 29400 }, { "epoch": 0.4481834422425428, "grad_norm": 3.5554513931274414, "learning_rate": 3.060200668896321e-06, "loss": 5.2374, "step": 29500 }, { "epoch": 0.4481834422425428, "eval_loss": 5.077342510223389, "eval_runtime": 194.0745, "eval_samples_per_second": 51.527, "eval_steps_per_second": 6.441, "step": 29500 }, { "epoch": 0.44970270814844976, "grad_norm": 3.598982810974121, "learning_rate": 3.0501672240802675e-06, "loss": 5.2324, "step": 29600 }, { "epoch": 0.44970270814844976, "eval_loss": 5.088211536407471, "eval_runtime": 193.9862, "eval_samples_per_second": 51.55, "eval_steps_per_second": 6.444, "step": 29600 }, { "epoch": 0.45122197405435666, "grad_norm": 3.2339296340942383, "learning_rate": 3.0401337792642143e-06, "loss": 5.2295, "step": 29700 }, { "epoch": 0.45122197405435666, "eval_loss": 5.077876567840576, "eval_runtime": 193.8777, "eval_samples_per_second": 51.579, "eval_steps_per_second": 6.447, "step": 29700 }, { "epoch": 0.4527412399602636, "grad_norm": 2.627495765686035, "learning_rate": 3.0301003344481606e-06, "loss": 5.2275, "step": 29800 }, { "epoch": 0.4527412399602636, "eval_loss": 5.074822902679443, "eval_runtime": 193.8311, "eval_samples_per_second": 51.591, "eval_steps_per_second": 6.449, "step": 29800 }, { "epoch": 0.4542605058661705, "grad_norm": 2.9252991676330566, "learning_rate": 3.020066889632107e-06, "loss": 5.2238, "step": 29900 }, { "epoch": 0.4542605058661705, "eval_loss": 5.063547611236572, "eval_runtime": 193.7302, "eval_samples_per_second": 51.618, "eval_steps_per_second": 6.452, "step": 29900 }, { "epoch": 0.45577977177207746, "grad_norm": 3.155406951904297, "learning_rate": 3.0100334448160537e-06, "loss": 5.2218, "step": 30000 }, { "epoch": 0.45577977177207746, "eval_loss": 5.066218852996826, "eval_runtime": 193.6072, "eval_samples_per_second": 51.651, "eval_steps_per_second": 6.456, "step": 30000 }, { "epoch": 0.45729903767798435, "grad_norm": 3.476306915283203, "learning_rate": 3e-06, "loss": 5.2166, "step": 30100 }, { "epoch": 0.45729903767798435, "eval_loss": 5.068021774291992, "eval_runtime": 193.6477, "eval_samples_per_second": 51.64, "eval_steps_per_second": 6.455, "step": 30100 }, { "epoch": 0.4588183035838913, "grad_norm": 3.618774175643921, "learning_rate": 2.9899665551839464e-06, "loss": 5.2154, "step": 30200 }, { "epoch": 0.4588183035838913, "eval_loss": 5.0593461990356445, "eval_runtime": 193.5821, "eval_samples_per_second": 51.658, "eval_steps_per_second": 6.457, "step": 30200 }, { "epoch": 0.4603375694897982, "grad_norm": 2.838336229324341, "learning_rate": 2.979933110367893e-06, "loss": 5.2082, "step": 30300 }, { "epoch": 0.4603375694897982, "eval_loss": 5.061206817626953, "eval_runtime": 193.644, "eval_samples_per_second": 51.641, "eval_steps_per_second": 6.455, "step": 30300 }, { "epoch": 0.46185683539570516, "grad_norm": 2.840545654296875, "learning_rate": 2.9698996655518395e-06, "loss": 5.2028, "step": 30400 }, { "epoch": 0.46185683539570516, "eval_loss": 5.051755428314209, "eval_runtime": 193.6726, "eval_samples_per_second": 51.634, "eval_steps_per_second": 6.454, "step": 30400 }, { "epoch": 0.46337610130161205, "grad_norm": 2.4346399307250977, "learning_rate": 2.959866220735786e-06, "loss": 5.2001, "step": 30500 }, { "epoch": 0.46337610130161205, "eval_loss": 5.050179481506348, "eval_runtime": 193.6621, "eval_samples_per_second": 51.636, "eval_steps_per_second": 6.455, "step": 30500 }, { "epoch": 0.464895367207519, "grad_norm": 2.331064224243164, "learning_rate": 2.9498327759197326e-06, "loss": 5.2018, "step": 30600 }, { "epoch": 0.464895367207519, "eval_loss": 5.039993762969971, "eval_runtime": 193.9744, "eval_samples_per_second": 51.553, "eval_steps_per_second": 6.444, "step": 30600 }, { "epoch": 0.4664146331134259, "grad_norm": 3.012594223022461, "learning_rate": 2.939799331103679e-06, "loss": 5.1991, "step": 30700 }, { "epoch": 0.4664146331134259, "eval_loss": 5.039401054382324, "eval_runtime": 194.0713, "eval_samples_per_second": 51.527, "eval_steps_per_second": 6.441, "step": 30700 }, { "epoch": 0.46793389901933286, "grad_norm": 3.4017112255096436, "learning_rate": 2.9297658862876257e-06, "loss": 5.1937, "step": 30800 }, { "epoch": 0.46793389901933286, "eval_loss": 5.0479512214660645, "eval_runtime": 194.0394, "eval_samples_per_second": 51.536, "eval_steps_per_second": 6.442, "step": 30800 }, { "epoch": 0.46945316492523975, "grad_norm": 2.848475694656372, "learning_rate": 2.919732441471572e-06, "loss": 5.1898, "step": 30900 }, { "epoch": 0.46945316492523975, "eval_loss": 5.043004035949707, "eval_runtime": 194.2051, "eval_samples_per_second": 51.492, "eval_steps_per_second": 6.436, "step": 30900 }, { "epoch": 0.4709724308311467, "grad_norm": 2.964001178741455, "learning_rate": 2.9096989966555184e-06, "loss": 5.1887, "step": 31000 }, { "epoch": 0.4709724308311467, "eval_loss": 5.029993534088135, "eval_runtime": 194.1479, "eval_samples_per_second": 51.507, "eval_steps_per_second": 6.438, "step": 31000 }, { "epoch": 0.4724916967370536, "grad_norm": 2.698634147644043, "learning_rate": 2.899665551839465e-06, "loss": 5.1879, "step": 31100 }, { "epoch": 0.4724916967370536, "eval_loss": 5.028534412384033, "eval_runtime": 194.0408, "eval_samples_per_second": 51.536, "eval_steps_per_second": 6.442, "step": 31100 }, { "epoch": 0.47401096264296055, "grad_norm": 2.757293224334717, "learning_rate": 2.8896321070234115e-06, "loss": 5.1818, "step": 31200 }, { "epoch": 0.47401096264296055, "eval_loss": 5.024392127990723, "eval_runtime": 194.1644, "eval_samples_per_second": 51.503, "eval_steps_per_second": 6.438, "step": 31200 }, { "epoch": 0.47553022854886745, "grad_norm": 3.269547700881958, "learning_rate": 2.879598662207358e-06, "loss": 5.1784, "step": 31300 }, { "epoch": 0.47553022854886745, "eval_loss": 5.026421546936035, "eval_runtime": 193.9523, "eval_samples_per_second": 51.559, "eval_steps_per_second": 6.445, "step": 31300 }, { "epoch": 0.4770494944547744, "grad_norm": 3.1080405712127686, "learning_rate": 2.8695652173913046e-06, "loss": 5.1725, "step": 31400 }, { "epoch": 0.4770494944547744, "eval_loss": 5.025778770446777, "eval_runtime": 193.9136, "eval_samples_per_second": 51.569, "eval_steps_per_second": 6.446, "step": 31400 }, { "epoch": 0.4785687603606813, "grad_norm": 4.382852554321289, "learning_rate": 2.859531772575251e-06, "loss": 5.1691, "step": 31500 }, { "epoch": 0.4785687603606813, "eval_loss": 5.0218424797058105, "eval_runtime": 194.0079, "eval_samples_per_second": 51.544, "eval_steps_per_second": 6.443, "step": 31500 }, { "epoch": 0.48008802626658825, "grad_norm": 2.4219489097595215, "learning_rate": 2.8494983277591977e-06, "loss": 5.1675, "step": 31600 }, { "epoch": 0.48008802626658825, "eval_loss": 5.009864807128906, "eval_runtime": 194.1924, "eval_samples_per_second": 51.495, "eval_steps_per_second": 6.437, "step": 31600 }, { "epoch": 0.48160729217249515, "grad_norm": 3.9848620891571045, "learning_rate": 2.839464882943144e-06, "loss": 5.1607, "step": 31700 }, { "epoch": 0.48160729217249515, "eval_loss": 5.008971214294434, "eval_runtime": 194.2985, "eval_samples_per_second": 51.467, "eval_steps_per_second": 6.433, "step": 31700 }, { "epoch": 0.4831265580784021, "grad_norm": 3.3474831581115723, "learning_rate": 2.8294314381270904e-06, "loss": 5.1598, "step": 31800 }, { "epoch": 0.4831265580784021, "eval_loss": 5.004793167114258, "eval_runtime": 194.03, "eval_samples_per_second": 51.538, "eval_steps_per_second": 6.442, "step": 31800 }, { "epoch": 0.484645823984309, "grad_norm": 3.074587821960449, "learning_rate": 2.819397993311037e-06, "loss": 5.1588, "step": 31900 }, { "epoch": 0.484645823984309, "eval_loss": 5.007466793060303, "eval_runtime": 193.9052, "eval_samples_per_second": 51.572, "eval_steps_per_second": 6.446, "step": 31900 }, { "epoch": 0.48616508989021595, "grad_norm": 2.631606340408325, "learning_rate": 2.8093645484949835e-06, "loss": 5.155, "step": 32000 }, { "epoch": 0.48616508989021595, "eval_loss": 5.00339937210083, "eval_runtime": 194.0167, "eval_samples_per_second": 51.542, "eval_steps_per_second": 6.443, "step": 32000 }, { "epoch": 0.48768435579612285, "grad_norm": 2.5506277084350586, "learning_rate": 2.79933110367893e-06, "loss": 5.1544, "step": 32100 }, { "epoch": 0.48768435579612285, "eval_loss": 4.995656967163086, "eval_runtime": 193.7711, "eval_samples_per_second": 51.607, "eval_steps_per_second": 6.451, "step": 32100 }, { "epoch": 0.4892036217020298, "grad_norm": 2.9476144313812256, "learning_rate": 2.7892976588628766e-06, "loss": 5.1477, "step": 32200 }, { "epoch": 0.4892036217020298, "eval_loss": 4.994040012359619, "eval_runtime": 193.6615, "eval_samples_per_second": 51.636, "eval_steps_per_second": 6.455, "step": 32200 }, { "epoch": 0.4907228876079367, "grad_norm": 3.5395162105560303, "learning_rate": 2.779264214046823e-06, "loss": 5.1424, "step": 32300 }, { "epoch": 0.4907228876079367, "eval_loss": 4.992614269256592, "eval_runtime": 193.7065, "eval_samples_per_second": 51.624, "eval_steps_per_second": 6.453, "step": 32300 }, { "epoch": 0.49224215351384365, "grad_norm": 2.805767297744751, "learning_rate": 2.7692307692307693e-06, "loss": 5.1446, "step": 32400 }, { "epoch": 0.49224215351384365, "eval_loss": 4.987194538116455, "eval_runtime": 193.7203, "eval_samples_per_second": 51.621, "eval_steps_per_second": 6.453, "step": 32400 }, { "epoch": 0.49376141941975055, "grad_norm": 3.9371492862701416, "learning_rate": 2.759197324414716e-06, "loss": 5.1391, "step": 32500 }, { "epoch": 0.49376141941975055, "eval_loss": 4.9901838302612305, "eval_runtime": 193.6911, "eval_samples_per_second": 51.629, "eval_steps_per_second": 6.454, "step": 32500 }, { "epoch": 0.4952806853256575, "grad_norm": 2.755789041519165, "learning_rate": 2.749163879598662e-06, "loss": 5.1393, "step": 32600 }, { "epoch": 0.4952806853256575, "eval_loss": 4.992640018463135, "eval_runtime": 193.6445, "eval_samples_per_second": 51.641, "eval_steps_per_second": 6.455, "step": 32600 }, { "epoch": 0.4967999512315644, "grad_norm": 3.4700164794921875, "learning_rate": 2.7391304347826087e-06, "loss": 5.1375, "step": 32700 }, { "epoch": 0.4967999512315644, "eval_loss": 4.975983142852783, "eval_runtime": 193.7046, "eval_samples_per_second": 51.625, "eval_steps_per_second": 6.453, "step": 32700 }, { "epoch": 0.49831921713747135, "grad_norm": 2.9584505558013916, "learning_rate": 2.729096989966555e-06, "loss": 5.1305, "step": 32800 }, { "epoch": 0.49831921713747135, "eval_loss": 4.978539943695068, "eval_runtime": 193.9644, "eval_samples_per_second": 51.556, "eval_steps_per_second": 6.444, "step": 32800 }, { "epoch": 0.49983848304337825, "grad_norm": 3.1944355964660645, "learning_rate": 2.7190635451505014e-06, "loss": 5.1202, "step": 32900 }, { "epoch": 0.49983848304337825, "eval_loss": 4.972556114196777, "eval_runtime": 194.0092, "eval_samples_per_second": 51.544, "eval_steps_per_second": 6.443, "step": 32900 }, { "epoch": 0.5013577489492852, "grad_norm": 2.980757713317871, "learning_rate": 2.709030100334448e-06, "loss": 5.1282, "step": 33000 }, { "epoch": 0.5013577489492852, "eval_loss": 4.974079132080078, "eval_runtime": 194.0276, "eval_samples_per_second": 51.539, "eval_steps_per_second": 6.442, "step": 33000 }, { "epoch": 0.5028770148551921, "grad_norm": 3.100187063217163, "learning_rate": 2.6989966555183945e-06, "loss": 5.1259, "step": 33100 }, { "epoch": 0.5028770148551921, "eval_loss": 4.963293552398682, "eval_runtime": 194.0336, "eval_samples_per_second": 51.537, "eval_steps_per_second": 6.442, "step": 33100 }, { "epoch": 0.504396280761099, "grad_norm": 2.542158603668213, "learning_rate": 2.6889632107023413e-06, "loss": 5.1217, "step": 33200 }, { "epoch": 0.504396280761099, "eval_loss": 4.9611406326293945, "eval_runtime": 193.9855, "eval_samples_per_second": 51.55, "eval_steps_per_second": 6.444, "step": 33200 }, { "epoch": 0.505915546667006, "grad_norm": 2.545457363128662, "learning_rate": 2.6789297658862876e-06, "loss": 5.1158, "step": 33300 }, { "epoch": 0.505915546667006, "eval_loss": 4.967195510864258, "eval_runtime": 194.147, "eval_samples_per_second": 51.507, "eval_steps_per_second": 6.438, "step": 33300 }, { "epoch": 0.5074348125729129, "grad_norm": 2.822507858276367, "learning_rate": 2.668896321070234e-06, "loss": 5.1109, "step": 33400 }, { "epoch": 0.5074348125729129, "eval_loss": 4.9572014808654785, "eval_runtime": 194.2258, "eval_samples_per_second": 51.486, "eval_steps_per_second": 6.436, "step": 33400 }, { "epoch": 0.5089540784788198, "grad_norm": 2.361830949783325, "learning_rate": 2.6588628762541807e-06, "loss": 5.1154, "step": 33500 }, { "epoch": 0.5089540784788198, "eval_loss": 4.94895076751709, "eval_runtime": 194.1601, "eval_samples_per_second": 51.504, "eval_steps_per_second": 6.438, "step": 33500 }, { "epoch": 0.5104733443847267, "grad_norm": 2.3638288974761963, "learning_rate": 2.648829431438127e-06, "loss": 5.1055, "step": 33600 }, { "epoch": 0.5104733443847267, "eval_loss": 4.947831153869629, "eval_runtime": 194.2013, "eval_samples_per_second": 51.493, "eval_steps_per_second": 6.437, "step": 33600 }, { "epoch": 0.5119926102906337, "grad_norm": 2.163120746612549, "learning_rate": 2.6387959866220734e-06, "loss": 5.1048, "step": 33700 }, { "epoch": 0.5119926102906337, "eval_loss": 4.943573951721191, "eval_runtime": 194.2102, "eval_samples_per_second": 51.491, "eval_steps_per_second": 6.436, "step": 33700 }, { "epoch": 0.5135118761965406, "grad_norm": 2.234380006790161, "learning_rate": 2.62876254180602e-06, "loss": 5.1042, "step": 33800 }, { "epoch": 0.5135118761965406, "eval_loss": 4.945695400238037, "eval_runtime": 194.1949, "eval_samples_per_second": 51.495, "eval_steps_per_second": 6.437, "step": 33800 }, { "epoch": 0.5150311421024475, "grad_norm": 2.8607873916625977, "learning_rate": 2.6187290969899665e-06, "loss": 5.0977, "step": 33900 }, { "epoch": 0.5150311421024475, "eval_loss": 4.940700531005859, "eval_runtime": 194.1567, "eval_samples_per_second": 51.505, "eval_steps_per_second": 6.438, "step": 33900 }, { "epoch": 0.5165504080083544, "grad_norm": 2.85111403465271, "learning_rate": 2.6086956521739132e-06, "loss": 5.0939, "step": 34000 }, { "epoch": 0.5165504080083544, "eval_loss": 4.934571266174316, "eval_runtime": 194.0792, "eval_samples_per_second": 51.525, "eval_steps_per_second": 6.441, "step": 34000 }, { "epoch": 0.5180696739142614, "grad_norm": 3.2021050453186035, "learning_rate": 2.5986622073578596e-06, "loss": 5.0902, "step": 34100 }, { "epoch": 0.5180696739142614, "eval_loss": 4.940134048461914, "eval_runtime": 194.0612, "eval_samples_per_second": 51.53, "eval_steps_per_second": 6.441, "step": 34100 }, { "epoch": 0.5195889398201683, "grad_norm": 2.500246047973633, "learning_rate": 2.588628762541806e-06, "loss": 5.0851, "step": 34200 }, { "epoch": 0.5195889398201683, "eval_loss": 4.938769340515137, "eval_runtime": 194.1164, "eval_samples_per_second": 51.515, "eval_steps_per_second": 6.439, "step": 34200 }, { "epoch": 0.5211082057260752, "grad_norm": 2.7174882888793945, "learning_rate": 2.5785953177257527e-06, "loss": 5.0917, "step": 34300 }, { "epoch": 0.5211082057260752, "eval_loss": 4.933419704437256, "eval_runtime": 194.3056, "eval_samples_per_second": 51.465, "eval_steps_per_second": 6.433, "step": 34300 }, { "epoch": 0.5226274716319821, "grad_norm": 3.255512237548828, "learning_rate": 2.568561872909699e-06, "loss": 5.0836, "step": 34400 }, { "epoch": 0.5226274716319821, "eval_loss": 4.930218696594238, "eval_runtime": 193.8112, "eval_samples_per_second": 51.597, "eval_steps_per_second": 6.45, "step": 34400 }, { "epoch": 0.5241467375378891, "grad_norm": 2.2356820106506348, "learning_rate": 2.5585284280936454e-06, "loss": 5.0815, "step": 34500 }, { "epoch": 0.5241467375378891, "eval_loss": 4.932159423828125, "eval_runtime": 193.6954, "eval_samples_per_second": 51.627, "eval_steps_per_second": 6.453, "step": 34500 }, { "epoch": 0.525666003443796, "grad_norm": 2.4992058277130127, "learning_rate": 2.548494983277592e-06, "loss": 5.0844, "step": 34600 }, { "epoch": 0.525666003443796, "eval_loss": 4.924154758453369, "eval_runtime": 193.7231, "eval_samples_per_second": 51.62, "eval_steps_per_second": 6.453, "step": 34600 }, { "epoch": 0.5271852693497029, "grad_norm": 2.348440647125244, "learning_rate": 2.5384615384615385e-06, "loss": 5.0789, "step": 34700 }, { "epoch": 0.5271852693497029, "eval_loss": 4.925171852111816, "eval_runtime": 193.7816, "eval_samples_per_second": 51.604, "eval_steps_per_second": 6.451, "step": 34700 }, { "epoch": 0.5287045352556098, "grad_norm": 2.589172840118408, "learning_rate": 2.528428093645485e-06, "loss": 5.0708, "step": 34800 }, { "epoch": 0.5287045352556098, "eval_loss": 4.919689178466797, "eval_runtime": 193.7778, "eval_samples_per_second": 51.605, "eval_steps_per_second": 6.451, "step": 34800 }, { "epoch": 0.5302238011615168, "grad_norm": 2.950510263442993, "learning_rate": 2.5183946488294316e-06, "loss": 5.0707, "step": 34900 }, { "epoch": 0.5302238011615168, "eval_loss": 4.9157304763793945, "eval_runtime": 193.5504, "eval_samples_per_second": 51.666, "eval_steps_per_second": 6.458, "step": 34900 }, { "epoch": 0.5317430670674237, "grad_norm": 3.1693990230560303, "learning_rate": 2.508361204013378e-06, "loss": 5.0653, "step": 35000 }, { "epoch": 0.5317430670674237, "eval_loss": 4.910171985626221, "eval_runtime": 194.0296, "eval_samples_per_second": 51.539, "eval_steps_per_second": 6.442, "step": 35000 }, { "epoch": 0.5332623329733306, "grad_norm": 2.8548085689544678, "learning_rate": 2.4983277591973247e-06, "loss": 5.0706, "step": 35100 }, { "epoch": 0.5332623329733306, "eval_loss": 4.9105353355407715, "eval_runtime": 193.9374, "eval_samples_per_second": 51.563, "eval_steps_per_second": 6.445, "step": 35100 }, { "epoch": 0.5347815988792376, "grad_norm": 1.8217041492462158, "learning_rate": 2.488294314381271e-06, "loss": 5.064, "step": 35200 }, { "epoch": 0.5347815988792376, "eval_loss": 4.906797885894775, "eval_runtime": 194.1314, "eval_samples_per_second": 51.511, "eval_steps_per_second": 6.439, "step": 35200 }, { "epoch": 0.5363008647851445, "grad_norm": 2.089233875274658, "learning_rate": 2.4782608695652173e-06, "loss": 5.0612, "step": 35300 }, { "epoch": 0.5363008647851445, "eval_loss": 4.904172420501709, "eval_runtime": 194.1107, "eval_samples_per_second": 51.517, "eval_steps_per_second": 6.44, "step": 35300 }, { "epoch": 0.5378201306910514, "grad_norm": 2.3225550651550293, "learning_rate": 2.468227424749164e-06, "loss": 5.06, "step": 35400 }, { "epoch": 0.5378201306910514, "eval_loss": 4.904652118682861, "eval_runtime": 194.1265, "eval_samples_per_second": 51.513, "eval_steps_per_second": 6.439, "step": 35400 }, { "epoch": 0.5393393965969583, "grad_norm": 3.1568684577941895, "learning_rate": 2.4581939799331104e-06, "loss": 5.0579, "step": 35500 }, { "epoch": 0.5393393965969583, "eval_loss": 4.90002965927124, "eval_runtime": 194.1042, "eval_samples_per_second": 51.519, "eval_steps_per_second": 6.44, "step": 35500 }, { "epoch": 0.5408586625028653, "grad_norm": 2.8267829418182373, "learning_rate": 2.4481605351170568e-06, "loss": 5.0568, "step": 35600 }, { "epoch": 0.5408586625028653, "eval_loss": 4.89033842086792, "eval_runtime": 194.0764, "eval_samples_per_second": 51.526, "eval_steps_per_second": 6.441, "step": 35600 }, { "epoch": 0.5423779284087722, "grad_norm": 1.987886667251587, "learning_rate": 2.4381270903010035e-06, "loss": 5.0541, "step": 35700 }, { "epoch": 0.5423779284087722, "eval_loss": 4.9011454582214355, "eval_runtime": 194.1549, "eval_samples_per_second": 51.505, "eval_steps_per_second": 6.438, "step": 35700 }, { "epoch": 0.5438971943146791, "grad_norm": 3.215435028076172, "learning_rate": 2.42809364548495e-06, "loss": 5.0506, "step": 35800 }, { "epoch": 0.5438971943146791, "eval_loss": 4.890650272369385, "eval_runtime": 194.1843, "eval_samples_per_second": 51.497, "eval_steps_per_second": 6.437, "step": 35800 }, { "epoch": 0.545416460220586, "grad_norm": 1.9231488704681396, "learning_rate": 2.4180602006688962e-06, "loss": 5.0466, "step": 35900 }, { "epoch": 0.545416460220586, "eval_loss": 4.890570163726807, "eval_runtime": 194.206, "eval_samples_per_second": 51.492, "eval_steps_per_second": 6.436, "step": 35900 }, { "epoch": 0.546935726126493, "grad_norm": 2.3541529178619385, "learning_rate": 2.408026755852843e-06, "loss": 5.0444, "step": 36000 }, { "epoch": 0.546935726126493, "eval_loss": 4.887938022613525, "eval_runtime": 194.1495, "eval_samples_per_second": 51.507, "eval_steps_per_second": 6.438, "step": 36000 }, { "epoch": 0.5484549920323999, "grad_norm": 2.646209478378296, "learning_rate": 2.3979933110367893e-06, "loss": 5.0381, "step": 36100 }, { "epoch": 0.5484549920323999, "eval_loss": 4.883460998535156, "eval_runtime": 194.0814, "eval_samples_per_second": 51.525, "eval_steps_per_second": 6.441, "step": 36100 }, { "epoch": 0.5499742579383068, "grad_norm": 2.2432219982147217, "learning_rate": 2.387959866220736e-06, "loss": 5.0363, "step": 36200 }, { "epoch": 0.5499742579383068, "eval_loss": 4.881083011627197, "eval_runtime": 194.1247, "eval_samples_per_second": 51.513, "eval_steps_per_second": 6.439, "step": 36200 }, { "epoch": 0.5514935238442137, "grad_norm": 2.482103109359741, "learning_rate": 2.3779264214046824e-06, "loss": 5.0416, "step": 36300 }, { "epoch": 0.5514935238442137, "eval_loss": 4.881221294403076, "eval_runtime": 194.1059, "eval_samples_per_second": 51.518, "eval_steps_per_second": 6.44, "step": 36300 }, { "epoch": 0.5530127897501207, "grad_norm": 2.0182697772979736, "learning_rate": 2.3678929765886288e-06, "loss": 5.0287, "step": 36400 }, { "epoch": 0.5530127897501207, "eval_loss": 4.877261161804199, "eval_runtime": 194.0469, "eval_samples_per_second": 51.534, "eval_steps_per_second": 6.442, "step": 36400 }, { "epoch": 0.5545320556560276, "grad_norm": 3.02773380279541, "learning_rate": 2.3578595317725755e-06, "loss": 5.0328, "step": 36500 }, { "epoch": 0.5545320556560276, "eval_loss": 4.869913101196289, "eval_runtime": 194.0627, "eval_samples_per_second": 51.53, "eval_steps_per_second": 6.441, "step": 36500 }, { "epoch": 0.5560513215619345, "grad_norm": 3.1895177364349365, "learning_rate": 2.347826086956522e-06, "loss": 5.0272, "step": 36600 }, { "epoch": 0.5560513215619345, "eval_loss": 4.872635364532471, "eval_runtime": 194.0337, "eval_samples_per_second": 51.537, "eval_steps_per_second": 6.442, "step": 36600 }, { "epoch": 0.5575705874678414, "grad_norm": 2.474367141723633, "learning_rate": 2.337792642140468e-06, "loss": 5.0285, "step": 36700 }, { "epoch": 0.5575705874678414, "eval_loss": 4.866065502166748, "eval_runtime": 193.983, "eval_samples_per_second": 51.551, "eval_steps_per_second": 6.444, "step": 36700 }, { "epoch": 0.5590898533737484, "grad_norm": 3.0734000205993652, "learning_rate": 2.327759197324415e-06, "loss": 5.0238, "step": 36800 }, { "epoch": 0.5590898533737484, "eval_loss": 4.873917102813721, "eval_runtime": 193.8114, "eval_samples_per_second": 51.597, "eval_steps_per_second": 6.45, "step": 36800 }, { "epoch": 0.5606091192796553, "grad_norm": 2.379478931427002, "learning_rate": 2.3177257525083613e-06, "loss": 5.0225, "step": 36900 }, { "epoch": 0.5606091192796553, "eval_loss": 4.864801406860352, "eval_runtime": 193.8549, "eval_samples_per_second": 51.585, "eval_steps_per_second": 6.448, "step": 36900 }, { "epoch": 0.5621283851855622, "grad_norm": 2.6084952354431152, "learning_rate": 2.307692307692308e-06, "loss": 5.0177, "step": 37000 }, { "epoch": 0.5621283851855622, "eval_loss": 4.863184452056885, "eval_runtime": 193.8212, "eval_samples_per_second": 51.594, "eval_steps_per_second": 6.449, "step": 37000 }, { "epoch": 0.5636476510914691, "grad_norm": 2.194261312484741, "learning_rate": 2.2976588628762544e-06, "loss": 5.0167, "step": 37100 }, { "epoch": 0.5636476510914691, "eval_loss": 4.855440139770508, "eval_runtime": 193.8552, "eval_samples_per_second": 51.585, "eval_steps_per_second": 6.448, "step": 37100 }, { "epoch": 0.5651669169973761, "grad_norm": 2.195667028427124, "learning_rate": 2.2876254180602008e-06, "loss": 5.0148, "step": 37200 }, { "epoch": 0.5651669169973761, "eval_loss": 4.857753753662109, "eval_runtime": 193.61, "eval_samples_per_second": 51.65, "eval_steps_per_second": 6.456, "step": 37200 }, { "epoch": 0.566686182903283, "grad_norm": 2.308091402053833, "learning_rate": 2.2775919732441475e-06, "loss": 5.0152, "step": 37300 }, { "epoch": 0.566686182903283, "eval_loss": 4.850945949554443, "eval_runtime": 193.675, "eval_samples_per_second": 51.633, "eval_steps_per_second": 6.454, "step": 37300 }, { "epoch": 0.5682054488091899, "grad_norm": 1.5866217613220215, "learning_rate": 2.267558528428094e-06, "loss": 5.0086, "step": 37400 }, { "epoch": 0.5682054488091899, "eval_loss": 4.856834411621094, "eval_runtime": 194.0621, "eval_samples_per_second": 51.53, "eval_steps_per_second": 6.441, "step": 37400 }, { "epoch": 0.5697247147150968, "grad_norm": 2.3778269290924072, "learning_rate": 2.25752508361204e-06, "loss": 5.008, "step": 37500 }, { "epoch": 0.5697247147150968, "eval_loss": 4.849526405334473, "eval_runtime": 193.9807, "eval_samples_per_second": 51.552, "eval_steps_per_second": 6.444, "step": 37500 }, { "epoch": 0.5712439806210038, "grad_norm": 2.434232234954834, "learning_rate": 2.2474916387959865e-06, "loss": 5.0049, "step": 37600 }, { "epoch": 0.5712439806210038, "eval_loss": 4.849723815917969, "eval_runtime": 194.1152, "eval_samples_per_second": 51.516, "eval_steps_per_second": 6.439, "step": 37600 }, { "epoch": 0.5727632465269107, "grad_norm": 1.9899414777755737, "learning_rate": 2.237458193979933e-06, "loss": 5.0034, "step": 37700 }, { "epoch": 0.5727632465269107, "eval_loss": 4.845240592956543, "eval_runtime": 194.1331, "eval_samples_per_second": 51.511, "eval_steps_per_second": 6.439, "step": 37700 }, { "epoch": 0.5742825124328176, "grad_norm": 2.168919086456299, "learning_rate": 2.2274247491638796e-06, "loss": 4.9989, "step": 37800 }, { "epoch": 0.5742825124328176, "eval_loss": 4.840480327606201, "eval_runtime": 194.2185, "eval_samples_per_second": 51.488, "eval_steps_per_second": 6.436, "step": 37800 }, { "epoch": 0.5758017783387245, "grad_norm": 2.4156546592712402, "learning_rate": 2.217391304347826e-06, "loss": 4.9981, "step": 37900 }, { "epoch": 0.5758017783387245, "eval_loss": 4.837850570678711, "eval_runtime": 194.2958, "eval_samples_per_second": 51.468, "eval_steps_per_second": 6.433, "step": 37900 }, { "epoch": 0.5773210442446315, "grad_norm": 2.725648880004883, "learning_rate": 2.2073578595317723e-06, "loss": 4.9999, "step": 38000 }, { "epoch": 0.5773210442446315, "eval_loss": 4.840028285980225, "eval_runtime": 194.1552, "eval_samples_per_second": 51.505, "eval_steps_per_second": 6.438, "step": 38000 }, { "epoch": 0.5788403101505384, "grad_norm": 2.447983503341675, "learning_rate": 2.197324414715719e-06, "loss": 4.9909, "step": 38100 }, { "epoch": 0.5788403101505384, "eval_loss": 4.840633392333984, "eval_runtime": 200.6709, "eval_samples_per_second": 49.833, "eval_steps_per_second": 6.229, "step": 38100 }, { "epoch": 0.5803595760564453, "grad_norm": 2.5275213718414307, "learning_rate": 2.1872909698996654e-06, "loss": 4.9924, "step": 38200 }, { "epoch": 0.5803595760564453, "eval_loss": 4.838108539581299, "eval_runtime": 194.1742, "eval_samples_per_second": 51.5, "eval_steps_per_second": 6.438, "step": 38200 }, { "epoch": 0.5818788419623522, "grad_norm": 3.0799427032470703, "learning_rate": 2.177257525083612e-06, "loss": 4.9892, "step": 38300 }, { "epoch": 0.5818788419623522, "eval_loss": 4.830769062042236, "eval_runtime": 194.1344, "eval_samples_per_second": 51.511, "eval_steps_per_second": 6.439, "step": 38300 }, { "epoch": 0.5833981078682592, "grad_norm": 2.262266159057617, "learning_rate": 2.1672240802675585e-06, "loss": 4.9869, "step": 38400 }, { "epoch": 0.5833981078682592, "eval_loss": 4.82758903503418, "eval_runtime": 194.1511, "eval_samples_per_second": 51.506, "eval_steps_per_second": 6.438, "step": 38400 }, { "epoch": 0.5849173737741661, "grad_norm": 2.2345926761627197, "learning_rate": 2.157190635451505e-06, "loss": 4.9879, "step": 38500 }, { "epoch": 0.5849173737741661, "eval_loss": 4.826181888580322, "eval_runtime": 194.2128, "eval_samples_per_second": 51.49, "eval_steps_per_second": 6.436, "step": 38500 }, { "epoch": 0.586436639680073, "grad_norm": 1.8996378183364868, "learning_rate": 2.1471571906354516e-06, "loss": 4.9854, "step": 38600 }, { "epoch": 0.586436639680073, "eval_loss": 4.823826789855957, "eval_runtime": 193.9959, "eval_samples_per_second": 51.547, "eval_steps_per_second": 6.443, "step": 38600 }, { "epoch": 0.5879559055859799, "grad_norm": 2.0965209007263184, "learning_rate": 2.137123745819398e-06, "loss": 4.9833, "step": 38700 }, { "epoch": 0.5879559055859799, "eval_loss": 4.819667816162109, "eval_runtime": 194.1201, "eval_samples_per_second": 51.515, "eval_steps_per_second": 6.439, "step": 38700 }, { "epoch": 0.5894751714918869, "grad_norm": 2.005686044692993, "learning_rate": 2.1270903010033443e-06, "loss": 4.9753, "step": 38800 }, { "epoch": 0.5894751714918869, "eval_loss": 4.818215847015381, "eval_runtime": 194.3711, "eval_samples_per_second": 51.448, "eval_steps_per_second": 6.431, "step": 38800 }, { "epoch": 0.5909944373977938, "grad_norm": 2.056711196899414, "learning_rate": 2.117056856187291e-06, "loss": 4.9729, "step": 38900 }, { "epoch": 0.5909944373977938, "eval_loss": 4.815535068511963, "eval_runtime": 194.0241, "eval_samples_per_second": 51.54, "eval_steps_per_second": 6.442, "step": 38900 }, { "epoch": 0.5925137033037007, "grad_norm": 2.186563730239868, "learning_rate": 2.1070234113712374e-06, "loss": 4.9738, "step": 39000 }, { "epoch": 0.5925137033037007, "eval_loss": 4.811450958251953, "eval_runtime": 193.7645, "eval_samples_per_second": 51.609, "eval_steps_per_second": 6.451, "step": 39000 }, { "epoch": 0.5940329692096076, "grad_norm": 2.0862069129943848, "learning_rate": 2.0969899665551837e-06, "loss": 4.9714, "step": 39100 }, { "epoch": 0.5940329692096076, "eval_loss": 4.812065601348877, "eval_runtime": 193.9277, "eval_samples_per_second": 51.566, "eval_steps_per_second": 6.446, "step": 39100 }, { "epoch": 0.5955522351155146, "grad_norm": 2.3990869522094727, "learning_rate": 2.0869565217391305e-06, "loss": 4.9745, "step": 39200 }, { "epoch": 0.5955522351155146, "eval_loss": 4.809053421020508, "eval_runtime": 194.0213, "eval_samples_per_second": 51.541, "eval_steps_per_second": 6.443, "step": 39200 }, { "epoch": 0.5970715010214215, "grad_norm": 2.380688428878784, "learning_rate": 2.076923076923077e-06, "loss": 4.9709, "step": 39300 }, { "epoch": 0.5970715010214215, "eval_loss": 4.810598373413086, "eval_runtime": 193.7901, "eval_samples_per_second": 51.602, "eval_steps_per_second": 6.45, "step": 39300 }, { "epoch": 0.5985907669273284, "grad_norm": 2.6398425102233887, "learning_rate": 2.0668896321070236e-06, "loss": 4.967, "step": 39400 }, { "epoch": 0.5985907669273284, "eval_loss": 4.807140827178955, "eval_runtime": 193.6791, "eval_samples_per_second": 51.632, "eval_steps_per_second": 6.454, "step": 39400 }, { "epoch": 0.6001100328332353, "grad_norm": 2.365203619003296, "learning_rate": 2.05685618729097e-06, "loss": 4.9623, "step": 39500 }, { "epoch": 0.6001100328332353, "eval_loss": 4.804749011993408, "eval_runtime": 193.728, "eval_samples_per_second": 51.619, "eval_steps_per_second": 6.452, "step": 39500 }, { "epoch": 0.6016292987391423, "grad_norm": 2.6509780883789062, "learning_rate": 2.0468227424749163e-06, "loss": 4.963, "step": 39600 }, { "epoch": 0.6016292987391423, "eval_loss": 4.8039093017578125, "eval_runtime": 194.0108, "eval_samples_per_second": 51.544, "eval_steps_per_second": 6.443, "step": 39600 }, { "epoch": 0.6031485646450492, "grad_norm": 2.182466506958008, "learning_rate": 2.036789297658863e-06, "loss": 4.9585, "step": 39700 }, { "epoch": 0.6031485646450492, "eval_loss": 4.798705577850342, "eval_runtime": 194.1051, "eval_samples_per_second": 51.518, "eval_steps_per_second": 6.44, "step": 39700 }, { "epoch": 0.6046678305509561, "grad_norm": 1.9312145709991455, "learning_rate": 2.0267558528428094e-06, "loss": 4.9604, "step": 39800 }, { "epoch": 0.6046678305509561, "eval_loss": 4.799111843109131, "eval_runtime": 194.0025, "eval_samples_per_second": 51.546, "eval_steps_per_second": 6.443, "step": 39800 }, { "epoch": 0.606187096456863, "grad_norm": 2.0514976978302, "learning_rate": 2.0167224080267557e-06, "loss": 4.9551, "step": 39900 }, { "epoch": 0.606187096456863, "eval_loss": 4.792530536651611, "eval_runtime": 194.0231, "eval_samples_per_second": 51.54, "eval_steps_per_second": 6.443, "step": 39900 }, { "epoch": 0.60770636236277, "grad_norm": 2.4416747093200684, "learning_rate": 2.0066889632107025e-06, "loss": 4.9522, "step": 40000 }, { "epoch": 0.60770636236277, "eval_loss": 4.7944655418396, "eval_runtime": 194.2247, "eval_samples_per_second": 51.487, "eval_steps_per_second": 6.436, "step": 40000 }, { "epoch": 0.6092256282686769, "grad_norm": 2.400484561920166, "learning_rate": 1.996655518394649e-06, "loss": 4.9543, "step": 40100 }, { "epoch": 0.6092256282686769, "eval_loss": 4.793302059173584, "eval_runtime": 194.2542, "eval_samples_per_second": 51.479, "eval_steps_per_second": 6.435, "step": 40100 }, { "epoch": 0.6107448941745838, "grad_norm": 1.9967873096466064, "learning_rate": 1.986622073578595e-06, "loss": 4.9507, "step": 40200 }, { "epoch": 0.6107448941745838, "eval_loss": 4.793440818786621, "eval_runtime": 194.3425, "eval_samples_per_second": 51.456, "eval_steps_per_second": 6.432, "step": 40200 }, { "epoch": 0.6122641600804907, "grad_norm": 1.917490839958191, "learning_rate": 1.976588628762542e-06, "loss": 4.9505, "step": 40300 }, { "epoch": 0.6122641600804907, "eval_loss": 4.786988258361816, "eval_runtime": 194.1703, "eval_samples_per_second": 51.501, "eval_steps_per_second": 6.438, "step": 40300 }, { "epoch": 0.6137834259863977, "grad_norm": 2.4164531230926514, "learning_rate": 1.9665551839464883e-06, "loss": 4.9423, "step": 40400 }, { "epoch": 0.6137834259863977, "eval_loss": 4.786272048950195, "eval_runtime": 194.6058, "eval_samples_per_second": 51.386, "eval_steps_per_second": 6.423, "step": 40400 }, { "epoch": 0.6153026918923046, "grad_norm": 2.5412399768829346, "learning_rate": 1.956521739130435e-06, "loss": 4.9447, "step": 40500 }, { "epoch": 0.6153026918923046, "eval_loss": 4.785825729370117, "eval_runtime": 194.2484, "eval_samples_per_second": 51.48, "eval_steps_per_second": 6.435, "step": 40500 }, { "epoch": 0.6168219577982115, "grad_norm": 2.2212436199188232, "learning_rate": 1.9464882943143814e-06, "loss": 4.9432, "step": 40600 }, { "epoch": 0.6168219577982115, "eval_loss": 4.7824625968933105, "eval_runtime": 194.1967, "eval_samples_per_second": 51.494, "eval_steps_per_second": 6.437, "step": 40600 }, { "epoch": 0.6183412237041184, "grad_norm": 2.1287331581115723, "learning_rate": 1.9364548494983277e-06, "loss": 4.9416, "step": 40700 }, { "epoch": 0.6183412237041184, "eval_loss": 4.776528358459473, "eval_runtime": 194.1119, "eval_samples_per_second": 51.517, "eval_steps_per_second": 6.44, "step": 40700 }, { "epoch": 0.6198604896100254, "grad_norm": 1.8793989419937134, "learning_rate": 1.9264214046822745e-06, "loss": 4.9357, "step": 40800 }, { "epoch": 0.6198604896100254, "eval_loss": 4.779613494873047, "eval_runtime": 194.0243, "eval_samples_per_second": 51.54, "eval_steps_per_second": 6.442, "step": 40800 }, { "epoch": 0.6213797555159323, "grad_norm": 1.943474531173706, "learning_rate": 1.916387959866221e-06, "loss": 4.9389, "step": 40900 }, { "epoch": 0.6213797555159323, "eval_loss": 4.774796009063721, "eval_runtime": 194.066, "eval_samples_per_second": 51.529, "eval_steps_per_second": 6.441, "step": 40900 }, { "epoch": 0.6228990214218392, "grad_norm": 2.138035774230957, "learning_rate": 1.9063545150501674e-06, "loss": 4.9344, "step": 41000 }, { "epoch": 0.6228990214218392, "eval_loss": 4.774413108825684, "eval_runtime": 194.289, "eval_samples_per_second": 51.47, "eval_steps_per_second": 6.434, "step": 41000 }, { "epoch": 0.6244182873277461, "grad_norm": 2.1911377906799316, "learning_rate": 1.896321070234114e-06, "loss": 4.9307, "step": 41100 }, { "epoch": 0.6244182873277461, "eval_loss": 4.7724833488464355, "eval_runtime": 194.0879, "eval_samples_per_second": 51.523, "eval_steps_per_second": 6.44, "step": 41100 }, { "epoch": 0.6259375532336531, "grad_norm": 2.186774730682373, "learning_rate": 1.8862876254180603e-06, "loss": 4.9316, "step": 41200 }, { "epoch": 0.6259375532336531, "eval_loss": 4.7727203369140625, "eval_runtime": 193.9834, "eval_samples_per_second": 51.551, "eval_steps_per_second": 6.444, "step": 41200 }, { "epoch": 0.62745681913956, "grad_norm": 2.706834554672241, "learning_rate": 1.8762541806020068e-06, "loss": 4.9244, "step": 41300 }, { "epoch": 0.62745681913956, "eval_loss": 4.769220352172852, "eval_runtime": 193.833, "eval_samples_per_second": 51.591, "eval_steps_per_second": 6.449, "step": 41300 }, { "epoch": 0.6289760850454669, "grad_norm": 2.0782527923583984, "learning_rate": 1.8662207357859534e-06, "loss": 4.9308, "step": 41400 }, { "epoch": 0.6289760850454669, "eval_loss": 4.769233703613281, "eval_runtime": 193.7092, "eval_samples_per_second": 51.624, "eval_steps_per_second": 6.453, "step": 41400 }, { "epoch": 0.6304953509513738, "grad_norm": 2.107680559158325, "learning_rate": 1.8561872909699e-06, "loss": 4.9286, "step": 41500 }, { "epoch": 0.6304953509513738, "eval_loss": 4.765684604644775, "eval_runtime": 193.8101, "eval_samples_per_second": 51.597, "eval_steps_per_second": 6.45, "step": 41500 }, { "epoch": 0.6320146168572808, "grad_norm": 1.861700177192688, "learning_rate": 1.8461538461538462e-06, "loss": 4.925, "step": 41600 }, { "epoch": 0.6320146168572808, "eval_loss": 4.761124134063721, "eval_runtime": 194.0209, "eval_samples_per_second": 51.541, "eval_steps_per_second": 6.443, "step": 41600 }, { "epoch": 0.6335338827631877, "grad_norm": 2.256538152694702, "learning_rate": 1.8361204013377928e-06, "loss": 4.9214, "step": 41700 }, { "epoch": 0.6335338827631877, "eval_loss": 4.761186122894287, "eval_runtime": 193.8553, "eval_samples_per_second": 51.585, "eval_steps_per_second": 6.448, "step": 41700 }, { "epoch": 0.6350531486690946, "grad_norm": 1.720786213874817, "learning_rate": 1.8260869565217394e-06, "loss": 4.9188, "step": 41800 }, { "epoch": 0.6350531486690946, "eval_loss": 4.75638484954834, "eval_runtime": 194.0169, "eval_samples_per_second": 51.542, "eval_steps_per_second": 6.443, "step": 41800 }, { "epoch": 0.6365724145750015, "grad_norm": 1.9223599433898926, "learning_rate": 1.8160535117056857e-06, "loss": 4.9162, "step": 41900 }, { "epoch": 0.6365724145750015, "eval_loss": 4.757732391357422, "eval_runtime": 194.1596, "eval_samples_per_second": 51.504, "eval_steps_per_second": 6.438, "step": 41900 }, { "epoch": 0.6380916804809085, "grad_norm": 1.7804436683654785, "learning_rate": 1.8060200668896322e-06, "loss": 4.9158, "step": 42000 }, { "epoch": 0.6380916804809085, "eval_loss": 4.757546424865723, "eval_runtime": 194.2282, "eval_samples_per_second": 51.486, "eval_steps_per_second": 6.436, "step": 42000 }, { "epoch": 0.6396109463868154, "grad_norm": 1.9580631256103516, "learning_rate": 1.7959866220735788e-06, "loss": 4.9095, "step": 42100 }, { "epoch": 0.6396109463868154, "eval_loss": 4.752386093139648, "eval_runtime": 194.1745, "eval_samples_per_second": 51.5, "eval_steps_per_second": 6.438, "step": 42100 }, { "epoch": 0.6411302122927223, "grad_norm": 2.1417272090911865, "learning_rate": 1.7859531772575253e-06, "loss": 4.9134, "step": 42200 }, { "epoch": 0.6411302122927223, "eval_loss": 4.749510765075684, "eval_runtime": 194.1938, "eval_samples_per_second": 51.495, "eval_steps_per_second": 6.437, "step": 42200 }, { "epoch": 0.6426494781986292, "grad_norm": 2.4839389324188232, "learning_rate": 1.7759197324414717e-06, "loss": 4.9116, "step": 42300 }, { "epoch": 0.6426494781986292, "eval_loss": 4.752679824829102, "eval_runtime": 194.1618, "eval_samples_per_second": 51.503, "eval_steps_per_second": 6.438, "step": 42300 }, { "epoch": 0.6441687441045362, "grad_norm": 2.5596067905426025, "learning_rate": 1.7658862876254182e-06, "loss": 4.9078, "step": 42400 }, { "epoch": 0.6441687441045362, "eval_loss": 4.742520332336426, "eval_runtime": 194.1697, "eval_samples_per_second": 51.501, "eval_steps_per_second": 6.438, "step": 42400 }, { "epoch": 0.6456880100104431, "grad_norm": 1.7020114660263062, "learning_rate": 1.7558528428093648e-06, "loss": 4.9063, "step": 42500 }, { "epoch": 0.6456880100104431, "eval_loss": 4.745018005371094, "eval_runtime": 194.2677, "eval_samples_per_second": 51.475, "eval_steps_per_second": 6.434, "step": 42500 }, { "epoch": 0.64720727591635, "grad_norm": 1.83507239818573, "learning_rate": 1.745819397993311e-06, "loss": 4.9067, "step": 42600 }, { "epoch": 0.64720727591635, "eval_loss": 4.749469757080078, "eval_runtime": 194.1266, "eval_samples_per_second": 51.513, "eval_steps_per_second": 6.439, "step": 42600 }, { "epoch": 0.6487265418222569, "grad_norm": 1.7852286100387573, "learning_rate": 1.7357859531772575e-06, "loss": 4.909, "step": 42700 }, { "epoch": 0.6487265418222569, "eval_loss": 4.74142599105835, "eval_runtime": 194.3029, "eval_samples_per_second": 51.466, "eval_steps_per_second": 6.433, "step": 42700 }, { "epoch": 0.6502458077281639, "grad_norm": 1.9356688261032104, "learning_rate": 1.7257525083612038e-06, "loss": 4.8968, "step": 42800 }, { "epoch": 0.6502458077281639, "eval_loss": 4.742361068725586, "eval_runtime": 194.1912, "eval_samples_per_second": 51.496, "eval_steps_per_second": 6.437, "step": 42800 }, { "epoch": 0.6517650736340708, "grad_norm": 2.4372880458831787, "learning_rate": 1.7157190635451504e-06, "loss": 4.9034, "step": 42900 }, { "epoch": 0.6517650736340708, "eval_loss": 4.737247467041016, "eval_runtime": 194.1333, "eval_samples_per_second": 51.511, "eval_steps_per_second": 6.439, "step": 42900 }, { "epoch": 0.6532843395399777, "grad_norm": 1.9416236877441406, "learning_rate": 1.705685618729097e-06, "loss": 4.8978, "step": 43000 }, { "epoch": 0.6532843395399777, "eval_loss": 4.7349853515625, "eval_runtime": 194.1706, "eval_samples_per_second": 51.501, "eval_steps_per_second": 6.438, "step": 43000 }, { "epoch": 0.6548036054458846, "grad_norm": 2.3514084815979004, "learning_rate": 1.6956521739130435e-06, "loss": 4.8963, "step": 43100 }, { "epoch": 0.6548036054458846, "eval_loss": 4.7388434410095215, "eval_runtime": 194.3316, "eval_samples_per_second": 51.458, "eval_steps_per_second": 6.432, "step": 43100 }, { "epoch": 0.6563228713517916, "grad_norm": 2.028310537338257, "learning_rate": 1.6856187290969898e-06, "loss": 4.8961, "step": 43200 }, { "epoch": 0.6563228713517916, "eval_loss": 4.735996723175049, "eval_runtime": 194.0261, "eval_samples_per_second": 51.539, "eval_steps_per_second": 6.442, "step": 43200 }, { "epoch": 0.6578421372576985, "grad_norm": 2.360321521759033, "learning_rate": 1.6755852842809363e-06, "loss": 4.8892, "step": 43300 }, { "epoch": 0.6578421372576985, "eval_loss": 4.731908798217773, "eval_runtime": 194.2382, "eval_samples_per_second": 51.483, "eval_steps_per_second": 6.435, "step": 43300 }, { "epoch": 0.6593614031636054, "grad_norm": 2.0614426136016846, "learning_rate": 1.665551839464883e-06, "loss": 4.8911, "step": 43400 }, { "epoch": 0.6593614031636054, "eval_loss": 4.727632999420166, "eval_runtime": 194.0495, "eval_samples_per_second": 51.533, "eval_steps_per_second": 6.442, "step": 43400 }, { "epoch": 0.6608806690695123, "grad_norm": 2.058509349822998, "learning_rate": 1.6555183946488294e-06, "loss": 4.8883, "step": 43500 }, { "epoch": 0.6608806690695123, "eval_loss": 4.72844123840332, "eval_runtime": 194.0456, "eval_samples_per_second": 51.534, "eval_steps_per_second": 6.442, "step": 43500 }, { "epoch": 0.6623999349754193, "grad_norm": 1.7108250856399536, "learning_rate": 1.6454849498327758e-06, "loss": 4.8866, "step": 43600 }, { "epoch": 0.6623999349754193, "eval_loss": 4.726889133453369, "eval_runtime": 193.8998, "eval_samples_per_second": 51.573, "eval_steps_per_second": 6.447, "step": 43600 }, { "epoch": 0.6639192008813262, "grad_norm": 1.871711254119873, "learning_rate": 1.6354515050167223e-06, "loss": 4.888, "step": 43700 }, { "epoch": 0.6639192008813262, "eval_loss": 4.726442813873291, "eval_runtime": 193.895, "eval_samples_per_second": 51.574, "eval_steps_per_second": 6.447, "step": 43700 }, { "epoch": 0.6654384667872331, "grad_norm": 1.9516098499298096, "learning_rate": 1.6254180602006689e-06, "loss": 4.887, "step": 43800 }, { "epoch": 0.6654384667872331, "eval_loss": 4.72707986831665, "eval_runtime": 193.7412, "eval_samples_per_second": 51.615, "eval_steps_per_second": 6.452, "step": 43800 }, { "epoch": 0.66695773269314, "grad_norm": 1.870690107345581, "learning_rate": 1.6153846153846154e-06, "loss": 4.8794, "step": 43900 }, { "epoch": 0.66695773269314, "eval_loss": 4.7214789390563965, "eval_runtime": 193.7498, "eval_samples_per_second": 51.613, "eval_steps_per_second": 6.452, "step": 43900 }, { "epoch": 0.668476998599047, "grad_norm": 1.8577009439468384, "learning_rate": 1.6053511705685618e-06, "loss": 4.8803, "step": 44000 }, { "epoch": 0.668476998599047, "eval_loss": 4.719671726226807, "eval_runtime": 193.9858, "eval_samples_per_second": 51.55, "eval_steps_per_second": 6.444, "step": 44000 }, { "epoch": 0.6699962645049539, "grad_norm": 2.1134140491485596, "learning_rate": 1.5953177257525083e-06, "loss": 4.879, "step": 44100 }, { "epoch": 0.6699962645049539, "eval_loss": 4.717536926269531, "eval_runtime": 193.7322, "eval_samples_per_second": 51.618, "eval_steps_per_second": 6.452, "step": 44100 }, { "epoch": 0.6715155304108608, "grad_norm": 2.10524845123291, "learning_rate": 1.5852842809364549e-06, "loss": 4.8782, "step": 44200 }, { "epoch": 0.6715155304108608, "eval_loss": 4.712420463562012, "eval_runtime": 194.1278, "eval_samples_per_second": 51.512, "eval_steps_per_second": 6.439, "step": 44200 }, { "epoch": 0.6730347963167677, "grad_norm": 1.9747872352600098, "learning_rate": 1.5752508361204012e-06, "loss": 4.8782, "step": 44300 }, { "epoch": 0.6730347963167677, "eval_loss": 4.716573238372803, "eval_runtime": 194.2422, "eval_samples_per_second": 51.482, "eval_steps_per_second": 6.435, "step": 44300 }, { "epoch": 0.6745540622226747, "grad_norm": 1.9124640226364136, "learning_rate": 1.5652173913043478e-06, "loss": 4.8808, "step": 44400 }, { "epoch": 0.6745540622226747, "eval_loss": 4.715909481048584, "eval_runtime": 199.7142, "eval_samples_per_second": 50.072, "eval_steps_per_second": 6.259, "step": 44400 }, { "epoch": 0.6760733281285816, "grad_norm": 1.971144676208496, "learning_rate": 1.5551839464882943e-06, "loss": 4.8739, "step": 44500 }, { "epoch": 0.6760733281285816, "eval_loss": 4.714458465576172, "eval_runtime": 194.1832, "eval_samples_per_second": 51.498, "eval_steps_per_second": 6.437, "step": 44500 }, { "epoch": 0.6775925940344885, "grad_norm": 2.0993101596832275, "learning_rate": 1.5451505016722409e-06, "loss": 4.8733, "step": 44600 }, { "epoch": 0.6775925940344885, "eval_loss": 4.708896636962891, "eval_runtime": 194.1323, "eval_samples_per_second": 51.511, "eval_steps_per_second": 6.439, "step": 44600 }, { "epoch": 0.6791118599403954, "grad_norm": 1.5517523288726807, "learning_rate": 1.5351170568561872e-06, "loss": 4.877, "step": 44700 }, { "epoch": 0.6791118599403954, "eval_loss": 4.706016540527344, "eval_runtime": 194.2224, "eval_samples_per_second": 51.487, "eval_steps_per_second": 6.436, "step": 44700 }, { "epoch": 0.6806311258463024, "grad_norm": 1.6051702499389648, "learning_rate": 1.5250836120401338e-06, "loss": 4.873, "step": 44800 }, { "epoch": 0.6806311258463024, "eval_loss": 4.71004581451416, "eval_runtime": 194.3369, "eval_samples_per_second": 51.457, "eval_steps_per_second": 6.432, "step": 44800 }, { "epoch": 0.6821503917522093, "grad_norm": 1.8578929901123047, "learning_rate": 1.5150501672240803e-06, "loss": 4.8645, "step": 44900 }, { "epoch": 0.6821503917522093, "eval_loss": 4.7041826248168945, "eval_runtime": 194.6352, "eval_samples_per_second": 51.378, "eval_steps_per_second": 6.422, "step": 44900 }, { "epoch": 0.6836696576581162, "grad_norm": 1.8288882970809937, "learning_rate": 1.5050167224080269e-06, "loss": 4.8717, "step": 45000 }, { "epoch": 0.6836696576581162, "eval_loss": 4.704262733459473, "eval_runtime": 194.5807, "eval_samples_per_second": 51.393, "eval_steps_per_second": 6.424, "step": 45000 }, { "epoch": 0.6851889235640231, "grad_norm": 1.766317367553711, "learning_rate": 1.4949832775919732e-06, "loss": 4.8658, "step": 45100 }, { "epoch": 0.6851889235640231, "eval_loss": 4.700209140777588, "eval_runtime": 194.2902, "eval_samples_per_second": 51.469, "eval_steps_per_second": 6.434, "step": 45100 }, { "epoch": 0.6867081894699301, "grad_norm": 2.1722605228424072, "learning_rate": 1.4849498327759198e-06, "loss": 4.868, "step": 45200 }, { "epoch": 0.6867081894699301, "eval_loss": 4.7045111656188965, "eval_runtime": 194.3122, "eval_samples_per_second": 51.464, "eval_steps_per_second": 6.433, "step": 45200 }, { "epoch": 0.688227455375837, "grad_norm": 2.2012276649475098, "learning_rate": 1.4749163879598663e-06, "loss": 4.861, "step": 45300 }, { "epoch": 0.688227455375837, "eval_loss": 4.699077606201172, "eval_runtime": 194.099, "eval_samples_per_second": 51.52, "eval_steps_per_second": 6.44, "step": 45300 }, { "epoch": 0.6897467212817439, "grad_norm": 1.9373100996017456, "learning_rate": 1.4648829431438129e-06, "loss": 4.8624, "step": 45400 }, { "epoch": 0.6897467212817439, "eval_loss": 4.699510097503662, "eval_runtime": 194.2648, "eval_samples_per_second": 51.476, "eval_steps_per_second": 6.435, "step": 45400 }, { "epoch": 0.6912659871876508, "grad_norm": 1.5436214208602905, "learning_rate": 1.4548494983277592e-06, "loss": 4.8669, "step": 45500 }, { "epoch": 0.6912659871876508, "eval_loss": 4.6950531005859375, "eval_runtime": 194.1491, "eval_samples_per_second": 51.507, "eval_steps_per_second": 6.438, "step": 45500 }, { "epoch": 0.6927852530935578, "grad_norm": 1.868397831916809, "learning_rate": 1.4448160535117058e-06, "loss": 4.8588, "step": 45600 }, { "epoch": 0.6927852530935578, "eval_loss": 4.699548244476318, "eval_runtime": 194.2333, "eval_samples_per_second": 51.484, "eval_steps_per_second": 6.436, "step": 45600 }, { "epoch": 0.6943045189994647, "grad_norm": 1.9601666927337646, "learning_rate": 1.4347826086956523e-06, "loss": 4.8583, "step": 45700 }, { "epoch": 0.6943045189994647, "eval_loss": 4.697216510772705, "eval_runtime": 194.113, "eval_samples_per_second": 51.516, "eval_steps_per_second": 6.44, "step": 45700 }, { "epoch": 0.6958237849053716, "grad_norm": 2.128359317779541, "learning_rate": 1.4247491638795989e-06, "loss": 4.8553, "step": 45800 }, { "epoch": 0.6958237849053716, "eval_loss": 4.695890426635742, "eval_runtime": 194.2141, "eval_samples_per_second": 51.49, "eval_steps_per_second": 6.436, "step": 45800 }, { "epoch": 0.6973430508112785, "grad_norm": 1.7737051248550415, "learning_rate": 1.4147157190635452e-06, "loss": 4.8552, "step": 45900 }, { "epoch": 0.6973430508112785, "eval_loss": 4.692898273468018, "eval_runtime": 194.0574, "eval_samples_per_second": 51.531, "eval_steps_per_second": 6.441, "step": 45900 }, { "epoch": 0.6988623167171855, "grad_norm": 1.8772127628326416, "learning_rate": 1.4046822742474917e-06, "loss": 4.8528, "step": 46000 }, { "epoch": 0.6988623167171855, "eval_loss": 4.690573215484619, "eval_runtime": 193.9137, "eval_samples_per_second": 51.569, "eval_steps_per_second": 6.446, "step": 46000 }, { "epoch": 0.7003815826230924, "grad_norm": 1.9277006387710571, "learning_rate": 1.3946488294314383e-06, "loss": 4.851, "step": 46100 }, { "epoch": 0.7003815826230924, "eval_loss": 4.688443183898926, "eval_runtime": 193.7729, "eval_samples_per_second": 51.607, "eval_steps_per_second": 6.451, "step": 46100 }, { "epoch": 0.7019008485289993, "grad_norm": 1.4775947332382202, "learning_rate": 1.3846153846153846e-06, "loss": 4.8477, "step": 46200 }, { "epoch": 0.7019008485289993, "eval_loss": 4.689602375030518, "eval_runtime": 193.799, "eval_samples_per_second": 51.6, "eval_steps_per_second": 6.45, "step": 46200 }, { "epoch": 0.7034201144349062, "grad_norm": 1.9227460622787476, "learning_rate": 1.374581939799331e-06, "loss": 4.8447, "step": 46300 }, { "epoch": 0.7034201144349062, "eval_loss": 4.6872076988220215, "eval_runtime": 193.971, "eval_samples_per_second": 51.554, "eval_steps_per_second": 6.444, "step": 46300 }, { "epoch": 0.7049393803408132, "grad_norm": 1.8744120597839355, "learning_rate": 1.3645484949832775e-06, "loss": 4.8443, "step": 46400 }, { "epoch": 0.7049393803408132, "eval_loss": 4.684128284454346, "eval_runtime": 193.7921, "eval_samples_per_second": 51.602, "eval_steps_per_second": 6.45, "step": 46400 }, { "epoch": 0.7064586462467201, "grad_norm": 1.858379602432251, "learning_rate": 1.354515050167224e-06, "loss": 4.8418, "step": 46500 }, { "epoch": 0.7064586462467201, "eval_loss": 4.681851387023926, "eval_runtime": 194.0781, "eval_samples_per_second": 51.526, "eval_steps_per_second": 6.441, "step": 46500 }, { "epoch": 0.707977912152627, "grad_norm": 1.8594979047775269, "learning_rate": 1.3444816053511706e-06, "loss": 4.8433, "step": 46600 }, { "epoch": 0.707977912152627, "eval_loss": 4.6782755851745605, "eval_runtime": 194.1927, "eval_samples_per_second": 51.495, "eval_steps_per_second": 6.437, "step": 46600 }, { "epoch": 0.7094971780585339, "grad_norm": 1.8931249380111694, "learning_rate": 1.334448160535117e-06, "loss": 4.8404, "step": 46700 }, { "epoch": 0.7094971780585339, "eval_loss": 4.683481216430664, "eval_runtime": 194.2623, "eval_samples_per_second": 51.477, "eval_steps_per_second": 6.435, "step": 46700 }, { "epoch": 0.7110164439644409, "grad_norm": 1.5091091394424438, "learning_rate": 1.3244147157190635e-06, "loss": 4.8423, "step": 46800 }, { "epoch": 0.7110164439644409, "eval_loss": 4.679195880889893, "eval_runtime": 194.1717, "eval_samples_per_second": 51.501, "eval_steps_per_second": 6.438, "step": 46800 }, { "epoch": 0.7125357098703478, "grad_norm": 1.5617057085037231, "learning_rate": 1.31438127090301e-06, "loss": 4.8384, "step": 46900 }, { "epoch": 0.7125357098703478, "eval_loss": 4.675555229187012, "eval_runtime": 195.1352, "eval_samples_per_second": 51.247, "eval_steps_per_second": 6.406, "step": 46900 }, { "epoch": 0.7140549757762547, "grad_norm": 1.5074530839920044, "learning_rate": 1.3043478260869566e-06, "loss": 4.8389, "step": 47000 }, { "epoch": 0.7140549757762547, "eval_loss": 4.67551851272583, "eval_runtime": 194.1856, "eval_samples_per_second": 51.497, "eval_steps_per_second": 6.437, "step": 47000 }, { "epoch": 0.7155742416821615, "grad_norm": 1.5850820541381836, "learning_rate": 1.294314381270903e-06, "loss": 4.8428, "step": 47100 }, { "epoch": 0.7155742416821615, "eval_loss": 4.677995681762695, "eval_runtime": 194.3756, "eval_samples_per_second": 51.447, "eval_steps_per_second": 6.431, "step": 47100 }, { "epoch": 0.7170935075880686, "grad_norm": 1.7918612957000732, "learning_rate": 1.2842809364548495e-06, "loss": 4.8399, "step": 47200 }, { "epoch": 0.7170935075880686, "eval_loss": 4.672911167144775, "eval_runtime": 194.224, "eval_samples_per_second": 51.487, "eval_steps_per_second": 6.436, "step": 47200 }, { "epoch": 0.7186127734939755, "grad_norm": 1.955620527267456, "learning_rate": 1.274247491638796e-06, "loss": 4.8338, "step": 47300 }, { "epoch": 0.7186127734939755, "eval_loss": 4.67067289352417, "eval_runtime": 194.1805, "eval_samples_per_second": 51.498, "eval_steps_per_second": 6.437, "step": 47300 }, { "epoch": 0.7201320393998824, "grad_norm": 1.994454264640808, "learning_rate": 1.2642140468227424e-06, "loss": 4.8314, "step": 47400 }, { "epoch": 0.7201320393998824, "eval_loss": 4.672824859619141, "eval_runtime": 194.2432, "eval_samples_per_second": 51.482, "eval_steps_per_second": 6.435, "step": 47400 }, { "epoch": 0.7216513053057892, "grad_norm": 1.8769866228103638, "learning_rate": 1.254180602006689e-06, "loss": 4.8321, "step": 47500 }, { "epoch": 0.7216513053057892, "eval_loss": 4.67031717300415, "eval_runtime": 194.2044, "eval_samples_per_second": 51.492, "eval_steps_per_second": 6.437, "step": 47500 }, { "epoch": 0.7231705712116963, "grad_norm": 1.7346811294555664, "learning_rate": 1.2441471571906355e-06, "loss": 4.8351, "step": 47600 }, { "epoch": 0.7231705712116963, "eval_loss": 4.667263031005859, "eval_runtime": 194.3279, "eval_samples_per_second": 51.459, "eval_steps_per_second": 6.432, "step": 47600 }, { "epoch": 0.7246898371176032, "grad_norm": 2.0054638385772705, "learning_rate": 1.234113712374582e-06, "loss": 4.8311, "step": 47700 }, { "epoch": 0.7246898371176032, "eval_loss": 4.670699119567871, "eval_runtime": 194.3103, "eval_samples_per_second": 51.464, "eval_steps_per_second": 6.433, "step": 47700 }, { "epoch": 0.72620910302351, "grad_norm": 1.9293532371520996, "learning_rate": 1.2240802675585284e-06, "loss": 4.8253, "step": 47800 }, { "epoch": 0.72620910302351, "eval_loss": 4.665504455566406, "eval_runtime": 194.5301, "eval_samples_per_second": 51.406, "eval_steps_per_second": 6.426, "step": 47800 }, { "epoch": 0.727728368929417, "grad_norm": 1.612265944480896, "learning_rate": 1.214046822742475e-06, "loss": 4.826, "step": 47900 }, { "epoch": 0.727728368929417, "eval_loss": 4.665849685668945, "eval_runtime": 194.1122, "eval_samples_per_second": 51.517, "eval_steps_per_second": 6.44, "step": 47900 }, { "epoch": 0.729247634835324, "grad_norm": 1.7139407396316528, "learning_rate": 1.2040133779264215e-06, "loss": 4.8267, "step": 48000 }, { "epoch": 0.729247634835324, "eval_loss": 4.663412570953369, "eval_runtime": 194.3177, "eval_samples_per_second": 51.462, "eval_steps_per_second": 6.433, "step": 48000 }, { "epoch": 0.7307669007412309, "grad_norm": 1.8362255096435547, "learning_rate": 1.193979933110368e-06, "loss": 4.826, "step": 48100 }, { "epoch": 0.7307669007412309, "eval_loss": 4.6637797355651855, "eval_runtime": 194.0467, "eval_samples_per_second": 51.534, "eval_steps_per_second": 6.442, "step": 48100 }, { "epoch": 0.7322861666471377, "grad_norm": 1.3808461427688599, "learning_rate": 1.1839464882943144e-06, "loss": 4.8203, "step": 48200 }, { "epoch": 0.7322861666471377, "eval_loss": 4.66359281539917, "eval_runtime": 194.0835, "eval_samples_per_second": 51.524, "eval_steps_per_second": 6.441, "step": 48200 }, { "epoch": 0.7338054325530446, "grad_norm": 2.090758800506592, "learning_rate": 1.173913043478261e-06, "loss": 4.8246, "step": 48300 }, { "epoch": 0.7338054325530446, "eval_loss": 4.658617973327637, "eval_runtime": 193.919, "eval_samples_per_second": 51.568, "eval_steps_per_second": 6.446, "step": 48300 }, { "epoch": 0.7353246984589517, "grad_norm": 1.410666584968567, "learning_rate": 1.1638795986622075e-06, "loss": 4.8198, "step": 48400 }, { "epoch": 0.7353246984589517, "eval_loss": 4.662432670593262, "eval_runtime": 193.8752, "eval_samples_per_second": 51.58, "eval_steps_per_second": 6.447, "step": 48400 }, { "epoch": 0.7368439643648586, "grad_norm": 1.5587624311447144, "learning_rate": 1.153846153846154e-06, "loss": 4.8185, "step": 48500 }, { "epoch": 0.7368439643648586, "eval_loss": 4.656804084777832, "eval_runtime": 193.773, "eval_samples_per_second": 51.607, "eval_steps_per_second": 6.451, "step": 48500 }, { "epoch": 0.7383632302707654, "grad_norm": 1.3816115856170654, "learning_rate": 1.1438127090301004e-06, "loss": 4.8168, "step": 48600 }, { "epoch": 0.7383632302707654, "eval_loss": 4.656231880187988, "eval_runtime": 193.8565, "eval_samples_per_second": 51.585, "eval_steps_per_second": 6.448, "step": 48600 }, { "epoch": 0.7398824961766723, "grad_norm": 1.927064299583435, "learning_rate": 1.133779264214047e-06, "loss": 4.8182, "step": 48700 }, { "epoch": 0.7398824961766723, "eval_loss": 4.656589031219482, "eval_runtime": 193.8744, "eval_samples_per_second": 51.58, "eval_steps_per_second": 6.447, "step": 48700 }, { "epoch": 0.7414017620825794, "grad_norm": 1.6699544191360474, "learning_rate": 1.1237458193979933e-06, "loss": 4.8185, "step": 48800 }, { "epoch": 0.7414017620825794, "eval_loss": 4.655017852783203, "eval_runtime": 193.8675, "eval_samples_per_second": 51.582, "eval_steps_per_second": 6.448, "step": 48800 }, { "epoch": 0.7429210279884862, "grad_norm": 1.3378312587738037, "learning_rate": 1.1137123745819398e-06, "loss": 4.815, "step": 48900 }, { "epoch": 0.7429210279884862, "eval_loss": 4.657501220703125, "eval_runtime": 194.1277, "eval_samples_per_second": 51.512, "eval_steps_per_second": 6.439, "step": 48900 }, { "epoch": 0.7444402938943931, "grad_norm": 1.6146018505096436, "learning_rate": 1.1036789297658862e-06, "loss": 4.8145, "step": 49000 }, { "epoch": 0.7444402938943931, "eval_loss": 4.6548943519592285, "eval_runtime": 194.2412, "eval_samples_per_second": 51.482, "eval_steps_per_second": 6.435, "step": 49000 }, { "epoch": 0.7459595598003, "grad_norm": 1.5952975749969482, "learning_rate": 1.0936454849498327e-06, "loss": 4.813, "step": 49100 }, { "epoch": 0.7459595598003, "eval_loss": 4.651684284210205, "eval_runtime": 194.2268, "eval_samples_per_second": 51.486, "eval_steps_per_second": 6.436, "step": 49100 }, { "epoch": 0.747478825706207, "grad_norm": 1.5946011543273926, "learning_rate": 1.0836120401337793e-06, "loss": 4.8148, "step": 49200 }, { "epoch": 0.747478825706207, "eval_loss": 4.651627540588379, "eval_runtime": 194.2502, "eval_samples_per_second": 51.48, "eval_steps_per_second": 6.435, "step": 49200 }, { "epoch": 0.748998091612114, "grad_norm": 1.4675341844558716, "learning_rate": 1.0735785953177258e-06, "loss": 4.81, "step": 49300 }, { "epoch": 0.748998091612114, "eval_loss": 4.650761127471924, "eval_runtime": 194.1574, "eval_samples_per_second": 51.505, "eval_steps_per_second": 6.438, "step": 49300 }, { "epoch": 0.7505173575180208, "grad_norm": 1.6807961463928223, "learning_rate": 1.0635451505016722e-06, "loss": 4.8115, "step": 49400 }, { "epoch": 0.7505173575180208, "eval_loss": 4.6511101722717285, "eval_runtime": 194.374, "eval_samples_per_second": 51.447, "eval_steps_per_second": 6.431, "step": 49400 }, { "epoch": 0.7520366234239277, "grad_norm": 1.4846396446228027, "learning_rate": 1.0535117056856187e-06, "loss": 4.8069, "step": 49500 }, { "epoch": 0.7520366234239277, "eval_loss": 4.647155284881592, "eval_runtime": 194.3314, "eval_samples_per_second": 51.458, "eval_steps_per_second": 6.432, "step": 49500 }, { "epoch": 0.7535558893298347, "grad_norm": 1.5872676372528076, "learning_rate": 1.0434782608695653e-06, "loss": 4.8084, "step": 49600 }, { "epoch": 0.7535558893298347, "eval_loss": 4.644804954528809, "eval_runtime": 194.3764, "eval_samples_per_second": 51.447, "eval_steps_per_second": 6.431, "step": 49600 }, { "epoch": 0.7550751552357416, "grad_norm": 1.6138330698013306, "learning_rate": 1.0334448160535118e-06, "loss": 4.8086, "step": 49700 }, { "epoch": 0.7550751552357416, "eval_loss": 4.644802093505859, "eval_runtime": 194.3935, "eval_samples_per_second": 51.442, "eval_steps_per_second": 6.43, "step": 49700 }, { "epoch": 0.7565944211416485, "grad_norm": 1.6802724599838257, "learning_rate": 1.0234113712374581e-06, "loss": 4.8052, "step": 49800 }, { "epoch": 0.7565944211416485, "eval_loss": 4.646471977233887, "eval_runtime": 194.4012, "eval_samples_per_second": 51.44, "eval_steps_per_second": 6.43, "step": 49800 }, { "epoch": 0.7581136870475554, "grad_norm": 1.7580209970474243, "learning_rate": 1.0133779264214047e-06, "loss": 4.805, "step": 49900 }, { "epoch": 0.7581136870475554, "eval_loss": 4.641211032867432, "eval_runtime": 194.5581, "eval_samples_per_second": 51.399, "eval_steps_per_second": 6.425, "step": 49900 }, { "epoch": 0.7596329529534624, "grad_norm": 1.732718586921692, "learning_rate": 1.0033444816053512e-06, "loss": 4.803, "step": 50000 }, { "epoch": 0.7596329529534624, "eval_loss": 4.643296241760254, "eval_runtime": 194.3028, "eval_samples_per_second": 51.466, "eval_steps_per_second": 6.433, "step": 50000 }, { "epoch": 0.7611522188593693, "grad_norm": 1.6775901317596436, "learning_rate": 9.933110367892976e-07, "loss": 4.8009, "step": 50100 }, { "epoch": 0.7611522188593693, "eval_loss": 4.639660358428955, "eval_runtime": 194.3411, "eval_samples_per_second": 51.456, "eval_steps_per_second": 6.432, "step": 50100 }, { "epoch": 0.7626714847652762, "grad_norm": 1.4055508375167847, "learning_rate": 9.832775919732441e-07, "loss": 4.8022, "step": 50200 }, { "epoch": 0.7626714847652762, "eval_loss": 4.637509346008301, "eval_runtime": 194.3272, "eval_samples_per_second": 51.46, "eval_steps_per_second": 6.432, "step": 50200 }, { "epoch": 0.7641907506711831, "grad_norm": 1.6316554546356201, "learning_rate": 9.732441471571907e-07, "loss": 4.8019, "step": 50300 }, { "epoch": 0.7641907506711831, "eval_loss": 4.6399359703063965, "eval_runtime": 194.5106, "eval_samples_per_second": 51.411, "eval_steps_per_second": 6.426, "step": 50300 }, { "epoch": 0.7657100165770901, "grad_norm": 1.87636137008667, "learning_rate": 9.632107023411372e-07, "loss": 4.8021, "step": 50400 }, { "epoch": 0.7657100165770901, "eval_loss": 4.637732028961182, "eval_runtime": 194.1555, "eval_samples_per_second": 51.505, "eval_steps_per_second": 6.438, "step": 50400 }, { "epoch": 0.767229282482997, "grad_norm": 1.5560215711593628, "learning_rate": 9.531772575250837e-07, "loss": 4.797, "step": 50500 }, { "epoch": 0.767229282482997, "eval_loss": 4.636757850646973, "eval_runtime": 194.187, "eval_samples_per_second": 51.497, "eval_steps_per_second": 6.437, "step": 50500 }, { "epoch": 0.7687485483889039, "grad_norm": 1.5681828260421753, "learning_rate": 9.431438127090301e-07, "loss": 4.7981, "step": 50600 }, { "epoch": 0.7687485483889039, "eval_loss": 4.63712215423584, "eval_runtime": 194.0875, "eval_samples_per_second": 51.523, "eval_steps_per_second": 6.44, "step": 50600 }, { "epoch": 0.7702678142948108, "grad_norm": 1.725135087966919, "learning_rate": 9.331103678929767e-07, "loss": 4.7988, "step": 50700 }, { "epoch": 0.7702678142948108, "eval_loss": 4.633908271789551, "eval_runtime": 193.9714, "eval_samples_per_second": 51.554, "eval_steps_per_second": 6.444, "step": 50700 }, { "epoch": 0.7717870802007178, "grad_norm": 1.5292387008666992, "learning_rate": 9.230769230769231e-07, "loss": 4.7942, "step": 50800 }, { "epoch": 0.7717870802007178, "eval_loss": 4.634795188903809, "eval_runtime": 193.9264, "eval_samples_per_second": 51.566, "eval_steps_per_second": 6.446, "step": 50800 }, { "epoch": 0.7733063461066247, "grad_norm": 1.313671350479126, "learning_rate": 9.130434782608697e-07, "loss": 4.7971, "step": 50900 }, { "epoch": 0.7733063461066247, "eval_loss": 4.632637977600098, "eval_runtime": 193.9004, "eval_samples_per_second": 51.573, "eval_steps_per_second": 6.447, "step": 50900 }, { "epoch": 0.7748256120125316, "grad_norm": 1.3143532276153564, "learning_rate": 9.030100334448161e-07, "loss": 4.7945, "step": 51000 }, { "epoch": 0.7748256120125316, "eval_loss": 4.6306681632995605, "eval_runtime": 194.3643, "eval_samples_per_second": 51.45, "eval_steps_per_second": 6.431, "step": 51000 }, { "epoch": 0.7763448779184385, "grad_norm": 1.3034121990203857, "learning_rate": 8.929765886287627e-07, "loss": 4.7888, "step": 51100 }, { "epoch": 0.7763448779184385, "eval_loss": 4.629621982574463, "eval_runtime": 194.0292, "eval_samples_per_second": 51.539, "eval_steps_per_second": 6.442, "step": 51100 }, { "epoch": 0.7778641438243455, "grad_norm": 1.739376425743103, "learning_rate": 8.829431438127091e-07, "loss": 4.7934, "step": 51200 }, { "epoch": 0.7778641438243455, "eval_loss": 4.62890625, "eval_runtime": 194.358, "eval_samples_per_second": 51.451, "eval_steps_per_second": 6.431, "step": 51200 }, { "epoch": 0.7793834097302524, "grad_norm": 1.3741992712020874, "learning_rate": 8.729096989966555e-07, "loss": 4.7887, "step": 51300 }, { "epoch": 0.7793834097302524, "eval_loss": 4.625428199768066, "eval_runtime": 194.4886, "eval_samples_per_second": 51.417, "eval_steps_per_second": 6.427, "step": 51300 }, { "epoch": 0.7809026756361593, "grad_norm": 1.423168420791626, "learning_rate": 8.628762541806019e-07, "loss": 4.7888, "step": 51400 }, { "epoch": 0.7809026756361593, "eval_loss": 4.626926422119141, "eval_runtime": 194.5901, "eval_samples_per_second": 51.39, "eval_steps_per_second": 6.424, "step": 51400 }, { "epoch": 0.7824219415420662, "grad_norm": 1.5038503408432007, "learning_rate": 8.528428093645485e-07, "loss": 4.791, "step": 51500 }, { "epoch": 0.7824219415420662, "eval_loss": 4.630486488342285, "eval_runtime": 194.4102, "eval_samples_per_second": 51.438, "eval_steps_per_second": 6.43, "step": 51500 }, { "epoch": 0.7839412074479732, "grad_norm": 1.6092890501022339, "learning_rate": 8.428093645484949e-07, "loss": 4.7863, "step": 51600 }, { "epoch": 0.7839412074479732, "eval_loss": 4.626857280731201, "eval_runtime": 194.4616, "eval_samples_per_second": 51.424, "eval_steps_per_second": 6.428, "step": 51600 }, { "epoch": 0.7854604733538801, "grad_norm": 1.6199829578399658, "learning_rate": 8.327759197324414e-07, "loss": 4.7875, "step": 51700 }, { "epoch": 0.7854604733538801, "eval_loss": 4.623871326446533, "eval_runtime": 194.5129, "eval_samples_per_second": 51.41, "eval_steps_per_second": 6.426, "step": 51700 }, { "epoch": 0.786979739259787, "grad_norm": 1.33729088306427, "learning_rate": 8.227424749163879e-07, "loss": 4.7836, "step": 51800 }, { "epoch": 0.786979739259787, "eval_loss": 4.625426769256592, "eval_runtime": 194.4935, "eval_samples_per_second": 51.416, "eval_steps_per_second": 6.427, "step": 51800 }, { "epoch": 0.7884990051656939, "grad_norm": 1.6848562955856323, "learning_rate": 8.127090301003344e-07, "loss": 4.7874, "step": 51900 }, { "epoch": 0.7884990051656939, "eval_loss": 4.626620292663574, "eval_runtime": 194.6564, "eval_samples_per_second": 51.373, "eval_steps_per_second": 6.422, "step": 51900 }, { "epoch": 0.7900182710716009, "grad_norm": 1.2945283651351929, "learning_rate": 8.026755852842809e-07, "loss": 4.7892, "step": 52000 }, { "epoch": 0.7900182710716009, "eval_loss": 4.624682903289795, "eval_runtime": 194.5982, "eval_samples_per_second": 51.388, "eval_steps_per_second": 6.423, "step": 52000 }, { "epoch": 0.7915375369775078, "grad_norm": 1.5469530820846558, "learning_rate": 7.926421404682274e-07, "loss": 4.7828, "step": 52100 }, { "epoch": 0.7915375369775078, "eval_loss": 4.622786521911621, "eval_runtime": 194.4896, "eval_samples_per_second": 51.417, "eval_steps_per_second": 6.427, "step": 52100 }, { "epoch": 0.7930568028834147, "grad_norm": 1.4468382596969604, "learning_rate": 7.826086956521739e-07, "loss": 4.7772, "step": 52200 }, { "epoch": 0.7930568028834147, "eval_loss": 4.625532150268555, "eval_runtime": 194.4728, "eval_samples_per_second": 51.421, "eval_steps_per_second": 6.428, "step": 52200 }, { "epoch": 0.7945760687893216, "grad_norm": 1.244032382965088, "learning_rate": 7.725752508361204e-07, "loss": 4.7794, "step": 52300 }, { "epoch": 0.7945760687893216, "eval_loss": 4.621998310089111, "eval_runtime": 194.4943, "eval_samples_per_second": 51.415, "eval_steps_per_second": 6.427, "step": 52300 }, { "epoch": 0.7960953346952286, "grad_norm": 1.416409969329834, "learning_rate": 7.625418060200669e-07, "loss": 4.784, "step": 52400 }, { "epoch": 0.7960953346952286, "eval_loss": 4.620311260223389, "eval_runtime": 194.4398, "eval_samples_per_second": 51.43, "eval_steps_per_second": 6.429, "step": 52400 }, { "epoch": 0.7976146006011355, "grad_norm": 1.3747918605804443, "learning_rate": 7.525083612040134e-07, "loss": 4.7776, "step": 52500 }, { "epoch": 0.7976146006011355, "eval_loss": 4.619593143463135, "eval_runtime": 194.5835, "eval_samples_per_second": 51.392, "eval_steps_per_second": 6.424, "step": 52500 }, { "epoch": 0.7991338665070424, "grad_norm": 1.4532439708709717, "learning_rate": 7.424749163879599e-07, "loss": 4.7805, "step": 52600 }, { "epoch": 0.7991338665070424, "eval_loss": 4.619747161865234, "eval_runtime": 194.2642, "eval_samples_per_second": 51.476, "eval_steps_per_second": 6.435, "step": 52600 }, { "epoch": 0.8006531324129494, "grad_norm": 1.34298574924469, "learning_rate": 7.324414715719064e-07, "loss": 4.7778, "step": 52700 }, { "epoch": 0.8006531324129494, "eval_loss": 4.61711311340332, "eval_runtime": 194.691, "eval_samples_per_second": 51.363, "eval_steps_per_second": 6.42, "step": 52700 }, { "epoch": 0.8021723983188563, "grad_norm": 1.4666342735290527, "learning_rate": 7.224080267558529e-07, "loss": 4.7792, "step": 52800 }, { "epoch": 0.8021723983188563, "eval_loss": 4.615002155303955, "eval_runtime": 194.4007, "eval_samples_per_second": 51.44, "eval_steps_per_second": 6.43, "step": 52800 }, { "epoch": 0.8036916642247632, "grad_norm": 1.1881191730499268, "learning_rate": 7.123745819397994e-07, "loss": 4.7789, "step": 52900 }, { "epoch": 0.8036916642247632, "eval_loss": 4.613386154174805, "eval_runtime": 194.3959, "eval_samples_per_second": 51.441, "eval_steps_per_second": 6.43, "step": 52900 }, { "epoch": 0.8052109301306701, "grad_norm": 1.1752644777297974, "learning_rate": 7.023411371237459e-07, "loss": 4.7766, "step": 53000 }, { "epoch": 0.8052109301306701, "eval_loss": 4.616655349731445, "eval_runtime": 194.0766, "eval_samples_per_second": 51.526, "eval_steps_per_second": 6.441, "step": 53000 }, { "epoch": 0.8067301960365771, "grad_norm": 1.3520350456237793, "learning_rate": 6.923076923076923e-07, "loss": 4.7748, "step": 53100 }, { "epoch": 0.8067301960365771, "eval_loss": 4.616769313812256, "eval_runtime": 194.2084, "eval_samples_per_second": 51.491, "eval_steps_per_second": 6.436, "step": 53100 }, { "epoch": 0.808249461942484, "grad_norm": 1.5536683797836304, "learning_rate": 6.822742474916388e-07, "loss": 4.7798, "step": 53200 }, { "epoch": 0.808249461942484, "eval_loss": 4.615866661071777, "eval_runtime": 193.8562, "eval_samples_per_second": 51.585, "eval_steps_per_second": 6.448, "step": 53200 }, { "epoch": 0.8097687278483909, "grad_norm": 1.2618976831436157, "learning_rate": 6.722408026755853e-07, "loss": 4.7762, "step": 53300 }, { "epoch": 0.8097687278483909, "eval_loss": 4.616024017333984, "eval_runtime": 193.9219, "eval_samples_per_second": 51.567, "eval_steps_per_second": 6.446, "step": 53300 }, { "epoch": 0.8112879937542978, "grad_norm": 1.8162367343902588, "learning_rate": 6.622073578595318e-07, "loss": 4.7761, "step": 53400 }, { "epoch": 0.8112879937542978, "eval_loss": 4.613333702087402, "eval_runtime": 194.0415, "eval_samples_per_second": 51.535, "eval_steps_per_second": 6.442, "step": 53400 }, { "epoch": 0.8128072596602048, "grad_norm": 1.1924686431884766, "learning_rate": 6.521739130434783e-07, "loss": 4.7721, "step": 53500 }, { "epoch": 0.8128072596602048, "eval_loss": 4.615184307098389, "eval_runtime": 193.9446, "eval_samples_per_second": 51.561, "eval_steps_per_second": 6.445, "step": 53500 }, { "epoch": 0.8143265255661117, "grad_norm": 1.1603306531906128, "learning_rate": 6.421404682274248e-07, "loss": 4.7746, "step": 53600 }, { "epoch": 0.8143265255661117, "eval_loss": 4.611873626708984, "eval_runtime": 194.2028, "eval_samples_per_second": 51.493, "eval_steps_per_second": 6.437, "step": 53600 }, { "epoch": 0.8158457914720186, "grad_norm": 1.202577829360962, "learning_rate": 6.321070234113712e-07, "loss": 4.7745, "step": 53700 }, { "epoch": 0.8158457914720186, "eval_loss": 4.610635757446289, "eval_runtime": 194.3713, "eval_samples_per_second": 51.448, "eval_steps_per_second": 6.431, "step": 53700 }, { "epoch": 0.8173650573779255, "grad_norm": 1.3371776342391968, "learning_rate": 6.220735785953178e-07, "loss": 4.7755, "step": 53800 }, { "epoch": 0.8173650573779255, "eval_loss": 4.611499786376953, "eval_runtime": 194.3936, "eval_samples_per_second": 51.442, "eval_steps_per_second": 6.43, "step": 53800 }, { "epoch": 0.8188843232838325, "grad_norm": 1.3666436672210693, "learning_rate": 6.120401337792642e-07, "loss": 4.7701, "step": 53900 }, { "epoch": 0.8188843232838325, "eval_loss": 4.610349655151367, "eval_runtime": 194.5735, "eval_samples_per_second": 51.394, "eval_steps_per_second": 6.424, "step": 53900 }, { "epoch": 0.8204035891897394, "grad_norm": 1.4433395862579346, "learning_rate": 6.020066889632107e-07, "loss": 4.7743, "step": 54000 }, { "epoch": 0.8204035891897394, "eval_loss": 4.610903263092041, "eval_runtime": 194.367, "eval_samples_per_second": 51.449, "eval_steps_per_second": 6.431, "step": 54000 }, { "epoch": 0.8219228550956463, "grad_norm": 1.2440968751907349, "learning_rate": 5.919732441471572e-07, "loss": 4.7701, "step": 54100 }, { "epoch": 0.8219228550956463, "eval_loss": 4.611226558685303, "eval_runtime": 194.4358, "eval_samples_per_second": 51.431, "eval_steps_per_second": 6.429, "step": 54100 }, { "epoch": 0.8234421210015532, "grad_norm": 1.311020016670227, "learning_rate": 5.819397993311037e-07, "loss": 4.767, "step": 54200 }, { "epoch": 0.8234421210015532, "eval_loss": 4.608744144439697, "eval_runtime": 194.5925, "eval_samples_per_second": 51.389, "eval_steps_per_second": 6.424, "step": 54200 }, { "epoch": 0.8249613869074602, "grad_norm": 1.2300583124160767, "learning_rate": 5.719063545150502e-07, "loss": 4.7713, "step": 54300 }, { "epoch": 0.8249613869074602, "eval_loss": 4.607234477996826, "eval_runtime": 194.4772, "eval_samples_per_second": 51.42, "eval_steps_per_second": 6.427, "step": 54300 }, { "epoch": 0.8264806528133671, "grad_norm": 1.3106154203414917, "learning_rate": 5.618729096989966e-07, "loss": 4.7698, "step": 54400 }, { "epoch": 0.8264806528133671, "eval_loss": 4.604393005371094, "eval_runtime": 194.687, "eval_samples_per_second": 51.364, "eval_steps_per_second": 6.421, "step": 54400 }, { "epoch": 0.827999918719274, "grad_norm": 1.2660140991210938, "learning_rate": 5.518394648829431e-07, "loss": 4.7655, "step": 54500 }, { "epoch": 0.827999918719274, "eval_loss": 4.602825164794922, "eval_runtime": 194.597, "eval_samples_per_second": 51.388, "eval_steps_per_second": 6.424, "step": 54500 }, { "epoch": 0.8295191846251809, "grad_norm": 1.4443926811218262, "learning_rate": 5.418060200668896e-07, "loss": 4.7727, "step": 54600 }, { "epoch": 0.8295191846251809, "eval_loss": 4.606249809265137, "eval_runtime": 194.4722, "eval_samples_per_second": 51.421, "eval_steps_per_second": 6.428, "step": 54600 }, { "epoch": 0.8310384505310879, "grad_norm": 1.339629888534546, "learning_rate": 5.317725752508361e-07, "loss": 4.7639, "step": 54700 }, { "epoch": 0.8310384505310879, "eval_loss": 4.604907512664795, "eval_runtime": 194.6528, "eval_samples_per_second": 51.374, "eval_steps_per_second": 6.422, "step": 54700 }, { "epoch": 0.8325577164369948, "grad_norm": 1.2703863382339478, "learning_rate": 5.217391304347826e-07, "loss": 4.762, "step": 54800 }, { "epoch": 0.8325577164369948, "eval_loss": 4.605154037475586, "eval_runtime": 194.4518, "eval_samples_per_second": 51.427, "eval_steps_per_second": 6.428, "step": 54800 }, { "epoch": 0.8340769823429017, "grad_norm": 1.1100186109542847, "learning_rate": 5.117056856187291e-07, "loss": 4.7635, "step": 54900 }, { "epoch": 0.8340769823429017, "eval_loss": 4.603663444519043, "eval_runtime": 194.4154, "eval_samples_per_second": 51.436, "eval_steps_per_second": 6.43, "step": 54900 }, { "epoch": 0.8355962482488086, "grad_norm": 1.6119050979614258, "learning_rate": 5.016722408026756e-07, "loss": 4.7627, "step": 55000 }, { "epoch": 0.8355962482488086, "eval_loss": 4.603806495666504, "eval_runtime": 194.3808, "eval_samples_per_second": 51.445, "eval_steps_per_second": 6.431, "step": 55000 }, { "epoch": 0.8371155141547156, "grad_norm": 1.22734534740448, "learning_rate": 4.916387959866221e-07, "loss": 4.764, "step": 55100 }, { "epoch": 0.8371155141547156, "eval_loss": 4.604480266571045, "eval_runtime": 194.4351, "eval_samples_per_second": 51.431, "eval_steps_per_second": 6.429, "step": 55100 }, { "epoch": 0.8386347800606225, "grad_norm": 1.1762231588363647, "learning_rate": 4.816053511705686e-07, "loss": 4.7674, "step": 55200 }, { "epoch": 0.8386347800606225, "eval_loss": 4.60023307800293, "eval_runtime": 194.3592, "eval_samples_per_second": 51.451, "eval_steps_per_second": 6.431, "step": 55200 }, { "epoch": 0.8401540459665294, "grad_norm": 1.0889923572540283, "learning_rate": 4.7157190635451506e-07, "loss": 4.762, "step": 55300 }, { "epoch": 0.8401540459665294, "eval_loss": 4.5993733406066895, "eval_runtime": 194.2327, "eval_samples_per_second": 51.485, "eval_steps_per_second": 6.436, "step": 55300 }, { "epoch": 0.8416733118724363, "grad_norm": 1.2975116968154907, "learning_rate": 4.6153846153846156e-07, "loss": 4.7635, "step": 55400 }, { "epoch": 0.8416733118724363, "eval_loss": 4.599579811096191, "eval_runtime": 194.1059, "eval_samples_per_second": 51.518, "eval_steps_per_second": 6.44, "step": 55400 }, { "epoch": 0.8431925777783433, "grad_norm": 1.257307767868042, "learning_rate": 4.5150501672240806e-07, "loss": 4.7607, "step": 55500 }, { "epoch": 0.8431925777783433, "eval_loss": 4.602155685424805, "eval_runtime": 193.9982, "eval_samples_per_second": 51.547, "eval_steps_per_second": 6.443, "step": 55500 }, { "epoch": 0.8447118436842502, "grad_norm": 1.2345635890960693, "learning_rate": 4.4147157190635456e-07, "loss": 4.7602, "step": 55600 }, { "epoch": 0.8447118436842502, "eval_loss": 4.60153341293335, "eval_runtime": 193.9915, "eval_samples_per_second": 51.549, "eval_steps_per_second": 6.444, "step": 55600 }, { "epoch": 0.8462311095901571, "grad_norm": 1.2262383699417114, "learning_rate": 4.3143812709030095e-07, "loss": 4.7619, "step": 55700 }, { "epoch": 0.8462311095901571, "eval_loss": 4.600053310394287, "eval_runtime": 194.0432, "eval_samples_per_second": 51.535, "eval_steps_per_second": 6.442, "step": 55700 }, { "epoch": 0.847750375496064, "grad_norm": 1.3070259094238281, "learning_rate": 4.2140468227424745e-07, "loss": 4.7564, "step": 55800 }, { "epoch": 0.847750375496064, "eval_loss": 4.597591876983643, "eval_runtime": 193.9858, "eval_samples_per_second": 51.55, "eval_steps_per_second": 6.444, "step": 55800 }, { "epoch": 0.849269641401971, "grad_norm": 1.2372263669967651, "learning_rate": 4.1137123745819395e-07, "loss": 4.7601, "step": 55900 }, { "epoch": 0.849269641401971, "eval_loss": 4.602851867675781, "eval_runtime": 194.1982, "eval_samples_per_second": 51.494, "eval_steps_per_second": 6.437, "step": 55900 }, { "epoch": 0.8507889073078779, "grad_norm": 1.1839525699615479, "learning_rate": 4.0133779264214045e-07, "loss": 4.7609, "step": 56000 }, { "epoch": 0.8507889073078779, "eval_loss": 4.595503330230713, "eval_runtime": 194.4031, "eval_samples_per_second": 51.44, "eval_steps_per_second": 6.43, "step": 56000 }, { "epoch": 0.8523081732137848, "grad_norm": 1.4197345972061157, "learning_rate": 3.9130434782608694e-07, "loss": 4.7594, "step": 56100 }, { "epoch": 0.8523081732137848, "eval_loss": 4.59796142578125, "eval_runtime": 194.4678, "eval_samples_per_second": 51.422, "eval_steps_per_second": 6.428, "step": 56100 }, { "epoch": 0.8538274391196917, "grad_norm": 1.1221038103103638, "learning_rate": 3.8127090301003344e-07, "loss": 4.7568, "step": 56200 }, { "epoch": 0.8538274391196917, "eval_loss": 4.596600532531738, "eval_runtime": 194.6362, "eval_samples_per_second": 51.378, "eval_steps_per_second": 6.422, "step": 56200 }, { "epoch": 0.8553467050255987, "grad_norm": 1.2606701850891113, "learning_rate": 3.7123745819397994e-07, "loss": 4.7543, "step": 56300 }, { "epoch": 0.8553467050255987, "eval_loss": 4.598119258880615, "eval_runtime": 194.3842, "eval_samples_per_second": 51.445, "eval_steps_per_second": 6.431, "step": 56300 }, { "epoch": 0.8568659709315056, "grad_norm": 1.3233997821807861, "learning_rate": 3.6120401337792644e-07, "loss": 4.7576, "step": 56400 }, { "epoch": 0.8568659709315056, "eval_loss": 4.596184730529785, "eval_runtime": 194.3736, "eval_samples_per_second": 51.447, "eval_steps_per_second": 6.431, "step": 56400 }, { "epoch": 0.8583852368374125, "grad_norm": 1.2004015445709229, "learning_rate": 3.5117056856187294e-07, "loss": 4.7616, "step": 56500 }, { "epoch": 0.8583852368374125, "eval_loss": 4.594801425933838, "eval_runtime": 194.5029, "eval_samples_per_second": 51.413, "eval_steps_per_second": 6.427, "step": 56500 }, { "epoch": 0.8599045027433194, "grad_norm": 1.2479798793792725, "learning_rate": 3.411371237458194e-07, "loss": 4.7628, "step": 56600 }, { "epoch": 0.8599045027433194, "eval_loss": 4.599001407623291, "eval_runtime": 194.5013, "eval_samples_per_second": 51.414, "eval_steps_per_second": 6.427, "step": 56600 }, { "epoch": 0.8614237686492264, "grad_norm": 1.2455825805664062, "learning_rate": 3.311036789297659e-07, "loss": 4.756, "step": 56700 }, { "epoch": 0.8614237686492264, "eval_loss": 4.596933364868164, "eval_runtime": 194.5096, "eval_samples_per_second": 51.411, "eval_steps_per_second": 6.426, "step": 56700 }, { "epoch": 0.8629430345551333, "grad_norm": 1.2096078395843506, "learning_rate": 3.210702341137124e-07, "loss": 4.7603, "step": 56800 }, { "epoch": 0.8629430345551333, "eval_loss": 4.5964884757995605, "eval_runtime": 194.6292, "eval_samples_per_second": 51.38, "eval_steps_per_second": 6.422, "step": 56800 }, { "epoch": 0.8644623004610402, "grad_norm": 0.9795971512794495, "learning_rate": 3.110367892976589e-07, "loss": 4.7533, "step": 56900 }, { "epoch": 0.8644623004610402, "eval_loss": 4.594615459442139, "eval_runtime": 194.678, "eval_samples_per_second": 51.367, "eval_steps_per_second": 6.421, "step": 56900 }, { "epoch": 0.8659815663669471, "grad_norm": 1.3727303743362427, "learning_rate": 3.010033444816054e-07, "loss": 4.7547, "step": 57000 }, { "epoch": 0.8659815663669471, "eval_loss": 4.596096515655518, "eval_runtime": 194.5316, "eval_samples_per_second": 51.406, "eval_steps_per_second": 6.426, "step": 57000 }, { "epoch": 0.8675008322728541, "grad_norm": 1.1338236331939697, "learning_rate": 2.9096989966555187e-07, "loss": 4.7542, "step": 57100 }, { "epoch": 0.8675008322728541, "eval_loss": 4.5944132804870605, "eval_runtime": 194.4861, "eval_samples_per_second": 51.418, "eval_steps_per_second": 6.427, "step": 57100 }, { "epoch": 0.869020098178761, "grad_norm": 1.1638000011444092, "learning_rate": 2.809364548494983e-07, "loss": 4.7509, "step": 57200 }, { "epoch": 0.869020098178761, "eval_loss": 4.593369483947754, "eval_runtime": 194.5214, "eval_samples_per_second": 51.408, "eval_steps_per_second": 6.426, "step": 57200 }, { "epoch": 0.8705393640846679, "grad_norm": 0.9814125299453735, "learning_rate": 2.709030100334448e-07, "loss": 4.7565, "step": 57300 }, { "epoch": 0.8705393640846679, "eval_loss": 4.595996856689453, "eval_runtime": 194.4485, "eval_samples_per_second": 51.427, "eval_steps_per_second": 6.428, "step": 57300 }, { "epoch": 0.8720586299905748, "grad_norm": 1.0250178575515747, "learning_rate": 2.608695652173913e-07, "loss": 4.7568, "step": 57400 }, { "epoch": 0.8720586299905748, "eval_loss": 4.59307336807251, "eval_runtime": 194.4712, "eval_samples_per_second": 51.421, "eval_steps_per_second": 6.428, "step": 57400 }, { "epoch": 0.8735778958964818, "grad_norm": 0.9920938014984131, "learning_rate": 2.508361204013378e-07, "loss": 4.7567, "step": 57500 }, { "epoch": 0.8735778958964818, "eval_loss": 4.5949625968933105, "eval_runtime": 194.5139, "eval_samples_per_second": 51.41, "eval_steps_per_second": 6.426, "step": 57500 }, { "epoch": 0.8750971618023887, "grad_norm": 1.0698268413543701, "learning_rate": 2.408026755852843e-07, "loss": 4.749, "step": 57600 }, { "epoch": 0.8750971618023887, "eval_loss": 4.590822696685791, "eval_runtime": 194.2365, "eval_samples_per_second": 51.484, "eval_steps_per_second": 6.435, "step": 57600 }, { "epoch": 0.8766164277082956, "grad_norm": 1.0088557004928589, "learning_rate": 2.3076923076923078e-07, "loss": 4.7556, "step": 57700 }, { "epoch": 0.8766164277082956, "eval_loss": 4.592673301696777, "eval_runtime": 194.2155, "eval_samples_per_second": 51.489, "eval_steps_per_second": 6.436, "step": 57700 }, { "epoch": 0.8781356936142025, "grad_norm": 0.989743173122406, "learning_rate": 2.2073578595317728e-07, "loss": 4.755, "step": 57800 }, { "epoch": 0.8781356936142025, "eval_loss": 4.594258785247803, "eval_runtime": 194.0368, "eval_samples_per_second": 51.537, "eval_steps_per_second": 6.442, "step": 57800 }, { "epoch": 0.8796549595201095, "grad_norm": 1.0593464374542236, "learning_rate": 2.1070234113712372e-07, "loss": 4.7553, "step": 57900 }, { "epoch": 0.8796549595201095, "eval_loss": 4.591804504394531, "eval_runtime": 193.9201, "eval_samples_per_second": 51.568, "eval_steps_per_second": 6.446, "step": 57900 }, { "epoch": 0.8811742254260164, "grad_norm": 1.0415208339691162, "learning_rate": 2.0066889632107022e-07, "loss": 4.7526, "step": 58000 }, { "epoch": 0.8811742254260164, "eval_loss": 4.591397285461426, "eval_runtime": 194.0639, "eval_samples_per_second": 51.529, "eval_steps_per_second": 6.441, "step": 58000 }, { "epoch": 0.8826934913319233, "grad_norm": 1.08748197555542, "learning_rate": 1.9063545150501672e-07, "loss": 4.748, "step": 58100 }, { "epoch": 0.8826934913319233, "eval_loss": 4.592258930206299, "eval_runtime": 193.9701, "eval_samples_per_second": 51.554, "eval_steps_per_second": 6.444, "step": 58100 }, { "epoch": 0.8842127572378302, "grad_norm": 0.875297486782074, "learning_rate": 1.8060200668896322e-07, "loss": 4.754, "step": 58200 }, { "epoch": 0.8842127572378302, "eval_loss": 4.590017318725586, "eval_runtime": 193.921, "eval_samples_per_second": 51.567, "eval_steps_per_second": 6.446, "step": 58200 }, { "epoch": 0.8857320231437372, "grad_norm": 0.9465267062187195, "learning_rate": 1.705685618729097e-07, "loss": 4.7541, "step": 58300 }, { "epoch": 0.8857320231437372, "eval_loss": 4.590794563293457, "eval_runtime": 193.8053, "eval_samples_per_second": 51.598, "eval_steps_per_second": 6.45, "step": 58300 }, { "epoch": 0.8872512890496441, "grad_norm": 1.108864426612854, "learning_rate": 1.605351170568562e-07, "loss": 4.7545, "step": 58400 }, { "epoch": 0.8872512890496441, "eval_loss": 4.590878963470459, "eval_runtime": 194.0226, "eval_samples_per_second": 51.54, "eval_steps_per_second": 6.443, "step": 58400 }, { "epoch": 0.888770554955551, "grad_norm": 0.9311940670013428, "learning_rate": 1.505016722408027e-07, "loss": 4.7537, "step": 58500 }, { "epoch": 0.888770554955551, "eval_loss": 4.589045524597168, "eval_runtime": 194.3809, "eval_samples_per_second": 51.445, "eval_steps_per_second": 6.431, "step": 58500 }, { "epoch": 0.8902898208614579, "grad_norm": 1.122527003288269, "learning_rate": 1.4046822742474916e-07, "loss": 4.7517, "step": 58600 }, { "epoch": 0.8902898208614579, "eval_loss": 4.590823650360107, "eval_runtime": 194.4167, "eval_samples_per_second": 51.436, "eval_steps_per_second": 6.429, "step": 58600 }, { "epoch": 0.8918090867673649, "grad_norm": 1.0384498834609985, "learning_rate": 1.3043478260869566e-07, "loss": 4.7491, "step": 58700 }, { "epoch": 0.8918090867673649, "eval_loss": 4.5913920402526855, "eval_runtime": 194.5657, "eval_samples_per_second": 51.397, "eval_steps_per_second": 6.425, "step": 58700 }, { "epoch": 0.8933283526732718, "grad_norm": 0.9574987292289734, "learning_rate": 1.2040133779264215e-07, "loss": 4.7512, "step": 58800 }, { "epoch": 0.8933283526732718, "eval_loss": 4.5905609130859375, "eval_runtime": 194.7064, "eval_samples_per_second": 51.359, "eval_steps_per_second": 6.42, "step": 58800 }, { "epoch": 0.8948476185791787, "grad_norm": 0.8835811614990234, "learning_rate": 1.1036789297658864e-07, "loss": 4.7493, "step": 58900 }, { "epoch": 0.8948476185791787, "eval_loss": 4.59054708480835, "eval_runtime": 194.6192, "eval_samples_per_second": 51.382, "eval_steps_per_second": 6.423, "step": 58900 }, { "epoch": 0.8963668844850856, "grad_norm": 0.8485853672027588, "learning_rate": 1.0033444816053511e-07, "loss": 4.7494, "step": 59000 }, { "epoch": 0.8963668844850856, "eval_loss": 4.590051651000977, "eval_runtime": 194.7537, "eval_samples_per_second": 51.347, "eval_steps_per_second": 6.418, "step": 59000 }, { "epoch": 0.8978861503909926, "grad_norm": 0.9415624737739563, "learning_rate": 9.030100334448161e-08, "loss": 4.7461, "step": 59100 }, { "epoch": 0.8978861503909926, "eval_loss": 4.593616962432861, "eval_runtime": 194.5307, "eval_samples_per_second": 51.406, "eval_steps_per_second": 6.426, "step": 59100 }, { "epoch": 0.8994054162968995, "grad_norm": 1.0554380416870117, "learning_rate": 8.02675585284281e-08, "loss": 4.7523, "step": 59200 }, { "epoch": 0.8994054162968995, "eval_loss": 4.5899271965026855, "eval_runtime": 194.6175, "eval_samples_per_second": 51.383, "eval_steps_per_second": 6.423, "step": 59200 }, { "epoch": 0.9009246822028064, "grad_norm": 0.8636355400085449, "learning_rate": 7.023411371237458e-08, "loss": 4.7511, "step": 59300 }, { "epoch": 0.9009246822028064, "eval_loss": 4.590203285217285, "eval_runtime": 194.5121, "eval_samples_per_second": 51.411, "eval_steps_per_second": 6.426, "step": 59300 }, { "epoch": 0.9024439481087133, "grad_norm": 0.7755019068717957, "learning_rate": 6.020066889632108e-08, "loss": 4.7495, "step": 59400 }, { "epoch": 0.9024439481087133, "eval_loss": 4.591348648071289, "eval_runtime": 194.53, "eval_samples_per_second": 51.406, "eval_steps_per_second": 6.426, "step": 59400 }, { "epoch": 0.9039632140146203, "grad_norm": 0.9905518293380737, "learning_rate": 5.0167224080267556e-08, "loss": 4.7507, "step": 59500 }, { "epoch": 0.9039632140146203, "eval_loss": 4.590855121612549, "eval_runtime": 194.486, "eval_samples_per_second": 51.418, "eval_steps_per_second": 6.427, "step": 59500 }, { "epoch": 0.9054824799205272, "grad_norm": 0.8361491560935974, "learning_rate": 4.013377926421405e-08, "loss": 4.7508, "step": 59600 }, { "epoch": 0.9054824799205272, "eval_loss": 4.588395118713379, "eval_runtime": 194.4894, "eval_samples_per_second": 51.417, "eval_steps_per_second": 6.427, "step": 59600 }, { "epoch": 0.9070017458264341, "grad_norm": 0.8528068661689758, "learning_rate": 3.010033444816054e-08, "loss": 4.7485, "step": 59700 }, { "epoch": 0.9070017458264341, "eval_loss": 4.589570999145508, "eval_runtime": 194.4665, "eval_samples_per_second": 51.423, "eval_steps_per_second": 6.428, "step": 59700 }, { "epoch": 0.908521011732341, "grad_norm": 0.9023746252059937, "learning_rate": 2.0066889632107024e-08, "loss": 4.7502, "step": 59800 }, { "epoch": 0.908521011732341, "eval_loss": 4.588865280151367, "eval_runtime": 194.2101, "eval_samples_per_second": 51.491, "eval_steps_per_second": 6.436, "step": 59800 }, { "epoch": 0.910040277638248, "grad_norm": 0.866371750831604, "learning_rate": 1.0033444816053512e-08, "loss": 4.7503, "step": 59900 }, { "epoch": 0.910040277638248, "eval_loss": 4.589888095855713, "eval_runtime": 194.6389, "eval_samples_per_second": 51.377, "eval_steps_per_second": 6.422, "step": 59900 }, { "epoch": 0.9115595435441549, "grad_norm": 0.7748922109603882, "learning_rate": 0.0, "loss": 4.7524, "step": 60000 }, { "epoch": 0.9115595435441549, "eval_loss": 4.588863372802734, "eval_runtime": 194.1121, "eval_samples_per_second": 51.517, "eval_steps_per_second": 6.44, "step": 60000 } ], "logging_steps": 100, "max_steps": 60000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.15783283933184e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }