| { |
| "best_metric": 4.588395118713379, |
| "best_model_checkpoint": "learning_source_20260316/rna/bert-output/rna-medium/checkpoint-59600", |
| "epoch": 0.9115595435441549, |
| "eval_steps": 100, |
| "global_step": 60000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0015192659059069249, |
| "grad_norm": 0.36328116059303284, |
| "learning_rate": 3e-06, |
| "loss": 10.2311, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.0015192659059069249, |
| "eval_loss": 10.090826988220215, |
| "eval_runtime": 193.145, |
| "eval_samples_per_second": 51.775, |
| "eval_steps_per_second": 6.472, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.0030385318118138498, |
| "grad_norm": 0.2686520218849182, |
| "learning_rate": 6e-06, |
| "loss": 9.9461, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.0030385318118138498, |
| "eval_loss": 9.758237838745117, |
| "eval_runtime": 193.1826, |
| "eval_samples_per_second": 51.764, |
| "eval_steps_per_second": 6.471, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.004557797717720775, |
| "grad_norm": 0.9466120004653931, |
| "learning_rate": 5.989966555183947e-06, |
| "loss": 9.6291, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.004557797717720775, |
| "eval_loss": 9.439615249633789, |
| "eval_runtime": 193.0329, |
| "eval_samples_per_second": 51.805, |
| "eval_steps_per_second": 6.476, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.0060770636236276996, |
| "grad_norm": 0.4738225042819977, |
| "learning_rate": 5.979933110367893e-06, |
| "loss": 9.3896, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.0060770636236276996, |
| "eval_loss": 9.214252471923828, |
| "eval_runtime": 193.5666, |
| "eval_samples_per_second": 51.662, |
| "eval_steps_per_second": 6.458, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.007596329529534624, |
| "grad_norm": 0.7529183626174927, |
| "learning_rate": 5.96989966555184e-06, |
| "loss": 9.2425, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.007596329529534624, |
| "eval_loss": 9.07589340209961, |
| "eval_runtime": 193.2214, |
| "eval_samples_per_second": 51.754, |
| "eval_steps_per_second": 6.469, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.00911559543544155, |
| "grad_norm": 0.48392170667648315, |
| "learning_rate": 5.959866220735786e-06, |
| "loss": 9.1413, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.00911559543544155, |
| "eval_loss": 8.97410774230957, |
| "eval_runtime": 192.9985, |
| "eval_samples_per_second": 51.814, |
| "eval_steps_per_second": 6.477, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.010634861341348474, |
| "grad_norm": 0.6194415092468262, |
| "learning_rate": 5.949832775919732e-06, |
| "loss": 9.0633, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.010634861341348474, |
| "eval_loss": 8.891473770141602, |
| "eval_runtime": 193.0661, |
| "eval_samples_per_second": 51.796, |
| "eval_steps_per_second": 6.474, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.012154127247255399, |
| "grad_norm": 2.9033119678497314, |
| "learning_rate": 5.939799331103679e-06, |
| "loss": 9.0036, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.012154127247255399, |
| "eval_loss": 8.827642440795898, |
| "eval_runtime": 192.8551, |
| "eval_samples_per_second": 51.852, |
| "eval_steps_per_second": 6.482, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.013673393153162324, |
| "grad_norm": 2.6778995990753174, |
| "learning_rate": 5.929765886287626e-06, |
| "loss": 8.9573, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.013673393153162324, |
| "eval_loss": 8.794682502746582, |
| "eval_runtime": 193.0047, |
| "eval_samples_per_second": 51.812, |
| "eval_steps_per_second": 6.477, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.015192659059069248, |
| "grad_norm": 2.586425304412842, |
| "learning_rate": 5.919732441471572e-06, |
| "loss": 8.927, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.015192659059069248, |
| "eval_loss": 8.765641212463379, |
| "eval_runtime": 193.0239, |
| "eval_samples_per_second": 51.807, |
| "eval_steps_per_second": 6.476, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.016711924964976175, |
| "grad_norm": 3.287247657775879, |
| "learning_rate": 5.9096989966555185e-06, |
| "loss": 8.8995, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.016711924964976175, |
| "eval_loss": 8.736359596252441, |
| "eval_runtime": 193.2903, |
| "eval_samples_per_second": 51.736, |
| "eval_steps_per_second": 6.467, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.0182311908708831, |
| "grad_norm": 3.2760348320007324, |
| "learning_rate": 5.899665551839465e-06, |
| "loss": 8.8734, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.0182311908708831, |
| "eval_loss": 8.70866870880127, |
| "eval_runtime": 193.2723, |
| "eval_samples_per_second": 51.74, |
| "eval_steps_per_second": 6.468, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.019750456776790024, |
| "grad_norm": 3.7369821071624756, |
| "learning_rate": 5.889632107023412e-06, |
| "loss": 8.8472, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.019750456776790024, |
| "eval_loss": 8.686373710632324, |
| "eval_runtime": 193.3013, |
| "eval_samples_per_second": 51.733, |
| "eval_steps_per_second": 6.467, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.02126972268269695, |
| "grad_norm": 3.6638362407684326, |
| "learning_rate": 5.879598662207358e-06, |
| "loss": 8.8219, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.02126972268269695, |
| "eval_loss": 8.66162109375, |
| "eval_runtime": 193.4171, |
| "eval_samples_per_second": 51.702, |
| "eval_steps_per_second": 6.463, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.022788988588603874, |
| "grad_norm": 3.1928629875183105, |
| "learning_rate": 5.869565217391305e-06, |
| "loss": 8.7973, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.022788988588603874, |
| "eval_loss": 8.633343696594238, |
| "eval_runtime": 193.7794, |
| "eval_samples_per_second": 51.605, |
| "eval_steps_per_second": 6.451, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.024308254494510798, |
| "grad_norm": 3.5108275413513184, |
| "learning_rate": 5.8595317725752514e-06, |
| "loss": 8.7682, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.024308254494510798, |
| "eval_loss": 8.609291076660156, |
| "eval_runtime": 193.2885, |
| "eval_samples_per_second": 51.736, |
| "eval_steps_per_second": 6.467, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.025827520400417723, |
| "grad_norm": 6.164127349853516, |
| "learning_rate": 5.849498327759197e-06, |
| "loss": 8.7411, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.025827520400417723, |
| "eval_loss": 8.578601837158203, |
| "eval_runtime": 193.259, |
| "eval_samples_per_second": 51.744, |
| "eval_steps_per_second": 6.468, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.027346786306324648, |
| "grad_norm": 2.5621981620788574, |
| "learning_rate": 5.839464882943144e-06, |
| "loss": 8.7198, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.027346786306324648, |
| "eval_loss": 8.555196762084961, |
| "eval_runtime": 193.4982, |
| "eval_samples_per_second": 51.68, |
| "eval_steps_per_second": 6.46, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.028866052212231572, |
| "grad_norm": 2.957981586456299, |
| "learning_rate": 5.829431438127091e-06, |
| "loss": 8.6935, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.028866052212231572, |
| "eval_loss": 8.530313491821289, |
| "eval_runtime": 193.7009, |
| "eval_samples_per_second": 51.626, |
| "eval_steps_per_second": 6.453, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.030385318118138497, |
| "grad_norm": 5.7702836990356445, |
| "learning_rate": 5.819397993311037e-06, |
| "loss": 8.6684, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.030385318118138497, |
| "eval_loss": 8.509552001953125, |
| "eval_runtime": 193.4646, |
| "eval_samples_per_second": 51.689, |
| "eval_steps_per_second": 6.461, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.03190458402404542, |
| "grad_norm": 3.653986930847168, |
| "learning_rate": 5.8093645484949836e-06, |
| "loss": 8.6505, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.03190458402404542, |
| "eval_loss": 8.499613761901855, |
| "eval_runtime": 193.2316, |
| "eval_samples_per_second": 51.751, |
| "eval_steps_per_second": 6.469, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.03342384992995235, |
| "grad_norm": 4.66618537902832, |
| "learning_rate": 5.79933110367893e-06, |
| "loss": 8.6175, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.03342384992995235, |
| "eval_loss": 8.473803520202637, |
| "eval_runtime": 193.4189, |
| "eval_samples_per_second": 51.701, |
| "eval_steps_per_second": 6.463, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.034943115835859274, |
| "grad_norm": 0.7005074620246887, |
| "learning_rate": 5.789297658862876e-06, |
| "loss": 8.5932, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.034943115835859274, |
| "eval_loss": 8.452431678771973, |
| "eval_runtime": 193.2263, |
| "eval_samples_per_second": 51.753, |
| "eval_steps_per_second": 6.469, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.0364623817417662, |
| "grad_norm": 5.592404842376709, |
| "learning_rate": 5.779264214046823e-06, |
| "loss": 8.572, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.0364623817417662, |
| "eval_loss": 8.448452949523926, |
| "eval_runtime": 193.5678, |
| "eval_samples_per_second": 51.661, |
| "eval_steps_per_second": 6.458, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.037981647647673124, |
| "grad_norm": 4.363527297973633, |
| "learning_rate": 5.76923076923077e-06, |
| "loss": 8.5536, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.037981647647673124, |
| "eval_loss": 8.417658805847168, |
| "eval_runtime": 193.2714, |
| "eval_samples_per_second": 51.741, |
| "eval_steps_per_second": 6.468, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.03950091355358005, |
| "grad_norm": 4.716485023498535, |
| "learning_rate": 5.759197324414716e-06, |
| "loss": 8.5332, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.03950091355358005, |
| "eval_loss": 8.41653823852539, |
| "eval_runtime": 193.336, |
| "eval_samples_per_second": 51.723, |
| "eval_steps_per_second": 6.465, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.04102017945948697, |
| "grad_norm": 2.145522117614746, |
| "learning_rate": 5.7491638795986624e-06, |
| "loss": 8.5152, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.04102017945948697, |
| "eval_loss": 8.391885757446289, |
| "eval_runtime": 193.7068, |
| "eval_samples_per_second": 51.624, |
| "eval_steps_per_second": 6.453, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.0425394453653939, |
| "grad_norm": 3.36438250541687, |
| "learning_rate": 5.739130434782609e-06, |
| "loss": 8.4964, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.0425394453653939, |
| "eval_loss": 8.382240295410156, |
| "eval_runtime": 193.7119, |
| "eval_samples_per_second": 51.623, |
| "eval_steps_per_second": 6.453, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.04405871127130082, |
| "grad_norm": 3.0056991577148438, |
| "learning_rate": 5.729096989966555e-06, |
| "loss": 8.4811, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.04405871127130082, |
| "eval_loss": 8.374021530151367, |
| "eval_runtime": 193.9566, |
| "eval_samples_per_second": 51.558, |
| "eval_steps_per_second": 6.445, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.04557797717720775, |
| "grad_norm": 2.388469696044922, |
| "learning_rate": 5.719063545150502e-06, |
| "loss": 8.4762, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.04557797717720775, |
| "eval_loss": 8.371816635131836, |
| "eval_runtime": 193.5842, |
| "eval_samples_per_second": 51.657, |
| "eval_steps_per_second": 6.457, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.04709724308311467, |
| "grad_norm": 4.248419761657715, |
| "learning_rate": 5.709030100334449e-06, |
| "loss": 8.458, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.04709724308311467, |
| "eval_loss": 8.359615325927734, |
| "eval_runtime": 193.612, |
| "eval_samples_per_second": 51.65, |
| "eval_steps_per_second": 6.456, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.048616508989021597, |
| "grad_norm": 1.2234629392623901, |
| "learning_rate": 5.698996655518395e-06, |
| "loss": 8.442, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.048616508989021597, |
| "eval_loss": 8.356290817260742, |
| "eval_runtime": 193.4203, |
| "eval_samples_per_second": 51.701, |
| "eval_steps_per_second": 6.463, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.05013577489492852, |
| "grad_norm": 1.149261236190796, |
| "learning_rate": 5.688963210702341e-06, |
| "loss": 8.434, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.05013577489492852, |
| "eval_loss": 8.348698616027832, |
| "eval_runtime": 193.5122, |
| "eval_samples_per_second": 51.676, |
| "eval_steps_per_second": 6.46, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.051655040800835446, |
| "grad_norm": 3.746015787124634, |
| "learning_rate": 5.678929765886288e-06, |
| "loss": 8.4225, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.051655040800835446, |
| "eval_loss": 8.341509819030762, |
| "eval_runtime": 193.694, |
| "eval_samples_per_second": 51.628, |
| "eval_steps_per_second": 6.453, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.05317430670674237, |
| "grad_norm": 3.512450933456421, |
| "learning_rate": 5.668896321070235e-06, |
| "loss": 8.4084, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.05317430670674237, |
| "eval_loss": 8.333552360534668, |
| "eval_runtime": 193.637, |
| "eval_samples_per_second": 51.643, |
| "eval_steps_per_second": 6.455, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.054693572612649295, |
| "grad_norm": 2.823720693588257, |
| "learning_rate": 5.658862876254181e-06, |
| "loss": 8.401, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.054693572612649295, |
| "eval_loss": 8.334371566772461, |
| "eval_runtime": 193.5758, |
| "eval_samples_per_second": 51.659, |
| "eval_steps_per_second": 6.457, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.05621283851855622, |
| "grad_norm": 3.2911577224731445, |
| "learning_rate": 5.6488294314381275e-06, |
| "loss": 8.3905, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.05621283851855622, |
| "eval_loss": 8.324334144592285, |
| "eval_runtime": 193.6614, |
| "eval_samples_per_second": 51.637, |
| "eval_steps_per_second": 6.455, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.057732104424463145, |
| "grad_norm": 2.3814852237701416, |
| "learning_rate": 5.638795986622074e-06, |
| "loss": 8.3799, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.057732104424463145, |
| "eval_loss": 8.320505142211914, |
| "eval_runtime": 193.6578, |
| "eval_samples_per_second": 51.637, |
| "eval_steps_per_second": 6.455, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.05925137033037007, |
| "grad_norm": 3.9368467330932617, |
| "learning_rate": 5.62876254180602e-06, |
| "loss": 8.3716, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.05925137033037007, |
| "eval_loss": 8.320087432861328, |
| "eval_runtime": 193.6177, |
| "eval_samples_per_second": 51.648, |
| "eval_steps_per_second": 6.456, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.060770636236276994, |
| "grad_norm": 3.7462780475616455, |
| "learning_rate": 5.618729096989967e-06, |
| "loss": 8.366, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.060770636236276994, |
| "eval_loss": 8.314221382141113, |
| "eval_runtime": 193.8249, |
| "eval_samples_per_second": 51.593, |
| "eval_steps_per_second": 6.449, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.06228990214218392, |
| "grad_norm": 4.8095598220825195, |
| "learning_rate": 5.608695652173914e-06, |
| "loss": 8.3588, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.06228990214218392, |
| "eval_loss": 8.31184196472168, |
| "eval_runtime": 193.6735, |
| "eval_samples_per_second": 51.633, |
| "eval_steps_per_second": 6.454, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.06380916804809084, |
| "grad_norm": 1.4702892303466797, |
| "learning_rate": 5.59866220735786e-06, |
| "loss": 8.349, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.06380916804809084, |
| "eval_loss": 8.315442085266113, |
| "eval_runtime": 193.7492, |
| "eval_samples_per_second": 51.613, |
| "eval_steps_per_second": 6.452, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.06532843395399776, |
| "grad_norm": 1.3424737453460693, |
| "learning_rate": 5.588628762541806e-06, |
| "loss": 8.3377, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.06532843395399776, |
| "eval_loss": 8.299623489379883, |
| "eval_runtime": 193.8901, |
| "eval_samples_per_second": 51.576, |
| "eval_steps_per_second": 6.447, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.0668476998599047, |
| "grad_norm": 3.2085587978363037, |
| "learning_rate": 5.578595317725753e-06, |
| "loss": 8.3276, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.0668476998599047, |
| "eval_loss": 8.291953086853027, |
| "eval_runtime": 193.3945, |
| "eval_samples_per_second": 51.708, |
| "eval_steps_per_second": 6.463, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.06836696576581162, |
| "grad_norm": 3.0818777084350586, |
| "learning_rate": 5.568561872909699e-06, |
| "loss": 8.3213, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.06836696576581162, |
| "eval_loss": 8.284076690673828, |
| "eval_runtime": 193.6893, |
| "eval_samples_per_second": 51.629, |
| "eval_steps_per_second": 6.454, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.06988623167171855, |
| "grad_norm": 2.9899518489837646, |
| "learning_rate": 5.558528428093646e-06, |
| "loss": 8.3146, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.06988623167171855, |
| "eval_loss": 8.288785934448242, |
| "eval_runtime": 193.4136, |
| "eval_samples_per_second": 51.703, |
| "eval_steps_per_second": 6.463, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.07140549757762547, |
| "grad_norm": 3.5509963035583496, |
| "learning_rate": 5.548494983277593e-06, |
| "loss": 8.3073, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.07140549757762547, |
| "eval_loss": 8.283821105957031, |
| "eval_runtime": 193.5669, |
| "eval_samples_per_second": 51.662, |
| "eval_steps_per_second": 6.458, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.0729247634835324, |
| "grad_norm": 3.2348263263702393, |
| "learning_rate": 5.5384615384615385e-06, |
| "loss": 8.3002, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.0729247634835324, |
| "eval_loss": 8.275022506713867, |
| "eval_runtime": 193.5313, |
| "eval_samples_per_second": 51.671, |
| "eval_steps_per_second": 6.459, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.07444402938943932, |
| "grad_norm": 3.035083293914795, |
| "learning_rate": 5.528428093645485e-06, |
| "loss": 8.2929, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.07444402938943932, |
| "eval_loss": 8.270652770996094, |
| "eval_runtime": 193.6056, |
| "eval_samples_per_second": 51.651, |
| "eval_steps_per_second": 6.456, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.07596329529534625, |
| "grad_norm": 2.820732593536377, |
| "learning_rate": 5.518394648829432e-06, |
| "loss": 8.2858, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.07596329529534625, |
| "eval_loss": 8.268174171447754, |
| "eval_runtime": 193.8467, |
| "eval_samples_per_second": 51.587, |
| "eval_steps_per_second": 6.448, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.07748256120125317, |
| "grad_norm": 3.0247511863708496, |
| "learning_rate": 5.508361204013378e-06, |
| "loss": 8.2786, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.07748256120125317, |
| "eval_loss": 8.26013469696045, |
| "eval_runtime": 193.8849, |
| "eval_samples_per_second": 51.577, |
| "eval_steps_per_second": 6.447, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.0790018271071601, |
| "grad_norm": 3.4587104320526123, |
| "learning_rate": 5.498327759197324e-06, |
| "loss": 8.2727, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.0790018271071601, |
| "eval_loss": 8.258410453796387, |
| "eval_runtime": 193.8157, |
| "eval_samples_per_second": 51.595, |
| "eval_steps_per_second": 6.449, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.08052109301306702, |
| "grad_norm": 3.1752476692199707, |
| "learning_rate": 5.488294314381271e-06, |
| "loss": 8.269, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.08052109301306702, |
| "eval_loss": 8.251901626586914, |
| "eval_runtime": 193.7661, |
| "eval_samples_per_second": 51.609, |
| "eval_steps_per_second": 6.451, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.08204035891897395, |
| "grad_norm": 2.231090784072876, |
| "learning_rate": 5.478260869565217e-06, |
| "loss": 8.2612, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.08204035891897395, |
| "eval_loss": 8.248543739318848, |
| "eval_runtime": 193.7807, |
| "eval_samples_per_second": 51.605, |
| "eval_steps_per_second": 6.451, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.08355962482488087, |
| "grad_norm": 1.8496346473693848, |
| "learning_rate": 5.468227424749163e-06, |
| "loss": 8.2594, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.08355962482488087, |
| "eval_loss": 8.25257396697998, |
| "eval_runtime": 193.5194, |
| "eval_samples_per_second": 51.674, |
| "eval_steps_per_second": 6.459, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.0850788907307878, |
| "grad_norm": 2.26971435546875, |
| "learning_rate": 5.45819397993311e-06, |
| "loss": 8.2519, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.0850788907307878, |
| "eval_loss": 8.239155769348145, |
| "eval_runtime": 193.6858, |
| "eval_samples_per_second": 51.63, |
| "eval_steps_per_second": 6.454, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.08659815663669472, |
| "grad_norm": 4.062191963195801, |
| "learning_rate": 5.448160535117057e-06, |
| "loss": 8.2494, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.08659815663669472, |
| "eval_loss": 8.248674392700195, |
| "eval_runtime": 193.7941, |
| "eval_samples_per_second": 51.601, |
| "eval_steps_per_second": 6.45, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.08811742254260164, |
| "grad_norm": 2.0019612312316895, |
| "learning_rate": 5.438127090301003e-06, |
| "loss": 8.246, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.08811742254260164, |
| "eval_loss": 8.234210968017578, |
| "eval_runtime": 193.7021, |
| "eval_samples_per_second": 51.626, |
| "eval_steps_per_second": 6.453, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.08963668844850857, |
| "grad_norm": 3.5080573558807373, |
| "learning_rate": 5.4280936454849495e-06, |
| "loss": 8.2391, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.08963668844850857, |
| "eval_loss": 8.240001678466797, |
| "eval_runtime": 193.5894, |
| "eval_samples_per_second": 51.656, |
| "eval_steps_per_second": 6.457, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.0911559543544155, |
| "grad_norm": 2.578500747680664, |
| "learning_rate": 5.418060200668896e-06, |
| "loss": 8.2361, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.0911559543544155, |
| "eval_loss": 8.238499641418457, |
| "eval_runtime": 193.7443, |
| "eval_samples_per_second": 51.614, |
| "eval_steps_per_second": 6.452, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.09267522026032242, |
| "grad_norm": 2.7456629276275635, |
| "learning_rate": 5.408026755852843e-06, |
| "loss": 8.2331, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.09267522026032242, |
| "eval_loss": 8.225603103637695, |
| "eval_runtime": 193.7051, |
| "eval_samples_per_second": 51.625, |
| "eval_steps_per_second": 6.453, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.09419448616622934, |
| "grad_norm": 1.1776982545852661, |
| "learning_rate": 5.397993311036789e-06, |
| "loss": 8.2294, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.09419448616622934, |
| "eval_loss": 8.235060691833496, |
| "eval_runtime": 193.5474, |
| "eval_samples_per_second": 51.667, |
| "eval_steps_per_second": 6.458, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.09571375207213627, |
| "grad_norm": 3.159752130508423, |
| "learning_rate": 5.387959866220736e-06, |
| "loss": 8.2273, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.09571375207213627, |
| "eval_loss": 8.22216510772705, |
| "eval_runtime": 193.5026, |
| "eval_samples_per_second": 51.679, |
| "eval_steps_per_second": 6.46, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.09723301797804319, |
| "grad_norm": 2.37727427482605, |
| "learning_rate": 5.3779264214046825e-06, |
| "loss": 8.2231, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.09723301797804319, |
| "eval_loss": 8.222684860229492, |
| "eval_runtime": 193.5974, |
| "eval_samples_per_second": 51.654, |
| "eval_steps_per_second": 6.457, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.09875228388395012, |
| "grad_norm": 2.0136072635650635, |
| "learning_rate": 5.367892976588628e-06, |
| "loss": 8.2203, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.09875228388395012, |
| "eval_loss": 8.220030784606934, |
| "eval_runtime": 193.6795, |
| "eval_samples_per_second": 51.632, |
| "eval_steps_per_second": 6.454, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.10027154978985704, |
| "grad_norm": 2.404653787612915, |
| "learning_rate": 5.357859531772575e-06, |
| "loss": 8.2154, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.10027154978985704, |
| "eval_loss": 8.219395637512207, |
| "eval_runtime": 193.5124, |
| "eval_samples_per_second": 51.676, |
| "eval_steps_per_second": 6.46, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.10179081569576397, |
| "grad_norm": 1.6043188571929932, |
| "learning_rate": 5.347826086956522e-06, |
| "loss": 8.2128, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.10179081569576397, |
| "eval_loss": 8.216435432434082, |
| "eval_runtime": 193.766, |
| "eval_samples_per_second": 51.609, |
| "eval_steps_per_second": 6.451, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.10331008160167089, |
| "grad_norm": 2.3386034965515137, |
| "learning_rate": 5.337792642140468e-06, |
| "loss": 8.2079, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.10331008160167089, |
| "eval_loss": 8.212626457214355, |
| "eval_runtime": 193.6166, |
| "eval_samples_per_second": 51.648, |
| "eval_steps_per_second": 6.456, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.10482934750757782, |
| "grad_norm": 2.259270668029785, |
| "learning_rate": 5.327759197324415e-06, |
| "loss": 8.2067, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.10482934750757782, |
| "eval_loss": 8.208475112915039, |
| "eval_runtime": 193.3797, |
| "eval_samples_per_second": 51.712, |
| "eval_steps_per_second": 6.464, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.10634861341348474, |
| "grad_norm": 2.469719409942627, |
| "learning_rate": 5.317725752508361e-06, |
| "loss": 8.1994, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.10634861341348474, |
| "eval_loss": 8.199501037597656, |
| "eval_runtime": 193.3429, |
| "eval_samples_per_second": 51.722, |
| "eval_steps_per_second": 6.465, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.10786787931939167, |
| "grad_norm": 4.370075702667236, |
| "learning_rate": 5.307692307692307e-06, |
| "loss": 8.1678, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.10786787931939167, |
| "eval_loss": 8.09277629852295, |
| "eval_runtime": 193.6702, |
| "eval_samples_per_second": 51.634, |
| "eval_steps_per_second": 6.454, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.10938714522529859, |
| "grad_norm": 5.548232555389404, |
| "learning_rate": 5.297658862876254e-06, |
| "loss": 8.0856, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.10938714522529859, |
| "eval_loss": 8.040851593017578, |
| "eval_runtime": 193.617, |
| "eval_samples_per_second": 51.648, |
| "eval_steps_per_second": 6.456, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.11090641113120552, |
| "grad_norm": 2.152247428894043, |
| "learning_rate": 5.287625418060201e-06, |
| "loss": 8.0478, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.11090641113120552, |
| "eval_loss": 7.996228218078613, |
| "eval_runtime": 193.613, |
| "eval_samples_per_second": 51.649, |
| "eval_steps_per_second": 6.456, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.11242567703711244, |
| "grad_norm": 5.204161167144775, |
| "learning_rate": 5.277591973244147e-06, |
| "loss": 8.0095, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.11242567703711244, |
| "eval_loss": 7.958820343017578, |
| "eval_runtime": 193.5944, |
| "eval_samples_per_second": 51.654, |
| "eval_steps_per_second": 6.457, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.11394494294301936, |
| "grad_norm": 7.082394123077393, |
| "learning_rate": 5.2675585284280935e-06, |
| "loss": 7.978, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.11394494294301936, |
| "eval_loss": 7.932178020477295, |
| "eval_runtime": 193.798, |
| "eval_samples_per_second": 51.6, |
| "eval_steps_per_second": 6.45, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.11546420884892629, |
| "grad_norm": 8.926252365112305, |
| "learning_rate": 5.25752508361204e-06, |
| "loss": 7.9505, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.11546420884892629, |
| "eval_loss": 7.882853031158447, |
| "eval_runtime": 193.6404, |
| "eval_samples_per_second": 51.642, |
| "eval_steps_per_second": 6.455, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.11698347475483321, |
| "grad_norm": 3.5671885013580322, |
| "learning_rate": 5.247491638795986e-06, |
| "loss": 7.9086, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.11698347475483321, |
| "eval_loss": 7.840451717376709, |
| "eval_runtime": 193.8911, |
| "eval_samples_per_second": 51.575, |
| "eval_steps_per_second": 6.447, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.11850274066074014, |
| "grad_norm": 5.790298938751221, |
| "learning_rate": 5.237458193979933e-06, |
| "loss": 7.861, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.11850274066074014, |
| "eval_loss": 7.790124416351318, |
| "eval_runtime": 193.8987, |
| "eval_samples_per_second": 51.573, |
| "eval_steps_per_second": 6.447, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.12002200656664706, |
| "grad_norm": 4.439774513244629, |
| "learning_rate": 5.22742474916388e-06, |
| "loss": 7.8082, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.12002200656664706, |
| "eval_loss": 7.72878885269165, |
| "eval_runtime": 193.9191, |
| "eval_samples_per_second": 51.568, |
| "eval_steps_per_second": 6.446, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.12154127247255399, |
| "grad_norm": 3.937167167663574, |
| "learning_rate": 5.2173913043478265e-06, |
| "loss": 7.757, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.12154127247255399, |
| "eval_loss": 7.679111003875732, |
| "eval_runtime": 193.7697, |
| "eval_samples_per_second": 51.608, |
| "eval_steps_per_second": 6.451, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.12306053837846091, |
| "grad_norm": 4.227074146270752, |
| "learning_rate": 5.207357859531772e-06, |
| "loss": 7.7088, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.12306053837846091, |
| "eval_loss": 7.634475231170654, |
| "eval_runtime": 193.766, |
| "eval_samples_per_second": 51.609, |
| "eval_steps_per_second": 6.451, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.12457980428436784, |
| "grad_norm": 3.042202949523926, |
| "learning_rate": 5.197324414715719e-06, |
| "loss": 7.6639, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.12457980428436784, |
| "eval_loss": 7.605250835418701, |
| "eval_runtime": 193.8104, |
| "eval_samples_per_second": 51.597, |
| "eval_steps_per_second": 6.45, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.12609907019027475, |
| "grad_norm": 4.436267375946045, |
| "learning_rate": 5.187290969899666e-06, |
| "loss": 7.6256, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.12609907019027475, |
| "eval_loss": 7.549579620361328, |
| "eval_runtime": 193.7749, |
| "eval_samples_per_second": 51.606, |
| "eval_steps_per_second": 6.451, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.12761833609618167, |
| "grad_norm": 3.9829390048980713, |
| "learning_rate": 5.177257525083612e-06, |
| "loss": 7.5838, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.12761833609618167, |
| "eval_loss": 7.504631042480469, |
| "eval_runtime": 193.7766, |
| "eval_samples_per_second": 51.606, |
| "eval_steps_per_second": 6.451, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.1291376020020886, |
| "grad_norm": 3.072918176651001, |
| "learning_rate": 5.167224080267559e-06, |
| "loss": 7.5446, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.1291376020020886, |
| "eval_loss": 7.467813968658447, |
| "eval_runtime": 194.0927, |
| "eval_samples_per_second": 51.522, |
| "eval_steps_per_second": 6.44, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.13065686790799552, |
| "grad_norm": 3.6358697414398193, |
| "learning_rate": 5.157190635451505e-06, |
| "loss": 7.5114, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.13065686790799552, |
| "eval_loss": 7.4314045906066895, |
| "eval_runtime": 197.5998, |
| "eval_samples_per_second": 50.607, |
| "eval_steps_per_second": 6.326, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.13217613381390245, |
| "grad_norm": 2.4115982055664062, |
| "learning_rate": 5.147157190635451e-06, |
| "loss": 7.4749, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.13217613381390245, |
| "eval_loss": 7.399141311645508, |
| "eval_runtime": 193.7763, |
| "eval_samples_per_second": 51.606, |
| "eval_steps_per_second": 6.451, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.1336953997198094, |
| "grad_norm": 3.8994717597961426, |
| "learning_rate": 5.137123745819398e-06, |
| "loss": 7.4388, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.1336953997198094, |
| "eval_loss": 7.3639349937438965, |
| "eval_runtime": 193.6905, |
| "eval_samples_per_second": 51.629, |
| "eval_steps_per_second": 6.454, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.13521466562571632, |
| "grad_norm": 3.6934337615966797, |
| "learning_rate": 5.127090301003345e-06, |
| "loss": 7.4034, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.13521466562571632, |
| "eval_loss": 7.327520370483398, |
| "eval_runtime": 193.686, |
| "eval_samples_per_second": 51.63, |
| "eval_steps_per_second": 6.454, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.13673393153162325, |
| "grad_norm": 3.4741897583007812, |
| "learning_rate": 5.117056856187291e-06, |
| "loss": 7.3684, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.13673393153162325, |
| "eval_loss": 7.294392108917236, |
| "eval_runtime": 193.3544, |
| "eval_samples_per_second": 51.719, |
| "eval_steps_per_second": 6.465, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.13825319743753017, |
| "grad_norm": 4.130598545074463, |
| "learning_rate": 5.1070234113712375e-06, |
| "loss": 7.3363, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.13825319743753017, |
| "eval_loss": 7.256102561950684, |
| "eval_runtime": 193.8255, |
| "eval_samples_per_second": 51.593, |
| "eval_steps_per_second": 6.449, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.1397724633434371, |
| "grad_norm": 3.8802666664123535, |
| "learning_rate": 5.096989966555184e-06, |
| "loss": 7.3054, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.1397724633434371, |
| "eval_loss": 7.220945358276367, |
| "eval_runtime": 193.6267, |
| "eval_samples_per_second": 51.646, |
| "eval_steps_per_second": 6.456, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.14129172924934402, |
| "grad_norm": 3.072411298751831, |
| "learning_rate": 5.08695652173913e-06, |
| "loss": 7.2674, |
| "step": 9300 |
| }, |
| { |
| "epoch": 0.14129172924934402, |
| "eval_loss": 7.181826591491699, |
| "eval_runtime": 193.6566, |
| "eval_samples_per_second": 51.638, |
| "eval_steps_per_second": 6.455, |
| "step": 9300 |
| }, |
| { |
| "epoch": 0.14281099515525095, |
| "grad_norm": 4.051361560821533, |
| "learning_rate": 5.076923076923077e-06, |
| "loss": 7.2294, |
| "step": 9400 |
| }, |
| { |
| "epoch": 0.14281099515525095, |
| "eval_loss": 7.154284477233887, |
| "eval_runtime": 193.737, |
| "eval_samples_per_second": 51.616, |
| "eval_steps_per_second": 6.452, |
| "step": 9400 |
| }, |
| { |
| "epoch": 0.14433026106115787, |
| "grad_norm": 3.4815194606781006, |
| "learning_rate": 5.066889632107024e-06, |
| "loss": 7.1993, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.14433026106115787, |
| "eval_loss": 7.109873294830322, |
| "eval_runtime": 193.7018, |
| "eval_samples_per_second": 51.626, |
| "eval_steps_per_second": 6.453, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.1458495269670648, |
| "grad_norm": 4.168730735778809, |
| "learning_rate": 5.05685618729097e-06, |
| "loss": 7.1617, |
| "step": 9600 |
| }, |
| { |
| "epoch": 0.1458495269670648, |
| "eval_loss": 7.068033695220947, |
| "eval_runtime": 193.7844, |
| "eval_samples_per_second": 51.604, |
| "eval_steps_per_second": 6.45, |
| "step": 9600 |
| }, |
| { |
| "epoch": 0.14736879287297172, |
| "grad_norm": 4.632892608642578, |
| "learning_rate": 5.046822742474916e-06, |
| "loss": 7.1265, |
| "step": 9700 |
| }, |
| { |
| "epoch": 0.14736879287297172, |
| "eval_loss": 7.029054641723633, |
| "eval_runtime": 193.7033, |
| "eval_samples_per_second": 51.625, |
| "eval_steps_per_second": 6.453, |
| "step": 9700 |
| }, |
| { |
| "epoch": 0.14888805877887865, |
| "grad_norm": 5.668432235717773, |
| "learning_rate": 5.036789297658863e-06, |
| "loss": 7.0973, |
| "step": 9800 |
| }, |
| { |
| "epoch": 0.14888805877887865, |
| "eval_loss": 7.001068115234375, |
| "eval_runtime": 193.7831, |
| "eval_samples_per_second": 51.604, |
| "eval_steps_per_second": 6.451, |
| "step": 9800 |
| }, |
| { |
| "epoch": 0.15040732468478557, |
| "grad_norm": 6.07447624206543, |
| "learning_rate": 5.02675585284281e-06, |
| "loss": 7.0693, |
| "step": 9900 |
| }, |
| { |
| "epoch": 0.15040732468478557, |
| "eval_loss": 6.974251747131348, |
| "eval_runtime": 193.7698, |
| "eval_samples_per_second": 51.608, |
| "eval_steps_per_second": 6.451, |
| "step": 9900 |
| }, |
| { |
| "epoch": 0.1519265905906925, |
| "grad_norm": 5.610072135925293, |
| "learning_rate": 5.016722408026756e-06, |
| "loss": 7.0395, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.1519265905906925, |
| "eval_loss": 6.959811687469482, |
| "eval_runtime": 193.7123, |
| "eval_samples_per_second": 51.623, |
| "eval_steps_per_second": 6.453, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.15344585649659942, |
| "grad_norm": 4.722342491149902, |
| "learning_rate": 5.0066889632107026e-06, |
| "loss": 7.0146, |
| "step": 10100 |
| }, |
| { |
| "epoch": 0.15344585649659942, |
| "eval_loss": 6.919780254364014, |
| "eval_runtime": 193.9627, |
| "eval_samples_per_second": 51.556, |
| "eval_steps_per_second": 6.445, |
| "step": 10100 |
| }, |
| { |
| "epoch": 0.15496512240250634, |
| "grad_norm": 2.454202175140381, |
| "learning_rate": 4.996655518394649e-06, |
| "loss": 6.979, |
| "step": 10200 |
| }, |
| { |
| "epoch": 0.15496512240250634, |
| "eval_loss": 6.8777360916137695, |
| "eval_runtime": 193.5746, |
| "eval_samples_per_second": 51.66, |
| "eval_steps_per_second": 6.457, |
| "step": 10200 |
| }, |
| { |
| "epoch": 0.15648438830841327, |
| "grad_norm": 6.628566265106201, |
| "learning_rate": 4.986622073578595e-06, |
| "loss": 6.9576, |
| "step": 10300 |
| }, |
| { |
| "epoch": 0.15648438830841327, |
| "eval_loss": 6.851335048675537, |
| "eval_runtime": 193.6238, |
| "eval_samples_per_second": 51.647, |
| "eval_steps_per_second": 6.456, |
| "step": 10300 |
| }, |
| { |
| "epoch": 0.1580036542143202, |
| "grad_norm": 4.226571559906006, |
| "learning_rate": 4.976588628762542e-06, |
| "loss": 6.9294, |
| "step": 10400 |
| }, |
| { |
| "epoch": 0.1580036542143202, |
| "eval_loss": 6.837319374084473, |
| "eval_runtime": 193.6321, |
| "eval_samples_per_second": 51.644, |
| "eval_steps_per_second": 6.456, |
| "step": 10400 |
| }, |
| { |
| "epoch": 0.15952292012022712, |
| "grad_norm": 3.949143648147583, |
| "learning_rate": 4.966555183946489e-06, |
| "loss": 6.906, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.15952292012022712, |
| "eval_loss": 6.798065662384033, |
| "eval_runtime": 193.7227, |
| "eval_samples_per_second": 51.62, |
| "eval_steps_per_second": 6.453, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.16104218602613404, |
| "grad_norm": 4.327299118041992, |
| "learning_rate": 4.956521739130435e-06, |
| "loss": 6.8789, |
| "step": 10600 |
| }, |
| { |
| "epoch": 0.16104218602613404, |
| "eval_loss": 6.793569564819336, |
| "eval_runtime": 193.6196, |
| "eval_samples_per_second": 51.648, |
| "eval_steps_per_second": 6.456, |
| "step": 10600 |
| }, |
| { |
| "epoch": 0.16256145193204097, |
| "grad_norm": 3.8152856826782227, |
| "learning_rate": 4.9464882943143815e-06, |
| "loss": 6.8557, |
| "step": 10700 |
| }, |
| { |
| "epoch": 0.16256145193204097, |
| "eval_loss": 6.754009246826172, |
| "eval_runtime": 193.7319, |
| "eval_samples_per_second": 51.618, |
| "eval_steps_per_second": 6.452, |
| "step": 10700 |
| }, |
| { |
| "epoch": 0.1640807178379479, |
| "grad_norm": 4.621021747589111, |
| "learning_rate": 4.936454849498328e-06, |
| "loss": 6.8387, |
| "step": 10800 |
| }, |
| { |
| "epoch": 0.1640807178379479, |
| "eval_loss": 6.748414993286133, |
| "eval_runtime": 193.8639, |
| "eval_samples_per_second": 51.583, |
| "eval_steps_per_second": 6.448, |
| "step": 10800 |
| }, |
| { |
| "epoch": 0.16559998374385482, |
| "grad_norm": 4.906980514526367, |
| "learning_rate": 4.926421404682274e-06, |
| "loss": 6.8054, |
| "step": 10900 |
| }, |
| { |
| "epoch": 0.16559998374385482, |
| "eval_loss": 6.722209453582764, |
| "eval_runtime": 193.5438, |
| "eval_samples_per_second": 51.668, |
| "eval_steps_per_second": 6.458, |
| "step": 10900 |
| }, |
| { |
| "epoch": 0.16711924964976174, |
| "grad_norm": 3.9837253093719482, |
| "learning_rate": 4.916387959866221e-06, |
| "loss": 6.7829, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.16711924964976174, |
| "eval_loss": 6.689162731170654, |
| "eval_runtime": 193.4992, |
| "eval_samples_per_second": 51.68, |
| "eval_steps_per_second": 6.46, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.16863851555566867, |
| "grad_norm": 4.781426906585693, |
| "learning_rate": 4.906354515050168e-06, |
| "loss": 6.7597, |
| "step": 11100 |
| }, |
| { |
| "epoch": 0.16863851555566867, |
| "eval_loss": 6.658721923828125, |
| "eval_runtime": 193.7146, |
| "eval_samples_per_second": 51.622, |
| "eval_steps_per_second": 6.453, |
| "step": 11100 |
| }, |
| { |
| "epoch": 0.1701577814615756, |
| "grad_norm": 6.702068328857422, |
| "learning_rate": 4.8963210702341136e-06, |
| "loss": 6.7416, |
| "step": 11200 |
| }, |
| { |
| "epoch": 0.1701577814615756, |
| "eval_loss": 6.642455577850342, |
| "eval_runtime": 193.6222, |
| "eval_samples_per_second": 51.647, |
| "eval_steps_per_second": 6.456, |
| "step": 11200 |
| }, |
| { |
| "epoch": 0.17167704736748252, |
| "grad_norm": 3.1839189529418945, |
| "learning_rate": 4.88628762541806e-06, |
| "loss": 6.7201, |
| "step": 11300 |
| }, |
| { |
| "epoch": 0.17167704736748252, |
| "eval_loss": 6.614835262298584, |
| "eval_runtime": 193.735, |
| "eval_samples_per_second": 51.617, |
| "eval_steps_per_second": 6.452, |
| "step": 11300 |
| }, |
| { |
| "epoch": 0.17319631327338944, |
| "grad_norm": 5.427370071411133, |
| "learning_rate": 4.876254180602007e-06, |
| "loss": 6.6993, |
| "step": 11400 |
| }, |
| { |
| "epoch": 0.17319631327338944, |
| "eval_loss": 6.601010799407959, |
| "eval_runtime": 193.6047, |
| "eval_samples_per_second": 51.652, |
| "eval_steps_per_second": 6.456, |
| "step": 11400 |
| }, |
| { |
| "epoch": 0.17471557917929637, |
| "grad_norm": 4.759448051452637, |
| "learning_rate": 4.866220735785953e-06, |
| "loss": 6.6775, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.17471557917929637, |
| "eval_loss": 6.579466342926025, |
| "eval_runtime": 193.6792, |
| "eval_samples_per_second": 51.632, |
| "eval_steps_per_second": 6.454, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.1762348450852033, |
| "grad_norm": 3.7401344776153564, |
| "learning_rate": 4.8561872909699e-06, |
| "loss": 6.6565, |
| "step": 11600 |
| }, |
| { |
| "epoch": 0.1762348450852033, |
| "eval_loss": 6.576225757598877, |
| "eval_runtime": 194.0381, |
| "eval_samples_per_second": 51.536, |
| "eval_steps_per_second": 6.442, |
| "step": 11600 |
| }, |
| { |
| "epoch": 0.17775411099111021, |
| "grad_norm": 5.589729309082031, |
| "learning_rate": 4.8461538461538465e-06, |
| "loss": 6.6419, |
| "step": 11700 |
| }, |
| { |
| "epoch": 0.17775411099111021, |
| "eval_loss": 6.542896270751953, |
| "eval_runtime": 193.798, |
| "eval_samples_per_second": 51.6, |
| "eval_steps_per_second": 6.45, |
| "step": 11700 |
| }, |
| { |
| "epoch": 0.17927337689701714, |
| "grad_norm": 4.623971939086914, |
| "learning_rate": 4.8361204013377925e-06, |
| "loss": 6.6199, |
| "step": 11800 |
| }, |
| { |
| "epoch": 0.17927337689701714, |
| "eval_loss": 6.519240856170654, |
| "eval_runtime": 193.739, |
| "eval_samples_per_second": 51.616, |
| "eval_steps_per_second": 6.452, |
| "step": 11800 |
| }, |
| { |
| "epoch": 0.18079264280292406, |
| "grad_norm": 4.685464859008789, |
| "learning_rate": 4.826086956521739e-06, |
| "loss": 6.6012, |
| "step": 11900 |
| }, |
| { |
| "epoch": 0.18079264280292406, |
| "eval_loss": 6.489596843719482, |
| "eval_runtime": 193.7453, |
| "eval_samples_per_second": 51.614, |
| "eval_steps_per_second": 6.452, |
| "step": 11900 |
| }, |
| { |
| "epoch": 0.182311908708831, |
| "grad_norm": 3.054800271987915, |
| "learning_rate": 4.816053511705686e-06, |
| "loss": 6.5813, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.182311908708831, |
| "eval_loss": 6.497661113739014, |
| "eval_runtime": 193.7138, |
| "eval_samples_per_second": 51.623, |
| "eval_steps_per_second": 6.453, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.1838311746147379, |
| "grad_norm": 3.547619342803955, |
| "learning_rate": 4.806020066889633e-06, |
| "loss": 6.5623, |
| "step": 12100 |
| }, |
| { |
| "epoch": 0.1838311746147379, |
| "eval_loss": 6.468958377838135, |
| "eval_runtime": 193.766, |
| "eval_samples_per_second": 51.609, |
| "eval_steps_per_second": 6.451, |
| "step": 12100 |
| }, |
| { |
| "epoch": 0.18535044052064484, |
| "grad_norm": 4.697444915771484, |
| "learning_rate": 4.795986622073579e-06, |
| "loss": 6.5448, |
| "step": 12200 |
| }, |
| { |
| "epoch": 0.18535044052064484, |
| "eval_loss": 6.436464309692383, |
| "eval_runtime": 193.7105, |
| "eval_samples_per_second": 51.623, |
| "eval_steps_per_second": 6.453, |
| "step": 12200 |
| }, |
| { |
| "epoch": 0.18686970642655176, |
| "grad_norm": 4.79019021987915, |
| "learning_rate": 4.785953177257525e-06, |
| "loss": 6.525, |
| "step": 12300 |
| }, |
| { |
| "epoch": 0.18686970642655176, |
| "eval_loss": 6.422084331512451, |
| "eval_runtime": 193.7424, |
| "eval_samples_per_second": 51.615, |
| "eval_steps_per_second": 6.452, |
| "step": 12300 |
| }, |
| { |
| "epoch": 0.1883889723324587, |
| "grad_norm": 3.7939579486846924, |
| "learning_rate": 4.775919732441472e-06, |
| "loss": 6.5131, |
| "step": 12400 |
| }, |
| { |
| "epoch": 0.1883889723324587, |
| "eval_loss": 6.425159931182861, |
| "eval_runtime": 193.6634, |
| "eval_samples_per_second": 51.636, |
| "eval_steps_per_second": 6.454, |
| "step": 12400 |
| }, |
| { |
| "epoch": 0.1899082382383656, |
| "grad_norm": 3.271348714828491, |
| "learning_rate": 4.765886287625418e-06, |
| "loss": 6.4978, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.1899082382383656, |
| "eval_loss": 6.395818710327148, |
| "eval_runtime": 193.5299, |
| "eval_samples_per_second": 51.672, |
| "eval_steps_per_second": 6.459, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.19142750414427254, |
| "grad_norm": 4.119296073913574, |
| "learning_rate": 4.755852842809365e-06, |
| "loss": 6.483, |
| "step": 12600 |
| }, |
| { |
| "epoch": 0.19142750414427254, |
| "eval_loss": 6.397064208984375, |
| "eval_runtime": 193.5731, |
| "eval_samples_per_second": 51.66, |
| "eval_steps_per_second": 6.458, |
| "step": 12600 |
| }, |
| { |
| "epoch": 0.19294677005017946, |
| "grad_norm": 3.7907373905181885, |
| "learning_rate": 4.745819397993312e-06, |
| "loss": 6.4639, |
| "step": 12700 |
| }, |
| { |
| "epoch": 0.19294677005017946, |
| "eval_loss": 6.348308563232422, |
| "eval_runtime": 193.6279, |
| "eval_samples_per_second": 51.645, |
| "eval_steps_per_second": 6.456, |
| "step": 12700 |
| }, |
| { |
| "epoch": 0.19446603595608639, |
| "grad_norm": 3.8455281257629395, |
| "learning_rate": 4.7357859531772575e-06, |
| "loss": 6.4471, |
| "step": 12800 |
| }, |
| { |
| "epoch": 0.19446603595608639, |
| "eval_loss": 6.347524642944336, |
| "eval_runtime": 193.9737, |
| "eval_samples_per_second": 51.553, |
| "eval_steps_per_second": 6.444, |
| "step": 12800 |
| }, |
| { |
| "epoch": 0.1959853018619933, |
| "grad_norm": 3.5916056632995605, |
| "learning_rate": 4.725752508361204e-06, |
| "loss": 6.4303, |
| "step": 12900 |
| }, |
| { |
| "epoch": 0.1959853018619933, |
| "eval_loss": 6.33302640914917, |
| "eval_runtime": 193.6018, |
| "eval_samples_per_second": 51.652, |
| "eval_steps_per_second": 6.457, |
| "step": 12900 |
| }, |
| { |
| "epoch": 0.19750456776790024, |
| "grad_norm": 3.734985589981079, |
| "learning_rate": 4.715719063545151e-06, |
| "loss": 6.4158, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.19750456776790024, |
| "eval_loss": 6.3155083656311035, |
| "eval_runtime": 193.6987, |
| "eval_samples_per_second": 51.627, |
| "eval_steps_per_second": 6.453, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.19902383367380716, |
| "grad_norm": 3.579678535461426, |
| "learning_rate": 4.705685618729097e-06, |
| "loss": 6.4024, |
| "step": 13100 |
| }, |
| { |
| "epoch": 0.19902383367380716, |
| "eval_loss": 6.29377555847168, |
| "eval_runtime": 193.6889, |
| "eval_samples_per_second": 51.629, |
| "eval_steps_per_second": 6.454, |
| "step": 13100 |
| }, |
| { |
| "epoch": 0.20054309957971408, |
| "grad_norm": 4.257501125335693, |
| "learning_rate": 4.695652173913044e-06, |
| "loss": 6.3894, |
| "step": 13200 |
| }, |
| { |
| "epoch": 0.20054309957971408, |
| "eval_loss": 6.288681507110596, |
| "eval_runtime": 193.6264, |
| "eval_samples_per_second": 51.646, |
| "eval_steps_per_second": 6.456, |
| "step": 13200 |
| }, |
| { |
| "epoch": 0.202062365485621, |
| "grad_norm": 3.8430824279785156, |
| "learning_rate": 4.6856187290969905e-06, |
| "loss": 6.3715, |
| "step": 13300 |
| }, |
| { |
| "epoch": 0.202062365485621, |
| "eval_loss": 6.247255802154541, |
| "eval_runtime": 193.7628, |
| "eval_samples_per_second": 51.609, |
| "eval_steps_per_second": 6.451, |
| "step": 13300 |
| }, |
| { |
| "epoch": 0.20358163139152793, |
| "grad_norm": 3.9459517002105713, |
| "learning_rate": 4.675585284280936e-06, |
| "loss": 6.3583, |
| "step": 13400 |
| }, |
| { |
| "epoch": 0.20358163139152793, |
| "eval_loss": 6.250117301940918, |
| "eval_runtime": 193.7665, |
| "eval_samples_per_second": 51.609, |
| "eval_steps_per_second": 6.451, |
| "step": 13400 |
| }, |
| { |
| "epoch": 0.20510089729743486, |
| "grad_norm": 3.475034475326538, |
| "learning_rate": 4.665551839464883e-06, |
| "loss": 6.3431, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.20510089729743486, |
| "eval_loss": 6.22703742980957, |
| "eval_runtime": 193.7742, |
| "eval_samples_per_second": 51.606, |
| "eval_steps_per_second": 6.451, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.20662016320334178, |
| "grad_norm": 4.17089319229126, |
| "learning_rate": 4.65551839464883e-06, |
| "loss": 6.3288, |
| "step": 13600 |
| }, |
| { |
| "epoch": 0.20662016320334178, |
| "eval_loss": 6.230484962463379, |
| "eval_runtime": 193.7687, |
| "eval_samples_per_second": 51.608, |
| "eval_steps_per_second": 6.451, |
| "step": 13600 |
| }, |
| { |
| "epoch": 0.2081394291092487, |
| "grad_norm": 2.118986129760742, |
| "learning_rate": 4.645484949832776e-06, |
| "loss": 6.3169, |
| "step": 13700 |
| }, |
| { |
| "epoch": 0.2081394291092487, |
| "eval_loss": 6.206001281738281, |
| "eval_runtime": 193.8247, |
| "eval_samples_per_second": 51.593, |
| "eval_steps_per_second": 6.449, |
| "step": 13700 |
| }, |
| { |
| "epoch": 0.20965869501515563, |
| "grad_norm": 4.810153007507324, |
| "learning_rate": 4.635451505016723e-06, |
| "loss": 6.3032, |
| "step": 13800 |
| }, |
| { |
| "epoch": 0.20965869501515563, |
| "eval_loss": 6.18707275390625, |
| "eval_runtime": 193.7442, |
| "eval_samples_per_second": 51.614, |
| "eval_steps_per_second": 6.452, |
| "step": 13800 |
| }, |
| { |
| "epoch": 0.21117796092106256, |
| "grad_norm": 3.7797763347625732, |
| "learning_rate": 4.625418060200669e-06, |
| "loss": 6.2918, |
| "step": 13900 |
| }, |
| { |
| "epoch": 0.21117796092106256, |
| "eval_loss": 6.1838603019714355, |
| "eval_runtime": 193.7881, |
| "eval_samples_per_second": 51.603, |
| "eval_steps_per_second": 6.45, |
| "step": 13900 |
| }, |
| { |
| "epoch": 0.21269722682696948, |
| "grad_norm": 4.482378959655762, |
| "learning_rate": 4.615384615384616e-06, |
| "loss": 6.2757, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.21269722682696948, |
| "eval_loss": 6.161965847015381, |
| "eval_runtime": 193.7356, |
| "eval_samples_per_second": 51.617, |
| "eval_steps_per_second": 6.452, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.2142164927328764, |
| "grad_norm": 4.001418590545654, |
| "learning_rate": 4.605351170568562e-06, |
| "loss": 6.2647, |
| "step": 14100 |
| }, |
| { |
| "epoch": 0.2142164927328764, |
| "eval_loss": 6.15457820892334, |
| "eval_runtime": 193.9234, |
| "eval_samples_per_second": 51.567, |
| "eval_steps_per_second": 6.446, |
| "step": 14100 |
| }, |
| { |
| "epoch": 0.21573575863878333, |
| "grad_norm": 3.8982086181640625, |
| "learning_rate": 4.595317725752509e-06, |
| "loss": 6.2545, |
| "step": 14200 |
| }, |
| { |
| "epoch": 0.21573575863878333, |
| "eval_loss": 6.143900394439697, |
| "eval_runtime": 193.7808, |
| "eval_samples_per_second": 51.605, |
| "eval_steps_per_second": 6.451, |
| "step": 14200 |
| }, |
| { |
| "epoch": 0.21725502454469026, |
| "grad_norm": 3.459050416946411, |
| "learning_rate": 4.585284280936456e-06, |
| "loss": 6.2398, |
| "step": 14300 |
| }, |
| { |
| "epoch": 0.21725502454469026, |
| "eval_loss": 6.131939888000488, |
| "eval_runtime": 193.8856, |
| "eval_samples_per_second": 51.577, |
| "eval_steps_per_second": 6.447, |
| "step": 14300 |
| }, |
| { |
| "epoch": 0.21877429045059718, |
| "grad_norm": 3.335505962371826, |
| "learning_rate": 4.5752508361204015e-06, |
| "loss": 6.2327, |
| "step": 14400 |
| }, |
| { |
| "epoch": 0.21877429045059718, |
| "eval_loss": 6.106751918792725, |
| "eval_runtime": 193.844, |
| "eval_samples_per_second": 51.588, |
| "eval_steps_per_second": 6.448, |
| "step": 14400 |
| }, |
| { |
| "epoch": 0.2202935563565041, |
| "grad_norm": 3.845909357070923, |
| "learning_rate": 4.565217391304348e-06, |
| "loss": 6.2144, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.2202935563565041, |
| "eval_loss": 6.094777584075928, |
| "eval_runtime": 193.5802, |
| "eval_samples_per_second": 51.658, |
| "eval_steps_per_second": 6.457, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.22181282226241103, |
| "grad_norm": 3.846149206161499, |
| "learning_rate": 4.555183946488295e-06, |
| "loss": 6.2022, |
| "step": 14600 |
| }, |
| { |
| "epoch": 0.22181282226241103, |
| "eval_loss": 6.085541248321533, |
| "eval_runtime": 193.8098, |
| "eval_samples_per_second": 51.597, |
| "eval_steps_per_second": 6.45, |
| "step": 14600 |
| }, |
| { |
| "epoch": 0.22333208816831795, |
| "grad_norm": 3.50091814994812, |
| "learning_rate": 4.545150501672241e-06, |
| "loss": 6.1915, |
| "step": 14700 |
| }, |
| { |
| "epoch": 0.22333208816831795, |
| "eval_loss": 6.058828830718994, |
| "eval_runtime": 193.4542, |
| "eval_samples_per_second": 51.692, |
| "eval_steps_per_second": 6.461, |
| "step": 14700 |
| }, |
| { |
| "epoch": 0.22485135407422488, |
| "grad_norm": 4.312457084655762, |
| "learning_rate": 4.535117056856188e-06, |
| "loss": 6.1776, |
| "step": 14800 |
| }, |
| { |
| "epoch": 0.22485135407422488, |
| "eval_loss": 6.067806720733643, |
| "eval_runtime": 193.5484, |
| "eval_samples_per_second": 51.667, |
| "eval_steps_per_second": 6.458, |
| "step": 14800 |
| }, |
| { |
| "epoch": 0.2263706199801318, |
| "grad_norm": 3.6476268768310547, |
| "learning_rate": 4.5250836120401345e-06, |
| "loss": 6.1703, |
| "step": 14900 |
| }, |
| { |
| "epoch": 0.2263706199801318, |
| "eval_loss": 6.045175552368164, |
| "eval_runtime": 193.3937, |
| "eval_samples_per_second": 51.708, |
| "eval_steps_per_second": 6.463, |
| "step": 14900 |
| }, |
| { |
| "epoch": 0.22788988588603873, |
| "grad_norm": 4.1377739906311035, |
| "learning_rate": 4.51505016722408e-06, |
| "loss": 6.1577, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.22788988588603873, |
| "eval_loss": 6.038886547088623, |
| "eval_runtime": 193.5749, |
| "eval_samples_per_second": 51.66, |
| "eval_steps_per_second": 6.457, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.22940915179194565, |
| "grad_norm": 4.192631244659424, |
| "learning_rate": 4.505016722408027e-06, |
| "loss": 6.1477, |
| "step": 15100 |
| }, |
| { |
| "epoch": 0.22940915179194565, |
| "eval_loss": 6.030833721160889, |
| "eval_runtime": 193.5655, |
| "eval_samples_per_second": 51.662, |
| "eval_steps_per_second": 6.458, |
| "step": 15100 |
| }, |
| { |
| "epoch": 0.23092841769785258, |
| "grad_norm": 3.234416961669922, |
| "learning_rate": 4.494983277591973e-06, |
| "loss": 6.1363, |
| "step": 15200 |
| }, |
| { |
| "epoch": 0.23092841769785258, |
| "eval_loss": 6.008749008178711, |
| "eval_runtime": 193.3663, |
| "eval_samples_per_second": 51.715, |
| "eval_steps_per_second": 6.464, |
| "step": 15200 |
| }, |
| { |
| "epoch": 0.2324476836037595, |
| "grad_norm": 4.860428810119629, |
| "learning_rate": 4.48494983277592e-06, |
| "loss": 6.1298, |
| "step": 15300 |
| }, |
| { |
| "epoch": 0.2324476836037595, |
| "eval_loss": 5.996873378753662, |
| "eval_runtime": 193.6007, |
| "eval_samples_per_second": 51.653, |
| "eval_steps_per_second": 6.457, |
| "step": 15300 |
| }, |
| { |
| "epoch": 0.23396694950966643, |
| "grad_norm": 4.0561323165893555, |
| "learning_rate": 4.474916387959866e-06, |
| "loss": 6.1122, |
| "step": 15400 |
| }, |
| { |
| "epoch": 0.23396694950966643, |
| "eval_loss": 5.984120845794678, |
| "eval_runtime": 193.5721, |
| "eval_samples_per_second": 51.66, |
| "eval_steps_per_second": 6.458, |
| "step": 15400 |
| }, |
| { |
| "epoch": 0.23548621541557335, |
| "grad_norm": 2.9819724559783936, |
| "learning_rate": 4.4648829431438125e-06, |
| "loss": 6.1024, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.23548621541557335, |
| "eval_loss": 5.982254981994629, |
| "eval_runtime": 193.7425, |
| "eval_samples_per_second": 51.615, |
| "eval_steps_per_second": 6.452, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.23700548132148028, |
| "grad_norm": 3.733194351196289, |
| "learning_rate": 4.454849498327759e-06, |
| "loss": 6.0924, |
| "step": 15600 |
| }, |
| { |
| "epoch": 0.23700548132148028, |
| "eval_loss": 5.969741344451904, |
| "eval_runtime": 193.8217, |
| "eval_samples_per_second": 51.594, |
| "eval_steps_per_second": 6.449, |
| "step": 15600 |
| }, |
| { |
| "epoch": 0.2385247472273872, |
| "grad_norm": 5.688018321990967, |
| "learning_rate": 4.444816053511705e-06, |
| "loss": 6.0871, |
| "step": 15700 |
| }, |
| { |
| "epoch": 0.2385247472273872, |
| "eval_loss": 5.9461445808410645, |
| "eval_runtime": 193.8283, |
| "eval_samples_per_second": 51.592, |
| "eval_steps_per_second": 6.449, |
| "step": 15700 |
| }, |
| { |
| "epoch": 0.24004401313329413, |
| "grad_norm": 2.9404726028442383, |
| "learning_rate": 4.434782608695652e-06, |
| "loss": 6.0706, |
| "step": 15800 |
| }, |
| { |
| "epoch": 0.24004401313329413, |
| "eval_loss": 5.936134338378906, |
| "eval_runtime": 193.8629, |
| "eval_samples_per_second": 51.583, |
| "eval_steps_per_second": 6.448, |
| "step": 15800 |
| }, |
| { |
| "epoch": 0.24156327903920105, |
| "grad_norm": 4.436812877655029, |
| "learning_rate": 4.424749163879599e-06, |
| "loss": 6.0652, |
| "step": 15900 |
| }, |
| { |
| "epoch": 0.24156327903920105, |
| "eval_loss": 5.9289655685424805, |
| "eval_runtime": 193.8246, |
| "eval_samples_per_second": 51.593, |
| "eval_steps_per_second": 6.449, |
| "step": 15900 |
| }, |
| { |
| "epoch": 0.24308254494510798, |
| "grad_norm": 4.113779544830322, |
| "learning_rate": 4.414715719063545e-06, |
| "loss": 6.0497, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.24308254494510798, |
| "eval_loss": 5.926904678344727, |
| "eval_runtime": 194.0466, |
| "eval_samples_per_second": 51.534, |
| "eval_steps_per_second": 6.442, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.2446018108510149, |
| "grad_norm": 3.2827975749969482, |
| "learning_rate": 4.404682274247491e-06, |
| "loss": 6.0365, |
| "step": 16100 |
| }, |
| { |
| "epoch": 0.2446018108510149, |
| "eval_loss": 5.9020185470581055, |
| "eval_runtime": 193.842, |
| "eval_samples_per_second": 51.588, |
| "eval_steps_per_second": 6.449, |
| "step": 16100 |
| }, |
| { |
| "epoch": 0.24612107675692182, |
| "grad_norm": 3.8352739810943604, |
| "learning_rate": 4.394648829431438e-06, |
| "loss": 6.0305, |
| "step": 16200 |
| }, |
| { |
| "epoch": 0.24612107675692182, |
| "eval_loss": 5.900501251220703, |
| "eval_runtime": 193.7914, |
| "eval_samples_per_second": 51.602, |
| "eval_steps_per_second": 6.45, |
| "step": 16200 |
| }, |
| { |
| "epoch": 0.24764034266282875, |
| "grad_norm": 3.2179617881774902, |
| "learning_rate": 4.384615384615384e-06, |
| "loss": 6.0166, |
| "step": 16300 |
| }, |
| { |
| "epoch": 0.24764034266282875, |
| "eval_loss": 5.891448497772217, |
| "eval_runtime": 193.74, |
| "eval_samples_per_second": 51.616, |
| "eval_steps_per_second": 6.452, |
| "step": 16300 |
| }, |
| { |
| "epoch": 0.24915960856873567, |
| "grad_norm": 3.446993112564087, |
| "learning_rate": 4.374581939799331e-06, |
| "loss": 6.0121, |
| "step": 16400 |
| }, |
| { |
| "epoch": 0.24915960856873567, |
| "eval_loss": 5.874625205993652, |
| "eval_runtime": 193.8125, |
| "eval_samples_per_second": 51.596, |
| "eval_steps_per_second": 6.45, |
| "step": 16400 |
| }, |
| { |
| "epoch": 0.2506788744746426, |
| "grad_norm": 4.3962578773498535, |
| "learning_rate": 4.364548494983278e-06, |
| "loss": 6.0029, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.2506788744746426, |
| "eval_loss": 5.884474754333496, |
| "eval_runtime": 193.754, |
| "eval_samples_per_second": 51.612, |
| "eval_steps_per_second": 6.451, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.2521981403805495, |
| "grad_norm": 4.52181339263916, |
| "learning_rate": 4.354515050167224e-06, |
| "loss": 5.9925, |
| "step": 16600 |
| }, |
| { |
| "epoch": 0.2521981403805495, |
| "eval_loss": 5.867855548858643, |
| "eval_runtime": 193.7648, |
| "eval_samples_per_second": 51.609, |
| "eval_steps_per_second": 6.451, |
| "step": 16600 |
| }, |
| { |
| "epoch": 0.25371740628645645, |
| "grad_norm": 3.847750186920166, |
| "learning_rate": 4.34448160535117e-06, |
| "loss": 5.9839, |
| "step": 16700 |
| }, |
| { |
| "epoch": 0.25371740628645645, |
| "eval_loss": 5.851235389709473, |
| "eval_runtime": 193.6623, |
| "eval_samples_per_second": 51.636, |
| "eval_steps_per_second": 6.455, |
| "step": 16700 |
| }, |
| { |
| "epoch": 0.25523667219236335, |
| "grad_norm": 2.9024147987365723, |
| "learning_rate": 4.334448160535117e-06, |
| "loss": 5.9744, |
| "step": 16800 |
| }, |
| { |
| "epoch": 0.25523667219236335, |
| "eval_loss": 5.854368686676025, |
| "eval_runtime": 193.8613, |
| "eval_samples_per_second": 51.583, |
| "eval_steps_per_second": 6.448, |
| "step": 16800 |
| }, |
| { |
| "epoch": 0.2567559380982703, |
| "grad_norm": 3.2213125228881836, |
| "learning_rate": 4.324414715719064e-06, |
| "loss": 5.9653, |
| "step": 16900 |
| }, |
| { |
| "epoch": 0.2567559380982703, |
| "eval_loss": 5.836233139038086, |
| "eval_runtime": 193.5642, |
| "eval_samples_per_second": 51.662, |
| "eval_steps_per_second": 6.458, |
| "step": 16900 |
| }, |
| { |
| "epoch": 0.2582752040041772, |
| "grad_norm": 4.198850631713867, |
| "learning_rate": 4.31438127090301e-06, |
| "loss": 5.957, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.2582752040041772, |
| "eval_loss": 5.818154811859131, |
| "eval_runtime": 193.5777, |
| "eval_samples_per_second": 51.659, |
| "eval_steps_per_second": 6.457, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.25979446991008415, |
| "grad_norm": 2.9214396476745605, |
| "learning_rate": 4.3043478260869565e-06, |
| "loss": 5.9417, |
| "step": 17100 |
| }, |
| { |
| "epoch": 0.25979446991008415, |
| "eval_loss": 5.829405784606934, |
| "eval_runtime": 193.6055, |
| "eval_samples_per_second": 51.651, |
| "eval_steps_per_second": 6.456, |
| "step": 17100 |
| }, |
| { |
| "epoch": 0.26131373581599104, |
| "grad_norm": 3.7691545486450195, |
| "learning_rate": 4.294314381270903e-06, |
| "loss": 5.934, |
| "step": 17200 |
| }, |
| { |
| "epoch": 0.26131373581599104, |
| "eval_loss": 5.794999122619629, |
| "eval_runtime": 193.5657, |
| "eval_samples_per_second": 51.662, |
| "eval_steps_per_second": 6.458, |
| "step": 17200 |
| }, |
| { |
| "epoch": 0.262833001721898, |
| "grad_norm": 4.013944625854492, |
| "learning_rate": 4.284280936454849e-06, |
| "loss": 5.9269, |
| "step": 17300 |
| }, |
| { |
| "epoch": 0.262833001721898, |
| "eval_loss": 5.787894248962402, |
| "eval_runtime": 193.7935, |
| "eval_samples_per_second": 51.601, |
| "eval_steps_per_second": 6.45, |
| "step": 17300 |
| }, |
| { |
| "epoch": 0.2643522676278049, |
| "grad_norm": 3.784191370010376, |
| "learning_rate": 4.274247491638796e-06, |
| "loss": 5.9224, |
| "step": 17400 |
| }, |
| { |
| "epoch": 0.2643522676278049, |
| "eval_loss": 5.795870780944824, |
| "eval_runtime": 193.6051, |
| "eval_samples_per_second": 51.652, |
| "eval_steps_per_second": 6.456, |
| "step": 17400 |
| }, |
| { |
| "epoch": 0.26587153353371185, |
| "grad_norm": 4.354425430297852, |
| "learning_rate": 4.264214046822743e-06, |
| "loss": 5.909, |
| "step": 17500 |
| }, |
| { |
| "epoch": 0.26587153353371185, |
| "eval_loss": 5.785282611846924, |
| "eval_runtime": 193.5413, |
| "eval_samples_per_second": 51.669, |
| "eval_steps_per_second": 6.459, |
| "step": 17500 |
| }, |
| { |
| "epoch": 0.2673907994396188, |
| "grad_norm": 3.2807064056396484, |
| "learning_rate": 4.254180602006689e-06, |
| "loss": 5.9017, |
| "step": 17600 |
| }, |
| { |
| "epoch": 0.2673907994396188, |
| "eval_loss": 5.772453308105469, |
| "eval_runtime": 193.741, |
| "eval_samples_per_second": 51.615, |
| "eval_steps_per_second": 6.452, |
| "step": 17600 |
| }, |
| { |
| "epoch": 0.2689100653455257, |
| "grad_norm": 3.0385000705718994, |
| "learning_rate": 4.244147157190635e-06, |
| "loss": 5.8906, |
| "step": 17700 |
| }, |
| { |
| "epoch": 0.2689100653455257, |
| "eval_loss": 5.765667915344238, |
| "eval_runtime": 193.8777, |
| "eval_samples_per_second": 51.579, |
| "eval_steps_per_second": 6.447, |
| "step": 17700 |
| }, |
| { |
| "epoch": 0.27042933125143265, |
| "grad_norm": 2.746528148651123, |
| "learning_rate": 4.234113712374582e-06, |
| "loss": 5.8847, |
| "step": 17800 |
| }, |
| { |
| "epoch": 0.27042933125143265, |
| "eval_loss": 5.7541351318359375, |
| "eval_runtime": 193.9065, |
| "eval_samples_per_second": 51.571, |
| "eval_steps_per_second": 6.446, |
| "step": 17800 |
| }, |
| { |
| "epoch": 0.27194859715733954, |
| "grad_norm": 3.3728785514831543, |
| "learning_rate": 4.224080267558528e-06, |
| "loss": 5.8769, |
| "step": 17900 |
| }, |
| { |
| "epoch": 0.27194859715733954, |
| "eval_loss": 5.7371392250061035, |
| "eval_runtime": 193.8325, |
| "eval_samples_per_second": 51.591, |
| "eval_steps_per_second": 6.449, |
| "step": 17900 |
| }, |
| { |
| "epoch": 0.2734678630632465, |
| "grad_norm": 3.4341955184936523, |
| "learning_rate": 4.214046822742475e-06, |
| "loss": 5.8711, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.2734678630632465, |
| "eval_loss": 5.715305328369141, |
| "eval_runtime": 193.9066, |
| "eval_samples_per_second": 51.571, |
| "eval_steps_per_second": 6.446, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.2749871289691534, |
| "grad_norm": 4.6379313468933105, |
| "learning_rate": 4.2040133779264216e-06, |
| "loss": 5.861, |
| "step": 18100 |
| }, |
| { |
| "epoch": 0.2749871289691534, |
| "eval_loss": 5.71766996383667, |
| "eval_runtime": 193.7937, |
| "eval_samples_per_second": 51.601, |
| "eval_steps_per_second": 6.45, |
| "step": 18100 |
| }, |
| { |
| "epoch": 0.27650639487506035, |
| "grad_norm": 3.901848554611206, |
| "learning_rate": 4.1939799331103675e-06, |
| "loss": 5.855, |
| "step": 18200 |
| }, |
| { |
| "epoch": 0.27650639487506035, |
| "eval_loss": 5.7228240966796875, |
| "eval_runtime": 194.0011, |
| "eval_samples_per_second": 51.546, |
| "eval_steps_per_second": 6.443, |
| "step": 18200 |
| }, |
| { |
| "epoch": 0.27802566078096724, |
| "grad_norm": 2.7498176097869873, |
| "learning_rate": 4.183946488294314e-06, |
| "loss": 5.8388, |
| "step": 18300 |
| }, |
| { |
| "epoch": 0.27802566078096724, |
| "eval_loss": 5.699355125427246, |
| "eval_runtime": 193.8501, |
| "eval_samples_per_second": 51.586, |
| "eval_steps_per_second": 6.448, |
| "step": 18300 |
| }, |
| { |
| "epoch": 0.2795449266868742, |
| "grad_norm": 3.4318690299987793, |
| "learning_rate": 4.173913043478261e-06, |
| "loss": 5.8356, |
| "step": 18400 |
| }, |
| { |
| "epoch": 0.2795449266868742, |
| "eval_loss": 5.697088241577148, |
| "eval_runtime": 193.7892, |
| "eval_samples_per_second": 51.602, |
| "eval_steps_per_second": 6.45, |
| "step": 18400 |
| }, |
| { |
| "epoch": 0.2810641925927811, |
| "grad_norm": 3.5657687187194824, |
| "learning_rate": 4.163879598662208e-06, |
| "loss": 5.8233, |
| "step": 18500 |
| }, |
| { |
| "epoch": 0.2810641925927811, |
| "eval_loss": 5.683408260345459, |
| "eval_runtime": 193.7355, |
| "eval_samples_per_second": 51.617, |
| "eval_steps_per_second": 6.452, |
| "step": 18500 |
| }, |
| { |
| "epoch": 0.28258345849868804, |
| "grad_norm": 4.344554424285889, |
| "learning_rate": 4.153846153846154e-06, |
| "loss": 5.8187, |
| "step": 18600 |
| }, |
| { |
| "epoch": 0.28258345849868804, |
| "eval_loss": 5.675909042358398, |
| "eval_runtime": 193.8113, |
| "eval_samples_per_second": 51.597, |
| "eval_steps_per_second": 6.45, |
| "step": 18600 |
| }, |
| { |
| "epoch": 0.28410272440459494, |
| "grad_norm": 3.3455545902252197, |
| "learning_rate": 4.1438127090301005e-06, |
| "loss": 5.8091, |
| "step": 18700 |
| }, |
| { |
| "epoch": 0.28410272440459494, |
| "eval_loss": 5.665746688842773, |
| "eval_runtime": 194.1351, |
| "eval_samples_per_second": 51.511, |
| "eval_steps_per_second": 6.439, |
| "step": 18700 |
| }, |
| { |
| "epoch": 0.2856219903105019, |
| "grad_norm": 3.412184476852417, |
| "learning_rate": 4.133779264214047e-06, |
| "loss": 5.8026, |
| "step": 18800 |
| }, |
| { |
| "epoch": 0.2856219903105019, |
| "eval_loss": 5.6578497886657715, |
| "eval_runtime": 193.7841, |
| "eval_samples_per_second": 51.604, |
| "eval_steps_per_second": 6.45, |
| "step": 18800 |
| }, |
| { |
| "epoch": 0.2871412562164088, |
| "grad_norm": 3.717855215072632, |
| "learning_rate": 4.123745819397993e-06, |
| "loss": 5.7957, |
| "step": 18900 |
| }, |
| { |
| "epoch": 0.2871412562164088, |
| "eval_loss": 5.665693759918213, |
| "eval_runtime": 193.7661, |
| "eval_samples_per_second": 51.609, |
| "eval_steps_per_second": 6.451, |
| "step": 18900 |
| }, |
| { |
| "epoch": 0.28866052212231574, |
| "grad_norm": 3.876275062561035, |
| "learning_rate": 4.11371237458194e-06, |
| "loss": 5.7846, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.28866052212231574, |
| "eval_loss": 5.648958206176758, |
| "eval_runtime": 193.7195, |
| "eval_samples_per_second": 51.621, |
| "eval_steps_per_second": 6.453, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.29017978802822264, |
| "grad_norm": 3.8186490535736084, |
| "learning_rate": 4.103678929765887e-06, |
| "loss": 5.7777, |
| "step": 19100 |
| }, |
| { |
| "epoch": 0.29017978802822264, |
| "eval_loss": 5.629169940948486, |
| "eval_runtime": 193.5999, |
| "eval_samples_per_second": 51.653, |
| "eval_steps_per_second": 6.457, |
| "step": 19100 |
| }, |
| { |
| "epoch": 0.2916990539341296, |
| "grad_norm": 5.3280839920043945, |
| "learning_rate": 4.0936454849498326e-06, |
| "loss": 5.77, |
| "step": 19200 |
| }, |
| { |
| "epoch": 0.2916990539341296, |
| "eval_loss": 5.620713233947754, |
| "eval_runtime": 193.6523, |
| "eval_samples_per_second": 51.639, |
| "eval_steps_per_second": 6.455, |
| "step": 19200 |
| }, |
| { |
| "epoch": 0.2932183198400365, |
| "grad_norm": 3.260324478149414, |
| "learning_rate": 4.083612040133779e-06, |
| "loss": 5.7611, |
| "step": 19300 |
| }, |
| { |
| "epoch": 0.2932183198400365, |
| "eval_loss": 5.629894733428955, |
| "eval_runtime": 193.685, |
| "eval_samples_per_second": 51.63, |
| "eval_steps_per_second": 6.454, |
| "step": 19300 |
| }, |
| { |
| "epoch": 0.29473758574594344, |
| "grad_norm": 4.145829200744629, |
| "learning_rate": 4.073578595317726e-06, |
| "loss": 5.7538, |
| "step": 19400 |
| }, |
| { |
| "epoch": 0.29473758574594344, |
| "eval_loss": 5.6320037841796875, |
| "eval_runtime": 193.4857, |
| "eval_samples_per_second": 51.683, |
| "eval_steps_per_second": 6.46, |
| "step": 19400 |
| }, |
| { |
| "epoch": 0.29625685165185034, |
| "grad_norm": 4.071881294250488, |
| "learning_rate": 4.063545150501672e-06, |
| "loss": 5.745, |
| "step": 19500 |
| }, |
| { |
| "epoch": 0.29625685165185034, |
| "eval_loss": 5.607526779174805, |
| "eval_runtime": 193.5349, |
| "eval_samples_per_second": 51.67, |
| "eval_steps_per_second": 6.459, |
| "step": 19500 |
| }, |
| { |
| "epoch": 0.2977761175577573, |
| "grad_norm": 3.4075276851654053, |
| "learning_rate": 4.053511705685619e-06, |
| "loss": 5.7404, |
| "step": 19600 |
| }, |
| { |
| "epoch": 0.2977761175577573, |
| "eval_loss": 5.603940010070801, |
| "eval_runtime": 193.5688, |
| "eval_samples_per_second": 51.661, |
| "eval_steps_per_second": 6.458, |
| "step": 19600 |
| }, |
| { |
| "epoch": 0.2992953834636642, |
| "grad_norm": 3.371760129928589, |
| "learning_rate": 4.0434782608695655e-06, |
| "loss": 5.7343, |
| "step": 19700 |
| }, |
| { |
| "epoch": 0.2992953834636642, |
| "eval_loss": 5.597903728485107, |
| "eval_runtime": 193.9636, |
| "eval_samples_per_second": 51.556, |
| "eval_steps_per_second": 6.445, |
| "step": 19700 |
| }, |
| { |
| "epoch": 0.30081464936957114, |
| "grad_norm": 3.1230831146240234, |
| "learning_rate": 4.0334448160535115e-06, |
| "loss": 5.7284, |
| "step": 19800 |
| }, |
| { |
| "epoch": 0.30081464936957114, |
| "eval_loss": 5.580268859863281, |
| "eval_runtime": 194.0245, |
| "eval_samples_per_second": 51.54, |
| "eval_steps_per_second": 6.442, |
| "step": 19800 |
| }, |
| { |
| "epoch": 0.30233391527547804, |
| "grad_norm": 3.339742422103882, |
| "learning_rate": 4.023411371237458e-06, |
| "loss": 5.7206, |
| "step": 19900 |
| }, |
| { |
| "epoch": 0.30233391527547804, |
| "eval_loss": 5.571849822998047, |
| "eval_runtime": 193.8323, |
| "eval_samples_per_second": 51.591, |
| "eval_steps_per_second": 6.449, |
| "step": 19900 |
| }, |
| { |
| "epoch": 0.303853181181385, |
| "grad_norm": 3.2297468185424805, |
| "learning_rate": 4.013377926421405e-06, |
| "loss": 5.7086, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.303853181181385, |
| "eval_loss": 5.5632548332214355, |
| "eval_runtime": 193.7563, |
| "eval_samples_per_second": 51.611, |
| "eval_steps_per_second": 6.451, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.3053724470872919, |
| "grad_norm": 2.8698532581329346, |
| "learning_rate": 4.003344481605351e-06, |
| "loss": 5.7027, |
| "step": 20100 |
| }, |
| { |
| "epoch": 0.3053724470872919, |
| "eval_loss": 5.559244632720947, |
| "eval_runtime": 193.8672, |
| "eval_samples_per_second": 51.582, |
| "eval_steps_per_second": 6.448, |
| "step": 20100 |
| }, |
| { |
| "epoch": 0.30689171299319884, |
| "grad_norm": 2.990452289581299, |
| "learning_rate": 3.993311036789298e-06, |
| "loss": 5.6953, |
| "step": 20200 |
| }, |
| { |
| "epoch": 0.30689171299319884, |
| "eval_loss": 5.560790061950684, |
| "eval_runtime": 193.8061, |
| "eval_samples_per_second": 51.598, |
| "eval_steps_per_second": 6.45, |
| "step": 20200 |
| }, |
| { |
| "epoch": 0.30841097889910574, |
| "grad_norm": 3.821631669998169, |
| "learning_rate": 3.9832775919732444e-06, |
| "loss": 5.6881, |
| "step": 20300 |
| }, |
| { |
| "epoch": 0.30841097889910574, |
| "eval_loss": 5.551888465881348, |
| "eval_runtime": 194.0615, |
| "eval_samples_per_second": 51.53, |
| "eval_steps_per_second": 6.441, |
| "step": 20300 |
| }, |
| { |
| "epoch": 0.3099302448050127, |
| "grad_norm": 3.209308624267578, |
| "learning_rate": 3.97324414715719e-06, |
| "loss": 5.683, |
| "step": 20400 |
| }, |
| { |
| "epoch": 0.3099302448050127, |
| "eval_loss": 5.5436787605285645, |
| "eval_runtime": 193.9187, |
| "eval_samples_per_second": 51.568, |
| "eval_steps_per_second": 6.446, |
| "step": 20400 |
| }, |
| { |
| "epoch": 0.3114495107109196, |
| "grad_norm": 4.5453901290893555, |
| "learning_rate": 3.963210702341137e-06, |
| "loss": 5.6747, |
| "step": 20500 |
| }, |
| { |
| "epoch": 0.3114495107109196, |
| "eval_loss": 5.523691177368164, |
| "eval_runtime": 193.8312, |
| "eval_samples_per_second": 51.591, |
| "eval_steps_per_second": 6.449, |
| "step": 20500 |
| }, |
| { |
| "epoch": 0.31296877661682654, |
| "grad_norm": 3.86855411529541, |
| "learning_rate": 3.953177257525084e-06, |
| "loss": 5.6753, |
| "step": 20600 |
| }, |
| { |
| "epoch": 0.31296877661682654, |
| "eval_loss": 5.530142784118652, |
| "eval_runtime": 193.8761, |
| "eval_samples_per_second": 51.579, |
| "eval_steps_per_second": 6.447, |
| "step": 20600 |
| }, |
| { |
| "epoch": 0.31448804252273344, |
| "grad_norm": 3.029080390930176, |
| "learning_rate": 3.943143812709031e-06, |
| "loss": 5.6683, |
| "step": 20700 |
| }, |
| { |
| "epoch": 0.31448804252273344, |
| "eval_loss": 5.527863025665283, |
| "eval_runtime": 193.8252, |
| "eval_samples_per_second": 51.593, |
| "eval_steps_per_second": 6.449, |
| "step": 20700 |
| }, |
| { |
| "epoch": 0.3160073084286404, |
| "grad_norm": 3.5344836711883545, |
| "learning_rate": 3.9331103678929765e-06, |
| "loss": 5.6554, |
| "step": 20800 |
| }, |
| { |
| "epoch": 0.3160073084286404, |
| "eval_loss": 5.510525226593018, |
| "eval_runtime": 193.8394, |
| "eval_samples_per_second": 51.589, |
| "eval_steps_per_second": 6.449, |
| "step": 20800 |
| }, |
| { |
| "epoch": 0.3175265743345473, |
| "grad_norm": 3.153604507446289, |
| "learning_rate": 3.923076923076923e-06, |
| "loss": 5.6508, |
| "step": 20900 |
| }, |
| { |
| "epoch": 0.3175265743345473, |
| "eval_loss": 5.508999824523926, |
| "eval_runtime": 194.0796, |
| "eval_samples_per_second": 51.525, |
| "eval_steps_per_second": 6.441, |
| "step": 20900 |
| }, |
| { |
| "epoch": 0.31904584024045424, |
| "grad_norm": 3.87959623336792, |
| "learning_rate": 3.91304347826087e-06, |
| "loss": 5.644, |
| "step": 21000 |
| }, |
| { |
| "epoch": 0.31904584024045424, |
| "eval_loss": 5.511682987213135, |
| "eval_runtime": 193.8814, |
| "eval_samples_per_second": 51.578, |
| "eval_steps_per_second": 6.447, |
| "step": 21000 |
| }, |
| { |
| "epoch": 0.32056510614636113, |
| "grad_norm": 3.9517741203308105, |
| "learning_rate": 3.903010033444816e-06, |
| "loss": 5.6387, |
| "step": 21100 |
| }, |
| { |
| "epoch": 0.32056510614636113, |
| "eval_loss": 5.499752521514893, |
| "eval_runtime": 193.8141, |
| "eval_samples_per_second": 51.596, |
| "eval_steps_per_second": 6.449, |
| "step": 21100 |
| }, |
| { |
| "epoch": 0.3220843720522681, |
| "grad_norm": 3.191702127456665, |
| "learning_rate": 3.892976588628763e-06, |
| "loss": 5.6333, |
| "step": 21200 |
| }, |
| { |
| "epoch": 0.3220843720522681, |
| "eval_loss": 5.476820945739746, |
| "eval_runtime": 193.9667, |
| "eval_samples_per_second": 51.555, |
| "eval_steps_per_second": 6.444, |
| "step": 21200 |
| }, |
| { |
| "epoch": 0.323603637958175, |
| "grad_norm": 3.1419906616210938, |
| "learning_rate": 3.8829431438127095e-06, |
| "loss": 5.6243, |
| "step": 21300 |
| }, |
| { |
| "epoch": 0.323603637958175, |
| "eval_loss": 5.486774444580078, |
| "eval_runtime": 193.7733, |
| "eval_samples_per_second": 51.607, |
| "eval_steps_per_second": 6.451, |
| "step": 21300 |
| }, |
| { |
| "epoch": 0.32512290386408194, |
| "grad_norm": 4.059791088104248, |
| "learning_rate": 3.8729096989966554e-06, |
| "loss": 5.6163, |
| "step": 21400 |
| }, |
| { |
| "epoch": 0.32512290386408194, |
| "eval_loss": 5.477799415588379, |
| "eval_runtime": 193.6285, |
| "eval_samples_per_second": 51.645, |
| "eval_steps_per_second": 6.456, |
| "step": 21400 |
| }, |
| { |
| "epoch": 0.32664216976998883, |
| "grad_norm": 2.990511417388916, |
| "learning_rate": 3.862876254180602e-06, |
| "loss": 5.6133, |
| "step": 21500 |
| }, |
| { |
| "epoch": 0.32664216976998883, |
| "eval_loss": 5.47875452041626, |
| "eval_runtime": 193.5641, |
| "eval_samples_per_second": 51.662, |
| "eval_steps_per_second": 6.458, |
| "step": 21500 |
| }, |
| { |
| "epoch": 0.3281614356758958, |
| "grad_norm": 2.3832523822784424, |
| "learning_rate": 3.852842809364549e-06, |
| "loss": 5.6062, |
| "step": 21600 |
| }, |
| { |
| "epoch": 0.3281614356758958, |
| "eval_loss": 5.4584455490112305, |
| "eval_runtime": 193.6584, |
| "eval_samples_per_second": 51.637, |
| "eval_steps_per_second": 6.455, |
| "step": 21600 |
| }, |
| { |
| "epoch": 0.3296807015818027, |
| "grad_norm": 3.010307788848877, |
| "learning_rate": 3.842809364548495e-06, |
| "loss": 5.5959, |
| "step": 21700 |
| }, |
| { |
| "epoch": 0.3296807015818027, |
| "eval_loss": 5.451364517211914, |
| "eval_runtime": 193.6921, |
| "eval_samples_per_second": 51.628, |
| "eval_steps_per_second": 6.454, |
| "step": 21700 |
| }, |
| { |
| "epoch": 0.33119996748770963, |
| "grad_norm": 3.718315601348877, |
| "learning_rate": 3.832775919732442e-06, |
| "loss": 5.5919, |
| "step": 21800 |
| }, |
| { |
| "epoch": 0.33119996748770963, |
| "eval_loss": 5.446727752685547, |
| "eval_runtime": 193.8195, |
| "eval_samples_per_second": 51.594, |
| "eval_steps_per_second": 6.449, |
| "step": 21800 |
| }, |
| { |
| "epoch": 0.33271923339361653, |
| "grad_norm": 4.131709098815918, |
| "learning_rate": 3.822742474916388e-06, |
| "loss": 5.5859, |
| "step": 21900 |
| }, |
| { |
| "epoch": 0.33271923339361653, |
| "eval_loss": 5.43417501449585, |
| "eval_runtime": 193.7876, |
| "eval_samples_per_second": 51.603, |
| "eval_steps_per_second": 6.45, |
| "step": 21900 |
| }, |
| { |
| "epoch": 0.3342384992995235, |
| "grad_norm": 3.7145907878875732, |
| "learning_rate": 3.8127090301003347e-06, |
| "loss": 5.5805, |
| "step": 22000 |
| }, |
| { |
| "epoch": 0.3342384992995235, |
| "eval_loss": 5.443439960479736, |
| "eval_runtime": 193.7659, |
| "eval_samples_per_second": 51.609, |
| "eval_steps_per_second": 6.451, |
| "step": 22000 |
| }, |
| { |
| "epoch": 0.3357577652054304, |
| "grad_norm": 3.167874574661255, |
| "learning_rate": 3.802675585284281e-06, |
| "loss": 5.5724, |
| "step": 22100 |
| }, |
| { |
| "epoch": 0.3357577652054304, |
| "eval_loss": 5.419732093811035, |
| "eval_runtime": 193.7654, |
| "eval_samples_per_second": 51.609, |
| "eval_steps_per_second": 6.451, |
| "step": 22100 |
| }, |
| { |
| "epoch": 0.33727703111133733, |
| "grad_norm": 3.820495367050171, |
| "learning_rate": 3.792642140468228e-06, |
| "loss": 5.5694, |
| "step": 22200 |
| }, |
| { |
| "epoch": 0.33727703111133733, |
| "eval_loss": 5.4181647300720215, |
| "eval_runtime": 193.8601, |
| "eval_samples_per_second": 51.584, |
| "eval_steps_per_second": 6.448, |
| "step": 22200 |
| }, |
| { |
| "epoch": 0.33879629701724423, |
| "grad_norm": 3.4039466381073, |
| "learning_rate": 3.782608695652174e-06, |
| "loss": 5.565, |
| "step": 22300 |
| }, |
| { |
| "epoch": 0.33879629701724423, |
| "eval_loss": 5.419365406036377, |
| "eval_runtime": 194.0048, |
| "eval_samples_per_second": 51.545, |
| "eval_steps_per_second": 6.443, |
| "step": 22300 |
| }, |
| { |
| "epoch": 0.3403155629231512, |
| "grad_norm": 2.805332660675049, |
| "learning_rate": 3.7725752508361205e-06, |
| "loss": 5.556, |
| "step": 22400 |
| }, |
| { |
| "epoch": 0.3403155629231512, |
| "eval_loss": 5.415971755981445, |
| "eval_runtime": 193.9692, |
| "eval_samples_per_second": 51.555, |
| "eval_steps_per_second": 6.444, |
| "step": 22400 |
| }, |
| { |
| "epoch": 0.3418348288290581, |
| "grad_norm": 2.43111252784729, |
| "learning_rate": 3.7625418060200673e-06, |
| "loss": 5.5536, |
| "step": 22500 |
| }, |
| { |
| "epoch": 0.3418348288290581, |
| "eval_loss": 5.405710220336914, |
| "eval_runtime": 193.9758, |
| "eval_samples_per_second": 51.553, |
| "eval_steps_per_second": 6.444, |
| "step": 22500 |
| }, |
| { |
| "epoch": 0.34335409473496503, |
| "grad_norm": 3.9612550735473633, |
| "learning_rate": 3.7525083612040136e-06, |
| "loss": 5.544, |
| "step": 22600 |
| }, |
| { |
| "epoch": 0.34335409473496503, |
| "eval_loss": 5.40699577331543, |
| "eval_runtime": 193.9672, |
| "eval_samples_per_second": 51.555, |
| "eval_steps_per_second": 6.444, |
| "step": 22600 |
| }, |
| { |
| "epoch": 0.34487336064087193, |
| "grad_norm": 2.8571434020996094, |
| "learning_rate": 3.74247491638796e-06, |
| "loss": 5.5392, |
| "step": 22700 |
| }, |
| { |
| "epoch": 0.34487336064087193, |
| "eval_loss": 5.397378921508789, |
| "eval_runtime": 194.1852, |
| "eval_samples_per_second": 51.497, |
| "eval_steps_per_second": 6.437, |
| "step": 22700 |
| }, |
| { |
| "epoch": 0.3463926265467789, |
| "grad_norm": 3.1463348865509033, |
| "learning_rate": 3.7324414715719067e-06, |
| "loss": 5.5296, |
| "step": 22800 |
| }, |
| { |
| "epoch": 0.3463926265467789, |
| "eval_loss": 5.3807501792907715, |
| "eval_runtime": 193.8656, |
| "eval_samples_per_second": 51.582, |
| "eval_steps_per_second": 6.448, |
| "step": 22800 |
| }, |
| { |
| "epoch": 0.3479118924526858, |
| "grad_norm": 3.696991205215454, |
| "learning_rate": 3.722408026755853e-06, |
| "loss": 5.5319, |
| "step": 22900 |
| }, |
| { |
| "epoch": 0.3479118924526858, |
| "eval_loss": 5.376627445220947, |
| "eval_runtime": 193.8475, |
| "eval_samples_per_second": 51.587, |
| "eval_steps_per_second": 6.448, |
| "step": 22900 |
| }, |
| { |
| "epoch": 0.34943115835859273, |
| "grad_norm": 3.691133737564087, |
| "learning_rate": 3.7123745819398e-06, |
| "loss": 5.5239, |
| "step": 23000 |
| }, |
| { |
| "epoch": 0.34943115835859273, |
| "eval_loss": 5.379115104675293, |
| "eval_runtime": 193.935, |
| "eval_samples_per_second": 51.564, |
| "eval_steps_per_second": 6.445, |
| "step": 23000 |
| }, |
| { |
| "epoch": 0.3509504242644996, |
| "grad_norm": 2.994180679321289, |
| "learning_rate": 3.702341137123746e-06, |
| "loss": 5.5174, |
| "step": 23100 |
| }, |
| { |
| "epoch": 0.3509504242644996, |
| "eval_loss": 5.36928129196167, |
| "eval_runtime": 193.8929, |
| "eval_samples_per_second": 51.575, |
| "eval_steps_per_second": 6.447, |
| "step": 23100 |
| }, |
| { |
| "epoch": 0.3524696901704066, |
| "grad_norm": 3.528660774230957, |
| "learning_rate": 3.6923076923076925e-06, |
| "loss": 5.5123, |
| "step": 23200 |
| }, |
| { |
| "epoch": 0.3524696901704066, |
| "eval_loss": 5.366269588470459, |
| "eval_runtime": 193.8307, |
| "eval_samples_per_second": 51.591, |
| "eval_steps_per_second": 6.449, |
| "step": 23200 |
| }, |
| { |
| "epoch": 0.3539889560763135, |
| "grad_norm": 2.8609702587127686, |
| "learning_rate": 3.6822742474916393e-06, |
| "loss": 5.5024, |
| "step": 23300 |
| }, |
| { |
| "epoch": 0.3539889560763135, |
| "eval_loss": 5.3636884689331055, |
| "eval_runtime": 193.9144, |
| "eval_samples_per_second": 51.569, |
| "eval_steps_per_second": 6.446, |
| "step": 23300 |
| }, |
| { |
| "epoch": 0.35550822198222043, |
| "grad_norm": 3.146467447280884, |
| "learning_rate": 3.6722408026755856e-06, |
| "loss": 5.4993, |
| "step": 23400 |
| }, |
| { |
| "epoch": 0.35550822198222043, |
| "eval_loss": 5.3594536781311035, |
| "eval_runtime": 193.8636, |
| "eval_samples_per_second": 51.583, |
| "eval_steps_per_second": 6.448, |
| "step": 23400 |
| }, |
| { |
| "epoch": 0.3570274878881273, |
| "grad_norm": 2.4640018939971924, |
| "learning_rate": 3.662207357859532e-06, |
| "loss": 5.4944, |
| "step": 23500 |
| }, |
| { |
| "epoch": 0.3570274878881273, |
| "eval_loss": 5.346569538116455, |
| "eval_runtime": 193.9228, |
| "eval_samples_per_second": 51.567, |
| "eval_steps_per_second": 6.446, |
| "step": 23500 |
| }, |
| { |
| "epoch": 0.3585467537940343, |
| "grad_norm": 4.175319671630859, |
| "learning_rate": 3.6521739130434787e-06, |
| "loss": 5.4874, |
| "step": 23600 |
| }, |
| { |
| "epoch": 0.3585467537940343, |
| "eval_loss": 5.343349456787109, |
| "eval_runtime": 193.585, |
| "eval_samples_per_second": 51.657, |
| "eval_steps_per_second": 6.457, |
| "step": 23600 |
| }, |
| { |
| "epoch": 0.3600660196999412, |
| "grad_norm": 3.4799277782440186, |
| "learning_rate": 3.642140468227425e-06, |
| "loss": 5.4874, |
| "step": 23700 |
| }, |
| { |
| "epoch": 0.3600660196999412, |
| "eval_loss": 5.342945098876953, |
| "eval_runtime": 193.6611, |
| "eval_samples_per_second": 51.637, |
| "eval_steps_per_second": 6.455, |
| "step": 23700 |
| }, |
| { |
| "epoch": 0.36158528560584813, |
| "grad_norm": 1.949639916419983, |
| "learning_rate": 3.6321070234113714e-06, |
| "loss": 5.4786, |
| "step": 23800 |
| }, |
| { |
| "epoch": 0.36158528560584813, |
| "eval_loss": 5.325855255126953, |
| "eval_runtime": 193.5983, |
| "eval_samples_per_second": 51.653, |
| "eval_steps_per_second": 6.457, |
| "step": 23800 |
| }, |
| { |
| "epoch": 0.363104551511755, |
| "grad_norm": 2.983280658721924, |
| "learning_rate": 3.622073578595318e-06, |
| "loss": 5.4736, |
| "step": 23900 |
| }, |
| { |
| "epoch": 0.363104551511755, |
| "eval_loss": 5.320173263549805, |
| "eval_runtime": 193.5482, |
| "eval_samples_per_second": 51.667, |
| "eval_steps_per_second": 6.458, |
| "step": 23900 |
| }, |
| { |
| "epoch": 0.364623817417662, |
| "grad_norm": 3.912425994873047, |
| "learning_rate": 3.6120401337792645e-06, |
| "loss": 5.4694, |
| "step": 24000 |
| }, |
| { |
| "epoch": 0.364623817417662, |
| "eval_loss": 5.3176751136779785, |
| "eval_runtime": 193.8093, |
| "eval_samples_per_second": 51.597, |
| "eval_steps_per_second": 6.45, |
| "step": 24000 |
| }, |
| { |
| "epoch": 0.3661430833235689, |
| "grad_norm": 3.230281114578247, |
| "learning_rate": 3.6020066889632112e-06, |
| "loss": 5.465, |
| "step": 24100 |
| }, |
| { |
| "epoch": 0.3661430833235689, |
| "eval_loss": 5.319676399230957, |
| "eval_runtime": 193.9617, |
| "eval_samples_per_second": 51.557, |
| "eval_steps_per_second": 6.445, |
| "step": 24100 |
| }, |
| { |
| "epoch": 0.3676623492294758, |
| "grad_norm": 2.6516830921173096, |
| "learning_rate": 3.5919732441471576e-06, |
| "loss": 5.4568, |
| "step": 24200 |
| }, |
| { |
| "epoch": 0.3676623492294758, |
| "eval_loss": 5.306182384490967, |
| "eval_runtime": 193.9303, |
| "eval_samples_per_second": 51.565, |
| "eval_steps_per_second": 6.446, |
| "step": 24200 |
| }, |
| { |
| "epoch": 0.3691816151353827, |
| "grad_norm": 3.398289442062378, |
| "learning_rate": 3.581939799331104e-06, |
| "loss": 5.4555, |
| "step": 24300 |
| }, |
| { |
| "epoch": 0.3691816151353827, |
| "eval_loss": 5.304970741271973, |
| "eval_runtime": 193.8994, |
| "eval_samples_per_second": 51.573, |
| "eval_steps_per_second": 6.447, |
| "step": 24300 |
| }, |
| { |
| "epoch": 0.3707008810412897, |
| "grad_norm": 2.9263579845428467, |
| "learning_rate": 3.5719063545150507e-06, |
| "loss": 5.446, |
| "step": 24400 |
| }, |
| { |
| "epoch": 0.3707008810412897, |
| "eval_loss": 5.304412364959717, |
| "eval_runtime": 193.8605, |
| "eval_samples_per_second": 51.583, |
| "eval_steps_per_second": 6.448, |
| "step": 24400 |
| }, |
| { |
| "epoch": 0.3722201469471966, |
| "grad_norm": 3.2696564197540283, |
| "learning_rate": 3.561872909698997e-06, |
| "loss": 5.4418, |
| "step": 24500 |
| }, |
| { |
| "epoch": 0.3722201469471966, |
| "eval_loss": 5.291959285736084, |
| "eval_runtime": 193.8303, |
| "eval_samples_per_second": 51.592, |
| "eval_steps_per_second": 6.449, |
| "step": 24500 |
| }, |
| { |
| "epoch": 0.3737394128531035, |
| "grad_norm": 3.3699710369110107, |
| "learning_rate": 3.5518394648829434e-06, |
| "loss": 5.4367, |
| "step": 24600 |
| }, |
| { |
| "epoch": 0.3737394128531035, |
| "eval_loss": 5.292627334594727, |
| "eval_runtime": 193.9116, |
| "eval_samples_per_second": 51.57, |
| "eval_steps_per_second": 6.446, |
| "step": 24600 |
| }, |
| { |
| "epoch": 0.3752586787590104, |
| "grad_norm": 3.646376848220825, |
| "learning_rate": 3.54180602006689e-06, |
| "loss": 5.4389, |
| "step": 24700 |
| }, |
| { |
| "epoch": 0.3752586787590104, |
| "eval_loss": 5.2781982421875, |
| "eval_runtime": 193.9384, |
| "eval_samples_per_second": 51.563, |
| "eval_steps_per_second": 6.445, |
| "step": 24700 |
| }, |
| { |
| "epoch": 0.3767779446649174, |
| "grad_norm": 2.873612880706787, |
| "learning_rate": 3.5317725752508365e-06, |
| "loss": 5.4344, |
| "step": 24800 |
| }, |
| { |
| "epoch": 0.3767779446649174, |
| "eval_loss": 5.286219596862793, |
| "eval_runtime": 193.9969, |
| "eval_samples_per_second": 51.547, |
| "eval_steps_per_second": 6.443, |
| "step": 24800 |
| }, |
| { |
| "epoch": 0.37829721057082427, |
| "grad_norm": 3.312747001647949, |
| "learning_rate": 3.521739130434783e-06, |
| "loss": 5.427, |
| "step": 24900 |
| }, |
| { |
| "epoch": 0.37829721057082427, |
| "eval_loss": 5.281394004821777, |
| "eval_runtime": 193.9463, |
| "eval_samples_per_second": 51.561, |
| "eval_steps_per_second": 6.445, |
| "step": 24900 |
| }, |
| { |
| "epoch": 0.3798164764767312, |
| "grad_norm": 3.727271556854248, |
| "learning_rate": 3.5117056856187296e-06, |
| "loss": 5.4212, |
| "step": 25000 |
| }, |
| { |
| "epoch": 0.3798164764767312, |
| "eval_loss": 5.266263484954834, |
| "eval_runtime": 193.9471, |
| "eval_samples_per_second": 51.56, |
| "eval_steps_per_second": 6.445, |
| "step": 25000 |
| }, |
| { |
| "epoch": 0.3813357423826381, |
| "grad_norm": 3.3630518913269043, |
| "learning_rate": 3.501672240802676e-06, |
| "loss": 5.4173, |
| "step": 25100 |
| }, |
| { |
| "epoch": 0.3813357423826381, |
| "eval_loss": 5.276744365692139, |
| "eval_runtime": 193.8804, |
| "eval_samples_per_second": 51.578, |
| "eval_steps_per_second": 6.447, |
| "step": 25100 |
| }, |
| { |
| "epoch": 0.3828550082885451, |
| "grad_norm": 4.12694787979126, |
| "learning_rate": 3.491638795986622e-06, |
| "loss": 5.413, |
| "step": 25200 |
| }, |
| { |
| "epoch": 0.3828550082885451, |
| "eval_loss": 5.260261535644531, |
| "eval_runtime": 193.919, |
| "eval_samples_per_second": 51.568, |
| "eval_steps_per_second": 6.446, |
| "step": 25200 |
| }, |
| { |
| "epoch": 0.38437427419445197, |
| "grad_norm": 3.921342611312866, |
| "learning_rate": 3.481605351170568e-06, |
| "loss": 5.4041, |
| "step": 25300 |
| }, |
| { |
| "epoch": 0.38437427419445197, |
| "eval_loss": 5.2696661949157715, |
| "eval_runtime": 193.8783, |
| "eval_samples_per_second": 51.579, |
| "eval_steps_per_second": 6.447, |
| "step": 25300 |
| }, |
| { |
| "epoch": 0.3858935401003589, |
| "grad_norm": 2.464872360229492, |
| "learning_rate": 3.471571906354515e-06, |
| "loss": 5.4035, |
| "step": 25400 |
| }, |
| { |
| "epoch": 0.3858935401003589, |
| "eval_loss": 5.251010894775391, |
| "eval_runtime": 193.8872, |
| "eval_samples_per_second": 51.576, |
| "eval_steps_per_second": 6.447, |
| "step": 25400 |
| }, |
| { |
| "epoch": 0.3874128060062658, |
| "grad_norm": 2.675010919570923, |
| "learning_rate": 3.4615384615384613e-06, |
| "loss": 5.3946, |
| "step": 25500 |
| }, |
| { |
| "epoch": 0.3874128060062658, |
| "eval_loss": 5.2538347244262695, |
| "eval_runtime": 193.8933, |
| "eval_samples_per_second": 51.575, |
| "eval_steps_per_second": 6.447, |
| "step": 25500 |
| }, |
| { |
| "epoch": 0.38893207191217277, |
| "grad_norm": 2.195725202560425, |
| "learning_rate": 3.4515050167224076e-06, |
| "loss": 5.3919, |
| "step": 25600 |
| }, |
| { |
| "epoch": 0.38893207191217277, |
| "eval_loss": 5.230504035949707, |
| "eval_runtime": 194.2054, |
| "eval_samples_per_second": 51.492, |
| "eval_steps_per_second": 6.436, |
| "step": 25600 |
| }, |
| { |
| "epoch": 0.39045133781807967, |
| "grad_norm": 3.359039545059204, |
| "learning_rate": 3.4414715719063544e-06, |
| "loss": 5.3855, |
| "step": 25700 |
| }, |
| { |
| "epoch": 0.39045133781807967, |
| "eval_loss": 5.245420455932617, |
| "eval_runtime": 193.8867, |
| "eval_samples_per_second": 51.577, |
| "eval_steps_per_second": 6.447, |
| "step": 25700 |
| }, |
| { |
| "epoch": 0.3919706037239866, |
| "grad_norm": 3.5205583572387695, |
| "learning_rate": 3.4314381270903007e-06, |
| "loss": 5.3822, |
| "step": 25800 |
| }, |
| { |
| "epoch": 0.3919706037239866, |
| "eval_loss": 5.227876663208008, |
| "eval_runtime": 194.0636, |
| "eval_samples_per_second": 51.529, |
| "eval_steps_per_second": 6.441, |
| "step": 25800 |
| }, |
| { |
| "epoch": 0.3934898696298935, |
| "grad_norm": 3.5619242191314697, |
| "learning_rate": 3.4214046822742475e-06, |
| "loss": 5.3773, |
| "step": 25900 |
| }, |
| { |
| "epoch": 0.3934898696298935, |
| "eval_loss": 5.234467029571533, |
| "eval_runtime": 193.7401, |
| "eval_samples_per_second": 51.616, |
| "eval_steps_per_second": 6.452, |
| "step": 25900 |
| }, |
| { |
| "epoch": 0.39500913553580047, |
| "grad_norm": 3.9920406341552734, |
| "learning_rate": 3.411371237458194e-06, |
| "loss": 5.3735, |
| "step": 26000 |
| }, |
| { |
| "epoch": 0.39500913553580047, |
| "eval_loss": 5.22184944152832, |
| "eval_runtime": 193.7394, |
| "eval_samples_per_second": 51.616, |
| "eval_steps_per_second": 6.452, |
| "step": 26000 |
| }, |
| { |
| "epoch": 0.39652840144170737, |
| "grad_norm": 3.559217691421509, |
| "learning_rate": 3.40133779264214e-06, |
| "loss": 5.3695, |
| "step": 26100 |
| }, |
| { |
| "epoch": 0.39652840144170737, |
| "eval_loss": 5.22100830078125, |
| "eval_runtime": 193.849, |
| "eval_samples_per_second": 51.587, |
| "eval_steps_per_second": 6.448, |
| "step": 26100 |
| }, |
| { |
| "epoch": 0.3980476673476143, |
| "grad_norm": 4.232235908508301, |
| "learning_rate": 3.391304347826087e-06, |
| "loss": 5.3703, |
| "step": 26200 |
| }, |
| { |
| "epoch": 0.3980476673476143, |
| "eval_loss": 5.208474159240723, |
| "eval_runtime": 193.8974, |
| "eval_samples_per_second": 51.574, |
| "eval_steps_per_second": 6.447, |
| "step": 26200 |
| }, |
| { |
| "epoch": 0.3995669332535212, |
| "grad_norm": 1.947100043296814, |
| "learning_rate": 3.3812709030100333e-06, |
| "loss": 5.3627, |
| "step": 26300 |
| }, |
| { |
| "epoch": 0.3995669332535212, |
| "eval_loss": 5.217953681945801, |
| "eval_runtime": 193.928, |
| "eval_samples_per_second": 51.566, |
| "eval_steps_per_second": 6.446, |
| "step": 26300 |
| }, |
| { |
| "epoch": 0.40108619915942817, |
| "grad_norm": 3.8494338989257812, |
| "learning_rate": 3.3712374581939796e-06, |
| "loss": 5.3584, |
| "step": 26400 |
| }, |
| { |
| "epoch": 0.40108619915942817, |
| "eval_loss": 5.212357044219971, |
| "eval_runtime": 193.9466, |
| "eval_samples_per_second": 51.561, |
| "eval_steps_per_second": 6.445, |
| "step": 26400 |
| }, |
| { |
| "epoch": 0.40260546506533507, |
| "grad_norm": 3.837324619293213, |
| "learning_rate": 3.3612040133779264e-06, |
| "loss": 5.3555, |
| "step": 26500 |
| }, |
| { |
| "epoch": 0.40260546506533507, |
| "eval_loss": 5.211539268493652, |
| "eval_runtime": 193.9781, |
| "eval_samples_per_second": 51.552, |
| "eval_steps_per_second": 6.444, |
| "step": 26500 |
| }, |
| { |
| "epoch": 0.404124730971242, |
| "grad_norm": 3.7754664421081543, |
| "learning_rate": 3.3511705685618727e-06, |
| "loss": 5.3476, |
| "step": 26600 |
| }, |
| { |
| "epoch": 0.404124730971242, |
| "eval_loss": 5.1926679611206055, |
| "eval_runtime": 194.0189, |
| "eval_samples_per_second": 51.541, |
| "eval_steps_per_second": 6.443, |
| "step": 26600 |
| }, |
| { |
| "epoch": 0.4056439968771489, |
| "grad_norm": 2.4836502075195312, |
| "learning_rate": 3.3411371237458195e-06, |
| "loss": 5.3471, |
| "step": 26700 |
| }, |
| { |
| "epoch": 0.4056439968771489, |
| "eval_loss": 5.188870429992676, |
| "eval_runtime": 194.0466, |
| "eval_samples_per_second": 51.534, |
| "eval_steps_per_second": 6.442, |
| "step": 26700 |
| }, |
| { |
| "epoch": 0.40716326278305587, |
| "grad_norm": 4.591010093688965, |
| "learning_rate": 3.331103678929766e-06, |
| "loss": 5.3431, |
| "step": 26800 |
| }, |
| { |
| "epoch": 0.40716326278305587, |
| "eval_loss": 5.2042717933654785, |
| "eval_runtime": 193.9239, |
| "eval_samples_per_second": 51.567, |
| "eval_steps_per_second": 6.446, |
| "step": 26800 |
| }, |
| { |
| "epoch": 0.40868252868896277, |
| "grad_norm": 3.4716506004333496, |
| "learning_rate": 3.321070234113712e-06, |
| "loss": 5.3363, |
| "step": 26900 |
| }, |
| { |
| "epoch": 0.40868252868896277, |
| "eval_loss": 5.18259859085083, |
| "eval_runtime": 193.9467, |
| "eval_samples_per_second": 51.561, |
| "eval_steps_per_second": 6.445, |
| "step": 26900 |
| }, |
| { |
| "epoch": 0.4102017945948697, |
| "grad_norm": 2.3968818187713623, |
| "learning_rate": 3.311036789297659e-06, |
| "loss": 5.335, |
| "step": 27000 |
| }, |
| { |
| "epoch": 0.4102017945948697, |
| "eval_loss": 5.189505577087402, |
| "eval_runtime": 193.9236, |
| "eval_samples_per_second": 51.567, |
| "eval_steps_per_second": 6.446, |
| "step": 27000 |
| }, |
| { |
| "epoch": 0.4117210605007766, |
| "grad_norm": 3.8948540687561035, |
| "learning_rate": 3.3010033444816052e-06, |
| "loss": 5.3306, |
| "step": 27100 |
| }, |
| { |
| "epoch": 0.4117210605007766, |
| "eval_loss": 5.193852424621582, |
| "eval_runtime": 193.9813, |
| "eval_samples_per_second": 51.551, |
| "eval_steps_per_second": 6.444, |
| "step": 27100 |
| }, |
| { |
| "epoch": 0.41324032640668357, |
| "grad_norm": 2.8864169120788574, |
| "learning_rate": 3.2909698996655516e-06, |
| "loss": 5.3292, |
| "step": 27200 |
| }, |
| { |
| "epoch": 0.41324032640668357, |
| "eval_loss": 5.173651695251465, |
| "eval_runtime": 193.9832, |
| "eval_samples_per_second": 51.551, |
| "eval_steps_per_second": 6.444, |
| "step": 27200 |
| }, |
| { |
| "epoch": 0.41475959231259046, |
| "grad_norm": 2.733299970626831, |
| "learning_rate": 3.2809364548494983e-06, |
| "loss": 5.33, |
| "step": 27300 |
| }, |
| { |
| "epoch": 0.41475959231259046, |
| "eval_loss": 5.169619083404541, |
| "eval_runtime": 193.8848, |
| "eval_samples_per_second": 51.577, |
| "eval_steps_per_second": 6.447, |
| "step": 27300 |
| }, |
| { |
| "epoch": 0.4162788582184974, |
| "grad_norm": 2.9062700271606445, |
| "learning_rate": 3.2709030100334447e-06, |
| "loss": 5.3173, |
| "step": 27400 |
| }, |
| { |
| "epoch": 0.4162788582184974, |
| "eval_loss": 5.1664323806762695, |
| "eval_runtime": 193.9119, |
| "eval_samples_per_second": 51.57, |
| "eval_steps_per_second": 6.446, |
| "step": 27400 |
| }, |
| { |
| "epoch": 0.4177981241244043, |
| "grad_norm": 3.473586320877075, |
| "learning_rate": 3.260869565217391e-06, |
| "loss": 5.3132, |
| "step": 27500 |
| }, |
| { |
| "epoch": 0.4177981241244043, |
| "eval_loss": 5.160322666168213, |
| "eval_runtime": 193.6677, |
| "eval_samples_per_second": 51.635, |
| "eval_steps_per_second": 6.454, |
| "step": 27500 |
| }, |
| { |
| "epoch": 0.41931739003031127, |
| "grad_norm": 3.763826847076416, |
| "learning_rate": 3.2508361204013378e-06, |
| "loss": 5.3079, |
| "step": 27600 |
| }, |
| { |
| "epoch": 0.41931739003031127, |
| "eval_loss": 5.159815788269043, |
| "eval_runtime": 193.6881, |
| "eval_samples_per_second": 51.629, |
| "eval_steps_per_second": 6.454, |
| "step": 27600 |
| }, |
| { |
| "epoch": 0.42083665593621816, |
| "grad_norm": 3.552210807800293, |
| "learning_rate": 3.240802675585284e-06, |
| "loss": 5.3065, |
| "step": 27700 |
| }, |
| { |
| "epoch": 0.42083665593621816, |
| "eval_loss": 5.150642395019531, |
| "eval_runtime": 193.6169, |
| "eval_samples_per_second": 51.648, |
| "eval_steps_per_second": 6.456, |
| "step": 27700 |
| }, |
| { |
| "epoch": 0.4223559218421251, |
| "grad_norm": 4.059215545654297, |
| "learning_rate": 3.230769230769231e-06, |
| "loss": 5.2979, |
| "step": 27800 |
| }, |
| { |
| "epoch": 0.4223559218421251, |
| "eval_loss": 5.1397881507873535, |
| "eval_runtime": 193.6252, |
| "eval_samples_per_second": 51.646, |
| "eval_steps_per_second": 6.456, |
| "step": 27800 |
| }, |
| { |
| "epoch": 0.423875187748032, |
| "grad_norm": 3.116863250732422, |
| "learning_rate": 3.2207357859531772e-06, |
| "loss": 5.2986, |
| "step": 27900 |
| }, |
| { |
| "epoch": 0.423875187748032, |
| "eval_loss": 5.141936779022217, |
| "eval_runtime": 193.8766, |
| "eval_samples_per_second": 51.579, |
| "eval_steps_per_second": 6.447, |
| "step": 27900 |
| }, |
| { |
| "epoch": 0.42539445365393896, |
| "grad_norm": 3.474275588989258, |
| "learning_rate": 3.2107023411371236e-06, |
| "loss": 5.2969, |
| "step": 28000 |
| }, |
| { |
| "epoch": 0.42539445365393896, |
| "eval_loss": 5.130954742431641, |
| "eval_runtime": 193.6114, |
| "eval_samples_per_second": 51.65, |
| "eval_steps_per_second": 6.456, |
| "step": 28000 |
| }, |
| { |
| "epoch": 0.42691371955984586, |
| "grad_norm": 4.147261619567871, |
| "learning_rate": 3.2006688963210703e-06, |
| "loss": 5.2919, |
| "step": 28100 |
| }, |
| { |
| "epoch": 0.42691371955984586, |
| "eval_loss": 5.131519794464111, |
| "eval_runtime": 193.5876, |
| "eval_samples_per_second": 51.656, |
| "eval_steps_per_second": 6.457, |
| "step": 28100 |
| }, |
| { |
| "epoch": 0.4284329854657528, |
| "grad_norm": 3.2498297691345215, |
| "learning_rate": 3.1906354515050167e-06, |
| "loss": 5.281, |
| "step": 28200 |
| }, |
| { |
| "epoch": 0.4284329854657528, |
| "eval_loss": 5.137979984283447, |
| "eval_runtime": 193.6213, |
| "eval_samples_per_second": 51.647, |
| "eval_steps_per_second": 6.456, |
| "step": 28200 |
| }, |
| { |
| "epoch": 0.4299522513716597, |
| "grad_norm": 2.9977059364318848, |
| "learning_rate": 3.180602006688963e-06, |
| "loss": 5.2799, |
| "step": 28300 |
| }, |
| { |
| "epoch": 0.4299522513716597, |
| "eval_loss": 5.123497009277344, |
| "eval_runtime": 193.6764, |
| "eval_samples_per_second": 51.633, |
| "eval_steps_per_second": 6.454, |
| "step": 28300 |
| }, |
| { |
| "epoch": 0.43147151727756666, |
| "grad_norm": 3.6998023986816406, |
| "learning_rate": 3.1705685618729098e-06, |
| "loss": 5.2772, |
| "step": 28400 |
| }, |
| { |
| "epoch": 0.43147151727756666, |
| "eval_loss": 5.125461101531982, |
| "eval_runtime": 194.1824, |
| "eval_samples_per_second": 51.498, |
| "eval_steps_per_second": 6.437, |
| "step": 28400 |
| }, |
| { |
| "epoch": 0.43299078318347356, |
| "grad_norm": 2.8865628242492676, |
| "learning_rate": 3.160535117056856e-06, |
| "loss": 5.2778, |
| "step": 28500 |
| }, |
| { |
| "epoch": 0.43299078318347356, |
| "eval_loss": 5.130805492401123, |
| "eval_runtime": 194.0322, |
| "eval_samples_per_second": 51.538, |
| "eval_steps_per_second": 6.442, |
| "step": 28500 |
| }, |
| { |
| "epoch": 0.4345100490893805, |
| "grad_norm": 3.853248357772827, |
| "learning_rate": 3.1505016722408024e-06, |
| "loss": 5.2722, |
| "step": 28600 |
| }, |
| { |
| "epoch": 0.4345100490893805, |
| "eval_loss": 5.125495910644531, |
| "eval_runtime": 194.031, |
| "eval_samples_per_second": 51.538, |
| "eval_steps_per_second": 6.442, |
| "step": 28600 |
| }, |
| { |
| "epoch": 0.4360293149952874, |
| "grad_norm": 2.8595046997070312, |
| "learning_rate": 3.140468227424749e-06, |
| "loss": 5.2644, |
| "step": 28700 |
| }, |
| { |
| "epoch": 0.4360293149952874, |
| "eval_loss": 5.113553524017334, |
| "eval_runtime": 194.0586, |
| "eval_samples_per_second": 51.531, |
| "eval_steps_per_second": 6.441, |
| "step": 28700 |
| }, |
| { |
| "epoch": 0.43754858090119436, |
| "grad_norm": 3.5894057750701904, |
| "learning_rate": 3.1304347826086955e-06, |
| "loss": 5.261, |
| "step": 28800 |
| }, |
| { |
| "epoch": 0.43754858090119436, |
| "eval_loss": 5.1062846183776855, |
| "eval_runtime": 194.0309, |
| "eval_samples_per_second": 51.538, |
| "eval_steps_per_second": 6.442, |
| "step": 28800 |
| }, |
| { |
| "epoch": 0.43906784680710126, |
| "grad_norm": 2.79595685005188, |
| "learning_rate": 3.1204013377926423e-06, |
| "loss": 5.257, |
| "step": 28900 |
| }, |
| { |
| "epoch": 0.43906784680710126, |
| "eval_loss": 5.108764171600342, |
| "eval_runtime": 194.0394, |
| "eval_samples_per_second": 51.536, |
| "eval_steps_per_second": 6.442, |
| "step": 28900 |
| }, |
| { |
| "epoch": 0.4405871127130082, |
| "grad_norm": 3.3071796894073486, |
| "learning_rate": 3.1103678929765886e-06, |
| "loss": 5.2543, |
| "step": 29000 |
| }, |
| { |
| "epoch": 0.4405871127130082, |
| "eval_loss": 5.101233005523682, |
| "eval_runtime": 194.0058, |
| "eval_samples_per_second": 51.545, |
| "eval_steps_per_second": 6.443, |
| "step": 29000 |
| }, |
| { |
| "epoch": 0.4421063786189151, |
| "grad_norm": 2.916874408721924, |
| "learning_rate": 3.100334448160535e-06, |
| "loss": 5.2474, |
| "step": 29100 |
| }, |
| { |
| "epoch": 0.4421063786189151, |
| "eval_loss": 5.100154876708984, |
| "eval_runtime": 194.0356, |
| "eval_samples_per_second": 51.537, |
| "eval_steps_per_second": 6.442, |
| "step": 29100 |
| }, |
| { |
| "epoch": 0.44362564452482206, |
| "grad_norm": 2.6649153232574463, |
| "learning_rate": 3.0903010033444818e-06, |
| "loss": 5.2504, |
| "step": 29200 |
| }, |
| { |
| "epoch": 0.44362564452482206, |
| "eval_loss": 5.0921311378479, |
| "eval_runtime": 194.0892, |
| "eval_samples_per_second": 51.523, |
| "eval_steps_per_second": 6.44, |
| "step": 29200 |
| }, |
| { |
| "epoch": 0.44514491043072896, |
| "grad_norm": 2.398049831390381, |
| "learning_rate": 3.080267558528428e-06, |
| "loss": 5.2441, |
| "step": 29300 |
| }, |
| { |
| "epoch": 0.44514491043072896, |
| "eval_loss": 5.0853142738342285, |
| "eval_runtime": 193.8364, |
| "eval_samples_per_second": 51.59, |
| "eval_steps_per_second": 6.449, |
| "step": 29300 |
| }, |
| { |
| "epoch": 0.4466641763366359, |
| "grad_norm": 2.485322952270508, |
| "learning_rate": 3.0702341137123744e-06, |
| "loss": 5.2415, |
| "step": 29400 |
| }, |
| { |
| "epoch": 0.4466641763366359, |
| "eval_loss": 5.091442584991455, |
| "eval_runtime": 193.9724, |
| "eval_samples_per_second": 51.554, |
| "eval_steps_per_second": 6.444, |
| "step": 29400 |
| }, |
| { |
| "epoch": 0.4481834422425428, |
| "grad_norm": 3.5554513931274414, |
| "learning_rate": 3.060200668896321e-06, |
| "loss": 5.2374, |
| "step": 29500 |
| }, |
| { |
| "epoch": 0.4481834422425428, |
| "eval_loss": 5.077342510223389, |
| "eval_runtime": 194.0745, |
| "eval_samples_per_second": 51.527, |
| "eval_steps_per_second": 6.441, |
| "step": 29500 |
| }, |
| { |
| "epoch": 0.44970270814844976, |
| "grad_norm": 3.598982810974121, |
| "learning_rate": 3.0501672240802675e-06, |
| "loss": 5.2324, |
| "step": 29600 |
| }, |
| { |
| "epoch": 0.44970270814844976, |
| "eval_loss": 5.088211536407471, |
| "eval_runtime": 193.9862, |
| "eval_samples_per_second": 51.55, |
| "eval_steps_per_second": 6.444, |
| "step": 29600 |
| }, |
| { |
| "epoch": 0.45122197405435666, |
| "grad_norm": 3.2339296340942383, |
| "learning_rate": 3.0401337792642143e-06, |
| "loss": 5.2295, |
| "step": 29700 |
| }, |
| { |
| "epoch": 0.45122197405435666, |
| "eval_loss": 5.077876567840576, |
| "eval_runtime": 193.8777, |
| "eval_samples_per_second": 51.579, |
| "eval_steps_per_second": 6.447, |
| "step": 29700 |
| }, |
| { |
| "epoch": 0.4527412399602636, |
| "grad_norm": 2.627495765686035, |
| "learning_rate": 3.0301003344481606e-06, |
| "loss": 5.2275, |
| "step": 29800 |
| }, |
| { |
| "epoch": 0.4527412399602636, |
| "eval_loss": 5.074822902679443, |
| "eval_runtime": 193.8311, |
| "eval_samples_per_second": 51.591, |
| "eval_steps_per_second": 6.449, |
| "step": 29800 |
| }, |
| { |
| "epoch": 0.4542605058661705, |
| "grad_norm": 2.9252991676330566, |
| "learning_rate": 3.020066889632107e-06, |
| "loss": 5.2238, |
| "step": 29900 |
| }, |
| { |
| "epoch": 0.4542605058661705, |
| "eval_loss": 5.063547611236572, |
| "eval_runtime": 193.7302, |
| "eval_samples_per_second": 51.618, |
| "eval_steps_per_second": 6.452, |
| "step": 29900 |
| }, |
| { |
| "epoch": 0.45577977177207746, |
| "grad_norm": 3.155406951904297, |
| "learning_rate": 3.0100334448160537e-06, |
| "loss": 5.2218, |
| "step": 30000 |
| }, |
| { |
| "epoch": 0.45577977177207746, |
| "eval_loss": 5.066218852996826, |
| "eval_runtime": 193.6072, |
| "eval_samples_per_second": 51.651, |
| "eval_steps_per_second": 6.456, |
| "step": 30000 |
| }, |
| { |
| "epoch": 0.45729903767798435, |
| "grad_norm": 3.476306915283203, |
| "learning_rate": 3e-06, |
| "loss": 5.2166, |
| "step": 30100 |
| }, |
| { |
| "epoch": 0.45729903767798435, |
| "eval_loss": 5.068021774291992, |
| "eval_runtime": 193.6477, |
| "eval_samples_per_second": 51.64, |
| "eval_steps_per_second": 6.455, |
| "step": 30100 |
| }, |
| { |
| "epoch": 0.4588183035838913, |
| "grad_norm": 3.618774175643921, |
| "learning_rate": 2.9899665551839464e-06, |
| "loss": 5.2154, |
| "step": 30200 |
| }, |
| { |
| "epoch": 0.4588183035838913, |
| "eval_loss": 5.0593461990356445, |
| "eval_runtime": 193.5821, |
| "eval_samples_per_second": 51.658, |
| "eval_steps_per_second": 6.457, |
| "step": 30200 |
| }, |
| { |
| "epoch": 0.4603375694897982, |
| "grad_norm": 2.838336229324341, |
| "learning_rate": 2.979933110367893e-06, |
| "loss": 5.2082, |
| "step": 30300 |
| }, |
| { |
| "epoch": 0.4603375694897982, |
| "eval_loss": 5.061206817626953, |
| "eval_runtime": 193.644, |
| "eval_samples_per_second": 51.641, |
| "eval_steps_per_second": 6.455, |
| "step": 30300 |
| }, |
| { |
| "epoch": 0.46185683539570516, |
| "grad_norm": 2.840545654296875, |
| "learning_rate": 2.9698996655518395e-06, |
| "loss": 5.2028, |
| "step": 30400 |
| }, |
| { |
| "epoch": 0.46185683539570516, |
| "eval_loss": 5.051755428314209, |
| "eval_runtime": 193.6726, |
| "eval_samples_per_second": 51.634, |
| "eval_steps_per_second": 6.454, |
| "step": 30400 |
| }, |
| { |
| "epoch": 0.46337610130161205, |
| "grad_norm": 2.4346399307250977, |
| "learning_rate": 2.959866220735786e-06, |
| "loss": 5.2001, |
| "step": 30500 |
| }, |
| { |
| "epoch": 0.46337610130161205, |
| "eval_loss": 5.050179481506348, |
| "eval_runtime": 193.6621, |
| "eval_samples_per_second": 51.636, |
| "eval_steps_per_second": 6.455, |
| "step": 30500 |
| }, |
| { |
| "epoch": 0.464895367207519, |
| "grad_norm": 2.331064224243164, |
| "learning_rate": 2.9498327759197326e-06, |
| "loss": 5.2018, |
| "step": 30600 |
| }, |
| { |
| "epoch": 0.464895367207519, |
| "eval_loss": 5.039993762969971, |
| "eval_runtime": 193.9744, |
| "eval_samples_per_second": 51.553, |
| "eval_steps_per_second": 6.444, |
| "step": 30600 |
| }, |
| { |
| "epoch": 0.4664146331134259, |
| "grad_norm": 3.012594223022461, |
| "learning_rate": 2.939799331103679e-06, |
| "loss": 5.1991, |
| "step": 30700 |
| }, |
| { |
| "epoch": 0.4664146331134259, |
| "eval_loss": 5.039401054382324, |
| "eval_runtime": 194.0713, |
| "eval_samples_per_second": 51.527, |
| "eval_steps_per_second": 6.441, |
| "step": 30700 |
| }, |
| { |
| "epoch": 0.46793389901933286, |
| "grad_norm": 3.4017112255096436, |
| "learning_rate": 2.9297658862876257e-06, |
| "loss": 5.1937, |
| "step": 30800 |
| }, |
| { |
| "epoch": 0.46793389901933286, |
| "eval_loss": 5.0479512214660645, |
| "eval_runtime": 194.0394, |
| "eval_samples_per_second": 51.536, |
| "eval_steps_per_second": 6.442, |
| "step": 30800 |
| }, |
| { |
| "epoch": 0.46945316492523975, |
| "grad_norm": 2.848475694656372, |
| "learning_rate": 2.919732441471572e-06, |
| "loss": 5.1898, |
| "step": 30900 |
| }, |
| { |
| "epoch": 0.46945316492523975, |
| "eval_loss": 5.043004035949707, |
| "eval_runtime": 194.2051, |
| "eval_samples_per_second": 51.492, |
| "eval_steps_per_second": 6.436, |
| "step": 30900 |
| }, |
| { |
| "epoch": 0.4709724308311467, |
| "grad_norm": 2.964001178741455, |
| "learning_rate": 2.9096989966555184e-06, |
| "loss": 5.1887, |
| "step": 31000 |
| }, |
| { |
| "epoch": 0.4709724308311467, |
| "eval_loss": 5.029993534088135, |
| "eval_runtime": 194.1479, |
| "eval_samples_per_second": 51.507, |
| "eval_steps_per_second": 6.438, |
| "step": 31000 |
| }, |
| { |
| "epoch": 0.4724916967370536, |
| "grad_norm": 2.698634147644043, |
| "learning_rate": 2.899665551839465e-06, |
| "loss": 5.1879, |
| "step": 31100 |
| }, |
| { |
| "epoch": 0.4724916967370536, |
| "eval_loss": 5.028534412384033, |
| "eval_runtime": 194.0408, |
| "eval_samples_per_second": 51.536, |
| "eval_steps_per_second": 6.442, |
| "step": 31100 |
| }, |
| { |
| "epoch": 0.47401096264296055, |
| "grad_norm": 2.757293224334717, |
| "learning_rate": 2.8896321070234115e-06, |
| "loss": 5.1818, |
| "step": 31200 |
| }, |
| { |
| "epoch": 0.47401096264296055, |
| "eval_loss": 5.024392127990723, |
| "eval_runtime": 194.1644, |
| "eval_samples_per_second": 51.503, |
| "eval_steps_per_second": 6.438, |
| "step": 31200 |
| }, |
| { |
| "epoch": 0.47553022854886745, |
| "grad_norm": 3.269547700881958, |
| "learning_rate": 2.879598662207358e-06, |
| "loss": 5.1784, |
| "step": 31300 |
| }, |
| { |
| "epoch": 0.47553022854886745, |
| "eval_loss": 5.026421546936035, |
| "eval_runtime": 193.9523, |
| "eval_samples_per_second": 51.559, |
| "eval_steps_per_second": 6.445, |
| "step": 31300 |
| }, |
| { |
| "epoch": 0.4770494944547744, |
| "grad_norm": 3.1080405712127686, |
| "learning_rate": 2.8695652173913046e-06, |
| "loss": 5.1725, |
| "step": 31400 |
| }, |
| { |
| "epoch": 0.4770494944547744, |
| "eval_loss": 5.025778770446777, |
| "eval_runtime": 193.9136, |
| "eval_samples_per_second": 51.569, |
| "eval_steps_per_second": 6.446, |
| "step": 31400 |
| }, |
| { |
| "epoch": 0.4785687603606813, |
| "grad_norm": 4.382852554321289, |
| "learning_rate": 2.859531772575251e-06, |
| "loss": 5.1691, |
| "step": 31500 |
| }, |
| { |
| "epoch": 0.4785687603606813, |
| "eval_loss": 5.0218424797058105, |
| "eval_runtime": 194.0079, |
| "eval_samples_per_second": 51.544, |
| "eval_steps_per_second": 6.443, |
| "step": 31500 |
| }, |
| { |
| "epoch": 0.48008802626658825, |
| "grad_norm": 2.4219489097595215, |
| "learning_rate": 2.8494983277591977e-06, |
| "loss": 5.1675, |
| "step": 31600 |
| }, |
| { |
| "epoch": 0.48008802626658825, |
| "eval_loss": 5.009864807128906, |
| "eval_runtime": 194.1924, |
| "eval_samples_per_second": 51.495, |
| "eval_steps_per_second": 6.437, |
| "step": 31600 |
| }, |
| { |
| "epoch": 0.48160729217249515, |
| "grad_norm": 3.9848620891571045, |
| "learning_rate": 2.839464882943144e-06, |
| "loss": 5.1607, |
| "step": 31700 |
| }, |
| { |
| "epoch": 0.48160729217249515, |
| "eval_loss": 5.008971214294434, |
| "eval_runtime": 194.2985, |
| "eval_samples_per_second": 51.467, |
| "eval_steps_per_second": 6.433, |
| "step": 31700 |
| }, |
| { |
| "epoch": 0.4831265580784021, |
| "grad_norm": 3.3474831581115723, |
| "learning_rate": 2.8294314381270904e-06, |
| "loss": 5.1598, |
| "step": 31800 |
| }, |
| { |
| "epoch": 0.4831265580784021, |
| "eval_loss": 5.004793167114258, |
| "eval_runtime": 194.03, |
| "eval_samples_per_second": 51.538, |
| "eval_steps_per_second": 6.442, |
| "step": 31800 |
| }, |
| { |
| "epoch": 0.484645823984309, |
| "grad_norm": 3.074587821960449, |
| "learning_rate": 2.819397993311037e-06, |
| "loss": 5.1588, |
| "step": 31900 |
| }, |
| { |
| "epoch": 0.484645823984309, |
| "eval_loss": 5.007466793060303, |
| "eval_runtime": 193.9052, |
| "eval_samples_per_second": 51.572, |
| "eval_steps_per_second": 6.446, |
| "step": 31900 |
| }, |
| { |
| "epoch": 0.48616508989021595, |
| "grad_norm": 2.631606340408325, |
| "learning_rate": 2.8093645484949835e-06, |
| "loss": 5.155, |
| "step": 32000 |
| }, |
| { |
| "epoch": 0.48616508989021595, |
| "eval_loss": 5.00339937210083, |
| "eval_runtime": 194.0167, |
| "eval_samples_per_second": 51.542, |
| "eval_steps_per_second": 6.443, |
| "step": 32000 |
| }, |
| { |
| "epoch": 0.48768435579612285, |
| "grad_norm": 2.5506277084350586, |
| "learning_rate": 2.79933110367893e-06, |
| "loss": 5.1544, |
| "step": 32100 |
| }, |
| { |
| "epoch": 0.48768435579612285, |
| "eval_loss": 4.995656967163086, |
| "eval_runtime": 193.7711, |
| "eval_samples_per_second": 51.607, |
| "eval_steps_per_second": 6.451, |
| "step": 32100 |
| }, |
| { |
| "epoch": 0.4892036217020298, |
| "grad_norm": 2.9476144313812256, |
| "learning_rate": 2.7892976588628766e-06, |
| "loss": 5.1477, |
| "step": 32200 |
| }, |
| { |
| "epoch": 0.4892036217020298, |
| "eval_loss": 4.994040012359619, |
| "eval_runtime": 193.6615, |
| "eval_samples_per_second": 51.636, |
| "eval_steps_per_second": 6.455, |
| "step": 32200 |
| }, |
| { |
| "epoch": 0.4907228876079367, |
| "grad_norm": 3.5395162105560303, |
| "learning_rate": 2.779264214046823e-06, |
| "loss": 5.1424, |
| "step": 32300 |
| }, |
| { |
| "epoch": 0.4907228876079367, |
| "eval_loss": 4.992614269256592, |
| "eval_runtime": 193.7065, |
| "eval_samples_per_second": 51.624, |
| "eval_steps_per_second": 6.453, |
| "step": 32300 |
| }, |
| { |
| "epoch": 0.49224215351384365, |
| "grad_norm": 2.805767297744751, |
| "learning_rate": 2.7692307692307693e-06, |
| "loss": 5.1446, |
| "step": 32400 |
| }, |
| { |
| "epoch": 0.49224215351384365, |
| "eval_loss": 4.987194538116455, |
| "eval_runtime": 193.7203, |
| "eval_samples_per_second": 51.621, |
| "eval_steps_per_second": 6.453, |
| "step": 32400 |
| }, |
| { |
| "epoch": 0.49376141941975055, |
| "grad_norm": 3.9371492862701416, |
| "learning_rate": 2.759197324414716e-06, |
| "loss": 5.1391, |
| "step": 32500 |
| }, |
| { |
| "epoch": 0.49376141941975055, |
| "eval_loss": 4.9901838302612305, |
| "eval_runtime": 193.6911, |
| "eval_samples_per_second": 51.629, |
| "eval_steps_per_second": 6.454, |
| "step": 32500 |
| }, |
| { |
| "epoch": 0.4952806853256575, |
| "grad_norm": 2.755789041519165, |
| "learning_rate": 2.749163879598662e-06, |
| "loss": 5.1393, |
| "step": 32600 |
| }, |
| { |
| "epoch": 0.4952806853256575, |
| "eval_loss": 4.992640018463135, |
| "eval_runtime": 193.6445, |
| "eval_samples_per_second": 51.641, |
| "eval_steps_per_second": 6.455, |
| "step": 32600 |
| }, |
| { |
| "epoch": 0.4967999512315644, |
| "grad_norm": 3.4700164794921875, |
| "learning_rate": 2.7391304347826087e-06, |
| "loss": 5.1375, |
| "step": 32700 |
| }, |
| { |
| "epoch": 0.4967999512315644, |
| "eval_loss": 4.975983142852783, |
| "eval_runtime": 193.7046, |
| "eval_samples_per_second": 51.625, |
| "eval_steps_per_second": 6.453, |
| "step": 32700 |
| }, |
| { |
| "epoch": 0.49831921713747135, |
| "grad_norm": 2.9584505558013916, |
| "learning_rate": 2.729096989966555e-06, |
| "loss": 5.1305, |
| "step": 32800 |
| }, |
| { |
| "epoch": 0.49831921713747135, |
| "eval_loss": 4.978539943695068, |
| "eval_runtime": 193.9644, |
| "eval_samples_per_second": 51.556, |
| "eval_steps_per_second": 6.444, |
| "step": 32800 |
| }, |
| { |
| "epoch": 0.49983848304337825, |
| "grad_norm": 3.1944355964660645, |
| "learning_rate": 2.7190635451505014e-06, |
| "loss": 5.1202, |
| "step": 32900 |
| }, |
| { |
| "epoch": 0.49983848304337825, |
| "eval_loss": 4.972556114196777, |
| "eval_runtime": 194.0092, |
| "eval_samples_per_second": 51.544, |
| "eval_steps_per_second": 6.443, |
| "step": 32900 |
| }, |
| { |
| "epoch": 0.5013577489492852, |
| "grad_norm": 2.980757713317871, |
| "learning_rate": 2.709030100334448e-06, |
| "loss": 5.1282, |
| "step": 33000 |
| }, |
| { |
| "epoch": 0.5013577489492852, |
| "eval_loss": 4.974079132080078, |
| "eval_runtime": 194.0276, |
| "eval_samples_per_second": 51.539, |
| "eval_steps_per_second": 6.442, |
| "step": 33000 |
| }, |
| { |
| "epoch": 0.5028770148551921, |
| "grad_norm": 3.100187063217163, |
| "learning_rate": 2.6989966555183945e-06, |
| "loss": 5.1259, |
| "step": 33100 |
| }, |
| { |
| "epoch": 0.5028770148551921, |
| "eval_loss": 4.963293552398682, |
| "eval_runtime": 194.0336, |
| "eval_samples_per_second": 51.537, |
| "eval_steps_per_second": 6.442, |
| "step": 33100 |
| }, |
| { |
| "epoch": 0.504396280761099, |
| "grad_norm": 2.542158603668213, |
| "learning_rate": 2.6889632107023413e-06, |
| "loss": 5.1217, |
| "step": 33200 |
| }, |
| { |
| "epoch": 0.504396280761099, |
| "eval_loss": 4.9611406326293945, |
| "eval_runtime": 193.9855, |
| "eval_samples_per_second": 51.55, |
| "eval_steps_per_second": 6.444, |
| "step": 33200 |
| }, |
| { |
| "epoch": 0.505915546667006, |
| "grad_norm": 2.545457363128662, |
| "learning_rate": 2.6789297658862876e-06, |
| "loss": 5.1158, |
| "step": 33300 |
| }, |
| { |
| "epoch": 0.505915546667006, |
| "eval_loss": 4.967195510864258, |
| "eval_runtime": 194.147, |
| "eval_samples_per_second": 51.507, |
| "eval_steps_per_second": 6.438, |
| "step": 33300 |
| }, |
| { |
| "epoch": 0.5074348125729129, |
| "grad_norm": 2.822507858276367, |
| "learning_rate": 2.668896321070234e-06, |
| "loss": 5.1109, |
| "step": 33400 |
| }, |
| { |
| "epoch": 0.5074348125729129, |
| "eval_loss": 4.9572014808654785, |
| "eval_runtime": 194.2258, |
| "eval_samples_per_second": 51.486, |
| "eval_steps_per_second": 6.436, |
| "step": 33400 |
| }, |
| { |
| "epoch": 0.5089540784788198, |
| "grad_norm": 2.361830949783325, |
| "learning_rate": 2.6588628762541807e-06, |
| "loss": 5.1154, |
| "step": 33500 |
| }, |
| { |
| "epoch": 0.5089540784788198, |
| "eval_loss": 4.94895076751709, |
| "eval_runtime": 194.1601, |
| "eval_samples_per_second": 51.504, |
| "eval_steps_per_second": 6.438, |
| "step": 33500 |
| }, |
| { |
| "epoch": 0.5104733443847267, |
| "grad_norm": 2.3638288974761963, |
| "learning_rate": 2.648829431438127e-06, |
| "loss": 5.1055, |
| "step": 33600 |
| }, |
| { |
| "epoch": 0.5104733443847267, |
| "eval_loss": 4.947831153869629, |
| "eval_runtime": 194.2013, |
| "eval_samples_per_second": 51.493, |
| "eval_steps_per_second": 6.437, |
| "step": 33600 |
| }, |
| { |
| "epoch": 0.5119926102906337, |
| "grad_norm": 2.163120746612549, |
| "learning_rate": 2.6387959866220734e-06, |
| "loss": 5.1048, |
| "step": 33700 |
| }, |
| { |
| "epoch": 0.5119926102906337, |
| "eval_loss": 4.943573951721191, |
| "eval_runtime": 194.2102, |
| "eval_samples_per_second": 51.491, |
| "eval_steps_per_second": 6.436, |
| "step": 33700 |
| }, |
| { |
| "epoch": 0.5135118761965406, |
| "grad_norm": 2.234380006790161, |
| "learning_rate": 2.62876254180602e-06, |
| "loss": 5.1042, |
| "step": 33800 |
| }, |
| { |
| "epoch": 0.5135118761965406, |
| "eval_loss": 4.945695400238037, |
| "eval_runtime": 194.1949, |
| "eval_samples_per_second": 51.495, |
| "eval_steps_per_second": 6.437, |
| "step": 33800 |
| }, |
| { |
| "epoch": 0.5150311421024475, |
| "grad_norm": 2.8607873916625977, |
| "learning_rate": 2.6187290969899665e-06, |
| "loss": 5.0977, |
| "step": 33900 |
| }, |
| { |
| "epoch": 0.5150311421024475, |
| "eval_loss": 4.940700531005859, |
| "eval_runtime": 194.1567, |
| "eval_samples_per_second": 51.505, |
| "eval_steps_per_second": 6.438, |
| "step": 33900 |
| }, |
| { |
| "epoch": 0.5165504080083544, |
| "grad_norm": 2.85111403465271, |
| "learning_rate": 2.6086956521739132e-06, |
| "loss": 5.0939, |
| "step": 34000 |
| }, |
| { |
| "epoch": 0.5165504080083544, |
| "eval_loss": 4.934571266174316, |
| "eval_runtime": 194.0792, |
| "eval_samples_per_second": 51.525, |
| "eval_steps_per_second": 6.441, |
| "step": 34000 |
| }, |
| { |
| "epoch": 0.5180696739142614, |
| "grad_norm": 3.2021050453186035, |
| "learning_rate": 2.5986622073578596e-06, |
| "loss": 5.0902, |
| "step": 34100 |
| }, |
| { |
| "epoch": 0.5180696739142614, |
| "eval_loss": 4.940134048461914, |
| "eval_runtime": 194.0612, |
| "eval_samples_per_second": 51.53, |
| "eval_steps_per_second": 6.441, |
| "step": 34100 |
| }, |
| { |
| "epoch": 0.5195889398201683, |
| "grad_norm": 2.500246047973633, |
| "learning_rate": 2.588628762541806e-06, |
| "loss": 5.0851, |
| "step": 34200 |
| }, |
| { |
| "epoch": 0.5195889398201683, |
| "eval_loss": 4.938769340515137, |
| "eval_runtime": 194.1164, |
| "eval_samples_per_second": 51.515, |
| "eval_steps_per_second": 6.439, |
| "step": 34200 |
| }, |
| { |
| "epoch": 0.5211082057260752, |
| "grad_norm": 2.7174882888793945, |
| "learning_rate": 2.5785953177257527e-06, |
| "loss": 5.0917, |
| "step": 34300 |
| }, |
| { |
| "epoch": 0.5211082057260752, |
| "eval_loss": 4.933419704437256, |
| "eval_runtime": 194.3056, |
| "eval_samples_per_second": 51.465, |
| "eval_steps_per_second": 6.433, |
| "step": 34300 |
| }, |
| { |
| "epoch": 0.5226274716319821, |
| "grad_norm": 3.255512237548828, |
| "learning_rate": 2.568561872909699e-06, |
| "loss": 5.0836, |
| "step": 34400 |
| }, |
| { |
| "epoch": 0.5226274716319821, |
| "eval_loss": 4.930218696594238, |
| "eval_runtime": 193.8112, |
| "eval_samples_per_second": 51.597, |
| "eval_steps_per_second": 6.45, |
| "step": 34400 |
| }, |
| { |
| "epoch": 0.5241467375378891, |
| "grad_norm": 2.2356820106506348, |
| "learning_rate": 2.5585284280936454e-06, |
| "loss": 5.0815, |
| "step": 34500 |
| }, |
| { |
| "epoch": 0.5241467375378891, |
| "eval_loss": 4.932159423828125, |
| "eval_runtime": 193.6954, |
| "eval_samples_per_second": 51.627, |
| "eval_steps_per_second": 6.453, |
| "step": 34500 |
| }, |
| { |
| "epoch": 0.525666003443796, |
| "grad_norm": 2.4992058277130127, |
| "learning_rate": 2.548494983277592e-06, |
| "loss": 5.0844, |
| "step": 34600 |
| }, |
| { |
| "epoch": 0.525666003443796, |
| "eval_loss": 4.924154758453369, |
| "eval_runtime": 193.7231, |
| "eval_samples_per_second": 51.62, |
| "eval_steps_per_second": 6.453, |
| "step": 34600 |
| }, |
| { |
| "epoch": 0.5271852693497029, |
| "grad_norm": 2.348440647125244, |
| "learning_rate": 2.5384615384615385e-06, |
| "loss": 5.0789, |
| "step": 34700 |
| }, |
| { |
| "epoch": 0.5271852693497029, |
| "eval_loss": 4.925171852111816, |
| "eval_runtime": 193.7816, |
| "eval_samples_per_second": 51.604, |
| "eval_steps_per_second": 6.451, |
| "step": 34700 |
| }, |
| { |
| "epoch": 0.5287045352556098, |
| "grad_norm": 2.589172840118408, |
| "learning_rate": 2.528428093645485e-06, |
| "loss": 5.0708, |
| "step": 34800 |
| }, |
| { |
| "epoch": 0.5287045352556098, |
| "eval_loss": 4.919689178466797, |
| "eval_runtime": 193.7778, |
| "eval_samples_per_second": 51.605, |
| "eval_steps_per_second": 6.451, |
| "step": 34800 |
| }, |
| { |
| "epoch": 0.5302238011615168, |
| "grad_norm": 2.950510263442993, |
| "learning_rate": 2.5183946488294316e-06, |
| "loss": 5.0707, |
| "step": 34900 |
| }, |
| { |
| "epoch": 0.5302238011615168, |
| "eval_loss": 4.9157304763793945, |
| "eval_runtime": 193.5504, |
| "eval_samples_per_second": 51.666, |
| "eval_steps_per_second": 6.458, |
| "step": 34900 |
| }, |
| { |
| "epoch": 0.5317430670674237, |
| "grad_norm": 3.1693990230560303, |
| "learning_rate": 2.508361204013378e-06, |
| "loss": 5.0653, |
| "step": 35000 |
| }, |
| { |
| "epoch": 0.5317430670674237, |
| "eval_loss": 4.910171985626221, |
| "eval_runtime": 194.0296, |
| "eval_samples_per_second": 51.539, |
| "eval_steps_per_second": 6.442, |
| "step": 35000 |
| }, |
| { |
| "epoch": 0.5332623329733306, |
| "grad_norm": 2.8548085689544678, |
| "learning_rate": 2.4983277591973247e-06, |
| "loss": 5.0706, |
| "step": 35100 |
| }, |
| { |
| "epoch": 0.5332623329733306, |
| "eval_loss": 4.9105353355407715, |
| "eval_runtime": 193.9374, |
| "eval_samples_per_second": 51.563, |
| "eval_steps_per_second": 6.445, |
| "step": 35100 |
| }, |
| { |
| "epoch": 0.5347815988792376, |
| "grad_norm": 1.8217041492462158, |
| "learning_rate": 2.488294314381271e-06, |
| "loss": 5.064, |
| "step": 35200 |
| }, |
| { |
| "epoch": 0.5347815988792376, |
| "eval_loss": 4.906797885894775, |
| "eval_runtime": 194.1314, |
| "eval_samples_per_second": 51.511, |
| "eval_steps_per_second": 6.439, |
| "step": 35200 |
| }, |
| { |
| "epoch": 0.5363008647851445, |
| "grad_norm": 2.089233875274658, |
| "learning_rate": 2.4782608695652173e-06, |
| "loss": 5.0612, |
| "step": 35300 |
| }, |
| { |
| "epoch": 0.5363008647851445, |
| "eval_loss": 4.904172420501709, |
| "eval_runtime": 194.1107, |
| "eval_samples_per_second": 51.517, |
| "eval_steps_per_second": 6.44, |
| "step": 35300 |
| }, |
| { |
| "epoch": 0.5378201306910514, |
| "grad_norm": 2.3225550651550293, |
| "learning_rate": 2.468227424749164e-06, |
| "loss": 5.06, |
| "step": 35400 |
| }, |
| { |
| "epoch": 0.5378201306910514, |
| "eval_loss": 4.904652118682861, |
| "eval_runtime": 194.1265, |
| "eval_samples_per_second": 51.513, |
| "eval_steps_per_second": 6.439, |
| "step": 35400 |
| }, |
| { |
| "epoch": 0.5393393965969583, |
| "grad_norm": 3.1568684577941895, |
| "learning_rate": 2.4581939799331104e-06, |
| "loss": 5.0579, |
| "step": 35500 |
| }, |
| { |
| "epoch": 0.5393393965969583, |
| "eval_loss": 4.90002965927124, |
| "eval_runtime": 194.1042, |
| "eval_samples_per_second": 51.519, |
| "eval_steps_per_second": 6.44, |
| "step": 35500 |
| }, |
| { |
| "epoch": 0.5408586625028653, |
| "grad_norm": 2.8267829418182373, |
| "learning_rate": 2.4481605351170568e-06, |
| "loss": 5.0568, |
| "step": 35600 |
| }, |
| { |
| "epoch": 0.5408586625028653, |
| "eval_loss": 4.89033842086792, |
| "eval_runtime": 194.0764, |
| "eval_samples_per_second": 51.526, |
| "eval_steps_per_second": 6.441, |
| "step": 35600 |
| }, |
| { |
| "epoch": 0.5423779284087722, |
| "grad_norm": 1.987886667251587, |
| "learning_rate": 2.4381270903010035e-06, |
| "loss": 5.0541, |
| "step": 35700 |
| }, |
| { |
| "epoch": 0.5423779284087722, |
| "eval_loss": 4.9011454582214355, |
| "eval_runtime": 194.1549, |
| "eval_samples_per_second": 51.505, |
| "eval_steps_per_second": 6.438, |
| "step": 35700 |
| }, |
| { |
| "epoch": 0.5438971943146791, |
| "grad_norm": 3.215435028076172, |
| "learning_rate": 2.42809364548495e-06, |
| "loss": 5.0506, |
| "step": 35800 |
| }, |
| { |
| "epoch": 0.5438971943146791, |
| "eval_loss": 4.890650272369385, |
| "eval_runtime": 194.1843, |
| "eval_samples_per_second": 51.497, |
| "eval_steps_per_second": 6.437, |
| "step": 35800 |
| }, |
| { |
| "epoch": 0.545416460220586, |
| "grad_norm": 1.9231488704681396, |
| "learning_rate": 2.4180602006688962e-06, |
| "loss": 5.0466, |
| "step": 35900 |
| }, |
| { |
| "epoch": 0.545416460220586, |
| "eval_loss": 4.890570163726807, |
| "eval_runtime": 194.206, |
| "eval_samples_per_second": 51.492, |
| "eval_steps_per_second": 6.436, |
| "step": 35900 |
| }, |
| { |
| "epoch": 0.546935726126493, |
| "grad_norm": 2.3541529178619385, |
| "learning_rate": 2.408026755852843e-06, |
| "loss": 5.0444, |
| "step": 36000 |
| }, |
| { |
| "epoch": 0.546935726126493, |
| "eval_loss": 4.887938022613525, |
| "eval_runtime": 194.1495, |
| "eval_samples_per_second": 51.507, |
| "eval_steps_per_second": 6.438, |
| "step": 36000 |
| }, |
| { |
| "epoch": 0.5484549920323999, |
| "grad_norm": 2.646209478378296, |
| "learning_rate": 2.3979933110367893e-06, |
| "loss": 5.0381, |
| "step": 36100 |
| }, |
| { |
| "epoch": 0.5484549920323999, |
| "eval_loss": 4.883460998535156, |
| "eval_runtime": 194.0814, |
| "eval_samples_per_second": 51.525, |
| "eval_steps_per_second": 6.441, |
| "step": 36100 |
| }, |
| { |
| "epoch": 0.5499742579383068, |
| "grad_norm": 2.2432219982147217, |
| "learning_rate": 2.387959866220736e-06, |
| "loss": 5.0363, |
| "step": 36200 |
| }, |
| { |
| "epoch": 0.5499742579383068, |
| "eval_loss": 4.881083011627197, |
| "eval_runtime": 194.1247, |
| "eval_samples_per_second": 51.513, |
| "eval_steps_per_second": 6.439, |
| "step": 36200 |
| }, |
| { |
| "epoch": 0.5514935238442137, |
| "grad_norm": 2.482103109359741, |
| "learning_rate": 2.3779264214046824e-06, |
| "loss": 5.0416, |
| "step": 36300 |
| }, |
| { |
| "epoch": 0.5514935238442137, |
| "eval_loss": 4.881221294403076, |
| "eval_runtime": 194.1059, |
| "eval_samples_per_second": 51.518, |
| "eval_steps_per_second": 6.44, |
| "step": 36300 |
| }, |
| { |
| "epoch": 0.5530127897501207, |
| "grad_norm": 2.0182697772979736, |
| "learning_rate": 2.3678929765886288e-06, |
| "loss": 5.0287, |
| "step": 36400 |
| }, |
| { |
| "epoch": 0.5530127897501207, |
| "eval_loss": 4.877261161804199, |
| "eval_runtime": 194.0469, |
| "eval_samples_per_second": 51.534, |
| "eval_steps_per_second": 6.442, |
| "step": 36400 |
| }, |
| { |
| "epoch": 0.5545320556560276, |
| "grad_norm": 3.02773380279541, |
| "learning_rate": 2.3578595317725755e-06, |
| "loss": 5.0328, |
| "step": 36500 |
| }, |
| { |
| "epoch": 0.5545320556560276, |
| "eval_loss": 4.869913101196289, |
| "eval_runtime": 194.0627, |
| "eval_samples_per_second": 51.53, |
| "eval_steps_per_second": 6.441, |
| "step": 36500 |
| }, |
| { |
| "epoch": 0.5560513215619345, |
| "grad_norm": 3.1895177364349365, |
| "learning_rate": 2.347826086956522e-06, |
| "loss": 5.0272, |
| "step": 36600 |
| }, |
| { |
| "epoch": 0.5560513215619345, |
| "eval_loss": 4.872635364532471, |
| "eval_runtime": 194.0337, |
| "eval_samples_per_second": 51.537, |
| "eval_steps_per_second": 6.442, |
| "step": 36600 |
| }, |
| { |
| "epoch": 0.5575705874678414, |
| "grad_norm": 2.474367141723633, |
| "learning_rate": 2.337792642140468e-06, |
| "loss": 5.0285, |
| "step": 36700 |
| }, |
| { |
| "epoch": 0.5575705874678414, |
| "eval_loss": 4.866065502166748, |
| "eval_runtime": 193.983, |
| "eval_samples_per_second": 51.551, |
| "eval_steps_per_second": 6.444, |
| "step": 36700 |
| }, |
| { |
| "epoch": 0.5590898533737484, |
| "grad_norm": 3.0734000205993652, |
| "learning_rate": 2.327759197324415e-06, |
| "loss": 5.0238, |
| "step": 36800 |
| }, |
| { |
| "epoch": 0.5590898533737484, |
| "eval_loss": 4.873917102813721, |
| "eval_runtime": 193.8114, |
| "eval_samples_per_second": 51.597, |
| "eval_steps_per_second": 6.45, |
| "step": 36800 |
| }, |
| { |
| "epoch": 0.5606091192796553, |
| "grad_norm": 2.379478931427002, |
| "learning_rate": 2.3177257525083613e-06, |
| "loss": 5.0225, |
| "step": 36900 |
| }, |
| { |
| "epoch": 0.5606091192796553, |
| "eval_loss": 4.864801406860352, |
| "eval_runtime": 193.8549, |
| "eval_samples_per_second": 51.585, |
| "eval_steps_per_second": 6.448, |
| "step": 36900 |
| }, |
| { |
| "epoch": 0.5621283851855622, |
| "grad_norm": 2.6084952354431152, |
| "learning_rate": 2.307692307692308e-06, |
| "loss": 5.0177, |
| "step": 37000 |
| }, |
| { |
| "epoch": 0.5621283851855622, |
| "eval_loss": 4.863184452056885, |
| "eval_runtime": 193.8212, |
| "eval_samples_per_second": 51.594, |
| "eval_steps_per_second": 6.449, |
| "step": 37000 |
| }, |
| { |
| "epoch": 0.5636476510914691, |
| "grad_norm": 2.194261312484741, |
| "learning_rate": 2.2976588628762544e-06, |
| "loss": 5.0167, |
| "step": 37100 |
| }, |
| { |
| "epoch": 0.5636476510914691, |
| "eval_loss": 4.855440139770508, |
| "eval_runtime": 193.8552, |
| "eval_samples_per_second": 51.585, |
| "eval_steps_per_second": 6.448, |
| "step": 37100 |
| }, |
| { |
| "epoch": 0.5651669169973761, |
| "grad_norm": 2.195667028427124, |
| "learning_rate": 2.2876254180602008e-06, |
| "loss": 5.0148, |
| "step": 37200 |
| }, |
| { |
| "epoch": 0.5651669169973761, |
| "eval_loss": 4.857753753662109, |
| "eval_runtime": 193.61, |
| "eval_samples_per_second": 51.65, |
| "eval_steps_per_second": 6.456, |
| "step": 37200 |
| }, |
| { |
| "epoch": 0.566686182903283, |
| "grad_norm": 2.308091402053833, |
| "learning_rate": 2.2775919732441475e-06, |
| "loss": 5.0152, |
| "step": 37300 |
| }, |
| { |
| "epoch": 0.566686182903283, |
| "eval_loss": 4.850945949554443, |
| "eval_runtime": 193.675, |
| "eval_samples_per_second": 51.633, |
| "eval_steps_per_second": 6.454, |
| "step": 37300 |
| }, |
| { |
| "epoch": 0.5682054488091899, |
| "grad_norm": 1.5866217613220215, |
| "learning_rate": 2.267558528428094e-06, |
| "loss": 5.0086, |
| "step": 37400 |
| }, |
| { |
| "epoch": 0.5682054488091899, |
| "eval_loss": 4.856834411621094, |
| "eval_runtime": 194.0621, |
| "eval_samples_per_second": 51.53, |
| "eval_steps_per_second": 6.441, |
| "step": 37400 |
| }, |
| { |
| "epoch": 0.5697247147150968, |
| "grad_norm": 2.3778269290924072, |
| "learning_rate": 2.25752508361204e-06, |
| "loss": 5.008, |
| "step": 37500 |
| }, |
| { |
| "epoch": 0.5697247147150968, |
| "eval_loss": 4.849526405334473, |
| "eval_runtime": 193.9807, |
| "eval_samples_per_second": 51.552, |
| "eval_steps_per_second": 6.444, |
| "step": 37500 |
| }, |
| { |
| "epoch": 0.5712439806210038, |
| "grad_norm": 2.434232234954834, |
| "learning_rate": 2.2474916387959865e-06, |
| "loss": 5.0049, |
| "step": 37600 |
| }, |
| { |
| "epoch": 0.5712439806210038, |
| "eval_loss": 4.849723815917969, |
| "eval_runtime": 194.1152, |
| "eval_samples_per_second": 51.516, |
| "eval_steps_per_second": 6.439, |
| "step": 37600 |
| }, |
| { |
| "epoch": 0.5727632465269107, |
| "grad_norm": 1.9899414777755737, |
| "learning_rate": 2.237458193979933e-06, |
| "loss": 5.0034, |
| "step": 37700 |
| }, |
| { |
| "epoch": 0.5727632465269107, |
| "eval_loss": 4.845240592956543, |
| "eval_runtime": 194.1331, |
| "eval_samples_per_second": 51.511, |
| "eval_steps_per_second": 6.439, |
| "step": 37700 |
| }, |
| { |
| "epoch": 0.5742825124328176, |
| "grad_norm": 2.168919086456299, |
| "learning_rate": 2.2274247491638796e-06, |
| "loss": 4.9989, |
| "step": 37800 |
| }, |
| { |
| "epoch": 0.5742825124328176, |
| "eval_loss": 4.840480327606201, |
| "eval_runtime": 194.2185, |
| "eval_samples_per_second": 51.488, |
| "eval_steps_per_second": 6.436, |
| "step": 37800 |
| }, |
| { |
| "epoch": 0.5758017783387245, |
| "grad_norm": 2.4156546592712402, |
| "learning_rate": 2.217391304347826e-06, |
| "loss": 4.9981, |
| "step": 37900 |
| }, |
| { |
| "epoch": 0.5758017783387245, |
| "eval_loss": 4.837850570678711, |
| "eval_runtime": 194.2958, |
| "eval_samples_per_second": 51.468, |
| "eval_steps_per_second": 6.433, |
| "step": 37900 |
| }, |
| { |
| "epoch": 0.5773210442446315, |
| "grad_norm": 2.725648880004883, |
| "learning_rate": 2.2073578595317723e-06, |
| "loss": 4.9999, |
| "step": 38000 |
| }, |
| { |
| "epoch": 0.5773210442446315, |
| "eval_loss": 4.840028285980225, |
| "eval_runtime": 194.1552, |
| "eval_samples_per_second": 51.505, |
| "eval_steps_per_second": 6.438, |
| "step": 38000 |
| }, |
| { |
| "epoch": 0.5788403101505384, |
| "grad_norm": 2.447983503341675, |
| "learning_rate": 2.197324414715719e-06, |
| "loss": 4.9909, |
| "step": 38100 |
| }, |
| { |
| "epoch": 0.5788403101505384, |
| "eval_loss": 4.840633392333984, |
| "eval_runtime": 200.6709, |
| "eval_samples_per_second": 49.833, |
| "eval_steps_per_second": 6.229, |
| "step": 38100 |
| }, |
| { |
| "epoch": 0.5803595760564453, |
| "grad_norm": 2.5275213718414307, |
| "learning_rate": 2.1872909698996654e-06, |
| "loss": 4.9924, |
| "step": 38200 |
| }, |
| { |
| "epoch": 0.5803595760564453, |
| "eval_loss": 4.838108539581299, |
| "eval_runtime": 194.1742, |
| "eval_samples_per_second": 51.5, |
| "eval_steps_per_second": 6.438, |
| "step": 38200 |
| }, |
| { |
| "epoch": 0.5818788419623522, |
| "grad_norm": 3.0799427032470703, |
| "learning_rate": 2.177257525083612e-06, |
| "loss": 4.9892, |
| "step": 38300 |
| }, |
| { |
| "epoch": 0.5818788419623522, |
| "eval_loss": 4.830769062042236, |
| "eval_runtime": 194.1344, |
| "eval_samples_per_second": 51.511, |
| "eval_steps_per_second": 6.439, |
| "step": 38300 |
| }, |
| { |
| "epoch": 0.5833981078682592, |
| "grad_norm": 2.262266159057617, |
| "learning_rate": 2.1672240802675585e-06, |
| "loss": 4.9869, |
| "step": 38400 |
| }, |
| { |
| "epoch": 0.5833981078682592, |
| "eval_loss": 4.82758903503418, |
| "eval_runtime": 194.1511, |
| "eval_samples_per_second": 51.506, |
| "eval_steps_per_second": 6.438, |
| "step": 38400 |
| }, |
| { |
| "epoch": 0.5849173737741661, |
| "grad_norm": 2.2345926761627197, |
| "learning_rate": 2.157190635451505e-06, |
| "loss": 4.9879, |
| "step": 38500 |
| }, |
| { |
| "epoch": 0.5849173737741661, |
| "eval_loss": 4.826181888580322, |
| "eval_runtime": 194.2128, |
| "eval_samples_per_second": 51.49, |
| "eval_steps_per_second": 6.436, |
| "step": 38500 |
| }, |
| { |
| "epoch": 0.586436639680073, |
| "grad_norm": 1.8996378183364868, |
| "learning_rate": 2.1471571906354516e-06, |
| "loss": 4.9854, |
| "step": 38600 |
| }, |
| { |
| "epoch": 0.586436639680073, |
| "eval_loss": 4.823826789855957, |
| "eval_runtime": 193.9959, |
| "eval_samples_per_second": 51.547, |
| "eval_steps_per_second": 6.443, |
| "step": 38600 |
| }, |
| { |
| "epoch": 0.5879559055859799, |
| "grad_norm": 2.0965209007263184, |
| "learning_rate": 2.137123745819398e-06, |
| "loss": 4.9833, |
| "step": 38700 |
| }, |
| { |
| "epoch": 0.5879559055859799, |
| "eval_loss": 4.819667816162109, |
| "eval_runtime": 194.1201, |
| "eval_samples_per_second": 51.515, |
| "eval_steps_per_second": 6.439, |
| "step": 38700 |
| }, |
| { |
| "epoch": 0.5894751714918869, |
| "grad_norm": 2.005686044692993, |
| "learning_rate": 2.1270903010033443e-06, |
| "loss": 4.9753, |
| "step": 38800 |
| }, |
| { |
| "epoch": 0.5894751714918869, |
| "eval_loss": 4.818215847015381, |
| "eval_runtime": 194.3711, |
| "eval_samples_per_second": 51.448, |
| "eval_steps_per_second": 6.431, |
| "step": 38800 |
| }, |
| { |
| "epoch": 0.5909944373977938, |
| "grad_norm": 2.056711196899414, |
| "learning_rate": 2.117056856187291e-06, |
| "loss": 4.9729, |
| "step": 38900 |
| }, |
| { |
| "epoch": 0.5909944373977938, |
| "eval_loss": 4.815535068511963, |
| "eval_runtime": 194.0241, |
| "eval_samples_per_second": 51.54, |
| "eval_steps_per_second": 6.442, |
| "step": 38900 |
| }, |
| { |
| "epoch": 0.5925137033037007, |
| "grad_norm": 2.186563730239868, |
| "learning_rate": 2.1070234113712374e-06, |
| "loss": 4.9738, |
| "step": 39000 |
| }, |
| { |
| "epoch": 0.5925137033037007, |
| "eval_loss": 4.811450958251953, |
| "eval_runtime": 193.7645, |
| "eval_samples_per_second": 51.609, |
| "eval_steps_per_second": 6.451, |
| "step": 39000 |
| }, |
| { |
| "epoch": 0.5940329692096076, |
| "grad_norm": 2.0862069129943848, |
| "learning_rate": 2.0969899665551837e-06, |
| "loss": 4.9714, |
| "step": 39100 |
| }, |
| { |
| "epoch": 0.5940329692096076, |
| "eval_loss": 4.812065601348877, |
| "eval_runtime": 193.9277, |
| "eval_samples_per_second": 51.566, |
| "eval_steps_per_second": 6.446, |
| "step": 39100 |
| }, |
| { |
| "epoch": 0.5955522351155146, |
| "grad_norm": 2.3990869522094727, |
| "learning_rate": 2.0869565217391305e-06, |
| "loss": 4.9745, |
| "step": 39200 |
| }, |
| { |
| "epoch": 0.5955522351155146, |
| "eval_loss": 4.809053421020508, |
| "eval_runtime": 194.0213, |
| "eval_samples_per_second": 51.541, |
| "eval_steps_per_second": 6.443, |
| "step": 39200 |
| }, |
| { |
| "epoch": 0.5970715010214215, |
| "grad_norm": 2.380688428878784, |
| "learning_rate": 2.076923076923077e-06, |
| "loss": 4.9709, |
| "step": 39300 |
| }, |
| { |
| "epoch": 0.5970715010214215, |
| "eval_loss": 4.810598373413086, |
| "eval_runtime": 193.7901, |
| "eval_samples_per_second": 51.602, |
| "eval_steps_per_second": 6.45, |
| "step": 39300 |
| }, |
| { |
| "epoch": 0.5985907669273284, |
| "grad_norm": 2.6398425102233887, |
| "learning_rate": 2.0668896321070236e-06, |
| "loss": 4.967, |
| "step": 39400 |
| }, |
| { |
| "epoch": 0.5985907669273284, |
| "eval_loss": 4.807140827178955, |
| "eval_runtime": 193.6791, |
| "eval_samples_per_second": 51.632, |
| "eval_steps_per_second": 6.454, |
| "step": 39400 |
| }, |
| { |
| "epoch": 0.6001100328332353, |
| "grad_norm": 2.365203619003296, |
| "learning_rate": 2.05685618729097e-06, |
| "loss": 4.9623, |
| "step": 39500 |
| }, |
| { |
| "epoch": 0.6001100328332353, |
| "eval_loss": 4.804749011993408, |
| "eval_runtime": 193.728, |
| "eval_samples_per_second": 51.619, |
| "eval_steps_per_second": 6.452, |
| "step": 39500 |
| }, |
| { |
| "epoch": 0.6016292987391423, |
| "grad_norm": 2.6509780883789062, |
| "learning_rate": 2.0468227424749163e-06, |
| "loss": 4.963, |
| "step": 39600 |
| }, |
| { |
| "epoch": 0.6016292987391423, |
| "eval_loss": 4.8039093017578125, |
| "eval_runtime": 194.0108, |
| "eval_samples_per_second": 51.544, |
| "eval_steps_per_second": 6.443, |
| "step": 39600 |
| }, |
| { |
| "epoch": 0.6031485646450492, |
| "grad_norm": 2.182466506958008, |
| "learning_rate": 2.036789297658863e-06, |
| "loss": 4.9585, |
| "step": 39700 |
| }, |
| { |
| "epoch": 0.6031485646450492, |
| "eval_loss": 4.798705577850342, |
| "eval_runtime": 194.1051, |
| "eval_samples_per_second": 51.518, |
| "eval_steps_per_second": 6.44, |
| "step": 39700 |
| }, |
| { |
| "epoch": 0.6046678305509561, |
| "grad_norm": 1.9312145709991455, |
| "learning_rate": 2.0267558528428094e-06, |
| "loss": 4.9604, |
| "step": 39800 |
| }, |
| { |
| "epoch": 0.6046678305509561, |
| "eval_loss": 4.799111843109131, |
| "eval_runtime": 194.0025, |
| "eval_samples_per_second": 51.546, |
| "eval_steps_per_second": 6.443, |
| "step": 39800 |
| }, |
| { |
| "epoch": 0.606187096456863, |
| "grad_norm": 2.0514976978302, |
| "learning_rate": 2.0167224080267557e-06, |
| "loss": 4.9551, |
| "step": 39900 |
| }, |
| { |
| "epoch": 0.606187096456863, |
| "eval_loss": 4.792530536651611, |
| "eval_runtime": 194.0231, |
| "eval_samples_per_second": 51.54, |
| "eval_steps_per_second": 6.443, |
| "step": 39900 |
| }, |
| { |
| "epoch": 0.60770636236277, |
| "grad_norm": 2.4416747093200684, |
| "learning_rate": 2.0066889632107025e-06, |
| "loss": 4.9522, |
| "step": 40000 |
| }, |
| { |
| "epoch": 0.60770636236277, |
| "eval_loss": 4.7944655418396, |
| "eval_runtime": 194.2247, |
| "eval_samples_per_second": 51.487, |
| "eval_steps_per_second": 6.436, |
| "step": 40000 |
| }, |
| { |
| "epoch": 0.6092256282686769, |
| "grad_norm": 2.400484561920166, |
| "learning_rate": 1.996655518394649e-06, |
| "loss": 4.9543, |
| "step": 40100 |
| }, |
| { |
| "epoch": 0.6092256282686769, |
| "eval_loss": 4.793302059173584, |
| "eval_runtime": 194.2542, |
| "eval_samples_per_second": 51.479, |
| "eval_steps_per_second": 6.435, |
| "step": 40100 |
| }, |
| { |
| "epoch": 0.6107448941745838, |
| "grad_norm": 1.9967873096466064, |
| "learning_rate": 1.986622073578595e-06, |
| "loss": 4.9507, |
| "step": 40200 |
| }, |
| { |
| "epoch": 0.6107448941745838, |
| "eval_loss": 4.793440818786621, |
| "eval_runtime": 194.3425, |
| "eval_samples_per_second": 51.456, |
| "eval_steps_per_second": 6.432, |
| "step": 40200 |
| }, |
| { |
| "epoch": 0.6122641600804907, |
| "grad_norm": 1.917490839958191, |
| "learning_rate": 1.976588628762542e-06, |
| "loss": 4.9505, |
| "step": 40300 |
| }, |
| { |
| "epoch": 0.6122641600804907, |
| "eval_loss": 4.786988258361816, |
| "eval_runtime": 194.1703, |
| "eval_samples_per_second": 51.501, |
| "eval_steps_per_second": 6.438, |
| "step": 40300 |
| }, |
| { |
| "epoch": 0.6137834259863977, |
| "grad_norm": 2.4164531230926514, |
| "learning_rate": 1.9665551839464883e-06, |
| "loss": 4.9423, |
| "step": 40400 |
| }, |
| { |
| "epoch": 0.6137834259863977, |
| "eval_loss": 4.786272048950195, |
| "eval_runtime": 194.6058, |
| "eval_samples_per_second": 51.386, |
| "eval_steps_per_second": 6.423, |
| "step": 40400 |
| }, |
| { |
| "epoch": 0.6153026918923046, |
| "grad_norm": 2.5412399768829346, |
| "learning_rate": 1.956521739130435e-06, |
| "loss": 4.9447, |
| "step": 40500 |
| }, |
| { |
| "epoch": 0.6153026918923046, |
| "eval_loss": 4.785825729370117, |
| "eval_runtime": 194.2484, |
| "eval_samples_per_second": 51.48, |
| "eval_steps_per_second": 6.435, |
| "step": 40500 |
| }, |
| { |
| "epoch": 0.6168219577982115, |
| "grad_norm": 2.2212436199188232, |
| "learning_rate": 1.9464882943143814e-06, |
| "loss": 4.9432, |
| "step": 40600 |
| }, |
| { |
| "epoch": 0.6168219577982115, |
| "eval_loss": 4.7824625968933105, |
| "eval_runtime": 194.1967, |
| "eval_samples_per_second": 51.494, |
| "eval_steps_per_second": 6.437, |
| "step": 40600 |
| }, |
| { |
| "epoch": 0.6183412237041184, |
| "grad_norm": 2.1287331581115723, |
| "learning_rate": 1.9364548494983277e-06, |
| "loss": 4.9416, |
| "step": 40700 |
| }, |
| { |
| "epoch": 0.6183412237041184, |
| "eval_loss": 4.776528358459473, |
| "eval_runtime": 194.1119, |
| "eval_samples_per_second": 51.517, |
| "eval_steps_per_second": 6.44, |
| "step": 40700 |
| }, |
| { |
| "epoch": 0.6198604896100254, |
| "grad_norm": 1.8793989419937134, |
| "learning_rate": 1.9264214046822745e-06, |
| "loss": 4.9357, |
| "step": 40800 |
| }, |
| { |
| "epoch": 0.6198604896100254, |
| "eval_loss": 4.779613494873047, |
| "eval_runtime": 194.0243, |
| "eval_samples_per_second": 51.54, |
| "eval_steps_per_second": 6.442, |
| "step": 40800 |
| }, |
| { |
| "epoch": 0.6213797555159323, |
| "grad_norm": 1.943474531173706, |
| "learning_rate": 1.916387959866221e-06, |
| "loss": 4.9389, |
| "step": 40900 |
| }, |
| { |
| "epoch": 0.6213797555159323, |
| "eval_loss": 4.774796009063721, |
| "eval_runtime": 194.066, |
| "eval_samples_per_second": 51.529, |
| "eval_steps_per_second": 6.441, |
| "step": 40900 |
| }, |
| { |
| "epoch": 0.6228990214218392, |
| "grad_norm": 2.138035774230957, |
| "learning_rate": 1.9063545150501674e-06, |
| "loss": 4.9344, |
| "step": 41000 |
| }, |
| { |
| "epoch": 0.6228990214218392, |
| "eval_loss": 4.774413108825684, |
| "eval_runtime": 194.289, |
| "eval_samples_per_second": 51.47, |
| "eval_steps_per_second": 6.434, |
| "step": 41000 |
| }, |
| { |
| "epoch": 0.6244182873277461, |
| "grad_norm": 2.1911377906799316, |
| "learning_rate": 1.896321070234114e-06, |
| "loss": 4.9307, |
| "step": 41100 |
| }, |
| { |
| "epoch": 0.6244182873277461, |
| "eval_loss": 4.7724833488464355, |
| "eval_runtime": 194.0879, |
| "eval_samples_per_second": 51.523, |
| "eval_steps_per_second": 6.44, |
| "step": 41100 |
| }, |
| { |
| "epoch": 0.6259375532336531, |
| "grad_norm": 2.186774730682373, |
| "learning_rate": 1.8862876254180603e-06, |
| "loss": 4.9316, |
| "step": 41200 |
| }, |
| { |
| "epoch": 0.6259375532336531, |
| "eval_loss": 4.7727203369140625, |
| "eval_runtime": 193.9834, |
| "eval_samples_per_second": 51.551, |
| "eval_steps_per_second": 6.444, |
| "step": 41200 |
| }, |
| { |
| "epoch": 0.62745681913956, |
| "grad_norm": 2.706834554672241, |
| "learning_rate": 1.8762541806020068e-06, |
| "loss": 4.9244, |
| "step": 41300 |
| }, |
| { |
| "epoch": 0.62745681913956, |
| "eval_loss": 4.769220352172852, |
| "eval_runtime": 193.833, |
| "eval_samples_per_second": 51.591, |
| "eval_steps_per_second": 6.449, |
| "step": 41300 |
| }, |
| { |
| "epoch": 0.6289760850454669, |
| "grad_norm": 2.0782527923583984, |
| "learning_rate": 1.8662207357859534e-06, |
| "loss": 4.9308, |
| "step": 41400 |
| }, |
| { |
| "epoch": 0.6289760850454669, |
| "eval_loss": 4.769233703613281, |
| "eval_runtime": 193.7092, |
| "eval_samples_per_second": 51.624, |
| "eval_steps_per_second": 6.453, |
| "step": 41400 |
| }, |
| { |
| "epoch": 0.6304953509513738, |
| "grad_norm": 2.107680559158325, |
| "learning_rate": 1.8561872909699e-06, |
| "loss": 4.9286, |
| "step": 41500 |
| }, |
| { |
| "epoch": 0.6304953509513738, |
| "eval_loss": 4.765684604644775, |
| "eval_runtime": 193.8101, |
| "eval_samples_per_second": 51.597, |
| "eval_steps_per_second": 6.45, |
| "step": 41500 |
| }, |
| { |
| "epoch": 0.6320146168572808, |
| "grad_norm": 1.861700177192688, |
| "learning_rate": 1.8461538461538462e-06, |
| "loss": 4.925, |
| "step": 41600 |
| }, |
| { |
| "epoch": 0.6320146168572808, |
| "eval_loss": 4.761124134063721, |
| "eval_runtime": 194.0209, |
| "eval_samples_per_second": 51.541, |
| "eval_steps_per_second": 6.443, |
| "step": 41600 |
| }, |
| { |
| "epoch": 0.6335338827631877, |
| "grad_norm": 2.256538152694702, |
| "learning_rate": 1.8361204013377928e-06, |
| "loss": 4.9214, |
| "step": 41700 |
| }, |
| { |
| "epoch": 0.6335338827631877, |
| "eval_loss": 4.761186122894287, |
| "eval_runtime": 193.8553, |
| "eval_samples_per_second": 51.585, |
| "eval_steps_per_second": 6.448, |
| "step": 41700 |
| }, |
| { |
| "epoch": 0.6350531486690946, |
| "grad_norm": 1.720786213874817, |
| "learning_rate": 1.8260869565217394e-06, |
| "loss": 4.9188, |
| "step": 41800 |
| }, |
| { |
| "epoch": 0.6350531486690946, |
| "eval_loss": 4.75638484954834, |
| "eval_runtime": 194.0169, |
| "eval_samples_per_second": 51.542, |
| "eval_steps_per_second": 6.443, |
| "step": 41800 |
| }, |
| { |
| "epoch": 0.6365724145750015, |
| "grad_norm": 1.9223599433898926, |
| "learning_rate": 1.8160535117056857e-06, |
| "loss": 4.9162, |
| "step": 41900 |
| }, |
| { |
| "epoch": 0.6365724145750015, |
| "eval_loss": 4.757732391357422, |
| "eval_runtime": 194.1596, |
| "eval_samples_per_second": 51.504, |
| "eval_steps_per_second": 6.438, |
| "step": 41900 |
| }, |
| { |
| "epoch": 0.6380916804809085, |
| "grad_norm": 1.7804436683654785, |
| "learning_rate": 1.8060200668896322e-06, |
| "loss": 4.9158, |
| "step": 42000 |
| }, |
| { |
| "epoch": 0.6380916804809085, |
| "eval_loss": 4.757546424865723, |
| "eval_runtime": 194.2282, |
| "eval_samples_per_second": 51.486, |
| "eval_steps_per_second": 6.436, |
| "step": 42000 |
| }, |
| { |
| "epoch": 0.6396109463868154, |
| "grad_norm": 1.9580631256103516, |
| "learning_rate": 1.7959866220735788e-06, |
| "loss": 4.9095, |
| "step": 42100 |
| }, |
| { |
| "epoch": 0.6396109463868154, |
| "eval_loss": 4.752386093139648, |
| "eval_runtime": 194.1745, |
| "eval_samples_per_second": 51.5, |
| "eval_steps_per_second": 6.438, |
| "step": 42100 |
| }, |
| { |
| "epoch": 0.6411302122927223, |
| "grad_norm": 2.1417272090911865, |
| "learning_rate": 1.7859531772575253e-06, |
| "loss": 4.9134, |
| "step": 42200 |
| }, |
| { |
| "epoch": 0.6411302122927223, |
| "eval_loss": 4.749510765075684, |
| "eval_runtime": 194.1938, |
| "eval_samples_per_second": 51.495, |
| "eval_steps_per_second": 6.437, |
| "step": 42200 |
| }, |
| { |
| "epoch": 0.6426494781986292, |
| "grad_norm": 2.4839389324188232, |
| "learning_rate": 1.7759197324414717e-06, |
| "loss": 4.9116, |
| "step": 42300 |
| }, |
| { |
| "epoch": 0.6426494781986292, |
| "eval_loss": 4.752679824829102, |
| "eval_runtime": 194.1618, |
| "eval_samples_per_second": 51.503, |
| "eval_steps_per_second": 6.438, |
| "step": 42300 |
| }, |
| { |
| "epoch": 0.6441687441045362, |
| "grad_norm": 2.5596067905426025, |
| "learning_rate": 1.7658862876254182e-06, |
| "loss": 4.9078, |
| "step": 42400 |
| }, |
| { |
| "epoch": 0.6441687441045362, |
| "eval_loss": 4.742520332336426, |
| "eval_runtime": 194.1697, |
| "eval_samples_per_second": 51.501, |
| "eval_steps_per_second": 6.438, |
| "step": 42400 |
| }, |
| { |
| "epoch": 0.6456880100104431, |
| "grad_norm": 1.7020114660263062, |
| "learning_rate": 1.7558528428093648e-06, |
| "loss": 4.9063, |
| "step": 42500 |
| }, |
| { |
| "epoch": 0.6456880100104431, |
| "eval_loss": 4.745018005371094, |
| "eval_runtime": 194.2677, |
| "eval_samples_per_second": 51.475, |
| "eval_steps_per_second": 6.434, |
| "step": 42500 |
| }, |
| { |
| "epoch": 0.64720727591635, |
| "grad_norm": 1.83507239818573, |
| "learning_rate": 1.745819397993311e-06, |
| "loss": 4.9067, |
| "step": 42600 |
| }, |
| { |
| "epoch": 0.64720727591635, |
| "eval_loss": 4.749469757080078, |
| "eval_runtime": 194.1266, |
| "eval_samples_per_second": 51.513, |
| "eval_steps_per_second": 6.439, |
| "step": 42600 |
| }, |
| { |
| "epoch": 0.6487265418222569, |
| "grad_norm": 1.7852286100387573, |
| "learning_rate": 1.7357859531772575e-06, |
| "loss": 4.909, |
| "step": 42700 |
| }, |
| { |
| "epoch": 0.6487265418222569, |
| "eval_loss": 4.74142599105835, |
| "eval_runtime": 194.3029, |
| "eval_samples_per_second": 51.466, |
| "eval_steps_per_second": 6.433, |
| "step": 42700 |
| }, |
| { |
| "epoch": 0.6502458077281639, |
| "grad_norm": 1.9356688261032104, |
| "learning_rate": 1.7257525083612038e-06, |
| "loss": 4.8968, |
| "step": 42800 |
| }, |
| { |
| "epoch": 0.6502458077281639, |
| "eval_loss": 4.742361068725586, |
| "eval_runtime": 194.1912, |
| "eval_samples_per_second": 51.496, |
| "eval_steps_per_second": 6.437, |
| "step": 42800 |
| }, |
| { |
| "epoch": 0.6517650736340708, |
| "grad_norm": 2.4372880458831787, |
| "learning_rate": 1.7157190635451504e-06, |
| "loss": 4.9034, |
| "step": 42900 |
| }, |
| { |
| "epoch": 0.6517650736340708, |
| "eval_loss": 4.737247467041016, |
| "eval_runtime": 194.1333, |
| "eval_samples_per_second": 51.511, |
| "eval_steps_per_second": 6.439, |
| "step": 42900 |
| }, |
| { |
| "epoch": 0.6532843395399777, |
| "grad_norm": 1.9416236877441406, |
| "learning_rate": 1.705685618729097e-06, |
| "loss": 4.8978, |
| "step": 43000 |
| }, |
| { |
| "epoch": 0.6532843395399777, |
| "eval_loss": 4.7349853515625, |
| "eval_runtime": 194.1706, |
| "eval_samples_per_second": 51.501, |
| "eval_steps_per_second": 6.438, |
| "step": 43000 |
| }, |
| { |
| "epoch": 0.6548036054458846, |
| "grad_norm": 2.3514084815979004, |
| "learning_rate": 1.6956521739130435e-06, |
| "loss": 4.8963, |
| "step": 43100 |
| }, |
| { |
| "epoch": 0.6548036054458846, |
| "eval_loss": 4.7388434410095215, |
| "eval_runtime": 194.3316, |
| "eval_samples_per_second": 51.458, |
| "eval_steps_per_second": 6.432, |
| "step": 43100 |
| }, |
| { |
| "epoch": 0.6563228713517916, |
| "grad_norm": 2.028310537338257, |
| "learning_rate": 1.6856187290969898e-06, |
| "loss": 4.8961, |
| "step": 43200 |
| }, |
| { |
| "epoch": 0.6563228713517916, |
| "eval_loss": 4.735996723175049, |
| "eval_runtime": 194.0261, |
| "eval_samples_per_second": 51.539, |
| "eval_steps_per_second": 6.442, |
| "step": 43200 |
| }, |
| { |
| "epoch": 0.6578421372576985, |
| "grad_norm": 2.360321521759033, |
| "learning_rate": 1.6755852842809363e-06, |
| "loss": 4.8892, |
| "step": 43300 |
| }, |
| { |
| "epoch": 0.6578421372576985, |
| "eval_loss": 4.731908798217773, |
| "eval_runtime": 194.2382, |
| "eval_samples_per_second": 51.483, |
| "eval_steps_per_second": 6.435, |
| "step": 43300 |
| }, |
| { |
| "epoch": 0.6593614031636054, |
| "grad_norm": 2.0614426136016846, |
| "learning_rate": 1.665551839464883e-06, |
| "loss": 4.8911, |
| "step": 43400 |
| }, |
| { |
| "epoch": 0.6593614031636054, |
| "eval_loss": 4.727632999420166, |
| "eval_runtime": 194.0495, |
| "eval_samples_per_second": 51.533, |
| "eval_steps_per_second": 6.442, |
| "step": 43400 |
| }, |
| { |
| "epoch": 0.6608806690695123, |
| "grad_norm": 2.058509349822998, |
| "learning_rate": 1.6555183946488294e-06, |
| "loss": 4.8883, |
| "step": 43500 |
| }, |
| { |
| "epoch": 0.6608806690695123, |
| "eval_loss": 4.72844123840332, |
| "eval_runtime": 194.0456, |
| "eval_samples_per_second": 51.534, |
| "eval_steps_per_second": 6.442, |
| "step": 43500 |
| }, |
| { |
| "epoch": 0.6623999349754193, |
| "grad_norm": 1.7108250856399536, |
| "learning_rate": 1.6454849498327758e-06, |
| "loss": 4.8866, |
| "step": 43600 |
| }, |
| { |
| "epoch": 0.6623999349754193, |
| "eval_loss": 4.726889133453369, |
| "eval_runtime": 193.8998, |
| "eval_samples_per_second": 51.573, |
| "eval_steps_per_second": 6.447, |
| "step": 43600 |
| }, |
| { |
| "epoch": 0.6639192008813262, |
| "grad_norm": 1.871711254119873, |
| "learning_rate": 1.6354515050167223e-06, |
| "loss": 4.888, |
| "step": 43700 |
| }, |
| { |
| "epoch": 0.6639192008813262, |
| "eval_loss": 4.726442813873291, |
| "eval_runtime": 193.895, |
| "eval_samples_per_second": 51.574, |
| "eval_steps_per_second": 6.447, |
| "step": 43700 |
| }, |
| { |
| "epoch": 0.6654384667872331, |
| "grad_norm": 1.9516098499298096, |
| "learning_rate": 1.6254180602006689e-06, |
| "loss": 4.887, |
| "step": 43800 |
| }, |
| { |
| "epoch": 0.6654384667872331, |
| "eval_loss": 4.72707986831665, |
| "eval_runtime": 193.7412, |
| "eval_samples_per_second": 51.615, |
| "eval_steps_per_second": 6.452, |
| "step": 43800 |
| }, |
| { |
| "epoch": 0.66695773269314, |
| "grad_norm": 1.870690107345581, |
| "learning_rate": 1.6153846153846154e-06, |
| "loss": 4.8794, |
| "step": 43900 |
| }, |
| { |
| "epoch": 0.66695773269314, |
| "eval_loss": 4.7214789390563965, |
| "eval_runtime": 193.7498, |
| "eval_samples_per_second": 51.613, |
| "eval_steps_per_second": 6.452, |
| "step": 43900 |
| }, |
| { |
| "epoch": 0.668476998599047, |
| "grad_norm": 1.8577009439468384, |
| "learning_rate": 1.6053511705685618e-06, |
| "loss": 4.8803, |
| "step": 44000 |
| }, |
| { |
| "epoch": 0.668476998599047, |
| "eval_loss": 4.719671726226807, |
| "eval_runtime": 193.9858, |
| "eval_samples_per_second": 51.55, |
| "eval_steps_per_second": 6.444, |
| "step": 44000 |
| }, |
| { |
| "epoch": 0.6699962645049539, |
| "grad_norm": 2.1134140491485596, |
| "learning_rate": 1.5953177257525083e-06, |
| "loss": 4.879, |
| "step": 44100 |
| }, |
| { |
| "epoch": 0.6699962645049539, |
| "eval_loss": 4.717536926269531, |
| "eval_runtime": 193.7322, |
| "eval_samples_per_second": 51.618, |
| "eval_steps_per_second": 6.452, |
| "step": 44100 |
| }, |
| { |
| "epoch": 0.6715155304108608, |
| "grad_norm": 2.10524845123291, |
| "learning_rate": 1.5852842809364549e-06, |
| "loss": 4.8782, |
| "step": 44200 |
| }, |
| { |
| "epoch": 0.6715155304108608, |
| "eval_loss": 4.712420463562012, |
| "eval_runtime": 194.1278, |
| "eval_samples_per_second": 51.512, |
| "eval_steps_per_second": 6.439, |
| "step": 44200 |
| }, |
| { |
| "epoch": 0.6730347963167677, |
| "grad_norm": 1.9747872352600098, |
| "learning_rate": 1.5752508361204012e-06, |
| "loss": 4.8782, |
| "step": 44300 |
| }, |
| { |
| "epoch": 0.6730347963167677, |
| "eval_loss": 4.716573238372803, |
| "eval_runtime": 194.2422, |
| "eval_samples_per_second": 51.482, |
| "eval_steps_per_second": 6.435, |
| "step": 44300 |
| }, |
| { |
| "epoch": 0.6745540622226747, |
| "grad_norm": 1.9124640226364136, |
| "learning_rate": 1.5652173913043478e-06, |
| "loss": 4.8808, |
| "step": 44400 |
| }, |
| { |
| "epoch": 0.6745540622226747, |
| "eval_loss": 4.715909481048584, |
| "eval_runtime": 199.7142, |
| "eval_samples_per_second": 50.072, |
| "eval_steps_per_second": 6.259, |
| "step": 44400 |
| }, |
| { |
| "epoch": 0.6760733281285816, |
| "grad_norm": 1.971144676208496, |
| "learning_rate": 1.5551839464882943e-06, |
| "loss": 4.8739, |
| "step": 44500 |
| }, |
| { |
| "epoch": 0.6760733281285816, |
| "eval_loss": 4.714458465576172, |
| "eval_runtime": 194.1832, |
| "eval_samples_per_second": 51.498, |
| "eval_steps_per_second": 6.437, |
| "step": 44500 |
| }, |
| { |
| "epoch": 0.6775925940344885, |
| "grad_norm": 2.0993101596832275, |
| "learning_rate": 1.5451505016722409e-06, |
| "loss": 4.8733, |
| "step": 44600 |
| }, |
| { |
| "epoch": 0.6775925940344885, |
| "eval_loss": 4.708896636962891, |
| "eval_runtime": 194.1323, |
| "eval_samples_per_second": 51.511, |
| "eval_steps_per_second": 6.439, |
| "step": 44600 |
| }, |
| { |
| "epoch": 0.6791118599403954, |
| "grad_norm": 1.5517523288726807, |
| "learning_rate": 1.5351170568561872e-06, |
| "loss": 4.877, |
| "step": 44700 |
| }, |
| { |
| "epoch": 0.6791118599403954, |
| "eval_loss": 4.706016540527344, |
| "eval_runtime": 194.2224, |
| "eval_samples_per_second": 51.487, |
| "eval_steps_per_second": 6.436, |
| "step": 44700 |
| }, |
| { |
| "epoch": 0.6806311258463024, |
| "grad_norm": 1.6051702499389648, |
| "learning_rate": 1.5250836120401338e-06, |
| "loss": 4.873, |
| "step": 44800 |
| }, |
| { |
| "epoch": 0.6806311258463024, |
| "eval_loss": 4.71004581451416, |
| "eval_runtime": 194.3369, |
| "eval_samples_per_second": 51.457, |
| "eval_steps_per_second": 6.432, |
| "step": 44800 |
| }, |
| { |
| "epoch": 0.6821503917522093, |
| "grad_norm": 1.8578929901123047, |
| "learning_rate": 1.5150501672240803e-06, |
| "loss": 4.8645, |
| "step": 44900 |
| }, |
| { |
| "epoch": 0.6821503917522093, |
| "eval_loss": 4.7041826248168945, |
| "eval_runtime": 194.6352, |
| "eval_samples_per_second": 51.378, |
| "eval_steps_per_second": 6.422, |
| "step": 44900 |
| }, |
| { |
| "epoch": 0.6836696576581162, |
| "grad_norm": 1.8288882970809937, |
| "learning_rate": 1.5050167224080269e-06, |
| "loss": 4.8717, |
| "step": 45000 |
| }, |
| { |
| "epoch": 0.6836696576581162, |
| "eval_loss": 4.704262733459473, |
| "eval_runtime": 194.5807, |
| "eval_samples_per_second": 51.393, |
| "eval_steps_per_second": 6.424, |
| "step": 45000 |
| }, |
| { |
| "epoch": 0.6851889235640231, |
| "grad_norm": 1.766317367553711, |
| "learning_rate": 1.4949832775919732e-06, |
| "loss": 4.8658, |
| "step": 45100 |
| }, |
| { |
| "epoch": 0.6851889235640231, |
| "eval_loss": 4.700209140777588, |
| "eval_runtime": 194.2902, |
| "eval_samples_per_second": 51.469, |
| "eval_steps_per_second": 6.434, |
| "step": 45100 |
| }, |
| { |
| "epoch": 0.6867081894699301, |
| "grad_norm": 2.1722605228424072, |
| "learning_rate": 1.4849498327759198e-06, |
| "loss": 4.868, |
| "step": 45200 |
| }, |
| { |
| "epoch": 0.6867081894699301, |
| "eval_loss": 4.7045111656188965, |
| "eval_runtime": 194.3122, |
| "eval_samples_per_second": 51.464, |
| "eval_steps_per_second": 6.433, |
| "step": 45200 |
| }, |
| { |
| "epoch": 0.688227455375837, |
| "grad_norm": 2.2012276649475098, |
| "learning_rate": 1.4749163879598663e-06, |
| "loss": 4.861, |
| "step": 45300 |
| }, |
| { |
| "epoch": 0.688227455375837, |
| "eval_loss": 4.699077606201172, |
| "eval_runtime": 194.099, |
| "eval_samples_per_second": 51.52, |
| "eval_steps_per_second": 6.44, |
| "step": 45300 |
| }, |
| { |
| "epoch": 0.6897467212817439, |
| "grad_norm": 1.9373100996017456, |
| "learning_rate": 1.4648829431438129e-06, |
| "loss": 4.8624, |
| "step": 45400 |
| }, |
| { |
| "epoch": 0.6897467212817439, |
| "eval_loss": 4.699510097503662, |
| "eval_runtime": 194.2648, |
| "eval_samples_per_second": 51.476, |
| "eval_steps_per_second": 6.435, |
| "step": 45400 |
| }, |
| { |
| "epoch": 0.6912659871876508, |
| "grad_norm": 1.5436214208602905, |
| "learning_rate": 1.4548494983277592e-06, |
| "loss": 4.8669, |
| "step": 45500 |
| }, |
| { |
| "epoch": 0.6912659871876508, |
| "eval_loss": 4.6950531005859375, |
| "eval_runtime": 194.1491, |
| "eval_samples_per_second": 51.507, |
| "eval_steps_per_second": 6.438, |
| "step": 45500 |
| }, |
| { |
| "epoch": 0.6927852530935578, |
| "grad_norm": 1.868397831916809, |
| "learning_rate": 1.4448160535117058e-06, |
| "loss": 4.8588, |
| "step": 45600 |
| }, |
| { |
| "epoch": 0.6927852530935578, |
| "eval_loss": 4.699548244476318, |
| "eval_runtime": 194.2333, |
| "eval_samples_per_second": 51.484, |
| "eval_steps_per_second": 6.436, |
| "step": 45600 |
| }, |
| { |
| "epoch": 0.6943045189994647, |
| "grad_norm": 1.9601666927337646, |
| "learning_rate": 1.4347826086956523e-06, |
| "loss": 4.8583, |
| "step": 45700 |
| }, |
| { |
| "epoch": 0.6943045189994647, |
| "eval_loss": 4.697216510772705, |
| "eval_runtime": 194.113, |
| "eval_samples_per_second": 51.516, |
| "eval_steps_per_second": 6.44, |
| "step": 45700 |
| }, |
| { |
| "epoch": 0.6958237849053716, |
| "grad_norm": 2.128359317779541, |
| "learning_rate": 1.4247491638795989e-06, |
| "loss": 4.8553, |
| "step": 45800 |
| }, |
| { |
| "epoch": 0.6958237849053716, |
| "eval_loss": 4.695890426635742, |
| "eval_runtime": 194.2141, |
| "eval_samples_per_second": 51.49, |
| "eval_steps_per_second": 6.436, |
| "step": 45800 |
| }, |
| { |
| "epoch": 0.6973430508112785, |
| "grad_norm": 1.7737051248550415, |
| "learning_rate": 1.4147157190635452e-06, |
| "loss": 4.8552, |
| "step": 45900 |
| }, |
| { |
| "epoch": 0.6973430508112785, |
| "eval_loss": 4.692898273468018, |
| "eval_runtime": 194.0574, |
| "eval_samples_per_second": 51.531, |
| "eval_steps_per_second": 6.441, |
| "step": 45900 |
| }, |
| { |
| "epoch": 0.6988623167171855, |
| "grad_norm": 1.8772127628326416, |
| "learning_rate": 1.4046822742474917e-06, |
| "loss": 4.8528, |
| "step": 46000 |
| }, |
| { |
| "epoch": 0.6988623167171855, |
| "eval_loss": 4.690573215484619, |
| "eval_runtime": 193.9137, |
| "eval_samples_per_second": 51.569, |
| "eval_steps_per_second": 6.446, |
| "step": 46000 |
| }, |
| { |
| "epoch": 0.7003815826230924, |
| "grad_norm": 1.9277006387710571, |
| "learning_rate": 1.3946488294314383e-06, |
| "loss": 4.851, |
| "step": 46100 |
| }, |
| { |
| "epoch": 0.7003815826230924, |
| "eval_loss": 4.688443183898926, |
| "eval_runtime": 193.7729, |
| "eval_samples_per_second": 51.607, |
| "eval_steps_per_second": 6.451, |
| "step": 46100 |
| }, |
| { |
| "epoch": 0.7019008485289993, |
| "grad_norm": 1.4775947332382202, |
| "learning_rate": 1.3846153846153846e-06, |
| "loss": 4.8477, |
| "step": 46200 |
| }, |
| { |
| "epoch": 0.7019008485289993, |
| "eval_loss": 4.689602375030518, |
| "eval_runtime": 193.799, |
| "eval_samples_per_second": 51.6, |
| "eval_steps_per_second": 6.45, |
| "step": 46200 |
| }, |
| { |
| "epoch": 0.7034201144349062, |
| "grad_norm": 1.9227460622787476, |
| "learning_rate": 1.374581939799331e-06, |
| "loss": 4.8447, |
| "step": 46300 |
| }, |
| { |
| "epoch": 0.7034201144349062, |
| "eval_loss": 4.6872076988220215, |
| "eval_runtime": 193.971, |
| "eval_samples_per_second": 51.554, |
| "eval_steps_per_second": 6.444, |
| "step": 46300 |
| }, |
| { |
| "epoch": 0.7049393803408132, |
| "grad_norm": 1.8744120597839355, |
| "learning_rate": 1.3645484949832775e-06, |
| "loss": 4.8443, |
| "step": 46400 |
| }, |
| { |
| "epoch": 0.7049393803408132, |
| "eval_loss": 4.684128284454346, |
| "eval_runtime": 193.7921, |
| "eval_samples_per_second": 51.602, |
| "eval_steps_per_second": 6.45, |
| "step": 46400 |
| }, |
| { |
| "epoch": 0.7064586462467201, |
| "grad_norm": 1.858379602432251, |
| "learning_rate": 1.354515050167224e-06, |
| "loss": 4.8418, |
| "step": 46500 |
| }, |
| { |
| "epoch": 0.7064586462467201, |
| "eval_loss": 4.681851387023926, |
| "eval_runtime": 194.0781, |
| "eval_samples_per_second": 51.526, |
| "eval_steps_per_second": 6.441, |
| "step": 46500 |
| }, |
| { |
| "epoch": 0.707977912152627, |
| "grad_norm": 1.8594979047775269, |
| "learning_rate": 1.3444816053511706e-06, |
| "loss": 4.8433, |
| "step": 46600 |
| }, |
| { |
| "epoch": 0.707977912152627, |
| "eval_loss": 4.6782755851745605, |
| "eval_runtime": 194.1927, |
| "eval_samples_per_second": 51.495, |
| "eval_steps_per_second": 6.437, |
| "step": 46600 |
| }, |
| { |
| "epoch": 0.7094971780585339, |
| "grad_norm": 1.8931249380111694, |
| "learning_rate": 1.334448160535117e-06, |
| "loss": 4.8404, |
| "step": 46700 |
| }, |
| { |
| "epoch": 0.7094971780585339, |
| "eval_loss": 4.683481216430664, |
| "eval_runtime": 194.2623, |
| "eval_samples_per_second": 51.477, |
| "eval_steps_per_second": 6.435, |
| "step": 46700 |
| }, |
| { |
| "epoch": 0.7110164439644409, |
| "grad_norm": 1.5091091394424438, |
| "learning_rate": 1.3244147157190635e-06, |
| "loss": 4.8423, |
| "step": 46800 |
| }, |
| { |
| "epoch": 0.7110164439644409, |
| "eval_loss": 4.679195880889893, |
| "eval_runtime": 194.1717, |
| "eval_samples_per_second": 51.501, |
| "eval_steps_per_second": 6.438, |
| "step": 46800 |
| }, |
| { |
| "epoch": 0.7125357098703478, |
| "grad_norm": 1.5617057085037231, |
| "learning_rate": 1.31438127090301e-06, |
| "loss": 4.8384, |
| "step": 46900 |
| }, |
| { |
| "epoch": 0.7125357098703478, |
| "eval_loss": 4.675555229187012, |
| "eval_runtime": 195.1352, |
| "eval_samples_per_second": 51.247, |
| "eval_steps_per_second": 6.406, |
| "step": 46900 |
| }, |
| { |
| "epoch": 0.7140549757762547, |
| "grad_norm": 1.5074530839920044, |
| "learning_rate": 1.3043478260869566e-06, |
| "loss": 4.8389, |
| "step": 47000 |
| }, |
| { |
| "epoch": 0.7140549757762547, |
| "eval_loss": 4.67551851272583, |
| "eval_runtime": 194.1856, |
| "eval_samples_per_second": 51.497, |
| "eval_steps_per_second": 6.437, |
| "step": 47000 |
| }, |
| { |
| "epoch": 0.7155742416821615, |
| "grad_norm": 1.5850820541381836, |
| "learning_rate": 1.294314381270903e-06, |
| "loss": 4.8428, |
| "step": 47100 |
| }, |
| { |
| "epoch": 0.7155742416821615, |
| "eval_loss": 4.677995681762695, |
| "eval_runtime": 194.3756, |
| "eval_samples_per_second": 51.447, |
| "eval_steps_per_second": 6.431, |
| "step": 47100 |
| }, |
| { |
| "epoch": 0.7170935075880686, |
| "grad_norm": 1.7918612957000732, |
| "learning_rate": 1.2842809364548495e-06, |
| "loss": 4.8399, |
| "step": 47200 |
| }, |
| { |
| "epoch": 0.7170935075880686, |
| "eval_loss": 4.672911167144775, |
| "eval_runtime": 194.224, |
| "eval_samples_per_second": 51.487, |
| "eval_steps_per_second": 6.436, |
| "step": 47200 |
| }, |
| { |
| "epoch": 0.7186127734939755, |
| "grad_norm": 1.955620527267456, |
| "learning_rate": 1.274247491638796e-06, |
| "loss": 4.8338, |
| "step": 47300 |
| }, |
| { |
| "epoch": 0.7186127734939755, |
| "eval_loss": 4.67067289352417, |
| "eval_runtime": 194.1805, |
| "eval_samples_per_second": 51.498, |
| "eval_steps_per_second": 6.437, |
| "step": 47300 |
| }, |
| { |
| "epoch": 0.7201320393998824, |
| "grad_norm": 1.994454264640808, |
| "learning_rate": 1.2642140468227424e-06, |
| "loss": 4.8314, |
| "step": 47400 |
| }, |
| { |
| "epoch": 0.7201320393998824, |
| "eval_loss": 4.672824859619141, |
| "eval_runtime": 194.2432, |
| "eval_samples_per_second": 51.482, |
| "eval_steps_per_second": 6.435, |
| "step": 47400 |
| }, |
| { |
| "epoch": 0.7216513053057892, |
| "grad_norm": 1.8769866228103638, |
| "learning_rate": 1.254180602006689e-06, |
| "loss": 4.8321, |
| "step": 47500 |
| }, |
| { |
| "epoch": 0.7216513053057892, |
| "eval_loss": 4.67031717300415, |
| "eval_runtime": 194.2044, |
| "eval_samples_per_second": 51.492, |
| "eval_steps_per_second": 6.437, |
| "step": 47500 |
| }, |
| { |
| "epoch": 0.7231705712116963, |
| "grad_norm": 1.7346811294555664, |
| "learning_rate": 1.2441471571906355e-06, |
| "loss": 4.8351, |
| "step": 47600 |
| }, |
| { |
| "epoch": 0.7231705712116963, |
| "eval_loss": 4.667263031005859, |
| "eval_runtime": 194.3279, |
| "eval_samples_per_second": 51.459, |
| "eval_steps_per_second": 6.432, |
| "step": 47600 |
| }, |
| { |
| "epoch": 0.7246898371176032, |
| "grad_norm": 2.0054638385772705, |
| "learning_rate": 1.234113712374582e-06, |
| "loss": 4.8311, |
| "step": 47700 |
| }, |
| { |
| "epoch": 0.7246898371176032, |
| "eval_loss": 4.670699119567871, |
| "eval_runtime": 194.3103, |
| "eval_samples_per_second": 51.464, |
| "eval_steps_per_second": 6.433, |
| "step": 47700 |
| }, |
| { |
| "epoch": 0.72620910302351, |
| "grad_norm": 1.9293532371520996, |
| "learning_rate": 1.2240802675585284e-06, |
| "loss": 4.8253, |
| "step": 47800 |
| }, |
| { |
| "epoch": 0.72620910302351, |
| "eval_loss": 4.665504455566406, |
| "eval_runtime": 194.5301, |
| "eval_samples_per_second": 51.406, |
| "eval_steps_per_second": 6.426, |
| "step": 47800 |
| }, |
| { |
| "epoch": 0.727728368929417, |
| "grad_norm": 1.612265944480896, |
| "learning_rate": 1.214046822742475e-06, |
| "loss": 4.826, |
| "step": 47900 |
| }, |
| { |
| "epoch": 0.727728368929417, |
| "eval_loss": 4.665849685668945, |
| "eval_runtime": 194.1122, |
| "eval_samples_per_second": 51.517, |
| "eval_steps_per_second": 6.44, |
| "step": 47900 |
| }, |
| { |
| "epoch": 0.729247634835324, |
| "grad_norm": 1.7139407396316528, |
| "learning_rate": 1.2040133779264215e-06, |
| "loss": 4.8267, |
| "step": 48000 |
| }, |
| { |
| "epoch": 0.729247634835324, |
| "eval_loss": 4.663412570953369, |
| "eval_runtime": 194.3177, |
| "eval_samples_per_second": 51.462, |
| "eval_steps_per_second": 6.433, |
| "step": 48000 |
| }, |
| { |
| "epoch": 0.7307669007412309, |
| "grad_norm": 1.8362255096435547, |
| "learning_rate": 1.193979933110368e-06, |
| "loss": 4.826, |
| "step": 48100 |
| }, |
| { |
| "epoch": 0.7307669007412309, |
| "eval_loss": 4.6637797355651855, |
| "eval_runtime": 194.0467, |
| "eval_samples_per_second": 51.534, |
| "eval_steps_per_second": 6.442, |
| "step": 48100 |
| }, |
| { |
| "epoch": 0.7322861666471377, |
| "grad_norm": 1.3808461427688599, |
| "learning_rate": 1.1839464882943144e-06, |
| "loss": 4.8203, |
| "step": 48200 |
| }, |
| { |
| "epoch": 0.7322861666471377, |
| "eval_loss": 4.66359281539917, |
| "eval_runtime": 194.0835, |
| "eval_samples_per_second": 51.524, |
| "eval_steps_per_second": 6.441, |
| "step": 48200 |
| }, |
| { |
| "epoch": 0.7338054325530446, |
| "grad_norm": 2.090758800506592, |
| "learning_rate": 1.173913043478261e-06, |
| "loss": 4.8246, |
| "step": 48300 |
| }, |
| { |
| "epoch": 0.7338054325530446, |
| "eval_loss": 4.658617973327637, |
| "eval_runtime": 193.919, |
| "eval_samples_per_second": 51.568, |
| "eval_steps_per_second": 6.446, |
| "step": 48300 |
| }, |
| { |
| "epoch": 0.7353246984589517, |
| "grad_norm": 1.410666584968567, |
| "learning_rate": 1.1638795986622075e-06, |
| "loss": 4.8198, |
| "step": 48400 |
| }, |
| { |
| "epoch": 0.7353246984589517, |
| "eval_loss": 4.662432670593262, |
| "eval_runtime": 193.8752, |
| "eval_samples_per_second": 51.58, |
| "eval_steps_per_second": 6.447, |
| "step": 48400 |
| }, |
| { |
| "epoch": 0.7368439643648586, |
| "grad_norm": 1.5587624311447144, |
| "learning_rate": 1.153846153846154e-06, |
| "loss": 4.8185, |
| "step": 48500 |
| }, |
| { |
| "epoch": 0.7368439643648586, |
| "eval_loss": 4.656804084777832, |
| "eval_runtime": 193.773, |
| "eval_samples_per_second": 51.607, |
| "eval_steps_per_second": 6.451, |
| "step": 48500 |
| }, |
| { |
| "epoch": 0.7383632302707654, |
| "grad_norm": 1.3816115856170654, |
| "learning_rate": 1.1438127090301004e-06, |
| "loss": 4.8168, |
| "step": 48600 |
| }, |
| { |
| "epoch": 0.7383632302707654, |
| "eval_loss": 4.656231880187988, |
| "eval_runtime": 193.8565, |
| "eval_samples_per_second": 51.585, |
| "eval_steps_per_second": 6.448, |
| "step": 48600 |
| }, |
| { |
| "epoch": 0.7398824961766723, |
| "grad_norm": 1.927064299583435, |
| "learning_rate": 1.133779264214047e-06, |
| "loss": 4.8182, |
| "step": 48700 |
| }, |
| { |
| "epoch": 0.7398824961766723, |
| "eval_loss": 4.656589031219482, |
| "eval_runtime": 193.8744, |
| "eval_samples_per_second": 51.58, |
| "eval_steps_per_second": 6.447, |
| "step": 48700 |
| }, |
| { |
| "epoch": 0.7414017620825794, |
| "grad_norm": 1.6699544191360474, |
| "learning_rate": 1.1237458193979933e-06, |
| "loss": 4.8185, |
| "step": 48800 |
| }, |
| { |
| "epoch": 0.7414017620825794, |
| "eval_loss": 4.655017852783203, |
| "eval_runtime": 193.8675, |
| "eval_samples_per_second": 51.582, |
| "eval_steps_per_second": 6.448, |
| "step": 48800 |
| }, |
| { |
| "epoch": 0.7429210279884862, |
| "grad_norm": 1.3378312587738037, |
| "learning_rate": 1.1137123745819398e-06, |
| "loss": 4.815, |
| "step": 48900 |
| }, |
| { |
| "epoch": 0.7429210279884862, |
| "eval_loss": 4.657501220703125, |
| "eval_runtime": 194.1277, |
| "eval_samples_per_second": 51.512, |
| "eval_steps_per_second": 6.439, |
| "step": 48900 |
| }, |
| { |
| "epoch": 0.7444402938943931, |
| "grad_norm": 1.6146018505096436, |
| "learning_rate": 1.1036789297658862e-06, |
| "loss": 4.8145, |
| "step": 49000 |
| }, |
| { |
| "epoch": 0.7444402938943931, |
| "eval_loss": 4.6548943519592285, |
| "eval_runtime": 194.2412, |
| "eval_samples_per_second": 51.482, |
| "eval_steps_per_second": 6.435, |
| "step": 49000 |
| }, |
| { |
| "epoch": 0.7459595598003, |
| "grad_norm": 1.5952975749969482, |
| "learning_rate": 1.0936454849498327e-06, |
| "loss": 4.813, |
| "step": 49100 |
| }, |
| { |
| "epoch": 0.7459595598003, |
| "eval_loss": 4.651684284210205, |
| "eval_runtime": 194.2268, |
| "eval_samples_per_second": 51.486, |
| "eval_steps_per_second": 6.436, |
| "step": 49100 |
| }, |
| { |
| "epoch": 0.747478825706207, |
| "grad_norm": 1.5946011543273926, |
| "learning_rate": 1.0836120401337793e-06, |
| "loss": 4.8148, |
| "step": 49200 |
| }, |
| { |
| "epoch": 0.747478825706207, |
| "eval_loss": 4.651627540588379, |
| "eval_runtime": 194.2502, |
| "eval_samples_per_second": 51.48, |
| "eval_steps_per_second": 6.435, |
| "step": 49200 |
| }, |
| { |
| "epoch": 0.748998091612114, |
| "grad_norm": 1.4675341844558716, |
| "learning_rate": 1.0735785953177258e-06, |
| "loss": 4.81, |
| "step": 49300 |
| }, |
| { |
| "epoch": 0.748998091612114, |
| "eval_loss": 4.650761127471924, |
| "eval_runtime": 194.1574, |
| "eval_samples_per_second": 51.505, |
| "eval_steps_per_second": 6.438, |
| "step": 49300 |
| }, |
| { |
| "epoch": 0.7505173575180208, |
| "grad_norm": 1.6807961463928223, |
| "learning_rate": 1.0635451505016722e-06, |
| "loss": 4.8115, |
| "step": 49400 |
| }, |
| { |
| "epoch": 0.7505173575180208, |
| "eval_loss": 4.6511101722717285, |
| "eval_runtime": 194.374, |
| "eval_samples_per_second": 51.447, |
| "eval_steps_per_second": 6.431, |
| "step": 49400 |
| }, |
| { |
| "epoch": 0.7520366234239277, |
| "grad_norm": 1.4846396446228027, |
| "learning_rate": 1.0535117056856187e-06, |
| "loss": 4.8069, |
| "step": 49500 |
| }, |
| { |
| "epoch": 0.7520366234239277, |
| "eval_loss": 4.647155284881592, |
| "eval_runtime": 194.3314, |
| "eval_samples_per_second": 51.458, |
| "eval_steps_per_second": 6.432, |
| "step": 49500 |
| }, |
| { |
| "epoch": 0.7535558893298347, |
| "grad_norm": 1.5872676372528076, |
| "learning_rate": 1.0434782608695653e-06, |
| "loss": 4.8084, |
| "step": 49600 |
| }, |
| { |
| "epoch": 0.7535558893298347, |
| "eval_loss": 4.644804954528809, |
| "eval_runtime": 194.3764, |
| "eval_samples_per_second": 51.447, |
| "eval_steps_per_second": 6.431, |
| "step": 49600 |
| }, |
| { |
| "epoch": 0.7550751552357416, |
| "grad_norm": 1.6138330698013306, |
| "learning_rate": 1.0334448160535118e-06, |
| "loss": 4.8086, |
| "step": 49700 |
| }, |
| { |
| "epoch": 0.7550751552357416, |
| "eval_loss": 4.644802093505859, |
| "eval_runtime": 194.3935, |
| "eval_samples_per_second": 51.442, |
| "eval_steps_per_second": 6.43, |
| "step": 49700 |
| }, |
| { |
| "epoch": 0.7565944211416485, |
| "grad_norm": 1.6802724599838257, |
| "learning_rate": 1.0234113712374581e-06, |
| "loss": 4.8052, |
| "step": 49800 |
| }, |
| { |
| "epoch": 0.7565944211416485, |
| "eval_loss": 4.646471977233887, |
| "eval_runtime": 194.4012, |
| "eval_samples_per_second": 51.44, |
| "eval_steps_per_second": 6.43, |
| "step": 49800 |
| }, |
| { |
| "epoch": 0.7581136870475554, |
| "grad_norm": 1.7580209970474243, |
| "learning_rate": 1.0133779264214047e-06, |
| "loss": 4.805, |
| "step": 49900 |
| }, |
| { |
| "epoch": 0.7581136870475554, |
| "eval_loss": 4.641211032867432, |
| "eval_runtime": 194.5581, |
| "eval_samples_per_second": 51.399, |
| "eval_steps_per_second": 6.425, |
| "step": 49900 |
| }, |
| { |
| "epoch": 0.7596329529534624, |
| "grad_norm": 1.732718586921692, |
| "learning_rate": 1.0033444816053512e-06, |
| "loss": 4.803, |
| "step": 50000 |
| }, |
| { |
| "epoch": 0.7596329529534624, |
| "eval_loss": 4.643296241760254, |
| "eval_runtime": 194.3028, |
| "eval_samples_per_second": 51.466, |
| "eval_steps_per_second": 6.433, |
| "step": 50000 |
| }, |
| { |
| "epoch": 0.7611522188593693, |
| "grad_norm": 1.6775901317596436, |
| "learning_rate": 9.933110367892976e-07, |
| "loss": 4.8009, |
| "step": 50100 |
| }, |
| { |
| "epoch": 0.7611522188593693, |
| "eval_loss": 4.639660358428955, |
| "eval_runtime": 194.3411, |
| "eval_samples_per_second": 51.456, |
| "eval_steps_per_second": 6.432, |
| "step": 50100 |
| }, |
| { |
| "epoch": 0.7626714847652762, |
| "grad_norm": 1.4055508375167847, |
| "learning_rate": 9.832775919732441e-07, |
| "loss": 4.8022, |
| "step": 50200 |
| }, |
| { |
| "epoch": 0.7626714847652762, |
| "eval_loss": 4.637509346008301, |
| "eval_runtime": 194.3272, |
| "eval_samples_per_second": 51.46, |
| "eval_steps_per_second": 6.432, |
| "step": 50200 |
| }, |
| { |
| "epoch": 0.7641907506711831, |
| "grad_norm": 1.6316554546356201, |
| "learning_rate": 9.732441471571907e-07, |
| "loss": 4.8019, |
| "step": 50300 |
| }, |
| { |
| "epoch": 0.7641907506711831, |
| "eval_loss": 4.6399359703063965, |
| "eval_runtime": 194.5106, |
| "eval_samples_per_second": 51.411, |
| "eval_steps_per_second": 6.426, |
| "step": 50300 |
| }, |
| { |
| "epoch": 0.7657100165770901, |
| "grad_norm": 1.87636137008667, |
| "learning_rate": 9.632107023411372e-07, |
| "loss": 4.8021, |
| "step": 50400 |
| }, |
| { |
| "epoch": 0.7657100165770901, |
| "eval_loss": 4.637732028961182, |
| "eval_runtime": 194.1555, |
| "eval_samples_per_second": 51.505, |
| "eval_steps_per_second": 6.438, |
| "step": 50400 |
| }, |
| { |
| "epoch": 0.767229282482997, |
| "grad_norm": 1.5560215711593628, |
| "learning_rate": 9.531772575250837e-07, |
| "loss": 4.797, |
| "step": 50500 |
| }, |
| { |
| "epoch": 0.767229282482997, |
| "eval_loss": 4.636757850646973, |
| "eval_runtime": 194.187, |
| "eval_samples_per_second": 51.497, |
| "eval_steps_per_second": 6.437, |
| "step": 50500 |
| }, |
| { |
| "epoch": 0.7687485483889039, |
| "grad_norm": 1.5681828260421753, |
| "learning_rate": 9.431438127090301e-07, |
| "loss": 4.7981, |
| "step": 50600 |
| }, |
| { |
| "epoch": 0.7687485483889039, |
| "eval_loss": 4.63712215423584, |
| "eval_runtime": 194.0875, |
| "eval_samples_per_second": 51.523, |
| "eval_steps_per_second": 6.44, |
| "step": 50600 |
| }, |
| { |
| "epoch": 0.7702678142948108, |
| "grad_norm": 1.725135087966919, |
| "learning_rate": 9.331103678929767e-07, |
| "loss": 4.7988, |
| "step": 50700 |
| }, |
| { |
| "epoch": 0.7702678142948108, |
| "eval_loss": 4.633908271789551, |
| "eval_runtime": 193.9714, |
| "eval_samples_per_second": 51.554, |
| "eval_steps_per_second": 6.444, |
| "step": 50700 |
| }, |
| { |
| "epoch": 0.7717870802007178, |
| "grad_norm": 1.5292387008666992, |
| "learning_rate": 9.230769230769231e-07, |
| "loss": 4.7942, |
| "step": 50800 |
| }, |
| { |
| "epoch": 0.7717870802007178, |
| "eval_loss": 4.634795188903809, |
| "eval_runtime": 193.9264, |
| "eval_samples_per_second": 51.566, |
| "eval_steps_per_second": 6.446, |
| "step": 50800 |
| }, |
| { |
| "epoch": 0.7733063461066247, |
| "grad_norm": 1.313671350479126, |
| "learning_rate": 9.130434782608697e-07, |
| "loss": 4.7971, |
| "step": 50900 |
| }, |
| { |
| "epoch": 0.7733063461066247, |
| "eval_loss": 4.632637977600098, |
| "eval_runtime": 193.9004, |
| "eval_samples_per_second": 51.573, |
| "eval_steps_per_second": 6.447, |
| "step": 50900 |
| }, |
| { |
| "epoch": 0.7748256120125316, |
| "grad_norm": 1.3143532276153564, |
| "learning_rate": 9.030100334448161e-07, |
| "loss": 4.7945, |
| "step": 51000 |
| }, |
| { |
| "epoch": 0.7748256120125316, |
| "eval_loss": 4.6306681632995605, |
| "eval_runtime": 194.3643, |
| "eval_samples_per_second": 51.45, |
| "eval_steps_per_second": 6.431, |
| "step": 51000 |
| }, |
| { |
| "epoch": 0.7763448779184385, |
| "grad_norm": 1.3034121990203857, |
| "learning_rate": 8.929765886287627e-07, |
| "loss": 4.7888, |
| "step": 51100 |
| }, |
| { |
| "epoch": 0.7763448779184385, |
| "eval_loss": 4.629621982574463, |
| "eval_runtime": 194.0292, |
| "eval_samples_per_second": 51.539, |
| "eval_steps_per_second": 6.442, |
| "step": 51100 |
| }, |
| { |
| "epoch": 0.7778641438243455, |
| "grad_norm": 1.739376425743103, |
| "learning_rate": 8.829431438127091e-07, |
| "loss": 4.7934, |
| "step": 51200 |
| }, |
| { |
| "epoch": 0.7778641438243455, |
| "eval_loss": 4.62890625, |
| "eval_runtime": 194.358, |
| "eval_samples_per_second": 51.451, |
| "eval_steps_per_second": 6.431, |
| "step": 51200 |
| }, |
| { |
| "epoch": 0.7793834097302524, |
| "grad_norm": 1.3741992712020874, |
| "learning_rate": 8.729096989966555e-07, |
| "loss": 4.7887, |
| "step": 51300 |
| }, |
| { |
| "epoch": 0.7793834097302524, |
| "eval_loss": 4.625428199768066, |
| "eval_runtime": 194.4886, |
| "eval_samples_per_second": 51.417, |
| "eval_steps_per_second": 6.427, |
| "step": 51300 |
| }, |
| { |
| "epoch": 0.7809026756361593, |
| "grad_norm": 1.423168420791626, |
| "learning_rate": 8.628762541806019e-07, |
| "loss": 4.7888, |
| "step": 51400 |
| }, |
| { |
| "epoch": 0.7809026756361593, |
| "eval_loss": 4.626926422119141, |
| "eval_runtime": 194.5901, |
| "eval_samples_per_second": 51.39, |
| "eval_steps_per_second": 6.424, |
| "step": 51400 |
| }, |
| { |
| "epoch": 0.7824219415420662, |
| "grad_norm": 1.5038503408432007, |
| "learning_rate": 8.528428093645485e-07, |
| "loss": 4.791, |
| "step": 51500 |
| }, |
| { |
| "epoch": 0.7824219415420662, |
| "eval_loss": 4.630486488342285, |
| "eval_runtime": 194.4102, |
| "eval_samples_per_second": 51.438, |
| "eval_steps_per_second": 6.43, |
| "step": 51500 |
| }, |
| { |
| "epoch": 0.7839412074479732, |
| "grad_norm": 1.6092890501022339, |
| "learning_rate": 8.428093645484949e-07, |
| "loss": 4.7863, |
| "step": 51600 |
| }, |
| { |
| "epoch": 0.7839412074479732, |
| "eval_loss": 4.626857280731201, |
| "eval_runtime": 194.4616, |
| "eval_samples_per_second": 51.424, |
| "eval_steps_per_second": 6.428, |
| "step": 51600 |
| }, |
| { |
| "epoch": 0.7854604733538801, |
| "grad_norm": 1.6199829578399658, |
| "learning_rate": 8.327759197324414e-07, |
| "loss": 4.7875, |
| "step": 51700 |
| }, |
| { |
| "epoch": 0.7854604733538801, |
| "eval_loss": 4.623871326446533, |
| "eval_runtime": 194.5129, |
| "eval_samples_per_second": 51.41, |
| "eval_steps_per_second": 6.426, |
| "step": 51700 |
| }, |
| { |
| "epoch": 0.786979739259787, |
| "grad_norm": 1.33729088306427, |
| "learning_rate": 8.227424749163879e-07, |
| "loss": 4.7836, |
| "step": 51800 |
| }, |
| { |
| "epoch": 0.786979739259787, |
| "eval_loss": 4.625426769256592, |
| "eval_runtime": 194.4935, |
| "eval_samples_per_second": 51.416, |
| "eval_steps_per_second": 6.427, |
| "step": 51800 |
| }, |
| { |
| "epoch": 0.7884990051656939, |
| "grad_norm": 1.6848562955856323, |
| "learning_rate": 8.127090301003344e-07, |
| "loss": 4.7874, |
| "step": 51900 |
| }, |
| { |
| "epoch": 0.7884990051656939, |
| "eval_loss": 4.626620292663574, |
| "eval_runtime": 194.6564, |
| "eval_samples_per_second": 51.373, |
| "eval_steps_per_second": 6.422, |
| "step": 51900 |
| }, |
| { |
| "epoch": 0.7900182710716009, |
| "grad_norm": 1.2945283651351929, |
| "learning_rate": 8.026755852842809e-07, |
| "loss": 4.7892, |
| "step": 52000 |
| }, |
| { |
| "epoch": 0.7900182710716009, |
| "eval_loss": 4.624682903289795, |
| "eval_runtime": 194.5982, |
| "eval_samples_per_second": 51.388, |
| "eval_steps_per_second": 6.423, |
| "step": 52000 |
| }, |
| { |
| "epoch": 0.7915375369775078, |
| "grad_norm": 1.5469530820846558, |
| "learning_rate": 7.926421404682274e-07, |
| "loss": 4.7828, |
| "step": 52100 |
| }, |
| { |
| "epoch": 0.7915375369775078, |
| "eval_loss": 4.622786521911621, |
| "eval_runtime": 194.4896, |
| "eval_samples_per_second": 51.417, |
| "eval_steps_per_second": 6.427, |
| "step": 52100 |
| }, |
| { |
| "epoch": 0.7930568028834147, |
| "grad_norm": 1.4468382596969604, |
| "learning_rate": 7.826086956521739e-07, |
| "loss": 4.7772, |
| "step": 52200 |
| }, |
| { |
| "epoch": 0.7930568028834147, |
| "eval_loss": 4.625532150268555, |
| "eval_runtime": 194.4728, |
| "eval_samples_per_second": 51.421, |
| "eval_steps_per_second": 6.428, |
| "step": 52200 |
| }, |
| { |
| "epoch": 0.7945760687893216, |
| "grad_norm": 1.244032382965088, |
| "learning_rate": 7.725752508361204e-07, |
| "loss": 4.7794, |
| "step": 52300 |
| }, |
| { |
| "epoch": 0.7945760687893216, |
| "eval_loss": 4.621998310089111, |
| "eval_runtime": 194.4943, |
| "eval_samples_per_second": 51.415, |
| "eval_steps_per_second": 6.427, |
| "step": 52300 |
| }, |
| { |
| "epoch": 0.7960953346952286, |
| "grad_norm": 1.416409969329834, |
| "learning_rate": 7.625418060200669e-07, |
| "loss": 4.784, |
| "step": 52400 |
| }, |
| { |
| "epoch": 0.7960953346952286, |
| "eval_loss": 4.620311260223389, |
| "eval_runtime": 194.4398, |
| "eval_samples_per_second": 51.43, |
| "eval_steps_per_second": 6.429, |
| "step": 52400 |
| }, |
| { |
| "epoch": 0.7976146006011355, |
| "grad_norm": 1.3747918605804443, |
| "learning_rate": 7.525083612040134e-07, |
| "loss": 4.7776, |
| "step": 52500 |
| }, |
| { |
| "epoch": 0.7976146006011355, |
| "eval_loss": 4.619593143463135, |
| "eval_runtime": 194.5835, |
| "eval_samples_per_second": 51.392, |
| "eval_steps_per_second": 6.424, |
| "step": 52500 |
| }, |
| { |
| "epoch": 0.7991338665070424, |
| "grad_norm": 1.4532439708709717, |
| "learning_rate": 7.424749163879599e-07, |
| "loss": 4.7805, |
| "step": 52600 |
| }, |
| { |
| "epoch": 0.7991338665070424, |
| "eval_loss": 4.619747161865234, |
| "eval_runtime": 194.2642, |
| "eval_samples_per_second": 51.476, |
| "eval_steps_per_second": 6.435, |
| "step": 52600 |
| }, |
| { |
| "epoch": 0.8006531324129494, |
| "grad_norm": 1.34298574924469, |
| "learning_rate": 7.324414715719064e-07, |
| "loss": 4.7778, |
| "step": 52700 |
| }, |
| { |
| "epoch": 0.8006531324129494, |
| "eval_loss": 4.61711311340332, |
| "eval_runtime": 194.691, |
| "eval_samples_per_second": 51.363, |
| "eval_steps_per_second": 6.42, |
| "step": 52700 |
| }, |
| { |
| "epoch": 0.8021723983188563, |
| "grad_norm": 1.4666342735290527, |
| "learning_rate": 7.224080267558529e-07, |
| "loss": 4.7792, |
| "step": 52800 |
| }, |
| { |
| "epoch": 0.8021723983188563, |
| "eval_loss": 4.615002155303955, |
| "eval_runtime": 194.4007, |
| "eval_samples_per_second": 51.44, |
| "eval_steps_per_second": 6.43, |
| "step": 52800 |
| }, |
| { |
| "epoch": 0.8036916642247632, |
| "grad_norm": 1.1881191730499268, |
| "learning_rate": 7.123745819397994e-07, |
| "loss": 4.7789, |
| "step": 52900 |
| }, |
| { |
| "epoch": 0.8036916642247632, |
| "eval_loss": 4.613386154174805, |
| "eval_runtime": 194.3959, |
| "eval_samples_per_second": 51.441, |
| "eval_steps_per_second": 6.43, |
| "step": 52900 |
| }, |
| { |
| "epoch": 0.8052109301306701, |
| "grad_norm": 1.1752644777297974, |
| "learning_rate": 7.023411371237459e-07, |
| "loss": 4.7766, |
| "step": 53000 |
| }, |
| { |
| "epoch": 0.8052109301306701, |
| "eval_loss": 4.616655349731445, |
| "eval_runtime": 194.0766, |
| "eval_samples_per_second": 51.526, |
| "eval_steps_per_second": 6.441, |
| "step": 53000 |
| }, |
| { |
| "epoch": 0.8067301960365771, |
| "grad_norm": 1.3520350456237793, |
| "learning_rate": 6.923076923076923e-07, |
| "loss": 4.7748, |
| "step": 53100 |
| }, |
| { |
| "epoch": 0.8067301960365771, |
| "eval_loss": 4.616769313812256, |
| "eval_runtime": 194.2084, |
| "eval_samples_per_second": 51.491, |
| "eval_steps_per_second": 6.436, |
| "step": 53100 |
| }, |
| { |
| "epoch": 0.808249461942484, |
| "grad_norm": 1.5536683797836304, |
| "learning_rate": 6.822742474916388e-07, |
| "loss": 4.7798, |
| "step": 53200 |
| }, |
| { |
| "epoch": 0.808249461942484, |
| "eval_loss": 4.615866661071777, |
| "eval_runtime": 193.8562, |
| "eval_samples_per_second": 51.585, |
| "eval_steps_per_second": 6.448, |
| "step": 53200 |
| }, |
| { |
| "epoch": 0.8097687278483909, |
| "grad_norm": 1.2618976831436157, |
| "learning_rate": 6.722408026755853e-07, |
| "loss": 4.7762, |
| "step": 53300 |
| }, |
| { |
| "epoch": 0.8097687278483909, |
| "eval_loss": 4.616024017333984, |
| "eval_runtime": 193.9219, |
| "eval_samples_per_second": 51.567, |
| "eval_steps_per_second": 6.446, |
| "step": 53300 |
| }, |
| { |
| "epoch": 0.8112879937542978, |
| "grad_norm": 1.8162367343902588, |
| "learning_rate": 6.622073578595318e-07, |
| "loss": 4.7761, |
| "step": 53400 |
| }, |
| { |
| "epoch": 0.8112879937542978, |
| "eval_loss": 4.613333702087402, |
| "eval_runtime": 194.0415, |
| "eval_samples_per_second": 51.535, |
| "eval_steps_per_second": 6.442, |
| "step": 53400 |
| }, |
| { |
| "epoch": 0.8128072596602048, |
| "grad_norm": 1.1924686431884766, |
| "learning_rate": 6.521739130434783e-07, |
| "loss": 4.7721, |
| "step": 53500 |
| }, |
| { |
| "epoch": 0.8128072596602048, |
| "eval_loss": 4.615184307098389, |
| "eval_runtime": 193.9446, |
| "eval_samples_per_second": 51.561, |
| "eval_steps_per_second": 6.445, |
| "step": 53500 |
| }, |
| { |
| "epoch": 0.8143265255661117, |
| "grad_norm": 1.1603306531906128, |
| "learning_rate": 6.421404682274248e-07, |
| "loss": 4.7746, |
| "step": 53600 |
| }, |
| { |
| "epoch": 0.8143265255661117, |
| "eval_loss": 4.611873626708984, |
| "eval_runtime": 194.2028, |
| "eval_samples_per_second": 51.493, |
| "eval_steps_per_second": 6.437, |
| "step": 53600 |
| }, |
| { |
| "epoch": 0.8158457914720186, |
| "grad_norm": 1.202577829360962, |
| "learning_rate": 6.321070234113712e-07, |
| "loss": 4.7745, |
| "step": 53700 |
| }, |
| { |
| "epoch": 0.8158457914720186, |
| "eval_loss": 4.610635757446289, |
| "eval_runtime": 194.3713, |
| "eval_samples_per_second": 51.448, |
| "eval_steps_per_second": 6.431, |
| "step": 53700 |
| }, |
| { |
| "epoch": 0.8173650573779255, |
| "grad_norm": 1.3371776342391968, |
| "learning_rate": 6.220735785953178e-07, |
| "loss": 4.7755, |
| "step": 53800 |
| }, |
| { |
| "epoch": 0.8173650573779255, |
| "eval_loss": 4.611499786376953, |
| "eval_runtime": 194.3936, |
| "eval_samples_per_second": 51.442, |
| "eval_steps_per_second": 6.43, |
| "step": 53800 |
| }, |
| { |
| "epoch": 0.8188843232838325, |
| "grad_norm": 1.3666436672210693, |
| "learning_rate": 6.120401337792642e-07, |
| "loss": 4.7701, |
| "step": 53900 |
| }, |
| { |
| "epoch": 0.8188843232838325, |
| "eval_loss": 4.610349655151367, |
| "eval_runtime": 194.5735, |
| "eval_samples_per_second": 51.394, |
| "eval_steps_per_second": 6.424, |
| "step": 53900 |
| }, |
| { |
| "epoch": 0.8204035891897394, |
| "grad_norm": 1.4433395862579346, |
| "learning_rate": 6.020066889632107e-07, |
| "loss": 4.7743, |
| "step": 54000 |
| }, |
| { |
| "epoch": 0.8204035891897394, |
| "eval_loss": 4.610903263092041, |
| "eval_runtime": 194.367, |
| "eval_samples_per_second": 51.449, |
| "eval_steps_per_second": 6.431, |
| "step": 54000 |
| }, |
| { |
| "epoch": 0.8219228550956463, |
| "grad_norm": 1.2440968751907349, |
| "learning_rate": 5.919732441471572e-07, |
| "loss": 4.7701, |
| "step": 54100 |
| }, |
| { |
| "epoch": 0.8219228550956463, |
| "eval_loss": 4.611226558685303, |
| "eval_runtime": 194.4358, |
| "eval_samples_per_second": 51.431, |
| "eval_steps_per_second": 6.429, |
| "step": 54100 |
| }, |
| { |
| "epoch": 0.8234421210015532, |
| "grad_norm": 1.311020016670227, |
| "learning_rate": 5.819397993311037e-07, |
| "loss": 4.767, |
| "step": 54200 |
| }, |
| { |
| "epoch": 0.8234421210015532, |
| "eval_loss": 4.608744144439697, |
| "eval_runtime": 194.5925, |
| "eval_samples_per_second": 51.389, |
| "eval_steps_per_second": 6.424, |
| "step": 54200 |
| }, |
| { |
| "epoch": 0.8249613869074602, |
| "grad_norm": 1.2300583124160767, |
| "learning_rate": 5.719063545150502e-07, |
| "loss": 4.7713, |
| "step": 54300 |
| }, |
| { |
| "epoch": 0.8249613869074602, |
| "eval_loss": 4.607234477996826, |
| "eval_runtime": 194.4772, |
| "eval_samples_per_second": 51.42, |
| "eval_steps_per_second": 6.427, |
| "step": 54300 |
| }, |
| { |
| "epoch": 0.8264806528133671, |
| "grad_norm": 1.3106154203414917, |
| "learning_rate": 5.618729096989966e-07, |
| "loss": 4.7698, |
| "step": 54400 |
| }, |
| { |
| "epoch": 0.8264806528133671, |
| "eval_loss": 4.604393005371094, |
| "eval_runtime": 194.687, |
| "eval_samples_per_second": 51.364, |
| "eval_steps_per_second": 6.421, |
| "step": 54400 |
| }, |
| { |
| "epoch": 0.827999918719274, |
| "grad_norm": 1.2660140991210938, |
| "learning_rate": 5.518394648829431e-07, |
| "loss": 4.7655, |
| "step": 54500 |
| }, |
| { |
| "epoch": 0.827999918719274, |
| "eval_loss": 4.602825164794922, |
| "eval_runtime": 194.597, |
| "eval_samples_per_second": 51.388, |
| "eval_steps_per_second": 6.424, |
| "step": 54500 |
| }, |
| { |
| "epoch": 0.8295191846251809, |
| "grad_norm": 1.4443926811218262, |
| "learning_rate": 5.418060200668896e-07, |
| "loss": 4.7727, |
| "step": 54600 |
| }, |
| { |
| "epoch": 0.8295191846251809, |
| "eval_loss": 4.606249809265137, |
| "eval_runtime": 194.4722, |
| "eval_samples_per_second": 51.421, |
| "eval_steps_per_second": 6.428, |
| "step": 54600 |
| }, |
| { |
| "epoch": 0.8310384505310879, |
| "grad_norm": 1.339629888534546, |
| "learning_rate": 5.317725752508361e-07, |
| "loss": 4.7639, |
| "step": 54700 |
| }, |
| { |
| "epoch": 0.8310384505310879, |
| "eval_loss": 4.604907512664795, |
| "eval_runtime": 194.6528, |
| "eval_samples_per_second": 51.374, |
| "eval_steps_per_second": 6.422, |
| "step": 54700 |
| }, |
| { |
| "epoch": 0.8325577164369948, |
| "grad_norm": 1.2703863382339478, |
| "learning_rate": 5.217391304347826e-07, |
| "loss": 4.762, |
| "step": 54800 |
| }, |
| { |
| "epoch": 0.8325577164369948, |
| "eval_loss": 4.605154037475586, |
| "eval_runtime": 194.4518, |
| "eval_samples_per_second": 51.427, |
| "eval_steps_per_second": 6.428, |
| "step": 54800 |
| }, |
| { |
| "epoch": 0.8340769823429017, |
| "grad_norm": 1.1100186109542847, |
| "learning_rate": 5.117056856187291e-07, |
| "loss": 4.7635, |
| "step": 54900 |
| }, |
| { |
| "epoch": 0.8340769823429017, |
| "eval_loss": 4.603663444519043, |
| "eval_runtime": 194.4154, |
| "eval_samples_per_second": 51.436, |
| "eval_steps_per_second": 6.43, |
| "step": 54900 |
| }, |
| { |
| "epoch": 0.8355962482488086, |
| "grad_norm": 1.6119050979614258, |
| "learning_rate": 5.016722408026756e-07, |
| "loss": 4.7627, |
| "step": 55000 |
| }, |
| { |
| "epoch": 0.8355962482488086, |
| "eval_loss": 4.603806495666504, |
| "eval_runtime": 194.3808, |
| "eval_samples_per_second": 51.445, |
| "eval_steps_per_second": 6.431, |
| "step": 55000 |
| }, |
| { |
| "epoch": 0.8371155141547156, |
| "grad_norm": 1.22734534740448, |
| "learning_rate": 4.916387959866221e-07, |
| "loss": 4.764, |
| "step": 55100 |
| }, |
| { |
| "epoch": 0.8371155141547156, |
| "eval_loss": 4.604480266571045, |
| "eval_runtime": 194.4351, |
| "eval_samples_per_second": 51.431, |
| "eval_steps_per_second": 6.429, |
| "step": 55100 |
| }, |
| { |
| "epoch": 0.8386347800606225, |
| "grad_norm": 1.1762231588363647, |
| "learning_rate": 4.816053511705686e-07, |
| "loss": 4.7674, |
| "step": 55200 |
| }, |
| { |
| "epoch": 0.8386347800606225, |
| "eval_loss": 4.60023307800293, |
| "eval_runtime": 194.3592, |
| "eval_samples_per_second": 51.451, |
| "eval_steps_per_second": 6.431, |
| "step": 55200 |
| }, |
| { |
| "epoch": 0.8401540459665294, |
| "grad_norm": 1.0889923572540283, |
| "learning_rate": 4.7157190635451506e-07, |
| "loss": 4.762, |
| "step": 55300 |
| }, |
| { |
| "epoch": 0.8401540459665294, |
| "eval_loss": 4.5993733406066895, |
| "eval_runtime": 194.2327, |
| "eval_samples_per_second": 51.485, |
| "eval_steps_per_second": 6.436, |
| "step": 55300 |
| }, |
| { |
| "epoch": 0.8416733118724363, |
| "grad_norm": 1.2975116968154907, |
| "learning_rate": 4.6153846153846156e-07, |
| "loss": 4.7635, |
| "step": 55400 |
| }, |
| { |
| "epoch": 0.8416733118724363, |
| "eval_loss": 4.599579811096191, |
| "eval_runtime": 194.1059, |
| "eval_samples_per_second": 51.518, |
| "eval_steps_per_second": 6.44, |
| "step": 55400 |
| }, |
| { |
| "epoch": 0.8431925777783433, |
| "grad_norm": 1.257307767868042, |
| "learning_rate": 4.5150501672240806e-07, |
| "loss": 4.7607, |
| "step": 55500 |
| }, |
| { |
| "epoch": 0.8431925777783433, |
| "eval_loss": 4.602155685424805, |
| "eval_runtime": 193.9982, |
| "eval_samples_per_second": 51.547, |
| "eval_steps_per_second": 6.443, |
| "step": 55500 |
| }, |
| { |
| "epoch": 0.8447118436842502, |
| "grad_norm": 1.2345635890960693, |
| "learning_rate": 4.4147157190635456e-07, |
| "loss": 4.7602, |
| "step": 55600 |
| }, |
| { |
| "epoch": 0.8447118436842502, |
| "eval_loss": 4.60153341293335, |
| "eval_runtime": 193.9915, |
| "eval_samples_per_second": 51.549, |
| "eval_steps_per_second": 6.444, |
| "step": 55600 |
| }, |
| { |
| "epoch": 0.8462311095901571, |
| "grad_norm": 1.2262383699417114, |
| "learning_rate": 4.3143812709030095e-07, |
| "loss": 4.7619, |
| "step": 55700 |
| }, |
| { |
| "epoch": 0.8462311095901571, |
| "eval_loss": 4.600053310394287, |
| "eval_runtime": 194.0432, |
| "eval_samples_per_second": 51.535, |
| "eval_steps_per_second": 6.442, |
| "step": 55700 |
| }, |
| { |
| "epoch": 0.847750375496064, |
| "grad_norm": 1.3070259094238281, |
| "learning_rate": 4.2140468227424745e-07, |
| "loss": 4.7564, |
| "step": 55800 |
| }, |
| { |
| "epoch": 0.847750375496064, |
| "eval_loss": 4.597591876983643, |
| "eval_runtime": 193.9858, |
| "eval_samples_per_second": 51.55, |
| "eval_steps_per_second": 6.444, |
| "step": 55800 |
| }, |
| { |
| "epoch": 0.849269641401971, |
| "grad_norm": 1.2372263669967651, |
| "learning_rate": 4.1137123745819395e-07, |
| "loss": 4.7601, |
| "step": 55900 |
| }, |
| { |
| "epoch": 0.849269641401971, |
| "eval_loss": 4.602851867675781, |
| "eval_runtime": 194.1982, |
| "eval_samples_per_second": 51.494, |
| "eval_steps_per_second": 6.437, |
| "step": 55900 |
| }, |
| { |
| "epoch": 0.8507889073078779, |
| "grad_norm": 1.1839525699615479, |
| "learning_rate": 4.0133779264214045e-07, |
| "loss": 4.7609, |
| "step": 56000 |
| }, |
| { |
| "epoch": 0.8507889073078779, |
| "eval_loss": 4.595503330230713, |
| "eval_runtime": 194.4031, |
| "eval_samples_per_second": 51.44, |
| "eval_steps_per_second": 6.43, |
| "step": 56000 |
| }, |
| { |
| "epoch": 0.8523081732137848, |
| "grad_norm": 1.4197345972061157, |
| "learning_rate": 3.9130434782608694e-07, |
| "loss": 4.7594, |
| "step": 56100 |
| }, |
| { |
| "epoch": 0.8523081732137848, |
| "eval_loss": 4.59796142578125, |
| "eval_runtime": 194.4678, |
| "eval_samples_per_second": 51.422, |
| "eval_steps_per_second": 6.428, |
| "step": 56100 |
| }, |
| { |
| "epoch": 0.8538274391196917, |
| "grad_norm": 1.1221038103103638, |
| "learning_rate": 3.8127090301003344e-07, |
| "loss": 4.7568, |
| "step": 56200 |
| }, |
| { |
| "epoch": 0.8538274391196917, |
| "eval_loss": 4.596600532531738, |
| "eval_runtime": 194.6362, |
| "eval_samples_per_second": 51.378, |
| "eval_steps_per_second": 6.422, |
| "step": 56200 |
| }, |
| { |
| "epoch": 0.8553467050255987, |
| "grad_norm": 1.2606701850891113, |
| "learning_rate": 3.7123745819397994e-07, |
| "loss": 4.7543, |
| "step": 56300 |
| }, |
| { |
| "epoch": 0.8553467050255987, |
| "eval_loss": 4.598119258880615, |
| "eval_runtime": 194.3842, |
| "eval_samples_per_second": 51.445, |
| "eval_steps_per_second": 6.431, |
| "step": 56300 |
| }, |
| { |
| "epoch": 0.8568659709315056, |
| "grad_norm": 1.3233997821807861, |
| "learning_rate": 3.6120401337792644e-07, |
| "loss": 4.7576, |
| "step": 56400 |
| }, |
| { |
| "epoch": 0.8568659709315056, |
| "eval_loss": 4.596184730529785, |
| "eval_runtime": 194.3736, |
| "eval_samples_per_second": 51.447, |
| "eval_steps_per_second": 6.431, |
| "step": 56400 |
| }, |
| { |
| "epoch": 0.8583852368374125, |
| "grad_norm": 1.2004015445709229, |
| "learning_rate": 3.5117056856187294e-07, |
| "loss": 4.7616, |
| "step": 56500 |
| }, |
| { |
| "epoch": 0.8583852368374125, |
| "eval_loss": 4.594801425933838, |
| "eval_runtime": 194.5029, |
| "eval_samples_per_second": 51.413, |
| "eval_steps_per_second": 6.427, |
| "step": 56500 |
| }, |
| { |
| "epoch": 0.8599045027433194, |
| "grad_norm": 1.2479798793792725, |
| "learning_rate": 3.411371237458194e-07, |
| "loss": 4.7628, |
| "step": 56600 |
| }, |
| { |
| "epoch": 0.8599045027433194, |
| "eval_loss": 4.599001407623291, |
| "eval_runtime": 194.5013, |
| "eval_samples_per_second": 51.414, |
| "eval_steps_per_second": 6.427, |
| "step": 56600 |
| }, |
| { |
| "epoch": 0.8614237686492264, |
| "grad_norm": 1.2455825805664062, |
| "learning_rate": 3.311036789297659e-07, |
| "loss": 4.756, |
| "step": 56700 |
| }, |
| { |
| "epoch": 0.8614237686492264, |
| "eval_loss": 4.596933364868164, |
| "eval_runtime": 194.5096, |
| "eval_samples_per_second": 51.411, |
| "eval_steps_per_second": 6.426, |
| "step": 56700 |
| }, |
| { |
| "epoch": 0.8629430345551333, |
| "grad_norm": 1.2096078395843506, |
| "learning_rate": 3.210702341137124e-07, |
| "loss": 4.7603, |
| "step": 56800 |
| }, |
| { |
| "epoch": 0.8629430345551333, |
| "eval_loss": 4.5964884757995605, |
| "eval_runtime": 194.6292, |
| "eval_samples_per_second": 51.38, |
| "eval_steps_per_second": 6.422, |
| "step": 56800 |
| }, |
| { |
| "epoch": 0.8644623004610402, |
| "grad_norm": 0.9795971512794495, |
| "learning_rate": 3.110367892976589e-07, |
| "loss": 4.7533, |
| "step": 56900 |
| }, |
| { |
| "epoch": 0.8644623004610402, |
| "eval_loss": 4.594615459442139, |
| "eval_runtime": 194.678, |
| "eval_samples_per_second": 51.367, |
| "eval_steps_per_second": 6.421, |
| "step": 56900 |
| }, |
| { |
| "epoch": 0.8659815663669471, |
| "grad_norm": 1.3727303743362427, |
| "learning_rate": 3.010033444816054e-07, |
| "loss": 4.7547, |
| "step": 57000 |
| }, |
| { |
| "epoch": 0.8659815663669471, |
| "eval_loss": 4.596096515655518, |
| "eval_runtime": 194.5316, |
| "eval_samples_per_second": 51.406, |
| "eval_steps_per_second": 6.426, |
| "step": 57000 |
| }, |
| { |
| "epoch": 0.8675008322728541, |
| "grad_norm": 1.1338236331939697, |
| "learning_rate": 2.9096989966555187e-07, |
| "loss": 4.7542, |
| "step": 57100 |
| }, |
| { |
| "epoch": 0.8675008322728541, |
| "eval_loss": 4.5944132804870605, |
| "eval_runtime": 194.4861, |
| "eval_samples_per_second": 51.418, |
| "eval_steps_per_second": 6.427, |
| "step": 57100 |
| }, |
| { |
| "epoch": 0.869020098178761, |
| "grad_norm": 1.1638000011444092, |
| "learning_rate": 2.809364548494983e-07, |
| "loss": 4.7509, |
| "step": 57200 |
| }, |
| { |
| "epoch": 0.869020098178761, |
| "eval_loss": 4.593369483947754, |
| "eval_runtime": 194.5214, |
| "eval_samples_per_second": 51.408, |
| "eval_steps_per_second": 6.426, |
| "step": 57200 |
| }, |
| { |
| "epoch": 0.8705393640846679, |
| "grad_norm": 0.9814125299453735, |
| "learning_rate": 2.709030100334448e-07, |
| "loss": 4.7565, |
| "step": 57300 |
| }, |
| { |
| "epoch": 0.8705393640846679, |
| "eval_loss": 4.595996856689453, |
| "eval_runtime": 194.4485, |
| "eval_samples_per_second": 51.427, |
| "eval_steps_per_second": 6.428, |
| "step": 57300 |
| }, |
| { |
| "epoch": 0.8720586299905748, |
| "grad_norm": 1.0250178575515747, |
| "learning_rate": 2.608695652173913e-07, |
| "loss": 4.7568, |
| "step": 57400 |
| }, |
| { |
| "epoch": 0.8720586299905748, |
| "eval_loss": 4.59307336807251, |
| "eval_runtime": 194.4712, |
| "eval_samples_per_second": 51.421, |
| "eval_steps_per_second": 6.428, |
| "step": 57400 |
| }, |
| { |
| "epoch": 0.8735778958964818, |
| "grad_norm": 0.9920938014984131, |
| "learning_rate": 2.508361204013378e-07, |
| "loss": 4.7567, |
| "step": 57500 |
| }, |
| { |
| "epoch": 0.8735778958964818, |
| "eval_loss": 4.5949625968933105, |
| "eval_runtime": 194.5139, |
| "eval_samples_per_second": 51.41, |
| "eval_steps_per_second": 6.426, |
| "step": 57500 |
| }, |
| { |
| "epoch": 0.8750971618023887, |
| "grad_norm": 1.0698268413543701, |
| "learning_rate": 2.408026755852843e-07, |
| "loss": 4.749, |
| "step": 57600 |
| }, |
| { |
| "epoch": 0.8750971618023887, |
| "eval_loss": 4.590822696685791, |
| "eval_runtime": 194.2365, |
| "eval_samples_per_second": 51.484, |
| "eval_steps_per_second": 6.435, |
| "step": 57600 |
| }, |
| { |
| "epoch": 0.8766164277082956, |
| "grad_norm": 1.0088557004928589, |
| "learning_rate": 2.3076923076923078e-07, |
| "loss": 4.7556, |
| "step": 57700 |
| }, |
| { |
| "epoch": 0.8766164277082956, |
| "eval_loss": 4.592673301696777, |
| "eval_runtime": 194.2155, |
| "eval_samples_per_second": 51.489, |
| "eval_steps_per_second": 6.436, |
| "step": 57700 |
| }, |
| { |
| "epoch": 0.8781356936142025, |
| "grad_norm": 0.989743173122406, |
| "learning_rate": 2.2073578595317728e-07, |
| "loss": 4.755, |
| "step": 57800 |
| }, |
| { |
| "epoch": 0.8781356936142025, |
| "eval_loss": 4.594258785247803, |
| "eval_runtime": 194.0368, |
| "eval_samples_per_second": 51.537, |
| "eval_steps_per_second": 6.442, |
| "step": 57800 |
| }, |
| { |
| "epoch": 0.8796549595201095, |
| "grad_norm": 1.0593464374542236, |
| "learning_rate": 2.1070234113712372e-07, |
| "loss": 4.7553, |
| "step": 57900 |
| }, |
| { |
| "epoch": 0.8796549595201095, |
| "eval_loss": 4.591804504394531, |
| "eval_runtime": 193.9201, |
| "eval_samples_per_second": 51.568, |
| "eval_steps_per_second": 6.446, |
| "step": 57900 |
| }, |
| { |
| "epoch": 0.8811742254260164, |
| "grad_norm": 1.0415208339691162, |
| "learning_rate": 2.0066889632107022e-07, |
| "loss": 4.7526, |
| "step": 58000 |
| }, |
| { |
| "epoch": 0.8811742254260164, |
| "eval_loss": 4.591397285461426, |
| "eval_runtime": 194.0639, |
| "eval_samples_per_second": 51.529, |
| "eval_steps_per_second": 6.441, |
| "step": 58000 |
| }, |
| { |
| "epoch": 0.8826934913319233, |
| "grad_norm": 1.08748197555542, |
| "learning_rate": 1.9063545150501672e-07, |
| "loss": 4.748, |
| "step": 58100 |
| }, |
| { |
| "epoch": 0.8826934913319233, |
| "eval_loss": 4.592258930206299, |
| "eval_runtime": 193.9701, |
| "eval_samples_per_second": 51.554, |
| "eval_steps_per_second": 6.444, |
| "step": 58100 |
| }, |
| { |
| "epoch": 0.8842127572378302, |
| "grad_norm": 0.875297486782074, |
| "learning_rate": 1.8060200668896322e-07, |
| "loss": 4.754, |
| "step": 58200 |
| }, |
| { |
| "epoch": 0.8842127572378302, |
| "eval_loss": 4.590017318725586, |
| "eval_runtime": 193.921, |
| "eval_samples_per_second": 51.567, |
| "eval_steps_per_second": 6.446, |
| "step": 58200 |
| }, |
| { |
| "epoch": 0.8857320231437372, |
| "grad_norm": 0.9465267062187195, |
| "learning_rate": 1.705685618729097e-07, |
| "loss": 4.7541, |
| "step": 58300 |
| }, |
| { |
| "epoch": 0.8857320231437372, |
| "eval_loss": 4.590794563293457, |
| "eval_runtime": 193.8053, |
| "eval_samples_per_second": 51.598, |
| "eval_steps_per_second": 6.45, |
| "step": 58300 |
| }, |
| { |
| "epoch": 0.8872512890496441, |
| "grad_norm": 1.108864426612854, |
| "learning_rate": 1.605351170568562e-07, |
| "loss": 4.7545, |
| "step": 58400 |
| }, |
| { |
| "epoch": 0.8872512890496441, |
| "eval_loss": 4.590878963470459, |
| "eval_runtime": 194.0226, |
| "eval_samples_per_second": 51.54, |
| "eval_steps_per_second": 6.443, |
| "step": 58400 |
| }, |
| { |
| "epoch": 0.888770554955551, |
| "grad_norm": 0.9311940670013428, |
| "learning_rate": 1.505016722408027e-07, |
| "loss": 4.7537, |
| "step": 58500 |
| }, |
| { |
| "epoch": 0.888770554955551, |
| "eval_loss": 4.589045524597168, |
| "eval_runtime": 194.3809, |
| "eval_samples_per_second": 51.445, |
| "eval_steps_per_second": 6.431, |
| "step": 58500 |
| }, |
| { |
| "epoch": 0.8902898208614579, |
| "grad_norm": 1.122527003288269, |
| "learning_rate": 1.4046822742474916e-07, |
| "loss": 4.7517, |
| "step": 58600 |
| }, |
| { |
| "epoch": 0.8902898208614579, |
| "eval_loss": 4.590823650360107, |
| "eval_runtime": 194.4167, |
| "eval_samples_per_second": 51.436, |
| "eval_steps_per_second": 6.429, |
| "step": 58600 |
| }, |
| { |
| "epoch": 0.8918090867673649, |
| "grad_norm": 1.0384498834609985, |
| "learning_rate": 1.3043478260869566e-07, |
| "loss": 4.7491, |
| "step": 58700 |
| }, |
| { |
| "epoch": 0.8918090867673649, |
| "eval_loss": 4.5913920402526855, |
| "eval_runtime": 194.5657, |
| "eval_samples_per_second": 51.397, |
| "eval_steps_per_second": 6.425, |
| "step": 58700 |
| }, |
| { |
| "epoch": 0.8933283526732718, |
| "grad_norm": 0.9574987292289734, |
| "learning_rate": 1.2040133779264215e-07, |
| "loss": 4.7512, |
| "step": 58800 |
| }, |
| { |
| "epoch": 0.8933283526732718, |
| "eval_loss": 4.5905609130859375, |
| "eval_runtime": 194.7064, |
| "eval_samples_per_second": 51.359, |
| "eval_steps_per_second": 6.42, |
| "step": 58800 |
| }, |
| { |
| "epoch": 0.8948476185791787, |
| "grad_norm": 0.8835811614990234, |
| "learning_rate": 1.1036789297658864e-07, |
| "loss": 4.7493, |
| "step": 58900 |
| }, |
| { |
| "epoch": 0.8948476185791787, |
| "eval_loss": 4.59054708480835, |
| "eval_runtime": 194.6192, |
| "eval_samples_per_second": 51.382, |
| "eval_steps_per_second": 6.423, |
| "step": 58900 |
| }, |
| { |
| "epoch": 0.8963668844850856, |
| "grad_norm": 0.8485853672027588, |
| "learning_rate": 1.0033444816053511e-07, |
| "loss": 4.7494, |
| "step": 59000 |
| }, |
| { |
| "epoch": 0.8963668844850856, |
| "eval_loss": 4.590051651000977, |
| "eval_runtime": 194.7537, |
| "eval_samples_per_second": 51.347, |
| "eval_steps_per_second": 6.418, |
| "step": 59000 |
| }, |
| { |
| "epoch": 0.8978861503909926, |
| "grad_norm": 0.9415624737739563, |
| "learning_rate": 9.030100334448161e-08, |
| "loss": 4.7461, |
| "step": 59100 |
| }, |
| { |
| "epoch": 0.8978861503909926, |
| "eval_loss": 4.593616962432861, |
| "eval_runtime": 194.5307, |
| "eval_samples_per_second": 51.406, |
| "eval_steps_per_second": 6.426, |
| "step": 59100 |
| }, |
| { |
| "epoch": 0.8994054162968995, |
| "grad_norm": 1.0554380416870117, |
| "learning_rate": 8.02675585284281e-08, |
| "loss": 4.7523, |
| "step": 59200 |
| }, |
| { |
| "epoch": 0.8994054162968995, |
| "eval_loss": 4.5899271965026855, |
| "eval_runtime": 194.6175, |
| "eval_samples_per_second": 51.383, |
| "eval_steps_per_second": 6.423, |
| "step": 59200 |
| }, |
| { |
| "epoch": 0.9009246822028064, |
| "grad_norm": 0.8636355400085449, |
| "learning_rate": 7.023411371237458e-08, |
| "loss": 4.7511, |
| "step": 59300 |
| }, |
| { |
| "epoch": 0.9009246822028064, |
| "eval_loss": 4.590203285217285, |
| "eval_runtime": 194.5121, |
| "eval_samples_per_second": 51.411, |
| "eval_steps_per_second": 6.426, |
| "step": 59300 |
| }, |
| { |
| "epoch": 0.9024439481087133, |
| "grad_norm": 0.7755019068717957, |
| "learning_rate": 6.020066889632108e-08, |
| "loss": 4.7495, |
| "step": 59400 |
| }, |
| { |
| "epoch": 0.9024439481087133, |
| "eval_loss": 4.591348648071289, |
| "eval_runtime": 194.53, |
| "eval_samples_per_second": 51.406, |
| "eval_steps_per_second": 6.426, |
| "step": 59400 |
| }, |
| { |
| "epoch": 0.9039632140146203, |
| "grad_norm": 0.9905518293380737, |
| "learning_rate": 5.0167224080267556e-08, |
| "loss": 4.7507, |
| "step": 59500 |
| }, |
| { |
| "epoch": 0.9039632140146203, |
| "eval_loss": 4.590855121612549, |
| "eval_runtime": 194.486, |
| "eval_samples_per_second": 51.418, |
| "eval_steps_per_second": 6.427, |
| "step": 59500 |
| }, |
| { |
| "epoch": 0.9054824799205272, |
| "grad_norm": 0.8361491560935974, |
| "learning_rate": 4.013377926421405e-08, |
| "loss": 4.7508, |
| "step": 59600 |
| }, |
| { |
| "epoch": 0.9054824799205272, |
| "eval_loss": 4.588395118713379, |
| "eval_runtime": 194.4894, |
| "eval_samples_per_second": 51.417, |
| "eval_steps_per_second": 6.427, |
| "step": 59600 |
| }, |
| { |
| "epoch": 0.9070017458264341, |
| "grad_norm": 0.8528068661689758, |
| "learning_rate": 3.010033444816054e-08, |
| "loss": 4.7485, |
| "step": 59700 |
| }, |
| { |
| "epoch": 0.9070017458264341, |
| "eval_loss": 4.589570999145508, |
| "eval_runtime": 194.4665, |
| "eval_samples_per_second": 51.423, |
| "eval_steps_per_second": 6.428, |
| "step": 59700 |
| }, |
| { |
| "epoch": 0.908521011732341, |
| "grad_norm": 0.9023746252059937, |
| "learning_rate": 2.0066889632107024e-08, |
| "loss": 4.7502, |
| "step": 59800 |
| }, |
| { |
| "epoch": 0.908521011732341, |
| "eval_loss": 4.588865280151367, |
| "eval_runtime": 194.2101, |
| "eval_samples_per_second": 51.491, |
| "eval_steps_per_second": 6.436, |
| "step": 59800 |
| }, |
| { |
| "epoch": 0.910040277638248, |
| "grad_norm": 0.866371750831604, |
| "learning_rate": 1.0033444816053512e-08, |
| "loss": 4.7503, |
| "step": 59900 |
| }, |
| { |
| "epoch": 0.910040277638248, |
| "eval_loss": 4.589888095855713, |
| "eval_runtime": 194.6389, |
| "eval_samples_per_second": 51.377, |
| "eval_steps_per_second": 6.422, |
| "step": 59900 |
| }, |
| { |
| "epoch": 0.9115595435441549, |
| "grad_norm": 0.7748922109603882, |
| "learning_rate": 0.0, |
| "loss": 4.7524, |
| "step": 60000 |
| }, |
| { |
| "epoch": 0.9115595435441549, |
| "eval_loss": 4.588863372802734, |
| "eval_runtime": 194.1121, |
| "eval_samples_per_second": 51.517, |
| "eval_steps_per_second": 6.44, |
| "step": 60000 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 60000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 7.15783283933184e+19, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|