| { |
| "best_metric": 0.2536342144012451, |
| "best_model_checkpoint": "learning_source_20260316/compounds/bert-output/compounds-medium/checkpoint-57000", |
| "epoch": 5.783313694163914, |
| "eval_steps": 100, |
| "global_step": 60000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.009638856156939856, |
| "grad_norm": 7.281230449676514, |
| "learning_rate": 3e-06, |
| "loss": 3.8704, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.009638856156939856, |
| "eval_loss": 2.631474494934082, |
| "eval_runtime": 40.2777, |
| "eval_samples_per_second": 248.276, |
| "eval_steps_per_second": 31.035, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.019277712313879713, |
| "grad_norm": 11.868888854980469, |
| "learning_rate": 6e-06, |
| "loss": 2.555, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.019277712313879713, |
| "eval_loss": 2.3589468002319336, |
| "eval_runtime": 40.128, |
| "eval_samples_per_second": 249.202, |
| "eval_steps_per_second": 31.15, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.028916568470819567, |
| "grad_norm": 6.551002025604248, |
| "learning_rate": 5.989966555183947e-06, |
| "loss": 2.3092, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.028916568470819567, |
| "eval_loss": 2.1789472103118896, |
| "eval_runtime": 40.1341, |
| "eval_samples_per_second": 249.164, |
| "eval_steps_per_second": 31.146, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.038555424627759426, |
| "grad_norm": 8.500665664672852, |
| "learning_rate": 5.979933110367893e-06, |
| "loss": 2.1633, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.038555424627759426, |
| "eval_loss": 2.072150945663452, |
| "eval_runtime": 40.1372, |
| "eval_samples_per_second": 249.146, |
| "eval_steps_per_second": 31.143, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.04819428078469928, |
| "grad_norm": 13.641960144042969, |
| "learning_rate": 5.96989966555184e-06, |
| "loss": 2.0894, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.04819428078469928, |
| "eval_loss": 2.0400867462158203, |
| "eval_runtime": 40.2974, |
| "eval_samples_per_second": 248.155, |
| "eval_steps_per_second": 31.019, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.057833136941639135, |
| "grad_norm": 5.950716495513916, |
| "learning_rate": 5.959866220735786e-06, |
| "loss": 2.0528, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.057833136941639135, |
| "eval_loss": 2.0093040466308594, |
| "eval_runtime": 40.1283, |
| "eval_samples_per_second": 249.2, |
| "eval_steps_per_second": 31.15, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.06747199309857899, |
| "grad_norm": 9.331167221069336, |
| "learning_rate": 5.949832775919732e-06, |
| "loss": 2.0191, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.06747199309857899, |
| "eval_loss": 1.9733953475952148, |
| "eval_runtime": 40.1378, |
| "eval_samples_per_second": 249.142, |
| "eval_steps_per_second": 31.143, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.07711084925551885, |
| "grad_norm": 3.786700963973999, |
| "learning_rate": 5.939799331103679e-06, |
| "loss": 1.972, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.07711084925551885, |
| "eval_loss": 1.9259250164031982, |
| "eval_runtime": 40.1305, |
| "eval_samples_per_second": 249.187, |
| "eval_steps_per_second": 31.148, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.0867497054124587, |
| "grad_norm": 3.534658432006836, |
| "learning_rate": 5.929765886287626e-06, |
| "loss": 1.9195, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.0867497054124587, |
| "eval_loss": 1.8756057024002075, |
| "eval_runtime": 40.1635, |
| "eval_samples_per_second": 248.982, |
| "eval_steps_per_second": 31.123, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.09638856156939855, |
| "grad_norm": 6.269195556640625, |
| "learning_rate": 5.919732441471572e-06, |
| "loss": 1.8699, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.09638856156939855, |
| "eval_loss": 1.847890853881836, |
| "eval_runtime": 40.2907, |
| "eval_samples_per_second": 248.196, |
| "eval_steps_per_second": 31.025, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.10602741772633842, |
| "grad_norm": 3.2279481887817383, |
| "learning_rate": 5.9096989966555185e-06, |
| "loss": 1.828, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.10602741772633842, |
| "eval_loss": 1.8036913871765137, |
| "eval_runtime": 40.1142, |
| "eval_samples_per_second": 249.288, |
| "eval_steps_per_second": 31.161, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.11566627388327827, |
| "grad_norm": 3.0232768058776855, |
| "learning_rate": 5.899665551839465e-06, |
| "loss": 1.7788, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.11566627388327827, |
| "eval_loss": 1.7329744100570679, |
| "eval_runtime": 40.1179, |
| "eval_samples_per_second": 249.265, |
| "eval_steps_per_second": 31.158, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.12530513004021812, |
| "grad_norm": 3.1454615592956543, |
| "learning_rate": 5.889632107023412e-06, |
| "loss": 1.7228, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.12530513004021812, |
| "eval_loss": 1.6728882789611816, |
| "eval_runtime": 40.1057, |
| "eval_samples_per_second": 249.341, |
| "eval_steps_per_second": 31.168, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.13494398619715797, |
| "grad_norm": 2.789080858230591, |
| "learning_rate": 5.879598662207358e-06, |
| "loss": 1.6748, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.13494398619715797, |
| "eval_loss": 1.6212955713272095, |
| "eval_runtime": 40.2793, |
| "eval_samples_per_second": 248.267, |
| "eval_steps_per_second": 31.033, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.14458284235409785, |
| "grad_norm": 3.62225604057312, |
| "learning_rate": 5.869565217391305e-06, |
| "loss": 1.6172, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.14458284235409785, |
| "eval_loss": 1.5460147857666016, |
| "eval_runtime": 40.2924, |
| "eval_samples_per_second": 248.186, |
| "eval_steps_per_second": 31.023, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.1542216985110377, |
| "grad_norm": 2.5482733249664307, |
| "learning_rate": 5.8595317725752514e-06, |
| "loss": 1.5611, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.1542216985110377, |
| "eval_loss": 1.485274076461792, |
| "eval_runtime": 40.1252, |
| "eval_samples_per_second": 249.22, |
| "eval_steps_per_second": 31.153, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.16386055466797755, |
| "grad_norm": 1.7880430221557617, |
| "learning_rate": 5.849498327759197e-06, |
| "loss": 1.5098, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.16386055466797755, |
| "eval_loss": 1.4245718717575073, |
| "eval_runtime": 40.128, |
| "eval_samples_per_second": 249.202, |
| "eval_steps_per_second": 31.15, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.1734994108249174, |
| "grad_norm": 2.3302555084228516, |
| "learning_rate": 5.839464882943144e-06, |
| "loss": 1.4607, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.1734994108249174, |
| "eval_loss": 1.3619263172149658, |
| "eval_runtime": 40.1162, |
| "eval_samples_per_second": 249.276, |
| "eval_steps_per_second": 31.159, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.18313826698185726, |
| "grad_norm": 3.174801826477051, |
| "learning_rate": 5.829431438127091e-06, |
| "loss": 1.4193, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.18313826698185726, |
| "eval_loss": 1.2982574701309204, |
| "eval_runtime": 40.283, |
| "eval_samples_per_second": 248.244, |
| "eval_steps_per_second": 31.03, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.1927771231387971, |
| "grad_norm": 2.9484267234802246, |
| "learning_rate": 5.819397993311037e-06, |
| "loss": 1.3769, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.1927771231387971, |
| "eval_loss": 1.2819702625274658, |
| "eval_runtime": 40.3215, |
| "eval_samples_per_second": 248.007, |
| "eval_steps_per_second": 31.001, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.202415979295737, |
| "grad_norm": 1.8851428031921387, |
| "learning_rate": 5.8093645484949836e-06, |
| "loss": 1.34, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.202415979295737, |
| "eval_loss": 1.2403301000595093, |
| "eval_runtime": 40.2922, |
| "eval_samples_per_second": 248.187, |
| "eval_steps_per_second": 31.023, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.21205483545267684, |
| "grad_norm": 2.5137407779693604, |
| "learning_rate": 5.79933110367893e-06, |
| "loss": 1.3021, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.21205483545267684, |
| "eval_loss": 1.2023345232009888, |
| "eval_runtime": 40.1117, |
| "eval_samples_per_second": 249.304, |
| "eval_steps_per_second": 31.163, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.2216936916096167, |
| "grad_norm": 2.809570789337158, |
| "learning_rate": 5.789297658862876e-06, |
| "loss": 1.2718, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.2216936916096167, |
| "eval_loss": 1.1547870635986328, |
| "eval_runtime": 40.114, |
| "eval_samples_per_second": 249.29, |
| "eval_steps_per_second": 31.161, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.23133254776655654, |
| "grad_norm": 1.812084674835205, |
| "learning_rate": 5.779264214046823e-06, |
| "loss": 1.2342, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.23133254776655654, |
| "eval_loss": 1.113013744354248, |
| "eval_runtime": 40.1439, |
| "eval_samples_per_second": 249.104, |
| "eval_steps_per_second": 31.138, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.2409714039234964, |
| "grad_norm": 2.1630361080169678, |
| "learning_rate": 5.76923076923077e-06, |
| "loss": 1.21, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.2409714039234964, |
| "eval_loss": 1.0802332162857056, |
| "eval_runtime": 40.2724, |
| "eval_samples_per_second": 248.309, |
| "eval_steps_per_second": 31.039, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.25061026008043624, |
| "grad_norm": 2.5893359184265137, |
| "learning_rate": 5.759197324414716e-06, |
| "loss": 1.181, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.25061026008043624, |
| "eval_loss": 1.065743088722229, |
| "eval_runtime": 40.113, |
| "eval_samples_per_second": 249.296, |
| "eval_steps_per_second": 31.162, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.2602491162373761, |
| "grad_norm": 2.6710309982299805, |
| "learning_rate": 5.7491638795986624e-06, |
| "loss": 1.1569, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.2602491162373761, |
| "eval_loss": 1.0426665544509888, |
| "eval_runtime": 40.1155, |
| "eval_samples_per_second": 249.28, |
| "eval_steps_per_second": 31.16, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.26988797239431594, |
| "grad_norm": 2.068502902984619, |
| "learning_rate": 5.739130434782609e-06, |
| "loss": 1.1335, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.26988797239431594, |
| "eval_loss": 1.0142563581466675, |
| "eval_runtime": 40.1365, |
| "eval_samples_per_second": 249.15, |
| "eval_steps_per_second": 31.144, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.2795268285512558, |
| "grad_norm": 2.485072374343872, |
| "learning_rate": 5.729096989966555e-06, |
| "loss": 1.1049, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.2795268285512558, |
| "eval_loss": 0.988125205039978, |
| "eval_runtime": 40.1288, |
| "eval_samples_per_second": 249.198, |
| "eval_steps_per_second": 31.15, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.2891656847081957, |
| "grad_norm": 2.3873965740203857, |
| "learning_rate": 5.719063545150502e-06, |
| "loss": 1.0827, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.2891656847081957, |
| "eval_loss": 0.95792555809021, |
| "eval_runtime": 40.2725, |
| "eval_samples_per_second": 248.308, |
| "eval_steps_per_second": 31.039, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.2988045408651355, |
| "grad_norm": 2.1445956230163574, |
| "learning_rate": 5.709030100334449e-06, |
| "loss": 1.0598, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.2988045408651355, |
| "eval_loss": 0.9487287402153015, |
| "eval_runtime": 40.201, |
| "eval_samples_per_second": 248.75, |
| "eval_steps_per_second": 31.094, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.3084433970220754, |
| "grad_norm": 2.3310909271240234, |
| "learning_rate": 5.698996655518395e-06, |
| "loss": 1.041, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.3084433970220754, |
| "eval_loss": 0.9160116910934448, |
| "eval_runtime": 40.2197, |
| "eval_samples_per_second": 248.634, |
| "eval_steps_per_second": 31.079, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.31808225317901523, |
| "grad_norm": 2.3795151710510254, |
| "learning_rate": 5.688963210702341e-06, |
| "loss": 1.0179, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.31808225317901523, |
| "eval_loss": 0.8998765349388123, |
| "eval_runtime": 40.0659, |
| "eval_samples_per_second": 249.589, |
| "eval_steps_per_second": 31.199, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.3277211093359551, |
| "grad_norm": 2.6683619022369385, |
| "learning_rate": 5.678929765886288e-06, |
| "loss": 1.0022, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.3277211093359551, |
| "eval_loss": 0.8904856443405151, |
| "eval_runtime": 40.0602, |
| "eval_samples_per_second": 249.624, |
| "eval_steps_per_second": 31.203, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.33735996549289493, |
| "grad_norm": 2.24725079536438, |
| "learning_rate": 5.668896321070235e-06, |
| "loss": 0.9901, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.33735996549289493, |
| "eval_loss": 0.8752567172050476, |
| "eval_runtime": 40.0856, |
| "eval_samples_per_second": 249.466, |
| "eval_steps_per_second": 31.183, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.3469988216498348, |
| "grad_norm": 2.042285442352295, |
| "learning_rate": 5.658862876254181e-06, |
| "loss": 0.9708, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.3469988216498348, |
| "eval_loss": 0.8505752086639404, |
| "eval_runtime": 40.0593, |
| "eval_samples_per_second": 249.63, |
| "eval_steps_per_second": 31.204, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.3566376778067747, |
| "grad_norm": 1.9062834978103638, |
| "learning_rate": 5.6488294314381275e-06, |
| "loss": 0.9577, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.3566376778067747, |
| "eval_loss": 0.8313038945198059, |
| "eval_runtime": 40.2235, |
| "eval_samples_per_second": 248.611, |
| "eval_steps_per_second": 31.076, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.3662765339637145, |
| "grad_norm": 1.9753172397613525, |
| "learning_rate": 5.638795986622074e-06, |
| "loss": 0.94, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.3662765339637145, |
| "eval_loss": 0.8297803997993469, |
| "eval_runtime": 40.0249, |
| "eval_samples_per_second": 249.844, |
| "eval_steps_per_second": 31.231, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.3759153901206544, |
| "grad_norm": 2.3993759155273438, |
| "learning_rate": 5.62876254180602e-06, |
| "loss": 0.9221, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.3759153901206544, |
| "eval_loss": 0.8069967031478882, |
| "eval_runtime": 40.0327, |
| "eval_samples_per_second": 249.796, |
| "eval_steps_per_second": 31.224, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.3855542462775942, |
| "grad_norm": 2.130056858062744, |
| "learning_rate": 5.618729096989967e-06, |
| "loss": 0.9078, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.3855542462775942, |
| "eval_loss": 0.8021091818809509, |
| "eval_runtime": 40.0159, |
| "eval_samples_per_second": 249.901, |
| "eval_steps_per_second": 31.238, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.3951931024345341, |
| "grad_norm": 2.2790122032165527, |
| "learning_rate": 5.608695652173914e-06, |
| "loss": 0.8954, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.3951931024345341, |
| "eval_loss": 0.797188401222229, |
| "eval_runtime": 40.062, |
| "eval_samples_per_second": 249.613, |
| "eval_steps_per_second": 31.202, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.404831958591474, |
| "grad_norm": 2.6065945625305176, |
| "learning_rate": 5.59866220735786e-06, |
| "loss": 0.8816, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.404831958591474, |
| "eval_loss": 0.7731242179870605, |
| "eval_runtime": 40.1911, |
| "eval_samples_per_second": 248.811, |
| "eval_steps_per_second": 31.101, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.4144708147484138, |
| "grad_norm": 2.0770599842071533, |
| "learning_rate": 5.588628762541806e-06, |
| "loss": 0.869, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.4144708147484138, |
| "eval_loss": 0.7588244080543518, |
| "eval_runtime": 40.0263, |
| "eval_samples_per_second": 249.836, |
| "eval_steps_per_second": 31.229, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.4241096709053537, |
| "grad_norm": 2.3637514114379883, |
| "learning_rate": 5.578595317725753e-06, |
| "loss": 0.8572, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.4241096709053537, |
| "eval_loss": 0.754951000213623, |
| "eval_runtime": 40.0508, |
| "eval_samples_per_second": 249.683, |
| "eval_steps_per_second": 31.21, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.4337485270622935, |
| "grad_norm": 2.044037342071533, |
| "learning_rate": 5.568561872909699e-06, |
| "loss": 0.8505, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.4337485270622935, |
| "eval_loss": 0.7327916026115417, |
| "eval_runtime": 40.0028, |
| "eval_samples_per_second": 249.983, |
| "eval_steps_per_second": 31.248, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.4433873832192334, |
| "grad_norm": 1.824724793434143, |
| "learning_rate": 5.558528428093646e-06, |
| "loss": 0.8372, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.4433873832192334, |
| "eval_loss": 0.7417221665382385, |
| "eval_runtime": 40.1915, |
| "eval_samples_per_second": 248.809, |
| "eval_steps_per_second": 31.101, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.45302623937617326, |
| "grad_norm": 2.4582533836364746, |
| "learning_rate": 5.548494983277593e-06, |
| "loss": 0.8265, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.45302623937617326, |
| "eval_loss": 0.7247117757797241, |
| "eval_runtime": 39.9831, |
| "eval_samples_per_second": 250.105, |
| "eval_steps_per_second": 31.263, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.4626650955331131, |
| "grad_norm": 2.2355294227600098, |
| "learning_rate": 5.5384615384615385e-06, |
| "loss": 0.8125, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.4626650955331131, |
| "eval_loss": 0.7139586806297302, |
| "eval_runtime": 39.9822, |
| "eval_samples_per_second": 250.111, |
| "eval_steps_per_second": 31.264, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.47230395169005296, |
| "grad_norm": 2.021684169769287, |
| "learning_rate": 5.528428093645485e-06, |
| "loss": 0.8085, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.47230395169005296, |
| "eval_loss": 0.7031977772712708, |
| "eval_runtime": 39.9965, |
| "eval_samples_per_second": 250.022, |
| "eval_steps_per_second": 31.253, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.4819428078469928, |
| "grad_norm": 2.307088613510132, |
| "learning_rate": 5.518394648829432e-06, |
| "loss": 0.7937, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.4819428078469928, |
| "eval_loss": 0.6927512884140015, |
| "eval_runtime": 40.0327, |
| "eval_samples_per_second": 249.796, |
| "eval_steps_per_second": 31.225, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.49158166400393266, |
| "grad_norm": 1.824004888534546, |
| "learning_rate": 5.508361204013378e-06, |
| "loss": 0.7882, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.49158166400393266, |
| "eval_loss": 0.6837274432182312, |
| "eval_runtime": 40.1793, |
| "eval_samples_per_second": 248.884, |
| "eval_steps_per_second": 31.111, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.5012205201608725, |
| "grad_norm": 1.8306970596313477, |
| "learning_rate": 5.498327759197324e-06, |
| "loss": 0.7758, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.5012205201608725, |
| "eval_loss": 0.6825730204582214, |
| "eval_runtime": 40.0105, |
| "eval_samples_per_second": 249.934, |
| "eval_steps_per_second": 31.242, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.5108593763178124, |
| "grad_norm": 1.8359283208847046, |
| "learning_rate": 5.488294314381271e-06, |
| "loss": 0.7665, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.5108593763178124, |
| "eval_loss": 0.6687480211257935, |
| "eval_runtime": 40.0097, |
| "eval_samples_per_second": 249.939, |
| "eval_steps_per_second": 31.242, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.5204982324747522, |
| "grad_norm": 1.8846337795257568, |
| "learning_rate": 5.478260869565217e-06, |
| "loss": 0.7621, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.5204982324747522, |
| "eval_loss": 0.6538456082344055, |
| "eval_runtime": 39.9879, |
| "eval_samples_per_second": 250.076, |
| "eval_steps_per_second": 31.259, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.5301370886316921, |
| "grad_norm": 1.734246850013733, |
| "learning_rate": 5.468227424749163e-06, |
| "loss": 0.7518, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.5301370886316921, |
| "eval_loss": 0.6537781953811646, |
| "eval_runtime": 40.0172, |
| "eval_samples_per_second": 249.893, |
| "eval_steps_per_second": 31.237, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.5397759447886319, |
| "grad_norm": 1.805600881576538, |
| "learning_rate": 5.45819397993311e-06, |
| "loss": 0.7433, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.5397759447886319, |
| "eval_loss": 0.6553384065628052, |
| "eval_runtime": 40.1835, |
| "eval_samples_per_second": 248.858, |
| "eval_steps_per_second": 31.107, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.5494148009455718, |
| "grad_norm": 1.7596203088760376, |
| "learning_rate": 5.448160535117057e-06, |
| "loss": 0.7319, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.5494148009455718, |
| "eval_loss": 0.6483535170555115, |
| "eval_runtime": 40.012, |
| "eval_samples_per_second": 249.925, |
| "eval_steps_per_second": 31.241, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.5590536571025116, |
| "grad_norm": 2.0771901607513428, |
| "learning_rate": 5.438127090301003e-06, |
| "loss": 0.727, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.5590536571025116, |
| "eval_loss": 0.645814836025238, |
| "eval_runtime": 40.0155, |
| "eval_samples_per_second": 249.903, |
| "eval_steps_per_second": 31.238, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.5686925132594515, |
| "grad_norm": 1.8789221048355103, |
| "learning_rate": 5.4280936454849495e-06, |
| "loss": 0.7269, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.5686925132594515, |
| "eval_loss": 0.6407626271247864, |
| "eval_runtime": 40.0142, |
| "eval_samples_per_second": 249.911, |
| "eval_steps_per_second": 31.239, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.5783313694163914, |
| "grad_norm": 2.0252554416656494, |
| "learning_rate": 5.418060200668896e-06, |
| "loss": 0.7133, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.5783313694163914, |
| "eval_loss": 0.6353247761726379, |
| "eval_runtime": 40.0245, |
| "eval_samples_per_second": 249.847, |
| "eval_steps_per_second": 31.231, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.5879702255733312, |
| "grad_norm": 2.003438949584961, |
| "learning_rate": 5.408026755852843e-06, |
| "loss": 0.7096, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.5879702255733312, |
| "eval_loss": 0.6176116466522217, |
| "eval_runtime": 40.1568, |
| "eval_samples_per_second": 249.024, |
| "eval_steps_per_second": 31.128, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.597609081730271, |
| "grad_norm": 1.8938828706741333, |
| "learning_rate": 5.397993311036789e-06, |
| "loss": 0.7001, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.597609081730271, |
| "eval_loss": 0.6332681775093079, |
| "eval_runtime": 39.9982, |
| "eval_samples_per_second": 250.011, |
| "eval_steps_per_second": 31.251, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.6072479378872109, |
| "grad_norm": 1.6841580867767334, |
| "learning_rate": 5.387959866220736e-06, |
| "loss": 0.6938, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.6072479378872109, |
| "eval_loss": 0.613711953163147, |
| "eval_runtime": 40.0044, |
| "eval_samples_per_second": 249.972, |
| "eval_steps_per_second": 31.247, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.6168867940441508, |
| "grad_norm": 1.654695987701416, |
| "learning_rate": 5.3779264214046825e-06, |
| "loss": 0.6902, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.6168867940441508, |
| "eval_loss": 0.6061456799507141, |
| "eval_runtime": 39.9958, |
| "eval_samples_per_second": 250.026, |
| "eval_steps_per_second": 31.253, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.6265256502010906, |
| "grad_norm": 1.8732696771621704, |
| "learning_rate": 5.367892976588628e-06, |
| "loss": 0.6801, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.6265256502010906, |
| "eval_loss": 0.5902617573738098, |
| "eval_runtime": 40.1557, |
| "eval_samples_per_second": 249.031, |
| "eval_steps_per_second": 31.129, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.6361645063580305, |
| "grad_norm": 1.8029794692993164, |
| "learning_rate": 5.357859531772575e-06, |
| "loss": 0.6756, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.6361645063580305, |
| "eval_loss": 0.5980539321899414, |
| "eval_runtime": 39.9884, |
| "eval_samples_per_second": 250.072, |
| "eval_steps_per_second": 31.259, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.6458033625149704, |
| "grad_norm": 2.164788007736206, |
| "learning_rate": 5.347826086956522e-06, |
| "loss": 0.6702, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.6458033625149704, |
| "eval_loss": 0.5885195732116699, |
| "eval_runtime": 39.9791, |
| "eval_samples_per_second": 250.131, |
| "eval_steps_per_second": 31.266, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.6554422186719102, |
| "grad_norm": 1.8862236738204956, |
| "learning_rate": 5.337792642140468e-06, |
| "loss": 0.6638, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.6554422186719102, |
| "eval_loss": 0.5894142389297485, |
| "eval_runtime": 39.9968, |
| "eval_samples_per_second": 250.02, |
| "eval_steps_per_second": 31.252, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.66508107482885, |
| "grad_norm": 1.7155135869979858, |
| "learning_rate": 5.327759197324415e-06, |
| "loss": 0.6624, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.66508107482885, |
| "eval_loss": 0.5867804884910583, |
| "eval_runtime": 39.9878, |
| "eval_samples_per_second": 250.077, |
| "eval_steps_per_second": 31.26, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.6747199309857899, |
| "grad_norm": 1.592544436454773, |
| "learning_rate": 5.317725752508361e-06, |
| "loss": 0.6581, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.6747199309857899, |
| "eval_loss": 0.5733709931373596, |
| "eval_runtime": 40.159, |
| "eval_samples_per_second": 249.01, |
| "eval_steps_per_second": 31.126, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.6843587871427298, |
| "grad_norm": 1.7839319705963135, |
| "learning_rate": 5.307692307692307e-06, |
| "loss": 0.6516, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.6843587871427298, |
| "eval_loss": 0.5781156420707703, |
| "eval_runtime": 40.0095, |
| "eval_samples_per_second": 249.94, |
| "eval_steps_per_second": 31.243, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.6939976432996696, |
| "grad_norm": 1.6162323951721191, |
| "learning_rate": 5.297658862876254e-06, |
| "loss": 0.646, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.6939976432996696, |
| "eval_loss": 0.5711376667022705, |
| "eval_runtime": 40.0115, |
| "eval_samples_per_second": 249.928, |
| "eval_steps_per_second": 31.241, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.7036364994566094, |
| "grad_norm": 1.8823449611663818, |
| "learning_rate": 5.287625418060201e-06, |
| "loss": 0.6408, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.7036364994566094, |
| "eval_loss": 0.5678967237472534, |
| "eval_runtime": 40.0142, |
| "eval_samples_per_second": 249.912, |
| "eval_steps_per_second": 31.239, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.7132753556135494, |
| "grad_norm": 1.7034283876419067, |
| "learning_rate": 5.277591973244147e-06, |
| "loss": 0.6353, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.7132753556135494, |
| "eval_loss": 0.5637656450271606, |
| "eval_runtime": 40.0085, |
| "eval_samples_per_second": 249.947, |
| "eval_steps_per_second": 31.243, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.7229142117704892, |
| "grad_norm": 1.791896104812622, |
| "learning_rate": 5.2675585284280935e-06, |
| "loss": 0.6364, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.7229142117704892, |
| "eval_loss": 0.559363842010498, |
| "eval_runtime": 40.1708, |
| "eval_samples_per_second": 248.937, |
| "eval_steps_per_second": 31.117, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.732553067927429, |
| "grad_norm": 1.6761842966079712, |
| "learning_rate": 5.25752508361204e-06, |
| "loss": 0.6317, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.732553067927429, |
| "eval_loss": 0.5549041628837585, |
| "eval_runtime": 40.009, |
| "eval_samples_per_second": 249.944, |
| "eval_steps_per_second": 31.243, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.742191924084369, |
| "grad_norm": 1.7356570959091187, |
| "learning_rate": 5.247491638795986e-06, |
| "loss": 0.6226, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.742191924084369, |
| "eval_loss": 0.5542047619819641, |
| "eval_runtime": 40.0074, |
| "eval_samples_per_second": 249.954, |
| "eval_steps_per_second": 31.244, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.7518307802413088, |
| "grad_norm": 2.016263246536255, |
| "learning_rate": 5.237458193979933e-06, |
| "loss": 0.6205, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.7518307802413088, |
| "eval_loss": 0.5474265813827515, |
| "eval_runtime": 40.0104, |
| "eval_samples_per_second": 249.935, |
| "eval_steps_per_second": 31.242, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.7614696363982486, |
| "grad_norm": 1.9018930196762085, |
| "learning_rate": 5.22742474916388e-06, |
| "loss": 0.613, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.7614696363982486, |
| "eval_loss": 0.5480296611785889, |
| "eval_runtime": 40.0133, |
| "eval_samples_per_second": 249.917, |
| "eval_steps_per_second": 31.24, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.7711084925551884, |
| "grad_norm": 1.6764400005340576, |
| "learning_rate": 5.2173913043478265e-06, |
| "loss": 0.6159, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.7711084925551884, |
| "eval_loss": 0.5415164828300476, |
| "eval_runtime": 40.1661, |
| "eval_samples_per_second": 248.966, |
| "eval_steps_per_second": 31.121, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.7807473487121284, |
| "grad_norm": 1.817726492881775, |
| "learning_rate": 5.207357859531772e-06, |
| "loss": 0.6104, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.7807473487121284, |
| "eval_loss": 0.5427911281585693, |
| "eval_runtime": 39.9874, |
| "eval_samples_per_second": 250.079, |
| "eval_steps_per_second": 31.26, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.7903862048690682, |
| "grad_norm": 1.9600380659103394, |
| "learning_rate": 5.197324414715719e-06, |
| "loss": 0.6054, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.7903862048690682, |
| "eval_loss": 0.5321446657180786, |
| "eval_runtime": 40.0095, |
| "eval_samples_per_second": 249.94, |
| "eval_steps_per_second": 31.243, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.800025061026008, |
| "grad_norm": 1.6928702592849731, |
| "learning_rate": 5.187290969899666e-06, |
| "loss": 0.6019, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.800025061026008, |
| "eval_loss": 0.537187933921814, |
| "eval_runtime": 40.0123, |
| "eval_samples_per_second": 249.923, |
| "eval_steps_per_second": 31.24, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.809663917182948, |
| "grad_norm": 1.5704461336135864, |
| "learning_rate": 5.177257525083612e-06, |
| "loss": 0.6027, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.809663917182948, |
| "eval_loss": 0.5241678357124329, |
| "eval_runtime": 40.156, |
| "eval_samples_per_second": 249.029, |
| "eval_steps_per_second": 31.129, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.8193027733398878, |
| "grad_norm": 1.6146364212036133, |
| "learning_rate": 5.167224080267559e-06, |
| "loss": 0.5972, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.8193027733398878, |
| "eval_loss": 0.5265556573867798, |
| "eval_runtime": 40.0497, |
| "eval_samples_per_second": 249.69, |
| "eval_steps_per_second": 31.211, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.8289416294968276, |
| "grad_norm": 1.5260276794433594, |
| "learning_rate": 5.157190635451505e-06, |
| "loss": 0.5943, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.8289416294968276, |
| "eval_loss": 0.525088369846344, |
| "eval_runtime": 40.0049, |
| "eval_samples_per_second": 249.969, |
| "eval_steps_per_second": 31.246, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.8385804856537674, |
| "grad_norm": 1.4389654397964478, |
| "learning_rate": 5.147157190635451e-06, |
| "loss": 0.5896, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.8385804856537674, |
| "eval_loss": 0.5226108431816101, |
| "eval_runtime": 40.0292, |
| "eval_samples_per_second": 249.818, |
| "eval_steps_per_second": 31.227, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.8482193418107074, |
| "grad_norm": 1.5730172395706177, |
| "learning_rate": 5.137123745819398e-06, |
| "loss": 0.5866, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.8482193418107074, |
| "eval_loss": 0.5186478495597839, |
| "eval_runtime": 40.0165, |
| "eval_samples_per_second": 249.897, |
| "eval_steps_per_second": 31.237, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.8578581979676472, |
| "grad_norm": 1.562483310699463, |
| "learning_rate": 5.127090301003345e-06, |
| "loss": 0.5844, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.8578581979676472, |
| "eval_loss": 0.5097201466560364, |
| "eval_runtime": 40.1856, |
| "eval_samples_per_second": 248.845, |
| "eval_steps_per_second": 31.106, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.867497054124587, |
| "grad_norm": 1.6470036506652832, |
| "learning_rate": 5.117056856187291e-06, |
| "loss": 0.5773, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.867497054124587, |
| "eval_loss": 0.5092134475708008, |
| "eval_runtime": 40.0143, |
| "eval_samples_per_second": 249.911, |
| "eval_steps_per_second": 31.239, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.8771359102815269, |
| "grad_norm": 1.5788582563400269, |
| "learning_rate": 5.1070234113712375e-06, |
| "loss": 0.5797, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.8771359102815269, |
| "eval_loss": 0.5102807879447937, |
| "eval_runtime": 40.0303, |
| "eval_samples_per_second": 249.811, |
| "eval_steps_per_second": 31.226, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.8867747664384668, |
| "grad_norm": 1.756942868232727, |
| "learning_rate": 5.096989966555184e-06, |
| "loss": 0.5776, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.8867747664384668, |
| "eval_loss": 0.5153423547744751, |
| "eval_runtime": 40.0054, |
| "eval_samples_per_second": 249.966, |
| "eval_steps_per_second": 31.246, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.8964136225954066, |
| "grad_norm": 1.5298547744750977, |
| "learning_rate": 5.08695652173913e-06, |
| "loss": 0.5751, |
| "step": 9300 |
| }, |
| { |
| "epoch": 0.8964136225954066, |
| "eval_loss": 0.5078564286231995, |
| "eval_runtime": 40.011, |
| "eval_samples_per_second": 249.932, |
| "eval_steps_per_second": 31.241, |
| "step": 9300 |
| }, |
| { |
| "epoch": 0.9060524787523465, |
| "grad_norm": 1.7196496725082397, |
| "learning_rate": 5.076923076923077e-06, |
| "loss": 0.5665, |
| "step": 9400 |
| }, |
| { |
| "epoch": 0.9060524787523465, |
| "eval_loss": 0.5065763592720032, |
| "eval_runtime": 40.1717, |
| "eval_samples_per_second": 248.932, |
| "eval_steps_per_second": 31.116, |
| "step": 9400 |
| }, |
| { |
| "epoch": 0.9156913349092863, |
| "grad_norm": 1.6301361322402954, |
| "learning_rate": 5.066889632107024e-06, |
| "loss": 0.5675, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.9156913349092863, |
| "eval_loss": 0.49776747822761536, |
| "eval_runtime": 40.0165, |
| "eval_samples_per_second": 249.897, |
| "eval_steps_per_second": 31.237, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.9253301910662262, |
| "grad_norm": 1.4653514623641968, |
| "learning_rate": 5.05685618729097e-06, |
| "loss": 0.5618, |
| "step": 9600 |
| }, |
| { |
| "epoch": 0.9253301910662262, |
| "eval_loss": 0.49646928906440735, |
| "eval_runtime": 40.017, |
| "eval_samples_per_second": 249.894, |
| "eval_steps_per_second": 31.237, |
| "step": 9600 |
| }, |
| { |
| "epoch": 0.934969047223166, |
| "grad_norm": 1.6670091152191162, |
| "learning_rate": 5.046822742474916e-06, |
| "loss": 0.563, |
| "step": 9700 |
| }, |
| { |
| "epoch": 0.934969047223166, |
| "eval_loss": 0.49914979934692383, |
| "eval_runtime": 40.0323, |
| "eval_samples_per_second": 249.798, |
| "eval_steps_per_second": 31.225, |
| "step": 9700 |
| }, |
| { |
| "epoch": 0.9446079033801059, |
| "grad_norm": 1.6070232391357422, |
| "learning_rate": 5.036789297658863e-06, |
| "loss": 0.5542, |
| "step": 9800 |
| }, |
| { |
| "epoch": 0.9446079033801059, |
| "eval_loss": 0.4884184002876282, |
| "eval_runtime": 40.0566, |
| "eval_samples_per_second": 249.646, |
| "eval_steps_per_second": 31.206, |
| "step": 9800 |
| }, |
| { |
| "epoch": 0.9542467595370457, |
| "grad_norm": 1.4984960556030273, |
| "learning_rate": 5.02675585284281e-06, |
| "loss": 0.5581, |
| "step": 9900 |
| }, |
| { |
| "epoch": 0.9542467595370457, |
| "eval_loss": 0.4900113642215729, |
| "eval_runtime": 40.2056, |
| "eval_samples_per_second": 248.721, |
| "eval_steps_per_second": 31.09, |
| "step": 9900 |
| }, |
| { |
| "epoch": 0.9638856156939856, |
| "grad_norm": 1.537240743637085, |
| "learning_rate": 5.016722408026756e-06, |
| "loss": 0.5539, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.9638856156939856, |
| "eval_loss": 0.4897591769695282, |
| "eval_runtime": 40.0038, |
| "eval_samples_per_second": 249.976, |
| "eval_steps_per_second": 31.247, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.9735244718509255, |
| "grad_norm": 1.6569781303405762, |
| "learning_rate": 5.0066889632107026e-06, |
| "loss": 0.5481, |
| "step": 10100 |
| }, |
| { |
| "epoch": 0.9735244718509255, |
| "eval_loss": 0.4842386245727539, |
| "eval_runtime": 40.0389, |
| "eval_samples_per_second": 249.757, |
| "eval_steps_per_second": 31.22, |
| "step": 10100 |
| }, |
| { |
| "epoch": 0.9831633280078653, |
| "grad_norm": 1.4528193473815918, |
| "learning_rate": 4.996655518394649e-06, |
| "loss": 0.5464, |
| "step": 10200 |
| }, |
| { |
| "epoch": 0.9831633280078653, |
| "eval_loss": 0.4880593717098236, |
| "eval_runtime": 40.0358, |
| "eval_samples_per_second": 249.776, |
| "eval_steps_per_second": 31.222, |
| "step": 10200 |
| }, |
| { |
| "epoch": 0.9928021841648051, |
| "grad_norm": 1.5718955993652344, |
| "learning_rate": 4.986622073578595e-06, |
| "loss": 0.543, |
| "step": 10300 |
| }, |
| { |
| "epoch": 0.9928021841648051, |
| "eval_loss": 0.4782450795173645, |
| "eval_runtime": 36.7041, |
| "eval_samples_per_second": 272.449, |
| "eval_steps_per_second": 34.056, |
| "step": 10300 |
| }, |
| { |
| "epoch": 1.002441040321745, |
| "grad_norm": 1.647326946258545, |
| "learning_rate": 4.976588628762542e-06, |
| "loss": 0.5412, |
| "step": 10400 |
| }, |
| { |
| "epoch": 1.002441040321745, |
| "eval_loss": 0.486726850271225, |
| "eval_runtime": 40.2013, |
| "eval_samples_per_second": 248.748, |
| "eval_steps_per_second": 31.094, |
| "step": 10400 |
| }, |
| { |
| "epoch": 1.0120798964786848, |
| "grad_norm": 1.6924008131027222, |
| "learning_rate": 4.966555183946489e-06, |
| "loss": 0.5394, |
| "step": 10500 |
| }, |
| { |
| "epoch": 1.0120798964786848, |
| "eval_loss": 0.4839903712272644, |
| "eval_runtime": 40.0205, |
| "eval_samples_per_second": 249.872, |
| "eval_steps_per_second": 31.234, |
| "step": 10500 |
| }, |
| { |
| "epoch": 1.0217187526356248, |
| "grad_norm": 1.5169456005096436, |
| "learning_rate": 4.956521739130435e-06, |
| "loss": 0.5397, |
| "step": 10600 |
| }, |
| { |
| "epoch": 1.0217187526356248, |
| "eval_loss": 0.47214385867118835, |
| "eval_runtime": 40.1949, |
| "eval_samples_per_second": 248.788, |
| "eval_steps_per_second": 31.098, |
| "step": 10600 |
| }, |
| { |
| "epoch": 1.0313576087925647, |
| "grad_norm": 1.5480705499649048, |
| "learning_rate": 4.9464882943143815e-06, |
| "loss": 0.5357, |
| "step": 10700 |
| }, |
| { |
| "epoch": 1.0313576087925647, |
| "eval_loss": 0.48204073309898376, |
| "eval_runtime": 40.0052, |
| "eval_samples_per_second": 249.968, |
| "eval_steps_per_second": 31.246, |
| "step": 10700 |
| }, |
| { |
| "epoch": 1.0409964649495045, |
| "grad_norm": 1.3915138244628906, |
| "learning_rate": 4.936454849498328e-06, |
| "loss": 0.534, |
| "step": 10800 |
| }, |
| { |
| "epoch": 1.0409964649495045, |
| "eval_loss": 0.47229185700416565, |
| "eval_runtime": 40.0072, |
| "eval_samples_per_second": 249.955, |
| "eval_steps_per_second": 31.244, |
| "step": 10800 |
| }, |
| { |
| "epoch": 1.0506353211064443, |
| "grad_norm": 1.4983240365982056, |
| "learning_rate": 4.926421404682274e-06, |
| "loss": 0.5276, |
| "step": 10900 |
| }, |
| { |
| "epoch": 1.0506353211064443, |
| "eval_loss": 0.46871235966682434, |
| "eval_runtime": 40.0389, |
| "eval_samples_per_second": 249.757, |
| "eval_steps_per_second": 31.22, |
| "step": 10900 |
| }, |
| { |
| "epoch": 1.0602741772633841, |
| "grad_norm": 1.533703088760376, |
| "learning_rate": 4.916387959866221e-06, |
| "loss": 0.5296, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.0602741772633841, |
| "eval_loss": 0.46552249789237976, |
| "eval_runtime": 40.0022, |
| "eval_samples_per_second": 249.986, |
| "eval_steps_per_second": 31.248, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.069913033420324, |
| "grad_norm": 1.5167957544326782, |
| "learning_rate": 4.906354515050168e-06, |
| "loss": 0.5254, |
| "step": 11100 |
| }, |
| { |
| "epoch": 1.069913033420324, |
| "eval_loss": 0.4699433445930481, |
| "eval_runtime": 40.1743, |
| "eval_samples_per_second": 248.915, |
| "eval_steps_per_second": 31.114, |
| "step": 11100 |
| }, |
| { |
| "epoch": 1.0795518895772638, |
| "grad_norm": 1.4261205196380615, |
| "learning_rate": 4.8963210702341136e-06, |
| "loss": 0.5201, |
| "step": 11200 |
| }, |
| { |
| "epoch": 1.0795518895772638, |
| "eval_loss": 0.4564357399940491, |
| "eval_runtime": 40.025, |
| "eval_samples_per_second": 249.844, |
| "eval_steps_per_second": 31.23, |
| "step": 11200 |
| }, |
| { |
| "epoch": 1.0891907457342038, |
| "grad_norm": 1.3741745948791504, |
| "learning_rate": 4.88628762541806e-06, |
| "loss": 0.5224, |
| "step": 11300 |
| }, |
| { |
| "epoch": 1.0891907457342038, |
| "eval_loss": 0.46628084778785706, |
| "eval_runtime": 40.0189, |
| "eval_samples_per_second": 249.882, |
| "eval_steps_per_second": 31.235, |
| "step": 11300 |
| }, |
| { |
| "epoch": 1.0988296018911436, |
| "grad_norm": 1.4131207466125488, |
| "learning_rate": 4.876254180602007e-06, |
| "loss": 0.5147, |
| "step": 11400 |
| }, |
| { |
| "epoch": 1.0988296018911436, |
| "eval_loss": 0.46065640449523926, |
| "eval_runtime": 40.0026, |
| "eval_samples_per_second": 249.984, |
| "eval_steps_per_second": 31.248, |
| "step": 11400 |
| }, |
| { |
| "epoch": 1.1084684580480835, |
| "grad_norm": 1.434804081916809, |
| "learning_rate": 4.866220735785953e-06, |
| "loss": 0.5186, |
| "step": 11500 |
| }, |
| { |
| "epoch": 1.1084684580480835, |
| "eval_loss": 0.4589378833770752, |
| "eval_runtime": 40.0058, |
| "eval_samples_per_second": 249.964, |
| "eval_steps_per_second": 31.246, |
| "step": 11500 |
| }, |
| { |
| "epoch": 1.1181073142050233, |
| "grad_norm": 1.382394552230835, |
| "learning_rate": 4.8561872909699e-06, |
| "loss": 0.5138, |
| "step": 11600 |
| }, |
| { |
| "epoch": 1.1181073142050233, |
| "eval_loss": 0.4565480053424835, |
| "eval_runtime": 40.1633, |
| "eval_samples_per_second": 248.984, |
| "eval_steps_per_second": 31.123, |
| "step": 11600 |
| }, |
| { |
| "epoch": 1.1277461703619631, |
| "grad_norm": 1.5272095203399658, |
| "learning_rate": 4.8461538461538465e-06, |
| "loss": 0.5164, |
| "step": 11700 |
| }, |
| { |
| "epoch": 1.1277461703619631, |
| "eval_loss": 0.45689794421195984, |
| "eval_runtime": 40.0182, |
| "eval_samples_per_second": 249.887, |
| "eval_steps_per_second": 31.236, |
| "step": 11700 |
| }, |
| { |
| "epoch": 1.137385026518903, |
| "grad_norm": 1.5336707830429077, |
| "learning_rate": 4.8361204013377925e-06, |
| "loss": 0.5089, |
| "step": 11800 |
| }, |
| { |
| "epoch": 1.137385026518903, |
| "eval_loss": 0.4564231336116791, |
| "eval_runtime": 40.0203, |
| "eval_samples_per_second": 249.873, |
| "eval_steps_per_second": 31.234, |
| "step": 11800 |
| }, |
| { |
| "epoch": 1.1470238826758428, |
| "grad_norm": 1.5161396265029907, |
| "learning_rate": 4.826086956521739e-06, |
| "loss": 0.5106, |
| "step": 11900 |
| }, |
| { |
| "epoch": 1.1470238826758428, |
| "eval_loss": 0.44420644640922546, |
| "eval_runtime": 40.0196, |
| "eval_samples_per_second": 249.877, |
| "eval_steps_per_second": 31.235, |
| "step": 11900 |
| }, |
| { |
| "epoch": 1.1566627388327828, |
| "grad_norm": 1.5325267314910889, |
| "learning_rate": 4.816053511705686e-06, |
| "loss": 0.5098, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.1566627388327828, |
| "eval_loss": 0.44834986329078674, |
| "eval_runtime": 40.0681, |
| "eval_samples_per_second": 249.575, |
| "eval_steps_per_second": 31.197, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.1663015949897226, |
| "grad_norm": 1.4805413484573364, |
| "learning_rate": 4.806020066889633e-06, |
| "loss": 0.5055, |
| "step": 12100 |
| }, |
| { |
| "epoch": 1.1663015949897226, |
| "eval_loss": 0.4538011848926544, |
| "eval_runtime": 40.2028, |
| "eval_samples_per_second": 248.739, |
| "eval_steps_per_second": 31.092, |
| "step": 12100 |
| }, |
| { |
| "epoch": 1.1759404511466625, |
| "grad_norm": 1.5557194948196411, |
| "learning_rate": 4.795986622073579e-06, |
| "loss": 0.5058, |
| "step": 12200 |
| }, |
| { |
| "epoch": 1.1759404511466625, |
| "eval_loss": 0.44933125376701355, |
| "eval_runtime": 40.023, |
| "eval_samples_per_second": 249.856, |
| "eval_steps_per_second": 31.232, |
| "step": 12200 |
| }, |
| { |
| "epoch": 1.1855793073036023, |
| "grad_norm": 1.5012928247451782, |
| "learning_rate": 4.785953177257525e-06, |
| "loss": 0.5051, |
| "step": 12300 |
| }, |
| { |
| "epoch": 1.1855793073036023, |
| "eval_loss": 0.44258418679237366, |
| "eval_runtime": 40.0302, |
| "eval_samples_per_second": 249.811, |
| "eval_steps_per_second": 31.226, |
| "step": 12300 |
| }, |
| { |
| "epoch": 1.195218163460542, |
| "grad_norm": 1.4221644401550293, |
| "learning_rate": 4.775919732441472e-06, |
| "loss": 0.4999, |
| "step": 12400 |
| }, |
| { |
| "epoch": 1.195218163460542, |
| "eval_loss": 0.4362325668334961, |
| "eval_runtime": 40.0213, |
| "eval_samples_per_second": 249.867, |
| "eval_steps_per_second": 31.233, |
| "step": 12400 |
| }, |
| { |
| "epoch": 1.204857019617482, |
| "grad_norm": 1.415287971496582, |
| "learning_rate": 4.765886287625418e-06, |
| "loss": 0.4973, |
| "step": 12500 |
| }, |
| { |
| "epoch": 1.204857019617482, |
| "eval_loss": 0.44187524914741516, |
| "eval_runtime": 40.0371, |
| "eval_samples_per_second": 249.769, |
| "eval_steps_per_second": 31.221, |
| "step": 12500 |
| }, |
| { |
| "epoch": 1.2144958757744218, |
| "grad_norm": 1.5381195545196533, |
| "learning_rate": 4.755852842809365e-06, |
| "loss": 0.4989, |
| "step": 12600 |
| }, |
| { |
| "epoch": 1.2144958757744218, |
| "eval_loss": 0.4413149952888489, |
| "eval_runtime": 40.2155, |
| "eval_samples_per_second": 248.66, |
| "eval_steps_per_second": 31.083, |
| "step": 12600 |
| }, |
| { |
| "epoch": 1.2241347319313618, |
| "grad_norm": 1.374925971031189, |
| "learning_rate": 4.745819397993312e-06, |
| "loss": 0.4923, |
| "step": 12700 |
| }, |
| { |
| "epoch": 1.2241347319313618, |
| "eval_loss": 0.440106064081192, |
| "eval_runtime": 40.0356, |
| "eval_samples_per_second": 249.777, |
| "eval_steps_per_second": 31.222, |
| "step": 12700 |
| }, |
| { |
| "epoch": 1.2337735880883016, |
| "grad_norm": 1.6059918403625488, |
| "learning_rate": 4.7357859531772575e-06, |
| "loss": 0.4918, |
| "step": 12800 |
| }, |
| { |
| "epoch": 1.2337735880883016, |
| "eval_loss": 0.44359320402145386, |
| "eval_runtime": 40.0597, |
| "eval_samples_per_second": 249.628, |
| "eval_steps_per_second": 31.203, |
| "step": 12800 |
| }, |
| { |
| "epoch": 1.2434124442452414, |
| "grad_norm": 1.4142322540283203, |
| "learning_rate": 4.725752508361204e-06, |
| "loss": 0.4881, |
| "step": 12900 |
| }, |
| { |
| "epoch": 1.2434124442452414, |
| "eval_loss": 0.44244951009750366, |
| "eval_runtime": 40.061, |
| "eval_samples_per_second": 249.619, |
| "eval_steps_per_second": 31.202, |
| "step": 12900 |
| }, |
| { |
| "epoch": 1.2530513004021813, |
| "grad_norm": 1.446840524673462, |
| "learning_rate": 4.715719063545151e-06, |
| "loss": 0.49, |
| "step": 13000 |
| }, |
| { |
| "epoch": 1.2530513004021813, |
| "eval_loss": 0.42693030834198, |
| "eval_runtime": 40.2377, |
| "eval_samples_per_second": 248.523, |
| "eval_steps_per_second": 31.065, |
| "step": 13000 |
| }, |
| { |
| "epoch": 1.262690156559121, |
| "grad_norm": 1.363279938697815, |
| "learning_rate": 4.705685618729097e-06, |
| "loss": 0.4839, |
| "step": 13100 |
| }, |
| { |
| "epoch": 1.262690156559121, |
| "eval_loss": 0.43453505635261536, |
| "eval_runtime": 40.0754, |
| "eval_samples_per_second": 249.53, |
| "eval_steps_per_second": 31.191, |
| "step": 13100 |
| }, |
| { |
| "epoch": 1.272329012716061, |
| "grad_norm": 1.4301884174346924, |
| "learning_rate": 4.695652173913044e-06, |
| "loss": 0.4867, |
| "step": 13200 |
| }, |
| { |
| "epoch": 1.272329012716061, |
| "eval_loss": 0.42788979411125183, |
| "eval_runtime": 40.0379, |
| "eval_samples_per_second": 249.763, |
| "eval_steps_per_second": 31.22, |
| "step": 13200 |
| }, |
| { |
| "epoch": 1.281967868873001, |
| "grad_norm": 1.4869626760482788, |
| "learning_rate": 4.6856187290969905e-06, |
| "loss": 0.4822, |
| "step": 13300 |
| }, |
| { |
| "epoch": 1.281967868873001, |
| "eval_loss": 0.42807090282440186, |
| "eval_runtime": 40.0485, |
| "eval_samples_per_second": 249.697, |
| "eval_steps_per_second": 31.212, |
| "step": 13300 |
| }, |
| { |
| "epoch": 1.2916067250299408, |
| "grad_norm": 1.4084575176239014, |
| "learning_rate": 4.675585284280936e-06, |
| "loss": 0.4817, |
| "step": 13400 |
| }, |
| { |
| "epoch": 1.2916067250299408, |
| "eval_loss": 0.431538462638855, |
| "eval_runtime": 40.2297, |
| "eval_samples_per_second": 248.572, |
| "eval_steps_per_second": 31.072, |
| "step": 13400 |
| }, |
| { |
| "epoch": 1.3012455811868806, |
| "grad_norm": 1.3125011920928955, |
| "learning_rate": 4.665551839464883e-06, |
| "loss": 0.4781, |
| "step": 13500 |
| }, |
| { |
| "epoch": 1.3012455811868806, |
| "eval_loss": 0.4229593276977539, |
| "eval_runtime": 40.0557, |
| "eval_samples_per_second": 249.652, |
| "eval_steps_per_second": 31.207, |
| "step": 13500 |
| }, |
| { |
| "epoch": 1.3108844373438204, |
| "grad_norm": 1.495177984237671, |
| "learning_rate": 4.65551839464883e-06, |
| "loss": 0.4782, |
| "step": 13600 |
| }, |
| { |
| "epoch": 1.3108844373438204, |
| "eval_loss": 0.4288323223590851, |
| "eval_runtime": 40.0435, |
| "eval_samples_per_second": 249.728, |
| "eval_steps_per_second": 31.216, |
| "step": 13600 |
| }, |
| { |
| "epoch": 1.3205232935007603, |
| "grad_norm": 1.349768042564392, |
| "learning_rate": 4.645484949832776e-06, |
| "loss": 0.48, |
| "step": 13700 |
| }, |
| { |
| "epoch": 1.3205232935007603, |
| "eval_loss": 0.41586947441101074, |
| "eval_runtime": 40.0452, |
| "eval_samples_per_second": 249.718, |
| "eval_steps_per_second": 31.215, |
| "step": 13700 |
| }, |
| { |
| "epoch": 1.3301621496577, |
| "grad_norm": 1.4638071060180664, |
| "learning_rate": 4.635451505016723e-06, |
| "loss": 0.4808, |
| "step": 13800 |
| }, |
| { |
| "epoch": 1.3301621496577, |
| "eval_loss": 0.4116751551628113, |
| "eval_runtime": 40.0376, |
| "eval_samples_per_second": 249.765, |
| "eval_steps_per_second": 31.221, |
| "step": 13800 |
| }, |
| { |
| "epoch": 1.33980100581464, |
| "grad_norm": 1.4769808053970337, |
| "learning_rate": 4.625418060200669e-06, |
| "loss": 0.4769, |
| "step": 13900 |
| }, |
| { |
| "epoch": 1.33980100581464, |
| "eval_loss": 0.4249781370162964, |
| "eval_runtime": 40.2173, |
| "eval_samples_per_second": 248.649, |
| "eval_steps_per_second": 31.081, |
| "step": 13900 |
| }, |
| { |
| "epoch": 1.3494398619715797, |
| "grad_norm": 1.4002916812896729, |
| "learning_rate": 4.615384615384616e-06, |
| "loss": 0.4703, |
| "step": 14000 |
| }, |
| { |
| "epoch": 1.3494398619715797, |
| "eval_loss": 0.41435718536376953, |
| "eval_runtime": 40.0251, |
| "eval_samples_per_second": 249.843, |
| "eval_steps_per_second": 31.23, |
| "step": 14000 |
| }, |
| { |
| "epoch": 1.3590787181285198, |
| "grad_norm": 1.324873685836792, |
| "learning_rate": 4.605351170568562e-06, |
| "loss": 0.4715, |
| "step": 14100 |
| }, |
| { |
| "epoch": 1.3590787181285198, |
| "eval_loss": 0.41557979583740234, |
| "eval_runtime": 40.0132, |
| "eval_samples_per_second": 249.917, |
| "eval_steps_per_second": 31.24, |
| "step": 14100 |
| }, |
| { |
| "epoch": 1.3687175742854596, |
| "grad_norm": 1.510293960571289, |
| "learning_rate": 4.595317725752509e-06, |
| "loss": 0.4661, |
| "step": 14200 |
| }, |
| { |
| "epoch": 1.3687175742854596, |
| "eval_loss": 0.41991928219795227, |
| "eval_runtime": 40.0151, |
| "eval_samples_per_second": 249.906, |
| "eval_steps_per_second": 31.238, |
| "step": 14200 |
| }, |
| { |
| "epoch": 1.3783564304423994, |
| "grad_norm": 1.4551657438278198, |
| "learning_rate": 4.585284280936456e-06, |
| "loss": 0.466, |
| "step": 14300 |
| }, |
| { |
| "epoch": 1.3783564304423994, |
| "eval_loss": 0.41806551814079285, |
| "eval_runtime": 40.0069, |
| "eval_samples_per_second": 249.957, |
| "eval_steps_per_second": 31.245, |
| "step": 14300 |
| }, |
| { |
| "epoch": 1.3879952865993392, |
| "grad_norm": 1.5093111991882324, |
| "learning_rate": 4.5752508361204015e-06, |
| "loss": 0.4653, |
| "step": 14400 |
| }, |
| { |
| "epoch": 1.3879952865993392, |
| "eval_loss": 0.41728827357292175, |
| "eval_runtime": 40.1654, |
| "eval_samples_per_second": 248.97, |
| "eval_steps_per_second": 31.121, |
| "step": 14400 |
| }, |
| { |
| "epoch": 1.397634142756279, |
| "grad_norm": 1.4863073825836182, |
| "learning_rate": 4.565217391304348e-06, |
| "loss": 0.467, |
| "step": 14500 |
| }, |
| { |
| "epoch": 1.397634142756279, |
| "eval_loss": 0.41565340757369995, |
| "eval_runtime": 39.9963, |
| "eval_samples_per_second": 250.023, |
| "eval_steps_per_second": 31.253, |
| "step": 14500 |
| }, |
| { |
| "epoch": 1.4072729989132189, |
| "grad_norm": 1.4027386903762817, |
| "learning_rate": 4.555183946488295e-06, |
| "loss": 0.4662, |
| "step": 14600 |
| }, |
| { |
| "epoch": 1.4072729989132189, |
| "eval_loss": 0.4067195951938629, |
| "eval_runtime": 40.0068, |
| "eval_samples_per_second": 249.957, |
| "eval_steps_per_second": 31.245, |
| "step": 14600 |
| }, |
| { |
| "epoch": 1.416911855070159, |
| "grad_norm": 1.4059629440307617, |
| "learning_rate": 4.545150501672241e-06, |
| "loss": 0.4651, |
| "step": 14700 |
| }, |
| { |
| "epoch": 1.416911855070159, |
| "eval_loss": 0.40936049818992615, |
| "eval_runtime": 39.9887, |
| "eval_samples_per_second": 250.071, |
| "eval_steps_per_second": 31.259, |
| "step": 14700 |
| }, |
| { |
| "epoch": 1.4265507112270988, |
| "grad_norm": 1.2644177675247192, |
| "learning_rate": 4.535117056856188e-06, |
| "loss": 0.4622, |
| "step": 14800 |
| }, |
| { |
| "epoch": 1.4265507112270988, |
| "eval_loss": 0.4118250906467438, |
| "eval_runtime": 39.9915, |
| "eval_samples_per_second": 250.053, |
| "eval_steps_per_second": 31.257, |
| "step": 14800 |
| }, |
| { |
| "epoch": 1.4361895673840386, |
| "grad_norm": 1.2830288410186768, |
| "learning_rate": 4.5250836120401345e-06, |
| "loss": 0.4597, |
| "step": 14900 |
| }, |
| { |
| "epoch": 1.4361895673840386, |
| "eval_loss": 0.40810731053352356, |
| "eval_runtime": 40.2112, |
| "eval_samples_per_second": 248.687, |
| "eval_steps_per_second": 31.086, |
| "step": 14900 |
| }, |
| { |
| "epoch": 1.4458284235409784, |
| "grad_norm": 1.2887154817581177, |
| "learning_rate": 4.51505016722408e-06, |
| "loss": 0.4602, |
| "step": 15000 |
| }, |
| { |
| "epoch": 1.4458284235409784, |
| "eval_loss": 0.40011951327323914, |
| "eval_runtime": 40.0118, |
| "eval_samples_per_second": 249.926, |
| "eval_steps_per_second": 31.241, |
| "step": 15000 |
| }, |
| { |
| "epoch": 1.4554672796979182, |
| "grad_norm": 1.348257303237915, |
| "learning_rate": 4.505016722408027e-06, |
| "loss": 0.4574, |
| "step": 15100 |
| }, |
| { |
| "epoch": 1.4554672796979182, |
| "eval_loss": 0.4056689143180847, |
| "eval_runtime": 40.015, |
| "eval_samples_per_second": 249.906, |
| "eval_steps_per_second": 31.238, |
| "step": 15100 |
| }, |
| { |
| "epoch": 1.465106135854858, |
| "grad_norm": 1.540839433670044, |
| "learning_rate": 4.494983277591973e-06, |
| "loss": 0.4549, |
| "step": 15200 |
| }, |
| { |
| "epoch": 1.465106135854858, |
| "eval_loss": 0.39897704124450684, |
| "eval_runtime": 40.0174, |
| "eval_samples_per_second": 249.892, |
| "eval_steps_per_second": 31.236, |
| "step": 15200 |
| }, |
| { |
| "epoch": 1.4747449920117979, |
| "grad_norm": 1.4020183086395264, |
| "learning_rate": 4.48494983277592e-06, |
| "loss": 0.4552, |
| "step": 15300 |
| }, |
| { |
| "epoch": 1.4747449920117979, |
| "eval_loss": 0.39921414852142334, |
| "eval_runtime": 40.2019, |
| "eval_samples_per_second": 248.745, |
| "eval_steps_per_second": 31.093, |
| "step": 15300 |
| }, |
| { |
| "epoch": 1.4843838481687377, |
| "grad_norm": 1.4397515058517456, |
| "learning_rate": 4.474916387959866e-06, |
| "loss": 0.4524, |
| "step": 15400 |
| }, |
| { |
| "epoch": 1.4843838481687377, |
| "eval_loss": 0.40324947237968445, |
| "eval_runtime": 40.0276, |
| "eval_samples_per_second": 249.828, |
| "eval_steps_per_second": 31.228, |
| "step": 15400 |
| }, |
| { |
| "epoch": 1.4940227043256777, |
| "grad_norm": 1.327071189880371, |
| "learning_rate": 4.4648829431438125e-06, |
| "loss": 0.4501, |
| "step": 15500 |
| }, |
| { |
| "epoch": 1.4940227043256777, |
| "eval_loss": 0.39761507511138916, |
| "eval_runtime": 39.9927, |
| "eval_samples_per_second": 250.045, |
| "eval_steps_per_second": 31.256, |
| "step": 15500 |
| }, |
| { |
| "epoch": 1.5036615604826176, |
| "grad_norm": 1.4344964027404785, |
| "learning_rate": 4.454849498327759e-06, |
| "loss": 0.4482, |
| "step": 15600 |
| }, |
| { |
| "epoch": 1.5036615604826176, |
| "eval_loss": 0.39871275424957275, |
| "eval_runtime": 40.227, |
| "eval_samples_per_second": 248.589, |
| "eval_steps_per_second": 31.074, |
| "step": 15600 |
| }, |
| { |
| "epoch": 1.5133004166395574, |
| "grad_norm": 1.3496111631393433, |
| "learning_rate": 4.444816053511705e-06, |
| "loss": 0.4457, |
| "step": 15700 |
| }, |
| { |
| "epoch": 1.5133004166395574, |
| "eval_loss": 0.40053361654281616, |
| "eval_runtime": 40.0154, |
| "eval_samples_per_second": 249.904, |
| "eval_steps_per_second": 31.238, |
| "step": 15700 |
| }, |
| { |
| "epoch": 1.5229392727964972, |
| "grad_norm": 1.3155416250228882, |
| "learning_rate": 4.434782608695652e-06, |
| "loss": 0.4469, |
| "step": 15800 |
| }, |
| { |
| "epoch": 1.5229392727964972, |
| "eval_loss": 0.399395227432251, |
| "eval_runtime": 40.0839, |
| "eval_samples_per_second": 249.476, |
| "eval_steps_per_second": 31.185, |
| "step": 15800 |
| }, |
| { |
| "epoch": 1.5325781289534373, |
| "grad_norm": 1.438828945159912, |
| "learning_rate": 4.424749163879599e-06, |
| "loss": 0.4467, |
| "step": 15900 |
| }, |
| { |
| "epoch": 1.5325781289534373, |
| "eval_loss": 0.39455991983413696, |
| "eval_runtime": 40.1699, |
| "eval_samples_per_second": 248.943, |
| "eval_steps_per_second": 31.118, |
| "step": 15900 |
| }, |
| { |
| "epoch": 1.542216985110377, |
| "grad_norm": 1.3380151987075806, |
| "learning_rate": 4.414715719063545e-06, |
| "loss": 0.4479, |
| "step": 16000 |
| }, |
| { |
| "epoch": 1.542216985110377, |
| "eval_loss": 0.3920137584209442, |
| "eval_runtime": 40.0458, |
| "eval_samples_per_second": 249.714, |
| "eval_steps_per_second": 31.214, |
| "step": 16000 |
| }, |
| { |
| "epoch": 1.551855841267317, |
| "grad_norm": 1.3055827617645264, |
| "learning_rate": 4.404682274247491e-06, |
| "loss": 0.4437, |
| "step": 16100 |
| }, |
| { |
| "epoch": 1.551855841267317, |
| "eval_loss": 0.39583614468574524, |
| "eval_runtime": 40.0719, |
| "eval_samples_per_second": 249.551, |
| "eval_steps_per_second": 31.194, |
| "step": 16100 |
| }, |
| { |
| "epoch": 1.5614946974242567, |
| "grad_norm": 1.3439549207687378, |
| "learning_rate": 4.394648829431438e-06, |
| "loss": 0.4421, |
| "step": 16200 |
| }, |
| { |
| "epoch": 1.5614946974242567, |
| "eval_loss": 0.3982265293598175, |
| "eval_runtime": 40.0502, |
| "eval_samples_per_second": 249.687, |
| "eval_steps_per_second": 31.211, |
| "step": 16200 |
| }, |
| { |
| "epoch": 1.5711335535811966, |
| "grad_norm": 1.2873746156692505, |
| "learning_rate": 4.384615384615384e-06, |
| "loss": 0.4421, |
| "step": 16300 |
| }, |
| { |
| "epoch": 1.5711335535811966, |
| "eval_loss": 0.392922580242157, |
| "eval_runtime": 40.0623, |
| "eval_samples_per_second": 249.611, |
| "eval_steps_per_second": 31.201, |
| "step": 16300 |
| }, |
| { |
| "epoch": 1.5807724097381364, |
| "grad_norm": 1.4414591789245605, |
| "learning_rate": 4.374581939799331e-06, |
| "loss": 0.4388, |
| "step": 16400 |
| }, |
| { |
| "epoch": 1.5807724097381364, |
| "eval_loss": 0.39475396275520325, |
| "eval_runtime": 40.2477, |
| "eval_samples_per_second": 248.462, |
| "eval_steps_per_second": 31.058, |
| "step": 16400 |
| }, |
| { |
| "epoch": 1.5904112658950762, |
| "grad_norm": 1.3728173971176147, |
| "learning_rate": 4.364548494983278e-06, |
| "loss": 0.4414, |
| "step": 16500 |
| }, |
| { |
| "epoch": 1.5904112658950762, |
| "eval_loss": 0.39176425337791443, |
| "eval_runtime": 40.0569, |
| "eval_samples_per_second": 249.645, |
| "eval_steps_per_second": 31.206, |
| "step": 16500 |
| }, |
| { |
| "epoch": 1.600050122052016, |
| "grad_norm": 1.5120171308517456, |
| "learning_rate": 4.354515050167224e-06, |
| "loss": 0.4414, |
| "step": 16600 |
| }, |
| { |
| "epoch": 1.600050122052016, |
| "eval_loss": 0.39494696259498596, |
| "eval_runtime": 40.0501, |
| "eval_samples_per_second": 249.687, |
| "eval_steps_per_second": 31.211, |
| "step": 16600 |
| }, |
| { |
| "epoch": 1.6096889782089558, |
| "grad_norm": 1.7337701320648193, |
| "learning_rate": 4.34448160535117e-06, |
| "loss": 0.436, |
| "step": 16700 |
| }, |
| { |
| "epoch": 1.6096889782089558, |
| "eval_loss": 0.3927867114543915, |
| "eval_runtime": 40.0345, |
| "eval_samples_per_second": 249.785, |
| "eval_steps_per_second": 31.223, |
| "step": 16700 |
| }, |
| { |
| "epoch": 1.6193278343658957, |
| "grad_norm": 1.465790033340454, |
| "learning_rate": 4.334448160535117e-06, |
| "loss": 0.436, |
| "step": 16800 |
| }, |
| { |
| "epoch": 1.6193278343658957, |
| "eval_loss": 0.38513514399528503, |
| "eval_runtime": 40.227, |
| "eval_samples_per_second": 248.589, |
| "eval_steps_per_second": 31.074, |
| "step": 16800 |
| }, |
| { |
| "epoch": 1.6289666905228355, |
| "grad_norm": 1.401219129562378, |
| "learning_rate": 4.324414715719064e-06, |
| "loss": 0.4375, |
| "step": 16900 |
| }, |
| { |
| "epoch": 1.6289666905228355, |
| "eval_loss": 0.37909185886383057, |
| "eval_runtime": 40.1381, |
| "eval_samples_per_second": 249.14, |
| "eval_steps_per_second": 31.142, |
| "step": 16900 |
| }, |
| { |
| "epoch": 1.6386055466797755, |
| "grad_norm": 1.3634768724441528, |
| "learning_rate": 4.31438127090301e-06, |
| "loss": 0.4358, |
| "step": 17000 |
| }, |
| { |
| "epoch": 1.6386055466797755, |
| "eval_loss": 0.3808463513851166, |
| "eval_runtime": 40.0323, |
| "eval_samples_per_second": 249.798, |
| "eval_steps_per_second": 31.225, |
| "step": 17000 |
| }, |
| { |
| "epoch": 1.6482444028367154, |
| "grad_norm": 1.4873460531234741, |
| "learning_rate": 4.3043478260869565e-06, |
| "loss": 0.4321, |
| "step": 17100 |
| }, |
| { |
| "epoch": 1.6482444028367154, |
| "eval_loss": 0.3821844160556793, |
| "eval_runtime": 40.0357, |
| "eval_samples_per_second": 249.777, |
| "eval_steps_per_second": 31.222, |
| "step": 17100 |
| }, |
| { |
| "epoch": 1.6578832589936552, |
| "grad_norm": 1.323455572128296, |
| "learning_rate": 4.294314381270903e-06, |
| "loss": 0.431, |
| "step": 17200 |
| }, |
| { |
| "epoch": 1.6578832589936552, |
| "eval_loss": 0.37755951285362244, |
| "eval_runtime": 40.2365, |
| "eval_samples_per_second": 248.531, |
| "eval_steps_per_second": 31.066, |
| "step": 17200 |
| }, |
| { |
| "epoch": 1.6675221151505952, |
| "grad_norm": 1.3670837879180908, |
| "learning_rate": 4.284280936454849e-06, |
| "loss": 0.4302, |
| "step": 17300 |
| }, |
| { |
| "epoch": 1.6675221151505952, |
| "eval_loss": 0.37524840235710144, |
| "eval_runtime": 40.0419, |
| "eval_samples_per_second": 249.738, |
| "eval_steps_per_second": 31.217, |
| "step": 17300 |
| }, |
| { |
| "epoch": 1.677160971307535, |
| "grad_norm": 1.507103681564331, |
| "learning_rate": 4.274247491638796e-06, |
| "loss": 0.4288, |
| "step": 17400 |
| }, |
| { |
| "epoch": 1.677160971307535, |
| "eval_loss": 0.38275057077407837, |
| "eval_runtime": 40.0423, |
| "eval_samples_per_second": 249.736, |
| "eval_steps_per_second": 31.217, |
| "step": 17400 |
| }, |
| { |
| "epoch": 1.6867998274644749, |
| "grad_norm": 1.3555055856704712, |
| "learning_rate": 4.264214046822743e-06, |
| "loss": 0.4307, |
| "step": 17500 |
| }, |
| { |
| "epoch": 1.6867998274644749, |
| "eval_loss": 0.3769054710865021, |
| "eval_runtime": 40.045, |
| "eval_samples_per_second": 249.719, |
| "eval_steps_per_second": 31.215, |
| "step": 17500 |
| }, |
| { |
| "epoch": 1.6964386836214147, |
| "grad_norm": 1.2957983016967773, |
| "learning_rate": 4.254180602006689e-06, |
| "loss": 0.4287, |
| "step": 17600 |
| }, |
| { |
| "epoch": 1.6964386836214147, |
| "eval_loss": 0.3762720823287964, |
| "eval_runtime": 40.2206, |
| "eval_samples_per_second": 248.629, |
| "eval_steps_per_second": 31.079, |
| "step": 17600 |
| }, |
| { |
| "epoch": 1.7060775397783545, |
| "grad_norm": 1.4515742063522339, |
| "learning_rate": 4.244147157190635e-06, |
| "loss": 0.4248, |
| "step": 17700 |
| }, |
| { |
| "epoch": 1.7060775397783545, |
| "eval_loss": 0.37980878353118896, |
| "eval_runtime": 40.0419, |
| "eval_samples_per_second": 249.738, |
| "eval_steps_per_second": 31.217, |
| "step": 17700 |
| }, |
| { |
| "epoch": 1.7157163959352943, |
| "grad_norm": 1.3719956874847412, |
| "learning_rate": 4.234113712374582e-06, |
| "loss": 0.4237, |
| "step": 17800 |
| }, |
| { |
| "epoch": 1.7157163959352943, |
| "eval_loss": 0.3736819326877594, |
| "eval_runtime": 40.0386, |
| "eval_samples_per_second": 249.759, |
| "eval_steps_per_second": 31.22, |
| "step": 17800 |
| }, |
| { |
| "epoch": 1.7253552520922342, |
| "grad_norm": 1.2918155193328857, |
| "learning_rate": 4.224080267558528e-06, |
| "loss": 0.4234, |
| "step": 17900 |
| }, |
| { |
| "epoch": 1.7253552520922342, |
| "eval_loss": 0.37445351481437683, |
| "eval_runtime": 40.0158, |
| "eval_samples_per_second": 249.901, |
| "eval_steps_per_second": 31.238, |
| "step": 17900 |
| }, |
| { |
| "epoch": 1.734994108249174, |
| "grad_norm": 1.5635416507720947, |
| "learning_rate": 4.214046822742475e-06, |
| "loss": 0.4226, |
| "step": 18000 |
| }, |
| { |
| "epoch": 1.734994108249174, |
| "eval_loss": 0.37227481603622437, |
| "eval_runtime": 40.0091, |
| "eval_samples_per_second": 249.943, |
| "eval_steps_per_second": 31.243, |
| "step": 18000 |
| }, |
| { |
| "epoch": 1.7446329644061138, |
| "grad_norm": 1.3523764610290527, |
| "learning_rate": 4.2040133779264216e-06, |
| "loss": 0.4253, |
| "step": 18100 |
| }, |
| { |
| "epoch": 1.7446329644061138, |
| "eval_loss": 0.37538984417915344, |
| "eval_runtime": 40.1924, |
| "eval_samples_per_second": 248.803, |
| "eval_steps_per_second": 31.1, |
| "step": 18100 |
| }, |
| { |
| "epoch": 1.7542718205630536, |
| "grad_norm": 1.2665579319000244, |
| "learning_rate": 4.1939799331103675e-06, |
| "loss": 0.4228, |
| "step": 18200 |
| }, |
| { |
| "epoch": 1.7542718205630536, |
| "eval_loss": 0.3729080259799957, |
| "eval_runtime": 40.0153, |
| "eval_samples_per_second": 249.905, |
| "eval_steps_per_second": 31.238, |
| "step": 18200 |
| }, |
| { |
| "epoch": 1.7639106767199937, |
| "grad_norm": 1.2079461812973022, |
| "learning_rate": 4.183946488294314e-06, |
| "loss": 0.4184, |
| "step": 18300 |
| }, |
| { |
| "epoch": 1.7639106767199937, |
| "eval_loss": 0.37542304396629333, |
| "eval_runtime": 40.226, |
| "eval_samples_per_second": 248.595, |
| "eval_steps_per_second": 31.074, |
| "step": 18300 |
| }, |
| { |
| "epoch": 1.7735495328769335, |
| "grad_norm": 1.2203370332717896, |
| "learning_rate": 4.173913043478261e-06, |
| "loss": 0.4186, |
| "step": 18400 |
| }, |
| { |
| "epoch": 1.7735495328769335, |
| "eval_loss": 0.3722892701625824, |
| "eval_runtime": 40.0431, |
| "eval_samples_per_second": 249.731, |
| "eval_steps_per_second": 31.216, |
| "step": 18400 |
| }, |
| { |
| "epoch": 1.7831883890338733, |
| "grad_norm": 1.2480080127716064, |
| "learning_rate": 4.163879598662208e-06, |
| "loss": 0.4214, |
| "step": 18500 |
| }, |
| { |
| "epoch": 1.7831883890338733, |
| "eval_loss": 0.3761518597602844, |
| "eval_runtime": 40.0595, |
| "eval_samples_per_second": 249.629, |
| "eval_steps_per_second": 31.204, |
| "step": 18500 |
| }, |
| { |
| "epoch": 1.7928272451908132, |
| "grad_norm": 1.514190912246704, |
| "learning_rate": 4.153846153846154e-06, |
| "loss": 0.4153, |
| "step": 18600 |
| }, |
| { |
| "epoch": 1.7928272451908132, |
| "eval_loss": 0.3671943247318268, |
| "eval_runtime": 40.0447, |
| "eval_samples_per_second": 249.721, |
| "eval_steps_per_second": 31.215, |
| "step": 18600 |
| }, |
| { |
| "epoch": 1.8024661013477532, |
| "grad_norm": 1.4013168811798096, |
| "learning_rate": 4.1438127090301005e-06, |
| "loss": 0.4152, |
| "step": 18700 |
| }, |
| { |
| "epoch": 1.8024661013477532, |
| "eval_loss": 0.36363351345062256, |
| "eval_runtime": 40.2138, |
| "eval_samples_per_second": 248.671, |
| "eval_steps_per_second": 31.084, |
| "step": 18700 |
| }, |
| { |
| "epoch": 1.812104957504693, |
| "grad_norm": 1.3701732158660889, |
| "learning_rate": 4.133779264214047e-06, |
| "loss": 0.4139, |
| "step": 18800 |
| }, |
| { |
| "epoch": 1.812104957504693, |
| "eval_loss": 0.37101006507873535, |
| "eval_runtime": 40.0504, |
| "eval_samples_per_second": 249.685, |
| "eval_steps_per_second": 31.211, |
| "step": 18800 |
| }, |
| { |
| "epoch": 1.8217438136616328, |
| "grad_norm": 1.2950865030288696, |
| "learning_rate": 4.123745819397993e-06, |
| "loss": 0.4127, |
| "step": 18900 |
| }, |
| { |
| "epoch": 1.8217438136616328, |
| "eval_loss": 0.3611432611942291, |
| "eval_runtime": 39.9977, |
| "eval_samples_per_second": 250.014, |
| "eval_steps_per_second": 31.252, |
| "step": 18900 |
| }, |
| { |
| "epoch": 1.8313826698185727, |
| "grad_norm": 1.2140529155731201, |
| "learning_rate": 4.11371237458194e-06, |
| "loss": 0.4175, |
| "step": 19000 |
| }, |
| { |
| "epoch": 1.8313826698185727, |
| "eval_loss": 0.3688327372074127, |
| "eval_runtime": 39.9976, |
| "eval_samples_per_second": 250.015, |
| "eval_steps_per_second": 31.252, |
| "step": 19000 |
| }, |
| { |
| "epoch": 1.8410215259755125, |
| "grad_norm": 1.4030205011367798, |
| "learning_rate": 4.103678929765887e-06, |
| "loss": 0.4123, |
| "step": 19100 |
| }, |
| { |
| "epoch": 1.8410215259755125, |
| "eval_loss": 0.36343154311180115, |
| "eval_runtime": 40.2164, |
| "eval_samples_per_second": 248.655, |
| "eval_steps_per_second": 31.082, |
| "step": 19100 |
| }, |
| { |
| "epoch": 1.8506603821324523, |
| "grad_norm": 1.2921127080917358, |
| "learning_rate": 4.0936454849498326e-06, |
| "loss": 0.4119, |
| "step": 19200 |
| }, |
| { |
| "epoch": 1.8506603821324523, |
| "eval_loss": 0.36798274517059326, |
| "eval_runtime": 40.0209, |
| "eval_samples_per_second": 249.869, |
| "eval_steps_per_second": 31.234, |
| "step": 19200 |
| }, |
| { |
| "epoch": 1.8602992382893921, |
| "grad_norm": 1.2748223543167114, |
| "learning_rate": 4.083612040133779e-06, |
| "loss": 0.4143, |
| "step": 19300 |
| }, |
| { |
| "epoch": 1.8602992382893921, |
| "eval_loss": 0.36462557315826416, |
| "eval_runtime": 40.0258, |
| "eval_samples_per_second": 249.839, |
| "eval_steps_per_second": 31.23, |
| "step": 19300 |
| }, |
| { |
| "epoch": 1.869938094446332, |
| "grad_norm": 1.2742102146148682, |
| "learning_rate": 4.073578595317726e-06, |
| "loss": 0.4093, |
| "step": 19400 |
| }, |
| { |
| "epoch": 1.869938094446332, |
| "eval_loss": 0.36330220103263855, |
| "eval_runtime": 40.0305, |
| "eval_samples_per_second": 249.809, |
| "eval_steps_per_second": 31.226, |
| "step": 19400 |
| }, |
| { |
| "epoch": 1.8795769506032718, |
| "grad_norm": 1.3298977613449097, |
| "learning_rate": 4.063545150501672e-06, |
| "loss": 0.4094, |
| "step": 19500 |
| }, |
| { |
| "epoch": 1.8795769506032718, |
| "eval_loss": 0.36082392930984497, |
| "eval_runtime": 40.0284, |
| "eval_samples_per_second": 249.822, |
| "eval_steps_per_second": 31.228, |
| "step": 19500 |
| }, |
| { |
| "epoch": 1.8892158067602116, |
| "grad_norm": 1.4168245792388916, |
| "learning_rate": 4.053511705685619e-06, |
| "loss": 0.4081, |
| "step": 19600 |
| }, |
| { |
| "epoch": 1.8892158067602116, |
| "eval_loss": 0.3604665994644165, |
| "eval_runtime": 40.2162, |
| "eval_samples_per_second": 248.656, |
| "eval_steps_per_second": 31.082, |
| "step": 19600 |
| }, |
| { |
| "epoch": 1.8988546629171517, |
| "grad_norm": 1.3064316511154175, |
| "learning_rate": 4.0434782608695655e-06, |
| "loss": 0.4096, |
| "step": 19700 |
| }, |
| { |
| "epoch": 1.8988546629171517, |
| "eval_loss": 0.35935744643211365, |
| "eval_runtime": 40.019, |
| "eval_samples_per_second": 249.881, |
| "eval_steps_per_second": 31.235, |
| "step": 19700 |
| }, |
| { |
| "epoch": 1.9084935190740915, |
| "grad_norm": 1.380152702331543, |
| "learning_rate": 4.0334448160535115e-06, |
| "loss": 0.4097, |
| "step": 19800 |
| }, |
| { |
| "epoch": 1.9084935190740915, |
| "eval_loss": 0.36837923526763916, |
| "eval_runtime": 40.0097, |
| "eval_samples_per_second": 249.94, |
| "eval_steps_per_second": 31.242, |
| "step": 19800 |
| }, |
| { |
| "epoch": 1.9181323752310313, |
| "grad_norm": 1.3063116073608398, |
| "learning_rate": 4.023411371237458e-06, |
| "loss": 0.4066, |
| "step": 19900 |
| }, |
| { |
| "epoch": 1.9181323752310313, |
| "eval_loss": 0.3581005036830902, |
| "eval_runtime": 40.0395, |
| "eval_samples_per_second": 249.754, |
| "eval_steps_per_second": 31.219, |
| "step": 19900 |
| }, |
| { |
| "epoch": 1.9277712313879714, |
| "grad_norm": 1.276556134223938, |
| "learning_rate": 4.013377926421405e-06, |
| "loss": 0.4048, |
| "step": 20000 |
| }, |
| { |
| "epoch": 1.9277712313879714, |
| "eval_loss": 0.3602534234523773, |
| "eval_runtime": 40.0325, |
| "eval_samples_per_second": 249.797, |
| "eval_steps_per_second": 31.225, |
| "step": 20000 |
| }, |
| { |
| "epoch": 1.9374100875449112, |
| "grad_norm": 1.4124329090118408, |
| "learning_rate": 4.003344481605351e-06, |
| "loss": 0.4067, |
| "step": 20100 |
| }, |
| { |
| "epoch": 1.9374100875449112, |
| "eval_loss": 0.35985687375068665, |
| "eval_runtime": 40.2758, |
| "eval_samples_per_second": 248.288, |
| "eval_steps_per_second": 31.036, |
| "step": 20100 |
| }, |
| { |
| "epoch": 1.947048943701851, |
| "grad_norm": 1.3454973697662354, |
| "learning_rate": 3.993311036789298e-06, |
| "loss": 0.4051, |
| "step": 20200 |
| }, |
| { |
| "epoch": 1.947048943701851, |
| "eval_loss": 0.353756844997406, |
| "eval_runtime": 40.0628, |
| "eval_samples_per_second": 249.608, |
| "eval_steps_per_second": 31.201, |
| "step": 20200 |
| }, |
| { |
| "epoch": 1.9566877998587908, |
| "grad_norm": 1.326913595199585, |
| "learning_rate": 3.9832775919732444e-06, |
| "loss": 0.4017, |
| "step": 20300 |
| }, |
| { |
| "epoch": 1.9566877998587908, |
| "eval_loss": 0.35797813534736633, |
| "eval_runtime": 40.067, |
| "eval_samples_per_second": 249.582, |
| "eval_steps_per_second": 31.198, |
| "step": 20300 |
| }, |
| { |
| "epoch": 1.9663266560157306, |
| "grad_norm": 1.1353386640548706, |
| "learning_rate": 3.97324414715719e-06, |
| "loss": 0.4039, |
| "step": 20400 |
| }, |
| { |
| "epoch": 1.9663266560157306, |
| "eval_loss": 0.3506408631801605, |
| "eval_runtime": 40.0693, |
| "eval_samples_per_second": 249.568, |
| "eval_steps_per_second": 31.196, |
| "step": 20400 |
| }, |
| { |
| "epoch": 1.9759655121726705, |
| "grad_norm": 1.3176063299179077, |
| "learning_rate": 3.963210702341137e-06, |
| "loss": 0.4042, |
| "step": 20500 |
| }, |
| { |
| "epoch": 1.9759655121726705, |
| "eval_loss": 0.35446348786354065, |
| "eval_runtime": 40.2855, |
| "eval_samples_per_second": 248.228, |
| "eval_steps_per_second": 31.029, |
| "step": 20500 |
| }, |
| { |
| "epoch": 1.9856043683296103, |
| "grad_norm": 1.243143081665039, |
| "learning_rate": 3.953177257525084e-06, |
| "loss": 0.4, |
| "step": 20600 |
| }, |
| { |
| "epoch": 1.9856043683296103, |
| "eval_loss": 0.35905522108078003, |
| "eval_runtime": 40.1017, |
| "eval_samples_per_second": 249.366, |
| "eval_steps_per_second": 31.171, |
| "step": 20600 |
| }, |
| { |
| "epoch": 1.9952432244865501, |
| "grad_norm": 1.3850924968719482, |
| "learning_rate": 3.943143812709031e-06, |
| "loss": 0.4025, |
| "step": 20700 |
| }, |
| { |
| "epoch": 1.9952432244865501, |
| "eval_loss": 0.35461366176605225, |
| "eval_runtime": 40.0328, |
| "eval_samples_per_second": 249.795, |
| "eval_steps_per_second": 31.224, |
| "step": 20700 |
| }, |
| { |
| "epoch": 2.00488208064349, |
| "grad_norm": 1.2397725582122803, |
| "learning_rate": 3.9331103678929765e-06, |
| "loss": 0.3984, |
| "step": 20800 |
| }, |
| { |
| "epoch": 2.00488208064349, |
| "eval_loss": 0.35152626037597656, |
| "eval_runtime": 40.0186, |
| "eval_samples_per_second": 249.884, |
| "eval_steps_per_second": 31.235, |
| "step": 20800 |
| }, |
| { |
| "epoch": 2.0145209368004298, |
| "grad_norm": 1.2805670499801636, |
| "learning_rate": 3.923076923076923e-06, |
| "loss": 0.3973, |
| "step": 20900 |
| }, |
| { |
| "epoch": 2.0145209368004298, |
| "eval_loss": 0.3505365252494812, |
| "eval_runtime": 40.0175, |
| "eval_samples_per_second": 249.89, |
| "eval_steps_per_second": 31.236, |
| "step": 20900 |
| }, |
| { |
| "epoch": 2.0241597929573696, |
| "grad_norm": 1.4358912706375122, |
| "learning_rate": 3.91304347826087e-06, |
| "loss": 0.3955, |
| "step": 21000 |
| }, |
| { |
| "epoch": 2.0241597929573696, |
| "eval_loss": 0.3397263288497925, |
| "eval_runtime": 40.2153, |
| "eval_samples_per_second": 248.661, |
| "eval_steps_per_second": 31.083, |
| "step": 21000 |
| }, |
| { |
| "epoch": 2.0337986491143094, |
| "grad_norm": 1.312941312789917, |
| "learning_rate": 3.903010033444816e-06, |
| "loss": 0.397, |
| "step": 21100 |
| }, |
| { |
| "epoch": 2.0337986491143094, |
| "eval_loss": 0.35078564286231995, |
| "eval_runtime": 40.0071, |
| "eval_samples_per_second": 249.956, |
| "eval_steps_per_second": 31.244, |
| "step": 21100 |
| }, |
| { |
| "epoch": 2.0434375052712497, |
| "grad_norm": 1.678265929222107, |
| "learning_rate": 3.892976588628763e-06, |
| "loss": 0.3929, |
| "step": 21200 |
| }, |
| { |
| "epoch": 2.0434375052712497, |
| "eval_loss": 0.3554067313671112, |
| "eval_runtime": 40.0148, |
| "eval_samples_per_second": 249.907, |
| "eval_steps_per_second": 31.238, |
| "step": 21200 |
| }, |
| { |
| "epoch": 2.0530763614281895, |
| "grad_norm": 1.271547794342041, |
| "learning_rate": 3.8829431438127095e-06, |
| "loss": 0.3969, |
| "step": 21300 |
| }, |
| { |
| "epoch": 2.0530763614281895, |
| "eval_loss": 0.3503868877887726, |
| "eval_runtime": 40.0058, |
| "eval_samples_per_second": 249.964, |
| "eval_steps_per_second": 31.245, |
| "step": 21300 |
| }, |
| { |
| "epoch": 2.0627152175851293, |
| "grad_norm": 1.3128001689910889, |
| "learning_rate": 3.8729096989966554e-06, |
| "loss": 0.3928, |
| "step": 21400 |
| }, |
| { |
| "epoch": 2.0627152175851293, |
| "eval_loss": 0.35138940811157227, |
| "eval_runtime": 40.2205, |
| "eval_samples_per_second": 248.63, |
| "eval_steps_per_second": 31.079, |
| "step": 21400 |
| }, |
| { |
| "epoch": 2.072354073742069, |
| "grad_norm": 1.2826029062271118, |
| "learning_rate": 3.862876254180602e-06, |
| "loss": 0.3956, |
| "step": 21500 |
| }, |
| { |
| "epoch": 2.072354073742069, |
| "eval_loss": 0.34093034267425537, |
| "eval_runtime": 40.0778, |
| "eval_samples_per_second": 249.515, |
| "eval_steps_per_second": 31.189, |
| "step": 21500 |
| }, |
| { |
| "epoch": 2.081992929899009, |
| "grad_norm": 1.2576078176498413, |
| "learning_rate": 3.852842809364549e-06, |
| "loss": 0.3909, |
| "step": 21600 |
| }, |
| { |
| "epoch": 2.081992929899009, |
| "eval_loss": 0.3406901955604553, |
| "eval_runtime": 40.2132, |
| "eval_samples_per_second": 248.675, |
| "eval_steps_per_second": 31.084, |
| "step": 21600 |
| }, |
| { |
| "epoch": 2.091631786055949, |
| "grad_norm": 1.2353730201721191, |
| "learning_rate": 3.842809364548495e-06, |
| "loss": 0.3899, |
| "step": 21700 |
| }, |
| { |
| "epoch": 2.091631786055949, |
| "eval_loss": 0.3464890122413635, |
| "eval_runtime": 40.0356, |
| "eval_samples_per_second": 249.778, |
| "eval_steps_per_second": 31.222, |
| "step": 21700 |
| }, |
| { |
| "epoch": 2.1012706422128886, |
| "grad_norm": 1.2551077604293823, |
| "learning_rate": 3.832775919732442e-06, |
| "loss": 0.3885, |
| "step": 21800 |
| }, |
| { |
| "epoch": 2.1012706422128886, |
| "eval_loss": 0.3510824739933014, |
| "eval_runtime": 40.0097, |
| "eval_samples_per_second": 249.939, |
| "eval_steps_per_second": 31.242, |
| "step": 21800 |
| }, |
| { |
| "epoch": 2.1109094983698284, |
| "grad_norm": 1.2558820247650146, |
| "learning_rate": 3.822742474916388e-06, |
| "loss": 0.393, |
| "step": 21900 |
| }, |
| { |
| "epoch": 2.1109094983698284, |
| "eval_loss": 0.3473747968673706, |
| "eval_runtime": 40.0293, |
| "eval_samples_per_second": 249.817, |
| "eval_steps_per_second": 31.227, |
| "step": 21900 |
| }, |
| { |
| "epoch": 2.1205483545267683, |
| "grad_norm": 1.2809922695159912, |
| "learning_rate": 3.8127090301003347e-06, |
| "loss": 0.3943, |
| "step": 22000 |
| }, |
| { |
| "epoch": 2.1205483545267683, |
| "eval_loss": 0.3469405770301819, |
| "eval_runtime": 40.2871, |
| "eval_samples_per_second": 248.218, |
| "eval_steps_per_second": 31.027, |
| "step": 22000 |
| }, |
| { |
| "epoch": 2.130187210683708, |
| "grad_norm": 1.2375646829605103, |
| "learning_rate": 3.802675585284281e-06, |
| "loss": 0.3895, |
| "step": 22100 |
| }, |
| { |
| "epoch": 2.130187210683708, |
| "eval_loss": 0.34392210841178894, |
| "eval_runtime": 40.0647, |
| "eval_samples_per_second": 249.596, |
| "eval_steps_per_second": 31.2, |
| "step": 22100 |
| }, |
| { |
| "epoch": 2.139826066840648, |
| "grad_norm": 1.3119927644729614, |
| "learning_rate": 3.792642140468228e-06, |
| "loss": 0.3918, |
| "step": 22200 |
| }, |
| { |
| "epoch": 2.139826066840648, |
| "eval_loss": 0.34562018513679504, |
| "eval_runtime": 40.0596, |
| "eval_samples_per_second": 249.628, |
| "eval_steps_per_second": 31.204, |
| "step": 22200 |
| }, |
| { |
| "epoch": 2.1494649229975877, |
| "grad_norm": 1.2270475625991821, |
| "learning_rate": 3.782608695652174e-06, |
| "loss": 0.3882, |
| "step": 22300 |
| }, |
| { |
| "epoch": 2.1494649229975877, |
| "eval_loss": 0.3436911702156067, |
| "eval_runtime": 40.1832, |
| "eval_samples_per_second": 248.86, |
| "eval_steps_per_second": 31.108, |
| "step": 22300 |
| }, |
| { |
| "epoch": 2.1591037791545276, |
| "grad_norm": 1.2549434900283813, |
| "learning_rate": 3.7725752508361205e-06, |
| "loss": 0.3855, |
| "step": 22400 |
| }, |
| { |
| "epoch": 2.1591037791545276, |
| "eval_loss": 0.3425428569316864, |
| "eval_runtime": 40.2231, |
| "eval_samples_per_second": 248.613, |
| "eval_steps_per_second": 31.077, |
| "step": 22400 |
| }, |
| { |
| "epoch": 2.1687426353114674, |
| "grad_norm": 1.2413984537124634, |
| "learning_rate": 3.7625418060200673e-06, |
| "loss": 0.3855, |
| "step": 22500 |
| }, |
| { |
| "epoch": 2.1687426353114674, |
| "eval_loss": 0.34506529569625854, |
| "eval_runtime": 40.0554, |
| "eval_samples_per_second": 249.655, |
| "eval_steps_per_second": 31.207, |
| "step": 22500 |
| }, |
| { |
| "epoch": 2.1783814914684076, |
| "grad_norm": 1.4316176176071167, |
| "learning_rate": 3.7525083612040136e-06, |
| "loss": 0.3846, |
| "step": 22600 |
| }, |
| { |
| "epoch": 2.1783814914684076, |
| "eval_loss": 0.3439718186855316, |
| "eval_runtime": 40.0313, |
| "eval_samples_per_second": 249.805, |
| "eval_steps_per_second": 31.226, |
| "step": 22600 |
| }, |
| { |
| "epoch": 2.1880203476253475, |
| "grad_norm": 1.3857834339141846, |
| "learning_rate": 3.74247491638796e-06, |
| "loss": 0.3845, |
| "step": 22700 |
| }, |
| { |
| "epoch": 2.1880203476253475, |
| "eval_loss": 0.34433916211128235, |
| "eval_runtime": 40.0274, |
| "eval_samples_per_second": 249.829, |
| "eval_steps_per_second": 31.229, |
| "step": 22700 |
| }, |
| { |
| "epoch": 2.1976592037822873, |
| "grad_norm": 1.2255629301071167, |
| "learning_rate": 3.7324414715719067e-06, |
| "loss": 0.3847, |
| "step": 22800 |
| }, |
| { |
| "epoch": 2.1976592037822873, |
| "eval_loss": 0.343479722738266, |
| "eval_runtime": 40.0139, |
| "eval_samples_per_second": 249.913, |
| "eval_steps_per_second": 31.239, |
| "step": 22800 |
| }, |
| { |
| "epoch": 2.207298059939227, |
| "grad_norm": 1.1473214626312256, |
| "learning_rate": 3.722408026755853e-06, |
| "loss": 0.3827, |
| "step": 22900 |
| }, |
| { |
| "epoch": 2.207298059939227, |
| "eval_loss": 0.34567588567733765, |
| "eval_runtime": 40.2165, |
| "eval_samples_per_second": 248.654, |
| "eval_steps_per_second": 31.082, |
| "step": 22900 |
| }, |
| { |
| "epoch": 2.216936916096167, |
| "grad_norm": 1.4547693729400635, |
| "learning_rate": 3.7123745819398e-06, |
| "loss": 0.3815, |
| "step": 23000 |
| }, |
| { |
| "epoch": 2.216936916096167, |
| "eval_loss": 0.340980589389801, |
| "eval_runtime": 40.0285, |
| "eval_samples_per_second": 249.822, |
| "eval_steps_per_second": 31.228, |
| "step": 23000 |
| }, |
| { |
| "epoch": 2.2265757722531068, |
| "grad_norm": 1.3170710802078247, |
| "learning_rate": 3.702341137123746e-06, |
| "loss": 0.3853, |
| "step": 23100 |
| }, |
| { |
| "epoch": 2.2265757722531068, |
| "eval_loss": 0.3409317433834076, |
| "eval_runtime": 40.0248, |
| "eval_samples_per_second": 249.845, |
| "eval_steps_per_second": 31.231, |
| "step": 23100 |
| }, |
| { |
| "epoch": 2.2362146284100466, |
| "grad_norm": 1.328068733215332, |
| "learning_rate": 3.6923076923076925e-06, |
| "loss": 0.3823, |
| "step": 23200 |
| }, |
| { |
| "epoch": 2.2362146284100466, |
| "eval_loss": 0.33983665704727173, |
| "eval_runtime": 40.1406, |
| "eval_samples_per_second": 249.124, |
| "eval_steps_per_second": 31.141, |
| "step": 23200 |
| }, |
| { |
| "epoch": 2.2458534845669864, |
| "grad_norm": 1.3068214654922485, |
| "learning_rate": 3.6822742474916393e-06, |
| "loss": 0.3802, |
| "step": 23300 |
| }, |
| { |
| "epoch": 2.2458534845669864, |
| "eval_loss": 0.3398624658584595, |
| "eval_runtime": 40.0336, |
| "eval_samples_per_second": 249.79, |
| "eval_steps_per_second": 31.224, |
| "step": 23300 |
| }, |
| { |
| "epoch": 2.2554923407239262, |
| "grad_norm": 1.2860172986984253, |
| "learning_rate": 3.6722408026755856e-06, |
| "loss": 0.3852, |
| "step": 23400 |
| }, |
| { |
| "epoch": 2.2554923407239262, |
| "eval_loss": 0.3490051031112671, |
| "eval_runtime": 40.1888, |
| "eval_samples_per_second": 248.826, |
| "eval_steps_per_second": 31.103, |
| "step": 23400 |
| }, |
| { |
| "epoch": 2.265131196880866, |
| "grad_norm": 1.3662693500518799, |
| "learning_rate": 3.662207357859532e-06, |
| "loss": 0.3787, |
| "step": 23500 |
| }, |
| { |
| "epoch": 2.265131196880866, |
| "eval_loss": 0.33699285984039307, |
| "eval_runtime": 40.0257, |
| "eval_samples_per_second": 249.839, |
| "eval_steps_per_second": 31.23, |
| "step": 23500 |
| }, |
| { |
| "epoch": 2.274770053037806, |
| "grad_norm": 1.3295284509658813, |
| "learning_rate": 3.6521739130434787e-06, |
| "loss": 0.3802, |
| "step": 23600 |
| }, |
| { |
| "epoch": 2.274770053037806, |
| "eval_loss": 0.33331725001335144, |
| "eval_runtime": 40.0229, |
| "eval_samples_per_second": 249.857, |
| "eval_steps_per_second": 31.232, |
| "step": 23600 |
| }, |
| { |
| "epoch": 2.2844089091947457, |
| "grad_norm": 1.1981773376464844, |
| "learning_rate": 3.642140468227425e-06, |
| "loss": 0.3777, |
| "step": 23700 |
| }, |
| { |
| "epoch": 2.2844089091947457, |
| "eval_loss": 0.33419373631477356, |
| "eval_runtime": 40.0172, |
| "eval_samples_per_second": 249.893, |
| "eval_steps_per_second": 31.237, |
| "step": 23700 |
| }, |
| { |
| "epoch": 2.2940477653516855, |
| "grad_norm": 1.318207025527954, |
| "learning_rate": 3.6321070234113714e-06, |
| "loss": 0.374, |
| "step": 23800 |
| }, |
| { |
| "epoch": 2.2940477653516855, |
| "eval_loss": 0.3326634168624878, |
| "eval_runtime": 40.0247, |
| "eval_samples_per_second": 249.846, |
| "eval_steps_per_second": 31.231, |
| "step": 23800 |
| }, |
| { |
| "epoch": 2.303686621508626, |
| "grad_norm": 1.204872727394104, |
| "learning_rate": 3.622073578595318e-06, |
| "loss": 0.3774, |
| "step": 23900 |
| }, |
| { |
| "epoch": 2.303686621508626, |
| "eval_loss": 0.3303450047969818, |
| "eval_runtime": 40.1914, |
| "eval_samples_per_second": 248.809, |
| "eval_steps_per_second": 31.101, |
| "step": 23900 |
| }, |
| { |
| "epoch": 2.3133254776655656, |
| "grad_norm": 1.30888831615448, |
| "learning_rate": 3.6120401337792645e-06, |
| "loss": 0.3752, |
| "step": 24000 |
| }, |
| { |
| "epoch": 2.3133254776655656, |
| "eval_loss": 0.3322921395301819, |
| "eval_runtime": 40.0327, |
| "eval_samples_per_second": 249.796, |
| "eval_steps_per_second": 31.224, |
| "step": 24000 |
| }, |
| { |
| "epoch": 2.3229643338225054, |
| "grad_norm": 1.296487808227539, |
| "learning_rate": 3.6020066889632112e-06, |
| "loss": 0.3759, |
| "step": 24100 |
| }, |
| { |
| "epoch": 2.3229643338225054, |
| "eval_loss": 0.3311348259449005, |
| "eval_runtime": 40.1498, |
| "eval_samples_per_second": 249.067, |
| "eval_steps_per_second": 31.133, |
| "step": 24100 |
| }, |
| { |
| "epoch": 2.3326031899794453, |
| "grad_norm": 1.2083899974822998, |
| "learning_rate": 3.5919732441471576e-06, |
| "loss": 0.3744, |
| "step": 24200 |
| }, |
| { |
| "epoch": 2.3326031899794453, |
| "eval_loss": 0.3285525441169739, |
| "eval_runtime": 40.0431, |
| "eval_samples_per_second": 249.731, |
| "eval_steps_per_second": 31.216, |
| "step": 24200 |
| }, |
| { |
| "epoch": 2.342242046136385, |
| "grad_norm": 1.2163653373718262, |
| "learning_rate": 3.581939799331104e-06, |
| "loss": 0.3752, |
| "step": 24300 |
| }, |
| { |
| "epoch": 2.342242046136385, |
| "eval_loss": 0.3351224660873413, |
| "eval_runtime": 40.0491, |
| "eval_samples_per_second": 249.694, |
| "eval_steps_per_second": 31.212, |
| "step": 24300 |
| }, |
| { |
| "epoch": 2.351880902293325, |
| "grad_norm": 1.176680564880371, |
| "learning_rate": 3.5719063545150507e-06, |
| "loss": 0.3689, |
| "step": 24400 |
| }, |
| { |
| "epoch": 2.351880902293325, |
| "eval_loss": 0.3293921947479248, |
| "eval_runtime": 40.2376, |
| "eval_samples_per_second": 248.524, |
| "eval_steps_per_second": 31.065, |
| "step": 24400 |
| }, |
| { |
| "epoch": 2.3615197584502647, |
| "grad_norm": 1.2454367876052856, |
| "learning_rate": 3.561872909698997e-06, |
| "loss": 0.3706, |
| "step": 24500 |
| }, |
| { |
| "epoch": 2.3615197584502647, |
| "eval_loss": 0.3308497369289398, |
| "eval_runtime": 40.0489, |
| "eval_samples_per_second": 249.695, |
| "eval_steps_per_second": 31.212, |
| "step": 24500 |
| }, |
| { |
| "epoch": 2.3711586146072046, |
| "grad_norm": 1.4466218948364258, |
| "learning_rate": 3.5518394648829434e-06, |
| "loss": 0.3719, |
| "step": 24600 |
| }, |
| { |
| "epoch": 2.3711586146072046, |
| "eval_loss": 0.32894277572631836, |
| "eval_runtime": 40.0499, |
| "eval_samples_per_second": 249.688, |
| "eval_steps_per_second": 31.211, |
| "step": 24600 |
| }, |
| { |
| "epoch": 2.3807974707641444, |
| "grad_norm": 1.1407536268234253, |
| "learning_rate": 3.54180602006689e-06, |
| "loss": 0.3726, |
| "step": 24700 |
| }, |
| { |
| "epoch": 2.3807974707641444, |
| "eval_loss": 0.3315465450286865, |
| "eval_runtime": 40.0281, |
| "eval_samples_per_second": 249.825, |
| "eval_steps_per_second": 31.228, |
| "step": 24700 |
| }, |
| { |
| "epoch": 2.390436326921084, |
| "grad_norm": 1.2593247890472412, |
| "learning_rate": 3.5317725752508365e-06, |
| "loss": 0.3707, |
| "step": 24800 |
| }, |
| { |
| "epoch": 2.390436326921084, |
| "eval_loss": 0.32788795232772827, |
| "eval_runtime": 39.9985, |
| "eval_samples_per_second": 250.009, |
| "eval_steps_per_second": 31.251, |
| "step": 24800 |
| }, |
| { |
| "epoch": 2.400075183078024, |
| "grad_norm": 1.183972954750061, |
| "learning_rate": 3.521739130434783e-06, |
| "loss": 0.3725, |
| "step": 24900 |
| }, |
| { |
| "epoch": 2.400075183078024, |
| "eval_loss": 0.3257734477519989, |
| "eval_runtime": 40.195, |
| "eval_samples_per_second": 248.787, |
| "eval_steps_per_second": 31.098, |
| "step": 24900 |
| }, |
| { |
| "epoch": 2.409714039234964, |
| "grad_norm": 1.3162939548492432, |
| "learning_rate": 3.5117056856187296e-06, |
| "loss": 0.3726, |
| "step": 25000 |
| }, |
| { |
| "epoch": 2.409714039234964, |
| "eval_loss": 0.32726946473121643, |
| "eval_runtime": 40.1077, |
| "eval_samples_per_second": 249.329, |
| "eval_steps_per_second": 31.166, |
| "step": 25000 |
| }, |
| { |
| "epoch": 2.4193528953919037, |
| "grad_norm": 1.2255990505218506, |
| "learning_rate": 3.501672240802676e-06, |
| "loss": 0.3723, |
| "step": 25100 |
| }, |
| { |
| "epoch": 2.4193528953919037, |
| "eval_loss": 0.32688045501708984, |
| "eval_runtime": 40.0031, |
| "eval_samples_per_second": 249.98, |
| "eval_steps_per_second": 31.248, |
| "step": 25100 |
| }, |
| { |
| "epoch": 2.4289917515488435, |
| "grad_norm": 1.2405096292495728, |
| "learning_rate": 3.491638795986622e-06, |
| "loss": 0.3701, |
| "step": 25200 |
| }, |
| { |
| "epoch": 2.4289917515488435, |
| "eval_loss": 0.32833394408226013, |
| "eval_runtime": 40.0179, |
| "eval_samples_per_second": 249.888, |
| "eval_steps_per_second": 31.236, |
| "step": 25200 |
| }, |
| { |
| "epoch": 2.4386306077057833, |
| "grad_norm": 1.1878525018692017, |
| "learning_rate": 3.481605351170568e-06, |
| "loss": 0.371, |
| "step": 25300 |
| }, |
| { |
| "epoch": 2.4386306077057833, |
| "eval_loss": 0.3243899941444397, |
| "eval_runtime": 40.2224, |
| "eval_samples_per_second": 248.618, |
| "eval_steps_per_second": 31.077, |
| "step": 25300 |
| }, |
| { |
| "epoch": 2.4482694638627236, |
| "grad_norm": 1.160326600074768, |
| "learning_rate": 3.471571906354515e-06, |
| "loss": 0.3664, |
| "step": 25400 |
| }, |
| { |
| "epoch": 2.4482694638627236, |
| "eval_loss": 0.3271392583847046, |
| "eval_runtime": 40.0288, |
| "eval_samples_per_second": 249.82, |
| "eval_steps_per_second": 31.228, |
| "step": 25400 |
| }, |
| { |
| "epoch": 2.4579083200196634, |
| "grad_norm": 1.1720784902572632, |
| "learning_rate": 3.4615384615384613e-06, |
| "loss": 0.3703, |
| "step": 25500 |
| }, |
| { |
| "epoch": 2.4579083200196634, |
| "eval_loss": 0.3221685588359833, |
| "eval_runtime": 40.0382, |
| "eval_samples_per_second": 249.761, |
| "eval_steps_per_second": 31.22, |
| "step": 25500 |
| }, |
| { |
| "epoch": 2.4675471761766032, |
| "grad_norm": 1.195279836654663, |
| "learning_rate": 3.4515050167224076e-06, |
| "loss": 0.368, |
| "step": 25600 |
| }, |
| { |
| "epoch": 2.4675471761766032, |
| "eval_loss": 0.3238082528114319, |
| "eval_runtime": 40.0341, |
| "eval_samples_per_second": 249.787, |
| "eval_steps_per_second": 31.223, |
| "step": 25600 |
| }, |
| { |
| "epoch": 2.477186032333543, |
| "grad_norm": 1.1934449672698975, |
| "learning_rate": 3.4414715719063544e-06, |
| "loss": 0.3661, |
| "step": 25700 |
| }, |
| { |
| "epoch": 2.477186032333543, |
| "eval_loss": 0.3191760778427124, |
| "eval_runtime": 40.2569, |
| "eval_samples_per_second": 248.404, |
| "eval_steps_per_second": 31.051, |
| "step": 25700 |
| }, |
| { |
| "epoch": 2.486824888490483, |
| "grad_norm": 1.185477375984192, |
| "learning_rate": 3.4314381270903007e-06, |
| "loss": 0.365, |
| "step": 25800 |
| }, |
| { |
| "epoch": 2.486824888490483, |
| "eval_loss": 0.32491880655288696, |
| "eval_runtime": 40.0528, |
| "eval_samples_per_second": 249.67, |
| "eval_steps_per_second": 31.209, |
| "step": 25800 |
| }, |
| { |
| "epoch": 2.4964637446474227, |
| "grad_norm": 1.2137432098388672, |
| "learning_rate": 3.4214046822742475e-06, |
| "loss": 0.3641, |
| "step": 25900 |
| }, |
| { |
| "epoch": 2.4964637446474227, |
| "eval_loss": 0.3219539523124695, |
| "eval_runtime": 40.0329, |
| "eval_samples_per_second": 249.795, |
| "eval_steps_per_second": 31.224, |
| "step": 25900 |
| }, |
| { |
| "epoch": 2.5061026008043625, |
| "grad_norm": 1.2091009616851807, |
| "learning_rate": 3.411371237458194e-06, |
| "loss": 0.3675, |
| "step": 26000 |
| }, |
| { |
| "epoch": 2.5061026008043625, |
| "eval_loss": 0.32177218794822693, |
| "eval_runtime": 40.0916, |
| "eval_samples_per_second": 249.429, |
| "eval_steps_per_second": 31.179, |
| "step": 26000 |
| }, |
| { |
| "epoch": 2.5157414569613024, |
| "grad_norm": 1.2672072649002075, |
| "learning_rate": 3.40133779264214e-06, |
| "loss": 0.3662, |
| "step": 26100 |
| }, |
| { |
| "epoch": 2.5157414569613024, |
| "eval_loss": 0.32613876461982727, |
| "eval_runtime": 40.0426, |
| "eval_samples_per_second": 249.734, |
| "eval_steps_per_second": 31.217, |
| "step": 26100 |
| }, |
| { |
| "epoch": 2.525380313118242, |
| "grad_norm": 1.0901987552642822, |
| "learning_rate": 3.391304347826087e-06, |
| "loss": 0.3665, |
| "step": 26200 |
| }, |
| { |
| "epoch": 2.525380313118242, |
| "eval_loss": 0.3274555206298828, |
| "eval_runtime": 40.2412, |
| "eval_samples_per_second": 248.501, |
| "eval_steps_per_second": 31.063, |
| "step": 26200 |
| }, |
| { |
| "epoch": 2.535019169275182, |
| "grad_norm": 1.1770433187484741, |
| "learning_rate": 3.3812709030100333e-06, |
| "loss": 0.3643, |
| "step": 26300 |
| }, |
| { |
| "epoch": 2.535019169275182, |
| "eval_loss": 0.32851478457450867, |
| "eval_runtime": 40.0516, |
| "eval_samples_per_second": 249.678, |
| "eval_steps_per_second": 31.21, |
| "step": 26300 |
| }, |
| { |
| "epoch": 2.544658025432122, |
| "grad_norm": 1.2189823389053345, |
| "learning_rate": 3.3712374581939796e-06, |
| "loss": 0.3603, |
| "step": 26400 |
| }, |
| { |
| "epoch": 2.544658025432122, |
| "eval_loss": 0.3176610767841339, |
| "eval_runtime": 40.0452, |
| "eval_samples_per_second": 249.718, |
| "eval_steps_per_second": 31.215, |
| "step": 26400 |
| }, |
| { |
| "epoch": 2.554296881589062, |
| "grad_norm": 1.1684179306030273, |
| "learning_rate": 3.3612040133779264e-06, |
| "loss": 0.3639, |
| "step": 26500 |
| }, |
| { |
| "epoch": 2.554296881589062, |
| "eval_loss": 0.3156176805496216, |
| "eval_runtime": 40.0435, |
| "eval_samples_per_second": 249.728, |
| "eval_steps_per_second": 31.216, |
| "step": 26500 |
| }, |
| { |
| "epoch": 2.563935737746002, |
| "grad_norm": 1.2573341131210327, |
| "learning_rate": 3.3511705685618727e-06, |
| "loss": 0.3586, |
| "step": 26600 |
| }, |
| { |
| "epoch": 2.563935737746002, |
| "eval_loss": 0.32222485542297363, |
| "eval_runtime": 40.199, |
| "eval_samples_per_second": 248.762, |
| "eval_steps_per_second": 31.095, |
| "step": 26600 |
| }, |
| { |
| "epoch": 2.5735745939029417, |
| "grad_norm": 1.2410520315170288, |
| "learning_rate": 3.3411371237458195e-06, |
| "loss": 0.3626, |
| "step": 26700 |
| }, |
| { |
| "epoch": 2.5735745939029417, |
| "eval_loss": 0.31266114115715027, |
| "eval_runtime": 40.0104, |
| "eval_samples_per_second": 249.935, |
| "eval_steps_per_second": 31.242, |
| "step": 26700 |
| }, |
| { |
| "epoch": 2.5832134500598816, |
| "grad_norm": 1.2394918203353882, |
| "learning_rate": 3.331103678929766e-06, |
| "loss": 0.3606, |
| "step": 26800 |
| }, |
| { |
| "epoch": 2.5832134500598816, |
| "eval_loss": 0.32011231780052185, |
| "eval_runtime": 40.1084, |
| "eval_samples_per_second": 249.324, |
| "eval_steps_per_second": 31.166, |
| "step": 26800 |
| }, |
| { |
| "epoch": 2.5928523062168214, |
| "grad_norm": 1.2512286901474, |
| "learning_rate": 3.321070234113712e-06, |
| "loss": 0.3614, |
| "step": 26900 |
| }, |
| { |
| "epoch": 2.5928523062168214, |
| "eval_loss": 0.31560882925987244, |
| "eval_runtime": 40.027, |
| "eval_samples_per_second": 249.831, |
| "eval_steps_per_second": 31.229, |
| "step": 26900 |
| }, |
| { |
| "epoch": 2.602491162373761, |
| "grad_norm": 1.3661274909973145, |
| "learning_rate": 3.311036789297659e-06, |
| "loss": 0.3601, |
| "step": 27000 |
| }, |
| { |
| "epoch": 2.602491162373761, |
| "eval_loss": 0.3139115869998932, |
| "eval_runtime": 40.2328, |
| "eval_samples_per_second": 248.554, |
| "eval_steps_per_second": 31.069, |
| "step": 27000 |
| }, |
| { |
| "epoch": 2.612130018530701, |
| "grad_norm": 1.293144702911377, |
| "learning_rate": 3.3010033444816052e-06, |
| "loss": 0.3602, |
| "step": 27100 |
| }, |
| { |
| "epoch": 2.612130018530701, |
| "eval_loss": 0.31472310423851013, |
| "eval_runtime": 40.0369, |
| "eval_samples_per_second": 249.77, |
| "eval_steps_per_second": 31.221, |
| "step": 27100 |
| }, |
| { |
| "epoch": 2.621768874687641, |
| "grad_norm": 1.1856138706207275, |
| "learning_rate": 3.2909698996655516e-06, |
| "loss": 0.3593, |
| "step": 27200 |
| }, |
| { |
| "epoch": 2.621768874687641, |
| "eval_loss": 0.3115543723106384, |
| "eval_runtime": 40.2262, |
| "eval_samples_per_second": 248.594, |
| "eval_steps_per_second": 31.074, |
| "step": 27200 |
| }, |
| { |
| "epoch": 2.6314077308445807, |
| "grad_norm": 1.2642232179641724, |
| "learning_rate": 3.2809364548494983e-06, |
| "loss": 0.3577, |
| "step": 27300 |
| }, |
| { |
| "epoch": 2.6314077308445807, |
| "eval_loss": 0.31309688091278076, |
| "eval_runtime": 40.0885, |
| "eval_samples_per_second": 249.448, |
| "eval_steps_per_second": 31.181, |
| "step": 27300 |
| }, |
| { |
| "epoch": 2.6410465870015205, |
| "grad_norm": 1.2292590141296387, |
| "learning_rate": 3.2709030100334447e-06, |
| "loss": 0.357, |
| "step": 27400 |
| }, |
| { |
| "epoch": 2.6410465870015205, |
| "eval_loss": 0.3183058202266693, |
| "eval_runtime": 40.0953, |
| "eval_samples_per_second": 249.406, |
| "eval_steps_per_second": 31.176, |
| "step": 27400 |
| }, |
| { |
| "epoch": 2.6506854431584603, |
| "grad_norm": 1.255295753479004, |
| "learning_rate": 3.260869565217391e-06, |
| "loss": 0.3552, |
| "step": 27500 |
| }, |
| { |
| "epoch": 2.6506854431584603, |
| "eval_loss": 0.3173259198665619, |
| "eval_runtime": 40.1197, |
| "eval_samples_per_second": 249.254, |
| "eval_steps_per_second": 31.157, |
| "step": 27500 |
| }, |
| { |
| "epoch": 2.6603242993154, |
| "grad_norm": 1.2543479204177856, |
| "learning_rate": 3.2508361204013378e-06, |
| "loss": 0.3562, |
| "step": 27600 |
| }, |
| { |
| "epoch": 2.6603242993154, |
| "eval_loss": 0.31650179624557495, |
| "eval_runtime": 40.2725, |
| "eval_samples_per_second": 248.308, |
| "eval_steps_per_second": 31.039, |
| "step": 27600 |
| }, |
| { |
| "epoch": 2.66996315547234, |
| "grad_norm": 1.1937369108200073, |
| "learning_rate": 3.240802675585284e-06, |
| "loss": 0.3548, |
| "step": 27700 |
| }, |
| { |
| "epoch": 2.66996315547234, |
| "eval_loss": 0.3191297948360443, |
| "eval_runtime": 40.0849, |
| "eval_samples_per_second": 249.47, |
| "eval_steps_per_second": 31.184, |
| "step": 27700 |
| }, |
| { |
| "epoch": 2.67960201162928, |
| "grad_norm": 1.1102817058563232, |
| "learning_rate": 3.230769230769231e-06, |
| "loss": 0.3558, |
| "step": 27800 |
| }, |
| { |
| "epoch": 2.67960201162928, |
| "eval_loss": 0.31787580251693726, |
| "eval_runtime": 40.045, |
| "eval_samples_per_second": 249.719, |
| "eval_steps_per_second": 31.215, |
| "step": 27800 |
| }, |
| { |
| "epoch": 2.6892408677862196, |
| "grad_norm": 1.1875417232513428, |
| "learning_rate": 3.2207357859531772e-06, |
| "loss": 0.3561, |
| "step": 27900 |
| }, |
| { |
| "epoch": 2.6892408677862196, |
| "eval_loss": 0.3153333067893982, |
| "eval_runtime": 40.0433, |
| "eval_samples_per_second": 249.73, |
| "eval_steps_per_second": 31.216, |
| "step": 27900 |
| }, |
| { |
| "epoch": 2.6988797239431594, |
| "grad_norm": 1.203171968460083, |
| "learning_rate": 3.2107023411371236e-06, |
| "loss": 0.3557, |
| "step": 28000 |
| }, |
| { |
| "epoch": 2.6988797239431594, |
| "eval_loss": 0.3114122450351715, |
| "eval_runtime": 40.207, |
| "eval_samples_per_second": 248.713, |
| "eval_steps_per_second": 31.089, |
| "step": 28000 |
| }, |
| { |
| "epoch": 2.7085185801000993, |
| "grad_norm": 1.1906991004943848, |
| "learning_rate": 3.2006688963210703e-06, |
| "loss": 0.3507, |
| "step": 28100 |
| }, |
| { |
| "epoch": 2.7085185801000993, |
| "eval_loss": 0.31228846311569214, |
| "eval_runtime": 40.2407, |
| "eval_samples_per_second": 248.504, |
| "eval_steps_per_second": 31.063, |
| "step": 28100 |
| }, |
| { |
| "epoch": 2.7181574362570395, |
| "grad_norm": 1.0473979711532593, |
| "learning_rate": 3.1906354515050167e-06, |
| "loss": 0.3547, |
| "step": 28200 |
| }, |
| { |
| "epoch": 2.7181574362570395, |
| "eval_loss": 0.31403276324272156, |
| "eval_runtime": 40.052, |
| "eval_samples_per_second": 249.675, |
| "eval_steps_per_second": 31.209, |
| "step": 28200 |
| }, |
| { |
| "epoch": 2.7277962924139794, |
| "grad_norm": 1.3040709495544434, |
| "learning_rate": 3.180602006688963e-06, |
| "loss": 0.3549, |
| "step": 28300 |
| }, |
| { |
| "epoch": 2.7277962924139794, |
| "eval_loss": 0.31078001856803894, |
| "eval_runtime": 40.0565, |
| "eval_samples_per_second": 249.648, |
| "eval_steps_per_second": 31.206, |
| "step": 28300 |
| }, |
| { |
| "epoch": 2.737435148570919, |
| "grad_norm": 1.1051684617996216, |
| "learning_rate": 3.1705685618729098e-06, |
| "loss": 0.3527, |
| "step": 28400 |
| }, |
| { |
| "epoch": 2.737435148570919, |
| "eval_loss": 0.311916321516037, |
| "eval_runtime": 40.0592, |
| "eval_samples_per_second": 249.63, |
| "eval_steps_per_second": 31.204, |
| "step": 28400 |
| }, |
| { |
| "epoch": 2.747074004727859, |
| "grad_norm": 1.2204347848892212, |
| "learning_rate": 3.160535117056856e-06, |
| "loss": 0.3529, |
| "step": 28500 |
| }, |
| { |
| "epoch": 2.747074004727859, |
| "eval_loss": 0.31446966528892517, |
| "eval_runtime": 40.0552, |
| "eval_samples_per_second": 249.655, |
| "eval_steps_per_second": 31.207, |
| "step": 28500 |
| }, |
| { |
| "epoch": 2.756712860884799, |
| "grad_norm": 1.1805849075317383, |
| "learning_rate": 3.1505016722408024e-06, |
| "loss": 0.3514, |
| "step": 28600 |
| }, |
| { |
| "epoch": 2.756712860884799, |
| "eval_loss": 0.305553138256073, |
| "eval_runtime": 40.2179, |
| "eval_samples_per_second": 248.645, |
| "eval_steps_per_second": 31.081, |
| "step": 28600 |
| }, |
| { |
| "epoch": 2.7663517170417387, |
| "grad_norm": 1.2930576801300049, |
| "learning_rate": 3.140468227424749e-06, |
| "loss": 0.3508, |
| "step": 28700 |
| }, |
| { |
| "epoch": 2.7663517170417387, |
| "eval_loss": 0.31795772910118103, |
| "eval_runtime": 40.0252, |
| "eval_samples_per_second": 249.842, |
| "eval_steps_per_second": 31.23, |
| "step": 28700 |
| }, |
| { |
| "epoch": 2.7759905731986785, |
| "grad_norm": 1.2266051769256592, |
| "learning_rate": 3.1304347826086955e-06, |
| "loss": 0.3513, |
| "step": 28800 |
| }, |
| { |
| "epoch": 2.7759905731986785, |
| "eval_loss": 0.31347090005874634, |
| "eval_runtime": 40.0594, |
| "eval_samples_per_second": 249.629, |
| "eval_steps_per_second": 31.204, |
| "step": 28800 |
| }, |
| { |
| "epoch": 2.7856294293556183, |
| "grad_norm": 1.1644967794418335, |
| "learning_rate": 3.1204013377926423e-06, |
| "loss": 0.3477, |
| "step": 28900 |
| }, |
| { |
| "epoch": 2.7856294293556183, |
| "eval_loss": 0.3088667392730713, |
| "eval_runtime": 40.0626, |
| "eval_samples_per_second": 249.609, |
| "eval_steps_per_second": 31.201, |
| "step": 28900 |
| }, |
| { |
| "epoch": 2.795268285512558, |
| "grad_norm": 1.1567316055297852, |
| "learning_rate": 3.1103678929765886e-06, |
| "loss": 0.3512, |
| "step": 29000 |
| }, |
| { |
| "epoch": 2.795268285512558, |
| "eval_loss": 0.30603039264678955, |
| "eval_runtime": 40.0801, |
| "eval_samples_per_second": 249.5, |
| "eval_steps_per_second": 31.188, |
| "step": 29000 |
| }, |
| { |
| "epoch": 2.804907141669498, |
| "grad_norm": 1.324166178703308, |
| "learning_rate": 3.100334448160535e-06, |
| "loss": 0.3508, |
| "step": 29100 |
| }, |
| { |
| "epoch": 2.804907141669498, |
| "eval_loss": 0.3052707612514496, |
| "eval_runtime": 40.1749, |
| "eval_samples_per_second": 248.912, |
| "eval_steps_per_second": 31.114, |
| "step": 29100 |
| }, |
| { |
| "epoch": 2.8145459978264378, |
| "grad_norm": 1.2231703996658325, |
| "learning_rate": 3.0903010033444818e-06, |
| "loss": 0.3491, |
| "step": 29200 |
| }, |
| { |
| "epoch": 2.8145459978264378, |
| "eval_loss": 0.3067169487476349, |
| "eval_runtime": 40.0129, |
| "eval_samples_per_second": 249.92, |
| "eval_steps_per_second": 31.24, |
| "step": 29200 |
| }, |
| { |
| "epoch": 2.824184853983378, |
| "grad_norm": 1.203022837638855, |
| "learning_rate": 3.080267558528428e-06, |
| "loss": 0.3476, |
| "step": 29300 |
| }, |
| { |
| "epoch": 2.824184853983378, |
| "eval_loss": 0.31380364298820496, |
| "eval_runtime": 39.9974, |
| "eval_samples_per_second": 250.016, |
| "eval_steps_per_second": 31.252, |
| "step": 29300 |
| }, |
| { |
| "epoch": 2.833823710140318, |
| "grad_norm": 1.1473413705825806, |
| "learning_rate": 3.0702341137123744e-06, |
| "loss": 0.3497, |
| "step": 29400 |
| }, |
| { |
| "epoch": 2.833823710140318, |
| "eval_loss": 0.30883774161338806, |
| "eval_runtime": 40.0159, |
| "eval_samples_per_second": 249.901, |
| "eval_steps_per_second": 31.238, |
| "step": 29400 |
| }, |
| { |
| "epoch": 2.8434625662972577, |
| "grad_norm": 1.0918190479278564, |
| "learning_rate": 3.060200668896321e-06, |
| "loss": 0.3505, |
| "step": 29500 |
| }, |
| { |
| "epoch": 2.8434625662972577, |
| "eval_loss": 0.3049847185611725, |
| "eval_runtime": 40.2816, |
| "eval_samples_per_second": 248.252, |
| "eval_steps_per_second": 31.032, |
| "step": 29500 |
| }, |
| { |
| "epoch": 2.8531014224541975, |
| "grad_norm": 1.121775507926941, |
| "learning_rate": 3.0501672240802675e-06, |
| "loss": 0.348, |
| "step": 29600 |
| }, |
| { |
| "epoch": 2.8531014224541975, |
| "eval_loss": 0.304568886756897, |
| "eval_runtime": 40.1199, |
| "eval_samples_per_second": 249.253, |
| "eval_steps_per_second": 31.157, |
| "step": 29600 |
| }, |
| { |
| "epoch": 2.8627402786111373, |
| "grad_norm": 1.1627501249313354, |
| "learning_rate": 3.0401337792642143e-06, |
| "loss": 0.3466, |
| "step": 29700 |
| }, |
| { |
| "epoch": 2.8627402786111373, |
| "eval_loss": 0.3042547404766083, |
| "eval_runtime": 40.2234, |
| "eval_samples_per_second": 248.611, |
| "eval_steps_per_second": 31.076, |
| "step": 29700 |
| }, |
| { |
| "epoch": 2.872379134768077, |
| "grad_norm": 1.2211036682128906, |
| "learning_rate": 3.0301003344481606e-06, |
| "loss": 0.3481, |
| "step": 29800 |
| }, |
| { |
| "epoch": 2.872379134768077, |
| "eval_loss": 0.3090800642967224, |
| "eval_runtime": 40.0403, |
| "eval_samples_per_second": 249.748, |
| "eval_steps_per_second": 31.219, |
| "step": 29800 |
| }, |
| { |
| "epoch": 2.882017990925017, |
| "grad_norm": 1.2113616466522217, |
| "learning_rate": 3.020066889632107e-06, |
| "loss": 0.3473, |
| "step": 29900 |
| }, |
| { |
| "epoch": 2.882017990925017, |
| "eval_loss": 0.30445191264152527, |
| "eval_runtime": 40.0405, |
| "eval_samples_per_second": 249.747, |
| "eval_steps_per_second": 31.218, |
| "step": 29900 |
| }, |
| { |
| "epoch": 2.891656847081957, |
| "grad_norm": 1.2371474504470825, |
| "learning_rate": 3.0100334448160537e-06, |
| "loss": 0.3465, |
| "step": 30000 |
| }, |
| { |
| "epoch": 2.891656847081957, |
| "eval_loss": 0.30408233404159546, |
| "eval_runtime": 40.0474, |
| "eval_samples_per_second": 249.704, |
| "eval_steps_per_second": 31.213, |
| "step": 30000 |
| }, |
| { |
| "epoch": 2.9012957032388966, |
| "grad_norm": 1.1639801263809204, |
| "learning_rate": 3e-06, |
| "loss": 0.3454, |
| "step": 30100 |
| }, |
| { |
| "epoch": 2.9012957032388966, |
| "eval_loss": 0.3157171905040741, |
| "eval_runtime": 40.2277, |
| "eval_samples_per_second": 248.585, |
| "eval_steps_per_second": 31.073, |
| "step": 30100 |
| }, |
| { |
| "epoch": 2.9109345593958365, |
| "grad_norm": 1.1769859790802002, |
| "learning_rate": 2.9899665551839464e-06, |
| "loss": 0.344, |
| "step": 30200 |
| }, |
| { |
| "epoch": 2.9109345593958365, |
| "eval_loss": 0.3092867136001587, |
| "eval_runtime": 40.029, |
| "eval_samples_per_second": 249.819, |
| "eval_steps_per_second": 31.227, |
| "step": 30200 |
| }, |
| { |
| "epoch": 2.9205734155527763, |
| "grad_norm": 1.1697938442230225, |
| "learning_rate": 2.979933110367893e-06, |
| "loss": 0.3455, |
| "step": 30300 |
| }, |
| { |
| "epoch": 2.9205734155527763, |
| "eval_loss": 0.30681154131889343, |
| "eval_runtime": 40.0152, |
| "eval_samples_per_second": 249.905, |
| "eval_steps_per_second": 31.238, |
| "step": 30300 |
| }, |
| { |
| "epoch": 2.930212271709716, |
| "grad_norm": 1.064261555671692, |
| "learning_rate": 2.9698996655518395e-06, |
| "loss": 0.3436, |
| "step": 30400 |
| }, |
| { |
| "epoch": 2.930212271709716, |
| "eval_loss": 0.30829107761383057, |
| "eval_runtime": 40.1179, |
| "eval_samples_per_second": 249.265, |
| "eval_steps_per_second": 31.158, |
| "step": 30400 |
| }, |
| { |
| "epoch": 2.939851127866656, |
| "grad_norm": 1.2219582796096802, |
| "learning_rate": 2.959866220735786e-06, |
| "loss": 0.3432, |
| "step": 30500 |
| }, |
| { |
| "epoch": 2.939851127866656, |
| "eval_loss": 0.30284854769706726, |
| "eval_runtime": 40.2058, |
| "eval_samples_per_second": 248.72, |
| "eval_steps_per_second": 31.09, |
| "step": 30500 |
| }, |
| { |
| "epoch": 2.9494899840235957, |
| "grad_norm": 1.1802637577056885, |
| "learning_rate": 2.9498327759197326e-06, |
| "loss": 0.3479, |
| "step": 30600 |
| }, |
| { |
| "epoch": 2.9494899840235957, |
| "eval_loss": 0.30530956387519836, |
| "eval_runtime": 40.0579, |
| "eval_samples_per_second": 249.638, |
| "eval_steps_per_second": 31.205, |
| "step": 30600 |
| }, |
| { |
| "epoch": 2.9591288401805356, |
| "grad_norm": 1.1790562868118286, |
| "learning_rate": 2.939799331103679e-06, |
| "loss": 0.3443, |
| "step": 30700 |
| }, |
| { |
| "epoch": 2.9591288401805356, |
| "eval_loss": 0.3033536672592163, |
| "eval_runtime": 40.0211, |
| "eval_samples_per_second": 249.868, |
| "eval_steps_per_second": 31.234, |
| "step": 30700 |
| }, |
| { |
| "epoch": 2.9687676963374754, |
| "grad_norm": 1.3000251054763794, |
| "learning_rate": 2.9297658862876257e-06, |
| "loss": 0.3445, |
| "step": 30800 |
| }, |
| { |
| "epoch": 2.9687676963374754, |
| "eval_loss": 0.30252954363822937, |
| "eval_runtime": 40.0123, |
| "eval_samples_per_second": 249.923, |
| "eval_steps_per_second": 31.24, |
| "step": 30800 |
| }, |
| { |
| "epoch": 2.978406552494415, |
| "grad_norm": 1.2018824815750122, |
| "learning_rate": 2.919732441471572e-06, |
| "loss": 0.3419, |
| "step": 30900 |
| }, |
| { |
| "epoch": 2.978406552494415, |
| "eval_loss": 0.30083614587783813, |
| "eval_runtime": 40.0481, |
| "eval_samples_per_second": 249.7, |
| "eval_steps_per_second": 31.212, |
| "step": 30900 |
| }, |
| { |
| "epoch": 2.9880454086513555, |
| "grad_norm": 1.2023593187332153, |
| "learning_rate": 2.9096989966555184e-06, |
| "loss": 0.3434, |
| "step": 31000 |
| }, |
| { |
| "epoch": 2.9880454086513555, |
| "eval_loss": 0.30430230498313904, |
| "eval_runtime": 40.2329, |
| "eval_samples_per_second": 248.553, |
| "eval_steps_per_second": 31.069, |
| "step": 31000 |
| }, |
| { |
| "epoch": 2.9976842648082953, |
| "grad_norm": 1.164267659187317, |
| "learning_rate": 2.899665551839465e-06, |
| "loss": 0.3416, |
| "step": 31100 |
| }, |
| { |
| "epoch": 2.9976842648082953, |
| "eval_loss": 0.301971435546875, |
| "eval_runtime": 40.0594, |
| "eval_samples_per_second": 249.629, |
| "eval_steps_per_second": 31.204, |
| "step": 31100 |
| }, |
| { |
| "epoch": 3.007323120965235, |
| "grad_norm": 1.1940221786499023, |
| "learning_rate": 2.8896321070234115e-06, |
| "loss": 0.3403, |
| "step": 31200 |
| }, |
| { |
| "epoch": 3.007323120965235, |
| "eval_loss": 0.2989708483219147, |
| "eval_runtime": 40.053, |
| "eval_samples_per_second": 249.669, |
| "eval_steps_per_second": 31.209, |
| "step": 31200 |
| }, |
| { |
| "epoch": 3.016961977122175, |
| "grad_norm": 1.1385316848754883, |
| "learning_rate": 2.879598662207358e-06, |
| "loss": 0.3412, |
| "step": 31300 |
| }, |
| { |
| "epoch": 3.016961977122175, |
| "eval_loss": 0.3076593279838562, |
| "eval_runtime": 40.0512, |
| "eval_samples_per_second": 249.68, |
| "eval_steps_per_second": 31.21, |
| "step": 31300 |
| }, |
| { |
| "epoch": 3.0266008332791148, |
| "grad_norm": 1.1974833011627197, |
| "learning_rate": 2.8695652173913046e-06, |
| "loss": 0.3404, |
| "step": 31400 |
| }, |
| { |
| "epoch": 3.0266008332791148, |
| "eval_loss": 0.29814204573631287, |
| "eval_runtime": 40.2335, |
| "eval_samples_per_second": 248.549, |
| "eval_steps_per_second": 31.069, |
| "step": 31400 |
| }, |
| { |
| "epoch": 3.0362396894360546, |
| "grad_norm": 1.170362114906311, |
| "learning_rate": 2.859531772575251e-06, |
| "loss": 0.3393, |
| "step": 31500 |
| }, |
| { |
| "epoch": 3.0362396894360546, |
| "eval_loss": 0.29789820313453674, |
| "eval_runtime": 40.0606, |
| "eval_samples_per_second": 249.622, |
| "eval_steps_per_second": 31.203, |
| "step": 31500 |
| }, |
| { |
| "epoch": 3.0458785455929944, |
| "grad_norm": 1.2427632808685303, |
| "learning_rate": 2.8494983277591977e-06, |
| "loss": 0.3425, |
| "step": 31600 |
| }, |
| { |
| "epoch": 3.0458785455929944, |
| "eval_loss": 0.30032023787498474, |
| "eval_runtime": 40.0293, |
| "eval_samples_per_second": 249.817, |
| "eval_steps_per_second": 31.227, |
| "step": 31600 |
| }, |
| { |
| "epoch": 3.0555174017499342, |
| "grad_norm": 1.2110202312469482, |
| "learning_rate": 2.839464882943144e-06, |
| "loss": 0.3411, |
| "step": 31700 |
| }, |
| { |
| "epoch": 3.0555174017499342, |
| "eval_loss": 0.3062838912010193, |
| "eval_runtime": 40.0201, |
| "eval_samples_per_second": 249.875, |
| "eval_steps_per_second": 31.234, |
| "step": 31700 |
| }, |
| { |
| "epoch": 3.065156257906874, |
| "grad_norm": 1.2322800159454346, |
| "learning_rate": 2.8294314381270904e-06, |
| "loss": 0.3378, |
| "step": 31800 |
| }, |
| { |
| "epoch": 3.065156257906874, |
| "eval_loss": 0.2991129755973816, |
| "eval_runtime": 40.0041, |
| "eval_samples_per_second": 249.974, |
| "eval_steps_per_second": 31.247, |
| "step": 31800 |
| }, |
| { |
| "epoch": 3.074795114063814, |
| "grad_norm": 1.1711872816085815, |
| "learning_rate": 2.819397993311037e-06, |
| "loss": 0.3386, |
| "step": 31900 |
| }, |
| { |
| "epoch": 3.074795114063814, |
| "eval_loss": 0.2999269962310791, |
| "eval_runtime": 40.2184, |
| "eval_samples_per_second": 248.642, |
| "eval_steps_per_second": 31.08, |
| "step": 31900 |
| }, |
| { |
| "epoch": 3.0844339702207537, |
| "grad_norm": 1.1980561017990112, |
| "learning_rate": 2.8093645484949835e-06, |
| "loss": 0.3391, |
| "step": 32000 |
| }, |
| { |
| "epoch": 3.0844339702207537, |
| "eval_loss": 0.3014710545539856, |
| "eval_runtime": 40.0414, |
| "eval_samples_per_second": 249.741, |
| "eval_steps_per_second": 31.218, |
| "step": 32000 |
| }, |
| { |
| "epoch": 3.094072826377694, |
| "grad_norm": 1.2509229183197021, |
| "learning_rate": 2.79933110367893e-06, |
| "loss": 0.3357, |
| "step": 32100 |
| }, |
| { |
| "epoch": 3.094072826377694, |
| "eval_loss": 0.29611119627952576, |
| "eval_runtime": 40.0618, |
| "eval_samples_per_second": 249.614, |
| "eval_steps_per_second": 31.202, |
| "step": 32100 |
| }, |
| { |
| "epoch": 3.103711682534634, |
| "grad_norm": 1.1109113693237305, |
| "learning_rate": 2.7892976588628766e-06, |
| "loss": 0.3377, |
| "step": 32200 |
| }, |
| { |
| "epoch": 3.103711682534634, |
| "eval_loss": 0.2999332547187805, |
| "eval_runtime": 40.0641, |
| "eval_samples_per_second": 249.6, |
| "eval_steps_per_second": 31.2, |
| "step": 32200 |
| }, |
| { |
| "epoch": 3.1133505386915736, |
| "grad_norm": 1.1124744415283203, |
| "learning_rate": 2.779264214046823e-06, |
| "loss": 0.3379, |
| "step": 32300 |
| }, |
| { |
| "epoch": 3.1133505386915736, |
| "eval_loss": 0.29913681745529175, |
| "eval_runtime": 40.0747, |
| "eval_samples_per_second": 249.534, |
| "eval_steps_per_second": 31.192, |
| "step": 32300 |
| }, |
| { |
| "epoch": 3.1229893948485135, |
| "grad_norm": 1.1505558490753174, |
| "learning_rate": 2.7692307692307693e-06, |
| "loss": 0.3377, |
| "step": 32400 |
| }, |
| { |
| "epoch": 3.1229893948485135, |
| "eval_loss": 0.29324981570243835, |
| "eval_runtime": 40.2233, |
| "eval_samples_per_second": 248.612, |
| "eval_steps_per_second": 31.077, |
| "step": 32400 |
| }, |
| { |
| "epoch": 3.1326282510054533, |
| "grad_norm": 1.2245198488235474, |
| "learning_rate": 2.759197324414716e-06, |
| "loss": 0.3364, |
| "step": 32500 |
| }, |
| { |
| "epoch": 3.1326282510054533, |
| "eval_loss": 0.2940920293331146, |
| "eval_runtime": 40.0385, |
| "eval_samples_per_second": 249.76, |
| "eval_steps_per_second": 31.22, |
| "step": 32500 |
| }, |
| { |
| "epoch": 3.142267107162393, |
| "grad_norm": 1.1837635040283203, |
| "learning_rate": 2.749163879598662e-06, |
| "loss": 0.3411, |
| "step": 32600 |
| }, |
| { |
| "epoch": 3.142267107162393, |
| "eval_loss": 0.29967695474624634, |
| "eval_runtime": 40.0599, |
| "eval_samples_per_second": 249.626, |
| "eval_steps_per_second": 31.203, |
| "step": 32600 |
| }, |
| { |
| "epoch": 3.151905963319333, |
| "grad_norm": 1.1685434579849243, |
| "learning_rate": 2.7391304347826087e-06, |
| "loss": 0.3395, |
| "step": 32700 |
| }, |
| { |
| "epoch": 3.151905963319333, |
| "eval_loss": 0.2951061427593231, |
| "eval_runtime": 40.0528, |
| "eval_samples_per_second": 249.67, |
| "eval_steps_per_second": 31.209, |
| "step": 32700 |
| }, |
| { |
| "epoch": 3.1615448194762727, |
| "grad_norm": 1.0835115909576416, |
| "learning_rate": 2.729096989966555e-06, |
| "loss": 0.3377, |
| "step": 32800 |
| }, |
| { |
| "epoch": 3.1615448194762727, |
| "eval_loss": 0.29684531688690186, |
| "eval_runtime": 40.2629, |
| "eval_samples_per_second": 248.367, |
| "eval_steps_per_second": 31.046, |
| "step": 32800 |
| }, |
| { |
| "epoch": 3.1711836756332126, |
| "grad_norm": 1.2234493494033813, |
| "learning_rate": 2.7190635451505014e-06, |
| "loss": 0.3361, |
| "step": 32900 |
| }, |
| { |
| "epoch": 3.1711836756332126, |
| "eval_loss": 0.29388144612312317, |
| "eval_runtime": 40.0533, |
| "eval_samples_per_second": 249.667, |
| "eval_steps_per_second": 31.208, |
| "step": 32900 |
| }, |
| { |
| "epoch": 3.1808225317901524, |
| "grad_norm": 1.1904512643814087, |
| "learning_rate": 2.709030100334448e-06, |
| "loss": 0.3339, |
| "step": 33000 |
| }, |
| { |
| "epoch": 3.1808225317901524, |
| "eval_loss": 0.2957378327846527, |
| "eval_runtime": 40.0266, |
| "eval_samples_per_second": 249.834, |
| "eval_steps_per_second": 31.229, |
| "step": 33000 |
| }, |
| { |
| "epoch": 3.190461387947092, |
| "grad_norm": 1.1379910707473755, |
| "learning_rate": 2.6989966555183945e-06, |
| "loss": 0.334, |
| "step": 33100 |
| }, |
| { |
| "epoch": 3.190461387947092, |
| "eval_loss": 0.2910006046295166, |
| "eval_runtime": 40.0239, |
| "eval_samples_per_second": 249.851, |
| "eval_steps_per_second": 31.231, |
| "step": 33100 |
| }, |
| { |
| "epoch": 3.200100244104032, |
| "grad_norm": 1.106467366218567, |
| "learning_rate": 2.6889632107023413e-06, |
| "loss": 0.3348, |
| "step": 33200 |
| }, |
| { |
| "epoch": 3.200100244104032, |
| "eval_loss": 0.29840487241744995, |
| "eval_runtime": 40.0152, |
| "eval_samples_per_second": 249.905, |
| "eval_steps_per_second": 31.238, |
| "step": 33200 |
| }, |
| { |
| "epoch": 3.209739100260972, |
| "grad_norm": 1.0919976234436035, |
| "learning_rate": 2.6789297658862876e-06, |
| "loss": 0.3342, |
| "step": 33300 |
| }, |
| { |
| "epoch": 3.209739100260972, |
| "eval_loss": 0.2929946184158325, |
| "eval_runtime": 40.2471, |
| "eval_samples_per_second": 248.465, |
| "eval_steps_per_second": 31.058, |
| "step": 33300 |
| }, |
| { |
| "epoch": 3.2193779564179117, |
| "grad_norm": 1.1051640510559082, |
| "learning_rate": 2.668896321070234e-06, |
| "loss": 0.3356, |
| "step": 33400 |
| }, |
| { |
| "epoch": 3.2193779564179117, |
| "eval_loss": 0.29255038499832153, |
| "eval_runtime": 40.0458, |
| "eval_samples_per_second": 249.714, |
| "eval_steps_per_second": 31.214, |
| "step": 33400 |
| }, |
| { |
| "epoch": 3.2290168125748515, |
| "grad_norm": 1.156020164489746, |
| "learning_rate": 2.6588628762541807e-06, |
| "loss": 0.3326, |
| "step": 33500 |
| }, |
| { |
| "epoch": 3.2290168125748515, |
| "eval_loss": 0.29639294743537903, |
| "eval_runtime": 40.0474, |
| "eval_samples_per_second": 249.704, |
| "eval_steps_per_second": 31.213, |
| "step": 33500 |
| }, |
| { |
| "epoch": 3.238655668731792, |
| "grad_norm": 1.1010196208953857, |
| "learning_rate": 2.648829431438127e-06, |
| "loss": 0.3321, |
| "step": 33600 |
| }, |
| { |
| "epoch": 3.238655668731792, |
| "eval_loss": 0.28783321380615234, |
| "eval_runtime": 40.0521, |
| "eval_samples_per_second": 249.675, |
| "eval_steps_per_second": 31.209, |
| "step": 33600 |
| }, |
| { |
| "epoch": 3.2482945248887316, |
| "grad_norm": 1.0766925811767578, |
| "learning_rate": 2.6387959866220734e-06, |
| "loss": 0.3346, |
| "step": 33700 |
| }, |
| { |
| "epoch": 3.2482945248887316, |
| "eval_loss": 0.2899346351623535, |
| "eval_runtime": 40.0356, |
| "eval_samples_per_second": 249.778, |
| "eval_steps_per_second": 31.222, |
| "step": 33700 |
| }, |
| { |
| "epoch": 3.2579333810456714, |
| "grad_norm": 1.1832084655761719, |
| "learning_rate": 2.62876254180602e-06, |
| "loss": 0.3323, |
| "step": 33800 |
| }, |
| { |
| "epoch": 3.2579333810456714, |
| "eval_loss": 0.29377686977386475, |
| "eval_runtime": 40.1913, |
| "eval_samples_per_second": 248.81, |
| "eval_steps_per_second": 31.101, |
| "step": 33800 |
| }, |
| { |
| "epoch": 3.2675722372026113, |
| "grad_norm": 1.1712465286254883, |
| "learning_rate": 2.6187290969899665e-06, |
| "loss": 0.3334, |
| "step": 33900 |
| }, |
| { |
| "epoch": 3.2675722372026113, |
| "eval_loss": 0.29734405875205994, |
| "eval_runtime": 40.0053, |
| "eval_samples_per_second": 249.967, |
| "eval_steps_per_second": 31.246, |
| "step": 33900 |
| }, |
| { |
| "epoch": 3.277211093359551, |
| "grad_norm": 1.2784638404846191, |
| "learning_rate": 2.6086956521739132e-06, |
| "loss": 0.3318, |
| "step": 34000 |
| }, |
| { |
| "epoch": 3.277211093359551, |
| "eval_loss": 0.29785603284835815, |
| "eval_runtime": 40.0215, |
| "eval_samples_per_second": 249.866, |
| "eval_steps_per_second": 31.233, |
| "step": 34000 |
| }, |
| { |
| "epoch": 3.286849949516491, |
| "grad_norm": 1.2473056316375732, |
| "learning_rate": 2.5986622073578596e-06, |
| "loss": 0.3315, |
| "step": 34100 |
| }, |
| { |
| "epoch": 3.286849949516491, |
| "eval_loss": 0.30169832706451416, |
| "eval_runtime": 40.0368, |
| "eval_samples_per_second": 249.77, |
| "eval_steps_per_second": 31.221, |
| "step": 34100 |
| }, |
| { |
| "epoch": 3.2964888056734307, |
| "grad_norm": 1.1944090127944946, |
| "learning_rate": 2.588628762541806e-06, |
| "loss": 0.3319, |
| "step": 34200 |
| }, |
| { |
| "epoch": 3.2964888056734307, |
| "eval_loss": 0.2977898120880127, |
| "eval_runtime": 40.0376, |
| "eval_samples_per_second": 249.765, |
| "eval_steps_per_second": 31.221, |
| "step": 34200 |
| }, |
| { |
| "epoch": 3.3061276618303705, |
| "grad_norm": 1.2597849369049072, |
| "learning_rate": 2.5785953177257527e-06, |
| "loss": 0.3323, |
| "step": 34300 |
| }, |
| { |
| "epoch": 3.3061276618303705, |
| "eval_loss": 0.29248785972595215, |
| "eval_runtime": 40.1905, |
| "eval_samples_per_second": 248.815, |
| "eval_steps_per_second": 31.102, |
| "step": 34300 |
| }, |
| { |
| "epoch": 3.3157665179873104, |
| "grad_norm": 1.139083981513977, |
| "learning_rate": 2.568561872909699e-06, |
| "loss": 0.3286, |
| "step": 34400 |
| }, |
| { |
| "epoch": 3.3157665179873104, |
| "eval_loss": 0.289288729429245, |
| "eval_runtime": 40.0409, |
| "eval_samples_per_second": 249.745, |
| "eval_steps_per_second": 31.218, |
| "step": 34400 |
| }, |
| { |
| "epoch": 3.32540537414425, |
| "grad_norm": 1.1464444398880005, |
| "learning_rate": 2.5585284280936454e-06, |
| "loss": 0.3347, |
| "step": 34500 |
| }, |
| { |
| "epoch": 3.32540537414425, |
| "eval_loss": 0.2892674207687378, |
| "eval_runtime": 40.0307, |
| "eval_samples_per_second": 249.808, |
| "eval_steps_per_second": 31.226, |
| "step": 34500 |
| }, |
| { |
| "epoch": 3.33504423030119, |
| "grad_norm": 1.17436945438385, |
| "learning_rate": 2.548494983277592e-06, |
| "loss": 0.3275, |
| "step": 34600 |
| }, |
| { |
| "epoch": 3.33504423030119, |
| "eval_loss": 0.2948303818702698, |
| "eval_runtime": 40.0039, |
| "eval_samples_per_second": 249.976, |
| "eval_steps_per_second": 31.247, |
| "step": 34600 |
| }, |
| { |
| "epoch": 3.34468308645813, |
| "grad_norm": 1.2058557271957397, |
| "learning_rate": 2.5384615384615385e-06, |
| "loss": 0.3324, |
| "step": 34700 |
| }, |
| { |
| "epoch": 3.34468308645813, |
| "eval_loss": 0.2879999577999115, |
| "eval_runtime": 40.215, |
| "eval_samples_per_second": 248.663, |
| "eval_steps_per_second": 31.083, |
| "step": 34700 |
| }, |
| { |
| "epoch": 3.35432194261507, |
| "grad_norm": 1.1361554861068726, |
| "learning_rate": 2.528428093645485e-06, |
| "loss": 0.3283, |
| "step": 34800 |
| }, |
| { |
| "epoch": 3.35432194261507, |
| "eval_loss": 0.29396873712539673, |
| "eval_runtime": 40.0234, |
| "eval_samples_per_second": 249.854, |
| "eval_steps_per_second": 31.232, |
| "step": 34800 |
| }, |
| { |
| "epoch": 3.36396079877201, |
| "grad_norm": 1.1167203187942505, |
| "learning_rate": 2.5183946488294316e-06, |
| "loss": 0.3291, |
| "step": 34900 |
| }, |
| { |
| "epoch": 3.36396079877201, |
| "eval_loss": 0.28754281997680664, |
| "eval_runtime": 40.0006, |
| "eval_samples_per_second": 249.996, |
| "eval_steps_per_second": 31.25, |
| "step": 34900 |
| }, |
| { |
| "epoch": 3.3735996549289498, |
| "grad_norm": 1.1369761228561401, |
| "learning_rate": 2.508361204013378e-06, |
| "loss": 0.3324, |
| "step": 35000 |
| }, |
| { |
| "epoch": 3.3735996549289498, |
| "eval_loss": 0.2906251847743988, |
| "eval_runtime": 40.0165, |
| "eval_samples_per_second": 249.897, |
| "eval_steps_per_second": 31.237, |
| "step": 35000 |
| }, |
| { |
| "epoch": 3.3832385110858896, |
| "grad_norm": 1.1644196510314941, |
| "learning_rate": 2.4983277591973247e-06, |
| "loss": 0.33, |
| "step": 35100 |
| }, |
| { |
| "epoch": 3.3832385110858896, |
| "eval_loss": 0.29271912574768066, |
| "eval_runtime": 40.0328, |
| "eval_samples_per_second": 249.795, |
| "eval_steps_per_second": 31.224, |
| "step": 35100 |
| }, |
| { |
| "epoch": 3.3928773672428294, |
| "grad_norm": 1.2349485158920288, |
| "learning_rate": 2.488294314381271e-06, |
| "loss": 0.3296, |
| "step": 35200 |
| }, |
| { |
| "epoch": 3.3928773672428294, |
| "eval_loss": 0.2844071686267853, |
| "eval_runtime": 40.2281, |
| "eval_samples_per_second": 248.582, |
| "eval_steps_per_second": 31.073, |
| "step": 35200 |
| }, |
| { |
| "epoch": 3.4025162233997692, |
| "grad_norm": 1.0957030057907104, |
| "learning_rate": 2.4782608695652173e-06, |
| "loss": 0.3259, |
| "step": 35300 |
| }, |
| { |
| "epoch": 3.4025162233997692, |
| "eval_loss": 0.28942251205444336, |
| "eval_runtime": 40.049, |
| "eval_samples_per_second": 249.694, |
| "eval_steps_per_second": 31.212, |
| "step": 35300 |
| }, |
| { |
| "epoch": 3.412155079556709, |
| "grad_norm": 1.2255046367645264, |
| "learning_rate": 2.468227424749164e-06, |
| "loss": 0.3278, |
| "step": 35400 |
| }, |
| { |
| "epoch": 3.412155079556709, |
| "eval_loss": 0.2880132794380188, |
| "eval_runtime": 40.0318, |
| "eval_samples_per_second": 249.802, |
| "eval_steps_per_second": 31.225, |
| "step": 35400 |
| }, |
| { |
| "epoch": 3.421793935713649, |
| "grad_norm": 1.0865519046783447, |
| "learning_rate": 2.4581939799331104e-06, |
| "loss": 0.3264, |
| "step": 35500 |
| }, |
| { |
| "epoch": 3.421793935713649, |
| "eval_loss": 0.2906026840209961, |
| "eval_runtime": 40.0258, |
| "eval_samples_per_second": 249.839, |
| "eval_steps_per_second": 31.23, |
| "step": 35500 |
| }, |
| { |
| "epoch": 3.4314327918705887, |
| "grad_norm": 1.1252706050872803, |
| "learning_rate": 2.4481605351170568e-06, |
| "loss": 0.3272, |
| "step": 35600 |
| }, |
| { |
| "epoch": 3.4314327918705887, |
| "eval_loss": 0.28184667229652405, |
| "eval_runtime": 40.2117, |
| "eval_samples_per_second": 248.684, |
| "eval_steps_per_second": 31.085, |
| "step": 35600 |
| }, |
| { |
| "epoch": 3.4410716480275285, |
| "grad_norm": 1.1938196420669556, |
| "learning_rate": 2.4381270903010035e-06, |
| "loss": 0.3273, |
| "step": 35700 |
| }, |
| { |
| "epoch": 3.4410716480275285, |
| "eval_loss": 0.28648287057876587, |
| "eval_runtime": 40.2043, |
| "eval_samples_per_second": 248.73, |
| "eval_steps_per_second": 31.091, |
| "step": 35700 |
| }, |
| { |
| "epoch": 3.4507105041844683, |
| "grad_norm": 1.2802789211273193, |
| "learning_rate": 2.42809364548495e-06, |
| "loss": 0.3293, |
| "step": 35800 |
| }, |
| { |
| "epoch": 3.4507105041844683, |
| "eval_loss": 0.2890487313270569, |
| "eval_runtime": 40.0181, |
| "eval_samples_per_second": 249.887, |
| "eval_steps_per_second": 31.236, |
| "step": 35800 |
| }, |
| { |
| "epoch": 3.460349360341408, |
| "grad_norm": 1.2290120124816895, |
| "learning_rate": 2.4180602006688962e-06, |
| "loss": 0.3279, |
| "step": 35900 |
| }, |
| { |
| "epoch": 3.460349360341408, |
| "eval_loss": 0.2872641086578369, |
| "eval_runtime": 40.013, |
| "eval_samples_per_second": 249.919, |
| "eval_steps_per_second": 31.24, |
| "step": 35900 |
| }, |
| { |
| "epoch": 3.469988216498348, |
| "grad_norm": 1.1569044589996338, |
| "learning_rate": 2.408026755852843e-06, |
| "loss": 0.3263, |
| "step": 36000 |
| }, |
| { |
| "epoch": 3.469988216498348, |
| "eval_loss": 0.2899464964866638, |
| "eval_runtime": 40.0117, |
| "eval_samples_per_second": 249.927, |
| "eval_steps_per_second": 31.241, |
| "step": 36000 |
| }, |
| { |
| "epoch": 3.479627072655288, |
| "grad_norm": 1.1881693601608276, |
| "learning_rate": 2.3979933110367893e-06, |
| "loss": 0.3271, |
| "step": 36100 |
| }, |
| { |
| "epoch": 3.479627072655288, |
| "eval_loss": 0.29130667448043823, |
| "eval_runtime": 40.0206, |
| "eval_samples_per_second": 249.871, |
| "eval_steps_per_second": 31.234, |
| "step": 36100 |
| }, |
| { |
| "epoch": 3.4892659288122276, |
| "grad_norm": 1.3138039112091064, |
| "learning_rate": 2.387959866220736e-06, |
| "loss": 0.3239, |
| "step": 36200 |
| }, |
| { |
| "epoch": 3.4892659288122276, |
| "eval_loss": 0.2865622341632843, |
| "eval_runtime": 40.1762, |
| "eval_samples_per_second": 248.904, |
| "eval_steps_per_second": 31.113, |
| "step": 36200 |
| }, |
| { |
| "epoch": 3.498904784969168, |
| "grad_norm": 1.1673741340637207, |
| "learning_rate": 2.3779264214046824e-06, |
| "loss": 0.3277, |
| "step": 36300 |
| }, |
| { |
| "epoch": 3.498904784969168, |
| "eval_loss": 0.2852942645549774, |
| "eval_runtime": 40.0125, |
| "eval_samples_per_second": 249.922, |
| "eval_steps_per_second": 31.24, |
| "step": 36300 |
| }, |
| { |
| "epoch": 3.5085436411261077, |
| "grad_norm": 1.1211373805999756, |
| "learning_rate": 2.3678929765886288e-06, |
| "loss": 0.3237, |
| "step": 36400 |
| }, |
| { |
| "epoch": 3.5085436411261077, |
| "eval_loss": 0.290414422750473, |
| "eval_runtime": 40.0075, |
| "eval_samples_per_second": 249.953, |
| "eval_steps_per_second": 31.244, |
| "step": 36400 |
| }, |
| { |
| "epoch": 3.5181824972830475, |
| "grad_norm": 1.1853524446487427, |
| "learning_rate": 2.3578595317725755e-06, |
| "loss": 0.324, |
| "step": 36500 |
| }, |
| { |
| "epoch": 3.5181824972830475, |
| "eval_loss": 0.2909868657588959, |
| "eval_runtime": 40.0112, |
| "eval_samples_per_second": 249.93, |
| "eval_steps_per_second": 31.241, |
| "step": 36500 |
| }, |
| { |
| "epoch": 3.5278213534399874, |
| "grad_norm": 1.2360116243362427, |
| "learning_rate": 2.347826086956522e-06, |
| "loss": 0.3261, |
| "step": 36600 |
| }, |
| { |
| "epoch": 3.5278213534399874, |
| "eval_loss": 0.28790682554244995, |
| "eval_runtime": 40.0256, |
| "eval_samples_per_second": 249.84, |
| "eval_steps_per_second": 31.23, |
| "step": 36600 |
| }, |
| { |
| "epoch": 3.537460209596927, |
| "grad_norm": 1.0577596426010132, |
| "learning_rate": 2.337792642140468e-06, |
| "loss": 0.3236, |
| "step": 36700 |
| }, |
| { |
| "epoch": 3.537460209596927, |
| "eval_loss": 0.28858110308647156, |
| "eval_runtime": 40.2009, |
| "eval_samples_per_second": 248.751, |
| "eval_steps_per_second": 31.094, |
| "step": 36700 |
| }, |
| { |
| "epoch": 3.547099065753867, |
| "grad_norm": 1.0951604843139648, |
| "learning_rate": 2.327759197324415e-06, |
| "loss": 0.3216, |
| "step": 36800 |
| }, |
| { |
| "epoch": 3.547099065753867, |
| "eval_loss": 0.28311312198638916, |
| "eval_runtime": 40.0195, |
| "eval_samples_per_second": 249.878, |
| "eval_steps_per_second": 31.235, |
| "step": 36800 |
| }, |
| { |
| "epoch": 3.556737921910807, |
| "grad_norm": 1.11006498336792, |
| "learning_rate": 2.3177257525083613e-06, |
| "loss": 0.3243, |
| "step": 36900 |
| }, |
| { |
| "epoch": 3.556737921910807, |
| "eval_loss": 0.28506574034690857, |
| "eval_runtime": 40.033, |
| "eval_samples_per_second": 249.794, |
| "eval_steps_per_second": 31.224, |
| "step": 36900 |
| }, |
| { |
| "epoch": 3.5663767780677467, |
| "grad_norm": 1.2008014917373657, |
| "learning_rate": 2.307692307692308e-06, |
| "loss": 0.3211, |
| "step": 37000 |
| }, |
| { |
| "epoch": 3.5663767780677467, |
| "eval_loss": 0.2800537049770355, |
| "eval_runtime": 40.0317, |
| "eval_samples_per_second": 249.802, |
| "eval_steps_per_second": 31.225, |
| "step": 37000 |
| }, |
| { |
| "epoch": 3.5760156342246865, |
| "grad_norm": 1.1588464975357056, |
| "learning_rate": 2.2976588628762544e-06, |
| "loss": 0.3225, |
| "step": 37100 |
| }, |
| { |
| "epoch": 3.5760156342246865, |
| "eval_loss": 0.2877085506916046, |
| "eval_runtime": 40.0153, |
| "eval_samples_per_second": 249.904, |
| "eval_steps_per_second": 31.238, |
| "step": 37100 |
| }, |
| { |
| "epoch": 3.5856544903816263, |
| "grad_norm": 1.095901608467102, |
| "learning_rate": 2.2876254180602008e-06, |
| "loss": 0.3233, |
| "step": 37200 |
| }, |
| { |
| "epoch": 3.5856544903816263, |
| "eval_loss": 0.290816992521286, |
| "eval_runtime": 40.1774, |
| "eval_samples_per_second": 248.896, |
| "eval_steps_per_second": 31.112, |
| "step": 37200 |
| }, |
| { |
| "epoch": 3.595293346538566, |
| "grad_norm": 1.2038841247558594, |
| "learning_rate": 2.2775919732441475e-06, |
| "loss": 0.322, |
| "step": 37300 |
| }, |
| { |
| "epoch": 3.595293346538566, |
| "eval_loss": 0.2854418456554413, |
| "eval_runtime": 40.067, |
| "eval_samples_per_second": 249.582, |
| "eval_steps_per_second": 31.198, |
| "step": 37300 |
| }, |
| { |
| "epoch": 3.604932202695506, |
| "grad_norm": 1.1646316051483154, |
| "learning_rate": 2.267558528428094e-06, |
| "loss": 0.3224, |
| "step": 37400 |
| }, |
| { |
| "epoch": 3.604932202695506, |
| "eval_loss": 0.2838365137577057, |
| "eval_runtime": 40.055, |
| "eval_samples_per_second": 249.657, |
| "eval_steps_per_second": 31.207, |
| "step": 37400 |
| }, |
| { |
| "epoch": 3.6145710588524462, |
| "grad_norm": 1.166131615638733, |
| "learning_rate": 2.25752508361204e-06, |
| "loss": 0.3247, |
| "step": 37500 |
| }, |
| { |
| "epoch": 3.6145710588524462, |
| "eval_loss": 0.2809997797012329, |
| "eval_runtime": 40.2416, |
| "eval_samples_per_second": 248.499, |
| "eval_steps_per_second": 31.062, |
| "step": 37500 |
| }, |
| { |
| "epoch": 3.624209915009386, |
| "grad_norm": 1.1528489589691162, |
| "learning_rate": 2.2474916387959865e-06, |
| "loss": 0.3251, |
| "step": 37600 |
| }, |
| { |
| "epoch": 3.624209915009386, |
| "eval_loss": 0.2795826196670532, |
| "eval_runtime": 40.1013, |
| "eval_samples_per_second": 249.368, |
| "eval_steps_per_second": 31.171, |
| "step": 37600 |
| }, |
| { |
| "epoch": 3.633848771166326, |
| "grad_norm": 1.1351743936538696, |
| "learning_rate": 2.237458193979933e-06, |
| "loss": 0.319, |
| "step": 37700 |
| }, |
| { |
| "epoch": 3.633848771166326, |
| "eval_loss": 0.2884068489074707, |
| "eval_runtime": 40.254, |
| "eval_samples_per_second": 248.423, |
| "eval_steps_per_second": 31.053, |
| "step": 37700 |
| }, |
| { |
| "epoch": 3.6434876273232657, |
| "grad_norm": 1.2123479843139648, |
| "learning_rate": 2.2274247491638796e-06, |
| "loss": 0.3216, |
| "step": 37800 |
| }, |
| { |
| "epoch": 3.6434876273232657, |
| "eval_loss": 0.2808574140071869, |
| "eval_runtime": 40.0346, |
| "eval_samples_per_second": 249.784, |
| "eval_steps_per_second": 31.223, |
| "step": 37800 |
| }, |
| { |
| "epoch": 3.6531264834802055, |
| "grad_norm": 1.1580688953399658, |
| "learning_rate": 2.217391304347826e-06, |
| "loss": 0.3182, |
| "step": 37900 |
| }, |
| { |
| "epoch": 3.6531264834802055, |
| "eval_loss": 0.28800562024116516, |
| "eval_runtime": 40.0302, |
| "eval_samples_per_second": 249.811, |
| "eval_steps_per_second": 31.226, |
| "step": 37900 |
| }, |
| { |
| "epoch": 3.6627653396371453, |
| "grad_norm": 1.1823816299438477, |
| "learning_rate": 2.2073578595317723e-06, |
| "loss": 0.3208, |
| "step": 38000 |
| }, |
| { |
| "epoch": 3.6627653396371453, |
| "eval_loss": 0.28609785437583923, |
| "eval_runtime": 40.0272, |
| "eval_samples_per_second": 249.83, |
| "eval_steps_per_second": 31.229, |
| "step": 38000 |
| }, |
| { |
| "epoch": 3.672404195794085, |
| "grad_norm": 1.1534825563430786, |
| "learning_rate": 2.197324414715719e-06, |
| "loss": 0.3165, |
| "step": 38100 |
| }, |
| { |
| "epoch": 3.672404195794085, |
| "eval_loss": 0.28174835443496704, |
| "eval_runtime": 40.198, |
| "eval_samples_per_second": 248.769, |
| "eval_steps_per_second": 31.096, |
| "step": 38100 |
| }, |
| { |
| "epoch": 3.682043051951025, |
| "grad_norm": 1.160309076309204, |
| "learning_rate": 2.1872909698996654e-06, |
| "loss": 0.3203, |
| "step": 38200 |
| }, |
| { |
| "epoch": 3.682043051951025, |
| "eval_loss": 0.2865579128265381, |
| "eval_runtime": 40.0605, |
| "eval_samples_per_second": 249.622, |
| "eval_steps_per_second": 31.203, |
| "step": 38200 |
| }, |
| { |
| "epoch": 3.691681908107965, |
| "grad_norm": 1.1397422552108765, |
| "learning_rate": 2.177257525083612e-06, |
| "loss": 0.3189, |
| "step": 38300 |
| }, |
| { |
| "epoch": 3.691681908107965, |
| "eval_loss": 0.2845374643802643, |
| "eval_runtime": 40.0285, |
| "eval_samples_per_second": 249.822, |
| "eval_steps_per_second": 31.228, |
| "step": 38300 |
| }, |
| { |
| "epoch": 3.7013207642649046, |
| "grad_norm": 1.1168709993362427, |
| "learning_rate": 2.1672240802675585e-06, |
| "loss": 0.32, |
| "step": 38400 |
| }, |
| { |
| "epoch": 3.7013207642649046, |
| "eval_loss": 0.2816096246242523, |
| "eval_runtime": 40.0255, |
| "eval_samples_per_second": 249.841, |
| "eval_steps_per_second": 31.23, |
| "step": 38400 |
| }, |
| { |
| "epoch": 3.7109596204218445, |
| "grad_norm": 1.221742868423462, |
| "learning_rate": 2.157190635451505e-06, |
| "loss": 0.3167, |
| "step": 38500 |
| }, |
| { |
| "epoch": 3.7109596204218445, |
| "eval_loss": 0.283882737159729, |
| "eval_runtime": 40.2082, |
| "eval_samples_per_second": 248.706, |
| "eval_steps_per_second": 31.088, |
| "step": 38500 |
| }, |
| { |
| "epoch": 3.7205984765787843, |
| "grad_norm": 1.132840633392334, |
| "learning_rate": 2.1471571906354516e-06, |
| "loss": 0.3189, |
| "step": 38600 |
| }, |
| { |
| "epoch": 3.7205984765787843, |
| "eval_loss": 0.28340187668800354, |
| "eval_runtime": 40.0209, |
| "eval_samples_per_second": 249.869, |
| "eval_steps_per_second": 31.234, |
| "step": 38600 |
| }, |
| { |
| "epoch": 3.730237332735724, |
| "grad_norm": 1.145599365234375, |
| "learning_rate": 2.137123745819398e-06, |
| "loss": 0.3179, |
| "step": 38700 |
| }, |
| { |
| "epoch": 3.730237332735724, |
| "eval_loss": 0.2839237451553345, |
| "eval_runtime": 40.0677, |
| "eval_samples_per_second": 249.578, |
| "eval_steps_per_second": 31.197, |
| "step": 38700 |
| }, |
| { |
| "epoch": 3.739876188892664, |
| "grad_norm": 1.1449469327926636, |
| "learning_rate": 2.1270903010033443e-06, |
| "loss": 0.3209, |
| "step": 38800 |
| }, |
| { |
| "epoch": 3.739876188892664, |
| "eval_loss": 0.28125128149986267, |
| "eval_runtime": 40.0685, |
| "eval_samples_per_second": 249.573, |
| "eval_steps_per_second": 31.197, |
| "step": 38800 |
| }, |
| { |
| "epoch": 3.7495150450496038, |
| "grad_norm": 1.155439853668213, |
| "learning_rate": 2.117056856187291e-06, |
| "loss": 0.3182, |
| "step": 38900 |
| }, |
| { |
| "epoch": 3.7495150450496038, |
| "eval_loss": 0.2792927622795105, |
| "eval_runtime": 40.048, |
| "eval_samples_per_second": 249.7, |
| "eval_steps_per_second": 31.213, |
| "step": 38900 |
| }, |
| { |
| "epoch": 3.7591539012065436, |
| "grad_norm": 1.1573885679244995, |
| "learning_rate": 2.1070234113712374e-06, |
| "loss": 0.3204, |
| "step": 39000 |
| }, |
| { |
| "epoch": 3.7591539012065436, |
| "eval_loss": 0.27629750967025757, |
| "eval_runtime": 40.2171, |
| "eval_samples_per_second": 248.651, |
| "eval_steps_per_second": 31.081, |
| "step": 39000 |
| }, |
| { |
| "epoch": 3.7687927573634834, |
| "grad_norm": 1.1162937879562378, |
| "learning_rate": 2.0969899665551837e-06, |
| "loss": 0.3183, |
| "step": 39100 |
| }, |
| { |
| "epoch": 3.7687927573634834, |
| "eval_loss": 0.2790432274341583, |
| "eval_runtime": 40.0376, |
| "eval_samples_per_second": 249.765, |
| "eval_steps_per_second": 31.221, |
| "step": 39100 |
| }, |
| { |
| "epoch": 3.7784316135204237, |
| "grad_norm": 1.0601564645767212, |
| "learning_rate": 2.0869565217391305e-06, |
| "loss": 0.3182, |
| "step": 39200 |
| }, |
| { |
| "epoch": 3.7784316135204237, |
| "eval_loss": 0.2791585326194763, |
| "eval_runtime": 40.037, |
| "eval_samples_per_second": 249.769, |
| "eval_steps_per_second": 31.221, |
| "step": 39200 |
| }, |
| { |
| "epoch": 3.7880704696773635, |
| "grad_norm": 1.131667137145996, |
| "learning_rate": 2.076923076923077e-06, |
| "loss": 0.3175, |
| "step": 39300 |
| }, |
| { |
| "epoch": 3.7880704696773635, |
| "eval_loss": 0.27618080377578735, |
| "eval_runtime": 40.0503, |
| "eval_samples_per_second": 249.686, |
| "eval_steps_per_second": 31.211, |
| "step": 39300 |
| }, |
| { |
| "epoch": 3.7977093258343033, |
| "grad_norm": 1.1320117712020874, |
| "learning_rate": 2.0668896321070236e-06, |
| "loss": 0.3189, |
| "step": 39400 |
| }, |
| { |
| "epoch": 3.7977093258343033, |
| "eval_loss": 0.2805337607860565, |
| "eval_runtime": 40.0425, |
| "eval_samples_per_second": 249.735, |
| "eval_steps_per_second": 31.217, |
| "step": 39400 |
| }, |
| { |
| "epoch": 3.807348181991243, |
| "grad_norm": 1.154017448425293, |
| "learning_rate": 2.05685618729097e-06, |
| "loss": 0.3189, |
| "step": 39500 |
| }, |
| { |
| "epoch": 3.807348181991243, |
| "eval_loss": 0.2768949866294861, |
| "eval_runtime": 40.2119, |
| "eval_samples_per_second": 248.683, |
| "eval_steps_per_second": 31.085, |
| "step": 39500 |
| }, |
| { |
| "epoch": 3.816987038148183, |
| "grad_norm": 1.1326124668121338, |
| "learning_rate": 2.0468227424749163e-06, |
| "loss": 0.3176, |
| "step": 39600 |
| }, |
| { |
| "epoch": 3.816987038148183, |
| "eval_loss": 0.28353628516197205, |
| "eval_runtime": 40.027, |
| "eval_samples_per_second": 249.832, |
| "eval_steps_per_second": 31.229, |
| "step": 39600 |
| }, |
| { |
| "epoch": 3.826625894305123, |
| "grad_norm": 1.0950291156768799, |
| "learning_rate": 2.036789297658863e-06, |
| "loss": 0.3152, |
| "step": 39700 |
| }, |
| { |
| "epoch": 3.826625894305123, |
| "eval_loss": 0.27837592363357544, |
| "eval_runtime": 40.0294, |
| "eval_samples_per_second": 249.817, |
| "eval_steps_per_second": 31.227, |
| "step": 39700 |
| }, |
| { |
| "epoch": 3.8362647504620626, |
| "grad_norm": 1.0399600267410278, |
| "learning_rate": 2.0267558528428094e-06, |
| "loss": 0.3162, |
| "step": 39800 |
| }, |
| { |
| "epoch": 3.8362647504620626, |
| "eval_loss": 0.279599666595459, |
| "eval_runtime": 40.03, |
| "eval_samples_per_second": 249.813, |
| "eval_steps_per_second": 31.227, |
| "step": 39800 |
| }, |
| { |
| "epoch": 3.8459036066190024, |
| "grad_norm": 1.1865919828414917, |
| "learning_rate": 2.0167224080267557e-06, |
| "loss": 0.312, |
| "step": 39900 |
| }, |
| { |
| "epoch": 3.8459036066190024, |
| "eval_loss": 0.2764998972415924, |
| "eval_runtime": 40.2252, |
| "eval_samples_per_second": 248.6, |
| "eval_steps_per_second": 31.075, |
| "step": 39900 |
| }, |
| { |
| "epoch": 3.8555424627759423, |
| "grad_norm": 1.1612026691436768, |
| "learning_rate": 2.0066889632107025e-06, |
| "loss": 0.3176, |
| "step": 40000 |
| }, |
| { |
| "epoch": 3.8555424627759423, |
| "eval_loss": 0.28109410405158997, |
| "eval_runtime": 40.0865, |
| "eval_samples_per_second": 249.461, |
| "eval_steps_per_second": 31.183, |
| "step": 40000 |
| }, |
| { |
| "epoch": 3.865181318932882, |
| "grad_norm": 1.1188021898269653, |
| "learning_rate": 1.996655518394649e-06, |
| "loss": 0.3128, |
| "step": 40100 |
| }, |
| { |
| "epoch": 3.865181318932882, |
| "eval_loss": 0.276141881942749, |
| "eval_runtime": 40.0338, |
| "eval_samples_per_second": 249.789, |
| "eval_steps_per_second": 31.224, |
| "step": 40100 |
| }, |
| { |
| "epoch": 3.8748201750898223, |
| "grad_norm": 1.0921941995620728, |
| "learning_rate": 1.986622073578595e-06, |
| "loss": 0.3163, |
| "step": 40200 |
| }, |
| { |
| "epoch": 3.8748201750898223, |
| "eval_loss": 0.27618375420570374, |
| "eval_runtime": 40.0354, |
| "eval_samples_per_second": 249.779, |
| "eval_steps_per_second": 31.222, |
| "step": 40200 |
| }, |
| { |
| "epoch": 3.884459031246762, |
| "grad_norm": 1.1041548252105713, |
| "learning_rate": 1.976588628762542e-06, |
| "loss": 0.312, |
| "step": 40300 |
| }, |
| { |
| "epoch": 3.884459031246762, |
| "eval_loss": 0.2808937728404999, |
| "eval_runtime": 40.0468, |
| "eval_samples_per_second": 249.708, |
| "eval_steps_per_second": 31.214, |
| "step": 40300 |
| }, |
| { |
| "epoch": 3.894097887403702, |
| "grad_norm": 1.1126019954681396, |
| "learning_rate": 1.9665551839464883e-06, |
| "loss": 0.3147, |
| "step": 40400 |
| }, |
| { |
| "epoch": 3.894097887403702, |
| "eval_loss": 0.2763468623161316, |
| "eval_runtime": 43.5695, |
| "eval_samples_per_second": 229.519, |
| "eval_steps_per_second": 28.69, |
| "step": 40400 |
| }, |
| { |
| "epoch": 3.903736743560642, |
| "grad_norm": 1.0650842189788818, |
| "learning_rate": 1.956521739130435e-06, |
| "loss": 0.3143, |
| "step": 40500 |
| }, |
| { |
| "epoch": 3.903736743560642, |
| "eval_loss": 0.27691203355789185, |
| "eval_runtime": 40.0111, |
| "eval_samples_per_second": 249.93, |
| "eval_steps_per_second": 31.241, |
| "step": 40500 |
| }, |
| { |
| "epoch": 3.9133755997175816, |
| "grad_norm": 1.2413527965545654, |
| "learning_rate": 1.9464882943143814e-06, |
| "loss": 0.3144, |
| "step": 40600 |
| }, |
| { |
| "epoch": 3.9133755997175816, |
| "eval_loss": 0.27241915464401245, |
| "eval_runtime": 40.027, |
| "eval_samples_per_second": 249.831, |
| "eval_steps_per_second": 31.229, |
| "step": 40600 |
| }, |
| { |
| "epoch": 3.9230144558745215, |
| "grad_norm": 1.2129082679748535, |
| "learning_rate": 1.9364548494983277e-06, |
| "loss": 0.3148, |
| "step": 40700 |
| }, |
| { |
| "epoch": 3.9230144558745215, |
| "eval_loss": 0.27262434363365173, |
| "eval_runtime": 40.0193, |
| "eval_samples_per_second": 249.88, |
| "eval_steps_per_second": 31.235, |
| "step": 40700 |
| }, |
| { |
| "epoch": 3.9326533120314613, |
| "grad_norm": 1.2504247426986694, |
| "learning_rate": 1.9264214046822745e-06, |
| "loss": 0.3138, |
| "step": 40800 |
| }, |
| { |
| "epoch": 3.9326533120314613, |
| "eval_loss": 0.27585116028785706, |
| "eval_runtime": 40.042, |
| "eval_samples_per_second": 249.738, |
| "eval_steps_per_second": 31.217, |
| "step": 40800 |
| }, |
| { |
| "epoch": 3.942292168188401, |
| "grad_norm": 1.1798555850982666, |
| "learning_rate": 1.916387959866221e-06, |
| "loss": 0.3143, |
| "step": 40900 |
| }, |
| { |
| "epoch": 3.942292168188401, |
| "eval_loss": 0.27270910143852234, |
| "eval_runtime": 40.2588, |
| "eval_samples_per_second": 248.393, |
| "eval_steps_per_second": 31.049, |
| "step": 40900 |
| }, |
| { |
| "epoch": 3.951931024345341, |
| "grad_norm": 0.9959877133369446, |
| "learning_rate": 1.9063545150501674e-06, |
| "loss": 0.3126, |
| "step": 41000 |
| }, |
| { |
| "epoch": 3.951931024345341, |
| "eval_loss": 0.2772971987724304, |
| "eval_runtime": 40.0712, |
| "eval_samples_per_second": 249.555, |
| "eval_steps_per_second": 31.194, |
| "step": 41000 |
| }, |
| { |
| "epoch": 3.9615698805022808, |
| "grad_norm": 1.1942039728164673, |
| "learning_rate": 1.896321070234114e-06, |
| "loss": 0.3137, |
| "step": 41100 |
| }, |
| { |
| "epoch": 3.9615698805022808, |
| "eval_loss": 0.2713697552680969, |
| "eval_runtime": 40.052, |
| "eval_samples_per_second": 249.675, |
| "eval_steps_per_second": 31.209, |
| "step": 41100 |
| }, |
| { |
| "epoch": 3.9712087366592206, |
| "grad_norm": 1.1946539878845215, |
| "learning_rate": 1.8862876254180603e-06, |
| "loss": 0.3143, |
| "step": 41200 |
| }, |
| { |
| "epoch": 3.9712087366592206, |
| "eval_loss": 0.27480974793434143, |
| "eval_runtime": 40.0696, |
| "eval_samples_per_second": 249.566, |
| "eval_steps_per_second": 31.196, |
| "step": 41200 |
| }, |
| { |
| "epoch": 3.9808475928161604, |
| "grad_norm": 1.168750286102295, |
| "learning_rate": 1.8762541806020068e-06, |
| "loss": 0.311, |
| "step": 41300 |
| }, |
| { |
| "epoch": 3.9808475928161604, |
| "eval_loss": 0.2778546214103699, |
| "eval_runtime": 40.2246, |
| "eval_samples_per_second": 248.604, |
| "eval_steps_per_second": 31.076, |
| "step": 41300 |
| }, |
| { |
| "epoch": 3.9904864489731002, |
| "grad_norm": 1.2502557039260864, |
| "learning_rate": 1.8662207357859534e-06, |
| "loss": 0.3174, |
| "step": 41400 |
| }, |
| { |
| "epoch": 3.9904864489731002, |
| "eval_loss": 0.2807627320289612, |
| "eval_runtime": 40.07, |
| "eval_samples_per_second": 249.563, |
| "eval_steps_per_second": 31.195, |
| "step": 41400 |
| }, |
| { |
| "epoch": 4.00012530513004, |
| "grad_norm": 1.09615159034729, |
| "learning_rate": 1.8561872909699e-06, |
| "loss": 0.3124, |
| "step": 41500 |
| }, |
| { |
| "epoch": 4.00012530513004, |
| "eval_loss": 0.27661824226379395, |
| "eval_runtime": 40.0399, |
| "eval_samples_per_second": 249.751, |
| "eval_steps_per_second": 31.219, |
| "step": 41500 |
| }, |
| { |
| "epoch": 4.00976416128698, |
| "grad_norm": 1.100216031074524, |
| "learning_rate": 1.8461538461538462e-06, |
| "loss": 0.3136, |
| "step": 41600 |
| }, |
| { |
| "epoch": 4.00976416128698, |
| "eval_loss": 0.2801878750324249, |
| "eval_runtime": 40.0624, |
| "eval_samples_per_second": 249.611, |
| "eval_steps_per_second": 31.201, |
| "step": 41600 |
| }, |
| { |
| "epoch": 4.01940301744392, |
| "grad_norm": 1.0246331691741943, |
| "learning_rate": 1.8361204013377928e-06, |
| "loss": 0.3146, |
| "step": 41700 |
| }, |
| { |
| "epoch": 4.01940301744392, |
| "eval_loss": 0.2788015604019165, |
| "eval_runtime": 40.0597, |
| "eval_samples_per_second": 249.628, |
| "eval_steps_per_second": 31.203, |
| "step": 41700 |
| }, |
| { |
| "epoch": 4.0290418736008595, |
| "grad_norm": 1.300437092781067, |
| "learning_rate": 1.8260869565217394e-06, |
| "loss": 0.3119, |
| "step": 41800 |
| }, |
| { |
| "epoch": 4.0290418736008595, |
| "eval_loss": 0.272819459438324, |
| "eval_runtime": 40.2236, |
| "eval_samples_per_second": 248.61, |
| "eval_steps_per_second": 31.076, |
| "step": 41800 |
| }, |
| { |
| "epoch": 4.038680729757799, |
| "grad_norm": 1.1649296283721924, |
| "learning_rate": 1.8160535117056857e-06, |
| "loss": 0.312, |
| "step": 41900 |
| }, |
| { |
| "epoch": 4.038680729757799, |
| "eval_loss": 0.2787322402000427, |
| "eval_runtime": 40.0348, |
| "eval_samples_per_second": 249.783, |
| "eval_steps_per_second": 31.223, |
| "step": 41900 |
| }, |
| { |
| "epoch": 4.048319585914739, |
| "grad_norm": 1.1247646808624268, |
| "learning_rate": 1.8060200668896322e-06, |
| "loss": 0.3124, |
| "step": 42000 |
| }, |
| { |
| "epoch": 4.048319585914739, |
| "eval_loss": 0.2764863669872284, |
| "eval_runtime": 40.0195, |
| "eval_samples_per_second": 249.878, |
| "eval_steps_per_second": 31.235, |
| "step": 42000 |
| }, |
| { |
| "epoch": 4.057958442071679, |
| "grad_norm": 1.21001398563385, |
| "learning_rate": 1.7959866220735788e-06, |
| "loss": 0.3099, |
| "step": 42100 |
| }, |
| { |
| "epoch": 4.057958442071679, |
| "eval_loss": 0.275383859872818, |
| "eval_runtime": 40.0245, |
| "eval_samples_per_second": 249.847, |
| "eval_steps_per_second": 31.231, |
| "step": 42100 |
| }, |
| { |
| "epoch": 4.067597298228619, |
| "grad_norm": 1.1185123920440674, |
| "learning_rate": 1.7859531772575253e-06, |
| "loss": 0.3094, |
| "step": 42200 |
| }, |
| { |
| "epoch": 4.067597298228619, |
| "eval_loss": 0.27758368849754333, |
| "eval_runtime": 40.2446, |
| "eval_samples_per_second": 248.481, |
| "eval_steps_per_second": 31.06, |
| "step": 42200 |
| }, |
| { |
| "epoch": 4.077236154385559, |
| "grad_norm": 1.0860310792922974, |
| "learning_rate": 1.7759197324414717e-06, |
| "loss": 0.3077, |
| "step": 42300 |
| }, |
| { |
| "epoch": 4.077236154385559, |
| "eval_loss": 0.2693646550178528, |
| "eval_runtime": 40.0602, |
| "eval_samples_per_second": 249.624, |
| "eval_steps_per_second": 31.203, |
| "step": 42300 |
| }, |
| { |
| "epoch": 4.086875010542499, |
| "grad_norm": 1.1427111625671387, |
| "learning_rate": 1.7658862876254182e-06, |
| "loss": 0.3119, |
| "step": 42400 |
| }, |
| { |
| "epoch": 4.086875010542499, |
| "eval_loss": 0.27336445450782776, |
| "eval_runtime": 40.0655, |
| "eval_samples_per_second": 249.592, |
| "eval_steps_per_second": 31.199, |
| "step": 42400 |
| }, |
| { |
| "epoch": 4.096513866699439, |
| "grad_norm": 1.027213215827942, |
| "learning_rate": 1.7558528428093648e-06, |
| "loss": 0.3103, |
| "step": 42500 |
| }, |
| { |
| "epoch": 4.096513866699439, |
| "eval_loss": 0.27709463238716125, |
| "eval_runtime": 40.0391, |
| "eval_samples_per_second": 249.756, |
| "eval_steps_per_second": 31.22, |
| "step": 42500 |
| }, |
| { |
| "epoch": 4.106152722856379, |
| "grad_norm": 1.1721312999725342, |
| "learning_rate": 1.745819397993311e-06, |
| "loss": 0.3108, |
| "step": 42600 |
| }, |
| { |
| "epoch": 4.106152722856379, |
| "eval_loss": 0.275216281414032, |
| "eval_runtime": 40.0433, |
| "eval_samples_per_second": 249.729, |
| "eval_steps_per_second": 31.216, |
| "step": 42600 |
| }, |
| { |
| "epoch": 4.115791579013319, |
| "grad_norm": 1.1721031665802002, |
| "learning_rate": 1.7357859531772575e-06, |
| "loss": 0.3111, |
| "step": 42700 |
| }, |
| { |
| "epoch": 4.115791579013319, |
| "eval_loss": 0.27178844809532166, |
| "eval_runtime": 40.2332, |
| "eval_samples_per_second": 248.551, |
| "eval_steps_per_second": 31.069, |
| "step": 42700 |
| }, |
| { |
| "epoch": 4.125430435170259, |
| "grad_norm": 1.1004197597503662, |
| "learning_rate": 1.7257525083612038e-06, |
| "loss": 0.3103, |
| "step": 42800 |
| }, |
| { |
| "epoch": 4.125430435170259, |
| "eval_loss": 0.278656542301178, |
| "eval_runtime": 40.0331, |
| "eval_samples_per_second": 249.793, |
| "eval_steps_per_second": 31.224, |
| "step": 42800 |
| }, |
| { |
| "epoch": 4.1350692913271985, |
| "grad_norm": 1.1854575872421265, |
| "learning_rate": 1.7157190635451504e-06, |
| "loss": 0.3124, |
| "step": 42900 |
| }, |
| { |
| "epoch": 4.1350692913271985, |
| "eval_loss": 0.27243393659591675, |
| "eval_runtime": 40.0307, |
| "eval_samples_per_second": 249.808, |
| "eval_steps_per_second": 31.226, |
| "step": 42900 |
| }, |
| { |
| "epoch": 4.144708147484138, |
| "grad_norm": 1.1487313508987427, |
| "learning_rate": 1.705685618729097e-06, |
| "loss": 0.3111, |
| "step": 43000 |
| }, |
| { |
| "epoch": 4.144708147484138, |
| "eval_loss": 0.27582091093063354, |
| "eval_runtime": 40.0545, |
| "eval_samples_per_second": 249.66, |
| "eval_steps_per_second": 31.207, |
| "step": 43000 |
| }, |
| { |
| "epoch": 4.154347003641078, |
| "grad_norm": 1.1287168264389038, |
| "learning_rate": 1.6956521739130435e-06, |
| "loss": 0.3088, |
| "step": 43100 |
| }, |
| { |
| "epoch": 4.154347003641078, |
| "eval_loss": 0.2760840952396393, |
| "eval_runtime": 40.0864, |
| "eval_samples_per_second": 249.461, |
| "eval_steps_per_second": 31.183, |
| "step": 43100 |
| }, |
| { |
| "epoch": 4.163985859798018, |
| "grad_norm": 1.1330616474151611, |
| "learning_rate": 1.6856187290969898e-06, |
| "loss": 0.3111, |
| "step": 43200 |
| }, |
| { |
| "epoch": 4.163985859798018, |
| "eval_loss": 0.2729859948158264, |
| "eval_runtime": 40.1988, |
| "eval_samples_per_second": 248.764, |
| "eval_steps_per_second": 31.095, |
| "step": 43200 |
| }, |
| { |
| "epoch": 4.173624715954958, |
| "grad_norm": 1.1357420682907104, |
| "learning_rate": 1.6755852842809363e-06, |
| "loss": 0.3092, |
| "step": 43300 |
| }, |
| { |
| "epoch": 4.173624715954958, |
| "eval_loss": 0.26835474371910095, |
| "eval_runtime": 40.0242, |
| "eval_samples_per_second": 249.849, |
| "eval_steps_per_second": 31.231, |
| "step": 43300 |
| }, |
| { |
| "epoch": 4.183263572111898, |
| "grad_norm": 1.0758098363876343, |
| "learning_rate": 1.665551839464883e-06, |
| "loss": 0.3091, |
| "step": 43400 |
| }, |
| { |
| "epoch": 4.183263572111898, |
| "eval_loss": 0.271380215883255, |
| "eval_runtime": 40.039, |
| "eval_samples_per_second": 249.757, |
| "eval_steps_per_second": 31.22, |
| "step": 43400 |
| }, |
| { |
| "epoch": 4.192902428268837, |
| "grad_norm": 1.1093837022781372, |
| "learning_rate": 1.6555183946488294e-06, |
| "loss": 0.3092, |
| "step": 43500 |
| }, |
| { |
| "epoch": 4.192902428268837, |
| "eval_loss": 0.276606947183609, |
| "eval_runtime": 40.058, |
| "eval_samples_per_second": 249.638, |
| "eval_steps_per_second": 31.205, |
| "step": 43500 |
| }, |
| { |
| "epoch": 4.202541284425777, |
| "grad_norm": 1.1726568937301636, |
| "learning_rate": 1.6454849498327758e-06, |
| "loss": 0.3089, |
| "step": 43600 |
| }, |
| { |
| "epoch": 4.202541284425777, |
| "eval_loss": 0.27440130710601807, |
| "eval_runtime": 40.2425, |
| "eval_samples_per_second": 248.493, |
| "eval_steps_per_second": 31.062, |
| "step": 43600 |
| }, |
| { |
| "epoch": 4.212180140582717, |
| "grad_norm": 1.137256145477295, |
| "learning_rate": 1.6354515050167223e-06, |
| "loss": 0.3082, |
| "step": 43700 |
| }, |
| { |
| "epoch": 4.212180140582717, |
| "eval_loss": 0.268880695104599, |
| "eval_runtime": 40.1041, |
| "eval_samples_per_second": 249.351, |
| "eval_steps_per_second": 31.169, |
| "step": 43700 |
| }, |
| { |
| "epoch": 4.221818996739657, |
| "grad_norm": 1.194308876991272, |
| "learning_rate": 1.6254180602006689e-06, |
| "loss": 0.3089, |
| "step": 43800 |
| }, |
| { |
| "epoch": 4.221818996739657, |
| "eval_loss": 0.2654968202114105, |
| "eval_runtime": 40.0755, |
| "eval_samples_per_second": 249.529, |
| "eval_steps_per_second": 31.191, |
| "step": 43800 |
| }, |
| { |
| "epoch": 4.231457852896597, |
| "grad_norm": 1.1237385272979736, |
| "learning_rate": 1.6153846153846154e-06, |
| "loss": 0.3096, |
| "step": 43900 |
| }, |
| { |
| "epoch": 4.231457852896597, |
| "eval_loss": 0.2736607491970062, |
| "eval_runtime": 40.0782, |
| "eval_samples_per_second": 249.512, |
| "eval_steps_per_second": 31.189, |
| "step": 43900 |
| }, |
| { |
| "epoch": 4.2410967090535365, |
| "grad_norm": 1.0342357158660889, |
| "learning_rate": 1.6053511705685618e-06, |
| "loss": 0.3088, |
| "step": 44000 |
| }, |
| { |
| "epoch": 4.2410967090535365, |
| "eval_loss": 0.27211064100265503, |
| "eval_runtime": 40.0588, |
| "eval_samples_per_second": 249.633, |
| "eval_steps_per_second": 31.204, |
| "step": 44000 |
| }, |
| { |
| "epoch": 4.250735565210476, |
| "grad_norm": 1.190117597579956, |
| "learning_rate": 1.5953177257525083e-06, |
| "loss": 0.3092, |
| "step": 44100 |
| }, |
| { |
| "epoch": 4.250735565210476, |
| "eval_loss": 0.2710612416267395, |
| "eval_runtime": 40.227, |
| "eval_samples_per_second": 248.589, |
| "eval_steps_per_second": 31.074, |
| "step": 44100 |
| }, |
| { |
| "epoch": 4.260374421367416, |
| "grad_norm": 1.1709164381027222, |
| "learning_rate": 1.5852842809364549e-06, |
| "loss": 0.3081, |
| "step": 44200 |
| }, |
| { |
| "epoch": 4.260374421367416, |
| "eval_loss": 0.272073358297348, |
| "eval_runtime": 40.0531, |
| "eval_samples_per_second": 249.669, |
| "eval_steps_per_second": 31.209, |
| "step": 44200 |
| }, |
| { |
| "epoch": 4.270013277524356, |
| "grad_norm": 1.2000877857208252, |
| "learning_rate": 1.5752508361204012e-06, |
| "loss": 0.3082, |
| "step": 44300 |
| }, |
| { |
| "epoch": 4.270013277524356, |
| "eval_loss": 0.2707884907722473, |
| "eval_runtime": 40.0511, |
| "eval_samples_per_second": 249.681, |
| "eval_steps_per_second": 31.21, |
| "step": 44300 |
| }, |
| { |
| "epoch": 4.279652133681296, |
| "grad_norm": 1.084593415260315, |
| "learning_rate": 1.5652173913043478e-06, |
| "loss": 0.308, |
| "step": 44400 |
| }, |
| { |
| "epoch": 4.279652133681296, |
| "eval_loss": 0.26928406953811646, |
| "eval_runtime": 40.0322, |
| "eval_samples_per_second": 249.799, |
| "eval_steps_per_second": 31.225, |
| "step": 44400 |
| }, |
| { |
| "epoch": 4.289290989838236, |
| "grad_norm": 1.0698468685150146, |
| "learning_rate": 1.5551839464882943e-06, |
| "loss": 0.3061, |
| "step": 44500 |
| }, |
| { |
| "epoch": 4.289290989838236, |
| "eval_loss": 0.2769457697868347, |
| "eval_runtime": 40.2095, |
| "eval_samples_per_second": 248.697, |
| "eval_steps_per_second": 31.087, |
| "step": 44500 |
| }, |
| { |
| "epoch": 4.2989298459951755, |
| "grad_norm": 1.0958250761032104, |
| "learning_rate": 1.5451505016722409e-06, |
| "loss": 0.3097, |
| "step": 44600 |
| }, |
| { |
| "epoch": 4.2989298459951755, |
| "eval_loss": 0.26975879073143005, |
| "eval_runtime": 40.0708, |
| "eval_samples_per_second": 249.558, |
| "eval_steps_per_second": 31.195, |
| "step": 44600 |
| }, |
| { |
| "epoch": 4.308568702152115, |
| "grad_norm": 1.27885103225708, |
| "learning_rate": 1.5351170568561872e-06, |
| "loss": 0.3063, |
| "step": 44700 |
| }, |
| { |
| "epoch": 4.308568702152115, |
| "eval_loss": 0.2665342688560486, |
| "eval_runtime": 40.2308, |
| "eval_samples_per_second": 248.566, |
| "eval_steps_per_second": 31.071, |
| "step": 44700 |
| }, |
| { |
| "epoch": 4.318207558309055, |
| "grad_norm": 1.1225556135177612, |
| "learning_rate": 1.5250836120401338e-06, |
| "loss": 0.3084, |
| "step": 44800 |
| }, |
| { |
| "epoch": 4.318207558309055, |
| "eval_loss": 0.266510546207428, |
| "eval_runtime": 40.0617, |
| "eval_samples_per_second": 249.615, |
| "eval_steps_per_second": 31.202, |
| "step": 44800 |
| }, |
| { |
| "epoch": 4.327846414465995, |
| "grad_norm": 1.1451354026794434, |
| "learning_rate": 1.5150501672240803e-06, |
| "loss": 0.3093, |
| "step": 44900 |
| }, |
| { |
| "epoch": 4.327846414465995, |
| "eval_loss": 0.26877492666244507, |
| "eval_runtime": 40.0203, |
| "eval_samples_per_second": 249.873, |
| "eval_steps_per_second": 31.234, |
| "step": 44900 |
| }, |
| { |
| "epoch": 4.337485270622935, |
| "grad_norm": 1.1867423057556152, |
| "learning_rate": 1.5050167224080269e-06, |
| "loss": 0.3064, |
| "step": 45000 |
| }, |
| { |
| "epoch": 4.337485270622935, |
| "eval_loss": 0.27599042654037476, |
| "eval_runtime": 40.0219, |
| "eval_samples_per_second": 249.863, |
| "eval_steps_per_second": 31.233, |
| "step": 45000 |
| }, |
| { |
| "epoch": 4.347124126779875, |
| "grad_norm": 1.1630709171295166, |
| "learning_rate": 1.4949832775919732e-06, |
| "loss": 0.3078, |
| "step": 45100 |
| }, |
| { |
| "epoch": 4.347124126779875, |
| "eval_loss": 0.27023693919181824, |
| "eval_runtime": 40.0226, |
| "eval_samples_per_second": 249.859, |
| "eval_steps_per_second": 31.232, |
| "step": 45100 |
| }, |
| { |
| "epoch": 4.356762982936815, |
| "grad_norm": 1.017093300819397, |
| "learning_rate": 1.4849498327759198e-06, |
| "loss": 0.3058, |
| "step": 45200 |
| }, |
| { |
| "epoch": 4.356762982936815, |
| "eval_loss": 0.26991012692451477, |
| "eval_runtime": 40.1764, |
| "eval_samples_per_second": 248.902, |
| "eval_steps_per_second": 31.113, |
| "step": 45200 |
| }, |
| { |
| "epoch": 4.366401839093755, |
| "grad_norm": 1.2074520587921143, |
| "learning_rate": 1.4749163879598663e-06, |
| "loss": 0.3068, |
| "step": 45300 |
| }, |
| { |
| "epoch": 4.366401839093755, |
| "eval_loss": 0.26959505677223206, |
| "eval_runtime": 40.0291, |
| "eval_samples_per_second": 249.819, |
| "eval_steps_per_second": 31.227, |
| "step": 45300 |
| }, |
| { |
| "epoch": 4.376040695250695, |
| "grad_norm": 1.1239268779754639, |
| "learning_rate": 1.4648829431438129e-06, |
| "loss": 0.3055, |
| "step": 45400 |
| }, |
| { |
| "epoch": 4.376040695250695, |
| "eval_loss": 0.26887837052345276, |
| "eval_runtime": 40.0551, |
| "eval_samples_per_second": 249.656, |
| "eval_steps_per_second": 31.207, |
| "step": 45400 |
| }, |
| { |
| "epoch": 4.385679551407635, |
| "grad_norm": 1.1952552795410156, |
| "learning_rate": 1.4548494983277592e-06, |
| "loss": 0.3069, |
| "step": 45500 |
| }, |
| { |
| "epoch": 4.385679551407635, |
| "eval_loss": 0.268917053937912, |
| "eval_runtime": 40.0378, |
| "eval_samples_per_second": 249.764, |
| "eval_steps_per_second": 31.22, |
| "step": 45500 |
| }, |
| { |
| "epoch": 4.395318407564575, |
| "grad_norm": 1.2636454105377197, |
| "learning_rate": 1.4448160535117058e-06, |
| "loss": 0.3047, |
| "step": 45600 |
| }, |
| { |
| "epoch": 4.395318407564575, |
| "eval_loss": 0.2656193673610687, |
| "eval_runtime": 40.2451, |
| "eval_samples_per_second": 248.478, |
| "eval_steps_per_second": 31.06, |
| "step": 45600 |
| }, |
| { |
| "epoch": 4.404957263721514, |
| "grad_norm": 1.2565031051635742, |
| "learning_rate": 1.4347826086956523e-06, |
| "loss": 0.3032, |
| "step": 45700 |
| }, |
| { |
| "epoch": 4.404957263721514, |
| "eval_loss": 0.2656501531600952, |
| "eval_runtime": 40.0216, |
| "eval_samples_per_second": 249.865, |
| "eval_steps_per_second": 31.233, |
| "step": 45700 |
| }, |
| { |
| "epoch": 4.414596119878454, |
| "grad_norm": 1.1796543598175049, |
| "learning_rate": 1.4247491638795989e-06, |
| "loss": 0.305, |
| "step": 45800 |
| }, |
| { |
| "epoch": 4.414596119878454, |
| "eval_loss": 0.26977530121803284, |
| "eval_runtime": 40.0013, |
| "eval_samples_per_second": 249.992, |
| "eval_steps_per_second": 31.249, |
| "step": 45800 |
| }, |
| { |
| "epoch": 4.424234976035394, |
| "grad_norm": 1.1382367610931396, |
| "learning_rate": 1.4147157190635452e-06, |
| "loss": 0.3073, |
| "step": 45900 |
| }, |
| { |
| "epoch": 4.424234976035394, |
| "eval_loss": 0.2679605185985565, |
| "eval_runtime": 40.0148, |
| "eval_samples_per_second": 249.908, |
| "eval_steps_per_second": 31.238, |
| "step": 45900 |
| }, |
| { |
| "epoch": 4.433873832192334, |
| "grad_norm": 1.094008445739746, |
| "learning_rate": 1.4046822742474917e-06, |
| "loss": 0.3051, |
| "step": 46000 |
| }, |
| { |
| "epoch": 4.433873832192334, |
| "eval_loss": 0.26570913195610046, |
| "eval_runtime": 40.0087, |
| "eval_samples_per_second": 249.946, |
| "eval_steps_per_second": 31.243, |
| "step": 46000 |
| }, |
| { |
| "epoch": 4.443512688349274, |
| "grad_norm": 0.9775266051292419, |
| "learning_rate": 1.3946488294314383e-06, |
| "loss": 0.3026, |
| "step": 46100 |
| }, |
| { |
| "epoch": 4.443512688349274, |
| "eval_loss": 0.2708422541618347, |
| "eval_runtime": 40.2084, |
| "eval_samples_per_second": 248.705, |
| "eval_steps_per_second": 31.088, |
| "step": 46100 |
| }, |
| { |
| "epoch": 4.4531515445062135, |
| "grad_norm": 1.1069421768188477, |
| "learning_rate": 1.3846153846153846e-06, |
| "loss": 0.3025, |
| "step": 46200 |
| }, |
| { |
| "epoch": 4.4531515445062135, |
| "eval_loss": 0.2691509425640106, |
| "eval_runtime": 40.0314, |
| "eval_samples_per_second": 249.804, |
| "eval_steps_per_second": 31.226, |
| "step": 46200 |
| }, |
| { |
| "epoch": 4.462790400663153, |
| "grad_norm": 1.1647151708602905, |
| "learning_rate": 1.374581939799331e-06, |
| "loss": 0.3027, |
| "step": 46300 |
| }, |
| { |
| "epoch": 4.462790400663153, |
| "eval_loss": 0.26704952120780945, |
| "eval_runtime": 40.0318, |
| "eval_samples_per_second": 249.802, |
| "eval_steps_per_second": 31.225, |
| "step": 46300 |
| }, |
| { |
| "epoch": 4.472429256820093, |
| "grad_norm": 1.1714391708374023, |
| "learning_rate": 1.3645484949832775e-06, |
| "loss": 0.304, |
| "step": 46400 |
| }, |
| { |
| "epoch": 4.472429256820093, |
| "eval_loss": 0.2679431140422821, |
| "eval_runtime": 40.04, |
| "eval_samples_per_second": 249.75, |
| "eval_steps_per_second": 31.219, |
| "step": 46400 |
| }, |
| { |
| "epoch": 4.482068112977033, |
| "grad_norm": 1.1067938804626465, |
| "learning_rate": 1.354515050167224e-06, |
| "loss": 0.3024, |
| "step": 46500 |
| }, |
| { |
| "epoch": 4.482068112977033, |
| "eval_loss": 0.26625776290893555, |
| "eval_runtime": 40.0422, |
| "eval_samples_per_second": 249.737, |
| "eval_steps_per_second": 31.217, |
| "step": 46500 |
| }, |
| { |
| "epoch": 4.491706969133973, |
| "grad_norm": 1.1131701469421387, |
| "learning_rate": 1.3444816053511706e-06, |
| "loss": 0.3026, |
| "step": 46600 |
| }, |
| { |
| "epoch": 4.491706969133973, |
| "eval_loss": 0.2679072916507721, |
| "eval_runtime": 40.2222, |
| "eval_samples_per_second": 248.619, |
| "eval_steps_per_second": 31.077, |
| "step": 46600 |
| }, |
| { |
| "epoch": 4.501345825290913, |
| "grad_norm": 1.0876435041427612, |
| "learning_rate": 1.334448160535117e-06, |
| "loss": 0.306, |
| "step": 46700 |
| }, |
| { |
| "epoch": 4.501345825290913, |
| "eval_loss": 0.2695472836494446, |
| "eval_runtime": 40.0345, |
| "eval_samples_per_second": 249.785, |
| "eval_steps_per_second": 31.223, |
| "step": 46700 |
| }, |
| { |
| "epoch": 4.5109846814478525, |
| "grad_norm": 1.0998533964157104, |
| "learning_rate": 1.3244147157190635e-06, |
| "loss": 0.2985, |
| "step": 46800 |
| }, |
| { |
| "epoch": 4.5109846814478525, |
| "eval_loss": 0.2631019949913025, |
| "eval_runtime": 40.039, |
| "eval_samples_per_second": 249.756, |
| "eval_steps_per_second": 31.22, |
| "step": 46800 |
| }, |
| { |
| "epoch": 4.520623537604792, |
| "grad_norm": 1.0389171838760376, |
| "learning_rate": 1.31438127090301e-06, |
| "loss": 0.3037, |
| "step": 46900 |
| }, |
| { |
| "epoch": 4.520623537604792, |
| "eval_loss": 0.2689988315105438, |
| "eval_runtime": 40.0346, |
| "eval_samples_per_second": 249.784, |
| "eval_steps_per_second": 31.223, |
| "step": 46900 |
| }, |
| { |
| "epoch": 4.530262393761732, |
| "grad_norm": 1.1371253728866577, |
| "learning_rate": 1.3043478260869566e-06, |
| "loss": 0.3016, |
| "step": 47000 |
| }, |
| { |
| "epoch": 4.530262393761732, |
| "eval_loss": 0.2646733820438385, |
| "eval_runtime": 40.2249, |
| "eval_samples_per_second": 248.602, |
| "eval_steps_per_second": 31.075, |
| "step": 47000 |
| }, |
| { |
| "epoch": 4.539901249918672, |
| "grad_norm": 1.1179282665252686, |
| "learning_rate": 1.294314381270903e-06, |
| "loss": 0.3005, |
| "step": 47100 |
| }, |
| { |
| "epoch": 4.539901249918672, |
| "eval_loss": 0.2690794765949249, |
| "eval_runtime": 40.071, |
| "eval_samples_per_second": 249.557, |
| "eval_steps_per_second": 31.195, |
| "step": 47100 |
| }, |
| { |
| "epoch": 4.549540106075612, |
| "grad_norm": 1.0551939010620117, |
| "learning_rate": 1.2842809364548495e-06, |
| "loss": 0.3041, |
| "step": 47200 |
| }, |
| { |
| "epoch": 4.549540106075612, |
| "eval_loss": 0.26324746012687683, |
| "eval_runtime": 40.2273, |
| "eval_samples_per_second": 248.587, |
| "eval_steps_per_second": 31.073, |
| "step": 47200 |
| }, |
| { |
| "epoch": 4.559178962232552, |
| "grad_norm": 1.0626869201660156, |
| "learning_rate": 1.274247491638796e-06, |
| "loss": 0.3038, |
| "step": 47300 |
| }, |
| { |
| "epoch": 4.559178962232552, |
| "eval_loss": 0.26168400049209595, |
| "eval_runtime": 40.2137, |
| "eval_samples_per_second": 248.672, |
| "eval_steps_per_second": 31.084, |
| "step": 47300 |
| }, |
| { |
| "epoch": 4.568817818389491, |
| "grad_norm": 1.1616398096084595, |
| "learning_rate": 1.2642140468227424e-06, |
| "loss": 0.3006, |
| "step": 47400 |
| }, |
| { |
| "epoch": 4.568817818389491, |
| "eval_loss": 0.2649966776371002, |
| "eval_runtime": 40.0842, |
| "eval_samples_per_second": 249.475, |
| "eval_steps_per_second": 31.184, |
| "step": 47400 |
| }, |
| { |
| "epoch": 4.578456674546431, |
| "grad_norm": 1.1772770881652832, |
| "learning_rate": 1.254180602006689e-06, |
| "loss": 0.303, |
| "step": 47500 |
| }, |
| { |
| "epoch": 4.578456674546431, |
| "eval_loss": 0.2650017738342285, |
| "eval_runtime": 40.0723, |
| "eval_samples_per_second": 249.549, |
| "eval_steps_per_second": 31.194, |
| "step": 47500 |
| }, |
| { |
| "epoch": 4.588095530703371, |
| "grad_norm": 1.1673014163970947, |
| "learning_rate": 1.2441471571906355e-06, |
| "loss": 0.3009, |
| "step": 47600 |
| }, |
| { |
| "epoch": 4.588095530703371, |
| "eval_loss": 0.2697572112083435, |
| "eval_runtime": 40.0689, |
| "eval_samples_per_second": 249.57, |
| "eval_steps_per_second": 31.196, |
| "step": 47600 |
| }, |
| { |
| "epoch": 4.597734386860312, |
| "grad_norm": 1.1624304056167603, |
| "learning_rate": 1.234113712374582e-06, |
| "loss": 0.3049, |
| "step": 47700 |
| }, |
| { |
| "epoch": 4.597734386860312, |
| "eval_loss": 0.26627808809280396, |
| "eval_runtime": 40.1927, |
| "eval_samples_per_second": 248.801, |
| "eval_steps_per_second": 31.1, |
| "step": 47700 |
| }, |
| { |
| "epoch": 4.607373243017252, |
| "grad_norm": 1.0767208337783813, |
| "learning_rate": 1.2240802675585284e-06, |
| "loss": 0.3021, |
| "step": 47800 |
| }, |
| { |
| "epoch": 4.607373243017252, |
| "eval_loss": 0.2637022137641907, |
| "eval_runtime": 40.0396, |
| "eval_samples_per_second": 249.753, |
| "eval_steps_per_second": 31.219, |
| "step": 47800 |
| }, |
| { |
| "epoch": 4.617012099174191, |
| "grad_norm": 1.1391677856445312, |
| "learning_rate": 1.214046822742475e-06, |
| "loss": 0.3013, |
| "step": 47900 |
| }, |
| { |
| "epoch": 4.617012099174191, |
| "eval_loss": 0.2667087912559509, |
| "eval_runtime": 40.0721, |
| "eval_samples_per_second": 249.55, |
| "eval_steps_per_second": 31.194, |
| "step": 47900 |
| }, |
| { |
| "epoch": 4.626650955331131, |
| "grad_norm": 1.1649397611618042, |
| "learning_rate": 1.2040133779264215e-06, |
| "loss": 0.3002, |
| "step": 48000 |
| }, |
| { |
| "epoch": 4.626650955331131, |
| "eval_loss": 0.27446261048316956, |
| "eval_runtime": 40.0365, |
| "eval_samples_per_second": 249.772, |
| "eval_steps_per_second": 31.222, |
| "step": 48000 |
| }, |
| { |
| "epoch": 4.636289811488071, |
| "grad_norm": 1.074158787727356, |
| "learning_rate": 1.193979933110368e-06, |
| "loss": 0.3015, |
| "step": 48100 |
| }, |
| { |
| "epoch": 4.636289811488071, |
| "eval_loss": 0.2660681903362274, |
| "eval_runtime": 40.0402, |
| "eval_samples_per_second": 249.749, |
| "eval_steps_per_second": 31.219, |
| "step": 48100 |
| }, |
| { |
| "epoch": 4.645928667645011, |
| "grad_norm": 1.1402333974838257, |
| "learning_rate": 1.1839464882943144e-06, |
| "loss": 0.3013, |
| "step": 48200 |
| }, |
| { |
| "epoch": 4.645928667645011, |
| "eval_loss": 0.2649831175804138, |
| "eval_runtime": 40.1953, |
| "eval_samples_per_second": 248.785, |
| "eval_steps_per_second": 31.098, |
| "step": 48200 |
| }, |
| { |
| "epoch": 4.655567523801951, |
| "grad_norm": 1.1394840478897095, |
| "learning_rate": 1.173913043478261e-06, |
| "loss": 0.2998, |
| "step": 48300 |
| }, |
| { |
| "epoch": 4.655567523801951, |
| "eval_loss": 0.26467251777648926, |
| "eval_runtime": 40.0301, |
| "eval_samples_per_second": 249.812, |
| "eval_steps_per_second": 31.226, |
| "step": 48300 |
| }, |
| { |
| "epoch": 4.6652063799588905, |
| "grad_norm": 1.151784896850586, |
| "learning_rate": 1.1638795986622075e-06, |
| "loss": 0.2998, |
| "step": 48400 |
| }, |
| { |
| "epoch": 4.6652063799588905, |
| "eval_loss": 0.2676987946033478, |
| "eval_runtime": 40.0304, |
| "eval_samples_per_second": 249.81, |
| "eval_steps_per_second": 31.226, |
| "step": 48400 |
| }, |
| { |
| "epoch": 4.67484523611583, |
| "grad_norm": 1.134567379951477, |
| "learning_rate": 1.153846153846154e-06, |
| "loss": 0.3026, |
| "step": 48500 |
| }, |
| { |
| "epoch": 4.67484523611583, |
| "eval_loss": 0.2631213068962097, |
| "eval_runtime": 40.0346, |
| "eval_samples_per_second": 249.784, |
| "eval_steps_per_second": 31.223, |
| "step": 48500 |
| }, |
| { |
| "epoch": 4.68448409227277, |
| "grad_norm": 1.1139414310455322, |
| "learning_rate": 1.1438127090301004e-06, |
| "loss": 0.2987, |
| "step": 48600 |
| }, |
| { |
| "epoch": 4.68448409227277, |
| "eval_loss": 0.2675442397594452, |
| "eval_runtime": 40.2414, |
| "eval_samples_per_second": 248.5, |
| "eval_steps_per_second": 31.063, |
| "step": 48600 |
| }, |
| { |
| "epoch": 4.69412294842971, |
| "grad_norm": 1.140663504600525, |
| "learning_rate": 1.133779264214047e-06, |
| "loss": 0.3005, |
| "step": 48700 |
| }, |
| { |
| "epoch": 4.69412294842971, |
| "eval_loss": 0.26258584856987, |
| "eval_runtime": 40.0562, |
| "eval_samples_per_second": 249.649, |
| "eval_steps_per_second": 31.206, |
| "step": 48700 |
| }, |
| { |
| "epoch": 4.70376180458665, |
| "grad_norm": 1.1587011814117432, |
| "learning_rate": 1.1237458193979933e-06, |
| "loss": 0.2998, |
| "step": 48800 |
| }, |
| { |
| "epoch": 4.70376180458665, |
| "eval_loss": 0.2626008987426758, |
| "eval_runtime": 40.0476, |
| "eval_samples_per_second": 249.703, |
| "eval_steps_per_second": 31.213, |
| "step": 48800 |
| }, |
| { |
| "epoch": 4.71340066074359, |
| "grad_norm": 1.2201813459396362, |
| "learning_rate": 1.1137123745819398e-06, |
| "loss": 0.2994, |
| "step": 48900 |
| }, |
| { |
| "epoch": 4.71340066074359, |
| "eval_loss": 0.26159748435020447, |
| "eval_runtime": 40.0689, |
| "eval_samples_per_second": 249.57, |
| "eval_steps_per_second": 31.196, |
| "step": 48900 |
| }, |
| { |
| "epoch": 4.7230395169005295, |
| "grad_norm": 1.0543116331100464, |
| "learning_rate": 1.1036789297658862e-06, |
| "loss": 0.3009, |
| "step": 49000 |
| }, |
| { |
| "epoch": 4.7230395169005295, |
| "eval_loss": 0.2661503553390503, |
| "eval_runtime": 40.0833, |
| "eval_samples_per_second": 249.481, |
| "eval_steps_per_second": 31.185, |
| "step": 49000 |
| }, |
| { |
| "epoch": 4.732678373057469, |
| "grad_norm": 1.0388127565383911, |
| "learning_rate": 1.0936454849498327e-06, |
| "loss": 0.3005, |
| "step": 49100 |
| }, |
| { |
| "epoch": 4.732678373057469, |
| "eval_loss": 0.2635759711265564, |
| "eval_runtime": 40.2341, |
| "eval_samples_per_second": 248.545, |
| "eval_steps_per_second": 31.068, |
| "step": 49100 |
| }, |
| { |
| "epoch": 4.742317229214409, |
| "grad_norm": 1.133090853691101, |
| "learning_rate": 1.0836120401337793e-06, |
| "loss": 0.298, |
| "step": 49200 |
| }, |
| { |
| "epoch": 4.742317229214409, |
| "eval_loss": 0.25813475251197815, |
| "eval_runtime": 40.0481, |
| "eval_samples_per_second": 249.7, |
| "eval_steps_per_second": 31.212, |
| "step": 49200 |
| }, |
| { |
| "epoch": 4.751956085371349, |
| "grad_norm": 1.149713158607483, |
| "learning_rate": 1.0735785953177258e-06, |
| "loss": 0.3011, |
| "step": 49300 |
| }, |
| { |
| "epoch": 4.751956085371349, |
| "eval_loss": 0.26733270287513733, |
| "eval_runtime": 40.0514, |
| "eval_samples_per_second": 249.679, |
| "eval_steps_per_second": 31.21, |
| "step": 49300 |
| }, |
| { |
| "epoch": 4.761594941528289, |
| "grad_norm": 1.1020680665969849, |
| "learning_rate": 1.0635451505016722e-06, |
| "loss": 0.3003, |
| "step": 49400 |
| }, |
| { |
| "epoch": 4.761594941528289, |
| "eval_loss": 0.26301661133766174, |
| "eval_runtime": 40.0244, |
| "eval_samples_per_second": 249.848, |
| "eval_steps_per_second": 31.231, |
| "step": 49400 |
| }, |
| { |
| "epoch": 4.771233797685229, |
| "grad_norm": 1.1501930952072144, |
| "learning_rate": 1.0535117056856187e-06, |
| "loss": 0.2994, |
| "step": 49500 |
| }, |
| { |
| "epoch": 4.771233797685229, |
| "eval_loss": 0.2652057111263275, |
| "eval_runtime": 40.203, |
| "eval_samples_per_second": 248.737, |
| "eval_steps_per_second": 31.092, |
| "step": 49500 |
| }, |
| { |
| "epoch": 4.780872653842168, |
| "grad_norm": 1.0680407285690308, |
| "learning_rate": 1.0434782608695653e-06, |
| "loss": 0.3026, |
| "step": 49600 |
| }, |
| { |
| "epoch": 4.780872653842168, |
| "eval_loss": 0.26839718222618103, |
| "eval_runtime": 40.0443, |
| "eval_samples_per_second": 249.723, |
| "eval_steps_per_second": 31.215, |
| "step": 49600 |
| }, |
| { |
| "epoch": 4.790511509999108, |
| "grad_norm": 1.1714107990264893, |
| "learning_rate": 1.0334448160535118e-06, |
| "loss": 0.3, |
| "step": 49700 |
| }, |
| { |
| "epoch": 4.790511509999108, |
| "eval_loss": 0.26893192529678345, |
| "eval_runtime": 40.0425, |
| "eval_samples_per_second": 249.735, |
| "eval_steps_per_second": 31.217, |
| "step": 49700 |
| }, |
| { |
| "epoch": 4.800150366156048, |
| "grad_norm": 1.216985821723938, |
| "learning_rate": 1.0234113712374581e-06, |
| "loss": 0.2996, |
| "step": 49800 |
| }, |
| { |
| "epoch": 4.800150366156048, |
| "eval_loss": 0.2666034698486328, |
| "eval_runtime": 40.0734, |
| "eval_samples_per_second": 249.542, |
| "eval_steps_per_second": 31.193, |
| "step": 49800 |
| }, |
| { |
| "epoch": 4.809789222312988, |
| "grad_norm": 1.0444952249526978, |
| "learning_rate": 1.0133779264214047e-06, |
| "loss": 0.2991, |
| "step": 49900 |
| }, |
| { |
| "epoch": 4.809789222312988, |
| "eval_loss": 0.26440900564193726, |
| "eval_runtime": 40.0587, |
| "eval_samples_per_second": 249.634, |
| "eval_steps_per_second": 31.204, |
| "step": 49900 |
| }, |
| { |
| "epoch": 4.819428078469928, |
| "grad_norm": 1.002389907836914, |
| "learning_rate": 1.0033444816053512e-06, |
| "loss": 0.3, |
| "step": 50000 |
| }, |
| { |
| "epoch": 4.819428078469928, |
| "eval_loss": 0.2678601145744324, |
| "eval_runtime": 40.2234, |
| "eval_samples_per_second": 248.612, |
| "eval_steps_per_second": 31.076, |
| "step": 50000 |
| }, |
| { |
| "epoch": 4.8290669346268675, |
| "grad_norm": 1.083066701889038, |
| "learning_rate": 9.933110367892976e-07, |
| "loss": 0.3003, |
| "step": 50100 |
| }, |
| { |
| "epoch": 4.8290669346268675, |
| "eval_loss": 0.26301416754722595, |
| "eval_runtime": 40.0381, |
| "eval_samples_per_second": 249.762, |
| "eval_steps_per_second": 31.22, |
| "step": 50100 |
| }, |
| { |
| "epoch": 4.838705790783807, |
| "grad_norm": 1.0669959783554077, |
| "learning_rate": 9.832775919732441e-07, |
| "loss": 0.2988, |
| "step": 50200 |
| }, |
| { |
| "epoch": 4.838705790783807, |
| "eval_loss": 0.2668324112892151, |
| "eval_runtime": 40.0364, |
| "eval_samples_per_second": 249.772, |
| "eval_steps_per_second": 31.222, |
| "step": 50200 |
| }, |
| { |
| "epoch": 4.848344646940747, |
| "grad_norm": 1.253390908241272, |
| "learning_rate": 9.732441471571907e-07, |
| "loss": 0.2984, |
| "step": 50300 |
| }, |
| { |
| "epoch": 4.848344646940747, |
| "eval_loss": 0.2622229754924774, |
| "eval_runtime": 40.0375, |
| "eval_samples_per_second": 249.766, |
| "eval_steps_per_second": 31.221, |
| "step": 50300 |
| }, |
| { |
| "epoch": 4.857983503097687, |
| "grad_norm": 1.0932843685150146, |
| "learning_rate": 9.632107023411372e-07, |
| "loss": 0.2997, |
| "step": 50400 |
| }, |
| { |
| "epoch": 4.857983503097687, |
| "eval_loss": 0.2585309147834778, |
| "eval_runtime": 40.0358, |
| "eval_samples_per_second": 249.776, |
| "eval_steps_per_second": 31.222, |
| "step": 50400 |
| }, |
| { |
| "epoch": 4.867622359254627, |
| "grad_norm": 1.089406967163086, |
| "learning_rate": 9.531772575250837e-07, |
| "loss": 0.2995, |
| "step": 50500 |
| }, |
| { |
| "epoch": 4.867622359254627, |
| "eval_loss": 0.26007798314094543, |
| "eval_runtime": 40.2316, |
| "eval_samples_per_second": 248.561, |
| "eval_steps_per_second": 31.07, |
| "step": 50500 |
| }, |
| { |
| "epoch": 4.877261215411567, |
| "grad_norm": 1.0903793573379517, |
| "learning_rate": 9.431438127090301e-07, |
| "loss": 0.2966, |
| "step": 50600 |
| }, |
| { |
| "epoch": 4.877261215411567, |
| "eval_loss": 0.26448899507522583, |
| "eval_runtime": 40.0444, |
| "eval_samples_per_second": 249.723, |
| "eval_steps_per_second": 31.215, |
| "step": 50600 |
| }, |
| { |
| "epoch": 4.8869000715685065, |
| "grad_norm": 1.1804698705673218, |
| "learning_rate": 9.331103678929767e-07, |
| "loss": 0.2975, |
| "step": 50700 |
| }, |
| { |
| "epoch": 4.8869000715685065, |
| "eval_loss": 0.2629201114177704, |
| "eval_runtime": 40.056, |
| "eval_samples_per_second": 249.65, |
| "eval_steps_per_second": 31.206, |
| "step": 50700 |
| }, |
| { |
| "epoch": 4.896538927725447, |
| "grad_norm": 1.0072447061538696, |
| "learning_rate": 9.230769230769231e-07, |
| "loss": 0.2971, |
| "step": 50800 |
| }, |
| { |
| "epoch": 4.896538927725447, |
| "eval_loss": 0.2699527442455292, |
| "eval_runtime": 40.2322, |
| "eval_samples_per_second": 248.557, |
| "eval_steps_per_second": 31.07, |
| "step": 50800 |
| }, |
| { |
| "epoch": 4.906177783882387, |
| "grad_norm": 1.1153916120529175, |
| "learning_rate": 9.130434782608697e-07, |
| "loss": 0.2991, |
| "step": 50900 |
| }, |
| { |
| "epoch": 4.906177783882387, |
| "eval_loss": 0.2644394040107727, |
| "eval_runtime": 40.0519, |
| "eval_samples_per_second": 249.676, |
| "eval_steps_per_second": 31.209, |
| "step": 50900 |
| }, |
| { |
| "epoch": 4.915816640039327, |
| "grad_norm": 1.1166541576385498, |
| "learning_rate": 9.030100334448161e-07, |
| "loss": 0.2985, |
| "step": 51000 |
| }, |
| { |
| "epoch": 4.915816640039327, |
| "eval_loss": 0.2613272964954376, |
| "eval_runtime": 40.0716, |
| "eval_samples_per_second": 249.553, |
| "eval_steps_per_second": 31.194, |
| "step": 51000 |
| }, |
| { |
| "epoch": 4.925455496196267, |
| "grad_norm": 1.062063455581665, |
| "learning_rate": 8.929765886287627e-07, |
| "loss": 0.2959, |
| "step": 51100 |
| }, |
| { |
| "epoch": 4.925455496196267, |
| "eval_loss": 0.2620924413204193, |
| "eval_runtime": 40.0632, |
| "eval_samples_per_second": 249.605, |
| "eval_steps_per_second": 31.201, |
| "step": 51100 |
| }, |
| { |
| "epoch": 4.9350943523532065, |
| "grad_norm": 1.0991202592849731, |
| "learning_rate": 8.829431438127091e-07, |
| "loss": 0.2963, |
| "step": 51200 |
| }, |
| { |
| "epoch": 4.9350943523532065, |
| "eval_loss": 0.2621663510799408, |
| "eval_runtime": 40.2344, |
| "eval_samples_per_second": 248.544, |
| "eval_steps_per_second": 31.068, |
| "step": 51200 |
| }, |
| { |
| "epoch": 4.944733208510146, |
| "grad_norm": 1.2747081518173218, |
| "learning_rate": 8.729096989966555e-07, |
| "loss": 0.2981, |
| "step": 51300 |
| }, |
| { |
| "epoch": 4.944733208510146, |
| "eval_loss": 0.2616971433162689, |
| "eval_runtime": 40.0357, |
| "eval_samples_per_second": 249.777, |
| "eval_steps_per_second": 31.222, |
| "step": 51300 |
| }, |
| { |
| "epoch": 4.954372064667086, |
| "grad_norm": 1.046461820602417, |
| "learning_rate": 8.628762541806019e-07, |
| "loss": 0.2988, |
| "step": 51400 |
| }, |
| { |
| "epoch": 4.954372064667086, |
| "eval_loss": 0.26591652631759644, |
| "eval_runtime": 40.0483, |
| "eval_samples_per_second": 249.699, |
| "eval_steps_per_second": 31.212, |
| "step": 51400 |
| }, |
| { |
| "epoch": 4.964010920824026, |
| "grad_norm": 1.3319487571716309, |
| "learning_rate": 8.528428093645485e-07, |
| "loss": 0.2971, |
| "step": 51500 |
| }, |
| { |
| "epoch": 4.964010920824026, |
| "eval_loss": 0.2656097412109375, |
| "eval_runtime": 40.0468, |
| "eval_samples_per_second": 249.708, |
| "eval_steps_per_second": 31.214, |
| "step": 51500 |
| }, |
| { |
| "epoch": 4.973649776980966, |
| "grad_norm": 1.2510554790496826, |
| "learning_rate": 8.428093645484949e-07, |
| "loss": 0.3005, |
| "step": 51600 |
| }, |
| { |
| "epoch": 4.973649776980966, |
| "eval_loss": 0.2569749653339386, |
| "eval_runtime": 40.0994, |
| "eval_samples_per_second": 249.38, |
| "eval_steps_per_second": 31.173, |
| "step": 51600 |
| }, |
| { |
| "epoch": 4.983288633137906, |
| "grad_norm": 1.045254111289978, |
| "learning_rate": 8.327759197324414e-07, |
| "loss": 0.2978, |
| "step": 51700 |
| }, |
| { |
| "epoch": 4.983288633137906, |
| "eval_loss": 0.26062557101249695, |
| "eval_runtime": 40.2488, |
| "eval_samples_per_second": 248.454, |
| "eval_steps_per_second": 31.057, |
| "step": 51700 |
| }, |
| { |
| "epoch": 4.992927489294845, |
| "grad_norm": 1.1483185291290283, |
| "learning_rate": 8.227424749163879e-07, |
| "loss": 0.2976, |
| "step": 51800 |
| }, |
| { |
| "epoch": 4.992927489294845, |
| "eval_loss": 0.2605762183666229, |
| "eval_runtime": 40.0482, |
| "eval_samples_per_second": 249.699, |
| "eval_steps_per_second": 31.212, |
| "step": 51800 |
| }, |
| { |
| "epoch": 5.002566345451785, |
| "grad_norm": 1.193968653678894, |
| "learning_rate": 8.127090301003344e-07, |
| "loss": 0.2958, |
| "step": 51900 |
| }, |
| { |
| "epoch": 5.002566345451785, |
| "eval_loss": 0.26294004917144775, |
| "eval_runtime": 40.0625, |
| "eval_samples_per_second": 249.61, |
| "eval_steps_per_second": 31.201, |
| "step": 51900 |
| }, |
| { |
| "epoch": 5.012205201608725, |
| "grad_norm": 1.3349761962890625, |
| "learning_rate": 8.026755852842809e-07, |
| "loss": 0.2966, |
| "step": 52000 |
| }, |
| { |
| "epoch": 5.012205201608725, |
| "eval_loss": 0.26609280705451965, |
| "eval_runtime": 40.059, |
| "eval_samples_per_second": 249.632, |
| "eval_steps_per_second": 31.204, |
| "step": 52000 |
| }, |
| { |
| "epoch": 5.021844057765665, |
| "grad_norm": 1.1316077709197998, |
| "learning_rate": 7.926421404682274e-07, |
| "loss": 0.2988, |
| "step": 52100 |
| }, |
| { |
| "epoch": 5.021844057765665, |
| "eval_loss": 0.26353102922439575, |
| "eval_runtime": 40.0857, |
| "eval_samples_per_second": 249.465, |
| "eval_steps_per_second": 31.183, |
| "step": 52100 |
| }, |
| { |
| "epoch": 5.031482913922605, |
| "grad_norm": 1.1322309970855713, |
| "learning_rate": 7.826086956521739e-07, |
| "loss": 0.2979, |
| "step": 52200 |
| }, |
| { |
| "epoch": 5.031482913922605, |
| "eval_loss": 0.2596350908279419, |
| "eval_runtime": 40.2317, |
| "eval_samples_per_second": 248.56, |
| "eval_steps_per_second": 31.07, |
| "step": 52200 |
| }, |
| { |
| "epoch": 5.0411217700795445, |
| "grad_norm": 1.1747742891311646, |
| "learning_rate": 7.725752508361204e-07, |
| "loss": 0.2946, |
| "step": 52300 |
| }, |
| { |
| "epoch": 5.0411217700795445, |
| "eval_loss": 0.2601070702075958, |
| "eval_runtime": 40.0484, |
| "eval_samples_per_second": 249.698, |
| "eval_steps_per_second": 31.212, |
| "step": 52300 |
| }, |
| { |
| "epoch": 5.050760626236484, |
| "grad_norm": 1.1011712551116943, |
| "learning_rate": 7.625418060200669e-07, |
| "loss": 0.2953, |
| "step": 52400 |
| }, |
| { |
| "epoch": 5.050760626236484, |
| "eval_loss": 0.25822940468788147, |
| "eval_runtime": 40.0407, |
| "eval_samples_per_second": 249.746, |
| "eval_steps_per_second": 31.218, |
| "step": 52400 |
| }, |
| { |
| "epoch": 5.060399482393424, |
| "grad_norm": 1.0672869682312012, |
| "learning_rate": 7.525083612040134e-07, |
| "loss": 0.2986, |
| "step": 52500 |
| }, |
| { |
| "epoch": 5.060399482393424, |
| "eval_loss": 0.25989893078804016, |
| "eval_runtime": 40.0772, |
| "eval_samples_per_second": 249.518, |
| "eval_steps_per_second": 31.19, |
| "step": 52500 |
| }, |
| { |
| "epoch": 5.070038338550364, |
| "grad_norm": 1.1346361637115479, |
| "learning_rate": 7.424749163879599e-07, |
| "loss": 0.2952, |
| "step": 52600 |
| }, |
| { |
| "epoch": 5.070038338550364, |
| "eval_loss": 0.26356619596481323, |
| "eval_runtime": 40.0484, |
| "eval_samples_per_second": 249.698, |
| "eval_steps_per_second": 31.212, |
| "step": 52600 |
| }, |
| { |
| "epoch": 5.079677194707304, |
| "grad_norm": 1.152971625328064, |
| "learning_rate": 7.324414715719064e-07, |
| "loss": 0.2967, |
| "step": 52700 |
| }, |
| { |
| "epoch": 5.079677194707304, |
| "eval_loss": 0.26063260436058044, |
| "eval_runtime": 40.2268, |
| "eval_samples_per_second": 248.591, |
| "eval_steps_per_second": 31.074, |
| "step": 52700 |
| }, |
| { |
| "epoch": 5.089316050864244, |
| "grad_norm": 1.0548993349075317, |
| "learning_rate": 7.224080267558529e-07, |
| "loss": 0.2974, |
| "step": 52800 |
| }, |
| { |
| "epoch": 5.089316050864244, |
| "eval_loss": 0.2643623650074005, |
| "eval_runtime": 40.0545, |
| "eval_samples_per_second": 249.66, |
| "eval_steps_per_second": 31.208, |
| "step": 52800 |
| }, |
| { |
| "epoch": 5.0989549070211835, |
| "grad_norm": 1.1734200716018677, |
| "learning_rate": 7.123745819397994e-07, |
| "loss": 0.2983, |
| "step": 52900 |
| }, |
| { |
| "epoch": 5.0989549070211835, |
| "eval_loss": 0.2603817284107208, |
| "eval_runtime": 40.0535, |
| "eval_samples_per_second": 249.666, |
| "eval_steps_per_second": 31.208, |
| "step": 52900 |
| }, |
| { |
| "epoch": 5.108593763178123, |
| "grad_norm": 1.1038990020751953, |
| "learning_rate": 7.023411371237459e-07, |
| "loss": 0.2958, |
| "step": 53000 |
| }, |
| { |
| "epoch": 5.108593763178123, |
| "eval_loss": 0.25843310356140137, |
| "eval_runtime": 40.062, |
| "eval_samples_per_second": 249.613, |
| "eval_steps_per_second": 31.202, |
| "step": 53000 |
| }, |
| { |
| "epoch": 5.118232619335063, |
| "grad_norm": 1.1007676124572754, |
| "learning_rate": 6.923076923076923e-07, |
| "loss": 0.2946, |
| "step": 53100 |
| }, |
| { |
| "epoch": 5.118232619335063, |
| "eval_loss": 0.2619819939136505, |
| "eval_runtime": 40.3148, |
| "eval_samples_per_second": 248.048, |
| "eval_steps_per_second": 31.006, |
| "step": 53100 |
| }, |
| { |
| "epoch": 5.127871475492003, |
| "grad_norm": 1.149030089378357, |
| "learning_rate": 6.822742474916388e-07, |
| "loss": 0.2974, |
| "step": 53200 |
| }, |
| { |
| "epoch": 5.127871475492003, |
| "eval_loss": 0.25853490829467773, |
| "eval_runtime": 40.1284, |
| "eval_samples_per_second": 249.2, |
| "eval_steps_per_second": 31.15, |
| "step": 53200 |
| }, |
| { |
| "epoch": 5.137510331648944, |
| "grad_norm": 1.102474331855774, |
| "learning_rate": 6.722408026755853e-07, |
| "loss": 0.2974, |
| "step": 53300 |
| }, |
| { |
| "epoch": 5.137510331648944, |
| "eval_loss": 0.26003292202949524, |
| "eval_runtime": 40.0545, |
| "eval_samples_per_second": 249.66, |
| "eval_steps_per_second": 31.207, |
| "step": 53300 |
| }, |
| { |
| "epoch": 5.1471491878058835, |
| "grad_norm": 1.1578916311264038, |
| "learning_rate": 6.622073578595318e-07, |
| "loss": 0.2969, |
| "step": 53400 |
| }, |
| { |
| "epoch": 5.1471491878058835, |
| "eval_loss": 0.2637874186038971, |
| "eval_runtime": 40.0602, |
| "eval_samples_per_second": 249.625, |
| "eval_steps_per_second": 31.203, |
| "step": 53400 |
| }, |
| { |
| "epoch": 5.156788043962823, |
| "grad_norm": 1.0911157131195068, |
| "learning_rate": 6.521739130434783e-07, |
| "loss": 0.2973, |
| "step": 53500 |
| }, |
| { |
| "epoch": 5.156788043962823, |
| "eval_loss": 0.26259294152259827, |
| "eval_runtime": 40.0592, |
| "eval_samples_per_second": 249.631, |
| "eval_steps_per_second": 31.204, |
| "step": 53500 |
| }, |
| { |
| "epoch": 5.166426900119763, |
| "grad_norm": 1.0731229782104492, |
| "learning_rate": 6.421404682274248e-07, |
| "loss": 0.2985, |
| "step": 53600 |
| }, |
| { |
| "epoch": 5.166426900119763, |
| "eval_loss": 0.2570517659187317, |
| "eval_runtime": 40.0808, |
| "eval_samples_per_second": 249.496, |
| "eval_steps_per_second": 31.187, |
| "step": 53600 |
| }, |
| { |
| "epoch": 5.176065756276703, |
| "grad_norm": 1.133985996246338, |
| "learning_rate": 6.321070234113712e-07, |
| "loss": 0.298, |
| "step": 53700 |
| }, |
| { |
| "epoch": 5.176065756276703, |
| "eval_loss": 0.2622462511062622, |
| "eval_runtime": 40.2293, |
| "eval_samples_per_second": 248.575, |
| "eval_steps_per_second": 31.072, |
| "step": 53700 |
| }, |
| { |
| "epoch": 5.185704612433643, |
| "grad_norm": 1.1667418479919434, |
| "learning_rate": 6.220735785953178e-07, |
| "loss": 0.2951, |
| "step": 53800 |
| }, |
| { |
| "epoch": 5.185704612433643, |
| "eval_loss": 0.2598443031311035, |
| "eval_runtime": 40.0513, |
| "eval_samples_per_second": 249.68, |
| "eval_steps_per_second": 31.21, |
| "step": 53800 |
| }, |
| { |
| "epoch": 5.195343468590583, |
| "grad_norm": 1.1579954624176025, |
| "learning_rate": 6.120401337792642e-07, |
| "loss": 0.2949, |
| "step": 53900 |
| }, |
| { |
| "epoch": 5.195343468590583, |
| "eval_loss": 0.25939348340034485, |
| "eval_runtime": 40.0758, |
| "eval_samples_per_second": 249.527, |
| "eval_steps_per_second": 31.191, |
| "step": 53900 |
| }, |
| { |
| "epoch": 5.204982324747522, |
| "grad_norm": 1.0994336605072021, |
| "learning_rate": 6.020066889632107e-07, |
| "loss": 0.2955, |
| "step": 54000 |
| }, |
| { |
| "epoch": 5.204982324747522, |
| "eval_loss": 0.2639375925064087, |
| "eval_runtime": 40.0757, |
| "eval_samples_per_second": 249.528, |
| "eval_steps_per_second": 31.191, |
| "step": 54000 |
| }, |
| { |
| "epoch": 5.214621180904462, |
| "grad_norm": 1.2090504169464111, |
| "learning_rate": 5.919732441471572e-07, |
| "loss": 0.2934, |
| "step": 54100 |
| }, |
| { |
| "epoch": 5.214621180904462, |
| "eval_loss": 0.2580001652240753, |
| "eval_runtime": 40.2509, |
| "eval_samples_per_second": 248.442, |
| "eval_steps_per_second": 31.055, |
| "step": 54100 |
| }, |
| { |
| "epoch": 5.224260037061402, |
| "grad_norm": 1.0528264045715332, |
| "learning_rate": 5.819397993311037e-07, |
| "loss": 0.2975, |
| "step": 54200 |
| }, |
| { |
| "epoch": 5.224260037061402, |
| "eval_loss": 0.2598421275615692, |
| "eval_runtime": 40.0571, |
| "eval_samples_per_second": 249.644, |
| "eval_steps_per_second": 31.205, |
| "step": 54200 |
| }, |
| { |
| "epoch": 5.233898893218342, |
| "grad_norm": 1.1567331552505493, |
| "learning_rate": 5.719063545150502e-07, |
| "loss": 0.2978, |
| "step": 54300 |
| }, |
| { |
| "epoch": 5.233898893218342, |
| "eval_loss": 0.26616182923316956, |
| "eval_runtime": 40.2149, |
| "eval_samples_per_second": 248.664, |
| "eval_steps_per_second": 31.083, |
| "step": 54300 |
| }, |
| { |
| "epoch": 5.243537749375282, |
| "grad_norm": 1.0571229457855225, |
| "learning_rate": 5.618729096989966e-07, |
| "loss": 0.2942, |
| "step": 54400 |
| }, |
| { |
| "epoch": 5.243537749375282, |
| "eval_loss": 0.26477307081222534, |
| "eval_runtime": 40.0949, |
| "eval_samples_per_second": 249.408, |
| "eval_steps_per_second": 31.176, |
| "step": 54400 |
| }, |
| { |
| "epoch": 5.2531766055322215, |
| "grad_norm": 1.148074984550476, |
| "learning_rate": 5.518394648829431e-07, |
| "loss": 0.295, |
| "step": 54500 |
| }, |
| { |
| "epoch": 5.2531766055322215, |
| "eval_loss": 0.2539403736591339, |
| "eval_runtime": 40.2349, |
| "eval_samples_per_second": 248.54, |
| "eval_steps_per_second": 31.068, |
| "step": 54500 |
| }, |
| { |
| "epoch": 5.262815461689161, |
| "grad_norm": 1.0529351234436035, |
| "learning_rate": 5.418060200668896e-07, |
| "loss": 0.2948, |
| "step": 54600 |
| }, |
| { |
| "epoch": 5.262815461689161, |
| "eval_loss": 0.26054322719573975, |
| "eval_runtime": 40.1935, |
| "eval_samples_per_second": 248.796, |
| "eval_steps_per_second": 31.1, |
| "step": 54600 |
| }, |
| { |
| "epoch": 5.272454317846101, |
| "grad_norm": 1.1665048599243164, |
| "learning_rate": 5.317725752508361e-07, |
| "loss": 0.2939, |
| "step": 54700 |
| }, |
| { |
| "epoch": 5.272454317846101, |
| "eval_loss": 0.2621845304965973, |
| "eval_runtime": 40.2294, |
| "eval_samples_per_second": 248.575, |
| "eval_steps_per_second": 31.072, |
| "step": 54700 |
| }, |
| { |
| "epoch": 5.282093174003041, |
| "grad_norm": 1.1923383474349976, |
| "learning_rate": 5.217391304347826e-07, |
| "loss": 0.2943, |
| "step": 54800 |
| }, |
| { |
| "epoch": 5.282093174003041, |
| "eval_loss": 0.2575650215148926, |
| "eval_runtime": 40.1924, |
| "eval_samples_per_second": 248.803, |
| "eval_steps_per_second": 31.1, |
| "step": 54800 |
| }, |
| { |
| "epoch": 5.291732030159981, |
| "grad_norm": 1.0694471597671509, |
| "learning_rate": 5.117056856187291e-07, |
| "loss": 0.2941, |
| "step": 54900 |
| }, |
| { |
| "epoch": 5.291732030159981, |
| "eval_loss": 0.25641068816185, |
| "eval_runtime": 40.2068, |
| "eval_samples_per_second": 248.714, |
| "eval_steps_per_second": 31.089, |
| "step": 54900 |
| }, |
| { |
| "epoch": 5.301370886316921, |
| "grad_norm": 1.207488775253296, |
| "learning_rate": 5.016722408026756e-07, |
| "loss": 0.2945, |
| "step": 55000 |
| }, |
| { |
| "epoch": 5.301370886316921, |
| "eval_loss": 0.2628210186958313, |
| "eval_runtime": 40.2419, |
| "eval_samples_per_second": 248.497, |
| "eval_steps_per_second": 31.062, |
| "step": 55000 |
| }, |
| { |
| "epoch": 5.3110097424738605, |
| "grad_norm": 1.0832284688949585, |
| "learning_rate": 4.916387959866221e-07, |
| "loss": 0.2946, |
| "step": 55100 |
| }, |
| { |
| "epoch": 5.3110097424738605, |
| "eval_loss": 0.26105597615242004, |
| "eval_runtime": 40.1107, |
| "eval_samples_per_second": 249.31, |
| "eval_steps_per_second": 31.164, |
| "step": 55100 |
| }, |
| { |
| "epoch": 5.3206485986308, |
| "grad_norm": 1.1050950288772583, |
| "learning_rate": 4.816053511705686e-07, |
| "loss": 0.2957, |
| "step": 55200 |
| }, |
| { |
| "epoch": 5.3206485986308, |
| "eval_loss": 0.2594417333602905, |
| "eval_runtime": 40.2651, |
| "eval_samples_per_second": 248.354, |
| "eval_steps_per_second": 31.044, |
| "step": 55200 |
| }, |
| { |
| "epoch": 5.33028745478774, |
| "grad_norm": 1.088828206062317, |
| "learning_rate": 4.7157190635451506e-07, |
| "loss": 0.2924, |
| "step": 55300 |
| }, |
| { |
| "epoch": 5.33028745478774, |
| "eval_loss": 0.26388004422187805, |
| "eval_runtime": 40.2575, |
| "eval_samples_per_second": 248.401, |
| "eval_steps_per_second": 31.05, |
| "step": 55300 |
| }, |
| { |
| "epoch": 5.33992631094468, |
| "grad_norm": 1.1768171787261963, |
| "learning_rate": 4.6153846153846156e-07, |
| "loss": 0.2951, |
| "step": 55400 |
| }, |
| { |
| "epoch": 5.33992631094468, |
| "eval_loss": 0.2630363702774048, |
| "eval_runtime": 40.2627, |
| "eval_samples_per_second": 248.369, |
| "eval_steps_per_second": 31.046, |
| "step": 55400 |
| }, |
| { |
| "epoch": 5.34956516710162, |
| "grad_norm": 1.1391232013702393, |
| "learning_rate": 4.5150501672240806e-07, |
| "loss": 0.2952, |
| "step": 55500 |
| }, |
| { |
| "epoch": 5.34956516710162, |
| "eval_loss": 0.25834527611732483, |
| "eval_runtime": 40.1693, |
| "eval_samples_per_second": 248.946, |
| "eval_steps_per_second": 31.118, |
| "step": 55500 |
| }, |
| { |
| "epoch": 5.35920402325856, |
| "grad_norm": 1.1385185718536377, |
| "learning_rate": 4.4147157190635456e-07, |
| "loss": 0.2935, |
| "step": 55600 |
| }, |
| { |
| "epoch": 5.35920402325856, |
| "eval_loss": 0.25431880354881287, |
| "eval_runtime": 40.1701, |
| "eval_samples_per_second": 248.941, |
| "eval_steps_per_second": 31.118, |
| "step": 55600 |
| }, |
| { |
| "epoch": 5.368842879415499, |
| "grad_norm": 1.0914981365203857, |
| "learning_rate": 4.3143812709030095e-07, |
| "loss": 0.2934, |
| "step": 55700 |
| }, |
| { |
| "epoch": 5.368842879415499, |
| "eval_loss": 0.25752854347229004, |
| "eval_runtime": 40.1548, |
| "eval_samples_per_second": 249.037, |
| "eval_steps_per_second": 31.13, |
| "step": 55700 |
| }, |
| { |
| "epoch": 5.378481735572439, |
| "grad_norm": 1.0367567539215088, |
| "learning_rate": 4.2140468227424745e-07, |
| "loss": 0.2929, |
| "step": 55800 |
| }, |
| { |
| "epoch": 5.378481735572439, |
| "eval_loss": 0.2595641613006592, |
| "eval_runtime": 40.1628, |
| "eval_samples_per_second": 248.986, |
| "eval_steps_per_second": 31.123, |
| "step": 55800 |
| }, |
| { |
| "epoch": 5.388120591729379, |
| "grad_norm": 1.072582483291626, |
| "learning_rate": 4.1137123745819395e-07, |
| "loss": 0.2947, |
| "step": 55900 |
| }, |
| { |
| "epoch": 5.388120591729379, |
| "eval_loss": 0.26283690333366394, |
| "eval_runtime": 40.1998, |
| "eval_samples_per_second": 248.758, |
| "eval_steps_per_second": 31.095, |
| "step": 55900 |
| }, |
| { |
| "epoch": 5.397759447886319, |
| "grad_norm": 1.0787990093231201, |
| "learning_rate": 4.0133779264214045e-07, |
| "loss": 0.2941, |
| "step": 56000 |
| }, |
| { |
| "epoch": 5.397759447886319, |
| "eval_loss": 0.2607322335243225, |
| "eval_runtime": 40.2136, |
| "eval_samples_per_second": 248.672, |
| "eval_steps_per_second": 31.084, |
| "step": 56000 |
| }, |
| { |
| "epoch": 5.407398304043259, |
| "grad_norm": 1.1289974451065063, |
| "learning_rate": 3.9130434782608694e-07, |
| "loss": 0.2951, |
| "step": 56100 |
| }, |
| { |
| "epoch": 5.407398304043259, |
| "eval_loss": 0.2661938965320587, |
| "eval_runtime": 40.0629, |
| "eval_samples_per_second": 249.608, |
| "eval_steps_per_second": 31.201, |
| "step": 56100 |
| }, |
| { |
| "epoch": 5.417037160200199, |
| "grad_norm": 1.1110820770263672, |
| "learning_rate": 3.8127090301003344e-07, |
| "loss": 0.2947, |
| "step": 56200 |
| }, |
| { |
| "epoch": 5.417037160200199, |
| "eval_loss": 0.26097166538238525, |
| "eval_runtime": 40.0727, |
| "eval_samples_per_second": 249.546, |
| "eval_steps_per_second": 31.193, |
| "step": 56200 |
| }, |
| { |
| "epoch": 5.426676016357139, |
| "grad_norm": 1.101787805557251, |
| "learning_rate": 3.7123745819397994e-07, |
| "loss": 0.2934, |
| "step": 56300 |
| }, |
| { |
| "epoch": 5.426676016357139, |
| "eval_loss": 0.2626558542251587, |
| "eval_runtime": 40.0699, |
| "eval_samples_per_second": 249.564, |
| "eval_steps_per_second": 31.195, |
| "step": 56300 |
| }, |
| { |
| "epoch": 5.436314872514079, |
| "grad_norm": 1.143552541732788, |
| "learning_rate": 3.6120401337792644e-07, |
| "loss": 0.2943, |
| "step": 56400 |
| }, |
| { |
| "epoch": 5.436314872514079, |
| "eval_loss": 0.2595503032207489, |
| "eval_runtime": 40.0704, |
| "eval_samples_per_second": 249.561, |
| "eval_steps_per_second": 31.195, |
| "step": 56400 |
| }, |
| { |
| "epoch": 5.445953728671019, |
| "grad_norm": 1.152707576751709, |
| "learning_rate": 3.5117056856187294e-07, |
| "loss": 0.2931, |
| "step": 56500 |
| }, |
| { |
| "epoch": 5.445953728671019, |
| "eval_loss": 0.2601962685585022, |
| "eval_runtime": 40.1774, |
| "eval_samples_per_second": 248.896, |
| "eval_steps_per_second": 31.112, |
| "step": 56500 |
| }, |
| { |
| "epoch": 5.455592584827959, |
| "grad_norm": 1.1379551887512207, |
| "learning_rate": 3.411371237458194e-07, |
| "loss": 0.2934, |
| "step": 56600 |
| }, |
| { |
| "epoch": 5.455592584827959, |
| "eval_loss": 0.2626563608646393, |
| "eval_runtime": 40.008, |
| "eval_samples_per_second": 249.95, |
| "eval_steps_per_second": 31.244, |
| "step": 56600 |
| }, |
| { |
| "epoch": 5.4652314409848985, |
| "grad_norm": 1.171043038368225, |
| "learning_rate": 3.311036789297659e-07, |
| "loss": 0.2935, |
| "step": 56700 |
| }, |
| { |
| "epoch": 5.4652314409848985, |
| "eval_loss": 0.2636509835720062, |
| "eval_runtime": 40.0145, |
| "eval_samples_per_second": 249.91, |
| "eval_steps_per_second": 31.239, |
| "step": 56700 |
| }, |
| { |
| "epoch": 5.474870297141838, |
| "grad_norm": 1.0666399002075195, |
| "learning_rate": 3.210702341137124e-07, |
| "loss": 0.2932, |
| "step": 56800 |
| }, |
| { |
| "epoch": 5.474870297141838, |
| "eval_loss": 0.26089200377464294, |
| "eval_runtime": 40.0108, |
| "eval_samples_per_second": 249.932, |
| "eval_steps_per_second": 31.242, |
| "step": 56800 |
| }, |
| { |
| "epoch": 5.484509153298778, |
| "grad_norm": 1.118462324142456, |
| "learning_rate": 3.110367892976589e-07, |
| "loss": 0.2953, |
| "step": 56900 |
| }, |
| { |
| "epoch": 5.484509153298778, |
| "eval_loss": 0.26198524236679077, |
| "eval_runtime": 40.1717, |
| "eval_samples_per_second": 248.931, |
| "eval_steps_per_second": 31.116, |
| "step": 56900 |
| }, |
| { |
| "epoch": 5.494148009455718, |
| "grad_norm": 1.1037251949310303, |
| "learning_rate": 3.010033444816054e-07, |
| "loss": 0.2926, |
| "step": 57000 |
| }, |
| { |
| "epoch": 5.494148009455718, |
| "eval_loss": 0.2536342144012451, |
| "eval_runtime": 40.0426, |
| "eval_samples_per_second": 249.734, |
| "eval_steps_per_second": 31.217, |
| "step": 57000 |
| }, |
| { |
| "epoch": 5.503786865612658, |
| "grad_norm": 1.0776835680007935, |
| "learning_rate": 2.9096989966555187e-07, |
| "loss": 0.291, |
| "step": 57100 |
| }, |
| { |
| "epoch": 5.503786865612658, |
| "eval_loss": 0.2602877914905548, |
| "eval_runtime": 40.0401, |
| "eval_samples_per_second": 249.75, |
| "eval_steps_per_second": 31.219, |
| "step": 57100 |
| }, |
| { |
| "epoch": 5.513425721769598, |
| "grad_norm": 1.1141369342803955, |
| "learning_rate": 2.809364548494983e-07, |
| "loss": 0.2932, |
| "step": 57200 |
| }, |
| { |
| "epoch": 5.513425721769598, |
| "eval_loss": 0.258954793214798, |
| "eval_runtime": 40.2256, |
| "eval_samples_per_second": 248.598, |
| "eval_steps_per_second": 31.075, |
| "step": 57200 |
| }, |
| { |
| "epoch": 5.5230645779265375, |
| "grad_norm": 1.111812710762024, |
| "learning_rate": 2.709030100334448e-07, |
| "loss": 0.2919, |
| "step": 57300 |
| }, |
| { |
| "epoch": 5.5230645779265375, |
| "eval_loss": 0.2630453407764435, |
| "eval_runtime": 40.0543, |
| "eval_samples_per_second": 249.661, |
| "eval_steps_per_second": 31.208, |
| "step": 57300 |
| }, |
| { |
| "epoch": 5.532703434083477, |
| "grad_norm": 1.165526032447815, |
| "learning_rate": 2.608695652173913e-07, |
| "loss": 0.2926, |
| "step": 57400 |
| }, |
| { |
| "epoch": 5.532703434083477, |
| "eval_loss": 0.25891971588134766, |
| "eval_runtime": 40.055, |
| "eval_samples_per_second": 249.657, |
| "eval_steps_per_second": 31.207, |
| "step": 57400 |
| }, |
| { |
| "epoch": 5.542342290240417, |
| "grad_norm": 1.124131679534912, |
| "learning_rate": 2.508361204013378e-07, |
| "loss": 0.2927, |
| "step": 57500 |
| }, |
| { |
| "epoch": 5.542342290240417, |
| "eval_loss": 0.2566971480846405, |
| "eval_runtime": 40.0829, |
| "eval_samples_per_second": 249.483, |
| "eval_steps_per_second": 31.185, |
| "step": 57500 |
| }, |
| { |
| "epoch": 5.551981146397357, |
| "grad_norm": 1.0840249061584473, |
| "learning_rate": 2.408026755852843e-07, |
| "loss": 0.2944, |
| "step": 57600 |
| }, |
| { |
| "epoch": 5.551981146397357, |
| "eval_loss": 0.25784093141555786, |
| "eval_runtime": 40.2031, |
| "eval_samples_per_second": 248.737, |
| "eval_steps_per_second": 31.092, |
| "step": 57600 |
| }, |
| { |
| "epoch": 5.561620002554297, |
| "grad_norm": 1.1037781238555908, |
| "learning_rate": 2.3076923076923078e-07, |
| "loss": 0.2934, |
| "step": 57700 |
| }, |
| { |
| "epoch": 5.561620002554297, |
| "eval_loss": 0.251878559589386, |
| "eval_runtime": 40.0523, |
| "eval_samples_per_second": 249.674, |
| "eval_steps_per_second": 31.209, |
| "step": 57700 |
| }, |
| { |
| "epoch": 5.571258858711237, |
| "grad_norm": 1.1695195436477661, |
| "learning_rate": 2.2073578595317728e-07, |
| "loss": 0.294, |
| "step": 57800 |
| }, |
| { |
| "epoch": 5.571258858711237, |
| "eval_loss": 0.2621340751647949, |
| "eval_runtime": 40.0487, |
| "eval_samples_per_second": 249.696, |
| "eval_steps_per_second": 31.212, |
| "step": 57800 |
| }, |
| { |
| "epoch": 5.580897714868176, |
| "grad_norm": 1.1314599514007568, |
| "learning_rate": 2.1070234113712372e-07, |
| "loss": 0.2925, |
| "step": 57900 |
| }, |
| { |
| "epoch": 5.580897714868176, |
| "eval_loss": 0.2625725567340851, |
| "eval_runtime": 40.0655, |
| "eval_samples_per_second": 249.592, |
| "eval_steps_per_second": 31.199, |
| "step": 57900 |
| }, |
| { |
| "epoch": 5.590536571025116, |
| "grad_norm": 1.0930743217468262, |
| "learning_rate": 2.0066889632107022e-07, |
| "loss": 0.2933, |
| "step": 58000 |
| }, |
| { |
| "epoch": 5.590536571025116, |
| "eval_loss": 0.25974881649017334, |
| "eval_runtime": 40.0392, |
| "eval_samples_per_second": 249.755, |
| "eval_steps_per_second": 31.219, |
| "step": 58000 |
| }, |
| { |
| "epoch": 5.600175427182056, |
| "grad_norm": 1.1561408042907715, |
| "learning_rate": 1.9063545150501672e-07, |
| "loss": 0.2942, |
| "step": 58100 |
| }, |
| { |
| "epoch": 5.600175427182056, |
| "eval_loss": 0.2584717869758606, |
| "eval_runtime": 40.2075, |
| "eval_samples_per_second": 248.71, |
| "eval_steps_per_second": 31.089, |
| "step": 58100 |
| }, |
| { |
| "epoch": 5.609814283338996, |
| "grad_norm": 1.1130927801132202, |
| "learning_rate": 1.8060200668896322e-07, |
| "loss": 0.2938, |
| "step": 58200 |
| }, |
| { |
| "epoch": 5.609814283338996, |
| "eval_loss": 0.2553929090499878, |
| "eval_runtime": 40.0363, |
| "eval_samples_per_second": 249.774, |
| "eval_steps_per_second": 31.222, |
| "step": 58200 |
| }, |
| { |
| "epoch": 5.619453139495936, |
| "grad_norm": 1.2237213850021362, |
| "learning_rate": 1.705685618729097e-07, |
| "loss": 0.2942, |
| "step": 58300 |
| }, |
| { |
| "epoch": 5.619453139495936, |
| "eval_loss": 0.24946899712085724, |
| "eval_runtime": 40.0454, |
| "eval_samples_per_second": 249.716, |
| "eval_steps_per_second": 31.215, |
| "step": 58300 |
| }, |
| { |
| "epoch": 5.6290919956528755, |
| "grad_norm": 1.1548402309417725, |
| "learning_rate": 1.605351170568562e-07, |
| "loss": 0.2948, |
| "step": 58400 |
| }, |
| { |
| "epoch": 5.6290919956528755, |
| "eval_loss": 0.25441214442253113, |
| "eval_runtime": 40.0484, |
| "eval_samples_per_second": 249.698, |
| "eval_steps_per_second": 31.212, |
| "step": 58400 |
| }, |
| { |
| "epoch": 5.638730851809815, |
| "grad_norm": 1.1330386400222778, |
| "learning_rate": 1.505016722408027e-07, |
| "loss": 0.2913, |
| "step": 58500 |
| }, |
| { |
| "epoch": 5.638730851809815, |
| "eval_loss": 0.2585105895996094, |
| "eval_runtime": 40.2225, |
| "eval_samples_per_second": 248.617, |
| "eval_steps_per_second": 31.077, |
| "step": 58500 |
| }, |
| { |
| "epoch": 5.648369707966756, |
| "grad_norm": 1.1387040615081787, |
| "learning_rate": 1.4046822742474916e-07, |
| "loss": 0.2938, |
| "step": 58600 |
| }, |
| { |
| "epoch": 5.648369707966756, |
| "eval_loss": 0.2615092098712921, |
| "eval_runtime": 40.0294, |
| "eval_samples_per_second": 249.816, |
| "eval_steps_per_second": 31.227, |
| "step": 58600 |
| }, |
| { |
| "epoch": 5.658008564123696, |
| "grad_norm": 1.1653701066970825, |
| "learning_rate": 1.3043478260869566e-07, |
| "loss": 0.2947, |
| "step": 58700 |
| }, |
| { |
| "epoch": 5.658008564123696, |
| "eval_loss": 0.2623739242553711, |
| "eval_runtime": 40.0354, |
| "eval_samples_per_second": 249.779, |
| "eval_steps_per_second": 31.222, |
| "step": 58700 |
| }, |
| { |
| "epoch": 5.667647420280636, |
| "grad_norm": 1.1270488500595093, |
| "learning_rate": 1.2040133779264215e-07, |
| "loss": 0.2928, |
| "step": 58800 |
| }, |
| { |
| "epoch": 5.667647420280636, |
| "eval_loss": 0.26067566871643066, |
| "eval_runtime": 40.0307, |
| "eval_samples_per_second": 249.809, |
| "eval_steps_per_second": 31.226, |
| "step": 58800 |
| }, |
| { |
| "epoch": 5.6772862764375756, |
| "grad_norm": 1.117533802986145, |
| "learning_rate": 1.1036789297658864e-07, |
| "loss": 0.2908, |
| "step": 58900 |
| }, |
| { |
| "epoch": 5.6772862764375756, |
| "eval_loss": 0.2551814019680023, |
| "eval_runtime": 40.0838, |
| "eval_samples_per_second": 249.478, |
| "eval_steps_per_second": 31.185, |
| "step": 58900 |
| }, |
| { |
| "epoch": 5.686925132594515, |
| "grad_norm": 1.0880376100540161, |
| "learning_rate": 1.0033444816053511e-07, |
| "loss": 0.2912, |
| "step": 59000 |
| }, |
| { |
| "epoch": 5.686925132594515, |
| "eval_loss": 0.25541287660598755, |
| "eval_runtime": 40.2323, |
| "eval_samples_per_second": 248.556, |
| "eval_steps_per_second": 31.07, |
| "step": 59000 |
| }, |
| { |
| "epoch": 5.696563988751455, |
| "grad_norm": 1.0031907558441162, |
| "learning_rate": 9.030100334448161e-08, |
| "loss": 0.2935, |
| "step": 59100 |
| }, |
| { |
| "epoch": 5.696563988751455, |
| "eval_loss": 0.2621237337589264, |
| "eval_runtime": 40.0531, |
| "eval_samples_per_second": 249.669, |
| "eval_steps_per_second": 31.209, |
| "step": 59100 |
| }, |
| { |
| "epoch": 5.706202844908395, |
| "grad_norm": 1.104634165763855, |
| "learning_rate": 8.02675585284281e-08, |
| "loss": 0.2951, |
| "step": 59200 |
| }, |
| { |
| "epoch": 5.706202844908395, |
| "eval_loss": 0.2626442015171051, |
| "eval_runtime": 40.0642, |
| "eval_samples_per_second": 249.599, |
| "eval_steps_per_second": 31.2, |
| "step": 59200 |
| }, |
| { |
| "epoch": 5.715841701065335, |
| "grad_norm": 1.1673884391784668, |
| "learning_rate": 7.023411371237458e-08, |
| "loss": 0.2927, |
| "step": 59300 |
| }, |
| { |
| "epoch": 5.715841701065335, |
| "eval_loss": 0.25724858045578003, |
| "eval_runtime": 40.2787, |
| "eval_samples_per_second": 248.27, |
| "eval_steps_per_second": 31.034, |
| "step": 59300 |
| }, |
| { |
| "epoch": 5.725480557222275, |
| "grad_norm": 1.0944865942001343, |
| "learning_rate": 6.020066889632108e-08, |
| "loss": 0.2926, |
| "step": 59400 |
| }, |
| { |
| "epoch": 5.725480557222275, |
| "eval_loss": 0.2603595554828644, |
| "eval_runtime": 40.0311, |
| "eval_samples_per_second": 249.806, |
| "eval_steps_per_second": 31.226, |
| "step": 59400 |
| }, |
| { |
| "epoch": 5.7351194133792145, |
| "grad_norm": 1.142904281616211, |
| "learning_rate": 5.0167224080267556e-08, |
| "loss": 0.2935, |
| "step": 59500 |
| }, |
| { |
| "epoch": 5.7351194133792145, |
| "eval_loss": 0.25873175263404846, |
| "eval_runtime": 40.0183, |
| "eval_samples_per_second": 249.886, |
| "eval_steps_per_second": 31.236, |
| "step": 59500 |
| }, |
| { |
| "epoch": 5.744758269536154, |
| "grad_norm": 1.1131585836410522, |
| "learning_rate": 4.013377926421405e-08, |
| "loss": 0.2913, |
| "step": 59600 |
| }, |
| { |
| "epoch": 5.744758269536154, |
| "eval_loss": 0.25963783264160156, |
| "eval_runtime": 40.0082, |
| "eval_samples_per_second": 249.949, |
| "eval_steps_per_second": 31.244, |
| "step": 59600 |
| }, |
| { |
| "epoch": 5.754397125693094, |
| "grad_norm": 1.1289507150650024, |
| "learning_rate": 3.010033444816054e-08, |
| "loss": 0.2934, |
| "step": 59700 |
| }, |
| { |
| "epoch": 5.754397125693094, |
| "eval_loss": 0.2582632005214691, |
| "eval_runtime": 40.0014, |
| "eval_samples_per_second": 249.991, |
| "eval_steps_per_second": 31.249, |
| "step": 59700 |
| }, |
| { |
| "epoch": 5.764035981850034, |
| "grad_norm": 1.0394152402877808, |
| "learning_rate": 2.0066889632107024e-08, |
| "loss": 0.2915, |
| "step": 59800 |
| }, |
| { |
| "epoch": 5.764035981850034, |
| "eval_loss": 0.2560790181159973, |
| "eval_runtime": 40.2031, |
| "eval_samples_per_second": 248.737, |
| "eval_steps_per_second": 31.092, |
| "step": 59800 |
| }, |
| { |
| "epoch": 5.773674838006974, |
| "grad_norm": 1.1087628602981567, |
| "learning_rate": 1.0033444816053512e-08, |
| "loss": 0.2915, |
| "step": 59900 |
| }, |
| { |
| "epoch": 5.773674838006974, |
| "eval_loss": 0.2614119350910187, |
| "eval_runtime": 40.0216, |
| "eval_samples_per_second": 249.865, |
| "eval_steps_per_second": 31.233, |
| "step": 59900 |
| }, |
| { |
| "epoch": 5.783313694163914, |
| "grad_norm": 1.0176420211791992, |
| "learning_rate": 0.0, |
| "loss": 0.2935, |
| "step": 60000 |
| }, |
| { |
| "epoch": 5.783313694163914, |
| "eval_loss": 0.2631940245628357, |
| "eval_runtime": 40.0188, |
| "eval_samples_per_second": 249.882, |
| "eval_steps_per_second": 31.235, |
| "step": 60000 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 60000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 6, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.7893111401591828e+19, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|