| { | |
| "best_metric": 5.546974182128906, | |
| "best_model_checkpoint": "/users/hr1171/scratch/T5LA/checkpoint-100000", | |
| "epoch": 2.1069, | |
| "eval_steps": 1000, | |
| "global_step": 100000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005, | |
| "grad_norm": 0.06483318656682968, | |
| "learning_rate": 4.975e-05, | |
| "loss": 10.0248, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.07472355663776398, | |
| "learning_rate": 4.9500000000000004e-05, | |
| "loss": 9.4056, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "eval_accuracy": 0.043492843878108624, | |
| "eval_loss": 9.121541023254395, | |
| "eval_runtime": 110.4799, | |
| "eval_samples_per_second": 32.513, | |
| "eval_steps_per_second": 2.037, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.015, | |
| "grad_norm": 0.08399348706007004, | |
| "learning_rate": 4.9250000000000004e-05, | |
| "loss": 8.8698, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.08570190519094467, | |
| "learning_rate": 4.9e-05, | |
| "loss": 8.4062, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "eval_accuracy": 0.04429728167514647, | |
| "eval_loss": 8.193856239318848, | |
| "eval_runtime": 112.0891, | |
| "eval_samples_per_second": 32.046, | |
| "eval_steps_per_second": 2.007, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.025, | |
| "grad_norm": 0.08781777322292328, | |
| "learning_rate": 4.875e-05, | |
| "loss": 8.0175, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.07474139332771301, | |
| "learning_rate": 4.85e-05, | |
| "loss": 7.7307, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "eval_accuracy": 0.04438749518317016, | |
| "eval_loss": 7.602397918701172, | |
| "eval_runtime": 112.6714, | |
| "eval_samples_per_second": 31.88, | |
| "eval_steps_per_second": 1.997, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.035, | |
| "grad_norm": 0.07133983075618744, | |
| "learning_rate": 4.825e-05, | |
| "loss": 7.5203, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.04410657659173012, | |
| "learning_rate": 4.8e-05, | |
| "loss": 7.39, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "eval_accuracy": 0.0443981085370553, | |
| "eval_loss": 7.333784103393555, | |
| "eval_runtime": 111.8336, | |
| "eval_samples_per_second": 32.119, | |
| "eval_steps_per_second": 2.012, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.045, | |
| "grad_norm": 0.03503479063510895, | |
| "learning_rate": 4.775e-05, | |
| "loss": 7.3043, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.039906181395053864, | |
| "learning_rate": 4.75e-05, | |
| "loss": 7.2546, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "eval_accuracy": 0.04405276632986957, | |
| "eval_loss": 7.245168685913086, | |
| "eval_runtime": 113.4504, | |
| "eval_samples_per_second": 31.661, | |
| "eval_steps_per_second": 1.983, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.055, | |
| "grad_norm": 0.04767516627907753, | |
| "learning_rate": 4.7249999999999997e-05, | |
| "loss": 7.2409, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.07150296866893768, | |
| "learning_rate": 4.7e-05, | |
| "loss": 7.1985, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "eval_accuracy": 0.03687500952480477, | |
| "eval_loss": 7.168196201324463, | |
| "eval_runtime": 110.8688, | |
| "eval_samples_per_second": 32.399, | |
| "eval_steps_per_second": 2.029, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.065, | |
| "grad_norm": 0.10620597004890442, | |
| "learning_rate": 4.6750000000000005e-05, | |
| "loss": 7.1475, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.09856470674276352, | |
| "learning_rate": 4.6500000000000005e-05, | |
| "loss": 7.1009, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "eval_accuracy": 0.03457599379091584, | |
| "eval_loss": 7.071776866912842, | |
| "eval_runtime": 115.3179, | |
| "eval_samples_per_second": 31.149, | |
| "eval_steps_per_second": 1.951, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.075, | |
| "grad_norm": 0.12105035036802292, | |
| "learning_rate": 4.6250000000000006e-05, | |
| "loss": 7.0493, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.1322629749774933, | |
| "learning_rate": 4.600000000000001e-05, | |
| "loss": 7.004, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "eval_accuracy": 0.03315856677269135, | |
| "eval_loss": 6.977820873260498, | |
| "eval_runtime": 110.4899, | |
| "eval_samples_per_second": 32.51, | |
| "eval_steps_per_second": 2.036, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.085, | |
| "grad_norm": 0.16380475461483002, | |
| "learning_rate": 4.575e-05, | |
| "loss": 6.9535, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.1642669439315796, | |
| "learning_rate": 4.55e-05, | |
| "loss": 6.9159, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "eval_accuracy": 0.032452642670689945, | |
| "eval_loss": 6.896429061889648, | |
| "eval_runtime": 112.221, | |
| "eval_samples_per_second": 32.008, | |
| "eval_steps_per_second": 2.005, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.095, | |
| "grad_norm": 0.2253541350364685, | |
| "learning_rate": 4.525e-05, | |
| "loss": 6.8866, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.20757266879081726, | |
| "learning_rate": 4.5e-05, | |
| "loss": 6.8548, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "eval_accuracy": 0.03252571153012995, | |
| "eval_loss": 6.830728054046631, | |
| "eval_runtime": 112.9087, | |
| "eval_samples_per_second": 31.813, | |
| "eval_steps_per_second": 1.993, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.105, | |
| "grad_norm": 0.21346400678157806, | |
| "learning_rate": 4.4750000000000004e-05, | |
| "loss": 6.8302, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.24308614432811737, | |
| "learning_rate": 4.4500000000000004e-05, | |
| "loss": 6.7833, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "eval_accuracy": 0.032619326754142475, | |
| "eval_loss": 6.77016544342041, | |
| "eval_runtime": 112.3436, | |
| "eval_samples_per_second": 31.973, | |
| "eval_steps_per_second": 2.003, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.115, | |
| "grad_norm": 0.27737605571746826, | |
| "learning_rate": 4.4250000000000005e-05, | |
| "loss": 6.7775, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.3234957158565521, | |
| "learning_rate": 4.4000000000000006e-05, | |
| "loss": 6.7376, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "eval_accuracy": 0.03369168370246034, | |
| "eval_loss": 6.716267108917236, | |
| "eval_runtime": 111.859, | |
| "eval_samples_per_second": 32.112, | |
| "eval_steps_per_second": 2.011, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 0.27591678500175476, | |
| "learning_rate": 4.375e-05, | |
| "loss": 6.7188, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.31338784098625183, | |
| "learning_rate": 4.35005e-05, | |
| "loss": 6.6821, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "eval_accuracy": 0.03455980162280902, | |
| "eval_loss": 6.661470413208008, | |
| "eval_runtime": 120.7993, | |
| "eval_samples_per_second": 29.735, | |
| "eval_steps_per_second": 1.863, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.135, | |
| "grad_norm": 0.513124406337738, | |
| "learning_rate": 4.32505e-05, | |
| "loss": 6.6565, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.40841469168663025, | |
| "learning_rate": 4.30005e-05, | |
| "loss": 6.6373, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "eval_accuracy": 0.03493249362654492, | |
| "eval_loss": 6.608607292175293, | |
| "eval_runtime": 111.8763, | |
| "eval_samples_per_second": 32.107, | |
| "eval_steps_per_second": 2.011, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.145, | |
| "grad_norm": 0.3946859538555145, | |
| "learning_rate": 4.2750500000000003e-05, | |
| "loss": 6.6188, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.48516473174095154, | |
| "learning_rate": 4.2501000000000005e-05, | |
| "loss": 6.5895, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "eval_accuracy": 0.0343823681168318, | |
| "eval_loss": 6.556947231292725, | |
| "eval_runtime": 112.7205, | |
| "eval_samples_per_second": 31.866, | |
| "eval_steps_per_second": 1.996, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.155, | |
| "grad_norm": 0.543297529220581, | |
| "learning_rate": 4.22515e-05, | |
| "loss": 6.5555, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.46242478489875793, | |
| "learning_rate": 4.200150000000001e-05, | |
| "loss": 6.5421, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_accuracy": 0.035396487687420944, | |
| "eval_loss": 6.511915683746338, | |
| "eval_runtime": 111.3995, | |
| "eval_samples_per_second": 32.244, | |
| "eval_steps_per_second": 2.02, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.165, | |
| "grad_norm": 0.5675227642059326, | |
| "learning_rate": 4.17515e-05, | |
| "loss": 6.5233, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.5619395971298218, | |
| "learning_rate": 4.15015e-05, | |
| "loss": 6.5051, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "eval_accuracy": 0.035478945282990115, | |
| "eval_loss": 6.467820644378662, | |
| "eval_runtime": 111.7569, | |
| "eval_samples_per_second": 32.141, | |
| "eval_steps_per_second": 2.013, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.175, | |
| "grad_norm": 0.4741845726966858, | |
| "learning_rate": 4.1252000000000004e-05, | |
| "loss": 6.491, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.5824158191680908, | |
| "learning_rate": 4.1002000000000005e-05, | |
| "loss": 6.4391, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "eval_accuracy": 0.03599301260322167, | |
| "eval_loss": 6.432408332824707, | |
| "eval_runtime": 113.2507, | |
| "eval_samples_per_second": 31.717, | |
| "eval_steps_per_second": 1.987, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.185, | |
| "grad_norm": 0.740391194820404, | |
| "learning_rate": 4.0752e-05, | |
| "loss": 6.4602, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.55320805311203, | |
| "learning_rate": 4.0502e-05, | |
| "loss": 6.4242, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "eval_accuracy": 0.03552126262989112, | |
| "eval_loss": 6.40146541595459, | |
| "eval_runtime": 111.8361, | |
| "eval_samples_per_second": 32.118, | |
| "eval_steps_per_second": 2.012, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.195, | |
| "grad_norm": 0.7065308094024658, | |
| "learning_rate": 4.0252e-05, | |
| "loss": 6.4047, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.6466010212898254, | |
| "learning_rate": 4.0003e-05, | |
| "loss": 6.3889, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "eval_accuracy": 0.03732308355485308, | |
| "eval_loss": 6.355311393737793, | |
| "eval_runtime": 112.8526, | |
| "eval_samples_per_second": 31.829, | |
| "eval_steps_per_second": 1.994, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.205, | |
| "grad_norm": 0.5905252695083618, | |
| "learning_rate": 3.9753000000000004e-05, | |
| "loss": 6.3693, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.5333964824676514, | |
| "learning_rate": 3.9503000000000004e-05, | |
| "loss": 6.3631, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "eval_accuracy": 0.036739757297089004, | |
| "eval_loss": 6.328461647033691, | |
| "eval_runtime": 112.8636, | |
| "eval_samples_per_second": 31.826, | |
| "eval_steps_per_second": 1.994, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.215, | |
| "grad_norm": 0.5893262028694153, | |
| "learning_rate": 3.9253e-05, | |
| "loss": 6.3469, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.596720278263092, | |
| "learning_rate": 3.9003e-05, | |
| "loss": 6.3296, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "eval_accuracy": 0.03686983891650175, | |
| "eval_loss": 6.3014631271362305, | |
| "eval_runtime": 111.598, | |
| "eval_samples_per_second": 32.187, | |
| "eval_steps_per_second": 2.016, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.225, | |
| "grad_norm": 0.49506717920303345, | |
| "learning_rate": 3.8753e-05, | |
| "loss": 6.3161, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.6392377614974976, | |
| "learning_rate": 3.8503e-05, | |
| "loss": 6.3081, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "eval_accuracy": 0.03635019278204852, | |
| "eval_loss": 6.269931316375732, | |
| "eval_runtime": 112.6212, | |
| "eval_samples_per_second": 31.895, | |
| "eval_steps_per_second": 1.998, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.235, | |
| "grad_norm": 0.5753453969955444, | |
| "learning_rate": 3.8253e-05, | |
| "loss": 6.2806, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.5371689796447754, | |
| "learning_rate": 3.80035e-05, | |
| "loss": 6.2784, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "eval_accuracy": 0.036991620348901764, | |
| "eval_loss": 6.2454352378845215, | |
| "eval_runtime": 115.1822, | |
| "eval_samples_per_second": 31.185, | |
| "eval_steps_per_second": 1.953, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.245, | |
| "grad_norm": 0.5090768337249756, | |
| "learning_rate": 3.7753500000000004e-05, | |
| "loss": 6.2632, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.5666926503181458, | |
| "learning_rate": 3.7503500000000004e-05, | |
| "loss": 6.2589, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "eval_accuracy": 0.03744921918371879, | |
| "eval_loss": 6.216660499572754, | |
| "eval_runtime": 112.1843, | |
| "eval_samples_per_second": 32.019, | |
| "eval_steps_per_second": 2.006, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.255, | |
| "grad_norm": 0.5514094233512878, | |
| "learning_rate": 3.7253500000000005e-05, | |
| "loss": 6.2441, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.5683887600898743, | |
| "learning_rate": 3.7004e-05, | |
| "loss": 6.2371, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "eval_accuracy": 0.03695529002214109, | |
| "eval_loss": 6.189018726348877, | |
| "eval_runtime": 110.9014, | |
| "eval_samples_per_second": 32.389, | |
| "eval_steps_per_second": 2.029, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.265, | |
| "grad_norm": 0.6400378346443176, | |
| "learning_rate": 3.6754e-05, | |
| "loss": 6.2257, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.5867771506309509, | |
| "learning_rate": 3.6504e-05, | |
| "loss": 6.1978, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "eval_accuracy": 0.0376371299749416, | |
| "eval_loss": 6.165986061096191, | |
| "eval_runtime": 112.4076, | |
| "eval_samples_per_second": 31.955, | |
| "eval_steps_per_second": 2.002, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.275, | |
| "grad_norm": 0.7539021372795105, | |
| "learning_rate": 3.62545e-05, | |
| "loss": 6.2078, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.5702797174453735, | |
| "learning_rate": 3.6004500000000004e-05, | |
| "loss": 6.1895, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "eval_accuracy": 0.03748677412823544, | |
| "eval_loss": 6.1377997398376465, | |
| "eval_runtime": 111.0948, | |
| "eval_samples_per_second": 32.333, | |
| "eval_steps_per_second": 2.025, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.285, | |
| "grad_norm": 0.5679642558097839, | |
| "learning_rate": 3.5754500000000005e-05, | |
| "loss": 6.1921, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.6666756868362427, | |
| "learning_rate": 3.55045e-05, | |
| "loss": 6.1636, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "eval_accuracy": 0.03662328254163156, | |
| "eval_loss": 6.121272087097168, | |
| "eval_runtime": 111.2726, | |
| "eval_samples_per_second": 32.281, | |
| "eval_steps_per_second": 2.022, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.295, | |
| "grad_norm": 0.5925132632255554, | |
| "learning_rate": 3.52545e-05, | |
| "loss": 6.1639, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.5706653594970703, | |
| "learning_rate": 3.50045e-05, | |
| "loss": 6.1262, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "eval_accuracy": 0.03699611061400702, | |
| "eval_loss": 6.096695899963379, | |
| "eval_runtime": 112.2709, | |
| "eval_samples_per_second": 31.994, | |
| "eval_steps_per_second": 2.004, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.305, | |
| "grad_norm": 0.6464671492576599, | |
| "learning_rate": 3.47545e-05, | |
| "loss": 6.1365, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.5408177375793457, | |
| "learning_rate": 3.4505e-05, | |
| "loss": 6.1345, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "eval_accuracy": 0.03612323029127397, | |
| "eval_loss": 6.074455738067627, | |
| "eval_runtime": 113.9504, | |
| "eval_samples_per_second": 31.522, | |
| "eval_steps_per_second": 1.975, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.315, | |
| "grad_norm": 0.5881854891777039, | |
| "learning_rate": 3.4255e-05, | |
| "loss": 6.1113, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.6097517609596252, | |
| "learning_rate": 3.4005000000000004e-05, | |
| "loss": 6.1096, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_accuracy": 0.03600621126125832, | |
| "eval_loss": 6.055612087249756, | |
| "eval_runtime": 111.7549, | |
| "eval_samples_per_second": 32.142, | |
| "eval_steps_per_second": 2.013, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.325, | |
| "grad_norm": 0.6643844246864319, | |
| "learning_rate": 3.3755000000000005e-05, | |
| "loss": 6.1049, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.6951597929000854, | |
| "learning_rate": 3.3505000000000005e-05, | |
| "loss": 6.0794, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "eval_accuracy": 0.03573761176678053, | |
| "eval_loss": 6.041288375854492, | |
| "eval_runtime": 112.9798, | |
| "eval_samples_per_second": 31.793, | |
| "eval_steps_per_second": 1.992, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.335, | |
| "grad_norm": 0.8083083033561707, | |
| "learning_rate": 3.3255000000000006e-05, | |
| "loss": 6.1019, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.6118751764297485, | |
| "learning_rate": 3.3005e-05, | |
| "loss": 6.0643, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "eval_accuracy": 0.03625793824443153, | |
| "eval_loss": 6.013612270355225, | |
| "eval_runtime": 111.1363, | |
| "eval_samples_per_second": 32.321, | |
| "eval_steps_per_second": 2.025, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.345, | |
| "grad_norm": 0.6557193398475647, | |
| "learning_rate": 3.27555e-05, | |
| "loss": 6.0551, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.5440818667411804, | |
| "learning_rate": 3.25055e-05, | |
| "loss": 6.057, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "eval_accuracy": 0.03619099247377141, | |
| "eval_loss": 5.996499538421631, | |
| "eval_runtime": 114.4917, | |
| "eval_samples_per_second": 31.373, | |
| "eval_steps_per_second": 1.965, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.355, | |
| "grad_norm": 0.5865129828453064, | |
| "learning_rate": 3.2255499999999996e-05, | |
| "loss": 6.0495, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.598119854927063, | |
| "learning_rate": 3.20055e-05, | |
| "loss": 6.0337, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "eval_accuracy": 0.03535702778195055, | |
| "eval_loss": 5.980613708496094, | |
| "eval_runtime": 112.638, | |
| "eval_samples_per_second": 31.89, | |
| "eval_steps_per_second": 1.998, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.365, | |
| "grad_norm": 0.5838174819946289, | |
| "learning_rate": 3.1756000000000006e-05, | |
| "loss": 6.0167, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.7071039080619812, | |
| "learning_rate": 3.1506e-05, | |
| "loss": 6.0217, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "eval_accuracy": 0.03629073078656382, | |
| "eval_loss": 5.958355903625488, | |
| "eval_runtime": 112.2627, | |
| "eval_samples_per_second": 31.996, | |
| "eval_steps_per_second": 2.004, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.375, | |
| "grad_norm": 0.606131374835968, | |
| "learning_rate": 3.1256e-05, | |
| "loss": 6.0086, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.6173387169837952, | |
| "learning_rate": 3.1006e-05, | |
| "loss": 6.0045, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "eval_accuracy": 0.035918719126025685, | |
| "eval_loss": 5.952617168426514, | |
| "eval_runtime": 116.0654, | |
| "eval_samples_per_second": 30.948, | |
| "eval_steps_per_second": 1.939, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.385, | |
| "grad_norm": 0.6699332594871521, | |
| "learning_rate": 3.0756e-05, | |
| "loss": 5.9996, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.6398167610168457, | |
| "learning_rate": 3.0506000000000002e-05, | |
| "loss": 5.9896, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "eval_accuracy": 0.0355204462180538, | |
| "eval_loss": 5.9288482666015625, | |
| "eval_runtime": 110.5071, | |
| "eval_samples_per_second": 32.505, | |
| "eval_steps_per_second": 2.036, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.395, | |
| "grad_norm": 0.7196072936058044, | |
| "learning_rate": 3.0256e-05, | |
| "loss": 5.9879, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.6065074801445007, | |
| "learning_rate": 3.0006e-05, | |
| "loss": 5.9711, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "eval_accuracy": 0.03516381031378517, | |
| "eval_loss": 5.915222644805908, | |
| "eval_runtime": 110.7645, | |
| "eval_samples_per_second": 32.429, | |
| "eval_steps_per_second": 2.031, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.405, | |
| "grad_norm": 0.6913782954216003, | |
| "learning_rate": 2.9757e-05, | |
| "loss": 5.9671, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.6546822190284729, | |
| "learning_rate": 2.9507e-05, | |
| "loss": 5.9629, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "eval_accuracy": 0.03489303372107453, | |
| "eval_loss": 5.896161079406738, | |
| "eval_runtime": 111.3465, | |
| "eval_samples_per_second": 32.26, | |
| "eval_steps_per_second": 2.021, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 0.415, | |
| "grad_norm": 0.6649766564369202, | |
| "learning_rate": 2.9257e-05, | |
| "loss": 5.9631, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.7166668176651001, | |
| "learning_rate": 2.9007000000000002e-05, | |
| "loss": 5.9465, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "eval_accuracy": 0.035887695476207584, | |
| "eval_loss": 5.882122993469238, | |
| "eval_runtime": 110.5783, | |
| "eval_samples_per_second": 32.484, | |
| "eval_steps_per_second": 2.035, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.425, | |
| "grad_norm": 0.6148796081542969, | |
| "learning_rate": 2.8757e-05, | |
| "loss": 5.9383, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.712359607219696, | |
| "learning_rate": 2.8507e-05, | |
| "loss": 5.9463, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "eval_accuracy": 0.034527145149316284, | |
| "eval_loss": 5.869154930114746, | |
| "eval_runtime": 113.9173, | |
| "eval_samples_per_second": 31.532, | |
| "eval_steps_per_second": 1.975, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 0.435, | |
| "grad_norm": 0.6819197535514832, | |
| "learning_rate": 2.8257e-05, | |
| "loss": 5.9363, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.7428186535835266, | |
| "learning_rate": 2.8007e-05, | |
| "loss": 5.9317, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "eval_accuracy": 0.034260450615792234, | |
| "eval_loss": 5.869858741760254, | |
| "eval_runtime": 111.9079, | |
| "eval_samples_per_second": 32.098, | |
| "eval_steps_per_second": 2.011, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 0.445, | |
| "grad_norm": 0.6326854825019836, | |
| "learning_rate": 2.7757500000000003e-05, | |
| "loss": 5.9114, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 1.00345, | |
| "grad_norm": 0.6722562313079834, | |
| "learning_rate": 2.7507500000000004e-05, | |
| "loss": 5.9097, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 1.00345, | |
| "eval_accuracy": 0.034564291887914274, | |
| "eval_loss": 5.8482561111450195, | |
| "eval_runtime": 111.3739, | |
| "eval_samples_per_second": 32.252, | |
| "eval_steps_per_second": 2.02, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 1.00845, | |
| "grad_norm": 0.6160574555397034, | |
| "learning_rate": 2.72575e-05, | |
| "loss": 5.8948, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 1.01345, | |
| "grad_norm": 0.6146988868713379, | |
| "learning_rate": 2.7007500000000002e-05, | |
| "loss": 5.9107, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 1.01345, | |
| "eval_accuracy": 0.03477846392657083, | |
| "eval_loss": 5.8351731300354, | |
| "eval_runtime": 112.8785, | |
| "eval_samples_per_second": 31.822, | |
| "eval_steps_per_second": 1.993, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 1.01845, | |
| "grad_norm": 0.6342368721961975, | |
| "learning_rate": 2.6758e-05, | |
| "loss": 5.893, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 1.02345, | |
| "grad_norm": 0.7240073680877686, | |
| "learning_rate": 2.6508e-05, | |
| "loss": 5.8838, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 1.02345, | |
| "eval_accuracy": 0.034256504625245196, | |
| "eval_loss": 5.818812847137451, | |
| "eval_runtime": 112.1297, | |
| "eval_samples_per_second": 32.034, | |
| "eval_steps_per_second": 2.007, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 1.02845, | |
| "grad_norm": 0.6988590955734253, | |
| "learning_rate": 2.6257999999999998e-05, | |
| "loss": 5.8907, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 1.03345, | |
| "grad_norm": 0.6514461636543274, | |
| "learning_rate": 2.6008e-05, | |
| "loss": 5.887, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 1.03345, | |
| "eval_accuracy": 0.03395511258863511, | |
| "eval_loss": 5.808635234832764, | |
| "eval_runtime": 112.3746, | |
| "eval_samples_per_second": 31.965, | |
| "eval_steps_per_second": 2.002, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 1.03845, | |
| "grad_norm": 0.6310043931007385, | |
| "learning_rate": 2.5758e-05, | |
| "loss": 5.873, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 1.04345, | |
| "grad_norm": 0.7601247429847717, | |
| "learning_rate": 2.5507999999999997e-05, | |
| "loss": 5.8563, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 1.04345, | |
| "eval_accuracy": 0.03384244775508516, | |
| "eval_loss": 5.797094821929932, | |
| "eval_runtime": 112.1133, | |
| "eval_samples_per_second": 32.039, | |
| "eval_steps_per_second": 2.007, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 1.04845, | |
| "grad_norm": 0.6604752540588379, | |
| "learning_rate": 2.5258500000000002e-05, | |
| "loss": 5.8583, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 1.05345, | |
| "grad_norm": 0.681904673576355, | |
| "learning_rate": 2.5008500000000003e-05, | |
| "loss": 5.8576, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 1.05345, | |
| "eval_accuracy": 0.03392898740984092, | |
| "eval_loss": 5.796751022338867, | |
| "eval_runtime": 113.244, | |
| "eval_samples_per_second": 31.719, | |
| "eval_steps_per_second": 1.987, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 1.0584500000000001, | |
| "grad_norm": 0.6337283849716187, | |
| "learning_rate": 2.47585e-05, | |
| "loss": 5.8495, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 1.06345, | |
| "grad_norm": 0.6972376108169556, | |
| "learning_rate": 2.45085e-05, | |
| "loss": 5.8567, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 1.06345, | |
| "eval_accuracy": 0.034259225998036255, | |
| "eval_loss": 5.779718399047852, | |
| "eval_runtime": 110.7791, | |
| "eval_samples_per_second": 32.425, | |
| "eval_steps_per_second": 2.031, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 1.06845, | |
| "grad_norm": 0.6530519127845764, | |
| "learning_rate": 2.42585e-05, | |
| "loss": 5.8519, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 1.07345, | |
| "grad_norm": 0.6328603625297546, | |
| "learning_rate": 2.4009e-05, | |
| "loss": 5.841, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 1.07345, | |
| "eval_accuracy": 0.033661476464479555, | |
| "eval_loss": 5.767716407775879, | |
| "eval_runtime": 114.652, | |
| "eval_samples_per_second": 31.33, | |
| "eval_steps_per_second": 1.962, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 1.07845, | |
| "grad_norm": 0.6603217124938965, | |
| "learning_rate": 2.3759e-05, | |
| "loss": 5.8284, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 1.08345, | |
| "grad_norm": 0.6852928400039673, | |
| "learning_rate": 2.3509e-05, | |
| "loss": 5.8192, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 1.08345, | |
| "eval_accuracy": 0.03317965741182208, | |
| "eval_loss": 5.76131534576416, | |
| "eval_runtime": 111.561, | |
| "eval_samples_per_second": 32.198, | |
| "eval_steps_per_second": 2.017, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 1.08845, | |
| "grad_norm": 0.6923466324806213, | |
| "learning_rate": 2.3259e-05, | |
| "loss": 5.8147, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 1.09345, | |
| "grad_norm": 0.7163240313529968, | |
| "learning_rate": 2.3009e-05, | |
| "loss": 5.8214, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 1.09345, | |
| "eval_accuracy": 0.03383822962725901, | |
| "eval_loss": 5.748566150665283, | |
| "eval_runtime": 113.5245, | |
| "eval_samples_per_second": 31.641, | |
| "eval_steps_per_second": 1.982, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 1.09845, | |
| "grad_norm": 0.7053197026252747, | |
| "learning_rate": 2.27595e-05, | |
| "loss": 5.8176, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 1.10345, | |
| "grad_norm": 0.7318024039268494, | |
| "learning_rate": 2.25095e-05, | |
| "loss": 5.8166, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 1.10345, | |
| "eval_accuracy": 0.033826935930176105, | |
| "eval_loss": 5.74090576171875, | |
| "eval_runtime": 112.6548, | |
| "eval_samples_per_second": 31.885, | |
| "eval_steps_per_second": 1.997, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 1.10845, | |
| "grad_norm": 0.727443516254425, | |
| "learning_rate": 2.22595e-05, | |
| "loss": 5.8054, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 1.11345, | |
| "grad_norm": 0.6675511598587036, | |
| "learning_rate": 2.2009500000000003e-05, | |
| "loss": 5.806, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 1.11345, | |
| "eval_accuracy": 0.03327177588079952, | |
| "eval_loss": 5.734244346618652, | |
| "eval_runtime": 114.861, | |
| "eval_samples_per_second": 31.273, | |
| "eval_steps_per_second": 1.959, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 1.11845, | |
| "grad_norm": 0.7548192143440247, | |
| "learning_rate": 2.17595e-05, | |
| "loss": 5.7987, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 1.12345, | |
| "grad_norm": 0.6868897676467896, | |
| "learning_rate": 2.15095e-05, | |
| "loss": 5.7961, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 1.12345, | |
| "eval_accuracy": 0.033481321585711266, | |
| "eval_loss": 5.723623752593994, | |
| "eval_runtime": 113.4262, | |
| "eval_samples_per_second": 31.668, | |
| "eval_steps_per_second": 1.984, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 1.12845, | |
| "grad_norm": 0.6014879941940308, | |
| "learning_rate": 2.1259500000000002e-05, | |
| "loss": 5.7892, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 1.13345, | |
| "grad_norm": 0.6603185534477234, | |
| "learning_rate": 2.101e-05, | |
| "loss": 5.7847, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 1.13345, | |
| "eval_accuracy": 0.033258985428681526, | |
| "eval_loss": 5.716407775878906, | |
| "eval_runtime": 112.5491, | |
| "eval_samples_per_second": 31.915, | |
| "eval_steps_per_second": 1.999, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 1.13845, | |
| "grad_norm": 0.7514461278915405, | |
| "learning_rate": 2.076e-05, | |
| "loss": 5.7785, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 1.14345, | |
| "grad_norm": 0.9591528177261353, | |
| "learning_rate": 2.0510000000000002e-05, | |
| "loss": 5.787, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 1.14345, | |
| "eval_accuracy": 0.03300685023958966, | |
| "eval_loss": 5.7095818519592285, | |
| "eval_runtime": 112.4202, | |
| "eval_samples_per_second": 31.952, | |
| "eval_steps_per_second": 2.001, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 1.14845, | |
| "grad_norm": 0.6442362666130066, | |
| "learning_rate": 2.0260000000000003e-05, | |
| "loss": 5.7765, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 1.15345, | |
| "grad_norm": 0.7159613370895386, | |
| "learning_rate": 2.001e-05, | |
| "loss": 5.7711, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 1.15345, | |
| "eval_accuracy": 0.03279621598556148, | |
| "eval_loss": 5.703481674194336, | |
| "eval_runtime": 112.2964, | |
| "eval_samples_per_second": 31.987, | |
| "eval_steps_per_second": 2.004, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 1.15845, | |
| "grad_norm": 0.6875385642051697, | |
| "learning_rate": 1.976e-05, | |
| "loss": 5.7593, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 1.16345, | |
| "grad_norm": 0.6751101016998291, | |
| "learning_rate": 1.951e-05, | |
| "loss": 5.7699, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 1.16345, | |
| "eval_accuracy": 0.03306223017588777, | |
| "eval_loss": 5.6887898445129395, | |
| "eval_runtime": 113.7541, | |
| "eval_samples_per_second": 31.577, | |
| "eval_steps_per_second": 1.978, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 1.16845, | |
| "grad_norm": 0.7514244914054871, | |
| "learning_rate": 1.92605e-05, | |
| "loss": 5.7672, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 1.1734499999999999, | |
| "grad_norm": 0.697762131690979, | |
| "learning_rate": 1.90105e-05, | |
| "loss": 5.763, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 1.1734499999999999, | |
| "eval_accuracy": 0.03338308002795394, | |
| "eval_loss": 5.687499046325684, | |
| "eval_runtime": 111.5862, | |
| "eval_samples_per_second": 32.19, | |
| "eval_steps_per_second": 2.016, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 1.17845, | |
| "grad_norm": 0.6761494874954224, | |
| "learning_rate": 1.87605e-05, | |
| "loss": 5.7669, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 1.1834500000000001, | |
| "grad_norm": 0.6467716693878174, | |
| "learning_rate": 1.8510500000000002e-05, | |
| "loss": 5.7434, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 1.1834500000000001, | |
| "eval_accuracy": 0.03302726053552262, | |
| "eval_loss": 5.680927276611328, | |
| "eval_runtime": 112.7327, | |
| "eval_samples_per_second": 31.863, | |
| "eval_steps_per_second": 1.996, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 1.18845, | |
| "grad_norm": 0.7104445695877075, | |
| "learning_rate": 1.82605e-05, | |
| "loss": 5.7506, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 1.19345, | |
| "grad_norm": 0.6914287805557251, | |
| "learning_rate": 1.8011e-05, | |
| "loss": 5.7477, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 1.19345, | |
| "eval_accuracy": 0.032914323564693565, | |
| "eval_loss": 5.668553829193115, | |
| "eval_runtime": 113.1627, | |
| "eval_samples_per_second": 31.742, | |
| "eval_steps_per_second": 1.988, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 1.19845, | |
| "grad_norm": 0.7188768982887268, | |
| "learning_rate": 1.7760999999999998e-05, | |
| "loss": 5.7329, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 1.20345, | |
| "grad_norm": 0.6479863524436951, | |
| "learning_rate": 1.7511e-05, | |
| "loss": 5.7409, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 1.20345, | |
| "eval_accuracy": 0.03304100346811748, | |
| "eval_loss": 5.662350654602051, | |
| "eval_runtime": 110.8399, | |
| "eval_samples_per_second": 32.407, | |
| "eval_steps_per_second": 2.03, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 1.20845, | |
| "grad_norm": 0.678534209728241, | |
| "learning_rate": 1.7261000000000003e-05, | |
| "loss": 5.7384, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 1.21345, | |
| "grad_norm": 0.8188093900680542, | |
| "learning_rate": 1.7011e-05, | |
| "loss": 5.737, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 1.21345, | |
| "eval_accuracy": 0.03385346931488896, | |
| "eval_loss": 5.675750732421875, | |
| "eval_runtime": 111.9995, | |
| "eval_samples_per_second": 32.072, | |
| "eval_steps_per_second": 2.009, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 1.21845, | |
| "grad_norm": 0.7928422689437866, | |
| "learning_rate": 1.6761e-05, | |
| "loss": 5.7299, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 1.22345, | |
| "grad_norm": 0.7139099836349487, | |
| "learning_rate": 1.6511e-05, | |
| "loss": 5.729, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 1.22345, | |
| "eval_accuracy": 0.032598372183651296, | |
| "eval_loss": 5.654570579528809, | |
| "eval_runtime": 111.7834, | |
| "eval_samples_per_second": 32.134, | |
| "eval_steps_per_second": 2.013, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 1.22845, | |
| "grad_norm": 0.6944179534912109, | |
| "learning_rate": 1.6261000000000002e-05, | |
| "loss": 5.7183, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 1.23345, | |
| "grad_norm": 0.7094106078147888, | |
| "learning_rate": 1.60115e-05, | |
| "loss": 5.7232, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 1.23345, | |
| "eval_accuracy": 0.032930243595521276, | |
| "eval_loss": 5.646746635437012, | |
| "eval_runtime": 117.6522, | |
| "eval_samples_per_second": 30.531, | |
| "eval_steps_per_second": 1.912, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 1.23845, | |
| "grad_norm": 0.6927877068519592, | |
| "learning_rate": 1.57615e-05, | |
| "loss": 5.7162, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 1.24345, | |
| "grad_norm": 0.7557797431945801, | |
| "learning_rate": 1.5511500000000002e-05, | |
| "loss": 5.7127, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 1.24345, | |
| "eval_accuracy": 0.032887245905422496, | |
| "eval_loss": 5.644942283630371, | |
| "eval_runtime": 111.9326, | |
| "eval_samples_per_second": 32.091, | |
| "eval_steps_per_second": 2.01, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 1.24845, | |
| "grad_norm": 0.673581063747406, | |
| "learning_rate": 1.52615e-05, | |
| "loss": 5.7094, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 1.25345, | |
| "grad_norm": 0.6678490042686462, | |
| "learning_rate": 1.50115e-05, | |
| "loss": 5.7187, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 1.25345, | |
| "eval_accuracy": 0.03288316384623591, | |
| "eval_loss": 5.635218143463135, | |
| "eval_runtime": 116.4191, | |
| "eval_samples_per_second": 30.854, | |
| "eval_steps_per_second": 1.933, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 1.25845, | |
| "grad_norm": 0.7218269109725952, | |
| "learning_rate": 1.4761500000000001e-05, | |
| "loss": 5.7138, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 1.26345, | |
| "grad_norm": 0.7058696150779724, | |
| "learning_rate": 1.4512000000000001e-05, | |
| "loss": 5.717, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 1.26345, | |
| "eval_accuracy": 0.03260463134107074, | |
| "eval_loss": 5.626367568969727, | |
| "eval_runtime": 111.4646, | |
| "eval_samples_per_second": 32.225, | |
| "eval_steps_per_second": 2.019, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 1.26845, | |
| "grad_norm": 0.7154064774513245, | |
| "learning_rate": 1.4262e-05, | |
| "loss": 5.6955, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 1.27345, | |
| "grad_norm": 0.6907570362091064, | |
| "learning_rate": 1.4012e-05, | |
| "loss": 5.714, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 1.27345, | |
| "eval_accuracy": 0.03297637086432977, | |
| "eval_loss": 5.6219401359558105, | |
| "eval_runtime": 113.1231, | |
| "eval_samples_per_second": 31.753, | |
| "eval_steps_per_second": 1.989, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 1.2784499999999999, | |
| "grad_norm": 0.708003580570221, | |
| "learning_rate": 1.3762e-05, | |
| "loss": 5.6993, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 1.28345, | |
| "grad_norm": 0.8350797295570374, | |
| "learning_rate": 1.35125e-05, | |
| "loss": 5.7079, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 1.28345, | |
| "eval_accuracy": 0.032977323344806644, | |
| "eval_loss": 5.616916656494141, | |
| "eval_runtime": 112.8971, | |
| "eval_samples_per_second": 31.817, | |
| "eval_steps_per_second": 1.993, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 1.28845, | |
| "grad_norm": 0.6496825218200684, | |
| "learning_rate": 1.32625e-05, | |
| "loss": 5.7047, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 1.29345, | |
| "grad_norm": 0.7288230657577515, | |
| "learning_rate": 1.30125e-05, | |
| "loss": 5.7034, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 1.29345, | |
| "eval_accuracy": 0.032624225225166385, | |
| "eval_loss": 5.613090991973877, | |
| "eval_runtime": 112.7724, | |
| "eval_samples_per_second": 31.852, | |
| "eval_steps_per_second": 1.995, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 1.2984499999999999, | |
| "grad_norm": 0.6955094337463379, | |
| "learning_rate": 1.27625e-05, | |
| "loss": 5.6884, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 1.30345, | |
| "grad_norm": 0.6996705532073975, | |
| "learning_rate": 1.25125e-05, | |
| "loss": 5.6768, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 1.30345, | |
| "eval_accuracy": 0.03247931212404235, | |
| "eval_loss": 5.61249303817749, | |
| "eval_runtime": 113.0871, | |
| "eval_samples_per_second": 31.763, | |
| "eval_steps_per_second": 1.99, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 1.3084500000000001, | |
| "grad_norm": 0.7164750695228577, | |
| "learning_rate": 1.22625e-05, | |
| "loss": 5.685, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 1.31345, | |
| "grad_norm": 0.6893213391304016, | |
| "learning_rate": 1.20125e-05, | |
| "loss": 5.6955, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 1.31345, | |
| "eval_accuracy": 0.0328188033797273, | |
| "eval_loss": 5.6074957847595215, | |
| "eval_runtime": 114.2316, | |
| "eval_samples_per_second": 31.445, | |
| "eval_steps_per_second": 1.97, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 1.31845, | |
| "grad_norm": 0.7451682090759277, | |
| "learning_rate": 1.17625e-05, | |
| "loss": 5.6853, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 1.32345, | |
| "grad_norm": 0.7856729626655579, | |
| "learning_rate": 1.15125e-05, | |
| "loss": 5.6947, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 1.32345, | |
| "eval_accuracy": 0.032477407163088605, | |
| "eval_loss": 5.601708889007568, | |
| "eval_runtime": 111.6415, | |
| "eval_samples_per_second": 32.174, | |
| "eval_steps_per_second": 2.015, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 1.3284500000000001, | |
| "grad_norm": 0.7268499135971069, | |
| "learning_rate": 1.1262500000000001e-05, | |
| "loss": 5.6676, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 1.33345, | |
| "grad_norm": 0.6918110847473145, | |
| "learning_rate": 1.1013000000000001e-05, | |
| "loss": 5.7056, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 1.33345, | |
| "eval_accuracy": 0.032296980147041215, | |
| "eval_loss": 5.595623970031738, | |
| "eval_runtime": 110.9779, | |
| "eval_samples_per_second": 32.367, | |
| "eval_steps_per_second": 2.027, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 1.33845, | |
| "grad_norm": 0.7075666189193726, | |
| "learning_rate": 1.0763e-05, | |
| "loss": 5.6793, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 1.34345, | |
| "grad_norm": 0.7042005658149719, | |
| "learning_rate": 1.0513e-05, | |
| "loss": 5.6636, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 1.34345, | |
| "eval_accuracy": 0.03255183670892414, | |
| "eval_loss": 5.592087268829346, | |
| "eval_runtime": 111.6356, | |
| "eval_samples_per_second": 32.176, | |
| "eval_steps_per_second": 2.015, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 1.34845, | |
| "grad_norm": 0.7470083832740784, | |
| "learning_rate": 1.0263e-05, | |
| "loss": 5.6727, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 1.35345, | |
| "grad_norm": 0.7401617169380188, | |
| "learning_rate": 1.00135e-05, | |
| "loss": 5.6723, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 1.35345, | |
| "eval_accuracy": 0.03255442201307565, | |
| "eval_loss": 5.588088512420654, | |
| "eval_runtime": 112.4329, | |
| "eval_samples_per_second": 31.948, | |
| "eval_steps_per_second": 2.001, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 1.35845, | |
| "grad_norm": 0.7123810648918152, | |
| "learning_rate": 9.7635e-06, | |
| "loss": 5.6695, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 1.36345, | |
| "grad_norm": 0.7234753966331482, | |
| "learning_rate": 9.5135e-06, | |
| "loss": 5.659, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 1.36345, | |
| "eval_accuracy": 0.03243753905169955, | |
| "eval_loss": 5.5822529792785645, | |
| "eval_runtime": 115.5372, | |
| "eval_samples_per_second": 31.09, | |
| "eval_steps_per_second": 1.947, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 1.36845, | |
| "grad_norm": 0.7145719528198242, | |
| "learning_rate": 9.2635e-06, | |
| "loss": 5.6632, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 1.37345, | |
| "grad_norm": 0.7814493179321289, | |
| "learning_rate": 9.013500000000001e-06, | |
| "loss": 5.6729, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 1.37345, | |
| "eval_accuracy": 0.032616741449990966, | |
| "eval_loss": 5.579476833343506, | |
| "eval_runtime": 111.3925, | |
| "eval_samples_per_second": 32.246, | |
| "eval_steps_per_second": 2.02, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 1.37845, | |
| "grad_norm": 0.7196946144104004, | |
| "learning_rate": 8.7635e-06, | |
| "loss": 5.6638, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 1.38345, | |
| "grad_norm": 0.7334076762199402, | |
| "learning_rate": 8.514e-06, | |
| "loss": 5.6595, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 1.38345, | |
| "eval_accuracy": 0.03224581833856925, | |
| "eval_loss": 5.579442977905273, | |
| "eval_runtime": 111.9776, | |
| "eval_samples_per_second": 32.078, | |
| "eval_steps_per_second": 2.009, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 1.38845, | |
| "grad_norm": 0.797461748123169, | |
| "learning_rate": 8.264e-06, | |
| "loss": 5.66, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 1.39345, | |
| "grad_norm": 0.7179501056671143, | |
| "learning_rate": 8.014e-06, | |
| "loss": 5.6565, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 1.39345, | |
| "eval_accuracy": 0.032768049777174, | |
| "eval_loss": 5.575778961181641, | |
| "eval_runtime": 113.2284, | |
| "eval_samples_per_second": 31.723, | |
| "eval_steps_per_second": 1.987, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 1.39845, | |
| "grad_norm": 0.7444576025009155, | |
| "learning_rate": 7.764e-06, | |
| "loss": 5.6539, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 1.4034499999999999, | |
| "grad_norm": 0.681348979473114, | |
| "learning_rate": 7.514500000000001e-06, | |
| "loss": 5.6649, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 1.4034499999999999, | |
| "eval_accuracy": 0.03250598157739475, | |
| "eval_loss": 5.571649074554443, | |
| "eval_runtime": 110.7941, | |
| "eval_samples_per_second": 32.42, | |
| "eval_steps_per_second": 2.031, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 1.40845, | |
| "grad_norm": 0.6666921973228455, | |
| "learning_rate": 7.2645000000000005e-06, | |
| "loss": 5.6487, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 1.41345, | |
| "grad_norm": 0.7330692410469055, | |
| "learning_rate": 7.0145e-06, | |
| "loss": 5.6561, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 1.41345, | |
| "eval_accuracy": 0.032128118965355834, | |
| "eval_loss": 5.5695481300354, | |
| "eval_runtime": 113.9227, | |
| "eval_samples_per_second": 31.53, | |
| "eval_steps_per_second": 1.975, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 1.41845, | |
| "grad_norm": 0.7140607237815857, | |
| "learning_rate": 6.7645e-06, | |
| "loss": 5.6569, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 1.4234499999999999, | |
| "grad_norm": 0.695405125617981, | |
| "learning_rate": 6.5145e-06, | |
| "loss": 5.6405, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 1.4234499999999999, | |
| "eval_accuracy": 0.03226895000729328, | |
| "eval_loss": 5.565379619598389, | |
| "eval_runtime": 110.9885, | |
| "eval_samples_per_second": 32.364, | |
| "eval_steps_per_second": 2.027, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 1.42845, | |
| "grad_norm": 0.748406708240509, | |
| "learning_rate": 6.265e-06, | |
| "loss": 5.661, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 1.4334500000000001, | |
| "grad_norm": 0.7667502760887146, | |
| "learning_rate": 6.015000000000001e-06, | |
| "loss": 5.6482, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 1.4334500000000001, | |
| "eval_accuracy": 0.03212390083752969, | |
| "eval_loss": 5.562798023223877, | |
| "eval_runtime": 113.7863, | |
| "eval_samples_per_second": 31.568, | |
| "eval_steps_per_second": 1.977, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 1.43845, | |
| "grad_norm": 0.7025783658027649, | |
| "learning_rate": 5.765e-06, | |
| "loss": 5.6537, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 1.44345, | |
| "grad_norm": 0.7218544483184814, | |
| "learning_rate": 5.515e-06, | |
| "loss": 5.6425, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 1.44345, | |
| "eval_accuracy": 0.03228555038131876, | |
| "eval_loss": 5.562201023101807, | |
| "eval_runtime": 110.7556, | |
| "eval_samples_per_second": 32.432, | |
| "eval_steps_per_second": 2.032, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 2.0019, | |
| "grad_norm": 0.7732229232788086, | |
| "learning_rate": 5.265e-06, | |
| "loss": 5.6439, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 2.0069, | |
| "grad_norm": 0.7334641814231873, | |
| "learning_rate": 5.015e-06, | |
| "loss": 5.6379, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 2.0069, | |
| "eval_accuracy": 0.032302831098542, | |
| "eval_loss": 5.558170318603516, | |
| "eval_runtime": 110.8037, | |
| "eval_samples_per_second": 32.418, | |
| "eval_steps_per_second": 2.031, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 2.0119, | |
| "grad_norm": 0.7253163456916809, | |
| "learning_rate": 4.765e-06, | |
| "loss": 5.6416, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 2.0169, | |
| "grad_norm": 0.6948328614234924, | |
| "learning_rate": 4.515000000000001e-06, | |
| "loss": 5.6357, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 2.0169, | |
| "eval_accuracy": 0.03219384011825997, | |
| "eval_loss": 5.557282447814941, | |
| "eval_runtime": 112.1174, | |
| "eval_samples_per_second": 32.038, | |
| "eval_steps_per_second": 2.007, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 2.0219, | |
| "grad_norm": 0.8627682328224182, | |
| "learning_rate": 4.2655e-06, | |
| "loss": 5.6417, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 2.0269, | |
| "grad_norm": 0.6853375434875488, | |
| "learning_rate": 4.015500000000001e-06, | |
| "loss": 5.6381, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 2.0269, | |
| "eval_accuracy": 0.032045117095228455, | |
| "eval_loss": 5.556839942932129, | |
| "eval_runtime": 112.6404, | |
| "eval_samples_per_second": 31.889, | |
| "eval_steps_per_second": 1.998, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 2.0319, | |
| "grad_norm": 0.7689598798751831, | |
| "learning_rate": 3.7655e-06, | |
| "loss": 5.6349, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 2.0369, | |
| "grad_norm": 0.714249849319458, | |
| "learning_rate": 3.516e-06, | |
| "loss": 5.6427, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 2.0369, | |
| "eval_accuracy": 0.032379982017168595, | |
| "eval_loss": 5.5526018142700195, | |
| "eval_runtime": 114.2105, | |
| "eval_samples_per_second": 31.451, | |
| "eval_steps_per_second": 1.97, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 2.0419, | |
| "grad_norm": 0.7157047390937805, | |
| "learning_rate": 3.266e-06, | |
| "loss": 5.6238, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 2.0469, | |
| "grad_norm": 0.702506959438324, | |
| "learning_rate": 3.016e-06, | |
| "loss": 5.6364, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 2.0469, | |
| "eval_accuracy": 0.03230364751037931, | |
| "eval_loss": 5.55258321762085, | |
| "eval_runtime": 113.8093, | |
| "eval_samples_per_second": 31.562, | |
| "eval_steps_per_second": 1.977, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 2.0519, | |
| "grad_norm": 0.7739561200141907, | |
| "learning_rate": 2.7660000000000003e-06, | |
| "loss": 5.6289, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 2.0569, | |
| "grad_norm": 0.700249195098877, | |
| "learning_rate": 2.516e-06, | |
| "loss": 5.626, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 2.0569, | |
| "eval_accuracy": 0.032115600650516954, | |
| "eval_loss": 5.550052642822266, | |
| "eval_runtime": 112.9135, | |
| "eval_samples_per_second": 31.812, | |
| "eval_steps_per_second": 1.993, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 2.0619, | |
| "grad_norm": 0.694694995880127, | |
| "learning_rate": 2.266e-06, | |
| "loss": 5.6414, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 2.0669, | |
| "grad_norm": 0.8426607251167297, | |
| "learning_rate": 2.0165e-06, | |
| "loss": 5.636, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 2.0669, | |
| "eval_accuracy": 0.032368688320085694, | |
| "eval_loss": 5.549187183380127, | |
| "eval_runtime": 111.6948, | |
| "eval_samples_per_second": 32.159, | |
| "eval_steps_per_second": 2.014, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 2.0719, | |
| "grad_norm": 0.7448583841323853, | |
| "learning_rate": 1.7665000000000002e-06, | |
| "loss": 5.633, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 2.0769, | |
| "grad_norm": 0.7047140002250671, | |
| "learning_rate": 1.5165e-06, | |
| "loss": 5.632, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 2.0769, | |
| "eval_accuracy": 0.032333582611081, | |
| "eval_loss": 5.548858165740967, | |
| "eval_runtime": 111.3384, | |
| "eval_samples_per_second": 32.262, | |
| "eval_steps_per_second": 2.021, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 2.0819, | |
| "grad_norm": 0.7185714244842529, | |
| "learning_rate": 1.2665e-06, | |
| "loss": 5.6218, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 2.0869, | |
| "grad_norm": 0.906790018081665, | |
| "learning_rate": 1.0165000000000001e-06, | |
| "loss": 5.6133, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 2.0869, | |
| "eval_accuracy": 0.03233031696373172, | |
| "eval_loss": 5.547926425933838, | |
| "eval_runtime": 112.1516, | |
| "eval_samples_per_second": 32.028, | |
| "eval_steps_per_second": 2.006, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 2.0919, | |
| "grad_norm": 0.7219076752662659, | |
| "learning_rate": 7.67e-07, | |
| "loss": 5.6305, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 2.0969, | |
| "grad_norm": 0.7102829217910767, | |
| "learning_rate": 5.17e-07, | |
| "loss": 5.6291, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 2.0969, | |
| "eval_accuracy": 0.032251125015511826, | |
| "eval_loss": 5.54772424697876, | |
| "eval_runtime": 113.02, | |
| "eval_samples_per_second": 31.782, | |
| "eval_steps_per_second": 1.991, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 2.1019, | |
| "grad_norm": 0.7246349453926086, | |
| "learning_rate": 2.67e-07, | |
| "loss": 5.6351, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 2.1069, | |
| "grad_norm": 0.6908907294273376, | |
| "learning_rate": 1.7000000000000003e-08, | |
| "loss": 5.6271, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 2.1069, | |
| "eval_accuracy": 0.0322300343763811, | |
| "eval_loss": 5.546974182128906, | |
| "eval_runtime": 114.2737, | |
| "eval_samples_per_second": 31.433, | |
| "eval_steps_per_second": 1.969, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 2.1069, | |
| "step": 100000, | |
| "total_flos": 9.182034338135409e+17, | |
| "train_loss": 0.0, | |
| "train_runtime": 1033.1844, | |
| "train_samples_per_second": 3097.221, | |
| "train_steps_per_second": 193.576 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 200000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9223372036854775807, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.182034338135409e+17, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |