Invalid JSON: Unexpected token 'N', ..."al_loss": NaN,
"... is not valid JSON
| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9993226593794158, | |
| "eval_steps": 500, | |
| "global_step": 32100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01, | |
| "learning_rate": 9.969479912799752e-06, | |
| "loss": 5.7053, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "learning_rate": 9.938336966677049e-06, | |
| "loss": 4.2137, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "learning_rate": 9.907194020554345e-06, | |
| "loss": 3.7382, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "learning_rate": 9.876051074431642e-06, | |
| "loss": 3.768, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "learning_rate": 9.844908128308939e-06, | |
| "loss": 4.0831, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1241.4479, | |
| "eval_samples_per_second": 183.933, | |
| "eval_steps_per_second": 22.992, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "learning_rate": 9.813765182186236e-06, | |
| "loss": 3.9938, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "learning_rate": 9.782622236063532e-06, | |
| "loss": 3.6619, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "learning_rate": 9.751790719402056e-06, | |
| "loss": 3.9896, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "learning_rate": 9.720647773279352e-06, | |
| "loss": 3.9662, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "learning_rate": 9.68950482715665e-06, | |
| "loss": 4.0557, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1241.0879, | |
| "eval_samples_per_second": 183.986, | |
| "eval_steps_per_second": 22.998, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "learning_rate": 9.658361881033946e-06, | |
| "loss": 3.4231, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "learning_rate": 9.627218934911242e-06, | |
| "loss": 3.4517, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "learning_rate": 9.59607598878854e-06, | |
| "loss": 4.235, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "learning_rate": 9.564933042665836e-06, | |
| "loss": 3.3104, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "learning_rate": 9.533790096543134e-06, | |
| "loss": 3.6169, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1240.8951, | |
| "eval_samples_per_second": 184.015, | |
| "eval_steps_per_second": 23.002, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "learning_rate": 9.50264715042043e-06, | |
| "loss": 3.3399, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "learning_rate": 9.471504204297728e-06, | |
| "loss": 3.9392, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "learning_rate": 9.440361258175024e-06, | |
| "loss": 4.2941, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "learning_rate": 9.40921831205232e-06, | |
| "loss": 4.2293, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "learning_rate": 9.378075365929618e-06, | |
| "loss": 3.411, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1240.9958, | |
| "eval_samples_per_second": 184.0, | |
| "eval_steps_per_second": 23.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "learning_rate": 9.346932419806914e-06, | |
| "loss": 3.1979, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "learning_rate": 9.315789473684212e-06, | |
| "loss": 3.303, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "learning_rate": 9.284646527561508e-06, | |
| "loss": 3.2433, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "learning_rate": 9.253503581438806e-06, | |
| "loss": 3.0607, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "learning_rate": 9.222360635316102e-06, | |
| "loss": 2.8986, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.4997, | |
| "eval_samples_per_second": 183.777, | |
| "eval_steps_per_second": 22.972, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "learning_rate": 9.191217689193398e-06, | |
| "loss": 3.5778, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "learning_rate": 9.160074743070694e-06, | |
| "loss": 3.2627, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "learning_rate": 9.128931796947992e-06, | |
| "loss": 2.8115, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "learning_rate": 9.097788850825288e-06, | |
| "loss": 3.1633, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "learning_rate": 9.066645904702586e-06, | |
| "loss": 3.3414, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1241.065, | |
| "eval_samples_per_second": 183.99, | |
| "eval_steps_per_second": 22.999, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "learning_rate": 9.035502958579882e-06, | |
| "loss": 3.1895, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "learning_rate": 9.00436001245718e-06, | |
| "loss": 3.2972, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "learning_rate": 8.973217066334476e-06, | |
| "loss": 3.3117, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "learning_rate": 8.942074120211772e-06, | |
| "loss": 3.8183, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "learning_rate": 8.911242603550296e-06, | |
| "loss": 3.0445, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1241.4692, | |
| "eval_samples_per_second": 183.93, | |
| "eval_steps_per_second": 22.991, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "learning_rate": 8.880099657427594e-06, | |
| "loss": 2.716, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "learning_rate": 8.84895671130489e-06, | |
| "loss": 2.7579, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "learning_rate": 8.817813765182188e-06, | |
| "loss": 3.6821, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "learning_rate": 8.786670819059484e-06, | |
| "loss": 2.9784, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "learning_rate": 8.755527872936782e-06, | |
| "loss": 3.1284, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1241.3243, | |
| "eval_samples_per_second": 183.951, | |
| "eval_steps_per_second": 22.994, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "learning_rate": 8.724384926814076e-06, | |
| "loss": 3.3944, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "learning_rate": 8.693241980691374e-06, | |
| "loss": 4.3378, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "learning_rate": 8.66209903456867e-06, | |
| "loss": 3.4281, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "learning_rate": 8.630956088445968e-06, | |
| "loss": 3.4173, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "learning_rate": 8.599813142323264e-06, | |
| "loss": 2.987, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.9007, | |
| "eval_samples_per_second": 183.718, | |
| "eval_steps_per_second": 22.965, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "learning_rate": 8.568670196200562e-06, | |
| "loss": 2.9806, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "learning_rate": 8.537527250077858e-06, | |
| "loss": 3.2971, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "learning_rate": 8.506384303955156e-06, | |
| "loss": 2.9308, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "learning_rate": 8.475241357832452e-06, | |
| "loss": 3.95, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "learning_rate": 8.444098411709748e-06, | |
| "loss": 3.6195, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "eval_loss": NaN, | |
| "eval_runtime": 12249.9104, | |
| "eval_samples_per_second": 18.64, | |
| "eval_steps_per_second": 2.33, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "learning_rate": 8.412955465587044e-06, | |
| "loss": 3.3623, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "learning_rate": 8.381812519464342e-06, | |
| "loss": 2.7048, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "learning_rate": 8.350669573341638e-06, | |
| "loss": 2.9451, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "learning_rate": 8.319526627218936e-06, | |
| "loss": 3.9476, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "learning_rate": 8.288383681096232e-06, | |
| "loss": 3.0161, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.1829, | |
| "eval_samples_per_second": 183.824, | |
| "eval_steps_per_second": 22.978, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "learning_rate": 8.25724073497353e-06, | |
| "loss": 2.7617, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "learning_rate": 8.226097788850826e-06, | |
| "loss": 2.6001, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "learning_rate": 8.194954842728122e-06, | |
| "loss": 2.8024, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "learning_rate": 8.16381189660542e-06, | |
| "loss": 2.6335, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "learning_rate": 8.132668950482716e-06, | |
| "loss": 3.0251, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1241.5477, | |
| "eval_samples_per_second": 183.918, | |
| "eval_steps_per_second": 22.99, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "learning_rate": 8.101526004360014e-06, | |
| "loss": 4.0077, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "learning_rate": 8.07038305823731e-06, | |
| "loss": 4.0494, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "learning_rate": 8.039240112114608e-06, | |
| "loss": 2.8992, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "learning_rate": 8.008097165991904e-06, | |
| "loss": 3.0616, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "learning_rate": 7.9769542198692e-06, | |
| "loss": 3.4257, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.4356, | |
| "eval_samples_per_second": 183.787, | |
| "eval_steps_per_second": 22.973, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "learning_rate": 7.945811273746496e-06, | |
| "loss": 2.9658, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "learning_rate": 7.914668327623794e-06, | |
| "loss": 3.288, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "learning_rate": 7.88352538150109e-06, | |
| "loss": 3.1517, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "learning_rate": 7.852382435378388e-06, | |
| "loss": 3.3913, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "learning_rate": 7.821239489255684e-06, | |
| "loss": 2.7966, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1239.9554, | |
| "eval_samples_per_second": 184.154, | |
| "eval_steps_per_second": 23.019, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "learning_rate": 7.790096543132982e-06, | |
| "loss": 3.372, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "learning_rate": 7.758953597010278e-06, | |
| "loss": 2.8074, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "learning_rate": 7.727810650887576e-06, | |
| "loss": 2.6364, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "learning_rate": 7.696667704764872e-06, | |
| "loss": 3.0556, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "learning_rate": 7.665524758642168e-06, | |
| "loss": 2.4966, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1238.3825, | |
| "eval_samples_per_second": 184.388, | |
| "eval_steps_per_second": 23.049, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "learning_rate": 7.634381812519464e-06, | |
| "loss": 3.562, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "learning_rate": 7.603238866396762e-06, | |
| "loss": 4.0498, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "learning_rate": 7.572095920274059e-06, | |
| "loss": 2.8504, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "learning_rate": 7.540952974151356e-06, | |
| "loss": 2.7274, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "learning_rate": 7.509810028028653e-06, | |
| "loss": 2.9404, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.6775, | |
| "eval_samples_per_second": 183.751, | |
| "eval_steps_per_second": 22.969, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "learning_rate": 7.47866708190595e-06, | |
| "loss": 2.6808, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "learning_rate": 7.447835565244473e-06, | |
| "loss": 2.7214, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "learning_rate": 7.4166926191217695e-06, | |
| "loss": 2.7306, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "learning_rate": 7.3855496729990665e-06, | |
| "loss": 2.7578, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "learning_rate": 7.354406726876363e-06, | |
| "loss": 2.6296, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.0617, | |
| "eval_samples_per_second": 183.842, | |
| "eval_steps_per_second": 22.98, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "learning_rate": 7.32326378075366e-06, | |
| "loss": 2.7712, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "learning_rate": 7.2921208346309565e-06, | |
| "loss": 2.9979, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "learning_rate": 7.2609778885082535e-06, | |
| "loss": 2.8045, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "learning_rate": 7.2298349423855505e-06, | |
| "loss": 3.1612, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "learning_rate": 7.198691996262847e-06, | |
| "loss": 2.8809, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.2163, | |
| "eval_samples_per_second": 183.819, | |
| "eval_steps_per_second": 22.977, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "learning_rate": 7.1675490501401435e-06, | |
| "loss": 2.8204, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "learning_rate": 7.1364061040174405e-06, | |
| "loss": 2.508, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "learning_rate": 7.1052631578947375e-06, | |
| "loss": 2.3787, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "learning_rate": 7.0741202117720344e-06, | |
| "loss": 2.5762, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "learning_rate": 7.042977265649331e-06, | |
| "loss": 2.9636, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.6873, | |
| "eval_samples_per_second": 183.749, | |
| "eval_steps_per_second": 22.969, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "learning_rate": 7.011834319526628e-06, | |
| "loss": 2.6669, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "learning_rate": 6.980691373403925e-06, | |
| "loss": 2.9417, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "learning_rate": 6.949548427281222e-06, | |
| "loss": 3.1507, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "learning_rate": 6.9184054811585175e-06, | |
| "loss": 3.0434, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "learning_rate": 6.8872625350358145e-06, | |
| "loss": 3.0385, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.5564, | |
| "eval_samples_per_second": 183.769, | |
| "eval_steps_per_second": 22.971, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "learning_rate": 6.8561195889131115e-06, | |
| "loss": 2.6385, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "learning_rate": 6.8249766427904084e-06, | |
| "loss": 2.5347, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "learning_rate": 6.793833696667705e-06, | |
| "loss": 3.0458, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "learning_rate": 6.763002180006229e-06, | |
| "loss": 3.0999, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "learning_rate": 6.731859233883526e-06, | |
| "loss": 2.5865, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.285, | |
| "eval_samples_per_second": 183.809, | |
| "eval_steps_per_second": 22.976, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "learning_rate": 6.700716287760822e-06, | |
| "loss": 2.7499, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "learning_rate": 6.669573341638119e-06, | |
| "loss": 2.9551, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "learning_rate": 6.638430395515416e-06, | |
| "loss": 2.8254, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "learning_rate": 6.607287449392713e-06, | |
| "loss": 2.8981, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "learning_rate": 6.57614450327001e-06, | |
| "loss": 2.7491, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.9348, | |
| "eval_samples_per_second": 183.713, | |
| "eval_steps_per_second": 22.964, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "learning_rate": 6.545001557147307e-06, | |
| "loss": 2.8125, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "learning_rate": 6.513858611024604e-06, | |
| "loss": 2.6231, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "learning_rate": 6.482715664901901e-06, | |
| "loss": 2.8729, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "learning_rate": 6.451572718779196e-06, | |
| "loss": 2.9655, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "learning_rate": 6.420429772656493e-06, | |
| "loss": 3.1339, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.7737, | |
| "eval_samples_per_second": 183.737, | |
| "eval_steps_per_second": 22.967, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "learning_rate": 6.38928682653379e-06, | |
| "loss": 3.2655, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "learning_rate": 6.358143880411087e-06, | |
| "loss": 2.7888, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "learning_rate": 6.327000934288384e-06, | |
| "loss": 3.7292, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "learning_rate": 6.295857988165681e-06, | |
| "loss": 2.6393, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "learning_rate": 6.264715042042978e-06, | |
| "loss": 2.9632, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.1518, | |
| "eval_samples_per_second": 183.829, | |
| "eval_steps_per_second": 22.979, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "learning_rate": 6.233572095920275e-06, | |
| "loss": 2.4857, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "learning_rate": 6.202429149797571e-06, | |
| "loss": 2.9144, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "learning_rate": 6.171286203674868e-06, | |
| "loss": 2.7133, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "learning_rate": 6.140143257552165e-06, | |
| "loss": 3.8753, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "learning_rate": 6.109000311429461e-06, | |
| "loss": 3.0089, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.2159, | |
| "eval_samples_per_second": 183.819, | |
| "eval_steps_per_second": 22.977, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "learning_rate": 6.077857365306758e-06, | |
| "loss": 2.9934, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "learning_rate": 6.047025848645283e-06, | |
| "loss": 2.5078, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "learning_rate": 6.01588290252258e-06, | |
| "loss": 2.7188, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "learning_rate": 5.984739956399875e-06, | |
| "loss": 2.6552, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "learning_rate": 5.953597010277172e-06, | |
| "loss": 2.3908, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.3282, | |
| "eval_samples_per_second": 183.802, | |
| "eval_steps_per_second": 22.975, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "learning_rate": 5.922454064154469e-06, | |
| "loss": 2.7332, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "learning_rate": 5.891311118031766e-06, | |
| "loss": 2.3849, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "learning_rate": 5.860168171909063e-06, | |
| "loss": 3.4628, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "learning_rate": 5.82902522578636e-06, | |
| "loss": 2.6728, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "learning_rate": 5.797882279663657e-06, | |
| "loss": 2.8807, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1241.8007, | |
| "eval_samples_per_second": 183.881, | |
| "eval_steps_per_second": 22.985, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "learning_rate": 5.766739333540954e-06, | |
| "loss": 2.9648, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "learning_rate": 5.735596387418251e-06, | |
| "loss": 2.8685, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "learning_rate": 5.704453441295547e-06, | |
| "loss": 2.7226, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "learning_rate": 5.673310495172844e-06, | |
| "loss": 2.7493, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "learning_rate": 5.642167549050141e-06, | |
| "loss": 2.4787, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.1112, | |
| "eval_samples_per_second": 183.835, | |
| "eval_steps_per_second": 22.979, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "learning_rate": 5.611024602927437e-06, | |
| "loss": 2.6621, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "learning_rate": 5.580193086265962e-06, | |
| "loss": 2.5317, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "learning_rate": 5.549050140143259e-06, | |
| "loss": 2.8094, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "learning_rate": 5.5179071940205556e-06, | |
| "loss": 2.9184, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "learning_rate": 5.486764247897851e-06, | |
| "loss": 3.0641, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.2939, | |
| "eval_samples_per_second": 183.808, | |
| "eval_steps_per_second": 22.976, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "learning_rate": 5.455621301775148e-06, | |
| "loss": 2.707, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "learning_rate": 5.424478355652445e-06, | |
| "loss": 2.8484, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "learning_rate": 5.393335409529742e-06, | |
| "loss": 2.7637, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "learning_rate": 5.362192463407039e-06, | |
| "loss": 3.2172, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "learning_rate": 5.331049517284336e-06, | |
| "loss": 3.8572, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.3426, | |
| "eval_samples_per_second": 183.8, | |
| "eval_steps_per_second": 22.975, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "learning_rate": 5.299906571161633e-06, | |
| "loss": 2.6719, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "learning_rate": 5.2687636250389296e-06, | |
| "loss": 2.6299, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "learning_rate": 5.237620678916226e-06, | |
| "loss": 2.7315, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "learning_rate": 5.206477732793523e-06, | |
| "loss": 2.5421, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "learning_rate": 5.17533478667082e-06, | |
| "loss": 2.6612, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.7472, | |
| "eval_samples_per_second": 183.741, | |
| "eval_steps_per_second": 22.968, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "learning_rate": 5.1441918405481166e-06, | |
| "loss": 2.4232, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "learning_rate": 5.113048894425413e-06, | |
| "loss": 2.4607, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "learning_rate": 5.08190594830271e-06, | |
| "loss": 2.5239, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "learning_rate": 5.050763002180007e-06, | |
| "loss": 2.7479, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "learning_rate": 5.0196200560573036e-06, | |
| "loss": 2.3672, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.0429, | |
| "eval_samples_per_second": 183.845, | |
| "eval_steps_per_second": 22.981, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "learning_rate": 4.9884771099346005e-06, | |
| "loss": 2.8632, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "learning_rate": 4.9573341638118975e-06, | |
| "loss": 2.6653, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "learning_rate": 4.9261912176891945e-06, | |
| "loss": 2.539, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "learning_rate": 4.8950482715664906e-06, | |
| "loss": 2.8172, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "learning_rate": 4.8639053254437875e-06, | |
| "loss": 2.5157, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "eval_loss": NaN, | |
| "eval_runtime": 11659.1974, | |
| "eval_samples_per_second": 19.585, | |
| "eval_steps_per_second": 2.448, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "learning_rate": 4.8327623793210845e-06, | |
| "loss": 2.5209, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "learning_rate": 4.8016194331983815e-06, | |
| "loss": 2.9114, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "learning_rate": 4.7704764870756776e-06, | |
| "loss": 2.6053, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "learning_rate": 4.7393335409529745e-06, | |
| "loss": 2.8304, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "learning_rate": 4.7081905948302715e-06, | |
| "loss": 2.7648, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1241.8221, | |
| "eval_samples_per_second": 183.877, | |
| "eval_steps_per_second": 22.985, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "learning_rate": 4.6770476487075685e-06, | |
| "loss": 2.9948, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "learning_rate": 4.6459047025848646e-06, | |
| "loss": 2.7007, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "learning_rate": 4.6147617564621615e-06, | |
| "loss": 3.0049, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "learning_rate": 4.5836188103394585e-06, | |
| "loss": 2.4344, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "learning_rate": 4.5524758642167555e-06, | |
| "loss": 2.474, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.354, | |
| "eval_samples_per_second": 183.799, | |
| "eval_steps_per_second": 22.975, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "learning_rate": 4.5213329180940516e-06, | |
| "loss": 2.653, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "learning_rate": 4.4901899719713485e-06, | |
| "loss": 2.7398, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "learning_rate": 4.4590470258486455e-06, | |
| "loss": 2.3064, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "learning_rate": 4.4279040797259425e-06, | |
| "loss": 2.8268, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "learning_rate": 4.3967611336032386e-06, | |
| "loss": 2.3197, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1244.3942, | |
| "eval_samples_per_second": 183.497, | |
| "eval_steps_per_second": 22.937, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "learning_rate": 4.3656181874805355e-06, | |
| "loss": 2.6969, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "learning_rate": 4.33478667081906e-06, | |
| "loss": 2.6233, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "learning_rate": 4.303643724696356e-06, | |
| "loss": 2.9522, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "learning_rate": 4.272500778573653e-06, | |
| "loss": 2.5308, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "learning_rate": 4.24135783245095e-06, | |
| "loss": 2.9766, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.8298, | |
| "eval_samples_per_second": 183.728, | |
| "eval_steps_per_second": 22.966, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "learning_rate": 4.210214886328247e-06, | |
| "loss": 2.4952, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "learning_rate": 4.179071940205543e-06, | |
| "loss": 3.0581, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "learning_rate": 4.14792899408284e-06, | |
| "loss": 2.3722, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "learning_rate": 4.116786047960137e-06, | |
| "loss": 2.6995, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "learning_rate": 4.085643101837434e-06, | |
| "loss": 2.827, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.4071, | |
| "eval_samples_per_second": 183.791, | |
| "eval_steps_per_second": 22.974, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "learning_rate": 4.054500155714731e-06, | |
| "loss": 2.7205, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "learning_rate": 4.023357209592027e-06, | |
| "loss": 2.9195, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "learning_rate": 3.992214263469324e-06, | |
| "loss": 2.8558, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "learning_rate": 3.961071317346621e-06, | |
| "loss": 3.2538, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "learning_rate": 3.929928371223918e-06, | |
| "loss": 2.7213, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.5138, | |
| "eval_samples_per_second": 183.775, | |
| "eval_steps_per_second": 22.972, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "learning_rate": 3.898785425101214e-06, | |
| "loss": 2.7368, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "learning_rate": 3.867642478978511e-06, | |
| "loss": 2.6442, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "learning_rate": 3.836499532855808e-06, | |
| "loss": 2.6891, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "learning_rate": 3.8053565867331056e-06, | |
| "loss": 2.6657, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "learning_rate": 3.7742136406104017e-06, | |
| "loss": 2.6653, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1243.214, | |
| "eval_samples_per_second": 183.672, | |
| "eval_steps_per_second": 22.959, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "learning_rate": 3.7430706944876987e-06, | |
| "loss": 3.0333, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "learning_rate": 3.7119277483649957e-06, | |
| "loss": 2.2654, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "learning_rate": 3.6807848022422926e-06, | |
| "loss": 2.8074, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "learning_rate": 3.649641856119589e-06, | |
| "loss": 2.5416, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "learning_rate": 3.618498909996886e-06, | |
| "loss": 2.671, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.2133, | |
| "eval_samples_per_second": 183.819, | |
| "eval_steps_per_second": 22.978, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "learning_rate": 3.5873559638741827e-06, | |
| "loss": 2.3197, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "learning_rate": 3.5562130177514796e-06, | |
| "loss": 2.6825, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "learning_rate": 3.525070071628776e-06, | |
| "loss": 2.9642, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "learning_rate": 3.493927125506073e-06, | |
| "loss": 2.327, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "learning_rate": 3.463095608844597e-06, | |
| "loss": 2.5401, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.9072, | |
| "eval_samples_per_second": 183.717, | |
| "eval_steps_per_second": 22.965, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "learning_rate": 3.4319526627218935e-06, | |
| "loss": 2.6376, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "learning_rate": 3.4008097165991905e-06, | |
| "loss": 2.8178, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "learning_rate": 3.3696667704764874e-06, | |
| "loss": 2.2995, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "learning_rate": 3.3385238243537844e-06, | |
| "loss": 3.476, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "learning_rate": 3.3073808782310805e-06, | |
| "loss": 2.7193, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.302, | |
| "eval_samples_per_second": 183.806, | |
| "eval_steps_per_second": 22.976, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "learning_rate": 3.2762379321083775e-06, | |
| "loss": 2.2606, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "learning_rate": 3.2450949859856744e-06, | |
| "loss": 2.4979, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "learning_rate": 3.2139520398629714e-06, | |
| "loss": 2.8394, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "learning_rate": 3.1828090937402684e-06, | |
| "loss": 2.5935, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "learning_rate": 3.151666147617565e-06, | |
| "loss": 2.5924, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1239.6995, | |
| "eval_samples_per_second": 184.192, | |
| "eval_steps_per_second": 23.024, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "learning_rate": 3.120523201494862e-06, | |
| "loss": 2.3212, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "learning_rate": 3.0893802553721584e-06, | |
| "loss": 2.5372, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "learning_rate": 3.0582373092494554e-06, | |
| "loss": 3.17, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "learning_rate": 3.027094363126752e-06, | |
| "loss": 2.3103, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "learning_rate": 2.995951417004049e-06, | |
| "loss": 2.5506, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1238.6358, | |
| "eval_samples_per_second": 184.35, | |
| "eval_steps_per_second": 23.044, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "learning_rate": 2.964808470881346e-06, | |
| "loss": 2.3131, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "learning_rate": 2.9336655247586428e-06, | |
| "loss": 2.9797, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "learning_rate": 2.902522578635939e-06, | |
| "loss": 3.3517, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "learning_rate": 2.871379632513236e-06, | |
| "loss": 2.3309, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "learning_rate": 2.840236686390533e-06, | |
| "loss": 2.9167, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1239.5086, | |
| "eval_samples_per_second": 184.221, | |
| "eval_steps_per_second": 23.028, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "learning_rate": 2.8090937402678298e-06, | |
| "loss": 2.612, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "learning_rate": 2.7779507941451263e-06, | |
| "loss": 2.5327, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "learning_rate": 2.746807848022423e-06, | |
| "loss": 2.4244, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "learning_rate": 2.71566490189972e-06, | |
| "loss": 2.6675, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "learning_rate": 2.6845219557770168e-06, | |
| "loss": 2.8272, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.2478, | |
| "eval_samples_per_second": 183.814, | |
| "eval_steps_per_second": 22.977, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "learning_rate": 2.6533790096543133e-06, | |
| "loss": 2.419, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "learning_rate": 2.6222360635316103e-06, | |
| "loss": 2.4781, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "learning_rate": 2.5910931174089072e-06, | |
| "loss": 3.1415, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "learning_rate": 2.559950171286204e-06, | |
| "loss": 2.5226, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "learning_rate": 2.5288072251635003e-06, | |
| "loss": 2.4586, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.6605, | |
| "eval_samples_per_second": 183.753, | |
| "eval_steps_per_second": 22.969, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "learning_rate": 2.4976642790407973e-06, | |
| "loss": 2.6241, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "learning_rate": 2.4665213329180942e-06, | |
| "loss": 2.9088, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "learning_rate": 2.435378386795391e-06, | |
| "loss": 2.3136, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "learning_rate": 2.4042354406726877e-06, | |
| "loss": 2.5916, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "learning_rate": 2.3730924945499847e-06, | |
| "loss": 2.5698, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1243.0815, | |
| "eval_samples_per_second": 183.691, | |
| "eval_steps_per_second": 22.961, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "learning_rate": 2.3419495484272812e-06, | |
| "loss": 2.9015, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "learning_rate": 2.310806602304578e-06, | |
| "loss": 3.1771, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "learning_rate": 2.279663656181875e-06, | |
| "loss": 3.388, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "learning_rate": 2.2485207100591717e-06, | |
| "loss": 2.5991, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "learning_rate": 2.2173777639364687e-06, | |
| "loss": 2.6171, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.7545, | |
| "eval_samples_per_second": 183.739, | |
| "eval_steps_per_second": 22.968, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "learning_rate": 2.1865462472749925e-06, | |
| "loss": 2.506, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "learning_rate": 2.1554033011522895e-06, | |
| "loss": 2.6567, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "learning_rate": 2.124260355029586e-06, | |
| "loss": 3.2163, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "learning_rate": 2.09342883836811e-06, | |
| "loss": 2.8622, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "learning_rate": 2.062285892245407e-06, | |
| "loss": 3.0785, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1241.7925, | |
| "eval_samples_per_second": 183.882, | |
| "eval_steps_per_second": 22.985, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "learning_rate": 2.0311429461227034e-06, | |
| "loss": 2.735, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 2.61, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "learning_rate": 1.968857053877297e-06, | |
| "loss": 2.3942, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "learning_rate": 1.937714107754594e-06, | |
| "loss": 2.1723, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "learning_rate": 1.9065711616318906e-06, | |
| "loss": 2.856, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.4465, | |
| "eval_samples_per_second": 183.785, | |
| "eval_steps_per_second": 22.973, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "learning_rate": 1.8757396449704144e-06, | |
| "loss": 2.9466, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "learning_rate": 1.8445966988477112e-06, | |
| "loss": 3.1076, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "learning_rate": 1.813453752725008e-06, | |
| "loss": 2.2859, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "learning_rate": 1.782310806602305e-06, | |
| "loss": 3.0796, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "learning_rate": 1.7511678604796014e-06, | |
| "loss": 2.3788, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.137, | |
| "eval_samples_per_second": 183.831, | |
| "eval_steps_per_second": 22.979, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "learning_rate": 1.7200249143568984e-06, | |
| "loss": 2.7557, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "learning_rate": 1.688881968234195e-06, | |
| "loss": 2.328, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "learning_rate": 1.657739022111492e-06, | |
| "loss": 2.7651, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "learning_rate": 1.6265960759887886e-06, | |
| "loss": 3.1467, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "learning_rate": 1.5954531298660856e-06, | |
| "loss": 2.5629, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1243.6496, | |
| "eval_samples_per_second": 183.607, | |
| "eval_steps_per_second": 22.951, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "learning_rate": 1.5643101837433821e-06, | |
| "loss": 2.6638, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "learning_rate": 1.5331672376206791e-06, | |
| "loss": 2.7094, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "learning_rate": 1.5020242914979756e-06, | |
| "loss": 2.6052, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "learning_rate": 1.4708813453752726e-06, | |
| "loss": 2.6647, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "learning_rate": 1.4397383992525694e-06, | |
| "loss": 2.5914, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.5399, | |
| "eval_samples_per_second": 183.771, | |
| "eval_steps_per_second": 22.971, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "learning_rate": 1.4085954531298663e-06, | |
| "loss": 2.6187, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "learning_rate": 1.3774525070071629e-06, | |
| "loss": 2.2137, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "learning_rate": 1.3463095608844598e-06, | |
| "loss": 2.5718, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "learning_rate": 1.3151666147617564e-06, | |
| "loss": 2.3102, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "learning_rate": 1.2840236686390533e-06, | |
| "loss": 2.5802, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.0317, | |
| "eval_samples_per_second": 183.846, | |
| "eval_steps_per_second": 22.981, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "learning_rate": 1.25288072251635e-06, | |
| "loss": 3.2864, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "learning_rate": 1.221737776393647e-06, | |
| "loss": 2.4009, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "learning_rate": 1.1905948302709438e-06, | |
| "loss": 2.2293, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "learning_rate": 1.1594518841482405e-06, | |
| "loss": 2.5808, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "learning_rate": 1.1283089380255373e-06, | |
| "loss": 2.2956, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.0354, | |
| "eval_samples_per_second": 183.846, | |
| "eval_steps_per_second": 22.981, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "learning_rate": 1.097165991902834e-06, | |
| "loss": 2.6273, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "learning_rate": 1.0660230457801308e-06, | |
| "loss": 2.5466, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "learning_rate": 1.0348800996574275e-06, | |
| "loss": 2.7805, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "learning_rate": 1.0037371535347245e-06, | |
| "loss": 2.4265, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "learning_rate": 9.725942074120212e-07, | |
| "loss": 2.6872, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1241.1816, | |
| "eval_samples_per_second": 183.972, | |
| "eval_steps_per_second": 22.997, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "learning_rate": 9.41451261289318e-07, | |
| "loss": 2.8077, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "learning_rate": 9.103083151666147e-07, | |
| "loss": 2.7051, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "learning_rate": 8.791653690439116e-07, | |
| "loss": 2.6449, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "learning_rate": 8.480224229212085e-07, | |
| "loss": 2.2203, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "learning_rate": 8.168794767985053e-07, | |
| "loss": 2.7376, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.2627, | |
| "eval_samples_per_second": 183.812, | |
| "eval_steps_per_second": 22.977, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "learning_rate": 7.857365306758021e-07, | |
| "loss": 2.3105, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "learning_rate": 7.545935845530988e-07, | |
| "loss": 3.303, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "learning_rate": 7.234506384303956e-07, | |
| "loss": 2.6327, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "learning_rate": 6.923076923076924e-07, | |
| "loss": 2.4727, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "learning_rate": 6.611647461849892e-07, | |
| "loss": 2.5736, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.5556, | |
| "eval_samples_per_second": 183.769, | |
| "eval_steps_per_second": 22.971, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "learning_rate": 6.300218000622859e-07, | |
| "loss": 2.687, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "learning_rate": 5.988788539395828e-07, | |
| "loss": 2.664, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "learning_rate": 5.677359078168795e-07, | |
| "loss": 2.4285, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "learning_rate": 5.369043911554033e-07, | |
| "loss": 2.4898, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "learning_rate": 5.057614450327001e-07, | |
| "loss": 2.3551, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.2664, | |
| "eval_samples_per_second": 183.812, | |
| "eval_steps_per_second": 22.977, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "learning_rate": 4.7461849890999693e-07, | |
| "loss": 2.8719, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "learning_rate": 4.4347555278729373e-07, | |
| "loss": 2.6584, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "learning_rate": 4.1233260666459054e-07, | |
| "loss": 2.3366, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "learning_rate": 3.811896605418873e-07, | |
| "loss": 2.8612, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "learning_rate": 3.500467144191841e-07, | |
| "loss": 2.7175, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.8604, | |
| "eval_samples_per_second": 183.724, | |
| "eval_steps_per_second": 22.966, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "learning_rate": 3.189037682964809e-07, | |
| "loss": 2.9036, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "learning_rate": 2.8776082217377764e-07, | |
| "loss": 2.8631, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "learning_rate": 2.5661787605107445e-07, | |
| "loss": 2.4594, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "learning_rate": 2.2547492992837125e-07, | |
| "loss": 2.9569, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "learning_rate": 1.9433198380566805e-07, | |
| "loss": 3.0516, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1243.9849, | |
| "eval_samples_per_second": 183.558, | |
| "eval_steps_per_second": 22.945, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "learning_rate": 1.6318903768296483e-07, | |
| "loss": 2.529, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "learning_rate": 1.320460915602616e-07, | |
| "loss": 2.4768, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "learning_rate": 1.009031454375584e-07, | |
| "loss": 2.5839, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "learning_rate": 6.97601993148552e-08, | |
| "loss": 2.6676, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "learning_rate": 3.861725319215198e-08, | |
| "loss": 3.2724, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "eval_loss": NaN, | |
| "eval_runtime": 1242.7853, | |
| "eval_samples_per_second": 183.735, | |
| "eval_steps_per_second": 22.967, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "learning_rate": 7.47430706944877e-09, | |
| "loss": 2.7629, | |
| "step": 32100 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 32110, | |
| "num_train_epochs": 2, | |
| "save_steps": 100, | |
| "total_flos": 1.0814548820133888e+18, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |