| { | |
| "best_metric": 0.6181166768074036, | |
| "best_model_checkpoint": "./id1/checkpoint-110000", | |
| "epoch": 6.0, | |
| "eval_steps": 10000, | |
| "global_step": 113010, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05, | |
| "learning_rate": 0.00019823024511105212, | |
| "loss": 0.6956, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "learning_rate": 0.0001964622599769932, | |
| "loss": 0.6938, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "learning_rate": 0.0001946925050880453, | |
| "loss": 0.6937, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "learning_rate": 0.00019292275019909742, | |
| "loss": 0.6935, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "learning_rate": 0.00019115299531014955, | |
| "loss": 0.6933, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "learning_rate": 0.0001893832404212017, | |
| "loss": 0.6932, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "learning_rate": 0.00018761525528714274, | |
| "loss": 0.6933, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "learning_rate": 0.00018584550039819485, | |
| "loss": 0.6933, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "learning_rate": 0.000184075745509247, | |
| "loss": 0.6933, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "learning_rate": 0.0001823059906202991, | |
| "loss": 0.6933, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "eval_accuracy": 0.5007803538857862, | |
| "eval_loss": 0.6931997537612915, | |
| "eval_runtime": 359.7161, | |
| "eval_samples_per_second": 147.842, | |
| "eval_steps_per_second": 18.481, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "learning_rate": 0.00018053800548624015, | |
| "loss": 0.6933, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "learning_rate": 0.0001787682505972923, | |
| "loss": 0.6933, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "learning_rate": 0.00017700026546323337, | |
| "loss": 0.6932, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "learning_rate": 0.00017523051057428547, | |
| "loss": 0.6933, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "learning_rate": 0.00017346252544022655, | |
| "loss": 0.6933, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "learning_rate": 0.00017169277055127866, | |
| "loss": 0.6932, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "learning_rate": 0.00016992478541721971, | |
| "loss": 0.6933, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "learning_rate": 0.00016815503052827185, | |
| "loss": 0.6932, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "learning_rate": 0.0001663870453942129, | |
| "loss": 0.6932, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "learning_rate": 0.000164617290505265, | |
| "loss": 0.6933, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "eval_accuracy": 0.4992196461142137, | |
| "eval_loss": 0.6932681202888489, | |
| "eval_runtime": 360.8474, | |
| "eval_samples_per_second": 147.378, | |
| "eval_steps_per_second": 18.423, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "learning_rate": 0.00016284930537120612, | |
| "loss": 0.6931, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "learning_rate": 0.00016107955048225823, | |
| "loss": 0.6933, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "learning_rate": 0.00015931156534819928, | |
| "loss": 0.6932, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "learning_rate": 0.00015754181045925142, | |
| "loss": 0.6932, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "learning_rate": 0.00015577382532519247, | |
| "loss": 0.6932, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "learning_rate": 0.00015400407043624458, | |
| "loss": 0.6931, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "learning_rate": 0.00015223608530218565, | |
| "loss": 0.6934, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "learning_rate": 0.00015046633041323776, | |
| "loss": 0.6932, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "learning_rate": 0.00014869834527917884, | |
| "loss": 0.6933, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "learning_rate": 0.00014692859039023098, | |
| "loss": 0.6933, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "eval_accuracy": 0.5007803538857862, | |
| "eval_loss": 0.6931193470954895, | |
| "eval_runtime": 356.4083, | |
| "eval_samples_per_second": 149.214, | |
| "eval_steps_per_second": 18.653, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "learning_rate": 0.00014516060525617203, | |
| "loss": 0.6932, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "learning_rate": 0.00014339085036722414, | |
| "loss": 0.6932, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "learning_rate": 0.00014162109547827628, | |
| "loss": 0.6932, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "learning_rate": 0.0001398513405893284, | |
| "loss": 0.6932, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "learning_rate": 0.00013808335545526944, | |
| "loss": 0.6932, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "learning_rate": 0.00013631360056632158, | |
| "loss": 0.6932, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "learning_rate": 0.00013454561543226263, | |
| "loss": 0.6931, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "learning_rate": 0.00013277586054331476, | |
| "loss": 0.6932, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "learning_rate": 0.00013100787540925584, | |
| "loss": 0.6932, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "learning_rate": 0.00012923812052030795, | |
| "loss": 0.6933, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "eval_accuracy": 0.5161429833963258, | |
| "eval_loss": 0.6930696368217468, | |
| "eval_runtime": 359.7266, | |
| "eval_samples_per_second": 147.837, | |
| "eval_steps_per_second": 18.481, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "learning_rate": 0.000127470135386249, | |
| "loss": 0.6932, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "learning_rate": 0.00012570038049730114, | |
| "loss": 0.6932, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "learning_rate": 0.0001239323953632422, | |
| "loss": 0.693, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "learning_rate": 0.0001221626404742943, | |
| "loss": 0.6932, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "learning_rate": 0.00012039465534023538, | |
| "loss": 0.6931, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "learning_rate": 0.00011862490045128752, | |
| "loss": 0.6932, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "learning_rate": 0.00011685691531722858, | |
| "loss": 0.6931, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "learning_rate": 0.00011508716042828069, | |
| "loss": 0.6932, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "learning_rate": 0.00011331917529422176, | |
| "loss": 0.6932, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "learning_rate": 0.00011154942040527388, | |
| "loss": 0.6931, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "eval_accuracy": 0.4991068238656663, | |
| "eval_loss": 0.6932998895645142, | |
| "eval_runtime": 359.6843, | |
| "eval_samples_per_second": 147.855, | |
| "eval_steps_per_second": 18.483, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "learning_rate": 0.00010978143527121494, | |
| "loss": 0.6931, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "learning_rate": 0.00010801168038226705, | |
| "loss": 0.6931, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "learning_rate": 0.00010624369524820812, | |
| "loss": 0.6929, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "learning_rate": 0.00010447571011414921, | |
| "loss": 0.6931, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "learning_rate": 0.00010270595522520132, | |
| "loss": 0.6926, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "learning_rate": 0.00010093620033625344, | |
| "loss": 0.6914, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 3.03, | |
| "learning_rate": 9.916644544730555e-05, | |
| "loss": 0.6929, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "learning_rate": 9.740023006813557e-05, | |
| "loss": 0.6932, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 3.13, | |
| "learning_rate": 9.563578444385452e-05, | |
| "loss": 0.693, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 3.19, | |
| "learning_rate": 9.386602955490666e-05, | |
| "loss": 0.6932, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 3.19, | |
| "eval_accuracy": 0.4991068238656663, | |
| "eval_loss": 0.6932001709938049, | |
| "eval_runtime": 363.5383, | |
| "eval_samples_per_second": 146.287, | |
| "eval_steps_per_second": 18.287, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 3.24, | |
| "learning_rate": 9.209627466595877e-05, | |
| "loss": 0.693, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 3.29, | |
| "learning_rate": 9.032651977701089e-05, | |
| "loss": 0.6931, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 3.34, | |
| "learning_rate": 8.8556764888063e-05, | |
| "loss": 0.6912, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "learning_rate": 8.678700999911514e-05, | |
| "loss": 0.6923, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 3.45, | |
| "learning_rate": 8.50190248650562e-05, | |
| "loss": 0.6864, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "learning_rate": 8.324926997610831e-05, | |
| "loss": 0.6833, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "learning_rate": 8.148128484204938e-05, | |
| "loss": 0.6827, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 3.61, | |
| "learning_rate": 7.97115299531015e-05, | |
| "loss": 0.6778, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 3.66, | |
| "learning_rate": 7.794177506415362e-05, | |
| "loss": 0.6766, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 3.72, | |
| "learning_rate": 7.617202017520573e-05, | |
| "loss": 0.6746, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 3.72, | |
| "eval_accuracy": 0.5796054982042459, | |
| "eval_loss": 0.6725409030914307, | |
| "eval_runtime": 362.6905, | |
| "eval_samples_per_second": 146.629, | |
| "eval_steps_per_second": 18.33, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 3.77, | |
| "learning_rate": 7.440226528625786e-05, | |
| "loss": 0.6729, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 3.82, | |
| "learning_rate": 7.263251039730997e-05, | |
| "loss": 0.6683, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "learning_rate": 7.08627555083621e-05, | |
| "loss": 0.6649, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 3.93, | |
| "learning_rate": 6.909300061941421e-05, | |
| "loss": 0.666, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 3.98, | |
| "learning_rate": 6.732324573046634e-05, | |
| "loss": 0.6647, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 4.04, | |
| "learning_rate": 6.555349084151845e-05, | |
| "loss": 0.6674, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 4.09, | |
| "learning_rate": 6.378550570745952e-05, | |
| "loss": 0.6645, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 4.14, | |
| "learning_rate": 6.201575081851165e-05, | |
| "loss": 0.6644, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 4.19, | |
| "learning_rate": 6.024599592956376e-05, | |
| "loss": 0.6605, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 4.25, | |
| "learning_rate": 5.847624104061587e-05, | |
| "loss": 0.6582, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 4.25, | |
| "eval_accuracy": 0.6032229555668378, | |
| "eval_loss": 0.6613653898239136, | |
| "eval_runtime": 359.5282, | |
| "eval_samples_per_second": 147.919, | |
| "eval_steps_per_second": 18.491, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 4.3, | |
| "learning_rate": 5.67100256614459e-05, | |
| "loss": 0.6565, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 4.35, | |
| "learning_rate": 5.4942040527386965e-05, | |
| "loss": 0.6561, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 4.41, | |
| "learning_rate": 5.317228563843908e-05, | |
| "loss": 0.6562, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 4.46, | |
| "learning_rate": 5.14025307494912e-05, | |
| "loss": 0.6558, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 4.51, | |
| "learning_rate": 4.963277586054332e-05, | |
| "loss": 0.6553, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 4.57, | |
| "learning_rate": 4.786302097159544e-05, | |
| "loss": 0.652, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 4.62, | |
| "learning_rate": 4.60950358375365e-05, | |
| "loss": 0.6519, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 4.67, | |
| "learning_rate": 4.432528094858862e-05, | |
| "loss": 0.6475, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 4.73, | |
| "learning_rate": 4.255552605964074e-05, | |
| "loss": 0.6517, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 4.78, | |
| "learning_rate": 4.078754092558181e-05, | |
| "loss": 0.6455, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 4.78, | |
| "eval_accuracy": 0.6132265282713751, | |
| "eval_loss": 0.6466211676597595, | |
| "eval_runtime": 358.035, | |
| "eval_samples_per_second": 148.536, | |
| "eval_steps_per_second": 18.568, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 4.83, | |
| "learning_rate": 3.901778603663393e-05, | |
| "loss": 0.6411, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 4.88, | |
| "learning_rate": 3.7251570657463943e-05, | |
| "loss": 0.6418, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 4.94, | |
| "learning_rate": 3.548181576851607e-05, | |
| "loss": 0.6419, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 4.99, | |
| "learning_rate": 3.371206087956818e-05, | |
| "loss": 0.6403, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 5.04, | |
| "learning_rate": 3.194407574550925e-05, | |
| "loss": 0.6358, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 5.1, | |
| "learning_rate": 3.0176090611450313e-05, | |
| "loss": 0.633, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 5.15, | |
| "learning_rate": 2.8406335722502436e-05, | |
| "loss": 0.6315, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 5.2, | |
| "learning_rate": 2.6636580833554553e-05, | |
| "loss": 0.6315, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 5.26, | |
| "learning_rate": 2.4866825944606673e-05, | |
| "loss": 0.6284, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 5.31, | |
| "learning_rate": 2.309884081054774e-05, | |
| "loss": 0.6256, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 5.31, | |
| "eval_accuracy": 0.6391004306049153, | |
| "eval_loss": 0.6324899196624756, | |
| "eval_runtime": 358.7004, | |
| "eval_samples_per_second": 148.26, | |
| "eval_steps_per_second": 18.534, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 5.36, | |
| "learning_rate": 2.1329085921599857e-05, | |
| "loss": 0.6267, | |
| "step": 101000 | |
| }, | |
| { | |
| "epoch": 5.42, | |
| "learning_rate": 1.9561100787540926e-05, | |
| "loss": 0.6261, | |
| "step": 102000 | |
| }, | |
| { | |
| "epoch": 5.47, | |
| "learning_rate": 1.7791345898593046e-05, | |
| "loss": 0.6263, | |
| "step": 103000 | |
| }, | |
| { | |
| "epoch": 5.52, | |
| "learning_rate": 1.6021591009645165e-05, | |
| "loss": 0.6233, | |
| "step": 104000 | |
| }, | |
| { | |
| "epoch": 5.57, | |
| "learning_rate": 1.4253605875586232e-05, | |
| "loss": 0.6206, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 5.63, | |
| "learning_rate": 1.2483850986638352e-05, | |
| "loss": 0.6205, | |
| "step": 106000 | |
| }, | |
| { | |
| "epoch": 5.68, | |
| "learning_rate": 1.071409609769047e-05, | |
| "loss": 0.6173, | |
| "step": 107000 | |
| }, | |
| { | |
| "epoch": 5.73, | |
| "learning_rate": 8.944341208742588e-06, | |
| "loss": 0.6188, | |
| "step": 108000 | |
| }, | |
| { | |
| "epoch": 5.79, | |
| "learning_rate": 7.178125829572605e-06, | |
| "loss": 0.6193, | |
| "step": 109000 | |
| }, | |
| { | |
| "epoch": 5.84, | |
| "learning_rate": 5.408370940624724e-06, | |
| "loss": 0.6144, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 5.84, | |
| "eval_accuracy": 0.6535040710028017, | |
| "eval_loss": 0.6181166768074036, | |
| "eval_runtime": 370.6733, | |
| "eval_samples_per_second": 143.471, | |
| "eval_steps_per_second": 17.935, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 5.89, | |
| "learning_rate": 3.638616051676843e-06, | |
| "loss": 0.6189, | |
| "step": 111000 | |
| }, | |
| { | |
| "epoch": 5.95, | |
| "learning_rate": 1.8688611627289622e-06, | |
| "loss": 0.6177, | |
| "step": 112000 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "learning_rate": 9.910627378108132e-08, | |
| "loss": 0.6149, | |
| "step": 113000 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "step": 113010, | |
| "total_flos": 1.40117450989583e+20, | |
| "train_loss": 0.6729164371932674, | |
| "train_runtime": 20561.1299, | |
| "train_samples_per_second": 87.94, | |
| "train_steps_per_second": 5.496 | |
| } | |
| ], | |
| "logging_steps": 1000, | |
| "max_steps": 113010, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 6, | |
| "save_steps": 10000, | |
| "total_flos": 1.40117450989583e+20, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |