{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.16326530612245, "eval_steps": 500, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.20408163265306123, "grad_norm": 11.627036094665527, "learning_rate": 4.897959183673469e-06, "loss": 0.8836, "step": 500 }, { "epoch": 0.20408163265306123, "eval_accuracy": 0.81, "eval_loss": 0.5058887004852295, "eval_runtime": 0.9154, "eval_samples_per_second": 218.492, "eval_steps_per_second": 27.312, "step": 500 }, { "epoch": 0.40816326530612246, "grad_norm": 12.688506126403809, "learning_rate": 4.795918367346939e-06, "loss": 0.5443, "step": 1000 }, { "epoch": 0.40816326530612246, "eval_accuracy": 0.82, "eval_loss": 0.4671895503997803, "eval_runtime": 0.9055, "eval_samples_per_second": 220.882, "eval_steps_per_second": 27.61, "step": 1000 }, { "epoch": 0.6122448979591837, "grad_norm": 13.827730178833008, "learning_rate": 4.693877551020409e-06, "loss": 0.515, "step": 1500 }, { "epoch": 0.6122448979591837, "eval_accuracy": 0.81, "eval_loss": 0.46435338258743286, "eval_runtime": 0.9074, "eval_samples_per_second": 220.422, "eval_steps_per_second": 27.553, "step": 1500 }, { "epoch": 0.8163265306122449, "grad_norm": 25.906373977661133, "learning_rate": 4.591836734693878e-06, "loss": 0.4944, "step": 2000 }, { "epoch": 0.8163265306122449, "eval_accuracy": 0.82, "eval_loss": 0.4432211220264435, "eval_runtime": 0.9023, "eval_samples_per_second": 221.663, "eval_steps_per_second": 27.708, "step": 2000 }, { "epoch": 1.0204081632653061, "grad_norm": 7.966475009918213, "learning_rate": 4.489795918367348e-06, "loss": 0.4552, "step": 2500 }, { "epoch": 1.0204081632653061, "eval_accuracy": 0.825, "eval_loss": 0.4824766516685486, "eval_runtime": 0.9074, "eval_samples_per_second": 220.402, "eval_steps_per_second": 27.55, "step": 2500 }, { "epoch": 1.2244897959183674, "grad_norm": 7.073498725891113, "learning_rate": 4.3877551020408165e-06, "loss": 0.3843, "step": 3000 }, { "epoch": 1.2244897959183674, "eval_accuracy": 0.82, "eval_loss": 0.4813213348388672, "eval_runtime": 0.9063, "eval_samples_per_second": 220.671, "eval_steps_per_second": 27.584, "step": 3000 }, { "epoch": 1.4285714285714286, "grad_norm": 31.893320083618164, "learning_rate": 4.2857142857142855e-06, "loss": 0.4123, "step": 3500 }, { "epoch": 1.4285714285714286, "eval_accuracy": 0.84, "eval_loss": 0.4781247079372406, "eval_runtime": 0.9101, "eval_samples_per_second": 219.764, "eval_steps_per_second": 27.471, "step": 3500 }, { "epoch": 1.6326530612244898, "grad_norm": 12.582639694213867, "learning_rate": 4.183673469387755e-06, "loss": 0.4158, "step": 4000 }, { "epoch": 1.6326530612244898, "eval_accuracy": 0.845, "eval_loss": 0.447386771440506, "eval_runtime": 0.9126, "eval_samples_per_second": 219.165, "eval_steps_per_second": 27.396, "step": 4000 }, { "epoch": 1.836734693877551, "grad_norm": 46.914730072021484, "learning_rate": 4.081632653061225e-06, "loss": 0.3748, "step": 4500 }, { "epoch": 1.836734693877551, "eval_accuracy": 0.845, "eval_loss": 0.45318862795829773, "eval_runtime": 0.9051, "eval_samples_per_second": 220.963, "eval_steps_per_second": 27.62, "step": 4500 }, { "epoch": 2.0408163265306123, "grad_norm": 36.80206298828125, "learning_rate": 3.979591836734694e-06, "loss": 0.3714, "step": 5000 }, { "epoch": 2.0408163265306123, "eval_accuracy": 0.85, "eval_loss": 0.4807853698730469, "eval_runtime": 0.9043, "eval_samples_per_second": 221.159, "eval_steps_per_second": 27.645, "step": 5000 }, { "epoch": 2.2448979591836733, "grad_norm": 28.757780075073242, "learning_rate": 3.877551020408164e-06, "loss": 0.3077, "step": 5500 }, { "epoch": 2.2448979591836733, "eval_accuracy": 0.86, "eval_loss": 0.4820670187473297, "eval_runtime": 0.9085, "eval_samples_per_second": 220.139, "eval_steps_per_second": 27.517, "step": 5500 }, { "epoch": 2.4489795918367347, "grad_norm": 4.862767219543457, "learning_rate": 3.7755102040816327e-06, "loss": 0.3405, "step": 6000 }, { "epoch": 2.4489795918367347, "eval_accuracy": 0.85, "eval_loss": 0.555472731590271, "eval_runtime": 0.9092, "eval_samples_per_second": 219.968, "eval_steps_per_second": 27.496, "step": 6000 }, { "epoch": 2.6530612244897958, "grad_norm": 24.65875816345215, "learning_rate": 3.6734693877551024e-06, "loss": 0.322, "step": 6500 }, { "epoch": 2.6530612244897958, "eval_accuracy": 0.855, "eval_loss": 0.5128615498542786, "eval_runtime": 0.904, "eval_samples_per_second": 221.229, "eval_steps_per_second": 27.654, "step": 6500 }, { "epoch": 2.857142857142857, "grad_norm": 28.109399795532227, "learning_rate": 3.5714285714285718e-06, "loss": 0.3269, "step": 7000 }, { "epoch": 2.857142857142857, "eval_accuracy": 0.85, "eval_loss": 0.4487631916999817, "eval_runtime": 0.9021, "eval_samples_per_second": 221.702, "eval_steps_per_second": 27.713, "step": 7000 }, { "epoch": 3.061224489795918, "grad_norm": 77.6505126953125, "learning_rate": 3.469387755102041e-06, "loss": 0.303, "step": 7500 }, { "epoch": 3.061224489795918, "eval_accuracy": 0.86, "eval_loss": 0.44085273146629333, "eval_runtime": 0.9115, "eval_samples_per_second": 219.427, "eval_steps_per_second": 27.428, "step": 7500 }, { "epoch": 3.2653061224489797, "grad_norm": 11.312287330627441, "learning_rate": 3.3673469387755105e-06, "loss": 0.2502, "step": 8000 }, { "epoch": 3.2653061224489797, "eval_accuracy": 0.85, "eval_loss": 0.49349740147590637, "eval_runtime": 0.9089, "eval_samples_per_second": 220.055, "eval_steps_per_second": 27.507, "step": 8000 }, { "epoch": 3.4693877551020407, "grad_norm": 17.491943359375, "learning_rate": 3.2653061224489794e-06, "loss": 0.2597, "step": 8500 }, { "epoch": 3.4693877551020407, "eval_accuracy": 0.86, "eval_loss": 0.5523408651351929, "eval_runtime": 0.9044, "eval_samples_per_second": 221.135, "eval_steps_per_second": 27.642, "step": 8500 }, { "epoch": 3.673469387755102, "grad_norm": 29.348148345947266, "learning_rate": 3.1632653061224496e-06, "loss": 0.2841, "step": 9000 }, { "epoch": 3.673469387755102, "eval_accuracy": 0.855, "eval_loss": 0.5676430463790894, "eval_runtime": 0.9151, "eval_samples_per_second": 218.544, "eval_steps_per_second": 27.318, "step": 9000 }, { "epoch": 3.877551020408163, "grad_norm": 18.88979148864746, "learning_rate": 3.0612244897959185e-06, "loss": 0.2907, "step": 9500 }, { "epoch": 3.877551020408163, "eval_accuracy": 0.865, "eval_loss": 0.5422011017799377, "eval_runtime": 0.9097, "eval_samples_per_second": 219.847, "eval_steps_per_second": 27.481, "step": 9500 }, { "epoch": 4.081632653061225, "grad_norm": 9.325241088867188, "learning_rate": 2.959183673469388e-06, "loss": 0.2521, "step": 10000 }, { "epoch": 4.081632653061225, "eval_accuracy": 0.875, "eval_loss": 0.49767935276031494, "eval_runtime": 0.9168, "eval_samples_per_second": 218.148, "eval_steps_per_second": 27.268, "step": 10000 }, { "epoch": 4.285714285714286, "grad_norm": 22.26582908630371, "learning_rate": 2.8571428571428573e-06, "loss": 0.2369, "step": 10500 }, { "epoch": 4.285714285714286, "eval_accuracy": 0.885, "eval_loss": 0.46531933546066284, "eval_runtime": 0.9131, "eval_samples_per_second": 219.037, "eval_steps_per_second": 27.38, "step": 10500 }, { "epoch": 4.489795918367347, "grad_norm": 21.525835037231445, "learning_rate": 2.7551020408163266e-06, "loss": 0.2312, "step": 11000 }, { "epoch": 4.489795918367347, "eval_accuracy": 0.875, "eval_loss": 0.5096437335014343, "eval_runtime": 0.9046, "eval_samples_per_second": 221.092, "eval_steps_per_second": 27.637, "step": 11000 }, { "epoch": 4.6938775510204085, "grad_norm": 24.357847213745117, "learning_rate": 2.6530612244897964e-06, "loss": 0.2232, "step": 11500 }, { "epoch": 4.6938775510204085, "eval_accuracy": 0.865, "eval_loss": 0.5319286584854126, "eval_runtime": 0.9091, "eval_samples_per_second": 220.002, "eval_steps_per_second": 27.5, "step": 11500 }, { "epoch": 4.8979591836734695, "grad_norm": 25.76366424560547, "learning_rate": 2.5510204081632657e-06, "loss": 0.2463, "step": 12000 }, { "epoch": 4.8979591836734695, "eval_accuracy": 0.87, "eval_loss": 0.5263551473617554, "eval_runtime": 0.9024, "eval_samples_per_second": 221.621, "eval_steps_per_second": 27.703, "step": 12000 }, { "epoch": 5.1020408163265305, "grad_norm": 9.35624885559082, "learning_rate": 2.4489795918367347e-06, "loss": 0.2163, "step": 12500 }, { "epoch": 5.1020408163265305, "eval_accuracy": 0.865, "eval_loss": 0.5473060607910156, "eval_runtime": 0.9093, "eval_samples_per_second": 219.942, "eval_steps_per_second": 27.493, "step": 12500 }, { "epoch": 5.3061224489795915, "grad_norm": 1.3547799587249756, "learning_rate": 2.3469387755102044e-06, "loss": 0.202, "step": 13000 }, { "epoch": 5.3061224489795915, "eval_accuracy": 0.87, "eval_loss": 0.5721442699432373, "eval_runtime": 0.9064, "eval_samples_per_second": 220.652, "eval_steps_per_second": 27.581, "step": 13000 }, { "epoch": 5.510204081632653, "grad_norm": 48.84397888183594, "learning_rate": 2.244897959183674e-06, "loss": 0.2078, "step": 13500 }, { "epoch": 5.510204081632653, "eval_accuracy": 0.865, "eval_loss": 0.5391547679901123, "eval_runtime": 0.9112, "eval_samples_per_second": 219.49, "eval_steps_per_second": 27.436, "step": 13500 }, { "epoch": 5.714285714285714, "grad_norm": 22.743989944458008, "learning_rate": 2.1428571428571427e-06, "loss": 0.1847, "step": 14000 }, { "epoch": 5.714285714285714, "eval_accuracy": 0.88, "eval_loss": 0.5364116430282593, "eval_runtime": 0.9101, "eval_samples_per_second": 219.751, "eval_steps_per_second": 27.469, "step": 14000 }, { "epoch": 5.918367346938775, "grad_norm": 2.1184639930725098, "learning_rate": 2.0408163265306125e-06, "loss": 0.1996, "step": 14500 }, { "epoch": 5.918367346938775, "eval_accuracy": 0.865, "eval_loss": 0.5743934512138367, "eval_runtime": 0.9196, "eval_samples_per_second": 217.49, "eval_steps_per_second": 27.186, "step": 14500 }, { "epoch": 6.122448979591836, "grad_norm": 28.888248443603516, "learning_rate": 1.938775510204082e-06, "loss": 0.1816, "step": 15000 }, { "epoch": 6.122448979591836, "eval_accuracy": 0.865, "eval_loss": 0.6048715114593506, "eval_runtime": 0.9105, "eval_samples_per_second": 219.667, "eval_steps_per_second": 27.458, "step": 15000 }, { "epoch": 6.326530612244898, "grad_norm": 12.823907852172852, "learning_rate": 1.8367346938775512e-06, "loss": 0.1717, "step": 15500 }, { "epoch": 6.326530612244898, "eval_accuracy": 0.86, "eval_loss": 0.5824536681175232, "eval_runtime": 0.9038, "eval_samples_per_second": 221.293, "eval_steps_per_second": 27.662, "step": 15500 }, { "epoch": 6.530612244897959, "grad_norm": 26.713756561279297, "learning_rate": 1.7346938775510206e-06, "loss": 0.1738, "step": 16000 }, { "epoch": 6.530612244897959, "eval_accuracy": 0.85, "eval_loss": 0.5947970747947693, "eval_runtime": 0.9197, "eval_samples_per_second": 217.458, "eval_steps_per_second": 27.182, "step": 16000 }, { "epoch": 6.73469387755102, "grad_norm": 9.862221717834473, "learning_rate": 1.6326530612244897e-06, "loss": 0.1614, "step": 16500 }, { "epoch": 6.73469387755102, "eval_accuracy": 0.87, "eval_loss": 0.5696790218353271, "eval_runtime": 0.9031, "eval_samples_per_second": 221.45, "eval_steps_per_second": 27.681, "step": 16500 }, { "epoch": 6.938775510204081, "grad_norm": 3.705087423324585, "learning_rate": 1.5306122448979593e-06, "loss": 0.1699, "step": 17000 }, { "epoch": 6.938775510204081, "eval_accuracy": 0.875, "eval_loss": 0.6074076294898987, "eval_runtime": 0.9047, "eval_samples_per_second": 221.062, "eval_steps_per_second": 27.633, "step": 17000 }, { "epoch": 7.142857142857143, "grad_norm": 10.12613582611084, "learning_rate": 1.4285714285714286e-06, "loss": 0.1616, "step": 17500 }, { "epoch": 7.142857142857143, "eval_accuracy": 0.865, "eval_loss": 0.6130539774894714, "eval_runtime": 0.9083, "eval_samples_per_second": 220.181, "eval_steps_per_second": 27.523, "step": 17500 }, { "epoch": 7.346938775510204, "grad_norm": 0.138712540268898, "learning_rate": 1.3265306122448982e-06, "loss": 0.1519, "step": 18000 }, { "epoch": 7.346938775510204, "eval_accuracy": 0.865, "eval_loss": 0.5894231200218201, "eval_runtime": 0.9116, "eval_samples_per_second": 219.392, "eval_steps_per_second": 27.424, "step": 18000 }, { "epoch": 7.551020408163265, "grad_norm": 34.15790557861328, "learning_rate": 1.2244897959183673e-06, "loss": 0.1631, "step": 18500 }, { "epoch": 7.551020408163265, "eval_accuracy": 0.885, "eval_loss": 0.5699704885482788, "eval_runtime": 0.9023, "eval_samples_per_second": 221.661, "eval_steps_per_second": 27.708, "step": 18500 }, { "epoch": 7.755102040816326, "grad_norm": 17.932504653930664, "learning_rate": 1.122448979591837e-06, "loss": 0.1353, "step": 19000 }, { "epoch": 7.755102040816326, "eval_accuracy": 0.865, "eval_loss": 0.6146803498268127, "eval_runtime": 0.922, "eval_samples_per_second": 216.922, "eval_steps_per_second": 27.115, "step": 19000 }, { "epoch": 7.959183673469388, "grad_norm": 47.22719192504883, "learning_rate": 1.0204081632653063e-06, "loss": 0.1617, "step": 19500 }, { "epoch": 7.959183673469388, "eval_accuracy": 0.87, "eval_loss": 0.5830011963844299, "eval_runtime": 0.9078, "eval_samples_per_second": 220.317, "eval_steps_per_second": 27.54, "step": 19500 }, { "epoch": 8.16326530612245, "grad_norm": 14.763110160827637, "learning_rate": 9.183673469387756e-07, "loss": 0.1374, "step": 20000 }, { "epoch": 8.16326530612245, "eval_accuracy": 0.865, "eval_loss": 0.5952904224395752, "eval_runtime": 0.911, "eval_samples_per_second": 219.542, "eval_steps_per_second": 27.443, "step": 20000 } ], "logging_steps": 500, "max_steps": 24500, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10000, "total_flos": 2.119553974272e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }