| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 8.16326530612245, |
| "eval_steps": 500, |
| "global_step": 20000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.20408163265306123, |
| "grad_norm": 11.627036094665527, |
| "learning_rate": 4.897959183673469e-06, |
| "loss": 0.8836, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.20408163265306123, |
| "eval_accuracy": 0.81, |
| "eval_loss": 0.5058887004852295, |
| "eval_runtime": 0.9154, |
| "eval_samples_per_second": 218.492, |
| "eval_steps_per_second": 27.312, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.40816326530612246, |
| "grad_norm": 12.688506126403809, |
| "learning_rate": 4.795918367346939e-06, |
| "loss": 0.5443, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.40816326530612246, |
| "eval_accuracy": 0.82, |
| "eval_loss": 0.4671895503997803, |
| "eval_runtime": 0.9055, |
| "eval_samples_per_second": 220.882, |
| "eval_steps_per_second": 27.61, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.6122448979591837, |
| "grad_norm": 13.827730178833008, |
| "learning_rate": 4.693877551020409e-06, |
| "loss": 0.515, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.6122448979591837, |
| "eval_accuracy": 0.81, |
| "eval_loss": 0.46435338258743286, |
| "eval_runtime": 0.9074, |
| "eval_samples_per_second": 220.422, |
| "eval_steps_per_second": 27.553, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.8163265306122449, |
| "grad_norm": 25.906373977661133, |
| "learning_rate": 4.591836734693878e-06, |
| "loss": 0.4944, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.8163265306122449, |
| "eval_accuracy": 0.82, |
| "eval_loss": 0.4432211220264435, |
| "eval_runtime": 0.9023, |
| "eval_samples_per_second": 221.663, |
| "eval_steps_per_second": 27.708, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.0204081632653061, |
| "grad_norm": 7.966475009918213, |
| "learning_rate": 4.489795918367348e-06, |
| "loss": 0.4552, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.0204081632653061, |
| "eval_accuracy": 0.825, |
| "eval_loss": 0.4824766516685486, |
| "eval_runtime": 0.9074, |
| "eval_samples_per_second": 220.402, |
| "eval_steps_per_second": 27.55, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.2244897959183674, |
| "grad_norm": 7.073498725891113, |
| "learning_rate": 4.3877551020408165e-06, |
| "loss": 0.3843, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.2244897959183674, |
| "eval_accuracy": 0.82, |
| "eval_loss": 0.4813213348388672, |
| "eval_runtime": 0.9063, |
| "eval_samples_per_second": 220.671, |
| "eval_steps_per_second": 27.584, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.4285714285714286, |
| "grad_norm": 31.893320083618164, |
| "learning_rate": 4.2857142857142855e-06, |
| "loss": 0.4123, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.4285714285714286, |
| "eval_accuracy": 0.84, |
| "eval_loss": 0.4781247079372406, |
| "eval_runtime": 0.9101, |
| "eval_samples_per_second": 219.764, |
| "eval_steps_per_second": 27.471, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.6326530612244898, |
| "grad_norm": 12.582639694213867, |
| "learning_rate": 4.183673469387755e-06, |
| "loss": 0.4158, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.6326530612244898, |
| "eval_accuracy": 0.845, |
| "eval_loss": 0.447386771440506, |
| "eval_runtime": 0.9126, |
| "eval_samples_per_second": 219.165, |
| "eval_steps_per_second": 27.396, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.836734693877551, |
| "grad_norm": 46.914730072021484, |
| "learning_rate": 4.081632653061225e-06, |
| "loss": 0.3748, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.836734693877551, |
| "eval_accuracy": 0.845, |
| "eval_loss": 0.45318862795829773, |
| "eval_runtime": 0.9051, |
| "eval_samples_per_second": 220.963, |
| "eval_steps_per_second": 27.62, |
| "step": 4500 |
| }, |
| { |
| "epoch": 2.0408163265306123, |
| "grad_norm": 36.80206298828125, |
| "learning_rate": 3.979591836734694e-06, |
| "loss": 0.3714, |
| "step": 5000 |
| }, |
| { |
| "epoch": 2.0408163265306123, |
| "eval_accuracy": 0.85, |
| "eval_loss": 0.4807853698730469, |
| "eval_runtime": 0.9043, |
| "eval_samples_per_second": 221.159, |
| "eval_steps_per_second": 27.645, |
| "step": 5000 |
| }, |
| { |
| "epoch": 2.2448979591836733, |
| "grad_norm": 28.757780075073242, |
| "learning_rate": 3.877551020408164e-06, |
| "loss": 0.3077, |
| "step": 5500 |
| }, |
| { |
| "epoch": 2.2448979591836733, |
| "eval_accuracy": 0.86, |
| "eval_loss": 0.4820670187473297, |
| "eval_runtime": 0.9085, |
| "eval_samples_per_second": 220.139, |
| "eval_steps_per_second": 27.517, |
| "step": 5500 |
| }, |
| { |
| "epoch": 2.4489795918367347, |
| "grad_norm": 4.862767219543457, |
| "learning_rate": 3.7755102040816327e-06, |
| "loss": 0.3405, |
| "step": 6000 |
| }, |
| { |
| "epoch": 2.4489795918367347, |
| "eval_accuracy": 0.85, |
| "eval_loss": 0.555472731590271, |
| "eval_runtime": 0.9092, |
| "eval_samples_per_second": 219.968, |
| "eval_steps_per_second": 27.496, |
| "step": 6000 |
| }, |
| { |
| "epoch": 2.6530612244897958, |
| "grad_norm": 24.65875816345215, |
| "learning_rate": 3.6734693877551024e-06, |
| "loss": 0.322, |
| "step": 6500 |
| }, |
| { |
| "epoch": 2.6530612244897958, |
| "eval_accuracy": 0.855, |
| "eval_loss": 0.5128615498542786, |
| "eval_runtime": 0.904, |
| "eval_samples_per_second": 221.229, |
| "eval_steps_per_second": 27.654, |
| "step": 6500 |
| }, |
| { |
| "epoch": 2.857142857142857, |
| "grad_norm": 28.109399795532227, |
| "learning_rate": 3.5714285714285718e-06, |
| "loss": 0.3269, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.857142857142857, |
| "eval_accuracy": 0.85, |
| "eval_loss": 0.4487631916999817, |
| "eval_runtime": 0.9021, |
| "eval_samples_per_second": 221.702, |
| "eval_steps_per_second": 27.713, |
| "step": 7000 |
| }, |
| { |
| "epoch": 3.061224489795918, |
| "grad_norm": 77.6505126953125, |
| "learning_rate": 3.469387755102041e-06, |
| "loss": 0.303, |
| "step": 7500 |
| }, |
| { |
| "epoch": 3.061224489795918, |
| "eval_accuracy": 0.86, |
| "eval_loss": 0.44085273146629333, |
| "eval_runtime": 0.9115, |
| "eval_samples_per_second": 219.427, |
| "eval_steps_per_second": 27.428, |
| "step": 7500 |
| }, |
| { |
| "epoch": 3.2653061224489797, |
| "grad_norm": 11.312287330627441, |
| "learning_rate": 3.3673469387755105e-06, |
| "loss": 0.2502, |
| "step": 8000 |
| }, |
| { |
| "epoch": 3.2653061224489797, |
| "eval_accuracy": 0.85, |
| "eval_loss": 0.49349740147590637, |
| "eval_runtime": 0.9089, |
| "eval_samples_per_second": 220.055, |
| "eval_steps_per_second": 27.507, |
| "step": 8000 |
| }, |
| { |
| "epoch": 3.4693877551020407, |
| "grad_norm": 17.491943359375, |
| "learning_rate": 3.2653061224489794e-06, |
| "loss": 0.2597, |
| "step": 8500 |
| }, |
| { |
| "epoch": 3.4693877551020407, |
| "eval_accuracy": 0.86, |
| "eval_loss": 0.5523408651351929, |
| "eval_runtime": 0.9044, |
| "eval_samples_per_second": 221.135, |
| "eval_steps_per_second": 27.642, |
| "step": 8500 |
| }, |
| { |
| "epoch": 3.673469387755102, |
| "grad_norm": 29.348148345947266, |
| "learning_rate": 3.1632653061224496e-06, |
| "loss": 0.2841, |
| "step": 9000 |
| }, |
| { |
| "epoch": 3.673469387755102, |
| "eval_accuracy": 0.855, |
| "eval_loss": 0.5676430463790894, |
| "eval_runtime": 0.9151, |
| "eval_samples_per_second": 218.544, |
| "eval_steps_per_second": 27.318, |
| "step": 9000 |
| }, |
| { |
| "epoch": 3.877551020408163, |
| "grad_norm": 18.88979148864746, |
| "learning_rate": 3.0612244897959185e-06, |
| "loss": 0.2907, |
| "step": 9500 |
| }, |
| { |
| "epoch": 3.877551020408163, |
| "eval_accuracy": 0.865, |
| "eval_loss": 0.5422011017799377, |
| "eval_runtime": 0.9097, |
| "eval_samples_per_second": 219.847, |
| "eval_steps_per_second": 27.481, |
| "step": 9500 |
| }, |
| { |
| "epoch": 4.081632653061225, |
| "grad_norm": 9.325241088867188, |
| "learning_rate": 2.959183673469388e-06, |
| "loss": 0.2521, |
| "step": 10000 |
| }, |
| { |
| "epoch": 4.081632653061225, |
| "eval_accuracy": 0.875, |
| "eval_loss": 0.49767935276031494, |
| "eval_runtime": 0.9168, |
| "eval_samples_per_second": 218.148, |
| "eval_steps_per_second": 27.268, |
| "step": 10000 |
| }, |
| { |
| "epoch": 4.285714285714286, |
| "grad_norm": 22.26582908630371, |
| "learning_rate": 2.8571428571428573e-06, |
| "loss": 0.2369, |
| "step": 10500 |
| }, |
| { |
| "epoch": 4.285714285714286, |
| "eval_accuracy": 0.885, |
| "eval_loss": 0.46531933546066284, |
| "eval_runtime": 0.9131, |
| "eval_samples_per_second": 219.037, |
| "eval_steps_per_second": 27.38, |
| "step": 10500 |
| }, |
| { |
| "epoch": 4.489795918367347, |
| "grad_norm": 21.525835037231445, |
| "learning_rate": 2.7551020408163266e-06, |
| "loss": 0.2312, |
| "step": 11000 |
| }, |
| { |
| "epoch": 4.489795918367347, |
| "eval_accuracy": 0.875, |
| "eval_loss": 0.5096437335014343, |
| "eval_runtime": 0.9046, |
| "eval_samples_per_second": 221.092, |
| "eval_steps_per_second": 27.637, |
| "step": 11000 |
| }, |
| { |
| "epoch": 4.6938775510204085, |
| "grad_norm": 24.357847213745117, |
| "learning_rate": 2.6530612244897964e-06, |
| "loss": 0.2232, |
| "step": 11500 |
| }, |
| { |
| "epoch": 4.6938775510204085, |
| "eval_accuracy": 0.865, |
| "eval_loss": 0.5319286584854126, |
| "eval_runtime": 0.9091, |
| "eval_samples_per_second": 220.002, |
| "eval_steps_per_second": 27.5, |
| "step": 11500 |
| }, |
| { |
| "epoch": 4.8979591836734695, |
| "grad_norm": 25.76366424560547, |
| "learning_rate": 2.5510204081632657e-06, |
| "loss": 0.2463, |
| "step": 12000 |
| }, |
| { |
| "epoch": 4.8979591836734695, |
| "eval_accuracy": 0.87, |
| "eval_loss": 0.5263551473617554, |
| "eval_runtime": 0.9024, |
| "eval_samples_per_second": 221.621, |
| "eval_steps_per_second": 27.703, |
| "step": 12000 |
| }, |
| { |
| "epoch": 5.1020408163265305, |
| "grad_norm": 9.35624885559082, |
| "learning_rate": 2.4489795918367347e-06, |
| "loss": 0.2163, |
| "step": 12500 |
| }, |
| { |
| "epoch": 5.1020408163265305, |
| "eval_accuracy": 0.865, |
| "eval_loss": 0.5473060607910156, |
| "eval_runtime": 0.9093, |
| "eval_samples_per_second": 219.942, |
| "eval_steps_per_second": 27.493, |
| "step": 12500 |
| }, |
| { |
| "epoch": 5.3061224489795915, |
| "grad_norm": 1.3547799587249756, |
| "learning_rate": 2.3469387755102044e-06, |
| "loss": 0.202, |
| "step": 13000 |
| }, |
| { |
| "epoch": 5.3061224489795915, |
| "eval_accuracy": 0.87, |
| "eval_loss": 0.5721442699432373, |
| "eval_runtime": 0.9064, |
| "eval_samples_per_second": 220.652, |
| "eval_steps_per_second": 27.581, |
| "step": 13000 |
| }, |
| { |
| "epoch": 5.510204081632653, |
| "grad_norm": 48.84397888183594, |
| "learning_rate": 2.244897959183674e-06, |
| "loss": 0.2078, |
| "step": 13500 |
| }, |
| { |
| "epoch": 5.510204081632653, |
| "eval_accuracy": 0.865, |
| "eval_loss": 0.5391547679901123, |
| "eval_runtime": 0.9112, |
| "eval_samples_per_second": 219.49, |
| "eval_steps_per_second": 27.436, |
| "step": 13500 |
| }, |
| { |
| "epoch": 5.714285714285714, |
| "grad_norm": 22.743989944458008, |
| "learning_rate": 2.1428571428571427e-06, |
| "loss": 0.1847, |
| "step": 14000 |
| }, |
| { |
| "epoch": 5.714285714285714, |
| "eval_accuracy": 0.88, |
| "eval_loss": 0.5364116430282593, |
| "eval_runtime": 0.9101, |
| "eval_samples_per_second": 219.751, |
| "eval_steps_per_second": 27.469, |
| "step": 14000 |
| }, |
| { |
| "epoch": 5.918367346938775, |
| "grad_norm": 2.1184639930725098, |
| "learning_rate": 2.0408163265306125e-06, |
| "loss": 0.1996, |
| "step": 14500 |
| }, |
| { |
| "epoch": 5.918367346938775, |
| "eval_accuracy": 0.865, |
| "eval_loss": 0.5743934512138367, |
| "eval_runtime": 0.9196, |
| "eval_samples_per_second": 217.49, |
| "eval_steps_per_second": 27.186, |
| "step": 14500 |
| }, |
| { |
| "epoch": 6.122448979591836, |
| "grad_norm": 28.888248443603516, |
| "learning_rate": 1.938775510204082e-06, |
| "loss": 0.1816, |
| "step": 15000 |
| }, |
| { |
| "epoch": 6.122448979591836, |
| "eval_accuracy": 0.865, |
| "eval_loss": 0.6048715114593506, |
| "eval_runtime": 0.9105, |
| "eval_samples_per_second": 219.667, |
| "eval_steps_per_second": 27.458, |
| "step": 15000 |
| }, |
| { |
| "epoch": 6.326530612244898, |
| "grad_norm": 12.823907852172852, |
| "learning_rate": 1.8367346938775512e-06, |
| "loss": 0.1717, |
| "step": 15500 |
| }, |
| { |
| "epoch": 6.326530612244898, |
| "eval_accuracy": 0.86, |
| "eval_loss": 0.5824536681175232, |
| "eval_runtime": 0.9038, |
| "eval_samples_per_second": 221.293, |
| "eval_steps_per_second": 27.662, |
| "step": 15500 |
| }, |
| { |
| "epoch": 6.530612244897959, |
| "grad_norm": 26.713756561279297, |
| "learning_rate": 1.7346938775510206e-06, |
| "loss": 0.1738, |
| "step": 16000 |
| }, |
| { |
| "epoch": 6.530612244897959, |
| "eval_accuracy": 0.85, |
| "eval_loss": 0.5947970747947693, |
| "eval_runtime": 0.9197, |
| "eval_samples_per_second": 217.458, |
| "eval_steps_per_second": 27.182, |
| "step": 16000 |
| }, |
| { |
| "epoch": 6.73469387755102, |
| "grad_norm": 9.862221717834473, |
| "learning_rate": 1.6326530612244897e-06, |
| "loss": 0.1614, |
| "step": 16500 |
| }, |
| { |
| "epoch": 6.73469387755102, |
| "eval_accuracy": 0.87, |
| "eval_loss": 0.5696790218353271, |
| "eval_runtime": 0.9031, |
| "eval_samples_per_second": 221.45, |
| "eval_steps_per_second": 27.681, |
| "step": 16500 |
| }, |
| { |
| "epoch": 6.938775510204081, |
| "grad_norm": 3.705087423324585, |
| "learning_rate": 1.5306122448979593e-06, |
| "loss": 0.1699, |
| "step": 17000 |
| }, |
| { |
| "epoch": 6.938775510204081, |
| "eval_accuracy": 0.875, |
| "eval_loss": 0.6074076294898987, |
| "eval_runtime": 0.9047, |
| "eval_samples_per_second": 221.062, |
| "eval_steps_per_second": 27.633, |
| "step": 17000 |
| }, |
| { |
| "epoch": 7.142857142857143, |
| "grad_norm": 10.12613582611084, |
| "learning_rate": 1.4285714285714286e-06, |
| "loss": 0.1616, |
| "step": 17500 |
| }, |
| { |
| "epoch": 7.142857142857143, |
| "eval_accuracy": 0.865, |
| "eval_loss": 0.6130539774894714, |
| "eval_runtime": 0.9083, |
| "eval_samples_per_second": 220.181, |
| "eval_steps_per_second": 27.523, |
| "step": 17500 |
| }, |
| { |
| "epoch": 7.346938775510204, |
| "grad_norm": 0.138712540268898, |
| "learning_rate": 1.3265306122448982e-06, |
| "loss": 0.1519, |
| "step": 18000 |
| }, |
| { |
| "epoch": 7.346938775510204, |
| "eval_accuracy": 0.865, |
| "eval_loss": 0.5894231200218201, |
| "eval_runtime": 0.9116, |
| "eval_samples_per_second": 219.392, |
| "eval_steps_per_second": 27.424, |
| "step": 18000 |
| }, |
| { |
| "epoch": 7.551020408163265, |
| "grad_norm": 34.15790557861328, |
| "learning_rate": 1.2244897959183673e-06, |
| "loss": 0.1631, |
| "step": 18500 |
| }, |
| { |
| "epoch": 7.551020408163265, |
| "eval_accuracy": 0.885, |
| "eval_loss": 0.5699704885482788, |
| "eval_runtime": 0.9023, |
| "eval_samples_per_second": 221.661, |
| "eval_steps_per_second": 27.708, |
| "step": 18500 |
| }, |
| { |
| "epoch": 7.755102040816326, |
| "grad_norm": 17.932504653930664, |
| "learning_rate": 1.122448979591837e-06, |
| "loss": 0.1353, |
| "step": 19000 |
| }, |
| { |
| "epoch": 7.755102040816326, |
| "eval_accuracy": 0.865, |
| "eval_loss": 0.6146803498268127, |
| "eval_runtime": 0.922, |
| "eval_samples_per_second": 216.922, |
| "eval_steps_per_second": 27.115, |
| "step": 19000 |
| }, |
| { |
| "epoch": 7.959183673469388, |
| "grad_norm": 47.22719192504883, |
| "learning_rate": 1.0204081632653063e-06, |
| "loss": 0.1617, |
| "step": 19500 |
| }, |
| { |
| "epoch": 7.959183673469388, |
| "eval_accuracy": 0.87, |
| "eval_loss": 0.5830011963844299, |
| "eval_runtime": 0.9078, |
| "eval_samples_per_second": 220.317, |
| "eval_steps_per_second": 27.54, |
| "step": 19500 |
| }, |
| { |
| "epoch": 8.16326530612245, |
| "grad_norm": 14.763110160827637, |
| "learning_rate": 9.183673469387756e-07, |
| "loss": 0.1374, |
| "step": 20000 |
| }, |
| { |
| "epoch": 8.16326530612245, |
| "eval_accuracy": 0.865, |
| "eval_loss": 0.5952904224395752, |
| "eval_runtime": 0.911, |
| "eval_samples_per_second": 219.542, |
| "eval_steps_per_second": 27.443, |
| "step": 20000 |
| } |
| ], |
| "logging_steps": 500, |
| "max_steps": 24500, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 10000, |
| "total_flos": 2.119553974272e+16, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|