{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9988571428571429, "eval_steps": 10, "global_step": 437, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022857142857142857, "grad_norm": 6.883157253265381, "learning_rate": 9.77116704805492e-05, "loss": 0.9709, "step": 10 }, { "epoch": 0.022857142857142857, "eval_accuracy": 0.6398571133613586, "eval_loss": 0.8923419117927551, "eval_runtime": 252.6626, "eval_samples_per_second": 27.705, "eval_steps_per_second": 6.926, "step": 10 }, { "epoch": 0.045714285714285714, "grad_norm": 4.793847560882568, "learning_rate": 9.542334096109841e-05, "loss": 0.9219, "step": 20 }, { "epoch": 0.045714285714285714, "eval_accuracy": 0.7664285898208618, "eval_loss": 0.6903320550918579, "eval_runtime": 260.5483, "eval_samples_per_second": 26.866, "eval_steps_per_second": 6.717, "step": 20 }, { "epoch": 0.06857142857142857, "grad_norm": 6.191551685333252, "learning_rate": 9.31350114416476e-05, "loss": 0.7112, "step": 30 }, { "epoch": 0.06857142857142857, "eval_accuracy": 0.7908571362495422, "eval_loss": 0.5838488936424255, "eval_runtime": 254.6091, "eval_samples_per_second": 27.493, "eval_steps_per_second": 6.873, "step": 30 }, { "epoch": 0.09142857142857143, "grad_norm": 9.833272933959961, "learning_rate": 9.08466819221968e-05, "loss": 0.567, "step": 40 }, { "epoch": 0.09142857142857143, "eval_accuracy": 0.8158571720123291, "eval_loss": 0.5405334830284119, "eval_runtime": 263.3184, "eval_samples_per_second": 26.584, "eval_steps_per_second": 6.646, "step": 40 }, { "epoch": 0.11428571428571428, "grad_norm": 9.925666809082031, "learning_rate": 8.878718535469108e-05, "loss": 0.6184, "step": 50 }, { "epoch": 0.11428571428571428, "eval_accuracy": 0.8581428527832031, "eval_loss": 0.41476812958717346, "eval_runtime": 259.1036, "eval_samples_per_second": 27.016, "eval_steps_per_second": 6.754, "step": 50 }, { "epoch": 0.13714285714285715, "grad_norm": 3.723980665206909, "learning_rate": 8.649885583524028e-05, "loss": 0.5291, "step": 60 }, { "epoch": 0.13714285714285715, "eval_accuracy": 0.8511428833007812, "eval_loss": 0.44439756870269775, "eval_runtime": 253.5826, "eval_samples_per_second": 27.604, "eval_steps_per_second": 6.901, "step": 60 }, { "epoch": 0.16, "grad_norm": 10.508088111877441, "learning_rate": 8.421052631578948e-05, "loss": 0.533, "step": 70 }, { "epoch": 0.16, "eval_accuracy": 0.8271428346633911, "eval_loss": 0.4642958641052246, "eval_runtime": 260.9488, "eval_samples_per_second": 26.825, "eval_steps_per_second": 6.706, "step": 70 }, { "epoch": 0.18285714285714286, "grad_norm": 7.824756622314453, "learning_rate": 8.192219679633868e-05, "loss": 0.4753, "step": 80 }, { "epoch": 0.18285714285714286, "eval_accuracy": 0.876714289188385, "eval_loss": 0.35598087310791016, "eval_runtime": 262.7831, "eval_samples_per_second": 26.638, "eval_steps_per_second": 6.659, "step": 80 }, { "epoch": 0.2057142857142857, "grad_norm": 5.332316875457764, "learning_rate": 7.963386727688788e-05, "loss": 0.4252, "step": 90 }, { "epoch": 0.2057142857142857, "eval_accuracy": 0.8102856874465942, "eval_loss": 0.5888535380363464, "eval_runtime": 262.7552, "eval_samples_per_second": 26.641, "eval_steps_per_second": 6.66, "step": 90 }, { "epoch": 0.22857142857142856, "grad_norm": 17.482688903808594, "learning_rate": 7.734553775743708e-05, "loss": 0.5007, "step": 100 }, { "epoch": 0.22857142857142856, "eval_accuracy": 0.8662857413291931, "eval_loss": 0.38821107149124146, "eval_runtime": 261.4572, "eval_samples_per_second": 26.773, "eval_steps_per_second": 6.693, "step": 100 }, { "epoch": 0.25142857142857145, "grad_norm": 8.691084861755371, "learning_rate": 7.505720823798627e-05, "loss": 0.5605, "step": 110 }, { "epoch": 0.25142857142857145, "eval_accuracy": 0.8921428322792053, "eval_loss": 0.32210296392440796, "eval_runtime": 261.1514, "eval_samples_per_second": 26.804, "eval_steps_per_second": 6.701, "step": 110 }, { "epoch": 0.2742857142857143, "grad_norm": 11.754142761230469, "learning_rate": 7.276887871853547e-05, "loss": 0.4875, "step": 120 }, { "epoch": 0.2742857142857143, "eval_accuracy": 0.8558571338653564, "eval_loss": 0.36388570070266724, "eval_runtime": 265.2182, "eval_samples_per_second": 26.393, "eval_steps_per_second": 6.598, "step": 120 }, { "epoch": 0.29714285714285715, "grad_norm": 7.222925662994385, "learning_rate": 7.048054919908466e-05, "loss": 0.4277, "step": 130 }, { "epoch": 0.29714285714285715, "eval_accuracy": 0.8745714426040649, "eval_loss": 0.35708051919937134, "eval_runtime": 264.6016, "eval_samples_per_second": 26.455, "eval_steps_per_second": 6.614, "step": 130 }, { "epoch": 0.32, "grad_norm": 6.181695938110352, "learning_rate": 6.819221967963387e-05, "loss": 0.3415, "step": 140 }, { "epoch": 0.32, "eval_accuracy": 0.8861428499221802, "eval_loss": 0.33818891644477844, "eval_runtime": 262.5039, "eval_samples_per_second": 26.666, "eval_steps_per_second": 6.667, "step": 140 }, { "epoch": 0.34285714285714286, "grad_norm": 8.087543487548828, "learning_rate": 6.590389016018307e-05, "loss": 0.413, "step": 150 }, { "epoch": 0.34285714285714286, "eval_accuracy": 0.9104285836219788, "eval_loss": 0.2596481442451477, "eval_runtime": 265.6837, "eval_samples_per_second": 26.347, "eval_steps_per_second": 6.587, "step": 150 }, { "epoch": 0.3657142857142857, "grad_norm": 11.313796997070312, "learning_rate": 6.361556064073226e-05, "loss": 0.377, "step": 160 }, { "epoch": 0.3657142857142857, "eval_accuracy": 0.8711428642272949, "eval_loss": 0.3518799841403961, "eval_runtime": 264.3798, "eval_samples_per_second": 26.477, "eval_steps_per_second": 6.619, "step": 160 }, { "epoch": 0.38857142857142857, "grad_norm": 7.65640115737915, "learning_rate": 6.132723112128147e-05, "loss": 0.4219, "step": 170 }, { "epoch": 0.38857142857142857, "eval_accuracy": 0.8947142958641052, "eval_loss": 0.2979215681552887, "eval_runtime": 262.8341, "eval_samples_per_second": 26.633, "eval_steps_per_second": 6.658, "step": 170 }, { "epoch": 0.4114285714285714, "grad_norm": 6.2714433670043945, "learning_rate": 5.903890160183066e-05, "loss": 0.3317, "step": 180 }, { "epoch": 0.4114285714285714, "eval_accuracy": 0.9225714206695557, "eval_loss": 0.22266168892383575, "eval_runtime": 265.1248, "eval_samples_per_second": 26.403, "eval_steps_per_second": 6.601, "step": 180 }, { "epoch": 0.4342857142857143, "grad_norm": 8.710111618041992, "learning_rate": 5.675057208237986e-05, "loss": 0.3131, "step": 190 }, { "epoch": 0.4342857142857143, "eval_accuracy": 0.8692857027053833, "eval_loss": 0.3680011034011841, "eval_runtime": 260.0056, "eval_samples_per_second": 26.923, "eval_steps_per_second": 6.731, "step": 190 }, { "epoch": 0.45714285714285713, "grad_norm": 4.041360378265381, "learning_rate": 5.446224256292907e-05, "loss": 0.3266, "step": 200 }, { "epoch": 0.45714285714285713, "eval_accuracy": 0.9308571219444275, "eval_loss": 0.20981180667877197, "eval_runtime": 256.153, "eval_samples_per_second": 27.327, "eval_steps_per_second": 6.832, "step": 200 }, { "epoch": 0.48, "grad_norm": 10.932918548583984, "learning_rate": 5.217391304347826e-05, "loss": 0.3306, "step": 210 }, { "epoch": 0.48, "eval_accuracy": 0.8824285864830017, "eval_loss": 0.3848917782306671, "eval_runtime": 253.9958, "eval_samples_per_second": 27.56, "eval_steps_per_second": 6.89, "step": 210 }, { "epoch": 0.5028571428571429, "grad_norm": 9.440160751342773, "learning_rate": 4.9885583524027466e-05, "loss": 0.3037, "step": 220 }, { "epoch": 0.5028571428571429, "eval_accuracy": 0.9024285674095154, "eval_loss": 0.28518444299697876, "eval_runtime": 259.3612, "eval_samples_per_second": 26.989, "eval_steps_per_second": 6.747, "step": 220 }, { "epoch": 0.5257142857142857, "grad_norm": 9.196854591369629, "learning_rate": 4.759725400457666e-05, "loss": 0.3086, "step": 230 }, { "epoch": 0.5257142857142857, "eval_accuracy": 0.9121428728103638, "eval_loss": 0.272481232881546, "eval_runtime": 254.9581, "eval_samples_per_second": 27.455, "eval_steps_per_second": 6.864, "step": 230 }, { "epoch": 0.5485714285714286, "grad_norm": 6.610895156860352, "learning_rate": 4.530892448512586e-05, "loss": 0.2576, "step": 240 }, { "epoch": 0.5485714285714286, "eval_accuracy": 0.9355714321136475, "eval_loss": 0.18688350915908813, "eval_runtime": 255.2292, "eval_samples_per_second": 27.426, "eval_steps_per_second": 6.857, "step": 240 }, { "epoch": 0.5714285714285714, "grad_norm": 15.24905014038086, "learning_rate": 4.302059496567506e-05, "loss": 0.2469, "step": 250 }, { "epoch": 0.5714285714285714, "eval_accuracy": 0.9242857098579407, "eval_loss": 0.2262311726808548, "eval_runtime": 254.9064, "eval_samples_per_second": 27.461, "eval_steps_per_second": 6.865, "step": 250 }, { "epoch": 0.5942857142857143, "grad_norm": 9.8357515335083, "learning_rate": 4.073226544622426e-05, "loss": 0.2405, "step": 260 }, { "epoch": 0.5942857142857143, "eval_accuracy": 0.9347142577171326, "eval_loss": 0.19631564617156982, "eval_runtime": 271.1966, "eval_samples_per_second": 25.812, "eval_steps_per_second": 6.453, "step": 260 }, { "epoch": 0.6171428571428571, "grad_norm": 19.872060775756836, "learning_rate": 3.844393592677346e-05, "loss": 0.2802, "step": 270 }, { "epoch": 0.6171428571428571, "eval_accuracy": 0.8804285526275635, "eval_loss": 0.3679888844490051, "eval_runtime": 256.0669, "eval_samples_per_second": 27.337, "eval_steps_per_second": 6.834, "step": 270 }, { "epoch": 0.64, "grad_norm": 3.6445915699005127, "learning_rate": 3.6155606407322653e-05, "loss": 0.2442, "step": 280 }, { "epoch": 0.64, "eval_accuracy": 0.9292857050895691, "eval_loss": 0.20533673465251923, "eval_runtime": 255.7952, "eval_samples_per_second": 27.366, "eval_steps_per_second": 6.841, "step": 280 }, { "epoch": 0.6628571428571428, "grad_norm": 8.114418983459473, "learning_rate": 3.3867276887871856e-05, "loss": 0.2302, "step": 290 }, { "epoch": 0.6628571428571428, "eval_accuracy": 0.8967142701148987, "eval_loss": 0.3355866074562073, "eval_runtime": 257.891, "eval_samples_per_second": 27.143, "eval_steps_per_second": 6.786, "step": 290 }, { "epoch": 0.6857142857142857, "grad_norm": 5.993322372436523, "learning_rate": 3.157894736842105e-05, "loss": 0.2492, "step": 300 }, { "epoch": 0.6857142857142857, "eval_accuracy": 0.9371428489685059, "eval_loss": 0.18795913457870483, "eval_runtime": 254.5882, "eval_samples_per_second": 27.495, "eval_steps_per_second": 6.874, "step": 300 }, { "epoch": 0.7085714285714285, "grad_norm": 6.529418468475342, "learning_rate": 2.9290617848970254e-05, "loss": 0.2089, "step": 310 }, { "epoch": 0.7085714285714285, "eval_accuracy": 0.928857147693634, "eval_loss": 0.2076321393251419, "eval_runtime": 260.5938, "eval_samples_per_second": 26.862, "eval_steps_per_second": 6.715, "step": 310 }, { "epoch": 0.7314285714285714, "grad_norm": 6.433741092681885, "learning_rate": 2.7002288329519453e-05, "loss": 0.2824, "step": 320 }, { "epoch": 0.7314285714285714, "eval_accuracy": 0.930142879486084, "eval_loss": 0.1999480277299881, "eval_runtime": 255.2396, "eval_samples_per_second": 27.425, "eval_steps_per_second": 6.856, "step": 320 }, { "epoch": 0.7542857142857143, "grad_norm": 5.394837379455566, "learning_rate": 2.4713958810068652e-05, "loss": 0.2009, "step": 330 }, { "epoch": 0.7542857142857143, "eval_accuracy": 0.9521428346633911, "eval_loss": 0.14918017387390137, "eval_runtime": 258.1497, "eval_samples_per_second": 27.116, "eval_steps_per_second": 6.779, "step": 330 }, { "epoch": 0.7771428571428571, "grad_norm": 5.843348503112793, "learning_rate": 2.242562929061785e-05, "loss": 0.2001, "step": 340 }, { "epoch": 0.7771428571428571, "eval_accuracy": 0.951714277267456, "eval_loss": 0.14960123598575592, "eval_runtime": 253.1262, "eval_samples_per_second": 27.654, "eval_steps_per_second": 6.914, "step": 340 }, { "epoch": 0.8, "grad_norm": 7.778473377227783, "learning_rate": 2.0137299771167047e-05, "loss": 0.2298, "step": 350 }, { "epoch": 0.8, "eval_accuracy": 0.9490000009536743, "eval_loss": 0.15794885158538818, "eval_runtime": 258.4154, "eval_samples_per_second": 27.088, "eval_steps_per_second": 6.772, "step": 350 }, { "epoch": 0.8228571428571428, "grad_norm": 7.672749042510986, "learning_rate": 1.784897025171625e-05, "loss": 0.1802, "step": 360 }, { "epoch": 0.8228571428571428, "eval_accuracy": 0.9501428604125977, "eval_loss": 0.15056686103343964, "eval_runtime": 253.0586, "eval_samples_per_second": 27.662, "eval_steps_per_second": 6.915, "step": 360 }, { "epoch": 0.8457142857142858, "grad_norm": 7.994875431060791, "learning_rate": 1.5560640732265445e-05, "loss": 0.1914, "step": 370 }, { "epoch": 0.8457142857142858, "eval_accuracy": 0.9311428666114807, "eval_loss": 0.20363783836364746, "eval_runtime": 261.3379, "eval_samples_per_second": 26.785, "eval_steps_per_second": 6.696, "step": 370 }, { "epoch": 0.8685714285714285, "grad_norm": 3.988149404525757, "learning_rate": 1.3272311212814645e-05, "loss": 0.1897, "step": 380 }, { "epoch": 0.8685714285714285, "eval_accuracy": 0.9382857084274292, "eval_loss": 0.18375040590763092, "eval_runtime": 256.8539, "eval_samples_per_second": 27.253, "eval_steps_per_second": 6.813, "step": 380 }, { "epoch": 0.8914285714285715, "grad_norm": 7.280108451843262, "learning_rate": 1.0983981693363844e-05, "loss": 0.1203, "step": 390 }, { "epoch": 0.8914285714285715, "eval_accuracy": 0.9504285454750061, "eval_loss": 0.1459112912416458, "eval_runtime": 256.3941, "eval_samples_per_second": 27.302, "eval_steps_per_second": 6.825, "step": 390 }, { "epoch": 0.9142857142857143, "grad_norm": 6.386229991912842, "learning_rate": 8.695652173913044e-06, "loss": 0.1372, "step": 400 }, { "epoch": 0.9142857142857143, "eval_accuracy": 0.9418571591377258, "eval_loss": 0.1748434156179428, "eval_runtime": 266.7645, "eval_samples_per_second": 26.24, "eval_steps_per_second": 6.56, "step": 400 }, { "epoch": 0.9371428571428572, "grad_norm": 7.714508056640625, "learning_rate": 6.407322654462243e-06, "loss": 0.1942, "step": 410 }, { "epoch": 0.9371428571428572, "eval_accuracy": 0.9405714273452759, "eval_loss": 0.18131674826145172, "eval_runtime": 266.6389, "eval_samples_per_second": 26.253, "eval_steps_per_second": 6.563, "step": 410 }, { "epoch": 0.96, "grad_norm": 4.493211269378662, "learning_rate": 4.118993135011442e-06, "loss": 0.1886, "step": 420 }, { "epoch": 0.96, "eval_accuracy": 0.9509999752044678, "eval_loss": 0.15357272326946259, "eval_runtime": 273.0321, "eval_samples_per_second": 25.638, "eval_steps_per_second": 6.41, "step": 420 }, { "epoch": 0.9828571428571429, "grad_norm": 4.66563606262207, "learning_rate": 1.8306636155606409e-06, "loss": 0.1872, "step": 430 }, { "epoch": 0.9828571428571429, "eval_accuracy": 0.952571451663971, "eval_loss": 0.1465713381767273, "eval_runtime": 266.7172, "eval_samples_per_second": 26.245, "eval_steps_per_second": 6.561, "step": 430 }, { "epoch": 0.9988571428571429, "step": 437, "total_flos": 1.3128537437918904e+18, "train_loss": 0.3557066834218442, "train_runtime": 12202.3201, "train_samples_per_second": 2.295, "train_steps_per_second": 0.036 } ], "logging_steps": 10, "max_steps": 437, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3128537437918904e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }