| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 50.0, |
| "eval_steps": 5, |
| "global_step": 350, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0, |
| "eval_loss": 1.468239188194275, |
| "eval_runtime": 18.8322, |
| "eval_samples_per_second": 2.655, |
| "eval_steps_per_second": 0.372, |
| "step": 0 |
| }, |
| { |
| "epoch": 0.7142857142857143, |
| "grad_norm": 5.422401428222656, |
| "learning_rate": 4.444444444444444e-06, |
| "loss": 1.3518, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.7142857142857143, |
| "eval_loss": 1.3430589437484741, |
| "eval_runtime": 19.2589, |
| "eval_samples_per_second": 2.596, |
| "eval_steps_per_second": 0.363, |
| "step": 5 |
| }, |
| { |
| "epoch": 1.4285714285714286, |
| "grad_norm": 2.686959981918335, |
| "learning_rate": 1e-05, |
| "loss": 0.9051, |
| "step": 10 |
| }, |
| { |
| "epoch": 1.4285714285714286, |
| "eval_loss": 1.2059158086776733, |
| "eval_runtime": 20.856, |
| "eval_samples_per_second": 2.397, |
| "eval_steps_per_second": 0.336, |
| "step": 10 |
| }, |
| { |
| "epoch": 2.142857142857143, |
| "grad_norm": 1.8475016355514526, |
| "learning_rate": 1.555555555555556e-05, |
| "loss": 0.8625, |
| "step": 15 |
| }, |
| { |
| "epoch": 2.142857142857143, |
| "eval_loss": 1.127707600593567, |
| "eval_runtime": 20.3953, |
| "eval_samples_per_second": 2.452, |
| "eval_steps_per_second": 0.343, |
| "step": 15 |
| }, |
| { |
| "epoch": 2.857142857142857, |
| "grad_norm": 1.3544838428497314, |
| "learning_rate": 1.9999552296652432e-05, |
| "loss": 0.6971, |
| "step": 20 |
| }, |
| { |
| "epoch": 2.857142857142857, |
| "eval_loss": 1.114540696144104, |
| "eval_runtime": 20.5519, |
| "eval_samples_per_second": 2.433, |
| "eval_steps_per_second": 0.341, |
| "step": 20 |
| }, |
| { |
| "epoch": 3.571428571428571, |
| "grad_norm": 1.6196757555007935, |
| "learning_rate": 1.9983886888289515e-05, |
| "loss": 0.4243, |
| "step": 25 |
| }, |
| { |
| "epoch": 3.571428571428571, |
| "eval_loss": 1.1612136363983154, |
| "eval_runtime": 20.8803, |
| "eval_samples_per_second": 2.395, |
| "eval_steps_per_second": 0.335, |
| "step": 25 |
| }, |
| { |
| "epoch": 4.285714285714286, |
| "grad_norm": 1.041768193244934, |
| "learning_rate": 1.994587638407389e-05, |
| "loss": 0.1901, |
| "step": 30 |
| }, |
| { |
| "epoch": 4.285714285714286, |
| "eval_loss": 1.3399298191070557, |
| "eval_runtime": 20.3992, |
| "eval_samples_per_second": 2.451, |
| "eval_steps_per_second": 0.343, |
| "step": 30 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.8406388163566589, |
| "learning_rate": 1.9885605855918887e-05, |
| "loss": 0.1171, |
| "step": 35 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 1.3959039449691772, |
| "eval_runtime": 20.6168, |
| "eval_samples_per_second": 2.425, |
| "eval_steps_per_second": 0.34, |
| "step": 35 |
| }, |
| { |
| "epoch": 5.714285714285714, |
| "grad_norm": 0.7612068057060242, |
| "learning_rate": 1.9803210196251057e-05, |
| "loss": 0.0593, |
| "step": 40 |
| }, |
| { |
| "epoch": 5.714285714285714, |
| "eval_loss": 1.4992326498031616, |
| "eval_runtime": 21.0617, |
| "eval_samples_per_second": 2.374, |
| "eval_steps_per_second": 0.332, |
| "step": 40 |
| }, |
| { |
| "epoch": 6.428571428571429, |
| "grad_norm": 0.5397480130195618, |
| "learning_rate": 1.9698873816105272e-05, |
| "loss": 0.0265, |
| "step": 45 |
| }, |
| { |
| "epoch": 6.428571428571429, |
| "eval_loss": 1.5966981649398804, |
| "eval_runtime": 20.5478, |
| "eval_samples_per_second": 2.433, |
| "eval_steps_per_second": 0.341, |
| "step": 45 |
| }, |
| { |
| "epoch": 7.142857142857143, |
| "grad_norm": 0.34659039974212646, |
| "learning_rate": 1.9572830232391467e-05, |
| "loss": 0.0229, |
| "step": 50 |
| }, |
| { |
| "epoch": 7.142857142857143, |
| "eval_loss": 1.6269463300704956, |
| "eval_runtime": 21.143, |
| "eval_samples_per_second": 2.365, |
| "eval_steps_per_second": 0.331, |
| "step": 50 |
| }, |
| { |
| "epoch": 7.857142857142857, |
| "grad_norm": 0.5654157996177673, |
| "learning_rate": 1.942536154525673e-05, |
| "loss": 0.0167, |
| "step": 55 |
| }, |
| { |
| "epoch": 7.857142857142857, |
| "eval_loss": 1.6504554748535156, |
| "eval_runtime": 20.9122, |
| "eval_samples_per_second": 2.391, |
| "eval_steps_per_second": 0.335, |
| "step": 55 |
| }, |
| { |
| "epoch": 8.571428571428571, |
| "grad_norm": 0.47888287901878357, |
| "learning_rate": 1.9256797806712478e-05, |
| "loss": 0.0119, |
| "step": 60 |
| }, |
| { |
| "epoch": 8.571428571428571, |
| "eval_loss": 1.6625537872314453, |
| "eval_runtime": 20.8644, |
| "eval_samples_per_second": 2.396, |
| "eval_steps_per_second": 0.335, |
| "step": 60 |
| }, |
| { |
| "epoch": 9.285714285714286, |
| "grad_norm": 0.17783451080322266, |
| "learning_rate": 1.9067516281939826e-05, |
| "loss": 0.0094, |
| "step": 65 |
| }, |
| { |
| "epoch": 9.285714285714286, |
| "eval_loss": 1.6874204874038696, |
| "eval_runtime": 21.4952, |
| "eval_samples_per_second": 2.326, |
| "eval_steps_per_second": 0.326, |
| "step": 65 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.3508545160293579, |
| "learning_rate": 1.885794060492637e-05, |
| "loss": 0.0089, |
| "step": 70 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_loss": 1.7034786939620972, |
| "eval_runtime": 21.6794, |
| "eval_samples_per_second": 2.306, |
| "eval_steps_per_second": 0.323, |
| "step": 70 |
| }, |
| { |
| "epoch": 10.714285714285714, |
| "grad_norm": 0.3566827178001404, |
| "learning_rate": 1.862853983032423e-05, |
| "loss": 0.0062, |
| "step": 75 |
| }, |
| { |
| "epoch": 10.714285714285714, |
| "eval_loss": 1.7052545547485352, |
| "eval_runtime": 21.1384, |
| "eval_samples_per_second": 2.365, |
| "eval_steps_per_second": 0.331, |
| "step": 75 |
| }, |
| { |
| "epoch": 11.428571428571429, |
| "grad_norm": 0.20090198516845703, |
| "learning_rate": 1.83798273836514e-05, |
| "loss": 0.0072, |
| "step": 80 |
| }, |
| { |
| "epoch": 11.428571428571429, |
| "eval_loss": 1.7082165479660034, |
| "eval_runtime": 21.7116, |
| "eval_samples_per_second": 2.303, |
| "eval_steps_per_second": 0.322, |
| "step": 80 |
| }, |
| { |
| "epoch": 12.142857142857142, |
| "grad_norm": 0.16760864853858948, |
| "learning_rate": 1.8112359912185923e-05, |
| "loss": 0.008, |
| "step": 85 |
| }, |
| { |
| "epoch": 12.142857142857142, |
| "eval_loss": 1.6972074508666992, |
| "eval_runtime": 21.5935, |
| "eval_samples_per_second": 2.316, |
| "eval_steps_per_second": 0.324, |
| "step": 85 |
| }, |
| { |
| "epoch": 12.857142857142858, |
| "grad_norm": 0.1947411298751831, |
| "learning_rate": 1.7826736039124782e-05, |
| "loss": 0.006, |
| "step": 90 |
| }, |
| { |
| "epoch": 12.857142857142858, |
| "eval_loss": 1.6968964338302612, |
| "eval_runtime": 21.3118, |
| "eval_samples_per_second": 2.346, |
| "eval_steps_per_second": 0.328, |
| "step": 90 |
| }, |
| { |
| "epoch": 13.571428571428571, |
| "grad_norm": 0.2383287250995636, |
| "learning_rate": 1.7523595023795814e-05, |
| "loss": 0.0044, |
| "step": 95 |
| }, |
| { |
| "epoch": 13.571428571428571, |
| "eval_loss": 1.7048730850219727, |
| "eval_runtime": 21.8112, |
| "eval_samples_per_second": 2.292, |
| "eval_steps_per_second": 0.321, |
| "step": 95 |
| }, |
| { |
| "epoch": 14.285714285714286, |
| "grad_norm": 0.0442085787653923, |
| "learning_rate": 1.720361533092124e-05, |
| "loss": 0.0036, |
| "step": 100 |
| }, |
| { |
| "epoch": 14.285714285714286, |
| "eval_loss": 1.7229158878326416, |
| "eval_runtime": 21.3718, |
| "eval_samples_per_second": 2.34, |
| "eval_steps_per_second": 0.328, |
| "step": 100 |
| }, |
| { |
| "epoch": 15.0, |
| "grad_norm": 0.15548433363437653, |
| "learning_rate": 1.6867513112135012e-05, |
| "loss": 0.0035, |
| "step": 105 |
| }, |
| { |
| "epoch": 15.0, |
| "eval_loss": 1.7421244382858276, |
| "eval_runtime": 21.513, |
| "eval_samples_per_second": 2.324, |
| "eval_steps_per_second": 0.325, |
| "step": 105 |
| }, |
| { |
| "epoch": 15.714285714285714, |
| "grad_norm": 0.14688356220722198, |
| "learning_rate": 1.6516040603152448e-05, |
| "loss": 0.0026, |
| "step": 110 |
| }, |
| { |
| "epoch": 15.714285714285714, |
| "eval_loss": 1.7549571990966797, |
| "eval_runtime": 21.852, |
| "eval_samples_per_second": 2.288, |
| "eval_steps_per_second": 0.32, |
| "step": 110 |
| }, |
| { |
| "epoch": 16.428571428571427, |
| "grad_norm": 0.11230692267417908, |
| "learning_rate": 1.614998444017954e-05, |
| "loss": 0.0023, |
| "step": 115 |
| }, |
| { |
| "epoch": 16.428571428571427, |
| "eval_loss": 1.7617460489273071, |
| "eval_runtime": 21.5895, |
| "eval_samples_per_second": 2.316, |
| "eval_steps_per_second": 0.324, |
| "step": 115 |
| }, |
| { |
| "epoch": 17.142857142857142, |
| "grad_norm": 0.05092110484838486, |
| "learning_rate": 1.5770163899329943e-05, |
| "loss": 0.0021, |
| "step": 120 |
| }, |
| { |
| "epoch": 17.142857142857142, |
| "eval_loss": 1.7655525207519531, |
| "eval_runtime": 21.8701, |
| "eval_samples_per_second": 2.286, |
| "eval_steps_per_second": 0.32, |
| "step": 120 |
| }, |
| { |
| "epoch": 17.857142857142858, |
| "grad_norm": 0.11048714816570282, |
| "learning_rate": 1.5377429062990122e-05, |
| "loss": 0.0025, |
| "step": 125 |
| }, |
| { |
| "epoch": 17.857142857142858, |
| "eval_loss": 1.768291711807251, |
| "eval_runtime": 21.9645, |
| "eval_samples_per_second": 2.276, |
| "eval_steps_per_second": 0.319, |
| "step": 125 |
| }, |
| { |
| "epoch": 18.571428571428573, |
| "grad_norm": 0.07220487296581268, |
| "learning_rate": 1.497265891723643e-05, |
| "loss": 0.0018, |
| "step": 130 |
| }, |
| { |
| "epoch": 18.571428571428573, |
| "eval_loss": 1.7749742269515991, |
| "eval_runtime": 21.6791, |
| "eval_samples_per_second": 2.306, |
| "eval_steps_per_second": 0.323, |
| "step": 130 |
| }, |
| { |
| "epoch": 19.285714285714285, |
| "grad_norm": 0.09813550859689713, |
| "learning_rate": 1.4556759384562418e-05, |
| "loss": 0.002, |
| "step": 135 |
| }, |
| { |
| "epoch": 19.285714285714285, |
| "eval_loss": 1.7667394876480103, |
| "eval_runtime": 22.2338, |
| "eval_samples_per_second": 2.249, |
| "eval_steps_per_second": 0.315, |
| "step": 135 |
| }, |
| { |
| "epoch": 20.0, |
| "grad_norm": 0.06082721799612045, |
| "learning_rate": 1.4130661296319313e-05, |
| "loss": 0.0036, |
| "step": 140 |
| }, |
| { |
| "epoch": 20.0, |
| "eval_loss": 1.7492411136627197, |
| "eval_runtime": 21.9099, |
| "eval_samples_per_second": 2.282, |
| "eval_steps_per_second": 0.319, |
| "step": 140 |
| }, |
| { |
| "epoch": 20.714285714285715, |
| "grad_norm": 0.19404393434524536, |
| "learning_rate": 1.369531830940757e-05, |
| "loss": 0.0025, |
| "step": 145 |
| }, |
| { |
| "epoch": 20.714285714285715, |
| "eval_loss": 1.7377521991729736, |
| "eval_runtime": 21.7672, |
| "eval_samples_per_second": 2.297, |
| "eval_steps_per_second": 0.322, |
| "step": 145 |
| }, |
| { |
| "epoch": 21.428571428571427, |
| "grad_norm": 0.05864373594522476, |
| "learning_rate": 1.325170477188224e-05, |
| "loss": 0.0017, |
| "step": 150 |
| }, |
| { |
| "epoch": 21.428571428571427, |
| "eval_loss": 1.7388665676116943, |
| "eval_runtime": 22.2013, |
| "eval_samples_per_second": 2.252, |
| "eval_steps_per_second": 0.315, |
| "step": 150 |
| }, |
| { |
| "epoch": 22.142857142857142, |
| "grad_norm": 0.04350695759057999, |
| "learning_rate": 1.2800813542249073e-05, |
| "loss": 0.0016, |
| "step": 155 |
| }, |
| { |
| "epoch": 22.142857142857142, |
| "eval_loss": 1.7510356903076172, |
| "eval_runtime": 22.0791, |
| "eval_samples_per_second": 2.265, |
| "eval_steps_per_second": 0.317, |
| "step": 155 |
| }, |
| { |
| "epoch": 22.857142857142858, |
| "grad_norm": 0.09723013639450073, |
| "learning_rate": 1.234365376733215e-05, |
| "loss": 0.0016, |
| "step": 160 |
| }, |
| { |
| "epoch": 22.857142857142858, |
| "eval_loss": 1.7622946500778198, |
| "eval_runtime": 22.0297, |
| "eval_samples_per_second": 2.27, |
| "eval_steps_per_second": 0.318, |
| "step": 160 |
| }, |
| { |
| "epoch": 23.571428571428573, |
| "grad_norm": 0.046282168477773666, |
| "learning_rate": 1.188124862368634e-05, |
| "loss": 0.0014, |
| "step": 165 |
| }, |
| { |
| "epoch": 23.571428571428573, |
| "eval_loss": 1.770511507987976, |
| "eval_runtime": 21.9343, |
| "eval_samples_per_second": 2.28, |
| "eval_steps_per_second": 0.319, |
| "step": 165 |
| }, |
| { |
| "epoch": 24.285714285714285, |
| "grad_norm": 0.03766458481550217, |
| "learning_rate": 1.1414633027609585e-05, |
| "loss": 0.0013, |
| "step": 170 |
| }, |
| { |
| "epoch": 24.285714285714285, |
| "eval_loss": 1.7750704288482666, |
| "eval_runtime": 22.0134, |
| "eval_samples_per_second": 2.271, |
| "eval_steps_per_second": 0.318, |
| "step": 170 |
| }, |
| { |
| "epoch": 25.0, |
| "grad_norm": 0.14919541776180267, |
| "learning_rate": 1.0944851318880314e-05, |
| "loss": 0.0015, |
| "step": 175 |
| }, |
| { |
| "epoch": 25.0, |
| "eval_loss": 1.7801543474197388, |
| "eval_runtime": 22.0082, |
| "eval_samples_per_second": 2.272, |
| "eval_steps_per_second": 0.318, |
| "step": 175 |
| }, |
| { |
| "epoch": 25.714285714285715, |
| "grad_norm": 0.01755683310329914, |
| "learning_rate": 1.047295492340397e-05, |
| "loss": 0.0011, |
| "step": 180 |
| }, |
| { |
| "epoch": 25.714285714285715, |
| "eval_loss": 1.7830265760421753, |
| "eval_runtime": 22.0185, |
| "eval_samples_per_second": 2.271, |
| "eval_steps_per_second": 0.318, |
| "step": 180 |
| }, |
| { |
| "epoch": 26.428571428571427, |
| "grad_norm": 0.051639165729284286, |
| "learning_rate": 1e-05, |
| "loss": 0.0012, |
| "step": 185 |
| }, |
| { |
| "epoch": 26.428571428571427, |
| "eval_loss": 1.7872556447982788, |
| "eval_runtime": 22.2185, |
| "eval_samples_per_second": 2.25, |
| "eval_steps_per_second": 0.315, |
| "step": 185 |
| }, |
| { |
| "epoch": 27.142857142857142, |
| "grad_norm": 0.05077521875500679, |
| "learning_rate": 9.527045076596036e-06, |
| "loss": 0.0011, |
| "step": 190 |
| }, |
| { |
| "epoch": 27.142857142857142, |
| "eval_loss": 1.7918928861618042, |
| "eval_runtime": 22.0395, |
| "eval_samples_per_second": 2.269, |
| "eval_steps_per_second": 0.318, |
| "step": 190 |
| }, |
| { |
| "epoch": 27.857142857142858, |
| "grad_norm": 0.0997517928481102, |
| "learning_rate": 9.055148681119688e-06, |
| "loss": 0.0012, |
| "step": 195 |
| }, |
| { |
| "epoch": 27.857142857142858, |
| "eval_loss": 1.7959425449371338, |
| "eval_runtime": 21.9538, |
| "eval_samples_per_second": 2.278, |
| "eval_steps_per_second": 0.319, |
| "step": 195 |
| }, |
| { |
| "epoch": 28.571428571428573, |
| "grad_norm": 0.07028105854988098, |
| "learning_rate": 8.585366972390416e-06, |
| "loss": 0.0012, |
| "step": 200 |
| }, |
| { |
| "epoch": 28.571428571428573, |
| "eval_loss": 1.7992604970932007, |
| "eval_runtime": 22.074, |
| "eval_samples_per_second": 2.265, |
| "eval_steps_per_second": 0.317, |
| "step": 200 |
| }, |
| { |
| "epoch": 29.285714285714285, |
| "grad_norm": 0.05388140678405762, |
| "learning_rate": 8.118751376313666e-06, |
| "loss": 0.001, |
| "step": 205 |
| }, |
| { |
| "epoch": 29.285714285714285, |
| "eval_loss": 1.801768183708191, |
| "eval_runtime": 21.8246, |
| "eval_samples_per_second": 2.291, |
| "eval_steps_per_second": 0.321, |
| "step": 205 |
| }, |
| { |
| "epoch": 30.0, |
| "grad_norm": 0.0411839634180069, |
| "learning_rate": 7.65634623266785e-06, |
| "loss": 0.0012, |
| "step": 210 |
| }, |
| { |
| "epoch": 30.0, |
| "eval_loss": 1.8040063381195068, |
| "eval_runtime": 22.2164, |
| "eval_samples_per_second": 2.251, |
| "eval_steps_per_second": 0.315, |
| "step": 210 |
| }, |
| { |
| "epoch": 30.714285714285715, |
| "grad_norm": 0.04949762672185898, |
| "learning_rate": 7.199186457750931e-06, |
| "loss": 0.001, |
| "step": 215 |
| }, |
| { |
| "epoch": 30.714285714285715, |
| "eval_loss": 1.807271957397461, |
| "eval_runtime": 22.2074, |
| "eval_samples_per_second": 2.252, |
| "eval_steps_per_second": 0.315, |
| "step": 215 |
| }, |
| { |
| "epoch": 31.428571428571427, |
| "grad_norm": 0.041917722672224045, |
| "learning_rate": 6.748295228117765e-06, |
| "loss": 0.001, |
| "step": 220 |
| }, |
| { |
| "epoch": 31.428571428571427, |
| "eval_loss": 1.8091932535171509, |
| "eval_runtime": 22.225, |
| "eval_samples_per_second": 2.25, |
| "eval_steps_per_second": 0.315, |
| "step": 220 |
| }, |
| { |
| "epoch": 32.142857142857146, |
| "grad_norm": 0.06832437217235565, |
| "learning_rate": 6.304681690592431e-06, |
| "loss": 0.0014, |
| "step": 225 |
| }, |
| { |
| "epoch": 32.142857142857146, |
| "eval_loss": 1.8116446733474731, |
| "eval_runtime": 22.2548, |
| "eval_samples_per_second": 2.247, |
| "eval_steps_per_second": 0.315, |
| "step": 225 |
| }, |
| { |
| "epoch": 32.857142857142854, |
| "grad_norm": 0.0969175174832344, |
| "learning_rate": 5.869338703680691e-06, |
| "loss": 0.0011, |
| "step": 230 |
| }, |
| { |
| "epoch": 32.857142857142854, |
| "eval_loss": 1.8134872913360596, |
| "eval_runtime": 22.0272, |
| "eval_samples_per_second": 2.27, |
| "eval_steps_per_second": 0.318, |
| "step": 230 |
| }, |
| { |
| "epoch": 33.57142857142857, |
| "grad_norm": 0.0360889658331871, |
| "learning_rate": 5.443240615437586e-06, |
| "loss": 0.001, |
| "step": 235 |
| }, |
| { |
| "epoch": 33.57142857142857, |
| "eval_loss": 1.814117670059204, |
| "eval_runtime": 21.875, |
| "eval_samples_per_second": 2.286, |
| "eval_steps_per_second": 0.32, |
| "step": 235 |
| }, |
| { |
| "epoch": 34.285714285714285, |
| "grad_norm": 0.047800276428461075, |
| "learning_rate": 5.027341082763575e-06, |
| "loss": 0.0011, |
| "step": 240 |
| }, |
| { |
| "epoch": 34.285714285714285, |
| "eval_loss": 1.8167412281036377, |
| "eval_runtime": 21.996, |
| "eval_samples_per_second": 2.273, |
| "eval_steps_per_second": 0.318, |
| "step": 240 |
| }, |
| { |
| "epoch": 35.0, |
| "grad_norm": 0.03180164098739624, |
| "learning_rate": 4.622570937009879e-06, |
| "loss": 0.0009, |
| "step": 245 |
| }, |
| { |
| "epoch": 35.0, |
| "eval_loss": 1.8182238340377808, |
| "eval_runtime": 22.4068, |
| "eval_samples_per_second": 2.231, |
| "eval_steps_per_second": 0.312, |
| "step": 245 |
| }, |
| { |
| "epoch": 35.714285714285715, |
| "grad_norm": 0.020492179319262505, |
| "learning_rate": 4.229836100670058e-06, |
| "loss": 0.001, |
| "step": 250 |
| }, |
| { |
| "epoch": 35.714285714285715, |
| "eval_loss": 1.8190386295318604, |
| "eval_runtime": 22.0033, |
| "eval_samples_per_second": 2.272, |
| "eval_steps_per_second": 0.318, |
| "step": 250 |
| }, |
| { |
| "epoch": 36.42857142857143, |
| "grad_norm": 0.09310004115104675, |
| "learning_rate": 3.850015559820465e-06, |
| "loss": 0.0011, |
| "step": 255 |
| }, |
| { |
| "epoch": 36.42857142857143, |
| "eval_loss": 1.8204212188720703, |
| "eval_runtime": 21.9927, |
| "eval_samples_per_second": 2.273, |
| "eval_steps_per_second": 0.318, |
| "step": 255 |
| }, |
| { |
| "epoch": 37.142857142857146, |
| "grad_norm": 0.056946855038404465, |
| "learning_rate": 3.483959396847554e-06, |
| "loss": 0.0012, |
| "step": 260 |
| }, |
| { |
| "epoch": 37.142857142857146, |
| "eval_loss": 1.821565866470337, |
| "eval_runtime": 21.8674, |
| "eval_samples_per_second": 2.287, |
| "eval_steps_per_second": 0.32, |
| "step": 260 |
| }, |
| { |
| "epoch": 37.857142857142854, |
| "grad_norm": 0.04144909605383873, |
| "learning_rate": 3.132486887864992e-06, |
| "loss": 0.0009, |
| "step": 265 |
| }, |
| { |
| "epoch": 37.857142857142854, |
| "eval_loss": 1.8220727443695068, |
| "eval_runtime": 22.0788, |
| "eval_samples_per_second": 2.265, |
| "eval_steps_per_second": 0.317, |
| "step": 265 |
| }, |
| { |
| "epoch": 38.57142857142857, |
| "grad_norm": 0.020420927554368973, |
| "learning_rate": 2.7963846690787633e-06, |
| "loss": 0.001, |
| "step": 270 |
| }, |
| { |
| "epoch": 38.57142857142857, |
| "eval_loss": 1.822296380996704, |
| "eval_runtime": 22.0319, |
| "eval_samples_per_second": 2.269, |
| "eval_steps_per_second": 0.318, |
| "step": 270 |
| }, |
| { |
| "epoch": 39.285714285714285, |
| "grad_norm": 0.03683812543749809, |
| "learning_rate": 2.4764049762041874e-06, |
| "loss": 0.0013, |
| "step": 275 |
| }, |
| { |
| "epoch": 39.285714285714285, |
| "eval_loss": 1.8238246440887451, |
| "eval_runtime": 22.3005, |
| "eval_samples_per_second": 2.242, |
| "eval_steps_per_second": 0.314, |
| "step": 275 |
| }, |
| { |
| "epoch": 40.0, |
| "grad_norm": 0.07886708527803421, |
| "learning_rate": 2.1732639608752173e-06, |
| "loss": 0.0011, |
| "step": 280 |
| }, |
| { |
| "epoch": 40.0, |
| "eval_loss": 1.824698567390442, |
| "eval_runtime": 22.3675, |
| "eval_samples_per_second": 2.235, |
| "eval_steps_per_second": 0.313, |
| "step": 280 |
| }, |
| { |
| "epoch": 40.714285714285715, |
| "grad_norm": 0.04558909684419632, |
| "learning_rate": 1.8876400878140776e-06, |
| "loss": 0.0009, |
| "step": 285 |
| }, |
| { |
| "epoch": 40.714285714285715, |
| "eval_loss": 1.8251045942306519, |
| "eval_runtime": 22.0914, |
| "eval_samples_per_second": 2.263, |
| "eval_steps_per_second": 0.317, |
| "step": 285 |
| }, |
| { |
| "epoch": 41.42857142857143, |
| "grad_norm": 0.037249855697155, |
| "learning_rate": 1.6201726163485997e-06, |
| "loss": 0.0011, |
| "step": 290 |
| }, |
| { |
| "epoch": 41.42857142857143, |
| "eval_loss": 1.8252718448638916, |
| "eval_runtime": 22.0174, |
| "eval_samples_per_second": 2.271, |
| "eval_steps_per_second": 0.318, |
| "step": 290 |
| }, |
| { |
| "epoch": 42.142857142857146, |
| "grad_norm": 0.0819711983203888, |
| "learning_rate": 1.3714601696757713e-06, |
| "loss": 0.001, |
| "step": 295 |
| }, |
| { |
| "epoch": 42.142857142857146, |
| "eval_loss": 1.826216459274292, |
| "eval_runtime": 21.9895, |
| "eval_samples_per_second": 2.274, |
| "eval_steps_per_second": 0.318, |
| "step": 295 |
| }, |
| { |
| "epoch": 42.857142857142854, |
| "grad_norm": 0.04518039524555206, |
| "learning_rate": 1.1420593950736326e-06, |
| "loss": 0.001, |
| "step": 300 |
| }, |
| { |
| "epoch": 42.857142857142854, |
| "eval_loss": 1.8267167806625366, |
| "eval_runtime": 21.8488, |
| "eval_samples_per_second": 2.288, |
| "eval_steps_per_second": 0.32, |
| "step": 300 |
| }, |
| { |
| "epoch": 43.57142857142857, |
| "grad_norm": 0.03789067268371582, |
| "learning_rate": 9.324837180601743e-07, |
| "loss": 0.0011, |
| "step": 305 |
| }, |
| { |
| "epoch": 43.57142857142857, |
| "eval_loss": 1.8267269134521484, |
| "eval_runtime": 22.0372, |
| "eval_samples_per_second": 2.269, |
| "eval_steps_per_second": 0.318, |
| "step": 305 |
| }, |
| { |
| "epoch": 44.285714285714285, |
| "grad_norm": 0.07329820841550827, |
| "learning_rate": 7.432021932875222e-07, |
| "loss": 0.0012, |
| "step": 310 |
| }, |
| { |
| "epoch": 44.285714285714285, |
| "eval_loss": 1.8272424936294556, |
| "eval_runtime": 21.9242, |
| "eval_samples_per_second": 2.281, |
| "eval_steps_per_second": 0.319, |
| "step": 310 |
| }, |
| { |
| "epoch": 45.0, |
| "grad_norm": 0.0455288402736187, |
| "learning_rate": 5.746384547432738e-07, |
| "loss": 0.0009, |
| "step": 315 |
| }, |
| { |
| "epoch": 45.0, |
| "eval_loss": 1.8278112411499023, |
| "eval_runtime": 22.3633, |
| "eval_samples_per_second": 2.236, |
| "eval_steps_per_second": 0.313, |
| "step": 315 |
| }, |
| { |
| "epoch": 45.714285714285715, |
| "grad_norm": 0.05950823426246643, |
| "learning_rate": 4.2716976760853513e-07, |
| "loss": 0.0008, |
| "step": 320 |
| }, |
| { |
| "epoch": 45.714285714285715, |
| "eval_loss": 1.8275575637817383, |
| "eval_runtime": 21.8644, |
| "eval_samples_per_second": 2.287, |
| "eval_steps_per_second": 0.32, |
| "step": 320 |
| }, |
| { |
| "epoch": 46.42857142857143, |
| "grad_norm": 0.04292258992791176, |
| "learning_rate": 3.011261838947277e-07, |
| "loss": 0.0009, |
| "step": 325 |
| }, |
| { |
| "epoch": 46.42857142857143, |
| "eval_loss": 1.8281974792480469, |
| "eval_runtime": 22.2434, |
| "eval_samples_per_second": 2.248, |
| "eval_steps_per_second": 0.315, |
| "step": 325 |
| }, |
| { |
| "epoch": 47.142857142857146, |
| "grad_norm": 0.03321965038776398, |
| "learning_rate": 1.9678980374894352e-07, |
| "loss": 0.001, |
| "step": 330 |
| }, |
| { |
| "epoch": 47.142857142857146, |
| "eval_loss": 1.8281824588775635, |
| "eval_runtime": 21.8282, |
| "eval_samples_per_second": 2.291, |
| "eval_steps_per_second": 0.321, |
| "step": 330 |
| }, |
| { |
| "epoch": 47.857142857142854, |
| "grad_norm": 0.05279669910669327, |
| "learning_rate": 1.1439414408111471e-07, |
| "loss": 0.001, |
| "step": 335 |
| }, |
| { |
| "epoch": 47.857142857142854, |
| "eval_loss": 1.827906847000122, |
| "eval_runtime": 21.7868, |
| "eval_samples_per_second": 2.295, |
| "eval_steps_per_second": 0.321, |
| "step": 335 |
| }, |
| { |
| "epoch": 48.57142857142857, |
| "grad_norm": 0.03341998532414436, |
| "learning_rate": 5.412361592611382e-08, |
| "loss": 0.0008, |
| "step": 340 |
| }, |
| { |
| "epoch": 48.57142857142857, |
| "eval_loss": 1.828213095664978, |
| "eval_runtime": 21.9519, |
| "eval_samples_per_second": 2.278, |
| "eval_steps_per_second": 0.319, |
| "step": 340 |
| }, |
| { |
| "epoch": 49.285714285714285, |
| "grad_norm": 0.04219294339418411, |
| "learning_rate": 1.611311171048735e-08, |
| "loss": 0.0012, |
| "step": 345 |
| }, |
| { |
| "epoch": 49.285714285714285, |
| "eval_loss": 1.8280681371688843, |
| "eval_runtime": 22.1409, |
| "eval_samples_per_second": 2.258, |
| "eval_steps_per_second": 0.316, |
| "step": 345 |
| }, |
| { |
| "epoch": 50.0, |
| "grad_norm": 0.046254031360149384, |
| "learning_rate": 4.477033475702719e-10, |
| "loss": 0.001, |
| "step": 350 |
| }, |
| { |
| "epoch": 50.0, |
| "eval_loss": 1.8277511596679688, |
| "eval_runtime": 21.972, |
| "eval_samples_per_second": 2.276, |
| "eval_steps_per_second": 0.319, |
| "step": 350 |
| }, |
| { |
| "epoch": 50.0, |
| "step": 350, |
| "total_flos": 86960020979712.0, |
| "train_loss": 0.06868666498057012, |
| "train_runtime": 10021.0579, |
| "train_samples_per_second": 0.249, |
| "train_steps_per_second": 0.035 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 350, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 50, |
| "save_steps": 999, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 86960020979712.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|