fleet-daniel's picture
Final model from fleet-sft-overfit-github-Qwen3-32B (exit code: 0)
fa08246 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 50.0,
"eval_steps": 5,
"global_step": 350,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0,
"eval_loss": 1.468239188194275,
"eval_runtime": 18.8322,
"eval_samples_per_second": 2.655,
"eval_steps_per_second": 0.372,
"step": 0
},
{
"epoch": 0.7142857142857143,
"grad_norm": 5.422401428222656,
"learning_rate": 4.444444444444444e-06,
"loss": 1.3518,
"step": 5
},
{
"epoch": 0.7142857142857143,
"eval_loss": 1.3430589437484741,
"eval_runtime": 19.2589,
"eval_samples_per_second": 2.596,
"eval_steps_per_second": 0.363,
"step": 5
},
{
"epoch": 1.4285714285714286,
"grad_norm": 2.686959981918335,
"learning_rate": 1e-05,
"loss": 0.9051,
"step": 10
},
{
"epoch": 1.4285714285714286,
"eval_loss": 1.2059158086776733,
"eval_runtime": 20.856,
"eval_samples_per_second": 2.397,
"eval_steps_per_second": 0.336,
"step": 10
},
{
"epoch": 2.142857142857143,
"grad_norm": 1.8475016355514526,
"learning_rate": 1.555555555555556e-05,
"loss": 0.8625,
"step": 15
},
{
"epoch": 2.142857142857143,
"eval_loss": 1.127707600593567,
"eval_runtime": 20.3953,
"eval_samples_per_second": 2.452,
"eval_steps_per_second": 0.343,
"step": 15
},
{
"epoch": 2.857142857142857,
"grad_norm": 1.3544838428497314,
"learning_rate": 1.9999552296652432e-05,
"loss": 0.6971,
"step": 20
},
{
"epoch": 2.857142857142857,
"eval_loss": 1.114540696144104,
"eval_runtime": 20.5519,
"eval_samples_per_second": 2.433,
"eval_steps_per_second": 0.341,
"step": 20
},
{
"epoch": 3.571428571428571,
"grad_norm": 1.6196757555007935,
"learning_rate": 1.9983886888289515e-05,
"loss": 0.4243,
"step": 25
},
{
"epoch": 3.571428571428571,
"eval_loss": 1.1612136363983154,
"eval_runtime": 20.8803,
"eval_samples_per_second": 2.395,
"eval_steps_per_second": 0.335,
"step": 25
},
{
"epoch": 4.285714285714286,
"grad_norm": 1.041768193244934,
"learning_rate": 1.994587638407389e-05,
"loss": 0.1901,
"step": 30
},
{
"epoch": 4.285714285714286,
"eval_loss": 1.3399298191070557,
"eval_runtime": 20.3992,
"eval_samples_per_second": 2.451,
"eval_steps_per_second": 0.343,
"step": 30
},
{
"epoch": 5.0,
"grad_norm": 0.8406388163566589,
"learning_rate": 1.9885605855918887e-05,
"loss": 0.1171,
"step": 35
},
{
"epoch": 5.0,
"eval_loss": 1.3959039449691772,
"eval_runtime": 20.6168,
"eval_samples_per_second": 2.425,
"eval_steps_per_second": 0.34,
"step": 35
},
{
"epoch": 5.714285714285714,
"grad_norm": 0.7612068057060242,
"learning_rate": 1.9803210196251057e-05,
"loss": 0.0593,
"step": 40
},
{
"epoch": 5.714285714285714,
"eval_loss": 1.4992326498031616,
"eval_runtime": 21.0617,
"eval_samples_per_second": 2.374,
"eval_steps_per_second": 0.332,
"step": 40
},
{
"epoch": 6.428571428571429,
"grad_norm": 0.5397480130195618,
"learning_rate": 1.9698873816105272e-05,
"loss": 0.0265,
"step": 45
},
{
"epoch": 6.428571428571429,
"eval_loss": 1.5966981649398804,
"eval_runtime": 20.5478,
"eval_samples_per_second": 2.433,
"eval_steps_per_second": 0.341,
"step": 45
},
{
"epoch": 7.142857142857143,
"grad_norm": 0.34659039974212646,
"learning_rate": 1.9572830232391467e-05,
"loss": 0.0229,
"step": 50
},
{
"epoch": 7.142857142857143,
"eval_loss": 1.6269463300704956,
"eval_runtime": 21.143,
"eval_samples_per_second": 2.365,
"eval_steps_per_second": 0.331,
"step": 50
},
{
"epoch": 7.857142857142857,
"grad_norm": 0.5654157996177673,
"learning_rate": 1.942536154525673e-05,
"loss": 0.0167,
"step": 55
},
{
"epoch": 7.857142857142857,
"eval_loss": 1.6504554748535156,
"eval_runtime": 20.9122,
"eval_samples_per_second": 2.391,
"eval_steps_per_second": 0.335,
"step": 55
},
{
"epoch": 8.571428571428571,
"grad_norm": 0.47888287901878357,
"learning_rate": 1.9256797806712478e-05,
"loss": 0.0119,
"step": 60
},
{
"epoch": 8.571428571428571,
"eval_loss": 1.6625537872314453,
"eval_runtime": 20.8644,
"eval_samples_per_second": 2.396,
"eval_steps_per_second": 0.335,
"step": 60
},
{
"epoch": 9.285714285714286,
"grad_norm": 0.17783451080322266,
"learning_rate": 1.9067516281939826e-05,
"loss": 0.0094,
"step": 65
},
{
"epoch": 9.285714285714286,
"eval_loss": 1.6874204874038696,
"eval_runtime": 21.4952,
"eval_samples_per_second": 2.326,
"eval_steps_per_second": 0.326,
"step": 65
},
{
"epoch": 10.0,
"grad_norm": 0.3508545160293579,
"learning_rate": 1.885794060492637e-05,
"loss": 0.0089,
"step": 70
},
{
"epoch": 10.0,
"eval_loss": 1.7034786939620972,
"eval_runtime": 21.6794,
"eval_samples_per_second": 2.306,
"eval_steps_per_second": 0.323,
"step": 70
},
{
"epoch": 10.714285714285714,
"grad_norm": 0.3566827178001404,
"learning_rate": 1.862853983032423e-05,
"loss": 0.0062,
"step": 75
},
{
"epoch": 10.714285714285714,
"eval_loss": 1.7052545547485352,
"eval_runtime": 21.1384,
"eval_samples_per_second": 2.365,
"eval_steps_per_second": 0.331,
"step": 75
},
{
"epoch": 11.428571428571429,
"grad_norm": 0.20090198516845703,
"learning_rate": 1.83798273836514e-05,
"loss": 0.0072,
"step": 80
},
{
"epoch": 11.428571428571429,
"eval_loss": 1.7082165479660034,
"eval_runtime": 21.7116,
"eval_samples_per_second": 2.303,
"eval_steps_per_second": 0.322,
"step": 80
},
{
"epoch": 12.142857142857142,
"grad_norm": 0.16760864853858948,
"learning_rate": 1.8112359912185923e-05,
"loss": 0.008,
"step": 85
},
{
"epoch": 12.142857142857142,
"eval_loss": 1.6972074508666992,
"eval_runtime": 21.5935,
"eval_samples_per_second": 2.316,
"eval_steps_per_second": 0.324,
"step": 85
},
{
"epoch": 12.857142857142858,
"grad_norm": 0.1947411298751831,
"learning_rate": 1.7826736039124782e-05,
"loss": 0.006,
"step": 90
},
{
"epoch": 12.857142857142858,
"eval_loss": 1.6968964338302612,
"eval_runtime": 21.3118,
"eval_samples_per_second": 2.346,
"eval_steps_per_second": 0.328,
"step": 90
},
{
"epoch": 13.571428571428571,
"grad_norm": 0.2383287250995636,
"learning_rate": 1.7523595023795814e-05,
"loss": 0.0044,
"step": 95
},
{
"epoch": 13.571428571428571,
"eval_loss": 1.7048730850219727,
"eval_runtime": 21.8112,
"eval_samples_per_second": 2.292,
"eval_steps_per_second": 0.321,
"step": 95
},
{
"epoch": 14.285714285714286,
"grad_norm": 0.0442085787653923,
"learning_rate": 1.720361533092124e-05,
"loss": 0.0036,
"step": 100
},
{
"epoch": 14.285714285714286,
"eval_loss": 1.7229158878326416,
"eval_runtime": 21.3718,
"eval_samples_per_second": 2.34,
"eval_steps_per_second": 0.328,
"step": 100
},
{
"epoch": 15.0,
"grad_norm": 0.15548433363437653,
"learning_rate": 1.6867513112135012e-05,
"loss": 0.0035,
"step": 105
},
{
"epoch": 15.0,
"eval_loss": 1.7421244382858276,
"eval_runtime": 21.513,
"eval_samples_per_second": 2.324,
"eval_steps_per_second": 0.325,
"step": 105
},
{
"epoch": 15.714285714285714,
"grad_norm": 0.14688356220722198,
"learning_rate": 1.6516040603152448e-05,
"loss": 0.0026,
"step": 110
},
{
"epoch": 15.714285714285714,
"eval_loss": 1.7549571990966797,
"eval_runtime": 21.852,
"eval_samples_per_second": 2.288,
"eval_steps_per_second": 0.32,
"step": 110
},
{
"epoch": 16.428571428571427,
"grad_norm": 0.11230692267417908,
"learning_rate": 1.614998444017954e-05,
"loss": 0.0023,
"step": 115
},
{
"epoch": 16.428571428571427,
"eval_loss": 1.7617460489273071,
"eval_runtime": 21.5895,
"eval_samples_per_second": 2.316,
"eval_steps_per_second": 0.324,
"step": 115
},
{
"epoch": 17.142857142857142,
"grad_norm": 0.05092110484838486,
"learning_rate": 1.5770163899329943e-05,
"loss": 0.0021,
"step": 120
},
{
"epoch": 17.142857142857142,
"eval_loss": 1.7655525207519531,
"eval_runtime": 21.8701,
"eval_samples_per_second": 2.286,
"eval_steps_per_second": 0.32,
"step": 120
},
{
"epoch": 17.857142857142858,
"grad_norm": 0.11048714816570282,
"learning_rate": 1.5377429062990122e-05,
"loss": 0.0025,
"step": 125
},
{
"epoch": 17.857142857142858,
"eval_loss": 1.768291711807251,
"eval_runtime": 21.9645,
"eval_samples_per_second": 2.276,
"eval_steps_per_second": 0.319,
"step": 125
},
{
"epoch": 18.571428571428573,
"grad_norm": 0.07220487296581268,
"learning_rate": 1.497265891723643e-05,
"loss": 0.0018,
"step": 130
},
{
"epoch": 18.571428571428573,
"eval_loss": 1.7749742269515991,
"eval_runtime": 21.6791,
"eval_samples_per_second": 2.306,
"eval_steps_per_second": 0.323,
"step": 130
},
{
"epoch": 19.285714285714285,
"grad_norm": 0.09813550859689713,
"learning_rate": 1.4556759384562418e-05,
"loss": 0.002,
"step": 135
},
{
"epoch": 19.285714285714285,
"eval_loss": 1.7667394876480103,
"eval_runtime": 22.2338,
"eval_samples_per_second": 2.249,
"eval_steps_per_second": 0.315,
"step": 135
},
{
"epoch": 20.0,
"grad_norm": 0.06082721799612045,
"learning_rate": 1.4130661296319313e-05,
"loss": 0.0036,
"step": 140
},
{
"epoch": 20.0,
"eval_loss": 1.7492411136627197,
"eval_runtime": 21.9099,
"eval_samples_per_second": 2.282,
"eval_steps_per_second": 0.319,
"step": 140
},
{
"epoch": 20.714285714285715,
"grad_norm": 0.19404393434524536,
"learning_rate": 1.369531830940757e-05,
"loss": 0.0025,
"step": 145
},
{
"epoch": 20.714285714285715,
"eval_loss": 1.7377521991729736,
"eval_runtime": 21.7672,
"eval_samples_per_second": 2.297,
"eval_steps_per_second": 0.322,
"step": 145
},
{
"epoch": 21.428571428571427,
"grad_norm": 0.05864373594522476,
"learning_rate": 1.325170477188224e-05,
"loss": 0.0017,
"step": 150
},
{
"epoch": 21.428571428571427,
"eval_loss": 1.7388665676116943,
"eval_runtime": 22.2013,
"eval_samples_per_second": 2.252,
"eval_steps_per_second": 0.315,
"step": 150
},
{
"epoch": 22.142857142857142,
"grad_norm": 0.04350695759057999,
"learning_rate": 1.2800813542249073e-05,
"loss": 0.0016,
"step": 155
},
{
"epoch": 22.142857142857142,
"eval_loss": 1.7510356903076172,
"eval_runtime": 22.0791,
"eval_samples_per_second": 2.265,
"eval_steps_per_second": 0.317,
"step": 155
},
{
"epoch": 22.857142857142858,
"grad_norm": 0.09723013639450073,
"learning_rate": 1.234365376733215e-05,
"loss": 0.0016,
"step": 160
},
{
"epoch": 22.857142857142858,
"eval_loss": 1.7622946500778198,
"eval_runtime": 22.0297,
"eval_samples_per_second": 2.27,
"eval_steps_per_second": 0.318,
"step": 160
},
{
"epoch": 23.571428571428573,
"grad_norm": 0.046282168477773666,
"learning_rate": 1.188124862368634e-05,
"loss": 0.0014,
"step": 165
},
{
"epoch": 23.571428571428573,
"eval_loss": 1.770511507987976,
"eval_runtime": 21.9343,
"eval_samples_per_second": 2.28,
"eval_steps_per_second": 0.319,
"step": 165
},
{
"epoch": 24.285714285714285,
"grad_norm": 0.03766458481550217,
"learning_rate": 1.1414633027609585e-05,
"loss": 0.0013,
"step": 170
},
{
"epoch": 24.285714285714285,
"eval_loss": 1.7750704288482666,
"eval_runtime": 22.0134,
"eval_samples_per_second": 2.271,
"eval_steps_per_second": 0.318,
"step": 170
},
{
"epoch": 25.0,
"grad_norm": 0.14919541776180267,
"learning_rate": 1.0944851318880314e-05,
"loss": 0.0015,
"step": 175
},
{
"epoch": 25.0,
"eval_loss": 1.7801543474197388,
"eval_runtime": 22.0082,
"eval_samples_per_second": 2.272,
"eval_steps_per_second": 0.318,
"step": 175
},
{
"epoch": 25.714285714285715,
"grad_norm": 0.01755683310329914,
"learning_rate": 1.047295492340397e-05,
"loss": 0.0011,
"step": 180
},
{
"epoch": 25.714285714285715,
"eval_loss": 1.7830265760421753,
"eval_runtime": 22.0185,
"eval_samples_per_second": 2.271,
"eval_steps_per_second": 0.318,
"step": 180
},
{
"epoch": 26.428571428571427,
"grad_norm": 0.051639165729284286,
"learning_rate": 1e-05,
"loss": 0.0012,
"step": 185
},
{
"epoch": 26.428571428571427,
"eval_loss": 1.7872556447982788,
"eval_runtime": 22.2185,
"eval_samples_per_second": 2.25,
"eval_steps_per_second": 0.315,
"step": 185
},
{
"epoch": 27.142857142857142,
"grad_norm": 0.05077521875500679,
"learning_rate": 9.527045076596036e-06,
"loss": 0.0011,
"step": 190
},
{
"epoch": 27.142857142857142,
"eval_loss": 1.7918928861618042,
"eval_runtime": 22.0395,
"eval_samples_per_second": 2.269,
"eval_steps_per_second": 0.318,
"step": 190
},
{
"epoch": 27.857142857142858,
"grad_norm": 0.0997517928481102,
"learning_rate": 9.055148681119688e-06,
"loss": 0.0012,
"step": 195
},
{
"epoch": 27.857142857142858,
"eval_loss": 1.7959425449371338,
"eval_runtime": 21.9538,
"eval_samples_per_second": 2.278,
"eval_steps_per_second": 0.319,
"step": 195
},
{
"epoch": 28.571428571428573,
"grad_norm": 0.07028105854988098,
"learning_rate": 8.585366972390416e-06,
"loss": 0.0012,
"step": 200
},
{
"epoch": 28.571428571428573,
"eval_loss": 1.7992604970932007,
"eval_runtime": 22.074,
"eval_samples_per_second": 2.265,
"eval_steps_per_second": 0.317,
"step": 200
},
{
"epoch": 29.285714285714285,
"grad_norm": 0.05388140678405762,
"learning_rate": 8.118751376313666e-06,
"loss": 0.001,
"step": 205
},
{
"epoch": 29.285714285714285,
"eval_loss": 1.801768183708191,
"eval_runtime": 21.8246,
"eval_samples_per_second": 2.291,
"eval_steps_per_second": 0.321,
"step": 205
},
{
"epoch": 30.0,
"grad_norm": 0.0411839634180069,
"learning_rate": 7.65634623266785e-06,
"loss": 0.0012,
"step": 210
},
{
"epoch": 30.0,
"eval_loss": 1.8040063381195068,
"eval_runtime": 22.2164,
"eval_samples_per_second": 2.251,
"eval_steps_per_second": 0.315,
"step": 210
},
{
"epoch": 30.714285714285715,
"grad_norm": 0.04949762672185898,
"learning_rate": 7.199186457750931e-06,
"loss": 0.001,
"step": 215
},
{
"epoch": 30.714285714285715,
"eval_loss": 1.807271957397461,
"eval_runtime": 22.2074,
"eval_samples_per_second": 2.252,
"eval_steps_per_second": 0.315,
"step": 215
},
{
"epoch": 31.428571428571427,
"grad_norm": 0.041917722672224045,
"learning_rate": 6.748295228117765e-06,
"loss": 0.001,
"step": 220
},
{
"epoch": 31.428571428571427,
"eval_loss": 1.8091932535171509,
"eval_runtime": 22.225,
"eval_samples_per_second": 2.25,
"eval_steps_per_second": 0.315,
"step": 220
},
{
"epoch": 32.142857142857146,
"grad_norm": 0.06832437217235565,
"learning_rate": 6.304681690592431e-06,
"loss": 0.0014,
"step": 225
},
{
"epoch": 32.142857142857146,
"eval_loss": 1.8116446733474731,
"eval_runtime": 22.2548,
"eval_samples_per_second": 2.247,
"eval_steps_per_second": 0.315,
"step": 225
},
{
"epoch": 32.857142857142854,
"grad_norm": 0.0969175174832344,
"learning_rate": 5.869338703680691e-06,
"loss": 0.0011,
"step": 230
},
{
"epoch": 32.857142857142854,
"eval_loss": 1.8134872913360596,
"eval_runtime": 22.0272,
"eval_samples_per_second": 2.27,
"eval_steps_per_second": 0.318,
"step": 230
},
{
"epoch": 33.57142857142857,
"grad_norm": 0.0360889658331871,
"learning_rate": 5.443240615437586e-06,
"loss": 0.001,
"step": 235
},
{
"epoch": 33.57142857142857,
"eval_loss": 1.814117670059204,
"eval_runtime": 21.875,
"eval_samples_per_second": 2.286,
"eval_steps_per_second": 0.32,
"step": 235
},
{
"epoch": 34.285714285714285,
"grad_norm": 0.047800276428461075,
"learning_rate": 5.027341082763575e-06,
"loss": 0.0011,
"step": 240
},
{
"epoch": 34.285714285714285,
"eval_loss": 1.8167412281036377,
"eval_runtime": 21.996,
"eval_samples_per_second": 2.273,
"eval_steps_per_second": 0.318,
"step": 240
},
{
"epoch": 35.0,
"grad_norm": 0.03180164098739624,
"learning_rate": 4.622570937009879e-06,
"loss": 0.0009,
"step": 245
},
{
"epoch": 35.0,
"eval_loss": 1.8182238340377808,
"eval_runtime": 22.4068,
"eval_samples_per_second": 2.231,
"eval_steps_per_second": 0.312,
"step": 245
},
{
"epoch": 35.714285714285715,
"grad_norm": 0.020492179319262505,
"learning_rate": 4.229836100670058e-06,
"loss": 0.001,
"step": 250
},
{
"epoch": 35.714285714285715,
"eval_loss": 1.8190386295318604,
"eval_runtime": 22.0033,
"eval_samples_per_second": 2.272,
"eval_steps_per_second": 0.318,
"step": 250
},
{
"epoch": 36.42857142857143,
"grad_norm": 0.09310004115104675,
"learning_rate": 3.850015559820465e-06,
"loss": 0.0011,
"step": 255
},
{
"epoch": 36.42857142857143,
"eval_loss": 1.8204212188720703,
"eval_runtime": 21.9927,
"eval_samples_per_second": 2.273,
"eval_steps_per_second": 0.318,
"step": 255
},
{
"epoch": 37.142857142857146,
"grad_norm": 0.056946855038404465,
"learning_rate": 3.483959396847554e-06,
"loss": 0.0012,
"step": 260
},
{
"epoch": 37.142857142857146,
"eval_loss": 1.821565866470337,
"eval_runtime": 21.8674,
"eval_samples_per_second": 2.287,
"eval_steps_per_second": 0.32,
"step": 260
},
{
"epoch": 37.857142857142854,
"grad_norm": 0.04144909605383873,
"learning_rate": 3.132486887864992e-06,
"loss": 0.0009,
"step": 265
},
{
"epoch": 37.857142857142854,
"eval_loss": 1.8220727443695068,
"eval_runtime": 22.0788,
"eval_samples_per_second": 2.265,
"eval_steps_per_second": 0.317,
"step": 265
},
{
"epoch": 38.57142857142857,
"grad_norm": 0.020420927554368973,
"learning_rate": 2.7963846690787633e-06,
"loss": 0.001,
"step": 270
},
{
"epoch": 38.57142857142857,
"eval_loss": 1.822296380996704,
"eval_runtime": 22.0319,
"eval_samples_per_second": 2.269,
"eval_steps_per_second": 0.318,
"step": 270
},
{
"epoch": 39.285714285714285,
"grad_norm": 0.03683812543749809,
"learning_rate": 2.4764049762041874e-06,
"loss": 0.0013,
"step": 275
},
{
"epoch": 39.285714285714285,
"eval_loss": 1.8238246440887451,
"eval_runtime": 22.3005,
"eval_samples_per_second": 2.242,
"eval_steps_per_second": 0.314,
"step": 275
},
{
"epoch": 40.0,
"grad_norm": 0.07886708527803421,
"learning_rate": 2.1732639608752173e-06,
"loss": 0.0011,
"step": 280
},
{
"epoch": 40.0,
"eval_loss": 1.824698567390442,
"eval_runtime": 22.3675,
"eval_samples_per_second": 2.235,
"eval_steps_per_second": 0.313,
"step": 280
},
{
"epoch": 40.714285714285715,
"grad_norm": 0.04558909684419632,
"learning_rate": 1.8876400878140776e-06,
"loss": 0.0009,
"step": 285
},
{
"epoch": 40.714285714285715,
"eval_loss": 1.8251045942306519,
"eval_runtime": 22.0914,
"eval_samples_per_second": 2.263,
"eval_steps_per_second": 0.317,
"step": 285
},
{
"epoch": 41.42857142857143,
"grad_norm": 0.037249855697155,
"learning_rate": 1.6201726163485997e-06,
"loss": 0.0011,
"step": 290
},
{
"epoch": 41.42857142857143,
"eval_loss": 1.8252718448638916,
"eval_runtime": 22.0174,
"eval_samples_per_second": 2.271,
"eval_steps_per_second": 0.318,
"step": 290
},
{
"epoch": 42.142857142857146,
"grad_norm": 0.0819711983203888,
"learning_rate": 1.3714601696757713e-06,
"loss": 0.001,
"step": 295
},
{
"epoch": 42.142857142857146,
"eval_loss": 1.826216459274292,
"eval_runtime": 21.9895,
"eval_samples_per_second": 2.274,
"eval_steps_per_second": 0.318,
"step": 295
},
{
"epoch": 42.857142857142854,
"grad_norm": 0.04518039524555206,
"learning_rate": 1.1420593950736326e-06,
"loss": 0.001,
"step": 300
},
{
"epoch": 42.857142857142854,
"eval_loss": 1.8267167806625366,
"eval_runtime": 21.8488,
"eval_samples_per_second": 2.288,
"eval_steps_per_second": 0.32,
"step": 300
},
{
"epoch": 43.57142857142857,
"grad_norm": 0.03789067268371582,
"learning_rate": 9.324837180601743e-07,
"loss": 0.0011,
"step": 305
},
{
"epoch": 43.57142857142857,
"eval_loss": 1.8267269134521484,
"eval_runtime": 22.0372,
"eval_samples_per_second": 2.269,
"eval_steps_per_second": 0.318,
"step": 305
},
{
"epoch": 44.285714285714285,
"grad_norm": 0.07329820841550827,
"learning_rate": 7.432021932875222e-07,
"loss": 0.0012,
"step": 310
},
{
"epoch": 44.285714285714285,
"eval_loss": 1.8272424936294556,
"eval_runtime": 21.9242,
"eval_samples_per_second": 2.281,
"eval_steps_per_second": 0.319,
"step": 310
},
{
"epoch": 45.0,
"grad_norm": 0.0455288402736187,
"learning_rate": 5.746384547432738e-07,
"loss": 0.0009,
"step": 315
},
{
"epoch": 45.0,
"eval_loss": 1.8278112411499023,
"eval_runtime": 22.3633,
"eval_samples_per_second": 2.236,
"eval_steps_per_second": 0.313,
"step": 315
},
{
"epoch": 45.714285714285715,
"grad_norm": 0.05950823426246643,
"learning_rate": 4.2716976760853513e-07,
"loss": 0.0008,
"step": 320
},
{
"epoch": 45.714285714285715,
"eval_loss": 1.8275575637817383,
"eval_runtime": 21.8644,
"eval_samples_per_second": 2.287,
"eval_steps_per_second": 0.32,
"step": 320
},
{
"epoch": 46.42857142857143,
"grad_norm": 0.04292258992791176,
"learning_rate": 3.011261838947277e-07,
"loss": 0.0009,
"step": 325
},
{
"epoch": 46.42857142857143,
"eval_loss": 1.8281974792480469,
"eval_runtime": 22.2434,
"eval_samples_per_second": 2.248,
"eval_steps_per_second": 0.315,
"step": 325
},
{
"epoch": 47.142857142857146,
"grad_norm": 0.03321965038776398,
"learning_rate": 1.9678980374894352e-07,
"loss": 0.001,
"step": 330
},
{
"epoch": 47.142857142857146,
"eval_loss": 1.8281824588775635,
"eval_runtime": 21.8282,
"eval_samples_per_second": 2.291,
"eval_steps_per_second": 0.321,
"step": 330
},
{
"epoch": 47.857142857142854,
"grad_norm": 0.05279669910669327,
"learning_rate": 1.1439414408111471e-07,
"loss": 0.001,
"step": 335
},
{
"epoch": 47.857142857142854,
"eval_loss": 1.827906847000122,
"eval_runtime": 21.7868,
"eval_samples_per_second": 2.295,
"eval_steps_per_second": 0.321,
"step": 335
},
{
"epoch": 48.57142857142857,
"grad_norm": 0.03341998532414436,
"learning_rate": 5.412361592611382e-08,
"loss": 0.0008,
"step": 340
},
{
"epoch": 48.57142857142857,
"eval_loss": 1.828213095664978,
"eval_runtime": 21.9519,
"eval_samples_per_second": 2.278,
"eval_steps_per_second": 0.319,
"step": 340
},
{
"epoch": 49.285714285714285,
"grad_norm": 0.04219294339418411,
"learning_rate": 1.611311171048735e-08,
"loss": 0.0012,
"step": 345
},
{
"epoch": 49.285714285714285,
"eval_loss": 1.8280681371688843,
"eval_runtime": 22.1409,
"eval_samples_per_second": 2.258,
"eval_steps_per_second": 0.316,
"step": 345
},
{
"epoch": 50.0,
"grad_norm": 0.046254031360149384,
"learning_rate": 4.477033475702719e-10,
"loss": 0.001,
"step": 350
},
{
"epoch": 50.0,
"eval_loss": 1.8277511596679688,
"eval_runtime": 21.972,
"eval_samples_per_second": 2.276,
"eval_steps_per_second": 0.319,
"step": 350
},
{
"epoch": 50.0,
"step": 350,
"total_flos": 86960020979712.0,
"train_loss": 0.06868666498057012,
"train_runtime": 10021.0579,
"train_samples_per_second": 0.249,
"train_steps_per_second": 0.035
}
],
"logging_steps": 5,
"max_steps": 350,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 86960020979712.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}