LamaDiab's picture
Training checkpoint - Epoch 2, Step 47030
fba5a2c verified
Invalid JSON: Unexpected token 'N', ..."ad_norm": NaN, "... is not valid JSON
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 47030,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 4.2526047203912395e-05,
"grad_norm": NaN,
"learning_rate": 0.0,
"loss": 16.5747,
"step": 1
},
{
"epoch": 0.0212630236019562,
"grad_norm": 72.24292755126953,
"learning_rate": 2.4719332578020393e-08,
"loss": 16.0022,
"step": 500
},
{
"epoch": 0.0212630236019562,
"eval_cosine_accuracy": 0.9308079481124878,
"eval_loss": 11.834263801574707,
"eval_runtime": 49.7307,
"eval_samples_per_second": 380.208,
"eval_steps_per_second": 1.488,
"step": 500
},
{
"epoch": 0.0425260472039124,
"grad_norm": 90.77424621582031,
"learning_rate": 4.963801461231515e-08,
"loss": 15.9413,
"step": 1000
},
{
"epoch": 0.0425260472039124,
"eval_cosine_accuracy": 0.9313969612121582,
"eval_loss": 11.764849662780762,
"eval_runtime": 50.6172,
"eval_samples_per_second": 373.549,
"eval_steps_per_second": 1.462,
"step": 1000
},
{
"epoch": 0.0637890708058686,
"grad_norm": 61.71802520751953,
"learning_rate": 7.45566966466099e-08,
"loss": 15.8159,
"step": 1500
},
{
"epoch": 0.0637890708058686,
"eval_cosine_accuracy": 0.9325751066207886,
"eval_loss": 11.667285919189453,
"eval_runtime": 50.0024,
"eval_samples_per_second": 378.142,
"eval_steps_per_second": 1.48,
"step": 1500
},
{
"epoch": 0.0850520944078248,
"grad_norm": 70.19003295898438,
"learning_rate": 9.947537868090464e-08,
"loss": 15.6699,
"step": 2000
},
{
"epoch": 0.0850520944078248,
"eval_cosine_accuracy": 0.934478223323822,
"eval_loss": 11.543947219848633,
"eval_runtime": 50.7737,
"eval_samples_per_second": 372.397,
"eval_steps_per_second": 1.457,
"step": 2000
},
{
"epoch": 0.106315118009781,
"grad_norm": 88.77647399902344,
"learning_rate": 1.2434422335113082e-07,
"loss": 15.3678,
"step": 2500
},
{
"epoch": 0.106315118009781,
"eval_cosine_accuracy": 0.9360641837120056,
"eval_loss": 11.422866821289062,
"eval_runtime": 49.9051,
"eval_samples_per_second": 378.879,
"eval_steps_per_second": 1.483,
"step": 2500
},
{
"epoch": 0.1275781416117372,
"grad_norm": 60.29121398925781,
"learning_rate": 1.4926290538542556e-07,
"loss": 15.0132,
"step": 3000
},
{
"epoch": 0.1275781416117372,
"eval_cosine_accuracy": 0.9384657144546509,
"eval_loss": 11.268472671508789,
"eval_runtime": 50.5525,
"eval_samples_per_second": 374.027,
"eval_steps_per_second": 1.464,
"step": 3000
},
{
"epoch": 0.1488411652136934,
"grad_norm": 58.82876968383789,
"learning_rate": 1.741815874197203e-07,
"loss": 14.5215,
"step": 3500
},
{
"epoch": 0.1488411652136934,
"eval_cosine_accuracy": 0.941592276096344,
"eval_loss": 11.027884483337402,
"eval_runtime": 49.4569,
"eval_samples_per_second": 382.313,
"eval_steps_per_second": 1.496,
"step": 3500
},
{
"epoch": 0.1701041888156496,
"grad_norm": 86.84857177734375,
"learning_rate": 1.9910026945401506e-07,
"loss": 14.0143,
"step": 4000
},
{
"epoch": 0.1701041888156496,
"eval_cosine_accuracy": 0.9445375800132751,
"eval_loss": 10.77338981628418,
"eval_runtime": 49.47,
"eval_samples_per_second": 382.212,
"eval_steps_per_second": 1.496,
"step": 4000
},
{
"epoch": 0.1913672124176058,
"grad_norm": 50.29563903808594,
"learning_rate": 2.2396911412424123e-07,
"loss": 13.3569,
"step": 4500
},
{
"epoch": 0.1913672124176058,
"eval_cosine_accuracy": 0.9471657276153564,
"eval_loss": 10.502735137939453,
"eval_runtime": 50.4762,
"eval_samples_per_second": 374.592,
"eval_steps_per_second": 1.466,
"step": 4500
},
{
"epoch": 0.212630236019562,
"grad_norm": 48.2503776550293,
"learning_rate": 2.48887796158536e-07,
"loss": 12.5679,
"step": 5000
},
{
"epoch": 0.212630236019562,
"eval_cosine_accuracy": 0.9499297738075256,
"eval_loss": 10.223851203918457,
"eval_runtime": 49.7554,
"eval_samples_per_second": 380.019,
"eval_steps_per_second": 1.487,
"step": 5000
},
{
"epoch": 0.2338932596215182,
"grad_norm": 85.99883270263672,
"learning_rate": 2.7380647819283073e-07,
"loss": 11.7751,
"step": 5500
},
{
"epoch": 0.2338932596215182,
"eval_cosine_accuracy": 0.9533734917640686,
"eval_loss": 9.879401206970215,
"eval_runtime": 50.9383,
"eval_samples_per_second": 371.194,
"eval_steps_per_second": 1.453,
"step": 5500
},
{
"epoch": 0.2551562832234744,
"grad_norm": 30.850263595581055,
"learning_rate": 2.9872516022712553e-07,
"loss": 10.924,
"step": 6000
},
{
"epoch": 0.2551562832234744,
"eval_cosine_accuracy": 0.9558656811714172,
"eval_loss": 9.583606719970703,
"eval_runtime": 50.1057,
"eval_samples_per_second": 377.363,
"eval_steps_per_second": 1.477,
"step": 6000
},
{
"epoch": 0.2764193068254306,
"grad_norm": 27.160547256469727,
"learning_rate": 3.2359400489735165e-07,
"loss": 10.207,
"step": 6500
},
{
"epoch": 0.2764193068254306,
"eval_cosine_accuracy": 0.9583125710487366,
"eval_loss": 9.295426368713379,
"eval_runtime": 50.8853,
"eval_samples_per_second": 371.58,
"eval_steps_per_second": 1.454,
"step": 6500
},
{
"epoch": 0.2976823304273868,
"grad_norm": 14.218546867370605,
"learning_rate": 3.4851268693164645e-07,
"loss": 9.6073,
"step": 7000
},
{
"epoch": 0.2976823304273868,
"eval_cosine_accuracy": 0.9617562890052795,
"eval_loss": 9.070219993591309,
"eval_runtime": 49.4097,
"eval_samples_per_second": 382.678,
"eval_steps_per_second": 1.498,
"step": 7000
},
{
"epoch": 0.318945354029343,
"grad_norm": 20.438467025756836,
"learning_rate": 3.7343136896594114e-07,
"loss": 9.3031,
"step": 7500
},
{
"epoch": 0.318945354029343,
"eval_cosine_accuracy": 0.9646109938621521,
"eval_loss": 8.921951293945312,
"eval_runtime": 50.9982,
"eval_samples_per_second": 370.758,
"eval_steps_per_second": 1.451,
"step": 7500
},
{
"epoch": 0.3402083776312992,
"grad_norm": 18.741724014282227,
"learning_rate": 3.983500510002359e-07,
"loss": 9.1113,
"step": 8000
},
{
"epoch": 0.3402083776312992,
"eval_cosine_accuracy": 0.9665141105651855,
"eval_loss": 8.841903686523438,
"eval_runtime": 49.583,
"eval_samples_per_second": 381.34,
"eval_steps_per_second": 1.492,
"step": 8000
},
{
"epoch": 0.3614714012332554,
"grad_norm": 14.93420696258545,
"learning_rate": 4.2321889567046206e-07,
"loss": 8.9949,
"step": 8500
},
{
"epoch": 0.3614714012332554,
"eval_cosine_accuracy": 0.9681000709533691,
"eval_loss": 8.776097297668457,
"eval_runtime": 50.3732,
"eval_samples_per_second": 375.359,
"eval_steps_per_second": 1.469,
"step": 8500
},
{
"epoch": 0.3827344248352116,
"grad_norm": 12.44970417022705,
"learning_rate": 4.481375777047568e-07,
"loss": 8.9394,
"step": 9000
},
{
"epoch": 0.3827344248352116,
"eval_cosine_accuracy": 0.9681000709533691,
"eval_loss": 8.712888717651367,
"eval_runtime": 49.3318,
"eval_samples_per_second": 383.282,
"eval_steps_per_second": 1.5,
"step": 9000
},
{
"epoch": 0.4039974484371678,
"grad_norm": 11.820176124572754,
"learning_rate": 4.7305625973905156e-07,
"loss": 8.8496,
"step": 9500
},
{
"epoch": 0.4039974484371678,
"eval_cosine_accuracy": 0.968235969543457,
"eval_loss": 8.671957015991211,
"eval_runtime": 50.445,
"eval_samples_per_second": 374.824,
"eval_steps_per_second": 1.467,
"step": 9500
},
{
"epoch": 0.425260472039124,
"grad_norm": 19.25648307800293,
"learning_rate": 4.979251044092777e-07,
"loss": 8.8057,
"step": 10000
},
{
"epoch": 0.425260472039124,
"eval_cosine_accuracy": 0.9679641127586365,
"eval_loss": 8.636693000793457,
"eval_runtime": 49.6391,
"eval_samples_per_second": 380.909,
"eval_steps_per_second": 1.491,
"step": 10000
},
{
"epoch": 0.4465234956410802,
"grad_norm": 11.33238697052002,
"learning_rate": 5.228437864435725e-07,
"loss": 8.739,
"step": 10500
},
{
"epoch": 0.4465234956410802,
"eval_cosine_accuracy": 0.9684625267982483,
"eval_loss": 8.61970329284668,
"eval_runtime": 50.2316,
"eval_samples_per_second": 376.416,
"eval_steps_per_second": 1.473,
"step": 10500
},
{
"epoch": 0.4677865192430364,
"grad_norm": 12.054763793945312,
"learning_rate": 5.477624684778672e-07,
"loss": 8.6886,
"step": 11000
},
{
"epoch": 0.4677865192430364,
"eval_cosine_accuracy": 0.968598484992981,
"eval_loss": 8.59864616394043,
"eval_runtime": 49.6249,
"eval_samples_per_second": 381.018,
"eval_steps_per_second": 1.491,
"step": 11000
},
{
"epoch": 0.4890495428449926,
"grad_norm": 12.590415000915527,
"learning_rate": 5.72681150512162e-07,
"loss": 8.6431,
"step": 11500
},
{
"epoch": 0.4890495428449926,
"eval_cosine_accuracy": 0.9678281545639038,
"eval_loss": 8.581836700439453,
"eval_runtime": 50.2665,
"eval_samples_per_second": 376.155,
"eval_steps_per_second": 1.472,
"step": 11500
},
{
"epoch": 0.5103125664469488,
"grad_norm": 15.370965957641602,
"learning_rate": 5.975998325464567e-07,
"loss": 8.6208,
"step": 12000
},
{
"epoch": 0.5103125664469488,
"eval_cosine_accuracy": 0.968598484992981,
"eval_loss": 8.53200912475586,
"eval_runtime": 49.502,
"eval_samples_per_second": 381.964,
"eval_steps_per_second": 1.495,
"step": 12000
},
{
"epoch": 0.531575590048905,
"grad_norm": 10.931774139404297,
"learning_rate": 6.225185145807515e-07,
"loss": 8.5638,
"step": 12500
},
{
"epoch": 0.531575590048905,
"eval_cosine_accuracy": 0.9685078859329224,
"eval_loss": 8.522621154785156,
"eval_runtime": 50.2178,
"eval_samples_per_second": 376.52,
"eval_steps_per_second": 1.474,
"step": 12500
},
{
"epoch": 0.5528386136508612,
"grad_norm": 13.296648025512695,
"learning_rate": 6.474371966150463e-07,
"loss": 8.539,
"step": 13000
},
{
"epoch": 0.5528386136508612,
"eval_cosine_accuracy": 0.9688250422477722,
"eval_loss": 8.52730941772461,
"eval_runtime": 49.7781,
"eval_samples_per_second": 379.845,
"eval_steps_per_second": 1.487,
"step": 13000
},
{
"epoch": 0.5741016372528174,
"grad_norm": 13.039021492004395,
"learning_rate": 6.72355878649341e-07,
"loss": 8.4952,
"step": 13500
},
{
"epoch": 0.5741016372528174,
"eval_cosine_accuracy": 0.9687797427177429,
"eval_loss": 8.488916397094727,
"eval_runtime": 50.244,
"eval_samples_per_second": 376.324,
"eval_steps_per_second": 1.473,
"step": 13500
},
{
"epoch": 0.5953646608547736,
"grad_norm": 14.663399696350098,
"learning_rate": 6.972745606836358e-07,
"loss": 8.437,
"step": 14000
},
{
"epoch": 0.5953646608547736,
"eval_cosine_accuracy": 0.9692781567573547,
"eval_loss": 8.479202270507812,
"eval_runtime": 49.8834,
"eval_samples_per_second": 379.044,
"eval_steps_per_second": 1.483,
"step": 14000
},
{
"epoch": 0.6166276844567298,
"grad_norm": 14.649970054626465,
"learning_rate": 7.221932427179305e-07,
"loss": 8.4223,
"step": 14500
},
{
"epoch": 0.6166276844567298,
"eval_cosine_accuracy": 0.9690062999725342,
"eval_loss": 8.463949203491211,
"eval_runtime": 50.2893,
"eval_samples_per_second": 375.985,
"eval_steps_per_second": 1.471,
"step": 14500
},
{
"epoch": 0.637890708058686,
"grad_norm": 16.208450317382812,
"learning_rate": 7.471119247522253e-07,
"loss": 8.367,
"step": 15000
},
{
"epoch": 0.637890708058686,
"eval_cosine_accuracy": 0.9686437845230103,
"eval_loss": 8.437935829162598,
"eval_runtime": 49.4396,
"eval_samples_per_second": 382.446,
"eval_steps_per_second": 1.497,
"step": 15000
},
{
"epoch": 0.6591537316606422,
"grad_norm": 17.769954681396484,
"learning_rate": 7.7203060678652e-07,
"loss": 8.3536,
"step": 15500
},
{
"epoch": 0.6591537316606422,
"eval_cosine_accuracy": 0.9689610004425049,
"eval_loss": 8.473112106323242,
"eval_runtime": 50.2825,
"eval_samples_per_second": 376.035,
"eval_steps_per_second": 1.472,
"step": 15500
},
{
"epoch": 0.6804167552625984,
"grad_norm": 17.118555068969727,
"learning_rate": 7.968994514567462e-07,
"loss": 8.3289,
"step": 16000
},
{
"epoch": 0.6804167552625984,
"eval_cosine_accuracy": 0.9688703417778015,
"eval_loss": 8.445318222045898,
"eval_runtime": 49.5183,
"eval_samples_per_second": 381.838,
"eval_steps_per_second": 1.494,
"step": 16000
},
{
"epoch": 0.7016797788645546,
"grad_norm": 14.455836296081543,
"learning_rate": 8.218181334910409e-07,
"loss": 8.3108,
"step": 16500
},
{
"epoch": 0.7016797788645546,
"eval_cosine_accuracy": 0.9694594144821167,
"eval_loss": 8.421878814697266,
"eval_runtime": 50.3275,
"eval_samples_per_second": 375.699,
"eval_steps_per_second": 1.47,
"step": 16500
},
{
"epoch": 0.7229428024665108,
"grad_norm": 13.595857620239258,
"learning_rate": 8.467368155253357e-07,
"loss": 8.2663,
"step": 17000
},
{
"epoch": 0.7229428024665108,
"eval_cosine_accuracy": 0.9686437845230103,
"eval_loss": 8.462857246398926,
"eval_runtime": 49.5482,
"eval_samples_per_second": 381.608,
"eval_steps_per_second": 1.493,
"step": 17000
},
{
"epoch": 0.744205826068467,
"grad_norm": 14.019209861755371,
"learning_rate": 8.716056601955618e-07,
"loss": 8.2225,
"step": 17500
},
{
"epoch": 0.744205826068467,
"eval_cosine_accuracy": 0.9691875576972961,
"eval_loss": 8.401082038879395,
"eval_runtime": 50.4644,
"eval_samples_per_second": 374.68,
"eval_steps_per_second": 1.466,
"step": 17500
},
{
"epoch": 0.7654688496704232,
"grad_norm": 13.743831634521484,
"learning_rate": 8.965243422298566e-07,
"loss": 8.2052,
"step": 18000
},
{
"epoch": 0.7654688496704232,
"eval_cosine_accuracy": 0.9689610004425049,
"eval_loss": 8.454690933227539,
"eval_runtime": 49.7999,
"eval_samples_per_second": 379.679,
"eval_steps_per_second": 1.486,
"step": 18000
},
{
"epoch": 0.7867318732723794,
"grad_norm": 14.726578712463379,
"learning_rate": 9.214430242641513e-07,
"loss": 8.1894,
"step": 18500
},
{
"epoch": 0.7867318732723794,
"eval_cosine_accuracy": 0.9687797427177429,
"eval_loss": 8.442588806152344,
"eval_runtime": 50.3938,
"eval_samples_per_second": 375.205,
"eval_steps_per_second": 1.468,
"step": 18500
},
{
"epoch": 0.8079948968743356,
"grad_norm": 16.874422073364258,
"learning_rate": 9.463617062984462e-07,
"loss": 8.1808,
"step": 19000
},
{
"epoch": 0.8079948968743356,
"eval_cosine_accuracy": 0.9687344431877136,
"eval_loss": 8.419596672058105,
"eval_runtime": 49.7216,
"eval_samples_per_second": 380.277,
"eval_steps_per_second": 1.488,
"step": 19000
},
{
"epoch": 0.8292579204762918,
"grad_norm": 18.23941993713379,
"learning_rate": 9.71280388332741e-07,
"loss": 8.1298,
"step": 19500
},
{
"epoch": 0.8292579204762918,
"eval_cosine_accuracy": 0.969504714012146,
"eval_loss": 8.426375389099121,
"eval_runtime": 50.209,
"eval_samples_per_second": 376.586,
"eval_steps_per_second": 1.474,
"step": 19500
},
{
"epoch": 0.850520944078248,
"grad_norm": 18.061824798583984,
"learning_rate": 9.961990703670356e-07,
"loss": 8.1187,
"step": 20000
},
{
"epoch": 0.850520944078248,
"eval_cosine_accuracy": 0.969504714012146,
"eval_loss": 8.403668403625488,
"eval_runtime": 50.0426,
"eval_samples_per_second": 377.838,
"eval_steps_per_second": 1.479,
"step": 20000
},
{
"epoch": 0.8717839676802042,
"grad_norm": 14.699914932250977,
"learning_rate": 1.0211177524013304e-06,
"loss": 8.0909,
"step": 20500
},
{
"epoch": 0.8717839676802042,
"eval_cosine_accuracy": 0.9694141149520874,
"eval_loss": 8.39807415008545,
"eval_runtime": 50.3586,
"eval_samples_per_second": 375.468,
"eval_steps_per_second": 1.469,
"step": 20500
},
{
"epoch": 0.8930469912821604,
"grad_norm": 31.298362731933594,
"learning_rate": 1.046036434435625e-06,
"loss": 8.0338,
"step": 21000
},
{
"epoch": 0.8930469912821604,
"eval_cosine_accuracy": 0.9687344431877136,
"eval_loss": 8.437390327453613,
"eval_runtime": 50.0486,
"eval_samples_per_second": 377.793,
"eval_steps_per_second": 1.479,
"step": 21000
},
{
"epoch": 0.9143100148841166,
"grad_norm": 20.37915802001953,
"learning_rate": 1.0709052791058514e-06,
"loss": 8.0198,
"step": 21500
},
{
"epoch": 0.9143100148841166,
"eval_cosine_accuracy": 0.9688250422477722,
"eval_loss": 8.418041229248047,
"eval_runtime": 50.3563,
"eval_samples_per_second": 375.484,
"eval_steps_per_second": 1.47,
"step": 21500
},
{
"epoch": 0.9355730384860728,
"grad_norm": 16.509695053100586,
"learning_rate": 1.095823961140146e-06,
"loss": 8.0113,
"step": 22000
},
{
"epoch": 0.9355730384860728,
"eval_cosine_accuracy": 0.968598484992981,
"eval_loss": 8.439444541931152,
"eval_runtime": 49.8709,
"eval_samples_per_second": 379.139,
"eval_steps_per_second": 1.484,
"step": 22000
},
{
"epoch": 0.956836062088029,
"grad_norm": 18.794130325317383,
"learning_rate": 1.1207426431744406e-06,
"loss": 8.0024,
"step": 22500
},
{
"epoch": 0.956836062088029,
"eval_cosine_accuracy": 0.9693688154220581,
"eval_loss": 8.415027618408203,
"eval_runtime": 50.599,
"eval_samples_per_second": 373.683,
"eval_steps_per_second": 1.462,
"step": 22500
},
{
"epoch": 0.9780990856899852,
"grad_norm": 26.394250869750977,
"learning_rate": 1.1456613252087355e-06,
"loss": 7.9449,
"step": 23000
},
{
"epoch": 0.9780990856899852,
"eval_cosine_accuracy": 0.9687797427177429,
"eval_loss": 8.452112197875977,
"eval_runtime": 49.7524,
"eval_samples_per_second": 380.042,
"eval_steps_per_second": 1.487,
"step": 23000
},
{
"epoch": 0.9993621092919414,
"grad_norm": 17.094562530517578,
"learning_rate": 1.1705301698789618e-06,
"loss": 7.9241,
"step": 23500
},
{
"epoch": 0.9993621092919414,
"eval_cosine_accuracy": 0.968598484992981,
"eval_loss": 8.457128524780273,
"eval_runtime": 49.272,
"eval_samples_per_second": 383.747,
"eval_steps_per_second": 1.502,
"step": 23500
},
{
"epoch": 1.0206251328938976,
"grad_norm": 19.156238555908203,
"learning_rate": 1.1954488519132564e-06,
"loss": 7.8644,
"step": 24000
},
{
"epoch": 1.0206251328938976,
"eval_cosine_accuracy": 0.9685531854629517,
"eval_loss": 8.447549819946289,
"eval_runtime": 51.0348,
"eval_samples_per_second": 370.493,
"eval_steps_per_second": 1.45,
"step": 24000
},
{
"epoch": 1.0418881564958538,
"grad_norm": 17.439102172851562,
"learning_rate": 1.220367533947551e-06,
"loss": 7.9011,
"step": 24500
},
{
"epoch": 1.0418881564958538,
"eval_cosine_accuracy": 0.9687797427177429,
"eval_loss": 8.469432830810547,
"eval_runtime": 50.4242,
"eval_samples_per_second": 374.978,
"eval_steps_per_second": 1.468,
"step": 24500
},
{
"epoch": 1.06315118009781,
"grad_norm": 21.96286964416504,
"learning_rate": 1.2452862159818461e-06,
"loss": 7.8623,
"step": 25000
},
{
"epoch": 1.06315118009781,
"eval_cosine_accuracy": 0.9688250422477722,
"eval_loss": 8.477595329284668,
"eval_runtime": 49.8803,
"eval_samples_per_second": 379.068,
"eval_steps_per_second": 1.484,
"step": 25000
},
{
"epoch": 1.0844142036997662,
"grad_norm": 17.020614624023438,
"learning_rate": 1.2701550606520722e-06,
"loss": 7.8451,
"step": 25500
},
{
"epoch": 1.0844142036997662,
"eval_cosine_accuracy": 0.9689157009124756,
"eval_loss": 8.455941200256348,
"eval_runtime": 49.3477,
"eval_samples_per_second": 383.159,
"eval_steps_per_second": 1.5,
"step": 25500
},
{
"epoch": 1.1056772273017224,
"grad_norm": 16.207447052001953,
"learning_rate": 1.2950737426863668e-06,
"loss": 7.7974,
"step": 26000
},
{
"epoch": 1.1056772273017224,
"eval_cosine_accuracy": 0.9693688154220581,
"eval_loss": 8.481649398803711,
"eval_runtime": 50.7649,
"eval_samples_per_second": 372.462,
"eval_steps_per_second": 1.458,
"step": 26000
},
{
"epoch": 1.1269402509036786,
"grad_norm": 18.933929443359375,
"learning_rate": 1.3199924247206615e-06,
"loss": 7.8167,
"step": 26500
},
{
"epoch": 1.1269402509036786,
"eval_cosine_accuracy": 0.9689157009124756,
"eval_loss": 8.489095687866211,
"eval_runtime": 50.3783,
"eval_samples_per_second": 375.321,
"eval_steps_per_second": 1.469,
"step": 26500
},
{
"epoch": 1.1482032745056348,
"grad_norm": 21.68393898010254,
"learning_rate": 1.3449111067549565e-06,
"loss": 7.7871,
"step": 27000
},
{
"epoch": 1.1482032745056348,
"eval_cosine_accuracy": 0.9683719277381897,
"eval_loss": 8.516972541809082,
"eval_runtime": 49.7949,
"eval_samples_per_second": 379.718,
"eval_steps_per_second": 1.486,
"step": 27000
},
{
"epoch": 1.169466298107591,
"grad_norm": 29.102264404296875,
"learning_rate": 1.3697799514251824e-06,
"loss": 7.7629,
"step": 27500
},
{
"epoch": 1.169466298107591,
"eval_cosine_accuracy": 0.9687797427177429,
"eval_loss": 8.527128219604492,
"eval_runtime": 50.4479,
"eval_samples_per_second": 374.802,
"eval_steps_per_second": 1.467,
"step": 27500
},
{
"epoch": 1.1907293217095472,
"grad_norm": 29.683128356933594,
"learning_rate": 1.3946487960954087e-06,
"loss": 7.7362,
"step": 28000
},
{
"epoch": 1.1907293217095472,
"eval_cosine_accuracy": 0.9690062999725342,
"eval_loss": 8.560052871704102,
"eval_runtime": 49.7688,
"eval_samples_per_second": 379.917,
"eval_steps_per_second": 1.487,
"step": 28000
},
{
"epoch": 1.2119923453115033,
"grad_norm": 19.49919891357422,
"learning_rate": 1.4195674781297035e-06,
"loss": 7.7203,
"step": 28500
},
{
"epoch": 1.2119923453115033,
"eval_cosine_accuracy": 0.9681906700134277,
"eval_loss": 8.618322372436523,
"eval_runtime": 50.5207,
"eval_samples_per_second": 374.263,
"eval_steps_per_second": 1.465,
"step": 28500
},
{
"epoch": 1.2332553689134595,
"grad_norm": 30.64069366455078,
"learning_rate": 1.4444861601639982e-06,
"loss": 7.713,
"step": 29000
},
{
"epoch": 1.2332553689134595,
"eval_cosine_accuracy": 0.969504714012146,
"eval_loss": 8.546183586120605,
"eval_runtime": 49.6131,
"eval_samples_per_second": 381.109,
"eval_steps_per_second": 1.492,
"step": 29000
},
{
"epoch": 1.2545183925154157,
"grad_norm": 21.36481285095215,
"learning_rate": 1.4694048421982928e-06,
"loss": 7.6853,
"step": 29500
},
{
"epoch": 1.2545183925154157,
"eval_cosine_accuracy": 0.969323456287384,
"eval_loss": 8.578022956848145,
"eval_runtime": 49.7468,
"eval_samples_per_second": 380.085,
"eval_steps_per_second": 1.488,
"step": 29500
},
{
"epoch": 1.275781416117372,
"grad_norm": 37.677242279052734,
"learning_rate": 1.4943235242325879e-06,
"loss": 7.6376,
"step": 30000
},
{
"epoch": 1.275781416117372,
"eval_cosine_accuracy": 0.9687797427177429,
"eval_loss": 8.658872604370117,
"eval_runtime": 51.0348,
"eval_samples_per_second": 370.492,
"eval_steps_per_second": 1.45,
"step": 30000
},
{
"epoch": 1.2970444397193281,
"grad_norm": 20.2037296295166,
"learning_rate": 1.519192368902814e-06,
"loss": 7.6364,
"step": 30500
},
{
"epoch": 1.2970444397193281,
"eval_cosine_accuracy": 0.9686437845230103,
"eval_loss": 8.601694107055664,
"eval_runtime": 50.5448,
"eval_samples_per_second": 374.084,
"eval_steps_per_second": 1.464,
"step": 30500
},
{
"epoch": 1.3183074633212843,
"grad_norm": 18.782926559448242,
"learning_rate": 1.5441110509371086e-06,
"loss": 7.6196,
"step": 31000
},
{
"epoch": 1.3183074633212843,
"eval_cosine_accuracy": 0.968417227268219,
"eval_loss": 8.666234970092773,
"eval_runtime": 49.549,
"eval_samples_per_second": 381.602,
"eval_steps_per_second": 1.493,
"step": 31000
},
{
"epoch": 1.3395704869232405,
"grad_norm": 24.106821060180664,
"learning_rate": 1.5690297329714032e-06,
"loss": 7.6169,
"step": 31500
},
{
"epoch": 1.3395704869232405,
"eval_cosine_accuracy": 0.9685531854629517,
"eval_loss": 8.674742698669434,
"eval_runtime": 50.4812,
"eval_samples_per_second": 374.555,
"eval_steps_per_second": 1.466,
"step": 31500
},
{
"epoch": 1.3608335105251967,
"grad_norm": 21.349994659423828,
"learning_rate": 1.5939484150056983e-06,
"loss": 7.5608,
"step": 32000
},
{
"epoch": 1.3608335105251967,
"eval_cosine_accuracy": 0.9680547118186951,
"eval_loss": 8.65296459197998,
"eval_runtime": 49.9031,
"eval_samples_per_second": 378.894,
"eval_steps_per_second": 1.483,
"step": 32000
},
{
"epoch": 1.382096534127153,
"grad_norm": 23.57134246826172,
"learning_rate": 1.618867097039993e-06,
"loss": 7.5725,
"step": 32500
},
{
"epoch": 1.382096534127153,
"eval_cosine_accuracy": 0.9688250422477722,
"eval_loss": 8.634788513183594,
"eval_runtime": 50.319,
"eval_samples_per_second": 375.763,
"eval_steps_per_second": 1.471,
"step": 32500
},
{
"epoch": 1.4033595577291091,
"grad_norm": 41.65155792236328,
"learning_rate": 1.643735941710219e-06,
"loss": 7.5056,
"step": 33000
},
{
"epoch": 1.4033595577291091,
"eval_cosine_accuracy": 0.967510998249054,
"eval_loss": 8.732030868530273,
"eval_runtime": 49.7065,
"eval_samples_per_second": 380.393,
"eval_steps_per_second": 1.489,
"step": 33000
},
{
"epoch": 1.4246225813310653,
"grad_norm": 21.274824142456055,
"learning_rate": 1.6686546237445138e-06,
"loss": 7.5294,
"step": 33500
},
{
"epoch": 1.4246225813310653,
"eval_cosine_accuracy": 0.9690968990325928,
"eval_loss": 8.727688789367676,
"eval_runtime": 50.402,
"eval_samples_per_second": 375.144,
"eval_steps_per_second": 1.468,
"step": 33500
},
{
"epoch": 1.4458856049330215,
"grad_norm": 24.373477935791016,
"learning_rate": 1.6935733057788085e-06,
"loss": 7.5106,
"step": 34000
},
{
"epoch": 1.4458856049330215,
"eval_cosine_accuracy": 0.9679641127586365,
"eval_loss": 8.750975608825684,
"eval_runtime": 49.6654,
"eval_samples_per_second": 380.708,
"eval_steps_per_second": 1.49,
"step": 34000
},
{
"epoch": 1.4671486285349777,
"grad_norm": 29.188819885253906,
"learning_rate": 1.7184919878131033e-06,
"loss": 7.5032,
"step": 34500
},
{
"epoch": 1.4671486285349777,
"eval_cosine_accuracy": 0.967510998249054,
"eval_loss": 8.7284574508667,
"eval_runtime": 49.6728,
"eval_samples_per_second": 380.651,
"eval_steps_per_second": 1.49,
"step": 34500
},
{
"epoch": 1.488411652136934,
"grad_norm": 24.691267013549805,
"learning_rate": 1.7433608324833296e-06,
"loss": 7.4635,
"step": 35000
},
{
"epoch": 1.488411652136934,
"eval_cosine_accuracy": 0.968235969543457,
"eval_loss": 8.765801429748535,
"eval_runtime": 50.8811,
"eval_samples_per_second": 371.611,
"eval_steps_per_second": 1.454,
"step": 35000
},
{
"epoch": 1.5096746757388901,
"grad_norm": 50.87013626098633,
"learning_rate": 1.7682795145176243e-06,
"loss": 7.4503,
"step": 35500
},
{
"epoch": 1.5096746757388901,
"eval_cosine_accuracy": 0.9684625267982483,
"eval_loss": 8.72378921508789,
"eval_runtime": 49.667,
"eval_samples_per_second": 380.695,
"eval_steps_per_second": 1.49,
"step": 35500
},
{
"epoch": 1.5309376993408463,
"grad_norm": 40.06660842895508,
"learning_rate": 1.793198196551919e-06,
"loss": 7.4227,
"step": 36000
},
{
"epoch": 1.5309376993408463,
"eval_cosine_accuracy": 0.9674656987190247,
"eval_loss": 8.823667526245117,
"eval_runtime": 51.0542,
"eval_samples_per_second": 370.352,
"eval_steps_per_second": 1.449,
"step": 36000
},
{
"epoch": 1.5522007229428025,
"grad_norm": 22.465124130249023,
"learning_rate": 1.8181168785862137e-06,
"loss": 7.4243,
"step": 36500
},
{
"epoch": 1.5522007229428025,
"eval_cosine_accuracy": 0.9679641127586365,
"eval_loss": 8.741899490356445,
"eval_runtime": 49.6301,
"eval_samples_per_second": 380.978,
"eval_steps_per_second": 1.491,
"step": 36500
},
{
"epoch": 1.5734637465447587,
"grad_norm": 33.76041793823242,
"learning_rate": 1.8430355606205084e-06,
"loss": 7.3642,
"step": 37000
},
{
"epoch": 1.5734637465447587,
"eval_cosine_accuracy": 0.9678281545639038,
"eval_loss": 8.775369644165039,
"eval_runtime": 50.8941,
"eval_samples_per_second": 371.517,
"eval_steps_per_second": 1.454,
"step": 37000
},
{
"epoch": 1.594726770146715,
"grad_norm": 70.6234359741211,
"learning_rate": 1.8679044052907347e-06,
"loss": 7.3529,
"step": 37500
},
{
"epoch": 1.594726770146715,
"eval_cosine_accuracy": 0.9670125246047974,
"eval_loss": 8.834284782409668,
"eval_runtime": 49.7944,
"eval_samples_per_second": 379.721,
"eval_steps_per_second": 1.486,
"step": 37500
},
{
"epoch": 1.6159897937486711,
"grad_norm": 20.19392967224121,
"learning_rate": 1.8928230873250293e-06,
"loss": 7.3594,
"step": 38000
},
{
"epoch": 1.6159897937486711,
"eval_cosine_accuracy": 0.9676922559738159,
"eval_loss": 8.824420928955078,
"eval_runtime": 51.008,
"eval_samples_per_second": 370.687,
"eval_steps_per_second": 1.451,
"step": 38000
},
{
"epoch": 1.6372528173506273,
"grad_norm": 28.939029693603516,
"learning_rate": 1.917741769359324e-06,
"loss": 7.3257,
"step": 38500
},
{
"epoch": 1.6372528173506273,
"eval_cosine_accuracy": 0.9672391414642334,
"eval_loss": 8.849225044250488,
"eval_runtime": 49.3783,
"eval_samples_per_second": 382.921,
"eval_steps_per_second": 1.499,
"step": 38500
},
{
"epoch": 1.6585158409525835,
"grad_norm": 24.4239559173584,
"learning_rate": 1.9426604513936186e-06,
"loss": 7.3277,
"step": 39000
},
{
"epoch": 1.6585158409525835,
"eval_cosine_accuracy": 0.9667859673500061,
"eval_loss": 8.900370597839355,
"eval_runtime": 50.6295,
"eval_samples_per_second": 373.458,
"eval_steps_per_second": 1.462,
"step": 39000
},
{
"epoch": 1.6797788645545397,
"grad_norm": 27.921985626220703,
"learning_rate": 1.9675292960638453e-06,
"loss": 7.3037,
"step": 39500
},
{
"epoch": 1.6797788645545397,
"eval_cosine_accuracy": 0.9670578837394714,
"eval_loss": 8.904617309570312,
"eval_runtime": 49.7604,
"eval_samples_per_second": 379.981,
"eval_steps_per_second": 1.487,
"step": 39500
},
{
"epoch": 1.701041888156496,
"grad_norm": 26.04640007019043,
"learning_rate": 1.9924479780981397e-06,
"loss": 7.2672,
"step": 40000
},
{
"epoch": 1.701041888156496,
"eval_cosine_accuracy": 0.9654719233512878,
"eval_loss": 9.0310697555542,
"eval_runtime": 50.5286,
"eval_samples_per_second": 374.204,
"eval_steps_per_second": 1.465,
"step": 40000
},
{
"epoch": 1.7223049117584521,
"grad_norm": 27.62725830078125,
"learning_rate": 2.0173666601324346e-06,
"loss": 7.2515,
"step": 40500
},
{
"epoch": 1.7223049117584521,
"eval_cosine_accuracy": 0.9665141105651855,
"eval_loss": 8.956608772277832,
"eval_runtime": 49.684,
"eval_samples_per_second": 380.565,
"eval_steps_per_second": 1.489,
"step": 40500
},
{
"epoch": 1.7435679353604083,
"grad_norm": 24.697010040283203,
"learning_rate": 2.0422853421667294e-06,
"loss": 7.2205,
"step": 41000
},
{
"epoch": 1.7435679353604083,
"eval_cosine_accuracy": 0.96714848279953,
"eval_loss": 8.937081336975098,
"eval_runtime": 50.465,
"eval_samples_per_second": 374.676,
"eval_steps_per_second": 1.466,
"step": 41000
},
{
"epoch": 1.7648309589623645,
"grad_norm": 29.04935073852539,
"learning_rate": 2.0671541868369553e-06,
"loss": 7.2263,
"step": 41500
},
{
"epoch": 1.7648309589623645,
"eval_cosine_accuracy": 0.9666500687599182,
"eval_loss": 8.94690227508545,
"eval_runtime": 49.547,
"eval_samples_per_second": 381.618,
"eval_steps_per_second": 1.494,
"step": 41500
},
{
"epoch": 1.7860939825643207,
"grad_norm": 35.25505065917969,
"learning_rate": 2.0920230315071816e-06,
"loss": 7.273,
"step": 42000
},
{
"epoch": 1.7860939825643207,
"eval_cosine_accuracy": 0.9672844409942627,
"eval_loss": 8.9337797164917,
"eval_runtime": 50.1569,
"eval_samples_per_second": 376.977,
"eval_steps_per_second": 1.475,
"step": 42000
},
{
"epoch": 1.807357006166277,
"grad_norm": 26.70579719543457,
"learning_rate": 2.116941713541476e-06,
"loss": 7.1962,
"step": 42500
},
{
"epoch": 1.807357006166277,
"eval_cosine_accuracy": 0.9669219255447388,
"eval_loss": 8.93774700164795,
"eval_runtime": 49.8395,
"eval_samples_per_second": 379.378,
"eval_steps_per_second": 1.485,
"step": 42500
},
{
"epoch": 1.8286200297682331,
"grad_norm": 25.269804000854492,
"learning_rate": 2.1418603955757713e-06,
"loss": 7.1955,
"step": 43000
},
{
"epoch": 1.8286200297682331,
"eval_cosine_accuracy": 0.965154767036438,
"eval_loss": 9.037887573242188,
"eval_runtime": 50.6789,
"eval_samples_per_second": 373.094,
"eval_steps_per_second": 1.46,
"step": 43000
},
{
"epoch": 1.8498830533701893,
"grad_norm": 26.97606086730957,
"learning_rate": 2.166779077610066e-06,
"loss": 7.2015,
"step": 43500
},
{
"epoch": 1.8498830533701893,
"eval_cosine_accuracy": 0.9658344388008118,
"eval_loss": 9.061223030090332,
"eval_runtime": 49.7986,
"eval_samples_per_second": 379.689,
"eval_steps_per_second": 1.486,
"step": 43500
},
{
"epoch": 1.8711460769721455,
"grad_norm": 30.354036331176758,
"learning_rate": 2.191647922280292e-06,
"loss": 7.1363,
"step": 44000
},
{
"epoch": 1.8711460769721455,
"eval_cosine_accuracy": 0.963840663433075,
"eval_loss": 9.144049644470215,
"eval_runtime": 50.374,
"eval_samples_per_second": 375.352,
"eval_steps_per_second": 1.469,
"step": 44000
},
{
"epoch": 1.8924091005741017,
"grad_norm": 34.271522521972656,
"learning_rate": 2.216566604314587e-06,
"loss": 7.0981,
"step": 44500
},
{
"epoch": 1.8924091005741017,
"eval_cosine_accuracy": 0.9659703373908997,
"eval_loss": 9.090474128723145,
"eval_runtime": 50.0017,
"eval_samples_per_second": 378.147,
"eval_steps_per_second": 1.48,
"step": 44500
},
{
"epoch": 1.913672124176058,
"grad_norm": 26.35101890563965,
"learning_rate": 2.2414852863488813e-06,
"loss": 7.1226,
"step": 45000
},
{
"epoch": 1.913672124176058,
"eval_cosine_accuracy": 0.9629797339439392,
"eval_loss": 9.179637908935547,
"eval_runtime": 50.6563,
"eval_samples_per_second": 373.261,
"eval_steps_per_second": 1.461,
"step": 45000
},
{
"epoch": 1.9349351477780141,
"grad_norm": 30.242382049560547,
"learning_rate": 2.266403968383176e-06,
"loss": 7.09,
"step": 45500
},
{
"epoch": 1.9349351477780141,
"eval_cosine_accuracy": 0.9639313220977783,
"eval_loss": 9.226616859436035,
"eval_runtime": 49.6704,
"eval_samples_per_second": 380.669,
"eval_steps_per_second": 1.49,
"step": 45500
},
{
"epoch": 1.9561981713799703,
"grad_norm": 32.14158248901367,
"learning_rate": 2.291322650417471e-06,
"loss": 7.0502,
"step": 46000
},
{
"epoch": 1.9561981713799703,
"eval_cosine_accuracy": 0.9655625820159912,
"eval_loss": 9.134511947631836,
"eval_runtime": 50.3879,
"eval_samples_per_second": 375.249,
"eval_steps_per_second": 1.469,
"step": 46000
},
{
"epoch": 1.9774611949819265,
"grad_norm": 34.23884201049805,
"learning_rate": 2.3161914950876973e-06,
"loss": 7.0414,
"step": 46500
},
{
"epoch": 1.9774611949819265,
"eval_cosine_accuracy": 0.9636141061782837,
"eval_loss": 9.34079647064209,
"eval_runtime": 49.6501,
"eval_samples_per_second": 380.825,
"eval_steps_per_second": 1.49,
"step": 46500
},
{
"epoch": 1.9987242185838827,
"grad_norm": 22.901996612548828,
"learning_rate": 2.3410603397579235e-06,
"loss": 7.074,
"step": 47000
},
{
"epoch": 1.9987242185838827,
"eval_cosine_accuracy": 0.9648375511169434,
"eval_loss": 9.222514152526855,
"eval_runtime": 50.3934,
"eval_samples_per_second": 375.208,
"eval_steps_per_second": 1.468,
"step": 47000
}
],
"logging_steps": 500,
"max_steps": 94060,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 256,
"trial_name": null,
"trial_params": null
}