Invalid JSON: Unexpected token 'N', ..."ad_norm": NaN,
"... is not valid JSON
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 47030, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 4.2526047203912395e-05, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0, | |
| "loss": 16.5747, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0212630236019562, | |
| "grad_norm": 72.24292755126953, | |
| "learning_rate": 2.4719332578020393e-08, | |
| "loss": 16.0022, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0212630236019562, | |
| "eval_cosine_accuracy": 0.9308079481124878, | |
| "eval_loss": 11.834263801574707, | |
| "eval_runtime": 49.7307, | |
| "eval_samples_per_second": 380.208, | |
| "eval_steps_per_second": 1.488, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0425260472039124, | |
| "grad_norm": 90.77424621582031, | |
| "learning_rate": 4.963801461231515e-08, | |
| "loss": 15.9413, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.0425260472039124, | |
| "eval_cosine_accuracy": 0.9313969612121582, | |
| "eval_loss": 11.764849662780762, | |
| "eval_runtime": 50.6172, | |
| "eval_samples_per_second": 373.549, | |
| "eval_steps_per_second": 1.462, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.0637890708058686, | |
| "grad_norm": 61.71802520751953, | |
| "learning_rate": 7.45566966466099e-08, | |
| "loss": 15.8159, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.0637890708058686, | |
| "eval_cosine_accuracy": 0.9325751066207886, | |
| "eval_loss": 11.667285919189453, | |
| "eval_runtime": 50.0024, | |
| "eval_samples_per_second": 378.142, | |
| "eval_steps_per_second": 1.48, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.0850520944078248, | |
| "grad_norm": 70.19003295898438, | |
| "learning_rate": 9.947537868090464e-08, | |
| "loss": 15.6699, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.0850520944078248, | |
| "eval_cosine_accuracy": 0.934478223323822, | |
| "eval_loss": 11.543947219848633, | |
| "eval_runtime": 50.7737, | |
| "eval_samples_per_second": 372.397, | |
| "eval_steps_per_second": 1.457, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.106315118009781, | |
| "grad_norm": 88.77647399902344, | |
| "learning_rate": 1.2434422335113082e-07, | |
| "loss": 15.3678, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.106315118009781, | |
| "eval_cosine_accuracy": 0.9360641837120056, | |
| "eval_loss": 11.422866821289062, | |
| "eval_runtime": 49.9051, | |
| "eval_samples_per_second": 378.879, | |
| "eval_steps_per_second": 1.483, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.1275781416117372, | |
| "grad_norm": 60.29121398925781, | |
| "learning_rate": 1.4926290538542556e-07, | |
| "loss": 15.0132, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.1275781416117372, | |
| "eval_cosine_accuracy": 0.9384657144546509, | |
| "eval_loss": 11.268472671508789, | |
| "eval_runtime": 50.5525, | |
| "eval_samples_per_second": 374.027, | |
| "eval_steps_per_second": 1.464, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.1488411652136934, | |
| "grad_norm": 58.82876968383789, | |
| "learning_rate": 1.741815874197203e-07, | |
| "loss": 14.5215, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.1488411652136934, | |
| "eval_cosine_accuracy": 0.941592276096344, | |
| "eval_loss": 11.027884483337402, | |
| "eval_runtime": 49.4569, | |
| "eval_samples_per_second": 382.313, | |
| "eval_steps_per_second": 1.496, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.1701041888156496, | |
| "grad_norm": 86.84857177734375, | |
| "learning_rate": 1.9910026945401506e-07, | |
| "loss": 14.0143, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.1701041888156496, | |
| "eval_cosine_accuracy": 0.9445375800132751, | |
| "eval_loss": 10.77338981628418, | |
| "eval_runtime": 49.47, | |
| "eval_samples_per_second": 382.212, | |
| "eval_steps_per_second": 1.496, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.1913672124176058, | |
| "grad_norm": 50.29563903808594, | |
| "learning_rate": 2.2396911412424123e-07, | |
| "loss": 13.3569, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.1913672124176058, | |
| "eval_cosine_accuracy": 0.9471657276153564, | |
| "eval_loss": 10.502735137939453, | |
| "eval_runtime": 50.4762, | |
| "eval_samples_per_second": 374.592, | |
| "eval_steps_per_second": 1.466, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.212630236019562, | |
| "grad_norm": 48.2503776550293, | |
| "learning_rate": 2.48887796158536e-07, | |
| "loss": 12.5679, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.212630236019562, | |
| "eval_cosine_accuracy": 0.9499297738075256, | |
| "eval_loss": 10.223851203918457, | |
| "eval_runtime": 49.7554, | |
| "eval_samples_per_second": 380.019, | |
| "eval_steps_per_second": 1.487, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.2338932596215182, | |
| "grad_norm": 85.99883270263672, | |
| "learning_rate": 2.7380647819283073e-07, | |
| "loss": 11.7751, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.2338932596215182, | |
| "eval_cosine_accuracy": 0.9533734917640686, | |
| "eval_loss": 9.879401206970215, | |
| "eval_runtime": 50.9383, | |
| "eval_samples_per_second": 371.194, | |
| "eval_steps_per_second": 1.453, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.2551562832234744, | |
| "grad_norm": 30.850263595581055, | |
| "learning_rate": 2.9872516022712553e-07, | |
| "loss": 10.924, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.2551562832234744, | |
| "eval_cosine_accuracy": 0.9558656811714172, | |
| "eval_loss": 9.583606719970703, | |
| "eval_runtime": 50.1057, | |
| "eval_samples_per_second": 377.363, | |
| "eval_steps_per_second": 1.477, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.2764193068254306, | |
| "grad_norm": 27.160547256469727, | |
| "learning_rate": 3.2359400489735165e-07, | |
| "loss": 10.207, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.2764193068254306, | |
| "eval_cosine_accuracy": 0.9583125710487366, | |
| "eval_loss": 9.295426368713379, | |
| "eval_runtime": 50.8853, | |
| "eval_samples_per_second": 371.58, | |
| "eval_steps_per_second": 1.454, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.2976823304273868, | |
| "grad_norm": 14.218546867370605, | |
| "learning_rate": 3.4851268693164645e-07, | |
| "loss": 9.6073, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.2976823304273868, | |
| "eval_cosine_accuracy": 0.9617562890052795, | |
| "eval_loss": 9.070219993591309, | |
| "eval_runtime": 49.4097, | |
| "eval_samples_per_second": 382.678, | |
| "eval_steps_per_second": 1.498, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.318945354029343, | |
| "grad_norm": 20.438467025756836, | |
| "learning_rate": 3.7343136896594114e-07, | |
| "loss": 9.3031, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.318945354029343, | |
| "eval_cosine_accuracy": 0.9646109938621521, | |
| "eval_loss": 8.921951293945312, | |
| "eval_runtime": 50.9982, | |
| "eval_samples_per_second": 370.758, | |
| "eval_steps_per_second": 1.451, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.3402083776312992, | |
| "grad_norm": 18.741724014282227, | |
| "learning_rate": 3.983500510002359e-07, | |
| "loss": 9.1113, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.3402083776312992, | |
| "eval_cosine_accuracy": 0.9665141105651855, | |
| "eval_loss": 8.841903686523438, | |
| "eval_runtime": 49.583, | |
| "eval_samples_per_second": 381.34, | |
| "eval_steps_per_second": 1.492, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.3614714012332554, | |
| "grad_norm": 14.93420696258545, | |
| "learning_rate": 4.2321889567046206e-07, | |
| "loss": 8.9949, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.3614714012332554, | |
| "eval_cosine_accuracy": 0.9681000709533691, | |
| "eval_loss": 8.776097297668457, | |
| "eval_runtime": 50.3732, | |
| "eval_samples_per_second": 375.359, | |
| "eval_steps_per_second": 1.469, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.3827344248352116, | |
| "grad_norm": 12.44970417022705, | |
| "learning_rate": 4.481375777047568e-07, | |
| "loss": 8.9394, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.3827344248352116, | |
| "eval_cosine_accuracy": 0.9681000709533691, | |
| "eval_loss": 8.712888717651367, | |
| "eval_runtime": 49.3318, | |
| "eval_samples_per_second": 383.282, | |
| "eval_steps_per_second": 1.5, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.4039974484371678, | |
| "grad_norm": 11.820176124572754, | |
| "learning_rate": 4.7305625973905156e-07, | |
| "loss": 8.8496, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.4039974484371678, | |
| "eval_cosine_accuracy": 0.968235969543457, | |
| "eval_loss": 8.671957015991211, | |
| "eval_runtime": 50.445, | |
| "eval_samples_per_second": 374.824, | |
| "eval_steps_per_second": 1.467, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.425260472039124, | |
| "grad_norm": 19.25648307800293, | |
| "learning_rate": 4.979251044092777e-07, | |
| "loss": 8.8057, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.425260472039124, | |
| "eval_cosine_accuracy": 0.9679641127586365, | |
| "eval_loss": 8.636693000793457, | |
| "eval_runtime": 49.6391, | |
| "eval_samples_per_second": 380.909, | |
| "eval_steps_per_second": 1.491, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.4465234956410802, | |
| "grad_norm": 11.33238697052002, | |
| "learning_rate": 5.228437864435725e-07, | |
| "loss": 8.739, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.4465234956410802, | |
| "eval_cosine_accuracy": 0.9684625267982483, | |
| "eval_loss": 8.61970329284668, | |
| "eval_runtime": 50.2316, | |
| "eval_samples_per_second": 376.416, | |
| "eval_steps_per_second": 1.473, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.4677865192430364, | |
| "grad_norm": 12.054763793945312, | |
| "learning_rate": 5.477624684778672e-07, | |
| "loss": 8.6886, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.4677865192430364, | |
| "eval_cosine_accuracy": 0.968598484992981, | |
| "eval_loss": 8.59864616394043, | |
| "eval_runtime": 49.6249, | |
| "eval_samples_per_second": 381.018, | |
| "eval_steps_per_second": 1.491, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.4890495428449926, | |
| "grad_norm": 12.590415000915527, | |
| "learning_rate": 5.72681150512162e-07, | |
| "loss": 8.6431, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.4890495428449926, | |
| "eval_cosine_accuracy": 0.9678281545639038, | |
| "eval_loss": 8.581836700439453, | |
| "eval_runtime": 50.2665, | |
| "eval_samples_per_second": 376.155, | |
| "eval_steps_per_second": 1.472, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.5103125664469488, | |
| "grad_norm": 15.370965957641602, | |
| "learning_rate": 5.975998325464567e-07, | |
| "loss": 8.6208, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.5103125664469488, | |
| "eval_cosine_accuracy": 0.968598484992981, | |
| "eval_loss": 8.53200912475586, | |
| "eval_runtime": 49.502, | |
| "eval_samples_per_second": 381.964, | |
| "eval_steps_per_second": 1.495, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.531575590048905, | |
| "grad_norm": 10.931774139404297, | |
| "learning_rate": 6.225185145807515e-07, | |
| "loss": 8.5638, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.531575590048905, | |
| "eval_cosine_accuracy": 0.9685078859329224, | |
| "eval_loss": 8.522621154785156, | |
| "eval_runtime": 50.2178, | |
| "eval_samples_per_second": 376.52, | |
| "eval_steps_per_second": 1.474, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.5528386136508612, | |
| "grad_norm": 13.296648025512695, | |
| "learning_rate": 6.474371966150463e-07, | |
| "loss": 8.539, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.5528386136508612, | |
| "eval_cosine_accuracy": 0.9688250422477722, | |
| "eval_loss": 8.52730941772461, | |
| "eval_runtime": 49.7781, | |
| "eval_samples_per_second": 379.845, | |
| "eval_steps_per_second": 1.487, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.5741016372528174, | |
| "grad_norm": 13.039021492004395, | |
| "learning_rate": 6.72355878649341e-07, | |
| "loss": 8.4952, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.5741016372528174, | |
| "eval_cosine_accuracy": 0.9687797427177429, | |
| "eval_loss": 8.488916397094727, | |
| "eval_runtime": 50.244, | |
| "eval_samples_per_second": 376.324, | |
| "eval_steps_per_second": 1.473, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.5953646608547736, | |
| "grad_norm": 14.663399696350098, | |
| "learning_rate": 6.972745606836358e-07, | |
| "loss": 8.437, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.5953646608547736, | |
| "eval_cosine_accuracy": 0.9692781567573547, | |
| "eval_loss": 8.479202270507812, | |
| "eval_runtime": 49.8834, | |
| "eval_samples_per_second": 379.044, | |
| "eval_steps_per_second": 1.483, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.6166276844567298, | |
| "grad_norm": 14.649970054626465, | |
| "learning_rate": 7.221932427179305e-07, | |
| "loss": 8.4223, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.6166276844567298, | |
| "eval_cosine_accuracy": 0.9690062999725342, | |
| "eval_loss": 8.463949203491211, | |
| "eval_runtime": 50.2893, | |
| "eval_samples_per_second": 375.985, | |
| "eval_steps_per_second": 1.471, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.637890708058686, | |
| "grad_norm": 16.208450317382812, | |
| "learning_rate": 7.471119247522253e-07, | |
| "loss": 8.367, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.637890708058686, | |
| "eval_cosine_accuracy": 0.9686437845230103, | |
| "eval_loss": 8.437935829162598, | |
| "eval_runtime": 49.4396, | |
| "eval_samples_per_second": 382.446, | |
| "eval_steps_per_second": 1.497, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.6591537316606422, | |
| "grad_norm": 17.769954681396484, | |
| "learning_rate": 7.7203060678652e-07, | |
| "loss": 8.3536, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.6591537316606422, | |
| "eval_cosine_accuracy": 0.9689610004425049, | |
| "eval_loss": 8.473112106323242, | |
| "eval_runtime": 50.2825, | |
| "eval_samples_per_second": 376.035, | |
| "eval_steps_per_second": 1.472, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.6804167552625984, | |
| "grad_norm": 17.118555068969727, | |
| "learning_rate": 7.968994514567462e-07, | |
| "loss": 8.3289, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.6804167552625984, | |
| "eval_cosine_accuracy": 0.9688703417778015, | |
| "eval_loss": 8.445318222045898, | |
| "eval_runtime": 49.5183, | |
| "eval_samples_per_second": 381.838, | |
| "eval_steps_per_second": 1.494, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.7016797788645546, | |
| "grad_norm": 14.455836296081543, | |
| "learning_rate": 8.218181334910409e-07, | |
| "loss": 8.3108, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.7016797788645546, | |
| "eval_cosine_accuracy": 0.9694594144821167, | |
| "eval_loss": 8.421878814697266, | |
| "eval_runtime": 50.3275, | |
| "eval_samples_per_second": 375.699, | |
| "eval_steps_per_second": 1.47, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.7229428024665108, | |
| "grad_norm": 13.595857620239258, | |
| "learning_rate": 8.467368155253357e-07, | |
| "loss": 8.2663, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.7229428024665108, | |
| "eval_cosine_accuracy": 0.9686437845230103, | |
| "eval_loss": 8.462857246398926, | |
| "eval_runtime": 49.5482, | |
| "eval_samples_per_second": 381.608, | |
| "eval_steps_per_second": 1.493, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.744205826068467, | |
| "grad_norm": 14.019209861755371, | |
| "learning_rate": 8.716056601955618e-07, | |
| "loss": 8.2225, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.744205826068467, | |
| "eval_cosine_accuracy": 0.9691875576972961, | |
| "eval_loss": 8.401082038879395, | |
| "eval_runtime": 50.4644, | |
| "eval_samples_per_second": 374.68, | |
| "eval_steps_per_second": 1.466, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.7654688496704232, | |
| "grad_norm": 13.743831634521484, | |
| "learning_rate": 8.965243422298566e-07, | |
| "loss": 8.2052, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.7654688496704232, | |
| "eval_cosine_accuracy": 0.9689610004425049, | |
| "eval_loss": 8.454690933227539, | |
| "eval_runtime": 49.7999, | |
| "eval_samples_per_second": 379.679, | |
| "eval_steps_per_second": 1.486, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.7867318732723794, | |
| "grad_norm": 14.726578712463379, | |
| "learning_rate": 9.214430242641513e-07, | |
| "loss": 8.1894, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.7867318732723794, | |
| "eval_cosine_accuracy": 0.9687797427177429, | |
| "eval_loss": 8.442588806152344, | |
| "eval_runtime": 50.3938, | |
| "eval_samples_per_second": 375.205, | |
| "eval_steps_per_second": 1.468, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.8079948968743356, | |
| "grad_norm": 16.874422073364258, | |
| "learning_rate": 9.463617062984462e-07, | |
| "loss": 8.1808, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.8079948968743356, | |
| "eval_cosine_accuracy": 0.9687344431877136, | |
| "eval_loss": 8.419596672058105, | |
| "eval_runtime": 49.7216, | |
| "eval_samples_per_second": 380.277, | |
| "eval_steps_per_second": 1.488, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.8292579204762918, | |
| "grad_norm": 18.23941993713379, | |
| "learning_rate": 9.71280388332741e-07, | |
| "loss": 8.1298, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.8292579204762918, | |
| "eval_cosine_accuracy": 0.969504714012146, | |
| "eval_loss": 8.426375389099121, | |
| "eval_runtime": 50.209, | |
| "eval_samples_per_second": 376.586, | |
| "eval_steps_per_second": 1.474, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.850520944078248, | |
| "grad_norm": 18.061824798583984, | |
| "learning_rate": 9.961990703670356e-07, | |
| "loss": 8.1187, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.850520944078248, | |
| "eval_cosine_accuracy": 0.969504714012146, | |
| "eval_loss": 8.403668403625488, | |
| "eval_runtime": 50.0426, | |
| "eval_samples_per_second": 377.838, | |
| "eval_steps_per_second": 1.479, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.8717839676802042, | |
| "grad_norm": 14.699914932250977, | |
| "learning_rate": 1.0211177524013304e-06, | |
| "loss": 8.0909, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.8717839676802042, | |
| "eval_cosine_accuracy": 0.9694141149520874, | |
| "eval_loss": 8.39807415008545, | |
| "eval_runtime": 50.3586, | |
| "eval_samples_per_second": 375.468, | |
| "eval_steps_per_second": 1.469, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.8930469912821604, | |
| "grad_norm": 31.298362731933594, | |
| "learning_rate": 1.046036434435625e-06, | |
| "loss": 8.0338, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.8930469912821604, | |
| "eval_cosine_accuracy": 0.9687344431877136, | |
| "eval_loss": 8.437390327453613, | |
| "eval_runtime": 50.0486, | |
| "eval_samples_per_second": 377.793, | |
| "eval_steps_per_second": 1.479, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.9143100148841166, | |
| "grad_norm": 20.37915802001953, | |
| "learning_rate": 1.0709052791058514e-06, | |
| "loss": 8.0198, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.9143100148841166, | |
| "eval_cosine_accuracy": 0.9688250422477722, | |
| "eval_loss": 8.418041229248047, | |
| "eval_runtime": 50.3563, | |
| "eval_samples_per_second": 375.484, | |
| "eval_steps_per_second": 1.47, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.9355730384860728, | |
| "grad_norm": 16.509695053100586, | |
| "learning_rate": 1.095823961140146e-06, | |
| "loss": 8.0113, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.9355730384860728, | |
| "eval_cosine_accuracy": 0.968598484992981, | |
| "eval_loss": 8.439444541931152, | |
| "eval_runtime": 49.8709, | |
| "eval_samples_per_second": 379.139, | |
| "eval_steps_per_second": 1.484, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.956836062088029, | |
| "grad_norm": 18.794130325317383, | |
| "learning_rate": 1.1207426431744406e-06, | |
| "loss": 8.0024, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.956836062088029, | |
| "eval_cosine_accuracy": 0.9693688154220581, | |
| "eval_loss": 8.415027618408203, | |
| "eval_runtime": 50.599, | |
| "eval_samples_per_second": 373.683, | |
| "eval_steps_per_second": 1.462, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.9780990856899852, | |
| "grad_norm": 26.394250869750977, | |
| "learning_rate": 1.1456613252087355e-06, | |
| "loss": 7.9449, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.9780990856899852, | |
| "eval_cosine_accuracy": 0.9687797427177429, | |
| "eval_loss": 8.452112197875977, | |
| "eval_runtime": 49.7524, | |
| "eval_samples_per_second": 380.042, | |
| "eval_steps_per_second": 1.487, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.9993621092919414, | |
| "grad_norm": 17.094562530517578, | |
| "learning_rate": 1.1705301698789618e-06, | |
| "loss": 7.9241, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.9993621092919414, | |
| "eval_cosine_accuracy": 0.968598484992981, | |
| "eval_loss": 8.457128524780273, | |
| "eval_runtime": 49.272, | |
| "eval_samples_per_second": 383.747, | |
| "eval_steps_per_second": 1.502, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 1.0206251328938976, | |
| "grad_norm": 19.156238555908203, | |
| "learning_rate": 1.1954488519132564e-06, | |
| "loss": 7.8644, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.0206251328938976, | |
| "eval_cosine_accuracy": 0.9685531854629517, | |
| "eval_loss": 8.447549819946289, | |
| "eval_runtime": 51.0348, | |
| "eval_samples_per_second": 370.493, | |
| "eval_steps_per_second": 1.45, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.0418881564958538, | |
| "grad_norm": 17.439102172851562, | |
| "learning_rate": 1.220367533947551e-06, | |
| "loss": 7.9011, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.0418881564958538, | |
| "eval_cosine_accuracy": 0.9687797427177429, | |
| "eval_loss": 8.469432830810547, | |
| "eval_runtime": 50.4242, | |
| "eval_samples_per_second": 374.978, | |
| "eval_steps_per_second": 1.468, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.06315118009781, | |
| "grad_norm": 21.96286964416504, | |
| "learning_rate": 1.2452862159818461e-06, | |
| "loss": 7.8623, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.06315118009781, | |
| "eval_cosine_accuracy": 0.9688250422477722, | |
| "eval_loss": 8.477595329284668, | |
| "eval_runtime": 49.8803, | |
| "eval_samples_per_second": 379.068, | |
| "eval_steps_per_second": 1.484, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.0844142036997662, | |
| "grad_norm": 17.020614624023438, | |
| "learning_rate": 1.2701550606520722e-06, | |
| "loss": 7.8451, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 1.0844142036997662, | |
| "eval_cosine_accuracy": 0.9689157009124756, | |
| "eval_loss": 8.455941200256348, | |
| "eval_runtime": 49.3477, | |
| "eval_samples_per_second": 383.159, | |
| "eval_steps_per_second": 1.5, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 1.1056772273017224, | |
| "grad_norm": 16.207447052001953, | |
| "learning_rate": 1.2950737426863668e-06, | |
| "loss": 7.7974, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.1056772273017224, | |
| "eval_cosine_accuracy": 0.9693688154220581, | |
| "eval_loss": 8.481649398803711, | |
| "eval_runtime": 50.7649, | |
| "eval_samples_per_second": 372.462, | |
| "eval_steps_per_second": 1.458, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.1269402509036786, | |
| "grad_norm": 18.933929443359375, | |
| "learning_rate": 1.3199924247206615e-06, | |
| "loss": 7.8167, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 1.1269402509036786, | |
| "eval_cosine_accuracy": 0.9689157009124756, | |
| "eval_loss": 8.489095687866211, | |
| "eval_runtime": 50.3783, | |
| "eval_samples_per_second": 375.321, | |
| "eval_steps_per_second": 1.469, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 1.1482032745056348, | |
| "grad_norm": 21.68393898010254, | |
| "learning_rate": 1.3449111067549565e-06, | |
| "loss": 7.7871, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.1482032745056348, | |
| "eval_cosine_accuracy": 0.9683719277381897, | |
| "eval_loss": 8.516972541809082, | |
| "eval_runtime": 49.7949, | |
| "eval_samples_per_second": 379.718, | |
| "eval_steps_per_second": 1.486, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.169466298107591, | |
| "grad_norm": 29.102264404296875, | |
| "learning_rate": 1.3697799514251824e-06, | |
| "loss": 7.7629, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 1.169466298107591, | |
| "eval_cosine_accuracy": 0.9687797427177429, | |
| "eval_loss": 8.527128219604492, | |
| "eval_runtime": 50.4479, | |
| "eval_samples_per_second": 374.802, | |
| "eval_steps_per_second": 1.467, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 1.1907293217095472, | |
| "grad_norm": 29.683128356933594, | |
| "learning_rate": 1.3946487960954087e-06, | |
| "loss": 7.7362, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.1907293217095472, | |
| "eval_cosine_accuracy": 0.9690062999725342, | |
| "eval_loss": 8.560052871704102, | |
| "eval_runtime": 49.7688, | |
| "eval_samples_per_second": 379.917, | |
| "eval_steps_per_second": 1.487, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.2119923453115033, | |
| "grad_norm": 19.49919891357422, | |
| "learning_rate": 1.4195674781297035e-06, | |
| "loss": 7.7203, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 1.2119923453115033, | |
| "eval_cosine_accuracy": 0.9681906700134277, | |
| "eval_loss": 8.618322372436523, | |
| "eval_runtime": 50.5207, | |
| "eval_samples_per_second": 374.263, | |
| "eval_steps_per_second": 1.465, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 1.2332553689134595, | |
| "grad_norm": 30.64069366455078, | |
| "learning_rate": 1.4444861601639982e-06, | |
| "loss": 7.713, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.2332553689134595, | |
| "eval_cosine_accuracy": 0.969504714012146, | |
| "eval_loss": 8.546183586120605, | |
| "eval_runtime": 49.6131, | |
| "eval_samples_per_second": 381.109, | |
| "eval_steps_per_second": 1.492, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.2545183925154157, | |
| "grad_norm": 21.36481285095215, | |
| "learning_rate": 1.4694048421982928e-06, | |
| "loss": 7.6853, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 1.2545183925154157, | |
| "eval_cosine_accuracy": 0.969323456287384, | |
| "eval_loss": 8.578022956848145, | |
| "eval_runtime": 49.7468, | |
| "eval_samples_per_second": 380.085, | |
| "eval_steps_per_second": 1.488, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 1.275781416117372, | |
| "grad_norm": 37.677242279052734, | |
| "learning_rate": 1.4943235242325879e-06, | |
| "loss": 7.6376, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.275781416117372, | |
| "eval_cosine_accuracy": 0.9687797427177429, | |
| "eval_loss": 8.658872604370117, | |
| "eval_runtime": 51.0348, | |
| "eval_samples_per_second": 370.492, | |
| "eval_steps_per_second": 1.45, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.2970444397193281, | |
| "grad_norm": 20.2037296295166, | |
| "learning_rate": 1.519192368902814e-06, | |
| "loss": 7.6364, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 1.2970444397193281, | |
| "eval_cosine_accuracy": 0.9686437845230103, | |
| "eval_loss": 8.601694107055664, | |
| "eval_runtime": 50.5448, | |
| "eval_samples_per_second": 374.084, | |
| "eval_steps_per_second": 1.464, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 1.3183074633212843, | |
| "grad_norm": 18.782926559448242, | |
| "learning_rate": 1.5441110509371086e-06, | |
| "loss": 7.6196, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 1.3183074633212843, | |
| "eval_cosine_accuracy": 0.968417227268219, | |
| "eval_loss": 8.666234970092773, | |
| "eval_runtime": 49.549, | |
| "eval_samples_per_second": 381.602, | |
| "eval_steps_per_second": 1.493, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 1.3395704869232405, | |
| "grad_norm": 24.106821060180664, | |
| "learning_rate": 1.5690297329714032e-06, | |
| "loss": 7.6169, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 1.3395704869232405, | |
| "eval_cosine_accuracy": 0.9685531854629517, | |
| "eval_loss": 8.674742698669434, | |
| "eval_runtime": 50.4812, | |
| "eval_samples_per_second": 374.555, | |
| "eval_steps_per_second": 1.466, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 1.3608335105251967, | |
| "grad_norm": 21.349994659423828, | |
| "learning_rate": 1.5939484150056983e-06, | |
| "loss": 7.5608, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.3608335105251967, | |
| "eval_cosine_accuracy": 0.9680547118186951, | |
| "eval_loss": 8.65296459197998, | |
| "eval_runtime": 49.9031, | |
| "eval_samples_per_second": 378.894, | |
| "eval_steps_per_second": 1.483, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.382096534127153, | |
| "grad_norm": 23.57134246826172, | |
| "learning_rate": 1.618867097039993e-06, | |
| "loss": 7.5725, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 1.382096534127153, | |
| "eval_cosine_accuracy": 0.9688250422477722, | |
| "eval_loss": 8.634788513183594, | |
| "eval_runtime": 50.319, | |
| "eval_samples_per_second": 375.763, | |
| "eval_steps_per_second": 1.471, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 1.4033595577291091, | |
| "grad_norm": 41.65155792236328, | |
| "learning_rate": 1.643735941710219e-06, | |
| "loss": 7.5056, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 1.4033595577291091, | |
| "eval_cosine_accuracy": 0.967510998249054, | |
| "eval_loss": 8.732030868530273, | |
| "eval_runtime": 49.7065, | |
| "eval_samples_per_second": 380.393, | |
| "eval_steps_per_second": 1.489, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 1.4246225813310653, | |
| "grad_norm": 21.274824142456055, | |
| "learning_rate": 1.6686546237445138e-06, | |
| "loss": 7.5294, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 1.4246225813310653, | |
| "eval_cosine_accuracy": 0.9690968990325928, | |
| "eval_loss": 8.727688789367676, | |
| "eval_runtime": 50.402, | |
| "eval_samples_per_second": 375.144, | |
| "eval_steps_per_second": 1.468, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 1.4458856049330215, | |
| "grad_norm": 24.373477935791016, | |
| "learning_rate": 1.6935733057788085e-06, | |
| "loss": 7.5106, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 1.4458856049330215, | |
| "eval_cosine_accuracy": 0.9679641127586365, | |
| "eval_loss": 8.750975608825684, | |
| "eval_runtime": 49.6654, | |
| "eval_samples_per_second": 380.708, | |
| "eval_steps_per_second": 1.49, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 1.4671486285349777, | |
| "grad_norm": 29.188819885253906, | |
| "learning_rate": 1.7184919878131033e-06, | |
| "loss": 7.5032, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 1.4671486285349777, | |
| "eval_cosine_accuracy": 0.967510998249054, | |
| "eval_loss": 8.7284574508667, | |
| "eval_runtime": 49.6728, | |
| "eval_samples_per_second": 380.651, | |
| "eval_steps_per_second": 1.49, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 1.488411652136934, | |
| "grad_norm": 24.691267013549805, | |
| "learning_rate": 1.7433608324833296e-06, | |
| "loss": 7.4635, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.488411652136934, | |
| "eval_cosine_accuracy": 0.968235969543457, | |
| "eval_loss": 8.765801429748535, | |
| "eval_runtime": 50.8811, | |
| "eval_samples_per_second": 371.611, | |
| "eval_steps_per_second": 1.454, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.5096746757388901, | |
| "grad_norm": 50.87013626098633, | |
| "learning_rate": 1.7682795145176243e-06, | |
| "loss": 7.4503, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 1.5096746757388901, | |
| "eval_cosine_accuracy": 0.9684625267982483, | |
| "eval_loss": 8.72378921508789, | |
| "eval_runtime": 49.667, | |
| "eval_samples_per_second": 380.695, | |
| "eval_steps_per_second": 1.49, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 1.5309376993408463, | |
| "grad_norm": 40.06660842895508, | |
| "learning_rate": 1.793198196551919e-06, | |
| "loss": 7.4227, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.5309376993408463, | |
| "eval_cosine_accuracy": 0.9674656987190247, | |
| "eval_loss": 8.823667526245117, | |
| "eval_runtime": 51.0542, | |
| "eval_samples_per_second": 370.352, | |
| "eval_steps_per_second": 1.449, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.5522007229428025, | |
| "grad_norm": 22.465124130249023, | |
| "learning_rate": 1.8181168785862137e-06, | |
| "loss": 7.4243, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 1.5522007229428025, | |
| "eval_cosine_accuracy": 0.9679641127586365, | |
| "eval_loss": 8.741899490356445, | |
| "eval_runtime": 49.6301, | |
| "eval_samples_per_second": 380.978, | |
| "eval_steps_per_second": 1.491, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 1.5734637465447587, | |
| "grad_norm": 33.76041793823242, | |
| "learning_rate": 1.8430355606205084e-06, | |
| "loss": 7.3642, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 1.5734637465447587, | |
| "eval_cosine_accuracy": 0.9678281545639038, | |
| "eval_loss": 8.775369644165039, | |
| "eval_runtime": 50.8941, | |
| "eval_samples_per_second": 371.517, | |
| "eval_steps_per_second": 1.454, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 1.594726770146715, | |
| "grad_norm": 70.6234359741211, | |
| "learning_rate": 1.8679044052907347e-06, | |
| "loss": 7.3529, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 1.594726770146715, | |
| "eval_cosine_accuracy": 0.9670125246047974, | |
| "eval_loss": 8.834284782409668, | |
| "eval_runtime": 49.7944, | |
| "eval_samples_per_second": 379.721, | |
| "eval_steps_per_second": 1.486, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 1.6159897937486711, | |
| "grad_norm": 20.19392967224121, | |
| "learning_rate": 1.8928230873250293e-06, | |
| "loss": 7.3594, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 1.6159897937486711, | |
| "eval_cosine_accuracy": 0.9676922559738159, | |
| "eval_loss": 8.824420928955078, | |
| "eval_runtime": 51.008, | |
| "eval_samples_per_second": 370.687, | |
| "eval_steps_per_second": 1.451, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 1.6372528173506273, | |
| "grad_norm": 28.939029693603516, | |
| "learning_rate": 1.917741769359324e-06, | |
| "loss": 7.3257, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 1.6372528173506273, | |
| "eval_cosine_accuracy": 0.9672391414642334, | |
| "eval_loss": 8.849225044250488, | |
| "eval_runtime": 49.3783, | |
| "eval_samples_per_second": 382.921, | |
| "eval_steps_per_second": 1.499, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 1.6585158409525835, | |
| "grad_norm": 24.4239559173584, | |
| "learning_rate": 1.9426604513936186e-06, | |
| "loss": 7.3277, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 1.6585158409525835, | |
| "eval_cosine_accuracy": 0.9667859673500061, | |
| "eval_loss": 8.900370597839355, | |
| "eval_runtime": 50.6295, | |
| "eval_samples_per_second": 373.458, | |
| "eval_steps_per_second": 1.462, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 1.6797788645545397, | |
| "grad_norm": 27.921985626220703, | |
| "learning_rate": 1.9675292960638453e-06, | |
| "loss": 7.3037, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 1.6797788645545397, | |
| "eval_cosine_accuracy": 0.9670578837394714, | |
| "eval_loss": 8.904617309570312, | |
| "eval_runtime": 49.7604, | |
| "eval_samples_per_second": 379.981, | |
| "eval_steps_per_second": 1.487, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 1.701041888156496, | |
| "grad_norm": 26.04640007019043, | |
| "learning_rate": 1.9924479780981397e-06, | |
| "loss": 7.2672, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 1.701041888156496, | |
| "eval_cosine_accuracy": 0.9654719233512878, | |
| "eval_loss": 9.0310697555542, | |
| "eval_runtime": 50.5286, | |
| "eval_samples_per_second": 374.204, | |
| "eval_steps_per_second": 1.465, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 1.7223049117584521, | |
| "grad_norm": 27.62725830078125, | |
| "learning_rate": 2.0173666601324346e-06, | |
| "loss": 7.2515, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 1.7223049117584521, | |
| "eval_cosine_accuracy": 0.9665141105651855, | |
| "eval_loss": 8.956608772277832, | |
| "eval_runtime": 49.684, | |
| "eval_samples_per_second": 380.565, | |
| "eval_steps_per_second": 1.489, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 1.7435679353604083, | |
| "grad_norm": 24.697010040283203, | |
| "learning_rate": 2.0422853421667294e-06, | |
| "loss": 7.2205, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 1.7435679353604083, | |
| "eval_cosine_accuracy": 0.96714848279953, | |
| "eval_loss": 8.937081336975098, | |
| "eval_runtime": 50.465, | |
| "eval_samples_per_second": 374.676, | |
| "eval_steps_per_second": 1.466, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 1.7648309589623645, | |
| "grad_norm": 29.04935073852539, | |
| "learning_rate": 2.0671541868369553e-06, | |
| "loss": 7.2263, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 1.7648309589623645, | |
| "eval_cosine_accuracy": 0.9666500687599182, | |
| "eval_loss": 8.94690227508545, | |
| "eval_runtime": 49.547, | |
| "eval_samples_per_second": 381.618, | |
| "eval_steps_per_second": 1.494, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 1.7860939825643207, | |
| "grad_norm": 35.25505065917969, | |
| "learning_rate": 2.0920230315071816e-06, | |
| "loss": 7.273, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 1.7860939825643207, | |
| "eval_cosine_accuracy": 0.9672844409942627, | |
| "eval_loss": 8.9337797164917, | |
| "eval_runtime": 50.1569, | |
| "eval_samples_per_second": 376.977, | |
| "eval_steps_per_second": 1.475, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 1.807357006166277, | |
| "grad_norm": 26.70579719543457, | |
| "learning_rate": 2.116941713541476e-06, | |
| "loss": 7.1962, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 1.807357006166277, | |
| "eval_cosine_accuracy": 0.9669219255447388, | |
| "eval_loss": 8.93774700164795, | |
| "eval_runtime": 49.8395, | |
| "eval_samples_per_second": 379.378, | |
| "eval_steps_per_second": 1.485, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 1.8286200297682331, | |
| "grad_norm": 25.269804000854492, | |
| "learning_rate": 2.1418603955757713e-06, | |
| "loss": 7.1955, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 1.8286200297682331, | |
| "eval_cosine_accuracy": 0.965154767036438, | |
| "eval_loss": 9.037887573242188, | |
| "eval_runtime": 50.6789, | |
| "eval_samples_per_second": 373.094, | |
| "eval_steps_per_second": 1.46, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 1.8498830533701893, | |
| "grad_norm": 26.97606086730957, | |
| "learning_rate": 2.166779077610066e-06, | |
| "loss": 7.2015, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 1.8498830533701893, | |
| "eval_cosine_accuracy": 0.9658344388008118, | |
| "eval_loss": 9.061223030090332, | |
| "eval_runtime": 49.7986, | |
| "eval_samples_per_second": 379.689, | |
| "eval_steps_per_second": 1.486, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 1.8711460769721455, | |
| "grad_norm": 30.354036331176758, | |
| "learning_rate": 2.191647922280292e-06, | |
| "loss": 7.1363, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 1.8711460769721455, | |
| "eval_cosine_accuracy": 0.963840663433075, | |
| "eval_loss": 9.144049644470215, | |
| "eval_runtime": 50.374, | |
| "eval_samples_per_second": 375.352, | |
| "eval_steps_per_second": 1.469, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 1.8924091005741017, | |
| "grad_norm": 34.271522521972656, | |
| "learning_rate": 2.216566604314587e-06, | |
| "loss": 7.0981, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 1.8924091005741017, | |
| "eval_cosine_accuracy": 0.9659703373908997, | |
| "eval_loss": 9.090474128723145, | |
| "eval_runtime": 50.0017, | |
| "eval_samples_per_second": 378.147, | |
| "eval_steps_per_second": 1.48, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 1.913672124176058, | |
| "grad_norm": 26.35101890563965, | |
| "learning_rate": 2.2414852863488813e-06, | |
| "loss": 7.1226, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 1.913672124176058, | |
| "eval_cosine_accuracy": 0.9629797339439392, | |
| "eval_loss": 9.179637908935547, | |
| "eval_runtime": 50.6563, | |
| "eval_samples_per_second": 373.261, | |
| "eval_steps_per_second": 1.461, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 1.9349351477780141, | |
| "grad_norm": 30.242382049560547, | |
| "learning_rate": 2.266403968383176e-06, | |
| "loss": 7.09, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 1.9349351477780141, | |
| "eval_cosine_accuracy": 0.9639313220977783, | |
| "eval_loss": 9.226616859436035, | |
| "eval_runtime": 49.6704, | |
| "eval_samples_per_second": 380.669, | |
| "eval_steps_per_second": 1.49, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 1.9561981713799703, | |
| "grad_norm": 32.14158248901367, | |
| "learning_rate": 2.291322650417471e-06, | |
| "loss": 7.0502, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 1.9561981713799703, | |
| "eval_cosine_accuracy": 0.9655625820159912, | |
| "eval_loss": 9.134511947631836, | |
| "eval_runtime": 50.3879, | |
| "eval_samples_per_second": 375.249, | |
| "eval_steps_per_second": 1.469, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 1.9774611949819265, | |
| "grad_norm": 34.23884201049805, | |
| "learning_rate": 2.3161914950876973e-06, | |
| "loss": 7.0414, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 1.9774611949819265, | |
| "eval_cosine_accuracy": 0.9636141061782837, | |
| "eval_loss": 9.34079647064209, | |
| "eval_runtime": 49.6501, | |
| "eval_samples_per_second": 380.825, | |
| "eval_steps_per_second": 1.49, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 1.9987242185838827, | |
| "grad_norm": 22.901996612548828, | |
| "learning_rate": 2.3410603397579235e-06, | |
| "loss": 7.074, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 1.9987242185838827, | |
| "eval_cosine_accuracy": 0.9648375511169434, | |
| "eval_loss": 9.222514152526855, | |
| "eval_runtime": 50.3934, | |
| "eval_samples_per_second": 375.208, | |
| "eval_steps_per_second": 1.468, | |
| "step": 47000 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 94060, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 256, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |