{ "best_global_step": 100, "best_metric": 0.3611111111111111, "best_model_checkpoint": "results/LinalgZero-SFT-Instruct/checkpoint-100", "epoch": 1.0, "eval_steps": 100, "global_step": 478, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": 2.2502853870391846, "eval_runtime": 46.7668, "eval_samples_per_second": 11.119, "eval_steps_per_second": 11.119, "step": 0 }, { "epoch": 0.010460251046025104, "grad_norm": 7.1875, "learning_rate": 5.333333333333334e-06, "loss": 1.7889, "step": 5 }, { "epoch": 0.02092050209205021, "grad_norm": 7.09375, "learning_rate": 1.2e-05, "loss": 1.8348, "step": 10 }, { "epoch": 0.03138075313807531, "grad_norm": 8.0, "learning_rate": 1.866666666666667e-05, "loss": 1.7877, "step": 15 }, { "epoch": 0.04184100418410042, "grad_norm": 6.90625, "learning_rate": 1.9996685304798592e-05, "loss": 1.703, "step": 20 }, { "epoch": 0.05230125523012552, "grad_norm": 5.875, "learning_rate": 1.998322354001344e-05, "loss": 1.6144, "step": 25 }, { "epoch": 0.06276150627615062, "grad_norm": 5.59375, "learning_rate": 1.995942301836391e-05, "loss": 1.5939, "step": 30 }, { "epoch": 0.07322175732217573, "grad_norm": 4.875, "learning_rate": 1.9925311131777348e-05, "loss": 1.4859, "step": 35 }, { "epoch": 0.08368200836820083, "grad_norm": 4.84375, "learning_rate": 1.9880927139490716e-05, "loss": 1.4745, "step": 40 }, { "epoch": 0.09414225941422594, "grad_norm": 4.9375, "learning_rate": 1.9826322122867276e-05, "loss": 1.4017, "step": 45 }, { "epoch": 0.10460251046025104, "grad_norm": 4.375, "learning_rate": 1.9761558926607257e-05, "loss": 1.3212, "step": 50 }, { "epoch": 0.11506276150627615, "grad_norm": 4.25, "learning_rate": 1.9686712086420124e-05, "loss": 1.2879, "step": 55 }, { "epoch": 0.12552301255230125, "grad_norm": 3.5625, "learning_rate": 1.960186774324174e-05, "loss": 1.2433, "step": 60 }, { "epoch": 0.13598326359832635, "grad_norm": 3.1875, "learning_rate": 1.9507123544095084e-05, "loss": 1.2018, "step": 65 }, { "epoch": 0.14644351464435146, "grad_norm": 3.15625, "learning_rate": 1.940258852970868e-05, "loss": 1.1513, "step": 70 }, { "epoch": 0.15690376569037656, "grad_norm": 3.25, "learning_rate": 1.9288383009022043e-05, "loss": 1.1296, "step": 75 }, { "epoch": 0.16736401673640167, "grad_norm": 2.515625, "learning_rate": 1.9164638420722603e-05, "loss": 1.0689, "step": 80 }, { "epoch": 0.17782426778242677, "grad_norm": 2.484375, "learning_rate": 1.9031497181973415e-05, "loss": 1.0317, "step": 85 }, { "epoch": 0.18828451882845187, "grad_norm": 2.34375, "learning_rate": 1.8889112524505825e-05, "loss": 1.0134, "step": 90 }, { "epoch": 0.19874476987447698, "grad_norm": 2.09375, "learning_rate": 1.8737648318265643e-05, "loss": 0.9905, "step": 95 }, { "epoch": 0.20920502092050208, "grad_norm": 2.125, "learning_rate": 1.85772788828159e-05, "loss": 0.9535, "step": 100 }, { "epoch": 0.20920502092050208, "eval_loss": 0.987014889717102, "eval_runtime": 46.5788, "eval_samples_per_second": 11.164, "eval_steps_per_second": 11.164, "step": 100 }, { "epoch": 0.2196652719665272, "grad_norm": 1.921875, "learning_rate": 1.840818878671309e-05, "loss": 0.9145, "step": 105 }, { "epoch": 0.2301255230125523, "grad_norm": 1.9609375, "learning_rate": 1.8230572635088e-05, "loss": 0.8838, "step": 110 }, { "epoch": 0.2405857740585774, "grad_norm": 1.828125, "learning_rate": 1.8044634845675377e-05, "loss": 0.8743, "step": 115 }, { "epoch": 0.2510460251046025, "grad_norm": 1.609375, "learning_rate": 1.7850589413550384e-05, "loss": 0.8409, "step": 120 }, { "epoch": 0.2615062761506276, "grad_norm": 1.6953125, "learning_rate": 1.7648659664842497e-05, "loss": 0.8184, "step": 125 }, { "epoch": 0.2719665271966527, "grad_norm": 2.0625, "learning_rate": 1.7439077999710325e-05, "loss": 0.8087, "step": 130 }, { "epoch": 0.2824267782426778, "grad_norm": 1.9609375, "learning_rate": 1.7222085624873195e-05, "loss": 0.7774, "step": 135 }, { "epoch": 0.2928870292887029, "grad_norm": 2.34375, "learning_rate": 1.699793227600727e-05, "loss": 0.7697, "step": 140 }, { "epoch": 0.303347280334728, "grad_norm": 2.0, "learning_rate": 1.6766875930325734e-05, "loss": 0.7348, "step": 145 }, { "epoch": 0.3138075313807531, "grad_norm": 1.78125, "learning_rate": 1.6529182509673824e-05, "loss": 0.7337, "step": 150 }, { "epoch": 0.32426778242677823, "grad_norm": 1.3984375, "learning_rate": 1.6285125574480428e-05, "loss": 0.7175, "step": 155 }, { "epoch": 0.33472803347280333, "grad_norm": 1.8515625, "learning_rate": 1.6034986008918444e-05, "loss": 0.7015, "step": 160 }, { "epoch": 0.34518828451882844, "grad_norm": 1.75, "learning_rate": 1.5779051697636304e-05, "loss": 0.6923, "step": 165 }, { "epoch": 0.35564853556485354, "grad_norm": 1.4453125, "learning_rate": 1.5517617194432617e-05, "loss": 0.6865, "step": 170 }, { "epoch": 0.36610878661087864, "grad_norm": 1.671875, "learning_rate": 1.5250983383255404e-05, "loss": 0.6749, "step": 175 }, { "epoch": 0.37656903765690375, "grad_norm": 1.2734375, "learning_rate": 1.4979457131915905e-05, "loss": 0.6706, "step": 180 }, { "epoch": 0.38702928870292885, "grad_norm": 1.2578125, "learning_rate": 1.4703350938915609e-05, "loss": 0.6537, "step": 185 }, { "epoch": 0.39748953974895396, "grad_norm": 1.2578125, "learning_rate": 1.4422982573792985e-05, "loss": 0.637, "step": 190 }, { "epoch": 0.40794979079497906, "grad_norm": 1.1484375, "learning_rate": 1.4138674711403724e-05, "loss": 0.6365, "step": 195 }, { "epoch": 0.41841004184100417, "grad_norm": 1.3515625, "learning_rate": 1.3850754560555532e-05, "loss": 0.6304, "step": 200 }, { "epoch": 0.41841004184100417, "eval_loss": 0.5646117329597473, "eval_runtime": 46.5599, "eval_samples_per_second": 11.168, "eval_steps_per_second": 11.168, "step": 200 }, { "epoch": 0.42887029288702927, "grad_norm": 1.5234375, "learning_rate": 1.3559553487424789e-05, "loss": 0.6149, "step": 205 }, { "epoch": 0.4393305439330544, "grad_norm": 1.203125, "learning_rate": 1.3265406634188494e-05, "loss": 0.6117, "step": 210 }, { "epoch": 0.4497907949790795, "grad_norm": 1.2734375, "learning_rate": 1.296865253331047e-05, "loss": 0.5991, "step": 215 }, { "epoch": 0.4602510460251046, "grad_norm": 1.2109375, "learning_rate": 1.266963271792566e-05, "loss": 0.5889, "step": 220 }, { "epoch": 0.4707112970711297, "grad_norm": 1.21875, "learning_rate": 1.2368691328770967e-05, "loss": 0.5869, "step": 225 }, { "epoch": 0.4811715481171548, "grad_norm": 1.1328125, "learning_rate": 1.206617471811505e-05, "loss": 0.5862, "step": 230 }, { "epoch": 0.4916317991631799, "grad_norm": 1.09375, "learning_rate": 1.176243105114283e-05, "loss": 0.57, "step": 235 }, { "epoch": 0.502092050209205, "grad_norm": 1.125, "learning_rate": 1.1457809905253547e-05, "loss": 0.5688, "step": 240 }, { "epoch": 0.5125523012552301, "grad_norm": 1.0546875, "learning_rate": 1.1152661867733498e-05, "loss": 0.5601, "step": 245 }, { "epoch": 0.5230125523012552, "grad_norm": 1.109375, "learning_rate": 1.0847338132266505e-05, "loss": 0.5646, "step": 250 }, { "epoch": 0.5334728033472803, "grad_norm": 1.125, "learning_rate": 1.0542190094746456e-05, "loss": 0.5557, "step": 255 }, { "epoch": 0.5439330543933054, "grad_norm": 1.0390625, "learning_rate": 1.0237568948857172e-05, "loss": 0.5557, "step": 260 }, { "epoch": 0.5543933054393305, "grad_norm": 1.0703125, "learning_rate": 9.933825281884955e-06, "loss": 0.5469, "step": 265 }, { "epoch": 0.5648535564853556, "grad_norm": 1.1640625, "learning_rate": 9.631308671229034e-06, "loss": 0.5445, "step": 270 }, { "epoch": 0.5753138075313807, "grad_norm": 0.95703125, "learning_rate": 9.330367282074346e-06, "loss": 0.5392, "step": 275 }, { "epoch": 0.5857740585774058, "grad_norm": 1.03125, "learning_rate": 9.03134746668953e-06, "loss": 0.5386, "step": 280 }, { "epoch": 0.5962343096234309, "grad_norm": 1.1796875, "learning_rate": 8.73459336581151e-06, "loss": 0.5391, "step": 285 }, { "epoch": 0.606694560669456, "grad_norm": 0.97265625, "learning_rate": 8.440446512575216e-06, "loss": 0.5343, "step": 290 }, { "epoch": 0.6171548117154811, "grad_norm": 1.1171875, "learning_rate": 8.14924543944447e-06, "loss": 0.5296, "step": 295 }, { "epoch": 0.6276150627615062, "grad_norm": 1.0, "learning_rate": 7.86132528859628e-06, "loss": 0.5242, "step": 300 }, { "epoch": 0.6276150627615062, "eval_loss": 0.44806575775146484, "eval_runtime": 46.6003, "eval_samples_per_second": 11.159, "eval_steps_per_second": 11.159, "step": 300 }, { "epoch": 0.6380753138075314, "grad_norm": 1.015625, "learning_rate": 7.577017426207018e-06, "loss": 0.5331, "step": 305 }, { "epoch": 0.6485355648535565, "grad_norm": 1.1640625, "learning_rate": 7.2966490610843955e-06, "loss": 0.5283, "step": 310 }, { "epoch": 0.6589958158995816, "grad_norm": 1.2265625, "learning_rate": 7.020542868084099e-06, "loss": 0.5208, "step": 315 }, { "epoch": 0.6694560669456067, "grad_norm": 0.94140625, "learning_rate": 6.749016616744599e-06, "loss": 0.5259, "step": 320 }, { "epoch": 0.6799163179916318, "grad_norm": 0.96484375, "learning_rate": 6.482382805567384e-06, "loss": 0.5203, "step": 325 }, { "epoch": 0.6903765690376569, "grad_norm": 1.1015625, "learning_rate": 6.220948302363703e-06, "loss": 0.5204, "step": 330 }, { "epoch": 0.700836820083682, "grad_norm": 1.0625, "learning_rate": 5.9650139910815575e-06, "loss": 0.5174, "step": 335 }, { "epoch": 0.7112970711297071, "grad_norm": 1.15625, "learning_rate": 5.714874425519574e-06, "loss": 0.5202, "step": 340 }, { "epoch": 0.7217573221757322, "grad_norm": 1.15625, "learning_rate": 5.4708174903261804e-06, "loss": 0.5166, "step": 345 }, { "epoch": 0.7322175732217573, "grad_norm": 1.2578125, "learning_rate": 5.233124069674268e-06, "loss": 0.5143, "step": 350 }, { "epoch": 0.7426778242677824, "grad_norm": 0.94140625, "learning_rate": 5.002067723992732e-06, "loss": 0.5196, "step": 355 }, { "epoch": 0.7531380753138075, "grad_norm": 1.015625, "learning_rate": 4.777914375126806e-06, "loss": 0.5186, "step": 360 }, { "epoch": 0.7635983263598326, "grad_norm": 1.0078125, "learning_rate": 4.560922000289677e-06, "loss": 0.5165, "step": 365 }, { "epoch": 0.7740585774058577, "grad_norm": 0.953125, "learning_rate": 4.3513403351575105e-06, "loss": 0.5092, "step": 370 }, { "epoch": 0.7845188284518828, "grad_norm": 0.921875, "learning_rate": 4.149410586449619e-06, "loss": 0.5098, "step": 375 }, { "epoch": 0.7949790794979079, "grad_norm": 1.0234375, "learning_rate": 3.955365154324628e-06, "loss": 0.5089, "step": 380 }, { "epoch": 0.805439330543933, "grad_norm": 0.96484375, "learning_rate": 3.7694273649120038e-06, "loss": 0.5154, "step": 385 }, { "epoch": 0.8158995815899581, "grad_norm": 1.03125, "learning_rate": 3.5918112132869133e-06, "loss": 0.5151, "step": 390 }, { "epoch": 0.8263598326359832, "grad_norm": 1.0546875, "learning_rate": 3.4227211171841044e-06, "loss": 0.5133, "step": 395 }, { "epoch": 0.8368200836820083, "grad_norm": 1.0234375, "learning_rate": 3.262351681734356e-06, "loss": 0.5089, "step": 400 }, { "epoch": 0.8368200836820083, "eval_loss": 0.4229147136211395, "eval_runtime": 46.5673, "eval_samples_per_second": 11.167, "eval_steps_per_second": 11.167, "step": 400 }, { "epoch": 0.8472803347280334, "grad_norm": 0.9765625, "learning_rate": 3.1108874754941786e-06, "loss": 0.5122, "step": 405 }, { "epoch": 0.8577405857740585, "grad_norm": 0.953125, "learning_rate": 2.968502818026586e-06, "loss": 0.5108, "step": 410 }, { "epoch": 0.8682008368200836, "grad_norm": 1.0625, "learning_rate": 2.8353615792774012e-06, "loss": 0.5065, "step": 415 }, { "epoch": 0.8786610878661087, "grad_norm": 1.0, "learning_rate": 2.711616990977959e-06, "loss": 0.5042, "step": 420 }, { "epoch": 0.8891213389121339, "grad_norm": 0.9921875, "learning_rate": 2.5974114702913215e-06, "loss": 0.5071, "step": 425 }, { "epoch": 0.899581589958159, "grad_norm": 0.953125, "learning_rate": 2.4928764559049163e-06, "loss": 0.5148, "step": 430 }, { "epoch": 0.9100418410041841, "grad_norm": 0.953125, "learning_rate": 2.398132256758261e-06, "loss": 0.5017, "step": 435 }, { "epoch": 0.9205020920502092, "grad_norm": 0.9453125, "learning_rate": 2.313287913579879e-06, "loss": 0.5119, "step": 440 }, { "epoch": 0.9309623430962343, "grad_norm": 0.9296875, "learning_rate": 2.2384410733927474e-06, "loss": 0.5054, "step": 445 }, { "epoch": 0.9414225941422594, "grad_norm": 1.0390625, "learning_rate": 2.173677877132726e-06, "loss": 0.511, "step": 450 }, { "epoch": 0.9518828451882845, "grad_norm": 1.15625, "learning_rate": 2.1190728605092853e-06, "loss": 0.506, "step": 455 }, { "epoch": 0.9623430962343096, "grad_norm": 1.0234375, "learning_rate": 2.074688868222654e-06, "loss": 0.5119, "step": 460 }, { "epoch": 0.9728033472803347, "grad_norm": 0.91015625, "learning_rate": 2.0405769816360936e-06, "loss": 0.5082, "step": 465 }, { "epoch": 0.9832635983263598, "grad_norm": 0.953125, "learning_rate": 2.016776459986563e-06, "loss": 0.5074, "step": 470 }, { "epoch": 0.9937238493723849, "grad_norm": 0.9765625, "learning_rate": 2.0033146952014117e-06, "loss": 0.5037, "step": 475 }, { "epoch": 1.0, "step": 478, "total_flos": 3.400475586667315e+17, "train_loss": 0.7476620379850955, "train_runtime": 6830.9614, "train_samples_per_second": 1.12, "train_steps_per_second": 0.07 } ], "logging_steps": 5, "max_steps": 478, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.400475586667315e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }