LinalgZero-SFT-Instruct / trainer_state.json
atomwalk12's picture
Model save
f991a24 verified
{
"best_global_step": 100,
"best_metric": 0.3611111111111111,
"best_model_checkpoint": "results/LinalgZero-SFT-Instruct/checkpoint-100",
"epoch": 1.0,
"eval_steps": 100,
"global_step": 478,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0,
"eval_loss": 2.2502853870391846,
"eval_runtime": 46.7668,
"eval_samples_per_second": 11.119,
"eval_steps_per_second": 11.119,
"step": 0
},
{
"epoch": 0.010460251046025104,
"grad_norm": 7.1875,
"learning_rate": 5.333333333333334e-06,
"loss": 1.7889,
"step": 5
},
{
"epoch": 0.02092050209205021,
"grad_norm": 7.09375,
"learning_rate": 1.2e-05,
"loss": 1.8348,
"step": 10
},
{
"epoch": 0.03138075313807531,
"grad_norm": 8.0,
"learning_rate": 1.866666666666667e-05,
"loss": 1.7877,
"step": 15
},
{
"epoch": 0.04184100418410042,
"grad_norm": 6.90625,
"learning_rate": 1.9996685304798592e-05,
"loss": 1.703,
"step": 20
},
{
"epoch": 0.05230125523012552,
"grad_norm": 5.875,
"learning_rate": 1.998322354001344e-05,
"loss": 1.6144,
"step": 25
},
{
"epoch": 0.06276150627615062,
"grad_norm": 5.59375,
"learning_rate": 1.995942301836391e-05,
"loss": 1.5939,
"step": 30
},
{
"epoch": 0.07322175732217573,
"grad_norm": 4.875,
"learning_rate": 1.9925311131777348e-05,
"loss": 1.4859,
"step": 35
},
{
"epoch": 0.08368200836820083,
"grad_norm": 4.84375,
"learning_rate": 1.9880927139490716e-05,
"loss": 1.4745,
"step": 40
},
{
"epoch": 0.09414225941422594,
"grad_norm": 4.9375,
"learning_rate": 1.9826322122867276e-05,
"loss": 1.4017,
"step": 45
},
{
"epoch": 0.10460251046025104,
"grad_norm": 4.375,
"learning_rate": 1.9761558926607257e-05,
"loss": 1.3212,
"step": 50
},
{
"epoch": 0.11506276150627615,
"grad_norm": 4.25,
"learning_rate": 1.9686712086420124e-05,
"loss": 1.2879,
"step": 55
},
{
"epoch": 0.12552301255230125,
"grad_norm": 3.5625,
"learning_rate": 1.960186774324174e-05,
"loss": 1.2433,
"step": 60
},
{
"epoch": 0.13598326359832635,
"grad_norm": 3.1875,
"learning_rate": 1.9507123544095084e-05,
"loss": 1.2018,
"step": 65
},
{
"epoch": 0.14644351464435146,
"grad_norm": 3.15625,
"learning_rate": 1.940258852970868e-05,
"loss": 1.1513,
"step": 70
},
{
"epoch": 0.15690376569037656,
"grad_norm": 3.25,
"learning_rate": 1.9288383009022043e-05,
"loss": 1.1296,
"step": 75
},
{
"epoch": 0.16736401673640167,
"grad_norm": 2.515625,
"learning_rate": 1.9164638420722603e-05,
"loss": 1.0689,
"step": 80
},
{
"epoch": 0.17782426778242677,
"grad_norm": 2.484375,
"learning_rate": 1.9031497181973415e-05,
"loss": 1.0317,
"step": 85
},
{
"epoch": 0.18828451882845187,
"grad_norm": 2.34375,
"learning_rate": 1.8889112524505825e-05,
"loss": 1.0134,
"step": 90
},
{
"epoch": 0.19874476987447698,
"grad_norm": 2.09375,
"learning_rate": 1.8737648318265643e-05,
"loss": 0.9905,
"step": 95
},
{
"epoch": 0.20920502092050208,
"grad_norm": 2.125,
"learning_rate": 1.85772788828159e-05,
"loss": 0.9535,
"step": 100
},
{
"epoch": 0.20920502092050208,
"eval_loss": 0.987014889717102,
"eval_runtime": 46.5788,
"eval_samples_per_second": 11.164,
"eval_steps_per_second": 11.164,
"step": 100
},
{
"epoch": 0.2196652719665272,
"grad_norm": 1.921875,
"learning_rate": 1.840818878671309e-05,
"loss": 0.9145,
"step": 105
},
{
"epoch": 0.2301255230125523,
"grad_norm": 1.9609375,
"learning_rate": 1.8230572635088e-05,
"loss": 0.8838,
"step": 110
},
{
"epoch": 0.2405857740585774,
"grad_norm": 1.828125,
"learning_rate": 1.8044634845675377e-05,
"loss": 0.8743,
"step": 115
},
{
"epoch": 0.2510460251046025,
"grad_norm": 1.609375,
"learning_rate": 1.7850589413550384e-05,
"loss": 0.8409,
"step": 120
},
{
"epoch": 0.2615062761506276,
"grad_norm": 1.6953125,
"learning_rate": 1.7648659664842497e-05,
"loss": 0.8184,
"step": 125
},
{
"epoch": 0.2719665271966527,
"grad_norm": 2.0625,
"learning_rate": 1.7439077999710325e-05,
"loss": 0.8087,
"step": 130
},
{
"epoch": 0.2824267782426778,
"grad_norm": 1.9609375,
"learning_rate": 1.7222085624873195e-05,
"loss": 0.7774,
"step": 135
},
{
"epoch": 0.2928870292887029,
"grad_norm": 2.34375,
"learning_rate": 1.699793227600727e-05,
"loss": 0.7697,
"step": 140
},
{
"epoch": 0.303347280334728,
"grad_norm": 2.0,
"learning_rate": 1.6766875930325734e-05,
"loss": 0.7348,
"step": 145
},
{
"epoch": 0.3138075313807531,
"grad_norm": 1.78125,
"learning_rate": 1.6529182509673824e-05,
"loss": 0.7337,
"step": 150
},
{
"epoch": 0.32426778242677823,
"grad_norm": 1.3984375,
"learning_rate": 1.6285125574480428e-05,
"loss": 0.7175,
"step": 155
},
{
"epoch": 0.33472803347280333,
"grad_norm": 1.8515625,
"learning_rate": 1.6034986008918444e-05,
"loss": 0.7015,
"step": 160
},
{
"epoch": 0.34518828451882844,
"grad_norm": 1.75,
"learning_rate": 1.5779051697636304e-05,
"loss": 0.6923,
"step": 165
},
{
"epoch": 0.35564853556485354,
"grad_norm": 1.4453125,
"learning_rate": 1.5517617194432617e-05,
"loss": 0.6865,
"step": 170
},
{
"epoch": 0.36610878661087864,
"grad_norm": 1.671875,
"learning_rate": 1.5250983383255404e-05,
"loss": 0.6749,
"step": 175
},
{
"epoch": 0.37656903765690375,
"grad_norm": 1.2734375,
"learning_rate": 1.4979457131915905e-05,
"loss": 0.6706,
"step": 180
},
{
"epoch": 0.38702928870292885,
"grad_norm": 1.2578125,
"learning_rate": 1.4703350938915609e-05,
"loss": 0.6537,
"step": 185
},
{
"epoch": 0.39748953974895396,
"grad_norm": 1.2578125,
"learning_rate": 1.4422982573792985e-05,
"loss": 0.637,
"step": 190
},
{
"epoch": 0.40794979079497906,
"grad_norm": 1.1484375,
"learning_rate": 1.4138674711403724e-05,
"loss": 0.6365,
"step": 195
},
{
"epoch": 0.41841004184100417,
"grad_norm": 1.3515625,
"learning_rate": 1.3850754560555532e-05,
"loss": 0.6304,
"step": 200
},
{
"epoch": 0.41841004184100417,
"eval_loss": 0.5646117329597473,
"eval_runtime": 46.5599,
"eval_samples_per_second": 11.168,
"eval_steps_per_second": 11.168,
"step": 200
},
{
"epoch": 0.42887029288702927,
"grad_norm": 1.5234375,
"learning_rate": 1.3559553487424789e-05,
"loss": 0.6149,
"step": 205
},
{
"epoch": 0.4393305439330544,
"grad_norm": 1.203125,
"learning_rate": 1.3265406634188494e-05,
"loss": 0.6117,
"step": 210
},
{
"epoch": 0.4497907949790795,
"grad_norm": 1.2734375,
"learning_rate": 1.296865253331047e-05,
"loss": 0.5991,
"step": 215
},
{
"epoch": 0.4602510460251046,
"grad_norm": 1.2109375,
"learning_rate": 1.266963271792566e-05,
"loss": 0.5889,
"step": 220
},
{
"epoch": 0.4707112970711297,
"grad_norm": 1.21875,
"learning_rate": 1.2368691328770967e-05,
"loss": 0.5869,
"step": 225
},
{
"epoch": 0.4811715481171548,
"grad_norm": 1.1328125,
"learning_rate": 1.206617471811505e-05,
"loss": 0.5862,
"step": 230
},
{
"epoch": 0.4916317991631799,
"grad_norm": 1.09375,
"learning_rate": 1.176243105114283e-05,
"loss": 0.57,
"step": 235
},
{
"epoch": 0.502092050209205,
"grad_norm": 1.125,
"learning_rate": 1.1457809905253547e-05,
"loss": 0.5688,
"step": 240
},
{
"epoch": 0.5125523012552301,
"grad_norm": 1.0546875,
"learning_rate": 1.1152661867733498e-05,
"loss": 0.5601,
"step": 245
},
{
"epoch": 0.5230125523012552,
"grad_norm": 1.109375,
"learning_rate": 1.0847338132266505e-05,
"loss": 0.5646,
"step": 250
},
{
"epoch": 0.5334728033472803,
"grad_norm": 1.125,
"learning_rate": 1.0542190094746456e-05,
"loss": 0.5557,
"step": 255
},
{
"epoch": 0.5439330543933054,
"grad_norm": 1.0390625,
"learning_rate": 1.0237568948857172e-05,
"loss": 0.5557,
"step": 260
},
{
"epoch": 0.5543933054393305,
"grad_norm": 1.0703125,
"learning_rate": 9.933825281884955e-06,
"loss": 0.5469,
"step": 265
},
{
"epoch": 0.5648535564853556,
"grad_norm": 1.1640625,
"learning_rate": 9.631308671229034e-06,
"loss": 0.5445,
"step": 270
},
{
"epoch": 0.5753138075313807,
"grad_norm": 0.95703125,
"learning_rate": 9.330367282074346e-06,
"loss": 0.5392,
"step": 275
},
{
"epoch": 0.5857740585774058,
"grad_norm": 1.03125,
"learning_rate": 9.03134746668953e-06,
"loss": 0.5386,
"step": 280
},
{
"epoch": 0.5962343096234309,
"grad_norm": 1.1796875,
"learning_rate": 8.73459336581151e-06,
"loss": 0.5391,
"step": 285
},
{
"epoch": 0.606694560669456,
"grad_norm": 0.97265625,
"learning_rate": 8.440446512575216e-06,
"loss": 0.5343,
"step": 290
},
{
"epoch": 0.6171548117154811,
"grad_norm": 1.1171875,
"learning_rate": 8.14924543944447e-06,
"loss": 0.5296,
"step": 295
},
{
"epoch": 0.6276150627615062,
"grad_norm": 1.0,
"learning_rate": 7.86132528859628e-06,
"loss": 0.5242,
"step": 300
},
{
"epoch": 0.6276150627615062,
"eval_loss": 0.44806575775146484,
"eval_runtime": 46.6003,
"eval_samples_per_second": 11.159,
"eval_steps_per_second": 11.159,
"step": 300
},
{
"epoch": 0.6380753138075314,
"grad_norm": 1.015625,
"learning_rate": 7.577017426207018e-06,
"loss": 0.5331,
"step": 305
},
{
"epoch": 0.6485355648535565,
"grad_norm": 1.1640625,
"learning_rate": 7.2966490610843955e-06,
"loss": 0.5283,
"step": 310
},
{
"epoch": 0.6589958158995816,
"grad_norm": 1.2265625,
"learning_rate": 7.020542868084099e-06,
"loss": 0.5208,
"step": 315
},
{
"epoch": 0.6694560669456067,
"grad_norm": 0.94140625,
"learning_rate": 6.749016616744599e-06,
"loss": 0.5259,
"step": 320
},
{
"epoch": 0.6799163179916318,
"grad_norm": 0.96484375,
"learning_rate": 6.482382805567384e-06,
"loss": 0.5203,
"step": 325
},
{
"epoch": 0.6903765690376569,
"grad_norm": 1.1015625,
"learning_rate": 6.220948302363703e-06,
"loss": 0.5204,
"step": 330
},
{
"epoch": 0.700836820083682,
"grad_norm": 1.0625,
"learning_rate": 5.9650139910815575e-06,
"loss": 0.5174,
"step": 335
},
{
"epoch": 0.7112970711297071,
"grad_norm": 1.15625,
"learning_rate": 5.714874425519574e-06,
"loss": 0.5202,
"step": 340
},
{
"epoch": 0.7217573221757322,
"grad_norm": 1.15625,
"learning_rate": 5.4708174903261804e-06,
"loss": 0.5166,
"step": 345
},
{
"epoch": 0.7322175732217573,
"grad_norm": 1.2578125,
"learning_rate": 5.233124069674268e-06,
"loss": 0.5143,
"step": 350
},
{
"epoch": 0.7426778242677824,
"grad_norm": 0.94140625,
"learning_rate": 5.002067723992732e-06,
"loss": 0.5196,
"step": 355
},
{
"epoch": 0.7531380753138075,
"grad_norm": 1.015625,
"learning_rate": 4.777914375126806e-06,
"loss": 0.5186,
"step": 360
},
{
"epoch": 0.7635983263598326,
"grad_norm": 1.0078125,
"learning_rate": 4.560922000289677e-06,
"loss": 0.5165,
"step": 365
},
{
"epoch": 0.7740585774058577,
"grad_norm": 0.953125,
"learning_rate": 4.3513403351575105e-06,
"loss": 0.5092,
"step": 370
},
{
"epoch": 0.7845188284518828,
"grad_norm": 0.921875,
"learning_rate": 4.149410586449619e-06,
"loss": 0.5098,
"step": 375
},
{
"epoch": 0.7949790794979079,
"grad_norm": 1.0234375,
"learning_rate": 3.955365154324628e-06,
"loss": 0.5089,
"step": 380
},
{
"epoch": 0.805439330543933,
"grad_norm": 0.96484375,
"learning_rate": 3.7694273649120038e-06,
"loss": 0.5154,
"step": 385
},
{
"epoch": 0.8158995815899581,
"grad_norm": 1.03125,
"learning_rate": 3.5918112132869133e-06,
"loss": 0.5151,
"step": 390
},
{
"epoch": 0.8263598326359832,
"grad_norm": 1.0546875,
"learning_rate": 3.4227211171841044e-06,
"loss": 0.5133,
"step": 395
},
{
"epoch": 0.8368200836820083,
"grad_norm": 1.0234375,
"learning_rate": 3.262351681734356e-06,
"loss": 0.5089,
"step": 400
},
{
"epoch": 0.8368200836820083,
"eval_loss": 0.4229147136211395,
"eval_runtime": 46.5673,
"eval_samples_per_second": 11.167,
"eval_steps_per_second": 11.167,
"step": 400
},
{
"epoch": 0.8472803347280334,
"grad_norm": 0.9765625,
"learning_rate": 3.1108874754941786e-06,
"loss": 0.5122,
"step": 405
},
{
"epoch": 0.8577405857740585,
"grad_norm": 0.953125,
"learning_rate": 2.968502818026586e-06,
"loss": 0.5108,
"step": 410
},
{
"epoch": 0.8682008368200836,
"grad_norm": 1.0625,
"learning_rate": 2.8353615792774012e-06,
"loss": 0.5065,
"step": 415
},
{
"epoch": 0.8786610878661087,
"grad_norm": 1.0,
"learning_rate": 2.711616990977959e-06,
"loss": 0.5042,
"step": 420
},
{
"epoch": 0.8891213389121339,
"grad_norm": 0.9921875,
"learning_rate": 2.5974114702913215e-06,
"loss": 0.5071,
"step": 425
},
{
"epoch": 0.899581589958159,
"grad_norm": 0.953125,
"learning_rate": 2.4928764559049163e-06,
"loss": 0.5148,
"step": 430
},
{
"epoch": 0.9100418410041841,
"grad_norm": 0.953125,
"learning_rate": 2.398132256758261e-06,
"loss": 0.5017,
"step": 435
},
{
"epoch": 0.9205020920502092,
"grad_norm": 0.9453125,
"learning_rate": 2.313287913579879e-06,
"loss": 0.5119,
"step": 440
},
{
"epoch": 0.9309623430962343,
"grad_norm": 0.9296875,
"learning_rate": 2.2384410733927474e-06,
"loss": 0.5054,
"step": 445
},
{
"epoch": 0.9414225941422594,
"grad_norm": 1.0390625,
"learning_rate": 2.173677877132726e-06,
"loss": 0.511,
"step": 450
},
{
"epoch": 0.9518828451882845,
"grad_norm": 1.15625,
"learning_rate": 2.1190728605092853e-06,
"loss": 0.506,
"step": 455
},
{
"epoch": 0.9623430962343096,
"grad_norm": 1.0234375,
"learning_rate": 2.074688868222654e-06,
"loss": 0.5119,
"step": 460
},
{
"epoch": 0.9728033472803347,
"grad_norm": 0.91015625,
"learning_rate": 2.0405769816360936e-06,
"loss": 0.5082,
"step": 465
},
{
"epoch": 0.9832635983263598,
"grad_norm": 0.953125,
"learning_rate": 2.016776459986563e-06,
"loss": 0.5074,
"step": 470
},
{
"epoch": 0.9937238493723849,
"grad_norm": 0.9765625,
"learning_rate": 2.0033146952014117e-06,
"loss": 0.5037,
"step": 475
},
{
"epoch": 1.0,
"step": 478,
"total_flos": 3.400475586667315e+17,
"train_loss": 0.7476620379850955,
"train_runtime": 6830.9614,
"train_samples_per_second": 1.12,
"train_steps_per_second": 0.07
}
],
"logging_steps": 5,
"max_steps": 478,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.400475586667315e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}