sample_phi3_finetune_example / trainer_state.json
Matt Huang
new
78aa984
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1677,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011926058437686345,
"grad_norm": 0.859375,
"learning_rate": 2.9761904761904765e-07,
"loss": 1.3391,
"step": 20
},
{
"epoch": 0.02385211687537269,
"grad_norm": 0.9765625,
"learning_rate": 5.952380952380953e-07,
"loss": 1.3358,
"step": 40
},
{
"epoch": 0.03577817531305903,
"grad_norm": 0.484375,
"learning_rate": 8.928571428571429e-07,
"loss": 1.2273,
"step": 60
},
{
"epoch": 0.04770423375074538,
"grad_norm": 0.6015625,
"learning_rate": 1.1904761904761906e-06,
"loss": 1.2731,
"step": 80
},
{
"epoch": 0.05963029218843172,
"grad_norm": 0.4921875,
"learning_rate": 1.4880952380952381e-06,
"loss": 1.2757,
"step": 100
},
{
"epoch": 0.07155635062611806,
"grad_norm": 0.5859375,
"learning_rate": 1.7857142857142859e-06,
"loss": 1.2827,
"step": 120
},
{
"epoch": 0.08348240906380441,
"grad_norm": 0.52734375,
"learning_rate": 2.0833333333333334e-06,
"loss": 1.2206,
"step": 140
},
{
"epoch": 0.09540846750149076,
"grad_norm": 0.66015625,
"learning_rate": 2.380952380952381e-06,
"loss": 1.2604,
"step": 160
},
{
"epoch": 0.1073345259391771,
"grad_norm": 0.412109375,
"learning_rate": 2.6785714285714285e-06,
"loss": 1.2433,
"step": 180
},
{
"epoch": 0.11926058437686345,
"grad_norm": 0.6015625,
"learning_rate": 2.9761904761904763e-06,
"loss": 1.2834,
"step": 200
},
{
"epoch": 0.13118664281454978,
"grad_norm": 0.69140625,
"learning_rate": 3.273809523809524e-06,
"loss": 1.263,
"step": 220
},
{
"epoch": 0.14311270125223613,
"grad_norm": 0.3984375,
"learning_rate": 3.5714285714285718e-06,
"loss": 1.2525,
"step": 240
},
{
"epoch": 0.15503875968992248,
"grad_norm": 0.298828125,
"learning_rate": 3.869047619047619e-06,
"loss": 1.2064,
"step": 260
},
{
"epoch": 0.16696481812760883,
"grad_norm": 0.431640625,
"learning_rate": 4.166666666666667e-06,
"loss": 1.1364,
"step": 280
},
{
"epoch": 0.17889087656529518,
"grad_norm": 0.55078125,
"learning_rate": 4.464285714285715e-06,
"loss": 1.1594,
"step": 300
},
{
"epoch": 0.19081693500298152,
"grad_norm": 0.291015625,
"learning_rate": 4.761904761904762e-06,
"loss": 1.1471,
"step": 320
},
{
"epoch": 0.20274299344066785,
"grad_norm": 0.2578125,
"learning_rate": 4.99989023370455e-06,
"loss": 1.138,
"step": 340
},
{
"epoch": 0.2146690518783542,
"grad_norm": 0.232421875,
"learning_rate": 4.996049425354717e-06,
"loss": 1.1786,
"step": 360
},
{
"epoch": 0.22659511031604054,
"grad_norm": 0.8671875,
"learning_rate": 4.986729937340083e-06,
"loss": 1.2042,
"step": 380
},
{
"epoch": 0.2385211687537269,
"grad_norm": 0.29296875,
"learning_rate": 4.971952225381176e-06,
"loss": 1.1528,
"step": 400
},
{
"epoch": 0.2504472271914132,
"grad_norm": 0.26953125,
"learning_rate": 4.951748725674643e-06,
"loss": 1.1932,
"step": 420
},
{
"epoch": 0.26237328562909956,
"grad_norm": 0.328125,
"learning_rate": 4.9261637836977315e-06,
"loss": 1.1587,
"step": 440
},
{
"epoch": 0.2742993440667859,
"grad_norm": 0.2119140625,
"learning_rate": 4.895253556872611e-06,
"loss": 1.2024,
"step": 460
},
{
"epoch": 0.28622540250447226,
"grad_norm": 0.2236328125,
"learning_rate": 4.8590858913041775e-06,
"loss": 1.1471,
"step": 480
},
{
"epoch": 0.2981514609421586,
"grad_norm": 0.291015625,
"learning_rate": 4.817740172861903e-06,
"loss": 1.137,
"step": 500
},
{
"epoch": 0.31007751937984496,
"grad_norm": 0.234375,
"learning_rate": 4.771307152932579e-06,
"loss": 1.1693,
"step": 520
},
{
"epoch": 0.3220035778175313,
"grad_norm": 0.28125,
"learning_rate": 4.719888749226442e-06,
"loss": 1.1901,
"step": 540
},
{
"epoch": 0.33392963625521765,
"grad_norm": 0.28125,
"learning_rate": 4.663597822073865e-06,
"loss": 1.1139,
"step": 560
},
{
"epoch": 0.345855694692904,
"grad_norm": 0.26953125,
"learning_rate": 4.602557926703675e-06,
"loss": 1.1683,
"step": 580
},
{
"epoch": 0.35778175313059035,
"grad_norm": 0.375,
"learning_rate": 4.536903042046778e-06,
"loss": 1.1746,
"step": 600
},
{
"epoch": 0.3697078115682767,
"grad_norm": 0.216796875,
"learning_rate": 4.4667772766604065e-06,
"loss": 1.1092,
"step": 620
},
{
"epoch": 0.38163387000596305,
"grad_norm": 0.392578125,
"learning_rate": 4.392334552418421e-06,
"loss": 1.125,
"step": 640
},
{
"epoch": 0.3935599284436494,
"grad_norm": 0.25390625,
"learning_rate": 4.313738266661979e-06,
"loss": 1.1584,
"step": 660
},
{
"epoch": 0.4054859868813357,
"grad_norm": 0.2216796875,
"learning_rate": 4.231160933552109e-06,
"loss": 1.1235,
"step": 680
},
{
"epoch": 0.41741204531902204,
"grad_norm": 0.330078125,
"learning_rate": 4.144783805411415e-06,
"loss": 1.2566,
"step": 700
},
{
"epoch": 0.4293381037567084,
"grad_norm": 0.208984375,
"learning_rate": 4.054796474886038e-06,
"loss": 1.164,
"step": 720
},
{
"epoch": 0.44126416219439474,
"grad_norm": 0.248046875,
"learning_rate": 3.961396458801099e-06,
"loss": 1.1195,
"step": 740
},
{
"epoch": 0.4531902206320811,
"grad_norm": 0.255859375,
"learning_rate": 3.864788764623042e-06,
"loss": 1.1012,
"step": 760
},
{
"epoch": 0.46511627906976744,
"grad_norm": 0.4296875,
"learning_rate": 3.7651854404804757e-06,
"loss": 1.1042,
"step": 780
},
{
"epoch": 0.4770423375074538,
"grad_norm": 0.1953125,
"learning_rate": 3.662805109731168e-06,
"loss": 1.1627,
"step": 800
},
{
"epoch": 0.48896839594514013,
"grad_norm": 0.2119140625,
"learning_rate": 3.557872491096812e-06,
"loss": 1.1711,
"step": 820
},
{
"epoch": 0.5008944543828264,
"grad_norm": 0.322265625,
"learning_rate": 3.450617905418834e-06,
"loss": 1.1929,
"step": 840
},
{
"epoch": 0.5128205128205128,
"grad_norm": 0.193359375,
"learning_rate": 3.341276770117877e-06,
"loss": 1.0958,
"step": 860
},
{
"epoch": 0.5247465712581991,
"grad_norm": 0.2294921875,
"learning_rate": 3.2300890824665942e-06,
"loss": 1.1335,
"step": 880
},
{
"epoch": 0.5366726296958855,
"grad_norm": 0.3125,
"learning_rate": 3.117298892809953e-06,
"loss": 1.137,
"step": 900
},
{
"epoch": 0.5485986881335718,
"grad_norm": 0.171875,
"learning_rate": 3.003153768889276e-06,
"loss": 1.1752,
"step": 920
},
{
"epoch": 0.5605247465712582,
"grad_norm": 0.1826171875,
"learning_rate": 2.887904252445806e-06,
"loss": 1.1044,
"step": 940
},
{
"epoch": 0.5724508050089445,
"grad_norm": 0.49609375,
"learning_rate": 2.7718033092965267e-06,
"loss": 1.1124,
"step": 960
},
{
"epoch": 0.5843768634466309,
"grad_norm": 0.25390625,
"learning_rate": 2.655105774089278e-06,
"loss": 1.2478,
"step": 980
},
{
"epoch": 0.5963029218843172,
"grad_norm": 0.20703125,
"learning_rate": 2.538067790955892e-06,
"loss": 1.1365,
"step": 1000
},
{
"epoch": 0.6082289803220036,
"grad_norm": 0.2578125,
"learning_rate": 2.420946251291103e-06,
"loss": 1.0598,
"step": 1020
},
{
"epoch": 0.6201550387596899,
"grad_norm": 0.2109375,
"learning_rate": 2.303998229891249e-06,
"loss": 1.1299,
"step": 1040
},
{
"epoch": 0.6320810971973763,
"grad_norm": 0.29296875,
"learning_rate": 2.18748042069042e-06,
"loss": 1.148,
"step": 1060
},
{
"epoch": 0.6440071556350626,
"grad_norm": 0.2412109375,
"learning_rate": 2.0716485733325834e-06,
"loss": 1.1469,
"step": 1080
},
{
"epoch": 0.655933214072749,
"grad_norm": 0.27734375,
"learning_rate": 1.95675693181636e-06,
"loss": 1.1275,
"step": 1100
},
{
"epoch": 0.6678592725104353,
"grad_norm": 0.25,
"learning_rate": 1.8430576764446046e-06,
"loss": 1.1711,
"step": 1120
},
{
"epoch": 0.6797853309481217,
"grad_norm": 0.2412109375,
"learning_rate": 1.730800370303683e-06,
"loss": 1.1191,
"step": 1140
},
{
"epoch": 0.691711389385808,
"grad_norm": 0.328125,
"learning_rate": 1.6202314114873693e-06,
"loss": 1.2033,
"step": 1160
},
{
"epoch": 0.7036374478234944,
"grad_norm": 0.24609375,
"learning_rate": 1.51159349226773e-06,
"loss": 1.1747,
"step": 1180
},
{
"epoch": 0.7155635062611807,
"grad_norm": 0.24609375,
"learning_rate": 1.4051250664000515e-06,
"loss": 1.1467,
"step": 1200
},
{
"epoch": 0.727489564698867,
"grad_norm": 0.21484375,
"learning_rate": 1.3010598257310642e-06,
"loss": 1.1213,
"step": 1220
},
{
"epoch": 0.7394156231365534,
"grad_norm": 0.423828125,
"learning_rate": 1.1996261872592754e-06,
"loss": 1.1539,
"step": 1240
},
{
"epoch": 0.7513416815742398,
"grad_norm": 0.296875,
"learning_rate": 1.1010467917732783e-06,
"loss": 1.0518,
"step": 1260
},
{
"epoch": 0.7632677400119261,
"grad_norm": 0.263671875,
"learning_rate": 1.005538015168487e-06,
"loss": 1.1907,
"step": 1280
},
{
"epoch": 0.7751937984496124,
"grad_norm": 0.2109375,
"learning_rate": 9.133094935149592e-07,
"loss": 1.0732,
"step": 1300
},
{
"epoch": 0.7871198568872988,
"grad_norm": 0.177734375,
"learning_rate": 8.245636629187121e-07,
"loss": 1.1658,
"step": 1320
},
{
"epoch": 0.7990459153249851,
"grad_norm": 0.19921875,
"learning_rate": 7.394953151865444e-07,
"loss": 1.0766,
"step": 1340
},
{
"epoch": 0.8109719737626714,
"grad_norm": 0.208984375,
"learning_rate": 6.582911702696334e-07,
"loss": 1.1737,
"step": 1360
},
{
"epoch": 0.8228980322003577,
"grad_norm": 0.212890625,
"learning_rate": 5.811294664243752e-07,
"loss": 1.0915,
"step": 1380
},
{
"epoch": 0.8348240906380441,
"grad_norm": 0.1884765625,
"learning_rate": 5.081795689900398e-07,
"loss": 1.1312,
"step": 1400
},
{
"epoch": 0.8467501490757304,
"grad_norm": 0.181640625,
"learning_rate": 4.396015986419483e-07,
"loss": 1.1867,
"step": 1420
},
{
"epoch": 0.8586762075134168,
"grad_norm": 0.1904296875,
"learning_rate": 3.7554607993613823e-07,
"loss": 1.1985,
"step": 1440
},
{
"epoch": 0.8706022659511031,
"grad_norm": 0.25,
"learning_rate": 3.1615361091693694e-07,
"loss": 1.1426,
"step": 1460
},
{
"epoch": 0.8825283243887895,
"grad_norm": 0.2060546875,
"learning_rate": 2.615545545126416e-07,
"loss": 1.1924,
"step": 1480
},
{
"epoch": 0.8944543828264758,
"grad_norm": 0.21484375,
"learning_rate": 2.118687523966559e-07,
"loss": 1.1344,
"step": 1500
},
{
"epoch": 0.9063804412641622,
"grad_norm": 0.2138671875,
"learning_rate": 1.6720526194217186e-07,
"loss": 1.153,
"step": 1520
},
{
"epoch": 0.9183064997018485,
"grad_norm": 0.1708984375,
"learning_rate": 1.2766211684773156e-07,
"loss": 1.1558,
"step": 1540
},
{
"epoch": 0.9302325581395349,
"grad_norm": 0.21484375,
"learning_rate": 9.332611195910585e-08,
"loss": 1.1415,
"step": 1560
},
{
"epoch": 0.9421586165772212,
"grad_norm": 0.349609375,
"learning_rate": 6.427261275978369e-08,
"loss": 1.1919,
"step": 1580
},
{
"epoch": 0.9540846750149076,
"grad_norm": 0.2353515625,
"learning_rate": 4.056538994822945e-08,
"loss": 1.0785,
"step": 1600
},
{
"epoch": 0.9660107334525939,
"grad_norm": 0.228515625,
"learning_rate": 2.2256479464999315e-08,
"loss": 1.1849,
"step": 1620
},
{
"epoch": 0.9779367918902803,
"grad_norm": 0.32421875,
"learning_rate": 9.386068276959204e-09,
"loss": 1.1015,
"step": 1640
},
{
"epoch": 0.9898628503279666,
"grad_norm": 0.17578125,
"learning_rate": 1.982406169283857e-09,
"loss": 1.1445,
"step": 1660
},
{
"epoch": 1.0,
"step": 1677,
"total_flos": 1.5441332068889395e+17,
"train_loss": 1.1665670079849415,
"train_runtime": 5262.9957,
"train_samples_per_second": 1.274,
"train_steps_per_second": 0.319
}
],
"logging_steps": 20,
"max_steps": 1677,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.5441332068889395e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}