mistral-7b-instruct-sppo-iter1 / trainer_state.json
jcmei's picture
End of training
7e9477e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 309,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003236245954692557,
"grad_norm": 44.84460324756473,
"learning_rate": 1.6129032258064514e-08,
"logits/chosen": -0.20905712246894836,
"logits/rejected": -0.22190234065055847,
"logps/chosen": -51.62083435058594,
"logps/rejected": -51.69921112060547,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.032362459546925564,
"grad_norm": 40.23570147271882,
"learning_rate": 1.6129032258064515e-07,
"logits/chosen": -0.5052363872528076,
"logits/rejected": -0.4759008586406708,
"logps/chosen": -117.98110961914062,
"logps/rejected": -115.17385864257812,
"loss": 0.6932,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.005904653575271368,
"rewards/margins": -0.005229531321674585,
"rewards/rejected": -0.0006751217297278345,
"step": 10
},
{
"epoch": 0.06472491909385113,
"grad_norm": 40.64958006423696,
"learning_rate": 3.225806451612903e-07,
"logits/chosen": -0.34268108010292053,
"logits/rejected": -0.32415661215782166,
"logps/chosen": -89.46002960205078,
"logps/rejected": -90.85234069824219,
"loss": 0.6918,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.012587952427566051,
"rewards/margins": -0.0015765244606882334,
"rewards/rejected": 0.014164477586746216,
"step": 20
},
{
"epoch": 0.0970873786407767,
"grad_norm": 44.44417549342928,
"learning_rate": 4.838709677419355e-07,
"logits/chosen": -0.3697855770587921,
"logits/rejected": -0.37569430470466614,
"logps/chosen": -91.7381362915039,
"logps/rejected": -120.64210510253906,
"loss": 0.6917,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": 0.040541667491197586,
"rewards/margins": -0.00646995147690177,
"rewards/rejected": 0.04701162129640579,
"step": 30
},
{
"epoch": 0.12944983818770225,
"grad_norm": 45.39432157926112,
"learning_rate": 4.838129496402878e-07,
"logits/chosen": -0.5134055614471436,
"logits/rejected": -0.5195242166519165,
"logps/chosen": -112.23564147949219,
"logps/rejected": -112.45448303222656,
"loss": 0.6845,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.1366191953420639,
"rewards/margins": 0.0211743526160717,
"rewards/rejected": 0.1154448390007019,
"step": 40
},
{
"epoch": 0.16181229773462782,
"grad_norm": 46.657888703309055,
"learning_rate": 4.6582733812949637e-07,
"logits/chosen": -0.49087825417518616,
"logits/rejected": -0.48370131850242615,
"logps/chosen": -108.74371337890625,
"logps/rejected": -108.59181213378906,
"loss": 0.6816,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.28128570318222046,
"rewards/margins": 0.055680472403764725,
"rewards/rejected": 0.22560521960258484,
"step": 50
},
{
"epoch": 0.1941747572815534,
"grad_norm": 45.70846809487506,
"learning_rate": 4.4784172661870503e-07,
"logits/chosen": -0.5000173449516296,
"logits/rejected": -0.44659870862960815,
"logps/chosen": -109.87890625,
"logps/rejected": -103.29914855957031,
"loss": 0.6765,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.34991931915283203,
"rewards/margins": -0.002201000927016139,
"rewards/rejected": 0.3521203100681305,
"step": 60
},
{
"epoch": 0.22653721682847897,
"grad_norm": 46.326027074357874,
"learning_rate": 4.2985611510791364e-07,
"logits/chosen": -0.42392462491989136,
"logits/rejected": -0.4413270056247711,
"logps/chosen": -106.632568359375,
"logps/rejected": -116.46354675292969,
"loss": 0.6766,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.4661247134208679,
"rewards/margins": 0.0015508796786889434,
"rewards/rejected": 0.4645739197731018,
"step": 70
},
{
"epoch": 0.2588996763754045,
"grad_norm": 43.498065814007035,
"learning_rate": 4.118705035971223e-07,
"logits/chosen": -0.3981110453605652,
"logits/rejected": -0.38208064436912537,
"logps/chosen": -77.17626953125,
"logps/rejected": -82.09029388427734,
"loss": 0.673,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.3067065179347992,
"rewards/margins": 0.007025508675724268,
"rewards/rejected": 0.29968103766441345,
"step": 80
},
{
"epoch": 0.2912621359223301,
"grad_norm": 44.075241479789,
"learning_rate": 3.938848920863309e-07,
"logits/chosen": -0.34064334630966187,
"logits/rejected": -0.39144274592399597,
"logps/chosen": -92.52304077148438,
"logps/rejected": -98.45548248291016,
"loss": 0.6704,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.3761943280696869,
"rewards/margins": 0.042388152331113815,
"rewards/rejected": 0.33380621671676636,
"step": 90
},
{
"epoch": 0.32362459546925565,
"grad_norm": 48.0790851847465,
"learning_rate": 3.7589928057553957e-07,
"logits/chosen": -0.37802955508232117,
"logits/rejected": -0.4160170555114746,
"logps/chosen": -113.50125885009766,
"logps/rejected": -121.9982681274414,
"loss": 0.6773,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.6942413449287415,
"rewards/margins": 0.051541488617658615,
"rewards/rejected": 0.6426998972892761,
"step": 100
},
{
"epoch": 0.3559870550161812,
"grad_norm": 46.010936459755236,
"learning_rate": 3.579136690647482e-07,
"logits/chosen": -0.42405351996421814,
"logits/rejected": -0.40491142868995667,
"logps/chosen": -103.0141830444336,
"logps/rejected": -107.39582824707031,
"loss": 0.6689,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.49587029218673706,
"rewards/margins": 0.06293975561857224,
"rewards/rejected": 0.4329305589199066,
"step": 110
},
{
"epoch": 0.3883495145631068,
"grad_norm": 45.68665961151184,
"learning_rate": 3.3992805755395684e-07,
"logits/chosen": -0.4767892360687256,
"logits/rejected": -0.4402199387550354,
"logps/chosen": -88.38746643066406,
"logps/rejected": -92.61766052246094,
"loss": 0.6499,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.7922600507736206,
"rewards/margins": 0.2213023602962494,
"rewards/rejected": 0.5709576606750488,
"step": 120
},
{
"epoch": 0.42071197411003236,
"grad_norm": 50.16095862731605,
"learning_rate": 3.2194244604316545e-07,
"logits/chosen": -0.37711650133132935,
"logits/rejected": -0.3402002155780792,
"logps/chosen": -90.22920227050781,
"logps/rejected": -94.65870666503906,
"loss": 0.674,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.6651551127433777,
"rewards/margins": 0.20479026436805725,
"rewards/rejected": 0.4603648781776428,
"step": 130
},
{
"epoch": 0.45307443365695793,
"grad_norm": 48.808328658203315,
"learning_rate": 3.039568345323741e-07,
"logits/chosen": -0.43002423644065857,
"logits/rejected": -0.45816200971603394,
"logps/chosen": -113.88044738769531,
"logps/rejected": -114.7729263305664,
"loss": 0.6533,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.5868810415267944,
"rewards/margins": 0.1377699077129364,
"rewards/rejected": 0.44911113381385803,
"step": 140
},
{
"epoch": 0.4854368932038835,
"grad_norm": 49.604479223239935,
"learning_rate": 2.859712230215827e-07,
"logits/chosen": -0.45960181951522827,
"logits/rejected": -0.4378342628479004,
"logps/chosen": -84.22222900390625,
"logps/rejected": -86.26544189453125,
"loss": 0.6639,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.5674360990524292,
"rewards/margins": 0.11405378580093384,
"rewards/rejected": 0.45338231325149536,
"step": 150
},
{
"epoch": 0.517799352750809,
"grad_norm": 47.42218438472324,
"learning_rate": 2.679856115107914e-07,
"logits/chosen": -0.3416453003883362,
"logits/rejected": -0.3230029344558716,
"logps/chosen": -80.31494903564453,
"logps/rejected": -82.15327453613281,
"loss": 0.6573,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.586269736289978,
"rewards/margins": 0.0887608677148819,
"rewards/rejected": 0.49750882387161255,
"step": 160
},
{
"epoch": 0.5501618122977346,
"grad_norm": 47.03382774502366,
"learning_rate": 2.5e-07,
"logits/chosen": -0.41951996088027954,
"logits/rejected": -0.3912803530693054,
"logps/chosen": -87.18314361572266,
"logps/rejected": -93.79847717285156,
"loss": 0.6592,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.5363109707832336,
"rewards/margins": 0.067426897585392,
"rewards/rejected": 0.4688839912414551,
"step": 170
},
{
"epoch": 0.5825242718446602,
"grad_norm": 47.69366741108912,
"learning_rate": 2.3201438848920862e-07,
"logits/chosen": -0.3929893374443054,
"logits/rejected": -0.4231534004211426,
"logps/chosen": -126.5281753540039,
"logps/rejected": -131.83119201660156,
"loss": 0.6558,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.5387030839920044,
"rewards/margins": 0.09579172730445862,
"rewards/rejected": 0.4429113268852234,
"step": 180
},
{
"epoch": 0.6148867313915858,
"grad_norm": 44.68169856182738,
"learning_rate": 2.1402877697841726e-07,
"logits/chosen": -0.44879454374313354,
"logits/rejected": -0.4319379925727844,
"logps/chosen": -92.65638732910156,
"logps/rejected": -87.41607666015625,
"loss": 0.6387,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.4598053991794586,
"rewards/margins": 0.2478960007429123,
"rewards/rejected": 0.21190936863422394,
"step": 190
},
{
"epoch": 0.6472491909385113,
"grad_norm": 52.43786206399067,
"learning_rate": 1.960431654676259e-07,
"logits/chosen": -0.4231666028499603,
"logits/rejected": -0.4151372015476227,
"logps/chosen": -89.03497314453125,
"logps/rejected": -94.46099853515625,
"loss": 0.638,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.564118504524231,
"rewards/margins": 0.19040581583976746,
"rewards/rejected": 0.3737126588821411,
"step": 200
},
{
"epoch": 0.6796116504854369,
"grad_norm": 47.830444290918436,
"learning_rate": 1.7805755395683453e-07,
"logits/chosen": -0.37914031744003296,
"logits/rejected": -0.3839500844478607,
"logps/chosen": -104.35710144042969,
"logps/rejected": -101.65086364746094,
"loss": 0.6557,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.5819724202156067,
"rewards/margins": 0.11125577986240387,
"rewards/rejected": 0.470716655254364,
"step": 210
},
{
"epoch": 0.7119741100323624,
"grad_norm": 56.9972848370308,
"learning_rate": 1.6007194244604316e-07,
"logits/chosen": -0.4811418950557709,
"logits/rejected": -0.4631536900997162,
"logps/chosen": -79.82476806640625,
"logps/rejected": -77.16559600830078,
"loss": 0.6487,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.5332338213920593,
"rewards/margins": 0.061274897307157516,
"rewards/rejected": 0.47195887565612793,
"step": 220
},
{
"epoch": 0.7443365695792881,
"grad_norm": 40.95527477894272,
"learning_rate": 1.420863309352518e-07,
"logits/chosen": -0.4873018264770508,
"logits/rejected": -0.48534002900123596,
"logps/chosen": -97.24694061279297,
"logps/rejected": -99.6893310546875,
"loss": 0.6618,
"rewards/accuracies": 0.4000000059604645,
"rewards/chosen": 0.36339443922042847,
"rewards/margins": 0.03903265669941902,
"rewards/rejected": 0.32436177134513855,
"step": 230
},
{
"epoch": 0.7766990291262136,
"grad_norm": 51.5213560789745,
"learning_rate": 1.2410071942446043e-07,
"logits/chosen": -0.49588823318481445,
"logits/rejected": -0.4999016225337982,
"logps/chosen": -109.93338775634766,
"logps/rejected": -112.18772888183594,
"loss": 0.6535,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.6505969762802124,
"rewards/margins": 0.1298864632844925,
"rewards/rejected": 0.5207104682922363,
"step": 240
},
{
"epoch": 0.8090614886731392,
"grad_norm": 46.03625656489629,
"learning_rate": 1.0611510791366907e-07,
"logits/chosen": -0.3793638348579407,
"logits/rejected": -0.38035714626312256,
"logps/chosen": -98.56913757324219,
"logps/rejected": -103.80790710449219,
"loss": 0.6503,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.5021631717681885,
"rewards/margins": 0.1653563678264618,
"rewards/rejected": 0.33680686354637146,
"step": 250
},
{
"epoch": 0.8414239482200647,
"grad_norm": 46.447929343856416,
"learning_rate": 8.812949640287769e-08,
"logits/chosen": -0.45624303817749023,
"logits/rejected": -0.4315834641456604,
"logps/chosen": -83.76708984375,
"logps/rejected": -94.65506744384766,
"loss": 0.6707,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.530864953994751,
"rewards/margins": 0.03900914266705513,
"rewards/rejected": 0.4918558597564697,
"step": 260
},
{
"epoch": 0.8737864077669902,
"grad_norm": 48.805366010941604,
"learning_rate": 7.014388489208632e-08,
"logits/chosen": -0.42842593789100647,
"logits/rejected": -0.40372419357299805,
"logps/chosen": -96.93135833740234,
"logps/rejected": -99.37718200683594,
"loss": 0.6434,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.4281619191169739,
"rewards/margins": 0.1487235724925995,
"rewards/rejected": 0.2794383466243744,
"step": 270
},
{
"epoch": 0.9061488673139159,
"grad_norm": 47.872792319475785,
"learning_rate": 5.2158273381294966e-08,
"logits/chosen": -0.4138847291469574,
"logits/rejected": -0.45293694734573364,
"logps/chosen": -102.27735137939453,
"logps/rejected": -107.4918212890625,
"loss": 0.6588,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.5359721183776855,
"rewards/margins": 0.1897757351398468,
"rewards/rejected": 0.34619635343551636,
"step": 280
},
{
"epoch": 0.9385113268608414,
"grad_norm": 47.988007458372486,
"learning_rate": 3.41726618705036e-08,
"logits/chosen": -0.4650436341762543,
"logits/rejected": -0.4407349228858948,
"logps/chosen": -141.83694458007812,
"logps/rejected": -126.32550048828125,
"loss": 0.6488,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.32771816849708557,
"rewards/margins": 0.28930023312568665,
"rewards/rejected": 0.038417913019657135,
"step": 290
},
{
"epoch": 0.970873786407767,
"grad_norm": 50.705167138943,
"learning_rate": 1.618705035971223e-08,
"logits/chosen": -0.4262828230857849,
"logits/rejected": -0.4605466425418854,
"logps/chosen": -109.3924560546875,
"logps/rejected": -106.82768249511719,
"loss": 0.6685,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.4629640579223633,
"rewards/margins": 0.06305359303951263,
"rewards/rejected": 0.39991044998168945,
"step": 300
},
{
"epoch": 1.0,
"step": 309,
"total_flos": 0.0,
"train_loss": 0.6613213452706445,
"train_runtime": 2759.9915,
"train_samples_per_second": 7.162,
"train_steps_per_second": 0.112
}
],
"logging_steps": 10,
"max_steps": 309,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}