AuxKTO / V1 /checkpoint-620 /trainer_state.json
SoonOk's picture
Upload folder using huggingface_hub
b6c054d verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 20.0,
"eval_steps": 200,
"global_step": 620,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.32454361054766734,
"grad_norm": 193.0,
"kl": 0.18435561656951904,
"learning_rate": 6e-08,
"logits/chosen": -60439219.2,
"logits/rejected": -88406048.0,
"logps/chosen": -197.2506591796875,
"logps/rejected": -107.289501953125,
"loss": 3.5312,
"rewards/chosen": -0.0054339878261089325,
"rewards/margins": -0.004661996196955442,
"rewards/rejected": -0.00077199162915349,
"step": 10
},
{
"epoch": 0.6490872210953347,
"grad_norm": 184.0,
"kl": 0.2278171330690384,
"learning_rate": 1.2666666666666666e-07,
"logits/chosen": -60431888.30573248,
"logits/rejected": -90372359.85276073,
"logps/chosen": -227.72788117038218,
"logps/rejected": -115.19334739263803,
"loss": 3.529,
"rewards/chosen": -0.005934715650643512,
"rewards/margins": -0.0023042475658752288,
"rewards/rejected": -0.0036304680847682835,
"step": 20
},
{
"epoch": 0.973630831643002,
"grad_norm": 212.0,
"kl": 0.25923511385917664,
"learning_rate": 1.9333333333333332e-07,
"logits/chosen": -64896310.01834863,
"logits/rejected": -90925383.15654951,
"logps/chosen": -202.24849483944953,
"logps/rejected": -113.89132388178913,
"loss": 3.493,
"rewards/chosen": -0.006282405386641849,
"rewards/margins": 0.008622396832151831,
"rewards/rejected": -0.01490480221879368,
"step": 30
},
{
"epoch": 1.2920892494929006,
"grad_norm": 176.0,
"kl": 0.1959868222475052,
"learning_rate": 2.6e-07,
"logits/chosen": -59720155.54179566,
"logits/rejected": -88351780.93114755,
"logps/chosen": -197.90833010835914,
"logps/rejected": -108.10444415983606,
"loss": 3.5435,
"rewards/chosen": 0.006978450544847424,
"rewards/margins": 0.021960525532540336,
"rewards/rejected": -0.014982074987692912,
"step": 40
},
{
"epoch": 1.616632860040568,
"grad_norm": 183.0,
"kl": 0.21559596061706543,
"learning_rate": 3.2666666666666663e-07,
"logits/chosen": -59467023.25827815,
"logits/rejected": -90559948.49704142,
"logps/chosen": -228.09385347682118,
"logps/rejected": -112.74208348742603,
"loss": 3.5032,
"rewards/chosen": 0.0010771212593609135,
"rewards/margins": 0.03369407513736861,
"rewards/rejected": -0.0326169538780077,
"step": 50
},
{
"epoch": 1.9411764705882353,
"grad_norm": 166.0,
"kl": 0.15380892157554626,
"learning_rate": 3.933333333333333e-07,
"logits/chosen": -65427847.25970149,
"logits/rejected": -90375137.78360656,
"logps/chosen": -201.5051072761194,
"logps/rejected": -115.42854764344263,
"loss": 3.4839,
"rewards/chosen": -0.005884787217894597,
"rewards/margins": 0.059975266923290735,
"rewards/rejected": -0.06586005414118533,
"step": 60
},
{
"epoch": 2.259634888438134,
"grad_norm": 177.0,
"kl": 0.14670009911060333,
"learning_rate": 4.6e-07,
"logits/chosen": -59004054.974358976,
"logits/rejected": -88976837.67088607,
"logps/chosen": -197.41224709535257,
"logps/rejected": -108.29176967958861,
"loss": 3.5035,
"rewards/chosen": 0.0009808578552343906,
"rewards/margins": 0.09745390586015737,
"rewards/rejected": -0.09647304800492298,
"step": 70
},
{
"epoch": 2.584178498985801,
"grad_norm": 147.0,
"kl": 0.17133259773254395,
"learning_rate": 5.266666666666666e-07,
"logits/chosen": -60354307.32467532,
"logits/rejected": -89784492.72289157,
"logps/chosen": -219.9271002435065,
"logps/rejected": -113.98176298945783,
"loss": 3.4706,
"rewards/chosen": -0.01354174180464311,
"rewards/margins": 0.1299854844710652,
"rewards/rejected": -0.1435272262757083,
"step": 80
},
{
"epoch": 2.9087221095334685,
"grad_norm": 197.0,
"kl": 0.18291868269443512,
"learning_rate": 5.933333333333334e-07,
"logits/chosen": -64280637.686746985,
"logits/rejected": -91105393.03896104,
"logps/chosen": -209.47286803463857,
"logps/rejected": -117.5850497159091,
"loss": 3.4451,
"rewards/chosen": -0.034842827233923485,
"rewards/margins": 0.20260719032359223,
"rewards/rejected": -0.2374500175575157,
"step": 90
},
{
"epoch": 3.227180527383367,
"grad_norm": 197.0,
"kl": 0.15808846056461334,
"learning_rate": 6.6e-07,
"logits/chosen": -61499443.52201258,
"logits/rejected": -88841982.34838709,
"logps/chosen": -191.46870086477986,
"logps/rejected": -110.64470766129033,
"loss": 3.4412,
"rewards/chosen": -0.02490143655980908,
"rewards/margins": 0.2772539658390694,
"rewards/rejected": -0.3021554023988785,
"step": 100
},
{
"epoch": 3.5517241379310347,
"grad_norm": 187.0,
"kl": 0.20476070046424866,
"learning_rate": 7.266666666666667e-07,
"logits/chosen": -59606039.973244146,
"logits/rejected": -90108102.19354838,
"logps/chosen": -218.58996132943145,
"logps/rejected": -116.83943135997067,
"loss": 3.4053,
"rewards/chosen": -0.04491897889204249,
"rewards/margins": 0.3912458546913978,
"rewards/rejected": -0.43616483358344027,
"step": 110
},
{
"epoch": 3.8762677484787016,
"grad_norm": 177.0,
"kl": 0.2384704351425171,
"learning_rate": 7.933333333333333e-07,
"logits/chosen": -63627667.66376811,
"logits/rejected": -90490970.25084746,
"logps/chosen": -210.72527173913045,
"logps/rejected": -119.54580243644068,
"loss": 3.3335,
"rewards/chosen": -0.08938233195871546,
"rewards/margins": 0.5145636426175557,
"rewards/rejected": -0.6039459745762712,
"step": 120
},
{
"epoch": 4.1947261663286,
"grad_norm": 180.0,
"kl": 0.1893850862979889,
"learning_rate": 8.599999999999999e-07,
"logits/chosen": -60919781.574193545,
"logits/rejected": -90189328.10062893,
"logps/chosen": -201.86630544354838,
"logps/rejected": -114.74926297169812,
"loss": 3.3418,
"rewards/chosen": -0.05891021605460874,
"rewards/margins": 0.6146469454804984,
"rewards/rejected": -0.6735571615351071,
"step": 130
},
{
"epoch": 4.519269776876268,
"grad_norm": 175.0,
"kl": 0.2661321759223938,
"learning_rate": 9.266666666666665e-07,
"logits/chosen": -59107307.58803987,
"logits/rejected": -89497455.00884956,
"logps/chosen": -215.70617473006644,
"logps/rejected": -120.84332365412979,
"loss": 3.2899,
"rewards/chosen": -0.11127648084266638,
"rewards/margins": 0.7554082443082831,
"rewards/rejected": -0.8666847251509495,
"step": 140
},
{
"epoch": 4.8438133874239355,
"grad_norm": 177.0,
"kl": 0.3608871102333069,
"learning_rate": 9.933333333333333e-07,
"logits/chosen": -64112037.64705882,
"logits/rejected": -91098432.85333334,
"logps/chosen": -204.3817325367647,
"logps/rejected": -125.74838541666666,
"loss": 3.23,
"rewards/chosen": -0.1627967497881721,
"rewards/margins": 0.8743928864422965,
"rewards/rejected": -1.0371896362304687,
"step": 150
},
{
"epoch": 5.162271805273834,
"grad_norm": 137.0,
"kl": 0.2991156578063965,
"learning_rate": 9.99095521855875e-07,
"logits/chosen": -60793345.625396825,
"logits/rejected": -90675919.74440895,
"logps/chosen": -211.70634920634922,
"logps/rejected": -120.8386456669329,
"loss": 3.217,
"rewards/chosen": -0.10563917614164807,
"rewards/margins": 0.9914023347846198,
"rewards/rejected": -1.097041510926268,
"step": 160
},
{
"epoch": 5.486815415821501,
"grad_norm": 171.0,
"kl": 0.35243138670921326,
"learning_rate": 9.959731316773258e-07,
"logits/chosen": -57961989.13712375,
"logits/rejected": -89004902.85043988,
"logps/chosen": -207.07096571906354,
"logps/rejected": -122.59219208211144,
"loss": 3.15,
"rewards/chosen": -0.1434617823980325,
"rewards/margins": 1.1325593303963681,
"rewards/rejected": -1.2760211127944006,
"step": 170
},
{
"epoch": 5.811359026369169,
"grad_norm": 185.0,
"kl": 0.4362719655036926,
"learning_rate": 9.906356050933962e-07,
"logits/chosen": -64087541.48973607,
"logits/rejected": -91216310.36789298,
"logps/chosen": -203.97832661290323,
"logps/rejected": -131.44156302257525,
"loss": 3.0881,
"rewards/chosen": -0.23974457234581195,
"rewards/margins": 1.1809270756605634,
"rewards/rejected": -1.4206716480063755,
"step": 180
},
{
"epoch": 6.129817444219067,
"grad_norm": 209.0,
"kl": 0.45517590641975403,
"learning_rate": 9.831067807935138e-07,
"logits/chosen": -60818541.48427673,
"logits/rejected": -91439209.7032258,
"logps/chosen": -216.79994595125785,
"logps/rejected": -123.5953125,
"loss": 3.1009,
"rewards/chosen": -0.14091354945920548,
"rewards/margins": 1.3181333373540505,
"rewards/rejected": -1.459046886813256,
"step": 190
},
{
"epoch": 6.454361054766734,
"grad_norm": 174.0,
"kl": 0.3785388171672821,
"learning_rate": 9.73420284334652e-07,
"logits/chosen": -57674728.34323432,
"logits/rejected": -88681900.43916914,
"logps/chosen": -205.04843492161717,
"logps/rejected": -123.72198395771514,
"loss": 3.0149,
"rewards/chosen": -0.1368737551245359,
"rewards/margins": 1.4349443391007053,
"rewards/rejected": -1.571818094225241,
"step": 200
},
{
"epoch": 6.454361054766734,
"eval_kl": 0.035786211490631104,
"eval_logits/chosen": -67682705.2972973,
"eval_logits/rejected": -106589274.61946903,
"eval_logps/chosen": -223.63279490427928,
"eval_logps/rejected": -130.98892941095133,
"eval_loss": 0.3622306287288666,
"eval_rewards/chosen": -0.1184040877196166,
"eval_rewards/margins": 1.4728804700617717,
"eval_rewards/rejected": -1.5912845577813883,
"eval_runtime": 14.432,
"eval_samples_per_second": 15.175,
"eval_steps_per_second": 0.97,
"step": 200
},
{
"epoch": 6.778904665314402,
"grad_norm": 173.0,
"kl": 0.486126571893692,
"learning_rate": 9.616193779614293e-07,
"logits/chosen": -62086706.13649852,
"logits/rejected": -91359813.28052805,
"logps/chosen": -207.32766135014836,
"logps/rejected": -134.37520627062707,
"loss": 2.9831,
"rewards/chosen": -0.22745244510095974,
"rewards/margins": 1.4696961454905213,
"rewards/rejected": -1.697148590591481,
"step": 210
},
{
"epoch": 7.0973630831643,
"grad_norm": 238.0,
"kl": 0.4411180913448334,
"learning_rate": 9.477567673864215e-07,
"logits/chosen": -61770599.064935066,
"logits/rejected": -91440755.2,
"logps/chosen": -212.82518262987014,
"logps/rejected": -127.96240234375,
"loss": 2.9719,
"rewards/chosen": -0.23503605731121904,
"rewards/margins": 1.439167243164855,
"rewards/rejected": -1.6742033004760741,
"step": 220
},
{
"epoch": 7.421906693711968,
"grad_norm": 696.0,
"kl": 0.43413224816322327,
"learning_rate": 9.318943663936569e-07,
"logits/chosen": -58800922.256410256,
"logits/rejected": -88753464.19512194,
"logps/chosen": -204.23775540865384,
"logps/rejected": -127.21961937881098,
"loss": 2.9524,
"rewards/chosen": -0.15640850556202424,
"rewards/margins": 1.593839914967225,
"rewards/rejected": -1.7502484205292492,
"step": 230
},
{
"epoch": 7.746450304259635,
"grad_norm": 155.0,
"kl": 0.6114085912704468,
"learning_rate": 9.141030203166256e-07,
"logits/chosen": -60832057.65765766,
"logits/rejected": -91749279.27035831,
"logps/chosen": -210.65941722972974,
"logps/rejected": -133.82223381514657,
"loss": 2.9045,
"rewards/chosen": -0.2403512674051004,
"rewards/margins": 1.6371219486983197,
"rewards/rejected": -1.8774732161034202,
"step": 240
},
{
"epoch": 8.064908722109534,
"grad_norm": 177.0,
"kl": 0.5247067213058472,
"learning_rate": 8.944621896258224e-07,
"logits/chosen": -61470391.79487179,
"logits/rejected": -90531684.4556962,
"logps/chosen": -212.98465044070514,
"logps/rejected": -129.2169822982595,
"loss": 2.9026,
"rewards/chosen": -0.2432682330791767,
"rewards/margins": 1.5595254387340156,
"rewards/rejected": -1.8027936718131923,
"step": 250
},
{
"epoch": 8.3894523326572,
"grad_norm": 144.0,
"kl": 0.4949173033237457,
"learning_rate": 8.730595950389967e-07,
"logits/chosen": -58573346.13333333,
"logits/rejected": -89080557.88307692,
"logps/chosen": -201.41715029761906,
"logps/rejected": -128.4549278846154,
"loss": 2.8883,
"rewards/chosen": -0.1385447789752294,
"rewards/margins": 1.7706690882122706,
"rewards/rejected": -1.9092138671875,
"step": 260
},
{
"epoch": 8.713995943204868,
"grad_norm": 185.0,
"kl": 0.6798511743545532,
"learning_rate": 8.499908257391323e-07,
"logits/chosen": -60232947.512195125,
"logits/rejected": -91839363.28205128,
"logps/chosen": -217.515625,
"logps/rejected": -134.24834735576923,
"loss": 2.8604,
"rewards/chosen": -0.22204266524896388,
"rewards/margins": 1.757865030814738,
"rewards/rejected": -1.9799076960637019,
"step": 270
},
{
"epoch": 9.032454361054766,
"grad_norm": 240.0,
"kl": 0.4586775600910187,
"learning_rate": 8.253589124499511e-07,
"logits/chosen": -61861802.11612903,
"logits/rejected": -91389275.7735849,
"logps/chosen": -207.82133316532259,
"logps/rejected": -133.32677378144655,
"loss": 2.8302,
"rewards/chosen": -0.2830538349766885,
"rewards/margins": 1.7209184404753555,
"rewards/rejected": -2.003972275452044,
"step": 280
},
{
"epoch": 9.356997971602434,
"grad_norm": 164.0,
"kl": 0.61592036485672,
"learning_rate": 7.992738672756908e-07,
"logits/chosen": -58843218.940809965,
"logits/rejected": -88515118.54545455,
"logps/chosen": -197.16750632788163,
"logps/rejected": -129.25817985893417,
"loss": 2.8523,
"rewards/chosen": -0.1824735837562062,
"rewards/margins": 1.795535009381529,
"rewards/rejected": -1.9780085931377351,
"step": 290
},
{
"epoch": 9.681541582150102,
"grad_norm": 137.0,
"kl": 0.7404313087463379,
"learning_rate": 7.718521923603404e-07,
"logits/chosen": -59332853.50157729,
"logits/rejected": -91459862.98452012,
"logps/chosen": -231.09537657728706,
"logps/rejected": -134.74608165634675,
"loss": 2.8027,
"rewards/chosen": -0.2009657730439484,
"rewards/margins": 1.85811701405938,
"rewards/rejected": -2.0590827871033284,
"step": 300
},
{
"epoch": 10.0,
"grad_norm": 207.0,
"kl": 0.44328153133392334,
"learning_rate": 7.43216359560785e-07,
"logits/chosen": -62495649.72698413,
"logits/rejected": -91457778.09584664,
"logps/chosen": -200.75829613095237,
"logps/rejected": -134.88111521565494,
"loss": 2.7823,
"rewards/chosen": -0.29915979778955853,
"rewards/margins": 1.8331558409384765,
"rewards/rejected": -2.132315638728035,
"step": 310
},
{
"epoch": 10.324543610547668,
"grad_norm": 182.0,
"kl": 0.5860379338264465,
"learning_rate": 7.134942634577615e-07,
"logits/chosen": -58812569.6,
"logits/rejected": -88819168.0,
"logps/chosen": -199.1839599609375,
"logps/rejected": -127.58912353515625,
"loss": 2.8078,
"rewards/chosen": -0.19876351356506347,
"rewards/margins": 1.8319713115692138,
"rewards/rejected": -2.030734825134277,
"step": 320
},
{
"epoch": 10.649087221095336,
"grad_norm": 177.0,
"kl": 0.6635628938674927,
"learning_rate": 6.828186501476144e-07,
"logits/chosen": -58466127.89808917,
"logits/rejected": -90940309.20245399,
"logps/chosen": -229.41757066082803,
"logps/rejected": -136.31909029907976,
"loss": 2.7883,
"rewards/chosen": -0.17490250897255671,
"rewards/margins": 1.9413016884488392,
"rewards/rejected": -2.116204197421396,
"step": 330
},
{
"epoch": 10.973630831643002,
"grad_norm": 176.0,
"kl": 0.6288160681724548,
"learning_rate": 6.513265243660057e-07,
"logits/chosen": -62876155.30275229,
"logits/rejected": -91420423.36102237,
"logps/chosen": -204.8246129587156,
"logps/rejected": -136.0083491413738,
"loss": 2.7516,
"rewards/chosen": -0.2638938507173404,
"rewards/margins": 1.962711744392983,
"rewards/rejected": -2.2266055951103234,
"step": 340
},
{
"epoch": 11.2920892494929,
"grad_norm": 165.0,
"kl": 0.6786984205245972,
"learning_rate": 6.191585375915055e-07,
"logits/chosen": -58107764.50773994,
"logits/rejected": -88665648.68196721,
"logps/chosen": -199.96833881578948,
"logps/rejected": -128.3748463114754,
"loss": 2.8082,
"rewards/chosen": -0.19902199193050987,
"rewards/margins": 1.8429992122831171,
"rewards/rejected": -2.042021204213627,
"step": 350
},
{
"epoch": 11.616632860040568,
"grad_norm": 167.0,
"kl": 0.6609476208686829,
"learning_rate": 5.864583598619467e-07,
"logits/chosen": -57476970.80794702,
"logits/rejected": -91102129.23076923,
"logps/chosen": -229.5999586092715,
"logps/rejected": -134.39487795857988,
"loss": 2.7377,
"rewards/chosen": -0.1495321286435159,
"rewards/margins": 2.048365336627841,
"rewards/rejected": -2.197897465271357,
"step": 360
},
{
"epoch": 11.941176470588236,
"grad_norm": 141.0,
"kl": 0.6255931854248047,
"learning_rate": 5.533720381091582e-07,
"logits/chosen": -63461816.16716418,
"logits/rejected": -90867329.2590164,
"logps/chosen": -204.81716417910448,
"logps/rejected": -137.47254098360656,
"loss": 2.7423,
"rewards/chosen": -0.3370915427136777,
"rewards/margins": 1.9331667260491296,
"rewards/rejected": -2.2702582687628072,
"step": 370
},
{
"epoch": 12.259634888438134,
"grad_norm": 160.0,
"kl": 0.6641746163368225,
"learning_rate": 5.200473438779146e-07,
"logits/chosen": -57381835.48717949,
"logits/rejected": -89336883.84810127,
"logps/chosen": -199.01509915865384,
"logps/rejected": -127.97672320015823,
"loss": 2.7637,
"rewards/chosen": -0.159302613674066,
"rewards/margins": 1.9056654471694863,
"rewards/rejected": -2.0649680608435523,
"step": 380
},
{
"epoch": 12.584178498985802,
"grad_norm": 158.0,
"kl": 0.605785071849823,
"learning_rate": 4.866331133423456e-07,
"logits/chosen": -58439706.597402595,
"logits/rejected": -90199761.73493975,
"logps/chosen": -222.0545606737013,
"logps/rejected": -134.65968561746988,
"loss": 2.7272,
"rewards/chosen": -0.22628873354428775,
"rewards/margins": 1.985029133137282,
"rewards/rejected": -2.21131786668157,
"step": 390
},
{
"epoch": 12.908722109533468,
"grad_norm": 192.0,
"kl": 0.7094799280166626,
"learning_rate": 4.5327858256745065e-07,
"logits/chosen": -62306581.590361446,
"logits/rejected": -91545985.66233766,
"logps/chosen": -212.20241905120483,
"logps/rejected": -138.62365564123377,
"loss": 2.7339,
"rewards/chosen": -0.30779806389866105,
"rewards/margins": 2.033511124349614,
"rewards/rejected": -2.341309188248275,
"step": 400
},
{
"epoch": 12.908722109533468,
"eval_kl": 0.035703618079423904,
"eval_logits/chosen": -67034790.05405405,
"eval_logits/rejected": -106966931.25663717,
"eval_logps/chosen": -224.06151463963963,
"eval_logps/rejected": -137.08888447179203,
"eval_loss": 0.3353511095046997,
"eval_rewards/chosen": -0.16127505603137318,
"eval_rewards/margins": 2.040005520961506,
"eval_rewards/rejected": -2.201280576992879,
"eval_runtime": 14.3688,
"eval_samples_per_second": 15.241,
"eval_steps_per_second": 0.974,
"step": 400
},
{
"epoch": 13.227180527383368,
"grad_norm": 160.0,
"kl": 0.6558622121810913,
"learning_rate": 4.201327209846065e-07,
"logits/chosen": -59866034.716981135,
"logits/rejected": -89190664.25806452,
"logps/chosen": -193.23755650550314,
"logps/rejected": -128.39615675403226,
"loss": 2.7624,
"rewards/chosen": -0.20178618521060585,
"rewards/margins": 1.875514249517217,
"rewards/rejected": -2.0773004347278228,
"step": 410
},
{
"epoch": 13.551724137931034,
"grad_norm": 156.0,
"kl": 0.5715658664703369,
"learning_rate": 3.873435660579217e-07,
"logits/chosen": -57941598.18060201,
"logits/rejected": -90468988.62170088,
"logps/chosen": -220.37727320234114,
"logps/rejected": -135.28165093475073,
"loss": 2.7148,
"rewards/chosen": -0.22364995950041805,
"rewards/margins": 2.0567365988756814,
"rewards/rejected": -2.2803865583760996,
"step": 420
},
{
"epoch": 13.876267748478702,
"grad_norm": 147.0,
"kl": 0.8168804049491882,
"learning_rate": 3.5505756211298774e-07,
"logits/chosen": -61979449.136231884,
"logits/rejected": -90915301.96610169,
"logps/chosen": -212.7193161231884,
"logps/rejected": -136.42921080508475,
"loss": 2.7174,
"rewards/chosen": -0.2887859178626019,
"rewards/margins": 2.0035011004769108,
"rewards/rejected": -2.2922870183395125,
"step": 430
},
{
"epoch": 14.1947261663286,
"grad_norm": 188.0,
"kl": 0.6269903779029846,
"learning_rate": 3.234189062809695e-07,
"logits/chosen": -59424662.29677419,
"logits/rejected": -90425550.08805032,
"logps/chosen": -203.18240927419356,
"logps/rejected": -129.64670548349056,
"loss": 2.7431,
"rewards/chosen": -0.19051946824596774,
"rewards/margins": 1.9727823955143509,
"rewards/rejected": -2.1633018637603185,
"step": 440
},
{
"epoch": 14.519269776876268,
"grad_norm": 145.0,
"kl": 0.5869894027709961,
"learning_rate": 2.9256890447921315e-07,
"logits/chosen": -57757369.408637874,
"logits/rejected": -89745903.38643068,
"logps/chosen": -217.05743874584718,
"logps/rejected": -134.99330521755164,
"loss": 2.7134,
"rewards/chosen": -0.24640435000194663,
"rewards/margins": 2.035277397209521,
"rewards/rejected": -2.2816817472114677,
"step": 450
},
{
"epoch": 14.843813387423936,
"grad_norm": 160.0,
"kl": 0.7495726346969604,
"learning_rate": 2.626453403047172e-07,
"logits/chosen": -62740118.5882353,
"logits/rejected": -91247097.17333333,
"logps/chosen": -205.96771599264707,
"logps/rejected": -138.3257421875,
"loss": 2.7088,
"rewards/chosen": -0.32139582914464615,
"rewards/margins": 1.973530928667854,
"rewards/rejected": -2.2949267578125,
"step": 460
},
{
"epoch": 15.162271805273834,
"grad_norm": 138.0,
"kl": 0.6461220979690552,
"learning_rate": 2.3378185965914078e-07,
"logits/chosen": -59542024.12698413,
"logits/rejected": -90673073.48242812,
"logps/chosen": -212.47857142857143,
"logps/rejected": -131.6886232028754,
"loss": 2.7498,
"rewards/chosen": -0.1828639923580109,
"rewards/margins": 1.9991756036633548,
"rewards/rejected": -2.1820395960213657,
"step": 470
},
{
"epoch": 15.486815415821502,
"grad_norm": 153.0,
"kl": 0.6038868427276611,
"learning_rate": 2.0610737385376348e-07,
"logits/chosen": -56933078.04682274,
"logits/rejected": -88981978.46334311,
"logps/chosen": -207.75867474916387,
"logps/rejected": -132.5094391495601,
"loss": 2.7032,
"rewards/chosen": -0.21223314470272,
"rewards/margins": 2.0555130507128774,
"rewards/rejected": -2.2677461954155973,
"step": 480
},
{
"epoch": 15.811359026369168,
"grad_norm": 169.0,
"kl": 0.7091981172561646,
"learning_rate": 1.7974548386027584e-07,
"logits/chosen": -62982918.75659824,
"logits/rejected": -91177055.89297658,
"logps/chosen": -205.08644153225808,
"logps/rejected": -140.23990123327758,
"loss": 2.6976,
"rewards/chosen": -0.3505571203147911,
"rewards/margins": 1.9499471239760744,
"rewards/rejected": -2.3005042442908654,
"step": 490
},
{
"epoch": 16.129817444219068,
"grad_norm": 170.0,
"kl": 0.7773324251174927,
"learning_rate": 1.5481392827883488e-07,
"logits/chosen": -59927378.11320755,
"logits/rejected": -91354719.79354839,
"logps/chosen": -217.3079304245283,
"logps/rejected": -131.10647681451613,
"loss": 2.7533,
"rewards/chosen": -0.19171231347809797,
"rewards/margins": 2.018452205801136,
"rewards/rejected": -2.210164519279234,
"step": 500
},
{
"epoch": 16.454361054766736,
"grad_norm": 159.0,
"kl": 0.592677116394043,
"learning_rate": 1.3142405748889457e-07,
"logits/chosen": -56867931.24752475,
"logits/rejected": -88677786.20771514,
"logps/chosen": -205.24737004950495,
"logps/rejected": -130.59708827893175,
"loss": 2.7022,
"rewards/chosen": -0.15676989413724088,
"rewards/margins": 2.102558342428505,
"rewards/rejected": -2.2593282365657457,
"step": 510
},
{
"epoch": 16.7789046653144,
"grad_norm": 176.0,
"kl": 0.7139925956726074,
"learning_rate": 1.096803363313803e-07,
"logits/chosen": -61236336.4272997,
"logits/rejected": -91304395.61716172,
"logps/chosen": -207.9202290430267,
"logps/rejected": -140.73229940181517,
"loss": 2.7113,
"rewards/chosen": -0.2867103825690838,
"rewards/margins": 2.0461471585561224,
"rewards/rejected": -2.332857541125206,
"step": 520
},
{
"epoch": 17.0973630831643,
"grad_norm": 197.0,
"kl": 0.7058033347129822,
"learning_rate": 8.967987754335022e-08,
"logits/chosen": -61097139.53246753,
"logits/rejected": -91390720.0,
"logps/chosen": -213.40300324675326,
"logps/rejected": -133.4157958984375,
"loss": 2.7195,
"rewards/chosen": -0.29281839147790684,
"rewards/margins": 1.9267250655533432,
"rewards/rejected": -2.21954345703125,
"step": 530
},
{
"epoch": 17.421906693711968,
"grad_norm": 264.0,
"kl": 0.5918253064155579,
"learning_rate": 7.15120080289368e-08,
"logits/chosen": -58138827.48717949,
"logits/rejected": -88649209.75609756,
"logps/chosen": -204.44078024839743,
"logps/rejected": -132.16799256859755,
"loss": 2.73,
"rewards/chosen": -0.17671105800530848,
"rewards/margins": 2.0683754258337728,
"rewards/rejected": -2.2450864838390814,
"step": 540
},
{
"epoch": 17.746450304259636,
"grad_norm": 157.0,
"kl": 0.7473562359809875,
"learning_rate": 5.5257869903709006e-08,
"logits/chosen": -60289140.85285285,
"logits/rejected": -91669847.55700326,
"logps/chosen": -210.96473817567568,
"logps/rejected": -138.43526058631923,
"loss": 2.7077,
"rewards/chosen": -0.27088488329638233,
"rewards/margins": 2.0678908508750347,
"rewards/rejected": -2.338775734171417,
"step": 550
},
{
"epoch": 18.064908722109532,
"grad_norm": 163.0,
"kl": 0.6325186491012573,
"learning_rate": 4.099005809428596e-08,
"logits/chosen": -60974821.743589744,
"logits/rejected": -90441339.1392405,
"logps/chosen": -213.37244591346155,
"logps/rejected": -133.14664507515823,
"loss": 2.7211,
"rewards/chosen": -0.2820472228221404,
"rewards/margins": 1.9137137696867599,
"rewards/rejected": -2.1957609925089003,
"step": 560
},
{
"epoch": 18.3894523326572,
"grad_norm": 138.0,
"kl": 0.6061395406723022,
"learning_rate": 2.8772296111772677e-08,
"logits/chosen": -58149936.76190476,
"logits/rejected": -89025810.11692308,
"logps/chosen": -201.52972470238095,
"logps/rejected": -132.06661057692307,
"loss": 2.7258,
"rewards/chosen": -0.14980345226469494,
"rewards/margins": 2.120578909514151,
"rewards/rejected": -2.270382361778846,
"step": 570
},
{
"epoch": 18.713995943204868,
"grad_norm": 191.0,
"kl": 0.7928330898284912,
"learning_rate": 1.865915144708985e-08,
"logits/chosen": -59847180.487804875,
"logits/rejected": -91838811.8974359,
"logps/chosen": -217.81192835365854,
"logps/rejected": -137.56884765625,
"loss": 2.7109,
"rewards/chosen": -0.25167286105272246,
"rewards/margins": 2.060285256310058,
"rewards/rejected": -2.3119581173627806,
"step": 580
},
{
"epoch": 19.032454361054768,
"grad_norm": 228.0,
"kl": 0.5313221216201782,
"learning_rate": 1.0695791859313297e-08,
"logits/chosen": -61405038.658064514,
"logits/rejected": -91388741.2327044,
"logps/chosen": -208.06592741935484,
"logps/rejected": -136.2116868121069,
"loss": 2.7008,
"rewards/chosen": -0.3075132308467742,
"rewards/margins": 1.9849507315402146,
"rewards/rejected": -2.292463962386989,
"step": 590
},
{
"epoch": 19.356997971602436,
"grad_norm": 159.0,
"kl": 0.7078633308410645,
"learning_rate": 4.917783645496887e-09,
"logits/chosen": -58602562.99065421,
"logits/rejected": -88561907.96238245,
"logps/chosen": -197.22442075545172,
"logps/rejected": -131.6894592476489,
"loss": 2.7409,
"rewards/chosen": -0.1881643990489924,
"rewards/margins": 2.0329730513575512,
"rewards/rejected": -2.2211374504065438,
"step": 600
},
{
"epoch": 19.356997971602436,
"eval_kl": 0.060564398765563965,
"eval_logits/chosen": -67062447.27927928,
"eval_logits/rejected": -106975032.63716814,
"eval_logps/chosen": -223.90932925112614,
"eval_logps/rejected": -137.44655696902655,
"eval_loss": 0.33327072858810425,
"eval_rewards/chosen": -0.14605889878831468,
"eval_rewards/margins": 2.090988279767064,
"eval_rewards/rejected": -2.237047178555379,
"eval_runtime": 14.3431,
"eval_samples_per_second": 15.269,
"eval_steps_per_second": 0.976,
"step": 600
},
{
"epoch": 19.6815415821501,
"grad_norm": 133.0,
"kl": 0.8249608278274536,
"learning_rate": 1.350932792956394e-09,
"logits/chosen": -59016821.90536278,
"logits/rejected": -91488921.75851393,
"logps/chosen": -231.22170741324922,
"logps/rejected": -136.9874467879257,
"loss": 2.7014,
"rewards/chosen": -0.21359941259919657,
"rewards/margins": 2.069619963247359,
"rewards/rejected": -2.2832193758465555,
"step": 610
},
{
"epoch": 20.0,
"grad_norm": 176.0,
"kl": 0.49784502387046814,
"learning_rate": 1.1169723465487279e-11,
"logits/chosen": -62227758.32380953,
"logits/rejected": -91514088.28115016,
"logps/chosen": -201.05374503968255,
"logps/rejected": -136.85892571884983,
"loss": 2.6991,
"rewards/chosen": -0.32870684426928326,
"rewards/margins": 2.0013896107776414,
"rewards/rejected": -2.3300964550469248,
"step": 620
}
],
"logging_steps": 10,
"max_steps": 620,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}