MNLP_M3_dpo_model / trainer_state.json
KaiserSlaughter's picture
first push
9f0c631 verified
Invalid JSON: Unexpected token 'N', ..."/chosen": NaN, "... is not valid JSON
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 50,
"global_step": 2699,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003705934127020892,
"grad_norm": 215.50672912597656,
"learning_rate": 4.983327158206743e-07,
"logits/chosen": -6.070415496826172,
"logits/rejected": -6.099751949310303,
"logps/chosen": -984.36767578125,
"logps/rejected": -897.9577026367188,
"loss": 0.6962,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.055581189692020416,
"rewards/margins": 0.0006150867557153106,
"rewards/rejected": 0.05496610328555107,
"step": 10
},
{
"epoch": 0.007411868254041784,
"grad_norm": 203.74668884277344,
"learning_rate": 4.964801778436458e-07,
"logits/chosen": -6.290555477142334,
"logits/rejected": -6.2849812507629395,
"logps/chosen": -932.5144653320312,
"logps/rejected": -823.7965698242188,
"loss": 0.675,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.27587246894836426,
"rewards/margins": 0.05210857465863228,
"rewards/rejected": 0.22376389801502228,
"step": 20
},
{
"epoch": 0.011117802381062676,
"grad_norm": 201.33848571777344,
"learning_rate": 4.946276398666173e-07,
"logits/chosen": -6.240113735198975,
"logits/rejected": -6.196782112121582,
"logps/chosen": -981.3587646484375,
"logps/rejected": -879.1512451171875,
"loss": 0.67,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.4677122235298157,
"rewards/margins": 0.07471133768558502,
"rewards/rejected": 0.39300084114074707,
"step": 30
},
{
"epoch": 0.014823736508083569,
"grad_norm": 201.69285583496094,
"learning_rate": 4.927751018895887e-07,
"logits/chosen": -6.257163047790527,
"logits/rejected": -6.211418151855469,
"logps/chosen": -998.1868286132812,
"logps/rejected": -941.7491455078125,
"loss": 0.6987,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.5558674931526184,
"rewards/margins": 0.02628883719444275,
"rewards/rejected": 0.5295786261558533,
"step": 40
},
{
"epoch": 0.01852967063510446,
"grad_norm": 213.06088256835938,
"learning_rate": 4.909225639125602e-07,
"logits/chosen": -6.227558612823486,
"logits/rejected": -6.338425636291504,
"logps/chosen": -1029.257080078125,
"logps/rejected": -952.8382568359375,
"loss": 0.6643,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.59141606092453,
"rewards/margins": 0.09994185715913773,
"rewards/rejected": 0.4914742112159729,
"step": 50
},
{
"epoch": 0.01852967063510446,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.348860263824463,
"eval_logps/chosen": -1146.9443359375,
"eval_logps/rejected": -1055.7943115234375,
"eval_loss": 0.6688504815101624,
"eval_rewards/accuracies": 0.5936188101768494,
"eval_rewards/chosen": 0.7400967478752136,
"eval_rewards/margins": 0.09535637497901917,
"eval_rewards/rejected": 0.6447404623031616,
"eval_runtime": 173.9141,
"eval_samples_per_second": 6.848,
"eval_steps_per_second": 6.848,
"step": 50
},
{
"epoch": 0.02223560476212535,
"grad_norm": 173.28237915039062,
"learning_rate": 4.890700259355317e-07,
"logits/chosen": -6.145205974578857,
"logits/rejected": -6.158076763153076,
"logps/chosen": -918.4729614257812,
"logps/rejected": -808.6856079101562,
"loss": 0.6631,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.6317356824874878,
"rewards/margins": 0.10025952756404877,
"rewards/rejected": 0.5314761400222778,
"step": 60
},
{
"epoch": 0.025941538889146246,
"grad_norm": 201.4814910888672,
"learning_rate": 4.872174879585031e-07,
"logits/chosen": -5.990462779998779,
"logits/rejected": -6.091423988342285,
"logps/chosen": -903.2649536132812,
"logps/rejected": -869.3302001953125,
"loss": 0.6871,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.7280756831169128,
"rewards/margins": 0.06912466883659363,
"rewards/rejected": 0.6589510440826416,
"step": 70
},
{
"epoch": 0.029647473016167138,
"grad_norm": 247.76434326171875,
"learning_rate": 4.853649499814746e-07,
"logits/chosen": -6.052975177764893,
"logits/rejected": NaN,
"logps/chosen": -994.2741088867188,
"logps/rejected": -887.9376220703125,
"loss": 0.6728,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.8349438905715942,
"rewards/margins": 0.10054773092269897,
"rewards/rejected": 0.7343961000442505,
"step": 80
},
{
"epoch": 0.03335340714318803,
"grad_norm": 241.12693786621094,
"learning_rate": 4.835124120044461e-07,
"logits/chosen": -6.244847774505615,
"logits/rejected": -6.195946216583252,
"logps/chosen": -953.9434814453125,
"logps/rejected": -816.5172119140625,
"loss": 0.6708,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.7360488176345825,
"rewards/margins": 0.10302430391311646,
"rewards/rejected": 0.6330245733261108,
"step": 90
},
{
"epoch": 0.03705934127020892,
"grad_norm": 222.10406494140625,
"learning_rate": 4.816598740274175e-07,
"logits/chosen": -6.177041530609131,
"logits/rejected": -6.0963640213012695,
"logps/chosen": -1006.8380737304688,
"logps/rejected": -819.0447998046875,
"loss": 0.6398,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.8563984036445618,
"rewards/margins": 0.1775047481060028,
"rewards/rejected": 0.6788936853408813,
"step": 100
},
{
"epoch": 0.03705934127020892,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.329836368560791,
"eval_logps/chosen": -1141.4117431640625,
"eval_logps/rejected": -1050.9219970703125,
"eval_loss": 0.6667112708091736,
"eval_rewards/accuracies": 0.5869017839431763,
"eval_rewards/chosen": 1.2933688163757324,
"eval_rewards/margins": 0.16138586401939392,
"eval_rewards/rejected": 1.1319829225540161,
"eval_runtime": 174.0404,
"eval_samples_per_second": 6.843,
"eval_steps_per_second": 6.843,
"step": 100
},
{
"epoch": 0.040765275397229815,
"grad_norm": 176.17654418945312,
"learning_rate": 4.79807336050389e-07,
"logits/chosen": NaN,
"logits/rejected": -6.241061210632324,
"logps/chosen": -934.3814697265625,
"logps/rejected": -809.5549926757812,
"loss": 0.6532,
"rewards/accuracies": 0.625,
"rewards/chosen": 1.0543756484985352,
"rewards/margins": 0.2122875154018402,
"rewards/rejected": 0.8420880436897278,
"step": 110
},
{
"epoch": 0.0444712095242507,
"grad_norm": 205.62350463867188,
"learning_rate": 4.779547980733605e-07,
"logits/chosen": -6.2480058670043945,
"logits/rejected": -6.166928291320801,
"logps/chosen": -996.0416259765625,
"logps/rejected": -834.2034912109375,
"loss": 0.6599,
"rewards/accuracies": 0.59375,
"rewards/chosen": 1.0299097299575806,
"rewards/margins": 0.1862904280424118,
"rewards/rejected": 0.8436192274093628,
"step": 120
},
{
"epoch": 0.0481771436512716,
"grad_norm": 191.88278198242188,
"learning_rate": 4.7610226009633197e-07,
"logits/chosen": -6.192694664001465,
"logits/rejected": -6.168017387390137,
"logps/chosen": -957.9847412109375,
"logps/rejected": -854.0274658203125,
"loss": 0.6462,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.9972726702690125,
"rewards/margins": 0.20573964715003967,
"rewards/rejected": 0.7915329933166504,
"step": 130
},
{
"epoch": 0.05188307777829249,
"grad_norm": 164.00282287597656,
"learning_rate": 4.742497221193034e-07,
"logits/chosen": -6.139467239379883,
"logits/rejected": -6.123991966247559,
"logps/chosen": -1019.96435546875,
"logps/rejected": -879.1519775390625,
"loss": 0.6461,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 1.0306396484375,
"rewards/margins": 0.21070317924022675,
"rewards/rejected": 0.8199363946914673,
"step": 140
},
{
"epoch": 0.05558901190531338,
"grad_norm": 192.38653564453125,
"learning_rate": 4.7239718414227493e-07,
"logits/chosen": -6.214751243591309,
"logits/rejected": -6.185935020446777,
"logps/chosen": -984.6964721679688,
"logps/rejected": -895.7721557617188,
"loss": 0.661,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.9771279096603394,
"rewards/margins": 0.13651703298091888,
"rewards/rejected": 0.8406108617782593,
"step": 150
},
{
"epoch": 0.05558901190531338,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.335906982421875,
"eval_logps/chosen": -1141.2706298828125,
"eval_logps/rejected": -1050.9266357421875,
"eval_loss": 0.6631777882575989,
"eval_rewards/accuracies": 0.5961377024650574,
"eval_rewards/chosen": 1.3074761629104614,
"eval_rewards/margins": 0.17596358060836792,
"eval_rewards/rejected": 1.1315125226974487,
"eval_runtime": 174.258,
"eval_samples_per_second": 6.835,
"eval_steps_per_second": 6.835,
"step": 150
},
{
"epoch": 0.059294946032334275,
"grad_norm": 174.99664306640625,
"learning_rate": 4.705446461652464e-07,
"logits/chosen": -6.193015098571777,
"logits/rejected": -6.1321492195129395,
"logps/chosen": -1026.88037109375,
"logps/rejected": -896.2667846679688,
"loss": 0.6248,
"rewards/accuracies": 0.65625,
"rewards/chosen": 1.183638572692871,
"rewards/margins": 0.23824377357959747,
"rewards/rejected": 0.9453946352005005,
"step": 160
},
{
"epoch": 0.06300088015935516,
"grad_norm": 192.7080078125,
"learning_rate": 4.6869210818821784e-07,
"logits/chosen": -6.125895023345947,
"logits/rejected": -6.105996131896973,
"logps/chosen": -891.0772705078125,
"logps/rejected": -783.1829223632812,
"loss": 0.6062,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 1.0523326396942139,
"rewards/margins": 0.29890042543411255,
"rewards/rejected": 0.7534322142601013,
"step": 170
},
{
"epoch": 0.06670681428637606,
"grad_norm": 213.39122009277344,
"learning_rate": 4.668395702111893e-07,
"logits/chosen": -6.171431064605713,
"logits/rejected": -6.141777992248535,
"logps/chosen": -950.2371215820312,
"logps/rejected": -837.3072509765625,
"loss": 0.6254,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 1.1676208972930908,
"rewards/margins": 0.2787976562976837,
"rewards/rejected": 0.8888231515884399,
"step": 180
},
{
"epoch": 0.07041274841339695,
"grad_norm": 187.03749084472656,
"learning_rate": 4.649870322341608e-07,
"logits/chosen": -6.19333028793335,
"logits/rejected": -6.163342475891113,
"logps/chosen": -958.0759887695312,
"logps/rejected": -845.267578125,
"loss": 0.6568,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 1.1464588642120361,
"rewards/margins": 0.23536348342895508,
"rewards/rejected": 0.9110953211784363,
"step": 190
},
{
"epoch": 0.07411868254041784,
"grad_norm": 150.2332763671875,
"learning_rate": 4.6313449425713225e-07,
"logits/chosen": -6.098294734954834,
"logits/rejected": -6.079904556274414,
"logps/chosen": -920.4479370117188,
"logps/rejected": -867.9637451171875,
"loss": 0.6275,
"rewards/accuracies": 0.625,
"rewards/chosen": 1.1195321083068848,
"rewards/margins": 0.24117548763751984,
"rewards/rejected": 0.8783566355705261,
"step": 200
},
{
"epoch": 0.07411868254041784,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.339965343475342,
"eval_logps/chosen": -1140.9739990234375,
"eval_logps/rejected": -1050.8843994140625,
"eval_loss": 0.6571991443634033,
"eval_rewards/accuracies": 0.6154491901397705,
"eval_rewards/chosen": 1.3371424674987793,
"eval_rewards/margins": 0.20140083134174347,
"eval_rewards/rejected": 1.1357417106628418,
"eval_runtime": 174.4949,
"eval_samples_per_second": 6.825,
"eval_steps_per_second": 6.825,
"step": 200
},
{
"epoch": 0.07782461666743874,
"grad_norm": 189.8514862060547,
"learning_rate": 4.6128195628010375e-07,
"logits/chosen": -6.194195747375488,
"logits/rejected": -6.146265983581543,
"logps/chosen": -915.0256958007812,
"logps/rejected": -845.5064697265625,
"loss": 0.6606,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.961922287940979,
"rewards/margins": 0.19818060100078583,
"rewards/rejected": 0.763741672039032,
"step": 210
},
{
"epoch": 0.08153055079445963,
"grad_norm": 189.69139099121094,
"learning_rate": 4.594294183030752e-07,
"logits/chosen": -6.284877300262451,
"logits/rejected": -6.263852119445801,
"logps/chosen": -1026.146484375,
"logps/rejected": -913.3385009765625,
"loss": 0.6511,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.9875092506408691,
"rewards/margins": 0.21295031905174255,
"rewards/rejected": 0.7745589017868042,
"step": 220
},
{
"epoch": 0.08523648492148052,
"grad_norm": 264.7326354980469,
"learning_rate": 4.575768803260467e-07,
"logits/chosen": -6.017802715301514,
"logits/rejected": -6.151331424713135,
"logps/chosen": -846.173828125,
"logps/rejected": -828.7672119140625,
"loss": 0.6848,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.7509580254554749,
"rewards/margins": 0.0957236960530281,
"rewards/rejected": 0.6552343368530273,
"step": 230
},
{
"epoch": 0.0889424190485014,
"grad_norm": 224.5631561279297,
"learning_rate": 4.557243423490181e-07,
"logits/chosen": -6.283064365386963,
"logits/rejected": -6.176108360290527,
"logps/chosen": -940.2021484375,
"logps/rejected": -812.8372802734375,
"loss": 0.6557,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.8100060224533081,
"rewards/margins": 0.16767463088035583,
"rewards/rejected": 0.6423314213752747,
"step": 240
},
{
"epoch": 0.09264835317552231,
"grad_norm": 176.48463439941406,
"learning_rate": 4.538718043719896e-07,
"logits/chosen": -6.0912933349609375,
"logits/rejected": -6.187921524047852,
"logps/chosen": -1011.1219482421875,
"logps/rejected": -848.7685546875,
"loss": 0.6152,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.857901930809021,
"rewards/margins": 0.25349634885787964,
"rewards/rejected": 0.6044055819511414,
"step": 250
},
{
"epoch": 0.09264835317552231,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.341000080108643,
"eval_logps/chosen": -1144.0086669921875,
"eval_logps/rejected": -1053.47119140625,
"eval_loss": 0.6642729043960571,
"eval_rewards/accuracies": 0.5801846981048584,
"eval_rewards/chosen": 1.0336804389953613,
"eval_rewards/margins": 0.1566334068775177,
"eval_rewards/rejected": 0.8770471215248108,
"eval_runtime": 174.2528,
"eval_samples_per_second": 6.835,
"eval_steps_per_second": 6.835,
"step": 250
},
{
"epoch": 0.0963542873025432,
"grad_norm": 194.99929809570312,
"learning_rate": 4.5201926639496107e-07,
"logits/chosen": -6.046469688415527,
"logits/rejected": -6.070072650909424,
"logps/chosen": -825.3790283203125,
"logps/rejected": -784.54052734375,
"loss": 0.6457,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.7765441536903381,
"rewards/margins": 0.17580363154411316,
"rewards/rejected": 0.6007404923439026,
"step": 260
},
{
"epoch": 0.10006022142956408,
"grad_norm": 232.19290161132812,
"learning_rate": 4.5016672841793257e-07,
"logits/chosen": -6.138308525085449,
"logits/rejected": -6.1265411376953125,
"logps/chosen": -966.5224609375,
"logps/rejected": -880.6701049804688,
"loss": 0.6047,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 1.0221220254898071,
"rewards/margins": 0.32319146394729614,
"rewards/rejected": 0.6989305019378662,
"step": 270
},
{
"epoch": 0.10376615555658499,
"grad_norm": 199.44570922851562,
"learning_rate": 4.48314190440904e-07,
"logits/chosen": -6.254446983337402,
"logits/rejected": -6.203383922576904,
"logps/chosen": -974.5589599609375,
"logps/rejected": -894.5767822265625,
"loss": 0.6625,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.9916531443595886,
"rewards/margins": 0.18319669365882874,
"rewards/rejected": 0.8084564208984375,
"step": 280
},
{
"epoch": 0.10747208968360587,
"grad_norm": 195.33485412597656,
"learning_rate": 4.4646165246387553e-07,
"logits/chosen": -6.29571533203125,
"logits/rejected": -6.212879180908203,
"logps/chosen": -883.5402221679688,
"logps/rejected": -783.243408203125,
"loss": 0.6307,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.9049898386001587,
"rewards/margins": 0.2685369849205017,
"rewards/rejected": 0.6364529132843018,
"step": 290
},
{
"epoch": 0.11117802381062676,
"grad_norm": 214.13623046875,
"learning_rate": 4.44609114486847e-07,
"logits/chosen": NaN,
"logits/rejected": -6.118219375610352,
"logps/chosen": -953.02001953125,
"logps/rejected": -822.9397583007812,
"loss": 0.6433,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.9801859855651855,
"rewards/margins": 0.20221543312072754,
"rewards/rejected": 0.7779706120491028,
"step": 300
},
{
"epoch": 0.11117802381062676,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.331676483154297,
"eval_logps/chosen": -1140.791748046875,
"eval_logps/rejected": -1050.74365234375,
"eval_loss": 0.6641189455986023,
"eval_rewards/accuracies": 0.6078925132751465,
"eval_rewards/chosen": 1.355363130569458,
"eval_rewards/margins": 0.20555777847766876,
"eval_rewards/rejected": 1.1498054265975952,
"eval_runtime": 174.1181,
"eval_samples_per_second": 6.84,
"eval_steps_per_second": 6.84,
"step": 300
},
{
"epoch": 0.11488395793764766,
"grad_norm": 151.80670166015625,
"learning_rate": 4.4275657650981843e-07,
"logits/chosen": -6.154031276702881,
"logits/rejected": -6.207782745361328,
"logps/chosen": -944.8739013671875,
"logps/rejected": -844.357421875,
"loss": 0.6034,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 1.1336545944213867,
"rewards/margins": 0.3580685555934906,
"rewards/rejected": 0.7755860090255737,
"step": 310
},
{
"epoch": 0.11858989206466855,
"grad_norm": 199.51202392578125,
"learning_rate": 4.409040385327899e-07,
"logits/chosen": -6.148090839385986,
"logits/rejected": -6.058573246002197,
"logps/chosen": -870.2352294921875,
"logps/rejected": -785.9710693359375,
"loss": 0.6577,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 1.0765498876571655,
"rewards/margins": 0.24500849843025208,
"rewards/rejected": 0.8315415382385254,
"step": 320
},
{
"epoch": 0.12229582619168944,
"grad_norm": 178.98838806152344,
"learning_rate": 4.390515005557614e-07,
"logits/chosen": -6.161218166351318,
"logits/rejected": -6.021878242492676,
"logps/chosen": -900.61279296875,
"logps/rejected": -716.9248046875,
"loss": 0.5969,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 1.0982271432876587,
"rewards/margins": 0.3538793623447418,
"rewards/rejected": 0.7443478107452393,
"step": 330
},
{
"epoch": 0.12600176031871033,
"grad_norm": 193.34494018554688,
"learning_rate": 4.3719896257873284e-07,
"logits/chosen": -6.12771463394165,
"logits/rejected": -6.165745735168457,
"logps/chosen": -938.45068359375,
"logps/rejected": -867.9703369140625,
"loss": 0.673,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 1.1986232995986938,
"rewards/margins": 0.20064759254455566,
"rewards/rejected": 0.9979757070541382,
"step": 340
},
{
"epoch": 0.12970769444573121,
"grad_norm": 176.0118865966797,
"learning_rate": 4.3534642460170435e-07,
"logits/chosen": -6.143977165222168,
"logits/rejected": -6.176082611083984,
"logps/chosen": -921.1917114257812,
"logps/rejected": -874.5079345703125,
"loss": 0.623,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 1.1385135650634766,
"rewards/margins": 0.2807597219944,
"rewards/rejected": 0.8577538728713989,
"step": 350
},
{
"epoch": 0.12970769444573121,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.329718589782715,
"eval_logps/chosen": -1140.4703369140625,
"eval_logps/rejected": -1050.4912109375,
"eval_loss": 0.6621597409248352,
"eval_rewards/accuracies": 0.6011754870414734,
"eval_rewards/chosen": 1.3875113725662231,
"eval_rewards/margins": 0.21245607733726501,
"eval_rewards/rejected": 1.1750552654266357,
"eval_runtime": 173.5305,
"eval_samples_per_second": 6.863,
"eval_steps_per_second": 6.863,
"step": 350
},
{
"epoch": 0.13341362857275213,
"grad_norm": 191.9296875,
"learning_rate": 4.334938866246758e-07,
"logits/chosen": -6.161706447601318,
"logits/rejected": -6.022977828979492,
"logps/chosen": -955.3453979492188,
"logps/rejected": -816.9100341796875,
"loss": 0.6297,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 1.105089545249939,
"rewards/margins": 0.2945402264595032,
"rewards/rejected": 0.8105493783950806,
"step": 360
},
{
"epoch": 0.13711956269977302,
"grad_norm": 208.82858276367188,
"learning_rate": 4.3164134864764725e-07,
"logits/chosen": -6.0739240646362305,
"logits/rejected": -6.2275519371032715,
"logps/chosen": -819.2732543945312,
"logps/rejected": -865.2078247070312,
"loss": 0.7031,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.8856269717216492,
"rewards/margins": 0.11217892169952393,
"rewards/rejected": 0.77344810962677,
"step": 370
},
{
"epoch": 0.1408254968267939,
"grad_norm": 180.92535400390625,
"learning_rate": 4.297888106706187e-07,
"logits/chosen": -6.172797203063965,
"logits/rejected": -6.124629497528076,
"logps/chosen": -935.8492431640625,
"logps/rejected": -818.2625732421875,
"loss": 0.6061,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 1.0353577136993408,
"rewards/margins": 0.3515828251838684,
"rewards/rejected": 0.683775007724762,
"step": 380
},
{
"epoch": 0.1445314309538148,
"grad_norm": 214.0187225341797,
"learning_rate": 4.2793627269359016e-07,
"logits/chosen": -6.215329170227051,
"logits/rejected": -6.21909236907959,
"logps/chosen": -1081.975341796875,
"logps/rejected": -913.9468994140625,
"loss": 0.5974,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 1.190735101699829,
"rewards/margins": 0.33388403058052063,
"rewards/rejected": 0.8568509817123413,
"step": 390
},
{
"epoch": 0.14823736508083568,
"grad_norm": 176.70887756347656,
"learning_rate": 4.2608373471656166e-07,
"logits/chosen": -6.156098365783691,
"logits/rejected": -6.209042072296143,
"logps/chosen": -976.1398315429688,
"logps/rejected": -873.5931396484375,
"loss": 0.6532,
"rewards/accuracies": 0.59375,
"rewards/chosen": 1.1283199787139893,
"rewards/margins": 0.22726468741893768,
"rewards/rejected": 0.9010552167892456,
"step": 400
},
{
"epoch": 0.14823736508083568,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.327154636383057,
"eval_logps/chosen": -1140.1060791015625,
"eval_logps/rejected": -1050.1492919921875,
"eval_loss": 0.6659889817237854,
"eval_rewards/accuracies": 0.6028547286987305,
"eval_rewards/chosen": 1.4239270687103271,
"eval_rewards/margins": 0.2146778702735901,
"eval_rewards/rejected": 1.2092490196228027,
"eval_runtime": 173.6869,
"eval_samples_per_second": 6.857,
"eval_steps_per_second": 6.857,
"step": 400
},
{
"epoch": 0.15194329920785657,
"grad_norm": 169.63328552246094,
"learning_rate": 4.242311967395331e-07,
"logits/chosen": -6.142382621765137,
"logits/rejected": -6.167304992675781,
"logps/chosen": -947.6598510742188,
"logps/rejected": -787.0640869140625,
"loss": 0.6019,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 1.2996371984481812,
"rewards/margins": 0.3752737045288086,
"rewards/rejected": 0.9243636131286621,
"step": 410
},
{
"epoch": 0.15564923333487748,
"grad_norm": 270.06866455078125,
"learning_rate": 4.223786587625046e-07,
"logits/chosen": -6.093822002410889,
"logits/rejected": -6.110901832580566,
"logps/chosen": -937.8591918945312,
"logps/rejected": -855.3360595703125,
"loss": 0.6348,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 1.346130132675171,
"rewards/margins": 0.3189659118652344,
"rewards/rejected": 1.0271642208099365,
"step": 420
},
{
"epoch": 0.15935516746189837,
"grad_norm": 124.60543823242188,
"learning_rate": 4.2052612078547607e-07,
"logits/chosen": -6.153736591339111,
"logits/rejected": -6.0936784744262695,
"logps/chosen": -891.9788208007812,
"logps/rejected": -771.3781127929688,
"loss": 0.6269,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 1.1474323272705078,
"rewards/margins": 0.3255314826965332,
"rewards/rejected": 0.8219007253646851,
"step": 430
},
{
"epoch": 0.16306110158891926,
"grad_norm": 169.39234924316406,
"learning_rate": 4.186735828084476e-07,
"logits/chosen": -6.129828453063965,
"logits/rejected": -6.149449348449707,
"logps/chosen": -891.6807861328125,
"logps/rejected": -785.4395751953125,
"loss": 0.6103,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 1.2507822513580322,
"rewards/margins": 0.34615927934646606,
"rewards/rejected": 0.9046230316162109,
"step": 440
},
{
"epoch": 0.16676703571594015,
"grad_norm": 182.51864624023438,
"learning_rate": 4.16821044831419e-07,
"logits/chosen": -6.106880187988281,
"logits/rejected": -6.003333568572998,
"logps/chosen": -994.0611572265625,
"logps/rejected": -866.1448974609375,
"loss": 0.6798,
"rewards/accuracies": 0.59375,
"rewards/chosen": 1.1819612979888916,
"rewards/margins": 0.22734245657920837,
"rewards/rejected": 0.9546189308166504,
"step": 450
},
{
"epoch": 0.16676703571594015,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.315321445465088,
"eval_logps/chosen": -1139.016845703125,
"eval_logps/rejected": -1049.19189453125,
"eval_loss": 0.6655510067939758,
"eval_rewards/accuracies": 0.6053736209869385,
"eval_rewards/chosen": 1.5328552722930908,
"eval_rewards/margins": 0.22787250578403473,
"eval_rewards/rejected": 1.3049829006195068,
"eval_runtime": 174.0654,
"eval_samples_per_second": 6.842,
"eval_steps_per_second": 6.842,
"step": 450
},
{
"epoch": 0.17047296984296104,
"grad_norm": 175.39608764648438,
"learning_rate": 4.149685068543905e-07,
"logits/chosen": -6.059579372406006,
"logits/rejected": -6.104693412780762,
"logps/chosen": -972.4683837890625,
"logps/rejected": -834.6124877929688,
"loss": 0.6116,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 1.3684813976287842,
"rewards/margins": 0.38127750158309937,
"rewards/rejected": 0.9872040748596191,
"step": 460
},
{
"epoch": 0.17417890396998192,
"grad_norm": 230.60934448242188,
"learning_rate": 4.1311596887736194e-07,
"logits/chosen": -6.06960916519165,
"logits/rejected": -6.0307111740112305,
"logps/chosen": -868.5339965820312,
"logps/rejected": -845.7874755859375,
"loss": 0.6496,
"rewards/accuracies": 0.65625,
"rewards/chosen": 1.1452016830444336,
"rewards/margins": 0.25945180654525757,
"rewards/rejected": 0.8857498168945312,
"step": 470
},
{
"epoch": 0.1778848380970028,
"grad_norm": 196.1241912841797,
"learning_rate": 4.1126343090033344e-07,
"logits/chosen": -6.163808822631836,
"logits/rejected": -6.103111267089844,
"logps/chosen": -993.4736328125,
"logps/rejected": -810.2939453125,
"loss": 0.5573,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 1.1358219385147095,
"rewards/margins": 0.4707656502723694,
"rewards/rejected": 0.6650562286376953,
"step": 480
},
{
"epoch": 0.18159077222402373,
"grad_norm": 225.72552490234375,
"learning_rate": 4.094108929233049e-07,
"logits/chosen": -6.20804500579834,
"logits/rejected": -6.198565483093262,
"logps/chosen": -967.2512817382812,
"logps/rejected": -881.41552734375,
"loss": 0.6359,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 1.2158067226409912,
"rewards/margins": 0.3084403872489929,
"rewards/rejected": 0.9073662757873535,
"step": 490
},
{
"epoch": 0.18529670635104462,
"grad_norm": 212.1605224609375,
"learning_rate": 4.075583549462764e-07,
"logits/chosen": -6.132593631744385,
"logits/rejected": -6.051444053649902,
"logps/chosen": -943.2779541015625,
"logps/rejected": -822.4968872070312,
"loss": 0.6209,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 1.2602580785751343,
"rewards/margins": 0.3520536720752716,
"rewards/rejected": 0.9082044363021851,
"step": 500
},
{
"epoch": 0.18529670635104462,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.321292400360107,
"eval_logps/chosen": -1139.3175048828125,
"eval_logps/rejected": -1049.5101318359375,
"eval_loss": 0.6620848774909973,
"eval_rewards/accuracies": 0.6011754870414734,
"eval_rewards/chosen": 1.5027841329574585,
"eval_rewards/margins": 0.2296140044927597,
"eval_rewards/rejected": 1.2731702327728271,
"eval_runtime": 174.6102,
"eval_samples_per_second": 6.821,
"eval_steps_per_second": 6.821,
"step": 500
},
{
"epoch": 0.1890026404780655,
"grad_norm": 153.60992431640625,
"learning_rate": 4.0570581696924785e-07,
"logits/chosen": -5.990109443664551,
"logits/rejected": -5.986026763916016,
"logps/chosen": -898.7940673828125,
"logps/rejected": -799.6437377929688,
"loss": 0.6165,
"rewards/accuracies": 0.65625,
"rewards/chosen": 1.1576520204544067,
"rewards/margins": 0.3633851110935211,
"rewards/rejected": 0.794266939163208,
"step": 510
},
{
"epoch": 0.1927085746050864,
"grad_norm": 172.2601776123047,
"learning_rate": 4.038532789922193e-07,
"logits/chosen": -6.226175308227539,
"logits/rejected": -6.190736293792725,
"logps/chosen": -883.1309814453125,
"logps/rejected": -794.0902099609375,
"loss": 0.6212,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 1.1897079944610596,
"rewards/margins": 0.35320332646369934,
"rewards/rejected": 0.8365045785903931,
"step": 520
},
{
"epoch": 0.19641450873210728,
"grad_norm": 192.50827026367188,
"learning_rate": 4.0200074101519076e-07,
"logits/chosen": -6.055853843688965,
"logits/rejected": NaN,
"logps/chosen": -956.99267578125,
"logps/rejected": -887.1090087890625,
"loss": 0.6176,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 1.1858432292938232,
"rewards/margins": 0.29170137643814087,
"rewards/rejected": 0.8941418528556824,
"step": 530
},
{
"epoch": 0.20012044285912817,
"grad_norm": 207.009521484375,
"learning_rate": 4.0014820303816226e-07,
"logits/chosen": -6.144883155822754,
"logits/rejected": NaN,
"logps/chosen": -929.91943359375,
"logps/rejected": -834.5404052734375,
"loss": 0.6305,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.9971332550048828,
"rewards/margins": 0.3494023382663727,
"rewards/rejected": 0.6477310061454773,
"step": 540
},
{
"epoch": 0.20382637698614908,
"grad_norm": 224.73924255371094,
"learning_rate": 3.982956650611337e-07,
"logits/chosen": -6.141475677490234,
"logits/rejected": -6.262620449066162,
"logps/chosen": -966.7326049804688,
"logps/rejected": -890.4728393554688,
"loss": 0.6286,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.9653292894363403,
"rewards/margins": 0.29025983810424805,
"rewards/rejected": 0.6750694513320923,
"step": 550
},
{
"epoch": 0.20382637698614908,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.334640979766846,
"eval_logps/chosen": -1142.1336669921875,
"eval_logps/rejected": -1052.1123046875,
"eval_loss": 0.660417377948761,
"eval_rewards/accuracies": 0.6179680824279785,
"eval_rewards/chosen": 1.2211687564849854,
"eval_rewards/margins": 0.2082298845052719,
"eval_rewards/rejected": 1.0129389762878418,
"eval_runtime": 174.5819,
"eval_samples_per_second": 6.822,
"eval_steps_per_second": 6.822,
"step": 550
},
{
"epoch": 0.20753231111316997,
"grad_norm": 318.4903869628906,
"learning_rate": 3.964431270841052e-07,
"logits/chosen": -6.1527791023254395,
"logits/rejected": -6.1851677894592285,
"logps/chosen": -932.6305541992188,
"logps/rejected": -875.3505859375,
"loss": 0.7095,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 1.0057893991470337,
"rewards/margins": 0.13861322402954102,
"rewards/rejected": 0.8671760559082031,
"step": 560
},
{
"epoch": 0.21123824524019086,
"grad_norm": 176.8316192626953,
"learning_rate": 3.9459058910707667e-07,
"logits/chosen": -6.179142951965332,
"logits/rejected": -6.174668788909912,
"logps/chosen": -1010.54052734375,
"logps/rejected": -875.7761840820312,
"loss": 0.607,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.096440076828003,
"rewards/margins": 0.36430811882019043,
"rewards/rejected": 0.7321318984031677,
"step": 570
},
{
"epoch": 0.21494417936721175,
"grad_norm": 215.4343719482422,
"learning_rate": 3.927380511300482e-07,
"logits/chosen": -6.139018535614014,
"logits/rejected": -6.070583343505859,
"logps/chosen": -961.7306518554688,
"logps/rejected": -831.40380859375,
"loss": 0.6366,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 1.0454758405685425,
"rewards/margins": 0.31583863496780396,
"rewards/rejected": 0.7296372652053833,
"step": 580
},
{
"epoch": 0.21865011349423263,
"grad_norm": 202.57000732421875,
"learning_rate": 3.908855131530196e-07,
"logits/chosen": -6.21251106262207,
"logits/rejected": -6.157750606536865,
"logps/chosen": -1023.9791259765625,
"logps/rejected": -936.6031494140625,
"loss": 0.6376,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 1.146539568901062,
"rewards/margins": 0.26862841844558716,
"rewards/rejected": 0.8779112100601196,
"step": 590
},
{
"epoch": 0.22235604762125352,
"grad_norm": 182.68316650390625,
"learning_rate": 3.890329751759911e-07,
"logits/chosen": -6.045320987701416,
"logits/rejected": -6.124794006347656,
"logps/chosen": -956.4278564453125,
"logps/rejected": -807.8424682617188,
"loss": 0.6274,
"rewards/accuracies": 0.625,
"rewards/chosen": 1.1107687950134277,
"rewards/margins": 0.26531320810317993,
"rewards/rejected": 0.845455527305603,
"step": 600
},
{
"epoch": 0.22235604762125352,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.337012767791748,
"eval_logps/chosen": -1139.7646484375,
"eval_logps/rejected": -1050.012451171875,
"eval_loss": 0.6612704396247864,
"eval_rewards/accuracies": 0.6179680824279785,
"eval_rewards/chosen": 1.458066701889038,
"eval_rewards/margins": 0.23514851927757263,
"eval_rewards/rejected": 1.2229182720184326,
"eval_runtime": 174.599,
"eval_samples_per_second": 6.821,
"eval_steps_per_second": 6.821,
"step": 600
},
{
"epoch": 0.22606198174827444,
"grad_norm": 190.41612243652344,
"learning_rate": 3.8718043719896253e-07,
"logits/chosen": -6.1819024085998535,
"logits/rejected": -6.2130818367004395,
"logps/chosen": -982.65869140625,
"logps/rejected": -875.1731567382812,
"loss": 0.6125,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 1.2317478656768799,
"rewards/margins": 0.363824725151062,
"rewards/rejected": 0.8679230809211731,
"step": 610
},
{
"epoch": 0.22976791587529533,
"grad_norm": 147.1990966796875,
"learning_rate": 3.8532789922193404e-07,
"logits/chosen": -6.232724666595459,
"logits/rejected": -6.308589458465576,
"logps/chosen": -914.0718994140625,
"logps/rejected": -837.0066528320312,
"loss": 0.6067,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 1.2401645183563232,
"rewards/margins": 0.37453925609588623,
"rewards/rejected": 0.8656252026557922,
"step": 620
},
{
"epoch": 0.2334738500023162,
"grad_norm": 200.28050231933594,
"learning_rate": 3.834753612449055e-07,
"logits/chosen": -6.240880012512207,
"logits/rejected": -6.253479957580566,
"logps/chosen": -951.05712890625,
"logps/rejected": -853.3023681640625,
"loss": 0.6128,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 1.3768174648284912,
"rewards/margins": 0.3513033986091614,
"rewards/rejected": 1.0255142450332642,
"step": 630
},
{
"epoch": 0.2371797841293371,
"grad_norm": 223.86203002929688,
"learning_rate": 3.81622823267877e-07,
"logits/chosen": -6.115043640136719,
"logits/rejected": -6.189513206481934,
"logps/chosen": -951.8463745117188,
"logps/rejected": -889.0245971679688,
"loss": 0.706,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 1.218057632446289,
"rewards/margins": 0.1631278544664383,
"rewards/rejected": 1.0549296140670776,
"step": 640
},
{
"epoch": 0.240885718256358,
"grad_norm": 211.12872314453125,
"learning_rate": 3.7977028529084845e-07,
"logits/chosen": -6.2889084815979,
"logits/rejected": -6.2076416015625,
"logps/chosen": -1071.9197998046875,
"logps/rejected": -939.8733520507812,
"loss": 0.6382,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 1.2747665643692017,
"rewards/margins": 0.3331315815448761,
"rewards/rejected": 0.941635012626648,
"step": 650
},
{
"epoch": 0.240885718256358,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.333639621734619,
"eval_logps/chosen": -1140.6971435546875,
"eval_logps/rejected": -1050.910888671875,
"eval_loss": 0.6616818904876709,
"eval_rewards/accuracies": 0.6204869747161865,
"eval_rewards/chosen": 1.36481773853302,
"eval_rewards/margins": 0.23173516988754272,
"eval_rewards/rejected": 1.133082628250122,
"eval_runtime": 174.4739,
"eval_samples_per_second": 6.826,
"eval_steps_per_second": 6.826,
"step": 650
},
{
"epoch": 0.24459165238337888,
"grad_norm": 157.5903778076172,
"learning_rate": 3.779177473138199e-07,
"logits/chosen": -6.22428560256958,
"logits/rejected": -6.160924434661865,
"logps/chosen": -881.8909301757812,
"logps/rejected": -848.2203979492188,
"loss": 0.6146,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 1.0156034231185913,
"rewards/margins": 0.35982877016067505,
"rewards/rejected": 0.6557747721672058,
"step": 660
},
{
"epoch": 0.24829758651039976,
"grad_norm": 210.9625244140625,
"learning_rate": 3.7606520933679135e-07,
"logits/chosen": -6.125003337860107,
"logits/rejected": -6.0754289627075195,
"logps/chosen": -884.8665771484375,
"logps/rejected": -789.2760620117188,
"loss": 0.651,
"rewards/accuracies": 0.65625,
"rewards/chosen": 1.0800001621246338,
"rewards/margins": 0.25831884145736694,
"rewards/rejected": 0.8216812014579773,
"step": 670
},
{
"epoch": 0.25200352063742065,
"grad_norm": 172.888916015625,
"learning_rate": 3.7421267135976286e-07,
"logits/chosen": -6.232366561889648,
"logits/rejected": -6.1383185386657715,
"logps/chosen": -960.3611450195312,
"logps/rejected": -835.4730224609375,
"loss": 0.6177,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 1.2130458354949951,
"rewards/margins": 0.35953769087791443,
"rewards/rejected": 0.8535081744194031,
"step": 680
},
{
"epoch": 0.25570945476444157,
"grad_norm": 186.8795623779297,
"learning_rate": 3.723601333827343e-07,
"logits/chosen": -6.141107082366943,
"logits/rejected": -6.195657253265381,
"logps/chosen": -938.1901245117188,
"logps/rejected": -814.3914184570312,
"loss": 0.6728,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.9760942459106445,
"rewards/margins": 0.2641645669937134,
"rewards/rejected": 0.7119296789169312,
"step": 690
},
{
"epoch": 0.25941538889146243,
"grad_norm": 152.13043212890625,
"learning_rate": 3.705075954057058e-07,
"logits/chosen": -6.24930477142334,
"logits/rejected": -6.1736040115356445,
"logps/chosen": -936.4893798828125,
"logps/rejected": -848.609375,
"loss": 0.5967,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 1.0887714624404907,
"rewards/margins": 0.3751378059387207,
"rewards/rejected": 0.71363365650177,
"step": 700
},
{
"epoch": 0.25941538889146243,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.339428901672363,
"eval_logps/chosen": -1141.5142822265625,
"eval_logps/rejected": -1051.634521484375,
"eval_loss": 0.6639354825019836,
"eval_rewards/accuracies": 0.6154491901397705,
"eval_rewards/chosen": 1.2831051349639893,
"eval_rewards/margins": 0.22238638997077942,
"eval_rewards/rejected": 1.0607186555862427,
"eval_runtime": 174.5031,
"eval_samples_per_second": 6.825,
"eval_steps_per_second": 6.825,
"step": 700
},
{
"epoch": 0.26312132301848334,
"grad_norm": 189.0890655517578,
"learning_rate": 3.6865505742867727e-07,
"logits/chosen": -6.229816436767578,
"logits/rejected": -6.161192893981934,
"logps/chosen": -850.07568359375,
"logps/rejected": -789.8104248046875,
"loss": 0.6791,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.9719418287277222,
"rewards/margins": 0.19261571764945984,
"rewards/rejected": 0.7793260812759399,
"step": 710
},
{
"epoch": 0.26682725714550426,
"grad_norm": 243.8544921875,
"learning_rate": 3.668025194516488e-07,
"logits/chosen": -6.192295551300049,
"logits/rejected": -6.1518754959106445,
"logps/chosen": -950.3997802734375,
"logps/rejected": -804.35009765625,
"loss": 0.623,
"rewards/accuracies": 0.625,
"rewards/chosen": 1.1841661930084229,
"rewards/margins": 0.35910895466804504,
"rewards/rejected": 0.8250571489334106,
"step": 720
},
{
"epoch": 0.2705331912725251,
"grad_norm": 194.04550170898438,
"learning_rate": 3.649499814746202e-07,
"logits/chosen": -6.118433952331543,
"logits/rejected": -6.096287727355957,
"logps/chosen": -998.7361450195312,
"logps/rejected": -866.0007934570312,
"loss": 0.5717,
"rewards/accuracies": 0.71875,
"rewards/chosen": 1.341344952583313,
"rewards/margins": 0.4694565236568451,
"rewards/rejected": 0.8718884587287903,
"step": 730
},
{
"epoch": 0.27423912539954604,
"grad_norm": 199.3968963623047,
"learning_rate": 3.630974434975917e-07,
"logits/chosen": -6.136265754699707,
"logits/rejected": -6.221334457397461,
"logps/chosen": -959.4265747070312,
"logps/rejected": -911.4915771484375,
"loss": 0.6336,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 1.3660612106323242,
"rewards/margins": 0.3232826888561249,
"rewards/rejected": 1.0427783727645874,
"step": 740
},
{
"epoch": 0.2779450595265669,
"grad_norm": 184.0609130859375,
"learning_rate": 3.6124490552056313e-07,
"logits/chosen": -6.145341873168945,
"logits/rejected": -6.150424480438232,
"logps/chosen": -933.7684326171875,
"logps/rejected": -828.2578125,
"loss": 0.6562,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 1.300537109375,
"rewards/margins": 0.30143502354621887,
"rewards/rejected": 0.999101996421814,
"step": 750
},
{
"epoch": 0.2779450595265669,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.332278251647949,
"eval_logps/chosen": -1140.3084716796875,
"eval_logps/rejected": -1050.5599365234375,
"eval_loss": 0.6633999347686768,
"eval_rewards/accuracies": 0.6171284914016724,
"eval_rewards/chosen": 1.4036915302276611,
"eval_rewards/margins": 0.23550742864608765,
"eval_rewards/rejected": 1.1681841611862183,
"eval_runtime": 174.2841,
"eval_samples_per_second": 6.834,
"eval_steps_per_second": 6.834,
"step": 750
},
{
"epoch": 0.2816509936535878,
"grad_norm": 187.0355224609375,
"learning_rate": 3.5939236754353464e-07,
"logits/chosen": -6.186856746673584,
"logits/rejected": -6.131129264831543,
"logps/chosen": -941.7112426757812,
"logps/rejected": -817.9475708007812,
"loss": 0.5649,
"rewards/accuracies": 0.71875,
"rewards/chosen": 1.3290807008743286,
"rewards/margins": 0.5047949552536011,
"rewards/rejected": 0.8242858052253723,
"step": 760
},
{
"epoch": 0.2853569277806087,
"grad_norm": 211.35397338867188,
"learning_rate": 3.575398295665061e-07,
"logits/chosen": -6.122169494628906,
"logits/rejected": -6.1151204109191895,
"logps/chosen": -887.787109375,
"logps/rejected": -862.0372314453125,
"loss": 0.6854,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 1.1365787982940674,
"rewards/margins": 0.2167353630065918,
"rewards/rejected": 0.9198434948921204,
"step": 770
},
{
"epoch": 0.2890628619076296,
"grad_norm": 198.17198181152344,
"learning_rate": 3.556872915894776e-07,
"logits/chosen": -6.241828918457031,
"logits/rejected": -6.265792369842529,
"logps/chosen": -929.9652099609375,
"logps/rejected": -877.3856201171875,
"loss": 0.614,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 1.117902398109436,
"rewards/margins": 0.3560691475868225,
"rewards/rejected": 0.7618332505226135,
"step": 780
},
{
"epoch": 0.2927687960346505,
"grad_norm": 150.22120666503906,
"learning_rate": 3.5383475361244905e-07,
"logits/chosen": -6.19686222076416,
"logits/rejected": -6.209750175476074,
"logps/chosen": -1054.142578125,
"logps/rejected": -874.3284912109375,
"loss": 0.5739,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.2843711376190186,
"rewards/margins": 0.4667224884033203,
"rewards/rejected": 0.817648708820343,
"step": 790
},
{
"epoch": 0.29647473016167136,
"grad_norm": 243.4776611328125,
"learning_rate": 3.519822156354205e-07,
"logits/chosen": -6.137392520904541,
"logits/rejected": -6.096640586853027,
"logps/chosen": -926.0994262695312,
"logps/rejected": -875.486328125,
"loss": 0.6676,
"rewards/accuracies": 0.625,
"rewards/chosen": 1.1599671840667725,
"rewards/margins": 0.2768460214138031,
"rewards/rejected": 0.883121132850647,
"step": 800
},
{
"epoch": 0.29647473016167136,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.3336334228515625,
"eval_logps/chosen": -1140.1082763671875,
"eval_logps/rejected": -1050.391845703125,
"eval_loss": 0.6643325090408325,
"eval_rewards/accuracies": 0.6272040009498596,
"eval_rewards/chosen": 1.4237107038497925,
"eval_rewards/margins": 0.23872110247612,
"eval_rewards/rejected": 1.1849894523620605,
"eval_runtime": 174.4194,
"eval_samples_per_second": 6.828,
"eval_steps_per_second": 6.828,
"step": 800
},
{
"epoch": 0.3001806642886923,
"grad_norm": 199.17247009277344,
"learning_rate": 3.5012967765839195e-07,
"logits/chosen": -6.241001129150391,
"logits/rejected": -6.098967552185059,
"logps/chosen": -920.8816528320312,
"logps/rejected": -898.8351440429688,
"loss": 0.7121,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 1.0589300394058228,
"rewards/margins": 0.1631755530834198,
"rewards/rejected": 0.8957546353340149,
"step": 810
},
{
"epoch": 0.30388659841571314,
"grad_norm": 250.08518981933594,
"learning_rate": 3.4827713968136346e-07,
"logits/chosen": -6.267752647399902,
"logits/rejected": -6.312867164611816,
"logps/chosen": -1013.2420654296875,
"logps/rejected": -958.6898193359375,
"loss": 0.6355,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 1.1633565425872803,
"rewards/margins": 0.3349772095680237,
"rewards/rejected": 0.8283793330192566,
"step": 820
},
{
"epoch": 0.30759253254273405,
"grad_norm": 208.069580078125,
"learning_rate": 3.464246017043349e-07,
"logits/chosen": -6.2897796630859375,
"logits/rejected": -6.228161811828613,
"logps/chosen": -933.79541015625,
"logps/rejected": -842.0236206054688,
"loss": 0.64,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 1.0309927463531494,
"rewards/margins": 0.28416553139686584,
"rewards/rejected": 0.7468270659446716,
"step": 830
},
{
"epoch": 0.31129846666975497,
"grad_norm": 232.0865478515625,
"learning_rate": 3.445720637273064e-07,
"logits/chosen": -6.142411708831787,
"logits/rejected": -6.164281845092773,
"logps/chosen": -1016.1708984375,
"logps/rejected": -898.3994140625,
"loss": 0.6169,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 1.0414842367172241,
"rewards/margins": 0.35715141892433167,
"rewards/rejected": 0.6843328475952148,
"step": 840
},
{
"epoch": 0.31500440079677583,
"grad_norm": 193.65232849121094,
"learning_rate": 3.4271952575027787e-07,
"logits/chosen": -6.228142738342285,
"logits/rejected": -6.197975158691406,
"logps/chosen": -931.4905395507812,
"logps/rejected": -856.0863037109375,
"loss": 0.6805,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 1.047786831855774,
"rewards/margins": 0.20453695952892303,
"rewards/rejected": 0.8432496786117554,
"step": 850
},
{
"epoch": 0.31500440079677583,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.3389668464660645,
"eval_logps/chosen": -1141.093505859375,
"eval_logps/rejected": -1051.356201171875,
"eval_loss": 0.6609283089637756,
"eval_rewards/accuracies": 0.6120907068252563,
"eval_rewards/chosen": 1.3251850605010986,
"eval_rewards/margins": 0.23663325607776642,
"eval_rewards/rejected": 1.0885517597198486,
"eval_runtime": 174.4195,
"eval_samples_per_second": 6.828,
"eval_steps_per_second": 6.828,
"step": 850
},
{
"epoch": 0.31871033492379675,
"grad_norm": 191.5203094482422,
"learning_rate": 3.4086698777324937e-07,
"logits/chosen": -6.188792705535889,
"logits/rejected": -6.236809730529785,
"logps/chosen": -977.2009887695312,
"logps/rejected": -865.64306640625,
"loss": 0.5709,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 1.2319726943969727,
"rewards/margins": 0.48221221566200256,
"rewards/rejected": 0.7497605085372925,
"step": 860
},
{
"epoch": 0.3224162690508176,
"grad_norm": 179.01022338867188,
"learning_rate": 3.3901444979622077e-07,
"logits/chosen": -6.048168659210205,
"logits/rejected": -6.131080627441406,
"logps/chosen": -959.7575073242188,
"logps/rejected": -822.2799072265625,
"loss": 0.6798,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 1.2015607357025146,
"rewards/margins": 0.25019967555999756,
"rewards/rejected": 0.9513611793518066,
"step": 870
},
{
"epoch": 0.3261222031778385,
"grad_norm": 164.24993896484375,
"learning_rate": 3.371619118191923e-07,
"logits/chosen": -6.164304256439209,
"logits/rejected": -6.14687442779541,
"logps/chosen": -984.1392822265625,
"logps/rejected": -887.4852294921875,
"loss": 0.5921,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.326623558998108,
"rewards/margins": 0.4440253674983978,
"rewards/rejected": 0.8825982213020325,
"step": 880
},
{
"epoch": 0.3298281373048594,
"grad_norm": 208.62112426757812,
"learning_rate": 3.3530937384216373e-07,
"logits/chosen": -6.055702209472656,
"logits/rejected": -6.137775421142578,
"logps/chosen": -966.544921875,
"logps/rejected": -852.4841918945312,
"loss": 0.6072,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 1.298642873764038,
"rewards/margins": 0.36649665236473083,
"rewards/rejected": 0.932146430015564,
"step": 890
},
{
"epoch": 0.3335340714318803,
"grad_norm": 198.2984619140625,
"learning_rate": 3.3345683586513524e-07,
"logits/chosen": -6.077668190002441,
"logits/rejected": -6.017401218414307,
"logps/chosen": -968.5540771484375,
"logps/rejected": -825.6281127929688,
"loss": 0.5936,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 1.3333479166030884,
"rewards/margins": 0.48964110016822815,
"rewards/rejected": 0.8437067866325378,
"step": 900
},
{
"epoch": 0.3335340714318803,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.333832263946533,
"eval_logps/chosen": -1138.702392578125,
"eval_logps/rejected": -1049.201904296875,
"eval_loss": 0.6653527021408081,
"eval_rewards/accuracies": 0.6322417855262756,
"eval_rewards/chosen": 1.564302682876587,
"eval_rewards/margins": 0.26032954454421997,
"eval_rewards/rejected": 1.3039733171463013,
"eval_runtime": 174.3951,
"eval_samples_per_second": 6.829,
"eval_steps_per_second": 6.829,
"step": 900
},
{
"epoch": 0.3372400055589012,
"grad_norm": 172.08016967773438,
"learning_rate": 3.316042978881067e-07,
"logits/chosen": -6.152904033660889,
"logits/rejected": -6.063776969909668,
"logps/chosen": -934.8038940429688,
"logps/rejected": -874.7457885742188,
"loss": 0.6239,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.3152906894683838,
"rewards/margins": 0.36238130927085876,
"rewards/rejected": 0.9529093503952026,
"step": 910
},
{
"epoch": 0.3409459396859221,
"grad_norm": 154.69894409179688,
"learning_rate": 3.297517599110782e-07,
"logits/chosen": -6.253002166748047,
"logits/rejected": -6.253316402435303,
"logps/chosen": -1026.7967529296875,
"logps/rejected": -909.5032348632812,
"loss": 0.6319,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.4157216548919678,
"rewards/margins": 0.3379477262496948,
"rewards/rejected": 1.077773928642273,
"step": 920
},
{
"epoch": 0.344651873812943,
"grad_norm": 128.03990173339844,
"learning_rate": 3.278992219340496e-07,
"logits/chosen": -6.163126468658447,
"logits/rejected": -6.272846698760986,
"logps/chosen": -1002.2288208007812,
"logps/rejected": -850.8206787109375,
"loss": 0.5969,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 1.2641438245773315,
"rewards/margins": 0.45238691568374634,
"rewards/rejected": 0.81175696849823,
"step": 930
},
{
"epoch": 0.34835780793996385,
"grad_norm": 126.5301284790039,
"learning_rate": 3.260466839570211e-07,
"logits/chosen": -6.24020528793335,
"logits/rejected": -6.305496692657471,
"logps/chosen": -831.75244140625,
"logps/rejected": -811.0787353515625,
"loss": 0.6274,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 1.032156229019165,
"rewards/margins": 0.3172861337661743,
"rewards/rejected": 0.7148701548576355,
"step": 940
},
{
"epoch": 0.35206374206698476,
"grad_norm": 211.2845916748047,
"learning_rate": 3.2419414597999255e-07,
"logits/chosen": -6.139876365661621,
"logits/rejected": -6.111436367034912,
"logps/chosen": -975.7356567382812,
"logps/rejected": -844.0784912109375,
"loss": 0.6325,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 1.1637299060821533,
"rewards/margins": 0.3424530327320099,
"rewards/rejected": 0.8212767839431763,
"step": 950
},
{
"epoch": 0.35206374206698476,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.342590808868408,
"eval_logps/chosen": -1141.1739501953125,
"eval_logps/rejected": -1051.595458984375,
"eval_loss": 0.6582168936729431,
"eval_rewards/accuracies": 0.6213266253471375,
"eval_rewards/chosen": 1.3171454668045044,
"eval_rewards/margins": 0.25251859426498413,
"eval_rewards/rejected": 1.064626932144165,
"eval_runtime": 174.1358,
"eval_samples_per_second": 6.839,
"eval_steps_per_second": 6.839,
"step": 950
},
{
"epoch": 0.3557696761940056,
"grad_norm": 203.0992431640625,
"learning_rate": 3.2234160800296406e-07,
"logits/chosen": -6.084280967712402,
"logits/rejected": -6.071610450744629,
"logps/chosen": -821.4600830078125,
"logps/rejected": -727.051025390625,
"loss": 0.6376,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 1.0315455198287964,
"rewards/margins": 0.3190918266773224,
"rewards/rejected": 0.7124537229537964,
"step": 960
},
{
"epoch": 0.35947561032102654,
"grad_norm": 180.57797241210938,
"learning_rate": 3.204890700259355e-07,
"logits/chosen": -6.084843635559082,
"logits/rejected": -6.00299072265625,
"logps/chosen": -981.2433471679688,
"logps/rejected": -845.26123046875,
"loss": 0.5913,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 1.1229795217514038,
"rewards/margins": 0.41206812858581543,
"rewards/rejected": 0.7109113931655884,
"step": 970
},
{
"epoch": 0.36318154444804746,
"grad_norm": 207.84568786621094,
"learning_rate": 3.18636532048907e-07,
"logits/chosen": -6.196352958679199,
"logits/rejected": -6.150284767150879,
"logps/chosen": -915.8230590820312,
"logps/rejected": -826.0857543945312,
"loss": 0.6367,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 1.011979579925537,
"rewards/margins": 0.326684832572937,
"rewards/rejected": 0.6852947473526001,
"step": 980
},
{
"epoch": 0.3668874785750683,
"grad_norm": 169.62612915039062,
"learning_rate": 3.1678399407187847e-07,
"logits/chosen": -6.155111312866211,
"logits/rejected": -6.252329349517822,
"logps/chosen": -945.7684326171875,
"logps/rejected": -895.0965576171875,
"loss": 0.6591,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 1.0974936485290527,
"rewards/margins": 0.277460515499115,
"rewards/rejected": 0.820033073425293,
"step": 990
},
{
"epoch": 0.37059341270208923,
"grad_norm": 164.93836975097656,
"learning_rate": 3.1493145609484997e-07,
"logits/chosen": -6.1941423416137695,
"logits/rejected": -6.119546413421631,
"logps/chosen": -859.4827270507812,
"logps/rejected": -778.8285522460938,
"loss": 0.614,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 1.124022126197815,
"rewards/margins": 0.3852692246437073,
"rewards/rejected": 0.7387528419494629,
"step": 1000
},
{
"epoch": 0.37059341270208923,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.3517584800720215,
"eval_logps/chosen": -1141.3780517578125,
"eval_logps/rejected": -1051.7454833984375,
"eval_loss": 0.6584185361862183,
"eval_rewards/accuracies": 0.6196473836898804,
"eval_rewards/chosen": 1.296731948852539,
"eval_rewards/margins": 0.24711348116397858,
"eval_rewards/rejected": 1.0496186017990112,
"eval_runtime": 174.4804,
"eval_samples_per_second": 6.826,
"eval_steps_per_second": 6.826,
"step": 1000
},
{
"epoch": 0.3742993468291101,
"grad_norm": 161.15005493164062,
"learning_rate": 3.1307891811782137e-07,
"logits/chosen": -6.150378227233887,
"logits/rejected": -6.203757286071777,
"logps/chosen": -1034.891357421875,
"logps/rejected": -888.6922607421875,
"loss": 0.6111,
"rewards/accuracies": 0.65625,
"rewards/chosen": 1.073899507522583,
"rewards/margins": 0.3846450746059418,
"rewards/rejected": 0.6892544031143188,
"step": 1010
},
{
"epoch": 0.378005280956131,
"grad_norm": 167.26861572265625,
"learning_rate": 3.112263801407929e-07,
"logits/chosen": -6.080620765686035,
"logits/rejected": -6.124800682067871,
"logps/chosen": -882.3255615234375,
"logps/rejected": -834.7425537109375,
"loss": 0.6356,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 1.0985982418060303,
"rewards/margins": 0.2840558588504791,
"rewards/rejected": 0.8145424127578735,
"step": 1020
},
{
"epoch": 0.3817112150831519,
"grad_norm": 143.760009765625,
"learning_rate": 3.0937384216376433e-07,
"logits/chosen": NaN,
"logits/rejected": -6.1283392906188965,
"logps/chosen": -931.8167114257812,
"logps/rejected": -770.5465087890625,
"loss": 0.5515,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 1.2119407653808594,
"rewards/margins": 0.5438046455383301,
"rewards/rejected": 0.6681360006332397,
"step": 1030
},
{
"epoch": 0.3854171492101728,
"grad_norm": 213.75035095214844,
"learning_rate": 3.0752130418673583e-07,
"logits/chosen": -6.2023396492004395,
"logits/rejected": -6.126180648803711,
"logps/chosen": -923.2513427734375,
"logps/rejected": -764.4282836914062,
"loss": 0.6242,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 1.2077882289886475,
"rewards/margins": 0.35475489497184753,
"rewards/rejected": 0.8530333638191223,
"step": 1040
},
{
"epoch": 0.3891230833371937,
"grad_norm": 129.1380157470703,
"learning_rate": 3.056687662097073e-07,
"logits/chosen": -6.151089668273926,
"logits/rejected": -6.196557998657227,
"logps/chosen": -891.5910034179688,
"logps/rejected": -825.3084716796875,
"loss": 0.5819,
"rewards/accuracies": 0.65625,
"rewards/chosen": 1.2349982261657715,
"rewards/margins": 0.49733766913414,
"rewards/rejected": 0.7376605272293091,
"step": 1050
},
{
"epoch": 0.3891230833371937,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.354092597961426,
"eval_logps/chosen": -1138.8973388671875,
"eval_logps/rejected": -1049.481689453125,
"eval_loss": 0.6607492566108704,
"eval_rewards/accuracies": 0.6213266253471375,
"eval_rewards/chosen": 1.544799566268921,
"eval_rewards/margins": 0.26879334449768066,
"eval_rewards/rejected": 1.2760061025619507,
"eval_runtime": 174.2371,
"eval_samples_per_second": 6.836,
"eval_steps_per_second": 6.836,
"step": 1050
},
{
"epoch": 0.39282901746421456,
"grad_norm": 164.7945556640625,
"learning_rate": 3.038162282326788e-07,
"logits/chosen": -6.143533229827881,
"logits/rejected": -6.127655982971191,
"logps/chosen": -845.65234375,
"logps/rejected": -816.9622802734375,
"loss": 0.6427,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 1.2775068283081055,
"rewards/margins": 0.3459513187408447,
"rewards/rejected": 0.9315555691719055,
"step": 1060
},
{
"epoch": 0.3965349515912355,
"grad_norm": 149.87388610839844,
"learning_rate": 3.019636902556502e-07,
"logits/chosen": -6.187637805938721,
"logits/rejected": -6.129674434661865,
"logps/chosen": -885.9992065429688,
"logps/rejected": -778.6902465820312,
"loss": 0.608,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 1.1672834157943726,
"rewards/margins": 0.3892399072647095,
"rewards/rejected": 0.7780434489250183,
"step": 1070
},
{
"epoch": 0.40024088571825633,
"grad_norm": 166.63389587402344,
"learning_rate": 3.001111522786217e-07,
"logits/chosen": -6.271850109100342,
"logits/rejected": -6.208528995513916,
"logps/chosen": -913.72998046875,
"logps/rejected": -773.72412109375,
"loss": 0.6183,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 1.156951904296875,
"rewards/margins": 0.3150864243507385,
"rewards/rejected": 0.8418653607368469,
"step": 1080
},
{
"epoch": 0.40394681984527725,
"grad_norm": 168.648681640625,
"learning_rate": 2.9825861430159315e-07,
"logits/chosen": -6.246833801269531,
"logits/rejected": -6.256269931793213,
"logps/chosen": -948.3607177734375,
"logps/rejected": -898.9035034179688,
"loss": 0.6266,
"rewards/accuracies": 0.65625,
"rewards/chosen": 1.2156215906143188,
"rewards/margins": 0.34718313813209534,
"rewards/rejected": 0.8684385418891907,
"step": 1090
},
{
"epoch": 0.40765275397229817,
"grad_norm": 174.19361877441406,
"learning_rate": 2.9640607632456465e-07,
"logits/chosen": -6.159620761871338,
"logits/rejected": -6.141018867492676,
"logps/chosen": -986.7599487304688,
"logps/rejected": -843.1695556640625,
"loss": 0.5832,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 1.3146113157272339,
"rewards/margins": 0.4466518461704254,
"rewards/rejected": 0.8679596185684204,
"step": 1100
},
{
"epoch": 0.40765275397229817,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.353438854217529,
"eval_logps/chosen": -1139.6771240234375,
"eval_logps/rejected": -1050.29736328125,
"eval_loss": 0.6590859293937683,
"eval_rewards/accuracies": 0.6179680824279785,
"eval_rewards/chosen": 1.466820478439331,
"eval_rewards/margins": 0.2723851799964905,
"eval_rewards/rejected": 1.1944352388381958,
"eval_runtime": 174.1419,
"eval_samples_per_second": 6.839,
"eval_steps_per_second": 6.839,
"step": 1100
},
{
"epoch": 0.411358688099319,
"grad_norm": 207.91915893554688,
"learning_rate": 2.945535383475361e-07,
"logits/chosen": -6.101978302001953,
"logits/rejected": -6.107656955718994,
"logps/chosen": -853.7755737304688,
"logps/rejected": -771.1502685546875,
"loss": 0.6212,
"rewards/accuracies": 0.625,
"rewards/chosen": 1.1558765172958374,
"rewards/margins": 0.4091036915779114,
"rewards/rejected": 0.7467728853225708,
"step": 1110
},
{
"epoch": 0.41506462222633994,
"grad_norm": 172.02349853515625,
"learning_rate": 2.927010003705076e-07,
"logits/chosen": -6.207159996032715,
"logits/rejected": -6.222277641296387,
"logps/chosen": -924.0470581054688,
"logps/rejected": -801.3013916015625,
"loss": 0.5918,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 1.1804696321487427,
"rewards/margins": 0.443875253200531,
"rewards/rejected": 0.7365943193435669,
"step": 1120
},
{
"epoch": 0.4187705563533608,
"grad_norm": 147.4911346435547,
"learning_rate": 2.9084846239347906e-07,
"logits/chosen": -6.112509727478027,
"logits/rejected": -6.115456581115723,
"logps/chosen": -937.400390625,
"logps/rejected": -851.7019653320312,
"loss": 0.5924,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 1.4129273891448975,
"rewards/margins": 0.4493323266506195,
"rewards/rejected": 0.9635950326919556,
"step": 1130
},
{
"epoch": 0.4224764904803817,
"grad_norm": 188.7657012939453,
"learning_rate": 2.8899592441645057e-07,
"logits/chosen": -6.175480842590332,
"logits/rejected": -6.202576637268066,
"logps/chosen": -860.0646362304688,
"logps/rejected": -785.8237915039062,
"loss": 0.5818,
"rewards/accuracies": 0.71875,
"rewards/chosen": 1.2907373905181885,
"rewards/margins": 0.48255085945129395,
"rewards/rejected": 0.8081865310668945,
"step": 1140
},
{
"epoch": 0.4261824246074026,
"grad_norm": 195.689208984375,
"learning_rate": 2.8714338643942197e-07,
"logits/chosen": -6.1582794189453125,
"logits/rejected": -6.1286234855651855,
"logps/chosen": -882.5172729492188,
"logps/rejected": -781.136962890625,
"loss": 0.6334,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 1.112781286239624,
"rewards/margins": 0.32989996671676636,
"rewards/rejected": 0.7828812599182129,
"step": 1150
},
{
"epoch": 0.4261824246074026,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.354198932647705,
"eval_logps/chosen": -1139.8521728515625,
"eval_logps/rejected": -1050.4593505859375,
"eval_loss": 0.6620603203773499,
"eval_rewards/accuracies": 0.6230058670043945,
"eval_rewards/chosen": 1.449316382408142,
"eval_rewards/margins": 0.27107417583465576,
"eval_rewards/rejected": 1.1782420873641968,
"eval_runtime": 174.0926,
"eval_samples_per_second": 6.841,
"eval_steps_per_second": 6.841,
"step": 1150
},
{
"epoch": 0.4298883587344235,
"grad_norm": 196.42591857910156,
"learning_rate": 2.852908484623935e-07,
"logits/chosen": -6.155422687530518,
"logits/rejected": -6.14513635635376,
"logps/chosen": -881.4910888671875,
"logps/rejected": -846.4808349609375,
"loss": 0.5939,
"rewards/accuracies": 0.65625,
"rewards/chosen": 1.1745377779006958,
"rewards/margins": 0.4163404405117035,
"rewards/rejected": 0.7581971883773804,
"step": 1160
},
{
"epoch": 0.4335942928614444,
"grad_norm": 143.9552001953125,
"learning_rate": 2.834383104853649e-07,
"logits/chosen": -6.306971549987793,
"logits/rejected": -6.230139255523682,
"logps/chosen": -952.1017456054688,
"logps/rejected": -841.35498046875,
"loss": 0.6176,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.0589168071746826,
"rewards/margins": 0.36980077624320984,
"rewards/rejected": 0.6891158819198608,
"step": 1170
},
{
"epoch": 0.43730022698846527,
"grad_norm": 160.4393310546875,
"learning_rate": 2.8158577250833643e-07,
"logits/chosen": -6.1785125732421875,
"logits/rejected": -6.113655090332031,
"logps/chosen": -856.7884521484375,
"logps/rejected": -804.0035400390625,
"loss": 0.6001,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 1.080625295639038,
"rewards/margins": 0.3962119221687317,
"rewards/rejected": 0.6844133138656616,
"step": 1180
},
{
"epoch": 0.4410061611154862,
"grad_norm": 149.27992248535156,
"learning_rate": 2.797332345313079e-07,
"logits/chosen": -6.293272972106934,
"logits/rejected": -6.266045570373535,
"logps/chosen": -1081.994140625,
"logps/rejected": -923.7223510742188,
"loss": 0.5861,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 1.324501395225525,
"rewards/margins": 0.486712783575058,
"rewards/rejected": 0.8377887606620789,
"step": 1190
},
{
"epoch": 0.44471209524250704,
"grad_norm": 158.3959197998047,
"learning_rate": 2.778806965542794e-07,
"logits/chosen": -6.168979644775391,
"logits/rejected": -6.1329731941223145,
"logps/chosen": -876.14697265625,
"logps/rejected": -876.2384643554688,
"loss": 0.6212,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 1.1340038776397705,
"rewards/margins": 0.32338947057724,
"rewards/rejected": 0.8106144070625305,
"step": 1200
},
{
"epoch": 0.44471209524250704,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.352243900299072,
"eval_logps/chosen": -1139.7337646484375,
"eval_logps/rejected": -1050.3447265625,
"eval_loss": 0.6622124910354614,
"eval_rewards/accuracies": 0.6272040009498596,
"eval_rewards/chosen": 1.4611579179763794,
"eval_rewards/margins": 0.27146124839782715,
"eval_rewards/rejected": 1.1896967887878418,
"eval_runtime": 174.1161,
"eval_samples_per_second": 6.84,
"eval_steps_per_second": 6.84,
"step": 1200
},
{
"epoch": 0.44841802936952796,
"grad_norm": 159.97509765625,
"learning_rate": 2.760281585772508e-07,
"logits/chosen": -6.155325889587402,
"logits/rejected": -6.213382720947266,
"logps/chosen": -876.2828369140625,
"logps/rejected": -887.4181518554688,
"loss": 0.6392,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 1.1726298332214355,
"rewards/margins": 0.33511778712272644,
"rewards/rejected": 0.8375120162963867,
"step": 1210
},
{
"epoch": 0.4521239634965489,
"grad_norm": 149.97601318359375,
"learning_rate": 2.741756206002223e-07,
"logits/chosen": -6.089978218078613,
"logits/rejected": -6.215594291687012,
"logps/chosen": -1006.5177612304688,
"logps/rejected": -837.3438720703125,
"loss": 0.5759,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 1.1888597011566162,
"rewards/margins": 0.4549834132194519,
"rewards/rejected": 0.7338763475418091,
"step": 1220
},
{
"epoch": 0.45582989762356974,
"grad_norm": 124.17080688476562,
"learning_rate": 2.7232308262319375e-07,
"logits/chosen": -6.237065315246582,
"logits/rejected": -6.162973403930664,
"logps/chosen": -900.43505859375,
"logps/rejected": -769.6177978515625,
"loss": 0.5828,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 1.1558102369308472,
"rewards/margins": 0.5107904672622681,
"rewards/rejected": 0.6450197100639343,
"step": 1230
},
{
"epoch": 0.45953583175059065,
"grad_norm": 209.32017517089844,
"learning_rate": 2.7047054464616525e-07,
"logits/chosen": -6.1265363693237305,
"logits/rejected": -6.136591911315918,
"logps/chosen": -850.1053466796875,
"logps/rejected": -755.8872680664062,
"loss": 0.6007,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.0844950675964355,
"rewards/margins": 0.4297495484352112,
"rewards/rejected": 0.6547454595565796,
"step": 1240
},
{
"epoch": 0.4632417658776115,
"grad_norm": 230.33175659179688,
"learning_rate": 2.686180066691367e-07,
"logits/chosen": -6.183014392852783,
"logits/rejected": -6.165501117706299,
"logps/chosen": -842.3179931640625,
"logps/rejected": -821.1053466796875,
"loss": 0.6189,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.9313210248947144,
"rewards/margins": 0.330872505903244,
"rewards/rejected": 0.6004485487937927,
"step": 1250
},
{
"epoch": 0.4632417658776115,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.354373931884766,
"eval_logps/chosen": -1141.2061767578125,
"eval_logps/rejected": -1051.6622314453125,
"eval_loss": 0.6636425852775574,
"eval_rewards/accuracies": 0.6204869747161865,
"eval_rewards/chosen": 1.31391179561615,
"eval_rewards/margins": 0.2559622824192047,
"eval_rewards/rejected": 1.0579496622085571,
"eval_runtime": 174.2497,
"eval_samples_per_second": 6.835,
"eval_steps_per_second": 6.835,
"step": 1250
},
{
"epoch": 0.4669477000046324,
"grad_norm": 212.96624755859375,
"learning_rate": 2.667654686921082e-07,
"logits/chosen": -6.100918292999268,
"logits/rejected": NaN,
"logps/chosen": -1028.841552734375,
"logps/rejected": -886.3626708984375,
"loss": 0.613,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 1.181762933731079,
"rewards/margins": 0.37267082929611206,
"rewards/rejected": 0.8090922236442566,
"step": 1260
},
{
"epoch": 0.4706536341316533,
"grad_norm": 162.5323944091797,
"learning_rate": 2.6491293071507966e-07,
"logits/chosen": -6.123560428619385,
"logits/rejected": -6.198370933532715,
"logps/chosen": -930.6388549804688,
"logps/rejected": -765.5624389648438,
"loss": 0.6066,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 1.0889371633529663,
"rewards/margins": 0.40622156858444214,
"rewards/rejected": 0.6827155351638794,
"step": 1270
},
{
"epoch": 0.4743595682586742,
"grad_norm": 129.6339874267578,
"learning_rate": 2.630603927380511e-07,
"logits/chosen": -6.171866416931152,
"logits/rejected": -6.194737434387207,
"logps/chosen": -922.0595703125,
"logps/rejected": -880.8753051757812,
"loss": 0.6099,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 1.1603076457977295,
"rewards/margins": 0.38798120617866516,
"rewards/rejected": 0.7723264694213867,
"step": 1280
},
{
"epoch": 0.4780655023856951,
"grad_norm": 185.04010009765625,
"learning_rate": 2.6120785476102257e-07,
"logits/chosen": -6.216259479522705,
"logits/rejected": -6.08756685256958,
"logps/chosen": -940.3963623046875,
"logps/rejected": -871.3863525390625,
"loss": 0.587,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 1.1315703392028809,
"rewards/margins": 0.41449031233787537,
"rewards/rejected": 0.7170801162719727,
"step": 1290
},
{
"epoch": 0.481771436512716,
"grad_norm": 188.83816528320312,
"learning_rate": 2.5935531678399407e-07,
"logits/chosen": -6.360658645629883,
"logits/rejected": -6.3473029136657715,
"logps/chosen": -941.73876953125,
"logps/rejected": -903.9150390625,
"loss": 0.581,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 1.2506906986236572,
"rewards/margins": 0.4942537844181061,
"rewards/rejected": 0.7564369440078735,
"step": 1300
},
{
"epoch": 0.481771436512716,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.355961322784424,
"eval_logps/chosen": -1139.5653076171875,
"eval_logps/rejected": -1050.153076171875,
"eval_loss": 0.6662114262580872,
"eval_rewards/accuracies": 0.6288833022117615,
"eval_rewards/chosen": 1.4780066013336182,
"eval_rewards/margins": 0.26914337277412415,
"eval_rewards/rejected": 1.2088632583618164,
"eval_runtime": 174.4932,
"eval_samples_per_second": 6.825,
"eval_steps_per_second": 6.825,
"step": 1300
},
{
"epoch": 0.4854773706397369,
"grad_norm": 176.7130584716797,
"learning_rate": 2.575027788069655e-07,
"logits/chosen": -6.2104811668396,
"logits/rejected": -6.247377395629883,
"logps/chosen": -863.90869140625,
"logps/rejected": -790.2784423828125,
"loss": 0.5685,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 1.1237311363220215,
"rewards/margins": 0.48923033475875854,
"rewards/rejected": 0.6345008015632629,
"step": 1310
},
{
"epoch": 0.48918330476675775,
"grad_norm": 181.07481384277344,
"learning_rate": 2.5565024082993703e-07,
"logits/chosen": -6.231227397918701,
"logits/rejected": NaN,
"logps/chosen": -985.6842651367188,
"logps/rejected": -883.3465576171875,
"loss": 0.5585,
"rewards/accuracies": 0.71875,
"rewards/chosen": 1.381658911705017,
"rewards/margins": 0.5166347026824951,
"rewards/rejected": 0.8650242686271667,
"step": 1320
},
{
"epoch": 0.49288923889377867,
"grad_norm": 203.50254821777344,
"learning_rate": 2.537977028529085e-07,
"logits/chosen": -6.048904895782471,
"logits/rejected": -6.121670722961426,
"logps/chosen": -907.2009887695312,
"logps/rejected": -842.6783447265625,
"loss": 0.6369,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 1.19151771068573,
"rewards/margins": 0.38030725717544556,
"rewards/rejected": 0.8112104535102844,
"step": 1330
},
{
"epoch": 0.49659517302079953,
"grad_norm": 154.94403076171875,
"learning_rate": 2.5194516487588e-07,
"logits/chosen": -6.1506195068359375,
"logits/rejected": -6.043631076812744,
"logps/chosen": -921.5447387695312,
"logps/rejected": -728.7830810546875,
"loss": 0.5662,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 1.1830615997314453,
"rewards/margins": 0.491299569606781,
"rewards/rejected": 0.6917620897293091,
"step": 1340
},
{
"epoch": 0.5003011071478205,
"grad_norm": 227.9466094970703,
"learning_rate": 2.500926268988514e-07,
"logits/chosen": -6.187090873718262,
"logits/rejected": -6.22959041595459,
"logps/chosen": -829.8533935546875,
"logps/rejected": -727.3782348632812,
"loss": 0.5804,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.0849952697753906,
"rewards/margins": 0.42699941992759705,
"rewards/rejected": 0.657995879650116,
"step": 1350
},
{
"epoch": 0.5003011071478205,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.349131107330322,
"eval_logps/chosen": -1137.9072265625,
"eval_logps/rejected": -1048.7745361328125,
"eval_loss": 0.6663568019866943,
"eval_rewards/accuracies": 0.6213266253471375,
"eval_rewards/chosen": 1.6438101530075073,
"eval_rewards/margins": 0.2971048057079315,
"eval_rewards/rejected": 1.346705436706543,
"eval_runtime": 174.0406,
"eval_samples_per_second": 6.843,
"eval_steps_per_second": 6.843,
"step": 1350
},
{
"epoch": 0.5040070412748413,
"grad_norm": 193.2044219970703,
"learning_rate": 2.482400889218229e-07,
"logits/chosen": -6.1281938552856445,
"logits/rejected": -6.117993354797363,
"logps/chosen": -1060.016357421875,
"logps/rejected": -954.3433837890625,
"loss": 0.5773,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 1.4718914031982422,
"rewards/margins": 0.5141991376876831,
"rewards/rejected": 0.9576921463012695,
"step": 1360
},
{
"epoch": 0.5077129754018622,
"grad_norm": 203.0106658935547,
"learning_rate": 2.4638755094479434e-07,
"logits/chosen": -6.16585111618042,
"logits/rejected": -6.127178192138672,
"logps/chosen": -917.00146484375,
"logps/rejected": -872.7060546875,
"loss": 0.6158,
"rewards/accuracies": 0.65625,
"rewards/chosen": 1.3071753978729248,
"rewards/margins": 0.41641944646835327,
"rewards/rejected": 0.8907560110092163,
"step": 1370
},
{
"epoch": 0.5114189095288831,
"grad_norm": 135.35690307617188,
"learning_rate": 2.4453501296776585e-07,
"logits/chosen": -6.066123962402344,
"logits/rejected": -6.084324836730957,
"logps/chosen": -859.6808471679688,
"logps/rejected": -741.0250854492188,
"loss": 0.6318,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 1.2575219869613647,
"rewards/margins": 0.3475838005542755,
"rewards/rejected": 0.9099382162094116,
"step": 1380
},
{
"epoch": 0.515124843655904,
"grad_norm": 171.01341247558594,
"learning_rate": 2.426824749907373e-07,
"logits/chosen": -6.1828107833862305,
"logits/rejected": -6.259852886199951,
"logps/chosen": -894.8861083984375,
"logps/rejected": -804.0977783203125,
"loss": 0.5773,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 1.3444898128509521,
"rewards/margins": 0.5704382658004761,
"rewards/rejected": 0.7740517258644104,
"step": 1390
},
{
"epoch": 0.5188307777829249,
"grad_norm": 250.60726928710938,
"learning_rate": 2.4082993701370875e-07,
"logits/chosen": -6.185898780822754,
"logits/rejected": -6.244287014007568,
"logps/chosen": -987.5439453125,
"logps/rejected": -882.2205200195312,
"loss": 0.5984,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 1.4197866916656494,
"rewards/margins": 0.5091265439987183,
"rewards/rejected": 0.9106601476669312,
"step": 1400
},
{
"epoch": 0.5188307777829249,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.353067398071289,
"eval_logps/chosen": -1137.5255126953125,
"eval_logps/rejected": -1048.356689453125,
"eval_loss": 0.6677223443984985,
"eval_rewards/accuracies": 0.6120907068252563,
"eval_rewards/chosen": 1.6819899082183838,
"eval_rewards/margins": 0.29349878430366516,
"eval_rewards/rejected": 1.388491153717041,
"eval_runtime": 174.2641,
"eval_samples_per_second": 6.834,
"eval_steps_per_second": 6.834,
"step": 1400
},
{
"epoch": 0.5225367119099458,
"grad_norm": 240.53480529785156,
"learning_rate": 2.3897739903668026e-07,
"logits/chosen": -6.275177001953125,
"logits/rejected": -6.181919574737549,
"logps/chosen": -965.3531494140625,
"logps/rejected": -786.497802734375,
"loss": 0.6224,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 1.2736237049102783,
"rewards/margins": 0.4058550298213959,
"rewards/rejected": 0.8677686452865601,
"step": 1410
},
{
"epoch": 0.5262426460369667,
"grad_norm": 165.14010620117188,
"learning_rate": 2.371248610596517e-07,
"logits/chosen": -6.211658954620361,
"logits/rejected": -6.0974249839782715,
"logps/chosen": -882.7542724609375,
"logps/rejected": -741.9580078125,
"loss": 0.5812,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.35499107837677,
"rewards/margins": 0.5230444073677063,
"rewards/rejected": 0.8319465517997742,
"step": 1420
},
{
"epoch": 0.5299485801639876,
"grad_norm": 143.9077606201172,
"learning_rate": 2.352723230826232e-07,
"logits/chosen": -6.10528039932251,
"logits/rejected": -6.128796100616455,
"logps/chosen": -933.9381103515625,
"logps/rejected": -815.3626098632812,
"loss": 0.5783,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 1.3778315782546997,
"rewards/margins": 0.5523862242698669,
"rewards/rejected": 0.8254453539848328,
"step": 1430
},
{
"epoch": 0.5336545142910085,
"grad_norm": 249.5879669189453,
"learning_rate": 2.3341978510559464e-07,
"logits/chosen": -6.190318584442139,
"logits/rejected": -6.109808921813965,
"logps/chosen": -988.9300537109375,
"logps/rejected": -904.2208862304688,
"loss": 0.6759,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 1.5002410411834717,
"rewards/margins": 0.37281566858291626,
"rewards/rejected": 1.1274254322052002,
"step": 1440
},
{
"epoch": 0.5373604484180293,
"grad_norm": 158.98834228515625,
"learning_rate": 2.3156724712856612e-07,
"logits/chosen": NaN,
"logits/rejected": -6.147918701171875,
"logps/chosen": -897.3717651367188,
"logps/rejected": -858.4124755859375,
"loss": 0.6545,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 1.2706643342971802,
"rewards/margins": 0.30182304978370667,
"rewards/rejected": 0.9688412547111511,
"step": 1450
},
{
"epoch": 0.5373604484180293,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.35387659072876,
"eval_logps/chosen": -1137.998046875,
"eval_logps/rejected": -1048.7669677734375,
"eval_loss": 0.6649725437164307,
"eval_rewards/accuracies": 0.6255247592926025,
"eval_rewards/chosen": 1.6347370147705078,
"eval_rewards/margins": 0.2872615456581116,
"eval_rewards/rejected": 1.3474754095077515,
"eval_runtime": 174.8955,
"eval_samples_per_second": 6.81,
"eval_steps_per_second": 6.81,
"step": 1450
},
{
"epoch": 0.5410663825450502,
"grad_norm": 202.708740234375,
"learning_rate": 2.297147091515376e-07,
"logits/chosen": -6.149045944213867,
"logits/rejected": -6.200368881225586,
"logps/chosen": -1000.2081909179688,
"logps/rejected": -880.8834838867188,
"loss": 0.609,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 1.3515934944152832,
"rewards/margins": 0.4020705223083496,
"rewards/rejected": 0.949522852897644,
"step": 1460
},
{
"epoch": 0.5447723166720712,
"grad_norm": 378.9857482910156,
"learning_rate": 2.2786217117450905e-07,
"logits/chosen": -6.143518447875977,
"logits/rejected": -6.165432929992676,
"logps/chosen": -974.8902587890625,
"logps/rejected": -885.5198974609375,
"loss": 0.6535,
"rewards/accuracies": 0.625,
"rewards/chosen": 1.1508817672729492,
"rewards/margins": 0.333347886800766,
"rewards/rejected": 0.8175338506698608,
"step": 1470
},
{
"epoch": 0.5484782507990921,
"grad_norm": 156.70301818847656,
"learning_rate": 2.2600963319748053e-07,
"logits/chosen": -6.1991682052612305,
"logits/rejected": -6.233222007751465,
"logps/chosen": -1007.1458129882812,
"logps/rejected": -901.8226318359375,
"loss": 0.5823,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 1.3570195436477661,
"rewards/margins": 0.5376918315887451,
"rewards/rejected": 0.819327712059021,
"step": 1480
},
{
"epoch": 0.552184184926113,
"grad_norm": 194.2936248779297,
"learning_rate": 2.24157095220452e-07,
"logits/chosen": -6.096491813659668,
"logits/rejected": -6.074574947357178,
"logps/chosen": -922.29833984375,
"logps/rejected": -841.58251953125,
"loss": 0.6448,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 1.311821699142456,
"rewards/margins": 0.2840858995914459,
"rewards/rejected": 1.027735710144043,
"step": 1490
},
{
"epoch": 0.5558901190531338,
"grad_norm": 138.76791381835938,
"learning_rate": 2.223045572434235e-07,
"logits/chosen": -6.16571569442749,
"logits/rejected": -6.221317768096924,
"logps/chosen": -931.6896362304688,
"logps/rejected": -879.6935424804688,
"loss": 0.6187,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 1.2591129541397095,
"rewards/margins": 0.4034864008426666,
"rewards/rejected": 0.8556264638900757,
"step": 1500
},
{
"epoch": 0.5558901190531338,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.352556228637695,
"eval_logps/chosen": -1138.7109375,
"eval_logps/rejected": -1049.4052734375,
"eval_loss": 0.6670076847076416,
"eval_rewards/accuracies": 0.6196473836898804,
"eval_rewards/chosen": 1.5634312629699707,
"eval_rewards/margins": 0.2797936499118805,
"eval_rewards/rejected": 1.2836376428604126,
"eval_runtime": 174.7794,
"eval_samples_per_second": 6.814,
"eval_steps_per_second": 6.814,
"step": 1500
},
{
"epoch": 0.5595960531801547,
"grad_norm": 196.8204345703125,
"learning_rate": 2.2045201926639494e-07,
"logits/chosen": -6.262406349182129,
"logits/rejected": -6.286099433898926,
"logps/chosen": -926.6876831054688,
"logps/rejected": -719.3727416992188,
"loss": 0.5834,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 1.2394769191741943,
"rewards/margins": 0.5388771891593933,
"rewards/rejected": 0.7005997896194458,
"step": 1510
},
{
"epoch": 0.5633019873071756,
"grad_norm": 181.23269653320312,
"learning_rate": 2.1859948128936642e-07,
"logits/chosen": -6.15579080581665,
"logits/rejected": NaN,
"logps/chosen": -907.208984375,
"logps/rejected": -789.55615234375,
"loss": 0.6841,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 1.3312886953353882,
"rewards/margins": 0.29294928908348083,
"rewards/rejected": 1.038339614868164,
"step": 1520
},
{
"epoch": 0.5670079214341965,
"grad_norm": 249.67068481445312,
"learning_rate": 2.167469433123379e-07,
"logits/chosen": -6.171587944030762,
"logits/rejected": -6.20114803314209,
"logps/chosen": -1026.1175537109375,
"logps/rejected": -928.2333984375,
"loss": 0.6224,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 1.4529210329055786,
"rewards/margins": 0.44856566190719604,
"rewards/rejected": 1.0043553113937378,
"step": 1530
},
{
"epoch": 0.5707138555612175,
"grad_norm": 161.0312042236328,
"learning_rate": 2.1489440533530935e-07,
"logits/chosen": -6.143443584442139,
"logits/rejected": -6.267470836639404,
"logps/chosen": -933.2398681640625,
"logps/rejected": -907.0738525390625,
"loss": 0.6238,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 1.4245777130126953,
"rewards/margins": 0.4305481016635895,
"rewards/rejected": 0.9940296411514282,
"step": 1540
},
{
"epoch": 0.5744197896882383,
"grad_norm": 156.3391571044922,
"learning_rate": 2.1304186735828083e-07,
"logits/chosen": -6.159350395202637,
"logits/rejected": -6.219527244567871,
"logps/chosen": -970.1463012695312,
"logps/rejected": -861.26416015625,
"loss": 0.6633,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 1.3536326885223389,
"rewards/margins": 0.36367741227149963,
"rewards/rejected": 0.9899552464485168,
"step": 1550
},
{
"epoch": 0.5744197896882383,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.356298923492432,
"eval_logps/chosen": -1139.156005859375,
"eval_logps/rejected": -1049.7542724609375,
"eval_loss": 0.6682325601577759,
"eval_rewards/accuracies": 0.6162888407707214,
"eval_rewards/chosen": 1.51894211769104,
"eval_rewards/margins": 0.27020886540412903,
"eval_rewards/rejected": 1.2487331628799438,
"eval_runtime": 174.6376,
"eval_samples_per_second": 6.82,
"eval_steps_per_second": 6.82,
"step": 1550
},
{
"epoch": 0.5781257238152592,
"grad_norm": 223.8711395263672,
"learning_rate": 2.111893293812523e-07,
"logits/chosen": -6.078129768371582,
"logits/rejected": -6.09130859375,
"logps/chosen": -902.1170043945312,
"logps/rejected": -816.3302001953125,
"loss": 0.6542,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 1.166666030883789,
"rewards/margins": 0.32527679204940796,
"rewards/rejected": 0.8413891792297363,
"step": 1560
},
{
"epoch": 0.5818316579422801,
"grad_norm": 146.7541961669922,
"learning_rate": 2.093367914042238e-07,
"logits/chosen": -6.186091899871826,
"logits/rejected": -6.257566928863525,
"logps/chosen": -953.74853515625,
"logps/rejected": -876.87451171875,
"loss": 0.6237,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 1.2467100620269775,
"rewards/margins": 0.4072350859642029,
"rewards/rejected": 0.8394750356674194,
"step": 1570
},
{
"epoch": 0.585537592069301,
"grad_norm": 201.52931213378906,
"learning_rate": 2.0748425342719524e-07,
"logits/chosen": -6.206198215484619,
"logits/rejected": -6.1201300621032715,
"logps/chosen": -973.3306884765625,
"logps/rejected": -831.2630004882812,
"loss": 0.6483,
"rewards/accuracies": 0.65625,
"rewards/chosen": 1.2103111743927002,
"rewards/margins": 0.3353997766971588,
"rewards/rejected": 0.8749113082885742,
"step": 1580
},
{
"epoch": 0.5892435261963218,
"grad_norm": 157.7445526123047,
"learning_rate": 2.0563171545016672e-07,
"logits/chosen": -6.155628681182861,
"logits/rejected": -6.149939060211182,
"logps/chosen": -978.5584716796875,
"logps/rejected": -856.3310546875,
"loss": 0.5719,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 1.2967727184295654,
"rewards/margins": 0.4965516924858093,
"rewards/rejected": 0.8002211451530457,
"step": 1590
},
{
"epoch": 0.5929494603233427,
"grad_norm": 161.9136505126953,
"learning_rate": 2.037791774731382e-07,
"logits/chosen": -6.121860027313232,
"logits/rejected": -6.133907794952393,
"logps/chosen": -928.7197265625,
"logps/rejected": -803.7809448242188,
"loss": 0.6081,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 1.312783122062683,
"rewards/margins": 0.4244639277458191,
"rewards/rejected": 0.8883193135261536,
"step": 1600
},
{
"epoch": 0.5929494603233427,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.356760025024414,
"eval_logps/chosen": -1139.2073974609375,
"eval_logps/rejected": -1049.739990234375,
"eval_loss": 0.6694273352622986,
"eval_rewards/accuracies": 0.6171284914016724,
"eval_rewards/chosen": 1.513792872428894,
"eval_rewards/margins": 0.2636261582374573,
"eval_rewards/rejected": 1.250166654586792,
"eval_runtime": 174.1847,
"eval_samples_per_second": 6.838,
"eval_steps_per_second": 6.838,
"step": 1600
},
{
"epoch": 0.5966553944503636,
"grad_norm": 110.379638671875,
"learning_rate": 2.0192663949610965e-07,
"logits/chosen": -6.110042572021484,
"logits/rejected": -6.157367706298828,
"logps/chosen": -851.4879760742188,
"logps/rejected": -773.2803955078125,
"loss": 0.5877,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 1.2313387393951416,
"rewards/margins": 0.5156643390655518,
"rewards/rejected": 0.7156744599342346,
"step": 1610
},
{
"epoch": 0.6003613285773846,
"grad_norm": 196.35398864746094,
"learning_rate": 2.0007410151908113e-07,
"logits/chosen": -6.248660087585449,
"logits/rejected": -6.268450736999512,
"logps/chosen": -1027.9803466796875,
"logps/rejected": -942.8600463867188,
"loss": 0.5961,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 1.4811532497406006,
"rewards/margins": 0.4853101670742035,
"rewards/rejected": 0.9958430528640747,
"step": 1620
},
{
"epoch": 0.6040672627044055,
"grad_norm": 147.44473266601562,
"learning_rate": 1.982215635420526e-07,
"logits/chosen": -6.094088554382324,
"logits/rejected": NaN,
"logps/chosen": -970.6212158203125,
"logps/rejected": -842.7681884765625,
"loss": 0.668,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 1.3443838357925415,
"rewards/margins": 0.37295302748680115,
"rewards/rejected": 0.971430778503418,
"step": 1630
},
{
"epoch": 0.6077731968314263,
"grad_norm": 163.9195556640625,
"learning_rate": 1.963690255650241e-07,
"logits/chosen": -6.2662577629089355,
"logits/rejected": -6.079975128173828,
"logps/chosen": -937.974609375,
"logps/rejected": -781.6622314453125,
"loss": 0.5775,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 1.30463707447052,
"rewards/margins": 0.5367504358291626,
"rewards/rejected": 0.7678866982460022,
"step": 1640
},
{
"epoch": 0.6114791309584472,
"grad_norm": 162.06088256835938,
"learning_rate": 1.9451648758799554e-07,
"logits/chosen": -6.107717990875244,
"logits/rejected": -6.13240909576416,
"logps/chosen": -855.8272705078125,
"logps/rejected": -743.3189086914062,
"loss": 0.6199,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 1.208332896232605,
"rewards/margins": 0.32260221242904663,
"rewards/rejected": 0.8857306241989136,
"step": 1650
},
{
"epoch": 0.6114791309584472,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.351521015167236,
"eval_logps/chosen": -1138.5380859375,
"eval_logps/rejected": -1049.13720703125,
"eval_loss": 0.67048579454422,
"eval_rewards/accuracies": 0.6087321639060974,
"eval_rewards/chosen": 1.580714464187622,
"eval_rewards/margins": 0.2702693045139313,
"eval_rewards/rejected": 1.3104450702667236,
"eval_runtime": 174.7091,
"eval_samples_per_second": 6.817,
"eval_steps_per_second": 6.817,
"step": 1650
},
{
"epoch": 0.6151850650854681,
"grad_norm": 209.0579833984375,
"learning_rate": 1.9266394961096702e-07,
"logits/chosen": -6.237910270690918,
"logits/rejected": -6.252842903137207,
"logps/chosen": -996.7249755859375,
"logps/rejected": -873.1549072265625,
"loss": 0.6163,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 1.4242780208587646,
"rewards/margins": 0.4079625606536865,
"rewards/rejected": 1.0163153409957886,
"step": 1660
},
{
"epoch": 0.618890999212489,
"grad_norm": 152.62171936035156,
"learning_rate": 1.908114116339385e-07,
"logits/chosen": -6.184769630432129,
"logits/rejected": -6.171984672546387,
"logps/chosen": -904.5120849609375,
"logps/rejected": -842.2957153320312,
"loss": 0.6201,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 1.3089262247085571,
"rewards/margins": 0.4640830457210541,
"rewards/rejected": 0.8448432087898254,
"step": 1670
},
{
"epoch": 0.6225969333395099,
"grad_norm": 163.1865997314453,
"learning_rate": 1.8895887365690995e-07,
"logits/chosen": -6.068234443664551,
"logits/rejected": -6.1052398681640625,
"logps/chosen": -892.0081787109375,
"logps/rejected": -850.9469604492188,
"loss": 0.5911,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.2593834400177002,
"rewards/margins": 0.4649893641471863,
"rewards/rejected": 0.7943940162658691,
"step": 1680
},
{
"epoch": 0.6263028674665307,
"grad_norm": 194.63597106933594,
"learning_rate": 1.8710633567988143e-07,
"logits/chosen": -6.147702217102051,
"logits/rejected": -6.189964294433594,
"logps/chosen": -944.7171630859375,
"logps/rejected": -848.2835693359375,
"loss": 0.594,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 1.3190233707427979,
"rewards/margins": 0.39705803990364075,
"rewards/rejected": 0.9219652414321899,
"step": 1690
},
{
"epoch": 0.6300088015935517,
"grad_norm": 196.50355529785156,
"learning_rate": 1.852537977028529e-07,
"logits/chosen": -6.1947102546691895,
"logits/rejected": -6.1808247566223145,
"logps/chosen": -886.1546020507812,
"logps/rejected": -801.2207641601562,
"loss": 0.6283,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 1.2352696657180786,
"rewards/margins": 0.33806803822517395,
"rewards/rejected": 0.8972015380859375,
"step": 1700
},
{
"epoch": 0.6300088015935517,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.353530406951904,
"eval_logps/chosen": -1139.465576171875,
"eval_logps/rejected": -1050.0250244140625,
"eval_loss": 0.6684470772743225,
"eval_rewards/accuracies": 0.6246851682662964,
"eval_rewards/chosen": 1.4879825115203857,
"eval_rewards/margins": 0.26630899310112,
"eval_rewards/rejected": 1.2216734886169434,
"eval_runtime": 174.6538,
"eval_samples_per_second": 6.819,
"eval_steps_per_second": 6.819,
"step": 1700
},
{
"epoch": 0.6337147357205726,
"grad_norm": 148.55259704589844,
"learning_rate": 1.834012597258244e-07,
"logits/chosen": -6.227621078491211,
"logits/rejected": -6.297041893005371,
"logps/chosen": -928.7427978515625,
"logps/rejected": -803.3206176757812,
"loss": 0.5915,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 1.285189151763916,
"rewards/margins": 0.5148311853408813,
"rewards/rejected": 0.7703579664230347,
"step": 1710
},
{
"epoch": 0.6374206698475935,
"grad_norm": 144.889404296875,
"learning_rate": 1.8154872174879584e-07,
"logits/chosen": -6.107190132141113,
"logits/rejected": -6.062108993530273,
"logps/chosen": -983.3319091796875,
"logps/rejected": -892.3099365234375,
"loss": 0.6625,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 1.1306359767913818,
"rewards/margins": 0.367345929145813,
"rewards/rejected": 0.7632900476455688,
"step": 1720
},
{
"epoch": 0.6411266039746144,
"grad_norm": 196.45310974121094,
"learning_rate": 1.7969618377176732e-07,
"logits/chosen": -6.020756721496582,
"logits/rejected": -6.166621208190918,
"logps/chosen": -843.7468872070312,
"logps/rejected": -768.1483154296875,
"loss": 0.6033,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 1.140453577041626,
"rewards/margins": 0.39432471990585327,
"rewards/rejected": 0.7461288571357727,
"step": 1730
},
{
"epoch": 0.6448325381016352,
"grad_norm": 151.2347412109375,
"learning_rate": 1.778436457947388e-07,
"logits/chosen": -6.10150671005249,
"logits/rejected": -6.154606819152832,
"logps/chosen": -896.15185546875,
"logps/rejected": -831.7677612304688,
"loss": 0.5964,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.2138025760650635,
"rewards/margins": 0.4559560716152191,
"rewards/rejected": 0.757846474647522,
"step": 1740
},
{
"epoch": 0.6485384722286561,
"grad_norm": 226.45729064941406,
"learning_rate": 1.7599110781771025e-07,
"logits/chosen": -6.205390453338623,
"logits/rejected": -6.212441444396973,
"logps/chosen": -1027.686767578125,
"logps/rejected": -986.68603515625,
"loss": 0.5979,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 1.3980613946914673,
"rewards/margins": 0.47088512778282166,
"rewards/rejected": 0.9271761775016785,
"step": 1750
},
{
"epoch": 0.6485384722286561,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.354175090789795,
"eval_logps/chosen": -1139.527587890625,
"eval_logps/rejected": -1050.107666015625,
"eval_loss": 0.665941596031189,
"eval_rewards/accuracies": 0.6246851682662964,
"eval_rewards/chosen": 1.4817659854888916,
"eval_rewards/margins": 0.2683611810207367,
"eval_rewards/rejected": 1.2134050130844116,
"eval_runtime": 174.0278,
"eval_samples_per_second": 6.844,
"eval_steps_per_second": 6.844,
"step": 1750
},
{
"epoch": 0.652244406355677,
"grad_norm": 144.0729217529297,
"learning_rate": 1.7413856984068173e-07,
"logits/chosen": -6.174568176269531,
"logits/rejected": -6.175555229187012,
"logps/chosen": -911.03125,
"logps/rejected": -848.8049926757812,
"loss": 0.5949,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 1.23678457736969,
"rewards/margins": 0.4119951128959656,
"rewards/rejected": 0.8247894048690796,
"step": 1760
},
{
"epoch": 0.655950340482698,
"grad_norm": 181.58526611328125,
"learning_rate": 1.722860318636532e-07,
"logits/chosen": -6.1877946853637695,
"logits/rejected": -6.193057060241699,
"logps/chosen": -931.5440673828125,
"logps/rejected": -827.9782104492188,
"loss": 0.6147,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 1.2314589023590088,
"rewards/margins": 0.3570247292518616,
"rewards/rejected": 0.8744341135025024,
"step": 1770
},
{
"epoch": 0.6596562746097188,
"grad_norm": 152.58062744140625,
"learning_rate": 1.7043349388662469e-07,
"logits/chosen": -6.187155246734619,
"logits/rejected": -6.14534854888916,
"logps/chosen": -855.6112060546875,
"logps/rejected": -809.7816162109375,
"loss": 0.6337,
"rewards/accuracies": 0.625,
"rewards/chosen": 1.152524709701538,
"rewards/margins": 0.3496701121330261,
"rewards/rejected": 0.8028545379638672,
"step": 1780
},
{
"epoch": 0.6633622087367397,
"grad_norm": 172.72300720214844,
"learning_rate": 1.6858095590959614e-07,
"logits/chosen": -6.147979259490967,
"logits/rejected": -6.196808815002441,
"logps/chosen": -1047.6871337890625,
"logps/rejected": -875.8937377929688,
"loss": 0.5804,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 1.3021422624588013,
"rewards/margins": 0.5173267126083374,
"rewards/rejected": 0.7848155498504639,
"step": 1790
},
{
"epoch": 0.6670681428637606,
"grad_norm": 157.03228759765625,
"learning_rate": 1.6672841793256762e-07,
"logits/chosen": -6.194676876068115,
"logits/rejected": -6.176183223724365,
"logps/chosen": -866.8079223632812,
"logps/rejected": -801.6738891601562,
"loss": 0.6229,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 1.2280833721160889,
"rewards/margins": 0.37192708253860474,
"rewards/rejected": 0.8561564683914185,
"step": 1800
},
{
"epoch": 0.6670681428637606,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.353016376495361,
"eval_logps/chosen": -1139.4063720703125,
"eval_logps/rejected": -1050.0079345703125,
"eval_loss": 0.6658960580825806,
"eval_rewards/accuracies": 0.6154491901397705,
"eval_rewards/chosen": 1.4938946962356567,
"eval_rewards/margins": 0.27052515745162964,
"eval_rewards/rejected": 1.2233693599700928,
"eval_runtime": 174.6275,
"eval_samples_per_second": 6.82,
"eval_steps_per_second": 6.82,
"step": 1800
},
{
"epoch": 0.6707740769907815,
"grad_norm": 192.70468139648438,
"learning_rate": 1.648758799555391e-07,
"logits/chosen": -6.1675310134887695,
"logits/rejected": -6.276528835296631,
"logps/chosen": -952.8025512695312,
"logps/rejected": -811.5380859375,
"loss": 0.6054,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 1.2332537174224854,
"rewards/margins": 0.4917047917842865,
"rewards/rejected": 0.7415488958358765,
"step": 1810
},
{
"epoch": 0.6744800111178024,
"grad_norm": 204.32887268066406,
"learning_rate": 1.6302334197851055e-07,
"logits/chosen": -6.1494927406311035,
"logits/rejected": -6.112942695617676,
"logps/chosen": -909.9085693359375,
"logps/rejected": -847.9560546875,
"loss": 0.5969,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 1.291352391242981,
"rewards/margins": 0.5246464014053345,
"rewards/rejected": 0.7667059898376465,
"step": 1820
},
{
"epoch": 0.6781859452448232,
"grad_norm": 129.51596069335938,
"learning_rate": 1.6117080400148203e-07,
"logits/chosen": -6.111174583435059,
"logits/rejected": -6.139338970184326,
"logps/chosen": -1006.2116088867188,
"logps/rejected": -902.53076171875,
"loss": 0.6243,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 1.3876721858978271,
"rewards/margins": 0.43100985884666443,
"rewards/rejected": 0.9566623568534851,
"step": 1830
},
{
"epoch": 0.6818918793718441,
"grad_norm": 198.752197265625,
"learning_rate": 1.593182660244535e-07,
"logits/chosen": -6.178341388702393,
"logits/rejected": -6.032105922698975,
"logps/chosen": -931.4904174804688,
"logps/rejected": -842.7693481445312,
"loss": 0.6636,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 1.2749412059783936,
"rewards/margins": 0.29882222414016724,
"rewards/rejected": 0.9761190414428711,
"step": 1840
},
{
"epoch": 0.6855978134988651,
"grad_norm": 173.8763885498047,
"learning_rate": 1.5746572804742499e-07,
"logits/chosen": -6.294638156890869,
"logits/rejected": -6.279183387756348,
"logps/chosen": -1006.7406005859375,
"logps/rejected": -953.181640625,
"loss": 0.6777,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 1.4022231101989746,
"rewards/margins": 0.36820927262306213,
"rewards/rejected": 1.0340137481689453,
"step": 1850
},
{
"epoch": 0.6855978134988651,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.3531341552734375,
"eval_logps/chosen": -1139.8416748046875,
"eval_logps/rejected": -1050.22802734375,
"eval_loss": 0.6716598868370056,
"eval_rewards/accuracies": 0.6078925132751465,
"eval_rewards/chosen": 1.4503740072250366,
"eval_rewards/margins": 0.24901418387889862,
"eval_rewards/rejected": 1.2013598680496216,
"eval_runtime": 174.1779,
"eval_samples_per_second": 6.838,
"eval_steps_per_second": 6.838,
"step": 1850
},
{
"epoch": 0.689303747625886,
"grad_norm": 166.79347229003906,
"learning_rate": 1.5561319007039644e-07,
"logits/chosen": -6.272482872009277,
"logits/rejected": -6.234023094177246,
"logps/chosen": -851.1373291015625,
"logps/rejected": -780.5647583007812,
"loss": 0.5844,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 1.1571626663208008,
"rewards/margins": 0.47672972083091736,
"rewards/rejected": 0.680432915687561,
"step": 1860
},
{
"epoch": 0.6930096817529069,
"grad_norm": 151.9160919189453,
"learning_rate": 1.5376065209336792e-07,
"logits/chosen": -6.101273059844971,
"logits/rejected": -6.033238887786865,
"logps/chosen": -865.0284423828125,
"logps/rejected": -818.7235107421875,
"loss": 0.6109,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 1.0943622589111328,
"rewards/margins": 0.34644466638565063,
"rewards/rejected": 0.7479175925254822,
"step": 1870
},
{
"epoch": 0.6967156158799277,
"grad_norm": 148.6206817626953,
"learning_rate": 1.519081141163394e-07,
"logits/chosen": -6.229578971862793,
"logits/rejected": -6.284262657165527,
"logps/chosen": -960.0841674804688,
"logps/rejected": -855.2936401367188,
"loss": 0.6012,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.3213304281234741,
"rewards/margins": 0.48967212438583374,
"rewards/rejected": 0.8316582441329956,
"step": 1880
},
{
"epoch": 0.7004215500069486,
"grad_norm": 158.1896514892578,
"learning_rate": 1.5005557613931085e-07,
"logits/chosen": -6.0919904708862305,
"logits/rejected": -6.142601013183594,
"logps/chosen": -897.45556640625,
"logps/rejected": -893.6177978515625,
"loss": 0.6336,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 1.1414662599563599,
"rewards/margins": 0.35405582189559937,
"rewards/rejected": 0.7874104380607605,
"step": 1890
},
{
"epoch": 0.7041274841339695,
"grad_norm": 248.24693298339844,
"learning_rate": 1.4820303816228233e-07,
"logits/chosen": -6.12492036819458,
"logits/rejected": -6.182600498199463,
"logps/chosen": -882.8030395507812,
"logps/rejected": -820.3277587890625,
"loss": 0.5957,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 1.1524447202682495,
"rewards/margins": 0.43523526191711426,
"rewards/rejected": 0.71720951795578,
"step": 1900
},
{
"epoch": 0.7041274841339695,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.356428146362305,
"eval_logps/chosen": -1141.09130859375,
"eval_logps/rejected": -1051.447998046875,
"eval_loss": 0.668496310710907,
"eval_rewards/accuracies": 0.6146095991134644,
"eval_rewards/chosen": 1.3254036903381348,
"eval_rewards/margins": 0.24603614211082458,
"eval_rewards/rejected": 1.0793676376342773,
"eval_runtime": 174.5224,
"eval_samples_per_second": 6.824,
"eval_steps_per_second": 6.824,
"step": 1900
},
{
"epoch": 0.7078334182609904,
"grad_norm": 246.97654724121094,
"learning_rate": 1.463505001852538e-07,
"logits/chosen": -6.261816024780273,
"logits/rejected": -6.296639442443848,
"logps/chosen": -889.0240478515625,
"logps/rejected": -812.9447021484375,
"loss": 0.6209,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 1.1191017627716064,
"rewards/margins": 0.4275694489479065,
"rewards/rejected": 0.6915323138237,
"step": 1910
},
{
"epoch": 0.7115393523880112,
"grad_norm": 187.32501220703125,
"learning_rate": 1.4449796220822528e-07,
"logits/chosen": -6.17855167388916,
"logits/rejected": -6.182552337646484,
"logps/chosen": -1076.29833984375,
"logps/rejected": -952.2566528320312,
"loss": 0.5959,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.351000428199768,
"rewards/margins": 0.5084677338600159,
"rewards/rejected": 0.8425326347351074,
"step": 1920
},
{
"epoch": 0.7152452865150322,
"grad_norm": 189.5558624267578,
"learning_rate": 1.4264542423119674e-07,
"logits/chosen": -6.147101402282715,
"logits/rejected": -6.108138084411621,
"logps/chosen": -975.3698120117188,
"logps/rejected": -849.8056640625,
"loss": 0.6439,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 1.0261156558990479,
"rewards/margins": 0.3257748484611511,
"rewards/rejected": 0.7003408670425415,
"step": 1930
},
{
"epoch": 0.7189512206420531,
"grad_norm": 188.531494140625,
"learning_rate": 1.4079288625416822e-07,
"logits/chosen": -6.304642200469971,
"logits/rejected": -6.310070991516113,
"logps/chosen": -951.8046875,
"logps/rejected": -888.8732299804688,
"loss": 0.623,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 1.1070719957351685,
"rewards/margins": 0.3869238495826721,
"rewards/rejected": 0.7201482057571411,
"step": 1940
},
{
"epoch": 0.722657154769074,
"grad_norm": 198.22499084472656,
"learning_rate": 1.389403482771397e-07,
"logits/chosen": -6.263820648193359,
"logits/rejected": NaN,
"logps/chosen": -928.9371337890625,
"logps/rejected": -846.8762817382812,
"loss": 0.6375,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 1.0606300830841064,
"rewards/margins": 0.30186527967453003,
"rewards/rejected": 0.7587647438049316,
"step": 1950
},
{
"epoch": 0.722657154769074,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.365767955780029,
"eval_logps/chosen": -1140.9320068359375,
"eval_logps/rejected": -1051.3402099609375,
"eval_loss": 0.6660320162773132,
"eval_rewards/accuracies": 0.6263644099235535,
"eval_rewards/chosen": 1.341342568397522,
"eval_rewards/margins": 0.2511833608150482,
"eval_rewards/rejected": 1.0901591777801514,
"eval_runtime": 174.295,
"eval_samples_per_second": 6.833,
"eval_steps_per_second": 6.833,
"step": 1950
},
{
"epoch": 0.7263630888960949,
"grad_norm": 158.5247039794922,
"learning_rate": 1.3708781030011115e-07,
"logits/chosen": -6.31934118270874,
"logits/rejected": -6.218142986297607,
"logps/chosen": -926.21826171875,
"logps/rejected": -798.2918090820312,
"loss": 0.607,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 1.078592300415039,
"rewards/margins": 0.38651174306869507,
"rewards/rejected": 0.6920806169509888,
"step": 1960
},
{
"epoch": 0.7300690230231157,
"grad_norm": 178.37115478515625,
"learning_rate": 1.3523527232308263e-07,
"logits/chosen": NaN,
"logits/rejected": -6.076174736022949,
"logps/chosen": -980.4775390625,
"logps/rejected": -835.7880859375,
"loss": 0.5914,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 1.232055902481079,
"rewards/margins": 0.5219300389289856,
"rewards/rejected": 0.710125744342804,
"step": 1970
},
{
"epoch": 0.7337749571501366,
"grad_norm": 164.35330200195312,
"learning_rate": 1.333827343460541e-07,
"logits/chosen": -6.220945835113525,
"logits/rejected": -6.0729475021362305,
"logps/chosen": -925.7268676757812,
"logps/rejected": -770.1295776367188,
"loss": 0.6359,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 1.143280267715454,
"rewards/margins": 0.3542799651622772,
"rewards/rejected": 0.7890002131462097,
"step": 1980
},
{
"epoch": 0.7374808912771575,
"grad_norm": 168.5455322265625,
"learning_rate": 1.3153019636902556e-07,
"logits/chosen": -6.159814834594727,
"logits/rejected": -6.193203926086426,
"logps/chosen": -896.9928588867188,
"logps/rejected": -814.4002075195312,
"loss": 0.5956,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 1.1392806768417358,
"rewards/margins": 0.39641261100769043,
"rewards/rejected": 0.7428680658340454,
"step": 1990
},
{
"epoch": 0.7411868254041785,
"grad_norm": 178.9434814453125,
"learning_rate": 1.2967765839199704e-07,
"logits/chosen": -6.179207801818848,
"logits/rejected": -6.206517219543457,
"logps/chosen": -962.1814575195312,
"logps/rejected": -875.90869140625,
"loss": 0.5861,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 1.4602222442626953,
"rewards/margins": 0.5008874535560608,
"rewards/rejected": 0.9593348503112793,
"step": 2000
},
{
"epoch": 0.7411868254041785,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.363770008087158,
"eval_logps/chosen": -1139.4501953125,
"eval_logps/rejected": -1049.95703125,
"eval_loss": 0.6680687069892883,
"eval_rewards/accuracies": 0.6154491901397705,
"eval_rewards/chosen": 1.4895213842391968,
"eval_rewards/margins": 0.2610515356063843,
"eval_rewards/rejected": 1.2284698486328125,
"eval_runtime": 174.3568,
"eval_samples_per_second": 6.831,
"eval_steps_per_second": 6.831,
"step": 2000
},
{
"epoch": 0.7448927595311994,
"grad_norm": 236.87335205078125,
"learning_rate": 1.2782512041496851e-07,
"logits/chosen": -6.2562150955200195,
"logits/rejected": -6.181870937347412,
"logps/chosen": -1007.4918823242188,
"logps/rejected": -940.9990234375,
"loss": 0.64,
"rewards/accuracies": 0.65625,
"rewards/chosen": 1.4098269939422607,
"rewards/margins": 0.383728563785553,
"rewards/rejected": 1.0260984897613525,
"step": 2010
},
{
"epoch": 0.7485986936582202,
"grad_norm": 180.59913635253906,
"learning_rate": 1.2597258243794e-07,
"logits/chosen": -6.184215068817139,
"logits/rejected": -6.141203880310059,
"logps/chosen": -925.7356567382812,
"logps/rejected": -824.5137939453125,
"loss": 0.5902,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 1.2621865272521973,
"rewards/margins": 0.42541566491127014,
"rewards/rejected": 0.8367708921432495,
"step": 2020
},
{
"epoch": 0.7523046277852411,
"grad_norm": 192.83140563964844,
"learning_rate": 1.2412004446091145e-07,
"logits/chosen": -6.163074970245361,
"logits/rejected": -6.230958461761475,
"logps/chosen": -957.0022583007812,
"logps/rejected": -863.4744873046875,
"loss": 0.5727,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 1.3720769882202148,
"rewards/margins": 0.5459089279174805,
"rewards/rejected": 0.8261680603027344,
"step": 2030
},
{
"epoch": 0.756010561912262,
"grad_norm": 128.6780242919922,
"learning_rate": 1.2226750648388292e-07,
"logits/chosen": -6.119555473327637,
"logits/rejected": -6.239079475402832,
"logps/chosen": -927.7706909179688,
"logps/rejected": -826.4373779296875,
"loss": 0.5713,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.3195993900299072,
"rewards/margins": 0.4843137264251709,
"rewards/rejected": 0.8352855443954468,
"step": 2040
},
{
"epoch": 0.7597164960392829,
"grad_norm": 244.9542694091797,
"learning_rate": 1.2041496850685438e-07,
"logits/chosen": -6.327083110809326,
"logits/rejected": -6.312042236328125,
"logps/chosen": -1045.3717041015625,
"logps/rejected": -913.8675537109375,
"loss": 0.5965,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 1.329573392868042,
"rewards/margins": 0.427541583776474,
"rewards/rejected": 0.9020318984985352,
"step": 2050
},
{
"epoch": 0.7597164960392829,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.36568546295166,
"eval_logps/chosen": -1140.65673828125,
"eval_logps/rejected": -1051.03271484375,
"eval_loss": 0.6708235740661621,
"eval_rewards/accuracies": 0.6112510561943054,
"eval_rewards/chosen": 1.3688610792160034,
"eval_rewards/margins": 0.24796034395694733,
"eval_rewards/rejected": 1.1209006309509277,
"eval_runtime": 174.13,
"eval_samples_per_second": 6.84,
"eval_steps_per_second": 6.84,
"step": 2050
},
{
"epoch": 0.7634224301663038,
"grad_norm": 154.33078002929688,
"learning_rate": 1.1856243052982586e-07,
"logits/chosen": -6.247130393981934,
"logits/rejected": -6.239518642425537,
"logps/chosen": -976.1388549804688,
"logps/rejected": -855.1339111328125,
"loss": 0.5808,
"rewards/accuracies": 0.71875,
"rewards/chosen": 1.2261674404144287,
"rewards/margins": 0.4568137526512146,
"rewards/rejected": 0.7693536877632141,
"step": 2060
},
{
"epoch": 0.7671283642933246,
"grad_norm": 152.96670532226562,
"learning_rate": 1.1670989255279732e-07,
"logits/chosen": -6.12724494934082,
"logits/rejected": -6.189521312713623,
"logps/chosen": -933.6554565429688,
"logps/rejected": -847.5482177734375,
"loss": 0.6256,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 1.2696754932403564,
"rewards/margins": 0.4418273866176605,
"rewards/rejected": 0.8278481364250183,
"step": 2070
},
{
"epoch": 0.7708342984203456,
"grad_norm": 171.41937255859375,
"learning_rate": 1.148573545757688e-07,
"logits/chosen": -6.24149227142334,
"logits/rejected": -6.186224937438965,
"logps/chosen": -952.6795654296875,
"logps/rejected": -866.2142333984375,
"loss": 0.5913,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 1.2056645154953003,
"rewards/margins": 0.43794241547584534,
"rewards/rejected": 0.7677222490310669,
"step": 2080
},
{
"epoch": 0.7745402325473665,
"grad_norm": 202.81951904296875,
"learning_rate": 1.1300481659874027e-07,
"logits/chosen": -6.235200881958008,
"logits/rejected": -6.2063446044921875,
"logps/chosen": -846.9156494140625,
"logps/rejected": -822.10205078125,
"loss": 0.6708,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 1.0181329250335693,
"rewards/margins": 0.2550104558467865,
"rewards/rejected": 0.7631224393844604,
"step": 2090
},
{
"epoch": 0.7782461666743874,
"grad_norm": 183.90293884277344,
"learning_rate": 1.1115227862171175e-07,
"logits/chosen": -6.047713279724121,
"logits/rejected": -6.1482133865356445,
"logps/chosen": -942.37060546875,
"logps/rejected": -869.7897338867188,
"loss": 0.5837,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 1.1601864099502563,
"rewards/margins": 0.4576262831687927,
"rewards/rejected": 0.7025600671768188,
"step": 2100
},
{
"epoch": 0.7782461666743874,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.365157127380371,
"eval_logps/chosen": -1140.8760986328125,
"eval_logps/rejected": -1051.16064453125,
"eval_loss": 0.6740830540657043,
"eval_rewards/accuracies": 0.5994962453842163,
"eval_rewards/chosen": 1.3469244241714478,
"eval_rewards/margins": 0.23882101476192474,
"eval_rewards/rejected": 1.1081035137176514,
"eval_runtime": 174.0197,
"eval_samples_per_second": 6.844,
"eval_steps_per_second": 6.844,
"step": 2100
},
{
"epoch": 0.7819521008014082,
"grad_norm": 165.31671142578125,
"learning_rate": 1.0929974064468321e-07,
"logits/chosen": -6.240053653717041,
"logits/rejected": -6.137989044189453,
"logps/chosen": -927.2156372070312,
"logps/rejected": -787.1029052734375,
"loss": 0.5752,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 1.1965583562850952,
"rewards/margins": 0.5078359842300415,
"rewards/rejected": 0.6887223124504089,
"step": 2110
},
{
"epoch": 0.7856580349284291,
"grad_norm": 187.2274932861328,
"learning_rate": 1.0744720266765468e-07,
"logits/chosen": -6.173762321472168,
"logits/rejected": -6.2248053550720215,
"logps/chosen": -946.9639892578125,
"logps/rejected": -851.115234375,
"loss": 0.5905,
"rewards/accuracies": 0.65625,
"rewards/chosen": 1.1326261758804321,
"rewards/margins": 0.43694519996643066,
"rewards/rejected": 0.6956809163093567,
"step": 2120
},
{
"epoch": 0.78936396905545,
"grad_norm": 203.85891723632812,
"learning_rate": 1.0559466469062616e-07,
"logits/chosen": -6.200101375579834,
"logits/rejected": -6.217113494873047,
"logps/chosen": -970.5763549804688,
"logps/rejected": -895.93310546875,
"loss": 0.6369,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 1.2285493612289429,
"rewards/margins": 0.3296011984348297,
"rewards/rejected": 0.8989483118057251,
"step": 2130
},
{
"epoch": 0.793069903182471,
"grad_norm": 204.75799560546875,
"learning_rate": 1.0374212671359762e-07,
"logits/chosen": -6.152576446533203,
"logits/rejected": -6.173853874206543,
"logps/chosen": -1001.7930908203125,
"logps/rejected": -783.9508056640625,
"loss": 0.607,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 1.1725000143051147,
"rewards/margins": 0.419525682926178,
"rewards/rejected": 0.7529743909835815,
"step": 2140
},
{
"epoch": 0.7967758373094919,
"grad_norm": 217.63076782226562,
"learning_rate": 1.018895887365691e-07,
"logits/chosen": -6.167757987976074,
"logits/rejected": -6.10471248626709,
"logps/chosen": -996.0086669921875,
"logps/rejected": -872.7799072265625,
"loss": 0.6336,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 1.3676767349243164,
"rewards/margins": 0.4060121476650238,
"rewards/rejected": 0.9616644978523254,
"step": 2150
},
{
"epoch": 0.7967758373094919,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.359668731689453,
"eval_logps/chosen": -1139.7021484375,
"eval_logps/rejected": -1050.19970703125,
"eval_loss": 0.6704598665237427,
"eval_rewards/accuracies": 0.6137699484825134,
"eval_rewards/chosen": 1.4643239974975586,
"eval_rewards/margins": 0.2601255178451538,
"eval_rewards/rejected": 1.2041983604431152,
"eval_runtime": 174.4958,
"eval_samples_per_second": 6.825,
"eval_steps_per_second": 6.825,
"step": 2150
},
{
"epoch": 0.8004817714365127,
"grad_norm": 203.3338165283203,
"learning_rate": 1.0003705075954057e-07,
"logits/chosen": -6.282981872558594,
"logits/rejected": -6.3550286293029785,
"logps/chosen": -897.6851806640625,
"logps/rejected": -796.6698608398438,
"loss": 0.6354,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 1.2425462007522583,
"rewards/margins": 0.3381389081478119,
"rewards/rejected": 0.9044073224067688,
"step": 2160
},
{
"epoch": 0.8041877055635336,
"grad_norm": 164.21310424804688,
"learning_rate": 9.818451278251204e-08,
"logits/chosen": -6.134262561798096,
"logits/rejected": -6.197000503540039,
"logps/chosen": -919.0861206054688,
"logps/rejected": -864.6526489257812,
"loss": 0.6287,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 1.2566717863082886,
"rewards/margins": 0.4035015106201172,
"rewards/rejected": 0.8531702160835266,
"step": 2170
},
{
"epoch": 0.8078936396905545,
"grad_norm": 153.4619598388672,
"learning_rate": 9.633197480548351e-08,
"logits/chosen": -6.275097846984863,
"logits/rejected": NaN,
"logps/chosen": -924.9786376953125,
"logps/rejected": -775.713134765625,
"loss": 0.5827,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.353480577468872,
"rewards/margins": 0.5435738563537598,
"rewards/rejected": 0.8099066019058228,
"step": 2180
},
{
"epoch": 0.8115995738175754,
"grad_norm": 150.305419921875,
"learning_rate": 9.447943682845498e-08,
"logits/chosen": -6.12339973449707,
"logits/rejected": -6.132723808288574,
"logps/chosen": -996.9190673828125,
"logps/rejected": -849.7203369140625,
"loss": 0.6283,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 1.3306301832199097,
"rewards/margins": 0.4646291732788086,
"rewards/rejected": 0.8660010099411011,
"step": 2190
},
{
"epoch": 0.8153055079445963,
"grad_norm": 137.26937866210938,
"learning_rate": 9.262689885142645e-08,
"logits/chosen": -6.194148063659668,
"logits/rejected": -6.139514923095703,
"logps/chosen": -874.9927978515625,
"logps/rejected": -870.7437744140625,
"loss": 0.6213,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 1.2261251211166382,
"rewards/margins": 0.37579071521759033,
"rewards/rejected": 0.8503344655036926,
"step": 2200
},
{
"epoch": 0.8153055079445963,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.357300758361816,
"eval_logps/chosen": -1138.9786376953125,
"eval_logps/rejected": -1049.5445556640625,
"eval_loss": 0.6698047518730164,
"eval_rewards/accuracies": 0.6120907068252563,
"eval_rewards/chosen": 1.5366746187210083,
"eval_rewards/margins": 0.26695773005485535,
"eval_rewards/rejected": 1.269716739654541,
"eval_runtime": 174.7913,
"eval_samples_per_second": 6.814,
"eval_steps_per_second": 6.814,
"step": 2200
},
{
"epoch": 0.8190114420716171,
"grad_norm": 195.36293029785156,
"learning_rate": 9.077436087439792e-08,
"logits/chosen": -6.112738609313965,
"logits/rejected": -6.180370807647705,
"logps/chosen": -958.05810546875,
"logps/rejected": -914.5363159179688,
"loss": 0.6249,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 1.3648970127105713,
"rewards/margins": 0.38803738355636597,
"rewards/rejected": 0.9768595695495605,
"step": 2210
},
{
"epoch": 0.822717376198638,
"grad_norm": 218.26048278808594,
"learning_rate": 8.89218228973694e-08,
"logits/chosen": -6.269371032714844,
"logits/rejected": -6.25061559677124,
"logps/chosen": -948.0462036132812,
"logps/rejected": -886.0133666992188,
"loss": 0.6872,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 1.214468002319336,
"rewards/margins": 0.23822224140167236,
"rewards/rejected": 0.9762457013130188,
"step": 2220
},
{
"epoch": 0.826423310325659,
"grad_norm": 211.05848693847656,
"learning_rate": 8.706928492034086e-08,
"logits/chosen": -6.225638389587402,
"logits/rejected": -6.232121467590332,
"logps/chosen": -908.8572998046875,
"logps/rejected": -863.0309448242188,
"loss": 0.6504,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 1.1848371028900146,
"rewards/margins": 0.3589955270290375,
"rewards/rejected": 0.8258415460586548,
"step": 2230
},
{
"epoch": 0.8301292444526799,
"grad_norm": 225.31521606445312,
"learning_rate": 8.521674694331234e-08,
"logits/chosen": -6.275576591491699,
"logits/rejected": -6.208001136779785,
"logps/chosen": -863.1882934570312,
"logps/rejected": -742.2914428710938,
"loss": 0.5924,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 1.097625970840454,
"rewards/margins": 0.4352286756038666,
"rewards/rejected": 0.6623972058296204,
"step": 2240
},
{
"epoch": 0.8338351785797008,
"grad_norm": 229.04183959960938,
"learning_rate": 8.336420896628381e-08,
"logits/chosen": -6.226241111755371,
"logits/rejected": -6.138689994812012,
"logps/chosen": -1008.5877685546875,
"logps/rejected": -819.7078247070312,
"loss": 0.6272,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 1.2099117040634155,
"rewards/margins": 0.3881527781486511,
"rewards/rejected": 0.8217589259147644,
"step": 2250
},
{
"epoch": 0.8338351785797008,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.360867500305176,
"eval_logps/chosen": -1140.3447265625,
"eval_logps/rejected": -1050.7974853515625,
"eval_loss": 0.6691888570785522,
"eval_rewards/accuracies": 0.6146095991134644,
"eval_rewards/chosen": 1.4000587463378906,
"eval_rewards/margins": 0.2556445896625519,
"eval_rewards/rejected": 1.1444141864776611,
"eval_runtime": 174.9273,
"eval_samples_per_second": 6.809,
"eval_steps_per_second": 6.809,
"step": 2250
},
{
"epoch": 0.8375411127067216,
"grad_norm": 179.97850036621094,
"learning_rate": 8.151167098925527e-08,
"logits/chosen": -6.222306251525879,
"logits/rejected": -6.101675510406494,
"logps/chosen": -929.2380981445312,
"logps/rejected": -836.9358520507812,
"loss": 0.5694,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 1.3586682081222534,
"rewards/margins": 0.4944392740726471,
"rewards/rejected": 0.8642290234565735,
"step": 2260
},
{
"epoch": 0.8412470468337425,
"grad_norm": 187.8804473876953,
"learning_rate": 7.965913301222675e-08,
"logits/chosen": -6.19569730758667,
"logits/rejected": -6.270641326904297,
"logps/chosen": -982.6009521484375,
"logps/rejected": -859.2730712890625,
"loss": 0.6045,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 1.1823841333389282,
"rewards/margins": 0.3857570290565491,
"rewards/rejected": 0.7966271638870239,
"step": 2270
},
{
"epoch": 0.8449529809607634,
"grad_norm": 250.08534240722656,
"learning_rate": 7.780659503519822e-08,
"logits/chosen": -6.17086935043335,
"logits/rejected": -6.222559452056885,
"logps/chosen": -1034.419677734375,
"logps/rejected": -955.6583251953125,
"loss": 0.659,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 1.3033745288848877,
"rewards/margins": 0.39839601516723633,
"rewards/rejected": 0.9049783945083618,
"step": 2280
},
{
"epoch": 0.8486589150877843,
"grad_norm": 148.72601318359375,
"learning_rate": 7.59540570581697e-08,
"logits/chosen": -6.179747581481934,
"logits/rejected": -6.1192827224731445,
"logps/chosen": -986.9896240234375,
"logps/rejected": -826.3615112304688,
"loss": 0.5721,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 1.323201298713684,
"rewards/margins": 0.5028241276741028,
"rewards/rejected": 0.8203772306442261,
"step": 2290
},
{
"epoch": 0.8523648492148052,
"grad_norm": 178.256591796875,
"learning_rate": 7.410151908114116e-08,
"logits/chosen": -6.32712459564209,
"logits/rejected": -6.3251752853393555,
"logps/chosen": -896.9401245117188,
"logps/rejected": -792.0689697265625,
"loss": 0.6251,
"rewards/accuracies": 0.65625,
"rewards/chosen": 1.158760905265808,
"rewards/margins": 0.38926878571510315,
"rewards/rejected": 0.7694920897483826,
"step": 2300
},
{
"epoch": 0.8523648492148052,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.361525535583496,
"eval_logps/chosen": -1140.1575927734375,
"eval_logps/rejected": -1050.596923828125,
"eval_loss": 0.667841911315918,
"eval_rewards/accuracies": 0.6095718145370483,
"eval_rewards/chosen": 1.418774962425232,
"eval_rewards/margins": 0.25429922342300415,
"eval_rewards/rejected": 1.164475917816162,
"eval_runtime": 174.9606,
"eval_samples_per_second": 6.807,
"eval_steps_per_second": 6.807,
"step": 2300
},
{
"epoch": 0.8560707833418261,
"grad_norm": 289.4090270996094,
"learning_rate": 7.224898110411264e-08,
"logits/chosen": -6.040954113006592,
"logits/rejected": -6.1116743087768555,
"logps/chosen": -868.0579833984375,
"logps/rejected": -742.7337646484375,
"loss": 0.5972,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 1.136728286743164,
"rewards/margins": 0.48759302496910095,
"rewards/rejected": 0.6491352915763855,
"step": 2310
},
{
"epoch": 0.859776717468847,
"grad_norm": 209.7076416015625,
"learning_rate": 7.039644312708411e-08,
"logits/chosen": -6.146378993988037,
"logits/rejected": -6.205878257751465,
"logps/chosen": -955.7279052734375,
"logps/rejected": -776.606201171875,
"loss": 0.6427,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 1.3216975927352905,
"rewards/margins": 0.4136362671852112,
"rewards/rejected": 0.9080612063407898,
"step": 2320
},
{
"epoch": 0.8634826515958679,
"grad_norm": 157.4209442138672,
"learning_rate": 6.854390515005557e-08,
"logits/chosen": -6.1600141525268555,
"logits/rejected": -6.090916633605957,
"logps/chosen": -1095.3663330078125,
"logps/rejected": -915.3992919921875,
"loss": 0.5802,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 1.4250476360321045,
"rewards/margins": 0.5543738007545471,
"rewards/rejected": 0.8706739544868469,
"step": 2330
},
{
"epoch": 0.8671885857228888,
"grad_norm": 202.49034118652344,
"learning_rate": 6.669136717302705e-08,
"logits/chosen": -6.110904216766357,
"logits/rejected": -6.1261210441589355,
"logps/chosen": -967.0671997070312,
"logps/rejected": -858.1019287109375,
"loss": 0.6087,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 1.3673769235610962,
"rewards/margins": 0.5143194198608398,
"rewards/rejected": 0.8530575037002563,
"step": 2340
},
{
"epoch": 0.8708945198499096,
"grad_norm": 103.83200073242188,
"learning_rate": 6.483882919599852e-08,
"logits/chosen": -6.204503059387207,
"logits/rejected": -6.187712669372559,
"logps/chosen": -872.6140747070312,
"logps/rejected": -760.0308227539062,
"loss": 0.6168,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 1.1666477918624878,
"rewards/margins": 0.4573976993560791,
"rewards/rejected": 0.7092500925064087,
"step": 2350
},
{
"epoch": 0.8708945198499096,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.359028339385986,
"eval_logps/chosen": -1138.91455078125,
"eval_logps/rejected": -1049.516845703125,
"eval_loss": 0.668138325214386,
"eval_rewards/accuracies": 0.6221662759780884,
"eval_rewards/chosen": 1.543074131011963,
"eval_rewards/margins": 0.2705841064453125,
"eval_rewards/rejected": 1.2724900245666504,
"eval_runtime": 174.7239,
"eval_samples_per_second": 6.816,
"eval_steps_per_second": 6.816,
"step": 2350
},
{
"epoch": 0.8746004539769305,
"grad_norm": 226.57864379882812,
"learning_rate": 6.298629121897e-08,
"logits/chosen": NaN,
"logits/rejected": -6.086965084075928,
"logps/chosen": -982.00390625,
"logps/rejected": -850.6180419921875,
"loss": 0.6142,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 1.2618882656097412,
"rewards/margins": 0.42980679869651794,
"rewards/rejected": 0.8320814967155457,
"step": 2360
},
{
"epoch": 0.8783063881039515,
"grad_norm": 250.1316680908203,
"learning_rate": 6.113375324194146e-08,
"logits/chosen": -6.260158061981201,
"logits/rejected": -6.332103252410889,
"logps/chosen": -921.3849487304688,
"logps/rejected": -801.4791259765625,
"loss": 0.6029,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 1.3854167461395264,
"rewards/margins": 0.49748069047927856,
"rewards/rejected": 0.8879362344741821,
"step": 2370
},
{
"epoch": 0.8820123222309724,
"grad_norm": 257.1830749511719,
"learning_rate": 5.928121526491293e-08,
"logits/chosen": -6.317752361297607,
"logits/rejected": NaN,
"logps/chosen": -978.55126953125,
"logps/rejected": -801.6576538085938,
"loss": 0.5736,
"rewards/accuracies": 0.75,
"rewards/chosen": 1.3101433515548706,
"rewards/margins": 0.5170674324035645,
"rewards/rejected": 0.7930759787559509,
"step": 2380
},
{
"epoch": 0.8857182563579933,
"grad_norm": 174.4718475341797,
"learning_rate": 5.74286772878844e-08,
"logits/chosen": -6.182769298553467,
"logits/rejected": NaN,
"logps/chosen": -841.9371948242188,
"logps/rejected": -779.7135009765625,
"loss": 0.6186,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 1.247337818145752,
"rewards/margins": 0.39551234245300293,
"rewards/rejected": 0.8518252372741699,
"step": 2390
},
{
"epoch": 0.8894241904850141,
"grad_norm": 150.2713165283203,
"learning_rate": 5.557613931085587e-08,
"logits/chosen": NaN,
"logits/rejected": -6.047909736633301,
"logps/chosen": -949.482421875,
"logps/rejected": -869.8361206054688,
"loss": 0.6183,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 1.4006798267364502,
"rewards/margins": 0.3790472149848938,
"rewards/rejected": 1.0216325521469116,
"step": 2400
},
{
"epoch": 0.8894241904850141,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.359555244445801,
"eval_logps/chosen": -1138.1962890625,
"eval_logps/rejected": -1048.9058837890625,
"eval_loss": 0.6646097898483276,
"eval_rewards/accuracies": 0.6221662759780884,
"eval_rewards/chosen": 1.6149109601974487,
"eval_rewards/margins": 0.2813268303871155,
"eval_rewards/rejected": 1.333584189414978,
"eval_runtime": 174.9379,
"eval_samples_per_second": 6.808,
"eval_steps_per_second": 6.808,
"step": 2400
},
{
"epoch": 0.893130124612035,
"grad_norm": 142.9062042236328,
"learning_rate": 5.372360133382734e-08,
"logits/chosen": -6.116464138031006,
"logits/rejected": -6.133796691894531,
"logps/chosen": -922.8450317382812,
"logps/rejected": -898.6383666992188,
"loss": 0.633,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 1.3375623226165771,
"rewards/margins": 0.38691186904907227,
"rewards/rejected": 0.9506505131721497,
"step": 2410
},
{
"epoch": 0.8968360587390559,
"grad_norm": 194.79432678222656,
"learning_rate": 5.187106335679881e-08,
"logits/chosen": -6.197569370269775,
"logits/rejected": -6.240142345428467,
"logps/chosen": -906.27001953125,
"logps/rejected": -837.5906372070312,
"loss": 0.6182,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 1.3941477537155151,
"rewards/margins": 0.47745227813720703,
"rewards/rejected": 0.9166954159736633,
"step": 2420
},
{
"epoch": 0.9005419928660768,
"grad_norm": 176.5300750732422,
"learning_rate": 5.001852537977028e-08,
"logits/chosen": -6.1893310546875,
"logits/rejected": -6.254434585571289,
"logps/chosen": -949.20654296875,
"logps/rejected": -864.1427001953125,
"loss": 0.6234,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 1.4380910396575928,
"rewards/margins": 0.3952023983001709,
"rewards/rejected": 1.0428886413574219,
"step": 2430
},
{
"epoch": 0.9042479269930978,
"grad_norm": 204.15846252441406,
"learning_rate": 4.8165987402741755e-08,
"logits/chosen": -6.22286319732666,
"logits/rejected": -6.251989841461182,
"logps/chosen": -1061.7998046875,
"logps/rejected": -907.6027221679688,
"loss": 0.6386,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 1.4448530673980713,
"rewards/margins": 0.4636596739292145,
"rewards/rejected": 0.9811934232711792,
"step": 2440
},
{
"epoch": 0.9079538611201186,
"grad_norm": 172.95408630371094,
"learning_rate": 4.631344942571323e-08,
"logits/chosen": -6.1700005531311035,
"logits/rejected": -6.175479412078857,
"logps/chosen": -931.7879028320312,
"logps/rejected": -793.9867553710938,
"loss": 0.5927,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 1.3887077569961548,
"rewards/margins": 0.46491020917892456,
"rewards/rejected": 0.9237974882125854,
"step": 2450
},
{
"epoch": 0.9079538611201186,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.360927581787109,
"eval_logps/chosen": -1138.57568359375,
"eval_logps/rejected": -1049.185302734375,
"eval_loss": 0.6698666214942932,
"eval_rewards/accuracies": 0.6179680824279785,
"eval_rewards/chosen": 1.576967477798462,
"eval_rewards/margins": 0.2713308334350586,
"eval_rewards/rejected": 1.3056366443634033,
"eval_runtime": 174.8797,
"eval_samples_per_second": 6.81,
"eval_steps_per_second": 6.81,
"step": 2450
},
{
"epoch": 0.9116597952471395,
"grad_norm": 199.45687866210938,
"learning_rate": 4.44609114486847e-08,
"logits/chosen": -6.102917194366455,
"logits/rejected": -6.159636974334717,
"logps/chosen": -846.6583251953125,
"logps/rejected": -778.142333984375,
"loss": 0.6016,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 1.2215101718902588,
"rewards/margins": 0.4805546700954437,
"rewards/rejected": 0.7409554719924927,
"step": 2460
},
{
"epoch": 0.9153657293741604,
"grad_norm": 148.47500610351562,
"learning_rate": 4.260837347165617e-08,
"logits/chosen": -6.2448930740356445,
"logits/rejected": -6.158895015716553,
"logps/chosen": -1002.0369262695312,
"logps/rejected": -890.6515502929688,
"loss": 0.5622,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 1.374596357345581,
"rewards/margins": 0.5444284081459045,
"rewards/rejected": 0.8301678895950317,
"step": 2470
},
{
"epoch": 0.9190716635011813,
"grad_norm": 186.5249481201172,
"learning_rate": 4.075583549462764e-08,
"logits/chosen": -6.199591159820557,
"logits/rejected": -6.20455265045166,
"logps/chosen": -952.9064331054688,
"logps/rejected": -825.30859375,
"loss": 0.6153,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 1.4213062524795532,
"rewards/margins": 0.4216051697731018,
"rewards/rejected": 0.9997010231018066,
"step": 2480
},
{
"epoch": 0.9227775976282021,
"grad_norm": 175.28465270996094,
"learning_rate": 3.890329751759911e-08,
"logits/chosen": -6.196984767913818,
"logits/rejected": -6.227551460266113,
"logps/chosen": -967.3463745117188,
"logps/rejected": -785.5040283203125,
"loss": 0.5873,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 1.3924157619476318,
"rewards/margins": 0.5140901803970337,
"rewards/rejected": 0.8783254623413086,
"step": 2490
},
{
"epoch": 0.926483531755223,
"grad_norm": 205.52532958984375,
"learning_rate": 3.705075954057058e-08,
"logits/chosen": -6.220477104187012,
"logits/rejected": -6.22896146774292,
"logps/chosen": -926.861328125,
"logps/rejected": -857.521484375,
"loss": 0.6039,
"rewards/accuracies": 0.65625,
"rewards/chosen": 1.3103541135787964,
"rewards/margins": 0.4116063117980957,
"rewards/rejected": 0.8987478017807007,
"step": 2500
},
{
"epoch": 0.926483531755223,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.359811305999756,
"eval_logps/chosen": -1138.34375,
"eval_logps/rejected": -1049.01904296875,
"eval_loss": 0.6664721965789795,
"eval_rewards/accuracies": 0.6263644099235535,
"eval_rewards/chosen": 1.6001617908477783,
"eval_rewards/margins": 0.2778994143009186,
"eval_rewards/rejected": 1.3222622871398926,
"eval_runtime": 175.0068,
"eval_samples_per_second": 6.805,
"eval_steps_per_second": 6.805,
"step": 2500
},
{
"epoch": 0.9301894658822439,
"grad_norm": 126.9505615234375,
"learning_rate": 3.5198221563542054e-08,
"logits/chosen": NaN,
"logits/rejected": -6.187649726867676,
"logps/chosen": -1037.003173828125,
"logps/rejected": -877.7476806640625,
"loss": 0.5535,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 1.5431692600250244,
"rewards/margins": 0.6853631734848022,
"rewards/rejected": 0.8578060865402222,
"step": 2510
},
{
"epoch": 0.9338954000092649,
"grad_norm": 217.86968994140625,
"learning_rate": 3.3345683586513526e-08,
"logits/chosen": -6.283205032348633,
"logits/rejected": -6.221907615661621,
"logps/chosen": -924.9992065429688,
"logps/rejected": -855.3511962890625,
"loss": 0.6438,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 1.3765745162963867,
"rewards/margins": 0.3500059247016907,
"rewards/rejected": 1.0265684127807617,
"step": 2520
},
{
"epoch": 0.9376013341362858,
"grad_norm": 170.4639434814453,
"learning_rate": 3.1493145609485e-08,
"logits/chosen": -6.257707595825195,
"logits/rejected": -6.27487850189209,
"logps/chosen": -903.9068603515625,
"logps/rejected": -843.6154174804688,
"loss": 0.6395,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 1.2748887538909912,
"rewards/margins": 0.3705710768699646,
"rewards/rejected": 0.9043177366256714,
"step": 2530
},
{
"epoch": 0.9413072682633066,
"grad_norm": 150.2277069091797,
"learning_rate": 2.9640607632456464e-08,
"logits/chosen": -6.218627452850342,
"logits/rejected": -6.296253204345703,
"logps/chosen": -1027.5751953125,
"logps/rejected": -868.8278198242188,
"loss": 0.6068,
"rewards/accuracies": 0.65625,
"rewards/chosen": 1.4749407768249512,
"rewards/margins": 0.5608575940132141,
"rewards/rejected": 0.9140831232070923,
"step": 2540
},
{
"epoch": 0.9450132023903275,
"grad_norm": 176.552490234375,
"learning_rate": 2.7788069655427936e-08,
"logits/chosen": -6.115626335144043,
"logits/rejected": -6.002920150756836,
"logps/chosen": -835.7297973632812,
"logps/rejected": -763.3617553710938,
"loss": 0.6288,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 1.1424133777618408,
"rewards/margins": 0.3475190997123718,
"rewards/rejected": 0.7948943376541138,
"step": 2550
},
{
"epoch": 0.9450132023903275,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.3612494468688965,
"eval_logps/chosen": -1138.4256591796875,
"eval_logps/rejected": -1049.066162109375,
"eval_loss": 0.6675523519515991,
"eval_rewards/accuracies": 0.6146095991134644,
"eval_rewards/chosen": 1.5919551849365234,
"eval_rewards/margins": 0.2744098901748657,
"eval_rewards/rejected": 1.3175454139709473,
"eval_runtime": 174.7172,
"eval_samples_per_second": 6.817,
"eval_steps_per_second": 6.817,
"step": 2550
},
{
"epoch": 0.9487191365173484,
"grad_norm": 168.2140350341797,
"learning_rate": 2.5935531678399405e-08,
"logits/chosen": -6.1927971839904785,
"logits/rejected": -6.197493553161621,
"logps/chosen": -991.3941650390625,
"logps/rejected": -906.1048583984375,
"loss": 0.6289,
"rewards/accuracies": 0.625,
"rewards/chosen": 1.511885404586792,
"rewards/margins": 0.4927326738834381,
"rewards/rejected": 1.0191527605056763,
"step": 2560
},
{
"epoch": 0.9524250706443693,
"grad_norm": 171.1510009765625,
"learning_rate": 2.4082993701370877e-08,
"logits/chosen": -6.209620475769043,
"logits/rejected": -6.170471668243408,
"logps/chosen": -925.6842041015625,
"logps/rejected": -813.4041137695312,
"loss": 0.6095,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 1.367976188659668,
"rewards/margins": 0.4230473041534424,
"rewards/rejected": 0.9449288249015808,
"step": 2570
},
{
"epoch": 0.9561310047713902,
"grad_norm": 194.6319580078125,
"learning_rate": 2.223045572434235e-08,
"logits/chosen": -6.165289878845215,
"logits/rejected": -6.122786521911621,
"logps/chosen": -965.0526123046875,
"logps/rejected": -807.0406494140625,
"loss": 0.5615,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 1.4062001705169678,
"rewards/margins": 0.6220154166221619,
"rewards/rejected": 0.7841848134994507,
"step": 2580
},
{
"epoch": 0.959836938898411,
"grad_norm": 187.33181762695312,
"learning_rate": 2.037791774731382e-08,
"logits/chosen": -6.2167792320251465,
"logits/rejected": NaN,
"logps/chosen": -980.70263671875,
"logps/rejected": -831.4769287109375,
"loss": 0.568,
"rewards/accuracies": 0.6875,
"rewards/chosen": 1.3932929039001465,
"rewards/margins": 0.5573422908782959,
"rewards/rejected": 0.8359505534172058,
"step": 2590
},
{
"epoch": 0.963542873025432,
"grad_norm": 142.72923278808594,
"learning_rate": 1.852537977028529e-08,
"logits/chosen": -6.158575534820557,
"logits/rejected": -6.203221321105957,
"logps/chosen": -921.6607666015625,
"logps/rejected": -842.1689453125,
"loss": 0.5714,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 1.325329065322876,
"rewards/margins": 0.510645866394043,
"rewards/rejected": 0.8146833181381226,
"step": 2600
},
{
"epoch": 0.963542873025432,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.361992835998535,
"eval_logps/chosen": -1138.678955078125,
"eval_logps/rejected": -1049.255615234375,
"eval_loss": 0.6706271171569824,
"eval_rewards/accuracies": 0.6120907068252563,
"eval_rewards/chosen": 1.566640853881836,
"eval_rewards/margins": 0.26803797483444214,
"eval_rewards/rejected": 1.2986030578613281,
"eval_runtime": 174.9793,
"eval_samples_per_second": 6.807,
"eval_steps_per_second": 6.807,
"step": 2600
},
{
"epoch": 0.9672488071524529,
"grad_norm": 190.34542846679688,
"learning_rate": 1.6672841793256763e-08,
"logits/chosen": -6.201694488525391,
"logits/rejected": -6.2331223487854,
"logps/chosen": -881.90576171875,
"logps/rejected": -822.5655517578125,
"loss": 0.6516,
"rewards/accuracies": 0.59375,
"rewards/chosen": 1.2538020610809326,
"rewards/margins": 0.3467678427696228,
"rewards/rejected": 0.9070342183113098,
"step": 2610
},
{
"epoch": 0.9709547412794738,
"grad_norm": 273.5181884765625,
"learning_rate": 1.4820303816228232e-08,
"logits/chosen": -6.193233489990234,
"logits/rejected": -6.104687690734863,
"logps/chosen": -866.4398193359375,
"logps/rejected": -785.888916015625,
"loss": 0.6333,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 1.187961220741272,
"rewards/margins": 0.3987189829349518,
"rewards/rejected": 0.7892423868179321,
"step": 2620
},
{
"epoch": 0.9746606754064947,
"grad_norm": 230.2846221923828,
"learning_rate": 1.2967765839199703e-08,
"logits/chosen": -6.259491920471191,
"logits/rejected": -6.279690265655518,
"logps/chosen": -791.6710815429688,
"logps/rejected": -694.3341674804688,
"loss": 0.6119,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 1.2341381311416626,
"rewards/margins": 0.4778687059879303,
"rewards/rejected": 0.7562695741653442,
"step": 2630
},
{
"epoch": 0.9783666095335155,
"grad_norm": 219.6723175048828,
"learning_rate": 1.1115227862171175e-08,
"logits/chosen": -6.238982677459717,
"logits/rejected": -6.274487495422363,
"logps/chosen": -911.5924072265625,
"logps/rejected": -826.4364013671875,
"loss": 0.6341,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 1.3268905878067017,
"rewards/margins": 0.3773989975452423,
"rewards/rejected": 0.9494916200637817,
"step": 2640
},
{
"epoch": 0.9820725436605364,
"grad_norm": 228.98863220214844,
"learning_rate": 9.262689885142645e-09,
"logits/chosen": -6.187376976013184,
"logits/rejected": -6.207940578460693,
"logps/chosen": -867.3173828125,
"logps/rejected": -846.06298828125,
"loss": 0.6064,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 1.1946938037872314,
"rewards/margins": 0.3911550045013428,
"rewards/rejected": 0.8035389184951782,
"step": 2650
},
{
"epoch": 0.9820725436605364,
"eval_logits/chosen": NaN,
"eval_logits/rejected": -6.362624645233154,
"eval_logps/chosen": -1138.8880615234375,
"eval_logps/rejected": -1049.453125,
"eval_loss": 0.6678369641304016,
"eval_rewards/accuracies": 0.6246851682662964,
"eval_rewards/chosen": 1.5457268953323364,
"eval_rewards/margins": 0.2668676972389221,
"eval_rewards/rejected": 1.2788591384887695,
"eval_runtime": 174.9484,
"eval_samples_per_second": 6.808,
"eval_steps_per_second": 6.808,
"step": 2650
},
{
"epoch": 0.9857784777875573,
"grad_norm": 147.26541137695312,
"learning_rate": 7.410151908114116e-09,
"logits/chosen": -6.185873508453369,
"logits/rejected": -6.177300930023193,
"logps/chosen": -981.9752197265625,
"logps/rejected": -823.0641479492188,
"loss": 0.557,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 1.3892064094543457,
"rewards/margins": 0.5826338529586792,
"rewards/rejected": 0.8065725564956665,
"step": 2660
},
{
"epoch": 0.9894844119145783,
"grad_norm": 143.0415802001953,
"learning_rate": 5.5576139310855874e-09,
"logits/chosen": -6.007561683654785,
"logits/rejected": -6.002453327178955,
"logps/chosen": -852.1906127929688,
"logps/rejected": -811.2535400390625,
"loss": 0.5599,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 1.2815325260162354,
"rewards/margins": 0.49190282821655273,
"rewards/rejected": 0.7896297574043274,
"step": 2670
},
{
"epoch": 0.9931903460415991,
"grad_norm": 210.65750122070312,
"learning_rate": 3.705075954057058e-09,
"logits/chosen": -6.069428443908691,
"logits/rejected": -6.068325996398926,
"logps/chosen": -1039.2158203125,
"logps/rejected": -915.1730346679688,
"loss": 0.5495,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 1.5940990447998047,
"rewards/margins": 0.5972550511360168,
"rewards/rejected": 0.9968441128730774,
"step": 2680
},
{
"epoch": 0.99689628016862,
"grad_norm": 211.33241271972656,
"learning_rate": 1.852537977028529e-09,
"logits/chosen": -6.273778438568115,
"logits/rejected": -6.336636543273926,
"logps/chosen": -950.8792724609375,
"logps/rejected": -863.4602661132812,
"loss": 0.6386,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 1.4326883554458618,
"rewards/margins": 0.3855132758617401,
"rewards/rejected": 1.0471750497817993,
"step": 2690
}
],
"logging_steps": 10,
"max_steps": 2699,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}