simpo-oh_v3.1_wo_camel_ai_math / trainer_state.json
sedrickkeh's picture
End of training
12e3fb1 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997172745264349,
"eval_steps": 500,
"global_step": 442,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0022618037885213456,
"grad_norm": 82.20633944321844,
"learning_rate": 1.7777777777777777e-08,
"logits/chosen": -1.0218509435653687,
"logits/rejected": -0.9817585349082947,
"logps/chosen": -1.5688527822494507,
"logps/rejected": -1.7071683406829834,
"loss": 5.3243,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -15.688528060913086,
"rewards/margins": 1.3831559419631958,
"rewards/rejected": -17.071683883666992,
"step": 1
},
{
"epoch": 0.004523607577042691,
"grad_norm": 76.56269102716989,
"learning_rate": 3.5555555555555554e-08,
"logits/chosen": -1.0730209350585938,
"logits/rejected": -1.0666016340255737,
"logps/chosen": -1.6668750047683716,
"logps/rejected": -1.6219017505645752,
"loss": 6.0119,
"rewards/accuracies": 0.515625,
"rewards/chosen": -16.66874885559082,
"rewards/margins": -0.44973334670066833,
"rewards/rejected": -16.21901512145996,
"step": 2
},
{
"epoch": 0.006785411365564037,
"grad_norm": 80.7872495859453,
"learning_rate": 5.333333333333333e-08,
"logits/chosen": -1.0274062156677246,
"logits/rejected": -1.0491466522216797,
"logps/chosen": -1.5775692462921143,
"logps/rejected": -1.8128482103347778,
"loss": 5.5709,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -15.775691986083984,
"rewards/margins": 2.3527896404266357,
"rewards/rejected": -18.128482818603516,
"step": 3
},
{
"epoch": 0.009047215154085382,
"grad_norm": 93.5787326834387,
"learning_rate": 7.111111111111111e-08,
"logits/chosen": -1.0509157180786133,
"logits/rejected": -1.028970718383789,
"logps/chosen": -1.6293585300445557,
"logps/rejected": -1.6357903480529785,
"loss": 5.7155,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -16.29358673095703,
"rewards/margins": 0.06431838870048523,
"rewards/rejected": -16.35790252685547,
"step": 4
},
{
"epoch": 0.01130901894260673,
"grad_norm": 65.5120890183917,
"learning_rate": 8.888888888888888e-08,
"logits/chosen": -1.0577445030212402,
"logits/rejected": -1.0238444805145264,
"logps/chosen": -1.5818334817886353,
"logps/rejected": -1.6137027740478516,
"loss": 5.376,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -15.818334579467773,
"rewards/margins": 0.3186935782432556,
"rewards/rejected": -16.137027740478516,
"step": 5
},
{
"epoch": 0.013570822731128074,
"grad_norm": 88.31949377933185,
"learning_rate": 1.0666666666666666e-07,
"logits/chosen": -1.118450403213501,
"logits/rejected": -1.101908802986145,
"logps/chosen": -1.639064908027649,
"logps/rejected": -1.5965094566345215,
"loss": 6.1786,
"rewards/accuracies": 0.4765625,
"rewards/chosen": -16.390649795532227,
"rewards/margins": -0.4255555272102356,
"rewards/rejected": -15.965093612670898,
"step": 6
},
{
"epoch": 0.01583262651964942,
"grad_norm": 47.1722172267689,
"learning_rate": 1.2444444444444443e-07,
"logits/chosen": -1.0451990365982056,
"logits/rejected": -1.0316812992095947,
"logps/chosen": -1.493871808052063,
"logps/rejected": -1.7871618270874023,
"loss": 4.0395,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -14.93871784210205,
"rewards/margins": 2.932901620864868,
"rewards/rejected": -17.871618270874023,
"step": 7
},
{
"epoch": 0.018094430308170765,
"grad_norm": 104.42110239584217,
"learning_rate": 1.4222222222222222e-07,
"logits/chosen": -1.0138366222381592,
"logits/rejected": -1.02739679813385,
"logps/chosen": -1.5772916078567505,
"logps/rejected": -1.6482716798782349,
"loss": 5.6936,
"rewards/accuracies": 0.546875,
"rewards/chosen": -15.772916793823242,
"rewards/margins": 0.7097985744476318,
"rewards/rejected": -16.482715606689453,
"step": 8
},
{
"epoch": 0.020356234096692113,
"grad_norm": 68.67675316355087,
"learning_rate": 1.6e-07,
"logits/chosen": -1.0876085758209229,
"logits/rejected": -1.0700544118881226,
"logps/chosen": -1.523546576499939,
"logps/rejected": -1.7054587602615356,
"loss": 5.3456,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -15.235466957092285,
"rewards/margins": 1.819122552871704,
"rewards/rejected": -17.054588317871094,
"step": 9
},
{
"epoch": 0.02261803788521346,
"grad_norm": 94.5983360056872,
"learning_rate": 1.7777777777777776e-07,
"logits/chosen": -1.0795514583587646,
"logits/rejected": -1.0769304037094116,
"logps/chosen": -1.6664490699768066,
"logps/rejected": -1.6543259620666504,
"loss": 5.6149,
"rewards/accuracies": 0.578125,
"rewards/chosen": -16.66448974609375,
"rewards/margins": -0.12123118340969086,
"rewards/rejected": -16.543258666992188,
"step": 10
},
{
"epoch": 0.024879841673734804,
"grad_norm": 88.34270236157406,
"learning_rate": 1.9555555555555555e-07,
"logits/chosen": -1.1123073101043701,
"logits/rejected": -1.1089990139007568,
"logps/chosen": -1.6528065204620361,
"logps/rejected": -1.7993955612182617,
"loss": 5.3148,
"rewards/accuracies": 0.5625,
"rewards/chosen": -16.528064727783203,
"rewards/margins": 1.4658915996551514,
"rewards/rejected": -17.993955612182617,
"step": 11
},
{
"epoch": 0.02714164546225615,
"grad_norm": 96.98065425868009,
"learning_rate": 2.133333333333333e-07,
"logits/chosen": -1.0383970737457275,
"logits/rejected": -1.0250272750854492,
"logps/chosen": -1.7430694103240967,
"logps/rejected": -1.8183331489562988,
"loss": 5.9778,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -17.430696487426758,
"rewards/margins": 0.7526383399963379,
"rewards/rejected": -18.183332443237305,
"step": 12
},
{
"epoch": 0.029403449250777494,
"grad_norm": 198.0750276907948,
"learning_rate": 2.3111111111111107e-07,
"logits/chosen": -1.0285305976867676,
"logits/rejected": -0.9945288300514221,
"logps/chosen": -1.6881521940231323,
"logps/rejected": -1.614635705947876,
"loss": 5.9679,
"rewards/accuracies": 0.53125,
"rewards/chosen": -16.88152313232422,
"rewards/margins": -0.735164999961853,
"rewards/rejected": -16.1463565826416,
"step": 13
},
{
"epoch": 0.03166525303929884,
"grad_norm": 46.47950957800464,
"learning_rate": 2.4888888888888886e-07,
"logits/chosen": -1.040562391281128,
"logits/rejected": -1.0334664583206177,
"logps/chosen": -1.3813308477401733,
"logps/rejected": -1.5766117572784424,
"loss": 4.3908,
"rewards/accuracies": 0.578125,
"rewards/chosen": -13.813308715820312,
"rewards/margins": 1.9528083801269531,
"rewards/rejected": -15.766117095947266,
"step": 14
},
{
"epoch": 0.033927056827820185,
"grad_norm": 69.33452676748747,
"learning_rate": 2.666666666666666e-07,
"logits/chosen": -1.0274150371551514,
"logits/rejected": -1.007089614868164,
"logps/chosen": -1.5167428255081177,
"logps/rejected": -1.6422500610351562,
"loss": 4.8089,
"rewards/accuracies": 0.640625,
"rewards/chosen": -15.16742992401123,
"rewards/margins": 1.255070686340332,
"rewards/rejected": -16.422500610351562,
"step": 15
},
{
"epoch": 0.03618886061634153,
"grad_norm": 72.5316696572694,
"learning_rate": 2.8444444444444443e-07,
"logits/chosen": -1.0711323022842407,
"logits/rejected": -1.0499267578125,
"logps/chosen": -1.5601518154144287,
"logps/rejected": -1.5947811603546143,
"loss": 5.6696,
"rewards/accuracies": 0.546875,
"rewards/chosen": -15.601516723632812,
"rewards/margins": 0.3462938070297241,
"rewards/rejected": -15.947811126708984,
"step": 16
},
{
"epoch": 0.038450664404862875,
"grad_norm": 76.85732812220978,
"learning_rate": 3.022222222222222e-07,
"logits/chosen": -1.094878911972046,
"logits/rejected": -1.067216157913208,
"logps/chosen": -1.486512541770935,
"logps/rejected": -1.4571260213851929,
"loss": 6.1338,
"rewards/accuracies": 0.546875,
"rewards/chosen": -14.86512565612793,
"rewards/margins": -0.29386693239212036,
"rewards/rejected": -14.571259498596191,
"step": 17
},
{
"epoch": 0.04071246819338423,
"grad_norm": 92.61472209561433,
"learning_rate": 3.2e-07,
"logits/chosen": -1.0927892923355103,
"logits/rejected": -1.0815608501434326,
"logps/chosen": -1.887829303741455,
"logps/rejected": -1.9170186519622803,
"loss": 5.9261,
"rewards/accuracies": 0.515625,
"rewards/chosen": -18.878293991088867,
"rewards/margins": 0.29189303517341614,
"rewards/rejected": -19.17018699645996,
"step": 18
},
{
"epoch": 0.04297427198190557,
"grad_norm": 67.495547378849,
"learning_rate": 3.3777777777777777e-07,
"logits/chosen": -1.1162761449813843,
"logits/rejected": -1.10536527633667,
"logps/chosen": -1.6044241189956665,
"logps/rejected": -1.6690468788146973,
"loss": 5.0572,
"rewards/accuracies": 0.625,
"rewards/chosen": -16.044240951538086,
"rewards/margins": 0.6462277173995972,
"rewards/rejected": -16.690467834472656,
"step": 19
},
{
"epoch": 0.04523607577042692,
"grad_norm": 46.08429182217552,
"learning_rate": 3.5555555555555553e-07,
"logits/chosen": -1.1088039875030518,
"logits/rejected": -1.091164231300354,
"logps/chosen": -1.4526840448379517,
"logps/rejected": -1.682389736175537,
"loss": 4.2789,
"rewards/accuracies": 0.65625,
"rewards/chosen": -14.526841163635254,
"rewards/margins": 2.297055721282959,
"rewards/rejected": -16.823896408081055,
"step": 20
},
{
"epoch": 0.04749787955894826,
"grad_norm": 81.54476641866651,
"learning_rate": 3.7333333333333334e-07,
"logits/chosen": -1.0202794075012207,
"logits/rejected": -1.0125457048416138,
"logps/chosen": -1.5033711194992065,
"logps/rejected": -1.515246868133545,
"loss": 5.2707,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -15.033713340759277,
"rewards/margins": 0.11875671148300171,
"rewards/rejected": -15.152469635009766,
"step": 21
},
{
"epoch": 0.04975968334746961,
"grad_norm": 77.44489528697936,
"learning_rate": 3.911111111111111e-07,
"logits/chosen": -1.0691096782684326,
"logits/rejected": -1.0430610179901123,
"logps/chosen": -1.5314964056015015,
"logps/rejected": -1.6080042123794556,
"loss": 4.5538,
"rewards/accuracies": 0.625,
"rewards/chosen": -15.314962387084961,
"rewards/margins": 0.7650798559188843,
"rewards/rejected": -16.080041885375977,
"step": 22
},
{
"epoch": 0.05202148713599095,
"grad_norm": 72.65865482531491,
"learning_rate": 4.0888888888888886e-07,
"logits/chosen": -1.0519309043884277,
"logits/rejected": -1.0246856212615967,
"logps/chosen": -1.5432385206222534,
"logps/rejected": -1.5514321327209473,
"loss": 5.6642,
"rewards/accuracies": 0.5625,
"rewards/chosen": -15.432385444641113,
"rewards/margins": 0.08193567395210266,
"rewards/rejected": -15.514322280883789,
"step": 23
},
{
"epoch": 0.0542832909245123,
"grad_norm": 61.554723407219434,
"learning_rate": 4.266666666666666e-07,
"logits/chosen": -1.0789406299591064,
"logits/rejected": -1.0445177555084229,
"logps/chosen": -1.3729901313781738,
"logps/rejected": -1.4575581550598145,
"loss": 4.8554,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -13.729900360107422,
"rewards/margins": 0.8456806540489197,
"rewards/rejected": -14.575581550598145,
"step": 24
},
{
"epoch": 0.05654509471303364,
"grad_norm": 92.24212352163143,
"learning_rate": 4.4444444444444444e-07,
"logits/chosen": -1.0634639263153076,
"logits/rejected": -1.0437037944793701,
"logps/chosen": -1.6674749851226807,
"logps/rejected": -1.6732288599014282,
"loss": 5.5718,
"rewards/accuracies": 0.546875,
"rewards/chosen": -16.674747467041016,
"rewards/margins": 0.057538360357284546,
"rewards/rejected": -16.73228645324707,
"step": 25
},
{
"epoch": 0.05880689850155499,
"grad_norm": 82.27286171700929,
"learning_rate": 4.6222222222222214e-07,
"logits/chosen": -1.0726224184036255,
"logits/rejected": -1.042041301727295,
"logps/chosen": -1.6516973972320557,
"logps/rejected": -1.645316481590271,
"loss": 5.4824,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -16.5169734954834,
"rewards/margins": -0.06380730867385864,
"rewards/rejected": -16.453166961669922,
"step": 26
},
{
"epoch": 0.061068702290076333,
"grad_norm": 71.37945583756039,
"learning_rate": 4.8e-07,
"logits/chosen": -1.0674571990966797,
"logits/rejected": -1.0658835172653198,
"logps/chosen": -1.6513671875,
"logps/rejected": -1.5918333530426025,
"loss": 5.9622,
"rewards/accuracies": 0.53125,
"rewards/chosen": -16.513669967651367,
"rewards/margins": -0.5953378677368164,
"rewards/rejected": -15.918333053588867,
"step": 27
},
{
"epoch": 0.06333050607859768,
"grad_norm": 62.92038574422204,
"learning_rate": 4.977777777777777e-07,
"logits/chosen": -1.069338083267212,
"logits/rejected": -1.0540227890014648,
"logps/chosen": -1.5345242023468018,
"logps/rejected": -1.580329418182373,
"loss": 5.6612,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -15.345240592956543,
"rewards/margins": 0.45805394649505615,
"rewards/rejected": -15.803295135498047,
"step": 28
},
{
"epoch": 0.06559230986711903,
"grad_norm": 75.36368108465321,
"learning_rate": 5.155555555555556e-07,
"logits/chosen": -1.0797611474990845,
"logits/rejected": -1.0751529932022095,
"logps/chosen": -1.5848275423049927,
"logps/rejected": -1.5977083444595337,
"loss": 5.4094,
"rewards/accuracies": 0.5078125,
"rewards/chosen": -15.848276138305664,
"rewards/margins": 0.1288076639175415,
"rewards/rejected": -15.977083206176758,
"step": 29
},
{
"epoch": 0.06785411365564037,
"grad_norm": 82.33258282824598,
"learning_rate": 5.333333333333332e-07,
"logits/chosen": -1.0502032041549683,
"logits/rejected": -1.0390020608901978,
"logps/chosen": -1.5757882595062256,
"logps/rejected": -1.5970449447631836,
"loss": 5.802,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -15.757882118225098,
"rewards/margins": 0.21256688237190247,
"rewards/rejected": -15.970449447631836,
"step": 30
},
{
"epoch": 0.07011591744416172,
"grad_norm": 79.89049476585055,
"learning_rate": 5.511111111111111e-07,
"logits/chosen": -1.0897246599197388,
"logits/rejected": -1.0813220739364624,
"logps/chosen": -1.556617021560669,
"logps/rejected": -1.5337543487548828,
"loss": 5.4678,
"rewards/accuracies": 0.59375,
"rewards/chosen": -15.566169738769531,
"rewards/margins": -0.2286262959241867,
"rewards/rejected": -15.337542533874512,
"step": 31
},
{
"epoch": 0.07237772123268306,
"grad_norm": 73.16917555470789,
"learning_rate": 5.688888888888889e-07,
"logits/chosen": -1.1192123889923096,
"logits/rejected": -1.109642744064331,
"logps/chosen": -1.4663976430892944,
"logps/rejected": -1.4524457454681396,
"loss": 5.5922,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -14.663975715637207,
"rewards/margins": -0.1395174264907837,
"rewards/rejected": -14.524457931518555,
"step": 32
},
{
"epoch": 0.07463952502120441,
"grad_norm": 62.031723886215296,
"learning_rate": 5.866666666666666e-07,
"logits/chosen": -1.0371233224868774,
"logits/rejected": -1.0098799467086792,
"logps/chosen": -1.4417638778686523,
"logps/rejected": -1.5921223163604736,
"loss": 4.6202,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -14.417638778686523,
"rewards/margins": 1.5035836696624756,
"rewards/rejected": -15.921221733093262,
"step": 33
},
{
"epoch": 0.07690132880972575,
"grad_norm": 51.2296834877244,
"learning_rate": 6.044444444444444e-07,
"logits/chosen": -1.0844844579696655,
"logits/rejected": -1.0409635305404663,
"logps/chosen": -1.3941082954406738,
"logps/rejected": -1.51826810836792,
"loss": 4.462,
"rewards/accuracies": 0.625,
"rewards/chosen": -13.941082000732422,
"rewards/margins": 1.2415988445281982,
"rewards/rejected": -15.182682037353516,
"step": 34
},
{
"epoch": 0.0791631325982471,
"grad_norm": 96.14883019900049,
"learning_rate": 6.222222222222223e-07,
"logits/chosen": -1.1195001602172852,
"logits/rejected": -1.0811963081359863,
"logps/chosen": -1.518474817276001,
"logps/rejected": -1.636244297027588,
"loss": 4.8592,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -15.184747695922852,
"rewards/margins": 1.1776940822601318,
"rewards/rejected": -16.362442016601562,
"step": 35
},
{
"epoch": 0.08142493638676845,
"grad_norm": 58.6930145213695,
"learning_rate": 6.4e-07,
"logits/chosen": -1.0167288780212402,
"logits/rejected": -1.0025566816329956,
"logps/chosen": -1.438754677772522,
"logps/rejected": -1.5819575786590576,
"loss": 4.6308,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -14.387547492980957,
"rewards/margins": 1.4320300817489624,
"rewards/rejected": -15.819576263427734,
"step": 36
},
{
"epoch": 0.08368674017528979,
"grad_norm": 68.52944989879505,
"learning_rate": 6.577777777777777e-07,
"logits/chosen": -1.0579607486724854,
"logits/rejected": -1.0365407466888428,
"logps/chosen": -1.3250023126602173,
"logps/rejected": -1.3734800815582275,
"loss": 4.806,
"rewards/accuracies": 0.640625,
"rewards/chosen": -13.250021934509277,
"rewards/margins": 0.4847772419452667,
"rewards/rejected": -13.734800338745117,
"step": 37
},
{
"epoch": 0.08594854396381114,
"grad_norm": 77.13280631717205,
"learning_rate": 6.755555555555555e-07,
"logits/chosen": -1.1369915008544922,
"logits/rejected": -1.1310396194458008,
"logps/chosen": -1.4429665803909302,
"logps/rejected": -1.502832055091858,
"loss": 4.7471,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -14.429665565490723,
"rewards/margins": 0.5986539125442505,
"rewards/rejected": -15.0283203125,
"step": 38
},
{
"epoch": 0.08821034775233248,
"grad_norm": 72.97486900799309,
"learning_rate": 6.933333333333333e-07,
"logits/chosen": -1.1206347942352295,
"logits/rejected": -1.1240882873535156,
"logps/chosen": -1.4795533418655396,
"logps/rejected": -1.5203282833099365,
"loss": 4.953,
"rewards/accuracies": 0.546875,
"rewards/chosen": -14.7955322265625,
"rewards/margins": 0.40774989128112793,
"rewards/rejected": -15.20328140258789,
"step": 39
},
{
"epoch": 0.09047215154085383,
"grad_norm": 52.88449320217243,
"learning_rate": 7.111111111111111e-07,
"logits/chosen": -1.0798068046569824,
"logits/rejected": -1.043776512145996,
"logps/chosen": -1.3615009784698486,
"logps/rejected": -1.5376112461090088,
"loss": 3.9307,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -13.615011215209961,
"rewards/margins": 1.761101245880127,
"rewards/rejected": -15.37611198425293,
"step": 40
},
{
"epoch": 0.09273395532937517,
"grad_norm": 44.35065713868699,
"learning_rate": 7.288888888888888e-07,
"logits/chosen": -1.1410603523254395,
"logits/rejected": -1.107545256614685,
"logps/chosen": -1.3647446632385254,
"logps/rejected": -1.4958362579345703,
"loss": 4.5188,
"rewards/accuracies": 0.609375,
"rewards/chosen": -13.647444725036621,
"rewards/margins": 1.3109173774719238,
"rewards/rejected": -14.95836353302002,
"step": 41
},
{
"epoch": 0.09499575911789652,
"grad_norm": 62.52516819672327,
"learning_rate": 7.466666666666667e-07,
"logits/chosen": -1.024010419845581,
"logits/rejected": -0.980333685874939,
"logps/chosen": -1.4068825244903564,
"logps/rejected": -1.5373163223266602,
"loss": 4.6958,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -14.068826675415039,
"rewards/margins": 1.3043370246887207,
"rewards/rejected": -15.373163223266602,
"step": 42
},
{
"epoch": 0.09725756290641786,
"grad_norm": 75.14472900675813,
"learning_rate": 7.644444444444444e-07,
"logits/chosen": -1.0994362831115723,
"logits/rejected": -1.0756360292434692,
"logps/chosen": -1.3996585607528687,
"logps/rejected": -1.4580457210540771,
"loss": 4.6856,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -13.996583938598633,
"rewards/margins": 0.5838702917098999,
"rewards/rejected": -14.58045482635498,
"step": 43
},
{
"epoch": 0.09951936669493922,
"grad_norm": 54.0175172423554,
"learning_rate": 7.822222222222222e-07,
"logits/chosen": -1.0326025485992432,
"logits/rejected": -1.0035631656646729,
"logps/chosen": -1.2723063230514526,
"logps/rejected": -1.3909995555877686,
"loss": 4.3197,
"rewards/accuracies": 0.625,
"rewards/chosen": -12.723064422607422,
"rewards/margins": 1.1869314908981323,
"rewards/rejected": -13.909995079040527,
"step": 44
},
{
"epoch": 0.10178117048346055,
"grad_norm": 50.25629417898048,
"learning_rate": 8e-07,
"logits/chosen": -1.0961617231369019,
"logits/rejected": -1.079810619354248,
"logps/chosen": -1.3260252475738525,
"logps/rejected": -1.4084868431091309,
"loss": 4.662,
"rewards/accuracies": 0.578125,
"rewards/chosen": -13.260252952575684,
"rewards/margins": 0.8246161937713623,
"rewards/rejected": -14.084867477416992,
"step": 45
},
{
"epoch": 0.1040429742719819,
"grad_norm": 96.18305334292586,
"learning_rate": 7.999874759018868e-07,
"logits/chosen": -1.117138147354126,
"logits/rejected": -1.0853725671768188,
"logps/chosen": -1.529714584350586,
"logps/rejected": -1.6374049186706543,
"loss": 4.7187,
"rewards/accuracies": 0.625,
"rewards/chosen": -15.29714584350586,
"rewards/margins": 1.076902151107788,
"rewards/rejected": -16.374048233032227,
"step": 46
},
{
"epoch": 0.10630477806050326,
"grad_norm": 50.552602097177946,
"learning_rate": 7.999499043918123e-07,
"logits/chosen": -1.1253491640090942,
"logits/rejected": -1.1363474130630493,
"logps/chosen": -1.4145984649658203,
"logps/rejected": -1.4883978366851807,
"loss": 4.8257,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -14.14598274230957,
"rewards/margins": 0.7379940748214722,
"rewards/rejected": -14.883977890014648,
"step": 47
},
{
"epoch": 0.1085665818490246,
"grad_norm": 56.426748642311836,
"learning_rate": 7.998872878225228e-07,
"logits/chosen": -1.0526834726333618,
"logits/rejected": -1.043718695640564,
"logps/chosen": -1.4263412952423096,
"logps/rejected": -1.5427813529968262,
"loss": 4.4355,
"rewards/accuracies": 0.59375,
"rewards/chosen": -14.263413429260254,
"rewards/margins": 1.1644010543823242,
"rewards/rejected": -15.427813529968262,
"step": 48
},
{
"epoch": 0.11082838563754595,
"grad_norm": 55.22325931071655,
"learning_rate": 7.997996301150987e-07,
"logits/chosen": -1.042626142501831,
"logits/rejected": -1.0421124696731567,
"logps/chosen": -1.3600575923919678,
"logps/rejected": -1.4976643323898315,
"loss": 4.2471,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -13.600576400756836,
"rewards/margins": 1.3760665655136108,
"rewards/rejected": -14.976642608642578,
"step": 49
},
{
"epoch": 0.11309018942606729,
"grad_norm": 63.88977382203578,
"learning_rate": 7.996869367587088e-07,
"logits/chosen": -1.0255780220031738,
"logits/rejected": -1.0129433870315552,
"logps/chosen": -1.368567943572998,
"logps/rejected": -1.4815213680267334,
"loss": 4.5417,
"rewards/accuracies": 0.640625,
"rewards/chosen": -13.685680389404297,
"rewards/margins": 1.1295346021652222,
"rewards/rejected": -14.815214157104492,
"step": 50
},
{
"epoch": 0.11535199321458864,
"grad_norm": 49.558562568767535,
"learning_rate": 7.99549214810266e-07,
"logits/chosen": -1.0174915790557861,
"logits/rejected": -1.01373291015625,
"logps/chosen": -1.417051911354065,
"logps/rejected": -1.5000643730163574,
"loss": 4.6103,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -14.170519828796387,
"rewards/margins": 0.8301246166229248,
"rewards/rejected": -15.00064468383789,
"step": 51
},
{
"epoch": 0.11761379700310998,
"grad_norm": 59.766954657505124,
"learning_rate": 7.993864728939867e-07,
"logits/chosen": -1.0445855855941772,
"logits/rejected": -1.002925157546997,
"logps/chosen": -1.3486438989639282,
"logps/rejected": -1.462002158164978,
"loss": 4.7086,
"rewards/accuracies": 0.59375,
"rewards/chosen": -13.486440658569336,
"rewards/margins": 1.1335809230804443,
"rewards/rejected": -14.620019912719727,
"step": 52
},
{
"epoch": 0.11987560079163133,
"grad_norm": 61.66347742848864,
"learning_rate": 7.991987212008491e-07,
"logits/chosen": -1.0562191009521484,
"logits/rejected": -1.0350170135498047,
"logps/chosen": -1.4400745630264282,
"logps/rejected": -1.6580910682678223,
"loss": 4.1834,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -14.400745391845703,
"rewards/margins": 2.1801652908325195,
"rewards/rejected": -16.580909729003906,
"step": 53
},
{
"epoch": 0.12213740458015267,
"grad_norm": 65.57057337291207,
"learning_rate": 7.989859714879565e-07,
"logits/chosen": -1.1002763509750366,
"logits/rejected": -1.066996693611145,
"logps/chosen": -1.4376161098480225,
"logps/rejected": -1.5005946159362793,
"loss": 4.9236,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -14.376161575317383,
"rewards/margins": 0.6297858357429504,
"rewards/rejected": -15.00594711303711,
"step": 54
},
{
"epoch": 0.12439920836867402,
"grad_norm": 55.513799760876196,
"learning_rate": 7.987482370778005e-07,
"logits/chosen": -1.0518475770950317,
"logits/rejected": -1.027663230895996,
"logps/chosen": -1.4021294116973877,
"logps/rejected": -1.530908465385437,
"loss": 4.5889,
"rewards/accuracies": 0.609375,
"rewards/chosen": -14.021293640136719,
"rewards/margins": 1.287792444229126,
"rewards/rejected": -15.309085845947266,
"step": 55
},
{
"epoch": 0.12666101215719536,
"grad_norm": 75.90464831673606,
"learning_rate": 7.984855328574262e-07,
"logits/chosen": -0.9373297691345215,
"logits/rejected": -0.9265519380569458,
"logps/chosen": -1.3669638633728027,
"logps/rejected": -1.4640941619873047,
"loss": 4.55,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -13.669638633728027,
"rewards/margins": 0.9713033437728882,
"rewards/rejected": -14.640941619873047,
"step": 56
},
{
"epoch": 0.1289228159457167,
"grad_norm": 68.96445829802559,
"learning_rate": 7.981978752775009e-07,
"logits/chosen": -0.9993575811386108,
"logits/rejected": -0.9976133704185486,
"logps/chosen": -1.397598385810852,
"logps/rejected": -1.541737675666809,
"loss": 4.59,
"rewards/accuracies": 0.578125,
"rewards/chosen": -13.975984573364258,
"rewards/margins": 1.4413925409317017,
"rewards/rejected": -15.417377471923828,
"step": 57
},
{
"epoch": 0.13118461973423806,
"grad_norm": 80.54101053736677,
"learning_rate": 7.978852823512833e-07,
"logits/chosen": -1.0546932220458984,
"logits/rejected": -1.0237070322036743,
"logps/chosen": -1.5197957754135132,
"logps/rejected": -1.6809276342391968,
"loss": 4.5398,
"rewards/accuracies": 0.578125,
"rewards/chosen": -15.197957992553711,
"rewards/margins": 1.6113194227218628,
"rewards/rejected": -16.809276580810547,
"step": 58
},
{
"epoch": 0.1334464235227594,
"grad_norm": 92.89373223689378,
"learning_rate": 7.975477736534957e-07,
"logits/chosen": -1.0505645275115967,
"logits/rejected": -1.0457043647766113,
"logps/chosen": -1.45807945728302,
"logps/rejected": -1.6856966018676758,
"loss": 4.3795,
"rewards/accuracies": 0.59375,
"rewards/chosen": -14.580793380737305,
"rewards/margins": 2.276172637939453,
"rewards/rejected": -16.856966018676758,
"step": 59
},
{
"epoch": 0.13570822731128074,
"grad_norm": 86.59524139517637,
"learning_rate": 7.971853703190986e-07,
"logits/chosen": -1.0514953136444092,
"logits/rejected": -1.0330663919448853,
"logps/chosen": -1.4829269647598267,
"logps/rejected": -1.656957983970642,
"loss": 4.4199,
"rewards/accuracies": 0.640625,
"rewards/chosen": -14.829270362854004,
"rewards/margins": 1.740309715270996,
"rewards/rejected": -16.569580078125,
"step": 60
},
{
"epoch": 0.1379700310998021,
"grad_norm": 58.04746598511771,
"learning_rate": 7.967980950419664e-07,
"logits/chosen": -0.9681941270828247,
"logits/rejected": -0.9625906348228455,
"logps/chosen": -1.4017956256866455,
"logps/rejected": -1.5850591659545898,
"loss": 4.1725,
"rewards/accuracies": 0.703125,
"rewards/chosen": -14.01795768737793,
"rewards/margins": 1.8326325416564941,
"rewards/rejected": -15.850589752197266,
"step": 61
},
{
"epoch": 0.14023183488832344,
"grad_norm": 75.43808245271408,
"learning_rate": 7.963859720734669e-07,
"logits/chosen": -1.0596580505371094,
"logits/rejected": -1.0476895570755005,
"logps/chosen": -1.3745746612548828,
"logps/rejected": -1.5092250108718872,
"loss": 4.634,
"rewards/accuracies": 0.578125,
"rewards/chosen": -13.745745658874512,
"rewards/margins": 1.3465046882629395,
"rewards/rejected": -15.09225082397461,
"step": 62
},
{
"epoch": 0.14249363867684478,
"grad_norm": 52.97237108595383,
"learning_rate": 7.959490272209427e-07,
"logits/chosen": -1.0601532459259033,
"logits/rejected": -1.01606023311615,
"logps/chosen": -1.3559281826019287,
"logps/rejected": -1.6258801221847534,
"loss": 3.7898,
"rewards/accuracies": 0.65625,
"rewards/chosen": -13.559282302856445,
"rewards/margins": 2.6995184421539307,
"rewards/rejected": -16.258800506591797,
"step": 63
},
{
"epoch": 0.14475544246536612,
"grad_norm": 52.737070381425056,
"learning_rate": 7.954872878460946e-07,
"logits/chosen": -1.0653126239776611,
"logits/rejected": -1.0284615755081177,
"logps/chosen": -1.3986772298812866,
"logps/rejected": -1.6275815963745117,
"loss": 3.9627,
"rewards/accuracies": 0.640625,
"rewards/chosen": -13.986770629882812,
"rewards/margins": 2.2890453338623047,
"rewards/rejected": -16.275815963745117,
"step": 64
},
{
"epoch": 0.14701724625388748,
"grad_norm": 64.78973484996234,
"learning_rate": 7.950007828632691e-07,
"logits/chosen": -1.0072953701019287,
"logits/rejected": -1.0003857612609863,
"logps/chosen": -1.4827585220336914,
"logps/rejected": -1.7651684284210205,
"loss": 4.0168,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -14.82758617401123,
"rewards/margins": 2.8241007328033447,
"rewards/rejected": -17.65168571472168,
"step": 65
},
{
"epoch": 0.14927905004240882,
"grad_norm": 48.16068590765694,
"learning_rate": 7.944895427376465e-07,
"logits/chosen": -1.0329233407974243,
"logits/rejected": -1.0038236379623413,
"logps/chosen": -1.4716813564300537,
"logps/rejected": -1.7506184577941895,
"loss": 4.0156,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -14.716813087463379,
"rewards/margins": 2.789372682571411,
"rewards/rejected": -17.506183624267578,
"step": 66
},
{
"epoch": 0.15154085383093016,
"grad_norm": 52.28364686992911,
"learning_rate": 7.939535994833345e-07,
"logits/chosen": -0.9820632338523865,
"logits/rejected": -0.9789884686470032,
"logps/chosen": -1.3938853740692139,
"logps/rejected": -1.6392000913619995,
"loss": 4.1279,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -13.938854217529297,
"rewards/margins": 2.4531469345092773,
"rewards/rejected": -16.392000198364258,
"step": 67
},
{
"epoch": 0.1538026576194515,
"grad_norm": 59.96935116096323,
"learning_rate": 7.933929866613628e-07,
"logits/chosen": -1.002638339996338,
"logits/rejected": -1.0027940273284912,
"logps/chosen": -1.450176477432251,
"logps/rejected": -1.5936009883880615,
"loss": 4.3626,
"rewards/accuracies": 0.671875,
"rewards/chosen": -14.501765251159668,
"rewards/margins": 1.4342446327209473,
"rewards/rejected": -15.936010360717773,
"step": 68
},
{
"epoch": 0.15606446140797287,
"grad_norm": 69.75195635586249,
"learning_rate": 7.928077393775808e-07,
"logits/chosen": -0.9776244163513184,
"logits/rejected": -0.9800903797149658,
"logps/chosen": -1.4521443843841553,
"logps/rejected": -1.7846593856811523,
"loss": 3.586,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -14.521443367004395,
"rewards/margins": 3.3251500129699707,
"rewards/rejected": -17.846593856811523,
"step": 69
},
{
"epoch": 0.1583262651964942,
"grad_norm": 77.74229614088146,
"learning_rate": 7.921978942804609e-07,
"logits/chosen": -0.9629479050636292,
"logits/rejected": -0.9628820419311523,
"logps/chosen": -1.4668632745742798,
"logps/rejected": -1.7327969074249268,
"loss": 3.7939,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -14.668632507324219,
"rewards/margins": 2.659337282180786,
"rewards/rejected": -17.32796859741211,
"step": 70
},
{
"epoch": 0.16058806898501554,
"grad_norm": 73.23733622048438,
"learning_rate": 7.915634895588021e-07,
"logits/chosen": -1.003068447113037,
"logits/rejected": -0.9916574954986572,
"logps/chosen": -1.5601112842559814,
"logps/rejected": -1.733612060546875,
"loss": 4.3674,
"rewards/accuracies": 0.65625,
"rewards/chosen": -15.601114273071289,
"rewards/margins": 1.735007405281067,
"rewards/rejected": -17.33612060546875,
"step": 71
},
{
"epoch": 0.1628498727735369,
"grad_norm": 75.55585019277092,
"learning_rate": 7.909045649393394e-07,
"logits/chosen": -1.040993332862854,
"logits/rejected": -1.0444133281707764,
"logps/chosen": -1.4949562549591064,
"logps/rejected": -1.6046648025512695,
"loss": 4.5456,
"rewards/accuracies": 0.640625,
"rewards/chosen": -14.949562072753906,
"rewards/margins": 1.0970847606658936,
"rewards/rejected": -16.046646118164062,
"step": 72
},
{
"epoch": 0.16511167656205825,
"grad_norm": 65.91136959416728,
"learning_rate": 7.902211616842556e-07,
"logits/chosen": -1.0223642587661743,
"logits/rejected": -1.014294147491455,
"logps/chosen": -1.514482021331787,
"logps/rejected": -1.7560579776763916,
"loss": 4.1924,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -15.144821166992188,
"rewards/margins": 2.4157588481903076,
"rewards/rejected": -17.56058120727539,
"step": 73
},
{
"epoch": 0.16737348035057958,
"grad_norm": 68.14881598964276,
"learning_rate": 7.89513322588598e-07,
"logits/chosen": -1.006181001663208,
"logits/rejected": -0.9941987991333008,
"logps/chosen": -1.5036894083023071,
"logps/rejected": -1.683767318725586,
"loss": 4.0427,
"rewards/accuracies": 0.65625,
"rewards/chosen": -15.036893844604492,
"rewards/margins": 1.800781011581421,
"rewards/rejected": -16.83767318725586,
"step": 74
},
{
"epoch": 0.16963528413910092,
"grad_norm": 54.7050909412455,
"learning_rate": 7.887810919775976e-07,
"logits/chosen": -0.9531492590904236,
"logits/rejected": -0.939092755317688,
"logps/chosen": -1.5378425121307373,
"logps/rejected": -1.7426013946533203,
"loss": 4.1258,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -15.378423690795898,
"rewards/margins": 2.0475881099700928,
"rewards/rejected": -17.42601203918457,
"step": 75
},
{
"epoch": 0.1718970879276223,
"grad_norm": 43.16218753621578,
"learning_rate": 7.880245157038949e-07,
"logits/chosen": -1.0244200229644775,
"logits/rejected": -0.9850828647613525,
"logps/chosen": -1.5913515090942383,
"logps/rejected": -1.7808334827423096,
"loss": 4.1766,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -15.913515090942383,
"rewards/margins": 1.8948214054107666,
"rewards/rejected": -17.80833625793457,
"step": 76
},
{
"epoch": 0.17415889171614363,
"grad_norm": 68.24029437702886,
"learning_rate": 7.872436411446671e-07,
"logits/chosen": -1.0308367013931274,
"logits/rejected": -1.053020715713501,
"logps/chosen": -1.578061819076538,
"logps/rejected": -1.7366865873336792,
"loss": 4.4062,
"rewards/accuracies": 0.609375,
"rewards/chosen": -15.780617713928223,
"rewards/margins": 1.5862494707107544,
"rewards/rejected": -17.366867065429688,
"step": 77
},
{
"epoch": 0.17642069550466496,
"grad_norm": 47.250623759092406,
"learning_rate": 7.86438517198662e-07,
"logits/chosen": -0.981247067451477,
"logits/rejected": -0.9654079675674438,
"logps/chosen": -1.4891939163208008,
"logps/rejected": -1.692526936531067,
"loss": 4.1759,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -14.89194107055664,
"rewards/margins": 2.0333304405212402,
"rewards/rejected": -16.925270080566406,
"step": 78
},
{
"epoch": 0.1786824992931863,
"grad_norm": 43.21600729867601,
"learning_rate": 7.856091942831366e-07,
"logits/chosen": -0.9647431373596191,
"logits/rejected": -0.9658678770065308,
"logps/chosen": -1.4802823066711426,
"logps/rejected": -1.6921435594558716,
"loss": 4.3292,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -14.802824020385742,
"rewards/margins": 2.1186134815216064,
"rewards/rejected": -16.921438217163086,
"step": 79
},
{
"epoch": 0.18094430308170767,
"grad_norm": 60.91821720218818,
"learning_rate": 7.847557243306982e-07,
"logits/chosen": -1.0512564182281494,
"logits/rejected": -1.0319970846176147,
"logps/chosen": -1.5609538555145264,
"logps/rejected": -1.7701412439346313,
"loss": 4.0368,
"rewards/accuracies": 0.671875,
"rewards/chosen": -15.609538078308105,
"rewards/margins": 2.0918755531311035,
"rewards/rejected": -17.701412200927734,
"step": 80
},
{
"epoch": 0.183206106870229,
"grad_norm": 63.54121135584775,
"learning_rate": 7.838781607860541e-07,
"logits/chosen": -1.0236457586288452,
"logits/rejected": -1.0056768655776978,
"logps/chosen": -1.5709818601608276,
"logps/rejected": -1.7774336338043213,
"loss": 3.6937,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -15.709818840026855,
"rewards/margins": 2.0645182132720947,
"rewards/rejected": -17.774335861206055,
"step": 81
},
{
"epoch": 0.18546791065875035,
"grad_norm": 52.43422564611808,
"learning_rate": 7.82976558602664e-07,
"logits/chosen": -1.0549571514129639,
"logits/rejected": -1.0641645193099976,
"logps/chosen": -1.5252659320831299,
"logps/rejected": -1.6951864957809448,
"loss": 4.1141,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -15.252659797668457,
"rewards/margins": 1.6992065906524658,
"rewards/rejected": -16.951866149902344,
"step": 82
},
{
"epoch": 0.1877297144472717,
"grad_norm": 71.24127939099962,
"learning_rate": 7.820509742392988e-07,
"logits/chosen": -1.0248099565505981,
"logits/rejected": -1.006967306137085,
"logps/chosen": -1.6189182996749878,
"logps/rejected": -1.7908198833465576,
"loss": 4.1344,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -16.18918228149414,
"rewards/margins": 1.7190148830413818,
"rewards/rejected": -17.9081974029541,
"step": 83
},
{
"epoch": 0.18999151823579305,
"grad_norm": 101.94936409837923,
"learning_rate": 7.811014656565054e-07,
"logits/chosen": -1.0417755842208862,
"logits/rejected": -1.0172818899154663,
"logps/chosen": -1.5213223695755005,
"logps/rejected": -1.8359572887420654,
"loss": 3.5607,
"rewards/accuracies": 0.6875,
"rewards/chosen": -15.213224411010742,
"rewards/margins": 3.1463489532470703,
"rewards/rejected": -18.359575271606445,
"step": 84
},
{
"epoch": 0.1922533220243144,
"grad_norm": 55.87779805654747,
"learning_rate": 7.801280923129773e-07,
"logits/chosen": -1.0234826803207397,
"logits/rejected": -1.0111762285232544,
"logps/chosen": -1.5780723094940186,
"logps/rejected": -1.7445272207260132,
"loss": 4.4554,
"rewards/accuracies": 0.609375,
"rewards/chosen": -15.780723571777344,
"rewards/margins": 1.6645516157150269,
"rewards/rejected": -17.44527244567871,
"step": 85
},
{
"epoch": 0.19451512581283573,
"grad_norm": 62.73905684172959,
"learning_rate": 7.791309151618305e-07,
"logits/chosen": -1.0294363498687744,
"logits/rejected": -1.026973843574524,
"logps/chosen": -1.68962824344635,
"logps/rejected": -1.888668417930603,
"loss": 4.1944,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -16.89628028869629,
"rewards/margins": 1.9904035329818726,
"rewards/rejected": -18.88668441772461,
"step": 86
},
{
"epoch": 0.1967769296013571,
"grad_norm": 50.00488780508724,
"learning_rate": 7.781099966467874e-07,
"logits/chosen": -1.05634343624115,
"logits/rejected": -1.0430347919464111,
"logps/chosen": -1.5399338006973267,
"logps/rejected": -1.6823569536209106,
"loss": 3.99,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -15.399337768554688,
"rewards/margins": 1.4242321252822876,
"rewards/rejected": -16.82356834411621,
"step": 87
},
{
"epoch": 0.19903873338987843,
"grad_norm": 76.23850174230617,
"learning_rate": 7.770654006982664e-07,
"logits/chosen": -1.033555269241333,
"logits/rejected": -0.9970846176147461,
"logps/chosen": -1.7175520658493042,
"logps/rejected": -1.9172134399414062,
"loss": 4.364,
"rewards/accuracies": 0.671875,
"rewards/chosen": -17.175518035888672,
"rewards/margins": 1.9966144561767578,
"rewards/rejected": -19.172136306762695,
"step": 88
},
{
"epoch": 0.20130053717839977,
"grad_norm": 60.90592219891859,
"learning_rate": 7.759971927293781e-07,
"logits/chosen": -1.0577505826950073,
"logits/rejected": -1.0415252447128296,
"logps/chosen": -1.566547155380249,
"logps/rejected": -1.7689849138259888,
"loss": 3.9552,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -15.665471076965332,
"rewards/margins": 2.0243773460388184,
"rewards/rejected": -17.689849853515625,
"step": 89
},
{
"epoch": 0.2035623409669211,
"grad_norm": 58.25113147294504,
"learning_rate": 7.749054396318297e-07,
"logits/chosen": -1.0256553888320923,
"logits/rejected": -1.0106987953186035,
"logps/chosen": -1.6812288761138916,
"logps/rejected": -1.8377100229263306,
"loss": 4.4271,
"rewards/accuracies": 0.625,
"rewards/chosen": -16.81229019165039,
"rewards/margins": 1.564809799194336,
"rewards/rejected": -18.377098083496094,
"step": 90
},
{
"epoch": 0.20582414475544247,
"grad_norm": 77.85759380281476,
"learning_rate": 7.737902097717356e-07,
"logits/chosen": -0.990077793598175,
"logits/rejected": -0.9915316700935364,
"logps/chosen": -1.572546124458313,
"logps/rejected": -1.8305258750915527,
"loss": 4.1888,
"rewards/accuracies": 0.59375,
"rewards/chosen": -15.725460052490234,
"rewards/margins": 2.5797994136810303,
"rewards/rejected": -18.305259704589844,
"step": 91
},
{
"epoch": 0.2080859485439638,
"grad_norm": 63.82605757099174,
"learning_rate": 7.726515729853367e-07,
"logits/chosen": -0.989700198173523,
"logits/rejected": -0.9903428554534912,
"logps/chosen": -1.5940182209014893,
"logps/rejected": -1.7212352752685547,
"loss": 4.5859,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -15.940183639526367,
"rewards/margins": 1.2721664905548096,
"rewards/rejected": -17.21234893798828,
"step": 92
},
{
"epoch": 0.21034775233248515,
"grad_norm": 65.72932573481259,
"learning_rate": 7.714896005746272e-07,
"logits/chosen": -1.024734616279602,
"logits/rejected": -1.0021594762802124,
"logps/chosen": -1.6271653175354004,
"logps/rejected": -1.9118987321853638,
"loss": 3.5493,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -16.271652221679688,
"rewards/margins": 2.8473360538482666,
"rewards/rejected": -19.118988037109375,
"step": 93
},
{
"epoch": 0.21260955612100652,
"grad_norm": 82.97110656932769,
"learning_rate": 7.703043653028896e-07,
"logits/chosen": -1.052355170249939,
"logits/rejected": -1.0488598346710205,
"logps/chosen": -1.840329647064209,
"logps/rejected": -2.031944990158081,
"loss": 4.325,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -18.403295516967773,
"rewards/margins": 1.9161533117294312,
"rewards/rejected": -20.31945037841797,
"step": 94
},
{
"epoch": 0.21487135990952785,
"grad_norm": 101.34165470117001,
"learning_rate": 7.690959413901379e-07,
"logits/chosen": -1.0169719457626343,
"logits/rejected": -0.9872531890869141,
"logps/chosen": -1.688145399093628,
"logps/rejected": -1.886854648590088,
"loss": 4.2579,
"rewards/accuracies": 0.703125,
"rewards/chosen": -16.881454467773438,
"rewards/margins": 1.9870920181274414,
"rewards/rejected": -18.868545532226562,
"step": 95
},
{
"epoch": 0.2171331636980492,
"grad_norm": 61.64172871920425,
"learning_rate": 7.678644045084704e-07,
"logits/chosen": -0.9576135873794556,
"logits/rejected": -0.9784174561500549,
"logps/chosen": -1.5893274545669556,
"logps/rejected": -1.8206167221069336,
"loss": 4.2059,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -15.89327621459961,
"rewards/margins": 2.312891721725464,
"rewards/rejected": -18.206167221069336,
"step": 96
},
{
"epoch": 0.21939496748657053,
"grad_norm": 57.351716171177195,
"learning_rate": 7.666098317773308e-07,
"logits/chosen": -1.0138105154037476,
"logits/rejected": -1.0110279321670532,
"logps/chosen": -1.73637855052948,
"logps/rejected": -1.9551489353179932,
"loss": 3.7635,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -17.363784790039062,
"rewards/margins": 2.187703847885132,
"rewards/rejected": -19.551488876342773,
"step": 97
},
{
"epoch": 0.2216567712750919,
"grad_norm": 55.90174304015073,
"learning_rate": 7.653323017586789e-07,
"logits/chosen": -1.036927342414856,
"logits/rejected": -1.022722840309143,
"logps/chosen": -1.5883724689483643,
"logps/rejected": -1.7606651782989502,
"loss": 4.022,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -15.8837251663208,
"rewards/margins": 1.7229257822036743,
"rewards/rejected": -17.60664939880371,
"step": 98
},
{
"epoch": 0.22391857506361323,
"grad_norm": 66.1645588014719,
"learning_rate": 7.640318944520711e-07,
"logits/chosen": -1.0245897769927979,
"logits/rejected": -1.0293288230895996,
"logps/chosen": -1.7076942920684814,
"logps/rejected": -1.9161455631256104,
"loss": 4.0482,
"rewards/accuracies": 0.6875,
"rewards/chosen": -17.07694435119629,
"rewards/margins": 2.0845108032226562,
"rewards/rejected": -19.161455154418945,
"step": 99
},
{
"epoch": 0.22618037885213457,
"grad_norm": 61.755723223908106,
"learning_rate": 7.627086912896511e-07,
"logits/chosen": -0.9213237762451172,
"logits/rejected": -0.9413522481918335,
"logps/chosen": -1.6155421733856201,
"logps/rejected": -1.8583589792251587,
"loss": 3.6863,
"rewards/accuracies": 0.6875,
"rewards/chosen": -16.15542221069336,
"rewards/margins": 2.4281704425811768,
"rewards/rejected": -18.583589553833008,
"step": 100
},
{
"epoch": 0.2284421826406559,
"grad_norm": 52.658945282281614,
"learning_rate": 7.613627751310499e-07,
"logits/chosen": -1.0586283206939697,
"logits/rejected": -1.0511394739151,
"logps/chosen": -1.6844897270202637,
"logps/rejected": -1.8975579738616943,
"loss": 3.6614,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -16.84489631652832,
"rewards/margins": 2.1306824684143066,
"rewards/rejected": -18.97557830810547,
"step": 101
},
{
"epoch": 0.23070398642917728,
"grad_norm": 63.56535705000215,
"learning_rate": 7.599942302581977e-07,
"logits/chosen": -1.026753544807434,
"logits/rejected": -1.029058575630188,
"logps/chosen": -1.7198643684387207,
"logps/rejected": -2.0452115535736084,
"loss": 3.5109,
"rewards/accuracies": 0.71875,
"rewards/chosen": -17.198644638061523,
"rewards/margins": 3.253472328186035,
"rewards/rejected": -20.452116012573242,
"step": 102
},
{
"epoch": 0.23296579021769862,
"grad_norm": 74.0515901650371,
"learning_rate": 7.586031423700457e-07,
"logits/chosen": -1.0166749954223633,
"logits/rejected": -1.0148067474365234,
"logps/chosen": -1.6628804206848145,
"logps/rejected": -1.8891874551773071,
"loss": 3.9207,
"rewards/accuracies": 0.671875,
"rewards/chosen": -16.62880516052246,
"rewards/margins": 2.263070821762085,
"rewards/rejected": -18.891874313354492,
"step": 103
},
{
"epoch": 0.23522759400621995,
"grad_norm": 75.20532489401907,
"learning_rate": 7.571895985772e-07,
"logits/chosen": -0.9868625402450562,
"logits/rejected": -0.9866358041763306,
"logps/chosen": -1.6601110696792603,
"logps/rejected": -1.9702361822128296,
"loss": 3.5455,
"rewards/accuracies": 0.71875,
"rewards/chosen": -16.601110458374023,
"rewards/margins": 3.1012516021728516,
"rewards/rejected": -19.702362060546875,
"step": 104
},
{
"epoch": 0.23748939779474132,
"grad_norm": 71.44625464382334,
"learning_rate": 7.557536873964661e-07,
"logits/chosen": -1.0506460666656494,
"logits/rejected": -1.0464547872543335,
"logps/chosen": -1.9673895835876465,
"logps/rejected": -2.150362491607666,
"loss": 4.3648,
"rewards/accuracies": 0.671875,
"rewards/chosen": -19.67389678955078,
"rewards/margins": 1.829728603363037,
"rewards/rejected": -21.503625869750977,
"step": 105
},
{
"epoch": 0.23975120158326266,
"grad_norm": 64.29011593779744,
"learning_rate": 7.542954987453069e-07,
"logits/chosen": -1.01687753200531,
"logits/rejected": -1.0152884721755981,
"logps/chosen": -1.7988362312316895,
"logps/rejected": -2.022243022918701,
"loss": 3.6928,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -17.98836326599121,
"rewards/margins": 2.2340688705444336,
"rewards/rejected": -20.222431182861328,
"step": 106
},
{
"epoch": 0.242013005371784,
"grad_norm": 71.33174561105508,
"learning_rate": 7.528151239362108e-07,
"logits/chosen": -1.035961627960205,
"logits/rejected": -1.0357584953308105,
"logps/chosen": -1.9293386936187744,
"logps/rejected": -2.1782901287078857,
"loss": 3.773,
"rewards/accuracies": 0.65625,
"rewards/chosen": -19.29338836669922,
"rewards/margins": 2.4895126819610596,
"rewards/rejected": -21.782901763916016,
"step": 107
},
{
"epoch": 0.24427480916030533,
"grad_norm": 80.81275558049109,
"learning_rate": 7.513126556709748e-07,
"logits/chosen": -1.0150071382522583,
"logits/rejected": -0.9945900440216064,
"logps/chosen": -1.8521547317504883,
"logps/rejected": -2.231255054473877,
"loss": 3.4008,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -18.521547317504883,
"rewards/margins": 3.7910025119781494,
"rewards/rejected": -22.312549591064453,
"step": 108
},
{
"epoch": 0.2465366129488267,
"grad_norm": 83.79674539481208,
"learning_rate": 7.497881880348984e-07,
"logits/chosen": -0.9941821098327637,
"logits/rejected": -0.9765617847442627,
"logps/chosen": -1.8816341161727905,
"logps/rejected": -2.1459474563598633,
"loss": 3.7964,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -18.816341400146484,
"rewards/margins": 2.6431331634521484,
"rewards/rejected": -21.459474563598633,
"step": 109
},
{
"epoch": 0.24879841673734804,
"grad_norm": 94.74147514463534,
"learning_rate": 7.482418164908931e-07,
"logits/chosen": -1.0146708488464355,
"logits/rejected": -1.0124180316925049,
"logps/chosen": -1.972477912902832,
"logps/rejected": -2.1618151664733887,
"loss": 4.1515,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -19.724777221679688,
"rewards/margins": 1.8933711051940918,
"rewards/rejected": -21.618152618408203,
"step": 110
},
{
"epoch": 0.2510602205258694,
"grad_norm": 75.01460898156064,
"learning_rate": 7.466736378735035e-07,
"logits/chosen": -0.9929904937744141,
"logits/rejected": -1.0005165338516235,
"logps/chosen": -1.9998325109481812,
"logps/rejected": -2.2924444675445557,
"loss": 3.5312,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -19.99832534790039,
"rewards/margins": 2.926118850708008,
"rewards/rejected": -22.92444610595703,
"step": 111
},
{
"epoch": 0.2533220243143907,
"grad_norm": 78.18245291370165,
"learning_rate": 7.450837503828439e-07,
"logits/chosen": -0.9820634126663208,
"logits/rejected": -0.9711620211601257,
"logps/chosen": -2.0978574752807617,
"logps/rejected": -2.4662513732910156,
"loss": 3.5141,
"rewards/accuracies": 0.703125,
"rewards/chosen": -20.978572845458984,
"rewards/margins": 3.6839394569396973,
"rewards/rejected": -24.662513732910156,
"step": 112
},
{
"epoch": 0.2555838281029121,
"grad_norm": 67.13686530078938,
"learning_rate": 7.43472253578449e-07,
"logits/chosen": -0.9848591089248657,
"logits/rejected": -0.9883791208267212,
"logps/chosen": -1.8714666366577148,
"logps/rejected": -2.1671502590179443,
"loss": 3.6048,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -18.714664459228516,
"rewards/margins": 2.9568369388580322,
"rewards/rejected": -21.6715030670166,
"step": 113
},
{
"epoch": 0.2578456318914334,
"grad_norm": 73.56377763038063,
"learning_rate": 7.418392483730389e-07,
"logits/chosen": -0.9991137981414795,
"logits/rejected": -1.0059112310409546,
"logps/chosen": -2.052945137023926,
"logps/rejected": -2.341433048248291,
"loss": 3.8262,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -20.529449462890625,
"rewards/margins": 2.8848838806152344,
"rewards/rejected": -23.41433334350586,
"step": 114
},
{
"epoch": 0.26010743567995476,
"grad_norm": 59.89075032149932,
"learning_rate": 7.401848370262012e-07,
"logits/chosen": -1.0506592988967896,
"logits/rejected": -1.034238338470459,
"logps/chosen": -2.0077497959136963,
"logps/rejected": -2.254788875579834,
"loss": 3.6761,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -20.07750129699707,
"rewards/margins": 2.470388174057007,
"rewards/rejected": -22.547889709472656,
"step": 115
},
{
"epoch": 0.2623692394684761,
"grad_norm": 83.49021187119062,
"learning_rate": 7.385091231379856e-07,
"logits/chosen": -1.0015934705734253,
"logits/rejected": -1.0052032470703125,
"logps/chosen": -2.1289501190185547,
"logps/rejected": -2.431809663772583,
"loss": 3.8092,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -21.28950309753418,
"rewards/margins": 3.0285940170288086,
"rewards/rejected": -24.318098068237305,
"step": 116
},
{
"epoch": 0.26463104325699743,
"grad_norm": 80.00525356848313,
"learning_rate": 7.368122116424182e-07,
"logits/chosen": -0.9749897718429565,
"logits/rejected": -0.976076602935791,
"logps/chosen": -2.118309736251831,
"logps/rejected": -2.3720223903656006,
"loss": 4.0039,
"rewards/accuracies": 0.671875,
"rewards/chosen": -21.18309783935547,
"rewards/margins": 2.5371272563934326,
"rewards/rejected": -23.720224380493164,
"step": 117
},
{
"epoch": 0.2668928470455188,
"grad_norm": 85.51204116705796,
"learning_rate": 7.350942088009289e-07,
"logits/chosen": -1.0251755714416504,
"logits/rejected": -1.028438687324524,
"logps/chosen": -2.1024088859558105,
"logps/rejected": -2.394990921020508,
"loss": 3.3893,
"rewards/accuracies": 0.78125,
"rewards/chosen": -21.024085998535156,
"rewards/margins": 2.9258241653442383,
"rewards/rejected": -23.94991111755371,
"step": 118
},
{
"epoch": 0.26915465083404017,
"grad_norm": 93.8888274811386,
"learning_rate": 7.333552221956986e-07,
"logits/chosen": -1.137995958328247,
"logits/rejected": -1.1309608221054077,
"logps/chosen": -2.220576524734497,
"logps/rejected": -2.5518410205841064,
"loss": 3.7665,
"rewards/accuracies": 0.6875,
"rewards/chosen": -22.205766677856445,
"rewards/margins": 3.3126463890075684,
"rewards/rejected": -25.51841163635254,
"step": 119
},
{
"epoch": 0.2714164546225615,
"grad_norm": 82.4376030791516,
"learning_rate": 7.315953607229217e-07,
"logits/chosen": -1.0302393436431885,
"logits/rejected": -1.0388896465301514,
"logps/chosen": -2.299226760864258,
"logps/rejected": -2.6036436557769775,
"loss": 3.6506,
"rewards/accuracies": 0.703125,
"rewards/chosen": -22.992267608642578,
"rewards/margins": 3.044168710708618,
"rewards/rejected": -26.03643798828125,
"step": 120
},
{
"epoch": 0.27367825841108284,
"grad_norm": 75.10951235795113,
"learning_rate": 7.298147345859869e-07,
"logits/chosen": -1.0442880392074585,
"logits/rejected": -1.0543183088302612,
"logps/chosen": -2.143893003463745,
"logps/rejected": -2.4186229705810547,
"loss": 3.6677,
"rewards/accuracies": 0.75,
"rewards/chosen": -21.438934326171875,
"rewards/margins": 2.747298240661621,
"rewards/rejected": -24.186229705810547,
"step": 121
},
{
"epoch": 0.2759400621996042,
"grad_norm": 90.5899791792264,
"learning_rate": 7.280134552885762e-07,
"logits/chosen": -1.0085413455963135,
"logits/rejected": -0.9950270056724548,
"logps/chosen": -2.2459306716918945,
"logps/rejected": -2.5612874031066895,
"loss": 3.674,
"rewards/accuracies": 0.71875,
"rewards/chosen": -22.459308624267578,
"rewards/margins": 3.153568744659424,
"rewards/rejected": -25.612876892089844,
"step": 122
},
{
"epoch": 0.2782018659881255,
"grad_norm": 64.12127061593394,
"learning_rate": 7.261916356276831e-07,
"logits/chosen": -1.0203675031661987,
"logits/rejected": -1.0163829326629639,
"logps/chosen": -2.314136028289795,
"logps/rejected": -2.7963342666625977,
"loss": 2.9003,
"rewards/accuracies": 0.796875,
"rewards/chosen": -23.141361236572266,
"rewards/margins": 4.821981906890869,
"rewards/rejected": -27.963342666625977,
"step": 123
},
{
"epoch": 0.2804636697766469,
"grad_norm": 67.5121542894354,
"learning_rate": 7.243493896865486e-07,
"logits/chosen": -1.0232429504394531,
"logits/rejected": -1.0271434783935547,
"logps/chosen": -2.094038486480713,
"logps/rejected": -2.34799861907959,
"loss": 3.6745,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -20.940383911132812,
"rewards/margins": 2.5396037101745605,
"rewards/rejected": -23.47998809814453,
"step": 124
},
{
"epoch": 0.2827254735651682,
"grad_norm": 94.88612662427548,
"learning_rate": 7.224868328275169e-07,
"logits/chosen": -1.0053532123565674,
"logits/rejected": -1.00931715965271,
"logps/chosen": -2.22296142578125,
"logps/rejected": -2.5190277099609375,
"loss": 3.8801,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -22.229612350463867,
"rewards/margins": 2.960665464401245,
"rewards/rejected": -25.190279006958008,
"step": 125
},
{
"epoch": 0.28498727735368956,
"grad_norm": 155.20517774124164,
"learning_rate": 7.206040816848126e-07,
"logits/chosen": -1.0197092294692993,
"logits/rejected": -1.013099193572998,
"logps/chosen": -2.4244911670684814,
"logps/rejected": -2.6366748809814453,
"loss": 4.3686,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -24.244909286499023,
"rewards/margins": 2.1218347549438477,
"rewards/rejected": -26.36674690246582,
"step": 126
},
{
"epoch": 0.2872490811422109,
"grad_norm": 84.56335295481283,
"learning_rate": 7.187012541572356e-07,
"logits/chosen": -1.1143990755081177,
"logits/rejected": -1.1029709577560425,
"logps/chosen": -2.4256863594055176,
"logps/rejected": -2.823179244995117,
"loss": 3.2922,
"rewards/accuracies": 0.71875,
"rewards/chosen": -24.25686264038086,
"rewards/margins": 3.9749279022216797,
"rewards/rejected": -28.23179054260254,
"step": 127
},
{
"epoch": 0.28951088493073224,
"grad_norm": 87.76435547733897,
"learning_rate": 7.167784694007791e-07,
"logits/chosen": -1.0146347284317017,
"logits/rejected": -1.0320950746536255,
"logps/chosen": -2.34283447265625,
"logps/rejected": -2.685307502746582,
"loss": 3.5692,
"rewards/accuracies": 0.71875,
"rewards/chosen": -23.4283447265625,
"rewards/margins": 3.424734115600586,
"rewards/rejected": -26.85307502746582,
"step": 128
},
{
"epoch": 0.2917726887192536,
"grad_norm": 74.4299989128208,
"learning_rate": 7.148358478211682e-07,
"logits/chosen": -1.0819629430770874,
"logits/rejected": -1.0712792873382568,
"logps/chosen": -2.465446949005127,
"logps/rejected": -2.8680379390716553,
"loss": 3.0557,
"rewards/accuracies": 0.78125,
"rewards/chosen": -24.65446662902832,
"rewards/margins": 4.025911808013916,
"rewards/rejected": -28.68037986755371,
"step": 129
},
{
"epoch": 0.29403449250777497,
"grad_norm": 68.46333390817632,
"learning_rate": 7.128735110663187e-07,
"logits/chosen": -1.0613350868225098,
"logits/rejected": -1.029494285583496,
"logps/chosen": -2.2105801105499268,
"logps/rejected": -2.5904829502105713,
"loss": 3.3787,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -22.105798721313477,
"rewards/margins": 3.7990307807922363,
"rewards/rejected": -25.904829025268555,
"step": 130
},
{
"epoch": 0.2962962962962963,
"grad_norm": 98.52621039177454,
"learning_rate": 7.108915820187211e-07,
"logits/chosen": -1.0233975648880005,
"logits/rejected": -1.0252894163131714,
"logps/chosen": -2.538795232772827,
"logps/rejected": -2.9489595890045166,
"loss": 3.5147,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -25.38795280456543,
"rewards/margins": 4.10164213180542,
"rewards/rejected": -29.489593505859375,
"step": 131
},
{
"epoch": 0.29855810008481765,
"grad_norm": 100.09208083182116,
"learning_rate": 7.088901847877447e-07,
"logits/chosen": -0.9954769611358643,
"logits/rejected": -1.0030215978622437,
"logps/chosen": -2.4815452098846436,
"logps/rejected": -2.736799955368042,
"loss": 4.4467,
"rewards/accuracies": 0.71875,
"rewards/chosen": -24.815452575683594,
"rewards/margins": 2.55254864692688,
"rewards/rejected": -27.367998123168945,
"step": 132
},
{
"epoch": 0.300819903873339,
"grad_norm": 84.50154084317974,
"learning_rate": 7.068694447018658e-07,
"logits/chosen": -1.0301378965377808,
"logits/rejected": -1.0450048446655273,
"logps/chosen": -2.5099587440490723,
"logps/rejected": -2.9627416133880615,
"loss": 3.1551,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -25.09958839416504,
"rewards/margins": 4.527829170227051,
"rewards/rejected": -29.627418518066406,
"step": 133
},
{
"epoch": 0.3030817076618603,
"grad_norm": 101.0196603183953,
"learning_rate": 7.048294883008199e-07,
"logits/chosen": -1.0414271354675293,
"logits/rejected": -1.0482277870178223,
"logps/chosen": -2.413658857345581,
"logps/rejected": -2.7206525802612305,
"loss": 3.5561,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -24.13658905029297,
"rewards/margins": 3.0699357986450195,
"rewards/rejected": -27.206523895263672,
"step": 134
},
{
"epoch": 0.3053435114503817,
"grad_norm": 95.0848682674735,
"learning_rate": 7.027704433276776e-07,
"logits/chosen": -0.9876019954681396,
"logits/rejected": -0.9939224123954773,
"logps/chosen": -2.4932754039764404,
"logps/rejected": -2.9078118801116943,
"loss": 3.3544,
"rewards/accuracies": 0.75,
"rewards/chosen": -24.932754516601562,
"rewards/margins": 4.145364761352539,
"rewards/rejected": -29.07811737060547,
"step": 135
},
{
"epoch": 0.307605315238903,
"grad_norm": 105.4740106322944,
"learning_rate": 7.006924387208452e-07,
"logits/chosen": -0.9797443151473999,
"logits/rejected": -0.967667818069458,
"logps/chosen": -2.409311532974243,
"logps/rejected": -2.7358930110931396,
"loss": 3.4948,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -24.093116760253906,
"rewards/margins": 3.265815496444702,
"rewards/rejected": -27.358928680419922,
"step": 136
},
{
"epoch": 0.30986711902742436,
"grad_norm": 76.39737789572504,
"learning_rate": 6.985956046059904e-07,
"logits/chosen": -0.9616111516952515,
"logits/rejected": -0.9658017158508301,
"logps/chosen": -2.336418628692627,
"logps/rejected": -2.7663118839263916,
"loss": 3.4032,
"rewards/accuracies": 0.703125,
"rewards/chosen": -23.364187240600586,
"rewards/margins": 4.298930644989014,
"rewards/rejected": -27.663118362426758,
"step": 137
},
{
"epoch": 0.31212892281594573,
"grad_norm": 65.49308975598159,
"learning_rate": 6.964800722878945e-07,
"logits/chosen": -0.9257555603981018,
"logits/rejected": -0.9293062686920166,
"logps/chosen": -2.520350456237793,
"logps/rejected": -3.011868715286255,
"loss": 2.9539,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -25.20350456237793,
"rewards/margins": 4.915180683135986,
"rewards/rejected": -30.118684768676758,
"step": 138
},
{
"epoch": 0.31439072660446704,
"grad_norm": 70.82118930505452,
"learning_rate": 6.943459742422287e-07,
"logits/chosen": -0.9633641839027405,
"logits/rejected": -0.9400511980056763,
"logps/chosen": -2.5129337310791016,
"logps/rejected": -2.921888589859009,
"loss": 3.524,
"rewards/accuracies": 0.71875,
"rewards/chosen": -25.12933921813965,
"rewards/margins": 4.089549541473389,
"rewards/rejected": -29.218887329101562,
"step": 139
},
{
"epoch": 0.3166525303929884,
"grad_norm": 81.70116714789167,
"learning_rate": 6.921934441072597e-07,
"logits/chosen": -1.03174889087677,
"logits/rejected": -1.023061752319336,
"logps/chosen": -2.756772994995117,
"logps/rejected": -3.09798526763916,
"loss": 3.7751,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -27.567729949951172,
"rewards/margins": 3.412126302719116,
"rewards/rejected": -30.979854583740234,
"step": 140
},
{
"epoch": 0.3189143341815098,
"grad_norm": 116.29276151269757,
"learning_rate": 6.900226166754807e-07,
"logits/chosen": -0.9728091955184937,
"logits/rejected": -0.9851078987121582,
"logps/chosen": -2.8437376022338867,
"logps/rejected": -3.1122090816497803,
"loss": 4.2026,
"rewards/accuracies": 0.6875,
"rewards/chosen": -28.437374114990234,
"rewards/margins": 2.6847147941589355,
"rewards/rejected": -31.122089385986328,
"step": 141
},
{
"epoch": 0.3211761379700311,
"grad_norm": 81.1094387371339,
"learning_rate": 6.8783362788517e-07,
"logits/chosen": -0.9725527763366699,
"logits/rejected": -0.9806532263755798,
"logps/chosen": -2.7617523670196533,
"logps/rejected": -3.1742238998413086,
"loss": 3.9007,
"rewards/accuracies": 0.671875,
"rewards/chosen": -27.617523193359375,
"rewards/margins": 4.124715328216553,
"rewards/rejected": -31.74224090576172,
"step": 142
},
{
"epoch": 0.32343794175855245,
"grad_norm": 83.8384180266702,
"learning_rate": 6.856266148118796e-07,
"logits/chosen": -0.9712091684341431,
"logits/rejected": -0.9775328636169434,
"logps/chosen": -2.507340431213379,
"logps/rejected": -2.992020606994629,
"loss": 3.1458,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -25.07340431213379,
"rewards/margins": 4.846798896789551,
"rewards/rejected": -29.92020606994629,
"step": 143
},
{
"epoch": 0.3256997455470738,
"grad_norm": 95.75252245013684,
"learning_rate": 6.834017156598512e-07,
"logits/chosen": -0.9558267593383789,
"logits/rejected": -0.952300488948822,
"logps/chosen": -2.7634055614471436,
"logps/rejected": -3.159095048904419,
"loss": 3.5389,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -27.634056091308594,
"rewards/margins": 3.9568943977355957,
"rewards/rejected": -31.5909481048584,
"step": 144
},
{
"epoch": 0.3279615493355951,
"grad_norm": 66.20083759659813,
"learning_rate": 6.811590697533607e-07,
"logits/chosen": -1.021683692932129,
"logits/rejected": -1.030705451965332,
"logps/chosen": -2.7327919006347656,
"logps/rejected": -3.0889930725097656,
"loss": 3.5802,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -27.327922821044922,
"rewards/margins": 3.562011241912842,
"rewards/rejected": -30.889930725097656,
"step": 145
},
{
"epoch": 0.3302233531241165,
"grad_norm": 97.21258496539569,
"learning_rate": 6.788988175279951e-07,
"logits/chosen": -0.9693772196769714,
"logits/rejected": -0.9559890031814575,
"logps/chosen": -2.6974761486053467,
"logps/rejected": -3.066542148590088,
"loss": 3.7983,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -26.974760055541992,
"rewards/margins": 3.6906626224517822,
"rewards/rejected": -30.665422439575195,
"step": 146
},
{
"epoch": 0.3324851569126378,
"grad_norm": 109.59653360197825,
"learning_rate": 6.766211005218577e-07,
"logits/chosen": -0.9583339095115662,
"logits/rejected": -0.9542748332023621,
"logps/chosen": -2.6948695182800293,
"logps/rejected": -3.238664150238037,
"loss": 3.0583,
"rewards/accuracies": 0.75,
"rewards/chosen": -26.948694229125977,
"rewards/margins": 5.43794584274292,
"rewards/rejected": -32.38664245605469,
"step": 147
},
{
"epoch": 0.33474696070115917,
"grad_norm": 83.0664156941931,
"learning_rate": 6.743260613667047e-07,
"logits/chosen": -1.0542542934417725,
"logits/rejected": -1.0388411283493042,
"logps/chosen": -2.708282709121704,
"logps/rejected": -3.167202949523926,
"loss": 3.2972,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -27.082826614379883,
"rewards/margins": 4.589202404022217,
"rewards/rejected": -31.672027587890625,
"step": 148
},
{
"epoch": 0.33700876448968053,
"grad_norm": 54.329954302423566,
"learning_rate": 6.720138437790139e-07,
"logits/chosen": -1.0130287408828735,
"logits/rejected": -1.0046091079711914,
"logps/chosen": -2.6365139484405518,
"logps/rejected": -3.088283061981201,
"loss": 3.0963,
"rewards/accuracies": 0.765625,
"rewards/chosen": -26.36513900756836,
"rewards/margins": 4.517690658569336,
"rewards/rejected": -30.882829666137695,
"step": 149
},
{
"epoch": 0.33927056827820185,
"grad_norm": 103.86460592347143,
"learning_rate": 6.696845925509848e-07,
"logits/chosen": -1.0333694219589233,
"logits/rejected": -1.0335674285888672,
"logps/chosen": -2.7684454917907715,
"logps/rejected": -3.083521842956543,
"loss": 3.8223,
"rewards/accuracies": 0.703125,
"rewards/chosen": -27.68445587158203,
"rewards/margins": 3.1507630348205566,
"rewards/rejected": -30.83521842956543,
"step": 150
},
{
"epoch": 0.3415323720667232,
"grad_norm": 93.51507593691238,
"learning_rate": 6.673384535414718e-07,
"logits/chosen": -1.0419607162475586,
"logits/rejected": -1.0317517518997192,
"logps/chosen": -2.868422031402588,
"logps/rejected": -3.1678481101989746,
"loss": 3.9666,
"rewards/accuracies": 0.6875,
"rewards/chosen": -28.684223175048828,
"rewards/margins": 2.9942572116851807,
"rewards/rejected": -31.678478240966797,
"step": 151
},
{
"epoch": 0.3437941758552446,
"grad_norm": 134.42512568099087,
"learning_rate": 6.649755736668511e-07,
"logits/chosen": -0.974323570728302,
"logits/rejected": -0.9719677567481995,
"logps/chosen": -2.5241260528564453,
"logps/rejected": -2.9137468338012695,
"loss": 2.8286,
"rewards/accuracies": 0.78125,
"rewards/chosen": -25.241260528564453,
"rewards/margins": 3.896209716796875,
"rewards/rejected": -29.137470245361328,
"step": 152
},
{
"epoch": 0.3460559796437659,
"grad_norm": 86.3337816462208,
"learning_rate": 6.625961008918192e-07,
"logits/chosen": -0.986197829246521,
"logits/rejected": -0.9864940643310547,
"logps/chosen": -2.601165294647217,
"logps/rejected": -3.0423331260681152,
"loss": 3.0146,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -26.011653900146484,
"rewards/margins": 4.411678314208984,
"rewards/rejected": -30.4233341217041,
"step": 153
},
{
"epoch": 0.34831778343228725,
"grad_norm": 102.48426546292868,
"learning_rate": 6.602001842201289e-07,
"logits/chosen": -0.9800938367843628,
"logits/rejected": -0.9895581603050232,
"logps/chosen": -2.5859339237213135,
"logps/rejected": -2.865299701690674,
"loss": 3.9638,
"rewards/accuracies": 0.6875,
"rewards/chosen": -25.859336853027344,
"rewards/margins": 2.7936594486236572,
"rewards/rejected": -28.652997970581055,
"step": 154
},
{
"epoch": 0.3505795872208086,
"grad_norm": 104.68446919661847,
"learning_rate": 6.577879736852571e-07,
"logits/chosen": -0.9982212781906128,
"logits/rejected": -1.0046367645263672,
"logps/chosen": -2.649374485015869,
"logps/rejected": -2.9364640712738037,
"loss": 3.8496,
"rewards/accuracies": 0.65625,
"rewards/chosen": -26.493745803833008,
"rewards/margins": 2.870893955230713,
"rewards/rejected": -29.364643096923828,
"step": 155
},
{
"epoch": 0.35284139100932993,
"grad_norm": 104.92988420265864,
"learning_rate": 6.553596203410112e-07,
"logits/chosen": -1.0082954168319702,
"logits/rejected": -1.0111855268478394,
"logps/chosen": -2.642954111099243,
"logps/rejected": -3.162132740020752,
"loss": 2.5535,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -26.429540634155273,
"rewards/margins": 5.1917853355407715,
"rewards/rejected": -31.621326446533203,
"step": 156
},
{
"epoch": 0.3551031947978513,
"grad_norm": 76.25729279383924,
"learning_rate": 6.529152762520688e-07,
"logits/chosen": -1.010929822921753,
"logits/rejected": -1.012401819229126,
"logps/chosen": -2.7833664417266846,
"logps/rejected": -3.1698460578918457,
"loss": 3.4851,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -27.83366584777832,
"rewards/margins": 3.8647947311401367,
"rewards/rejected": -31.69845962524414,
"step": 157
},
{
"epoch": 0.3573649985863726,
"grad_norm": 91.64977048048202,
"learning_rate": 6.504550944844558e-07,
"logits/chosen": -0.9399479627609253,
"logits/rejected": -0.9675872325897217,
"logps/chosen": -2.6605355739593506,
"logps/rejected": -3.0566985607147217,
"loss": 3.1926,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -26.60535430908203,
"rewards/margins": 3.961630344390869,
"rewards/rejected": -30.566986083984375,
"step": 158
},
{
"epoch": 0.359626802374894,
"grad_norm": 91.26525865623015,
"learning_rate": 6.479792290959613e-07,
"logits/chosen": -0.9715840816497803,
"logits/rejected": -0.9836963415145874,
"logps/chosen": -2.655123710632324,
"logps/rejected": -3.1809985637664795,
"loss": 3.1864,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -26.551239013671875,
"rewards/margins": 5.258747100830078,
"rewards/rejected": -31.80998420715332,
"step": 159
},
{
"epoch": 0.36188860616341534,
"grad_norm": 77.20624956215188,
"learning_rate": 6.454878351264906e-07,
"logits/chosen": -0.9734132885932922,
"logits/rejected": -0.9633912444114685,
"logps/chosen": -2.620572805404663,
"logps/rejected": -3.0621609687805176,
"loss": 3.3031,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -26.205730438232422,
"rewards/margins": 4.415881156921387,
"rewards/rejected": -30.621610641479492,
"step": 160
},
{
"epoch": 0.36415040995193665,
"grad_norm": 94.30228281712074,
"learning_rate": 6.429810685883565e-07,
"logits/chosen": -1.0244390964508057,
"logits/rejected": -1.0341304540634155,
"logps/chosen": -2.815182685852051,
"logps/rejected": -3.2361655235290527,
"loss": 2.7388,
"rewards/accuracies": 0.796875,
"rewards/chosen": -28.15182876586914,
"rewards/margins": 4.209825038909912,
"rewards/rejected": -32.361656188964844,
"step": 161
},
{
"epoch": 0.366412213740458,
"grad_norm": 116.19130395085337,
"learning_rate": 6.404590864565088e-07,
"logits/chosen": -0.9736944437026978,
"logits/rejected": -0.9617501497268677,
"logps/chosen": -2.7973079681396484,
"logps/rejected": -3.07376766204834,
"loss": 3.8692,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -27.973079681396484,
"rewards/margins": 2.7645981311798096,
"rewards/rejected": -30.73767852783203,
"step": 162
},
{
"epoch": 0.3686740175289794,
"grad_norm": 81.52905794953793,
"learning_rate": 6.379220466587063e-07,
"logits/chosen": -0.9999153017997742,
"logits/rejected": -0.9712975025177002,
"logps/chosen": -2.7478442192077637,
"logps/rejected": -3.222566843032837,
"loss": 3.2696,
"rewards/accuracies": 0.8125,
"rewards/chosen": -27.478445053100586,
"rewards/margins": 4.747226238250732,
"rewards/rejected": -32.225669860839844,
"step": 163
},
{
"epoch": 0.3709358213175007,
"grad_norm": 120.0020137912241,
"learning_rate": 6.353701080656254e-07,
"logits/chosen": -0.980323314666748,
"logits/rejected": -1.0064855813980103,
"logps/chosen": -2.868739604949951,
"logps/rejected": -3.228687286376953,
"loss": 3.4866,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -28.687393188476562,
"rewards/margins": 3.599480390548706,
"rewards/rejected": -32.2868766784668,
"step": 164
},
{
"epoch": 0.37319762510602206,
"grad_norm": 76.2628133649645,
"learning_rate": 6.32803430480913e-07,
"logits/chosen": -0.9936685562133789,
"logits/rejected": -0.998755693435669,
"logps/chosen": -2.7757811546325684,
"logps/rejected": -3.2076919078826904,
"loss": 3.3059,
"rewards/accuracies": 0.765625,
"rewards/chosen": -27.757808685302734,
"rewards/margins": 4.319108963012695,
"rewards/rejected": -32.07691955566406,
"step": 165
},
{
"epoch": 0.3754594288945434,
"grad_norm": 94.14547576362092,
"learning_rate": 6.302221746311782e-07,
"logits/chosen": -0.9984323978424072,
"logits/rejected": -0.9655094146728516,
"logps/chosen": -2.6705729961395264,
"logps/rejected": -3.1013717651367188,
"loss": 3.5603,
"rewards/accuracies": 0.734375,
"rewards/chosen": -26.705730438232422,
"rewards/margins": 4.307986736297607,
"rewards/rejected": -31.013717651367188,
"step": 166
},
{
"epoch": 0.37772123268306473,
"grad_norm": 98.38930298766635,
"learning_rate": 6.276265021559288e-07,
"logits/chosen": -1.0194363594055176,
"logits/rejected": -1.0137174129486084,
"logps/chosen": -2.913635015487671,
"logps/rejected": -3.219327449798584,
"loss": 3.9403,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -29.136350631713867,
"rewards/margins": 3.0569217205047607,
"rewards/rejected": -32.193275451660156,
"step": 167
},
{
"epoch": 0.3799830364715861,
"grad_norm": 80.88897995132001,
"learning_rate": 6.250165755974487e-07,
"logits/chosen": -0.9634656310081482,
"logits/rejected": -0.972878634929657,
"logps/chosen": -2.8597655296325684,
"logps/rejected": -3.2424063682556152,
"loss": 3.4182,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -28.59765625,
"rewards/margins": 3.8264071941375732,
"rewards/rejected": -32.42406463623047,
"step": 168
},
{
"epoch": 0.3822448402601074,
"grad_norm": 76.80218510989977,
"learning_rate": 6.223925583906192e-07,
"logits/chosen": -1.005441665649414,
"logits/rejected": -1.014135718345642,
"logps/chosen": -2.8562896251678467,
"logps/rejected": -3.3525795936584473,
"loss": 3.0492,
"rewards/accuracies": 0.765625,
"rewards/chosen": -28.562894821166992,
"rewards/margins": 4.962900161743164,
"rewards/rejected": -33.525794982910156,
"step": 169
},
{
"epoch": 0.3845066440486288,
"grad_norm": 76.07958470639606,
"learning_rate": 6.19754614852685e-07,
"logits/chosen": -1.013322353363037,
"logits/rejected": -1.016672134399414,
"logps/chosen": -2.7543845176696777,
"logps/rejected": -3.1926443576812744,
"loss": 3.215,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -27.543842315673828,
"rewards/margins": 4.382599830627441,
"rewards/rejected": -31.926441192626953,
"step": 170
},
{
"epoch": 0.38676844783715014,
"grad_norm": 99.45087355333845,
"learning_rate": 6.171029101729644e-07,
"logits/chosen": -0.9391031265258789,
"logits/rejected": -0.9616649746894836,
"logps/chosen": -3.042638063430786,
"logps/rejected": -3.5252275466918945,
"loss": 3.2546,
"rewards/accuracies": 0.703125,
"rewards/chosen": -30.426380157470703,
"rewards/margins": 4.825897693634033,
"rewards/rejected": -35.25227737426758,
"step": 171
},
{
"epoch": 0.38903025162567145,
"grad_norm": 85.11197223778332,
"learning_rate": 6.144376104025055e-07,
"logits/chosen": -1.0180509090423584,
"logits/rejected": -1.0034980773925781,
"logps/chosen": -2.8050010204315186,
"logps/rejected": -3.2974133491516113,
"loss": 2.9998,
"rewards/accuracies": 0.765625,
"rewards/chosen": -28.050006866455078,
"rewards/margins": 4.924124717712402,
"rewards/rejected": -32.9741325378418,
"step": 172
},
{
"epoch": 0.3912920554141928,
"grad_norm": 101.05359206277465,
"learning_rate": 6.117588824436873e-07,
"logits/chosen": -1.0196110010147095,
"logits/rejected": -1.0431230068206787,
"logps/chosen": -2.8156418800354004,
"logps/rejected": -3.218076229095459,
"loss": 3.4264,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -28.15641975402832,
"rewards/margins": 4.024340629577637,
"rewards/rejected": -32.180763244628906,
"step": 173
},
{
"epoch": 0.3935538592027142,
"grad_norm": 103.50825244083374,
"learning_rate": 6.090668940397688e-07,
"logits/chosen": -0.9689127206802368,
"logits/rejected": -0.9795331358909607,
"logps/chosen": -2.831442356109619,
"logps/rejected": -3.2767508029937744,
"loss": 3.475,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -28.314422607421875,
"rewards/margins": 4.4530839920043945,
"rewards/rejected": -32.76750946044922,
"step": 174
},
{
"epoch": 0.3958156629912355,
"grad_norm": 95.78558182335533,
"learning_rate": 6.063618137643844e-07,
"logits/chosen": -0.9743781089782715,
"logits/rejected": -0.9708334803581238,
"logps/chosen": -2.8041203022003174,
"logps/rejected": -3.2117085456848145,
"loss": 3.3677,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -28.041202545166016,
"rewards/margins": 4.075882911682129,
"rewards/rejected": -32.117088317871094,
"step": 175
},
{
"epoch": 0.39807746677975686,
"grad_norm": 97.91758087524954,
"learning_rate": 6.03643811010988e-07,
"logits/chosen": -0.9981604814529419,
"logits/rejected": -1.0190309286117554,
"logps/chosen": -3.009575843811035,
"logps/rejected": -3.4792652130126953,
"loss": 2.9463,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -30.095760345458984,
"rewards/margins": 4.696890830993652,
"rewards/rejected": -34.79264831542969,
"step": 176
},
{
"epoch": 0.4003392705682782,
"grad_norm": 108.50415430037745,
"learning_rate": 6.009130559822453e-07,
"logits/chosen": -1.033525824546814,
"logits/rejected": -1.0412116050720215,
"logps/chosen": -2.977822780609131,
"logps/rejected": -3.300154447555542,
"loss": 3.8532,
"rewards/accuracies": 0.6875,
"rewards/chosen": -29.77822494506836,
"rewards/margins": 3.2233195304870605,
"rewards/rejected": -33.001548767089844,
"step": 177
},
{
"epoch": 0.40260107435679954,
"grad_norm": 111.58568873914501,
"learning_rate": 5.981697196793758e-07,
"logits/chosen": -1.0652039051055908,
"logits/rejected": -1.0723577737808228,
"logps/chosen": -3.18682861328125,
"logps/rejected": -3.5545740127563477,
"loss": 3.2891,
"rewards/accuracies": 0.6875,
"rewards/chosen": -31.868288040161133,
"rewards/margins": 3.6774556636810303,
"rewards/rejected": -35.545745849609375,
"step": 178
},
{
"epoch": 0.4048628781453209,
"grad_norm": 113.22029697753425,
"learning_rate": 5.954139738914446e-07,
"logits/chosen": -1.0194616317749023,
"logits/rejected": -1.0498051643371582,
"logps/chosen": -3.0097670555114746,
"logps/rejected": -3.4376673698425293,
"loss": 3.4262,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -30.09766960144043,
"rewards/margins": 4.279005527496338,
"rewards/rejected": -34.376678466796875,
"step": 179
},
{
"epoch": 0.4071246819338422,
"grad_norm": 88.31224190243825,
"learning_rate": 5.92645991184605e-07,
"logits/chosen": -1.000518560409546,
"logits/rejected": -1.0053133964538574,
"logps/chosen": -3.138739585876465,
"logps/rejected": -3.564058303833008,
"loss": 3.3412,
"rewards/accuracies": 0.75,
"rewards/chosen": -31.38739585876465,
"rewards/margins": 4.2531867027282715,
"rewards/rejected": -35.64058303833008,
"step": 180
},
{
"epoch": 0.4093864857223636,
"grad_norm": 93.89901341102379,
"learning_rate": 5.898659448912917e-07,
"logits/chosen": -0.9921884536743164,
"logits/rejected": -1.0198383331298828,
"logps/chosen": -2.974911689758301,
"logps/rejected": -3.3996355533599854,
"loss": 3.5069,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -29.74911880493164,
"rewards/margins": 4.247236251831055,
"rewards/rejected": -33.99635314941406,
"step": 181
},
{
"epoch": 0.41164828951088495,
"grad_norm": 98.09431340991735,
"learning_rate": 5.870740090993676e-07,
"logits/chosen": -1.0371700525283813,
"logits/rejected": -1.0533473491668701,
"logps/chosen": -3.3046529293060303,
"logps/rejected": -3.7636566162109375,
"loss": 3.2028,
"rewards/accuracies": 0.75,
"rewards/chosen": -33.046531677246094,
"rewards/margins": 4.590036392211914,
"rewards/rejected": -37.636566162109375,
"step": 182
},
{
"epoch": 0.41391009329940626,
"grad_norm": 120.2568476777465,
"learning_rate": 5.842703586412214e-07,
"logits/chosen": -1.0324050188064575,
"logits/rejected": -1.0546314716339111,
"logps/chosen": -3.2569875717163086,
"logps/rejected": -3.61047625541687,
"loss": 3.8317,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -32.56987380981445,
"rewards/margins": 3.534888982772827,
"rewards/rejected": -36.104766845703125,
"step": 183
},
{
"epoch": 0.4161718970879276,
"grad_norm": 85.18463181569176,
"learning_rate": 5.814551690828203e-07,
"logits/chosen": -0.9997261762619019,
"logits/rejected": -1.0231051445007324,
"logps/chosen": -3.0839271545410156,
"logps/rejected": -3.5636889934539795,
"loss": 2.8158,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -30.83926773071289,
"rewards/margins": 4.797619342803955,
"rewards/rejected": -35.63688659667969,
"step": 184
},
{
"epoch": 0.418433700876449,
"grad_norm": 109.35546689042273,
"learning_rate": 5.786286167127155e-07,
"logits/chosen": -1.039783000946045,
"logits/rejected": -1.0381850004196167,
"logps/chosen": -3.022249937057495,
"logps/rejected": -3.558924913406372,
"loss": 3.2005,
"rewards/accuracies": 0.765625,
"rewards/chosen": -30.222497940063477,
"rewards/margins": 5.366751670837402,
"rewards/rejected": -35.58924865722656,
"step": 185
},
{
"epoch": 0.4206955046649703,
"grad_norm": 121.67840212859822,
"learning_rate": 5.757908785310031e-07,
"logits/chosen": -0.9625982046127319,
"logits/rejected": -0.9870297312736511,
"logps/chosen": -2.8242011070251465,
"logps/rejected": -3.244973659515381,
"loss": 3.4056,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -28.242015838623047,
"rewards/margins": 4.207724094390869,
"rewards/rejected": -32.44974136352539,
"step": 186
},
{
"epoch": 0.42295730845349166,
"grad_norm": 83.36428430432693,
"learning_rate": 5.729421322382399e-07,
"logits/chosen": -0.9603241086006165,
"logits/rejected": -0.989672064781189,
"logps/chosen": -2.6210927963256836,
"logps/rejected": -3.126544952392578,
"loss": 2.8853,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -26.21092987060547,
"rewards/margins": 5.0545196533203125,
"rewards/rejected": -31.26544952392578,
"step": 187
},
{
"epoch": 0.42521911224201303,
"grad_norm": 101.06130324143219,
"learning_rate": 5.700825562243163e-07,
"logits/chosen": -0.9474819302558899,
"logits/rejected": -0.9584120512008667,
"logps/chosen": -2.716667413711548,
"logps/rejected": -3.2290937900543213,
"loss": 2.7513,
"rewards/accuracies": 0.796875,
"rewards/chosen": -27.166675567626953,
"rewards/margins": 5.124265670776367,
"rewards/rejected": -32.29093933105469,
"step": 188
},
{
"epoch": 0.42748091603053434,
"grad_norm": 96.1462570190499,
"learning_rate": 5.672123295572854e-07,
"logits/chosen": -1.0214743614196777,
"logits/rejected": -1.0417888164520264,
"logps/chosen": -2.9014813899993896,
"logps/rejected": -3.219127655029297,
"loss": 3.2058,
"rewards/accuracies": 0.78125,
"rewards/chosen": -29.014816284179688,
"rewards/margins": 3.1764631271362305,
"rewards/rejected": -32.19127655029297,
"step": 189
},
{
"epoch": 0.4297427198190557,
"grad_norm": 76.49202012271405,
"learning_rate": 5.643316319721487e-07,
"logits/chosen": -1.008056879043579,
"logits/rejected": -1.0231949090957642,
"logps/chosen": -3.03181791305542,
"logps/rejected": -3.4028077125549316,
"loss": 3.6319,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -30.318180084228516,
"rewards/margins": 3.7098987102508545,
"rewards/rejected": -34.028076171875,
"step": 190
},
{
"epoch": 0.432004523607577,
"grad_norm": 103.69905256755406,
"learning_rate": 5.614406438596026e-07,
"logits/chosen": -1.0403547286987305,
"logits/rejected": -1.0387647151947021,
"logps/chosen": -3.0885541439056396,
"logps/rejected": -3.5047993659973145,
"loss": 3.5887,
"rewards/accuracies": 0.71875,
"rewards/chosen": -30.885540008544922,
"rewards/margins": 4.162451267242432,
"rewards/rejected": -35.04799270629883,
"step": 191
},
{
"epoch": 0.4342663273960984,
"grad_norm": 110.46035775661962,
"learning_rate": 5.585395462547406e-07,
"logits/chosen": -0.9906610250473022,
"logits/rejected": -0.9895243644714355,
"logps/chosen": -3.050811529159546,
"logps/rejected": -3.3403472900390625,
"loss": 3.9036,
"rewards/accuracies": 0.640625,
"rewards/chosen": -30.50811767578125,
"rewards/margins": 2.8953564167022705,
"rewards/rejected": -33.403472900390625,
"step": 192
},
{
"epoch": 0.43652813118461975,
"grad_norm": 101.65651790587738,
"learning_rate": 5.55628520825718e-07,
"logits/chosen": -1.0707571506500244,
"logits/rejected": -1.0764364004135132,
"logps/chosen": -3.1038661003112793,
"logps/rejected": -3.454857110977173,
"loss": 3.4261,
"rewards/accuracies": 0.734375,
"rewards/chosen": -31.03866195678711,
"rewards/margins": 3.5099072456359863,
"rewards/rejected": -34.54856872558594,
"step": 193
},
{
"epoch": 0.43878993497314106,
"grad_norm": 93.93497063277395,
"learning_rate": 5.527077498623752e-07,
"logits/chosen": -1.0300379991531372,
"logits/rejected": -1.0473064184188843,
"logps/chosen": -2.991177797317505,
"logps/rejected": -3.39091157913208,
"loss": 3.1118,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -29.911775588989258,
"rewards/margins": 3.9973347187042236,
"rewards/rejected": -33.90911102294922,
"step": 194
},
{
"epoch": 0.4410517387616624,
"grad_norm": 87.97932049560232,
"learning_rate": 5.497774162648228e-07,
"logits/chosen": -0.9984287023544312,
"logits/rejected": -1.0242127180099487,
"logps/chosen": -2.906052589416504,
"logps/rejected": -3.424265146255493,
"loss": 3.135,
"rewards/accuracies": 0.765625,
"rewards/chosen": -29.06052589416504,
"rewards/margins": 5.18212366104126,
"rewards/rejected": -34.24264907836914,
"step": 195
},
{
"epoch": 0.4433135425501838,
"grad_norm": 104.21099663735023,
"learning_rate": 5.468377035319882e-07,
"logits/chosen": -1.0468281507492065,
"logits/rejected": -1.0517960786819458,
"logps/chosen": -2.991363048553467,
"logps/rejected": -3.451014280319214,
"loss": 3.1655,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -29.913631439208984,
"rewards/margins": 4.596511363983154,
"rewards/rejected": -34.51013946533203,
"step": 196
},
{
"epoch": 0.4455753463387051,
"grad_norm": 97.96259698980502,
"learning_rate": 5.438887957501248e-07,
"logits/chosen": -0.9534517526626587,
"logits/rejected": -0.9541075229644775,
"logps/chosen": -2.9699201583862305,
"logps/rejected": -3.363379716873169,
"loss": 3.4454,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -29.699199676513672,
"rewards/margins": 3.934598922729492,
"rewards/rejected": -33.63379669189453,
"step": 197
},
{
"epoch": 0.44783715012722647,
"grad_norm": 81.95675095435759,
"learning_rate": 5.409308775812844e-07,
"logits/chosen": -1.0043997764587402,
"logits/rejected": -1.0169973373413086,
"logps/chosen": -3.0810163021087646,
"logps/rejected": -3.500107765197754,
"loss": 3.3197,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -30.810165405273438,
"rewards/margins": 4.190912246704102,
"rewards/rejected": -35.001075744628906,
"step": 198
},
{
"epoch": 0.45009895391574783,
"grad_norm": 78.86431644460743,
"learning_rate": 5.379641342517541e-07,
"logits/chosen": -1.0426075458526611,
"logits/rejected": -1.064549207687378,
"logps/chosen": -2.9156980514526367,
"logps/rejected": -3.4033756256103516,
"loss": 3.1621,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -29.156984329223633,
"rewards/margins": 4.876772403717041,
"rewards/rejected": -34.03376007080078,
"step": 199
},
{
"epoch": 0.45236075770426915,
"grad_norm": 78.79345485135647,
"learning_rate": 5.349887515404564e-07,
"logits/chosen": -1.0214426517486572,
"logits/rejected": -1.0430337190628052,
"logps/chosen": -3.1084158420562744,
"logps/rejected": -3.697129964828491,
"loss": 2.5793,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -31.08416175842285,
"rewards/margins": 5.887137413024902,
"rewards/rejected": -36.97129821777344,
"step": 200
},
{
"epoch": 0.4546225614927905,
"grad_norm": 84.91022455477744,
"learning_rate": 5.320049157673163e-07,
"logits/chosen": -0.9889032244682312,
"logits/rejected": -0.9852777719497681,
"logps/chosen": -2.9240407943725586,
"logps/rejected": -3.4134063720703125,
"loss": 2.8051,
"rewards/accuracies": 0.796875,
"rewards/chosen": -29.240407943725586,
"rewards/margins": 4.8936567306518555,
"rewards/rejected": -34.13405990600586,
"step": 201
},
{
"epoch": 0.4568843652813118,
"grad_norm": 84.77083592931683,
"learning_rate": 5.290128137815938e-07,
"logits/chosen": -1.0504995584487915,
"logits/rejected": -1.0646427869796753,
"logps/chosen": -3.179389476776123,
"logps/rejected": -3.651752233505249,
"loss": 2.8529,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -31.793895721435547,
"rewards/margins": 4.723628997802734,
"rewards/rejected": -36.51752471923828,
"step": 202
},
{
"epoch": 0.4591461690698332,
"grad_norm": 74.57762667630237,
"learning_rate": 5.260126329501828e-07,
"logits/chosen": -1.0635945796966553,
"logits/rejected": -1.0678863525390625,
"logps/chosen": -3.0903306007385254,
"logps/rejected": -3.723266363143921,
"loss": 2.6491,
"rewards/accuracies": 0.78125,
"rewards/chosen": -30.903305053710938,
"rewards/margins": 6.329358100891113,
"rewards/rejected": -37.232662200927734,
"step": 203
},
{
"epoch": 0.46140797285835455,
"grad_norm": 143.06703544138904,
"learning_rate": 5.230045611458789e-07,
"logits/chosen": -0.9957941770553589,
"logits/rejected": -1.0184533596038818,
"logps/chosen": -2.955549955368042,
"logps/rejected": -3.382478952407837,
"loss": 3.1943,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -29.555503845214844,
"rewards/margins": 4.269284725189209,
"rewards/rejected": -33.82478713989258,
"step": 204
},
{
"epoch": 0.46366977664687586,
"grad_norm": 84.52091519382871,
"learning_rate": 5.199887867356143e-07,
"logits/chosen": -1.0054672956466675,
"logits/rejected": -1.034124493598938,
"logps/chosen": -3.2448582649230957,
"logps/rejected": -3.7912087440490723,
"loss": 2.5166,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -32.44858169555664,
"rewards/margins": 5.463507652282715,
"rewards/rejected": -37.912086486816406,
"step": 205
},
{
"epoch": 0.46593158043539723,
"grad_norm": 91.18340069666662,
"learning_rate": 5.16965498568662e-07,
"logits/chosen": -1.0431950092315674,
"logits/rejected": -1.0494352579116821,
"logps/chosen": -3.452178478240967,
"logps/rejected": -4.119001388549805,
"loss": 2.7708,
"rewards/accuracies": 0.796875,
"rewards/chosen": -34.521785736083984,
"rewards/margins": 6.668229103088379,
"rewards/rejected": -41.19001388549805,
"step": 206
},
{
"epoch": 0.4681933842239186,
"grad_norm": 97.96611032782644,
"learning_rate": 5.139348859648098e-07,
"logits/chosen": -1.0379126071929932,
"logits/rejected": -1.0373729467391968,
"logps/chosen": -3.139047145843506,
"logps/rejected": -3.602118492126465,
"loss": 2.9431,
"rewards/accuracies": 0.75,
"rewards/chosen": -31.390474319458008,
"rewards/margins": 4.630711555480957,
"rewards/rejected": -36.02118682861328,
"step": 207
},
{
"epoch": 0.4704551880124399,
"grad_norm": 105.3352825410185,
"learning_rate": 5.10897138702506e-07,
"logits/chosen": -0.9758431315422058,
"logits/rejected": -0.9999425411224365,
"logps/chosen": -3.3104453086853027,
"logps/rejected": -3.760322093963623,
"loss": 3.6167,
"rewards/accuracies": 0.703125,
"rewards/chosen": -33.104454040527344,
"rewards/margins": 4.4987688064575195,
"rewards/rejected": -37.60322189331055,
"step": 208
},
{
"epoch": 0.4727169918009613,
"grad_norm": 93.74191045191726,
"learning_rate": 5.078524470069743e-07,
"logits/chosen": -1.0916061401367188,
"logits/rejected": -1.0970686674118042,
"logps/chosen": -3.437713384628296,
"logps/rejected": -3.9724910259246826,
"loss": 2.3738,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -34.377132415771484,
"rewards/margins": 5.347775936126709,
"rewards/rejected": -39.72490692138672,
"step": 209
},
{
"epoch": 0.47497879558948264,
"grad_norm": 105.04884105691238,
"learning_rate": 5.048010015383021e-07,
"logits/chosen": -1.010386347770691,
"logits/rejected": -1.0093263387680054,
"logps/chosen": -3.519493579864502,
"logps/rejected": -4.191654205322266,
"loss": 2.6885,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -35.19493865966797,
"rewards/margins": 6.721607208251953,
"rewards/rejected": -41.91654586791992,
"step": 210
},
{
"epoch": 0.47724059937800395,
"grad_norm": 113.27606664327345,
"learning_rate": 5.01742993379502e-07,
"logits/chosen": -1.0328989028930664,
"logits/rejected": -1.050833821296692,
"logps/chosen": -3.583611249923706,
"logps/rejected": -4.106909275054932,
"loss": 2.7627,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -35.836116790771484,
"rewards/margins": 5.2329792976379395,
"rewards/rejected": -41.069091796875,
"step": 211
},
{
"epoch": 0.4795024031665253,
"grad_norm": 104.82853867162883,
"learning_rate": 4.986786140245446e-07,
"logits/chosen": -1.0020099878311157,
"logits/rejected": -1.0192692279815674,
"logps/chosen": -3.448815107345581,
"logps/rejected": -3.9523656368255615,
"loss": 3.3598,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -34.488155364990234,
"rewards/margins": 5.035503387451172,
"rewards/rejected": -39.523658752441406,
"step": 212
},
{
"epoch": 0.4817642069550466,
"grad_norm": 115.63283025199365,
"learning_rate": 4.956080553663687e-07,
"logits/chosen": -1.0651870965957642,
"logits/rejected": -1.0669838190078735,
"logps/chosen": -3.580437183380127,
"logps/rejected": -4.1571044921875,
"loss": 2.9216,
"rewards/accuracies": 0.796875,
"rewards/chosen": -35.804378509521484,
"rewards/margins": 5.766667366027832,
"rewards/rejected": -41.571044921875,
"step": 213
},
{
"epoch": 0.484026010743568,
"grad_norm": 128.35809725836472,
"learning_rate": 4.925315096848636e-07,
"logits/chosen": -1.0875139236450195,
"logits/rejected": -1.1225805282592773,
"logps/chosen": -3.7847559452056885,
"logps/rejected": -4.454068183898926,
"loss": 2.724,
"rewards/accuracies": 0.75,
"rewards/chosen": -37.847557067871094,
"rewards/margins": 6.693119525909424,
"rewards/rejected": -44.54068374633789,
"step": 214
},
{
"epoch": 0.48628781453208936,
"grad_norm": 102.85432329521814,
"learning_rate": 4.894491696348293e-07,
"logits/chosen": -1.1038923263549805,
"logits/rejected": -1.1225206851959229,
"logps/chosen": -3.6632027626037598,
"logps/rejected": -4.115760803222656,
"loss": 3.1612,
"rewards/accuracies": 0.734375,
"rewards/chosen": -36.63202667236328,
"rewards/margins": 4.52557897567749,
"rewards/rejected": -41.15760803222656,
"step": 215
},
{
"epoch": 0.48854961832061067,
"grad_norm": 100.23233670472042,
"learning_rate": 4.863612282339116e-07,
"logits/chosen": -1.0292141437530518,
"logits/rejected": -1.0395947694778442,
"logps/chosen": -3.849827766418457,
"logps/rejected": -4.370044231414795,
"loss": 3.3243,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -38.4982795715332,
"rewards/margins": 5.202164649963379,
"rewards/rejected": -43.700439453125,
"step": 216
},
{
"epoch": 0.49081142210913203,
"grad_norm": 107.28211566319682,
"learning_rate": 4.832678788505161e-07,
"logits/chosen": -1.070858120918274,
"logits/rejected": -1.0871332883834839,
"logps/chosen": -3.8025472164154053,
"logps/rejected": -4.315918445587158,
"loss": 3.5426,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -38.025474548339844,
"rewards/margins": 5.133708953857422,
"rewards/rejected": -43.1591796875,
"step": 217
},
{
"epoch": 0.4930732258976534,
"grad_norm": 114.43034245836103,
"learning_rate": 4.801693151916985e-07,
"logits/chosen": -1.0845677852630615,
"logits/rejected": -1.1089526414871216,
"logps/chosen": -3.7524938583374023,
"logps/rejected": -4.324914932250977,
"loss": 2.9573,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -37.52493667602539,
"rewards/margins": 5.724216461181641,
"rewards/rejected": -43.24915313720703,
"step": 218
},
{
"epoch": 0.4953350296861747,
"grad_norm": 92.78634682717656,
"learning_rate": 4.770657312910354e-07,
"logits/chosen": -1.1212890148162842,
"logits/rejected": -1.1400455236434937,
"logps/chosen": -3.78830885887146,
"logps/rejected": -4.240726947784424,
"loss": 3.3389,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -37.88309097290039,
"rewards/margins": 4.524179935455322,
"rewards/rejected": -42.40727233886719,
"step": 219
},
{
"epoch": 0.4975968334746961,
"grad_norm": 100.92687567592242,
"learning_rate": 4.739573214964729e-07,
"logits/chosen": -1.0963836908340454,
"logits/rejected": -1.109062671661377,
"logps/chosen": -3.5693228244781494,
"logps/rejected": -4.1547346115112305,
"loss": 2.8707,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -35.6932258605957,
"rewards/margins": 5.854123115539551,
"rewards/rejected": -41.54734802246094,
"step": 220
},
{
"epoch": 0.49985863726321744,
"grad_norm": 114.6619281519479,
"learning_rate": 4.7084428045815733e-07,
"logits/chosen": -1.0976245403289795,
"logits/rejected": -1.1148653030395508,
"logps/chosen": -3.7459495067596436,
"logps/rejected": -4.185751914978027,
"loss": 3.345,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -37.459495544433594,
"rewards/margins": 4.39802360534668,
"rewards/rejected": -41.85751724243164,
"step": 221
},
{
"epoch": 0.5021204410517388,
"grad_norm": 164.15889181575363,
"learning_rate": 4.677268031162457e-07,
"logits/chosen": -1.1060454845428467,
"logits/rejected": -1.1150747537612915,
"logps/chosen": -3.5374410152435303,
"logps/rejected": -4.003762245178223,
"loss": 3.3499,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -35.374412536621094,
"rewards/margins": 4.663212299346924,
"rewards/rejected": -40.03762435913086,
"step": 222
},
{
"epoch": 0.5043822448402601,
"grad_norm": 100.61678360581845,
"learning_rate": 4.646050846886985e-07,
"logits/chosen": -1.0104235410690308,
"logits/rejected": -1.0443775653839111,
"logps/chosen": -3.3526723384857178,
"logps/rejected": -3.873018503189087,
"loss": 2.8564,
"rewards/accuracies": 0.78125,
"rewards/chosen": -33.5267219543457,
"rewards/margins": 5.203460216522217,
"rewards/rejected": -38.730186462402344,
"step": 223
},
{
"epoch": 0.5066440486287814,
"grad_norm": 114.26897252401342,
"learning_rate": 4.6147932065905494e-07,
"logits/chosen": -1.0860874652862549,
"logits/rejected": -1.0895824432373047,
"logps/chosen": -3.5561678409576416,
"logps/rejected": -3.9924020767211914,
"loss": 3.4319,
"rewards/accuracies": 0.75,
"rewards/chosen": -35.561676025390625,
"rewards/margins": 4.362340927124023,
"rewards/rejected": -39.92401885986328,
"step": 224
},
{
"epoch": 0.5089058524173028,
"grad_norm": 106.26494466238337,
"learning_rate": 4.5834970676419214e-07,
"logits/chosen": -1.0927622318267822,
"logits/rejected": -1.1163933277130127,
"logps/chosen": -3.4472174644470215,
"logps/rejected": -3.9276986122131348,
"loss": 3.0706,
"rewards/accuracies": 0.71875,
"rewards/chosen": -34.472171783447266,
"rewards/margins": 4.804813861846924,
"rewards/rejected": -39.2769889831543,
"step": 225
},
{
"epoch": 0.5111676562058242,
"grad_norm": 142.79046408424182,
"learning_rate": 4.552164389820673e-07,
"logits/chosen": -1.003208041191101,
"logits/rejected": -1.0169744491577148,
"logps/chosen": -3.3661296367645264,
"logps/rejected": -4.004166126251221,
"loss": 2.8901,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -33.661293029785156,
"rewards/margins": 6.380366325378418,
"rewards/rejected": -40.041664123535156,
"step": 226
},
{
"epoch": 0.5134294599943455,
"grad_norm": 113.6822648904385,
"learning_rate": 4.5207971351944605e-07,
"logits/chosen": -1.133029580116272,
"logits/rejected": -1.13167142868042,
"logps/chosen": -3.4778950214385986,
"logps/rejected": -4.010115146636963,
"loss": 3.6064,
"rewards/accuracies": 0.6875,
"rewards/chosen": -34.778953552246094,
"rewards/margins": 5.32219934463501,
"rewards/rejected": -40.10114669799805,
"step": 227
},
{
"epoch": 0.5156912637828668,
"grad_norm": 107.25373744217654,
"learning_rate": 4.489397267996157e-07,
"logits/chosen": -1.1207070350646973,
"logits/rejected": -1.1274689435958862,
"logps/chosen": -3.4271583557128906,
"logps/rejected": -3.929474353790283,
"loss": 2.9591,
"rewards/accuracies": 0.796875,
"rewards/chosen": -34.271583557128906,
"rewards/margins": 5.023160934448242,
"rewards/rejected": -39.294742584228516,
"step": 228
},
{
"epoch": 0.5179530675713881,
"grad_norm": 111.56900476327738,
"learning_rate": 4.45796675450085e-07,
"logits/chosen": -1.0740970373153687,
"logits/rejected": -1.0927459001541138,
"logps/chosen": -3.397404193878174,
"logps/rejected": -3.9328582286834717,
"loss": 3.4474,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -33.97404098510742,
"rewards/margins": 5.354538917541504,
"rewards/rejected": -39.328582763671875,
"step": 229
},
{
"epoch": 0.5202148713599095,
"grad_norm": 103.68995444830254,
"learning_rate": 4.4265075629027126e-07,
"logits/chosen": -1.0486955642700195,
"logits/rejected": -1.0680443048477173,
"logps/chosen": -3.4485530853271484,
"logps/rejected": -3.9430947303771973,
"loss": 2.7208,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -34.485530853271484,
"rewards/margins": 4.94541597366333,
"rewards/rejected": -39.430946350097656,
"step": 230
},
{
"epoch": 0.5224766751484309,
"grad_norm": 91.41895983261622,
"learning_rate": 4.3950216631917563e-07,
"logits/chosen": -1.1048505306243896,
"logits/rejected": -1.1197776794433594,
"logps/chosen": -3.47996187210083,
"logps/rejected": -4.138747692108154,
"loss": 2.491,
"rewards/accuracies": 0.828125,
"rewards/chosen": -34.79962158203125,
"rewards/margins": 6.587857723236084,
"rewards/rejected": -41.387481689453125,
"step": 231
},
{
"epoch": 0.5247384789369522,
"grad_norm": 90.33357685407847,
"learning_rate": 4.3635110270304676e-07,
"logits/chosen": -1.0903615951538086,
"logits/rejected": -1.1053102016448975,
"logps/chosen": -3.2950947284698486,
"logps/rejected": -3.8697397708892822,
"loss": 2.243,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -32.950950622558594,
"rewards/margins": 5.746450424194336,
"rewards/rejected": -38.6973991394043,
"step": 232
},
{
"epoch": 0.5270002827254736,
"grad_norm": 164.72132130018886,
"learning_rate": 4.331977627630339e-07,
"logits/chosen": -1.052128553390503,
"logits/rejected": -1.04896080493927,
"logps/chosen": -3.2477636337280273,
"logps/rejected": -3.909946918487549,
"loss": 2.5147,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -32.47763442993164,
"rewards/margins": 6.621834754943848,
"rewards/rejected": -39.09946823120117,
"step": 233
},
{
"epoch": 0.5292620865139949,
"grad_norm": 105.44016716210716,
"learning_rate": 4.300423439628313e-07,
"logits/chosen": -1.100780963897705,
"logits/rejected": -1.1270376443862915,
"logps/chosen": -3.316885232925415,
"logps/rejected": -3.9043822288513184,
"loss": 2.4694,
"rewards/accuracies": 0.828125,
"rewards/chosen": -33.168853759765625,
"rewards/margins": 5.874972343444824,
"rewards/rejected": -39.043827056884766,
"step": 234
},
{
"epoch": 0.5315238903025162,
"grad_norm": 102.37005839911242,
"learning_rate": 4.268850438963118e-07,
"logits/chosen": -1.1160274744033813,
"logits/rejected": -1.1523478031158447,
"logps/chosen": -3.7024288177490234,
"logps/rejected": -4.261665344238281,
"loss": 2.5654,
"rewards/accuracies": 0.796875,
"rewards/chosen": -37.0242919921875,
"rewards/margins": 5.5923662185668945,
"rewards/rejected": -42.61665725708008,
"step": 235
},
{
"epoch": 0.5337856940910376,
"grad_norm": 100.177700252223,
"learning_rate": 4.2372606027515463e-07,
"logits/chosen": -1.0851325988769531,
"logits/rejected": -1.1043192148208618,
"logps/chosen": -3.3892626762390137,
"logps/rejected": -3.893834114074707,
"loss": 3.159,
"rewards/accuracies": 0.734375,
"rewards/chosen": -33.89262771606445,
"rewards/margins": 5.045711994171143,
"rewards/rejected": -38.93833923339844,
"step": 236
},
{
"epoch": 0.536047497879559,
"grad_norm": 137.28638250019023,
"learning_rate": 4.2056559091646387e-07,
"logits/chosen": -1.099539041519165,
"logits/rejected": -1.134901523590088,
"logps/chosen": -3.5579705238342285,
"logps/rejected": -4.083298683166504,
"loss": 3.2432,
"rewards/accuracies": 0.71875,
"rewards/chosen": -35.579708099365234,
"rewards/margins": 5.25327730178833,
"rewards/rejected": -40.832984924316406,
"step": 237
},
{
"epoch": 0.5383093016680803,
"grad_norm": 119.2515847549312,
"learning_rate": 4.1740383373038116e-07,
"logits/chosen": -1.081524133682251,
"logits/rejected": -1.1097904443740845,
"logps/chosen": -3.4320178031921387,
"logps/rejected": -4.001075744628906,
"loss": 2.7358,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -34.32017517089844,
"rewards/margins": 5.690584182739258,
"rewards/rejected": -40.01075744628906,
"step": 238
},
{
"epoch": 0.5405711054566016,
"grad_norm": 106.65847227560742,
"learning_rate": 4.1424098670769255e-07,
"logits/chosen": -1.1375625133514404,
"logits/rejected": -1.1663264036178589,
"logps/chosen": -3.55190372467041,
"logps/rejected": -3.9685091972351074,
"loss": 3.2169,
"rewards/accuracies": 0.71875,
"rewards/chosen": -35.51903533935547,
"rewards/margins": 4.166055679321289,
"rewards/rejected": -39.685096740722656,
"step": 239
},
{
"epoch": 0.542832909245123,
"grad_norm": 100.59018953371722,
"learning_rate": 4.1107724790743007e-07,
"logits/chosen": -1.0572842359542847,
"logits/rejected": -1.0986131429672241,
"logps/chosen": -3.5302083492279053,
"logps/rejected": -4.043272018432617,
"loss": 2.5905,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -35.302085876464844,
"rewards/margins": 5.130636692047119,
"rewards/rejected": -40.43272399902344,
"step": 240
},
{
"epoch": 0.5450947130336443,
"grad_norm": 130.8658906195362,
"learning_rate": 4.0791281544446947e-07,
"logits/chosen": -1.1063083410263062,
"logits/rejected": -1.102048635482788,
"logps/chosen": -3.599591016769409,
"logps/rejected": -4.208369731903076,
"loss": 2.4404,
"rewards/accuracies": 0.78125,
"rewards/chosen": -35.99591064453125,
"rewards/margins": 6.087784290313721,
"rewards/rejected": -42.08369064331055,
"step": 241
},
{
"epoch": 0.5473565168221657,
"grad_norm": 122.85647135040247,
"learning_rate": 4.0474788747712416e-07,
"logits/chosen": -1.1174023151397705,
"logits/rejected": -1.1157442331314087,
"logps/chosen": -3.706411838531494,
"logps/rejected": -4.156275272369385,
"loss": 3.519,
"rewards/accuracies": 0.703125,
"rewards/chosen": -37.064117431640625,
"rewards/margins": 4.498636245727539,
"rewards/rejected": -41.56275177001953,
"step": 242
},
{
"epoch": 0.549618320610687,
"grad_norm": 116.9138011882379,
"learning_rate": 4.0158266219473573e-07,
"logits/chosen": -1.104333519935608,
"logits/rejected": -1.1253178119659424,
"logps/chosen": -3.5006866455078125,
"logps/rejected": -4.054680824279785,
"loss": 2.7425,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -35.00686264038086,
"rewards/margins": 5.539941787719727,
"rewards/rejected": -40.54680633544922,
"step": 243
},
{
"epoch": 0.5518801243992084,
"grad_norm": 110.06536774666624,
"learning_rate": 3.984173378052643e-07,
"logits/chosen": -1.0578244924545288,
"logits/rejected": -1.0700592994689941,
"logps/chosen": -3.424272060394287,
"logps/rejected": -4.020389080047607,
"loss": 2.677,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -34.24272155761719,
"rewards/margins": 5.961171627044678,
"rewards/rejected": -40.203887939453125,
"step": 244
},
{
"epoch": 0.5541419281877297,
"grad_norm": 111.08948121570445,
"learning_rate": 3.9525211252287585e-07,
"logits/chosen": -1.1446008682250977,
"logits/rejected": -1.1535121202468872,
"logps/chosen": -3.7665810585021973,
"logps/rejected": -4.514401912689209,
"loss": 2.6081,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -37.665809631347656,
"rewards/margins": 7.478209018707275,
"rewards/rejected": -45.144020080566406,
"step": 245
},
{
"epoch": 0.556403731976251,
"grad_norm": 107.23381454459822,
"learning_rate": 3.920871845555305e-07,
"logits/chosen": -1.0748775005340576,
"logits/rejected": -1.0747300386428833,
"logps/chosen": -3.663707733154297,
"logps/rejected": -4.138020038604736,
"loss": 2.7566,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -36.63707733154297,
"rewards/margins": 4.743118762969971,
"rewards/rejected": -41.38019943237305,
"step": 246
},
{
"epoch": 0.5586655357647724,
"grad_norm": 137.08053930045634,
"learning_rate": 3.8892275209256984e-07,
"logits/chosen": -1.1169809103012085,
"logits/rejected": -1.124050259590149,
"logps/chosen": -3.999643564224243,
"logps/rejected": -4.464410305023193,
"loss": 3.1658,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -39.996437072753906,
"rewards/margins": 4.64766263961792,
"rewards/rejected": -44.644100189208984,
"step": 247
},
{
"epoch": 0.5609273395532938,
"grad_norm": 132.83729250085113,
"learning_rate": 3.8575901329230747e-07,
"logits/chosen": -1.0912463665008545,
"logits/rejected": -1.091078281402588,
"logps/chosen": -3.834798812866211,
"logps/rejected": -4.405033111572266,
"loss": 2.8859,
"rewards/accuracies": 0.78125,
"rewards/chosen": -38.347991943359375,
"rewards/margins": 5.7023420333862305,
"rewards/rejected": -44.050331115722656,
"step": 248
},
{
"epoch": 0.5631891433418151,
"grad_norm": 133.00485994320712,
"learning_rate": 3.8259616626961886e-07,
"logits/chosen": -1.0844348669052124,
"logits/rejected": -1.0968797206878662,
"logps/chosen": -3.6328611373901367,
"logps/rejected": -4.086342811584473,
"loss": 2.5866,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -36.328609466552734,
"rewards/margins": 4.534820079803467,
"rewards/rejected": -40.863426208496094,
"step": 249
},
{
"epoch": 0.5654509471303364,
"grad_norm": 111.79119137868418,
"learning_rate": 3.794344090835362e-07,
"logits/chosen": -1.1024246215820312,
"logits/rejected": -1.1074460744857788,
"logps/chosen": -3.958535671234131,
"logps/rejected": -4.475435733795166,
"loss": 3.247,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -39.58535385131836,
"rewards/margins": 5.169003009796143,
"rewards/rejected": -44.754356384277344,
"step": 250
},
{
"epoch": 0.5677127509188578,
"grad_norm": 135.70711423835718,
"learning_rate": 3.7627393972484534e-07,
"logits/chosen": -1.1514368057250977,
"logits/rejected": -1.1660606861114502,
"logps/chosen": -3.9443583488464355,
"logps/rejected": -4.3965229988098145,
"loss": 3.3998,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -39.44358825683594,
"rewards/margins": 4.5216474533081055,
"rewards/rejected": -43.965232849121094,
"step": 251
},
{
"epoch": 0.5699745547073791,
"grad_norm": 122.14532370258085,
"learning_rate": 3.7311495610368823e-07,
"logits/chosen": -1.1626390218734741,
"logits/rejected": -1.1832255125045776,
"logps/chosen": -4.036317348480225,
"logps/rejected": -4.556058883666992,
"loss": 3.1401,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -40.36316680908203,
"rewards/margins": 5.1974196434021,
"rewards/rejected": -45.56059265136719,
"step": 252
},
{
"epoch": 0.5722363584959005,
"grad_norm": 102.26046335206325,
"learning_rate": 3.699576560371689e-07,
"logits/chosen": -1.0833961963653564,
"logits/rejected": -1.1061809062957764,
"logps/chosen": -3.965580463409424,
"logps/rejected": -4.775724411010742,
"loss": 2.0592,
"rewards/accuracies": 0.84375,
"rewards/chosen": -39.65580749511719,
"rewards/margins": 8.1014404296875,
"rewards/rejected": -47.75724792480469,
"step": 253
},
{
"epoch": 0.5744981622844219,
"grad_norm": 122.67818060364495,
"learning_rate": 3.66802237236966e-07,
"logits/chosen": -1.1018187999725342,
"logits/rejected": -1.1187875270843506,
"logps/chosen": -3.992328643798828,
"logps/rejected": -4.545815944671631,
"loss": 2.8386,
"rewards/accuracies": 0.78125,
"rewards/chosen": -39.92329025268555,
"rewards/margins": 5.5348687171936035,
"rewards/rejected": -45.458160400390625,
"step": 254
},
{
"epoch": 0.5767599660729432,
"grad_norm": 118.62094863063925,
"learning_rate": 3.636488972969532e-07,
"logits/chosen": -1.0958614349365234,
"logits/rejected": -1.1104106903076172,
"logps/chosen": -3.850203514099121,
"logps/rejected": -4.422935485839844,
"loss": 2.8001,
"rewards/accuracies": 0.78125,
"rewards/chosen": -38.50203323364258,
"rewards/margins": 5.727315425872803,
"rewards/rejected": -44.22935104370117,
"step": 255
},
{
"epoch": 0.5790217698614645,
"grad_norm": 120.96931790013662,
"learning_rate": 3.604978336808244e-07,
"logits/chosen": -1.194389820098877,
"logits/rejected": -1.201671838760376,
"logps/chosen": -3.8203506469726562,
"logps/rejected": -4.354315280914307,
"loss": 2.8406,
"rewards/accuracies": 0.765625,
"rewards/chosen": -38.2035026550293,
"rewards/margins": 5.339643955230713,
"rewards/rejected": -43.54315185546875,
"step": 256
},
{
"epoch": 0.5812835736499858,
"grad_norm": 108.82242695089444,
"learning_rate": 3.5734924370972876e-07,
"logits/chosen": -1.1173956394195557,
"logits/rejected": -1.1346094608306885,
"logps/chosen": -3.774829149246216,
"logps/rejected": -4.319247245788574,
"loss": 2.9131,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -37.74829864501953,
"rewards/margins": 5.444177627563477,
"rewards/rejected": -43.192474365234375,
"step": 257
},
{
"epoch": 0.5835453774385072,
"grad_norm": 123.4677107906652,
"learning_rate": 3.5420332454991504e-07,
"logits/chosen": -1.0675257444381714,
"logits/rejected": -1.0911375284194946,
"logps/chosen": -3.746617078781128,
"logps/rejected": -4.322977066040039,
"loss": 2.8697,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -37.46617126464844,
"rewards/margins": 5.76359748840332,
"rewards/rejected": -43.22977066040039,
"step": 258
},
{
"epoch": 0.5858071812270286,
"grad_norm": 105.5976688373798,
"learning_rate": 3.510602732003843e-07,
"logits/chosen": -1.1454898118972778,
"logits/rejected": -1.1573333740234375,
"logps/chosen": -3.9214420318603516,
"logps/rejected": -4.544137001037598,
"loss": 2.6906,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -39.214420318603516,
"rewards/margins": 6.226948261260986,
"rewards/rejected": -45.441368103027344,
"step": 259
},
{
"epoch": 0.5880689850155499,
"grad_norm": 113.87298735956101,
"learning_rate": 3.4792028648055396e-07,
"logits/chosen": -1.0981009006500244,
"logits/rejected": -1.1229112148284912,
"logps/chosen": -3.649538040161133,
"logps/rejected": -4.314185619354248,
"loss": 2.535,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -36.49538040161133,
"rewards/margins": 6.646478176116943,
"rewards/rejected": -43.14185333251953,
"step": 260
},
{
"epoch": 0.5903307888040712,
"grad_norm": 116.11587475053403,
"learning_rate": 3.447835610179327e-07,
"logits/chosen": -1.0831830501556396,
"logits/rejected": -1.100474238395691,
"logps/chosen": -3.8131837844848633,
"logps/rejected": -4.552441120147705,
"loss": 2.6365,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -38.131832122802734,
"rewards/margins": 7.392579555511475,
"rewards/rejected": -45.5244140625,
"step": 261
},
{
"epoch": 0.5925925925925926,
"grad_norm": 134.3566971386809,
"learning_rate": 3.416502932358079e-07,
"logits/chosen": -1.1478521823883057,
"logits/rejected": -1.1568520069122314,
"logps/chosen": -4.000604152679443,
"logps/rejected": -4.428308486938477,
"loss": 3.1547,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -40.00604248046875,
"rewards/margins": 4.277041435241699,
"rewards/rejected": -44.2830810546875,
"step": 262
},
{
"epoch": 0.5948543963811139,
"grad_norm": 135.07800674537654,
"learning_rate": 3.385206793409451e-07,
"logits/chosen": -1.0648393630981445,
"logits/rejected": -1.0934747457504272,
"logps/chosen": -3.4633100032806396,
"logps/rejected": -3.9857139587402344,
"loss": 2.8446,
"rewards/accuracies": 0.78125,
"rewards/chosen": -34.63310241699219,
"rewards/margins": 5.2240400314331055,
"rewards/rejected": -39.857139587402344,
"step": 263
},
{
"epoch": 0.5971162001696353,
"grad_norm": 135.56690119641354,
"learning_rate": 3.3539491531130163e-07,
"logits/chosen": -1.1003735065460205,
"logits/rejected": -1.1152950525283813,
"logps/chosen": -3.834996223449707,
"logps/rejected": -4.56181526184082,
"loss": 2.74,
"rewards/accuracies": 0.78125,
"rewards/chosen": -38.3499641418457,
"rewards/margins": 7.268193244934082,
"rewards/rejected": -45.61815643310547,
"step": 264
},
{
"epoch": 0.5993780039581567,
"grad_norm": 126.5260644122031,
"learning_rate": 3.3227319688375426e-07,
"logits/chosen": -1.1725904941558838,
"logits/rejected": -1.1713595390319824,
"logps/chosen": -3.9160027503967285,
"logps/rejected": -4.517629623413086,
"loss": 2.4535,
"rewards/accuracies": 0.84375,
"rewards/chosen": -39.16002655029297,
"rewards/margins": 6.016266345977783,
"rewards/rejected": -45.176292419433594,
"step": 265
},
{
"epoch": 0.601639807746678,
"grad_norm": 110.87232953520362,
"learning_rate": 3.291557195418427e-07,
"logits/chosen": -1.169702410697937,
"logits/rejected": -1.1751315593719482,
"logps/chosen": -3.7779271602630615,
"logps/rejected": -4.331457614898682,
"loss": 2.7261,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -37.779273986816406,
"rewards/margins": 5.53530740737915,
"rewards/rejected": -43.314571380615234,
"step": 266
},
{
"epoch": 0.6039016115351993,
"grad_norm": 170.10518211676956,
"learning_rate": 3.260426785035272e-07,
"logits/chosen": -1.1354193687438965,
"logits/rejected": -1.1495718955993652,
"logps/chosen": -3.8486831188201904,
"logps/rejected": -4.348843097686768,
"loss": 3.5745,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -38.4868278503418,
"rewards/margins": 5.001603126525879,
"rewards/rejected": -43.48843002319336,
"step": 267
},
{
"epoch": 0.6061634153237206,
"grad_norm": 116.44936706887347,
"learning_rate": 3.229342687089646e-07,
"logits/chosen": -1.140492558479309,
"logits/rejected": -1.1480599641799927,
"logps/chosen": -3.7502431869506836,
"logps/rejected": -4.402010440826416,
"loss": 2.6712,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -37.502437591552734,
"rewards/margins": 6.517667770385742,
"rewards/rejected": -44.02009963989258,
"step": 268
},
{
"epoch": 0.608425219112242,
"grad_norm": 126.74661926416681,
"learning_rate": 3.1983068480830143e-07,
"logits/chosen": -1.1415023803710938,
"logits/rejected": -1.159563422203064,
"logps/chosen": -3.7056891918182373,
"logps/rejected": -4.368051528930664,
"loss": 2.4807,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -37.05690002441406,
"rewards/margins": 6.6236186027526855,
"rewards/rejected": -43.68051528930664,
"step": 269
},
{
"epoch": 0.6106870229007634,
"grad_norm": 112.01190173093258,
"learning_rate": 3.1673212114948387e-07,
"logits/chosen": -1.1376936435699463,
"logits/rejected": -1.1491377353668213,
"logps/chosen": -3.693352699279785,
"logps/rejected": -4.348179340362549,
"loss": 2.362,
"rewards/accuracies": 0.8515625,
"rewards/chosen": -36.93352508544922,
"rewards/margins": 6.548270225524902,
"rewards/rejected": -43.48179244995117,
"step": 270
},
{
"epoch": 0.6129488266892847,
"grad_norm": 131.75988503177302,
"learning_rate": 3.1363877176608845e-07,
"logits/chosen": -1.106995940208435,
"logits/rejected": -1.1330175399780273,
"logps/chosen": -3.4859375953674316,
"logps/rejected": -4.093271732330322,
"loss": 2.5697,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -34.859375,
"rewards/margins": 6.073338508605957,
"rewards/rejected": -40.932716369628906,
"step": 271
},
{
"epoch": 0.615210630477806,
"grad_norm": 131.91725099894813,
"learning_rate": 3.1055083036517076e-07,
"logits/chosen": -1.116554617881775,
"logits/rejected": -1.1147369146347046,
"logps/chosen": -3.5844948291778564,
"logps/rejected": -4.161045074462891,
"loss": 2.8339,
"rewards/accuracies": 0.765625,
"rewards/chosen": -35.844947814941406,
"rewards/margins": 5.765503883361816,
"rewards/rejected": -41.610450744628906,
"step": 272
},
{
"epoch": 0.6174724342663274,
"grad_norm": 150.42025749523202,
"learning_rate": 3.074684903151364e-07,
"logits/chosen": -1.0365394353866577,
"logits/rejected": -1.0323246717453003,
"logps/chosen": -3.274428129196167,
"logps/rejected": -3.7612733840942383,
"loss": 2.5272,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -32.744285583496094,
"rewards/margins": 4.868447303771973,
"rewards/rejected": -37.612728118896484,
"step": 273
},
{
"epoch": 0.6197342380548487,
"grad_norm": 136.11200767678002,
"learning_rate": 3.0439194463363136e-07,
"logits/chosen": -1.0989680290222168,
"logits/rejected": -1.1113637685775757,
"logps/chosen": -3.509467840194702,
"logps/rejected": -4.074167728424072,
"loss": 2.6865,
"rewards/accuracies": 0.796875,
"rewards/chosen": -35.09468078613281,
"rewards/margins": 5.646995544433594,
"rewards/rejected": -40.741676330566406,
"step": 274
},
{
"epoch": 0.6219960418433701,
"grad_norm": 127.44459660902798,
"learning_rate": 3.0132138597545537e-07,
"logits/chosen": -1.1420754194259644,
"logits/rejected": -1.1679383516311646,
"logps/chosen": -3.78161358833313,
"logps/rejected": -4.458674430847168,
"loss": 2.5574,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -37.81613540649414,
"rewards/margins": 6.7706098556518555,
"rewards/rejected": -44.58674240112305,
"step": 275
},
{
"epoch": 0.6242578456318915,
"grad_norm": 106.31811579148187,
"learning_rate": 2.982570066204981e-07,
"logits/chosen": -1.140415906906128,
"logits/rejected": -1.1640311479568481,
"logps/chosen": -3.6432271003723145,
"logps/rejected": -4.16619348526001,
"loss": 2.6839,
"rewards/accuracies": 0.765625,
"rewards/chosen": -36.43227005004883,
"rewards/margins": 5.229666233062744,
"rewards/rejected": -41.66193389892578,
"step": 276
},
{
"epoch": 0.6265196494204128,
"grad_norm": 136.57760766398937,
"learning_rate": 2.951989984616979e-07,
"logits/chosen": -1.073444128036499,
"logits/rejected": -1.1000826358795166,
"logps/chosen": -3.8470325469970703,
"logps/rejected": -4.511543273925781,
"loss": 3.2317,
"rewards/accuracies": 0.734375,
"rewards/chosen": -38.4703254699707,
"rewards/margins": 6.645102024078369,
"rewards/rejected": -45.11542892456055,
"step": 277
},
{
"epoch": 0.6287814532089341,
"grad_norm": 124.12928037170991,
"learning_rate": 2.9214755299302584e-07,
"logits/chosen": -1.0841182470321655,
"logits/rejected": -1.1026504039764404,
"logps/chosen": -4.121213436126709,
"logps/rejected": -4.8484954833984375,
"loss": 2.2503,
"rewards/accuracies": 0.84375,
"rewards/chosen": -41.212135314941406,
"rewards/margins": 7.272819519042969,
"rewards/rejected": -48.484954833984375,
"step": 278
},
{
"epoch": 0.6310432569974554,
"grad_norm": 153.94308352336938,
"learning_rate": 2.89102861297494e-07,
"logits/chosen": -1.151421308517456,
"logits/rejected": -1.1864724159240723,
"logps/chosen": -3.9869375228881836,
"logps/rejected": -4.583745956420898,
"loss": 3.0198,
"rewards/accuracies": 0.71875,
"rewards/chosen": -39.86937713623047,
"rewards/margins": 5.968080997467041,
"rewards/rejected": -45.83745574951172,
"step": 279
},
{
"epoch": 0.6333050607859768,
"grad_norm": 130.631038930031,
"learning_rate": 2.860651140351902e-07,
"logits/chosen": -1.1522353887557983,
"logits/rejected": -1.1588143110275269,
"logps/chosen": -4.093480587005615,
"logps/rejected": -4.765751838684082,
"loss": 2.5793,
"rewards/accuracies": 0.78125,
"rewards/chosen": -40.93480682373047,
"rewards/margins": 6.7227091789245605,
"rewards/rejected": -47.65751647949219,
"step": 280
},
{
"epoch": 0.6355668645744982,
"grad_norm": 128.14259196954697,
"learning_rate": 2.830345014313381e-07,
"logits/chosen": -1.0641752481460571,
"logits/rejected": -1.0930891036987305,
"logps/chosen": -4.28399658203125,
"logps/rejected": -4.995797634124756,
"loss": 2.376,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -42.83997344970703,
"rewards/margins": 7.118013381958008,
"rewards/rejected": -49.95798110961914,
"step": 281
},
{
"epoch": 0.6378286683630195,
"grad_norm": 131.35675571796293,
"learning_rate": 2.800112132643856e-07,
"logits/chosen": -1.147114634513855,
"logits/rejected": -1.1579588651657104,
"logps/chosen": -4.231831073760986,
"logps/rejected": -4.9878106117248535,
"loss": 2.3918,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -42.31830978393555,
"rewards/margins": 7.559793472290039,
"rewards/rejected": -49.87810516357422,
"step": 282
},
{
"epoch": 0.6400904721515408,
"grad_norm": 140.95255800161883,
"learning_rate": 2.7699543885412105e-07,
"logits/chosen": -1.1515127420425415,
"logits/rejected": -1.154524564743042,
"logps/chosen": -4.58392333984375,
"logps/rejected": -5.317344665527344,
"loss": 2.6597,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -45.8392333984375,
"rewards/margins": 7.334213733673096,
"rewards/rejected": -53.17344665527344,
"step": 283
},
{
"epoch": 0.6423522759400622,
"grad_norm": 129.8600206614132,
"learning_rate": 2.7398736704981725e-07,
"logits/chosen": -1.150620460510254,
"logits/rejected": -1.1516865491867065,
"logps/chosen": -4.295502185821533,
"logps/rejected": -4.943012714385986,
"loss": 2.3597,
"rewards/accuracies": 0.828125,
"rewards/chosen": -42.95501708984375,
"rewards/margins": 6.475111484527588,
"rewards/rejected": -49.43013000488281,
"step": 284
},
{
"epoch": 0.6446140797285835,
"grad_norm": 177.10199379210934,
"learning_rate": 2.709871862184063e-07,
"logits/chosen": -1.1259461641311646,
"logits/rejected": -1.1398793458938599,
"logps/chosen": -4.510852813720703,
"logps/rejected": -5.170764446258545,
"loss": 2.795,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -45.10852813720703,
"rewards/margins": 6.599118232727051,
"rewards/rejected": -51.7076416015625,
"step": 285
},
{
"epoch": 0.6468758835171049,
"grad_norm": 129.64216363994416,
"learning_rate": 2.679950842326837e-07,
"logits/chosen": -1.1655973196029663,
"logits/rejected": -1.1758906841278076,
"logps/chosen": -4.58130407333374,
"logps/rejected": -5.327126502990723,
"loss": 2.3903,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -45.81304168701172,
"rewards/margins": 7.45822286605835,
"rewards/rejected": -53.271263122558594,
"step": 286
},
{
"epoch": 0.6491376873056263,
"grad_norm": 154.94721926801571,
"learning_rate": 2.6501124845954363e-07,
"logits/chosen": -1.1170488595962524,
"logits/rejected": -1.1338824033737183,
"logps/chosen": -4.708388328552246,
"logps/rejected": -5.483621597290039,
"loss": 2.3088,
"rewards/accuracies": 0.796875,
"rewards/chosen": -47.08388137817383,
"rewards/margins": 7.752330780029297,
"rewards/rejected": -54.836212158203125,
"step": 287
},
{
"epoch": 0.6513994910941476,
"grad_norm": 137.88543689039244,
"learning_rate": 2.62035865748246e-07,
"logits/chosen": -1.102371096611023,
"logits/rejected": -1.1110731363296509,
"logps/chosen": -4.388948917388916,
"logps/rejected": -5.055350303649902,
"loss": 2.7012,
"rewards/accuracies": 0.78125,
"rewards/chosen": -43.88949203491211,
"rewards/margins": 6.664010047912598,
"rewards/rejected": -50.55350112915039,
"step": 288
},
{
"epoch": 0.6536612948826689,
"grad_norm": 185.23078201443082,
"learning_rate": 2.5906912241871554e-07,
"logits/chosen": -1.1915696859359741,
"logits/rejected": -1.192091941833496,
"logps/chosen": -4.680731773376465,
"logps/rejected": -5.389235496520996,
"loss": 2.7165,
"rewards/accuracies": 0.796875,
"rewards/chosen": -46.807315826416016,
"rewards/margins": 7.085034370422363,
"rewards/rejected": -53.892356872558594,
"step": 289
},
{
"epoch": 0.6559230986711903,
"grad_norm": 158.84111035426994,
"learning_rate": 2.561112042498753e-07,
"logits/chosen": -1.0850210189819336,
"logits/rejected": -1.1100788116455078,
"logps/chosen": -4.235768795013428,
"logps/rejected": -4.804391384124756,
"loss": 3.4646,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -42.35768508911133,
"rewards/margins": 5.686228275299072,
"rewards/rejected": -48.04391098022461,
"step": 290
},
{
"epoch": 0.6581849024597116,
"grad_norm": 169.51933352380985,
"learning_rate": 2.5316229646801195e-07,
"logits/chosen": -1.1047579050064087,
"logits/rejected": -1.1353652477264404,
"logps/chosen": -4.523907661437988,
"logps/rejected": -5.176398754119873,
"loss": 2.6109,
"rewards/accuracies": 0.78125,
"rewards/chosen": -45.23907470703125,
"rewards/margins": 6.524911880493164,
"rewards/rejected": -51.76398849487305,
"step": 291
},
{
"epoch": 0.660446706248233,
"grad_norm": 173.65393153021836,
"learning_rate": 2.5022258373517714e-07,
"logits/chosen": -1.190868616104126,
"logits/rejected": -1.1945364475250244,
"logps/chosen": -4.056920051574707,
"logps/rejected": -4.662143707275391,
"loss": 2.5368,
"rewards/accuracies": 0.796875,
"rewards/chosen": -40.56919860839844,
"rewards/margins": 6.052234649658203,
"rewards/rejected": -46.62143325805664,
"step": 292
},
{
"epoch": 0.6627085100367544,
"grad_norm": 142.8629393695611,
"learning_rate": 2.4729225013762474e-07,
"logits/chosen": -1.21455717086792,
"logits/rejected": -1.2325047254562378,
"logps/chosen": -4.186022758483887,
"logps/rejected": -4.799120903015137,
"loss": 3.1403,
"rewards/accuracies": 0.765625,
"rewards/chosen": -41.86022186279297,
"rewards/margins": 6.130983829498291,
"rewards/rejected": -47.99120330810547,
"step": 293
},
{
"epoch": 0.6649703138252756,
"grad_norm": 135.41361240770718,
"learning_rate": 2.4437147917428203e-07,
"logits/chosen": -1.1105806827545166,
"logits/rejected": -1.129175066947937,
"logps/chosen": -3.8398869037628174,
"logps/rejected": -4.574884414672852,
"loss": 2.5425,
"rewards/accuracies": 0.8125,
"rewards/chosen": -38.398868560791016,
"rewards/margins": 7.349975109100342,
"rewards/rejected": -45.74884796142578,
"step": 294
},
{
"epoch": 0.667232117613797,
"grad_norm": 168.11239113479994,
"learning_rate": 2.414604537452595e-07,
"logits/chosen": -1.1045446395874023,
"logits/rejected": -1.1168041229248047,
"logps/chosen": -3.7975552082061768,
"logps/rejected": -4.3420491218566895,
"loss": 2.7938,
"rewards/accuracies": 0.78125,
"rewards/chosen": -37.975547790527344,
"rewards/margins": 5.44494104385376,
"rewards/rejected": -43.420494079589844,
"step": 295
},
{
"epoch": 0.6694939214023183,
"grad_norm": 121.89542687766853,
"learning_rate": 2.385593561403974e-07,
"logits/chosen": -1.1402511596679688,
"logits/rejected": -1.1568533182144165,
"logps/chosen": -3.708047866821289,
"logps/rejected": -4.3688859939575195,
"loss": 2.3406,
"rewards/accuracies": 0.84375,
"rewards/chosen": -37.08047866821289,
"rewards/margins": 6.60837984085083,
"rewards/rejected": -43.68885803222656,
"step": 296
},
{
"epoch": 0.6717557251908397,
"grad_norm": 111.82954104467309,
"learning_rate": 2.3566836802785119e-07,
"logits/chosen": -1.136359453201294,
"logits/rejected": -1.145086407661438,
"logps/chosen": -3.72176456451416,
"logps/rejected": -4.468777656555176,
"loss": 2.2578,
"rewards/accuracies": 0.828125,
"rewards/chosen": -37.2176513671875,
"rewards/margins": 7.470129013061523,
"rewards/rejected": -44.687774658203125,
"step": 297
},
{
"epoch": 0.6740175289793611,
"grad_norm": 114.98069010016692,
"learning_rate": 2.327876704427146e-07,
"logits/chosen": -1.098710060119629,
"logits/rejected": -1.1092326641082764,
"logps/chosen": -3.6272008419036865,
"logps/rejected": -4.156996726989746,
"loss": 2.8945,
"rewards/accuracies": 0.765625,
"rewards/chosen": -36.27201461791992,
"rewards/margins": 5.2979583740234375,
"rewards/rejected": -41.569969177246094,
"step": 298
},
{
"epoch": 0.6762793327678824,
"grad_norm": 181.53426849775465,
"learning_rate": 2.2991744377568358e-07,
"logits/chosen": -1.1007070541381836,
"logits/rejected": -1.1016566753387451,
"logps/chosen": -3.71644926071167,
"logps/rejected": -4.26182746887207,
"loss": 2.773,
"rewards/accuracies": 0.78125,
"rewards/chosen": -37.164493560791016,
"rewards/margins": 5.45378303527832,
"rewards/rejected": -42.61827850341797,
"step": 299
},
{
"epoch": 0.6785411365564037,
"grad_norm": 147.43695074107663,
"learning_rate": 2.270578677617601e-07,
"logits/chosen": -1.1499643325805664,
"logits/rejected": -1.1593643426895142,
"logps/chosen": -3.7320542335510254,
"logps/rejected": -4.402863502502441,
"loss": 3.127,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -37.32053756713867,
"rewards/margins": 6.708094596862793,
"rewards/rejected": -44.02863693237305,
"step": 300
},
{
"epoch": 0.6808029403449251,
"grad_norm": 128.47706273777575,
"learning_rate": 2.242091214689971e-07,
"logits/chosen": -1.119197130203247,
"logits/rejected": -1.154642105102539,
"logps/chosen": -3.8752312660217285,
"logps/rejected": -4.5948710441589355,
"loss": 2.5358,
"rewards/accuracies": 0.828125,
"rewards/chosen": -38.75231170654297,
"rewards/margins": 7.196399211883545,
"rewards/rejected": -45.94871139526367,
"step": 301
},
{
"epoch": 0.6830647441334464,
"grad_norm": 156.39742683192418,
"learning_rate": 2.2137138328728456e-07,
"logits/chosen": -1.1946172714233398,
"logits/rejected": -1.1830028295516968,
"logps/chosen": -3.916076183319092,
"logps/rejected": -4.4446516036987305,
"loss": 2.484,
"rewards/accuracies": 0.828125,
"rewards/chosen": -39.16075897216797,
"rewards/margins": 5.285754680633545,
"rewards/rejected": -44.44651412963867,
"step": 302
},
{
"epoch": 0.6853265479219678,
"grad_norm": 112.74171061703916,
"learning_rate": 2.1854483091717974e-07,
"logits/chosen": -1.1835260391235352,
"logits/rejected": -1.2037560939788818,
"logps/chosen": -3.8349528312683105,
"logps/rejected": -4.525196075439453,
"loss": 1.9858,
"rewards/accuracies": 0.828125,
"rewards/chosen": -38.34953308105469,
"rewards/margins": 6.902431964874268,
"rewards/rejected": -45.25196075439453,
"step": 303
},
{
"epoch": 0.6875883517104892,
"grad_norm": 138.95665180597908,
"learning_rate": 2.1572964135877863e-07,
"logits/chosen": -1.158783197402954,
"logits/rejected": -1.1802828311920166,
"logps/chosen": -4.0684814453125,
"logps/rejected": -4.600527286529541,
"loss": 2.9658,
"rewards/accuracies": 0.75,
"rewards/chosen": -40.684814453125,
"rewards/margins": 5.320462226867676,
"rewards/rejected": -46.00527572631836,
"step": 304
},
{
"epoch": 0.6898501554990104,
"grad_norm": 129.34978151860037,
"learning_rate": 2.1292599090063245e-07,
"logits/chosen": -1.1776607036590576,
"logits/rejected": -1.1886625289916992,
"logps/chosen": -4.103504180908203,
"logps/rejected": -4.913661479949951,
"loss": 2.0852,
"rewards/accuracies": 0.828125,
"rewards/chosen": -41.03504180908203,
"rewards/margins": 8.101574897766113,
"rewards/rejected": -49.136619567871094,
"step": 305
},
{
"epoch": 0.6921119592875318,
"grad_norm": 146.9792159571898,
"learning_rate": 2.1013405510870824e-07,
"logits/chosen": -1.1055035591125488,
"logits/rejected": -1.137697458267212,
"logps/chosen": -4.204385757446289,
"logps/rejected": -4.979820728302002,
"loss": 2.2835,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -42.043861389160156,
"rewards/margins": 7.7543511390686035,
"rewards/rejected": -49.79821014404297,
"step": 306
},
{
"epoch": 0.6943737630760531,
"grad_norm": 131.64245125711065,
"learning_rate": 2.0735400881539494e-07,
"logits/chosen": -1.1195533275604248,
"logits/rejected": -1.1339137554168701,
"logps/chosen": -4.579202651977539,
"logps/rejected": -5.318291664123535,
"loss": 2.2426,
"rewards/accuracies": 0.8515625,
"rewards/chosen": -45.792022705078125,
"rewards/margins": 7.390895366668701,
"rewards/rejected": -53.182918548583984,
"step": 307
},
{
"epoch": 0.6966355668645745,
"grad_norm": 152.58870740768677,
"learning_rate": 2.0458602610855536e-07,
"logits/chosen": -1.2092702388763428,
"logits/rejected": -1.2151172161102295,
"logps/chosen": -4.504486560821533,
"logps/rejected": -5.164217472076416,
"loss": 2.5257,
"rewards/accuracies": 0.796875,
"rewards/chosen": -45.044864654541016,
"rewards/margins": 6.59730863571167,
"rewards/rejected": -51.642173767089844,
"step": 308
},
{
"epoch": 0.6988973706530959,
"grad_norm": 165.81164088683033,
"learning_rate": 2.0183028032062422e-07,
"logits/chosen": -1.1569883823394775,
"logits/rejected": -1.1767610311508179,
"logps/chosen": -4.616701126098633,
"logps/rejected": -5.350695610046387,
"loss": 2.8387,
"rewards/accuracies": 0.765625,
"rewards/chosen": -46.16701126098633,
"rewards/margins": 7.339938640594482,
"rewards/rejected": -53.50695037841797,
"step": 309
},
{
"epoch": 0.7011591744416172,
"grad_norm": 140.2670171009395,
"learning_rate": 1.9908694401775473e-07,
"logits/chosen": -1.186819076538086,
"logits/rejected": -1.191609263420105,
"logps/chosen": -4.841033935546875,
"logps/rejected": -5.632945537567139,
"loss": 2.6282,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -48.41033935546875,
"rewards/margins": 7.919118881225586,
"rewards/rejected": -56.32946014404297,
"step": 310
},
{
"epoch": 0.7034209782301385,
"grad_norm": 177.6691174160963,
"learning_rate": 1.9635618898901196e-07,
"logits/chosen": -1.1530416011810303,
"logits/rejected": -1.1640995740890503,
"logps/chosen": -5.184268951416016,
"logps/rejected": -6.004877090454102,
"loss": 2.7581,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -51.842689514160156,
"rewards/margins": 8.20608139038086,
"rewards/rejected": -60.04876708984375,
"step": 311
},
{
"epoch": 0.7056827820186599,
"grad_norm": 148.70727647820473,
"learning_rate": 1.9363818623561565e-07,
"logits/chosen": -1.1177144050598145,
"logits/rejected": -1.1454871892929077,
"logps/chosen": -4.92880392074585,
"logps/rejected": -5.721675395965576,
"loss": 2.7453,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -49.28804016113281,
"rewards/margins": 7.928720951080322,
"rewards/rejected": -57.21676254272461,
"step": 312
},
{
"epoch": 0.7079445858071812,
"grad_norm": 132.10892999630366,
"learning_rate": 1.9093310596023108e-07,
"logits/chosen": -1.0975664854049683,
"logits/rejected": -1.1099908351898193,
"logps/chosen": -4.774985313415527,
"logps/rejected": -5.697840213775635,
"loss": 2.0977,
"rewards/accuracies": 0.8359375,
"rewards/chosen": -47.749847412109375,
"rewards/margins": 9.228551864624023,
"rewards/rejected": -56.97840118408203,
"step": 313
},
{
"epoch": 0.7102063895957026,
"grad_norm": 143.64663603150058,
"learning_rate": 1.8824111755631274e-07,
"logits/chosen": -1.1822034120559692,
"logits/rejected": -1.1960346698760986,
"logps/chosen": -4.618733882904053,
"logps/rejected": -5.318885803222656,
"loss": 2.4163,
"rewards/accuracies": 0.796875,
"rewards/chosen": -46.187339782714844,
"rewards/margins": 7.0015177726745605,
"rewards/rejected": -53.18885040283203,
"step": 314
},
{
"epoch": 0.712468193384224,
"grad_norm": 209.87932133223504,
"learning_rate": 1.8556238959749457e-07,
"logits/chosen": -1.1291964054107666,
"logits/rejected": -1.1407198905944824,
"logps/chosen": -5.23246955871582,
"logps/rejected": -5.747961044311523,
"loss": 3.631,
"rewards/accuracies": 0.734375,
"rewards/chosen": -52.324703216552734,
"rewards/margins": 5.154911518096924,
"rewards/rejected": -57.4796142578125,
"step": 315
},
{
"epoch": 0.7147299971727452,
"grad_norm": 231.18280960384436,
"learning_rate": 1.8289708982703562e-07,
"logits/chosen": -1.1179161071777344,
"logits/rejected": -1.1054469347000122,
"logps/chosen": -4.88719367980957,
"logps/rejected": -5.661530494689941,
"loss": 3.2037,
"rewards/accuracies": 0.78125,
"rewards/chosen": -48.87194061279297,
"rewards/margins": 7.743368148803711,
"rewards/rejected": -56.61531066894531,
"step": 316
},
{
"epoch": 0.7169918009612666,
"grad_norm": 166.68538687840427,
"learning_rate": 1.802453851473151e-07,
"logits/chosen": -1.1730633974075317,
"logits/rejected": -1.1686493158340454,
"logps/chosen": -4.938043117523193,
"logps/rejected": -5.677487373352051,
"loss": 2.6408,
"rewards/accuracies": 0.78125,
"rewards/chosen": -49.38043212890625,
"rewards/margins": 7.394440650939941,
"rewards/rejected": -56.774871826171875,
"step": 317
},
{
"epoch": 0.719253604749788,
"grad_norm": 150.32828613492202,
"learning_rate": 1.7760744160938093e-07,
"logits/chosen": -1.1172575950622559,
"logits/rejected": -1.127094030380249,
"logps/chosen": -4.671790599822998,
"logps/rejected": -5.620820045471191,
"loss": 2.3808,
"rewards/accuracies": 0.8125,
"rewards/chosen": -46.71790313720703,
"rewards/margins": 9.490296363830566,
"rewards/rejected": -56.20820617675781,
"step": 318
},
{
"epoch": 0.7215154085383093,
"grad_norm": 139.98556421446793,
"learning_rate": 1.7498342440255135e-07,
"logits/chosen": -1.162559151649475,
"logits/rejected": -1.1655352115631104,
"logps/chosen": -4.815513610839844,
"logps/rejected": -5.493526935577393,
"loss": 2.6548,
"rewards/accuracies": 0.78125,
"rewards/chosen": -48.1551399230957,
"rewards/margins": 6.780129909515381,
"rewards/rejected": -54.935272216796875,
"step": 319
},
{
"epoch": 0.7237772123268307,
"grad_norm": 148.0972095206849,
"learning_rate": 1.7237349784407115e-07,
"logits/chosen": -1.1615474224090576,
"logits/rejected": -1.1631369590759277,
"logps/chosen": -4.868170261383057,
"logps/rejected": -5.621500015258789,
"loss": 2.4636,
"rewards/accuracies": 0.796875,
"rewards/chosen": -48.68170166015625,
"rewards/margins": 7.533293724060059,
"rewards/rejected": -56.214996337890625,
"step": 320
},
{
"epoch": 0.726039016115352,
"grad_norm": 140.71046770828303,
"learning_rate": 1.6977782536882178e-07,
"logits/chosen": -1.0695641040802002,
"logits/rejected": -1.0845947265625,
"logps/chosen": -4.2239885330200195,
"logps/rejected": -5.09049654006958,
"loss": 2.4795,
"rewards/accuracies": 0.796875,
"rewards/chosen": -42.23988342285156,
"rewards/margins": 8.665081977844238,
"rewards/rejected": -50.904964447021484,
"step": 321
},
{
"epoch": 0.7283008199038733,
"grad_norm": 141.34556612654956,
"learning_rate": 1.6719656951908708e-07,
"logits/chosen": -1.0963406562805176,
"logits/rejected": -1.115116834640503,
"logps/chosen": -4.027679443359375,
"logps/rejected": -4.841724872589111,
"loss": 1.9544,
"rewards/accuracies": 0.8671875,
"rewards/chosen": -40.276798248291016,
"rewards/margins": 8.140453338623047,
"rewards/rejected": -48.41725158691406,
"step": 322
},
{
"epoch": 0.7305626236923947,
"grad_norm": 129.10288785236676,
"learning_rate": 1.6462989193437453e-07,
"logits/chosen": -1.161424994468689,
"logits/rejected": -1.1720256805419922,
"logps/chosen": -4.578709125518799,
"logps/rejected": -5.312513828277588,
"loss": 2.3471,
"rewards/accuracies": 0.828125,
"rewards/chosen": -45.787086486816406,
"rewards/margins": 7.338046550750732,
"rewards/rejected": -53.1251335144043,
"step": 323
},
{
"epoch": 0.732824427480916,
"grad_norm": 145.12918025265157,
"learning_rate": 1.6207795334129365e-07,
"logits/chosen": -1.1244527101516724,
"logits/rejected": -1.1257734298706055,
"logps/chosen": -4.8168206214904785,
"logps/rejected": -5.5553297996521,
"loss": 2.6839,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -48.16820526123047,
"rewards/margins": 7.385091304779053,
"rewards/rejected": -55.55329895019531,
"step": 324
},
{
"epoch": 0.7350862312694374,
"grad_norm": 196.0171375912546,
"learning_rate": 1.5954091354349121e-07,
"logits/chosen": -1.1482946872711182,
"logits/rejected": -1.1611016988754272,
"logps/chosen": -4.442202568054199,
"logps/rejected": -4.989385604858398,
"loss": 3.3545,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -44.422027587890625,
"rewards/margins": 5.471826553344727,
"rewards/rejected": -49.89385223388672,
"step": 325
},
{
"epoch": 0.7373480350579588,
"grad_norm": 182.35479880389357,
"learning_rate": 1.5701893141164364e-07,
"logits/chosen": -1.1409872770309448,
"logits/rejected": -1.1580214500427246,
"logps/chosen": -4.6923394203186035,
"logps/rejected": -5.407514572143555,
"loss": 3.7705,
"rewards/accuracies": 0.734375,
"rewards/chosen": -46.923397064208984,
"rewards/margins": 7.151753902435303,
"rewards/rejected": -54.07514953613281,
"step": 326
},
{
"epoch": 0.73960983884648,
"grad_norm": 166.07956012082315,
"learning_rate": 1.545121648735093e-07,
"logits/chosen": -1.1439409255981445,
"logits/rejected": -1.1424399614334106,
"logps/chosen": -4.474762916564941,
"logps/rejected": -5.130880355834961,
"loss": 3.0764,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -44.74763107299805,
"rewards/margins": 6.561171531677246,
"rewards/rejected": -51.308799743652344,
"step": 327
},
{
"epoch": 0.7418716426350014,
"grad_norm": 140.38393909032104,
"learning_rate": 1.5202077090403863e-07,
"logits/chosen": -1.158162236213684,
"logits/rejected": -1.1361192464828491,
"logps/chosen": -4.179104328155518,
"logps/rejected": -4.780279636383057,
"loss": 2.8063,
"rewards/accuracies": 0.75,
"rewards/chosen": -41.791046142578125,
"rewards/margins": 6.011752128601074,
"rewards/rejected": -47.802799224853516,
"step": 328
},
{
"epoch": 0.7441334464235227,
"grad_norm": 149.78510692087144,
"learning_rate": 1.495449055155443e-07,
"logits/chosen": -1.148727536201477,
"logits/rejected": -1.16141676902771,
"logps/chosen": -4.255738258361816,
"logps/rejected": -5.054032325744629,
"loss": 2.3771,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -42.5573844909668,
"rewards/margins": 7.9829421043396,
"rewards/rejected": -50.54032516479492,
"step": 329
},
{
"epoch": 0.7463952502120441,
"grad_norm": 144.87586841648493,
"learning_rate": 1.4708472374793112e-07,
"logits/chosen": -1.0922203063964844,
"logits/rejected": -1.1075836420059204,
"logps/chosen": -4.270491600036621,
"logps/rejected": -4.7941460609436035,
"loss": 3.2525,
"rewards/accuracies": 0.734375,
"rewards/chosen": -42.704917907714844,
"rewards/margins": 5.236541271209717,
"rewards/rejected": -47.941463470458984,
"step": 330
},
{
"epoch": 0.7486570540005655,
"grad_norm": 151.1541825467292,
"learning_rate": 1.4464037965898878e-07,
"logits/chosen": -1.087780475616455,
"logits/rejected": -1.094712495803833,
"logps/chosen": -4.091693878173828,
"logps/rejected": -4.723832607269287,
"loss": 2.5101,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -40.91693878173828,
"rewards/margins": 6.321380138397217,
"rewards/rejected": -47.238319396972656,
"step": 331
},
{
"epoch": 0.7509188577890868,
"grad_norm": 150.53746416178075,
"learning_rate": 1.4221202631474282e-07,
"logits/chosen": -1.074486494064331,
"logits/rejected": -1.0786263942718506,
"logps/chosen": -4.040347099304199,
"logps/rejected": -4.647793769836426,
"loss": 2.7484,
"rewards/accuracies": 0.765625,
"rewards/chosen": -40.403465270996094,
"rewards/margins": 6.074465751647949,
"rewards/rejected": -46.47793197631836,
"step": 332
},
{
"epoch": 0.7531806615776081,
"grad_norm": 140.80934854690494,
"learning_rate": 1.3979981577987113e-07,
"logits/chosen": -1.1275672912597656,
"logits/rejected": -1.1185460090637207,
"logps/chosen": -3.7870967388153076,
"logps/rejected": -4.483286380767822,
"loss": 2.4486,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -37.87096405029297,
"rewards/margins": 6.961895942687988,
"rewards/rejected": -44.832862854003906,
"step": 333
},
{
"epoch": 0.7554424653661295,
"grad_norm": 136.75785695589838,
"learning_rate": 1.374038991081807e-07,
"logits/chosen": -1.1587432622909546,
"logits/rejected": -1.1672437191009521,
"logps/chosen": -4.0366315841674805,
"logps/rejected": -4.591939926147461,
"loss": 2.7988,
"rewards/accuracies": 0.78125,
"rewards/chosen": -40.36631774902344,
"rewards/margins": 5.55308723449707,
"rewards/rejected": -45.91940689086914,
"step": 334
},
{
"epoch": 0.7577042691546508,
"grad_norm": 133.61424676453677,
"learning_rate": 1.3502442633314882e-07,
"logits/chosen": -1.1029198169708252,
"logits/rejected": -1.1053695678710938,
"logps/chosen": -3.5087780952453613,
"logps/rejected": -4.049241065979004,
"loss": 2.7249,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -35.0877799987793,
"rewards/margins": 5.404629707336426,
"rewards/rejected": -40.49240493774414,
"step": 335
},
{
"epoch": 0.7599660729431722,
"grad_norm": 130.2580688330827,
"learning_rate": 1.3266154645852815e-07,
"logits/chosen": -1.1168211698532104,
"logits/rejected": -1.1052803993225098,
"logps/chosen": -3.832916259765625,
"logps/rejected": -4.46284294128418,
"loss": 2.3583,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -38.329158782958984,
"rewards/margins": 6.2992682456970215,
"rewards/rejected": -44.62842559814453,
"step": 336
},
{
"epoch": 0.7622278767316936,
"grad_norm": 116.1303850461595,
"learning_rate": 1.303154074490152e-07,
"logits/chosen": -1.146256685256958,
"logits/rejected": -1.1414530277252197,
"logps/chosen": -3.628737688064575,
"logps/rejected": -4.385705947875977,
"loss": 2.1664,
"rewards/accuracies": 0.8125,
"rewards/chosen": -36.287376403808594,
"rewards/margins": 7.569684028625488,
"rewards/rejected": -43.8570556640625,
"step": 337
},
{
"epoch": 0.7644896805202148,
"grad_norm": 129.71570053294832,
"learning_rate": 1.2798615622098616e-07,
"logits/chosen": -1.1601388454437256,
"logits/rejected": -1.1707144975662231,
"logps/chosen": -3.6512036323547363,
"logps/rejected": -4.333779335021973,
"loss": 3.0042,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -36.51203536987305,
"rewards/margins": 6.825753688812256,
"rewards/rejected": -43.337791442871094,
"step": 338
},
{
"epoch": 0.7667514843087362,
"grad_norm": 145.5066563342706,
"learning_rate": 1.2567393863329523e-07,
"logits/chosen": -1.1308033466339111,
"logits/rejected": -1.1647956371307373,
"logps/chosen": -3.8886606693267822,
"logps/rejected": -4.585987567901611,
"loss": 2.6023,
"rewards/accuracies": 0.796875,
"rewards/chosen": -38.88660430908203,
"rewards/margins": 6.973270416259766,
"rewards/rejected": -45.8598747253418,
"step": 339
},
{
"epoch": 0.7690132880972576,
"grad_norm": 131.06463836482803,
"learning_rate": 1.233788994781423e-07,
"logits/chosen": -1.1664081811904907,
"logits/rejected": -1.1904940605163574,
"logps/chosen": -3.713836193084717,
"logps/rejected": -4.329701900482178,
"loss": 2.6029,
"rewards/accuracies": 0.8125,
"rewards/chosen": -37.138362884521484,
"rewards/margins": 6.158655166625977,
"rewards/rejected": -43.29701614379883,
"step": 340
},
{
"epoch": 0.7712750918857789,
"grad_norm": 119.23231418301683,
"learning_rate": 1.2110118247200468e-07,
"logits/chosen": -1.1798597574234009,
"logits/rejected": -1.1897577047348022,
"logps/chosen": -3.7017178535461426,
"logps/rejected": -4.307440280914307,
"loss": 2.2494,
"rewards/accuracies": 0.78125,
"rewards/chosen": -37.017181396484375,
"rewards/margins": 6.057225704193115,
"rewards/rejected": -43.07440185546875,
"step": 341
},
{
"epoch": 0.7735368956743003,
"grad_norm": 140.0321300554404,
"learning_rate": 1.1884093024663933e-07,
"logits/chosen": -1.1550508737564087,
"logits/rejected": -1.1637250185012817,
"logps/chosen": -3.4708728790283203,
"logps/rejected": -4.254858493804932,
"loss": 2.5564,
"rewards/accuracies": 0.796875,
"rewards/chosen": -34.70873260498047,
"rewards/margins": 7.839854717254639,
"rewards/rejected": -42.548583984375,
"step": 342
},
{
"epoch": 0.7757986994628217,
"grad_norm": 135.8564211412602,
"learning_rate": 1.1659828434014886e-07,
"logits/chosen": -1.1468806266784668,
"logits/rejected": -1.130220890045166,
"logps/chosen": -3.576955795288086,
"logps/rejected": -4.336724758148193,
"loss": 2.3826,
"rewards/accuracies": 0.828125,
"rewards/chosen": -35.769561767578125,
"rewards/margins": 7.597687244415283,
"rewards/rejected": -43.36724853515625,
"step": 343
},
{
"epoch": 0.7780605032513429,
"grad_norm": 161.11121894922448,
"learning_rate": 1.143733851881203e-07,
"logits/chosen": -1.2072776556015015,
"logits/rejected": -1.2078864574432373,
"logps/chosen": -3.899902820587158,
"logps/rejected": -4.685684680938721,
"loss": 2.4913,
"rewards/accuracies": 0.8125,
"rewards/chosen": -38.999027252197266,
"rewards/margins": 7.857818603515625,
"rewards/rejected": -46.856842041015625,
"step": 344
},
{
"epoch": 0.7803223070398643,
"grad_norm": 125.43996792686391,
"learning_rate": 1.1216637211483005e-07,
"logits/chosen": -1.148529291152954,
"logits/rejected": -1.1545379161834717,
"logps/chosen": -3.8425681591033936,
"logps/rejected": -4.446829795837402,
"loss": 2.4109,
"rewards/accuracies": 0.8125,
"rewards/chosen": -38.425682067871094,
"rewards/margins": 6.0426130294799805,
"rewards/rejected": -44.468292236328125,
"step": 345
},
{
"epoch": 0.7825841108283856,
"grad_norm": 134.62772319760776,
"learning_rate": 1.0997738332451936e-07,
"logits/chosen": -1.1377711296081543,
"logits/rejected": -1.1397600173950195,
"logps/chosen": -3.997607469558716,
"logps/rejected": -4.604636192321777,
"loss": 2.5131,
"rewards/accuracies": 0.8125,
"rewards/chosen": -39.976070404052734,
"rewards/margins": 6.070294380187988,
"rewards/rejected": -46.04636764526367,
"step": 346
},
{
"epoch": 0.784845914616907,
"grad_norm": 134.84346461104718,
"learning_rate": 1.0780655589274031e-07,
"logits/chosen": -1.2021856307983398,
"logits/rejected": -1.1845647096633911,
"logps/chosen": -3.890503406524658,
"logps/rejected": -4.5709123611450195,
"loss": 2.1196,
"rewards/accuracies": 0.875,
"rewards/chosen": -38.905029296875,
"rewards/margins": 6.8040876388549805,
"rewards/rejected": -45.70912170410156,
"step": 347
},
{
"epoch": 0.7871077184054284,
"grad_norm": 139.41519511521523,
"learning_rate": 1.056540257577712e-07,
"logits/chosen": -1.1279267072677612,
"logits/rejected": -1.1376078128814697,
"logps/chosen": -4.36607551574707,
"logps/rejected": -5.16333532333374,
"loss": 2.0742,
"rewards/accuracies": 0.8515625,
"rewards/chosen": -43.66075897216797,
"rewards/margins": 7.972592830657959,
"rewards/rejected": -51.63335037231445,
"step": 348
},
{
"epoch": 0.7893695221939496,
"grad_norm": 132.61876254999865,
"learning_rate": 1.0351992771210554e-07,
"logits/chosen": -1.1291594505310059,
"logits/rejected": -1.152091383934021,
"logps/chosen": -3.9938082695007324,
"logps/rejected": -4.693463325500488,
"loss": 2.5644,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -39.93808364868164,
"rewards/margins": 6.996548175811768,
"rewards/rejected": -46.93463134765625,
"step": 349
},
{
"epoch": 0.791631325982471,
"grad_norm": 153.1527234952499,
"learning_rate": 1.0140439539400953e-07,
"logits/chosen": -1.1234492063522339,
"logits/rejected": -1.135466456413269,
"logps/chosen": -3.981572389602661,
"logps/rejected": -4.654304027557373,
"loss": 2.887,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -39.81572341918945,
"rewards/margins": 6.727319240570068,
"rewards/rejected": -46.54304504394531,
"step": 350
},
{
"epoch": 0.7938931297709924,
"grad_norm": 118.9564494853148,
"learning_rate": 9.930756127915488e-08,
"logits/chosen": -1.1307318210601807,
"logits/rejected": -1.1451183557510376,
"logps/chosen": -3.9668586254119873,
"logps/rejected": -4.713019847869873,
"loss": 2.0547,
"rewards/accuracies": 0.859375,
"rewards/chosen": -39.66858673095703,
"rewards/margins": 7.461606979370117,
"rewards/rejected": -47.13019561767578,
"step": 351
},
{
"epoch": 0.7961549335595137,
"grad_norm": 179.1168561225876,
"learning_rate": 9.722955667232242e-08,
"logits/chosen": -1.175784945487976,
"logits/rejected": -1.1912992000579834,
"logps/chosen": -4.297727584838867,
"logps/rejected": -4.875129699707031,
"loss": 3.1093,
"rewards/accuracies": 0.71875,
"rewards/chosen": -42.977272033691406,
"rewards/margins": 5.774031162261963,
"rewards/rejected": -48.751304626464844,
"step": 352
},
{
"epoch": 0.7984167373480351,
"grad_norm": 149.9462238898871,
"learning_rate": 9.517051169918016e-08,
"logits/chosen": -1.1609903573989868,
"logits/rejected": -1.1678093671798706,
"logps/chosen": -4.031696796417236,
"logps/rejected": -4.656764984130859,
"loss": 2.9365,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -40.31697082519531,
"rewards/margins": 6.25068473815918,
"rewards/rejected": -46.56765365600586,
"step": 353
},
{
"epoch": 0.8006785411365565,
"grad_norm": 144.2370757323942,
"learning_rate": 9.313055529813412e-08,
"logits/chosen": -1.0940794944763184,
"logits/rejected": -1.1267677545547485,
"logps/chosen": -4.094084739685059,
"logps/rejected": -4.81442403793335,
"loss": 2.1865,
"rewards/accuracies": 0.8125,
"rewards/chosen": -40.94084548950195,
"rewards/margins": 7.203390121459961,
"rewards/rejected": -48.14424133300781,
"step": 354
},
{
"epoch": 0.8029403449250777,
"grad_norm": 150.07773110636595,
"learning_rate": 9.110981521225532e-08,
"logits/chosen": -1.1485052108764648,
"logits/rejected": -1.164825439453125,
"logps/chosen": -4.242175579071045,
"logps/rejected": -4.949873924255371,
"loss": 2.6261,
"rewards/accuracies": 0.78125,
"rewards/chosen": -42.421756744384766,
"rewards/margins": 7.07698392868042,
"rewards/rejected": -49.498741149902344,
"step": 355
},
{
"epoch": 0.8052021487135991,
"grad_norm": 158.4977836908745,
"learning_rate": 8.910841798127884e-08,
"logits/chosen": -1.1083470582962036,
"logits/rejected": -1.1305320262908936,
"logps/chosen": -4.242856025695801,
"logps/rejected": -4.95394229888916,
"loss": 2.2088,
"rewards/accuracies": 0.8359375,
"rewards/chosen": -42.428558349609375,
"rewards/margins": 7.110870361328125,
"rewards/rejected": -49.53942108154297,
"step": 356
},
{
"epoch": 0.8074639525021204,
"grad_norm": 142.3023711469036,
"learning_rate": 8.712648893368139e-08,
"logits/chosen": -1.1344428062438965,
"logits/rejected": -1.1723387241363525,
"logps/chosen": -4.20165491104126,
"logps/rejected": -5.037359237670898,
"loss": 2.3976,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -42.01654815673828,
"rewards/margins": 8.357048034667969,
"rewards/rejected": -50.37359619140625,
"step": 357
},
{
"epoch": 0.8097257562906418,
"grad_norm": 133.95168826861777,
"learning_rate": 8.516415217883186e-08,
"logits/chosen": -1.1243913173675537,
"logits/rejected": -1.1292424201965332,
"logps/chosen": -4.095825672149658,
"logps/rejected": -4.912569999694824,
"loss": 2.2118,
"rewards/accuracies": 0.84375,
"rewards/chosen": -40.958255767822266,
"rewards/margins": 8.167447090148926,
"rewards/rejected": -49.125701904296875,
"step": 358
},
{
"epoch": 0.8119875600791632,
"grad_norm": 185.9215975376776,
"learning_rate": 8.32215305992209e-08,
"logits/chosen": -1.1615304946899414,
"logits/rejected": -1.1655793190002441,
"logps/chosen": -4.049884796142578,
"logps/rejected": -4.723489761352539,
"loss": 2.7838,
"rewards/accuracies": 0.75,
"rewards/chosen": -40.49885177612305,
"rewards/margins": 6.736046314239502,
"rewards/rejected": -47.23489761352539,
"step": 359
},
{
"epoch": 0.8142493638676844,
"grad_norm": 125.05686534430966,
"learning_rate": 8.129874584276448e-08,
"logits/chosen": -1.1320921182632446,
"logits/rejected": -1.1268113851547241,
"logps/chosen": -4.095798969268799,
"logps/rejected": -4.970909118652344,
"loss": 1.8914,
"rewards/accuracies": 0.84375,
"rewards/chosen": -40.95799255371094,
"rewards/margins": 8.751094818115234,
"rewards/rejected": -49.70909118652344,
"step": 360
},
{
"epoch": 0.8165111676562058,
"grad_norm": 162.2302808539331,
"learning_rate": 7.939591831518746e-08,
"logits/chosen": -1.1332316398620605,
"logits/rejected": -1.1608184576034546,
"logps/chosen": -4.223212242126465,
"logps/rejected": -4.8841400146484375,
"loss": 2.1666,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -42.23212432861328,
"rewards/margins": 6.609279155731201,
"rewards/rejected": -48.84140396118164,
"step": 361
},
{
"epoch": 0.8187729714447272,
"grad_norm": 152.02329076128746,
"learning_rate": 7.751316717248304e-08,
"logits/chosen": -1.1371338367462158,
"logits/rejected": -1.144465684890747,
"logps/chosen": -4.625061988830566,
"logps/rejected": -5.566699028015137,
"loss": 2.3702,
"rewards/accuracies": 0.796875,
"rewards/chosen": -46.25061798095703,
"rewards/margins": 9.416373252868652,
"rewards/rejected": -55.666996002197266,
"step": 362
},
{
"epoch": 0.8210347752332485,
"grad_norm": 182.21151399037953,
"learning_rate": 7.565061031345142e-08,
"logits/chosen": -1.118225336074829,
"logits/rejected": -1.1225550174713135,
"logps/chosen": -4.930262088775635,
"logps/rejected": -5.776245594024658,
"loss": 2.066,
"rewards/accuracies": 0.8671875,
"rewards/chosen": -49.30262756347656,
"rewards/margins": 8.45982837677002,
"rewards/rejected": -57.76245880126953,
"step": 363
},
{
"epoch": 0.8232965790217699,
"grad_norm": 182.6967377285154,
"learning_rate": 7.380836437231686e-08,
"logits/chosen": -1.1148128509521484,
"logits/rejected": -1.118404746055603,
"logps/chosen": -4.367569446563721,
"logps/rejected": -5.1442484855651855,
"loss": 2.4535,
"rewards/accuracies": 0.796875,
"rewards/chosen": -43.675697326660156,
"rewards/margins": 7.766792297363281,
"rewards/rejected": -51.44248962402344,
"step": 364
},
{
"epoch": 0.8255583828102913,
"grad_norm": 146.88066372474253,
"learning_rate": 7.198654471142371e-08,
"logits/chosen": -1.1199636459350586,
"logits/rejected": -1.1298437118530273,
"logps/chosen": -4.415999412536621,
"logps/rejected": -5.402173042297363,
"loss": 1.7824,
"rewards/accuracies": 0.84375,
"rewards/chosen": -44.15999984741211,
"rewards/margins": 9.861732482910156,
"rewards/rejected": -54.021728515625,
"step": 365
},
{
"epoch": 0.8278201865988125,
"grad_norm": 162.16963854973068,
"learning_rate": 7.01852654140132e-08,
"logits/chosen": -1.1699155569076538,
"logits/rejected": -1.1726248264312744,
"logps/chosen": -4.93940544128418,
"logps/rejected": -5.82332706451416,
"loss": 2.2319,
"rewards/accuracies": 0.8359375,
"rewards/chosen": -49.39405059814453,
"rewards/margins": 8.839221954345703,
"rewards/rejected": -58.2332763671875,
"step": 366
},
{
"epoch": 0.8300819903873339,
"grad_norm": 162.38403071496873,
"learning_rate": 6.840463927707833e-08,
"logits/chosen": -1.1146910190582275,
"logits/rejected": -1.1289540529251099,
"logps/chosen": -4.881972312927246,
"logps/rejected": -5.560351848602295,
"loss": 2.7204,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -48.81972885131836,
"rewards/margins": 6.783794403076172,
"rewards/rejected": -55.603515625,
"step": 367
},
{
"epoch": 0.8323437941758552,
"grad_norm": 155.33335228556993,
"learning_rate": 6.664477780430138e-08,
"logits/chosen": -1.108270287513733,
"logits/rejected": -1.1260305643081665,
"logps/chosen": -4.770946979522705,
"logps/rejected": -5.368707656860352,
"loss": 2.888,
"rewards/accuracies": 0.78125,
"rewards/chosen": -47.70947265625,
"rewards/margins": 5.977604866027832,
"rewards/rejected": -53.68707275390625,
"step": 368
},
{
"epoch": 0.8346055979643766,
"grad_norm": 172.0274452536749,
"learning_rate": 6.49057911990711e-08,
"logits/chosen": -1.0896248817443848,
"logits/rejected": -1.0903116464614868,
"logps/chosen": -4.668241500854492,
"logps/rejected": -5.342168807983398,
"loss": 2.8429,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -46.68241500854492,
"rewards/margins": 6.739270210266113,
"rewards/rejected": -53.421688079833984,
"step": 369
},
{
"epoch": 0.836867401752898,
"grad_norm": 147.87851842328394,
"learning_rate": 6.318778835758189e-08,
"logits/chosen": -1.1355427503585815,
"logits/rejected": -1.1373378038406372,
"logps/chosen": -4.819726943969727,
"logps/rejected": -5.56911563873291,
"loss": 1.9771,
"rewards/accuracies": 0.828125,
"rewards/chosen": -48.19727325439453,
"rewards/margins": 7.493888854980469,
"rewards/rejected": -55.69115447998047,
"step": 370
},
{
"epoch": 0.8391292055414192,
"grad_norm": 193.1168590090841,
"learning_rate": 6.149087686201433e-08,
"logits/chosen": -1.1564326286315918,
"logits/rejected": -1.163049578666687,
"logps/chosen": -4.552361011505127,
"logps/rejected": -5.267241954803467,
"loss": 3.2197,
"rewards/accuracies": 0.78125,
"rewards/chosen": -45.52361297607422,
"rewards/margins": 7.148810863494873,
"rewards/rejected": -52.67242431640625,
"step": 371
},
{
"epoch": 0.8413910093299406,
"grad_norm": 158.69146700001298,
"learning_rate": 5.98151629737988e-08,
"logits/chosen": -1.1418228149414062,
"logits/rejected": -1.139822006225586,
"logps/chosen": -4.647780418395996,
"logps/rejected": -5.4461588859558105,
"loss": 2.7971,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -46.47779846191406,
"rewards/margins": 7.983788967132568,
"rewards/rejected": -54.46159362792969,
"step": 372
},
{
"epoch": 0.843652813118462,
"grad_norm": 127.53521312375436,
"learning_rate": 5.816075162696097e-08,
"logits/chosen": -1.1655972003936768,
"logits/rejected": -1.1918379068374634,
"logps/chosen": -4.663849830627441,
"logps/rejected": -5.401423931121826,
"loss": 1.9571,
"rewards/accuracies": 0.84375,
"rewards/chosen": -46.63849639892578,
"rewards/margins": 7.375744819641113,
"rewards/rejected": -54.01424026489258,
"step": 373
},
{
"epoch": 0.8459146169069833,
"grad_norm": 142.81966600033684,
"learning_rate": 5.6527746421551046e-08,
"logits/chosen": -1.106029987335205,
"logits/rejected": -1.091361403465271,
"logps/chosen": -4.538349628448486,
"logps/rejected": -5.323310375213623,
"loss": 2.36,
"rewards/accuracies": 0.8125,
"rewards/chosen": -45.38349533081055,
"rewards/margins": 7.849606513977051,
"rewards/rejected": -53.23310470581055,
"step": 374
},
{
"epoch": 0.8481764206955047,
"grad_norm": 157.71016320454683,
"learning_rate": 5.4916249617156064e-08,
"logits/chosen": -1.1139298677444458,
"logits/rejected": -1.1261509656906128,
"logps/chosen": -4.135310649871826,
"logps/rejected": -4.878140926361084,
"loss": 2.345,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -41.353111267089844,
"rewards/margins": 7.42829704284668,
"rewards/rejected": -48.781402587890625,
"step": 375
},
{
"epoch": 0.8504382244840261,
"grad_norm": 132.44635331603325,
"learning_rate": 5.332636212649646e-08,
"logits/chosen": -1.1287761926651,
"logits/rejected": -1.1302844285964966,
"logps/chosen": -4.467879295349121,
"logps/rejected": -5.283463954925537,
"loss": 2.1031,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -44.678794860839844,
"rewards/margins": 8.155845642089844,
"rewards/rejected": -52.83464050292969,
"step": 376
},
{
"epoch": 0.8527000282725473,
"grad_norm": 150.5335514700392,
"learning_rate": 5.17581835091069e-08,
"logits/chosen": -1.1303383111953735,
"logits/rejected": -1.1533814668655396,
"logps/chosen": -4.488173961639404,
"logps/rejected": -5.281373977661133,
"loss": 2.7408,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -44.881736755371094,
"rewards/margins": 7.932003498077393,
"rewards/rejected": -52.813743591308594,
"step": 377
},
{
"epoch": 0.8549618320610687,
"grad_norm": 150.25699456017983,
"learning_rate": 5.02118119651016e-08,
"logits/chosen": -1.1570930480957031,
"logits/rejected": -1.159618616104126,
"logps/chosen": -4.484018802642822,
"logps/rejected": -5.202611923217773,
"loss": 2.5301,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -44.840187072753906,
"rewards/margins": 7.1859331130981445,
"rewards/rejected": -52.026119232177734,
"step": 378
},
{
"epoch": 0.85722363584959,
"grad_norm": 162.1310863378,
"learning_rate": 4.868734432902526e-08,
"logits/chosen": -1.2017693519592285,
"logits/rejected": -1.202324390411377,
"logps/chosen": -4.508758068084717,
"logps/rejected": -5.381972312927246,
"loss": 2.9957,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -45.087581634521484,
"rewards/margins": 8.732136726379395,
"rewards/rejected": -53.8197135925293,
"step": 379
},
{
"epoch": 0.8594854396381114,
"grad_norm": 171.7256004142805,
"learning_rate": 4.7184876063789134e-08,
"logits/chosen": -1.14678955078125,
"logits/rejected": -1.1611804962158203,
"logps/chosen": -3.9182119369506836,
"logps/rejected": -4.665888786315918,
"loss": 2.5436,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -39.1821174621582,
"rewards/margins": 7.4767746925354,
"rewards/rejected": -46.65888977050781,
"step": 380
},
{
"epoch": 0.8617472434266328,
"grad_norm": 126.74508418371084,
"learning_rate": 4.570450125469314e-08,
"logits/chosen": -1.1241579055786133,
"logits/rejected": -1.1344544887542725,
"logps/chosen": -4.487060546875,
"logps/rejected": -5.410554885864258,
"loss": 2.0142,
"rewards/accuracies": 0.84375,
"rewards/chosen": -44.870601654052734,
"rewards/margins": 9.234944343566895,
"rewards/rejected": -54.10554504394531,
"step": 381
},
{
"epoch": 0.864009047215154,
"grad_norm": 150.5877064450684,
"learning_rate": 4.424631260353378e-08,
"logits/chosen": -1.1614850759506226,
"logits/rejected": -1.1724066734313965,
"logps/chosen": -4.289636611938477,
"logps/rejected": -4.987452983856201,
"loss": 2.6354,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -42.8963623046875,
"rewards/margins": 6.97816801071167,
"rewards/rejected": -49.874534606933594,
"step": 382
},
{
"epoch": 0.8662708510036754,
"grad_norm": 135.3394329513268,
"learning_rate": 4.281040142280008e-08,
"logits/chosen": -1.1927827596664429,
"logits/rejected": -1.2009069919586182,
"logps/chosen": -4.040445804595947,
"logps/rejected": -4.89124870300293,
"loss": 1.918,
"rewards/accuracies": 0.8515625,
"rewards/chosen": -40.40446090698242,
"rewards/margins": 8.508023262023926,
"rewards/rejected": -48.91248321533203,
"step": 383
},
{
"epoch": 0.8685326547921968,
"grad_norm": 197.32088580990666,
"learning_rate": 4.1396857629954286e-08,
"logits/chosen": -1.152172565460205,
"logits/rejected": -1.1629152297973633,
"logps/chosen": -4.775947093963623,
"logps/rejected": -5.507784366607666,
"loss": 2.7031,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -47.75947189331055,
"rewards/margins": 7.318375110626221,
"rewards/rejected": -55.07784652709961,
"step": 384
},
{
"epoch": 0.8707944585807181,
"grad_norm": 147.48372966461358,
"learning_rate": 4.000576974180232e-08,
"logits/chosen": -1.1277602910995483,
"logits/rejected": -1.1425299644470215,
"logps/chosen": -4.28041410446167,
"logps/rejected": -4.912994861602783,
"loss": 3.3009,
"rewards/accuracies": 0.78125,
"rewards/chosen": -42.80414581298828,
"rewards/margins": 6.325807094573975,
"rewards/rejected": -49.12995147705078,
"step": 385
},
{
"epoch": 0.8730562623692395,
"grad_norm": 216.40246174451113,
"learning_rate": 3.8637224868950066e-08,
"logits/chosen": -1.1443856954574585,
"logits/rejected": -1.158379077911377,
"logps/chosen": -4.138412952423096,
"logps/rejected": -4.75631046295166,
"loss": 2.8595,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -41.38412857055664,
"rewards/margins": 6.178977012634277,
"rewards/rejected": -47.56310272216797,
"step": 386
},
{
"epoch": 0.8753180661577609,
"grad_norm": 145.70726367236028,
"learning_rate": 3.729130871034885e-08,
"logits/chosen": -1.1509549617767334,
"logits/rejected": -1.1646292209625244,
"logps/chosen": -4.321000099182129,
"logps/rejected": -5.116883277893066,
"loss": 2.1179,
"rewards/accuracies": 0.828125,
"rewards/chosen": -43.21000671386719,
"rewards/margins": 7.958827972412109,
"rewards/rejected": -51.1688346862793,
"step": 387
},
{
"epoch": 0.8775798699462821,
"grad_norm": 163.83720976168166,
"learning_rate": 3.596810554792888e-08,
"logits/chosen": -1.146296501159668,
"logits/rejected": -1.171584963798523,
"logps/chosen": -4.1524882316589355,
"logps/rejected": -4.907288074493408,
"loss": 2.833,
"rewards/accuracies": 0.796875,
"rewards/chosen": -41.52488327026367,
"rewards/margins": 7.547997951507568,
"rewards/rejected": -49.07288360595703,
"step": 388
},
{
"epoch": 0.8798416737348035,
"grad_norm": 171.451051766312,
"learning_rate": 3.466769824132116e-08,
"logits/chosen": -1.1617058515548706,
"logits/rejected": -1.151517629623413,
"logps/chosen": -4.187567234039307,
"logps/rejected": -4.928333282470703,
"loss": 2.0192,
"rewards/accuracies": 0.8828125,
"rewards/chosen": -41.87567138671875,
"rewards/margins": 7.407662391662598,
"rewards/rejected": -49.28333282470703,
"step": 389
},
{
"epoch": 0.8821034775233249,
"grad_norm": 149.50679016885917,
"learning_rate": 3.339016822266925e-08,
"logits/chosen": -1.1291420459747314,
"logits/rejected": -1.154278039932251,
"logps/chosen": -4.321372032165527,
"logps/rejected": -5.223550319671631,
"loss": 1.7499,
"rewards/accuracies": 0.8671875,
"rewards/chosen": -43.213722229003906,
"rewards/margins": 9.0217866897583,
"rewards/rejected": -52.235511779785156,
"step": 390
},
{
"epoch": 0.8843652813118462,
"grad_norm": 138.97884749980054,
"learning_rate": 3.213559549152958e-08,
"logits/chosen": -1.1683623790740967,
"logits/rejected": -1.1750186681747437,
"logps/chosen": -4.231035232543945,
"logps/rejected": -5.0254597663879395,
"loss": 2.4769,
"rewards/accuracies": 0.796875,
"rewards/chosen": -42.31035614013672,
"rewards/margins": 7.944244861602783,
"rewards/rejected": -50.25459671020508,
"step": 391
},
{
"epoch": 0.8866270851003676,
"grad_norm": 149.13743982029294,
"learning_rate": 3.090405860986203e-08,
"logits/chosen": -1.1904428005218506,
"logits/rejected": -1.225684404373169,
"logps/chosen": -4.472517967224121,
"logps/rejected": -5.399662494659424,
"loss": 2.1715,
"rewards/accuracies": 0.84375,
"rewards/chosen": -44.72518539428711,
"rewards/margins": 9.271440505981445,
"rewards/rejected": -53.99662399291992,
"step": 392
},
{
"epoch": 0.8888888888888888,
"grad_norm": 137.86076074457236,
"learning_rate": 2.9695634697110315e-08,
"logits/chosen": -1.1298010349273682,
"logits/rejected": -1.1351279020309448,
"logps/chosen": -4.120969772338867,
"logps/rejected": -4.982769966125488,
"loss": 2.6108,
"rewards/accuracies": 0.8359375,
"rewards/chosen": -41.209693908691406,
"rewards/margins": 8.618008613586426,
"rewards/rejected": -49.82770538330078,
"step": 393
},
{
"epoch": 0.8911506926774102,
"grad_norm": 205.78409246729305,
"learning_rate": 2.8510399425372766e-08,
"logits/chosen": -1.169985294342041,
"logits/rejected": -1.1676735877990723,
"logps/chosen": -4.293628692626953,
"logps/rejected": -4.940824508666992,
"loss": 2.4817,
"rewards/accuracies": 0.8125,
"rewards/chosen": -42.93628692626953,
"rewards/margins": 6.471960067749023,
"rewards/rejected": -49.40824890136719,
"step": 394
},
{
"epoch": 0.8934124964659316,
"grad_norm": 156.66716455473116,
"learning_rate": 2.734842701466329e-08,
"logits/chosen": -1.1552950143814087,
"logits/rejected": -1.1497843265533447,
"logps/chosen": -4.565382957458496,
"logps/rejected": -5.28816032409668,
"loss": 2.3395,
"rewards/accuracies": 0.8125,
"rewards/chosen": -45.653831481933594,
"rewards/margins": 7.227770805358887,
"rewards/rejected": -52.88159942626953,
"step": 395
},
{
"epoch": 0.8956743002544529,
"grad_norm": 123.71662611686183,
"learning_rate": 2.6209790228264438e-08,
"logits/chosen": -1.159185528755188,
"logits/rejected": -1.1655610799789429,
"logps/chosen": -3.889777183532715,
"logps/rejected": -4.674656867980957,
"loss": 2.1273,
"rewards/accuracies": 0.84375,
"rewards/chosen": -38.89777755737305,
"rewards/margins": 7.848790168762207,
"rewards/rejected": -46.7465705871582,
"step": 396
},
{
"epoch": 0.8979361040429743,
"grad_norm": 147.95551170482682,
"learning_rate": 2.5094560368170305e-08,
"logits/chosen": -1.1451451778411865,
"logits/rejected": -1.1695401668548584,
"logps/chosen": -4.563682556152344,
"logps/rejected": -5.299858570098877,
"loss": 2.1221,
"rewards/accuracies": 0.8125,
"rewards/chosen": -45.63682556152344,
"rewards/margins": 7.361759185791016,
"rewards/rejected": -52.99858093261719,
"step": 397
},
{
"epoch": 0.9001979078314957,
"grad_norm": 154.64080727041423,
"learning_rate": 2.4002807270621893e-08,
"logits/chosen": -1.1961959600448608,
"logits/rejected": -1.1858330965042114,
"logps/chosen": -4.226797103881836,
"logps/rejected": -4.973693370819092,
"loss": 2.3421,
"rewards/accuracies": 0.8359375,
"rewards/chosen": -42.26797103881836,
"rewards/margins": 7.468969821929932,
"rewards/rejected": -49.7369384765625,
"step": 398
},
{
"epoch": 0.9024597116200169,
"grad_norm": 152.1828038337204,
"learning_rate": 2.293459930173354e-08,
"logits/chosen": -1.195888876914978,
"logits/rejected": -1.2131280899047852,
"logps/chosen": -4.33859395980835,
"logps/rejected": -5.055052757263184,
"loss": 2.6306,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -43.38593673706055,
"rewards/margins": 7.164592742919922,
"rewards/rejected": -50.5505256652832,
"step": 399
},
{
"epoch": 0.9047215154085383,
"grad_norm": 143.11836038738673,
"learning_rate": 2.189000335321256e-08,
"logits/chosen": -1.1481764316558838,
"logits/rejected": -1.150956153869629,
"logps/chosen": -4.231009483337402,
"logps/rejected": -4.886160373687744,
"loss": 2.9376,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -42.310096740722656,
"rewards/margins": 6.551509380340576,
"rewards/rejected": -48.861602783203125,
"step": 400
},
{
"epoch": 0.9069833191970597,
"grad_norm": 149.95401926590577,
"learning_rate": 2.086908483816954e-08,
"logits/chosen": -1.1622329950332642,
"logits/rejected": -1.1632294654846191,
"logps/chosen": -4.43674373626709,
"logps/rejected": -5.167576313018799,
"loss": 2.2468,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -44.367435455322266,
"rewards/margins": 7.308327674865723,
"rewards/rejected": -51.67576217651367,
"step": 401
},
{
"epoch": 0.909245122985581,
"grad_norm": 141.35952916507554,
"learning_rate": 1.9871907687022717e-08,
"logits/chosen": -1.150231957435608,
"logits/rejected": -1.1642488241195679,
"logps/chosen": -4.147550106048584,
"logps/rejected": -4.799522399902344,
"loss": 2.3613,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -41.475502014160156,
"rewards/margins": 6.519725799560547,
"rewards/rejected": -47.99523162841797,
"step": 402
},
{
"epoch": 0.9115069267741024,
"grad_norm": 130.65953710722863,
"learning_rate": 1.889853434349451e-08,
"logits/chosen": -1.1447476148605347,
"logits/rejected": -1.1627604961395264,
"logps/chosen": -4.104800224304199,
"logps/rejected": -4.9086785316467285,
"loss": 2.5277,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -41.04800033569336,
"rewards/margins": 8.03878402709961,
"rewards/rejected": -49.08678436279297,
"step": 403
},
{
"epoch": 0.9137687305626236,
"grad_norm": 144.0791755156075,
"learning_rate": 1.7949025760701164e-08,
"logits/chosen": -1.1415810585021973,
"logits/rejected": -1.1473815441131592,
"logps/chosen": -4.432383060455322,
"logps/rejected": -5.151994705200195,
"loss": 2.2592,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -44.323829650878906,
"rewards/margins": 7.1961164474487305,
"rewards/rejected": -51.51994705200195,
"step": 404
},
{
"epoch": 0.916030534351145,
"grad_norm": 140.1370708813358,
"learning_rate": 1.7023441397336023e-08,
"logits/chosen": -1.1852596998214722,
"logits/rejected": -1.2118213176727295,
"logps/chosen": -4.218780994415283,
"logps/rejected": -5.021721363067627,
"loss": 2.0941,
"rewards/accuracies": 0.8515625,
"rewards/chosen": -42.18781661987305,
"rewards/margins": 8.029399871826172,
"rewards/rejected": -50.21721267700195,
"step": 405
},
{
"epoch": 0.9182923381396664,
"grad_norm": 154.58794315173094,
"learning_rate": 1.6121839213945854e-08,
"logits/chosen": -1.1428247690200806,
"logits/rejected": -1.1743805408477783,
"logps/chosen": -4.260222911834717,
"logps/rejected": -4.994080543518066,
"loss": 3.0364,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -42.602230072021484,
"rewards/margins": 7.3385748863220215,
"rewards/rejected": -49.9408073425293,
"step": 406
},
{
"epoch": 0.9205541419281877,
"grad_norm": 174.30248594338545,
"learning_rate": 1.5244275669301777e-08,
"logits/chosen": -1.1678388118743896,
"logits/rejected": -1.1716469526290894,
"logps/chosen": -4.424648284912109,
"logps/rejected": -5.160457611083984,
"loss": 2.9157,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -44.246482849121094,
"rewards/margins": 7.358092784881592,
"rewards/rejected": -51.604576110839844,
"step": 407
},
{
"epoch": 0.9228159457167091,
"grad_norm": 161.69664780097952,
"learning_rate": 1.4390805716863398e-08,
"logits/chosen": -1.1633141040802002,
"logits/rejected": -1.1669323444366455,
"logps/chosen": -4.22412109375,
"logps/rejected": -4.885556697845459,
"loss": 2.8954,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -42.241207122802734,
"rewards/margins": 6.614358901977539,
"rewards/rejected": -48.855567932128906,
"step": 408
},
{
"epoch": 0.9250777495052305,
"grad_norm": 149.67343386390974,
"learning_rate": 1.3561482801337908e-08,
"logits/chosen": -1.1175371408462524,
"logits/rejected": -1.1360092163085938,
"logps/chosen": -4.270062446594238,
"logps/rejected": -5.0212225914001465,
"loss": 2.8054,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -42.700626373291016,
"rewards/margins": 7.511605262756348,
"rewards/rejected": -50.21223068237305,
"step": 409
},
{
"epoch": 0.9273395532937517,
"grad_norm": 166.06441280121587,
"learning_rate": 1.2756358855332904e-08,
"logits/chosen": -1.1701834201812744,
"logits/rejected": -1.1796129941940308,
"logps/chosen": -4.174693584442139,
"logps/rejected": -4.778913497924805,
"loss": 3.0222,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -41.74692916870117,
"rewards/margins": 6.042202472686768,
"rewards/rejected": -47.78913497924805,
"step": 410
},
{
"epoch": 0.9296013570822731,
"grad_norm": 156.45557835212637,
"learning_rate": 1.1975484296105154e-08,
"logits/chosen": -1.1475261449813843,
"logits/rejected": -1.1557633876800537,
"logps/chosen": -4.335869789123535,
"logps/rejected": -5.053281784057617,
"loss": 2.8032,
"rewards/accuracies": 0.75,
"rewards/chosen": -43.35869598388672,
"rewards/margins": 7.1741180419921875,
"rewards/rejected": -50.53281021118164,
"step": 411
},
{
"epoch": 0.9318631608707945,
"grad_norm": 150.89002422183648,
"learning_rate": 1.1218908022402374e-08,
"logits/chosen": -1.1364606618881226,
"logits/rejected": -1.150048851966858,
"logps/chosen": -4.117517471313477,
"logps/rejected": -4.878651142120361,
"loss": 2.6882,
"rewards/accuracies": 0.796875,
"rewards/chosen": -41.17517852783203,
"rewards/margins": 7.611327648162842,
"rewards/rejected": -48.78650665283203,
"step": 412
},
{
"epoch": 0.9341249646593158,
"grad_norm": 138.25948330744123,
"learning_rate": 1.0486677411402079e-08,
"logits/chosen": -1.2186678647994995,
"logits/rejected": -1.2154583930969238,
"logps/chosen": -4.54112434387207,
"logps/rejected": -5.477083206176758,
"loss": 2.4561,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -45.41124725341797,
"rewards/margins": 9.359580039978027,
"rewards/rejected": -54.77082824707031,
"step": 413
},
{
"epoch": 0.9363867684478372,
"grad_norm": 141.68327115023573,
"learning_rate": 9.778838315744353e-09,
"logits/chosen": -1.1879788637161255,
"logits/rejected": -1.2011134624481201,
"logps/chosen": -4.5947699546813965,
"logps/rejected": -5.385745525360107,
"loss": 2.2977,
"rewards/accuracies": 0.828125,
"rewards/chosen": -45.94770050048828,
"rewards/margins": 7.909754753112793,
"rewards/rejected": -53.85745620727539,
"step": 414
},
{
"epoch": 0.9386485722363584,
"grad_norm": 155.86855773359468,
"learning_rate": 9.095435060660595e-09,
"logits/chosen": -1.131690502166748,
"logits/rejected": -1.1426740884780884,
"logps/chosen": -4.285680770874023,
"logps/rejected": -5.050602436065674,
"loss": 2.6046,
"rewards/accuracies": 0.8125,
"rewards/chosen": -42.85680389404297,
"rewards/margins": 7.64921760559082,
"rewards/rejected": -50.50602340698242,
"step": 415
},
{
"epoch": 0.9409103760248798,
"grad_norm": 179.16381227507324,
"learning_rate": 8.436510441197864e-09,
"logits/chosen": -1.1493927240371704,
"logits/rejected": -1.170986533164978,
"logps/chosen": -4.301328659057617,
"logps/rejected": -5.077293872833252,
"loss": 2.7123,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -43.01328659057617,
"rewards/margins": 7.759650230407715,
"rewards/rejected": -50.77294158935547,
"step": 416
},
{
"epoch": 0.9431721798134012,
"grad_norm": 178.14439119940326,
"learning_rate": 7.802105719539076e-09,
"logits/chosen": -1.1591538190841675,
"logits/rejected": -1.1537411212921143,
"logps/chosen": -4.516260147094727,
"logps/rejected": -5.185364246368408,
"loss": 3.2654,
"rewards/accuracies": 0.75,
"rewards/chosen": -45.162601470947266,
"rewards/margins": 6.691040992736816,
"rewards/rejected": -51.85364532470703,
"step": 417
},
{
"epoch": 0.9454339836019225,
"grad_norm": 143.6671474067502,
"learning_rate": 7.1922606224192e-09,
"logits/chosen": -1.1799875497817993,
"logits/rejected": -1.1871784925460815,
"logps/chosen": -4.451123237609863,
"logps/rejected": -5.2011637687683105,
"loss": 2.2148,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -44.51123046875,
"rewards/margins": 7.50040340423584,
"rewards/rejected": -52.011634826660156,
"step": 418
},
{
"epoch": 0.9476957873904439,
"grad_norm": 180.34081751631268,
"learning_rate": 6.6070133386372906e-09,
"logits/chosen": -1.1689175367355347,
"logits/rejected": -1.1665769815444946,
"logps/chosen": -4.324338912963867,
"logps/rejected": -4.959226131439209,
"loss": 3.0841,
"rewards/accuracies": 0.78125,
"rewards/chosen": -43.24338912963867,
"rewards/margins": 6.348876953125,
"rewards/rejected": -49.59226608276367,
"step": 419
},
{
"epoch": 0.9499575911789653,
"grad_norm": 146.1251333430689,
"learning_rate": 6.046400516665384e-09,
"logits/chosen": -1.1695650815963745,
"logits/rejected": -1.1703170537948608,
"logps/chosen": -4.297419548034668,
"logps/rejected": -5.06644868850708,
"loss": 2.6095,
"rewards/accuracies": 0.78125,
"rewards/chosen": -42.97419738769531,
"rewards/margins": 7.69029426574707,
"rewards/rejected": -50.664485931396484,
"step": 420
},
{
"epoch": 0.9522193949674865,
"grad_norm": 155.5449189325342,
"learning_rate": 5.510457262353396e-09,
"logits/chosen": -1.1957755088806152,
"logits/rejected": -1.2013041973114014,
"logps/chosen": -4.236740589141846,
"logps/rejected": -4.945225715637207,
"loss": 2.1589,
"rewards/accuracies": 0.8515625,
"rewards/chosen": -42.36740493774414,
"rewards/margins": 7.084850311279297,
"rewards/rejected": -49.4522590637207,
"step": 421
},
{
"epoch": 0.9544811987560079,
"grad_norm": 138.48331466546585,
"learning_rate": 4.9992171367309265e-09,
"logits/chosen": -1.1926621198654175,
"logits/rejected": -1.1839208602905273,
"logps/chosen": -4.090144157409668,
"logps/rejected": -4.805689811706543,
"loss": 2.4128,
"rewards/accuracies": 0.796875,
"rewards/chosen": -40.90143966674805,
"rewards/margins": 7.155453205108643,
"rewards/rejected": -48.0568962097168,
"step": 422
},
{
"epoch": 0.9567430025445293,
"grad_norm": 158.48354828470264,
"learning_rate": 4.5127121539052955e-09,
"logits/chosen": -1.1851263046264648,
"logits/rejected": -1.195054292678833,
"logps/chosen": -4.540511131286621,
"logps/rejected": -5.315212249755859,
"loss": 2.4374,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -45.405113220214844,
"rewards/margins": 7.747008323669434,
"rewards/rejected": -53.152122497558594,
"step": 423
},
{
"epoch": 0.9590048063330506,
"grad_norm": 161.76738943972168,
"learning_rate": 4.050972779057327e-09,
"logits/chosen": -1.0983150005340576,
"logits/rejected": -1.1198445558547974,
"logps/chosen": -4.050824165344238,
"logps/rejected": -4.785923004150391,
"loss": 2.516,
"rewards/accuracies": 0.796875,
"rewards/chosen": -40.50823974609375,
"rewards/margins": 7.350987434387207,
"rewards/rejected": -47.859230041503906,
"step": 424
},
{
"epoch": 0.961266610121572,
"grad_norm": 147.16124784211212,
"learning_rate": 3.6140279265330477e-09,
"logits/chosen": -1.1477775573730469,
"logits/rejected": -1.151658058166504,
"logps/chosen": -4.37608003616333,
"logps/rejected": -5.113864421844482,
"loss": 2.3465,
"rewards/accuracies": 0.8125,
"rewards/chosen": -43.76080322265625,
"rewards/margins": 7.377841472625732,
"rewards/rejected": -51.13864517211914,
"step": 425
},
{
"epoch": 0.9635284139100933,
"grad_norm": 156.27921111967768,
"learning_rate": 3.2019049580335853e-09,
"logits/chosen": -1.1753405332565308,
"logits/rejected": -1.172703504562378,
"logps/chosen": -4.146142959594727,
"logps/rejected": -4.741412162780762,
"loss": 2.9867,
"rewards/accuracies": 0.78125,
"rewards/chosen": -41.461429595947266,
"rewards/margins": 5.952691555023193,
"rewards/rejected": -47.41412353515625,
"step": 426
},
{
"epoch": 0.9657902176986146,
"grad_norm": 152.1907684077805,
"learning_rate": 2.814629680901337e-09,
"logits/chosen": -1.2028779983520508,
"logits/rejected": -1.2054622173309326,
"logps/chosen": -4.460807800292969,
"logps/rejected": -5.132265090942383,
"loss": 2.6493,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -44.60807418823242,
"rewards/margins": 6.714574337005615,
"rewards/rejected": -51.32265090942383,
"step": 427
},
{
"epoch": 0.968052021487136,
"grad_norm": 155.37833094133038,
"learning_rate": 2.4522263465041937e-09,
"logits/chosen": -1.1574562788009644,
"logits/rejected": -1.175394058227539,
"logps/chosen": -4.284934043884277,
"logps/rejected": -5.051609039306641,
"loss": 1.8433,
"rewards/accuracies": 0.84375,
"rewards/chosen": -42.84933853149414,
"rewards/margins": 7.666755676269531,
"rewards/rejected": -50.51609802246094,
"step": 428
},
{
"epoch": 0.9703138252756573,
"grad_norm": 144.34283399834695,
"learning_rate": 2.114717648716713e-09,
"logits/chosen": -1.1277655363082886,
"logits/rejected": -1.1411540508270264,
"logps/chosen": -4.294366836547852,
"logps/rejected": -5.148277282714844,
"loss": 2.0689,
"rewards/accuracies": 0.828125,
"rewards/chosen": -42.943668365478516,
"rewards/margins": 8.539103507995605,
"rewards/rejected": -51.4827766418457,
"step": 429
},
{
"epoch": 0.9725756290641787,
"grad_norm": 154.89691227614196,
"learning_rate": 1.802124722499121e-09,
"logits/chosen": -1.1664525270462036,
"logits/rejected": -1.1698546409606934,
"logps/chosen": -4.270165920257568,
"logps/rejected": -5.067376136779785,
"loss": 2.4589,
"rewards/accuracies": 0.796875,
"rewards/chosen": -42.701656341552734,
"rewards/margins": 7.97210168838501,
"rewards/rejected": -50.67375946044922,
"step": 430
},
{
"epoch": 0.9748374328527001,
"grad_norm": 175.9345178956199,
"learning_rate": 1.5144671425737499e-09,
"logits/chosen": -1.1535289287567139,
"logits/rejected": -1.1616544723510742,
"logps/chosen": -3.9937241077423096,
"logps/rejected": -4.663761138916016,
"loss": 3.1278,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -39.93724060058594,
"rewards/margins": 6.700366497039795,
"rewards/rejected": -46.637611389160156,
"step": 431
},
{
"epoch": 0.9770992366412213,
"grad_norm": 159.6613123701801,
"learning_rate": 1.251762922199484e-09,
"logits/chosen": -1.1003191471099854,
"logits/rejected": -1.1101816892623901,
"logps/chosen": -4.444840431213379,
"logps/rejected": -5.245765209197998,
"loss": 1.9898,
"rewards/accuracies": 0.84375,
"rewards/chosen": -44.448402404785156,
"rewards/margins": 8.009248733520508,
"rewards/rejected": -52.45764923095703,
"step": 432
},
{
"epoch": 0.9793610404297427,
"grad_norm": 132.41596768015958,
"learning_rate": 1.0140285120433744e-09,
"logits/chosen": -1.1827129125595093,
"logits/rejected": -1.1924540996551514,
"logps/chosen": -4.455898284912109,
"logps/rejected": -5.210501670837402,
"loss": 2.519,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -44.558982849121094,
"rewards/margins": 7.546034336090088,
"rewards/rejected": -52.10501480102539,
"step": 433
},
{
"epoch": 0.9816228442182641,
"grad_norm": 150.85793494364853,
"learning_rate": 8.012787991508396e-10,
"logits/chosen": -1.147634506225586,
"logits/rejected": -1.1722147464752197,
"logps/chosen": -4.21061897277832,
"logps/rejected": -5.051124572753906,
"loss": 2.6106,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -42.1061897277832,
"rewards/margins": 8.405055046081543,
"rewards/rejected": -50.5112419128418,
"step": 434
},
{
"epoch": 0.9838846480067854,
"grad_norm": 128.47098525306382,
"learning_rate": 6.135271060133007e-10,
"logits/chosen": -1.1236947774887085,
"logits/rejected": -1.1270819902420044,
"logps/chosen": -4.134029865264893,
"logps/rejected": -4.879709243774414,
"loss": 2.4142,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -41.34030532836914,
"rewards/margins": 7.456791400909424,
"rewards/rejected": -48.797096252441406,
"step": 435
},
{
"epoch": 0.9861464517953068,
"grad_norm": 151.15908258364348,
"learning_rate": 4.50785189733871e-10,
"logits/chosen": -1.1433157920837402,
"logits/rejected": -1.1700868606567383,
"logps/chosen": -4.154694557189941,
"logps/rejected": -4.998684883117676,
"loss": 2.2485,
"rewards/accuracies": 0.828125,
"rewards/chosen": -41.54694366455078,
"rewards/margins": 8.439903259277344,
"rewards/rejected": -49.986846923828125,
"step": 436
},
{
"epoch": 0.988408255583828,
"grad_norm": 143.0805408958406,
"learning_rate": 3.1306324129118935e-10,
"logits/chosen": -1.1328377723693848,
"logits/rejected": -1.1593248844146729,
"logps/chosen": -4.32177209854126,
"logps/rejected": -5.004055023193359,
"loss": 2.586,
"rewards/accuracies": 0.796875,
"rewards/chosen": -43.21772766113281,
"rewards/margins": 6.822832107543945,
"rewards/rejected": -50.040550231933594,
"step": 437
},
{
"epoch": 0.9906700593723494,
"grad_norm": 183.31095051381905,
"learning_rate": 2.003698849011748e-10,
"logits/chosen": -1.195257306098938,
"logits/rejected": -1.1942667961120605,
"logps/chosen": -4.546604156494141,
"logps/rejected": -5.205718517303467,
"loss": 2.5836,
"rewards/accuracies": 0.796875,
"rewards/chosen": -45.46604537963867,
"rewards/margins": 6.591144561767578,
"rewards/rejected": -52.057186126708984,
"step": 438
},
{
"epoch": 0.9929318631608708,
"grad_norm": 156.79957243989656,
"learning_rate": 1.1271217747714779e-10,
"logits/chosen": -1.1826034784317017,
"logits/rejected": -1.2057559490203857,
"logps/chosen": -4.436058521270752,
"logps/rejected": -5.084158897399902,
"loss": 2.8188,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -44.3605842590332,
"rewards/margins": 6.481000900268555,
"rewards/rejected": -50.84158706665039,
"step": 439
},
{
"epoch": 0.9951936669493922,
"grad_norm": 154.28655412323974,
"learning_rate": 5.0095608187739055e-11,
"logits/chosen": -1.151612639427185,
"logits/rejected": -1.1684077978134155,
"logps/chosen": -4.0718464851379395,
"logps/rejected": -4.774002552032471,
"loss": 2.6165,
"rewards/accuracies": 0.796875,
"rewards/chosen": -40.71846389770508,
"rewards/margins": 7.021560192108154,
"rewards/rejected": -47.74002456665039,
"step": 440
},
{
"epoch": 0.9974554707379135,
"grad_norm": 170.7047305284887,
"learning_rate": 1.2524098113209092e-11,
"logits/chosen": -1.1892815828323364,
"logits/rejected": -1.1911449432373047,
"logps/chosen": -4.447210788726807,
"logps/rejected": -5.045917987823486,
"loss": 3.3375,
"rewards/accuracies": 0.796875,
"rewards/chosen": -44.472110748291016,
"rewards/margins": 5.987071990966797,
"rewards/rejected": -50.45918273925781,
"step": 441
},
{
"epoch": 0.9997172745264349,
"grad_norm": 162.25573622129824,
"learning_rate": 0.0,
"logits/chosen": -1.1888779401779175,
"logits/rejected": -1.1964941024780273,
"logps/chosen": -4.214119911193848,
"logps/rejected": -4.939702987670898,
"loss": 2.7514,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -42.14120101928711,
"rewards/margins": 7.255833148956299,
"rewards/rejected": -49.39703369140625,
"step": 442
},
{
"epoch": 0.9997172745264349,
"eval_logits/chosen": -1.1595183610916138,
"eval_logits/rejected": -1.1711369752883911,
"eval_logps/chosen": -4.307417392730713,
"eval_logps/rejected": -5.051736831665039,
"eval_loss": 2.3919546604156494,
"eval_rewards/accuracies": 0.8038102388381958,
"eval_rewards/chosen": -43.07417297363281,
"eval_rewards/margins": 7.443192958831787,
"eval_rewards/rejected": -50.517372131347656,
"eval_runtime": 100.8538,
"eval_samples_per_second": 29.538,
"eval_steps_per_second": 1.854,
"step": 442
},
{
"epoch": 0.9997172745264349,
"step": 442,
"total_flos": 134366991482880.0,
"train_loss": 3.293350306571339,
"train_runtime": 7617.916,
"train_samples_per_second": 7.429,
"train_steps_per_second": 0.058
}
],
"logging_steps": 1.0,
"max_steps": 442,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 134366991482880.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}