sedrickkeh's picture
End of training
951ee72 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997172745264349,
"eval_steps": 500,
"global_step": 442,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0022618037885213456,
"grad_norm": 141.01397939585976,
"learning_rate": 1.7777777777777777e-08,
"logits/chosen": -1.0503966808319092,
"logits/rejected": -1.0386303663253784,
"logps/chosen": -1.497732400894165,
"logps/rejected": -1.611051321029663,
"loss": 5.5018,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -14.977323532104492,
"rewards/margins": 1.1331881284713745,
"rewards/rejected": -16.110509872436523,
"step": 1
},
{
"epoch": 0.004523607577042691,
"grad_norm": 68.88925767431851,
"learning_rate": 3.5555555555555554e-08,
"logits/chosen": -1.104045033454895,
"logits/rejected": -1.1043524742126465,
"logps/chosen": -1.5607943534851074,
"logps/rejected": -1.5189738273620605,
"loss": 5.9744,
"rewards/accuracies": 0.4765625,
"rewards/chosen": -15.60794448852539,
"rewards/margins": -0.4182056784629822,
"rewards/rejected": -15.189737319946289,
"step": 2
},
{
"epoch": 0.006785411365564037,
"grad_norm": 127.77084111180126,
"learning_rate": 5.333333333333333e-08,
"logits/chosen": -1.0770599842071533,
"logits/rejected": -1.0823699235916138,
"logps/chosen": -1.5453805923461914,
"logps/rejected": -1.722267746925354,
"loss": 6.0253,
"rewards/accuracies": 0.546875,
"rewards/chosen": -15.453805923461914,
"rewards/margins": 1.7688698768615723,
"rewards/rejected": -17.222675323486328,
"step": 3
},
{
"epoch": 0.009047215154085382,
"grad_norm": 106.10699632777116,
"learning_rate": 7.111111111111111e-08,
"logits/chosen": -1.09734308719635,
"logits/rejected": -1.0833051204681396,
"logps/chosen": -1.545915126800537,
"logps/rejected": -1.5207103490829468,
"loss": 5.8916,
"rewards/accuracies": 0.53125,
"rewards/chosen": -15.459149360656738,
"rewards/margins": -0.2520461976528168,
"rewards/rejected": -15.207103729248047,
"step": 4
},
{
"epoch": 0.01130901894260673,
"grad_norm": 77.5519625218596,
"learning_rate": 8.888888888888888e-08,
"logits/chosen": -1.0867865085601807,
"logits/rejected": -1.0832228660583496,
"logps/chosen": -1.5075905323028564,
"logps/rejected": -1.5380187034606934,
"loss": 5.3655,
"rewards/accuracies": 0.5625,
"rewards/chosen": -15.075907707214355,
"rewards/margins": 0.30427923798561096,
"rewards/rejected": -15.380186080932617,
"step": 5
},
{
"epoch": 0.013570822731128074,
"grad_norm": 78.11509558718444,
"learning_rate": 1.0666666666666666e-07,
"logits/chosen": -1.1473548412322998,
"logits/rejected": -1.1390419006347656,
"logps/chosen": -1.5627464056015015,
"logps/rejected": -1.5230021476745605,
"loss": 6.1162,
"rewards/accuracies": 0.4609375,
"rewards/chosen": -15.627466201782227,
"rewards/margins": -0.39744287729263306,
"rewards/rejected": -15.230021476745605,
"step": 6
},
{
"epoch": 0.01583262651964942,
"grad_norm": 39.73625297036525,
"learning_rate": 1.2444444444444443e-07,
"logits/chosen": -1.0994057655334473,
"logits/rejected": -1.0783416032791138,
"logps/chosen": -1.4192826747894287,
"logps/rejected": -1.718736171722412,
"loss": 4.1576,
"rewards/accuracies": 0.59375,
"rewards/chosen": -14.192827224731445,
"rewards/margins": 2.9945356845855713,
"rewards/rejected": -17.187362670898438,
"step": 7
},
{
"epoch": 0.018094430308170765,
"grad_norm": 114.77062454343967,
"learning_rate": 1.4222222222222222e-07,
"logits/chosen": -1.0515432357788086,
"logits/rejected": -1.0535235404968262,
"logps/chosen": -1.4979735612869263,
"logps/rejected": -1.5330562591552734,
"loss": 5.8127,
"rewards/accuracies": 0.5078125,
"rewards/chosen": -14.979734420776367,
"rewards/margins": 0.35082772374153137,
"rewards/rejected": -15.330562591552734,
"step": 8
},
{
"epoch": 0.020356234096692113,
"grad_norm": 96.07872744538972,
"learning_rate": 1.6e-07,
"logits/chosen": -1.1019177436828613,
"logits/rejected": -1.0908172130584717,
"logps/chosen": -1.4399257898330688,
"logps/rejected": -1.5934813022613525,
"loss": 5.4423,
"rewards/accuracies": 0.4921875,
"rewards/chosen": -14.399259567260742,
"rewards/margins": 1.5355541706085205,
"rewards/rejected": -15.934813499450684,
"step": 9
},
{
"epoch": 0.02261803788521346,
"grad_norm": 146.26584785993413,
"learning_rate": 1.7777777777777776e-07,
"logits/chosen": -1.0987049341201782,
"logits/rejected": -1.1177351474761963,
"logps/chosen": -1.589949607849121,
"logps/rejected": -1.5490418672561646,
"loss": 5.7372,
"rewards/accuracies": 0.53125,
"rewards/chosen": -15.899496078491211,
"rewards/margins": -0.40907663106918335,
"rewards/rejected": -15.490419387817383,
"step": 10
},
{
"epoch": 0.024879841673734804,
"grad_norm": 79.11204208374147,
"learning_rate": 1.9555555555555555e-07,
"logits/chosen": -1.1473506689071655,
"logits/rejected": -1.1570088863372803,
"logps/chosen": -1.5604548454284668,
"logps/rejected": -1.726046085357666,
"loss": 5.2868,
"rewards/accuracies": 0.5625,
"rewards/chosen": -15.6045503616333,
"rewards/margins": 1.6559122800827026,
"rewards/rejected": -17.260459899902344,
"step": 11
},
{
"epoch": 0.02714164546225615,
"grad_norm": 123.63916912843932,
"learning_rate": 2.133333333333333e-07,
"logits/chosen": -1.0692572593688965,
"logits/rejected": -1.0530564785003662,
"logps/chosen": -1.6708486080169678,
"logps/rejected": -1.7337902784347534,
"loss": 6.1169,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -16.708486557006836,
"rewards/margins": 0.6294152736663818,
"rewards/rejected": -17.337902069091797,
"step": 12
},
{
"epoch": 0.029403449250777494,
"grad_norm": 139.28796689567102,
"learning_rate": 2.3111111111111107e-07,
"logits/chosen": -1.0673459768295288,
"logits/rejected": -1.053938865661621,
"logps/chosen": -1.6321990489959717,
"logps/rejected": -1.5502426624298096,
"loss": 6.0676,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -16.321990966796875,
"rewards/margins": -0.8195652365684509,
"rewards/rejected": -15.502425193786621,
"step": 13
},
{
"epoch": 0.03166525303929884,
"grad_norm": 44.86523130113871,
"learning_rate": 2.4888888888888886e-07,
"logits/chosen": -1.0727157592773438,
"logits/rejected": -1.0687270164489746,
"logps/chosen": -1.3144315481185913,
"logps/rejected": -1.5097585916519165,
"loss": 4.4316,
"rewards/accuracies": 0.625,
"rewards/chosen": -13.144314765930176,
"rewards/margins": 1.9532725811004639,
"rewards/rejected": -15.097586631774902,
"step": 14
},
{
"epoch": 0.033927056827820185,
"grad_norm": 119.40197659765802,
"learning_rate": 2.666666666666666e-07,
"logits/chosen": -1.072951078414917,
"logits/rejected": -1.0741289854049683,
"logps/chosen": -1.460188865661621,
"logps/rejected": -1.5374088287353516,
"loss": 5.1376,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -14.601886749267578,
"rewards/margins": 0.772199273109436,
"rewards/rejected": -15.374088287353516,
"step": 15
},
{
"epoch": 0.03618886061634153,
"grad_norm": 120.20221027013656,
"learning_rate": 2.8444444444444443e-07,
"logits/chosen": -1.1052724123001099,
"logits/rejected": -1.0987166166305542,
"logps/chosen": -1.544463872909546,
"logps/rejected": -1.5167737007141113,
"loss": 6.1293,
"rewards/accuracies": 0.546875,
"rewards/chosen": -15.444637298583984,
"rewards/margins": -0.27689969539642334,
"rewards/rejected": -15.16773796081543,
"step": 16
},
{
"epoch": 0.038450664404862875,
"grad_norm": 65.69449437246068,
"learning_rate": 3.022222222222222e-07,
"logits/chosen": -1.1224839687347412,
"logits/rejected": -1.0999984741210938,
"logps/chosen": -1.3996049165725708,
"logps/rejected": -1.3617793321609497,
"loss": 6.0917,
"rewards/accuracies": 0.5234375,
"rewards/chosen": -13.996048927307129,
"rewards/margins": -0.37825584411621094,
"rewards/rejected": -13.617794036865234,
"step": 17
},
{
"epoch": 0.04071246819338423,
"grad_norm": 99.76979777598639,
"learning_rate": 3.2e-07,
"logits/chosen": -1.1205981969833374,
"logits/rejected": -1.120253562927246,
"logps/chosen": -1.8338966369628906,
"logps/rejected": -1.8411056995391846,
"loss": 6.0657,
"rewards/accuracies": 0.4609375,
"rewards/chosen": -18.338966369628906,
"rewards/margins": 0.07208935916423798,
"rewards/rejected": -18.411056518554688,
"step": 18
},
{
"epoch": 0.04297427198190557,
"grad_norm": 75.13215709008873,
"learning_rate": 3.3777777777777777e-07,
"logits/chosen": -1.1356143951416016,
"logits/rejected": -1.1374859809875488,
"logps/chosen": -1.5520572662353516,
"logps/rejected": -1.584758996963501,
"loss": 5.3015,
"rewards/accuracies": 0.5625,
"rewards/chosen": -15.5205717086792,
"rewards/margins": 0.327017605304718,
"rewards/rejected": -15.847589492797852,
"step": 19
},
{
"epoch": 0.04523607577042692,
"grad_norm": 44.2748498763636,
"learning_rate": 3.5555555555555553e-07,
"logits/chosen": -1.149074912071228,
"logits/rejected": -1.12880539894104,
"logps/chosen": -1.4242889881134033,
"logps/rejected": -1.6002920866012573,
"loss": 4.4667,
"rewards/accuracies": 0.609375,
"rewards/chosen": -14.242890357971191,
"rewards/margins": 1.760029673576355,
"rewards/rejected": -16.00292205810547,
"step": 20
},
{
"epoch": 0.04749787955894826,
"grad_norm": 69.54836323607135,
"learning_rate": 3.7333333333333334e-07,
"logits/chosen": -1.0379455089569092,
"logits/rejected": -1.0415663719177246,
"logps/chosen": -1.413461446762085,
"logps/rejected": -1.4350874423980713,
"loss": 5.2848,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -14.134614944458008,
"rewards/margins": 0.21626026928424835,
"rewards/rejected": -14.350875854492188,
"step": 21
},
{
"epoch": 0.04975968334746961,
"grad_norm": 101.12732993520753,
"learning_rate": 3.911111111111111e-07,
"logits/chosen": -1.0873618125915527,
"logits/rejected": -1.0630055665969849,
"logps/chosen": -1.478364109992981,
"logps/rejected": -1.5262987613677979,
"loss": 4.7711,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -14.783641815185547,
"rewards/margins": 0.4793458580970764,
"rewards/rejected": -15.262988090515137,
"step": 22
},
{
"epoch": 0.05202148713599095,
"grad_norm": 97.19117838044485,
"learning_rate": 4.0888888888888886e-07,
"logits/chosen": -1.0850725173950195,
"logits/rejected": -1.0555074214935303,
"logps/chosen": -1.4664888381958008,
"logps/rejected": -1.4977301359176636,
"loss": 5.4823,
"rewards/accuracies": 0.5234375,
"rewards/chosen": -14.664888381958008,
"rewards/margins": 0.3124130368232727,
"rewards/rejected": -14.977302551269531,
"step": 23
},
{
"epoch": 0.0542832909245123,
"grad_norm": 86.40698470532111,
"learning_rate": 4.266666666666666e-07,
"logits/chosen": -1.0878976583480835,
"logits/rejected": -1.083939552307129,
"logps/chosen": -1.315569281578064,
"logps/rejected": -1.3708666563034058,
"loss": 5.082,
"rewards/accuracies": 0.5625,
"rewards/chosen": -13.155693054199219,
"rewards/margins": 0.552973210811615,
"rewards/rejected": -13.70866584777832,
"step": 24
},
{
"epoch": 0.05654509471303364,
"grad_norm": 87.79572082532306,
"learning_rate": 4.4444444444444444e-07,
"logits/chosen": -1.0942628383636475,
"logits/rejected": -1.091521978378296,
"logps/chosen": -1.6227660179138184,
"logps/rejected": -1.6117480993270874,
"loss": 5.7014,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -16.2276611328125,
"rewards/margins": -0.1101788878440857,
"rewards/rejected": -16.11747932434082,
"step": 25
},
{
"epoch": 0.05880689850155499,
"grad_norm": 268.4862749626735,
"learning_rate": 4.6222222222222214e-07,
"logits/chosen": -1.1140916347503662,
"logits/rejected": -1.0925090312957764,
"logps/chosen": -1.5979957580566406,
"logps/rejected": -1.5717800855636597,
"loss": 5.6624,
"rewards/accuracies": 0.578125,
"rewards/chosen": -15.97995662689209,
"rewards/margins": -0.2621573209762573,
"rewards/rejected": -15.71780014038086,
"step": 26
},
{
"epoch": 0.061068702290076333,
"grad_norm": 79.4491817324606,
"learning_rate": 4.8e-07,
"logits/chosen": -1.1191110610961914,
"logits/rejected": -1.1132099628448486,
"logps/chosen": -1.6132346391677856,
"logps/rejected": -1.5380040407180786,
"loss": 6.0429,
"rewards/accuracies": 0.53125,
"rewards/chosen": -16.13234519958496,
"rewards/margins": -0.7523058652877808,
"rewards/rejected": -15.380041122436523,
"step": 27
},
{
"epoch": 0.06333050607859768,
"grad_norm": 66.73204327091015,
"learning_rate": 4.977777777777777e-07,
"logits/chosen": -1.1149988174438477,
"logits/rejected": -1.0995606184005737,
"logps/chosen": -1.4549816846847534,
"logps/rejected": -1.5254497528076172,
"loss": 5.5384,
"rewards/accuracies": 0.5234375,
"rewards/chosen": -14.549816131591797,
"rewards/margins": 0.7046794891357422,
"rewards/rejected": -15.254497528076172,
"step": 28
},
{
"epoch": 0.06559230986711903,
"grad_norm": 70.83012837849557,
"learning_rate": 5.155555555555556e-07,
"logits/chosen": -1.1130647659301758,
"logits/rejected": -1.1251792907714844,
"logps/chosen": -1.5109716653823853,
"logps/rejected": -1.526107907295227,
"loss": 5.4054,
"rewards/accuracies": 0.5,
"rewards/chosen": -15.10971736907959,
"rewards/margins": 0.1513628363609314,
"rewards/rejected": -15.261078834533691,
"step": 29
},
{
"epoch": 0.06785411365564037,
"grad_norm": 94.06032585695752,
"learning_rate": 5.333333333333332e-07,
"logits/chosen": -1.0764459371566772,
"logits/rejected": -1.078137993812561,
"logps/chosen": -1.5241881608963013,
"logps/rejected": -1.5383861064910889,
"loss": 5.9267,
"rewards/accuracies": 0.484375,
"rewards/chosen": -15.241884231567383,
"rewards/margins": 0.14197878539562225,
"rewards/rejected": -15.383862495422363,
"step": 30
},
{
"epoch": 0.07011591744416172,
"grad_norm": 75.08995684434343,
"learning_rate": 5.511111111111111e-07,
"logits/chosen": -1.126107096672058,
"logits/rejected": -1.1239315271377563,
"logps/chosen": -1.5170094966888428,
"logps/rejected": -1.4923768043518066,
"loss": 5.6393,
"rewards/accuracies": 0.578125,
"rewards/chosen": -15.170095443725586,
"rewards/margins": -0.24632781744003296,
"rewards/rejected": -14.92376708984375,
"step": 31
},
{
"epoch": 0.07237772123268306,
"grad_norm": 74.95824050979452,
"learning_rate": 5.688888888888889e-07,
"logits/chosen": -1.1436784267425537,
"logits/rejected": -1.1325445175170898,
"logps/chosen": -1.438338041305542,
"logps/rejected": -1.3806811571121216,
"loss": 5.9243,
"rewards/accuracies": 0.53125,
"rewards/chosen": -14.383380889892578,
"rewards/margins": -0.5765687227249146,
"rewards/rejected": -13.80681324005127,
"step": 32
},
{
"epoch": 0.07463952502120441,
"grad_norm": 57.484672836149315,
"learning_rate": 5.866666666666666e-07,
"logits/chosen": -1.0700812339782715,
"logits/rejected": -1.0605463981628418,
"logps/chosen": -1.412937879562378,
"logps/rejected": -1.5639235973358154,
"loss": 4.7378,
"rewards/accuracies": 0.5625,
"rewards/chosen": -14.129378318786621,
"rewards/margins": 1.5098581314086914,
"rewards/rejected": -15.639235496520996,
"step": 33
},
{
"epoch": 0.07690132880972575,
"grad_norm": 53.90732494351107,
"learning_rate": 6.044444444444444e-07,
"logits/chosen": -1.1256736516952515,
"logits/rejected": -1.084123134613037,
"logps/chosen": -1.3476636409759521,
"logps/rejected": -1.4755098819732666,
"loss": 4.5532,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -13.47663688659668,
"rewards/margins": 1.278462290763855,
"rewards/rejected": -14.75510025024414,
"step": 34
},
{
"epoch": 0.0791631325982471,
"grad_norm": 85.39623980816621,
"learning_rate": 6.222222222222223e-07,
"logits/chosen": -1.1434717178344727,
"logits/rejected": -1.1167579889297485,
"logps/chosen": -1.5166611671447754,
"logps/rejected": -1.6307806968688965,
"loss": 5.1443,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -15.166611671447754,
"rewards/margins": 1.1411969661712646,
"rewards/rejected": -16.30780792236328,
"step": 35
},
{
"epoch": 0.08142493638676845,
"grad_norm": 69.28128094162892,
"learning_rate": 6.4e-07,
"logits/chosen": -1.0718586444854736,
"logits/rejected": -1.0694938898086548,
"logps/chosen": -1.4529365301132202,
"logps/rejected": -1.576625108718872,
"loss": 4.986,
"rewards/accuracies": 0.59375,
"rewards/chosen": -14.529365539550781,
"rewards/margins": 1.236886978149414,
"rewards/rejected": -15.766251564025879,
"step": 36
},
{
"epoch": 0.08368674017528979,
"grad_norm": 63.685546020810015,
"learning_rate": 6.577777777777777e-07,
"logits/chosen": -1.0874630212783813,
"logits/rejected": -1.0730936527252197,
"logps/chosen": -1.308869481086731,
"logps/rejected": -1.319458246231079,
"loss": 5.2187,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -13.088695526123047,
"rewards/margins": 0.1058862954378128,
"rewards/rejected": -13.194581031799316,
"step": 37
},
{
"epoch": 0.08594854396381114,
"grad_norm": 85.27763093015075,
"learning_rate": 6.755555555555555e-07,
"logits/chosen": -1.1657055616378784,
"logits/rejected": -1.1689563989639282,
"logps/chosen": -1.4712262153625488,
"logps/rejected": -1.4614089727401733,
"loss": 5.4122,
"rewards/accuracies": 0.546875,
"rewards/chosen": -14.712262153625488,
"rewards/margins": -0.0981736034154892,
"rewards/rejected": -14.614089965820312,
"step": 38
},
{
"epoch": 0.08821034775233248,
"grad_norm": 72.53486427096175,
"learning_rate": 6.933333333333333e-07,
"logits/chosen": -1.158952236175537,
"logits/rejected": -1.1623462438583374,
"logps/chosen": -1.4649841785430908,
"logps/rejected": -1.446244478225708,
"loss": 5.4153,
"rewards/accuracies": 0.515625,
"rewards/chosen": -14.649843215942383,
"rewards/margins": -0.18739792704582214,
"rewards/rejected": -14.462443351745605,
"step": 39
},
{
"epoch": 0.09047215154085383,
"grad_norm": 48.05551639915771,
"learning_rate": 7.111111111111111e-07,
"logits/chosen": -1.1088396310806274,
"logits/rejected": -1.087456464767456,
"logps/chosen": -1.3251829147338867,
"logps/rejected": -1.482697606086731,
"loss": 4.2834,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -13.25182819366455,
"rewards/margins": 1.5751475095748901,
"rewards/rejected": -14.82697582244873,
"step": 40
},
{
"epoch": 0.09273395532937517,
"grad_norm": 41.08333243058533,
"learning_rate": 7.288888888888888e-07,
"logits/chosen": -1.1634094715118408,
"logits/rejected": -1.1473877429962158,
"logps/chosen": -1.2953405380249023,
"logps/rejected": -1.4355030059814453,
"loss": 4.393,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -12.95340633392334,
"rewards/margins": 1.401624321937561,
"rewards/rejected": -14.355029106140137,
"step": 41
},
{
"epoch": 0.09499575911789652,
"grad_norm": 81.6745151173038,
"learning_rate": 7.466666666666667e-07,
"logits/chosen": -1.0549430847167969,
"logits/rejected": -1.0252429246902466,
"logps/chosen": -1.3976647853851318,
"logps/rejected": -1.4610525369644165,
"loss": 5.1624,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -13.97664737701416,
"rewards/margins": 0.6338790059089661,
"rewards/rejected": -14.610527038574219,
"step": 42
},
{
"epoch": 0.09725756290641786,
"grad_norm": 64.66643703071367,
"learning_rate": 7.644444444444444e-07,
"logits/chosen": -1.1331276893615723,
"logits/rejected": -1.1149864196777344,
"logps/chosen": -1.3842030763626099,
"logps/rejected": -1.4057769775390625,
"loss": 5.1358,
"rewards/accuracies": 0.515625,
"rewards/chosen": -13.84203052520752,
"rewards/margins": 0.21573936939239502,
"rewards/rejected": -14.057769775390625,
"step": 43
},
{
"epoch": 0.09951936669493922,
"grad_norm": 50.21628916517668,
"learning_rate": 7.822222222222222e-07,
"logits/chosen": -1.0726534128189087,
"logits/rejected": -1.0697026252746582,
"logps/chosen": -1.2373145818710327,
"logps/rejected": -1.3352696895599365,
"loss": 4.3962,
"rewards/accuracies": 0.578125,
"rewards/chosen": -12.373147010803223,
"rewards/margins": 0.9795514941215515,
"rewards/rejected": -13.35269832611084,
"step": 44
},
{
"epoch": 0.10178117048346055,
"grad_norm": 43.75236482093407,
"learning_rate": 8e-07,
"logits/chosen": -1.1228159666061401,
"logits/rejected": -1.1156741380691528,
"logps/chosen": -1.2973405122756958,
"logps/rejected": -1.3789002895355225,
"loss": 4.6924,
"rewards/accuracies": 0.5625,
"rewards/chosen": -12.973404884338379,
"rewards/margins": 0.8155972957611084,
"rewards/rejected": -13.78900146484375,
"step": 45
},
{
"epoch": 0.1040429742719819,
"grad_norm": 93.9343495458176,
"learning_rate": 7.999874759018868e-07,
"logits/chosen": -1.165191650390625,
"logits/rejected": -1.1482605934143066,
"logps/chosen": -1.5229721069335938,
"logps/rejected": -1.6199113130569458,
"loss": 4.7701,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -15.229720115661621,
"rewards/margins": 0.9693921804428101,
"rewards/rejected": -16.199111938476562,
"step": 46
},
{
"epoch": 0.10630477806050326,
"grad_norm": 66.34152836720595,
"learning_rate": 7.999499043918123e-07,
"logits/chosen": -1.1630305051803589,
"logits/rejected": -1.1713589429855347,
"logps/chosen": -1.3499112129211426,
"logps/rejected": -1.3848986625671387,
"loss": 5.0497,
"rewards/accuracies": 0.4921875,
"rewards/chosen": -13.499112129211426,
"rewards/margins": 0.3498736023902893,
"rewards/rejected": -13.848986625671387,
"step": 47
},
{
"epoch": 0.1085665818490246,
"grad_norm": 73.12468967818121,
"learning_rate": 7.998872878225228e-07,
"logits/chosen": -1.104253888130188,
"logits/rejected": -1.0981318950653076,
"logps/chosen": -1.3970146179199219,
"logps/rejected": -1.4857858419418335,
"loss": 4.6942,
"rewards/accuracies": 0.5234375,
"rewards/chosen": -13.970146179199219,
"rewards/margins": 0.8877115249633789,
"rewards/rejected": -14.857856750488281,
"step": 48
},
{
"epoch": 0.11082838563754595,
"grad_norm": 50.68026321922499,
"learning_rate": 7.997996301150987e-07,
"logits/chosen": -1.095520257949829,
"logits/rejected": -1.0906875133514404,
"logps/chosen": -1.3076212406158447,
"logps/rejected": -1.3888590335845947,
"loss": 4.6915,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -13.076210021972656,
"rewards/margins": 0.8123778700828552,
"rewards/rejected": -13.888589859008789,
"step": 49
},
{
"epoch": 0.11309018942606729,
"grad_norm": 54.48253221284768,
"learning_rate": 7.996869367587088e-07,
"logits/chosen": -1.0804747343063354,
"logits/rejected": -1.0651739835739136,
"logps/chosen": -1.357546329498291,
"logps/rejected": -1.4353337287902832,
"loss": 4.7526,
"rewards/accuracies": 0.609375,
"rewards/chosen": -13.575462341308594,
"rewards/margins": 0.7778746485710144,
"rewards/rejected": -14.353337287902832,
"step": 50
},
{
"epoch": 0.11535199321458864,
"grad_norm": 38.59484455862516,
"learning_rate": 7.99549214810266e-07,
"logits/chosen": -1.088415503501892,
"logits/rejected": -1.088612675666809,
"logps/chosen": -1.3685206174850464,
"logps/rejected": -1.4190219640731812,
"loss": 4.8571,
"rewards/accuracies": 0.4765625,
"rewards/chosen": -13.68520450592041,
"rewards/margins": 0.5050145387649536,
"rewards/rejected": -14.19021987915039,
"step": 51
},
{
"epoch": 0.11761379700310998,
"grad_norm": 47.56303506310153,
"learning_rate": 7.993864728939867e-07,
"logits/chosen": -1.103301763534546,
"logits/rejected": -1.0828572511672974,
"logps/chosen": -1.3142025470733643,
"logps/rejected": -1.4140937328338623,
"loss": 4.8802,
"rewards/accuracies": 0.5625,
"rewards/chosen": -13.1420259475708,
"rewards/margins": 0.9989122748374939,
"rewards/rejected": -14.140937805175781,
"step": 52
},
{
"epoch": 0.11987560079163133,
"grad_norm": 53.48543744291103,
"learning_rate": 7.991987212008491e-07,
"logits/chosen": -1.1189554929733276,
"logits/rejected": -1.1071739196777344,
"logps/chosen": -1.3785709142684937,
"logps/rejected": -1.5353251695632935,
"loss": 4.5596,
"rewards/accuracies": 0.5234375,
"rewards/chosen": -13.785710334777832,
"rewards/margins": 1.5675415992736816,
"rewards/rejected": -15.353252410888672,
"step": 53
},
{
"epoch": 0.12213740458015267,
"grad_norm": 54.3409563468225,
"learning_rate": 7.989859714879565e-07,
"logits/chosen": -1.139965534210205,
"logits/rejected": -1.1204081773757935,
"logps/chosen": -1.3048521280288696,
"logps/rejected": -1.372660756111145,
"loss": 4.7955,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -13.048519134521484,
"rewards/margins": 0.6780871748924255,
"rewards/rejected": -13.726606369018555,
"step": 54
},
{
"epoch": 0.12439920836867402,
"grad_norm": 44.10829510538185,
"learning_rate": 7.987482370778005e-07,
"logits/chosen": -1.1082535982131958,
"logits/rejected": -1.1038706302642822,
"logps/chosen": -1.3513308763504028,
"logps/rejected": -1.4602677822113037,
"loss": 4.725,
"rewards/accuracies": 0.59375,
"rewards/chosen": -13.513306617736816,
"rewards/margins": 1.0893704891204834,
"rewards/rejected": -14.602678298950195,
"step": 55
},
{
"epoch": 0.12666101215719536,
"grad_norm": 77.53742820558867,
"learning_rate": 7.984855328574262e-07,
"logits/chosen": -1.007509708404541,
"logits/rejected": -1.009194254875183,
"logps/chosen": -1.3011356592178345,
"logps/rejected": -1.372280478477478,
"loss": 4.6891,
"rewards/accuracies": 0.53125,
"rewards/chosen": -13.011357307434082,
"rewards/margins": 0.7114498615264893,
"rewards/rejected": -13.722806930541992,
"step": 56
},
{
"epoch": 0.1289228159457167,
"grad_norm": 70.85645949378882,
"learning_rate": 7.981978752775009e-07,
"logits/chosen": -1.0481213331222534,
"logits/rejected": -1.0459202527999878,
"logps/chosen": -1.3659950494766235,
"logps/rejected": -1.4737251996994019,
"loss": 4.7291,
"rewards/accuracies": 0.546875,
"rewards/chosen": -13.659952163696289,
"rewards/margins": 1.0772995948791504,
"rewards/rejected": -14.737251281738281,
"step": 57
},
{
"epoch": 0.13118461973423806,
"grad_norm": 83.8267776038953,
"learning_rate": 7.978852823512833e-07,
"logits/chosen": -1.1209564208984375,
"logits/rejected": -1.0898634195327759,
"logps/chosen": -1.447513461112976,
"logps/rejected": -1.5469727516174316,
"loss": 4.9365,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -14.475133895874023,
"rewards/margins": 0.9945943355560303,
"rewards/rejected": -15.469727516174316,
"step": 58
},
{
"epoch": 0.1334464235227594,
"grad_norm": 72.64611478190926,
"learning_rate": 7.975477736534957e-07,
"logits/chosen": -1.1025452613830566,
"logits/rejected": -1.109392523765564,
"logps/chosen": -1.4008920192718506,
"logps/rejected": -1.5584888458251953,
"loss": 4.51,
"rewards/accuracies": 0.578125,
"rewards/chosen": -14.00892162322998,
"rewards/margins": 1.5759669542312622,
"rewards/rejected": -15.584887504577637,
"step": 59
},
{
"epoch": 0.13570822731128074,
"grad_norm": 66.85917896473248,
"learning_rate": 7.971853703190986e-07,
"logits/chosen": -1.0982723236083984,
"logits/rejected": -1.092232584953308,
"logps/chosen": -1.3805067539215088,
"logps/rejected": -1.521033763885498,
"loss": 4.5318,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -13.805068969726562,
"rewards/margins": 1.4052679538726807,
"rewards/rejected": -15.210335731506348,
"step": 60
},
{
"epoch": 0.1379700310998021,
"grad_norm": 53.210256153607055,
"learning_rate": 7.967980950419664e-07,
"logits/chosen": -1.0485178232192993,
"logits/rejected": -1.037397027015686,
"logps/chosen": -1.2965946197509766,
"logps/rejected": -1.442001223564148,
"loss": 4.4137,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -12.965946197509766,
"rewards/margins": 1.4540655612945557,
"rewards/rejected": -14.420013427734375,
"step": 61
},
{
"epoch": 0.14023183488832344,
"grad_norm": 64.9229435001097,
"learning_rate": 7.963859720734669e-07,
"logits/chosen": -1.1149603128433228,
"logits/rejected": -1.1201238632202148,
"logps/chosen": -1.2722684144973755,
"logps/rejected": -1.4075822830200195,
"loss": 4.5023,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -12.722681999206543,
"rewards/margins": 1.3531394004821777,
"rewards/rejected": -14.075822830200195,
"step": 62
},
{
"epoch": 0.14249363867684478,
"grad_norm": 45.43996335712389,
"learning_rate": 7.959490272209427e-07,
"logits/chosen": -1.1015686988830566,
"logits/rejected": -1.079529047012329,
"logps/chosen": -1.2755954265594482,
"logps/rejected": -1.476697564125061,
"loss": 4.1012,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -12.755952835083008,
"rewards/margins": 2.0110244750976562,
"rewards/rejected": -14.766977310180664,
"step": 63
},
{
"epoch": 0.14475544246536612,
"grad_norm": 45.99686400612689,
"learning_rate": 7.954872878460946e-07,
"logits/chosen": -1.128149390220642,
"logits/rejected": -1.1002622842788696,
"logps/chosen": -1.3429501056671143,
"logps/rejected": -1.5083937644958496,
"loss": 4.2495,
"rewards/accuracies": 0.609375,
"rewards/chosen": -13.429499626159668,
"rewards/margins": 1.6544382572174072,
"rewards/rejected": -15.083937644958496,
"step": 64
},
{
"epoch": 0.14701724625388748,
"grad_norm": 58.04494616379713,
"learning_rate": 7.950007828632691e-07,
"logits/chosen": -1.074033498764038,
"logits/rejected": -1.083252191543579,
"logps/chosen": -1.3732857704162598,
"logps/rejected": -1.602651834487915,
"loss": 4.1484,
"rewards/accuracies": 0.625,
"rewards/chosen": -13.732858657836914,
"rewards/margins": 2.2936599254608154,
"rewards/rejected": -16.026517868041992,
"step": 65
},
{
"epoch": 0.14927905004240882,
"grad_norm": 51.14410021137322,
"learning_rate": 7.944895427376465e-07,
"logits/chosen": -1.092671513557434,
"logits/rejected": -1.084201455116272,
"logps/chosen": -1.3794586658477783,
"logps/rejected": -1.6024820804595947,
"loss": 4.3135,
"rewards/accuracies": 0.578125,
"rewards/chosen": -13.794585227966309,
"rewards/margins": 2.2302355766296387,
"rewards/rejected": -16.024822235107422,
"step": 66
},
{
"epoch": 0.15154085383093016,
"grad_norm": 36.03692322063,
"learning_rate": 7.939535994833345e-07,
"logits/chosen": -1.0611392259597778,
"logits/rejected": -1.0569102764129639,
"logps/chosen": -1.2717268466949463,
"logps/rejected": -1.485985517501831,
"loss": 4.2175,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -12.717269897460938,
"rewards/margins": 2.1425867080688477,
"rewards/rejected": -14.859856605529785,
"step": 67
},
{
"epoch": 0.1538026576194515,
"grad_norm": 65.23896138368357,
"learning_rate": 7.933929866613628e-07,
"logits/chosen": -1.0750938653945923,
"logits/rejected": -1.0748298168182373,
"logps/chosen": -1.3082906007766724,
"logps/rejected": -1.4252880811691284,
"loss": 4.5562,
"rewards/accuracies": 0.625,
"rewards/chosen": -13.082904815673828,
"rewards/margins": 1.1699758768081665,
"rewards/rejected": -14.25288200378418,
"step": 68
},
{
"epoch": 0.15606446140797287,
"grad_norm": 50.89066601450084,
"learning_rate": 7.928077393775808e-07,
"logits/chosen": -1.0674494504928589,
"logits/rejected": -1.0776114463806152,
"logps/chosen": -1.3522337675094604,
"logps/rejected": -1.6143879890441895,
"loss": 3.9809,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -13.522336959838867,
"rewards/margins": 2.621541738510132,
"rewards/rejected": -16.143878936767578,
"step": 69
},
{
"epoch": 0.1583262651964942,
"grad_norm": 126.95874714186452,
"learning_rate": 7.921978942804609e-07,
"logits/chosen": -1.0427839756011963,
"logits/rejected": -1.0468775033950806,
"logps/chosen": -1.3458898067474365,
"logps/rejected": -1.5510884523391724,
"loss": 4.1796,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -13.458898544311523,
"rewards/margins": 2.0519869327545166,
"rewards/rejected": -15.510884284973145,
"step": 70
},
{
"epoch": 0.16058806898501554,
"grad_norm": 55.63953868334949,
"learning_rate": 7.915634895588021e-07,
"logits/chosen": -1.0790458917617798,
"logits/rejected": -1.0586471557617188,
"logps/chosen": -1.4167431592941284,
"logps/rejected": -1.5353124141693115,
"loss": 4.7775,
"rewards/accuracies": 0.59375,
"rewards/chosen": -14.167430877685547,
"rewards/margins": 1.185691475868225,
"rewards/rejected": -15.35312271118164,
"step": 71
},
{
"epoch": 0.1628498727735369,
"grad_norm": 76.32984072024325,
"learning_rate": 7.909045649393394e-07,
"logits/chosen": -1.120489239692688,
"logits/rejected": -1.1161108016967773,
"logps/chosen": -1.3034402132034302,
"logps/rejected": -1.3763903379440308,
"loss": 4.7508,
"rewards/accuracies": 0.59375,
"rewards/chosen": -13.034402847290039,
"rewards/margins": 0.7294999361038208,
"rewards/rejected": -13.76390266418457,
"step": 72
},
{
"epoch": 0.16511167656205825,
"grad_norm": 54.86426522118373,
"learning_rate": 7.902211616842556e-07,
"logits/chosen": -1.08669912815094,
"logits/rejected": -1.0823296308517456,
"logps/chosen": -1.3619005680084229,
"logps/rejected": -1.573593258857727,
"loss": 4.3494,
"rewards/accuracies": 0.5625,
"rewards/chosen": -13.61900520324707,
"rewards/margins": 2.1169278621673584,
"rewards/rejected": -15.735933303833008,
"step": 73
},
{
"epoch": 0.16737348035057958,
"grad_norm": 58.4654706399478,
"learning_rate": 7.89513322588598e-07,
"logits/chosen": -1.0687835216522217,
"logits/rejected": -1.0629172325134277,
"logps/chosen": -1.3168197870254517,
"logps/rejected": -1.4600768089294434,
"loss": 4.1908,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -13.168198585510254,
"rewards/margins": 1.4325703382492065,
"rewards/rejected": -14.600768089294434,
"step": 74
},
{
"epoch": 0.16963528413910092,
"grad_norm": 53.919564210580624,
"learning_rate": 7.887810919775976e-07,
"logits/chosen": -1.0242153406143188,
"logits/rejected": -1.0200350284576416,
"logps/chosen": -1.3797943592071533,
"logps/rejected": -1.5162783861160278,
"loss": 4.4817,
"rewards/accuracies": 0.59375,
"rewards/chosen": -13.797942161560059,
"rewards/margins": 1.3648402690887451,
"rewards/rejected": -15.162782669067383,
"step": 75
},
{
"epoch": 0.1718970879276223,
"grad_norm": 39.76484434551506,
"learning_rate": 7.880245157038949e-07,
"logits/chosen": -1.1036596298217773,
"logits/rejected": -1.0872161388397217,
"logps/chosen": -1.375118613243103,
"logps/rejected": -1.5439777374267578,
"loss": 4.3267,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -13.751185417175293,
"rewards/margins": 1.688592553138733,
"rewards/rejected": -15.439779281616211,
"step": 76
},
{
"epoch": 0.17415889171614363,
"grad_norm": 74.50498370535034,
"learning_rate": 7.872436411446671e-07,
"logits/chosen": -1.1212602853775024,
"logits/rejected": -1.1364926099777222,
"logps/chosen": -1.4108389616012573,
"logps/rejected": -1.5181750059127808,
"loss": 4.8281,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -14.108390808105469,
"rewards/margins": 1.073359727859497,
"rewards/rejected": -15.18174934387207,
"step": 77
},
{
"epoch": 0.17642069550466496,
"grad_norm": 44.774679343796215,
"learning_rate": 7.86438517198662e-07,
"logits/chosen": -1.0375410318374634,
"logits/rejected": -1.0335216522216797,
"logps/chosen": -1.3055564165115356,
"logps/rejected": -1.4685771465301514,
"loss": 4.3614,
"rewards/accuracies": 0.609375,
"rewards/chosen": -13.055564880371094,
"rewards/margins": 1.6302083730697632,
"rewards/rejected": -14.685771942138672,
"step": 78
},
{
"epoch": 0.1786824992931863,
"grad_norm": 37.906288042223274,
"learning_rate": 7.856091942831366e-07,
"logits/chosen": -1.0268999338150024,
"logits/rejected": -1.042458415031433,
"logps/chosen": -1.3059368133544922,
"logps/rejected": -1.482313871383667,
"loss": 4.5709,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -13.059368133544922,
"rewards/margins": 1.7637701034545898,
"rewards/rejected": -14.823138236999512,
"step": 79
},
{
"epoch": 0.18094430308170767,
"grad_norm": 48.0205012678544,
"learning_rate": 7.847557243306982e-07,
"logits/chosen": -1.119336724281311,
"logits/rejected": -1.1097991466522217,
"logps/chosen": -1.3474864959716797,
"logps/rejected": -1.4953199625015259,
"loss": 4.4092,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -13.474865913391113,
"rewards/margins": 1.4783344268798828,
"rewards/rejected": -14.953200340270996,
"step": 80
},
{
"epoch": 0.183206106870229,
"grad_norm": 69.7852307914101,
"learning_rate": 7.838781607860541e-07,
"logits/chosen": -1.0976037979125977,
"logits/rejected": -1.0965137481689453,
"logps/chosen": -1.3701703548431396,
"logps/rejected": -1.5155431032180786,
"loss": 4.1212,
"rewards/accuracies": 0.640625,
"rewards/chosen": -13.701703071594238,
"rewards/margins": 1.4537272453308105,
"rewards/rejected": -15.155430793762207,
"step": 81
},
{
"epoch": 0.18546791065875035,
"grad_norm": 49.10698010184252,
"learning_rate": 7.82976558602664e-07,
"logits/chosen": -1.1253350973129272,
"logits/rejected": -1.1349576711654663,
"logps/chosen": -1.3088488578796387,
"logps/rejected": -1.435444712638855,
"loss": 4.4909,
"rewards/accuracies": 0.625,
"rewards/chosen": -13.088489532470703,
"rewards/margins": 1.2659577131271362,
"rewards/rejected": -14.354446411132812,
"step": 82
},
{
"epoch": 0.1877297144472717,
"grad_norm": 63.38808359107094,
"learning_rate": 7.820509742392988e-07,
"logits/chosen": -1.1047896146774292,
"logits/rejected": -1.1108447313308716,
"logps/chosen": -1.4216302633285522,
"logps/rejected": -1.5547611713409424,
"loss": 4.3087,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -14.216300964355469,
"rewards/margins": 1.3313111066818237,
"rewards/rejected": -15.547612190246582,
"step": 83
},
{
"epoch": 0.18999151823579305,
"grad_norm": 67.89717567725671,
"learning_rate": 7.811014656565054e-07,
"logits/chosen": -1.1079522371292114,
"logits/rejected": -1.0807424783706665,
"logps/chosen": -1.342353105545044,
"logps/rejected": -1.6123462915420532,
"loss": 3.9214,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -13.423532485961914,
"rewards/margins": 2.6999294757843018,
"rewards/rejected": -16.12346076965332,
"step": 84
},
{
"epoch": 0.1922533220243144,
"grad_norm": 59.34836842935611,
"learning_rate": 7.801280923129773e-07,
"logits/chosen": -1.0938023328781128,
"logits/rejected": -1.0850962400436401,
"logps/chosen": -1.3702034950256348,
"logps/rejected": -1.4707939624786377,
"loss": 4.8057,
"rewards/accuracies": 0.578125,
"rewards/chosen": -13.702035903930664,
"rewards/margins": 1.0059046745300293,
"rewards/rejected": -14.707940101623535,
"step": 85
},
{
"epoch": 0.19451512581283573,
"grad_norm": 63.12311188329623,
"learning_rate": 7.791309151618305e-07,
"logits/chosen": -1.1037566661834717,
"logits/rejected": -1.104620099067688,
"logps/chosen": -1.4397916793823242,
"logps/rejected": -1.5834070444107056,
"loss": 4.4345,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -14.397918701171875,
"rewards/margins": 1.4361515045166016,
"rewards/rejected": -15.834070205688477,
"step": 86
},
{
"epoch": 0.1967769296013571,
"grad_norm": 40.6034885299812,
"learning_rate": 7.781099966467874e-07,
"logits/chosen": -1.1172497272491455,
"logits/rejected": -1.1132512092590332,
"logps/chosen": -1.2978553771972656,
"logps/rejected": -1.4062166213989258,
"loss": 4.2868,
"rewards/accuracies": 0.65625,
"rewards/chosen": -12.978553771972656,
"rewards/margins": 1.0836100578308105,
"rewards/rejected": -14.062165260314941,
"step": 87
},
{
"epoch": 0.19903873338987843,
"grad_norm": 82.98673857879416,
"learning_rate": 7.770654006982664e-07,
"logits/chosen": -1.1160707473754883,
"logits/rejected": -1.0892269611358643,
"logps/chosen": -1.4885170459747314,
"logps/rejected": -1.6167935132980347,
"loss": 4.7014,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -14.88516902923584,
"rewards/margins": 1.282766342163086,
"rewards/rejected": -16.16793441772461,
"step": 88
},
{
"epoch": 0.20130053717839977,
"grad_norm": 63.02089340256696,
"learning_rate": 7.759971927293781e-07,
"logits/chosen": -1.1214509010314941,
"logits/rejected": -1.1107975244522095,
"logps/chosen": -1.3759891986846924,
"logps/rejected": -1.5045208930969238,
"loss": 4.603,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -13.759891510009766,
"rewards/margins": 1.2853155136108398,
"rewards/rejected": -15.045208930969238,
"step": 89
},
{
"epoch": 0.2035623409669211,
"grad_norm": 49.39733815100318,
"learning_rate": 7.749054396318297e-07,
"logits/chosen": -1.121274709701538,
"logits/rejected": -1.104023814201355,
"logps/chosen": -1.4384340047836304,
"logps/rejected": -1.5760951042175293,
"loss": 4.4771,
"rewards/accuracies": 0.5625,
"rewards/chosen": -14.38433837890625,
"rewards/margins": 1.3766124248504639,
"rewards/rejected": -15.76095199584961,
"step": 90
},
{
"epoch": 0.20582414475544247,
"grad_norm": 85.82224666214906,
"learning_rate": 7.737902097717356e-07,
"logits/chosen": -1.0971518754959106,
"logits/rejected": -1.111463189125061,
"logps/chosen": -1.4152038097381592,
"logps/rejected": -1.6357877254486084,
"loss": 4.3825,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -14.15203857421875,
"rewards/margins": 2.2058396339416504,
"rewards/rejected": -16.357877731323242,
"step": 91
},
{
"epoch": 0.2080859485439638,
"grad_norm": 61.90199433280952,
"learning_rate": 7.726515729853367e-07,
"logits/chosen": -1.077009677886963,
"logits/rejected": -1.082908034324646,
"logps/chosen": -1.3816239833831787,
"logps/rejected": -1.4853577613830566,
"loss": 4.8364,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -13.816240310668945,
"rewards/margins": 1.037337303161621,
"rewards/rejected": -14.85357666015625,
"step": 92
},
{
"epoch": 0.21034775233248515,
"grad_norm": 76.41393582381788,
"learning_rate": 7.714896005746272e-07,
"logits/chosen": -1.1236391067504883,
"logits/rejected": -1.117820143699646,
"logps/chosen": -1.3681647777557373,
"logps/rejected": -1.590654730796814,
"loss": 3.933,
"rewards/accuracies": 0.671875,
"rewards/chosen": -13.681647300720215,
"rewards/margins": 2.2248995304107666,
"rewards/rejected": -15.906549453735352,
"step": 93
},
{
"epoch": 0.21260955612100652,
"grad_norm": 77.71996625500955,
"learning_rate": 7.703043653028896e-07,
"logits/chosen": -1.1559141874313354,
"logits/rejected": -1.1570885181427002,
"logps/chosen": -1.53440523147583,
"logps/rejected": -1.6532174348831177,
"loss": 4.6511,
"rewards/accuracies": 0.640625,
"rewards/chosen": -15.3440523147583,
"rewards/margins": 1.1881229877471924,
"rewards/rejected": -16.53217315673828,
"step": 94
},
{
"epoch": 0.21487135990952785,
"grad_norm": 91.56807102764826,
"learning_rate": 7.690959413901379e-07,
"logits/chosen": -1.1174224615097046,
"logits/rejected": -1.1035161018371582,
"logps/chosen": -1.41274893283844,
"logps/rejected": -1.5510450601577759,
"loss": 4.3802,
"rewards/accuracies": 0.609375,
"rewards/chosen": -14.12748908996582,
"rewards/margins": 1.3829612731933594,
"rewards/rejected": -15.510449409484863,
"step": 95
},
{
"epoch": 0.2171331636980492,
"grad_norm": 67.88527134817444,
"learning_rate": 7.678644045084704e-07,
"logits/chosen": -1.0640089511871338,
"logits/rejected": -1.0812008380889893,
"logps/chosen": -1.365978479385376,
"logps/rejected": -1.5785081386566162,
"loss": 4.2054,
"rewards/accuracies": 0.59375,
"rewards/chosen": -13.659785270690918,
"rewards/margins": 2.125296115875244,
"rewards/rejected": -15.78508186340332,
"step": 96
},
{
"epoch": 0.21939496748657053,
"grad_norm": 49.30327561622109,
"learning_rate": 7.666098317773308e-07,
"logits/chosen": -1.1126219034194946,
"logits/rejected": -1.1190177202224731,
"logps/chosen": -1.4845547676086426,
"logps/rejected": -1.6363489627838135,
"loss": 4.1985,
"rewards/accuracies": 0.546875,
"rewards/chosen": -14.84554672241211,
"rewards/margins": 1.517941951751709,
"rewards/rejected": -16.363489151000977,
"step": 97
},
{
"epoch": 0.2216567712750919,
"grad_norm": 69.91733204705244,
"learning_rate": 7.653323017586789e-07,
"logits/chosen": -1.140116572380066,
"logits/rejected": -1.1216413974761963,
"logps/chosen": -1.3246794939041138,
"logps/rejected": -1.4348188638687134,
"loss": 4.3538,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -13.246795654296875,
"rewards/margins": 1.101393699645996,
"rewards/rejected": -14.348188400268555,
"step": 98
},
{
"epoch": 0.22391857506361323,
"grad_norm": 86.8761329888255,
"learning_rate": 7.640318944520711e-07,
"logits/chosen": -1.1340844631195068,
"logits/rejected": -1.1252467632293701,
"logps/chosen": -1.5068817138671875,
"logps/rejected": -1.6223900318145752,
"loss": 4.5239,
"rewards/accuracies": 0.609375,
"rewards/chosen": -15.068817138671875,
"rewards/margins": 1.1550840139389038,
"rewards/rejected": -16.223899841308594,
"step": 99
},
{
"epoch": 0.22618037885213457,
"grad_norm": 66.95277093894374,
"learning_rate": 7.627086912896511e-07,
"logits/chosen": -1.0213607549667358,
"logits/rejected": -1.0508267879486084,
"logps/chosen": -1.3892170190811157,
"logps/rejected": -1.5458083152770996,
"loss": 4.1824,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -13.892169952392578,
"rewards/margins": 1.5659123659133911,
"rewards/rejected": -15.45808219909668,
"step": 100
},
{
"epoch": 0.2284421826406559,
"grad_norm": 58.53340577802912,
"learning_rate": 7.613627751310499e-07,
"logits/chosen": -1.1418393850326538,
"logits/rejected": -1.142421007156372,
"logps/chosen": -1.4400460720062256,
"logps/rejected": -1.584843635559082,
"loss": 4.061,
"rewards/accuracies": 0.703125,
"rewards/chosen": -14.40046215057373,
"rewards/margins": 1.4479742050170898,
"rewards/rejected": -15.84843635559082,
"step": 101
},
{
"epoch": 0.23070398642917728,
"grad_norm": 89.18045407035784,
"learning_rate": 7.599942302581977e-07,
"logits/chosen": -1.1121330261230469,
"logits/rejected": -1.1174899339675903,
"logps/chosen": -1.4459974765777588,
"logps/rejected": -1.680160403251648,
"loss": 3.9842,
"rewards/accuracies": 0.6875,
"rewards/chosen": -14.45997428894043,
"rewards/margins": 2.341628313064575,
"rewards/rejected": -16.80160140991211,
"step": 102
},
{
"epoch": 0.23296579021769862,
"grad_norm": 78.88418112061981,
"learning_rate": 7.586031423700457e-07,
"logits/chosen": -1.1099210977554321,
"logits/rejected": -1.1131954193115234,
"logps/chosen": -1.39324152469635,
"logps/rejected": -1.5408098697662354,
"loss": 4.4078,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -13.932414054870605,
"rewards/margins": 1.4756839275360107,
"rewards/rejected": -15.408098220825195,
"step": 103
},
{
"epoch": 0.23522759400621995,
"grad_norm": 78.97593612106348,
"learning_rate": 7.571895985772e-07,
"logits/chosen": -1.0611519813537598,
"logits/rejected": -1.0723930597305298,
"logps/chosen": -1.397983431816101,
"logps/rejected": -1.64972722530365,
"loss": 3.7839,
"rewards/accuracies": 0.65625,
"rewards/chosen": -13.979835510253906,
"rewards/margins": 2.517435073852539,
"rewards/rejected": -16.497272491455078,
"step": 104
},
{
"epoch": 0.23748939779474132,
"grad_norm": 79.0701709292454,
"learning_rate": 7.557536873964661e-07,
"logits/chosen": -1.1425997018814087,
"logits/rejected": -1.1292781829833984,
"logps/chosen": -1.6143113374710083,
"logps/rejected": -1.7253085374832153,
"loss": 4.753,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -16.143112182617188,
"rewards/margins": 1.1099728345870972,
"rewards/rejected": -17.253087997436523,
"step": 105
},
{
"epoch": 0.23975120158326266,
"grad_norm": 70.3983101177752,
"learning_rate": 7.542954987453069e-07,
"logits/chosen": -1.1226975917816162,
"logits/rejected": -1.1262807846069336,
"logps/chosen": -1.510130763053894,
"logps/rejected": -1.6568247079849243,
"loss": 4.1598,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -15.10130786895752,
"rewards/margins": 1.46694016456604,
"rewards/rejected": -16.568248748779297,
"step": 106
},
{
"epoch": 0.242013005371784,
"grad_norm": 63.96492528100626,
"learning_rate": 7.528151239362108e-07,
"logits/chosen": -1.1100159883499146,
"logits/rejected": -1.11635422706604,
"logps/chosen": -1.5105023384094238,
"logps/rejected": -1.7230298519134521,
"loss": 4.028,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -15.105022430419922,
"rewards/margins": 2.1252756118774414,
"rewards/rejected": -17.23029899597168,
"step": 107
},
{
"epoch": 0.24427480916030533,
"grad_norm": 88.26352011162051,
"learning_rate": 7.513126556709748e-07,
"logits/chosen": -1.0872122049331665,
"logits/rejected": -1.0848742723464966,
"logps/chosen": -1.5043668746948242,
"logps/rejected": -1.8191593885421753,
"loss": 3.5723,
"rewards/accuracies": 0.71875,
"rewards/chosen": -15.04366683959961,
"rewards/margins": 3.147927761077881,
"rewards/rejected": -18.191593170166016,
"step": 108
},
{
"epoch": 0.2465366129488267,
"grad_norm": 72.88913803824948,
"learning_rate": 7.497881880348984e-07,
"logits/chosen": -1.053697109222412,
"logits/rejected": -1.0525224208831787,
"logps/chosen": -1.5000402927398682,
"logps/rejected": -1.7036010026931763,
"loss": 3.9378,
"rewards/accuracies": 0.6875,
"rewards/chosen": -15.000402450561523,
"rewards/margins": 2.035606622695923,
"rewards/rejected": -17.0360107421875,
"step": 109
},
{
"epoch": 0.24879841673734804,
"grad_norm": 112.83565654609102,
"learning_rate": 7.482418164908931e-07,
"logits/chosen": -1.0978425741195679,
"logits/rejected": -1.1009314060211182,
"logps/chosen": -1.6112879514694214,
"logps/rejected": -1.7323018312454224,
"loss": 4.4843,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -16.112878799438477,
"rewards/margins": 1.210138201713562,
"rewards/rejected": -17.323017120361328,
"step": 110
},
{
"epoch": 0.2510602205258694,
"grad_norm": 86.85626491737465,
"learning_rate": 7.466736378735035e-07,
"logits/chosen": -1.078384280204773,
"logits/rejected": -1.079357624053955,
"logps/chosen": -1.5716272592544556,
"logps/rejected": -1.7571252584457397,
"loss": 4.0243,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -15.716273307800293,
"rewards/margins": 1.8549789190292358,
"rewards/rejected": -17.571250915527344,
"step": 111
},
{
"epoch": 0.2533220243143907,
"grad_norm": 86.39434107888039,
"learning_rate": 7.450837503828439e-07,
"logits/chosen": -1.0666810274124146,
"logits/rejected": -1.0605652332305908,
"logps/chosen": -1.6759812831878662,
"logps/rejected": -1.9359164237976074,
"loss": 3.9933,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -16.75981330871582,
"rewards/margins": 2.599350690841675,
"rewards/rejected": -19.359163284301758,
"step": 112
},
{
"epoch": 0.2555838281029121,
"grad_norm": 69.9794740418758,
"learning_rate": 7.43472253578449e-07,
"logits/chosen": -1.0734777450561523,
"logits/rejected": -1.0733259916305542,
"logps/chosen": -1.5143762826919556,
"logps/rejected": -1.7324830293655396,
"loss": 4.0494,
"rewards/accuracies": 0.625,
"rewards/chosen": -15.143762588500977,
"rewards/margins": 2.181068181991577,
"rewards/rejected": -17.324831008911133,
"step": 113
},
{
"epoch": 0.2578456318914334,
"grad_norm": 81.18585812420585,
"learning_rate": 7.418392483730389e-07,
"logits/chosen": -1.0903693437576294,
"logits/rejected": -1.0884432792663574,
"logps/chosen": -1.6420326232910156,
"logps/rejected": -1.8970825672149658,
"loss": 3.9537,
"rewards/accuracies": 0.65625,
"rewards/chosen": -16.420326232910156,
"rewards/margins": 2.5504982471466064,
"rewards/rejected": -18.9708251953125,
"step": 114
},
{
"epoch": 0.26010743567995476,
"grad_norm": 79.65683808928875,
"learning_rate": 7.401848370262012e-07,
"logits/chosen": -1.125045657157898,
"logits/rejected": -1.1087158918380737,
"logps/chosen": -1.5817803144454956,
"logps/rejected": -1.7453052997589111,
"loss": 4.0128,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -15.817804336547852,
"rewards/margins": 1.635249376296997,
"rewards/rejected": -17.453052520751953,
"step": 115
},
{
"epoch": 0.2623692394684761,
"grad_norm": 82.03864787816394,
"learning_rate": 7.385091231379856e-07,
"logits/chosen": -1.109777808189392,
"logits/rejected": -1.1198936700820923,
"logps/chosen": -1.65959894657135,
"logps/rejected": -1.9032299518585205,
"loss": 3.9017,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -16.59598731994629,
"rewards/margins": 2.436310291290283,
"rewards/rejected": -19.032299041748047,
"step": 116
},
{
"epoch": 0.26463104325699743,
"grad_norm": 90.60602549341097,
"learning_rate": 7.368122116424182e-07,
"logits/chosen": -1.065422773361206,
"logits/rejected": -1.0693196058273315,
"logps/chosen": -1.7014392614364624,
"logps/rejected": -1.907637119293213,
"loss": 4.0746,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -17.014392852783203,
"rewards/margins": 2.061978340148926,
"rewards/rejected": -19.076370239257812,
"step": 117
},
{
"epoch": 0.2668928470455188,
"grad_norm": 86.03329980337816,
"learning_rate": 7.350942088009289e-07,
"logits/chosen": -1.1139557361602783,
"logits/rejected": -1.1093213558197021,
"logps/chosen": -1.69358229637146,
"logps/rejected": -1.916198492050171,
"loss": 3.5838,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -16.935823440551758,
"rewards/margins": 2.2261621952056885,
"rewards/rejected": -19.161983489990234,
"step": 118
},
{
"epoch": 0.26915465083404017,
"grad_norm": 121.7554160666809,
"learning_rate": 7.333552221956986e-07,
"logits/chosen": -1.2222692966461182,
"logits/rejected": -1.2024728059768677,
"logps/chosen": -1.8368546962738037,
"logps/rejected": -2.0985753536224365,
"loss": 4.0214,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -18.368549346923828,
"rewards/margins": 2.6172049045562744,
"rewards/rejected": -20.98575210571289,
"step": 119
},
{
"epoch": 0.2714164546225615,
"grad_norm": 97.83952989713862,
"learning_rate": 7.315953607229217e-07,
"logits/chosen": -1.109398365020752,
"logits/rejected": -1.1084152460098267,
"logps/chosen": -1.907912254333496,
"logps/rejected": -2.1674695014953613,
"loss": 3.7429,
"rewards/accuracies": 0.65625,
"rewards/chosen": -19.07912254333496,
"rewards/margins": 2.595571517944336,
"rewards/rejected": -21.674694061279297,
"step": 120
},
{
"epoch": 0.27367825841108284,
"grad_norm": 105.07451107315077,
"learning_rate": 7.298147345859869e-07,
"logits/chosen": -1.111659049987793,
"logits/rejected": -1.1200050115585327,
"logps/chosen": -1.807603120803833,
"logps/rejected": -2.0426204204559326,
"loss": 3.8303,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -18.076032638549805,
"rewards/margins": 2.350172519683838,
"rewards/rejected": -20.426204681396484,
"step": 121
},
{
"epoch": 0.2759400621996042,
"grad_norm": 128.53073399523066,
"learning_rate": 7.280134552885762e-07,
"logits/chosen": -1.1038322448730469,
"logits/rejected": -1.0986474752426147,
"logps/chosen": -1.919103980064392,
"logps/rejected": -2.1694791316986084,
"loss": 4.1521,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -19.1910400390625,
"rewards/margins": 2.5037519931793213,
"rewards/rejected": -21.694791793823242,
"step": 122
},
{
"epoch": 0.2782018659881255,
"grad_norm": 124.69931513710597,
"learning_rate": 7.261916356276831e-07,
"logits/chosen": -1.0938494205474854,
"logits/rejected": -1.0867280960083008,
"logps/chosen": -1.9384150505065918,
"logps/rejected": -2.2772631645202637,
"loss": 3.0558,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -19.3841495513916,
"rewards/margins": 3.3884801864624023,
"rewards/rejected": -22.77263069152832,
"step": 123
},
{
"epoch": 0.2804636697766469,
"grad_norm": 112.01228179833498,
"learning_rate": 7.243493896865486e-07,
"logits/chosen": -1.0969176292419434,
"logits/rejected": -1.0823101997375488,
"logps/chosen": -1.8088492155075073,
"logps/rejected": -2.002607583999634,
"loss": 3.8824,
"rewards/accuracies": 0.65625,
"rewards/chosen": -18.088491439819336,
"rewards/margins": 1.9375840425491333,
"rewards/rejected": -20.02607536315918,
"step": 124
},
{
"epoch": 0.2827254735651682,
"grad_norm": 135.1348279059641,
"learning_rate": 7.224868328275169e-07,
"logits/chosen": -1.0725688934326172,
"logits/rejected": -1.06583833694458,
"logps/chosen": -1.9847272634506226,
"logps/rejected": -2.2174899578094482,
"loss": 4.0921,
"rewards/accuracies": 0.6875,
"rewards/chosen": -19.847272872924805,
"rewards/margins": 2.3276259899139404,
"rewards/rejected": -22.174898147583008,
"step": 125
},
{
"epoch": 0.28498727735368956,
"grad_norm": 183.11859236868796,
"learning_rate": 7.206040816848126e-07,
"logits/chosen": -1.0952653884887695,
"logits/rejected": -1.0870287418365479,
"logps/chosen": -2.140598773956299,
"logps/rejected": -2.3416907787323,
"loss": 4.3437,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -21.405988693237305,
"rewards/margins": 2.010920524597168,
"rewards/rejected": -23.416908264160156,
"step": 126
},
{
"epoch": 0.2872490811422109,
"grad_norm": 163.26760277117015,
"learning_rate": 7.187012541572356e-07,
"logits/chosen": -1.124993085861206,
"logits/rejected": -1.126251459121704,
"logps/chosen": -2.2181339263916016,
"logps/rejected": -2.5058629512786865,
"loss": 3.9287,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -22.18134307861328,
"rewards/margins": 2.8772897720336914,
"rewards/rejected": -25.058629989624023,
"step": 127
},
{
"epoch": 0.28951088493073224,
"grad_norm": 150.6861946732737,
"learning_rate": 7.167784694007791e-07,
"logits/chosen": -1.0672762393951416,
"logits/rejected": -1.076225757598877,
"logps/chosen": -2.1517903804779053,
"logps/rejected": -2.513075351715088,
"loss": 3.5207,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -21.517902374267578,
"rewards/margins": 3.61285138130188,
"rewards/rejected": -25.130752563476562,
"step": 128
},
{
"epoch": 0.2917726887192536,
"grad_norm": 166.1334881985271,
"learning_rate": 7.148358478211682e-07,
"logits/chosen": -1.0861376523971558,
"logits/rejected": -1.0827029943466187,
"logps/chosen": -2.336635112762451,
"logps/rejected": -2.585222005844116,
"loss": 3.9166,
"rewards/accuracies": 0.625,
"rewards/chosen": -23.366352081298828,
"rewards/margins": 2.4858686923980713,
"rewards/rejected": -25.85222053527832,
"step": 129
},
{
"epoch": 0.29403449250777497,
"grad_norm": 129.64078380147677,
"learning_rate": 7.128735110663187e-07,
"logits/chosen": -1.0773652791976929,
"logits/rejected": -1.0514805316925049,
"logps/chosen": -2.0527725219726562,
"logps/rejected": -2.4218080043792725,
"loss": 3.2006,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -20.527725219726562,
"rewards/margins": 3.690356969833374,
"rewards/rejected": -24.218082427978516,
"step": 130
},
{
"epoch": 0.2962962962962963,
"grad_norm": 185.67538718660288,
"learning_rate": 7.108915820187211e-07,
"logits/chosen": -1.0431915521621704,
"logits/rejected": -1.0388939380645752,
"logps/chosen": -2.42039155960083,
"logps/rejected": -2.778228998184204,
"loss": 3.9305,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -24.203916549682617,
"rewards/margins": 3.5783729553222656,
"rewards/rejected": -27.782289505004883,
"step": 131
},
{
"epoch": 0.29855810008481765,
"grad_norm": 189.37316298985056,
"learning_rate": 7.088901847877447e-07,
"logits/chosen": -1.0450592041015625,
"logits/rejected": -1.0333521366119385,
"logps/chosen": -2.2116613388061523,
"logps/rejected": -2.4226627349853516,
"loss": 4.5748,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -22.116615295410156,
"rewards/margins": 2.110013961791992,
"rewards/rejected": -24.226627349853516,
"step": 132
},
{
"epoch": 0.300819903873339,
"grad_norm": 148.7215180863099,
"learning_rate": 7.068694447018658e-07,
"logits/chosen": -1.0752545595169067,
"logits/rejected": -1.0728555917739868,
"logps/chosen": -2.185119390487671,
"logps/rejected": -2.5360453128814697,
"loss": 3.5117,
"rewards/accuracies": 0.6875,
"rewards/chosen": -21.851194381713867,
"rewards/margins": 3.5092573165893555,
"rewards/rejected": -25.360450744628906,
"step": 133
},
{
"epoch": 0.3030817076618603,
"grad_norm": 153.23013702965537,
"learning_rate": 7.048294883008199e-07,
"logits/chosen": -1.0524669885635376,
"logits/rejected": -1.052236795425415,
"logps/chosen": -2.0294673442840576,
"logps/rejected": -2.303767442703247,
"loss": 3.5238,
"rewards/accuracies": 0.765625,
"rewards/chosen": -20.294673919677734,
"rewards/margins": 2.7429981231689453,
"rewards/rejected": -23.037673950195312,
"step": 134
},
{
"epoch": 0.3053435114503817,
"grad_norm": 144.28413679223638,
"learning_rate": 7.027704433276776e-07,
"logits/chosen": -1.0182456970214844,
"logits/rejected": -1.0239580869674683,
"logps/chosen": -2.133883237838745,
"logps/rejected": -2.4184799194335938,
"loss": 3.9025,
"rewards/accuracies": 0.640625,
"rewards/chosen": -21.33883285522461,
"rewards/margins": 2.8459646701812744,
"rewards/rejected": -24.184799194335938,
"step": 135
},
{
"epoch": 0.307605315238903,
"grad_norm": 179.86153387997342,
"learning_rate": 7.006924387208452e-07,
"logits/chosen": -1.047975778579712,
"logits/rejected": -1.0314030647277832,
"logps/chosen": -1.9791343212127686,
"logps/rejected": -2.2267491817474365,
"loss": 3.8151,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -19.791345596313477,
"rewards/margins": 2.4761476516723633,
"rewards/rejected": -22.267492294311523,
"step": 136
},
{
"epoch": 0.30986711902742436,
"grad_norm": 121.33379388556347,
"learning_rate": 6.985956046059904e-07,
"logits/chosen": -1.0230764150619507,
"logits/rejected": -1.0363925695419312,
"logps/chosen": -1.897845983505249,
"logps/rejected": -2.19516921043396,
"loss": 3.7707,
"rewards/accuracies": 0.671875,
"rewards/chosen": -18.978456497192383,
"rewards/margins": 2.973233938217163,
"rewards/rejected": -21.951692581176758,
"step": 137
},
{
"epoch": 0.31212892281594573,
"grad_norm": 112.00748336876892,
"learning_rate": 6.964800722878945e-07,
"logits/chosen": -0.996837854385376,
"logits/rejected": -0.9921685457229614,
"logps/chosen": -1.9219303131103516,
"logps/rejected": -2.291210651397705,
"loss": 3.3089,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -19.219301223754883,
"rewards/margins": 3.692802906036377,
"rewards/rejected": -22.912105560302734,
"step": 138
},
{
"epoch": 0.31439072660446704,
"grad_norm": 91.27352193919032,
"learning_rate": 6.943459742422287e-07,
"logits/chosen": -1.022385835647583,
"logits/rejected": -0.9965115189552307,
"logps/chosen": -1.897507667541504,
"logps/rejected": -2.216675043106079,
"loss": 3.4424,
"rewards/accuracies": 0.703125,
"rewards/chosen": -18.975078582763672,
"rewards/margins": 3.1916732788085938,
"rewards/rejected": -22.166751861572266,
"step": 139
},
{
"epoch": 0.3166525303929884,
"grad_norm": 111.30209201573247,
"learning_rate": 6.921934441072597e-07,
"logits/chosen": -1.0870633125305176,
"logits/rejected": -1.074265480041504,
"logps/chosen": -2.0160953998565674,
"logps/rejected": -2.277557373046875,
"loss": 3.9955,
"rewards/accuracies": 0.625,
"rewards/chosen": -20.16095542907715,
"rewards/margins": 2.6146204471588135,
"rewards/rejected": -22.77557373046875,
"step": 140
},
{
"epoch": 0.3189143341815098,
"grad_norm": 127.72259297295662,
"learning_rate": 6.900226166754807e-07,
"logits/chosen": -1.0401194095611572,
"logits/rejected": -1.0414453744888306,
"logps/chosen": -2.111356258392334,
"logps/rejected": -2.3192646503448486,
"loss": 4.2984,
"rewards/accuracies": 0.671875,
"rewards/chosen": -21.113563537597656,
"rewards/margins": 2.0790882110595703,
"rewards/rejected": -23.19264793395996,
"step": 141
},
{
"epoch": 0.3211761379700311,
"grad_norm": 109.41246623179207,
"learning_rate": 6.8783362788517e-07,
"logits/chosen": -1.041169285774231,
"logits/rejected": -1.0405552387237549,
"logps/chosen": -2.0705885887145996,
"logps/rejected": -2.3723855018615723,
"loss": 3.9914,
"rewards/accuracies": 0.65625,
"rewards/chosen": -20.70588493347168,
"rewards/margins": 3.017970561981201,
"rewards/rejected": -23.72385597229004,
"step": 142
},
{
"epoch": 0.32343794175855245,
"grad_norm": 101.80108756422902,
"learning_rate": 6.856266148118796e-07,
"logits/chosen": -1.0391225814819336,
"logits/rejected": -1.0372495651245117,
"logps/chosen": -1.9241631031036377,
"logps/rejected": -2.2478513717651367,
"loss": 3.4991,
"rewards/accuracies": 0.734375,
"rewards/chosen": -19.24163246154785,
"rewards/margins": 3.2368831634521484,
"rewards/rejected": -22.478511810302734,
"step": 143
},
{
"epoch": 0.3256997455470738,
"grad_norm": 126.22301243213639,
"learning_rate": 6.834017156598512e-07,
"logits/chosen": -1.0187711715698242,
"logits/rejected": -1.0043439865112305,
"logps/chosen": -2.097262382507324,
"logps/rejected": -2.361314535140991,
"loss": 3.7219,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -20.972625732421875,
"rewards/margins": 2.6405200958251953,
"rewards/rejected": -23.61314582824707,
"step": 144
},
{
"epoch": 0.3279615493355951,
"grad_norm": 111.89551066310779,
"learning_rate": 6.811590697533607e-07,
"logits/chosen": -1.086721420288086,
"logits/rejected": -1.0971835851669312,
"logps/chosen": -2.035705089569092,
"logps/rejected": -2.3083982467651367,
"loss": 3.568,
"rewards/accuracies": 0.71875,
"rewards/chosen": -20.357051849365234,
"rewards/margins": 2.7269287109375,
"rewards/rejected": -23.083980560302734,
"step": 145
},
{
"epoch": 0.3302233531241165,
"grad_norm": 148.3108673788691,
"learning_rate": 6.788988175279951e-07,
"logits/chosen": -1.0467808246612549,
"logits/rejected": -1.028620958328247,
"logps/chosen": -2.1247239112854004,
"logps/rejected": -2.4022562503814697,
"loss": 3.9603,
"rewards/accuracies": 0.6875,
"rewards/chosen": -21.247238159179688,
"rewards/margins": 2.7753214836120605,
"rewards/rejected": -24.02256202697754,
"step": 146
},
{
"epoch": 0.3324851569126378,
"grad_norm": 125.5433893020898,
"learning_rate": 6.766211005218577e-07,
"logits/chosen": -1.0311849117279053,
"logits/rejected": -1.0179343223571777,
"logps/chosen": -2.089470624923706,
"logps/rejected": -2.500403881072998,
"loss": 3.2191,
"rewards/accuracies": 0.75,
"rewards/chosen": -20.894704818725586,
"rewards/margins": 4.109335422515869,
"rewards/rejected": -25.004039764404297,
"step": 147
},
{
"epoch": 0.33474696070115917,
"grad_norm": 118.88936796279769,
"learning_rate": 6.743260613667047e-07,
"logits/chosen": -1.0592498779296875,
"logits/rejected": -1.045177698135376,
"logps/chosen": -2.1444954872131348,
"logps/rejected": -2.4342246055603027,
"loss": 3.7747,
"rewards/accuracies": 0.65625,
"rewards/chosen": -21.44495391845703,
"rewards/margins": 2.8972933292388916,
"rewards/rejected": -24.342247009277344,
"step": 148
},
{
"epoch": 0.33700876448968053,
"grad_norm": 96.48894963535292,
"learning_rate": 6.720138437790139e-07,
"logits/chosen": -1.0325779914855957,
"logits/rejected": -1.0183120965957642,
"logps/chosen": -2.072295665740967,
"logps/rejected": -2.4342474937438965,
"loss": 3.1761,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -20.72295570373535,
"rewards/margins": 3.6195178031921387,
"rewards/rejected": -24.34247398376465,
"step": 149
},
{
"epoch": 0.33927056827820185,
"grad_norm": 153.98600360046012,
"learning_rate": 6.696845925509848e-07,
"logits/chosen": -1.0699944496154785,
"logits/rejected": -1.0510475635528564,
"logps/chosen": -2.2458243370056152,
"logps/rejected": -2.5218453407287598,
"loss": 3.7126,
"rewards/accuracies": 0.75,
"rewards/chosen": -22.458240509033203,
"rewards/margins": 2.7602086067199707,
"rewards/rejected": -25.21845245361328,
"step": 150
},
{
"epoch": 0.3415323720667232,
"grad_norm": 131.38558339494207,
"learning_rate": 6.673384535414718e-07,
"logits/chosen": -1.0725032091140747,
"logits/rejected": -1.0490397214889526,
"logps/chosen": -2.4079201221466064,
"logps/rejected": -2.6393277645111084,
"loss": 4.1844,
"rewards/accuracies": 0.640625,
"rewards/chosen": -24.079200744628906,
"rewards/margins": 2.3140788078308105,
"rewards/rejected": -26.39327621459961,
"step": 151
},
{
"epoch": 0.3437941758552446,
"grad_norm": 117.04940042869278,
"learning_rate": 6.649755736668511e-07,
"logits/chosen": -0.9817408323287964,
"logits/rejected": -0.9708501100540161,
"logps/chosen": -2.151355743408203,
"logps/rejected": -2.484079599380493,
"loss": 3.3881,
"rewards/accuracies": 0.703125,
"rewards/chosen": -21.51355743408203,
"rewards/margins": 3.327239513397217,
"rewards/rejected": -24.840797424316406,
"step": 152
},
{
"epoch": 0.3460559796437659,
"grad_norm": 117.23212029570279,
"learning_rate": 6.625961008918192e-07,
"logits/chosen": -1.007891297340393,
"logits/rejected": -0.9990096092224121,
"logps/chosen": -2.211634397506714,
"logps/rejected": -2.586132049560547,
"loss": 3.0739,
"rewards/accuracies": 0.765625,
"rewards/chosen": -22.116344451904297,
"rewards/margins": 3.744976043701172,
"rewards/rejected": -25.86132049560547,
"step": 153
},
{
"epoch": 0.34831778343228725,
"grad_norm": 123.97516922142196,
"learning_rate": 6.602001842201289e-07,
"logits/chosen": -1.0259909629821777,
"logits/rejected": -1.0294607877731323,
"logps/chosen": -2.1726152896881104,
"logps/rejected": -2.435748338699341,
"loss": 3.9322,
"rewards/accuracies": 0.671875,
"rewards/chosen": -21.726154327392578,
"rewards/margins": 2.6313281059265137,
"rewards/rejected": -24.357481002807617,
"step": 154
},
{
"epoch": 0.3505795872208086,
"grad_norm": 122.24528395830801,
"learning_rate": 6.577879736852571e-07,
"logits/chosen": -1.0417340993881226,
"logits/rejected": -1.0337560176849365,
"logps/chosen": -2.2181789875030518,
"logps/rejected": -2.4941534996032715,
"loss": 3.6841,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -22.181791305541992,
"rewards/margins": 2.75974440574646,
"rewards/rejected": -24.94153594970703,
"step": 155
},
{
"epoch": 0.35284139100932993,
"grad_norm": 97.83947309290298,
"learning_rate": 6.553596203410112e-07,
"logits/chosen": -1.0390684604644775,
"logits/rejected": -1.0227370262145996,
"logps/chosen": -2.2328336238861084,
"logps/rejected": -2.7284092903137207,
"loss": 2.639,
"rewards/accuracies": 0.796875,
"rewards/chosen": -22.32833480834961,
"rewards/margins": 4.9557576179504395,
"rewards/rejected": -27.284093856811523,
"step": 156
},
{
"epoch": 0.3551031947978513,
"grad_norm": 128.64043273189782,
"learning_rate": 6.529152762520688e-07,
"logits/chosen": -1.0631431341171265,
"logits/rejected": -1.0382766723632812,
"logps/chosen": -2.2645809650421143,
"logps/rejected": -2.6104514598846436,
"loss": 3.3993,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -22.645809173583984,
"rewards/margins": 3.4587042331695557,
"rewards/rejected": -26.104515075683594,
"step": 157
},
{
"epoch": 0.3573649985863726,
"grad_norm": 142.03788600510083,
"learning_rate": 6.504550944844558e-07,
"logits/chosen": -1.0133343935012817,
"logits/rejected": -1.0211207866668701,
"logps/chosen": -2.2236688137054443,
"logps/rejected": -2.5752692222595215,
"loss": 3.4598,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -22.2366886138916,
"rewards/margins": 3.5160036087036133,
"rewards/rejected": -25.7526912689209,
"step": 158
},
{
"epoch": 0.359626802374894,
"grad_norm": 123.13104037411107,
"learning_rate": 6.479792290959613e-07,
"logits/chosen": -1.023786187171936,
"logits/rejected": -1.0143035650253296,
"logps/chosen": -2.2663955688476562,
"logps/rejected": -2.7522597312927246,
"loss": 3.1837,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -22.663957595825195,
"rewards/margins": 4.858642101287842,
"rewards/rejected": -27.52259635925293,
"step": 159
},
{
"epoch": 0.36188860616341534,
"grad_norm": 116.507236487475,
"learning_rate": 6.454878351264906e-07,
"logits/chosen": -0.9985541105270386,
"logits/rejected": -0.9835253357887268,
"logps/chosen": -2.197329044342041,
"logps/rejected": -2.5955209732055664,
"loss": 3.1949,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -21.973289489746094,
"rewards/margins": 3.981919527053833,
"rewards/rejected": -25.955209732055664,
"step": 160
},
{
"epoch": 0.36415040995193665,
"grad_norm": 140.3290000030199,
"learning_rate": 6.429810685883565e-07,
"logits/chosen": -1.025122880935669,
"logits/rejected": -1.015618085861206,
"logps/chosen": -2.3687376976013184,
"logps/rejected": -2.7061593532562256,
"loss": 3.458,
"rewards/accuracies": 0.734375,
"rewards/chosen": -23.687376022338867,
"rewards/margins": 3.3742165565490723,
"rewards/rejected": -27.06159210205078,
"step": 161
},
{
"epoch": 0.366412213740458,
"grad_norm": 165.90557536016226,
"learning_rate": 6.404590864565088e-07,
"logits/chosen": -0.9796350002288818,
"logits/rejected": -0.9656831622123718,
"logps/chosen": -2.3296120166778564,
"logps/rejected": -2.513373851776123,
"loss": 4.2251,
"rewards/accuracies": 0.640625,
"rewards/chosen": -23.296117782592773,
"rewards/margins": 1.8376156091690063,
"rewards/rejected": -25.133737564086914,
"step": 162
},
{
"epoch": 0.3686740175289794,
"grad_norm": 145.66517770093515,
"learning_rate": 6.379220466587063e-07,
"logits/chosen": -1.0169918537139893,
"logits/rejected": -0.9867813587188721,
"logps/chosen": -2.2477433681488037,
"logps/rejected": -2.6195108890533447,
"loss": 3.5149,
"rewards/accuracies": 0.734375,
"rewards/chosen": -22.477432250976562,
"rewards/margins": 3.717676877975464,
"rewards/rejected": -26.195110321044922,
"step": 163
},
{
"epoch": 0.3709358213175007,
"grad_norm": 144.08044730930845,
"learning_rate": 6.353701080656254e-07,
"logits/chosen": -0.9822530150413513,
"logits/rejected": -0.9942737221717834,
"logps/chosen": -2.3591365814208984,
"logps/rejected": -2.7208523750305176,
"loss": 3.378,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -23.591367721557617,
"rewards/margins": 3.6171576976776123,
"rewards/rejected": -27.208526611328125,
"step": 164
},
{
"epoch": 0.37319762510602206,
"grad_norm": 151.33264705138873,
"learning_rate": 6.32803430480913e-07,
"logits/chosen": -0.9973443746566772,
"logits/rejected": -0.9848815202713013,
"logps/chosen": -2.3127224445343018,
"logps/rejected": -2.695197582244873,
"loss": 3.5003,
"rewards/accuracies": 0.703125,
"rewards/chosen": -23.127225875854492,
"rewards/margins": 3.82474946975708,
"rewards/rejected": -26.951976776123047,
"step": 165
},
{
"epoch": 0.3754594288945434,
"grad_norm": 159.23717011250747,
"learning_rate": 6.302221746311782e-07,
"logits/chosen": -1.0382288694381714,
"logits/rejected": -1.0025843381881714,
"logps/chosen": -2.2013797760009766,
"logps/rejected": -2.51949405670166,
"loss": 4.0404,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -22.013797760009766,
"rewards/margins": 3.1811418533325195,
"rewards/rejected": -25.19493865966797,
"step": 166
},
{
"epoch": 0.37772123268306473,
"grad_norm": 136.63350908805106,
"learning_rate": 6.276265021559288e-07,
"logits/chosen": -1.0501428842544556,
"logits/rejected": -1.0255956649780273,
"logps/chosen": -2.2914161682128906,
"logps/rejected": -2.5851945877075195,
"loss": 3.837,
"rewards/accuracies": 0.703125,
"rewards/chosen": -22.914161682128906,
"rewards/margins": 2.9377853870391846,
"rewards/rejected": -25.851943969726562,
"step": 167
},
{
"epoch": 0.3799830364715861,
"grad_norm": 106.76290330410582,
"learning_rate": 6.250165755974487e-07,
"logits/chosen": -0.973567008972168,
"logits/rejected": -0.9691902995109558,
"logps/chosen": -2.191209316253662,
"logps/rejected": -2.55265212059021,
"loss": 3.2144,
"rewards/accuracies": 0.703125,
"rewards/chosen": -21.912094116210938,
"rewards/margins": 3.6144275665283203,
"rewards/rejected": -25.52652359008789,
"step": 168
},
{
"epoch": 0.3822448402601074,
"grad_norm": 123.0165341553547,
"learning_rate": 6.223925583906192e-07,
"logits/chosen": -1.0723838806152344,
"logits/rejected": -1.068512201309204,
"logps/chosen": -2.2144222259521484,
"logps/rejected": -2.5826680660247803,
"loss": 3.2022,
"rewards/accuracies": 0.734375,
"rewards/chosen": -22.144222259521484,
"rewards/margins": 3.682457208633423,
"rewards/rejected": -25.826679229736328,
"step": 169
},
{
"epoch": 0.3845066440486288,
"grad_norm": 106.9659885956079,
"learning_rate": 6.19754614852685e-07,
"logits/chosen": -1.057454228401184,
"logits/rejected": -1.0504707098007202,
"logps/chosen": -2.0870509147644043,
"logps/rejected": -2.4082977771759033,
"loss": 3.341,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -20.870508193969727,
"rewards/margins": 3.212470531463623,
"rewards/rejected": -24.08298110961914,
"step": 170
},
{
"epoch": 0.38676844783715014,
"grad_norm": 116.69942583640601,
"learning_rate": 6.171029101729644e-07,
"logits/chosen": -1.0107990503311157,
"logits/rejected": -0.997688353061676,
"logps/chosen": -2.2821381092071533,
"logps/rejected": -2.6844727993011475,
"loss": 3.3542,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -22.821382522583008,
"rewards/margins": 4.023346900939941,
"rewards/rejected": -26.8447322845459,
"step": 171
},
{
"epoch": 0.38903025162567145,
"grad_norm": 112.64754228209283,
"learning_rate": 6.144376104025055e-07,
"logits/chosen": -1.078917145729065,
"logits/rejected": -1.0517069101333618,
"logps/chosen": -2.097365379333496,
"logps/rejected": -2.4820194244384766,
"loss": 3.0937,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -20.97365379333496,
"rewards/margins": 3.8465404510498047,
"rewards/rejected": -24.820194244384766,
"step": 172
},
{
"epoch": 0.3912920554141928,
"grad_norm": 110.79442729953392,
"learning_rate": 6.117588824436873e-07,
"logits/chosen": -1.0570931434631348,
"logits/rejected": -1.0704282522201538,
"logps/chosen": -2.193882703781128,
"logps/rejected": -2.4734573364257812,
"loss": 3.9527,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -21.938825607299805,
"rewards/margins": 2.795746326446533,
"rewards/rejected": -24.73457145690918,
"step": 173
},
{
"epoch": 0.3935538592027142,
"grad_norm": 137.95490469520266,
"learning_rate": 6.090668940397688e-07,
"logits/chosen": -1.0477267503738403,
"logits/rejected": -1.0384645462036133,
"logps/chosen": -2.2047643661499023,
"logps/rejected": -2.585783004760742,
"loss": 3.3294,
"rewards/accuracies": 0.765625,
"rewards/chosen": -22.047645568847656,
"rewards/margins": 3.810184955596924,
"rewards/rejected": -25.857830047607422,
"step": 174
},
{
"epoch": 0.3958156629912355,
"grad_norm": 127.29644669869833,
"learning_rate": 6.063618137643844e-07,
"logits/chosen": -1.022063970565796,
"logits/rejected": -1.0069043636322021,
"logps/chosen": -2.243839740753174,
"logps/rejected": -2.6155033111572266,
"loss": 3.4077,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -22.438400268554688,
"rewards/margins": 3.7166357040405273,
"rewards/rejected": -26.1550350189209,
"step": 175
},
{
"epoch": 0.39807746677975686,
"grad_norm": 135.11493494004503,
"learning_rate": 6.03643811010988e-07,
"logits/chosen": -1.0559251308441162,
"logits/rejected": -1.063502311706543,
"logps/chosen": -2.3829030990600586,
"logps/rejected": -2.7540674209594727,
"loss": 3.177,
"rewards/accuracies": 0.75,
"rewards/chosen": -23.829029083251953,
"rewards/margins": 3.7116451263427734,
"rewards/rejected": -27.540672302246094,
"step": 176
},
{
"epoch": 0.4003392705682782,
"grad_norm": 140.7638348318017,
"learning_rate": 6.009130559822453e-07,
"logits/chosen": -1.0719250440597534,
"logits/rejected": -1.0558902025222778,
"logps/chosen": -2.440748929977417,
"logps/rejected": -2.732268810272217,
"loss": 3.8044,
"rewards/accuracies": 0.703125,
"rewards/chosen": -24.407489776611328,
"rewards/margins": 2.9151973724365234,
"rewards/rejected": -27.322690963745117,
"step": 177
},
{
"epoch": 0.40260107435679954,
"grad_norm": 145.03797185456023,
"learning_rate": 5.981697196793758e-07,
"logits/chosen": -1.0876305103302002,
"logits/rejected": -1.0896273851394653,
"logps/chosen": -2.5427262783050537,
"logps/rejected": -2.8704357147216797,
"loss": 3.2689,
"rewards/accuracies": 0.71875,
"rewards/chosen": -25.427263259887695,
"rewards/margins": 3.2770934104919434,
"rewards/rejected": -28.704357147216797,
"step": 178
},
{
"epoch": 0.4048628781453209,
"grad_norm": 142.03805172849428,
"learning_rate": 5.954139738914446e-07,
"logits/chosen": -1.064598560333252,
"logits/rejected": -1.0695425271987915,
"logps/chosen": -2.6064581871032715,
"logps/rejected": -2.963006019592285,
"loss": 3.7176,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -26.064579010009766,
"rewards/margins": 3.565481662750244,
"rewards/rejected": -29.630062103271484,
"step": 179
},
{
"epoch": 0.4071246819338422,
"grad_norm": 155.86905489974896,
"learning_rate": 5.92645991184605e-07,
"logits/chosen": -1.0406618118286133,
"logits/rejected": -1.022411584854126,
"logps/chosen": -2.7639245986938477,
"logps/rejected": -3.189260959625244,
"loss": 3.0644,
"rewards/accuracies": 0.765625,
"rewards/chosen": -27.639245986938477,
"rewards/margins": 4.253364562988281,
"rewards/rejected": -31.892608642578125,
"step": 180
},
{
"epoch": 0.4093864857223636,
"grad_norm": 146.00773077090489,
"learning_rate": 5.898659448912917e-07,
"logits/chosen": -1.0187329053878784,
"logits/rejected": -1.0266443490982056,
"logps/chosen": -2.6189653873443604,
"logps/rejected": -3.0183205604553223,
"loss": 3.6161,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -26.18965721130371,
"rewards/margins": 3.9935495853424072,
"rewards/rejected": -30.183202743530273,
"step": 181
},
{
"epoch": 0.41164828951088495,
"grad_norm": 143.88940929849588,
"learning_rate": 5.870740090993676e-07,
"logits/chosen": -1.054551124572754,
"logits/rejected": -1.0553083419799805,
"logps/chosen": -2.855097532272339,
"logps/rejected": -3.358206033706665,
"loss": 2.9768,
"rewards/accuracies": 0.75,
"rewards/chosen": -28.550975799560547,
"rewards/margins": 5.031084060668945,
"rewards/rejected": -33.582061767578125,
"step": 182
},
{
"epoch": 0.41391009329940626,
"grad_norm": 148.85713065388646,
"learning_rate": 5.842703586412214e-07,
"logits/chosen": -1.053299069404602,
"logits/rejected": -1.0449230670928955,
"logps/chosen": -2.9110286235809326,
"logps/rejected": -3.206702470779419,
"loss": 4.0637,
"rewards/accuracies": 0.703125,
"rewards/chosen": -29.110288619995117,
"rewards/margins": 2.956738233566284,
"rewards/rejected": -32.06702423095703,
"step": 183
},
{
"epoch": 0.4161718970879276,
"grad_norm": 167.29318192619908,
"learning_rate": 5.814551690828203e-07,
"logits/chosen": -1.0455509424209595,
"logits/rejected": -1.0376818180084229,
"logps/chosen": -2.828270673751831,
"logps/rejected": -3.220689296722412,
"loss": 3.2985,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -28.28270721435547,
"rewards/margins": 3.924184560775757,
"rewards/rejected": -32.20689010620117,
"step": 184
},
{
"epoch": 0.418433700876449,
"grad_norm": 206.27636903137858,
"learning_rate": 5.786286167127155e-07,
"logits/chosen": -1.0653034448623657,
"logits/rejected": -1.057640552520752,
"logps/chosen": -2.7050979137420654,
"logps/rejected": -3.1350555419921875,
"loss": 3.1992,
"rewards/accuracies": 0.734375,
"rewards/chosen": -27.05097770690918,
"rewards/margins": 4.299577236175537,
"rewards/rejected": -31.350555419921875,
"step": 185
},
{
"epoch": 0.4206955046649703,
"grad_norm": 143.90302309104877,
"learning_rate": 5.757908785310031e-07,
"logits/chosen": -1.033769130706787,
"logits/rejected": -1.0227222442626953,
"logps/chosen": -2.5687739849090576,
"logps/rejected": -2.9693543910980225,
"loss": 3.4184,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -25.687742233276367,
"rewards/margins": 4.005804061889648,
"rewards/rejected": -29.69354248046875,
"step": 186
},
{
"epoch": 0.42295730845349166,
"grad_norm": 131.295957540962,
"learning_rate": 5.729421322382399e-07,
"logits/chosen": -1.0499193668365479,
"logits/rejected": -1.060524344444275,
"logps/chosen": -2.4598476886749268,
"logps/rejected": -2.8906612396240234,
"loss": 3.3257,
"rewards/accuracies": 0.703125,
"rewards/chosen": -24.598472595214844,
"rewards/margins": 4.3081374168396,
"rewards/rejected": -28.906612396240234,
"step": 187
},
{
"epoch": 0.42521911224201303,
"grad_norm": 136.38634729167853,
"learning_rate": 5.700825562243163e-07,
"logits/chosen": -1.0197397470474243,
"logits/rejected": -1.012385368347168,
"logps/chosen": -2.5251848697662354,
"logps/rejected": -2.9892866611480713,
"loss": 2.9243,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -25.251850128173828,
"rewards/margins": 4.641017436981201,
"rewards/rejected": -29.89286994934082,
"step": 188
},
{
"epoch": 0.42748091603053434,
"grad_norm": 141.76629611664433,
"learning_rate": 5.672123295572854e-07,
"logits/chosen": -1.0829524993896484,
"logits/rejected": -1.0842368602752686,
"logps/chosen": -2.5519356727600098,
"logps/rejected": -2.897853136062622,
"loss": 2.8895,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -25.519359588623047,
"rewards/margins": 3.4591751098632812,
"rewards/rejected": -28.978532791137695,
"step": 189
},
{
"epoch": 0.4297427198190557,
"grad_norm": 156.54079418478554,
"learning_rate": 5.643316319721487e-07,
"logits/chosen": -1.044749140739441,
"logits/rejected": -1.0437251329421997,
"logps/chosen": -2.7124738693237305,
"logps/rejected": -3.0288543701171875,
"loss": 3.8523,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -27.124740600585938,
"rewards/margins": 3.163804292678833,
"rewards/rejected": -30.288543701171875,
"step": 190
},
{
"epoch": 0.432004523607577,
"grad_norm": 152.22283290197944,
"learning_rate": 5.614406438596026e-07,
"logits/chosen": -1.0822639465332031,
"logits/rejected": -1.0667061805725098,
"logps/chosen": -2.801711320877075,
"logps/rejected": -3.1665806770324707,
"loss": 3.4512,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -28.017114639282227,
"rewards/margins": 3.6486923694610596,
"rewards/rejected": -31.665807723999023,
"step": 191
},
{
"epoch": 0.4342663273960984,
"grad_norm": 152.11043222529125,
"learning_rate": 5.585395462547406e-07,
"logits/chosen": -1.0402151346206665,
"logits/rejected": -1.0291682481765747,
"logps/chosen": -2.7880678176879883,
"logps/rejected": -3.064667224884033,
"loss": 3.7747,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -27.88067626953125,
"rewards/margins": 2.765998363494873,
"rewards/rejected": -30.64667510986328,
"step": 192
},
{
"epoch": 0.43652813118461975,
"grad_norm": 142.00119985201334,
"learning_rate": 5.55628520825718e-07,
"logits/chosen": -1.0989923477172852,
"logits/rejected": -1.0788824558258057,
"logps/chosen": -2.6404459476470947,
"logps/rejected": -3.0510411262512207,
"loss": 2.9318,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -26.40445899963379,
"rewards/margins": 4.105955123901367,
"rewards/rejected": -30.510412216186523,
"step": 193
},
{
"epoch": 0.43878993497314106,
"grad_norm": 139.7479873967227,
"learning_rate": 5.527077498623752e-07,
"logits/chosen": -1.0548537969589233,
"logits/rejected": -1.0540835857391357,
"logps/chosen": -2.691882610321045,
"logps/rejected": -3.0502240657806396,
"loss": 3.6926,
"rewards/accuracies": 0.703125,
"rewards/chosen": -26.9188289642334,
"rewards/margins": 3.5834131240844727,
"rewards/rejected": -30.502239227294922,
"step": 194
},
{
"epoch": 0.4410517387616624,
"grad_norm": 130.61284433987367,
"learning_rate": 5.497774162648228e-07,
"logits/chosen": -1.0161868333816528,
"logits/rejected": -1.0190304517745972,
"logps/chosen": -2.542428493499756,
"logps/rejected": -3.0524849891662598,
"loss": 3.1532,
"rewards/accuracies": 0.75,
"rewards/chosen": -25.42428207397461,
"rewards/margins": 5.100566387176514,
"rewards/rejected": -30.52484893798828,
"step": 195
},
{
"epoch": 0.4433135425501838,
"grad_norm": 135.14144066906175,
"learning_rate": 5.468377035319882e-07,
"logits/chosen": -1.0638034343719482,
"logits/rejected": -1.0532487630844116,
"logps/chosen": -2.6114158630371094,
"logps/rejected": -3.115168809890747,
"loss": 3.2609,
"rewards/accuracies": 0.734375,
"rewards/chosen": -26.114160537719727,
"rewards/margins": 5.037529468536377,
"rewards/rejected": -31.151687622070312,
"step": 196
},
{
"epoch": 0.4455753463387051,
"grad_norm": 148.720911284201,
"learning_rate": 5.438887957501248e-07,
"logits/chosen": -0.9781689047813416,
"logits/rejected": -0.9678754806518555,
"logps/chosen": -2.449193239212036,
"logps/rejected": -2.8158748149871826,
"loss": 3.6472,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -24.491931915283203,
"rewards/margins": 3.6668155193328857,
"rewards/rejected": -28.15874671936035,
"step": 197
},
{
"epoch": 0.44783715012722647,
"grad_norm": 135.90145162820608,
"learning_rate": 5.409308775812844e-07,
"logits/chosen": -1.0223724842071533,
"logits/rejected": -1.0113955736160278,
"logps/chosen": -2.5878636837005615,
"logps/rejected": -2.9643242359161377,
"loss": 3.6913,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -25.87863540649414,
"rewards/margins": 3.7646050453186035,
"rewards/rejected": -29.643238067626953,
"step": 198
},
{
"epoch": 0.45009895391574783,
"grad_norm": 128.2704382853074,
"learning_rate": 5.379641342517541e-07,
"logits/chosen": -1.0288407802581787,
"logits/rejected": -1.0315312147140503,
"logps/chosen": -2.346052646636963,
"logps/rejected": -2.8313674926757812,
"loss": 3.1107,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -23.460527420043945,
"rewards/margins": 4.853147983551025,
"rewards/rejected": -28.313674926757812,
"step": 199
},
{
"epoch": 0.45236075770426915,
"grad_norm": 138.9294365697912,
"learning_rate": 5.349887515404564e-07,
"logits/chosen": -1.008575677871704,
"logits/rejected": -1.0214340686798096,
"logps/chosen": -2.542800188064575,
"logps/rejected": -3.029541015625,
"loss": 3.1166,
"rewards/accuracies": 0.734375,
"rewards/chosen": -25.428003311157227,
"rewards/margins": 4.867405891418457,
"rewards/rejected": -30.29541015625,
"step": 200
},
{
"epoch": 0.4546225614927905,
"grad_norm": 117.1329832477415,
"learning_rate": 5.320049157673163e-07,
"logits/chosen": -0.9712215662002563,
"logits/rejected": -0.9573394656181335,
"logps/chosen": -2.3807926177978516,
"logps/rejected": -2.774594783782959,
"loss": 3.1467,
"rewards/accuracies": 0.765625,
"rewards/chosen": -23.807926177978516,
"rewards/margins": 3.938020706176758,
"rewards/rejected": -27.745946884155273,
"step": 201
},
{
"epoch": 0.4568843652813118,
"grad_norm": 141.22701148173041,
"learning_rate": 5.290128137815938e-07,
"logits/chosen": -1.0210527181625366,
"logits/rejected": -1.0222499370574951,
"logps/chosen": -2.4406933784484863,
"logps/rejected": -2.9088692665100098,
"loss": 2.8153,
"rewards/accuracies": 0.71875,
"rewards/chosen": -24.40693473815918,
"rewards/margins": 4.681759357452393,
"rewards/rejected": -29.088693618774414,
"step": 202
},
{
"epoch": 0.4591461690698332,
"grad_norm": 104.57362015283528,
"learning_rate": 5.260126329501828e-07,
"logits/chosen": -1.066743016242981,
"logits/rejected": -1.0438530445098877,
"logps/chosen": -2.399411201477051,
"logps/rejected": -2.9573822021484375,
"loss": 2.6011,
"rewards/accuracies": 0.78125,
"rewards/chosen": -23.99411392211914,
"rewards/margins": 5.579708576202393,
"rewards/rejected": -29.573822021484375,
"step": 203
},
{
"epoch": 0.46140797285835455,
"grad_norm": 143.5907755591019,
"learning_rate": 5.230045611458789e-07,
"logits/chosen": -0.9814115762710571,
"logits/rejected": -0.9883652925491333,
"logps/chosen": -2.3635404109954834,
"logps/rejected": -2.725175380706787,
"loss": 3.4507,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -23.635404586791992,
"rewards/margins": 3.6163482666015625,
"rewards/rejected": -27.251752853393555,
"step": 204
},
{
"epoch": 0.46366977664687586,
"grad_norm": 132.51043793956725,
"learning_rate": 5.199887867356143e-07,
"logits/chosen": -0.9847227334976196,
"logits/rejected": -0.9898078441619873,
"logps/chosen": -2.5268359184265137,
"logps/rejected": -2.994638442993164,
"loss": 3.0621,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -25.268360137939453,
"rewards/margins": 4.678021430969238,
"rewards/rejected": -29.946386337280273,
"step": 205
},
{
"epoch": 0.46593158043539723,
"grad_norm": 118.9977147061269,
"learning_rate": 5.16965498568662e-07,
"logits/chosen": -1.0224330425262451,
"logits/rejected": -0.9982988834381104,
"logps/chosen": -2.616161584854126,
"logps/rejected": -3.237916946411133,
"loss": 2.9049,
"rewards/accuracies": 0.71875,
"rewards/chosen": -26.1616153717041,
"rewards/margins": 6.217552661895752,
"rewards/rejected": -32.37916946411133,
"step": 206
},
{
"epoch": 0.4681933842239186,
"grad_norm": 143.69066684123075,
"learning_rate": 5.139348859648098e-07,
"logits/chosen": -1.036367654800415,
"logits/rejected": -1.0168451070785522,
"logps/chosen": -2.4392778873443604,
"logps/rejected": -2.870711088180542,
"loss": 3.0652,
"rewards/accuracies": 0.75,
"rewards/chosen": -24.39278221130371,
"rewards/margins": 4.314330101013184,
"rewards/rejected": -28.707111358642578,
"step": 207
},
{
"epoch": 0.4704551880124399,
"grad_norm": 140.65993366550816,
"learning_rate": 5.10897138702506e-07,
"logits/chosen": -0.9891340732574463,
"logits/rejected": -0.9894376993179321,
"logps/chosen": -2.548025608062744,
"logps/rejected": -2.9896531105041504,
"loss": 3.5331,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -25.48025894165039,
"rewards/margins": 4.4162702560424805,
"rewards/rejected": -29.896530151367188,
"step": 208
},
{
"epoch": 0.4727169918009613,
"grad_norm": 149.36997254654412,
"learning_rate": 5.078524470069743e-07,
"logits/chosen": -1.0524518489837646,
"logits/rejected": -1.0292545557022095,
"logps/chosen": -2.681795120239258,
"logps/rejected": -3.1556873321533203,
"loss": 3.1155,
"rewards/accuracies": 0.78125,
"rewards/chosen": -26.817949295043945,
"rewards/margins": 4.738921642303467,
"rewards/rejected": -31.556873321533203,
"step": 209
},
{
"epoch": 0.47497879558948264,
"grad_norm": 155.2109822525701,
"learning_rate": 5.048010015383021e-07,
"logits/chosen": -0.9646722674369812,
"logits/rejected": -0.956778883934021,
"logps/chosen": -2.597956657409668,
"logps/rejected": -3.2177212238311768,
"loss": 2.8626,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -25.979564666748047,
"rewards/margins": 6.197646617889404,
"rewards/rejected": -32.177215576171875,
"step": 210
},
{
"epoch": 0.47724059937800395,
"grad_norm": 147.3834849484295,
"learning_rate": 5.01742993379502e-07,
"logits/chosen": -1.0122888088226318,
"logits/rejected": -1.0171258449554443,
"logps/chosen": -2.6626272201538086,
"logps/rejected": -3.146449089050293,
"loss": 3.0977,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -26.626270294189453,
"rewards/margins": 4.838218688964844,
"rewards/rejected": -31.464488983154297,
"step": 211
},
{
"epoch": 0.4795024031665253,
"grad_norm": 159.19015701212336,
"learning_rate": 4.986786140245446e-07,
"logits/chosen": -0.9807606339454651,
"logits/rejected": -0.9683344960212708,
"logps/chosen": -2.5756638050079346,
"logps/rejected": -2.922246217727661,
"loss": 3.8934,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -25.756633758544922,
"rewards/margins": 3.4658255577087402,
"rewards/rejected": -29.222461700439453,
"step": 212
},
{
"epoch": 0.4817642069550466,
"grad_norm": 170.92750297621785,
"learning_rate": 4.956080553663687e-07,
"logits/chosen": -1.0279110670089722,
"logits/rejected": -1.0205495357513428,
"logps/chosen": -2.631155252456665,
"logps/rejected": -3.1095268726348877,
"loss": 3.4103,
"rewards/accuracies": 0.734375,
"rewards/chosen": -26.311552047729492,
"rewards/margins": 4.783715724945068,
"rewards/rejected": -31.09527015686035,
"step": 213
},
{
"epoch": 0.484026010743568,
"grad_norm": 154.82473362445955,
"learning_rate": 4.925315096848636e-07,
"logits/chosen": -1.0190399885177612,
"logits/rejected": -1.014163613319397,
"logps/chosen": -2.682346820831299,
"logps/rejected": -3.2099146842956543,
"loss": 3.0444,
"rewards/accuracies": 0.78125,
"rewards/chosen": -26.823469161987305,
"rewards/margins": 5.2756781578063965,
"rewards/rejected": -32.099143981933594,
"step": 214
},
{
"epoch": 0.48628781453208936,
"grad_norm": 138.22471006405996,
"learning_rate": 4.894491696348293e-07,
"logits/chosen": -1.0609517097473145,
"logits/rejected": -1.0692510604858398,
"logps/chosen": -2.570169448852539,
"logps/rejected": -2.9345059394836426,
"loss": 3.3457,
"rewards/accuracies": 0.75,
"rewards/chosen": -25.701698303222656,
"rewards/margins": 3.6433632373809814,
"rewards/rejected": -29.345060348510742,
"step": 215
},
{
"epoch": 0.48854961832061067,
"grad_norm": 132.4676990140604,
"learning_rate": 4.863612282339116e-07,
"logits/chosen": -0.968072235584259,
"logits/rejected": -0.9448983073234558,
"logps/chosen": -2.7336604595184326,
"logps/rejected": -3.2022647857666016,
"loss": 3.3316,
"rewards/accuracies": 0.71875,
"rewards/chosen": -27.336606979370117,
"rewards/margins": 4.6860432624816895,
"rewards/rejected": -32.02265167236328,
"step": 216
},
{
"epoch": 0.49081142210913203,
"grad_norm": 130.33127389382147,
"learning_rate": 4.832678788505161e-07,
"logits/chosen": -0.9838683009147644,
"logits/rejected": -0.9567040801048279,
"logps/chosen": -2.637286901473999,
"logps/rejected": -3.1400976181030273,
"loss": 3.1326,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -26.37286949157715,
"rewards/margins": 5.028109550476074,
"rewards/rejected": -31.40097999572754,
"step": 217
},
{
"epoch": 0.4930732258976534,
"grad_norm": 200.20541042576428,
"learning_rate": 4.801693151916985e-07,
"logits/chosen": -1.0184892416000366,
"logits/rejected": -1.014449954032898,
"logps/chosen": -2.620845079421997,
"logps/rejected": -3.109973430633545,
"loss": 3.0453,
"rewards/accuracies": 0.75,
"rewards/chosen": -26.20844841003418,
"rewards/margins": 4.891287326812744,
"rewards/rejected": -31.09973907470703,
"step": 218
},
{
"epoch": 0.4953350296861747,
"grad_norm": 124.9685928887746,
"learning_rate": 4.770657312910354e-07,
"logits/chosen": -1.0416370630264282,
"logits/rejected": -1.032057523727417,
"logps/chosen": -2.6119141578674316,
"logps/rejected": -3.0717544555664062,
"loss": 3.3908,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -26.119142532348633,
"rewards/margins": 4.5983991622924805,
"rewards/rejected": -30.71754264831543,
"step": 219
},
{
"epoch": 0.4975968334746961,
"grad_norm": 144.4680629722706,
"learning_rate": 4.739573214964729e-07,
"logits/chosen": -1.0071725845336914,
"logits/rejected": -0.979911208152771,
"logps/chosen": -2.5241236686706543,
"logps/rejected": -2.997404098510742,
"loss": 3.1178,
"rewards/accuracies": 0.734375,
"rewards/chosen": -25.241235733032227,
"rewards/margins": 4.732804298400879,
"rewards/rejected": -29.974040985107422,
"step": 220
},
{
"epoch": 0.49985863726321744,
"grad_norm": 222.12373832600284,
"learning_rate": 4.7084428045815733e-07,
"logits/chosen": -0.9902421832084656,
"logits/rejected": -0.9855415225028992,
"logps/chosen": -2.6917271614074707,
"logps/rejected": -3.020761013031006,
"loss": 4.0458,
"rewards/accuracies": 0.671875,
"rewards/chosen": -26.91727066040039,
"rewards/margins": 3.290342330932617,
"rewards/rejected": -30.20760726928711,
"step": 221
},
{
"epoch": 0.5021204410517388,
"grad_norm": 184.680260766697,
"learning_rate": 4.677268031162457e-07,
"logits/chosen": -1.0093257427215576,
"logits/rejected": -0.9966680407524109,
"logps/chosen": -2.5475189685821533,
"logps/rejected": -3.01365065574646,
"loss": 3.4698,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -25.475189208984375,
"rewards/margins": 4.66131591796875,
"rewards/rejected": -30.136505126953125,
"step": 222
},
{
"epoch": 0.5043822448402601,
"grad_norm": 136.79024356574675,
"learning_rate": 4.646050846886985e-07,
"logits/chosen": -0.9515697360038757,
"logits/rejected": -0.9518415331840515,
"logps/chosen": -2.4492499828338623,
"logps/rejected": -2.9192066192626953,
"loss": 3.2096,
"rewards/accuracies": 0.703125,
"rewards/chosen": -24.49249839782715,
"rewards/margins": 4.699567794799805,
"rewards/rejected": -29.19206428527832,
"step": 223
},
{
"epoch": 0.5066440486287814,
"grad_norm": 151.5169744750405,
"learning_rate": 4.6147932065905494e-07,
"logits/chosen": -1.019814133644104,
"logits/rejected": -1.0050256252288818,
"logps/chosen": -2.612035036087036,
"logps/rejected": -2.9731242656707764,
"loss": 3.5691,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -26.12034797668457,
"rewards/margins": 3.6108956336975098,
"rewards/rejected": -29.731239318847656,
"step": 224
},
{
"epoch": 0.5089058524173028,
"grad_norm": 146.9307990920379,
"learning_rate": 4.5834970676419214e-07,
"logits/chosen": -0.9980362057685852,
"logits/rejected": -0.981331467628479,
"logps/chosen": -2.5562005043029785,
"logps/rejected": -2.9508323669433594,
"loss": 3.4484,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -25.562007904052734,
"rewards/margins": 3.946317434310913,
"rewards/rejected": -29.508325576782227,
"step": 225
},
{
"epoch": 0.5111676562058242,
"grad_norm": 178.66761742959343,
"learning_rate": 4.552164389820673e-07,
"logits/chosen": -0.9223219156265259,
"logits/rejected": -0.9146152138710022,
"logps/chosen": -2.5326623916625977,
"logps/rejected": -3.0222249031066895,
"loss": 3.4509,
"rewards/accuracies": 0.734375,
"rewards/chosen": -25.326622009277344,
"rewards/margins": 4.89562463760376,
"rewards/rejected": -30.22224998474121,
"step": 226
},
{
"epoch": 0.5134294599943455,
"grad_norm": 188.01343368426728,
"learning_rate": 4.5207971351944605e-07,
"logits/chosen": -1.0626205205917358,
"logits/rejected": -1.0522465705871582,
"logps/chosen": -2.579554796218872,
"logps/rejected": -2.987112045288086,
"loss": 3.6329,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -25.795547485351562,
"rewards/margins": 4.075572967529297,
"rewards/rejected": -29.87112045288086,
"step": 227
},
{
"epoch": 0.5156912637828668,
"grad_norm": 159.4963136122297,
"learning_rate": 4.489397267996157e-07,
"logits/chosen": -1.0269495248794556,
"logits/rejected": -1.007673740386963,
"logps/chosen": -2.5031819343566895,
"logps/rejected": -2.9473769664764404,
"loss": 3.34,
"rewards/accuracies": 0.71875,
"rewards/chosen": -25.03182029724121,
"rewards/margins": 4.441953182220459,
"rewards/rejected": -29.473773956298828,
"step": 228
},
{
"epoch": 0.5179530675713881,
"grad_norm": 150.8053085996754,
"learning_rate": 4.45796675450085e-07,
"logits/chosen": -1.01454758644104,
"logits/rejected": -1.0115008354187012,
"logps/chosen": -2.4989662170410156,
"logps/rejected": -2.9577999114990234,
"loss": 3.4289,
"rewards/accuracies": 0.71875,
"rewards/chosen": -24.989662170410156,
"rewards/margins": 4.5883378982543945,
"rewards/rejected": -29.578001022338867,
"step": 229
},
{
"epoch": 0.5202148713599095,
"grad_norm": 159.36648564262296,
"learning_rate": 4.4265075629027126e-07,
"logits/chosen": -0.974044919013977,
"logits/rejected": -0.9666758179664612,
"logps/chosen": -2.5280728340148926,
"logps/rejected": -2.9372358322143555,
"loss": 3.1007,
"rewards/accuracies": 0.75,
"rewards/chosen": -25.280729293823242,
"rewards/margins": 4.0916314125061035,
"rewards/rejected": -29.372360229492188,
"step": 230
},
{
"epoch": 0.5224766751484309,
"grad_norm": 226.02995114665168,
"learning_rate": 4.3950216631917563e-07,
"logits/chosen": -1.0299785137176514,
"logits/rejected": -1.032341718673706,
"logps/chosen": -2.4965872764587402,
"logps/rejected": -3.022507429122925,
"loss": 2.8711,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -24.96587371826172,
"rewards/margins": 5.259200572967529,
"rewards/rejected": -30.225072860717773,
"step": 231
},
{
"epoch": 0.5247384789369522,
"grad_norm": 165.05960224221946,
"learning_rate": 4.3635110270304676e-07,
"logits/chosen": -1.042232871055603,
"logits/rejected": -1.0289117097854614,
"logps/chosen": -2.436948537826538,
"logps/rejected": -2.940929889678955,
"loss": 2.3945,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -24.36948585510254,
"rewards/margins": 5.039816856384277,
"rewards/rejected": -29.4093017578125,
"step": 232
},
{
"epoch": 0.5270002827254736,
"grad_norm": 159.51910763447063,
"learning_rate": 4.331977627630339e-07,
"logits/chosen": -0.9937188029289246,
"logits/rejected": -0.9700920581817627,
"logps/chosen": -2.4370651245117188,
"logps/rejected": -2.9327409267425537,
"loss": 2.9749,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -24.370651245117188,
"rewards/margins": 4.956755638122559,
"rewards/rejected": -29.327407836914062,
"step": 233
},
{
"epoch": 0.5292620865139949,
"grad_norm": 139.67122586454525,
"learning_rate": 4.300423439628313e-07,
"logits/chosen": -1.0289289951324463,
"logits/rejected": -1.027557134628296,
"logps/chosen": -2.4969000816345215,
"logps/rejected": -3.048306941986084,
"loss": 2.6142,
"rewards/accuracies": 0.8125,
"rewards/chosen": -24.969003677368164,
"rewards/margins": 5.514064311981201,
"rewards/rejected": -30.48306655883789,
"step": 234
},
{
"epoch": 0.5315238903025162,
"grad_norm": 134.66456207195904,
"learning_rate": 4.268850438963118e-07,
"logits/chosen": -1.0312316417694092,
"logits/rejected": -1.038203239440918,
"logps/chosen": -2.631141424179077,
"logps/rejected": -3.1112723350524902,
"loss": 2.9671,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -26.311412811279297,
"rewards/margins": 4.801308631896973,
"rewards/rejected": -31.112722396850586,
"step": 235
},
{
"epoch": 0.5337856940910376,
"grad_norm": 162.1792467046527,
"learning_rate": 4.2372606027515463e-07,
"logits/chosen": -1.0173921585083008,
"logits/rejected": -1.0191435813903809,
"logps/chosen": -2.651998519897461,
"logps/rejected": -3.0730855464935303,
"loss": 3.3264,
"rewards/accuracies": 0.734375,
"rewards/chosen": -26.519981384277344,
"rewards/margins": 4.210873603820801,
"rewards/rejected": -30.73085594177246,
"step": 236
},
{
"epoch": 0.536047497879559,
"grad_norm": 175.2406482243298,
"learning_rate": 4.2056559091646387e-07,
"logits/chosen": -1.0354706048965454,
"logits/rejected": -1.0305323600769043,
"logps/chosen": -2.6608736515045166,
"logps/rejected": -3.0917770862579346,
"loss": 3.4704,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -26.608734130859375,
"rewards/margins": 4.309037208557129,
"rewards/rejected": -30.917770385742188,
"step": 237
},
{
"epoch": 0.5383093016680803,
"grad_norm": 146.3851634765387,
"learning_rate": 4.1740383373038116e-07,
"logits/chosen": -1.0109094381332397,
"logits/rejected": -1.0172268152236938,
"logps/chosen": -2.564114809036255,
"logps/rejected": -2.9928038120269775,
"loss": 3.4799,
"rewards/accuracies": 0.6875,
"rewards/chosen": -25.641149520874023,
"rewards/margins": 4.28688907623291,
"rewards/rejected": -29.928035736083984,
"step": 238
},
{
"epoch": 0.5405711054566016,
"grad_norm": 169.57296943959173,
"learning_rate": 4.1424098670769255e-07,
"logits/chosen": -1.0650702714920044,
"logits/rejected": -1.0601589679718018,
"logps/chosen": -2.695429563522339,
"logps/rejected": -3.104177713394165,
"loss": 3.3077,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -26.954296112060547,
"rewards/margins": 4.0874810218811035,
"rewards/rejected": -31.041776657104492,
"step": 239
},
{
"epoch": 0.542832909245123,
"grad_norm": 119.6967428650925,
"learning_rate": 4.1107724790743007e-07,
"logits/chosen": -0.9832959175109863,
"logits/rejected": -1.000196933746338,
"logps/chosen": -2.490891933441162,
"logps/rejected": -2.9166793823242188,
"loss": 2.9422,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -24.908920288085938,
"rewards/margins": 4.257873058319092,
"rewards/rejected": -29.166791915893555,
"step": 240
},
{
"epoch": 0.5450947130336443,
"grad_norm": 163.28408702057416,
"learning_rate": 4.0791281544446947e-07,
"logits/chosen": -1.053307294845581,
"logits/rejected": -1.0447359085083008,
"logps/chosen": -2.5901968479156494,
"logps/rejected": -3.1284067630767822,
"loss": 2.6637,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -25.901966094970703,
"rewards/margins": 5.382099628448486,
"rewards/rejected": -31.28406524658203,
"step": 241
},
{
"epoch": 0.5473565168221657,
"grad_norm": 148.41508302533668,
"learning_rate": 4.0474788747712416e-07,
"logits/chosen": -1.057266116142273,
"logits/rejected": -1.047303318977356,
"logps/chosen": -2.5654890537261963,
"logps/rejected": -2.987715244293213,
"loss": 3.2988,
"rewards/accuracies": 0.71875,
"rewards/chosen": -25.654890060424805,
"rewards/margins": 4.222264289855957,
"rewards/rejected": -29.877155303955078,
"step": 242
},
{
"epoch": 0.549618320610687,
"grad_norm": 126.18312951161131,
"learning_rate": 4.0158266219473573e-07,
"logits/chosen": -1.0503863096237183,
"logits/rejected": -1.0432727336883545,
"logps/chosen": -2.4267337322235107,
"logps/rejected": -2.9202306270599365,
"loss": 2.9689,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -24.267337799072266,
"rewards/margins": 4.9349684715271,
"rewards/rejected": -29.202306747436523,
"step": 243
},
{
"epoch": 0.5518801243992084,
"grad_norm": 133.87077060972973,
"learning_rate": 3.984173378052643e-07,
"logits/chosen": -1.0356292724609375,
"logits/rejected": -1.0147960186004639,
"logps/chosen": -2.316190719604492,
"logps/rejected": -2.8582763671875,
"loss": 2.48,
"rewards/accuracies": 0.8125,
"rewards/chosen": -23.161909103393555,
"rewards/margins": 5.420856475830078,
"rewards/rejected": -28.582763671875,
"step": 244
},
{
"epoch": 0.5541419281877297,
"grad_norm": 142.54981649690544,
"learning_rate": 3.9525211252287585e-07,
"logits/chosen": -1.0919835567474365,
"logits/rejected": -1.0901732444763184,
"logps/chosen": -2.596813201904297,
"logps/rejected": -3.177374839782715,
"loss": 2.8792,
"rewards/accuracies": 0.734375,
"rewards/chosen": -25.968130111694336,
"rewards/margins": 5.805619716644287,
"rewards/rejected": -31.77375030517578,
"step": 245
},
{
"epoch": 0.556403731976251,
"grad_norm": 128.27060507121888,
"learning_rate": 3.920871845555305e-07,
"logits/chosen": -1.0442081689834595,
"logits/rejected": -1.0269874334335327,
"logps/chosen": -2.5303421020507812,
"logps/rejected": -2.9623892307281494,
"loss": 2.9162,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -25.303421020507812,
"rewards/margins": 4.320469856262207,
"rewards/rejected": -29.623889923095703,
"step": 246
},
{
"epoch": 0.5586655357647724,
"grad_norm": 153.31496605164523,
"learning_rate": 3.8892275209256984e-07,
"logits/chosen": -1.0571988821029663,
"logits/rejected": -1.0373103618621826,
"logps/chosen": -2.6785757541656494,
"logps/rejected": -3.0567195415496826,
"loss": 3.0226,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -26.78575897216797,
"rewards/margins": 3.7814342975616455,
"rewards/rejected": -30.567195892333984,
"step": 247
},
{
"epoch": 0.5609273395532938,
"grad_norm": 171.8190367681467,
"learning_rate": 3.8575901329230747e-07,
"logits/chosen": -1.0109193325042725,
"logits/rejected": -0.9870609641075134,
"logps/chosen": -2.7195398807525635,
"logps/rejected": -3.1331217288970947,
"loss": 3.6614,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -27.19540023803711,
"rewards/margins": 4.135817527770996,
"rewards/rejected": -31.331218719482422,
"step": 248
},
{
"epoch": 0.5631891433418151,
"grad_norm": 140.4062037075952,
"learning_rate": 3.8259616626961886e-07,
"logits/chosen": -0.9998199939727783,
"logits/rejected": -1.00416898727417,
"logps/chosen": -2.5169920921325684,
"logps/rejected": -2.868487596511841,
"loss": 3.1507,
"rewards/accuracies": 0.765625,
"rewards/chosen": -25.169921875,
"rewards/margins": 3.5149550437927246,
"rewards/rejected": -28.68487548828125,
"step": 249
},
{
"epoch": 0.5654509471303364,
"grad_norm": 166.8389194279534,
"learning_rate": 3.794344090835362e-07,
"logits/chosen": -1.0278959274291992,
"logits/rejected": -1.0074037313461304,
"logps/chosen": -2.7008965015411377,
"logps/rejected": -3.127589225769043,
"loss": 3.4144,
"rewards/accuracies": 0.71875,
"rewards/chosen": -27.00896453857422,
"rewards/margins": 4.26693058013916,
"rewards/rejected": -31.275894165039062,
"step": 250
},
{
"epoch": 0.5677127509188578,
"grad_norm": 143.63770434185832,
"learning_rate": 3.7627393972484534e-07,
"logits/chosen": -1.1122283935546875,
"logits/rejected": -1.0987051725387573,
"logps/chosen": -2.713491439819336,
"logps/rejected": -3.113678216934204,
"loss": 3.2252,
"rewards/accuracies": 0.734375,
"rewards/chosen": -27.134912490844727,
"rewards/margins": 4.001870155334473,
"rewards/rejected": -31.136781692504883,
"step": 251
},
{
"epoch": 0.5699745547073791,
"grad_norm": 136.26224349022138,
"learning_rate": 3.7311495610368823e-07,
"logits/chosen": -1.0734153985977173,
"logits/rejected": -1.0677202939987183,
"logps/chosen": -2.799349784851074,
"logps/rejected": -3.266136646270752,
"loss": 3.0055,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -27.99349594116211,
"rewards/margins": 4.667870998382568,
"rewards/rejected": -32.6613655090332,
"step": 252
},
{
"epoch": 0.5722363584959005,
"grad_norm": 156.68909191083887,
"learning_rate": 3.699576560371689e-07,
"logits/chosen": -1.0107449293136597,
"logits/rejected": -1.012537956237793,
"logps/chosen": -2.8697638511657715,
"logps/rejected": -3.5351531505584717,
"loss": 2.4639,
"rewards/accuracies": 0.78125,
"rewards/chosen": -28.69763946533203,
"rewards/margins": 6.6538920402526855,
"rewards/rejected": -35.351531982421875,
"step": 253
},
{
"epoch": 0.5744981622844219,
"grad_norm": 150.83056174799268,
"learning_rate": 3.66802237236966e-07,
"logits/chosen": -1.0131436586380005,
"logits/rejected": -1.0157915353775024,
"logps/chosen": -2.8984572887420654,
"logps/rejected": -3.4090542793273926,
"loss": 3.0289,
"rewards/accuracies": 0.765625,
"rewards/chosen": -28.984573364257812,
"rewards/margins": 5.105963706970215,
"rewards/rejected": -34.090538024902344,
"step": 254
},
{
"epoch": 0.5767599660729432,
"grad_norm": 167.25008334154396,
"learning_rate": 3.636488972969532e-07,
"logits/chosen": -1.0094777345657349,
"logits/rejected": -1.0090457201004028,
"logps/chosen": -2.8125481605529785,
"logps/rejected": -3.2535414695739746,
"loss": 3.3245,
"rewards/accuracies": 0.71875,
"rewards/chosen": -28.1254825592041,
"rewards/margins": 4.409930229187012,
"rewards/rejected": -32.53541564941406,
"step": 255
},
{
"epoch": 0.5790217698614645,
"grad_norm": 141.29219706418925,
"learning_rate": 3.604978336808244e-07,
"logits/chosen": -1.140702724456787,
"logits/rejected": -1.1400625705718994,
"logps/chosen": -2.848879337310791,
"logps/rejected": -3.357835292816162,
"loss": 2.8014,
"rewards/accuracies": 0.796875,
"rewards/chosen": -28.48879051208496,
"rewards/margins": 5.089555740356445,
"rewards/rejected": -33.57835006713867,
"step": 256
},
{
"epoch": 0.5812835736499858,
"grad_norm": 151.14436444702108,
"learning_rate": 3.5734924370972876e-07,
"logits/chosen": -1.0457602739334106,
"logits/rejected": -1.0472681522369385,
"logps/chosen": -2.700531244277954,
"logps/rejected": -3.1960270404815674,
"loss": 2.7282,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -27.005313873291016,
"rewards/margins": 4.954959392547607,
"rewards/rejected": -31.96027183532715,
"step": 257
},
{
"epoch": 0.5835453774385072,
"grad_norm": 169.2294504356089,
"learning_rate": 3.5420332454991504e-07,
"logits/chosen": -1.0189040899276733,
"logits/rejected": -1.0160531997680664,
"logps/chosen": -2.818633556365967,
"logps/rejected": -3.2412967681884766,
"loss": 3.4916,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -28.186336517333984,
"rewards/margins": 4.226632118225098,
"rewards/rejected": -32.412967681884766,
"step": 258
},
{
"epoch": 0.5858071812270286,
"grad_norm": 168.96562111413766,
"learning_rate": 3.510602732003843e-07,
"logits/chosen": -1.0517528057098389,
"logits/rejected": -1.0637171268463135,
"logps/chosen": -3.009273052215576,
"logps/rejected": -3.4520859718322754,
"loss": 3.2834,
"rewards/accuracies": 0.734375,
"rewards/chosen": -30.092731475830078,
"rewards/margins": 4.428128719329834,
"rewards/rejected": -34.52085876464844,
"step": 259
},
{
"epoch": 0.5880689850155499,
"grad_norm": 154.95870708399784,
"learning_rate": 3.4792028648055396e-07,
"logits/chosen": -1.0257949829101562,
"logits/rejected": -1.0418014526367188,
"logps/chosen": -2.8583335876464844,
"logps/rejected": -3.3227787017822266,
"loss": 2.9362,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -28.583337783813477,
"rewards/margins": 4.644450664520264,
"rewards/rejected": -33.227787017822266,
"step": 260
},
{
"epoch": 0.5903307888040712,
"grad_norm": 152.99013472578545,
"learning_rate": 3.447835610179327e-07,
"logits/chosen": -1.0327140092849731,
"logits/rejected": -1.0097893476486206,
"logps/chosen": -2.8091721534729004,
"logps/rejected": -3.4519004821777344,
"loss": 2.827,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -28.091720581054688,
"rewards/margins": 6.427282333374023,
"rewards/rejected": -34.519004821777344,
"step": 261
},
{
"epoch": 0.5925925925925926,
"grad_norm": 166.47320424747275,
"learning_rate": 3.416502932358079e-07,
"logits/chosen": -1.0839364528656006,
"logits/rejected": -1.0797481536865234,
"logps/chosen": -2.8693737983703613,
"logps/rejected": -3.1928672790527344,
"loss": 3.5401,
"rewards/accuracies": 0.65625,
"rewards/chosen": -28.69373893737793,
"rewards/margins": 3.2349326610565186,
"rewards/rejected": -31.92867088317871,
"step": 262
},
{
"epoch": 0.5948543963811139,
"grad_norm": 175.40553313980547,
"learning_rate": 3.385206793409451e-07,
"logits/chosen": -1.0378577709197998,
"logits/rejected": -1.0370153188705444,
"logps/chosen": -2.7698192596435547,
"logps/rejected": -3.1720666885375977,
"loss": 3.3164,
"rewards/accuracies": 0.765625,
"rewards/chosen": -27.698190689086914,
"rewards/margins": 4.0224761962890625,
"rewards/rejected": -31.72066879272461,
"step": 263
},
{
"epoch": 0.5971162001696353,
"grad_norm": 155.3543000873015,
"learning_rate": 3.3539491531130163e-07,
"logits/chosen": -1.0410065650939941,
"logits/rejected": -1.0408883094787598,
"logps/chosen": -2.8821988105773926,
"logps/rejected": -3.3631529808044434,
"loss": 3.193,
"rewards/accuracies": 0.765625,
"rewards/chosen": -28.82198715209961,
"rewards/margins": 4.809541702270508,
"rewards/rejected": -33.631526947021484,
"step": 264
},
{
"epoch": 0.5993780039581567,
"grad_norm": 149.20108555092546,
"learning_rate": 3.3227319688375426e-07,
"logits/chosen": -1.1061354875564575,
"logits/rejected": -1.0955908298492432,
"logps/chosen": -2.884066581726074,
"logps/rejected": -3.2610855102539062,
"loss": 3.2773,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -28.840667724609375,
"rewards/margins": 3.770188808441162,
"rewards/rejected": -32.61085891723633,
"step": 265
},
{
"epoch": 0.601639807746678,
"grad_norm": 155.70578005343887,
"learning_rate": 3.291557195418427e-07,
"logits/chosen": -1.0915729999542236,
"logits/rejected": -1.0768842697143555,
"logps/chosen": -2.6985390186309814,
"logps/rejected": -3.0585601329803467,
"loss": 3.4593,
"rewards/accuracies": 0.734375,
"rewards/chosen": -26.985389709472656,
"rewards/margins": 3.600210666656494,
"rewards/rejected": -30.58559799194336,
"step": 266
},
{
"epoch": 0.6039016115351993,
"grad_norm": 219.43494306982168,
"learning_rate": 3.260426785035272e-07,
"logits/chosen": -1.11444890499115,
"logits/rejected": -1.1026936769485474,
"logps/chosen": -2.854384422302246,
"logps/rejected": -3.2280983924865723,
"loss": 3.7163,
"rewards/accuracies": 0.6875,
"rewards/chosen": -28.543846130371094,
"rewards/margins": 3.7371411323547363,
"rewards/rejected": -32.28098678588867,
"step": 267
},
{
"epoch": 0.6061634153237206,
"grad_norm": 171.107251475596,
"learning_rate": 3.229342687089646e-07,
"logits/chosen": -1.070772647857666,
"logits/rejected": -1.0768314599990845,
"logps/chosen": -2.7497973442077637,
"logps/rejected": -3.2370479106903076,
"loss": 3.0025,
"rewards/accuracies": 0.78125,
"rewards/chosen": -27.497974395751953,
"rewards/margins": 4.8725080490112305,
"rewards/rejected": -32.370479583740234,
"step": 268
},
{
"epoch": 0.608425219112242,
"grad_norm": 171.5051536968982,
"learning_rate": 3.1983068480830143e-07,
"logits/chosen": -1.0967150926589966,
"logits/rejected": -1.0990163087844849,
"logps/chosen": -2.86303973197937,
"logps/rejected": -3.367943048477173,
"loss": 3.1547,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -28.63039779663086,
"rewards/margins": 5.04902982711792,
"rewards/rejected": -33.67942810058594,
"step": 269
},
{
"epoch": 0.6106870229007634,
"grad_norm": 151.3759521009327,
"learning_rate": 3.1673212114948387e-07,
"logits/chosen": -1.0704870223999023,
"logits/rejected": -1.0655865669250488,
"logps/chosen": -2.7373344898223877,
"logps/rejected": -3.2650370597839355,
"loss": 2.7578,
"rewards/accuracies": 0.75,
"rewards/chosen": -27.37334632873535,
"rewards/margins": 5.277024269104004,
"rewards/rejected": -32.650367736816406,
"step": 270
},
{
"epoch": 0.6129488266892847,
"grad_norm": 183.96062606737704,
"learning_rate": 3.1363877176608845e-07,
"logits/chosen": -1.0521974563598633,
"logits/rejected": -1.0564298629760742,
"logps/chosen": -2.6424999237060547,
"logps/rejected": -3.1260673999786377,
"loss": 3.0961,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -26.42500114440918,
"rewards/margins": 4.835672378540039,
"rewards/rejected": -31.26067352294922,
"step": 271
},
{
"epoch": 0.615210630477806,
"grad_norm": 146.7415619801789,
"learning_rate": 3.1055083036517076e-07,
"logits/chosen": -1.0500925779342651,
"logits/rejected": -1.0320236682891846,
"logps/chosen": -2.6332831382751465,
"logps/rejected": -3.151555299758911,
"loss": 2.8809,
"rewards/accuracies": 0.78125,
"rewards/chosen": -26.33283233642578,
"rewards/margins": 5.182720184326172,
"rewards/rejected": -31.51555061340332,
"step": 272
},
{
"epoch": 0.6174724342663274,
"grad_norm": 176.8059829778894,
"learning_rate": 3.074684903151364e-07,
"logits/chosen": -0.9690829515457153,
"logits/rejected": -0.9445469379425049,
"logps/chosen": -2.4842689037323,
"logps/rejected": -2.8961751461029053,
"loss": 2.9286,
"rewards/accuracies": 0.796875,
"rewards/chosen": -24.842689514160156,
"rewards/margins": 4.1190619468688965,
"rewards/rejected": -28.96175193786621,
"step": 273
},
{
"epoch": 0.6197342380548487,
"grad_norm": 174.11563926665636,
"learning_rate": 3.0439194463363136e-07,
"logits/chosen": -1.03439199924469,
"logits/rejected": -1.011659860610962,
"logps/chosen": -2.580949306488037,
"logps/rejected": -2.947218179702759,
"loss": 3.3772,
"rewards/accuracies": 0.75,
"rewards/chosen": -25.809492111206055,
"rewards/margins": 3.6626861095428467,
"rewards/rejected": -29.47217559814453,
"step": 274
},
{
"epoch": 0.6219960418433701,
"grad_norm": 120.16739284187575,
"learning_rate": 3.0132138597545537e-07,
"logits/chosen": -1.0700979232788086,
"logits/rejected": -1.0875582695007324,
"logps/chosen": -2.760199546813965,
"logps/rejected": -3.3094594478607178,
"loss": 2.8477,
"rewards/accuracies": 0.734375,
"rewards/chosen": -27.601999282836914,
"rewards/margins": 5.4925971031188965,
"rewards/rejected": -33.0945930480957,
"step": 275
},
{
"epoch": 0.6242578456318915,
"grad_norm": 133.7458346644232,
"learning_rate": 2.982570066204981e-07,
"logits/chosen": -1.0688081979751587,
"logits/rejected": -1.0659135580062866,
"logps/chosen": -2.709754705429077,
"logps/rejected": -3.212181329727173,
"loss": 2.8696,
"rewards/accuracies": 0.75,
"rewards/chosen": -27.097549438476562,
"rewards/margins": 5.024271011352539,
"rewards/rejected": -32.12181854248047,
"step": 276
},
{
"epoch": 0.6265196494204128,
"grad_norm": 194.94745882680402,
"learning_rate": 2.951989984616979e-07,
"logits/chosen": -1.0103790760040283,
"logits/rejected": -1.0189611911773682,
"logps/chosen": -2.788445472717285,
"logps/rejected": -3.306720018386841,
"loss": 3.2288,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -27.88445472717285,
"rewards/margins": 5.182745933532715,
"rewards/rejected": -33.06719970703125,
"step": 277
},
{
"epoch": 0.6287814532089341,
"grad_norm": 181.74990018566842,
"learning_rate": 2.9214755299302584e-07,
"logits/chosen": -0.9969690442085266,
"logits/rejected": -0.993459939956665,
"logps/chosen": -2.8557004928588867,
"logps/rejected": -3.3921194076538086,
"loss": 2.7074,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -28.5570068359375,
"rewards/margins": 5.3641886711120605,
"rewards/rejected": -33.92119598388672,
"step": 278
},
{
"epoch": 0.6310432569974554,
"grad_norm": 149.61408474184353,
"learning_rate": 2.89102861297494e-07,
"logits/chosen": -1.0359619855880737,
"logits/rejected": -1.0459537506103516,
"logps/chosen": -2.7723605632781982,
"logps/rejected": -3.2539467811584473,
"loss": 3.182,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -27.72360610961914,
"rewards/margins": 4.815859794616699,
"rewards/rejected": -32.539466857910156,
"step": 279
},
{
"epoch": 0.6333050607859768,
"grad_norm": 195.39013018559334,
"learning_rate": 2.860651140351902e-07,
"logits/chosen": -1.037549614906311,
"logits/rejected": -1.0350432395935059,
"logps/chosen": -2.8769359588623047,
"logps/rejected": -3.420259475708008,
"loss": 2.9818,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -28.769359588623047,
"rewards/margins": 5.433237552642822,
"rewards/rejected": -34.20259475708008,
"step": 280
},
{
"epoch": 0.6355668645744982,
"grad_norm": 155.92948546147593,
"learning_rate": 2.830345014313381e-07,
"logits/chosen": -0.9779025912284851,
"logits/rejected": -0.9906786680221558,
"logps/chosen": -2.9215312004089355,
"logps/rejected": -3.4977033138275146,
"loss": 2.6284,
"rewards/accuracies": 0.8125,
"rewards/chosen": -29.215312957763672,
"rewards/margins": 5.761720180511475,
"rewards/rejected": -34.97703552246094,
"step": 281
},
{
"epoch": 0.6378286683630195,
"grad_norm": 187.88083261637343,
"learning_rate": 2.800112132643856e-07,
"logits/chosen": -1.0564908981323242,
"logits/rejected": -1.0514881610870361,
"logps/chosen": -2.9547762870788574,
"logps/rejected": -3.5844526290893555,
"loss": 2.8202,
"rewards/accuracies": 0.78125,
"rewards/chosen": -29.547761917114258,
"rewards/margins": 6.2967658042907715,
"rewards/rejected": -35.84452819824219,
"step": 282
},
{
"epoch": 0.6400904721515408,
"grad_norm": 156.47358710093474,
"learning_rate": 2.7699543885412105e-07,
"logits/chosen": -1.0226179361343384,
"logits/rejected": -1.0112264156341553,
"logps/chosen": -2.949159622192383,
"logps/rejected": -3.5769615173339844,
"loss": 2.5308,
"rewards/accuracies": 0.8359375,
"rewards/chosen": -29.491596221923828,
"rewards/margins": 6.278021812438965,
"rewards/rejected": -35.76961898803711,
"step": 283
},
{
"epoch": 0.6423522759400622,
"grad_norm": 153.99253553149956,
"learning_rate": 2.7398736704981725e-07,
"logits/chosen": -1.034913182258606,
"logits/rejected": -1.0078083276748657,
"logps/chosen": -3.0782995223999023,
"logps/rejected": -3.6696910858154297,
"loss": 2.6226,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -30.78299331665039,
"rewards/margins": 5.913917541503906,
"rewards/rejected": -36.6969108581543,
"step": 284
},
{
"epoch": 0.6446140797285835,
"grad_norm": 222.1159065771539,
"learning_rate": 2.709871862184063e-07,
"logits/chosen": -1.0211516618728638,
"logits/rejected": -1.0156588554382324,
"logps/chosen": -2.9046590328216553,
"logps/rejected": -3.4058563709259033,
"loss": 3.3323,
"rewards/accuracies": 0.75,
"rewards/chosen": -29.046588897705078,
"rewards/margins": 5.011976718902588,
"rewards/rejected": -34.05856704711914,
"step": 285
},
{
"epoch": 0.6468758835171049,
"grad_norm": 156.58411783215192,
"learning_rate": 2.679950842326837e-07,
"logits/chosen": -1.055216908454895,
"logits/rejected": -1.0356172323226929,
"logps/chosen": -2.966404676437378,
"logps/rejected": -3.566565990447998,
"loss": 2.673,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -29.664047241210938,
"rewards/margins": 6.001612186431885,
"rewards/rejected": -35.66565704345703,
"step": 286
},
{
"epoch": 0.6491376873056263,
"grad_norm": 131.25191153891365,
"learning_rate": 2.6501124845954363e-07,
"logits/chosen": -1.0120482444763184,
"logits/rejected": -1.0116891860961914,
"logps/chosen": -2.8681583404541016,
"logps/rejected": -3.502807140350342,
"loss": 2.3787,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -28.68158531188965,
"rewards/margins": 6.346485137939453,
"rewards/rejected": -35.02806854248047,
"step": 287
},
{
"epoch": 0.6513994910941476,
"grad_norm": 182.00067587268094,
"learning_rate": 2.62035865748246e-07,
"logits/chosen": -0.9981238842010498,
"logits/rejected": -0.9938050508499146,
"logps/chosen": -2.7982468605041504,
"logps/rejected": -3.3067078590393066,
"loss": 2.9436,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -27.982467651367188,
"rewards/margins": 5.084610462188721,
"rewards/rejected": -33.06707763671875,
"step": 288
},
{
"epoch": 0.6536612948826689,
"grad_norm": 158.2959824838079,
"learning_rate": 2.5906912241871554e-07,
"logits/chosen": -1.0856727361679077,
"logits/rejected": -1.0612850189208984,
"logps/chosen": -2.90170955657959,
"logps/rejected": -3.4285690784454346,
"loss": 2.8793,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -29.0170955657959,
"rewards/margins": 5.268592357635498,
"rewards/rejected": -34.28569030761719,
"step": 289
},
{
"epoch": 0.6559230986711903,
"grad_norm": 187.9726365909188,
"learning_rate": 2.561112042498753e-07,
"logits/chosen": -0.992550253868103,
"logits/rejected": -0.9997091293334961,
"logps/chosen": -2.7526485919952393,
"logps/rejected": -3.165712594985962,
"loss": 3.721,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -27.526485443115234,
"rewards/margins": 4.130640029907227,
"rewards/rejected": -31.65712547302246,
"step": 290
},
{
"epoch": 0.6581849024597116,
"grad_norm": 143.79444839334136,
"learning_rate": 2.5316229646801195e-07,
"logits/chosen": -0.9941728115081787,
"logits/rejected": -0.992635190486908,
"logps/chosen": -2.9787821769714355,
"logps/rejected": -3.4489240646362305,
"loss": 2.7863,
"rewards/accuracies": 0.8125,
"rewards/chosen": -29.787822723388672,
"rewards/margins": 4.701417922973633,
"rewards/rejected": -34.48924255371094,
"step": 291
},
{
"epoch": 0.660446706248233,
"grad_norm": 153.0517998301082,
"learning_rate": 2.5022258373517714e-07,
"logits/chosen": -1.104297161102295,
"logits/rejected": -1.0981920957565308,
"logps/chosen": -2.7750794887542725,
"logps/rejected": -3.223928451538086,
"loss": 2.9225,
"rewards/accuracies": 0.78125,
"rewards/chosen": -27.75079345703125,
"rewards/margins": 4.488492012023926,
"rewards/rejected": -32.23928451538086,
"step": 292
},
{
"epoch": 0.6627085100367544,
"grad_norm": 153.94026088368992,
"learning_rate": 2.4729225013762474e-07,
"logits/chosen": -1.143121361732483,
"logits/rejected": -1.142082929611206,
"logps/chosen": -3.0029547214508057,
"logps/rejected": -3.4995949268341064,
"loss": 3.0848,
"rewards/accuracies": 0.75,
"rewards/chosen": -30.0295467376709,
"rewards/margins": 4.966399669647217,
"rewards/rejected": -34.995948791503906,
"step": 293
},
{
"epoch": 0.6649703138252756,
"grad_norm": 181.36537578535228,
"learning_rate": 2.4437147917428203e-07,
"logits/chosen": -1.033583641052246,
"logits/rejected": -1.026897668838501,
"logps/chosen": -2.8641695976257324,
"logps/rejected": -3.3429033756256104,
"loss": 3.1708,
"rewards/accuracies": 0.765625,
"rewards/chosen": -28.641693115234375,
"rewards/margins": 4.787341117858887,
"rewards/rejected": -33.429039001464844,
"step": 294
},
{
"epoch": 0.667232117613797,
"grad_norm": 232.87767557454146,
"learning_rate": 2.414604537452595e-07,
"logits/chosen": -1.0605061054229736,
"logits/rejected": -1.0549243688583374,
"logps/chosen": -2.7509469985961914,
"logps/rejected": -3.1501309871673584,
"loss": 3.3439,
"rewards/accuracies": 0.71875,
"rewards/chosen": -27.509469985961914,
"rewards/margins": 3.9918391704559326,
"rewards/rejected": -31.50130844116211,
"step": 295
},
{
"epoch": 0.6694939214023183,
"grad_norm": 140.3168861772763,
"learning_rate": 2.385593561403974e-07,
"logits/chosen": -1.072545051574707,
"logits/rejected": -1.0590806007385254,
"logps/chosen": -2.5582523345947266,
"logps/rejected": -3.033470392227173,
"loss": 3.001,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -25.58251953125,
"rewards/margins": 4.752185821533203,
"rewards/rejected": -30.33470916748047,
"step": 296
},
{
"epoch": 0.6717557251908397,
"grad_norm": 130.90402883012493,
"learning_rate": 2.3566836802785119e-07,
"logits/chosen": -1.0802139043807983,
"logits/rejected": -1.0805728435516357,
"logps/chosen": -2.5614163875579834,
"logps/rejected": -3.126112461090088,
"loss": 2.3384,
"rewards/accuracies": 0.796875,
"rewards/chosen": -25.614166259765625,
"rewards/margins": 5.6469621658325195,
"rewards/rejected": -31.261127471923828,
"step": 297
},
{
"epoch": 0.6740175289793611,
"grad_norm": 167.16655731623354,
"learning_rate": 2.327876704427146e-07,
"logits/chosen": -1.0259709358215332,
"logits/rejected": -1.0346571207046509,
"logps/chosen": -2.592196226119995,
"logps/rejected": -2.979300022125244,
"loss": 3.2647,
"rewards/accuracies": 0.703125,
"rewards/chosen": -25.92196273803711,
"rewards/margins": 3.871039628982544,
"rewards/rejected": -29.79300308227539,
"step": 298
},
{
"epoch": 0.6762793327678824,
"grad_norm": 260.03040862255443,
"learning_rate": 2.2991744377568358e-07,
"logits/chosen": -1.0623974800109863,
"logits/rejected": -1.0467642545700073,
"logps/chosen": -2.873406410217285,
"logps/rejected": -3.258018970489502,
"loss": 3.5362,
"rewards/accuracies": 0.734375,
"rewards/chosen": -28.73406410217285,
"rewards/margins": 3.8461239337921143,
"rewards/rejected": -32.5801887512207,
"step": 299
},
{
"epoch": 0.6785411365564037,
"grad_norm": 165.28649873322973,
"learning_rate": 2.270578677617601e-07,
"logits/chosen": -1.0998969078063965,
"logits/rejected": -1.0864473581314087,
"logps/chosen": -2.6970479488372803,
"logps/rejected": -3.2025129795074463,
"loss": 3.4805,
"rewards/accuracies": 0.734375,
"rewards/chosen": -26.970483779907227,
"rewards/margins": 5.054651260375977,
"rewards/rejected": -32.0251350402832,
"step": 300
},
{
"epoch": 0.6808029403449251,
"grad_norm": 147.02104564491643,
"learning_rate": 2.242091214689971e-07,
"logits/chosen": -1.080596923828125,
"logits/rejected": -1.0811580419540405,
"logps/chosen": -2.7154738903045654,
"logps/rejected": -3.343048095703125,
"loss": 2.413,
"rewards/accuracies": 0.828125,
"rewards/chosen": -27.154741287231445,
"rewards/margins": 6.27573823928833,
"rewards/rejected": -33.430477142333984,
"step": 301
},
{
"epoch": 0.6830647441334464,
"grad_norm": 168.1353554238078,
"learning_rate": 2.2137138328728456e-07,
"logits/chosen": -1.1214321851730347,
"logits/rejected": -1.1100562810897827,
"logps/chosen": -2.8991968631744385,
"logps/rejected": -3.303884983062744,
"loss": 3.0082,
"rewards/accuracies": 0.78125,
"rewards/chosen": -28.99197006225586,
"rewards/margins": 4.046879768371582,
"rewards/rejected": -33.038848876953125,
"step": 302
},
{
"epoch": 0.6853265479219678,
"grad_norm": 138.49574783119974,
"learning_rate": 2.1854483091717974e-07,
"logits/chosen": -1.1026039123535156,
"logits/rejected": -1.1136388778686523,
"logps/chosen": -2.7164382934570312,
"logps/rejected": -3.249185085296631,
"loss": 2.5052,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -27.16438102722168,
"rewards/margins": 5.32747220993042,
"rewards/rejected": -32.491851806640625,
"step": 303
},
{
"epoch": 0.6875883517104892,
"grad_norm": 154.94371001690558,
"learning_rate": 2.1572964135877863e-07,
"logits/chosen": -1.093347191810608,
"logits/rejected": -1.0927071571350098,
"logps/chosen": -2.7913358211517334,
"logps/rejected": -3.1904070377349854,
"loss": 3.5234,
"rewards/accuracies": 0.734375,
"rewards/chosen": -27.913358688354492,
"rewards/margins": 3.9907102584838867,
"rewards/rejected": -31.904071807861328,
"step": 304
},
{
"epoch": 0.6898501554990104,
"grad_norm": 150.4889483956078,
"learning_rate": 2.1292599090063245e-07,
"logits/chosen": -1.1195977926254272,
"logits/rejected": -1.1199798583984375,
"logps/chosen": -2.6830005645751953,
"logps/rejected": -3.2673914432525635,
"loss": 2.702,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -26.830005645751953,
"rewards/margins": 5.843911170959473,
"rewards/rejected": -32.67391586303711,
"step": 305
},
{
"epoch": 0.6921119592875318,
"grad_norm": 138.37763662950627,
"learning_rate": 2.1013405510870824e-07,
"logits/chosen": -1.0394456386566162,
"logits/rejected": -1.0511260032653809,
"logps/chosen": -2.730743885040283,
"logps/rejected": -3.277578115463257,
"loss": 2.974,
"rewards/accuracies": 0.75,
"rewards/chosen": -27.30743980407715,
"rewards/margins": 5.4683427810668945,
"rewards/rejected": -32.775779724121094,
"step": 306
},
{
"epoch": 0.6943737630760531,
"grad_norm": 148.04801804973624,
"learning_rate": 2.0735400881539494e-07,
"logits/chosen": -1.0334455966949463,
"logits/rejected": -1.0548396110534668,
"logps/chosen": -2.8730499744415283,
"logps/rejected": -3.461742877960205,
"loss": 2.6681,
"rewards/accuracies": 0.796875,
"rewards/chosen": -28.730499267578125,
"rewards/margins": 5.886929988861084,
"rewards/rejected": -34.617431640625,
"step": 307
},
{
"epoch": 0.6966355668645745,
"grad_norm": 221.53992304852852,
"learning_rate": 2.0458602610855536e-07,
"logits/chosen": -1.1213308572769165,
"logits/rejected": -1.1119002103805542,
"logps/chosen": -2.9042418003082275,
"logps/rejected": -3.3705925941467285,
"loss": 2.7571,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -29.042417526245117,
"rewards/margins": 4.663509368896484,
"rewards/rejected": -33.705928802490234,
"step": 308
},
{
"epoch": 0.6988973706530959,
"grad_norm": 136.07823495543695,
"learning_rate": 2.0183028032062422e-07,
"logits/chosen": -1.0783358812332153,
"logits/rejected": -1.0803169012069702,
"logps/chosen": -2.797858238220215,
"logps/rejected": -3.263309955596924,
"loss": 3.0103,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -27.97858238220215,
"rewards/margins": 4.654518127441406,
"rewards/rejected": -32.63310623168945,
"step": 309
},
{
"epoch": 0.7011591744416172,
"grad_norm": 155.65801501138648,
"learning_rate": 1.9908694401775473e-07,
"logits/chosen": -1.1042208671569824,
"logits/rejected": -1.1049745082855225,
"logps/chosen": -2.884843349456787,
"logps/rejected": -3.377685785293579,
"loss": 2.9679,
"rewards/accuracies": 0.765625,
"rewards/chosen": -28.848434448242188,
"rewards/margins": 4.92842435836792,
"rewards/rejected": -33.776859283447266,
"step": 310
},
{
"epoch": 0.7034209782301385,
"grad_norm": 139.09066002924212,
"learning_rate": 1.9635618898901196e-07,
"logits/chosen": -1.0947176218032837,
"logits/rejected": -1.091169834136963,
"logps/chosen": -3.14107608795166,
"logps/rejected": -3.6818785667419434,
"loss": 3.0449,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -31.410762786865234,
"rewards/margins": 5.408024311065674,
"rewards/rejected": -36.81878662109375,
"step": 311
},
{
"epoch": 0.7056827820186599,
"grad_norm": 162.5311793566366,
"learning_rate": 1.9363818623561565e-07,
"logits/chosen": -1.0548720359802246,
"logits/rejected": -1.0498396158218384,
"logps/chosen": -2.948056697845459,
"logps/rejected": -3.4283742904663086,
"loss": 3.2606,
"rewards/accuracies": 0.734375,
"rewards/chosen": -29.480567932128906,
"rewards/margins": 4.803174018859863,
"rewards/rejected": -34.28374099731445,
"step": 312
},
{
"epoch": 0.7079445858071812,
"grad_norm": 144.53325872542644,
"learning_rate": 1.9093310596023108e-07,
"logits/chosen": -1.0383893251419067,
"logits/rejected": -1.0171747207641602,
"logps/chosen": -2.96266770362854,
"logps/rejected": -3.5945122241973877,
"loss": 2.5371,
"rewards/accuracies": 0.8125,
"rewards/chosen": -29.626678466796875,
"rewards/margins": 6.318445205688477,
"rewards/rejected": -35.94512176513672,
"step": 313
},
{
"epoch": 0.7102063895957026,
"grad_norm": 159.839020331253,
"learning_rate": 1.8824111755631274e-07,
"logits/chosen": -1.09993577003479,
"logits/rejected": -1.0983682870864868,
"logps/chosen": -2.958811044692993,
"logps/rejected": -3.3868050575256348,
"loss": 3.5182,
"rewards/accuracies": 0.71875,
"rewards/chosen": -29.588109970092773,
"rewards/margins": 4.279940128326416,
"rewards/rejected": -33.8680534362793,
"step": 314
},
{
"epoch": 0.712468193384224,
"grad_norm": 173.12178435051865,
"learning_rate": 1.8556238959749457e-07,
"logits/chosen": -1.0669752359390259,
"logits/rejected": -1.058374047279358,
"logps/chosen": -3.1393425464630127,
"logps/rejected": -3.544556140899658,
"loss": 3.4766,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -31.3934268951416,
"rewards/margins": 4.052134990692139,
"rewards/rejected": -35.445560455322266,
"step": 315
},
{
"epoch": 0.7147299971727452,
"grad_norm": 163.63036158743895,
"learning_rate": 1.8289708982703562e-07,
"logits/chosen": -1.0433309078216553,
"logits/rejected": -1.026186466217041,
"logps/chosen": -2.943878650665283,
"logps/rejected": -3.444617748260498,
"loss": 3.506,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -29.438785552978516,
"rewards/margins": 5.0073957443237305,
"rewards/rejected": -34.4461784362793,
"step": 316
},
{
"epoch": 0.7169918009612666,
"grad_norm": 174.9260498692663,
"learning_rate": 1.802453851473151e-07,
"logits/chosen": -1.102484107017517,
"logits/rejected": -1.1031461954116821,
"logps/chosen": -3.222090005874634,
"logps/rejected": -3.821988582611084,
"loss": 2.6827,
"rewards/accuracies": 0.78125,
"rewards/chosen": -32.22090148925781,
"rewards/margins": 5.9989824295043945,
"rewards/rejected": -38.219886779785156,
"step": 317
},
{
"epoch": 0.719253604749788,
"grad_norm": 160.37374943304434,
"learning_rate": 1.7760744160938093e-07,
"logits/chosen": -1.0474447011947632,
"logits/rejected": -1.031981348991394,
"logps/chosen": -3.0827102661132812,
"logps/rejected": -3.7794246673583984,
"loss": 2.3634,
"rewards/accuracies": 0.828125,
"rewards/chosen": -30.827098846435547,
"rewards/margins": 6.967146396636963,
"rewards/rejected": -37.794246673583984,
"step": 318
},
{
"epoch": 0.7215154085383093,
"grad_norm": 130.2366582324951,
"learning_rate": 1.7498342440255135e-07,
"logits/chosen": -1.110652208328247,
"logits/rejected": -1.092280626296997,
"logps/chosen": -2.9385170936584473,
"logps/rejected": -3.447098970413208,
"loss": 2.7639,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -29.385169982910156,
"rewards/margins": 5.085820198059082,
"rewards/rejected": -34.47098922729492,
"step": 319
},
{
"epoch": 0.7237772123268307,
"grad_norm": 137.53735259891135,
"learning_rate": 1.7237349784407115e-07,
"logits/chosen": -1.098408818244934,
"logits/rejected": -1.087859869003296,
"logps/chosen": -3.0967419147491455,
"logps/rejected": -3.6063919067382812,
"loss": 3.0231,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -30.967418670654297,
"rewards/margins": 5.096498489379883,
"rewards/rejected": -36.06391525268555,
"step": 320
},
{
"epoch": 0.726039016115352,
"grad_norm": 157.85568588838194,
"learning_rate": 1.6977782536882178e-07,
"logits/chosen": -1.055006742477417,
"logits/rejected": -1.057015299797058,
"logps/chosen": -2.8752079010009766,
"logps/rejected": -3.4953298568725586,
"loss": 2.5631,
"rewards/accuracies": 0.78125,
"rewards/chosen": -28.7520751953125,
"rewards/margins": 6.2012176513671875,
"rewards/rejected": -34.95329284667969,
"step": 321
},
{
"epoch": 0.7283008199038733,
"grad_norm": 167.22358095522708,
"learning_rate": 1.6719656951908708e-07,
"logits/chosen": -1.03927481174469,
"logits/rejected": -1.0443062782287598,
"logps/chosen": -2.6960246562957764,
"logps/rejected": -3.2267005443573,
"loss": 2.8958,
"rewards/accuracies": 0.78125,
"rewards/chosen": -26.960247039794922,
"rewards/margins": 5.306758880615234,
"rewards/rejected": -32.267005920410156,
"step": 322
},
{
"epoch": 0.7305626236923947,
"grad_norm": 141.43318236850038,
"learning_rate": 1.6462989193437453e-07,
"logits/chosen": -1.1290605068206787,
"logits/rejected": -1.1231218576431274,
"logps/chosen": -2.9903295040130615,
"logps/rejected": -3.4813930988311768,
"loss": 3.0688,
"rewards/accuracies": 0.765625,
"rewards/chosen": -29.903291702270508,
"rewards/margins": 4.9106364250183105,
"rewards/rejected": -34.813934326171875,
"step": 323
},
{
"epoch": 0.732824427480916,
"grad_norm": 153.5405476048427,
"learning_rate": 1.6207795334129365e-07,
"logits/chosen": -1.0967392921447754,
"logits/rejected": -1.0955842733383179,
"logps/chosen": -3.0994861125946045,
"logps/rejected": -3.6565637588500977,
"loss": 3.0762,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -30.99485969543457,
"rewards/margins": 5.570777893066406,
"rewards/rejected": -36.56563949584961,
"step": 324
},
{
"epoch": 0.7350862312694374,
"grad_norm": 172.69716391444982,
"learning_rate": 1.5954091354349121e-07,
"logits/chosen": -1.1257866621017456,
"logits/rejected": -1.1157145500183105,
"logps/chosen": -3.0154027938842773,
"logps/rejected": -3.544265031814575,
"loss": 2.8422,
"rewards/accuracies": 0.78125,
"rewards/chosen": -30.154027938842773,
"rewards/margins": 5.288622856140137,
"rewards/rejected": -35.442649841308594,
"step": 325
},
{
"epoch": 0.7373480350579588,
"grad_norm": 177.90906140786845,
"learning_rate": 1.5701893141164364e-07,
"logits/chosen": -1.1034339666366577,
"logits/rejected": -1.10421621799469,
"logps/chosen": -3.1780266761779785,
"logps/rejected": -3.7761831283569336,
"loss": 3.7696,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -31.78026580810547,
"rewards/margins": 5.981565475463867,
"rewards/rejected": -37.7618293762207,
"step": 326
},
{
"epoch": 0.73960983884648,
"grad_norm": 183.54016496777058,
"learning_rate": 1.545121648735093e-07,
"logits/chosen": -1.1025980710983276,
"logits/rejected": -1.0847091674804688,
"logps/chosen": -3.075312614440918,
"logps/rejected": -3.548532009124756,
"loss": 3.2151,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -30.75312614440918,
"rewards/margins": 4.732194423675537,
"rewards/rejected": -35.485321044921875,
"step": 327
},
{
"epoch": 0.7418716426350014,
"grad_norm": 135.25748433449712,
"learning_rate": 1.5202077090403863e-07,
"logits/chosen": -1.1285474300384521,
"logits/rejected": -1.1008970737457275,
"logps/chosen": -2.8346710205078125,
"logps/rejected": -3.3540940284729004,
"loss": 2.5201,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -28.34670639038086,
"rewards/margins": 5.1942338943481445,
"rewards/rejected": -33.54093933105469,
"step": 328
},
{
"epoch": 0.7441334464235227,
"grad_norm": 163.3852037175944,
"learning_rate": 1.495449055155443e-07,
"logits/chosen": -1.11967134475708,
"logits/rejected": -1.1241058111190796,
"logps/chosen": -3.2236409187316895,
"logps/rejected": -3.8717517852783203,
"loss": 2.5778,
"rewards/accuracies": 0.78125,
"rewards/chosen": -32.23640823364258,
"rewards/margins": 6.481108665466309,
"rewards/rejected": -38.7175178527832,
"step": 329
},
{
"epoch": 0.7463952502120441,
"grad_norm": 189.7340951008119,
"learning_rate": 1.4708472374793112e-07,
"logits/chosen": -1.0307663679122925,
"logits/rejected": -1.0201297998428345,
"logps/chosen": -3.2278249263763428,
"logps/rejected": -3.6534264087677,
"loss": 3.6326,
"rewards/accuracies": 0.6875,
"rewards/chosen": -32.27824783325195,
"rewards/margins": 4.256016254425049,
"rewards/rejected": -36.534263610839844,
"step": 330
},
{
"epoch": 0.7486570540005655,
"grad_norm": 162.72584520699186,
"learning_rate": 1.4464037965898878e-07,
"logits/chosen": -1.0099622011184692,
"logits/rejected": -0.9994704127311707,
"logps/chosen": -2.914703369140625,
"logps/rejected": -3.456731081008911,
"loss": 2.8021,
"rewards/accuracies": 0.78125,
"rewards/chosen": -29.147035598754883,
"rewards/margins": 5.4202775955200195,
"rewards/rejected": -34.56731033325195,
"step": 331
},
{
"epoch": 0.7509188577890868,
"grad_norm": 171.09208924948626,
"learning_rate": 1.4221202631474282e-07,
"logits/chosen": -1.036645770072937,
"logits/rejected": -1.047488808631897,
"logps/chosen": -3.04544734954834,
"logps/rejected": -3.528249502182007,
"loss": 3.1024,
"rewards/accuracies": 0.75,
"rewards/chosen": -30.4544677734375,
"rewards/margins": 4.828027725219727,
"rewards/rejected": -35.28249740600586,
"step": 332
},
{
"epoch": 0.7531806615776081,
"grad_norm": 170.6804908176681,
"learning_rate": 1.3979981577987113e-07,
"logits/chosen": -1.0796502828598022,
"logits/rejected": -1.0737544298171997,
"logps/chosen": -2.974057197570801,
"logps/rejected": -3.5673232078552246,
"loss": 2.4609,
"rewards/accuracies": 0.78125,
"rewards/chosen": -29.740570068359375,
"rewards/margins": 5.932661533355713,
"rewards/rejected": -35.6732292175293,
"step": 333
},
{
"epoch": 0.7554424653661295,
"grad_norm": 157.68396676428247,
"learning_rate": 1.374038991081807e-07,
"logits/chosen": -1.0844019651412964,
"logits/rejected": -1.0833029747009277,
"logps/chosen": -3.113481044769287,
"logps/rejected": -3.539191484451294,
"loss": 2.9737,
"rewards/accuracies": 0.75,
"rewards/chosen": -31.134809494018555,
"rewards/margins": 4.257106781005859,
"rewards/rejected": -35.39191436767578,
"step": 334
},
{
"epoch": 0.7577042691546508,
"grad_norm": 159.54123794673237,
"learning_rate": 1.3502442633314882e-07,
"logits/chosen": -1.08342707157135,
"logits/rejected": -1.0705313682556152,
"logps/chosen": -2.7244319915771484,
"logps/rejected": -3.2218587398529053,
"loss": 2.6449,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -27.244319915771484,
"rewards/margins": 4.97426700592041,
"rewards/rejected": -32.218589782714844,
"step": 335
},
{
"epoch": 0.7599660729431722,
"grad_norm": 139.57722381793081,
"learning_rate": 1.3266154645852815e-07,
"logits/chosen": -1.058245062828064,
"logits/rejected": -1.0414983034133911,
"logps/chosen": -3.0073535442352295,
"logps/rejected": -3.5276637077331543,
"loss": 2.7813,
"rewards/accuracies": 0.78125,
"rewards/chosen": -30.073535919189453,
"rewards/margins": 5.20310115814209,
"rewards/rejected": -35.27663803100586,
"step": 336
},
{
"epoch": 0.7622278767316936,
"grad_norm": 174.03982323832386,
"learning_rate": 1.303154074490152e-07,
"logits/chosen": -1.1049643754959106,
"logits/rejected": -1.083824872970581,
"logps/chosen": -3.008976697921753,
"logps/rejected": -3.5545217990875244,
"loss": 3.0701,
"rewards/accuracies": 0.75,
"rewards/chosen": -30.089771270751953,
"rewards/margins": 5.455449104309082,
"rewards/rejected": -35.54521942138672,
"step": 337
},
{
"epoch": 0.7644896805202148,
"grad_norm": 160.9242319744395,
"learning_rate": 1.2798615622098616e-07,
"logits/chosen": -1.118033766746521,
"logits/rejected": -1.104003667831421,
"logps/chosen": -2.9770729541778564,
"logps/rejected": -3.559138536453247,
"loss": 2.7528,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -29.770729064941406,
"rewards/margins": 5.820652961730957,
"rewards/rejected": -35.59138107299805,
"step": 338
},
{
"epoch": 0.7667514843087362,
"grad_norm": 121.06171902418025,
"learning_rate": 1.2567393863329523e-07,
"logits/chosen": -1.0702447891235352,
"logits/rejected": -1.092645287513733,
"logps/chosen": -2.9810526371002197,
"logps/rejected": -3.5818190574645996,
"loss": 2.5609,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -29.810523986816406,
"rewards/margins": 6.00766658782959,
"rewards/rejected": -35.81819152832031,
"step": 339
},
{
"epoch": 0.7690132880972576,
"grad_norm": 170.89823033285558,
"learning_rate": 1.233788994781423e-07,
"logits/chosen": -1.1062794923782349,
"logits/rejected": -1.1038752794265747,
"logps/chosen": -3.008683443069458,
"logps/rejected": -3.5800118446350098,
"loss": 2.6117,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -30.086833953857422,
"rewards/margins": 5.713282585144043,
"rewards/rejected": -35.800113677978516,
"step": 340
},
{
"epoch": 0.7712750918857789,
"grad_norm": 131.00188496624259,
"learning_rate": 1.2110118247200468e-07,
"logits/chosen": -1.093713641166687,
"logits/rejected": -1.0842986106872559,
"logps/chosen": -2.8371694087982178,
"logps/rejected": -3.4009671211242676,
"loss": 2.5132,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -28.371692657470703,
"rewards/margins": 5.637977123260498,
"rewards/rejected": -34.00967025756836,
"step": 341
},
{
"epoch": 0.7735368956743003,
"grad_norm": 193.45548191250376,
"learning_rate": 1.1884093024663933e-07,
"logits/chosen": -1.0978885889053345,
"logits/rejected": -1.096379041671753,
"logps/chosen": -2.7560176849365234,
"logps/rejected": -3.429332971572876,
"loss": 3.0091,
"rewards/accuracies": 0.71875,
"rewards/chosen": -27.560178756713867,
"rewards/margins": 6.733152389526367,
"rewards/rejected": -34.293331146240234,
"step": 342
},
{
"epoch": 0.7757986994628217,
"grad_norm": 179.19045707900517,
"learning_rate": 1.1659828434014886e-07,
"logits/chosen": -1.0911431312561035,
"logits/rejected": -1.0719244480133057,
"logps/chosen": -2.8892617225646973,
"logps/rejected": -3.54764461517334,
"loss": 2.8438,
"rewards/accuracies": 0.75,
"rewards/chosen": -28.892620086669922,
"rewards/margins": 6.583826065063477,
"rewards/rejected": -35.47644805908203,
"step": 343
},
{
"epoch": 0.7780605032513429,
"grad_norm": 161.1666542002096,
"learning_rate": 1.143733851881203e-07,
"logits/chosen": -1.1230119466781616,
"logits/rejected": -1.1065000295639038,
"logps/chosen": -3.1044983863830566,
"logps/rejected": -3.6987762451171875,
"loss": 3.0252,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -31.044984817504883,
"rewards/margins": 5.942776203155518,
"rewards/rejected": -36.987762451171875,
"step": 344
},
{
"epoch": 0.7803223070398643,
"grad_norm": 162.875371762857,
"learning_rate": 1.1216637211483005e-07,
"logits/chosen": -1.0864002704620361,
"logits/rejected": -1.076468586921692,
"logps/chosen": -2.995060920715332,
"logps/rejected": -3.4805409908294678,
"loss": 3.1132,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -29.95060920715332,
"rewards/margins": 4.85480260848999,
"rewards/rejected": -34.80541229248047,
"step": 345
},
{
"epoch": 0.7825841108283856,
"grad_norm": 211.00125843356489,
"learning_rate": 1.0997738332451936e-07,
"logits/chosen": -1.0667786598205566,
"logits/rejected": -1.0570969581604004,
"logps/chosen": -3.2074437141418457,
"logps/rejected": -3.686741352081299,
"loss": 2.8699,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -32.074440002441406,
"rewards/margins": 4.792973041534424,
"rewards/rejected": -36.86741256713867,
"step": 346
},
{
"epoch": 0.784845914616907,
"grad_norm": 157.4005395746414,
"learning_rate": 1.0780655589274031e-07,
"logits/chosen": -1.1231722831726074,
"logits/rejected": -1.101925253868103,
"logps/chosen": -3.014099359512329,
"logps/rejected": -3.558246374130249,
"loss": 2.693,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -30.1409912109375,
"rewards/margins": 5.441472053527832,
"rewards/rejected": -35.582462310791016,
"step": 347
},
{
"epoch": 0.7871077184054284,
"grad_norm": 165.93325016524315,
"learning_rate": 1.056540257577712e-07,
"logits/chosen": -1.0740177631378174,
"logits/rejected": -1.0665497779846191,
"logps/chosen": -3.4591240882873535,
"logps/rejected": -4.057380676269531,
"loss": 2.6869,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -34.59123992919922,
"rewards/margins": 5.982567310333252,
"rewards/rejected": -40.57380676269531,
"step": 348
},
{
"epoch": 0.7893695221939496,
"grad_norm": 147.55749046925172,
"learning_rate": 1.0351992771210554e-07,
"logits/chosen": -1.0476750135421753,
"logits/rejected": -1.0461106300354004,
"logps/chosen": -3.0786919593811035,
"logps/rejected": -3.583596706390381,
"loss": 2.8979,
"rewards/accuracies": 0.78125,
"rewards/chosen": -30.786916732788086,
"rewards/margins": 5.049046993255615,
"rewards/rejected": -35.83596420288086,
"step": 349
},
{
"epoch": 0.791631325982471,
"grad_norm": 193.43103769575845,
"learning_rate": 1.0140439539400953e-07,
"logits/chosen": -1.0643444061279297,
"logits/rejected": -1.0696581602096558,
"logps/chosen": -3.1182804107666016,
"logps/rejected": -3.620661735534668,
"loss": 3.5198,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -31.182804107666016,
"rewards/margins": 5.023812770843506,
"rewards/rejected": -36.20661926269531,
"step": 350
},
{
"epoch": 0.7938931297709924,
"grad_norm": 141.88482331317974,
"learning_rate": 9.930756127915488e-08,
"logits/chosen": -1.0582480430603027,
"logits/rejected": -1.0691368579864502,
"logps/chosen": -2.9359114170074463,
"logps/rejected": -3.5403919219970703,
"loss": 2.7303,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -29.359111785888672,
"rewards/margins": 6.044802665710449,
"rewards/rejected": -35.4039192199707,
"step": 351
},
{
"epoch": 0.7961549335595137,
"grad_norm": 176.14309347354188,
"learning_rate": 9.722955667232242e-08,
"logits/chosen": -1.103529453277588,
"logits/rejected": -1.1015864610671997,
"logps/chosen": -3.259512424468994,
"logps/rejected": -3.5993950366973877,
"loss": 3.9747,
"rewards/accuracies": 0.703125,
"rewards/chosen": -32.59511947631836,
"rewards/margins": 3.398827075958252,
"rewards/rejected": -35.99394989013672,
"step": 352
},
{
"epoch": 0.7984167373480351,
"grad_norm": 161.52973237898686,
"learning_rate": 9.517051169918016e-08,
"logits/chosen": -1.1242177486419678,
"logits/rejected": -1.131679892539978,
"logps/chosen": -3.072510004043579,
"logps/rejected": -3.5460586547851562,
"loss": 3.0567,
"rewards/accuracies": 0.765625,
"rewards/chosen": -30.725101470947266,
"rewards/margins": 4.735486030578613,
"rewards/rejected": -35.46058654785156,
"step": 353
},
{
"epoch": 0.8006785411365565,
"grad_norm": 173.94582839965446,
"learning_rate": 9.313055529813412e-08,
"logits/chosen": -1.0249019861221313,
"logits/rejected": -1.0524556636810303,
"logps/chosen": -2.8515758514404297,
"logps/rejected": -3.4208881855010986,
"loss": 2.6105,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -28.51576042175293,
"rewards/margins": 5.693122863769531,
"rewards/rejected": -34.20888137817383,
"step": 354
},
{
"epoch": 0.8029403449250777,
"grad_norm": 186.23957915125598,
"learning_rate": 9.110981521225532e-08,
"logits/chosen": -1.100682258605957,
"logits/rejected": -1.080102801322937,
"logps/chosen": -3.1016242504119873,
"logps/rejected": -3.5042476654052734,
"loss": 3.6117,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -31.01624298095703,
"rewards/margins": 4.0262370109558105,
"rewards/rejected": -35.04248046875,
"step": 355
},
{
"epoch": 0.8052021487135991,
"grad_norm": 179.43856596279534,
"learning_rate": 8.910841798127884e-08,
"logits/chosen": -1.0597988367080688,
"logits/rejected": -1.0726373195648193,
"logps/chosen": -3.047053813934326,
"logps/rejected": -3.5895328521728516,
"loss": 2.874,
"rewards/accuracies": 0.796875,
"rewards/chosen": -30.470539093017578,
"rewards/margins": 5.424790382385254,
"rewards/rejected": -35.89532470703125,
"step": 356
},
{
"epoch": 0.8074639525021204,
"grad_norm": 161.8027690554887,
"learning_rate": 8.712648893368139e-08,
"logits/chosen": -1.0760971307754517,
"logits/rejected": -1.0981957912445068,
"logps/chosen": -3.037094831466675,
"logps/rejected": -3.6570444107055664,
"loss": 2.6992,
"rewards/accuracies": 0.765625,
"rewards/chosen": -30.370943069458008,
"rewards/margins": 6.199495315551758,
"rewards/rejected": -36.57044219970703,
"step": 357
},
{
"epoch": 0.8097257562906418,
"grad_norm": 136.66722952170457,
"learning_rate": 8.516415217883186e-08,
"logits/chosen": -1.0790215730667114,
"logits/rejected": -1.0539416074752808,
"logps/chosen": -2.8489599227905273,
"logps/rejected": -3.426215648651123,
"loss": 3.0279,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -28.489595413208008,
"rewards/margins": 5.772557735443115,
"rewards/rejected": -34.26215362548828,
"step": 358
},
{
"epoch": 0.8119875600791632,
"grad_norm": 183.0413049106842,
"learning_rate": 8.32215305992209e-08,
"logits/chosen": -1.1424682140350342,
"logits/rejected": -1.1471257209777832,
"logps/chosen": -2.8472583293914795,
"logps/rejected": -3.4035425186157227,
"loss": 2.7101,
"rewards/accuracies": 0.78125,
"rewards/chosen": -28.472583770751953,
"rewards/margins": 5.562839984893799,
"rewards/rejected": -34.035423278808594,
"step": 359
},
{
"epoch": 0.8142493638676844,
"grad_norm": 130.5926688418937,
"learning_rate": 8.129874584276448e-08,
"logits/chosen": -1.0832806825637817,
"logits/rejected": -1.0744616985321045,
"logps/chosen": -2.8213050365448,
"logps/rejected": -3.391476631164551,
"loss": 2.6731,
"rewards/accuracies": 0.765625,
"rewards/chosen": -28.213050842285156,
"rewards/margins": 5.701716423034668,
"rewards/rejected": -33.91476821899414,
"step": 360
},
{
"epoch": 0.8165111676562058,
"grad_norm": 170.2145280077911,
"learning_rate": 7.939591831518746e-08,
"logits/chosen": -1.093398928642273,
"logits/rejected": -1.094681739807129,
"logps/chosen": -2.789719581604004,
"logps/rejected": -3.1508536338806152,
"loss": 3.2504,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -27.897193908691406,
"rewards/margins": 3.611339807510376,
"rewards/rejected": -31.508529663085938,
"step": 361
},
{
"epoch": 0.8187729714447272,
"grad_norm": 148.16847439682329,
"learning_rate": 7.751316717248304e-08,
"logits/chosen": -1.0858169794082642,
"logits/rejected": -1.0891151428222656,
"logps/chosen": -3.0676655769348145,
"logps/rejected": -3.830179214477539,
"loss": 2.2728,
"rewards/accuracies": 0.828125,
"rewards/chosen": -30.676654815673828,
"rewards/margins": 7.625136852264404,
"rewards/rejected": -38.30179214477539,
"step": 362
},
{
"epoch": 0.8210347752332485,
"grad_norm": 161.20986921430406,
"learning_rate": 7.565061031345142e-08,
"logits/chosen": -1.0509235858917236,
"logits/rejected": -1.0490310192108154,
"logps/chosen": -3.1492578983306885,
"logps/rejected": -3.728961944580078,
"loss": 2.884,
"rewards/accuracies": 0.78125,
"rewards/chosen": -31.492576599121094,
"rewards/margins": 5.79704475402832,
"rewards/rejected": -37.28961944580078,
"step": 363
},
{
"epoch": 0.8232965790217699,
"grad_norm": 150.85257421881246,
"learning_rate": 7.380836437231686e-08,
"logits/chosen": -1.0766702890396118,
"logits/rejected": -1.0677735805511475,
"logps/chosen": -2.8776063919067383,
"logps/rejected": -3.4943623542785645,
"loss": 2.4755,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -28.77606201171875,
"rewards/margins": 6.167560577392578,
"rewards/rejected": -34.94362258911133,
"step": 364
},
{
"epoch": 0.8255583828102913,
"grad_norm": 149.72523598721824,
"learning_rate": 7.198654471142371e-08,
"logits/chosen": -1.0955713987350464,
"logits/rejected": -1.0762622356414795,
"logps/chosen": -2.9110095500946045,
"logps/rejected": -3.5544984340667725,
"loss": 2.7029,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -29.11009407043457,
"rewards/margins": 6.43488883972168,
"rewards/rejected": -35.54498291015625,
"step": 365
},
{
"epoch": 0.8278201865988125,
"grad_norm": 245.9741650955417,
"learning_rate": 7.01852654140132e-08,
"logits/chosen": -1.1138908863067627,
"logits/rejected": -1.1113349199295044,
"logps/chosen": -3.091291666030884,
"logps/rejected": -3.6368563175201416,
"loss": 3.0739,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -30.91291618347168,
"rewards/margins": 5.45564603805542,
"rewards/rejected": -36.36856460571289,
"step": 366
},
{
"epoch": 0.8300819903873339,
"grad_norm": 155.74801290743875,
"learning_rate": 6.840463927707833e-08,
"logits/chosen": -1.0664961338043213,
"logits/rejected": -1.080258846282959,
"logps/chosen": -3.084540367126465,
"logps/rejected": -3.6539242267608643,
"loss": 2.5649,
"rewards/accuracies": 0.828125,
"rewards/chosen": -30.84540367126465,
"rewards/margins": 5.6938371658325195,
"rewards/rejected": -36.539241790771484,
"step": 367
},
{
"epoch": 0.8323437941758552,
"grad_norm": 151.84768757368855,
"learning_rate": 6.664477780430138e-08,
"logits/chosen": -1.0625722408294678,
"logits/rejected": -1.0663492679595947,
"logps/chosen": -3.0569539070129395,
"logps/rejected": -3.5362274646759033,
"loss": 3.1163,
"rewards/accuracies": 0.765625,
"rewards/chosen": -30.56954002380371,
"rewards/margins": 4.792736530303955,
"rewards/rejected": -35.36227798461914,
"step": 368
},
{
"epoch": 0.8346055979643766,
"grad_norm": 171.73774525120407,
"learning_rate": 6.49057911990711e-08,
"logits/chosen": -1.0828893184661865,
"logits/rejected": -1.0686218738555908,
"logps/chosen": -3.017665386199951,
"logps/rejected": -3.5277650356292725,
"loss": 3.1351,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -30.176654815673828,
"rewards/margins": 5.1009955406188965,
"rewards/rejected": -35.27764892578125,
"step": 369
},
{
"epoch": 0.836867401752898,
"grad_norm": 145.4377135801225,
"learning_rate": 6.318778835758189e-08,
"logits/chosen": -1.0982502698898315,
"logits/rejected": -1.0948173999786377,
"logps/chosen": -3.06710147857666,
"logps/rejected": -3.6917946338653564,
"loss": 2.3441,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -30.67101287841797,
"rewards/margins": 6.2469329833984375,
"rewards/rejected": -36.917945861816406,
"step": 370
},
{
"epoch": 0.8391292055414192,
"grad_norm": 197.55910952033693,
"learning_rate": 6.149087686201433e-08,
"logits/chosen": -1.1107332706451416,
"logits/rejected": -1.1229714155197144,
"logps/chosen": -2.962777853012085,
"logps/rejected": -3.3359103202819824,
"loss": 3.9932,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -29.62778091430664,
"rewards/margins": 3.731322765350342,
"rewards/rejected": -33.359107971191406,
"step": 371
},
{
"epoch": 0.8413910093299406,
"grad_norm": 155.21912723349024,
"learning_rate": 5.98151629737988e-08,
"logits/chosen": -1.1003910303115845,
"logits/rejected": -1.0755306482315063,
"logps/chosen": -3.033722400665283,
"logps/rejected": -3.64453387260437,
"loss": 2.9219,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -30.33721923828125,
"rewards/margins": 6.10811710357666,
"rewards/rejected": -36.445335388183594,
"step": 372
},
{
"epoch": 0.843652813118462,
"grad_norm": 141.87616811965896,
"learning_rate": 5.816075162696097e-08,
"logits/chosen": -1.1110835075378418,
"logits/rejected": -1.1089603900909424,
"logps/chosen": -2.8409385681152344,
"logps/rejected": -3.3667705059051514,
"loss": 2.3721,
"rewards/accuracies": 0.84375,
"rewards/chosen": -28.40938377380371,
"rewards/margins": 5.2583208084106445,
"rewards/rejected": -33.66770553588867,
"step": 373
},
{
"epoch": 0.8459146169069833,
"grad_norm": 127.68193628951124,
"learning_rate": 5.6527746421551046e-08,
"logits/chosen": -1.0517830848693848,
"logits/rejected": -1.0378973484039307,
"logps/chosen": -2.8996520042419434,
"logps/rejected": -3.4401497840881348,
"loss": 2.9538,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -28.99652099609375,
"rewards/margins": 5.4049787521362305,
"rewards/rejected": -34.40149688720703,
"step": 374
},
{
"epoch": 0.8481764206955047,
"grad_norm": 147.04552648138863,
"learning_rate": 5.4916249617156064e-08,
"logits/chosen": -1.077235460281372,
"logits/rejected": -1.0792851448059082,
"logps/chosen": -2.7157163619995117,
"logps/rejected": -3.2315173149108887,
"loss": 2.8498,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -27.15716552734375,
"rewards/margins": 5.158007621765137,
"rewards/rejected": -32.31517028808594,
"step": 375
},
{
"epoch": 0.8504382244840261,
"grad_norm": 136.93878767480692,
"learning_rate": 5.332636212649646e-08,
"logits/chosen": -1.07993745803833,
"logits/rejected": -1.0671964883804321,
"logps/chosen": -2.8164620399475098,
"logps/rejected": -3.303701877593994,
"loss": 2.8001,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -28.16461944580078,
"rewards/margins": 4.872396469116211,
"rewards/rejected": -33.037017822265625,
"step": 376
},
{
"epoch": 0.8527000282725473,
"grad_norm": 138.27817441233987,
"learning_rate": 5.17581835091069e-08,
"logits/chosen": -1.0522788763046265,
"logits/rejected": -1.0637288093566895,
"logps/chosen": -2.8656744956970215,
"logps/rejected": -3.3920047283172607,
"loss": 3.0323,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -28.65674591064453,
"rewards/margins": 5.263302326202393,
"rewards/rejected": -33.9200439453125,
"step": 377
},
{
"epoch": 0.8549618320610687,
"grad_norm": 136.54322095828988,
"learning_rate": 5.02118119651016e-08,
"logits/chosen": -1.1301465034484863,
"logits/rejected": -1.130077600479126,
"logps/chosen": -2.8762614727020264,
"logps/rejected": -3.4399282932281494,
"loss": 2.6643,
"rewards/accuracies": 0.75,
"rewards/chosen": -28.762617111206055,
"rewards/margins": 5.636669158935547,
"rewards/rejected": -34.39928436279297,
"step": 378
},
{
"epoch": 0.85722363584959,
"grad_norm": 159.79319397102407,
"learning_rate": 4.868734432902526e-08,
"logits/chosen": -1.1377651691436768,
"logits/rejected": -1.1213501691818237,
"logps/chosen": -2.9207286834716797,
"logps/rejected": -3.5463762283325195,
"loss": 3.1626,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -29.207284927368164,
"rewards/margins": 6.256474494934082,
"rewards/rejected": -35.46376037597656,
"step": 379
},
{
"epoch": 0.8594854396381114,
"grad_norm": 206.34365826921854,
"learning_rate": 4.7184876063789134e-08,
"logits/chosen": -1.1172423362731934,
"logits/rejected": -1.118281602859497,
"logps/chosen": -2.6622347831726074,
"logps/rejected": -3.1349706649780273,
"loss": 2.9815,
"rewards/accuracies": 0.8125,
"rewards/chosen": -26.62234878540039,
"rewards/margins": 4.727357864379883,
"rewards/rejected": -31.349702835083008,
"step": 380
},
{
"epoch": 0.8617472434266328,
"grad_norm": 202.3644549275693,
"learning_rate": 4.570450125469314e-08,
"logits/chosen": -1.0802032947540283,
"logits/rejected": -1.0645304918289185,
"logps/chosen": -2.9399185180664062,
"logps/rejected": -3.5755248069763184,
"loss": 2.5091,
"rewards/accuracies": 0.78125,
"rewards/chosen": -29.399185180664062,
"rewards/margins": 6.356063365936279,
"rewards/rejected": -35.7552490234375,
"step": 381
},
{
"epoch": 0.864009047215154,
"grad_norm": 142.61443241131124,
"learning_rate": 4.424631260353378e-08,
"logits/chosen": -1.1050821542739868,
"logits/rejected": -1.0963554382324219,
"logps/chosen": -2.7791900634765625,
"logps/rejected": -3.280860424041748,
"loss": 3.077,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -27.79190444946289,
"rewards/margins": 5.016700744628906,
"rewards/rejected": -32.80860137939453,
"step": 382
},
{
"epoch": 0.8662708510036754,
"grad_norm": 117.70172449775342,
"learning_rate": 4.281040142280008e-08,
"logits/chosen": -1.1450071334838867,
"logits/rejected": -1.1387488842010498,
"logps/chosen": -2.701590061187744,
"logps/rejected": -3.290010929107666,
"loss": 2.0548,
"rewards/accuracies": 0.828125,
"rewards/chosen": -27.015897750854492,
"rewards/margins": 5.884207725524902,
"rewards/rejected": -32.90010452270508,
"step": 383
},
{
"epoch": 0.8685326547921968,
"grad_norm": 204.206206294938,
"learning_rate": 4.1396857629954286e-08,
"logits/chosen": -1.1101422309875488,
"logits/rejected": -1.0988112688064575,
"logps/chosen": -3.2335309982299805,
"logps/rejected": -3.733260154724121,
"loss": 3.0684,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -32.33530807495117,
"rewards/margins": 4.997293472290039,
"rewards/rejected": -37.332603454589844,
"step": 384
},
{
"epoch": 0.8707944585807181,
"grad_norm": 153.54162168949213,
"learning_rate": 4.000576974180232e-08,
"logits/chosen": -1.0964728593826294,
"logits/rejected": -1.1033198833465576,
"logps/chosen": -2.8175177574157715,
"logps/rejected": -3.2958943843841553,
"loss": 3.1353,
"rewards/accuracies": 0.765625,
"rewards/chosen": -28.17517852783203,
"rewards/margins": 4.7837677001953125,
"rewards/rejected": -32.958946228027344,
"step": 385
},
{
"epoch": 0.8730562623692395,
"grad_norm": 172.21965212484756,
"learning_rate": 3.8637224868950066e-08,
"logits/chosen": -1.0778682231903076,
"logits/rejected": -1.0855255126953125,
"logps/chosen": -2.8457837104797363,
"logps/rejected": -3.306011438369751,
"loss": 3.127,
"rewards/accuracies": 0.75,
"rewards/chosen": -28.457834243774414,
"rewards/margins": 4.60227632522583,
"rewards/rejected": -33.060115814208984,
"step": 386
},
{
"epoch": 0.8753180661577609,
"grad_norm": 155.4189818710374,
"learning_rate": 3.729130871034885e-08,
"logits/chosen": -1.1040070056915283,
"logits/rejected": -1.0997991561889648,
"logps/chosen": -2.8627209663391113,
"logps/rejected": -3.4421796798706055,
"loss": 2.5368,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -28.62721061706543,
"rewards/margins": 5.794586181640625,
"rewards/rejected": -34.42179489135742,
"step": 387
},
{
"epoch": 0.8775798699462821,
"grad_norm": 154.8763399842207,
"learning_rate": 3.596810554792888e-08,
"logits/chosen": -1.1059839725494385,
"logits/rejected": -1.1149722337722778,
"logps/chosen": -2.8257861137390137,
"logps/rejected": -3.35601544380188,
"loss": 2.9665,
"rewards/accuracies": 0.734375,
"rewards/chosen": -28.25786018371582,
"rewards/margins": 5.3022918701171875,
"rewards/rejected": -33.560150146484375,
"step": 388
},
{
"epoch": 0.8798416737348035,
"grad_norm": 147.65054864467723,
"learning_rate": 3.466769824132116e-08,
"logits/chosen": -1.0984127521514893,
"logits/rejected": -1.0669302940368652,
"logps/chosen": -2.8397090435028076,
"logps/rejected": -3.374936819076538,
"loss": 2.8977,
"rewards/accuracies": 0.796875,
"rewards/chosen": -28.397090911865234,
"rewards/margins": 5.352276802062988,
"rewards/rejected": -33.749366760253906,
"step": 389
},
{
"epoch": 0.8821034775233249,
"grad_norm": 157.74667917718264,
"learning_rate": 3.339016822266925e-08,
"logits/chosen": -1.0509486198425293,
"logits/rejected": -1.062030553817749,
"logps/chosen": -2.9279890060424805,
"logps/rejected": -3.5987658500671387,
"loss": 2.1703,
"rewards/accuracies": 0.8359375,
"rewards/chosen": -29.279890060424805,
"rewards/margins": 6.707767963409424,
"rewards/rejected": -35.9876594543457,
"step": 390
},
{
"epoch": 0.8843652813118462,
"grad_norm": 146.8349228995068,
"learning_rate": 3.213559549152958e-08,
"logits/chosen": -1.1121222972869873,
"logits/rejected": -1.093271255493164,
"logps/chosen": -2.754055976867676,
"logps/rejected": -3.3426716327667236,
"loss": 2.8307,
"rewards/accuracies": 0.796875,
"rewards/chosen": -27.54056167602539,
"rewards/margins": 5.886153697967529,
"rewards/rejected": -33.42671203613281,
"step": 391
},
{
"epoch": 0.8866270851003676,
"grad_norm": 137.10119291122675,
"learning_rate": 3.090405860986203e-08,
"logits/chosen": -1.1325721740722656,
"logits/rejected": -1.15884268283844,
"logps/chosen": -2.919574022293091,
"logps/rejected": -3.656275987625122,
"loss": 2.3372,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -29.19573974609375,
"rewards/margins": 7.367020130157471,
"rewards/rejected": -36.56275939941406,
"step": 392
},
{
"epoch": 0.8888888888888888,
"grad_norm": 157.3640886436071,
"learning_rate": 2.9695634697110315e-08,
"logits/chosen": -1.056593418121338,
"logits/rejected": -1.0579760074615479,
"logps/chosen": -2.707357883453369,
"logps/rejected": -3.351743459701538,
"loss": 2.8028,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -27.073579788208008,
"rewards/margins": 6.443853378295898,
"rewards/rejected": -33.517433166503906,
"step": 393
},
{
"epoch": 0.8911506926774102,
"grad_norm": 167.08718168939666,
"learning_rate": 2.8510399425372766e-08,
"logits/chosen": -1.0880804061889648,
"logits/rejected": -1.067001223564148,
"logps/chosen": -2.823213815689087,
"logps/rejected": -3.351870536804199,
"loss": 2.7972,
"rewards/accuracies": 0.75,
"rewards/chosen": -28.23213768005371,
"rewards/margins": 5.286569595336914,
"rewards/rejected": -33.518707275390625,
"step": 394
},
{
"epoch": 0.8934124964659316,
"grad_norm": 170.71135881911255,
"learning_rate": 2.734842701466329e-08,
"logits/chosen": -1.1137869358062744,
"logits/rejected": -1.0902773141860962,
"logps/chosen": -3.2399227619171143,
"logps/rejected": -3.800828456878662,
"loss": 2.6217,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -32.39922332763672,
"rewards/margins": 5.609059810638428,
"rewards/rejected": -38.00828552246094,
"step": 395
},
{
"epoch": 0.8956743002544529,
"grad_norm": 155.60384357389907,
"learning_rate": 2.6209790228264438e-08,
"logits/chosen": -1.1179192066192627,
"logits/rejected": -1.1079840660095215,
"logps/chosen": -2.8274612426757812,
"logps/rejected": -3.3131465911865234,
"loss": 2.8649,
"rewards/accuracies": 0.765625,
"rewards/chosen": -28.274608612060547,
"rewards/margins": 4.856854438781738,
"rewards/rejected": -33.131465911865234,
"step": 396
},
{
"epoch": 0.8979361040429743,
"grad_norm": 158.73384494070228,
"learning_rate": 2.5094560368170305e-08,
"logits/chosen": -1.0657453536987305,
"logits/rejected": -1.0789343118667603,
"logps/chosen": -2.905332088470459,
"logps/rejected": -3.3988749980926514,
"loss": 2.7725,
"rewards/accuracies": 0.796875,
"rewards/chosen": -29.053321838378906,
"rewards/margins": 4.935429096221924,
"rewards/rejected": -33.98875427246094,
"step": 397
},
{
"epoch": 0.9001979078314957,
"grad_norm": 132.85292741199737,
"learning_rate": 2.4002807270621893e-08,
"logits/chosen": -1.1158607006072998,
"logits/rejected": -1.1061642169952393,
"logps/chosen": -2.8107986450195312,
"logps/rejected": -3.386343240737915,
"loss": 2.4806,
"rewards/accuracies": 0.796875,
"rewards/chosen": -28.107988357543945,
"rewards/margins": 5.7554426193237305,
"rewards/rejected": -33.863433837890625,
"step": 398
},
{
"epoch": 0.9024597116200169,
"grad_norm": 163.9414902624495,
"learning_rate": 2.293459930173354e-08,
"logits/chosen": -1.1135897636413574,
"logits/rejected": -1.1188251972198486,
"logps/chosen": -2.918743848800659,
"logps/rejected": -3.4281227588653564,
"loss": 2.8561,
"rewards/accuracies": 0.78125,
"rewards/chosen": -29.18743896484375,
"rewards/margins": 5.093789100646973,
"rewards/rejected": -34.281227111816406,
"step": 399
},
{
"epoch": 0.9047215154085383,
"grad_norm": 159.34621159580047,
"learning_rate": 2.189000335321256e-08,
"logits/chosen": -1.0814203023910522,
"logits/rejected": -1.0605090856552124,
"logps/chosen": -2.842669725418091,
"logps/rejected": -3.3467047214508057,
"loss": 3.2137,
"rewards/accuracies": 0.75,
"rewards/chosen": -28.42669677734375,
"rewards/margins": 5.040349006652832,
"rewards/rejected": -33.46704864501953,
"step": 400
},
{
"epoch": 0.9069833191970597,
"grad_norm": 176.47557033405067,
"learning_rate": 2.086908483816954e-08,
"logits/chosen": -1.1017359495162964,
"logits/rejected": -1.0888714790344238,
"logps/chosen": -3.1457722187042236,
"logps/rejected": -3.6218981742858887,
"loss": 3.2885,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -31.457719802856445,
"rewards/margins": 4.76125955581665,
"rewards/rejected": -36.2189826965332,
"step": 401
},
{
"epoch": 0.909245122985581,
"grad_norm": 171.77739705704673,
"learning_rate": 1.9871907687022717e-08,
"logits/chosen": -1.103614091873169,
"logits/rejected": -1.0980620384216309,
"logps/chosen": -2.7797932624816895,
"logps/rejected": -3.2574329376220703,
"loss": 2.9828,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -27.79793357849121,
"rewards/margins": 4.776399612426758,
"rewards/rejected": -32.5743293762207,
"step": 402
},
{
"epoch": 0.9115069267741024,
"grad_norm": 129.71420303866944,
"learning_rate": 1.889853434349451e-08,
"logits/chosen": -1.0566623210906982,
"logits/rejected": -1.0624772310256958,
"logps/chosen": -2.755463123321533,
"logps/rejected": -3.331778049468994,
"loss": 3.0728,
"rewards/accuracies": 0.734375,
"rewards/chosen": -27.55463409423828,
"rewards/margins": 5.76314640045166,
"rewards/rejected": -33.317779541015625,
"step": 403
},
{
"epoch": 0.9137687305626236,
"grad_norm": 146.67389640495995,
"learning_rate": 1.7949025760701164e-08,
"logits/chosen": -1.0643901824951172,
"logits/rejected": -1.0577207803726196,
"logps/chosen": -2.9979794025421143,
"logps/rejected": -3.5035500526428223,
"loss": 2.8652,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -29.979793548583984,
"rewards/margins": 5.0557050704956055,
"rewards/rejected": -35.035499572753906,
"step": 404
},
{
"epoch": 0.916030534351145,
"grad_norm": 145.46946091603365,
"learning_rate": 1.7023441397336023e-08,
"logits/chosen": -1.1220301389694214,
"logits/rejected": -1.1037752628326416,
"logps/chosen": -2.870546340942383,
"logps/rejected": -3.4554359912872314,
"loss": 2.4947,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -28.70546531677246,
"rewards/margins": 5.848890781402588,
"rewards/rejected": -34.55435562133789,
"step": 405
},
{
"epoch": 0.9182923381396664,
"grad_norm": 147.2092751589586,
"learning_rate": 1.6121839213945854e-08,
"logits/chosen": -1.0786685943603516,
"logits/rejected": -1.0783849954605103,
"logps/chosen": -2.9078550338745117,
"logps/rejected": -3.536391258239746,
"loss": 3.0017,
"rewards/accuracies": 0.78125,
"rewards/chosen": -29.078548431396484,
"rewards/margins": 6.28536319732666,
"rewards/rejected": -35.36391067504883,
"step": 406
},
{
"epoch": 0.9205541419281877,
"grad_norm": 186.25880993372996,
"learning_rate": 1.5244275669301777e-08,
"logits/chosen": -1.113109827041626,
"logits/rejected": -1.0987439155578613,
"logps/chosen": -2.944148302078247,
"logps/rejected": -3.5245351791381836,
"loss": 2.8001,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -29.44148063659668,
"rewards/margins": 5.8038716316223145,
"rewards/rejected": -35.24535369873047,
"step": 407
},
{
"epoch": 0.9228159457167091,
"grad_norm": 178.1440969919412,
"learning_rate": 1.4390805716863398e-08,
"logits/chosen": -1.0935332775115967,
"logits/rejected": -1.094548225402832,
"logps/chosen": -2.868260383605957,
"logps/rejected": -3.3665590286254883,
"loss": 3.0518,
"rewards/accuracies": 0.75,
"rewards/chosen": -28.682598114013672,
"rewards/margins": 4.982987403869629,
"rewards/rejected": -33.665584564208984,
"step": 408
},
{
"epoch": 0.9250777495052305,
"grad_norm": 130.74419848686492,
"learning_rate": 1.3561482801337908e-08,
"logits/chosen": -1.0224024057388306,
"logits/rejected": -1.038309097290039,
"logps/chosen": -2.6347591876983643,
"logps/rejected": -3.282350540161133,
"loss": 2.557,
"rewards/accuracies": 0.765625,
"rewards/chosen": -26.347593307495117,
"rewards/margins": 6.475912570953369,
"rewards/rejected": -32.82350540161133,
"step": 409
},
{
"epoch": 0.9273395532937517,
"grad_norm": 191.06201962614162,
"learning_rate": 1.2756358855332904e-08,
"logits/chosen": -1.1101237535476685,
"logits/rejected": -1.1116387844085693,
"logps/chosen": -2.9200122356414795,
"logps/rejected": -3.3554527759552,
"loss": 3.4354,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -29.200119018554688,
"rewards/margins": 4.354408264160156,
"rewards/rejected": -33.55453109741211,
"step": 410
},
{
"epoch": 0.9296013570822731,
"grad_norm": 134.29545605610724,
"learning_rate": 1.1975484296105154e-08,
"logits/chosen": -1.0667734146118164,
"logits/rejected": -1.0599286556243896,
"logps/chosen": -2.877082109451294,
"logps/rejected": -3.443552017211914,
"loss": 2.5938,
"rewards/accuracies": 0.78125,
"rewards/chosen": -28.770824432373047,
"rewards/margins": 5.6646928787231445,
"rewards/rejected": -34.435516357421875,
"step": 411
},
{
"epoch": 0.9318631608707945,
"grad_norm": 174.84962964775417,
"learning_rate": 1.1218908022402374e-08,
"logits/chosen": -1.0731241703033447,
"logits/rejected": -1.0706862211227417,
"logps/chosen": -2.7748520374298096,
"logps/rejected": -3.357201337814331,
"loss": 2.8863,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -27.74852180480957,
"rewards/margins": 5.823493003845215,
"rewards/rejected": -33.57201385498047,
"step": 412
},
{
"epoch": 0.9341249646593158,
"grad_norm": 146.28409435329323,
"learning_rate": 1.0486677411402079e-08,
"logits/chosen": -1.1381806135177612,
"logits/rejected": -1.133017897605896,
"logps/chosen": -2.9692389965057373,
"logps/rejected": -3.5767297744750977,
"loss": 3.1506,
"rewards/accuracies": 0.734375,
"rewards/chosen": -29.69239044189453,
"rewards/margins": 6.074907302856445,
"rewards/rejected": -35.767295837402344,
"step": 413
},
{
"epoch": 0.9363867684478372,
"grad_norm": 159.6750432179662,
"learning_rate": 9.778838315744353e-09,
"logits/chosen": -1.1111695766448975,
"logits/rejected": -1.1009503602981567,
"logps/chosen": -2.921187162399292,
"logps/rejected": -3.49282169342041,
"loss": 2.4905,
"rewards/accuracies": 0.796875,
"rewards/chosen": -29.21187400817871,
"rewards/margins": 5.716343879699707,
"rewards/rejected": -34.92821502685547,
"step": 414
},
{
"epoch": 0.9386485722363584,
"grad_norm": 163.33116990865057,
"learning_rate": 9.095435060660595e-09,
"logits/chosen": -1.0523741245269775,
"logits/rejected": -1.0503299236297607,
"logps/chosen": -2.904160499572754,
"logps/rejected": -3.385972499847412,
"loss": 3.2821,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -29.041603088378906,
"rewards/margins": 4.818119525909424,
"rewards/rejected": -33.85972595214844,
"step": 415
},
{
"epoch": 0.9409103760248798,
"grad_norm": 172.67539274971105,
"learning_rate": 8.436510441197864e-09,
"logits/chosen": -1.0634400844573975,
"logits/rejected": -1.064286470413208,
"logps/chosen": -2.8385376930236816,
"logps/rejected": -3.3240842819213867,
"loss": 3.3055,
"rewards/accuracies": 0.71875,
"rewards/chosen": -28.385377883911133,
"rewards/margins": 4.855468273162842,
"rewards/rejected": -33.2408447265625,
"step": 416
},
{
"epoch": 0.9431721798134012,
"grad_norm": 176.03201940803845,
"learning_rate": 7.802105719539076e-09,
"logits/chosen": -1.0832864046096802,
"logits/rejected": -1.094434380531311,
"logps/chosen": -2.95005464553833,
"logps/rejected": -3.435476541519165,
"loss": 3.4372,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -29.500545501708984,
"rewards/margins": 4.854221820831299,
"rewards/rejected": -34.354766845703125,
"step": 417
},
{
"epoch": 0.9454339836019225,
"grad_norm": 140.1951037917173,
"learning_rate": 7.1922606224192e-09,
"logits/chosen": -1.1131281852722168,
"logits/rejected": -1.0919857025146484,
"logps/chosen": -2.904672861099243,
"logps/rejected": -3.4692986011505127,
"loss": 2.6629,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -29.046730041503906,
"rewards/margins": 5.646256923675537,
"rewards/rejected": -34.69298553466797,
"step": 418
},
{
"epoch": 0.9476957873904439,
"grad_norm": 158.29519351333673,
"learning_rate": 6.6070133386372906e-09,
"logits/chosen": -1.109086513519287,
"logits/rejected": -1.100806713104248,
"logps/chosen": -2.969877243041992,
"logps/rejected": -3.402845859527588,
"loss": 3.1048,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -29.698768615722656,
"rewards/margins": 4.3296895027160645,
"rewards/rejected": -34.02845764160156,
"step": 419
},
{
"epoch": 0.9499575911789653,
"grad_norm": 142.54144986382758,
"learning_rate": 6.046400516665384e-09,
"logits/chosen": -1.072888970375061,
"logits/rejected": -1.0681804418563843,
"logps/chosen": -2.8510801792144775,
"logps/rejected": -3.392737627029419,
"loss": 2.8594,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -28.51080322265625,
"rewards/margins": 5.416579246520996,
"rewards/rejected": -33.9273796081543,
"step": 420
},
{
"epoch": 0.9522193949674865,
"grad_norm": 167.23162652803958,
"learning_rate": 5.510457262353396e-09,
"logits/chosen": -1.0967669486999512,
"logits/rejected": -1.0917611122131348,
"logps/chosen": -2.810840606689453,
"logps/rejected": -3.335181474685669,
"loss": 2.8169,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -28.10840606689453,
"rewards/margins": 5.243409156799316,
"rewards/rejected": -33.35181427001953,
"step": 421
},
{
"epoch": 0.9544811987560079,
"grad_norm": 147.43492313171294,
"learning_rate": 4.9992171367309265e-09,
"logits/chosen": -1.0767607688903809,
"logits/rejected": -1.062929391860962,
"logps/chosen": -2.683845043182373,
"logps/rejected": -3.255852222442627,
"loss": 2.5143,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -26.83845329284668,
"rewards/margins": 5.720067977905273,
"rewards/rejected": -32.55852127075195,
"step": 422
},
{
"epoch": 0.9567430025445293,
"grad_norm": 154.264825757035,
"learning_rate": 4.5127121539052955e-09,
"logits/chosen": -1.1299694776535034,
"logits/rejected": -1.1185402870178223,
"logps/chosen": -2.9707369804382324,
"logps/rejected": -3.532411575317383,
"loss": 2.8685,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -29.70737075805664,
"rewards/margins": 5.616747856140137,
"rewards/rejected": -35.32411575317383,
"step": 423
},
{
"epoch": 0.9590048063330506,
"grad_norm": 140.7802126038013,
"learning_rate": 4.050972779057327e-09,
"logits/chosen": -1.0196279287338257,
"logits/rejected": -1.012292504310608,
"logps/chosen": -2.6845099925994873,
"logps/rejected": -3.226938009262085,
"loss": 3.0667,
"rewards/accuracies": 0.734375,
"rewards/chosen": -26.845102310180664,
"rewards/margins": 5.424278259277344,
"rewards/rejected": -32.269378662109375,
"step": 424
},
{
"epoch": 0.961266610121572,
"grad_norm": 165.39326307614513,
"learning_rate": 3.6140279265330477e-09,
"logits/chosen": -1.066307783126831,
"logits/rejected": -1.0565274953842163,
"logps/chosen": -2.9191229343414307,
"logps/rejected": -3.4463143348693848,
"loss": 3.2063,
"rewards/accuracies": 0.765625,
"rewards/chosen": -29.191232681274414,
"rewards/margins": 5.271913051605225,
"rewards/rejected": -34.46314239501953,
"step": 425
},
{
"epoch": 0.9635284139100933,
"grad_norm": 173.22961492513315,
"learning_rate": 3.2019049580335853e-09,
"logits/chosen": -1.090395212173462,
"logits/rejected": -1.071869969367981,
"logps/chosen": -2.804586410522461,
"logps/rejected": -3.237943649291992,
"loss": 3.3588,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -28.045866012573242,
"rewards/margins": 4.3335723876953125,
"rewards/rejected": -32.37943649291992,
"step": 426
},
{
"epoch": 0.9657902176986146,
"grad_norm": 145.19510434395903,
"learning_rate": 2.814629680901337e-09,
"logits/chosen": -1.1221716403961182,
"logits/rejected": -1.1198689937591553,
"logps/chosen": -2.9432907104492188,
"logps/rejected": -3.467853546142578,
"loss": 2.7511,
"rewards/accuracies": 0.796875,
"rewards/chosen": -29.43290901184082,
"rewards/margins": 5.245627403259277,
"rewards/rejected": -34.67853546142578,
"step": 427
},
{
"epoch": 0.968052021487136,
"grad_norm": 158.1903949721956,
"learning_rate": 2.4522263465041937e-09,
"logits/chosen": -1.0683190822601318,
"logits/rejected": -1.0715701580047607,
"logps/chosen": -2.909548044204712,
"logps/rejected": -3.454073667526245,
"loss": 2.6358,
"rewards/accuracies": 0.796875,
"rewards/chosen": -29.09547996520996,
"rewards/margins": 5.445255756378174,
"rewards/rejected": -34.540733337402344,
"step": 428
},
{
"epoch": 0.9703138252756573,
"grad_norm": 155.32881349911696,
"learning_rate": 2.114717648716713e-09,
"logits/chosen": -1.058469533920288,
"logits/rejected": -1.0475648641586304,
"logps/chosen": -2.8327419757843018,
"logps/rejected": -3.398642063140869,
"loss": 2.8237,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -28.327417373657227,
"rewards/margins": 5.659000396728516,
"rewards/rejected": -33.986419677734375,
"step": 429
},
{
"epoch": 0.9725756290641787,
"grad_norm": 151.9784262009179,
"learning_rate": 1.802124722499121e-09,
"logits/chosen": -1.082242488861084,
"logits/rejected": -1.0863622426986694,
"logps/chosen": -2.9595117568969727,
"logps/rejected": -3.589582920074463,
"loss": 2.8873,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -29.59511947631836,
"rewards/margins": 6.3007121086120605,
"rewards/rejected": -35.89583206176758,
"step": 430
},
{
"epoch": 0.9748374328527001,
"grad_norm": 145.2881020579959,
"learning_rate": 1.5144671425737499e-09,
"logits/chosen": -1.0939596891403198,
"logits/rejected": -1.0839340686798096,
"logps/chosen": -2.7935657501220703,
"logps/rejected": -3.3438332080841064,
"loss": 3.1053,
"rewards/accuracies": 0.71875,
"rewards/chosen": -27.93565559387207,
"rewards/margins": 5.502673149108887,
"rewards/rejected": -33.438331604003906,
"step": 431
},
{
"epoch": 0.9770992366412213,
"grad_norm": 142.85260765400977,
"learning_rate": 1.251762922199484e-09,
"logits/chosen": -1.0255465507507324,
"logits/rejected": -1.0279417037963867,
"logps/chosen": -2.9136672019958496,
"logps/rejected": -3.5454673767089844,
"loss": 2.2466,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -29.13667106628418,
"rewards/margins": 6.318003177642822,
"rewards/rejected": -35.454673767089844,
"step": 432
},
{
"epoch": 0.9793610404297427,
"grad_norm": 140.6251681228611,
"learning_rate": 1.0140285120433744e-09,
"logits/chosen": -1.0883697271347046,
"logits/rejected": -1.079869031906128,
"logps/chosen": -2.923574209213257,
"logps/rejected": -3.472318649291992,
"loss": 2.9511,
"rewards/accuracies": 0.75,
"rewards/chosen": -29.235740661621094,
"rewards/margins": 5.48744535446167,
"rewards/rejected": -34.72319030761719,
"step": 433
},
{
"epoch": 0.9816228442182641,
"grad_norm": 163.4887730874021,
"learning_rate": 8.012787991508396e-10,
"logits/chosen": -1.0686748027801514,
"logits/rejected": -1.0695778131484985,
"logps/chosen": -2.8537275791168213,
"logps/rejected": -3.4509644508361816,
"loss": 2.8354,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -28.537277221679688,
"rewards/margins": 5.972366809844971,
"rewards/rejected": -34.5096435546875,
"step": 434
},
{
"epoch": 0.9838846480067854,
"grad_norm": 159.38860256761092,
"learning_rate": 6.135271060133007e-10,
"logits/chosen": -1.08330500125885,
"logits/rejected": -1.059489130973816,
"logps/chosen": -3.001162528991699,
"logps/rejected": -3.6113970279693604,
"loss": 2.7838,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -30.01162338256836,
"rewards/margins": 6.102348327636719,
"rewards/rejected": -36.11397171020508,
"step": 435
},
{
"epoch": 0.9861464517953068,
"grad_norm": 153.9713059574121,
"learning_rate": 4.50785189733871e-10,
"logits/chosen": -1.0712958574295044,
"logits/rejected": -1.07225501537323,
"logps/chosen": -2.815187454223633,
"logps/rejected": -3.376993179321289,
"loss": 2.5357,
"rewards/accuracies": 0.78125,
"rewards/chosen": -28.151872634887695,
"rewards/margins": 5.61806058883667,
"rewards/rejected": -33.76993179321289,
"step": 436
},
{
"epoch": 0.988408255583828,
"grad_norm": 177.28777588642907,
"learning_rate": 3.1306324129118935e-10,
"logits/chosen": -1.074324369430542,
"logits/rejected": -1.068164348602295,
"logps/chosen": -3.001028299331665,
"logps/rejected": -3.464707612991333,
"loss": 3.0057,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -30.010284423828125,
"rewards/margins": 4.636792182922363,
"rewards/rejected": -34.64707565307617,
"step": 437
},
{
"epoch": 0.9906700593723494,
"grad_norm": 188.49091857123523,
"learning_rate": 2.003698849011748e-10,
"logits/chosen": -1.1169301271438599,
"logits/rejected": -1.1111122369766235,
"logps/chosen": -3.0199649333953857,
"logps/rejected": -3.4195683002471924,
"loss": 3.6605,
"rewards/accuracies": 0.671875,
"rewards/chosen": -30.199649810791016,
"rewards/margins": 3.996029853820801,
"rewards/rejected": -34.1956787109375,
"step": 438
},
{
"epoch": 0.9929318631608708,
"grad_norm": 161.39259146870793,
"learning_rate": 1.1271217747714779e-10,
"logits/chosen": -1.1197105646133423,
"logits/rejected": -1.123986005783081,
"logps/chosen": -2.9404468536376953,
"logps/rejected": -3.3876237869262695,
"loss": 3.4166,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -29.404468536376953,
"rewards/margins": 4.471774101257324,
"rewards/rejected": -33.87623977661133,
"step": 439
},
{
"epoch": 0.9951936669493922,
"grad_norm": 141.54707977256322,
"learning_rate": 5.0095608187739055e-11,
"logits/chosen": -1.0695970058441162,
"logits/rejected": -1.0684033632278442,
"logps/chosen": -2.7523577213287354,
"logps/rejected": -3.2570207118988037,
"loss": 2.9094,
"rewards/accuracies": 0.734375,
"rewards/chosen": -27.523576736450195,
"rewards/margins": 5.046632766723633,
"rewards/rejected": -32.57020950317383,
"step": 440
},
{
"epoch": 0.9974554707379135,
"grad_norm": 164.42489149506028,
"learning_rate": 1.2524098113209092e-11,
"logits/chosen": -1.1133098602294922,
"logits/rejected": -1.1184360980987549,
"logps/chosen": -3.0493438243865967,
"logps/rejected": -3.5772345066070557,
"loss": 2.9557,
"rewards/accuracies": 0.8125,
"rewards/chosen": -30.493436813354492,
"rewards/margins": 5.27890682220459,
"rewards/rejected": -35.77234649658203,
"step": 441
},
{
"epoch": 0.9997172745264349,
"grad_norm": 132.5425140912548,
"learning_rate": 0.0,
"logits/chosen": -1.1178230047225952,
"logits/rejected": -1.1037944555282593,
"logps/chosen": -2.8530402183532715,
"logps/rejected": -3.433281660079956,
"loss": 2.6473,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -28.530399322509766,
"rewards/margins": 5.80241584777832,
"rewards/rejected": -34.33281707763672,
"step": 442
},
{
"epoch": 0.9997172745264349,
"eval_logits/chosen": -1.0814050436019897,
"eval_logits/rejected": -1.0757393836975098,
"eval_logps/chosen": -2.8983895778656006,
"eval_logps/rejected": -3.4255356788635254,
"eval_loss": 2.930593490600586,
"eval_rewards/accuracies": 0.7573529481887817,
"eval_rewards/chosen": -28.983896255493164,
"eval_rewards/margins": 5.271461486816406,
"eval_rewards/rejected": -34.25535583496094,
"eval_runtime": 100.5751,
"eval_samples_per_second": 29.62,
"eval_steps_per_second": 1.859,
"step": 442
},
{
"epoch": 0.9997172745264349,
"step": 442,
"total_flos": 134366991482880.0,
"train_loss": 3.5634871910060695,
"train_runtime": 7608.7168,
"train_samples_per_second": 7.438,
"train_steps_per_second": 0.058
}
],
"logging_steps": 1.0,
"max_steps": 442,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 134366991482880.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}