simpo_r80 / trainer_state.json
lzc0525's picture
Upload folder using huggingface_hub
c9d7a3f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9979517674264948,
"eval_steps": 500,
"global_step": 472,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002114304592005286,
"grad_norm": 1.5018059015274048,
"learning_rate": 2.083333333333333e-08,
"logits/chosen": -0.3466828167438507,
"logits/rejected": -0.30099987983703613,
"logps/chosen": -0.9345186948776245,
"logps/rejected": -0.9117153882980347,
"loss": 1.4889,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.869037389755249,
"rewards/margins": -0.04560665041208267,
"rewards/rejected": -1.8234307765960693,
"step": 1
},
{
"epoch": 0.004228609184010572,
"grad_norm": 0.8093975186347961,
"learning_rate": 4.166666666666666e-08,
"logits/chosen": -0.4310421049594879,
"logits/rejected": -0.39132067561149597,
"logps/chosen": -0.8198825716972351,
"logps/rejected": -0.8644211888313293,
"loss": 1.376,
"rewards/accuracies": 0.515625,
"rewards/chosen": -1.6397651433944702,
"rewards/margins": 0.08907715976238251,
"rewards/rejected": -1.7288423776626587,
"step": 2
},
{
"epoch": 0.006342913776015857,
"grad_norm": 0.5377389788627625,
"learning_rate": 6.25e-08,
"logits/chosen": -0.46692028641700745,
"logits/rejected": -0.4649256467819214,
"logps/chosen": -0.9087910652160645,
"logps/rejected": -0.9648240804672241,
"loss": 1.3404,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -1.817582130432129,
"rewards/margins": 0.11206617206335068,
"rewards/rejected": -1.9296481609344482,
"step": 3
},
{
"epoch": 0.008457218368021144,
"grad_norm": 0.3221875727176666,
"learning_rate": 8.333333333333333e-08,
"logits/chosen": -0.416828453540802,
"logits/rejected": -0.3584724962711334,
"logps/chosen": -0.7818898558616638,
"logps/rejected": -0.8170815110206604,
"loss": 1.3806,
"rewards/accuracies": 0.484375,
"rewards/chosen": -1.5637797117233276,
"rewards/margins": 0.07038339227437973,
"rewards/rejected": -1.6341630220413208,
"step": 4
},
{
"epoch": 0.010571522960026428,
"grad_norm": 0.64655601978302,
"learning_rate": 1.0416666666666667e-07,
"logits/chosen": -0.376886248588562,
"logits/rejected": -0.3516141474246979,
"logps/chosen": -0.8814125061035156,
"logps/rejected": -1.0214396715164185,
"loss": 1.2741,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.7628250122070312,
"rewards/margins": 0.28005433082580566,
"rewards/rejected": -2.042879343032837,
"step": 5
},
{
"epoch": 0.012685827552031714,
"grad_norm": 0.4775894582271576,
"learning_rate": 1.25e-07,
"logits/chosen": -0.4757865369319916,
"logits/rejected": -0.4498941898345947,
"logps/chosen": -0.8962199687957764,
"logps/rejected": -0.9462199807167053,
"loss": 1.364,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.7924399375915527,
"rewards/margins": 0.10000008344650269,
"rewards/rejected": -1.8924399614334106,
"step": 6
},
{
"epoch": 0.014800132144037,
"grad_norm": 1.2459568977355957,
"learning_rate": 1.4583333333333335e-07,
"logits/chosen": -0.38895344734191895,
"logits/rejected": -0.38165366649627686,
"logps/chosen": -0.9025766253471375,
"logps/rejected": -0.9465017318725586,
"loss": 1.3898,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.805153250694275,
"rewards/margins": 0.0878501906991005,
"rewards/rejected": -1.8930034637451172,
"step": 7
},
{
"epoch": 0.016914436736042288,
"grad_norm": 0.6195729374885559,
"learning_rate": 1.6666666666666665e-07,
"logits/chosen": -0.3964853286743164,
"logits/rejected": -0.377862811088562,
"logps/chosen": -0.9054160118103027,
"logps/rejected": -0.9605879187583923,
"loss": 1.3821,
"rewards/accuracies": 0.4765625,
"rewards/chosen": -1.8108320236206055,
"rewards/margins": 0.1103438138961792,
"rewards/rejected": -1.9211758375167847,
"step": 8
},
{
"epoch": 0.019028741328047574,
"grad_norm": 1.2074137926101685,
"learning_rate": 1.875e-07,
"logits/chosen": -0.3729037344455719,
"logits/rejected": -0.38143450021743774,
"logps/chosen": -0.9328653216362,
"logps/rejected": -0.9905799627304077,
"loss": 1.3754,
"rewards/accuracies": 0.5078125,
"rewards/chosen": -1.8657306432724,
"rewards/margins": 0.11542946100234985,
"rewards/rejected": -1.9811599254608154,
"step": 9
},
{
"epoch": 0.021143045920052856,
"grad_norm": 0.2867220640182495,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": -0.4263336658477783,
"logits/rejected": -0.42903271317481995,
"logps/chosen": -0.8979260325431824,
"logps/rejected": -0.9078099727630615,
"loss": 1.4438,
"rewards/accuracies": 0.515625,
"rewards/chosen": -1.7958520650863647,
"rewards/margins": 0.019767940044403076,
"rewards/rejected": -1.815619945526123,
"step": 10
},
{
"epoch": 0.023257350512058142,
"grad_norm": 0.8363026976585388,
"learning_rate": 2.2916666666666663e-07,
"logits/chosen": -0.3374914526939392,
"logits/rejected": -0.32399696111679077,
"logps/chosen": -0.8886098861694336,
"logps/rejected": -0.9484556317329407,
"loss": 1.3422,
"rewards/accuracies": 0.546875,
"rewards/chosen": -1.7772197723388672,
"rewards/margins": 0.11969132721424103,
"rewards/rejected": -1.8969112634658813,
"step": 11
},
{
"epoch": 0.025371655104063428,
"grad_norm": 0.5406804084777832,
"learning_rate": 2.5e-07,
"logits/chosen": -0.42844679951667786,
"logits/rejected": -0.37984615564346313,
"logps/chosen": -0.861629843711853,
"logps/rejected": -0.8968492150306702,
"loss": 1.3922,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.723259687423706,
"rewards/margins": 0.07043875753879547,
"rewards/rejected": -1.7936984300613403,
"step": 12
},
{
"epoch": 0.027485959696068714,
"grad_norm": 0.9919329285621643,
"learning_rate": 2.708333333333333e-07,
"logits/chosen": -0.36495402455329895,
"logits/rejected": -0.3249490261077881,
"logps/chosen": -0.8502095937728882,
"logps/rejected": -0.8470643758773804,
"loss": 1.4334,
"rewards/accuracies": 0.5078125,
"rewards/chosen": -1.7004191875457764,
"rewards/margins": -0.006290358491241932,
"rewards/rejected": -1.6941287517547607,
"step": 13
},
{
"epoch": 0.029600264288074,
"grad_norm": 0.5477162003517151,
"learning_rate": 2.916666666666667e-07,
"logits/chosen": -0.4155704081058502,
"logits/rejected": -0.39535820484161377,
"logps/chosen": -1.0430240631103516,
"logps/rejected": -1.1318373680114746,
"loss": 1.3533,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -2.086048126220703,
"rewards/margins": 0.17762640118598938,
"rewards/rejected": -2.263674736022949,
"step": 14
},
{
"epoch": 0.031714568880079286,
"grad_norm": 0.26530712842941284,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -0.4810572564601898,
"logits/rejected": -0.42454615235328674,
"logps/chosen": -0.8741041421890259,
"logps/rejected": -0.9494178295135498,
"loss": 1.3655,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.7482082843780518,
"rewards/margins": 0.15062758326530457,
"rewards/rejected": -1.8988356590270996,
"step": 15
},
{
"epoch": 0.033828873472084575,
"grad_norm": 0.9272629618644714,
"learning_rate": 3.333333333333333e-07,
"logits/chosen": -0.4440098702907562,
"logits/rejected": -0.3930297791957855,
"logps/chosen": -0.8473359942436218,
"logps/rejected": -0.9369213581085205,
"loss": 1.3248,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.6946719884872437,
"rewards/margins": 0.17917080223560333,
"rewards/rejected": -1.873842716217041,
"step": 16
},
{
"epoch": 0.03594317806408986,
"grad_norm": 0.5912418961524963,
"learning_rate": 3.541666666666667e-07,
"logits/chosen": -0.3838099539279938,
"logits/rejected": -0.3507584035396576,
"logps/chosen": -0.8888350129127502,
"logps/rejected": -0.9361770749092102,
"loss": 1.383,
"rewards/accuracies": 0.4921875,
"rewards/chosen": -1.7776700258255005,
"rewards/margins": 0.0946839451789856,
"rewards/rejected": -1.8723541498184204,
"step": 17
},
{
"epoch": 0.03805748265609515,
"grad_norm": 0.6536504030227661,
"learning_rate": 3.75e-07,
"logits/chosen": -0.3581697940826416,
"logits/rejected": -0.3620460629463196,
"logps/chosen": -0.8519617319107056,
"logps/rejected": -0.9022184610366821,
"loss": 1.3841,
"rewards/accuracies": 0.5078125,
"rewards/chosen": -1.7039234638214111,
"rewards/margins": 0.10051343590021133,
"rewards/rejected": -1.8044369220733643,
"step": 18
},
{
"epoch": 0.04017178724810043,
"grad_norm": 0.3433632552623749,
"learning_rate": 3.958333333333333e-07,
"logits/chosen": -0.37887442111968994,
"logits/rejected": -0.37543320655822754,
"logps/chosen": -0.9464104175567627,
"logps/rejected": -1.0017329454421997,
"loss": 1.3649,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -1.8928208351135254,
"rewards/margins": 0.11064518243074417,
"rewards/rejected": -2.0034658908843994,
"step": 19
},
{
"epoch": 0.04228609184010571,
"grad_norm": 0.9764007329940796,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -0.44110679626464844,
"logits/rejected": -0.4280649721622467,
"logps/chosen": -0.9046768546104431,
"logps/rejected": -1.0464633703231812,
"loss": 1.2592,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.8093537092208862,
"rewards/margins": 0.2835729420185089,
"rewards/rejected": -2.0929267406463623,
"step": 20
},
{
"epoch": 0.044400396432111,
"grad_norm": 1.8563830852508545,
"learning_rate": 4.375e-07,
"logits/chosen": -0.45183491706848145,
"logits/rejected": -0.42935287952423096,
"logps/chosen": -0.9043138027191162,
"logps/rejected": -0.9462392926216125,
"loss": 1.3784,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.8086276054382324,
"rewards/margins": 0.08385094255208969,
"rewards/rejected": -1.892478585243225,
"step": 21
},
{
"epoch": 0.046514701024116284,
"grad_norm": 1.3473299741744995,
"learning_rate": 4.5833333333333327e-07,
"logits/chosen": -0.37855517864227295,
"logits/rejected": -0.34429043531417847,
"logps/chosen": -0.9284683465957642,
"logps/rejected": -0.9454050064086914,
"loss": 1.4346,
"rewards/accuracies": 0.5234375,
"rewards/chosen": -1.8569366931915283,
"rewards/margins": 0.03387312963604927,
"rewards/rejected": -1.8908100128173828,
"step": 22
},
{
"epoch": 0.04862900561612157,
"grad_norm": 0.940831184387207,
"learning_rate": 4.791666666666667e-07,
"logits/chosen": -0.39172160625457764,
"logits/rejected": -0.3695780634880066,
"logps/chosen": -0.9314202666282654,
"logps/rejected": -1.020229697227478,
"loss": 1.3322,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -1.8628405332565308,
"rewards/margins": 0.17761869728565216,
"rewards/rejected": -2.040459394454956,
"step": 23
},
{
"epoch": 0.050743310208126856,
"grad_norm": 0.5783158540725708,
"learning_rate": 5e-07,
"logits/chosen": -0.4958629608154297,
"logits/rejected": -0.4257377088069916,
"logps/chosen": -0.9379237294197083,
"logps/rejected": -0.9415461421012878,
"loss": 1.441,
"rewards/accuracies": 0.515625,
"rewards/chosen": -1.8758474588394165,
"rewards/margins": 0.0072449808940291405,
"rewards/rejected": -1.8830922842025757,
"step": 24
},
{
"epoch": 0.052857614800132145,
"grad_norm": 1.4209853410720825,
"learning_rate": 5.208333333333334e-07,
"logits/chosen": -0.36407172679901123,
"logits/rejected": -0.3331725299358368,
"logps/chosen": -0.9192589521408081,
"logps/rejected": -0.9595308899879456,
"loss": 1.3994,
"rewards/accuracies": 0.5078125,
"rewards/chosen": -1.8385179042816162,
"rewards/margins": 0.080544114112854,
"rewards/rejected": -1.9190617799758911,
"step": 25
},
{
"epoch": 0.05497191939213743,
"grad_norm": 0.6310216188430786,
"learning_rate": 5.416666666666666e-07,
"logits/chosen": -0.41772690415382385,
"logits/rejected": -0.36565953493118286,
"logps/chosen": -0.8052878379821777,
"logps/rejected": -0.8673746585845947,
"loss": 1.3356,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -1.6105756759643555,
"rewards/margins": 0.12417369335889816,
"rewards/rejected": -1.7347493171691895,
"step": 26
},
{
"epoch": 0.05708622398414272,
"grad_norm": 1.2933462858200073,
"learning_rate": 5.625e-07,
"logits/chosen": -0.4482795000076294,
"logits/rejected": -0.39409321546554565,
"logps/chosen": -0.8339261412620544,
"logps/rejected": -0.8675202131271362,
"loss": 1.3739,
"rewards/accuracies": 0.4921875,
"rewards/chosen": -1.6678522825241089,
"rewards/margins": 0.06718815863132477,
"rewards/rejected": -1.7350404262542725,
"step": 27
},
{
"epoch": 0.059200528576148,
"grad_norm": 0.5808025002479553,
"learning_rate": 5.833333333333334e-07,
"logits/chosen": -0.37116044759750366,
"logits/rejected": -0.3478051722049713,
"logps/chosen": -0.8950318694114685,
"logps/rejected": -0.9756672978401184,
"loss": 1.3505,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.790063738822937,
"rewards/margins": 0.1612708568572998,
"rewards/rejected": -1.9513345956802368,
"step": 28
},
{
"epoch": 0.06131483316815329,
"grad_norm": 1.0569533109664917,
"learning_rate": 6.041666666666666e-07,
"logits/chosen": -0.421148419380188,
"logits/rejected": -0.38443076610565186,
"logps/chosen": -0.8021283745765686,
"logps/rejected": -0.8370179533958435,
"loss": 1.3916,
"rewards/accuracies": 0.46875,
"rewards/chosen": -1.6042567491531372,
"rewards/margins": 0.06977920234203339,
"rewards/rejected": -1.674035906791687,
"step": 29
},
{
"epoch": 0.06342913776015857,
"grad_norm": 0.42577147483825684,
"learning_rate": 6.249999999999999e-07,
"logits/chosen": -0.4429818391799927,
"logits/rejected": -0.3524704575538635,
"logps/chosen": -0.8916822671890259,
"logps/rejected": -0.8985542058944702,
"loss": 1.4321,
"rewards/accuracies": 0.5234375,
"rewards/chosen": -1.7833645343780518,
"rewards/margins": 0.01374388113617897,
"rewards/rejected": -1.7971084117889404,
"step": 30
},
{
"epoch": 0.06554344235216386,
"grad_norm": 1.0056904554367065,
"learning_rate": 6.458333333333333e-07,
"logits/chosen": -0.376451700925827,
"logits/rejected": -0.342519074678421,
"logps/chosen": -0.9038617014884949,
"logps/rejected": -0.953092634677887,
"loss": 1.398,
"rewards/accuracies": 0.4765625,
"rewards/chosen": -1.8077234029769897,
"rewards/margins": 0.09846188127994537,
"rewards/rejected": -1.906185269355774,
"step": 31
},
{
"epoch": 0.06765774694416915,
"grad_norm": 0.5494012236595154,
"learning_rate": 6.666666666666666e-07,
"logits/chosen": -0.3459138870239258,
"logits/rejected": -0.3590989410877228,
"logps/chosen": -0.8274999260902405,
"logps/rejected": -0.8776509761810303,
"loss": 1.363,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.654999852180481,
"rewards/margins": 0.1003020703792572,
"rewards/rejected": -1.7553019523620605,
"step": 32
},
{
"epoch": 0.06977205153617443,
"grad_norm": 0.693267822265625,
"learning_rate": 6.875e-07,
"logits/chosen": -0.40053680539131165,
"logits/rejected": -0.37323904037475586,
"logps/chosen": -0.8255244493484497,
"logps/rejected": -0.8658804893493652,
"loss": 1.3712,
"rewards/accuracies": 0.578125,
"rewards/chosen": -1.6510488986968994,
"rewards/margins": 0.08071210980415344,
"rewards/rejected": -1.7317609786987305,
"step": 33
},
{
"epoch": 0.07188635612817972,
"grad_norm": 2.213238000869751,
"learning_rate": 7.083333333333334e-07,
"logits/chosen": -0.40097948908805847,
"logits/rejected": -0.38190510869026184,
"logps/chosen": -0.9122671484947205,
"logps/rejected": -0.9549552798271179,
"loss": 1.36,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -1.824534296989441,
"rewards/margins": 0.0853763073682785,
"rewards/rejected": -1.9099105596542358,
"step": 34
},
{
"epoch": 0.074000660720185,
"grad_norm": 0.6859830021858215,
"learning_rate": 7.291666666666666e-07,
"logits/chosen": -0.42501094937324524,
"logits/rejected": -0.42549416422843933,
"logps/chosen": -1.0008373260498047,
"logps/rejected": -1.1157118082046509,
"loss": 1.3294,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -2.0016746520996094,
"rewards/margins": 0.22974897921085358,
"rewards/rejected": -2.2314236164093018,
"step": 35
},
{
"epoch": 0.0761149653121903,
"grad_norm": 0.6468721628189087,
"learning_rate": 7.5e-07,
"logits/chosen": -0.36494994163513184,
"logits/rejected": -0.30433908104896545,
"logps/chosen": -0.9062094688415527,
"logps/rejected": -0.920263409614563,
"loss": 1.4312,
"rewards/accuracies": 0.4921875,
"rewards/chosen": -1.8124189376831055,
"rewards/margins": 0.02810765616595745,
"rewards/rejected": -1.840526819229126,
"step": 36
},
{
"epoch": 0.07822926990419557,
"grad_norm": 0.5085556507110596,
"learning_rate": 7.708333333333333e-07,
"logits/chosen": -0.4677881598472595,
"logits/rejected": -0.456132709980011,
"logps/chosen": -1.0101865530014038,
"logps/rejected": -1.0429682731628418,
"loss": 1.4132,
"rewards/accuracies": 0.5234375,
"rewards/chosen": -2.0203731060028076,
"rewards/margins": 0.06556359678506851,
"rewards/rejected": -2.0859365463256836,
"step": 37
},
{
"epoch": 0.08034357449620086,
"grad_norm": 0.23813335597515106,
"learning_rate": 7.916666666666666e-07,
"logits/chosen": -0.3991190791130066,
"logits/rejected": -0.3664044141769409,
"logps/chosen": -0.9578174352645874,
"logps/rejected": -0.9229263067245483,
"loss": 1.4824,
"rewards/accuracies": 0.4609375,
"rewards/chosen": -1.9156348705291748,
"rewards/margins": -0.06978224962949753,
"rewards/rejected": -1.8458526134490967,
"step": 38
},
{
"epoch": 0.08245787908820615,
"grad_norm": 0.587037980556488,
"learning_rate": 8.125e-07,
"logits/chosen": -0.37554049491882324,
"logits/rejected": -0.36305734515190125,
"logps/chosen": -0.8503091931343079,
"logps/rejected": -0.864615261554718,
"loss": 1.4086,
"rewards/accuracies": 0.4296875,
"rewards/chosen": -1.7006183862686157,
"rewards/margins": 0.028611989691853523,
"rewards/rejected": -1.729230523109436,
"step": 39
},
{
"epoch": 0.08457218368021142,
"grad_norm": 0.4172501862049103,
"learning_rate": 8.333333333333333e-07,
"logits/chosen": -0.4405443072319031,
"logits/rejected": -0.41723060607910156,
"logps/chosen": -0.8502858877182007,
"logps/rejected": -0.9114271402359009,
"loss": 1.3446,
"rewards/accuracies": 0.546875,
"rewards/chosen": -1.7005717754364014,
"rewards/margins": 0.12228240817785263,
"rewards/rejected": -1.8228542804718018,
"step": 40
},
{
"epoch": 0.08668648827221671,
"grad_norm": 0.9275372624397278,
"learning_rate": 8.541666666666666e-07,
"logits/chosen": -0.4200601577758789,
"logits/rejected": -0.3478623628616333,
"logps/chosen": -0.892408013343811,
"logps/rejected": -0.9276402592658997,
"loss": 1.3887,
"rewards/accuracies": 0.46875,
"rewards/chosen": -1.784816026687622,
"rewards/margins": 0.07046431303024292,
"rewards/rejected": -1.8552805185317993,
"step": 41
},
{
"epoch": 0.088800792864222,
"grad_norm": 0.7317383289337158,
"learning_rate": 8.75e-07,
"logits/chosen": -0.37675267457962036,
"logits/rejected": -0.33540332317352295,
"logps/chosen": -0.7866061925888062,
"logps/rejected": -0.824250340461731,
"loss": 1.3837,
"rewards/accuracies": 0.5078125,
"rewards/chosen": -1.5732123851776123,
"rewards/margins": 0.07528844475746155,
"rewards/rejected": -1.648500680923462,
"step": 42
},
{
"epoch": 0.09091509745622729,
"grad_norm": 0.9452736973762512,
"learning_rate": 8.958333333333334e-07,
"logits/chosen": -0.4662383198738098,
"logits/rejected": -0.4447881579399109,
"logps/chosen": -0.9490666389465332,
"logps/rejected": -1.0112388134002686,
"loss": 1.3412,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -1.8981332778930664,
"rewards/margins": 0.12434446066617966,
"rewards/rejected": -2.022477626800537,
"step": 43
},
{
"epoch": 0.09302940204823257,
"grad_norm": 0.2848323881626129,
"learning_rate": 9.166666666666665e-07,
"logits/chosen": -0.41404005885124207,
"logits/rejected": -0.3944583535194397,
"logps/chosen": -0.8224930167198181,
"logps/rejected": -0.8416361808776855,
"loss": 1.4027,
"rewards/accuracies": 0.609375,
"rewards/chosen": -1.6449860334396362,
"rewards/margins": 0.038286346942186356,
"rewards/rejected": -1.683272361755371,
"step": 44
},
{
"epoch": 0.09514370664023786,
"grad_norm": 0.7165678143501282,
"learning_rate": 9.374999999999999e-07,
"logits/chosen": -0.40475326776504517,
"logits/rejected": -0.3559921383857727,
"logps/chosen": -0.8070214986801147,
"logps/rejected": -0.8993593454360962,
"loss": 1.3148,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.6140429973602295,
"rewards/margins": 0.18467575311660767,
"rewards/rejected": -1.7987186908721924,
"step": 45
},
{
"epoch": 0.09725801123224315,
"grad_norm": 0.4779021739959717,
"learning_rate": 9.583333333333334e-07,
"logits/chosen": -0.4171525835990906,
"logits/rejected": -0.42166149616241455,
"logps/chosen": -0.7872560024261475,
"logps/rejected": -0.8496187925338745,
"loss": 1.3356,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -1.574512004852295,
"rewards/margins": 0.12472567707300186,
"rewards/rejected": -1.699237585067749,
"step": 46
},
{
"epoch": 0.09937231582424844,
"grad_norm": 0.7870219349861145,
"learning_rate": 9.791666666666667e-07,
"logits/chosen": -0.3734116554260254,
"logits/rejected": -0.32778748869895935,
"logps/chosen": -0.7842286825180054,
"logps/rejected": -0.8161548972129822,
"loss": 1.3647,
"rewards/accuracies": 0.5078125,
"rewards/chosen": -1.5684573650360107,
"rewards/margins": 0.06385258585214615,
"rewards/rejected": -1.6323097944259644,
"step": 47
},
{
"epoch": 0.10148662041625371,
"grad_norm": 0.2597256600856781,
"learning_rate": 1e-06,
"logits/chosen": -0.4355677664279938,
"logits/rejected": -0.38983187079429626,
"logps/chosen": -0.8787693977355957,
"logps/rejected": -0.9383041262626648,
"loss": 1.35,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -1.7575387954711914,
"rewards/margins": 0.11906948685646057,
"rewards/rejected": -1.8766082525253296,
"step": 48
},
{
"epoch": 0.103600925008259,
"grad_norm": 0.9942799210548401,
"learning_rate": 9.999862751990697e-07,
"logits/chosen": -0.4244321882724762,
"logits/rejected": -0.4366786777973175,
"logps/chosen": -0.7910157442092896,
"logps/rejected": -0.8630884885787964,
"loss": 1.3166,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -1.582031488418579,
"rewards/margins": 0.14414538443088531,
"rewards/rejected": -1.7261769771575928,
"step": 49
},
{
"epoch": 0.10571522960026429,
"grad_norm": 0.5333903431892395,
"learning_rate": 9.999451015497595e-07,
"logits/chosen": -0.389942467212677,
"logits/rejected": -0.36674585938453674,
"logps/chosen": -0.7312074899673462,
"logps/rejected": -0.7289648652076721,
"loss": 1.4225,
"rewards/accuracies": 0.4765625,
"rewards/chosen": -1.4624149799346924,
"rewards/margins": -0.004485193639993668,
"rewards/rejected": -1.4579297304153442,
"step": 50
},
{
"epoch": 0.10782953419226958,
"grad_norm": 0.5712242722511292,
"learning_rate": 9.9987648131247e-07,
"logits/chosen": -0.4622853994369507,
"logits/rejected": -0.3728552460670471,
"logps/chosen": -0.8764299750328064,
"logps/rejected": -0.869678795337677,
"loss": 1.4542,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.7528599500656128,
"rewards/margins": -0.013502337038516998,
"rewards/rejected": -1.739357590675354,
"step": 51
},
{
"epoch": 0.10994383878427486,
"grad_norm": 0.2586441934108734,
"learning_rate": 9.99780418254397e-07,
"logits/chosen": -0.37249019742012024,
"logits/rejected": -0.3998304605484009,
"logps/chosen": -0.8435611724853516,
"logps/rejected": -0.9359882473945618,
"loss": 1.3057,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.6871223449707031,
"rewards/margins": 0.18485431373119354,
"rewards/rejected": -1.8719764947891235,
"step": 52
},
{
"epoch": 0.11205814337628014,
"grad_norm": 1.0829113721847534,
"learning_rate": 9.996569176493268e-07,
"logits/chosen": -0.47697725892066956,
"logits/rejected": -0.4208195209503174,
"logps/chosen": -0.8014968037605286,
"logps/rejected": -0.8703804612159729,
"loss": 1.3523,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -1.6029936075210571,
"rewards/margins": 0.1377674788236618,
"rewards/rejected": -1.7407609224319458,
"step": 53
},
{
"epoch": 0.11417244796828543,
"grad_norm": 0.5523208379745483,
"learning_rate": 9.995059862773438e-07,
"logits/chosen": -0.40533363819122314,
"logits/rejected": -0.36801978945732117,
"logps/chosen": -0.7641825675964355,
"logps/rejected": -0.8168596029281616,
"loss": 1.3692,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.528365135192871,
"rewards/margins": 0.10535416752099991,
"rewards/rejected": -1.6337192058563232,
"step": 54
},
{
"epoch": 0.11628675256029072,
"grad_norm": 0.614101767539978,
"learning_rate": 9.993276324244605e-07,
"logits/chosen": -0.4476906955242157,
"logits/rejected": -0.40396648645401,
"logps/chosen": -0.8706808090209961,
"logps/rejected": -0.9221430420875549,
"loss": 1.3787,
"rewards/accuracies": 0.515625,
"rewards/chosen": -1.7413616180419922,
"rewards/margins": 0.10292442888021469,
"rewards/rejected": -1.8442860841751099,
"step": 55
},
{
"epoch": 0.118401057152296,
"grad_norm": 0.3428778052330017,
"learning_rate": 9.991218658821608e-07,
"logits/chosen": -0.31709593534469604,
"logits/rejected": -0.2760937213897705,
"logps/chosen": -0.842248797416687,
"logps/rejected": -0.8068034648895264,
"loss": 1.498,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.684497594833374,
"rewards/margins": -0.07089066505432129,
"rewards/rejected": -1.6136069297790527,
"step": 56
},
{
"epoch": 0.12051536174430129,
"grad_norm": 0.6877723932266235,
"learning_rate": 9.988886979468643e-07,
"logits/chosen": -0.41800016164779663,
"logits/rejected": -0.4011584222316742,
"logps/chosen": -0.7845420837402344,
"logps/rejected": -0.834447979927063,
"loss": 1.3491,
"rewards/accuracies": 0.546875,
"rewards/chosen": -1.5690841674804688,
"rewards/margins": 0.09981165081262589,
"rewards/rejected": -1.668895959854126,
"step": 57
},
{
"epoch": 0.12262966633630658,
"grad_norm": 0.9649701714515686,
"learning_rate": 9.98628141419305e-07,
"logits/chosen": -0.4253537058830261,
"logits/rejected": -0.4305458962917328,
"logps/chosen": -0.86476731300354,
"logps/rejected": -0.9080386161804199,
"loss": 1.3639,
"rewards/accuracies": 0.515625,
"rewards/chosen": -1.72953462600708,
"rewards/margins": 0.08654248714447021,
"rewards/rejected": -1.8160772323608398,
"step": 58
},
{
"epoch": 0.12474397092831185,
"grad_norm": 1.3779780864715576,
"learning_rate": 9.98340210603829e-07,
"logits/chosen": -0.39970022439956665,
"logits/rejected": -0.441428005695343,
"logps/chosen": -0.8662775158882141,
"logps/rejected": -0.9646260738372803,
"loss": 1.3001,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -1.7325550317764282,
"rewards/margins": 0.19669723510742188,
"rewards/rejected": -1.9292521476745605,
"step": 59
},
{
"epoch": 0.12685827552031714,
"grad_norm": 0.5366966724395752,
"learning_rate": 9.980249213076084e-07,
"logits/chosen": -0.37770116329193115,
"logits/rejected": -0.35231757164001465,
"logps/chosen": -0.8165755867958069,
"logps/rejected": -0.8619179129600525,
"loss": 1.3699,
"rewards/accuracies": 0.578125,
"rewards/chosen": -1.6331511735916138,
"rewards/margins": 0.09068439900875092,
"rewards/rejected": -1.723835825920105,
"step": 60
},
{
"epoch": 0.12897258011232243,
"grad_norm": 0.36810922622680664,
"learning_rate": 9.976822908397748e-07,
"logits/chosen": -0.4224976897239685,
"logits/rejected": -0.41758257150650024,
"logps/chosen": -0.8445641994476318,
"logps/rejected": -0.9393664598464966,
"loss": 1.3193,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -1.6891283988952637,
"rewards/margins": 0.18960458040237427,
"rewards/rejected": -1.8787329196929932,
"step": 61
},
{
"epoch": 0.13108688470432772,
"grad_norm": 0.6838279366493225,
"learning_rate": 9.97312338010468e-07,
"logits/chosen": -0.4168627858161926,
"logits/rejected": -0.36115381121635437,
"logps/chosen": -0.8370552659034729,
"logps/rejected": -0.8352169394493103,
"loss": 1.4284,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -1.6741105318069458,
"rewards/margins": -0.0036766715347766876,
"rewards/rejected": -1.6704338788986206,
"step": 62
},
{
"epoch": 0.133201189296333,
"grad_norm": 0.39330533146858215,
"learning_rate": 9.969150831298037e-07,
"logits/chosen": -0.4558233618736267,
"logits/rejected": -0.4025765061378479,
"logps/chosen": -0.826255738735199,
"logps/rejected": -0.894213080406189,
"loss": 1.3485,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.652511477470398,
"rewards/margins": 0.13591471314430237,
"rewards/rejected": -1.788426160812378,
"step": 63
},
{
"epoch": 0.1353154938883383,
"grad_norm": 0.6055929660797119,
"learning_rate": 9.964905480067584e-07,
"logits/chosen": -0.459463506937027,
"logits/rejected": -0.42943331599235535,
"logps/chosen": -0.7901928424835205,
"logps/rejected": -0.7964221239089966,
"loss": 1.4057,
"rewards/accuracies": 0.484375,
"rewards/chosen": -1.580385684967041,
"rewards/margins": 0.012458762153983116,
"rewards/rejected": -1.5928442478179932,
"step": 64
},
{
"epoch": 0.13742979848034356,
"grad_norm": 0.37883859872817993,
"learning_rate": 9.960387559479725e-07,
"logits/chosen": -0.4447207450866699,
"logits/rejected": -0.371269553899765,
"logps/chosen": -0.7863065004348755,
"logps/rejected": -0.7983666658401489,
"loss": 1.4202,
"rewards/accuracies": 0.4609375,
"rewards/chosen": -1.572613000869751,
"rewards/margins": 0.02412020042538643,
"rewards/rejected": -1.5967333316802979,
"step": 65
},
{
"epoch": 0.13954410307234885,
"grad_norm": 0.31330156326293945,
"learning_rate": 9.955597317564703e-07,
"logits/chosen": -0.42059677839279175,
"logits/rejected": -0.37605100870132446,
"logps/chosen": -0.7669360637664795,
"logps/rejected": -0.8348797559738159,
"loss": 1.3368,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -1.533872127532959,
"rewards/margins": 0.13588732481002808,
"rewards/rejected": -1.6697595119476318,
"step": 66
},
{
"epoch": 0.14165840766435414,
"grad_norm": 0.4353170096874237,
"learning_rate": 9.950535017302983e-07,
"logits/chosen": -0.3897082805633545,
"logits/rejected": -0.38229796290397644,
"logps/chosen": -0.7249190807342529,
"logps/rejected": -0.7696882486343384,
"loss": 1.3511,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -1.4498381614685059,
"rewards/margins": 0.08953814208507538,
"rewards/rejected": -1.5393764972686768,
"step": 67
},
{
"epoch": 0.14377271225635943,
"grad_norm": 0.6724106669425964,
"learning_rate": 9.94520093661082e-07,
"logits/chosen": -0.3687596023082733,
"logits/rejected": -0.34222811460494995,
"logps/chosen": -0.7845972776412964,
"logps/rejected": -0.8308086395263672,
"loss": 1.3737,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.5691945552825928,
"rewards/margins": 0.09242270141839981,
"rewards/rejected": -1.6616172790527344,
"step": 68
},
{
"epoch": 0.14588701684836472,
"grad_norm": 0.7312172651290894,
"learning_rate": 9.939595368324994e-07,
"logits/chosen": -0.4475817382335663,
"logits/rejected": -0.3975730538368225,
"logps/chosen": -0.7314785718917847,
"logps/rejected": -0.7924487590789795,
"loss": 1.3439,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -1.4629571437835693,
"rewards/margins": 0.12194043397903442,
"rewards/rejected": -1.584897518157959,
"step": 69
},
{
"epoch": 0.14800132144037,
"grad_norm": 0.301097571849823,
"learning_rate": 9.933718620186744e-07,
"logits/chosen": -0.402032732963562,
"logits/rejected": -0.3640722632408142,
"logps/chosen": -0.7727882862091064,
"logps/rejected": -0.8291516304016113,
"loss": 1.358,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -1.545576572418213,
"rewards/margins": 0.11272668838500977,
"rewards/rejected": -1.6583032608032227,
"step": 70
},
{
"epoch": 0.1501156260323753,
"grad_norm": 0.377835750579834,
"learning_rate": 9.92757101482486e-07,
"logits/chosen": -0.316825270652771,
"logits/rejected": -0.3245603144168854,
"logps/chosen": -0.7962774634361267,
"logps/rejected": -0.8610175848007202,
"loss": 1.3464,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -1.5925549268722534,
"rewards/margins": 0.12948019802570343,
"rewards/rejected": -1.7220351696014404,
"step": 71
},
{
"epoch": 0.1522299306243806,
"grad_norm": 0.84058678150177,
"learning_rate": 9.921152889737984e-07,
"logits/chosen": -0.4446060359477997,
"logits/rejected": -0.43160340189933777,
"logps/chosen": -0.7745426297187805,
"logps/rejected": -0.8286185264587402,
"loss": 1.3408,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -1.549085259437561,
"rewards/margins": 0.10815180093050003,
"rewards/rejected": -1.6572370529174805,
"step": 72
},
{
"epoch": 0.15434423521638585,
"grad_norm": 0.6970808506011963,
"learning_rate": 9.91446459727607e-07,
"logits/chosen": -0.4220297634601593,
"logits/rejected": -0.404453307390213,
"logps/chosen": -0.8769615888595581,
"logps/rejected": -0.9564313292503357,
"loss": 1.3423,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -1.7539231777191162,
"rewards/margins": 0.15893957018852234,
"rewards/rejected": -1.9128626585006714,
"step": 73
},
{
"epoch": 0.15645853980839114,
"grad_norm": 0.5900676250457764,
"learning_rate": 9.90750650462105e-07,
"logits/chosen": -0.41884100437164307,
"logits/rejected": -0.38551777601242065,
"logps/chosen": -0.814996063709259,
"logps/rejected": -0.8892688751220703,
"loss": 1.3325,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -1.629992127418518,
"rewards/margins": 0.14854571223258972,
"rewards/rejected": -1.7785377502441406,
"step": 74
},
{
"epoch": 0.15857284440039643,
"grad_norm": 0.7245749831199646,
"learning_rate": 9.900278993766668e-07,
"logits/chosen": -0.3451727330684662,
"logits/rejected": -0.3348972201347351,
"logps/chosen": -0.8788102865219116,
"logps/rejected": -0.9273182153701782,
"loss": 1.377,
"rewards/accuracies": 0.5078125,
"rewards/chosen": -1.7576205730438232,
"rewards/margins": 0.09701582789421082,
"rewards/rejected": -1.8546364307403564,
"step": 75
},
{
"epoch": 0.16068714899240172,
"grad_norm": 0.3135383725166321,
"learning_rate": 9.89278246149752e-07,
"logits/chosen": -0.4140404760837555,
"logits/rejected": -0.38082340359687805,
"logps/chosen": -0.7513999342918396,
"logps/rejected": -0.7880118489265442,
"loss": 1.3921,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.5027998685836792,
"rewards/margins": 0.07322371751070023,
"rewards/rejected": -1.5760236978530884,
"step": 76
},
{
"epoch": 0.162801453584407,
"grad_norm": 0.8594076633453369,
"learning_rate": 9.885017319367252e-07,
"logits/chosen": -0.35951656103134155,
"logits/rejected": -0.30456626415252686,
"logps/chosen": -0.7989844679832458,
"logps/rejected": -0.8145395517349243,
"loss": 1.4061,
"rewards/accuracies": 0.4765625,
"rewards/chosen": -1.5979689359664917,
"rewards/margins": 0.031110182404518127,
"rewards/rejected": -1.6290791034698486,
"step": 77
},
{
"epoch": 0.1649157581764123,
"grad_norm": 0.4393538534641266,
"learning_rate": 9.876983993675989e-07,
"logits/chosen": -0.33119240403175354,
"logits/rejected": -0.31970253586769104,
"logps/chosen": -0.721772313117981,
"logps/rejected": -0.7868390679359436,
"loss": 1.3325,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -1.443544626235962,
"rewards/margins": 0.13013358414173126,
"rewards/rejected": -1.5736781358718872,
"step": 78
},
{
"epoch": 0.1670300627684176,
"grad_norm": 0.8017925024032593,
"learning_rate": 9.868682925446909e-07,
"logits/chosen": -0.3608989417552948,
"logits/rejected": -0.3497124910354614,
"logps/chosen": -0.8117240071296692,
"logps/rejected": -0.8506529331207275,
"loss": 1.3671,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.6234480142593384,
"rewards/margins": 0.07785768806934357,
"rewards/rejected": -1.701305866241455,
"step": 79
},
{
"epoch": 0.16914436736042285,
"grad_norm": 0.5704994201660156,
"learning_rate": 9.860114570402054e-07,
"logits/chosen": -0.37441548705101013,
"logits/rejected": -0.32188406586647034,
"logps/chosen": -0.745419442653656,
"logps/rejected": -0.816170871257782,
"loss": 1.3315,
"rewards/accuracies": 0.484375,
"rewards/chosen": -1.490838885307312,
"rewards/margins": 0.1415030062198639,
"rewards/rejected": -1.632341742515564,
"step": 80
},
{
"epoch": 0.17125867195242814,
"grad_norm": 3.0005106925964355,
"learning_rate": 9.85127939893729e-07,
"logits/chosen": -0.34791454672813416,
"logits/rejected": -0.32542383670806885,
"logps/chosen": -0.7547991275787354,
"logps/rejected": -0.7868378162384033,
"loss": 1.3742,
"rewards/accuracies": 0.515625,
"rewards/chosen": -1.5095982551574707,
"rewards/margins": 0.06407731771469116,
"rewards/rejected": -1.5736756324768066,
"step": 81
},
{
"epoch": 0.17337297654443343,
"grad_norm": 0.5891271829605103,
"learning_rate": 9.842177896096493e-07,
"logits/chosen": -0.38649702072143555,
"logits/rejected": -0.36892226338386536,
"logps/chosen": -0.7556143999099731,
"logps/rejected": -0.82858806848526,
"loss": 1.3233,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -1.5112287998199463,
"rewards/margins": 0.14594702422618866,
"rewards/rejected": -1.65717613697052,
"step": 82
},
{
"epoch": 0.17548728113643872,
"grad_norm": 0.27861157059669495,
"learning_rate": 9.832810561544923e-07,
"logits/chosen": -0.38264670968055725,
"logits/rejected": -0.35908499360084534,
"logps/chosen": -0.7858557105064392,
"logps/rejected": -0.8571599721908569,
"loss": 1.3234,
"rewards/accuracies": 0.578125,
"rewards/chosen": -1.5717114210128784,
"rewards/margins": 0.1426086127758026,
"rewards/rejected": -1.7143199443817139,
"step": 83
},
{
"epoch": 0.177601585728444,
"grad_norm": 0.8158763647079468,
"learning_rate": 9.823177909541793e-07,
"logits/chosen": -0.4076104760169983,
"logits/rejected": -0.3934200704097748,
"logps/chosen": -0.8089872002601624,
"logps/rejected": -0.8885407447814941,
"loss": 1.3476,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -1.6179744005203247,
"rewards/margins": 0.15910708904266357,
"rewards/rejected": -1.7770814895629883,
"step": 84
},
{
"epoch": 0.1797158903204493,
"grad_norm": 0.46008333563804626,
"learning_rate": 9.813280468912022e-07,
"logits/chosen": -0.33124151825904846,
"logits/rejected": -0.34535717964172363,
"logps/chosen": -0.733020544052124,
"logps/rejected": -0.8716557621955872,
"loss": 1.2807,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -1.466041088104248,
"rewards/margins": 0.27727028727531433,
"rewards/rejected": -1.7433115243911743,
"step": 85
},
{
"epoch": 0.18183019491245458,
"grad_norm": 0.3784334659576416,
"learning_rate": 9.80311878301722e-07,
"logits/chosen": -0.40713849663734436,
"logits/rejected": -0.3808574080467224,
"logps/chosen": -0.7063947319984436,
"logps/rejected": -0.7589148879051208,
"loss": 1.3501,
"rewards/accuracies": 0.515625,
"rewards/chosen": -1.4127894639968872,
"rewards/margins": 0.10504024475812912,
"rewards/rejected": -1.5178297758102417,
"step": 86
},
{
"epoch": 0.18394449950445987,
"grad_norm": 0.6137813329696655,
"learning_rate": 9.792693409725853e-07,
"logits/chosen": -0.4119255542755127,
"logits/rejected": -0.44221603870391846,
"logps/chosen": -0.795850932598114,
"logps/rejected": -0.8925026059150696,
"loss": 1.2987,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -1.591701865196228,
"rewards/margins": 0.19330324232578278,
"rewards/rejected": -1.7850052118301392,
"step": 87
},
{
"epoch": 0.18605880409646514,
"grad_norm": 0.5354250073432922,
"learning_rate": 9.78200492138261e-07,
"logits/chosen": -0.3792279064655304,
"logits/rejected": -0.3789527714252472,
"logps/chosen": -0.7249161005020142,
"logps/rejected": -0.8088154792785645,
"loss": 1.3304,
"rewards/accuracies": 0.515625,
"rewards/chosen": -1.4498322010040283,
"rewards/margins": 0.16779886186122894,
"rewards/rejected": -1.617630958557129,
"step": 88
},
{
"epoch": 0.18817310868847043,
"grad_norm": 0.6911243796348572,
"learning_rate": 9.771053904776995e-07,
"logits/chosen": -0.38837429881095886,
"logits/rejected": -0.36597418785095215,
"logps/chosen": -0.7528612017631531,
"logps/rejected": -0.7981135249137878,
"loss": 1.3481,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -1.5057224035263062,
"rewards/margins": 0.09050464630126953,
"rewards/rejected": -1.5962270498275757,
"step": 89
},
{
"epoch": 0.19028741328047571,
"grad_norm": 0.37110790610313416,
"learning_rate": 9.759840961111097e-07,
"logits/chosen": -0.3804919421672821,
"logits/rejected": -0.38750600814819336,
"logps/chosen": -0.8673248291015625,
"logps/rejected": -0.9381619691848755,
"loss": 1.3303,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -1.734649658203125,
"rewards/margins": 0.14167429506778717,
"rewards/rejected": -1.876323938369751,
"step": 90
},
{
"epoch": 0.192401717872481,
"grad_norm": 0.8033086657524109,
"learning_rate": 9.748366705966593e-07,
"logits/chosen": -0.3804866075515747,
"logits/rejected": -0.31055447459220886,
"logps/chosen": -0.7535511255264282,
"logps/rejected": -0.7824290990829468,
"loss": 1.3706,
"rewards/accuracies": 0.546875,
"rewards/chosen": -1.5071022510528564,
"rewards/margins": 0.057755980640649796,
"rewards/rejected": -1.5648581981658936,
"step": 91
},
{
"epoch": 0.1945160224644863,
"grad_norm": 0.6934167742729187,
"learning_rate": 9.736631769270957e-07,
"logits/chosen": -0.443461149930954,
"logits/rejected": -0.4398806691169739,
"logps/chosen": -0.8123858571052551,
"logps/rejected": -0.8972252607345581,
"loss": 1.3464,
"rewards/accuracies": 0.578125,
"rewards/chosen": -1.6247717142105103,
"rewards/margins": 0.16967862844467163,
"rewards/rejected": -1.7944505214691162,
"step": 92
},
{
"epoch": 0.19663032705649158,
"grad_norm": 1.140067458152771,
"learning_rate": 9.724636795262866e-07,
"logits/chosen": -0.43793433904647827,
"logits/rejected": -0.4402340352535248,
"logps/chosen": -0.8155819177627563,
"logps/rejected": -0.8659977912902832,
"loss": 1.3621,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.6311638355255127,
"rewards/margins": 0.10083187371492386,
"rewards/rejected": -1.7319955825805664,
"step": 93
},
{
"epoch": 0.19874463164849687,
"grad_norm": 1.8303897380828857,
"learning_rate": 9.712382442456844e-07,
"logits/chosen": -0.34288379549980164,
"logits/rejected": -0.36632782220840454,
"logps/chosen": -0.7338054776191711,
"logps/rejected": -0.8537961840629578,
"loss": 1.2942,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -1.4676109552383423,
"rewards/margins": 0.23998141288757324,
"rewards/rejected": -1.7075923681259155,
"step": 94
},
{
"epoch": 0.20085893624050213,
"grad_norm": 0.34392252564430237,
"learning_rate": 9.6998693836071e-07,
"logits/chosen": -0.4381723999977112,
"logits/rejected": -0.4031081199645996,
"logps/chosen": -0.7130292057991028,
"logps/rejected": -0.7402217388153076,
"loss": 1.3766,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -1.4260584115982056,
"rewards/margins": 0.0543849840760231,
"rewards/rejected": -1.4804434776306152,
"step": 95
},
{
"epoch": 0.20297324083250742,
"grad_norm": 0.4129842519760132,
"learning_rate": 9.687098305670604e-07,
"logits/chosen": -0.39796924591064453,
"logits/rejected": -0.3476859927177429,
"logps/chosen": -0.7520885467529297,
"logps/rejected": -0.8058558702468872,
"loss": 1.3663,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.5041770935058594,
"rewards/margins": 0.10753461718559265,
"rewards/rejected": -1.6117117404937744,
"step": 96
},
{
"epoch": 0.2050875454245127,
"grad_norm": 0.5054985284805298,
"learning_rate": 9.674069909769362e-07,
"logits/chosen": -0.3942393660545349,
"logits/rejected": -0.3627544045448303,
"logps/chosen": -0.733702540397644,
"logps/rejected": -0.781308650970459,
"loss": 1.342,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -1.467405080795288,
"rewards/margins": 0.09521210938692093,
"rewards/rejected": -1.562617301940918,
"step": 97
},
{
"epoch": 0.207201850016518,
"grad_norm": 0.6975870728492737,
"learning_rate": 9.66078491115194e-07,
"logits/chosen": -0.38557127118110657,
"logits/rejected": -0.3581204414367676,
"logps/chosen": -0.7359838485717773,
"logps/rejected": -0.7648134827613831,
"loss": 1.3841,
"rewards/accuracies": 0.5234375,
"rewards/chosen": -1.4719676971435547,
"rewards/margins": 0.057659298181533813,
"rewards/rejected": -1.5296269655227661,
"step": 98
},
{
"epoch": 0.2093161546085233,
"grad_norm": 0.7575029730796814,
"learning_rate": 9.647244039154177e-07,
"logits/chosen": -0.3871467411518097,
"logits/rejected": -0.3941374123096466,
"logps/chosen": -0.6516871452331543,
"logps/rejected": -0.7066073417663574,
"loss": 1.3364,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -1.3033742904663086,
"rewards/margins": 0.10984040796756744,
"rewards/rejected": -1.4132146835327148,
"step": 99
},
{
"epoch": 0.21143045920052858,
"grad_norm": 1.3344250917434692,
"learning_rate": 9.633448037159166e-07,
"logits/chosen": -0.40887755155563354,
"logits/rejected": -0.41733911633491516,
"logps/chosen": -0.6978950500488281,
"logps/rejected": -0.793424129486084,
"loss": 1.3076,
"rewards/accuracies": 0.609375,
"rewards/chosen": -1.3957901000976562,
"rewards/margins": 0.1910584717988968,
"rewards/rejected": -1.586848258972168,
"step": 100
},
{
"epoch": 0.21354476379253387,
"grad_norm": 0.8798456788063049,
"learning_rate": 9.619397662556433e-07,
"logits/chosen": -0.302534282207489,
"logits/rejected": -0.29954588413238525,
"logps/chosen": -0.719552755355835,
"logps/rejected": -0.7628123164176941,
"loss": 1.3699,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -1.43910551071167,
"rewards/margins": 0.08651915192604065,
"rewards/rejected": -1.5256246328353882,
"step": 101
},
{
"epoch": 0.21565906838453916,
"grad_norm": 0.8746365308761597,
"learning_rate": 9.605093686700353e-07,
"logits/chosen": -0.372263640165329,
"logits/rejected": -0.3714321255683899,
"logps/chosen": -0.6665956974029541,
"logps/rejected": -0.7361368536949158,
"loss": 1.3173,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -1.3331913948059082,
"rewards/margins": 0.1390824019908905,
"rewards/rejected": -1.4722737073898315,
"step": 102
},
{
"epoch": 0.21777337297654442,
"grad_norm": 0.8450930714607239,
"learning_rate": 9.590536894867812e-07,
"logits/chosen": -0.37228280305862427,
"logits/rejected": -0.37763556838035583,
"logps/chosen": -0.7425979375839233,
"logps/rejected": -0.7557005882263184,
"loss": 1.4085,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -1.4851958751678467,
"rewards/margins": 0.026205357164144516,
"rewards/rejected": -1.5114011764526367,
"step": 103
},
{
"epoch": 0.2198876775685497,
"grad_norm": 0.5075035691261292,
"learning_rate": 9.575728086215091e-07,
"logits/chosen": -0.4433964788913727,
"logits/rejected": -0.3782787024974823,
"logps/chosen": -0.7308244109153748,
"logps/rejected": -0.8043883442878723,
"loss": 1.3353,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -1.4616488218307495,
"rewards/margins": 0.14712783694267273,
"rewards/rejected": -1.6087766885757446,
"step": 104
},
{
"epoch": 0.222001982160555,
"grad_norm": 1.0270946025848389,
"learning_rate": 9.560668073733993e-07,
"logits/chosen": -0.3593980073928833,
"logits/rejected": -0.3159312903881073,
"logps/chosen": -0.757469892501831,
"logps/rejected": -0.8256179094314575,
"loss": 1.3289,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -1.514939785003662,
"rewards/margins": 0.13629598915576935,
"rewards/rejected": -1.651235818862915,
"step": 105
},
{
"epoch": 0.2241162867525603,
"grad_norm": 0.8390078544616699,
"learning_rate": 9.54535768420721e-07,
"logits/chosen": -0.3266332149505615,
"logits/rejected": -0.3008713722229004,
"logps/chosen": -0.7286102771759033,
"logps/rejected": -0.7803273797035217,
"loss": 1.3593,
"rewards/accuracies": 0.5078125,
"rewards/chosen": -1.4572205543518066,
"rewards/margins": 0.10343428701162338,
"rewards/rejected": -1.5606547594070435,
"step": 106
},
{
"epoch": 0.22623059134456558,
"grad_norm": 1.130595088005066,
"learning_rate": 9.529797758162934e-07,
"logits/chosen": -0.36109817028045654,
"logits/rejected": -0.34797021746635437,
"logps/chosen": -0.7723361253738403,
"logps/rejected": -0.8873662352561951,
"loss": 1.2956,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.5446722507476807,
"rewards/margins": 0.23006024956703186,
"rewards/rejected": -1.7747324705123901,
"step": 107
},
{
"epoch": 0.22834489593657087,
"grad_norm": 0.9399718642234802,
"learning_rate": 9.513989149828717e-07,
"logits/chosen": -0.3596777021884918,
"logits/rejected": -0.3660539388656616,
"logps/chosen": -0.7130635976791382,
"logps/rejected": -0.7378955483436584,
"loss": 1.3774,
"rewards/accuracies": 0.4765625,
"rewards/chosen": -1.4261271953582764,
"rewards/margins": 0.04966379329562187,
"rewards/rejected": -1.475791096687317,
"step": 108
},
{
"epoch": 0.23045920052857616,
"grad_norm": 1.097594976425171,
"learning_rate": 9.49793272708457e-07,
"logits/chosen": -0.31783169507980347,
"logits/rejected": -0.3008044362068176,
"logps/chosen": -0.6933202147483826,
"logps/rejected": -0.7510000467300415,
"loss": 1.3498,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -1.3866404294967651,
"rewards/margins": 0.1153596043586731,
"rewards/rejected": -1.502000093460083,
"step": 109
},
{
"epoch": 0.23257350512058145,
"grad_norm": 1.096330165863037,
"learning_rate": 9.481629371415313e-07,
"logits/chosen": -0.3582899570465088,
"logits/rejected": -0.3120020031929016,
"logps/chosen": -0.817268431186676,
"logps/rejected": -0.8862374424934387,
"loss": 1.3349,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.634536862373352,
"rewards/margins": 0.13793781399726868,
"rewards/rejected": -1.7724748849868774,
"step": 110
},
{
"epoch": 0.2346878097125867,
"grad_norm": 0.8261978626251221,
"learning_rate": 9.465079977862192e-07,
"logits/chosen": -0.41336673498153687,
"logits/rejected": -0.39544352889060974,
"logps/chosen": -0.7673372030258179,
"logps/rejected": -0.8331737518310547,
"loss": 1.3373,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.5346744060516357,
"rewards/margins": 0.1316729635000229,
"rewards/rejected": -1.6663475036621094,
"step": 111
},
{
"epoch": 0.236802114304592,
"grad_norm": 0.5922806262969971,
"learning_rate": 9.448285454973737e-07,
"logits/chosen": -0.3224758207798004,
"logits/rejected": -0.3118049204349518,
"logps/chosen": -0.7584627866744995,
"logps/rejected": -0.8859898447990417,
"loss": 1.2731,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.516925573348999,
"rewards/margins": 0.2550540566444397,
"rewards/rejected": -1.7719796895980835,
"step": 112
},
{
"epoch": 0.2389164188965973,
"grad_norm": 1.3172541856765747,
"learning_rate": 9.431246724755877e-07,
"logits/chosen": -0.4287208318710327,
"logits/rejected": -0.3984590172767639,
"logps/chosen": -0.7587048411369324,
"logps/rejected": -0.7860502004623413,
"loss": 1.3832,
"rewards/accuracies": 0.4765625,
"rewards/chosen": -1.5174096822738647,
"rewards/margins": 0.05469079315662384,
"rewards/rejected": -1.5721004009246826,
"step": 113
},
{
"epoch": 0.24103072348860258,
"grad_norm": 0.7749882340431213,
"learning_rate": 9.413964722621337e-07,
"logits/chosen": -0.39085906744003296,
"logits/rejected": -0.3316206932067871,
"logps/chosen": -0.7035898566246033,
"logps/rejected": -0.7375759482383728,
"loss": 1.387,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -1.4071797132492065,
"rewards/margins": 0.06797221302986145,
"rewards/rejected": -1.4751518964767456,
"step": 114
},
{
"epoch": 0.24314502808060787,
"grad_norm": 1.0914056301116943,
"learning_rate": 9.396440397338272e-07,
"logits/chosen": -0.38826486468315125,
"logits/rejected": -0.35520774126052856,
"logps/chosen": -0.7385872602462769,
"logps/rejected": -0.7974889278411865,
"loss": 1.3477,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.4771745204925537,
"rewards/margins": 0.11780343949794769,
"rewards/rejected": -1.594977855682373,
"step": 115
},
{
"epoch": 0.24525933267261316,
"grad_norm": 1.2966018915176392,
"learning_rate": 9.378674710978183e-07,
"logits/chosen": -0.36493802070617676,
"logits/rejected": -0.34763696789741516,
"logps/chosen": -0.6731826663017273,
"logps/rejected": -0.7645149827003479,
"loss": 1.3,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -1.3463653326034546,
"rewards/margins": 0.18266455829143524,
"rewards/rejected": -1.5290299654006958,
"step": 116
},
{
"epoch": 0.24737363726461845,
"grad_norm": 0.49401605129241943,
"learning_rate": 9.360668638863109e-07,
"logits/chosen": -0.40416795015335083,
"logits/rejected": -0.3815993070602417,
"logps/chosen": -0.719497799873352,
"logps/rejected": -0.7588324546813965,
"loss": 1.3621,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.438995599746704,
"rewards/margins": 0.07866920530796051,
"rewards/rejected": -1.517664909362793,
"step": 117
},
{
"epoch": 0.2494879418566237,
"grad_norm": 1.0603238344192505,
"learning_rate": 9.342423169512071e-07,
"logits/chosen": -0.3857055604457855,
"logits/rejected": -0.3524513244628906,
"logps/chosen": -0.7373769283294678,
"logps/rejected": -0.7971038818359375,
"loss": 1.3358,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.4747538566589355,
"rewards/margins": 0.11945393681526184,
"rewards/rejected": -1.594207763671875,
"step": 118
},
{
"epoch": 0.251602246448629,
"grad_norm": 0.9880490303039551,
"learning_rate": 9.323939304586804e-07,
"logits/chosen": -0.31455785036087036,
"logits/rejected": -0.3102484941482544,
"logps/chosen": -0.7276102900505066,
"logps/rejected": -0.7446941137313843,
"loss": 1.3928,
"rewards/accuracies": 0.546875,
"rewards/chosen": -1.4552205801010132,
"rewards/margins": 0.034167706966400146,
"rewards/rejected": -1.4893882274627686,
"step": 119
},
{
"epoch": 0.2537165510406343,
"grad_norm": 1.0419566631317139,
"learning_rate": 9.305218058836776e-07,
"logits/chosen": -0.38093918561935425,
"logits/rejected": -0.3588898181915283,
"logps/chosen": -0.715582013130188,
"logps/rejected": -0.8271002769470215,
"loss": 1.2934,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -1.431164026260376,
"rewards/margins": 0.22303667664527893,
"rewards/rejected": -1.654200553894043,
"step": 120
},
{
"epoch": 0.2558308556326396,
"grad_norm": 0.657620370388031,
"learning_rate": 9.286260460043473e-07,
"logits/chosen": -0.45690783858299255,
"logits/rejected": -0.4082674980163574,
"logps/chosen": -0.6932571530342102,
"logps/rejected": -0.7631082534790039,
"loss": 1.3398,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -1.3865143060684204,
"rewards/margins": 0.13970226049423218,
"rewards/rejected": -1.5262165069580078,
"step": 121
},
{
"epoch": 0.25794516022464486,
"grad_norm": 0.983686089515686,
"learning_rate": 9.267067548963974e-07,
"logits/chosen": -0.40266987681388855,
"logits/rejected": -0.37586671113967896,
"logps/chosen": -0.7362720966339111,
"logps/rejected": -0.7538987398147583,
"loss": 1.4066,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -1.4725441932678223,
"rewards/margins": 0.03525342047214508,
"rewards/rejected": -1.5077974796295166,
"step": 122
},
{
"epoch": 0.26005946481665015,
"grad_norm": 1.0076361894607544,
"learning_rate": 9.24764037927381e-07,
"logits/chosen": -0.4461461007595062,
"logits/rejected": -0.40700826048851013,
"logps/chosen": -0.7206646800041199,
"logps/rejected": -0.7489192485809326,
"loss": 1.3759,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.4413293600082397,
"rewards/margins": 0.05650928616523743,
"rewards/rejected": -1.4978384971618652,
"step": 123
},
{
"epoch": 0.26217376940865544,
"grad_norm": 0.933315098285675,
"learning_rate": 9.22798001750913e-07,
"logits/chosen": -0.3966676890850067,
"logits/rejected": -0.3572196960449219,
"logps/chosen": -0.7075096368789673,
"logps/rejected": -0.7406759262084961,
"loss": 1.3667,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -1.4150192737579346,
"rewards/margins": 0.06633266806602478,
"rewards/rejected": -1.4813518524169922,
"step": 124
},
{
"epoch": 0.26428807400066073,
"grad_norm": 0.6277392506599426,
"learning_rate": 9.20808754300814e-07,
"logits/chosen": -0.3555490970611572,
"logits/rejected": -0.35786163806915283,
"logps/chosen": -0.7549921274185181,
"logps/rejected": -0.832869291305542,
"loss": 1.3175,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -1.5099842548370361,
"rewards/margins": 0.15575438737869263,
"rewards/rejected": -1.665738582611084,
"step": 125
},
{
"epoch": 0.266402378592666,
"grad_norm": 0.7172744274139404,
"learning_rate": 9.18796404785185e-07,
"logits/chosen": -0.41230690479278564,
"logits/rejected": -0.39935630559921265,
"logps/chosen": -0.7129833698272705,
"logps/rejected": -0.7888559103012085,
"loss": 1.3167,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.425966739654541,
"rewards/margins": 0.15174514055252075,
"rewards/rejected": -1.577711820602417,
"step": 126
},
{
"epoch": 0.2685166831846713,
"grad_norm": 1.2629508972167969,
"learning_rate": 9.16761063680412e-07,
"logits/chosen": -0.36754000186920166,
"logits/rejected": -0.3541562259197235,
"logps/chosen": -0.6992133855819702,
"logps/rejected": -0.7668892741203308,
"loss": 1.3735,
"rewards/accuracies": 0.578125,
"rewards/chosen": -1.3984267711639404,
"rewards/margins": 0.13535188138484955,
"rewards/rejected": -1.5337785482406616,
"step": 127
},
{
"epoch": 0.2706309877766766,
"grad_norm": 0.7024405598640442,
"learning_rate": 9.147028427251009e-07,
"logits/chosen": -0.4014585018157959,
"logits/rejected": -0.40560898184776306,
"logps/chosen": -0.727234959602356,
"logps/rejected": -0.8070081472396851,
"loss": 1.3138,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.454469919204712,
"rewards/margins": 0.15954652428627014,
"rewards/rejected": -1.6140162944793701,
"step": 128
},
{
"epoch": 0.2727452923686819,
"grad_norm": 1.6173532009124756,
"learning_rate": 9.126218549139433e-07,
"logits/chosen": -0.32572367787361145,
"logits/rejected": -0.3470613956451416,
"logps/chosen": -0.7555541396141052,
"logps/rejected": -0.8856738209724426,
"loss": 1.2461,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -1.5111082792282104,
"rewards/margins": 0.26023951172828674,
"rewards/rejected": -1.7713476419448853,
"step": 129
},
{
"epoch": 0.2748595969606871,
"grad_norm": 0.5878487229347229,
"learning_rate": 9.105182144915129e-07,
"logits/chosen": -0.39267170429229736,
"logits/rejected": -0.3448992967605591,
"logps/chosen": -0.6776289343833923,
"logps/rejected": -0.7530183792114258,
"loss": 1.3242,
"rewards/accuracies": 0.609375,
"rewards/chosen": -1.3552578687667847,
"rewards/margins": 0.15077897906303406,
"rewards/rejected": -1.5060367584228516,
"step": 130
},
{
"epoch": 0.2769739015526924,
"grad_norm": 0.43264809250831604,
"learning_rate": 9.08392036945994e-07,
"logits/chosen": -0.39980950951576233,
"logits/rejected": -0.4247930645942688,
"logps/chosen": -0.7898982167243958,
"logps/rejected": -0.8856299519538879,
"loss": 1.3004,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.5797964334487915,
"rewards/margins": 0.19146347045898438,
"rewards/rejected": -1.7712599039077759,
"step": 131
},
{
"epoch": 0.2790882061446977,
"grad_norm": 1.0348538160324097,
"learning_rate": 9.062434390028407e-07,
"logits/chosen": -0.35729700326919556,
"logits/rejected": -0.3265542984008789,
"logps/chosen": -0.7120587229728699,
"logps/rejected": -0.771691083908081,
"loss": 1.3374,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.4241174459457397,
"rewards/margins": 0.11926469206809998,
"rewards/rejected": -1.543382167816162,
"step": 132
},
{
"epoch": 0.281202510736703,
"grad_norm": 2.0902225971221924,
"learning_rate": 9.04072538618369e-07,
"logits/chosen": -0.4942469298839569,
"logits/rejected": -0.48699846863746643,
"logps/chosen": -0.7882512211799622,
"logps/rejected": -0.8270165920257568,
"loss": 1.3715,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -1.5765024423599243,
"rewards/margins": 0.07753071188926697,
"rewards/rejected": -1.6540331840515137,
"step": 133
},
{
"epoch": 0.2833168153287083,
"grad_norm": 1.6436113119125366,
"learning_rate": 9.018794549732817e-07,
"logits/chosen": -0.41133156418800354,
"logits/rejected": -0.4146718382835388,
"logps/chosen": -0.779824435710907,
"logps/rejected": -0.9421006441116333,
"loss": 1.2521,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.559648871421814,
"rewards/margins": 0.324552446603775,
"rewards/rejected": -1.8842012882232666,
"step": 134
},
{
"epoch": 0.28543111992071357,
"grad_norm": 0.8831859827041626,
"learning_rate": 8.996643084661244e-07,
"logits/chosen": -0.42452165484428406,
"logits/rejected": -0.3798604905605316,
"logps/chosen": -0.6499216556549072,
"logps/rejected": -0.7796702980995178,
"loss": 1.2581,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.2998433113098145,
"rewards/margins": 0.25949734449386597,
"rewards/rejected": -1.5593405961990356,
"step": 135
},
{
"epoch": 0.28754542451271886,
"grad_norm": 0.8031218647956848,
"learning_rate": 8.974272207066767e-07,
"logits/chosen": -0.38131940364837646,
"logits/rejected": -0.3854255676269531,
"logps/chosen": -0.7026851773262024,
"logps/rejected": -0.762391209602356,
"loss": 1.3333,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.4053703546524048,
"rewards/margins": 0.11941206455230713,
"rewards/rejected": -1.524782419204712,
"step": 136
},
{
"epoch": 0.28965972910472415,
"grad_norm": 1.4455821514129639,
"learning_rate": 8.951683145092748e-07,
"logits/chosen": -0.42824965715408325,
"logits/rejected": -0.4320424795150757,
"logps/chosen": -0.7893270254135132,
"logps/rejected": -0.8517144322395325,
"loss": 1.3652,
"rewards/accuracies": 0.5234375,
"rewards/chosen": -1.5786540508270264,
"rewards/margins": 0.12477481365203857,
"rewards/rejected": -1.703428864479065,
"step": 137
},
{
"epoch": 0.29177403369672944,
"grad_norm": 0.6299450397491455,
"learning_rate": 8.928877138860706e-07,
"logits/chosen": -0.4388589560985565,
"logits/rejected": -0.40156903862953186,
"logps/chosen": -0.7346572875976562,
"logps/rejected": -0.8166492581367493,
"loss": 1.3134,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -1.4693145751953125,
"rewards/margins": 0.16398391127586365,
"rewards/rejected": -1.6332985162734985,
"step": 138
},
{
"epoch": 0.29388833828873473,
"grad_norm": 2.784437417984009,
"learning_rate": 8.905855440402224e-07,
"logits/chosen": -0.405662477016449,
"logits/rejected": -0.35549795627593994,
"logps/chosen": -0.7482771277427673,
"logps/rejected": -0.795568585395813,
"loss": 1.3656,
"rewards/accuracies": 0.578125,
"rewards/chosen": -1.4965542554855347,
"rewards/margins": 0.09458285570144653,
"rewards/rejected": -1.591137170791626,
"step": 139
},
{
"epoch": 0.29600264288074,
"grad_norm": 0.4958692193031311,
"learning_rate": 8.882619313590212e-07,
"logits/chosen": -0.3814452886581421,
"logits/rejected": -0.35715553164482117,
"logps/chosen": -0.7731542587280273,
"logps/rejected": -0.8285202980041504,
"loss": 1.3776,
"rewards/accuracies": 0.515625,
"rewards/chosen": -1.5463085174560547,
"rewards/margins": 0.11073210835456848,
"rewards/rejected": -1.6570405960083008,
"step": 140
},
{
"epoch": 0.2981169474727453,
"grad_norm": 0.4597362279891968,
"learning_rate": 8.859170034069532e-07,
"logits/chosen": -0.388383150100708,
"logits/rejected": -0.4071737229824066,
"logps/chosen": -0.7263504266738892,
"logps/rejected": -0.769676148891449,
"loss": 1.3712,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.4527008533477783,
"rewards/margins": 0.08665145933628082,
"rewards/rejected": -1.539352297782898,
"step": 141
},
{
"epoch": 0.3002312520647506,
"grad_norm": 0.4914930760860443,
"learning_rate": 8.835508889186956e-07,
"logits/chosen": -0.41084378957748413,
"logits/rejected": -0.3823031187057495,
"logps/chosen": -0.7565821409225464,
"logps/rejected": -0.9084322452545166,
"loss": 1.2717,
"rewards/accuracies": 0.578125,
"rewards/chosen": -1.5131642818450928,
"rewards/margins": 0.3037002384662628,
"rewards/rejected": -1.8168644905090332,
"step": 142
},
{
"epoch": 0.3023455566567559,
"grad_norm": 2.0075581073760986,
"learning_rate": 8.811637177920499e-07,
"logits/chosen": -0.4438302516937256,
"logits/rejected": -0.4916025698184967,
"logps/chosen": -0.800719141960144,
"logps/rejected": -0.8658267855644226,
"loss": 1.358,
"rewards/accuracies": 0.515625,
"rewards/chosen": -1.601438283920288,
"rewards/margins": 0.1302153617143631,
"rewards/rejected": -1.7316535711288452,
"step": 143
},
{
"epoch": 0.3044598612487612,
"grad_norm": 1.1243022680282593,
"learning_rate": 8.7875562108081e-07,
"logits/chosen": -0.40519949793815613,
"logits/rejected": -0.3905750811100006,
"logps/chosen": -0.689585268497467,
"logps/rejected": -0.7312421798706055,
"loss": 1.3503,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -1.379170536994934,
"rewards/margins": 0.08331384509801865,
"rewards/rejected": -1.462484359741211,
"step": 144
},
{
"epoch": 0.3065741658407664,
"grad_norm": 0.7543137669563293,
"learning_rate": 8.76326730987568e-07,
"logits/chosen": -0.4696752727031708,
"logits/rejected": -0.4357326626777649,
"logps/chosen": -0.7813425660133362,
"logps/rejected": -0.8276973962783813,
"loss": 1.3794,
"rewards/accuracies": 0.5234375,
"rewards/chosen": -1.5626851320266724,
"rewards/margins": 0.09270970523357391,
"rewards/rejected": -1.6553947925567627,
"step": 145
},
{
"epoch": 0.3086884704327717,
"grad_norm": 1.3136053085327148,
"learning_rate": 8.738771808564555e-07,
"logits/chosen": -0.4262731075286865,
"logits/rejected": -0.44038820266723633,
"logps/chosen": -0.697494387626648,
"logps/rejected": -0.8369535803794861,
"loss": 1.2699,
"rewards/accuracies": 0.609375,
"rewards/chosen": -1.394988775253296,
"rewards/margins": 0.2789183557033539,
"rewards/rejected": -1.6739071607589722,
"step": 146
},
{
"epoch": 0.310802775024777,
"grad_norm": 2.221562385559082,
"learning_rate": 8.714071051658245e-07,
"logits/chosen": -0.40089336037635803,
"logits/rejected": -0.37991875410079956,
"logps/chosen": -0.7704445123672485,
"logps/rejected": -0.859091579914093,
"loss": 1.2987,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -1.540889024734497,
"rewards/margins": 0.17729414999485016,
"rewards/rejected": -1.718183159828186,
"step": 147
},
{
"epoch": 0.3129170796167823,
"grad_norm": 1.5049912929534912,
"learning_rate": 8.689166395208636e-07,
"logits/chosen": -0.38984015583992004,
"logits/rejected": -0.35900723934173584,
"logps/chosen": -0.6424779891967773,
"logps/rejected": -0.7145389318466187,
"loss": 1.3261,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -1.2849559783935547,
"rewards/margins": 0.14412200450897217,
"rewards/rejected": -1.4290778636932373,
"step": 148
},
{
"epoch": 0.31503138420878757,
"grad_norm": 0.36125388741493225,
"learning_rate": 8.664059206461534e-07,
"logits/chosen": -0.3490441143512726,
"logits/rejected": -0.3219914436340332,
"logps/chosen": -0.7200264930725098,
"logps/rejected": -0.7924249768257141,
"loss": 1.3476,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.4400529861450195,
"rewards/margins": 0.1447969526052475,
"rewards/rejected": -1.5848499536514282,
"step": 149
},
{
"epoch": 0.31714568880079286,
"grad_norm": 1.039840579032898,
"learning_rate": 8.638750863781612e-07,
"logits/chosen": -0.40701645612716675,
"logits/rejected": -0.406186580657959,
"logps/chosen": -0.7083575129508972,
"logps/rejected": -0.7766748070716858,
"loss": 1.3263,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -1.4167150259017944,
"rewards/margins": 0.1366347074508667,
"rewards/rejected": -1.5533496141433716,
"step": 150
},
{
"epoch": 0.31925999339279815,
"grad_norm": 0.7128564119338989,
"learning_rate": 8.613242756576728e-07,
"logits/chosen": -0.40932926535606384,
"logits/rejected": -0.4234562814235687,
"logps/chosen": -0.6775843501091003,
"logps/rejected": -0.7866222858428955,
"loss": 1.2834,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -1.3551687002182007,
"rewards/margins": 0.2180757373571396,
"rewards/rejected": -1.573244571685791,
"step": 151
},
{
"epoch": 0.32137429798480344,
"grad_norm": 1.1701059341430664,
"learning_rate": 8.587536285221655e-07,
"logits/chosen": -0.3654797077178955,
"logits/rejected": -0.3181680738925934,
"logps/chosen": -0.6686022877693176,
"logps/rejected": -0.7058504223823547,
"loss": 1.3612,
"rewards/accuracies": 0.546875,
"rewards/chosen": -1.3372045755386353,
"rewards/margins": 0.07449636608362198,
"rewards/rejected": -1.4117008447647095,
"step": 152
},
{
"epoch": 0.3234886025768087,
"grad_norm": 0.8239700794219971,
"learning_rate": 8.561632860981204e-07,
"logits/chosen": -0.42527130246162415,
"logits/rejected": -0.4091627299785614,
"logps/chosen": -0.6969794631004333,
"logps/rejected": -0.8019355535507202,
"loss": 1.2974,
"rewards/accuracies": 0.578125,
"rewards/chosen": -1.3939589262008667,
"rewards/margins": 0.20991206169128418,
"rewards/rejected": -1.6038711071014404,
"step": 153
},
{
"epoch": 0.325602907168814,
"grad_norm": 1.4885636568069458,
"learning_rate": 8.535533905932737e-07,
"logits/chosen": -0.4126192331314087,
"logits/rejected": -0.41548141837120056,
"logps/chosen": -0.7076549530029297,
"logps/rejected": -0.7940821051597595,
"loss": 1.3198,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.4153099060058594,
"rewards/margins": 0.17285437881946564,
"rewards/rejected": -1.588164210319519,
"step": 154
},
{
"epoch": 0.3277172117608193,
"grad_norm": 1.439434289932251,
"learning_rate": 8.509240852888106e-07,
"logits/chosen": -0.3763914704322815,
"logits/rejected": -0.3617165684700012,
"logps/chosen": -0.7189474105834961,
"logps/rejected": -0.827629804611206,
"loss": 1.2816,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.4378948211669922,
"rewards/margins": 0.2173648476600647,
"rewards/rejected": -1.655259609222412,
"step": 155
},
{
"epoch": 0.3298315163528246,
"grad_norm": 1.4505418539047241,
"learning_rate": 8.482755145314985e-07,
"logits/chosen": -0.37879478931427,
"logits/rejected": -0.38689684867858887,
"logps/chosen": -0.7011865973472595,
"logps/rejected": -0.8019431829452515,
"loss": 1.3158,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -1.402373194694519,
"rewards/margins": 0.2015131413936615,
"rewards/rejected": -1.603886365890503,
"step": 156
},
{
"epoch": 0.3319458209448299,
"grad_norm": 2.0968713760375977,
"learning_rate": 8.45607823725763e-07,
"logits/chosen": -0.4366365075111389,
"logits/rejected": -0.41210681200027466,
"logps/chosen": -0.6455651521682739,
"logps/rejected": -0.7228428721427917,
"loss": 1.3247,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -1.2911303043365479,
"rewards/margins": 0.1545555144548416,
"rewards/rejected": -1.4456857442855835,
"step": 157
},
{
"epoch": 0.3340601255368352,
"grad_norm": 0.6716106534004211,
"learning_rate": 8.429211593257052e-07,
"logits/chosen": -0.42992207407951355,
"logits/rejected": -0.4105672836303711,
"logps/chosen": -0.6981461048126221,
"logps/rejected": -0.7909567952156067,
"loss": 1.3128,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -1.3962922096252441,
"rewards/margins": 0.1856214702129364,
"rewards/rejected": -1.5819135904312134,
"step": 158
},
{
"epoch": 0.33617443012884046,
"grad_norm": 2.4430501461029053,
"learning_rate": 8.402156688270612e-07,
"logits/chosen": -0.4184916317462921,
"logits/rejected": -0.3943992257118225,
"logps/chosen": -0.6568948030471802,
"logps/rejected": -0.7506390810012817,
"loss": 1.2992,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -1.3137896060943604,
"rewards/margins": 0.18748846650123596,
"rewards/rejected": -1.5012781620025635,
"step": 159
},
{
"epoch": 0.3382887347208457,
"grad_norm": 2.0322091579437256,
"learning_rate": 8.374915007591052e-07,
"logits/chosen": -0.4713057577610016,
"logits/rejected": -0.42163771390914917,
"logps/chosen": -0.7347853779792786,
"logps/rejected": -0.7770044207572937,
"loss": 1.3801,
"rewards/accuracies": 0.5078125,
"rewards/chosen": -1.4695707559585571,
"rewards/margins": 0.0844380110502243,
"rewards/rejected": -1.5540088415145874,
"step": 160
},
{
"epoch": 0.340403039312851,
"grad_norm": 0.4045500159263611,
"learning_rate": 8.347488046764948e-07,
"logits/chosen": -0.39465126395225525,
"logits/rejected": -0.3961923122406006,
"logps/chosen": -0.601732075214386,
"logps/rejected": -0.694148600101471,
"loss": 1.2859,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -1.203464150428772,
"rewards/margins": 0.18483319878578186,
"rewards/rejected": -1.388297200202942,
"step": 161
},
{
"epoch": 0.3425173439048563,
"grad_norm": 2.79396915435791,
"learning_rate": 8.319877311510612e-07,
"logits/chosen": -0.4311378002166748,
"logits/rejected": -0.4248836636543274,
"logps/chosen": -0.6813413500785828,
"logps/rejected": -0.775830864906311,
"loss": 1.3001,
"rewards/accuracies": 0.578125,
"rewards/chosen": -1.3626827001571655,
"rewards/margins": 0.18897925317287445,
"rewards/rejected": -1.551661729812622,
"step": 162
},
{
"epoch": 0.34463164849686156,
"grad_norm": 0.714146077632904,
"learning_rate": 8.292084317635419e-07,
"logits/chosen": -0.4060715436935425,
"logits/rejected": -0.3770482540130615,
"logps/chosen": -0.7176523208618164,
"logps/rejected": -0.7973593473434448,
"loss": 1.324,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -1.4353046417236328,
"rewards/margins": 0.15941408276557922,
"rewards/rejected": -1.5947186946868896,
"step": 163
},
{
"epoch": 0.34674595308886685,
"grad_norm": 1.6007037162780762,
"learning_rate": 8.264110590952607e-07,
"logits/chosen": -0.49063974618911743,
"logits/rejected": -0.5119628310203552,
"logps/chosen": -0.7263911366462708,
"logps/rejected": -0.9138184785842896,
"loss": 1.2439,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -1.4527822732925415,
"rewards/margins": 0.3748546540737152,
"rewards/rejected": -1.827636957168579,
"step": 164
},
{
"epoch": 0.34886025768087214,
"grad_norm": 1.4566830396652222,
"learning_rate": 8.235957667197494e-07,
"logits/chosen": -0.4681779146194458,
"logits/rejected": -0.46475380659103394,
"logps/chosen": -0.6923782229423523,
"logps/rejected": -0.7901281118392944,
"loss": 1.295,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -1.3847564458847046,
"rewards/margins": 0.19549959897994995,
"rewards/rejected": -1.5802562236785889,
"step": 165
},
{
"epoch": 0.35097456227287743,
"grad_norm": 3.0825328826904297,
"learning_rate": 8.207627091943177e-07,
"logits/chosen": -0.4294862151145935,
"logits/rejected": -0.42411237955093384,
"logps/chosen": -0.6851246356964111,
"logps/rejected": -0.7844961881637573,
"loss": 1.2871,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.3702492713928223,
"rewards/margins": 0.19874317944049835,
"rewards/rejected": -1.5689923763275146,
"step": 166
},
{
"epoch": 0.3530888668648827,
"grad_norm": 1.0783339738845825,
"learning_rate": 8.179120420515675e-07,
"logits/chosen": -0.4528030455112457,
"logits/rejected": -0.4626815617084503,
"logps/chosen": -0.703376293182373,
"logps/rejected": -0.8752757906913757,
"loss": 1.2193,
"rewards/accuracies": 0.703125,
"rewards/chosen": -1.406752586364746,
"rewards/margins": 0.34379899501800537,
"rewards/rejected": -1.7505515813827515,
"step": 167
},
{
"epoch": 0.355203171456888,
"grad_norm": 2.6788036823272705,
"learning_rate": 8.150439217908556e-07,
"logits/chosen": -0.44946759939193726,
"logits/rejected": -0.47430264949798584,
"logps/chosen": -0.751136839389801,
"logps/rejected": -0.874577522277832,
"loss": 1.29,
"rewards/accuracies": 0.578125,
"rewards/chosen": -1.502273678779602,
"rewards/margins": 0.24688144028186798,
"rewards/rejected": -1.749155044555664,
"step": 168
},
{
"epoch": 0.3573174760488933,
"grad_norm": 0.9087730050086975,
"learning_rate": 8.121585058696999e-07,
"logits/chosen": -0.47294262051582336,
"logits/rejected": -0.46765226125717163,
"logps/chosen": -0.7291173934936523,
"logps/rejected": -0.7999277114868164,
"loss": 1.3482,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -1.4582347869873047,
"rewards/margins": 0.1416206806898117,
"rewards/rejected": -1.5998554229736328,
"step": 169
},
{
"epoch": 0.3594317806408986,
"grad_norm": 3.392674207687378,
"learning_rate": 8.092559526951374e-07,
"logits/chosen": -0.5026620626449585,
"logits/rejected": -0.46620574593544006,
"logps/chosen": -0.746992290019989,
"logps/rejected": -0.8266301155090332,
"loss": 1.3202,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -1.493984580039978,
"rewards/margins": 0.15927578508853912,
"rewards/rejected": -1.6532602310180664,
"step": 170
},
{
"epoch": 0.3615460852329039,
"grad_norm": 1.27628755569458,
"learning_rate": 8.063364216150256e-07,
"logits/chosen": -0.5211395025253296,
"logits/rejected": -0.5419963598251343,
"logps/chosen": -0.7919114828109741,
"logps/rejected": -0.8731362223625183,
"loss": 1.3228,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -1.5838229656219482,
"rewards/margins": 0.16244953870773315,
"rewards/rejected": -1.7462724447250366,
"step": 171
},
{
"epoch": 0.36366038982490917,
"grad_norm": 0.8269656896591187,
"learning_rate": 8.034000729092967e-07,
"logits/chosen": -0.49545183777809143,
"logits/rejected": -0.4716613292694092,
"logps/chosen": -0.719520092010498,
"logps/rejected": -0.7876347303390503,
"loss": 1.3367,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.439040184020996,
"rewards/margins": 0.13622930645942688,
"rewards/rejected": -1.5752694606781006,
"step": 172
},
{
"epoch": 0.36577469441691446,
"grad_norm": 0.6049383282661438,
"learning_rate": 8.004470677811559e-07,
"logits/chosen": -0.45276379585266113,
"logits/rejected": -0.42617955803871155,
"logps/chosen": -0.7097947597503662,
"logps/rejected": -0.7606989145278931,
"loss": 1.3909,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -1.4195895195007324,
"rewards/margins": 0.10180822014808655,
"rewards/rejected": -1.5213978290557861,
"step": 173
},
{
"epoch": 0.36788899900891975,
"grad_norm": 3.980013847351074,
"learning_rate": 7.974775683482337e-07,
"logits/chosen": -0.4783569574356079,
"logits/rejected": -0.43521156907081604,
"logps/chosen": -0.7623491287231445,
"logps/rejected": -0.8719285130500793,
"loss": 1.2838,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -1.524698257446289,
"rewards/margins": 0.2191585898399353,
"rewards/rejected": -1.7438570261001587,
"step": 174
},
{
"epoch": 0.370003303600925,
"grad_norm": 1.024530053138733,
"learning_rate": 7.94491737633684e-07,
"logits/chosen": -0.5009916424751282,
"logits/rejected": -0.48874592781066895,
"logps/chosen": -0.7552992701530457,
"logps/rejected": -0.8485872745513916,
"loss": 1.3153,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -1.5105985403060913,
"rewards/margins": 0.18657605350017548,
"rewards/rejected": -1.6971745491027832,
"step": 175
},
{
"epoch": 0.37211760819293027,
"grad_norm": 1.5952919721603394,
"learning_rate": 7.91489739557236e-07,
"logits/chosen": -0.4424138069152832,
"logits/rejected": -0.4334307312965393,
"logps/chosen": -0.6956002116203308,
"logps/rejected": -0.8018803000450134,
"loss": 1.3011,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.3912004232406616,
"rewards/margins": 0.21256020665168762,
"rewards/rejected": -1.6037606000900269,
"step": 176
},
{
"epoch": 0.37423191278493556,
"grad_norm": 1.8331164121627808,
"learning_rate": 7.884717389261934e-07,
"logits/chosen": -0.4836267828941345,
"logits/rejected": -0.5018677115440369,
"logps/chosen": -0.7895969152450562,
"logps/rejected": -0.927432656288147,
"loss": 1.2467,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.5791938304901123,
"rewards/margins": 0.27567166090011597,
"rewards/rejected": -1.854865312576294,
"step": 177
},
{
"epoch": 0.37634621737694085,
"grad_norm": 2.165984869003296,
"learning_rate": 7.854379014263876e-07,
"logits/chosen": -0.46125832200050354,
"logits/rejected": -0.39802712202072144,
"logps/chosen": -0.8382925391197205,
"logps/rejected": -0.9422982931137085,
"loss": 1.339,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -1.676585078239441,
"rewards/margins": 0.20801125466823578,
"rewards/rejected": -1.884596586227417,
"step": 178
},
{
"epoch": 0.37846052196894614,
"grad_norm": 0.522197425365448,
"learning_rate": 7.823883936130817e-07,
"logits/chosen": -0.4747823476791382,
"logits/rejected": -0.4888593554496765,
"logps/chosen": -0.723059892654419,
"logps/rejected": -0.84626305103302,
"loss": 1.2708,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -1.446119785308838,
"rewards/margins": 0.24640652537345886,
"rewards/rejected": -1.69252610206604,
"step": 179
},
{
"epoch": 0.38057482656095143,
"grad_norm": 1.9690748453140259,
"learning_rate": 7.793233829018262e-07,
"logits/chosen": -0.5430271625518799,
"logits/rejected": -0.5403288006782532,
"logps/chosen": -0.8244275450706482,
"logps/rejected": -0.9133931994438171,
"loss": 1.3306,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.6488550901412964,
"rewards/margins": 0.17793115973472595,
"rewards/rejected": -1.8267863988876343,
"step": 180
},
{
"epoch": 0.3826891311529567,
"grad_norm": 2.9181363582611084,
"learning_rate": 7.762430375592688e-07,
"logits/chosen": -0.4843495786190033,
"logits/rejected": -0.47929176688194275,
"logps/chosen": -0.8097372055053711,
"logps/rejected": -0.8973760008811951,
"loss": 1.3283,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.6194744110107422,
"rewards/margins": 0.17527759075164795,
"rewards/rejected": -1.7947520017623901,
"step": 181
},
{
"epoch": 0.384803435744962,
"grad_norm": 4.227083683013916,
"learning_rate": 7.731475266939158e-07,
"logits/chosen": -0.5047686696052551,
"logits/rejected": -0.4921850264072418,
"logps/chosen": -0.875984787940979,
"logps/rejected": -1.0406755208969116,
"loss": 1.3169,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -1.751969575881958,
"rewards/margins": 0.32938146591186523,
"rewards/rejected": -2.0813510417938232,
"step": 182
},
{
"epoch": 0.3869177403369673,
"grad_norm": 1.2871490716934204,
"learning_rate": 7.700370202468489e-07,
"logits/chosen": -0.5123783349990845,
"logits/rejected": -0.55179762840271,
"logps/chosen": -0.8869211077690125,
"logps/rejected": -1.1082773208618164,
"loss": 1.216,
"rewards/accuracies": 0.640625,
"rewards/chosen": -1.773842215538025,
"rewards/margins": 0.4427123963832855,
"rewards/rejected": -2.216554641723633,
"step": 183
},
{
"epoch": 0.3890320449289726,
"grad_norm": 1.3015679121017456,
"learning_rate": 7.669116889823954e-07,
"logits/chosen": -0.49182361364364624,
"logits/rejected": -0.5180585384368896,
"logps/chosen": -0.8816227912902832,
"logps/rejected": -0.9516821503639221,
"loss": 1.3449,
"rewards/accuracies": 0.5078125,
"rewards/chosen": -1.7632455825805664,
"rewards/margins": 0.14011862874031067,
"rewards/rejected": -1.9033643007278442,
"step": 184
},
{
"epoch": 0.3911463495209779,
"grad_norm": 4.280956268310547,
"learning_rate": 7.637717044787526e-07,
"logits/chosen": -0.5702117681503296,
"logits/rejected": -0.5475804209709167,
"logps/chosen": -0.9307697415351868,
"logps/rejected": -1.0322346687316895,
"loss": 1.3434,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -1.8615394830703735,
"rewards/margins": 0.20292985439300537,
"rewards/rejected": -2.064469337463379,
"step": 185
},
{
"epoch": 0.39326065411298317,
"grad_norm": 1.3511455059051514,
"learning_rate": 7.606172391185699e-07,
"logits/chosen": -0.5466108322143555,
"logits/rejected": -0.551085352897644,
"logps/chosen": -1.0657893419265747,
"logps/rejected": -1.15786612033844,
"loss": 1.3549,
"rewards/accuracies": 0.4609375,
"rewards/chosen": -2.1315786838531494,
"rewards/margins": 0.18415334820747375,
"rewards/rejected": -2.31573224067688,
"step": 186
},
{
"epoch": 0.39537495870498846,
"grad_norm": 0.7001176476478577,
"learning_rate": 7.574484660794836e-07,
"logits/chosen": -0.4849010407924652,
"logits/rejected": -0.5057946443557739,
"logps/chosen": -1.0784757137298584,
"logps/rejected": -1.2035218477249146,
"loss": 1.3556,
"rewards/accuracies": 0.4921875,
"rewards/chosen": -2.156951427459717,
"rewards/margins": 0.25009211897850037,
"rewards/rejected": -2.407043695449829,
"step": 187
},
{
"epoch": 0.39748926329699374,
"grad_norm": 3.1405649185180664,
"learning_rate": 7.542655593246103e-07,
"logits/chosen": -0.5316596031188965,
"logits/rejected": -0.5658366680145264,
"logps/chosen": -1.0630009174346924,
"logps/rejected": -1.2867177724838257,
"loss": 1.2612,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -2.1260018348693848,
"rewards/margins": 0.447433739900589,
"rewards/rejected": -2.5734355449676514,
"step": 188
},
{
"epoch": 0.39960356788899903,
"grad_norm": 2.142986297607422,
"learning_rate": 7.510686935929962e-07,
"logits/chosen": -0.5959028005599976,
"logits/rejected": -0.5836039781570435,
"logps/chosen": -1.111003041267395,
"logps/rejected": -1.1858208179473877,
"loss": 1.3958,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -2.22200608253479,
"rewards/margins": 0.149635449051857,
"rewards/rejected": -2.3716416358947754,
"step": 189
},
{
"epoch": 0.40171787248100427,
"grad_norm": 1.9227335453033447,
"learning_rate": 7.478580443900246e-07,
"logits/chosen": -0.607532799243927,
"logits/rejected": -0.6102017760276794,
"logps/chosen": -1.3353261947631836,
"logps/rejected": -1.3975369930267334,
"loss": 1.457,
"rewards/accuracies": 0.5078125,
"rewards/chosen": -2.670652389526367,
"rewards/margins": 0.12442154437303543,
"rewards/rejected": -2.795073986053467,
"step": 190
},
{
"epoch": 0.40383217707300956,
"grad_norm": 0.8509105443954468,
"learning_rate": 7.446337879777802e-07,
"logits/chosen": -0.5903070569038391,
"logits/rejected": -0.5728173851966858,
"logps/chosen": -1.27094566822052,
"logps/rejected": -1.3024815320968628,
"loss": 1.4953,
"rewards/accuracies": 0.5,
"rewards/chosen": -2.54189133644104,
"rewards/margins": 0.06307169049978256,
"rewards/rejected": -2.6049630641937256,
"step": 191
},
{
"epoch": 0.40594648166501485,
"grad_norm": 1.1561088562011719,
"learning_rate": 7.413961013653725e-07,
"logits/chosen": -0.5578102469444275,
"logits/rejected": -0.5907329320907593,
"logps/chosen": -1.3817013502120972,
"logps/rejected": -1.419295072555542,
"loss": 1.4865,
"rewards/accuracies": 0.515625,
"rewards/chosen": -2.7634027004241943,
"rewards/margins": 0.07518734782934189,
"rewards/rejected": -2.838590145111084,
"step": 192
},
{
"epoch": 0.40806078625702014,
"grad_norm": 8.165387153625488,
"learning_rate": 7.381451622992183e-07,
"logits/chosen": -0.5213198661804199,
"logits/rejected": -0.5392848253250122,
"logps/chosen": -1.1798306703567505,
"logps/rejected": -1.2692899703979492,
"loss": 1.3971,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.359661340713501,
"rewards/margins": 0.17891867458820343,
"rewards/rejected": -2.5385799407958984,
"step": 193
},
{
"epoch": 0.4101750908490254,
"grad_norm": 1.2850884199142456,
"learning_rate": 7.348811492532839e-07,
"logits/chosen": -0.5382787585258484,
"logits/rejected": -0.5274642705917358,
"logps/chosen": -1.242587685585022,
"logps/rejected": -1.272438645362854,
"loss": 1.4795,
"rewards/accuracies": 0.4921875,
"rewards/chosen": -2.485175371170044,
"rewards/margins": 0.05970197170972824,
"rewards/rejected": -2.544877290725708,
"step": 194
},
{
"epoch": 0.4122893954410307,
"grad_norm": 4.910929203033447,
"learning_rate": 7.316042414192864e-07,
"logits/chosen": -0.6186666488647461,
"logits/rejected": -0.6255884170532227,
"logps/chosen": -1.1743704080581665,
"logps/rejected": -1.2720146179199219,
"loss": 1.4127,
"rewards/accuracies": 0.546875,
"rewards/chosen": -2.348740816116333,
"rewards/margins": 0.19528816640377045,
"rewards/rejected": -2.5440292358398438,
"step": 195
},
{
"epoch": 0.414403700033036,
"grad_norm": 4.270901203155518,
"learning_rate": 7.283146186968565e-07,
"logits/chosen": -0.5861366987228394,
"logits/rejected": -0.6005197763442993,
"logps/chosen": -1.2127022743225098,
"logps/rejected": -1.3036490678787231,
"loss": 1.4067,
"rewards/accuracies": 0.546875,
"rewards/chosen": -2.4254045486450195,
"rewards/margins": 0.18189355731010437,
"rewards/rejected": -2.6072981357574463,
"step": 196
},
{
"epoch": 0.4165180046250413,
"grad_norm": 0.3070116639137268,
"learning_rate": 7.250124616836622e-07,
"logits/chosen": -0.6026022434234619,
"logits/rejected": -0.5920048952102661,
"logps/chosen": -1.0706496238708496,
"logps/rejected": -1.2879594564437866,
"loss": 1.2465,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -2.141299247741699,
"rewards/margins": 0.4346192479133606,
"rewards/rejected": -2.5759189128875732,
"step": 197
},
{
"epoch": 0.4186323092170466,
"grad_norm": 1.160252571105957,
"learning_rate": 7.216979516654943e-07,
"logits/chosen": -0.5808722376823425,
"logits/rejected": -0.5770124197006226,
"logps/chosen": -1.0426011085510254,
"logps/rejected": -1.1295092105865479,
"loss": 1.4244,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.085202217102051,
"rewards/margins": 0.1738162338733673,
"rewards/rejected": -2.2590184211730957,
"step": 198
},
{
"epoch": 0.4207466138090519,
"grad_norm": 4.6966471672058105,
"learning_rate": 7.183712706063132e-07,
"logits/chosen": -0.5958350896835327,
"logits/rejected": -0.6440161466598511,
"logps/chosen": -0.981076717376709,
"logps/rejected": -1.1257147789001465,
"loss": 1.3175,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -1.962153434753418,
"rewards/margins": 0.28927627205848694,
"rewards/rejected": -2.251429557800293,
"step": 199
},
{
"epoch": 0.42286091840105716,
"grad_norm": 2.9395248889923096,
"learning_rate": 7.150326011382603e-07,
"logits/chosen": -0.5647889375686646,
"logits/rejected": -0.5762943625450134,
"logps/chosen": -0.8101261854171753,
"logps/rejected": -1.0001438856124878,
"loss": 1.2135,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.6202523708343506,
"rewards/margins": 0.38003528118133545,
"rewards/rejected": -2.0002877712249756,
"step": 200
},
{
"epoch": 0.42497522299306245,
"grad_norm": 1.2575147151947021,
"learning_rate": 7.116821265516306e-07,
"logits/chosen": -0.5834293961524963,
"logits/rejected": -0.5929508805274963,
"logps/chosen": -0.8768399953842163,
"logps/rejected": -1.0942046642303467,
"loss": 1.219,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -1.7536799907684326,
"rewards/margins": 0.43472927808761597,
"rewards/rejected": -2.1884093284606934,
"step": 201
},
{
"epoch": 0.42708952758506774,
"grad_norm": 1.4035751819610596,
"learning_rate": 7.083200307848115e-07,
"logits/chosen": -0.5424078106880188,
"logits/rejected": -0.5316082239151001,
"logps/chosen": -0.8791903257369995,
"logps/rejected": -0.9323580265045166,
"loss": 1.3675,
"rewards/accuracies": 0.546875,
"rewards/chosen": -1.758380651473999,
"rewards/margins": 0.10633517056703568,
"rewards/rejected": -1.8647160530090332,
"step": 202
},
{
"epoch": 0.42920383217707303,
"grad_norm": 1.8622503280639648,
"learning_rate": 7.049464984141829e-07,
"logits/chosen": -0.5329294204711914,
"logits/rejected": -0.5523126721382141,
"logps/chosen": -0.695776104927063,
"logps/rejected": -0.8400713801383972,
"loss": 1.2285,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -1.391552209854126,
"rewards/margins": 0.28859058022499084,
"rewards/rejected": -1.6801427602767944,
"step": 203
},
{
"epoch": 0.4313181367690783,
"grad_norm": 0.8603182435035706,
"learning_rate": 7.015617146439861e-07,
"logits/chosen": -0.4516752064228058,
"logits/rejected": -0.46907976269721985,
"logps/chosen": -0.6868133544921875,
"logps/rejected": -0.8646677732467651,
"loss": 1.2417,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.373626708984375,
"rewards/margins": 0.355709046125412,
"rewards/rejected": -1.7293355464935303,
"step": 204
},
{
"epoch": 0.43343244136108355,
"grad_norm": 0.6437748670578003,
"learning_rate": 6.981658652961546e-07,
"logits/chosen": -0.6159051656723022,
"logits/rejected": -0.6000130772590637,
"logps/chosen": -0.7715178728103638,
"logps/rejected": -0.8714219331741333,
"loss": 1.3469,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -1.5430357456207275,
"rewards/margins": 0.19980813562870026,
"rewards/rejected": -1.7428438663482666,
"step": 205
},
{
"epoch": 0.43554674595308884,
"grad_norm": 1.2309322357177734,
"learning_rate": 6.947591368001137e-07,
"logits/chosen": -0.5913614630699158,
"logits/rejected": -0.6128537654876709,
"logps/chosen": -0.7512561678886414,
"logps/rejected": -0.8872793912887573,
"loss": 1.26,
"rewards/accuracies": 0.578125,
"rewards/chosen": -1.5025123357772827,
"rewards/margins": 0.2720465660095215,
"rewards/rejected": -1.7745587825775146,
"step": 206
},
{
"epoch": 0.43766105054509413,
"grad_norm": 0.6153685450553894,
"learning_rate": 6.913417161825449e-07,
"logits/chosen": -0.5976595878601074,
"logits/rejected": -0.6222202181816101,
"logps/chosen": -0.837669849395752,
"logps/rejected": -0.9835771918296814,
"loss": 1.2986,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -1.675339698791504,
"rewards/margins": 0.2918146252632141,
"rewards/rejected": -1.9671543836593628,
"step": 207
},
{
"epoch": 0.4397753551370994,
"grad_norm": 1.9922760725021362,
"learning_rate": 6.87913791057119e-07,
"logits/chosen": -0.6808818578720093,
"logits/rejected": -0.6692708730697632,
"logps/chosen": -0.7088961601257324,
"logps/rejected": -0.8256410360336304,
"loss": 1.281,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -1.4177923202514648,
"rewards/margins": 0.23348984122276306,
"rewards/rejected": -1.6512820720672607,
"step": 208
},
{
"epoch": 0.4418896597291047,
"grad_norm": 1.9562067985534668,
"learning_rate": 6.844755496141961e-07,
"logits/chosen": -0.5282632112503052,
"logits/rejected": -0.5692226886749268,
"logps/chosen": -0.7235382795333862,
"logps/rejected": -0.801092803478241,
"loss": 1.3227,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.4470765590667725,
"rewards/margins": 0.1551089584827423,
"rewards/rejected": -1.602185606956482,
"step": 209
},
{
"epoch": 0.44400396432111,
"grad_norm": 0.8182584047317505,
"learning_rate": 6.81027180610493e-07,
"logits/chosen": -0.6418904662132263,
"logits/rejected": -0.5941328406333923,
"logps/chosen": -0.820648729801178,
"logps/rejected": -0.8864803910255432,
"loss": 1.3498,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.641297459602356,
"rewards/margins": 0.13166317343711853,
"rewards/rejected": -1.7729607820510864,
"step": 210
},
{
"epoch": 0.4461182689131153,
"grad_norm": 3.075260877609253,
"learning_rate": 6.775688733587227e-07,
"logits/chosen": -0.5926809906959534,
"logits/rejected": -0.5844541788101196,
"logps/chosen": -0.7822425365447998,
"logps/rejected": -0.8866626024246216,
"loss": 1.2884,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -1.5644850730895996,
"rewards/margins": 0.20884013175964355,
"rewards/rejected": -1.7733252048492432,
"step": 211
},
{
"epoch": 0.4482325735051206,
"grad_norm": 0.8032744526863098,
"learning_rate": 6.741008177171993e-07,
"logits/chosen": -0.579971432685852,
"logits/rejected": -0.5978566408157349,
"logps/chosen": -0.721234917640686,
"logps/rejected": -0.8368514180183411,
"loss": 1.2781,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -1.442469835281372,
"rewards/margins": 0.23123310506343842,
"rewards/rejected": -1.6737028360366821,
"step": 212
},
{
"epoch": 0.45034687809712587,
"grad_norm": 0.6680911779403687,
"learning_rate": 6.706232040794161e-07,
"logits/chosen": -0.6748596429824829,
"logits/rejected": -0.6615546941757202,
"logps/chosen": -0.7931480407714844,
"logps/rejected": -0.8879257440567017,
"loss": 1.337,
"rewards/accuracies": 0.546875,
"rewards/chosen": -1.5862960815429688,
"rewards/margins": 0.1895553171634674,
"rewards/rejected": -1.7758514881134033,
"step": 213
},
{
"epoch": 0.45246118268913116,
"grad_norm": 2.5107688903808594,
"learning_rate": 6.671362233635925e-07,
"logits/chosen": -0.6460363268852234,
"logits/rejected": -0.6273557543754578,
"logps/chosen": -0.823783814907074,
"logps/rejected": -0.87412428855896,
"loss": 1.3756,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.647567629814148,
"rewards/margins": 0.10068092495203018,
"rewards/rejected": -1.74824857711792,
"step": 214
},
{
"epoch": 0.45457548728113645,
"grad_norm": 2.2206740379333496,
"learning_rate": 6.636400670021933e-07,
"logits/chosen": -0.6295229196548462,
"logits/rejected": -0.6330893039703369,
"logps/chosen": -0.807812511920929,
"logps/rejected": -0.9784457683563232,
"loss": 1.2259,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -1.615625023841858,
"rewards/margins": 0.3412665128707886,
"rewards/rejected": -1.9568915367126465,
"step": 215
},
{
"epoch": 0.45668979187314174,
"grad_norm": 1.2925803661346436,
"learning_rate": 6.601349269314187e-07,
"logits/chosen": -0.6001027822494507,
"logits/rejected": -0.6305864453315735,
"logps/chosen": -0.7216315865516663,
"logps/rejected": -0.8616191744804382,
"loss": 1.269,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.4432631731033325,
"rewards/margins": 0.2799749970436096,
"rewards/rejected": -1.7232383489608765,
"step": 216
},
{
"epoch": 0.458804096465147,
"grad_norm": 4.863992214202881,
"learning_rate": 6.566209955806679e-07,
"logits/chosen": -0.5307935476303101,
"logits/rejected": -0.5385264754295349,
"logps/chosen": -0.8053566813468933,
"logps/rejected": -0.9241464734077454,
"loss": 1.3325,
"rewards/accuracies": 0.546875,
"rewards/chosen": -1.6107133626937866,
"rewards/margins": 0.23757943511009216,
"rewards/rejected": -1.8482929468154907,
"step": 217
},
{
"epoch": 0.4609184010571523,
"grad_norm": 1.0189604759216309,
"learning_rate": 6.530984658619733e-07,
"logits/chosen": -0.7031885385513306,
"logits/rejected": -0.7072005867958069,
"logps/chosen": -0.8382629752159119,
"logps/rejected": -0.9468755722045898,
"loss": 1.3276,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -1.6765259504318237,
"rewards/margins": 0.21722503006458282,
"rewards/rejected": -1.8937511444091797,
"step": 218
},
{
"epoch": 0.4630327056491576,
"grad_norm": 1.1178699731826782,
"learning_rate": 6.495675311594122e-07,
"logits/chosen": -0.5736142992973328,
"logits/rejected": -0.5926069021224976,
"logps/chosen": -0.7676032781600952,
"logps/rejected": -0.9179919958114624,
"loss": 1.278,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -1.5352065563201904,
"rewards/margins": 0.3007773756980896,
"rewards/rejected": -1.8359839916229248,
"step": 219
},
{
"epoch": 0.4651470102411629,
"grad_norm": 2.4985287189483643,
"learning_rate": 6.460283853184879e-07,
"logits/chosen": -0.6372602581977844,
"logits/rejected": -0.6313104033470154,
"logps/chosen": -0.8754556179046631,
"logps/rejected": -0.9803894758224487,
"loss": 1.3166,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -1.7509112358093262,
"rewards/margins": 0.2098677009344101,
"rewards/rejected": -1.9607789516448975,
"step": 220
},
{
"epoch": 0.46726131483316813,
"grad_norm": 1.5675435066223145,
"learning_rate": 6.424812226354889e-07,
"logits/chosen": -0.6377983093261719,
"logits/rejected": -0.6666730642318726,
"logps/chosen": -0.7556843757629395,
"logps/rejected": -0.9096466302871704,
"loss": 1.2397,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -1.511368751525879,
"rewards/margins": 0.30792441964149475,
"rewards/rejected": -1.8192932605743408,
"step": 221
},
{
"epoch": 0.4693756194251734,
"grad_norm": 2.853426218032837,
"learning_rate": 6.389262378468219e-07,
"logits/chosen": -0.6055567860603333,
"logits/rejected": -0.612144947052002,
"logps/chosen": -0.8588352203369141,
"logps/rejected": -0.8928595185279846,
"loss": 1.4022,
"rewards/accuracies": 0.5078125,
"rewards/chosen": -1.7176704406738281,
"rewards/margins": 0.06804870069026947,
"rewards/rejected": -1.7857190370559692,
"step": 222
},
{
"epoch": 0.4714899240171787,
"grad_norm": 0.528042733669281,
"learning_rate": 6.353636261183213e-07,
"logits/chosen": -0.6543641090393066,
"logits/rejected": -0.6635830402374268,
"logps/chosen": -0.7858147621154785,
"logps/rejected": -0.9400445222854614,
"loss": 1.2446,
"rewards/accuracies": 0.640625,
"rewards/chosen": -1.571629524230957,
"rewards/margins": 0.3084595203399658,
"rewards/rejected": -1.8800890445709229,
"step": 223
},
{
"epoch": 0.473604228609184,
"grad_norm": 1.1155768632888794,
"learning_rate": 6.317935830345338e-07,
"logits/chosen": -0.5700349807739258,
"logits/rejected": -0.6560614705085754,
"logps/chosen": -0.8426170945167542,
"logps/rejected": -0.9983471035957336,
"loss": 1.3204,
"rewards/accuracies": 0.578125,
"rewards/chosen": -1.6852341890335083,
"rewards/margins": 0.3114599883556366,
"rewards/rejected": -1.9966942071914673,
"step": 224
},
{
"epoch": 0.4757185332011893,
"grad_norm": 0.802669107913971,
"learning_rate": 6.282163045879823e-07,
"logits/chosen": -0.6912901401519775,
"logits/rejected": -0.7201069593429565,
"logps/chosen": -0.8135342597961426,
"logps/rejected": -0.9537283182144165,
"loss": 1.2961,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -1.6270685195922852,
"rewards/margins": 0.2803882658481598,
"rewards/rejected": -1.907456636428833,
"step": 225
},
{
"epoch": 0.4778328377931946,
"grad_norm": 1.709757924079895,
"learning_rate": 6.246319871684047e-07,
"logits/chosen": -0.7573816776275635,
"logits/rejected": -0.8028420209884644,
"logps/chosen": -0.891952633857727,
"logps/rejected": -1.0168029069900513,
"loss": 1.333,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -1.783905267715454,
"rewards/margins": 0.24970072507858276,
"rewards/rejected": -2.0336058139801025,
"step": 226
},
{
"epoch": 0.47994714238519987,
"grad_norm": 2.170957326889038,
"learning_rate": 6.210408275519734e-07,
"logits/chosen": -0.6915597915649414,
"logits/rejected": -0.7027997970581055,
"logps/chosen": -0.9063036441802979,
"logps/rejected": -1.0104373693466187,
"loss": 1.3388,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -1.8126072883605957,
"rewards/margins": 0.20826762914657593,
"rewards/rejected": -2.0208747386932373,
"step": 227
},
{
"epoch": 0.48206144697720515,
"grad_norm": 1.8802261352539062,
"learning_rate": 6.174430228904919e-07,
"logits/chosen": -0.689726710319519,
"logits/rejected": -0.7143282890319824,
"logps/chosen": -0.7480812072753906,
"logps/rejected": -0.8698041439056396,
"loss": 1.2836,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -1.4961624145507812,
"rewards/margins": 0.24344584345817566,
"rewards/rejected": -1.7396082878112793,
"step": 228
},
{
"epoch": 0.48417575156921044,
"grad_norm": 2.5202934741973877,
"learning_rate": 6.13838770700571e-07,
"logits/chosen": -0.6858299374580383,
"logits/rejected": -0.7115206122398376,
"logps/chosen": -0.8575515151023865,
"logps/rejected": -0.9657347202301025,
"loss": 1.3046,
"rewards/accuracies": 0.546875,
"rewards/chosen": -1.715103030204773,
"rewards/margins": 0.21636635065078735,
"rewards/rejected": -1.931469440460205,
"step": 229
},
{
"epoch": 0.48629005616121573,
"grad_norm": 1.268512487411499,
"learning_rate": 6.102282688527859e-07,
"logits/chosen": -0.7078689932823181,
"logits/rejected": -0.7254161238670349,
"logps/chosen": -0.8850880861282349,
"logps/rejected": -1.031385898590088,
"loss": 1.2816,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.7701761722564697,
"rewards/margins": 0.29259535670280457,
"rewards/rejected": -2.062771797180176,
"step": 230
},
{
"epoch": 0.488404360753221,
"grad_norm": 1.7285584211349487,
"learning_rate": 6.066117155608135e-07,
"logits/chosen": -0.7325868606567383,
"logits/rejected": -0.7433226108551025,
"logps/chosen": -0.8014956116676331,
"logps/rejected": -0.9653260111808777,
"loss": 1.2429,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -1.6029912233352661,
"rewards/margins": 0.32766085863113403,
"rewards/rejected": -1.9306520223617554,
"step": 231
},
{
"epoch": 0.4905186653452263,
"grad_norm": 0.6270304322242737,
"learning_rate": 6.029893093705491e-07,
"logits/chosen": -0.692166805267334,
"logits/rejected": -0.6799293756484985,
"logps/chosen": -0.7850213646888733,
"logps/rejected": -0.8839574456214905,
"loss": 1.2967,
"rewards/accuracies": 0.640625,
"rewards/chosen": -1.5700427293777466,
"rewards/margins": 0.19787229597568512,
"rewards/rejected": -1.767914891242981,
"step": 232
},
{
"epoch": 0.4926329699372316,
"grad_norm": 1.0160484313964844,
"learning_rate": 5.993612491492087e-07,
"logits/chosen": -0.7095844149589539,
"logits/rejected": -0.71524578332901,
"logps/chosen": -0.7063854336738586,
"logps/rejected": -0.8855549097061157,
"loss": 1.2176,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -1.4127708673477173,
"rewards/margins": 0.3583390712738037,
"rewards/rejected": -1.7711098194122314,
"step": 233
},
{
"epoch": 0.4947472745292369,
"grad_norm": 2.225841999053955,
"learning_rate": 5.957277340744094e-07,
"logits/chosen": -0.7488946318626404,
"logits/rejected": -0.7588428854942322,
"logps/chosen": -0.9203822612762451,
"logps/rejected": -1.0089298486709595,
"loss": 1.355,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -1.8407645225524902,
"rewards/margins": 0.17709502577781677,
"rewards/rejected": -2.017859697341919,
"step": 234
},
{
"epoch": 0.4968615791212422,
"grad_norm": 1.9577795267105103,
"learning_rate": 5.920889636232351e-07,
"logits/chosen": -0.8078997731208801,
"logits/rejected": -0.8064825534820557,
"logps/chosen": -0.8004480004310608,
"logps/rejected": -0.9856831431388855,
"loss": 1.2273,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -1.6008960008621216,
"rewards/margins": 0.3704703152179718,
"rewards/rejected": -1.971366286277771,
"step": 235
},
{
"epoch": 0.4989758837132474,
"grad_norm": 2.5050246715545654,
"learning_rate": 5.884451375612865e-07,
"logits/chosen": -0.7499472498893738,
"logits/rejected": -0.7421904802322388,
"logps/chosen": -0.8363584876060486,
"logps/rejected": -0.9543781876564026,
"loss": 1.3002,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -1.6727169752120972,
"rewards/margins": 0.23603934049606323,
"rewards/rejected": -1.9087563753128052,
"step": 236
},
{
"epoch": 0.5010901883052528,
"grad_norm": 0.585436224937439,
"learning_rate": 5.847964559317128e-07,
"logits/chosen": -0.730015218257904,
"logits/rejected": -0.7154791355133057,
"logps/chosen": -0.8828849196434021,
"logps/rejected": -0.9897070527076721,
"loss": 1.347,
"rewards/accuracies": 0.578125,
"rewards/chosen": -1.7657698392868042,
"rewards/margins": 0.21364440023899078,
"rewards/rejected": -1.9794141054153442,
"step": 237
},
{
"epoch": 0.503204492897258,
"grad_norm": 0.9204092621803284,
"learning_rate": 5.8114311904423e-07,
"logits/chosen": -0.759974479675293,
"logits/rejected": -0.7793674468994141,
"logps/chosen": -0.8321584463119507,
"logps/rejected": -1.0809751749038696,
"loss": 1.2185,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.6643168926239014,
"rewards/margins": 0.4976334273815155,
"rewards/rejected": -2.1619503498077393,
"step": 238
},
{
"epoch": 0.5053187974892633,
"grad_norm": 5.147011756896973,
"learning_rate": 5.774853274641243e-07,
"logits/chosen": -0.7148956060409546,
"logits/rejected": -0.7363921403884888,
"logps/chosen": -0.8623124361038208,
"logps/rejected": -1.0681498050689697,
"loss": 1.2353,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -1.7246248722076416,
"rewards/margins": 0.4116746187210083,
"rewards/rejected": -2.1362996101379395,
"step": 239
},
{
"epoch": 0.5074331020812686,
"grad_norm": 1.9065529108047485,
"learning_rate": 5.738232820012407e-07,
"logits/chosen": -0.7158540487289429,
"logits/rejected": -0.7083900570869446,
"logps/chosen": -0.981558620929718,
"logps/rejected": -1.054612636566162,
"loss": 1.3594,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.963117241859436,
"rewards/margins": 0.14610806107521057,
"rewards/rejected": -2.109225273132324,
"step": 240
},
{
"epoch": 0.5095474066732739,
"grad_norm": 2.4411256313323975,
"learning_rate": 5.701571836989591e-07,
"logits/chosen": -0.8441444039344788,
"logits/rejected": -0.8529233336448669,
"logps/chosen": -0.8665949702262878,
"logps/rejected": -1.030572772026062,
"loss": 1.2477,
"rewards/accuracies": 0.671875,
"rewards/chosen": -1.7331899404525757,
"rewards/margins": 0.3279556334018707,
"rewards/rejected": -2.061145544052124,
"step": 241
},
{
"epoch": 0.5116617112652792,
"grad_norm": 2.461113214492798,
"learning_rate": 5.664872338231571e-07,
"logits/chosen": -0.7463312149047852,
"logits/rejected": -0.7725105285644531,
"logps/chosen": -0.9185941815376282,
"logps/rejected": -1.1244423389434814,
"loss": 1.2404,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -1.8371883630752563,
"rewards/margins": 0.411696195602417,
"rewards/rejected": -2.248884677886963,
"step": 242
},
{
"epoch": 0.5137760158572844,
"grad_norm": 3.5861761569976807,
"learning_rate": 5.628136338511607e-07,
"logits/chosen": -0.8432914018630981,
"logits/rejected": -0.85801100730896,
"logps/chosen": -0.8873915672302246,
"logps/rejected": -1.0090795755386353,
"loss": 1.3072,
"rewards/accuracies": 0.609375,
"rewards/chosen": -1.7747831344604492,
"rewards/margins": 0.24337637424468994,
"rewards/rejected": -2.0181591510772705,
"step": 243
},
{
"epoch": 0.5158903204492897,
"grad_norm": 2.109071969985962,
"learning_rate": 5.591365854606829e-07,
"logits/chosen": -0.7899532318115234,
"logits/rejected": -0.7548331618309021,
"logps/chosen": -0.9333330392837524,
"logps/rejected": -1.00949227809906,
"loss": 1.3749,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.8666660785675049,
"rewards/margins": 0.1523183286190033,
"rewards/rejected": -2.01898455619812,
"step": 244
},
{
"epoch": 0.518004625041295,
"grad_norm": 2.2017955780029297,
"learning_rate": 5.554562905187527e-07,
"logits/chosen": -0.7569047212600708,
"logits/rejected": -0.7679808735847473,
"logps/chosen": -0.9779613614082336,
"logps/rejected": -1.1713427305221558,
"loss": 1.2628,
"rewards/accuracies": 0.609375,
"rewards/chosen": -1.9559227228164673,
"rewards/margins": 0.3867628276348114,
"rewards/rejected": -2.3426854610443115,
"step": 245
},
{
"epoch": 0.5201189296333003,
"grad_norm": 4.651991367340088,
"learning_rate": 5.517729510706315e-07,
"logits/chosen": -0.8546395301818848,
"logits/rejected": -0.8609369397163391,
"logps/chosen": -0.9926605224609375,
"logps/rejected": -1.1553713083267212,
"loss": 1.2812,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -1.985321044921875,
"rewards/margins": 0.32542160153388977,
"rewards/rejected": -2.3107426166534424,
"step": 246
},
{
"epoch": 0.5222332342253055,
"grad_norm": 2.6384060382843018,
"learning_rate": 5.480867693287223e-07,
"logits/chosen": -0.7734386324882507,
"logits/rejected": -0.7963250875473022,
"logps/chosen": -0.8996341824531555,
"logps/rejected": -1.0466523170471191,
"loss": 1.2849,
"rewards/accuracies": 0.578125,
"rewards/chosen": -1.799268364906311,
"rewards/margins": 0.2940361201763153,
"rewards/rejected": -2.0933046340942383,
"step": 247
},
{
"epoch": 0.5243475388173109,
"grad_norm": 1.3608977794647217,
"learning_rate": 5.443979476614674e-07,
"logits/chosen": -0.7350472807884216,
"logits/rejected": -0.7215992212295532,
"logps/chosen": -0.8887076377868652,
"logps/rejected": -1.0147045850753784,
"loss": 1.3182,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -1.7774152755737305,
"rewards/margins": 0.25199398398399353,
"rewards/rejected": -2.029409170150757,
"step": 248
},
{
"epoch": 0.5264618434093161,
"grad_norm": 3.017115354537964,
"learning_rate": 5.407066885822391e-07,
"logits/chosen": -0.827782154083252,
"logits/rejected": -0.8471929430961609,
"logps/chosen": -0.9262440800666809,
"logps/rejected": -1.1658306121826172,
"loss": 1.1882,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -1.8524881601333618,
"rewards/margins": 0.47917306423187256,
"rewards/rejected": -2.3316612243652344,
"step": 249
},
{
"epoch": 0.5285761480013215,
"grad_norm": 0.7805312275886536,
"learning_rate": 5.370131947382214e-07,
"logits/chosen": -0.7815499305725098,
"logits/rejected": -0.8279274702072144,
"logps/chosen": -0.968708872795105,
"logps/rejected": -1.2697322368621826,
"loss": 1.2092,
"rewards/accuracies": 0.671875,
"rewards/chosen": -1.93741774559021,
"rewards/margins": 0.6020466685295105,
"rewards/rejected": -2.5394644737243652,
"step": 250
},
{
"epoch": 0.5306904525933267,
"grad_norm": 2.229363441467285,
"learning_rate": 5.333176688992855e-07,
"logits/chosen": -0.7824153900146484,
"logits/rejected": -0.8154900074005127,
"logps/chosen": -1.0211957693099976,
"logps/rejected": -1.2145965099334717,
"loss": 1.3074,
"rewards/accuracies": 0.609375,
"rewards/chosen": -2.042391538619995,
"rewards/margins": 0.3868010938167572,
"rewards/rejected": -2.4291930198669434,
"step": 251
},
{
"epoch": 0.532804757185332,
"grad_norm": 1.1359837055206299,
"learning_rate": 5.296203139468571e-07,
"logits/chosen": -0.7467613220214844,
"logits/rejected": -0.7548531889915466,
"logps/chosen": -1.0614902973175049,
"logps/rejected": -1.2674376964569092,
"loss": 1.2512,
"rewards/accuracies": 0.578125,
"rewards/chosen": -2.1229805946350098,
"rewards/margins": 0.4118950664997101,
"rewards/rejected": -2.5348753929138184,
"step": 252
},
{
"epoch": 0.5349190617773373,
"grad_norm": 3.0548548698425293,
"learning_rate": 5.259213328627792e-07,
"logits/chosen": -0.7868636250495911,
"logits/rejected": -0.8130850791931152,
"logps/chosen": -1.0743666887283325,
"logps/rejected": -1.2010191679000854,
"loss": 1.3275,
"rewards/accuracies": 0.578125,
"rewards/chosen": -2.148733377456665,
"rewards/margins": 0.2533051669597626,
"rewards/rejected": -2.402038335800171,
"step": 253
},
{
"epoch": 0.5370333663693426,
"grad_norm": 1.7205246686935425,
"learning_rate": 5.222209287181676e-07,
"logits/chosen": -0.81404709815979,
"logits/rejected": -0.8481613397598267,
"logps/chosen": -1.1599587202072144,
"logps/rejected": -1.4234716892242432,
"loss": 1.2894,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -2.3199174404144287,
"rewards/margins": 0.5270256400108337,
"rewards/rejected": -2.8469433784484863,
"step": 254
},
{
"epoch": 0.5391476709613479,
"grad_norm": 2.2516112327575684,
"learning_rate": 5.185193046622634e-07,
"logits/chosen": -0.8112510442733765,
"logits/rejected": -0.8310728073120117,
"logps/chosen": -1.1263186931610107,
"logps/rejected": -1.3256827592849731,
"loss": 1.3552,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -2.2526373863220215,
"rewards/margins": 0.39872825145721436,
"rewards/rejected": -2.6513655185699463,
"step": 255
},
{
"epoch": 0.5412619755533532,
"grad_norm": 2.8379359245300293,
"learning_rate": 5.148166639112799e-07,
"logits/chosen": -0.8202102184295654,
"logits/rejected": -0.845209002494812,
"logps/chosen": -1.264180302619934,
"logps/rejected": -1.6190590858459473,
"loss": 1.2083,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.528360605239868,
"rewards/margins": 0.7097575068473816,
"rewards/rejected": -3.2381181716918945,
"step": 256
},
{
"epoch": 0.5433762801453584,
"grad_norm": 4.676355838775635,
"learning_rate": 5.111132097372459e-07,
"logits/chosen": -0.8866451978683472,
"logits/rejected": -0.8642281889915466,
"logps/chosen": -1.3194389343261719,
"logps/rejected": -1.4506916999816895,
"loss": 1.4002,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.6388778686523438,
"rewards/margins": 0.2625058591365814,
"rewards/rejected": -2.901383399963379,
"step": 257
},
{
"epoch": 0.5454905847373638,
"grad_norm": 2.55251407623291,
"learning_rate": 5.074091454568463e-07,
"logits/chosen": -0.7903708815574646,
"logits/rejected": -0.8010709881782532,
"logps/chosen": -1.3550961017608643,
"logps/rejected": -1.661428451538086,
"loss": 1.2131,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.7101922035217285,
"rewards/margins": 0.6126645803451538,
"rewards/rejected": -3.322856903076172,
"step": 258
},
{
"epoch": 0.547604889329369,
"grad_norm": 4.116572856903076,
"learning_rate": 5.037046744202611e-07,
"logits/chosen": -0.7501232624053955,
"logits/rejected": -0.7825176119804382,
"logps/chosen": -1.2111856937408447,
"logps/rejected": -1.5176191329956055,
"loss": 1.1345,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.4223713874816895,
"rewards/margins": 0.6128667593002319,
"rewards/rejected": -3.035238265991211,
"step": 259
},
{
"epoch": 0.5497191939213742,
"grad_norm": 2.0285205841064453,
"learning_rate": 5e-07,
"logits/chosen": -0.8355445861816406,
"logits/rejected": -0.8497716784477234,
"logps/chosen": -1.1876304149627686,
"logps/rejected": -1.4788450002670288,
"loss": 1.1559,
"rewards/accuracies": 0.671875,
"rewards/chosen": -2.375260829925537,
"rewards/margins": 0.5824294090270996,
"rewards/rejected": -2.9576900005340576,
"step": 260
},
{
"epoch": 0.5518334985133796,
"grad_norm": 4.681185245513916,
"learning_rate": 4.962953255797389e-07,
"logits/chosen": -0.8240503072738647,
"logits/rejected": -0.8016488552093506,
"logps/chosen": -1.2238959074020386,
"logps/rejected": -1.4727882146835327,
"loss": 1.2914,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -2.447791814804077,
"rewards/margins": 0.49778467416763306,
"rewards/rejected": -2.9455764293670654,
"step": 261
},
{
"epoch": 0.5539478031053848,
"grad_norm": 5.15679931640625,
"learning_rate": 4.925908545431537e-07,
"logits/chosen": -0.728940486907959,
"logits/rejected": -0.7355924248695374,
"logps/chosen": -1.3356778621673584,
"logps/rejected": -1.6726096868515015,
"loss": 1.1434,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -2.671355724334717,
"rewards/margins": 0.6738637685775757,
"rewards/rejected": -3.345219373703003,
"step": 262
},
{
"epoch": 0.5560621076973902,
"grad_norm": 2.481048345565796,
"learning_rate": 4.888867902627543e-07,
"logits/chosen": -0.8311591148376465,
"logits/rejected": -0.8191719055175781,
"logps/chosen": -1.2743335962295532,
"logps/rejected": -1.5339927673339844,
"loss": 1.2164,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -2.5486671924591064,
"rewards/margins": 0.5193185210227966,
"rewards/rejected": -3.0679855346679688,
"step": 263
},
{
"epoch": 0.5581764122893954,
"grad_norm": 3.6758291721343994,
"learning_rate": 4.851833360887201e-07,
"logits/chosen": -0.6787989735603333,
"logits/rejected": -0.668928325176239,
"logps/chosen": -1.2278664112091064,
"logps/rejected": -1.4955706596374512,
"loss": 1.1942,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -2.455732822418213,
"rewards/margins": 0.535408616065979,
"rewards/rejected": -2.9911413192749023,
"step": 264
},
{
"epoch": 0.5602907168814008,
"grad_norm": 2.7282023429870605,
"learning_rate": 4.814806953377365e-07,
"logits/chosen": -0.7772133350372314,
"logits/rejected": -0.7689889073371887,
"logps/chosen": -1.1954048871994019,
"logps/rejected": -1.444943904876709,
"loss": 1.2686,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.3908097743988037,
"rewards/margins": 0.4990782141685486,
"rewards/rejected": -2.889887809753418,
"step": 265
},
{
"epoch": 0.562405021473406,
"grad_norm": 2.8753116130828857,
"learning_rate": 4.777790712818323e-07,
"logits/chosen": -0.6946043968200684,
"logits/rejected": -0.7001516819000244,
"logps/chosen": -1.2844620943069458,
"logps/rejected": -1.486103892326355,
"loss": 1.284,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -2.5689241886138916,
"rewards/margins": 0.4032836854457855,
"rewards/rejected": -2.97220778465271,
"step": 266
},
{
"epoch": 0.5645193260654113,
"grad_norm": 1.5583593845367432,
"learning_rate": 4.740786671372209e-07,
"logits/chosen": -0.7396820187568665,
"logits/rejected": -0.7129873037338257,
"logps/chosen": -1.410097599029541,
"logps/rejected": -1.6091456413269043,
"loss": 1.3158,
"rewards/accuracies": 0.640625,
"rewards/chosen": -2.820195198059082,
"rewards/margins": 0.3980959951877594,
"rewards/rejected": -3.2182912826538086,
"step": 267
},
{
"epoch": 0.5666336306574166,
"grad_norm": 3.5984952449798584,
"learning_rate": 4.703796860531429e-07,
"logits/chosen": -0.7031830549240112,
"logits/rejected": -0.700330376625061,
"logps/chosen": -1.633664608001709,
"logps/rejected": -1.9186874628067017,
"loss": 1.2479,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -3.267329216003418,
"rewards/margins": 0.5700456500053406,
"rewards/rejected": -3.8373749256134033,
"step": 268
},
{
"epoch": 0.5687479352494219,
"grad_norm": 6.295733451843262,
"learning_rate": 4.666823311007144e-07,
"logits/chosen": -0.8001950979232788,
"logits/rejected": -0.8042099475860596,
"logps/chosen": -1.5675832033157349,
"logps/rejected": -1.9247075319290161,
"loss": 1.1759,
"rewards/accuracies": 0.625,
"rewards/chosen": -3.1351664066314697,
"rewards/margins": 0.7142485976219177,
"rewards/rejected": -3.8494150638580322,
"step": 269
},
{
"epoch": 0.5708622398414271,
"grad_norm": 3.6349036693573,
"learning_rate": 4.6298680526177855e-07,
"logits/chosen": -0.8108068704605103,
"logits/rejected": -0.8030902147293091,
"logps/chosen": -1.8205997943878174,
"logps/rejected": -2.195197105407715,
"loss": 1.1864,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -3.6411995887756348,
"rewards/margins": 0.7491948008537292,
"rewards/rejected": -4.39039421081543,
"step": 270
},
{
"epoch": 0.5729765444334325,
"grad_norm": 4.786395072937012,
"learning_rate": 4.59293311417761e-07,
"logits/chosen": -0.798182487487793,
"logits/rejected": -0.7736828327178955,
"logps/chosen": -1.8617057800292969,
"logps/rejected": -2.08984637260437,
"loss": 1.3947,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -3.7234115600585938,
"rewards/margins": 0.4562810957431793,
"rewards/rejected": -4.17969274520874,
"step": 271
},
{
"epoch": 0.5750908490254377,
"grad_norm": 6.7946457862854,
"learning_rate": 4.556020523385326e-07,
"logits/chosen": -0.7530428171157837,
"logits/rejected": -0.7395590543746948,
"logps/chosen": -1.8709862232208252,
"logps/rejected": -2.3599390983581543,
"loss": 1.1025,
"rewards/accuracies": 0.6875,
"rewards/chosen": -3.7419724464416504,
"rewards/margins": 0.9779053926467896,
"rewards/rejected": -4.719878196716309,
"step": 272
},
{
"epoch": 0.5772051536174431,
"grad_norm": 4.877624988555908,
"learning_rate": 4.5191323067127773e-07,
"logits/chosen": -0.7732480764389038,
"logits/rejected": -0.7835702300071716,
"logps/chosen": -2.0340800285339355,
"logps/rejected": -2.330742835998535,
"loss": 1.3198,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -4.068160057067871,
"rewards/margins": 0.5933258533477783,
"rewards/rejected": -4.66148567199707,
"step": 273
},
{
"epoch": 0.5793194582094483,
"grad_norm": 9.001680374145508,
"learning_rate": 4.482270489293685e-07,
"logits/chosen": -0.9062263369560242,
"logits/rejected": -0.9105854630470276,
"logps/chosen": -2.1364972591400146,
"logps/rejected": -2.4467523097991943,
"loss": 1.3464,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -4.272994518280029,
"rewards/margins": 0.6205099821090698,
"rewards/rejected": -4.893504619598389,
"step": 274
},
{
"epoch": 0.5814337628014535,
"grad_norm": 2.811025619506836,
"learning_rate": 4.445437094812475e-07,
"logits/chosen": -0.8593579530715942,
"logits/rejected": -0.8343831896781921,
"logps/chosen": -2.452843189239502,
"logps/rejected": -2.7551848888397217,
"loss": 1.3536,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -4.905686378479004,
"rewards/margins": 0.6046838760375977,
"rewards/rejected": -5.510369777679443,
"step": 275
},
{
"epoch": 0.5835480673934589,
"grad_norm": 2.2030158042907715,
"learning_rate": 4.4086341453931714e-07,
"logits/chosen": -0.8991417288780212,
"logits/rejected": -0.8766486644744873,
"logps/chosen": -2.30641508102417,
"logps/rejected": -2.7606654167175293,
"loss": 1.1708,
"rewards/accuracies": 0.6875,
"rewards/chosen": -4.61283016204834,
"rewards/margins": 0.9085015654563904,
"rewards/rejected": -5.521330833435059,
"step": 276
},
{
"epoch": 0.5856623719854641,
"grad_norm": 5.5185227394104,
"learning_rate": 4.371863661488393e-07,
"logits/chosen": -0.8738227486610413,
"logits/rejected": -0.8665530681610107,
"logps/chosen": -2.29125714302063,
"logps/rejected": -2.7014153003692627,
"loss": 1.1883,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -4.58251428604126,
"rewards/margins": 0.8203167915344238,
"rewards/rejected": -5.402830600738525,
"step": 277
},
{
"epoch": 0.5877766765774695,
"grad_norm": 2.0779521465301514,
"learning_rate": 4.3351276617684285e-07,
"logits/chosen": -0.958415150642395,
"logits/rejected": -0.9585077166557312,
"logps/chosen": -2.4368410110473633,
"logps/rejected": -2.798506736755371,
"loss": 1.1749,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -4.873682022094727,
"rewards/margins": 0.7233313322067261,
"rewards/rejected": -5.597013473510742,
"step": 278
},
{
"epoch": 0.5898909811694747,
"grad_norm": 2.884877920150757,
"learning_rate": 4.29842816301041e-07,
"logits/chosen": -0.9413051605224609,
"logits/rejected": -0.9224691987037659,
"logps/chosen": -2.485034942626953,
"logps/rejected": -2.911332368850708,
"loss": 1.2035,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -4.970069885253906,
"rewards/margins": 0.8525944948196411,
"rewards/rejected": -5.822664737701416,
"step": 279
},
{
"epoch": 0.59200528576148,
"grad_norm": 5.203248500823975,
"learning_rate": 4.2617671799875944e-07,
"logits/chosen": -0.9359334111213684,
"logits/rejected": -0.9387660026550293,
"logps/chosen": -2.378349542617798,
"logps/rejected": -2.730886936187744,
"loss": 1.2253,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -4.756699085235596,
"rewards/margins": 0.7050745487213135,
"rewards/rejected": -5.461773872375488,
"step": 280
},
{
"epoch": 0.5941195903534853,
"grad_norm": 6.818525314331055,
"learning_rate": 4.225146725358758e-07,
"logits/chosen": -0.8864554166793823,
"logits/rejected": -0.8813320398330688,
"logps/chosen": -2.4233975410461426,
"logps/rejected": -2.8188178539276123,
"loss": 1.2281,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -4.846795082092285,
"rewards/margins": 0.7908411622047424,
"rewards/rejected": -5.637635707855225,
"step": 281
},
{
"epoch": 0.5962338949454906,
"grad_norm": 2.529154062271118,
"learning_rate": 4.1885688095577e-07,
"logits/chosen": -0.8420325517654419,
"logits/rejected": -0.8822402954101562,
"logps/chosen": -2.626488447189331,
"logps/rejected": -3.1887192726135254,
"loss": 1.0561,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -5.252976894378662,
"rewards/margins": 1.1244611740112305,
"rewards/rejected": -6.377438545227051,
"step": 282
},
{
"epoch": 0.5983481995374959,
"grad_norm": 3.0739686489105225,
"learning_rate": 4.152035440682873e-07,
"logits/chosen": -0.8550993204116821,
"logits/rejected": -0.8528580665588379,
"logps/chosen": -2.6387887001037598,
"logps/rejected": -2.9952192306518555,
"loss": 1.3409,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -5.2775774002075195,
"rewards/margins": 0.7128612399101257,
"rewards/rejected": -5.990438461303711,
"step": 283
},
{
"epoch": 0.6004625041295012,
"grad_norm": 3.6649062633514404,
"learning_rate": 4.1155486243871363e-07,
"logits/chosen": -0.8643282651901245,
"logits/rejected": -0.9175342321395874,
"logps/chosen": -2.929072618484497,
"logps/rejected": -3.105940580368042,
"loss": 1.5121,
"rewards/accuracies": 0.578125,
"rewards/chosen": -5.858145236968994,
"rewards/margins": 0.3537355065345764,
"rewards/rejected": -6.211881160736084,
"step": 284
},
{
"epoch": 0.6025768087215064,
"grad_norm": 2.5071723461151123,
"learning_rate": 4.0791103637676486e-07,
"logits/chosen": -0.8368631601333618,
"logits/rejected": -0.819808304309845,
"logps/chosen": -3.0672600269317627,
"logps/rejected": -3.4685003757476807,
"loss": 1.3236,
"rewards/accuracies": 0.65625,
"rewards/chosen": -6.134520053863525,
"rewards/margins": 0.8024805784225464,
"rewards/rejected": -6.937000751495361,
"step": 285
},
{
"epoch": 0.6046911133135118,
"grad_norm": 8.780280113220215,
"learning_rate": 4.042722659255906e-07,
"logits/chosen": -0.8249569535255432,
"logits/rejected": -0.8442113995552063,
"logps/chosen": -3.3199872970581055,
"logps/rejected": -3.7276291847229004,
"loss": 1.322,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -6.639974594116211,
"rewards/margins": 0.8152831792831421,
"rewards/rejected": -7.455258369445801,
"step": 286
},
{
"epoch": 0.606805417905517,
"grad_norm": 3.4388678073883057,
"learning_rate": 4.006387508507914e-07,
"logits/chosen": -0.7224047780036926,
"logits/rejected": -0.7616450786590576,
"logps/chosen": -2.9411330223083496,
"logps/rejected": -3.32680082321167,
"loss": 1.2868,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -5.882266044616699,
"rewards/margins": 0.7713361978530884,
"rewards/rejected": -6.65360164642334,
"step": 287
},
{
"epoch": 0.6089197224975224,
"grad_norm": 5.095273971557617,
"learning_rate": 3.970106906294509e-07,
"logits/chosen": -0.7394692897796631,
"logits/rejected": -0.7316830158233643,
"logps/chosen": -2.9902045726776123,
"logps/rejected": -3.469916820526123,
"loss": 1.1694,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -5.980409145355225,
"rewards/margins": 0.9594244360923767,
"rewards/rejected": -6.939833641052246,
"step": 288
},
{
"epoch": 0.6110340270895276,
"grad_norm": 2.1398613452911377,
"learning_rate": 3.933882844391866e-07,
"logits/chosen": -0.8181312084197998,
"logits/rejected": -0.833306610584259,
"logps/chosen": -3.0137529373168945,
"logps/rejected": -3.4241840839385986,
"loss": 1.2453,
"rewards/accuracies": 0.609375,
"rewards/chosen": -6.027505874633789,
"rewards/margins": 0.8208625316619873,
"rewards/rejected": -6.848368167877197,
"step": 289
},
{
"epoch": 0.6131483316815328,
"grad_norm": 4.185284614562988,
"learning_rate": 3.89771731147214e-07,
"logits/chosen": -0.7805104851722717,
"logits/rejected": -0.8086984753608704,
"logps/chosen": -2.984957218170166,
"logps/rejected": -3.430112361907959,
"loss": 1.2671,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -5.969914436340332,
"rewards/margins": 0.890310525894165,
"rewards/rejected": -6.860224723815918,
"step": 290
},
{
"epoch": 0.6152626362735382,
"grad_norm": 7.104829788208008,
"learning_rate": 3.861612292994292e-07,
"logits/chosen": -0.7788286209106445,
"logits/rejected": -0.8027424216270447,
"logps/chosen": -2.896563768386841,
"logps/rejected": -3.1082046031951904,
"loss": 1.4853,
"rewards/accuracies": 0.578125,
"rewards/chosen": -5.793127536773682,
"rewards/margins": 0.42328107357025146,
"rewards/rejected": -6.216409206390381,
"step": 291
},
{
"epoch": 0.6173769408655434,
"grad_norm": 3.795579433441162,
"learning_rate": 3.825569771095082e-07,
"logits/chosen": -0.8044757843017578,
"logits/rejected": -0.7828265428543091,
"logps/chosen": -2.8059256076812744,
"logps/rejected": -3.3121094703674316,
"loss": 1.1299,
"rewards/accuracies": 0.6875,
"rewards/chosen": -5.611851215362549,
"rewards/margins": 1.0123679637908936,
"rewards/rejected": -6.624218940734863,
"step": 292
},
{
"epoch": 0.6194912454575487,
"grad_norm": 4.486142158508301,
"learning_rate": 3.7895917244802655e-07,
"logits/chosen": -0.7511788606643677,
"logits/rejected": -0.7885503768920898,
"logps/chosen": -2.927251100540161,
"logps/rejected": -3.1605303287506104,
"loss": 1.426,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -5.854502201080322,
"rewards/margins": 0.466558575630188,
"rewards/rejected": -6.321060657501221,
"step": 293
},
{
"epoch": 0.621605550049554,
"grad_norm": 3.3942787647247314,
"learning_rate": 3.753680128315952e-07,
"logits/chosen": -0.8230300545692444,
"logits/rejected": -0.8042524456977844,
"logps/chosen": -2.524353504180908,
"logps/rejected": -2.8687357902526855,
"loss": 1.2653,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -5.048707008361816,
"rewards/margins": 0.6887640953063965,
"rewards/rejected": -5.737471580505371,
"step": 294
},
{
"epoch": 0.6237198546415593,
"grad_norm": 4.326812744140625,
"learning_rate": 3.717836954120178e-07,
"logits/chosen": -0.7763381004333496,
"logits/rejected": -0.7852378487586975,
"logps/chosen": -2.4861948490142822,
"logps/rejected": -2.8822267055511475,
"loss": 1.124,
"rewards/accuracies": 0.75,
"rewards/chosen": -4.9723896980285645,
"rewards/margins": 0.7920635938644409,
"rewards/rejected": -5.764453411102295,
"step": 295
},
{
"epoch": 0.6258341592335646,
"grad_norm": 3.886293649673462,
"learning_rate": 3.6820641696546627e-07,
"logits/chosen": -0.8350138664245605,
"logits/rejected": -0.8594292998313904,
"logps/chosen": -2.1301493644714355,
"logps/rejected": -2.3678014278411865,
"loss": 1.3532,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -4.260298728942871,
"rewards/margins": 0.4753049314022064,
"rewards/rejected": -4.735602855682373,
"step": 296
},
{
"epoch": 0.6279484638255699,
"grad_norm": 1.9318888187408447,
"learning_rate": 3.6463637388167875e-07,
"logits/chosen": -0.812870979309082,
"logits/rejected": -0.8393633961677551,
"logps/chosen": -2.0607728958129883,
"logps/rejected": -2.4457521438598633,
"loss": 1.2317,
"rewards/accuracies": 0.609375,
"rewards/chosen": -4.121545791625977,
"rewards/margins": 0.76995849609375,
"rewards/rejected": -4.891504287719727,
"step": 297
},
{
"epoch": 0.6300627684175751,
"grad_norm": 2.731139898300171,
"learning_rate": 3.610737621531781e-07,
"logits/chosen": -0.7860711216926575,
"logits/rejected": -0.8006534576416016,
"logps/chosen": -1.9324530363082886,
"logps/rejected": -2.2838711738586426,
"loss": 1.2986,
"rewards/accuracies": 0.65625,
"rewards/chosen": -3.864906072616577,
"rewards/margins": 0.7028359174728394,
"rewards/rejected": -4.567742347717285,
"step": 298
},
{
"epoch": 0.6321770730095805,
"grad_norm": 3.118441581726074,
"learning_rate": 3.575187773645112e-07,
"logits/chosen": -0.6946629285812378,
"logits/rejected": -0.6832380294799805,
"logps/chosen": -2.2569775581359863,
"logps/rejected": -2.6153128147125244,
"loss": 1.2166,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -4.513955116271973,
"rewards/margins": 0.7166703343391418,
"rewards/rejected": -5.230625629425049,
"step": 299
},
{
"epoch": 0.6342913776015857,
"grad_norm": 4.998100757598877,
"learning_rate": 3.5397161468151214e-07,
"logits/chosen": -0.7972643375396729,
"logits/rejected": -0.7864660620689392,
"logps/chosen": -2.227022886276245,
"logps/rejected": -2.57175350189209,
"loss": 1.2796,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -4.45404577255249,
"rewards/margins": 0.6894608736038208,
"rewards/rejected": -5.14350700378418,
"step": 300
},
{
"epoch": 0.6364056821935911,
"grad_norm": 6.259451866149902,
"learning_rate": 3.5043246884058777e-07,
"logits/chosen": -0.6282143592834473,
"logits/rejected": -0.6314865350723267,
"logps/chosen": -2.4372308254241943,
"logps/rejected": -2.8582205772399902,
"loss": 1.1592,
"rewards/accuracies": 0.625,
"rewards/chosen": -4.874461650848389,
"rewards/margins": 0.8419792056083679,
"rewards/rejected": -5.7164411544799805,
"step": 301
},
{
"epoch": 0.6385199867855963,
"grad_norm": 2.577531337738037,
"learning_rate": 3.4690153413802653e-07,
"logits/chosen": -0.658220648765564,
"logits/rejected": -0.6330516934394836,
"logps/chosen": -2.6647050380706787,
"logps/rejected": -3.1917996406555176,
"loss": 1.2609,
"rewards/accuracies": 0.671875,
"rewards/chosen": -5.329410076141357,
"rewards/margins": 1.0541892051696777,
"rewards/rejected": -6.383599281311035,
"step": 302
},
{
"epoch": 0.6406342913776016,
"grad_norm": 4.733935356140137,
"learning_rate": 3.4337900441933227e-07,
"logits/chosen": -0.5048555731773376,
"logits/rejected": -0.45112305879592896,
"logps/chosen": -2.5193920135498047,
"logps/rejected": -3.1279971599578857,
"loss": 1.0648,
"rewards/accuracies": 0.765625,
"rewards/chosen": -5.038784027099609,
"rewards/margins": 1.2172104120254517,
"rewards/rejected": -6.2559943199157715,
"step": 303
},
{
"epoch": 0.6427485959696069,
"grad_norm": 5.54962158203125,
"learning_rate": 3.3986507306858125e-07,
"logits/chosen": -0.5305406451225281,
"logits/rejected": -0.5246613025665283,
"logps/chosen": -2.8851962089538574,
"logps/rejected": -3.248018264770508,
"loss": 1.4329,
"rewards/accuracies": 0.625,
"rewards/chosen": -5.770392417907715,
"rewards/margins": 0.7256444692611694,
"rewards/rejected": -6.496036529541016,
"step": 304
},
{
"epoch": 0.6448629005616121,
"grad_norm": 2.827944278717041,
"learning_rate": 3.363599329978066e-07,
"logits/chosen": -0.4795135259628296,
"logits/rejected": -0.4911767244338989,
"logps/chosen": -3.0268373489379883,
"logps/rejected": -3.4411511421203613,
"loss": 1.4083,
"rewards/accuracies": 0.65625,
"rewards/chosen": -6.053674697875977,
"rewards/margins": 0.8286278247833252,
"rewards/rejected": -6.882302284240723,
"step": 305
},
{
"epoch": 0.6469772051536175,
"grad_norm": 5.35672664642334,
"learning_rate": 3.328637766364075e-07,
"logits/chosen": -0.4823904037475586,
"logits/rejected": -0.48555058240890503,
"logps/chosen": -2.990793466567993,
"logps/rejected": -3.529240846633911,
"loss": 1.1417,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -5.981586933135986,
"rewards/margins": 1.0768945217132568,
"rewards/rejected": -7.058481693267822,
"step": 306
},
{
"epoch": 0.6490915097456227,
"grad_norm": 2.8072359561920166,
"learning_rate": 3.2937679592058396e-07,
"logits/chosen": -0.4903571605682373,
"logits/rejected": -0.46411609649658203,
"logps/chosen": -2.8665530681610107,
"logps/rejected": -3.542123556137085,
"loss": 1.2485,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -5.7331061363220215,
"rewards/margins": 1.3511409759521484,
"rewards/rejected": -7.08424711227417,
"step": 307
},
{
"epoch": 0.651205814337628,
"grad_norm": 6.341434478759766,
"learning_rate": 3.2589918228280066e-07,
"logits/chosen": -0.4496378004550934,
"logits/rejected": -0.35389459133148193,
"logps/chosen": -2.8208916187286377,
"logps/rejected": -3.326601505279541,
"loss": 1.3089,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -5.641783237457275,
"rewards/margins": 1.011419653892517,
"rewards/rejected": -6.653203010559082,
"step": 308
},
{
"epoch": 0.6533201189296333,
"grad_norm": 2.5416784286499023,
"learning_rate": 3.2243112664127723e-07,
"logits/chosen": -0.44504135847091675,
"logits/rejected": -0.42088568210601807,
"logps/chosen": -2.7710533142089844,
"logps/rejected": -3.4406185150146484,
"loss": 1.2213,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -5.542106628417969,
"rewards/margins": 1.3391309976577759,
"rewards/rejected": -6.881237030029297,
"step": 309
},
{
"epoch": 0.6554344235216386,
"grad_norm": 4.573229789733887,
"learning_rate": 3.189728193895069e-07,
"logits/chosen": -0.31100764870643616,
"logits/rejected": -0.32552966475486755,
"logps/chosen": -3.099289655685425,
"logps/rejected": -3.5152204036712646,
"loss": 1.3571,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -6.19857931137085,
"rewards/margins": 0.8318620324134827,
"rewards/rejected": -7.030440807342529,
"step": 310
},
{
"epoch": 0.6575487281136438,
"grad_norm": 3.7587928771972656,
"learning_rate": 3.155244503858041e-07,
"logits/chosen": -0.4225979447364807,
"logits/rejected": -0.43882372975349426,
"logps/chosen": -2.9082608222961426,
"logps/rejected": -3.2239482402801514,
"loss": 1.3415,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -5.816521644592285,
"rewards/margins": 0.6313749551773071,
"rewards/rejected": -6.447896480560303,
"step": 311
},
{
"epoch": 0.6596630327056492,
"grad_norm": 5.79728889465332,
"learning_rate": 3.12086208942881e-07,
"logits/chosen": -0.48076939582824707,
"logits/rejected": -0.41990721225738525,
"logps/chosen": -2.7089650630950928,
"logps/rejected": -3.29990291595459,
"loss": 1.1423,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -5.4179301261901855,
"rewards/margins": 1.181876540184021,
"rewards/rejected": -6.59980583190918,
"step": 312
},
{
"epoch": 0.6617773372976544,
"grad_norm": 7.405224800109863,
"learning_rate": 3.086582838174551e-07,
"logits/chosen": -0.48003631830215454,
"logits/rejected": -0.40571871399879456,
"logps/chosen": -2.53741455078125,
"logps/rejected": -3.0145747661590576,
"loss": 1.3247,
"rewards/accuracies": 0.609375,
"rewards/chosen": -5.0748291015625,
"rewards/margins": 0.9543203115463257,
"rewards/rejected": -6.029149532318115,
"step": 313
},
{
"epoch": 0.6638916418896598,
"grad_norm": 6.371465682983398,
"learning_rate": 3.052408631998863e-07,
"logits/chosen": -0.42537638545036316,
"logits/rejected": -0.39384835958480835,
"logps/chosen": -3.006593942642212,
"logps/rejected": -3.4665465354919434,
"loss": 1.2648,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -6.013187885284424,
"rewards/margins": 0.919904887676239,
"rewards/rejected": -6.933093070983887,
"step": 314
},
{
"epoch": 0.666005946481665,
"grad_norm": 4.65411376953125,
"learning_rate": 3.018341347038453e-07,
"logits/chosen": -0.38848310708999634,
"logits/rejected": -0.3435167670249939,
"logps/chosen": -2.9562084674835205,
"logps/rejected": -3.5491316318511963,
"loss": 1.1353,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -5.912416934967041,
"rewards/margins": 1.1858452558517456,
"rewards/rejected": -7.098263263702393,
"step": 315
},
{
"epoch": 0.6681202510736703,
"grad_norm": 5.089771747589111,
"learning_rate": 2.9843828535601397e-07,
"logits/chosen": -0.3452882170677185,
"logits/rejected": -0.29303884506225586,
"logps/chosen": -2.5367987155914307,
"logps/rejected": -3.172724723815918,
"loss": 1.2002,
"rewards/accuracies": 0.6875,
"rewards/chosen": -5.073597431182861,
"rewards/margins": 1.2718524932861328,
"rewards/rejected": -6.345449447631836,
"step": 316
},
{
"epoch": 0.6702345556656756,
"grad_norm": 4.480255603790283,
"learning_rate": 2.9505350158581697e-07,
"logits/chosen": -0.47401517629623413,
"logits/rejected": -0.45950815081596375,
"logps/chosen": -2.45076322555542,
"logps/rejected": -2.998079299926758,
"loss": 1.2545,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -4.90152645111084,
"rewards/margins": 1.0946320295333862,
"rewards/rejected": -5.996158599853516,
"step": 317
},
{
"epoch": 0.6723488602576809,
"grad_norm": 3.6318399906158447,
"learning_rate": 2.916799692151884e-07,
"logits/chosen": -0.20774951577186584,
"logits/rejected": -0.21114808320999146,
"logps/chosen": -2.8932981491088867,
"logps/rejected": -3.613022565841675,
"loss": 1.1187,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -5.786596298217773,
"rewards/margins": 1.4394491910934448,
"rewards/rejected": -7.22604513168335,
"step": 318
},
{
"epoch": 0.6744631648496862,
"grad_norm": 6.601771831512451,
"learning_rate": 2.883178734483692e-07,
"logits/chosen": -0.3821495473384857,
"logits/rejected": -0.35181915760040283,
"logps/chosen": -2.5047662258148193,
"logps/rejected": -3.074918270111084,
"loss": 1.1545,
"rewards/accuracies": 0.71875,
"rewards/chosen": -5.009532451629639,
"rewards/margins": 1.1403042078018188,
"rewards/rejected": -6.149836540222168,
"step": 319
},
{
"epoch": 0.6765774694416914,
"grad_norm": 3.077775716781616,
"learning_rate": 2.849673988617399e-07,
"logits/chosen": -0.4517952799797058,
"logits/rejected": -0.3880998194217682,
"logps/chosen": -2.5404443740844727,
"logps/rejected": -3.007855176925659,
"loss": 1.2441,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -5.080888748168945,
"rewards/margins": 0.9348208904266357,
"rewards/rejected": -6.015710353851318,
"step": 320
},
{
"epoch": 0.6786917740336967,
"grad_norm": 4.130971908569336,
"learning_rate": 2.8162872939368674e-07,
"logits/chosen": -0.3455219566822052,
"logits/rejected": -0.3199109137058258,
"logps/chosen": -2.5115320682525635,
"logps/rejected": -3.0809438228607178,
"loss": 1.1814,
"rewards/accuracies": 0.71875,
"rewards/chosen": -5.023064136505127,
"rewards/margins": 1.1388237476348877,
"rewards/rejected": -6.1618876457214355,
"step": 321
},
{
"epoch": 0.680806078625702,
"grad_norm": 6.414750099182129,
"learning_rate": 2.783020483345057e-07,
"logits/chosen": -0.500693142414093,
"logits/rejected": -0.43053722381591797,
"logps/chosen": -2.627499580383301,
"logps/rejected": -3.176882266998291,
"loss": 1.2207,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -5.254999160766602,
"rewards/margins": 1.0987658500671387,
"rewards/rejected": -6.353764533996582,
"step": 322
},
{
"epoch": 0.6829203832177073,
"grad_norm": 3.8955185413360596,
"learning_rate": 2.749875383163377e-07,
"logits/chosen": -0.3386150896549225,
"logits/rejected": -0.3456903100013733,
"logps/chosen": -2.5545601844787598,
"logps/rejected": -3.0574111938476562,
"loss": 1.2667,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -5.1091203689575195,
"rewards/margins": 1.0057018995285034,
"rewards/rejected": -6.1148223876953125,
"step": 323
},
{
"epoch": 0.6850346878097126,
"grad_norm": 4.244959831237793,
"learning_rate": 2.7168538130314345e-07,
"logits/chosen": -0.4657687246799469,
"logits/rejected": -0.41878795623779297,
"logps/chosen": -2.3406989574432373,
"logps/rejected": -2.74613094329834,
"loss": 1.2982,
"rewards/accuracies": 0.640625,
"rewards/chosen": -4.681397914886475,
"rewards/margins": 0.8108637928962708,
"rewards/rejected": -5.49226188659668,
"step": 324
},
{
"epoch": 0.6871489924017179,
"grad_norm": 8.914139747619629,
"learning_rate": 2.683957585807136e-07,
"logits/chosen": -0.42120760679244995,
"logits/rejected": -0.34997111558914185,
"logps/chosen": -2.4362924098968506,
"logps/rejected": -2.8844237327575684,
"loss": 1.3185,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -4.872584819793701,
"rewards/margins": 0.8962627649307251,
"rewards/rejected": -5.768847465515137,
"step": 325
},
{
"epoch": 0.6892632969937231,
"grad_norm": 2.8318073749542236,
"learning_rate": 2.651188507467161e-07,
"logits/chosen": -0.4435175657272339,
"logits/rejected": -0.43688836693763733,
"logps/chosen": -2.316673994064331,
"logps/rejected": -2.6802306175231934,
"loss": 1.2727,
"rewards/accuracies": 0.65625,
"rewards/chosen": -4.633347988128662,
"rewards/margins": 0.7271134853363037,
"rewards/rejected": -5.360461235046387,
"step": 326
},
{
"epoch": 0.6913776015857285,
"grad_norm": 9.15845012664795,
"learning_rate": 2.618548377007817e-07,
"logits/chosen": -0.4659804105758667,
"logits/rejected": -0.43525823950767517,
"logps/chosen": -2.3177073001861572,
"logps/rejected": -2.674837350845337,
"loss": 1.3204,
"rewards/accuracies": 0.65625,
"rewards/chosen": -4.6354146003723145,
"rewards/margins": 0.7142605781555176,
"rewards/rejected": -5.349674701690674,
"step": 327
},
{
"epoch": 0.6934919061777337,
"grad_norm": 8.41653060913086,
"learning_rate": 2.5860389863462763e-07,
"logits/chosen": -0.42244386672973633,
"logits/rejected": -0.3488731384277344,
"logps/chosen": -2.3063669204711914,
"logps/rejected": -2.8124496936798096,
"loss": 1.2621,
"rewards/accuracies": 0.671875,
"rewards/chosen": -4.612733840942383,
"rewards/margins": 1.0121653079986572,
"rewards/rejected": -5.624899387359619,
"step": 328
},
{
"epoch": 0.695606210769739,
"grad_norm": 8.558746337890625,
"learning_rate": 2.5536621202221986e-07,
"logits/chosen": -0.4081762433052063,
"logits/rejected": -0.3913821578025818,
"logps/chosen": -2.331026554107666,
"logps/rejected": -2.799482583999634,
"loss": 1.2435,
"rewards/accuracies": 0.59375,
"rewards/chosen": -4.662053108215332,
"rewards/margins": 0.9369123578071594,
"rewards/rejected": -5.598965167999268,
"step": 329
},
{
"epoch": 0.6977205153617443,
"grad_norm": 7.550137519836426,
"learning_rate": 2.521419556099754e-07,
"logits/chosen": -0.5334538221359253,
"logits/rejected": -0.5046267509460449,
"logps/chosen": -2.3662197589874268,
"logps/rejected": -2.8178446292877197,
"loss": 1.2172,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -4.7324395179748535,
"rewards/margins": 0.9032500386238098,
"rewards/rejected": -5.6356892585754395,
"step": 330
},
{
"epoch": 0.6998348199537496,
"grad_norm": 4.939478397369385,
"learning_rate": 2.4893130640700364e-07,
"logits/chosen": -0.5103824138641357,
"logits/rejected": -0.49076637625694275,
"logps/chosen": -2.0302557945251465,
"logps/rejected": -2.4443471431732178,
"loss": 1.1939,
"rewards/accuracies": 0.65625,
"rewards/chosen": -4.060511589050293,
"rewards/margins": 0.8281831741333008,
"rewards/rejected": -4.8886942863464355,
"step": 331
},
{
"epoch": 0.7019491245457549,
"grad_norm": 5.584677219390869,
"learning_rate": 2.4573444067538985e-07,
"logits/chosen": -0.46035417914390564,
"logits/rejected": -0.4546043574810028,
"logps/chosen": -2.1907548904418945,
"logps/rejected": -2.4913454055786133,
"loss": 1.4253,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -4.381509780883789,
"rewards/margins": 0.6011807322502136,
"rewards/rejected": -4.982690811157227,
"step": 332
},
{
"epoch": 0.7040634291377602,
"grad_norm": 3.398441791534424,
"learning_rate": 2.425515339205165e-07,
"logits/chosen": -0.5569466352462769,
"logits/rejected": -0.5756793022155762,
"logps/chosen": -2.037411689758301,
"logps/rejected": -2.3700244426727295,
"loss": 1.3425,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -4.074823379516602,
"rewards/margins": 0.6652255654335022,
"rewards/rejected": -4.740048885345459,
"step": 333
},
{
"epoch": 0.7061777337297654,
"grad_norm": 8.54529094696045,
"learning_rate": 2.3938276088143e-07,
"logits/chosen": -0.5746757388114929,
"logits/rejected": -0.5874296426773071,
"logps/chosen": -2.1479601860046387,
"logps/rejected": -2.584625244140625,
"loss": 1.2366,
"rewards/accuracies": 0.71875,
"rewards/chosen": -4.295920372009277,
"rewards/margins": 0.8733301758766174,
"rewards/rejected": -5.16925048828125,
"step": 334
},
{
"epoch": 0.7082920383217707,
"grad_norm": 5.141815662384033,
"learning_rate": 2.362282955212473e-07,
"logits/chosen": -0.6492913961410522,
"logits/rejected": -0.5812432765960693,
"logps/chosen": -1.9753435850143433,
"logps/rejected": -2.340383768081665,
"loss": 1.2197,
"rewards/accuracies": 0.640625,
"rewards/chosen": -3.9506871700286865,
"rewards/margins": 0.7300805449485779,
"rewards/rejected": -4.68076753616333,
"step": 335
},
{
"epoch": 0.710406342913776,
"grad_norm": 5.991698265075684,
"learning_rate": 2.3308831101760483e-07,
"logits/chosen": -0.6887751221656799,
"logits/rejected": -0.6923843622207642,
"logps/chosen": -1.577715277671814,
"logps/rejected": -1.861379623413086,
"loss": 1.2608,
"rewards/accuracies": 0.640625,
"rewards/chosen": -3.155430555343628,
"rewards/margins": 0.5673283338546753,
"rewards/rejected": -3.722759246826172,
"step": 336
},
{
"epoch": 0.7125206475057813,
"grad_norm": 1.5719850063323975,
"learning_rate": 2.2996297975315097e-07,
"logits/chosen": -0.6095813512802124,
"logits/rejected": -0.5842909216880798,
"logps/chosen": -1.6973541975021362,
"logps/rejected": -2.1261086463928223,
"loss": 1.2424,
"rewards/accuracies": 0.609375,
"rewards/chosen": -3.3947083950042725,
"rewards/margins": 0.857509195804596,
"rewards/rejected": -4.2522172927856445,
"step": 337
},
{
"epoch": 0.7146349520977866,
"grad_norm": 4.785243511199951,
"learning_rate": 2.2685247330608414e-07,
"logits/chosen": -0.7062411308288574,
"logits/rejected": -0.6849475502967834,
"logps/chosen": -1.6659798622131348,
"logps/rejected": -1.980202555656433,
"loss": 1.2512,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -3.3319597244262695,
"rewards/margins": 0.6284454464912415,
"rewards/rejected": -3.960405111312866,
"step": 338
},
{
"epoch": 0.7167492566897918,
"grad_norm": 4.3183674812316895,
"learning_rate": 2.2375696244073123e-07,
"logits/chosen": -0.6655697822570801,
"logits/rejected": -0.6642571687698364,
"logps/chosen": -1.615012764930725,
"logps/rejected": -1.9022549390792847,
"loss": 1.398,
"rewards/accuracies": 0.625,
"rewards/chosen": -3.23002552986145,
"rewards/margins": 0.5744845867156982,
"rewards/rejected": -3.8045098781585693,
"step": 339
},
{
"epoch": 0.7188635612817972,
"grad_norm": 3.458740472793579,
"learning_rate": 2.2067661709817382e-07,
"logits/chosen": -0.6138105988502502,
"logits/rejected": -0.6241220235824585,
"logps/chosen": -1.5244299173355103,
"logps/rejected": -1.8252849578857422,
"loss": 1.2257,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -3.0488598346710205,
"rewards/margins": 0.6017097234725952,
"rewards/rejected": -3.6505699157714844,
"step": 340
},
{
"epoch": 0.7209778658738024,
"grad_norm": 3.3990859985351562,
"learning_rate": 2.1761160638691838e-07,
"logits/chosen": -0.596839964389801,
"logits/rejected": -0.5929630398750305,
"logps/chosen": -1.4333155155181885,
"logps/rejected": -1.820554494857788,
"loss": 1.1124,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -2.866631031036377,
"rewards/margins": 0.7744779586791992,
"rewards/rejected": -3.641108989715576,
"step": 341
},
{
"epoch": 0.7230921704658078,
"grad_norm": 2.742016315460205,
"learning_rate": 2.1456209857361246e-07,
"logits/chosen": -0.6483213901519775,
"logits/rejected": -0.6418218612670898,
"logps/chosen": -1.4174959659576416,
"logps/rejected": -1.831233263015747,
"loss": 1.1372,
"rewards/accuracies": 0.703125,
"rewards/chosen": -2.834991931915283,
"rewards/margins": 0.8274745941162109,
"rewards/rejected": -3.662466526031494,
"step": 342
},
{
"epoch": 0.725206475057813,
"grad_norm": 2.5489015579223633,
"learning_rate": 2.1152826107380651e-07,
"logits/chosen": -0.599895179271698,
"logits/rejected": -0.6154446005821228,
"logps/chosen": -1.4996072053909302,
"logps/rejected": -1.7961615324020386,
"loss": 1.2288,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -2.9992144107818604,
"rewards/margins": 0.5931087732315063,
"rewards/rejected": -3.592323064804077,
"step": 343
},
{
"epoch": 0.7273207796498183,
"grad_norm": 2.8836190700531006,
"learning_rate": 2.0851026044276405e-07,
"logits/chosen": -0.7359989285469055,
"logits/rejected": -0.7111036777496338,
"logps/chosen": -1.32615065574646,
"logps/rejected": -1.6067696809768677,
"loss": 1.2088,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.65230131149292,
"rewards/margins": 0.5612384080886841,
"rewards/rejected": -3.2135393619537354,
"step": 344
},
{
"epoch": 0.7294350842418236,
"grad_norm": 3.1838135719299316,
"learning_rate": 2.0550826236631596e-07,
"logits/chosen": -0.6709272265434265,
"logits/rejected": -0.6708023548126221,
"logps/chosen": -1.2859303951263428,
"logps/rejected": -1.6929675340652466,
"loss": 1.1446,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.5718607902526855,
"rewards/margins": 0.8140743374824524,
"rewards/rejected": -3.385935068130493,
"step": 345
},
{
"epoch": 0.7315493888338289,
"grad_norm": 2.4209675788879395,
"learning_rate": 2.025224316517663e-07,
"logits/chosen": -0.7540403604507446,
"logits/rejected": -0.7601196765899658,
"logps/chosen": -1.3634543418884277,
"logps/rejected": -1.6112797260284424,
"loss": 1.2561,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.7269086837768555,
"rewards/margins": 0.4956510066986084,
"rewards/rejected": -3.2225594520568848,
"step": 346
},
{
"epoch": 0.7336636934258342,
"grad_norm": 5.405437469482422,
"learning_rate": 1.9955293221884402e-07,
"logits/chosen": -0.7241419553756714,
"logits/rejected": -0.7224253416061401,
"logps/chosen": -1.2650585174560547,
"logps/rejected": -1.639666199684143,
"loss": 1.1565,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.5301170349121094,
"rewards/margins": 0.7492151856422424,
"rewards/rejected": -3.279332399368286,
"step": 347
},
{
"epoch": 0.7357779980178395,
"grad_norm": 1.5863631963729858,
"learning_rate": 1.9659992709070344e-07,
"logits/chosen": -0.7479431629180908,
"logits/rejected": -0.7219806909561157,
"logps/chosen": -1.294840931892395,
"logps/rejected": -1.6082017421722412,
"loss": 1.1693,
"rewards/accuracies": 0.640625,
"rewards/chosen": -2.58968186378479,
"rewards/margins": 0.6267215013504028,
"rewards/rejected": -3.2164034843444824,
"step": 348
},
{
"epoch": 0.7378923026098447,
"grad_norm": 1.7051454782485962,
"learning_rate": 1.936635783849742e-07,
"logits/chosen": -0.6940132975578308,
"logits/rejected": -0.7377297878265381,
"logps/chosen": -1.1897408962249756,
"logps/rejected": -1.631073236465454,
"loss": 1.1069,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -2.379481792449951,
"rewards/margins": 0.8826643228530884,
"rewards/rejected": -3.262146472930908,
"step": 349
},
{
"epoch": 0.74000660720185,
"grad_norm": 2.704514980316162,
"learning_rate": 1.907440473048626e-07,
"logits/chosen": -0.6926394104957581,
"logits/rejected": -0.7064180374145508,
"logps/chosen": -1.1691362857818604,
"logps/rejected": -1.511006236076355,
"loss": 1.1541,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -2.3382725715637207,
"rewards/margins": 0.6837398409843445,
"rewards/rejected": -3.02201247215271,
"step": 350
},
{
"epoch": 0.7421209117938553,
"grad_norm": 2.3685505390167236,
"learning_rate": 1.8784149413030004e-07,
"logits/chosen": -0.7785338759422302,
"logits/rejected": -0.7802280187606812,
"logps/chosen": -1.267012119293213,
"logps/rejected": -1.5235991477966309,
"loss": 1.177,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.534024238586426,
"rewards/margins": 0.5131738781929016,
"rewards/rejected": -3.0471982955932617,
"step": 351
},
{
"epoch": 0.7442352163858605,
"grad_norm": 2.8642280101776123,
"learning_rate": 1.849560782091445e-07,
"logits/chosen": -0.8269493579864502,
"logits/rejected": -0.8431333899497986,
"logps/chosen": -1.228893518447876,
"logps/rejected": -1.5784943103790283,
"loss": 1.1764,
"rewards/accuracies": 0.640625,
"rewards/chosen": -2.457787036895752,
"rewards/margins": 0.6992017030715942,
"rewards/rejected": -3.1569886207580566,
"step": 352
},
{
"epoch": 0.7463495209778659,
"grad_norm": 4.742166996002197,
"learning_rate": 1.8208795794843246e-07,
"logits/chosen": -0.764488160610199,
"logits/rejected": -0.7553139925003052,
"logps/chosen": -1.3095338344573975,
"logps/rejected": -1.6771752834320068,
"loss": 1.0957,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -2.619067668914795,
"rewards/margins": 0.7352830171585083,
"rewards/rejected": -3.3543505668640137,
"step": 353
},
{
"epoch": 0.7484638255698711,
"grad_norm": 3.543769359588623,
"learning_rate": 1.7923729080568239e-07,
"logits/chosen": -0.7355642914772034,
"logits/rejected": -0.7744429707527161,
"logps/chosen": -1.3419017791748047,
"logps/rejected": -1.591749668121338,
"loss": 1.2579,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -2.6838035583496094,
"rewards/margins": 0.4996955990791321,
"rewards/rejected": -3.183499336242676,
"step": 354
},
{
"epoch": 0.7505781301618765,
"grad_norm": 4.187947750091553,
"learning_rate": 1.764042332802506e-07,
"logits/chosen": -0.7009099721908569,
"logits/rejected": -0.6947562098503113,
"logps/chosen": -1.3167665004730225,
"logps/rejected": -1.640596866607666,
"loss": 1.2269,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -2.633533000946045,
"rewards/margins": 0.6476608514785767,
"rewards/rejected": -3.281193733215332,
"step": 355
},
{
"epoch": 0.7526924347538817,
"grad_norm": 1.7813458442687988,
"learning_rate": 1.7358894090473924e-07,
"logits/chosen": -0.7276792526245117,
"logits/rejected": -0.7536065578460693,
"logps/chosen": -1.401429295539856,
"logps/rejected": -1.7458314895629883,
"loss": 1.1934,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.802858591079712,
"rewards/margins": 0.6888045072555542,
"rewards/rejected": -3.4916629791259766,
"step": 356
},
{
"epoch": 0.754806739345887,
"grad_norm": 2.3192296028137207,
"learning_rate": 1.7079156823645801e-07,
"logits/chosen": -0.6756848096847534,
"logits/rejected": -0.6988381743431091,
"logps/chosen": -1.36654531955719,
"logps/rejected": -1.6672351360321045,
"loss": 1.1928,
"rewards/accuracies": 0.671875,
"rewards/chosen": -2.73309063911438,
"rewards/margins": 0.6013798117637634,
"rewards/rejected": -3.334470272064209,
"step": 357
},
{
"epoch": 0.7569210439378923,
"grad_norm": 2.7722420692443848,
"learning_rate": 1.6801226884893893e-07,
"logits/chosen": -0.6857397556304932,
"logits/rejected": -0.7169467806816101,
"logps/chosen": -1.4047114849090576,
"logps/rejected": -1.733205795288086,
"loss": 1.16,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -2.8094229698181152,
"rewards/margins": 0.6569885015487671,
"rewards/rejected": -3.466411590576172,
"step": 358
},
{
"epoch": 0.7590353485298976,
"grad_norm": 6.300495624542236,
"learning_rate": 1.6525119532350506e-07,
"logits/chosen": -0.7457281947135925,
"logits/rejected": -0.7319377660751343,
"logps/chosen": -1.282365083694458,
"logps/rejected": -1.6675825119018555,
"loss": 1.0742,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -2.564730167388916,
"rewards/margins": 0.7704350352287292,
"rewards/rejected": -3.335165023803711,
"step": 359
},
{
"epoch": 0.7611496531219029,
"grad_norm": 3.5068228244781494,
"learning_rate": 1.6250849924089482e-07,
"logits/chosen": -0.7112680077552795,
"logits/rejected": -0.7166794538497925,
"logps/chosen": -1.3996254205703735,
"logps/rejected": -1.6635833978652954,
"loss": 1.2438,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.799250841140747,
"rewards/margins": 0.5279159545898438,
"rewards/rejected": -3.327166795730591,
"step": 360
},
{
"epoch": 0.7632639577139082,
"grad_norm": 1.421538233757019,
"learning_rate": 1.5978433117293883e-07,
"logits/chosen": -0.7009663581848145,
"logits/rejected": -0.6878695487976074,
"logps/chosen": -1.4174691438674927,
"logps/rejected": -1.802457332611084,
"loss": 1.0885,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -2.8349382877349854,
"rewards/margins": 0.7699761986732483,
"rewards/rejected": -3.604914665222168,
"step": 361
},
{
"epoch": 0.7653782623059134,
"grad_norm": 3.2645766735076904,
"learning_rate": 1.5707884067429471e-07,
"logits/chosen": -0.6865817904472351,
"logits/rejected": -0.7084690928459167,
"logps/chosen": -1.377517819404602,
"logps/rejected": -1.7079989910125732,
"loss": 1.2371,
"rewards/accuracies": 0.640625,
"rewards/chosen": -2.755035638809204,
"rewards/margins": 0.660962700843811,
"rewards/rejected": -3.4159979820251465,
"step": 362
},
{
"epoch": 0.7674925668979188,
"grad_norm": 1.973783254623413,
"learning_rate": 1.5439217627423695e-07,
"logits/chosen": -0.7317283153533936,
"logits/rejected": -0.7571225166320801,
"logps/chosen": -1.63040030002594,
"logps/rejected": -2.027442216873169,
"loss": 1.1614,
"rewards/accuracies": 0.671875,
"rewards/chosen": -3.26080060005188,
"rewards/margins": 0.7940834760665894,
"rewards/rejected": -4.054884433746338,
"step": 363
},
{
"epoch": 0.769606871489924,
"grad_norm": 4.545448303222656,
"learning_rate": 1.5172448546850163e-07,
"logits/chosen": -0.6746503710746765,
"logits/rejected": -0.7073549628257751,
"logps/chosen": -1.321073055267334,
"logps/rejected": -1.6741642951965332,
"loss": 1.1609,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -2.642146110534668,
"rewards/margins": 0.7061826586723328,
"rewards/rejected": -3.3483285903930664,
"step": 364
},
{
"epoch": 0.7717211760819292,
"grad_norm": 8.678997039794922,
"learning_rate": 1.490759147111894e-07,
"logits/chosen": -0.6089351773262024,
"logits/rejected": -0.6172072291374207,
"logps/chosen": -1.6598318815231323,
"logps/rejected": -1.9151239395141602,
"loss": 1.2762,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -3.3196637630462646,
"rewards/margins": 0.5105838775634766,
"rewards/rejected": -3.8302478790283203,
"step": 365
},
{
"epoch": 0.7738354806739346,
"grad_norm": 3.29367733001709,
"learning_rate": 1.4644660940672627e-07,
"logits/chosen": -0.6255152821540833,
"logits/rejected": -0.6178345680236816,
"logps/chosen": -1.7635339498519897,
"logps/rejected": -2.02409029006958,
"loss": 1.4469,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -3.5270678997039795,
"rewards/margins": 0.5211121439933777,
"rewards/rejected": -4.04818058013916,
"step": 366
},
{
"epoch": 0.7759497852659398,
"grad_norm": 3.500715732574463,
"learning_rate": 1.438367139018796e-07,
"logits/chosen": -0.6738446354866028,
"logits/rejected": -0.671849250793457,
"logps/chosen": -1.603959560394287,
"logps/rejected": -2.140998363494873,
"loss": 0.9771,
"rewards/accuracies": 0.703125,
"rewards/chosen": -3.207919120788574,
"rewards/margins": 1.0740783214569092,
"rewards/rejected": -4.281996726989746,
"step": 367
},
{
"epoch": 0.7780640898579452,
"grad_norm": 2.8842501640319824,
"learning_rate": 1.412463714778343e-07,
"logits/chosen": -0.6544129252433777,
"logits/rejected": -0.6667245030403137,
"logps/chosen": -1.7409751415252686,
"logps/rejected": -2.1441538333892822,
"loss": 1.1043,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -3.481950283050537,
"rewards/margins": 0.806357741355896,
"rewards/rejected": -4.2883076667785645,
"step": 368
},
{
"epoch": 0.7801783944499504,
"grad_norm": 3.7606077194213867,
"learning_rate": 1.3867572434232728e-07,
"logits/chosen": -0.6620441675186157,
"logits/rejected": -0.6536539793014526,
"logps/chosen": -1.6755543947219849,
"logps/rejected": -2.012425184249878,
"loss": 1.2249,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -3.3511087894439697,
"rewards/margins": 0.6737421154975891,
"rewards/rejected": -4.024850368499756,
"step": 369
},
{
"epoch": 0.7822926990419558,
"grad_norm": 3.284456729888916,
"learning_rate": 1.3612491362183887e-07,
"logits/chosen": -0.6353476047515869,
"logits/rejected": -0.6363587975502014,
"logps/chosen": -1.6001538038253784,
"logps/rejected": -2.0670526027679443,
"loss": 1.0746,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -3.200307607650757,
"rewards/margins": 0.9337971210479736,
"rewards/rejected": -4.134105205535889,
"step": 370
},
{
"epoch": 0.784407003633961,
"grad_norm": 1.9063444137573242,
"learning_rate": 1.3359407935384642e-07,
"logits/chosen": -0.6120063662528992,
"logits/rejected": -0.5794797539710999,
"logps/chosen": -1.4489734172821045,
"logps/rejected": -1.9216854572296143,
"loss": 1.0928,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.897946834564209,
"rewards/margins": 0.9454240798950195,
"rewards/rejected": -3.8433709144592285,
"step": 371
},
{
"epoch": 0.7865213082259663,
"grad_norm": 6.973724842071533,
"learning_rate": 1.3108336047913633e-07,
"logits/chosen": -0.6082984209060669,
"logits/rejected": -0.6162828207015991,
"logps/chosen": -1.7623229026794434,
"logps/rejected": -2.239130735397339,
"loss": 1.2665,
"rewards/accuracies": 0.59375,
"rewards/chosen": -3.5246458053588867,
"rewards/margins": 0.9536150693893433,
"rewards/rejected": -4.478261470794678,
"step": 372
},
{
"epoch": 0.7886356128179716,
"grad_norm": 3.874128580093384,
"learning_rate": 1.2859289483417557e-07,
"logits/chosen": -0.5540960431098938,
"logits/rejected": -0.5091680884361267,
"logps/chosen": -1.85587739944458,
"logps/rejected": -2.3959312438964844,
"loss": 1.0672,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -3.71175479888916,
"rewards/margins": 1.0801074504852295,
"rewards/rejected": -4.791862487792969,
"step": 373
},
{
"epoch": 0.7907499174099769,
"grad_norm": 13.771154403686523,
"learning_rate": 1.261228191435445e-07,
"logits/chosen": -0.599963903427124,
"logits/rejected": -0.5765703916549683,
"logps/chosen": -1.7974251508712769,
"logps/rejected": -2.2272088527679443,
"loss": 1.1994,
"rewards/accuracies": 0.640625,
"rewards/chosen": -3.5948503017425537,
"rewards/margins": 0.8595672249794006,
"rewards/rejected": -4.454417705535889,
"step": 374
},
{
"epoch": 0.7928642220019821,
"grad_norm": 2.5084969997406006,
"learning_rate": 1.2367326901243214e-07,
"logits/chosen": -0.5945304036140442,
"logits/rejected": -0.6021737456321716,
"logps/chosen": -1.9855573177337646,
"logps/rejected": -2.3953022956848145,
"loss": 1.2576,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -3.9711146354675293,
"rewards/margins": 0.8194906711578369,
"rewards/rejected": -4.790604591369629,
"step": 375
},
{
"epoch": 0.7949785265939875,
"grad_norm": 4.571497440338135,
"learning_rate": 1.2124437891918993e-07,
"logits/chosen": -0.5888144373893738,
"logits/rejected": -0.5575076937675476,
"logps/chosen": -1.8334908485412598,
"logps/rejected": -2.153212070465088,
"loss": 1.2104,
"rewards/accuracies": 0.640625,
"rewards/chosen": -3.6669816970825195,
"rewards/margins": 0.639442503452301,
"rewards/rejected": -4.306424140930176,
"step": 376
},
{
"epoch": 0.7970928311859927,
"grad_norm": 5.023235321044922,
"learning_rate": 1.1883628220795005e-07,
"logits/chosen": -0.632038414478302,
"logits/rejected": -0.6368271708488464,
"logps/chosen": -1.8573570251464844,
"logps/rejected": -2.291320323944092,
"loss": 1.1719,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -3.7147140502929688,
"rewards/margins": 0.8679270148277283,
"rewards/rejected": -4.582640647888184,
"step": 377
},
{
"epoch": 0.7992071357779981,
"grad_norm": 4.98567533493042,
"learning_rate": 1.1644911108130434e-07,
"logits/chosen": -0.5647228360176086,
"logits/rejected": -0.5541558265686035,
"logps/chosen": -1.8232372999191284,
"logps/rejected": -2.2992348670959473,
"loss": 1.1476,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -3.646474599838257,
"rewards/margins": 0.9519950747489929,
"rewards/rejected": -4.5984697341918945,
"step": 378
},
{
"epoch": 0.8013214403700033,
"grad_norm": 9.514540672302246,
"learning_rate": 1.1408299659304682e-07,
"logits/chosen": -0.5385195016860962,
"logits/rejected": -0.5475942492485046,
"logps/chosen": -2.077877998352051,
"logps/rejected": -2.4877052307128906,
"loss": 1.1605,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -4.155755996704102,
"rewards/margins": 0.8196545243263245,
"rewards/rejected": -4.975410461425781,
"step": 379
},
{
"epoch": 0.8034357449620085,
"grad_norm": 7.652558326721191,
"learning_rate": 1.1173806864097884e-07,
"logits/chosen": -0.5651392936706543,
"logits/rejected": -0.5097556114196777,
"logps/chosen": -1.9452521800994873,
"logps/rejected": -2.376047134399414,
"loss": 1.2004,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -3.8905043601989746,
"rewards/margins": 0.8615895509719849,
"rewards/rejected": -4.752094268798828,
"step": 380
},
{
"epoch": 0.8055500495540139,
"grad_norm": 6.184218406677246,
"learning_rate": 1.0941445595977766e-07,
"logits/chosen": -0.5738644599914551,
"logits/rejected": -0.570101797580719,
"logps/chosen": -2.0233359336853027,
"logps/rejected": -2.5829384326934814,
"loss": 1.1539,
"rewards/accuracies": 0.71875,
"rewards/chosen": -4.0466718673706055,
"rewards/margins": 1.1192048788070679,
"rewards/rejected": -5.165876865386963,
"step": 381
},
{
"epoch": 0.8076643541460191,
"grad_norm": 4.697547435760498,
"learning_rate": 1.0711228611392936e-07,
"logits/chosen": -0.5766915082931519,
"logits/rejected": -0.5619411468505859,
"logps/chosen": -2.0546395778656006,
"logps/rejected": -2.4459054470062256,
"loss": 1.2723,
"rewards/accuracies": 0.59375,
"rewards/chosen": -4.109279155731201,
"rewards/margins": 0.7825320959091187,
"rewards/rejected": -4.891810894012451,
"step": 382
},
{
"epoch": 0.8097786587380245,
"grad_norm": 5.595128536224365,
"learning_rate": 1.0483168549072518e-07,
"logits/chosen": -0.6808648109436035,
"logits/rejected": -0.6518751382827759,
"logps/chosen": -1.9909974336624146,
"logps/rejected": -2.3775596618652344,
"loss": 1.2501,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -3.981994867324829,
"rewards/margins": 0.7731241583824158,
"rewards/rejected": -4.755119323730469,
"step": 383
},
{
"epoch": 0.8118929633300297,
"grad_norm": 3.6460607051849365,
"learning_rate": 1.0257277929332331e-07,
"logits/chosen": -0.6901826858520508,
"logits/rejected": -0.703309953212738,
"logps/chosen": -1.9317903518676758,
"logps/rejected": -2.322279930114746,
"loss": 1.1945,
"rewards/accuracies": 0.65625,
"rewards/chosen": -3.8635807037353516,
"rewards/margins": 0.780979335308075,
"rewards/rejected": -4.644559860229492,
"step": 384
},
{
"epoch": 0.814007267922035,
"grad_norm": 8.366463661193848,
"learning_rate": 1.0033569153387561e-07,
"logits/chosen": -0.5720599293708801,
"logits/rejected": -0.5666248798370361,
"logps/chosen": -1.9946173429489136,
"logps/rejected": -2.3951826095581055,
"loss": 1.3349,
"rewards/accuracies": 0.546875,
"rewards/chosen": -3.989234685897827,
"rewards/margins": 0.8011305332183838,
"rewards/rejected": -4.790365219116211,
"step": 385
},
{
"epoch": 0.8161215725140403,
"grad_norm": 1.8578377962112427,
"learning_rate": 9.812054502671834e-08,
"logits/chosen": -0.6122175455093384,
"logits/rejected": -0.5665942430496216,
"logps/chosen": -2.1414878368377686,
"logps/rejected": -2.646432399749756,
"loss": 1.1834,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -4.282975673675537,
"rewards/margins": 1.009889006614685,
"rewards/rejected": -5.292864799499512,
"step": 386
},
{
"epoch": 0.8182358771060456,
"grad_norm": 4.7323408126831055,
"learning_rate": 9.592746138163093e-08,
"logits/chosen": -0.5390607118606567,
"logits/rejected": -0.5227072834968567,
"logps/chosen": -2.1249067783355713,
"logps/rejected": -2.688115119934082,
"loss": 1.2211,
"rewards/accuracies": 0.671875,
"rewards/chosen": -4.249813556671143,
"rewards/margins": 1.1264164447784424,
"rewards/rejected": -5.376230239868164,
"step": 387
},
{
"epoch": 0.8203501816980509,
"grad_norm": 2.5557284355163574,
"learning_rate": 9.375656099715934e-08,
"logits/chosen": -0.5654515027999878,
"logits/rejected": -0.5636597275733948,
"logps/chosen": -2.126107692718506,
"logps/rejected": -2.606684684753418,
"loss": 1.1995,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -4.252215385437012,
"rewards/margins": 0.9611539244651794,
"rewards/rejected": -5.213369369506836,
"step": 388
},
{
"epoch": 0.8224644862900562,
"grad_norm": 4.177574634552002,
"learning_rate": 9.160796305400609e-08,
"logits/chosen": -0.6432445645332336,
"logits/rejected": -0.6587055921554565,
"logps/chosen": -2.0785441398620605,
"logps/rejected": -2.4507219791412354,
"loss": 1.2339,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -4.157088279724121,
"rewards/margins": 0.7443561553955078,
"rewards/rejected": -4.901443958282471,
"step": 389
},
{
"epoch": 0.8245787908820614,
"grad_norm": 5.901131629943848,
"learning_rate": 8.9481785508487e-08,
"logits/chosen": -0.588135302066803,
"logits/rejected": -0.5850880742073059,
"logps/chosen": -2.225466251373291,
"logps/rejected": -2.638160467147827,
"loss": 1.2255,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -4.450932502746582,
"rewards/margins": 0.8253881335258484,
"rewards/rejected": -5.276320934295654,
"step": 390
},
{
"epoch": 0.8266930954740668,
"grad_norm": 2.727555751800537,
"learning_rate": 8.737814508605673e-08,
"logits/chosen": -0.5863823294639587,
"logits/rejected": -0.590294599533081,
"logps/chosen": -1.9851064682006836,
"logps/rejected": -2.579831600189209,
"loss": 1.0113,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -3.970212936401367,
"rewards/margins": 1.1894500255584717,
"rewards/rejected": -5.159663200378418,
"step": 391
},
{
"epoch": 0.828807400066072,
"grad_norm": 9.048048973083496,
"learning_rate": 8.529715727489912e-08,
"logits/chosen": -0.5600543022155762,
"logits/rejected": -0.5537065267562866,
"logps/chosen": -1.9846975803375244,
"logps/rejected": -2.2676990032196045,
"loss": 1.3045,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -3.969395160675049,
"rewards/margins": 0.5660032629966736,
"rewards/rejected": -4.535398006439209,
"step": 392
},
{
"epoch": 0.8309217046580774,
"grad_norm": 3.4390201568603516,
"learning_rate": 8.323893631958806e-08,
"logits/chosen": -0.6335893273353577,
"logits/rejected": -0.6190727949142456,
"logps/chosen": -1.908363938331604,
"logps/rejected": -2.510305166244507,
"loss": 1.0262,
"rewards/accuracies": 0.6875,
"rewards/chosen": -3.816727876663208,
"rewards/margins": 1.2038825750350952,
"rewards/rejected": -5.020610332489014,
"step": 393
},
{
"epoch": 0.8330360092500826,
"grad_norm": 5.347372531890869,
"learning_rate": 8.120359521481501e-08,
"logits/chosen": -0.6408874988555908,
"logits/rejected": -0.643690288066864,
"logps/chosen": -2.019606828689575,
"logps/rejected": -2.3068103790283203,
"loss": 1.3028,
"rewards/accuracies": 0.640625,
"rewards/chosen": -4.03921365737915,
"rewards/margins": 0.574406623840332,
"rewards/rejected": -4.613620758056641,
"step": 394
},
{
"epoch": 0.8351503138420878,
"grad_norm": 2.2186920642852783,
"learning_rate": 7.9191245699186e-08,
"logits/chosen": -0.7156819105148315,
"logits/rejected": -0.6814436316490173,
"logps/chosen": -2.108549118041992,
"logps/rejected": -2.608646869659424,
"loss": 1.2948,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -4.217098236083984,
"rewards/margins": 1.0001959800720215,
"rewards/rejected": -5.217293739318848,
"step": 395
},
{
"epoch": 0.8372646184340932,
"grad_norm": 2.6448726654052734,
"learning_rate": 7.720199824908691e-08,
"logits/chosen": -0.5753149390220642,
"logits/rejected": -0.6065633296966553,
"logps/chosen": -2.2337100505828857,
"logps/rejected": -2.6677160263061523,
"loss": 1.2273,
"rewards/accuracies": 0.625,
"rewards/chosen": -4.4674201011657715,
"rewards/margins": 0.868012011051178,
"rewards/rejected": -5.335432052612305,
"step": 396
},
{
"epoch": 0.8393789230260984,
"grad_norm": 6.596648216247559,
"learning_rate": 7.523596207261907e-08,
"logits/chosen": -0.5432775616645813,
"logits/rejected": -0.4928567409515381,
"logps/chosen": -2.1113924980163574,
"logps/rejected": -2.482846975326538,
"loss": 1.319,
"rewards/accuracies": 0.625,
"rewards/chosen": -4.222784996032715,
"rewards/margins": 0.7429092526435852,
"rewards/rejected": -4.965693950653076,
"step": 397
},
{
"epoch": 0.8414932276181037,
"grad_norm": 3.9646811485290527,
"learning_rate": 7.329324510360269e-08,
"logits/chosen": -0.5816119909286499,
"logits/rejected": -0.564030110836029,
"logps/chosen": -2.0296411514282227,
"logps/rejected": -2.5152456760406494,
"loss": 1.1645,
"rewards/accuracies": 0.703125,
"rewards/chosen": -4.059282302856445,
"rewards/margins": 0.9712092876434326,
"rewards/rejected": -5.030491352081299,
"step": 398
},
{
"epoch": 0.843607532210109,
"grad_norm": 2.7787463665008545,
"learning_rate": 7.137395399565249e-08,
"logits/chosen": -0.6342184543609619,
"logits/rejected": -0.6318203210830688,
"logps/chosen": -2.0209803581237793,
"logps/rejected": -2.5250658988952637,
"loss": 1.1822,
"rewards/accuracies": 0.671875,
"rewards/chosen": -4.041960716247559,
"rewards/margins": 1.0081708431243896,
"rewards/rejected": -5.050131797790527,
"step": 399
},
{
"epoch": 0.8457218368021143,
"grad_norm": 4.476524353027344,
"learning_rate": 6.947819411632222e-08,
"logits/chosen": -0.5809480547904968,
"logits/rejected": -0.5740150213241577,
"logps/chosen": -1.9072691202163696,
"logps/rejected": -2.2942898273468018,
"loss": 1.3214,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -3.8145382404327393,
"rewards/margins": 0.7740417718887329,
"rewards/rejected": -4.5885796546936035,
"step": 400
},
{
"epoch": 0.8478361413941196,
"grad_norm": 2.47866153717041,
"learning_rate": 6.760606954131965e-08,
"logits/chosen": -0.5540263652801514,
"logits/rejected": -0.5378059148788452,
"logps/chosen": -1.8337305784225464,
"logps/rejected": -2.264974594116211,
"loss": 1.2396,
"rewards/accuracies": 0.671875,
"rewards/chosen": -3.6674611568450928,
"rewards/margins": 0.8624882698059082,
"rewards/rejected": -4.529949188232422,
"step": 401
},
{
"epoch": 0.8499504459861249,
"grad_norm": 2.800645112991333,
"learning_rate": 6.575768304879292e-08,
"logits/chosen": -0.6384072303771973,
"logits/rejected": -0.6310533285140991,
"logps/chosen": -1.9723026752471924,
"logps/rejected": -2.3342039585113525,
"loss": 1.2746,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -3.9446053504943848,
"rewards/margins": 0.7238021492958069,
"rewards/rejected": -4.668407917022705,
"step": 402
},
{
"epoch": 0.8520647505781301,
"grad_norm": 2.794485092163086,
"learning_rate": 6.3933136113689e-08,
"logits/chosen": -0.7269207239151001,
"logits/rejected": -0.7003817558288574,
"logps/chosen": -1.8535553216934204,
"logps/rejected": -2.2630820274353027,
"loss": 1.1774,
"rewards/accuracies": 0.765625,
"rewards/chosen": -3.707110643386841,
"rewards/margins": 0.8190534114837646,
"rewards/rejected": -4.5261640548706055,
"step": 403
},
{
"epoch": 0.8541790551701355,
"grad_norm": 12.197257041931152,
"learning_rate": 6.213252890218162e-08,
"logits/chosen": -0.5296715497970581,
"logits/rejected": -0.5422269105911255,
"logps/chosen": -1.8217012882232666,
"logps/rejected": -2.3873071670532227,
"loss": 1.1467,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -3.643402576446533,
"rewards/margins": 1.1312119960784912,
"rewards/rejected": -4.774614334106445,
"step": 404
},
{
"epoch": 0.8562933597621407,
"grad_norm": 2.396972179412842,
"learning_rate": 6.03559602661729e-08,
"logits/chosen": -0.6527739763259888,
"logits/rejected": -0.645389199256897,
"logps/chosen": -1.9720454216003418,
"logps/rejected": -2.2900233268737793,
"loss": 1.3423,
"rewards/accuracies": 0.671875,
"rewards/chosen": -3.9440908432006836,
"rewards/margins": 0.6359554529190063,
"rewards/rejected": -4.580046653747559,
"step": 405
},
{
"epoch": 0.8584076643541461,
"grad_norm": 3.5759809017181396,
"learning_rate": 5.8603527737866307e-08,
"logits/chosen": -0.5955278277397156,
"logits/rejected": -0.583007276058197,
"logps/chosen": -1.835761547088623,
"logps/rejected": -2.2889809608459473,
"loss": 1.1015,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -3.671523094177246,
"rewards/margins": 0.9064393639564514,
"rewards/rejected": -4.5779619216918945,
"step": 406
},
{
"epoch": 0.8605219689461513,
"grad_norm": 8.514383316040039,
"learning_rate": 5.687532752441232e-08,
"logits/chosen": -0.6325979828834534,
"logits/rejected": -0.5895124077796936,
"logps/chosen": -2.0668628215789795,
"logps/rejected": -2.4919605255126953,
"loss": 1.2469,
"rewards/accuracies": 0.640625,
"rewards/chosen": -4.133725643157959,
"rewards/margins": 0.8501947522163391,
"rewards/rejected": -4.983921051025391,
"step": 407
},
{
"epoch": 0.8626362735381566,
"grad_norm": 2.7234861850738525,
"learning_rate": 5.517145450262639e-08,
"logits/chosen": -0.5355826616287231,
"logits/rejected": -0.5421631932258606,
"logps/chosen": -1.8649351596832275,
"logps/rejected": -2.5664312839508057,
"loss": 1.0119,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -3.729870319366455,
"rewards/margins": 1.4029927253723145,
"rewards/rejected": -5.132862567901611,
"step": 408
},
{
"epoch": 0.8647505781301619,
"grad_norm": 3.1693661212921143,
"learning_rate": 5.3492002213780754e-08,
"logits/chosen": -0.5687247514724731,
"logits/rejected": -0.5579267740249634,
"logps/chosen": -2.0369410514831543,
"logps/rejected": -2.4640278816223145,
"loss": 1.311,
"rewards/accuracies": 0.640625,
"rewards/chosen": -4.073882102966309,
"rewards/margins": 0.8541740775108337,
"rewards/rejected": -4.928055763244629,
"step": 409
},
{
"epoch": 0.8668648827221671,
"grad_norm": 1.8922606706619263,
"learning_rate": 5.183706285846873e-08,
"logits/chosen": -0.6247987151145935,
"logits/rejected": -0.6043509244918823,
"logps/chosen": -1.8121845722198486,
"logps/rejected": -2.2492425441741943,
"loss": 1.1291,
"rewards/accuracies": 0.671875,
"rewards/chosen": -3.6243691444396973,
"rewards/margins": 0.8741158843040466,
"rewards/rejected": -4.498485088348389,
"step": 410
},
{
"epoch": 0.8689791873141725,
"grad_norm": 5.305470943450928,
"learning_rate": 5.020672729154307e-08,
"logits/chosen": -0.5554785132408142,
"logits/rejected": -0.565819501876831,
"logps/chosen": -1.9100950956344604,
"logps/rejected": -2.4060237407684326,
"loss": 1.1576,
"rewards/accuracies": 0.640625,
"rewards/chosen": -3.820190191268921,
"rewards/margins": 0.9918570518493652,
"rewards/rejected": -4.812047481536865,
"step": 411
},
{
"epoch": 0.8710934919061777,
"grad_norm": 3.2367563247680664,
"learning_rate": 4.860108501712823e-08,
"logits/chosen": -0.6536320447921753,
"logits/rejected": -0.6901589035987854,
"logps/chosen": -1.9213619232177734,
"logps/rejected": -2.270475387573242,
"loss": 1.2711,
"rewards/accuracies": 0.703125,
"rewards/chosen": -3.842723846435547,
"rewards/margins": 0.6982269287109375,
"rewards/rejected": -4.540950775146484,
"step": 412
},
{
"epoch": 0.873207796498183,
"grad_norm": 4.2919135093688965,
"learning_rate": 4.7020224183706715e-08,
"logits/chosen": -0.7220910787582397,
"logits/rejected": -0.7015137672424316,
"logps/chosen": -1.7745577096939087,
"logps/rejected": -2.2485008239746094,
"loss": 1.0773,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -3.5491154193878174,
"rewards/margins": 0.9478861093521118,
"rewards/rejected": -4.497001647949219,
"step": 413
},
{
"epoch": 0.8753221010901883,
"grad_norm": 6.373754501342773,
"learning_rate": 4.54642315792792e-08,
"logits/chosen": -0.6177189946174622,
"logits/rejected": -0.6176800727844238,
"logps/chosen": -1.8406522274017334,
"logps/rejected": -2.368619441986084,
"loss": 1.1582,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -3.681304454803467,
"rewards/margins": 1.0559337139129639,
"rewards/rejected": -4.737238883972168,
"step": 414
},
{
"epoch": 0.8774364056821936,
"grad_norm": 4.120994567871094,
"learning_rate": 4.3933192626600725e-08,
"logits/chosen": -0.5981518626213074,
"logits/rejected": -0.5846447348594666,
"logps/chosen": -1.9437062740325928,
"logps/rejected": -2.4175901412963867,
"loss": 1.1865,
"rewards/accuracies": 0.6875,
"rewards/chosen": -3.8874125480651855,
"rewards/margins": 0.9477680921554565,
"rewards/rejected": -4.835180282592773,
"step": 415
},
{
"epoch": 0.8795507102741988,
"grad_norm": 3.618441104888916,
"learning_rate": 4.242719137849077e-08,
"logits/chosen": -0.544365644454956,
"logits/rejected": -0.5385901927947998,
"logps/chosen": -1.8662028312683105,
"logps/rejected": -2.2550435066223145,
"loss": 1.2125,
"rewards/accuracies": 0.625,
"rewards/chosen": -3.732405662536621,
"rewards/margins": 0.7776816487312317,
"rewards/rejected": -4.510087013244629,
"step": 416
},
{
"epoch": 0.8816650148662042,
"grad_norm": 8.518675804138184,
"learning_rate": 4.0946310513218726e-08,
"logits/chosen": -0.6048115491867065,
"logits/rejected": -0.5681714415550232,
"logps/chosen": -2.020745038986206,
"logps/rejected": -2.5642106533050537,
"loss": 1.1682,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -4.041490077972412,
"rewards/margins": 1.0869308710098267,
"rewards/rejected": -5.128421306610107,
"step": 417
},
{
"epoch": 0.8837793194582094,
"grad_norm": 4.693824768066406,
"learning_rate": 3.9490631329964554e-08,
"logits/chosen": -0.5653468370437622,
"logits/rejected": -0.5610933303833008,
"logps/chosen": -1.8477216958999634,
"logps/rejected": -2.280613660812378,
"loss": 1.2177,
"rewards/accuracies": 0.640625,
"rewards/chosen": -3.6954433917999268,
"rewards/margins": 0.8657836318016052,
"rewards/rejected": -4.561227321624756,
"step": 418
},
{
"epoch": 0.8858936240502148,
"grad_norm": 4.910251617431641,
"learning_rate": 3.806023374435663e-08,
"logits/chosen": -0.6456243991851807,
"logits/rejected": -0.6571968197822571,
"logps/chosen": -1.8414027690887451,
"logps/rejected": -2.2380261421203613,
"loss": 1.2081,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -3.6828055381774902,
"rewards/margins": 0.7932465076446533,
"rewards/rejected": -4.476052284240723,
"step": 419
},
{
"epoch": 0.88800792864222,
"grad_norm": 2.260300636291504,
"learning_rate": 3.665519628408331e-08,
"logits/chosen": -0.6023683547973633,
"logits/rejected": -0.6400430798530579,
"logps/chosen": -2.039283275604248,
"logps/rejected": -2.520536184310913,
"loss": 1.1629,
"rewards/accuracies": 0.671875,
"rewards/chosen": -4.078566551208496,
"rewards/margins": 0.962505042552948,
"rewards/rejected": -5.041072368621826,
"step": 420
},
{
"epoch": 0.8901222332342253,
"grad_norm": 2.411315679550171,
"learning_rate": 3.527559608458225e-08,
"logits/chosen": -0.6408150792121887,
"logits/rejected": -0.6065229177474976,
"logps/chosen": -1.91830313205719,
"logps/rejected": -2.378871440887451,
"loss": 1.1848,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -3.83660626411438,
"rewards/margins": 0.9211370944976807,
"rewards/rejected": -4.757742881774902,
"step": 421
},
{
"epoch": 0.8922365378262306,
"grad_norm": 8.43724250793457,
"learning_rate": 3.39215088848061e-08,
"logits/chosen": -0.5962439179420471,
"logits/rejected": -0.5975909233093262,
"logps/chosen": -1.9837861061096191,
"logps/rejected": -2.319769859313965,
"loss": 1.2026,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -3.9675722122192383,
"rewards/margins": 0.6719677448272705,
"rewards/rejected": -4.63953971862793,
"step": 422
},
{
"epoch": 0.8943508424182359,
"grad_norm": 2.261178731918335,
"learning_rate": 3.259300902306367e-08,
"logits/chosen": -0.6858331561088562,
"logits/rejected": -0.7034648060798645,
"logps/chosen": -1.8496602773666382,
"logps/rejected": -2.3583877086639404,
"loss": 1.1137,
"rewards/accuracies": 0.734375,
"rewards/chosen": -3.6993205547332764,
"rewards/margins": 1.0174546241760254,
"rewards/rejected": -4.716775417327881,
"step": 423
},
{
"epoch": 0.8964651470102412,
"grad_norm": 7.621473789215088,
"learning_rate": 3.129016943293955e-08,
"logits/chosen": -0.6037753224372864,
"logits/rejected": -0.5865834355354309,
"logps/chosen": -1.902024507522583,
"logps/rejected": -2.3152518272399902,
"loss": 1.2577,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -3.804049015045166,
"rewards/margins": 0.8264546394348145,
"rewards/rejected": -4.6305036544799805,
"step": 424
},
{
"epoch": 0.8985794516022465,
"grad_norm": 2.954953908920288,
"learning_rate": 3.001306163928985e-08,
"logits/chosen": -0.6682695746421814,
"logits/rejected": -0.6516857147216797,
"logps/chosen": -2.0923025608062744,
"logps/rejected": -2.4602210521698,
"loss": 1.3758,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -4.184605121612549,
"rewards/margins": 0.7358372211456299,
"rewards/rejected": -4.9204421043396,
"step": 425
},
{
"epoch": 0.9006937561942517,
"grad_norm": 4.746059894561768,
"learning_rate": 2.8761755754315663e-08,
"logits/chosen": -0.6213058829307556,
"logits/rejected": -0.6071665287017822,
"logps/chosen": -1.9309402704238892,
"logps/rejected": -2.3048858642578125,
"loss": 1.2216,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -3.8618805408477783,
"rewards/margins": 0.7478916049003601,
"rewards/rejected": -4.609771728515625,
"step": 426
},
{
"epoch": 0.902808060786257,
"grad_norm": 3.4567902088165283,
"learning_rate": 2.753632047371335e-08,
"logits/chosen": -0.5602300763130188,
"logits/rejected": -0.5994393825531006,
"logps/chosen": -2.0382192134857178,
"logps/rejected": -2.4620015621185303,
"loss": 1.1534,
"rewards/accuracies": 0.6875,
"rewards/chosen": -4.0764384269714355,
"rewards/margins": 0.8475649952888489,
"rewards/rejected": -4.9240031242370605,
"step": 427
},
{
"epoch": 0.9049223653782623,
"grad_norm": 8.650147438049316,
"learning_rate": 2.63368230729043e-08,
"logits/chosen": -0.6574521660804749,
"logits/rejected": -0.6474560499191284,
"logps/chosen": -2.01283860206604,
"logps/rejected": -2.3451762199401855,
"loss": 1.3337,
"rewards/accuracies": 0.625,
"rewards/chosen": -4.02567720413208,
"rewards/margins": 0.6646751165390015,
"rewards/rejected": -4.690352439880371,
"step": 428
},
{
"epoch": 0.9070366699702676,
"grad_norm": 4.965768337249756,
"learning_rate": 2.5163329403340593e-08,
"logits/chosen": -0.632398784160614,
"logits/rejected": -0.6226595640182495,
"logps/chosen": -1.9954252243041992,
"logps/rejected": -2.415121555328369,
"loss": 1.1249,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -3.9908504486083984,
"rewards/margins": 0.8393926024436951,
"rewards/rejected": -4.830243110656738,
"step": 429
},
{
"epoch": 0.9091509745622729,
"grad_norm": 4.165818214416504,
"learning_rate": 2.4015903888890242e-08,
"logits/chosen": -0.6372086405754089,
"logits/rejected": -0.6573516130447388,
"logps/chosen": -1.9238042831420898,
"logps/rejected": -2.3672964572906494,
"loss": 1.1372,
"rewards/accuracies": 0.65625,
"rewards/chosen": -3.8476085662841797,
"rewards/margins": 0.8869843482971191,
"rewards/rejected": -4.734592914581299,
"step": 430
},
{
"epoch": 0.9112652791542781,
"grad_norm": 4.025818347930908,
"learning_rate": 2.289460952230038e-08,
"logits/chosen": -0.6017577648162842,
"logits/rejected": -0.5835919380187988,
"logps/chosen": -1.9263951778411865,
"logps/rejected": -2.364337921142578,
"loss": 1.1519,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -3.852790355682373,
"rewards/margins": 0.875885009765625,
"rewards/rejected": -4.728675842285156,
"step": 431
},
{
"epoch": 0.9133795837462835,
"grad_norm": 2.232624053955078,
"learning_rate": 2.1799507861738788e-08,
"logits/chosen": -0.697775661945343,
"logits/rejected": -0.7254015803337097,
"logps/chosen": -1.8258415460586548,
"logps/rejected": -2.089477777481079,
"loss": 1.3136,
"rewards/accuracies": 0.578125,
"rewards/chosen": -3.6516830921173096,
"rewards/margins": 0.5272722244262695,
"rewards/rejected": -4.178955554962158,
"step": 432
},
{
"epoch": 0.9154938883382887,
"grad_norm": 5.815128326416016,
"learning_rate": 2.073065902741472e-08,
"logits/chosen": -0.5873744487762451,
"logits/rejected": -0.5638723969459534,
"logps/chosen": -1.9891620874404907,
"logps/rejected": -2.4962096214294434,
"loss": 1.1379,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -3.9783241748809814,
"rewards/margins": 1.0140951871871948,
"rewards/rejected": -4.992419242858887,
"step": 433
},
{
"epoch": 0.917608192930294,
"grad_norm": 5.057411193847656,
"learning_rate": 1.9688121698277993e-08,
"logits/chosen": -0.607324481010437,
"logits/rejected": -0.5964059829711914,
"logps/chosen": -1.8643240928649902,
"logps/rejected": -2.2751855850219727,
"loss": 1.2388,
"rewards/accuracies": 0.625,
"rewards/chosen": -3.7286481857299805,
"rewards/margins": 0.8217229843139648,
"rewards/rejected": -4.550371170043945,
"step": 434
},
{
"epoch": 0.9197224975222993,
"grad_norm": 2.25390362739563,
"learning_rate": 1.8671953108797823e-08,
"logits/chosen": -0.6268022656440735,
"logits/rejected": -0.6332954168319702,
"logps/chosen": -1.945924997329712,
"logps/rejected": -2.330981731414795,
"loss": 1.1455,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.891849994659424,
"rewards/margins": 0.7701136469841003,
"rewards/rejected": -4.66196346282959,
"step": 435
},
{
"epoch": 0.9218368021143046,
"grad_norm": 3.9572856426239014,
"learning_rate": 1.7682209045820684e-08,
"logits/chosen": -0.6522207856178284,
"logits/rejected": -0.6930267810821533,
"logps/chosen": -1.8152984380722046,
"logps/rejected": -2.0800223350524902,
"loss": 1.2978,
"rewards/accuracies": 0.6875,
"rewards/chosen": -3.630596876144409,
"rewards/margins": 0.5294479727745056,
"rewards/rejected": -4.1600446701049805,
"step": 436
},
{
"epoch": 0.9239511067063099,
"grad_norm": 1.733438491821289,
"learning_rate": 1.671894384550743e-08,
"logits/chosen": -0.5977643728256226,
"logits/rejected": -0.5842040777206421,
"logps/chosen": -1.8794972896575928,
"logps/rejected": -2.413329601287842,
"loss": 1.0233,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.7589945793151855,
"rewards/margins": 1.0676649808883667,
"rewards/rejected": -4.826659202575684,
"step": 437
},
{
"epoch": 0.9260654112983152,
"grad_norm": 2.8760743141174316,
"learning_rate": 1.5782210390350713e-08,
"logits/chosen": -0.5813508033752441,
"logits/rejected": -0.5602753758430481,
"logps/chosen": -1.7892794609069824,
"logps/rejected": -2.32309627532959,
"loss": 1.0836,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.578558921813965,
"rewards/margins": 1.0676335096359253,
"rewards/rejected": -4.64619255065918,
"step": 438
},
{
"epoch": 0.9281797158903204,
"grad_norm": 5.760490894317627,
"learning_rate": 1.4872060106271179e-08,
"logits/chosen": -0.5673117637634277,
"logits/rejected": -0.5580011606216431,
"logps/chosen": -1.943117618560791,
"logps/rejected": -2.4581894874572754,
"loss": 1.1229,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.886235237121582,
"rewards/margins": 1.030144453048706,
"rewards/rejected": -4.916378974914551,
"step": 439
},
{
"epoch": 0.9302940204823258,
"grad_norm": 5.213393211364746,
"learning_rate": 1.3988542959794625e-08,
"logits/chosen": -0.5715171098709106,
"logits/rejected": -0.5791775584220886,
"logps/chosen": -1.961305022239685,
"logps/rejected": -2.4485957622528076,
"loss": 1.0877,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -3.92261004447937,
"rewards/margins": 0.9745810627937317,
"rewards/rejected": -4.897191524505615,
"step": 440
},
{
"epoch": 0.932408325074331,
"grad_norm": 2.670029878616333,
"learning_rate": 1.3131707455309004e-08,
"logits/chosen": -0.6612206101417542,
"logits/rejected": -0.569149374961853,
"logps/chosen": -1.9947882890701294,
"logps/rejected": -2.41544771194458,
"loss": 1.2501,
"rewards/accuracies": 0.65625,
"rewards/chosen": -3.989576578140259,
"rewards/margins": 0.8413184881210327,
"rewards/rejected": -4.83089542388916,
"step": 441
},
{
"epoch": 0.9345226296663363,
"grad_norm": 2.0773093700408936,
"learning_rate": 1.230160063240121e-08,
"logits/chosen": -0.5475001335144043,
"logits/rejected": -0.6024526953697205,
"logps/chosen": -1.9972546100616455,
"logps/rejected": -2.2212231159210205,
"loss": 1.2857,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -3.994509220123291,
"rewards/margins": 0.4479368031024933,
"rewards/rejected": -4.442446231842041,
"step": 442
},
{
"epoch": 0.9366369342583416,
"grad_norm": 2.6185569763183594,
"learning_rate": 1.1498268063274697e-08,
"logits/chosen": -0.6600778102874756,
"logits/rejected": -0.6794160604476929,
"logps/chosen": -1.7303975820541382,
"logps/rejected": -2.0589568614959717,
"loss": 1.183,
"rewards/accuracies": 0.6875,
"rewards/chosen": -3.4607951641082764,
"rewards/margins": 0.6571190357208252,
"rewards/rejected": -4.117913722991943,
"step": 443
},
{
"epoch": 0.9387512388503468,
"grad_norm": 2.7385923862457275,
"learning_rate": 1.0721753850247984e-08,
"logits/chosen": -0.6136504411697388,
"logits/rejected": -0.5926402807235718,
"logps/chosen": -1.9593303203582764,
"logps/rejected": -2.446382999420166,
"loss": 1.161,
"rewards/accuracies": 0.640625,
"rewards/chosen": -3.9186606407165527,
"rewards/margins": 0.9741055965423584,
"rewards/rejected": -4.892765998840332,
"step": 444
},
{
"epoch": 0.9408655434423522,
"grad_norm": 2.006077527999878,
"learning_rate": 9.972100623333035e-09,
"logits/chosen": -0.5911227464675903,
"logits/rejected": -0.5988056063652039,
"logps/chosen": -1.9767932891845703,
"logps/rejected": -2.307847499847412,
"loss": 1.2698,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -3.9535865783691406,
"rewards/margins": 0.6621084809303284,
"rewards/rejected": -4.615694999694824,
"step": 445
},
{
"epoch": 0.9429798480343574,
"grad_norm": 3.775676965713501,
"learning_rate": 9.249349537894968e-09,
"logits/chosen": -0.5951496958732605,
"logits/rejected": -0.5602840185165405,
"logps/chosen": -2.01466965675354,
"logps/rejected": -2.404120922088623,
"loss": 1.3551,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -4.02933931350708,
"rewards/margins": 0.7789022922515869,
"rewards/rejected": -4.808241844177246,
"step": 446
},
{
"epoch": 0.9450941526263628,
"grad_norm": 10.657898902893066,
"learning_rate": 8.553540272392967e-09,
"logits/chosen": -0.616013765335083,
"logits/rejected": -0.6068493127822876,
"logps/chosen": -1.9523563385009766,
"logps/rejected": -2.3371798992156982,
"loss": 1.2264,
"rewards/accuracies": 0.640625,
"rewards/chosen": -3.904712677001953,
"rewards/margins": 0.7696471810340881,
"rewards/rejected": -4.6743597984313965,
"step": 447
},
{
"epoch": 0.947208457218368,
"grad_norm": 5.239955902099609,
"learning_rate": 7.884711026201584e-09,
"logits/chosen": -0.5559091567993164,
"logits/rejected": -0.5499454140663147,
"logps/chosen": -1.9888339042663574,
"logps/rejected": -2.5645201206207275,
"loss": 1.1615,
"rewards/accuracies": 0.703125,
"rewards/chosen": -3.977667808532715,
"rewards/margins": 1.1513725519180298,
"rewards/rejected": -5.129040241241455,
"step": 448
},
{
"epoch": 0.9493227618103733,
"grad_norm": 4.970836162567139,
"learning_rate": 7.242898517513863e-09,
"logits/chosen": -0.6270098686218262,
"logits/rejected": -0.5990616083145142,
"logps/chosen": -2.0393564701080322,
"logps/rejected": -2.6450533866882324,
"loss": 1.0316,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -4.0787129402160645,
"rewards/margins": 1.2113933563232422,
"rewards/rejected": -5.290106773376465,
"step": 449
},
{
"epoch": 0.9514370664023786,
"grad_norm": 4.401031494140625,
"learning_rate": 6.62813798132561e-09,
"logits/chosen": -0.6103833913803101,
"logits/rejected": -0.6355498433113098,
"logps/chosen": -1.9900306463241577,
"logps/rejected": -2.4799742698669434,
"loss": 1.1272,
"rewards/accuracies": 0.734375,
"rewards/chosen": -3.9800612926483154,
"rewards/margins": 0.979887843132019,
"rewards/rejected": -4.959948539733887,
"step": 450
},
{
"epoch": 0.9535513709943839,
"grad_norm": 5.162088871002197,
"learning_rate": 6.040463167500509e-09,
"logits/chosen": -0.6351377367973328,
"logits/rejected": -0.6445170044898987,
"logps/chosen": -2.017266035079956,
"logps/rejected": -2.4103317260742188,
"loss": 1.2591,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -4.034532070159912,
"rewards/margins": 0.7861310243606567,
"rewards/rejected": -4.8206634521484375,
"step": 451
},
{
"epoch": 0.9556656755863892,
"grad_norm": 3.158773422241211,
"learning_rate": 5.4799063389179834e-09,
"logits/chosen": -0.6216992139816284,
"logits/rejected": -0.6317836046218872,
"logps/chosen": -1.9916179180145264,
"logps/rejected": -2.476783275604248,
"loss": 1.192,
"rewards/accuracies": 0.671875,
"rewards/chosen": -3.9832358360290527,
"rewards/margins": 0.970331072807312,
"rewards/rejected": -4.953566551208496,
"step": 452
},
{
"epoch": 0.9577799801783945,
"grad_norm": 4.7540435791015625,
"learning_rate": 4.946498269701616e-09,
"logits/chosen": -0.652457594871521,
"logits/rejected": -0.6148388385772705,
"logps/chosen": -2.0300891399383545,
"logps/rejected": -2.5610132217407227,
"loss": 1.0769,
"rewards/accuracies": 0.734375,
"rewards/chosen": -4.060178279876709,
"rewards/margins": 1.061848759651184,
"rewards/rejected": -5.122026443481445,
"step": 453
},
{
"epoch": 0.9598942847703997,
"grad_norm": 4.686556339263916,
"learning_rate": 4.440268243529666e-09,
"logits/chosen": -0.5588012337684631,
"logits/rejected": -0.5526341199874878,
"logps/chosen": -1.8666988611221313,
"logps/rejected": -2.3390815258026123,
"loss": 1.1768,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -3.7333977222442627,
"rewards/margins": 0.9447645545005798,
"rewards/rejected": -4.678163051605225,
"step": 454
},
{
"epoch": 0.9620085893624051,
"grad_norm": 2.740269422531128,
"learning_rate": 3.961244052027413e-09,
"logits/chosen": -0.6438521146774292,
"logits/rejected": -0.6682748198509216,
"logps/chosen": -2.0076475143432617,
"logps/rejected": -2.388810396194458,
"loss": 1.2689,
"rewards/accuracies": 0.640625,
"rewards/chosen": -4.015295028686523,
"rewards/margins": 0.7623259425163269,
"rewards/rejected": -4.777620792388916,
"step": 455
},
{
"epoch": 0.9641228939544103,
"grad_norm": 2.9197144508361816,
"learning_rate": 3.509451993241541e-09,
"logits/chosen": -0.5822494029998779,
"logits/rejected": -0.5853508114814758,
"logps/chosen": -1.8848122358322144,
"logps/rejected": -2.4192898273468018,
"loss": 1.0924,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.7696244716644287,
"rewards/margins": 1.0689555406570435,
"rewards/rejected": -4.8385796546936035,
"step": 456
},
{
"epoch": 0.9662371985464155,
"grad_norm": 4.501737117767334,
"learning_rate": 3.084916870196297e-09,
"logits/chosen": -0.5652188658714294,
"logits/rejected": -0.5740686655044556,
"logps/chosen": -1.9216543436050415,
"logps/rejected": -2.23102068901062,
"loss": 1.2907,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -3.843308687210083,
"rewards/margins": 0.618732750415802,
"rewards/rejected": -4.46204137802124,
"step": 457
},
{
"epoch": 0.9683515031384209,
"grad_norm": 3.512376070022583,
"learning_rate": 2.687661989531964e-09,
"logits/chosen": -0.6515664458274841,
"logits/rejected": -0.6550417542457581,
"logps/chosen": -1.9334843158721924,
"logps/rejected": -2.2688543796539307,
"loss": 1.2578,
"rewards/accuracies": 0.578125,
"rewards/chosen": -3.8669686317443848,
"rewards/margins": 0.67074054479599,
"rewards/rejected": -4.537708759307861,
"step": 458
},
{
"epoch": 0.9704658077304261,
"grad_norm": 2.165844678878784,
"learning_rate": 2.3177091602251675e-09,
"logits/chosen": -0.6218724250793457,
"logits/rejected": -0.5920112729072571,
"logps/chosen": -1.8584281206130981,
"logps/rejected": -2.366225242614746,
"loss": 1.1553,
"rewards/accuracies": 0.6875,
"rewards/chosen": -3.7168562412261963,
"rewards/margins": 1.0155941247940063,
"rewards/rejected": -4.732450485229492,
"step": 459
},
{
"epoch": 0.9725801123224315,
"grad_norm": 1.7227884531021118,
"learning_rate": 1.975078692391552e-09,
"logits/chosen": -0.5791985988616943,
"logits/rejected": -0.5785022974014282,
"logps/chosen": -1.8981022834777832,
"logps/rejected": -2.3716633319854736,
"loss": 1.1642,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -3.7962045669555664,
"rewards/margins": 0.9471220970153809,
"rewards/rejected": -4.743326663970947,
"step": 460
},
{
"epoch": 0.9746944169144367,
"grad_norm": 3.150090217590332,
"learning_rate": 1.659789396171063e-09,
"logits/chosen": -0.6548072695732117,
"logits/rejected": -0.6290433406829834,
"logps/chosen": -2.0168204307556152,
"logps/rejected": -2.520479679107666,
"loss": 1.0736,
"rewards/accuracies": 0.765625,
"rewards/chosen": -4.0336408615112305,
"rewards/margins": 1.0073186159133911,
"rewards/rejected": -5.040959358215332,
"step": 461
},
{
"epoch": 0.976808721506442,
"grad_norm": 1.256157636642456,
"learning_rate": 1.37185858069494e-09,
"logits/chosen": -0.7094852328300476,
"logits/rejected": -0.7226460576057434,
"logps/chosen": -1.8896048069000244,
"logps/rejected": -2.4871973991394043,
"loss": 1.0536,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -3.779209613800049,
"rewards/margins": 1.1951854228973389,
"rewards/rejected": -4.974394798278809,
"step": 462
},
{
"epoch": 0.9789230260984473,
"grad_norm": 2.8358895778656006,
"learning_rate": 1.1113020531357541e-09,
"logits/chosen": -0.6778469085693359,
"logits/rejected": -0.6957201957702637,
"logps/chosen": -2.0275380611419678,
"logps/rejected": -2.470618963241577,
"loss": 1.1801,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -4.0550761222839355,
"rewards/margins": 0.886161208152771,
"rewards/rejected": -4.941237926483154,
"step": 463
},
{
"epoch": 0.9810373306904526,
"grad_norm": 2.8881914615631104,
"learning_rate": 8.781341178393242e-10,
"logits/chosen": -0.5639821887016296,
"logits/rejected": -0.5891467928886414,
"logps/chosen": -2.0047199726104736,
"logps/rejected": -2.522782802581787,
"loss": 1.1948,
"rewards/accuracies": 0.640625,
"rewards/chosen": -4.009439945220947,
"rewards/margins": 1.0361257791519165,
"rewards/rejected": -5.045565605163574,
"step": 464
},
{
"epoch": 0.9831516352824579,
"grad_norm": 3.421194314956665,
"learning_rate": 6.723675755396229e-10,
"logits/chosen": -0.540326714515686,
"logits/rejected": -0.5159227252006531,
"logps/chosen": -1.88228178024292,
"logps/rejected": -2.2003138065338135,
"loss": 1.2191,
"rewards/accuracies": 0.59375,
"rewards/chosen": -3.76456356048584,
"rewards/margins": 0.6360642313957214,
"rewards/rejected": -4.400627613067627,
"step": 465
},
{
"epoch": 0.9852659398744632,
"grad_norm": 4.243066310882568,
"learning_rate": 4.940137226560615e-10,
"logits/chosen": -0.6175463795661926,
"logits/rejected": -0.6400432586669922,
"logps/chosen": -1.9547748565673828,
"logps/rejected": -2.4598965644836426,
"loss": 1.2589,
"rewards/accuracies": 0.640625,
"rewards/chosen": -3.9095497131347656,
"rewards/margins": 1.0102434158325195,
"rewards/rejected": -4.919793128967285,
"step": 466
},
{
"epoch": 0.9873802444664684,
"grad_norm": 3.3425028324127197,
"learning_rate": 3.430823506730962e-10,
"logits/chosen": -0.5236034393310547,
"logits/rejected": -0.48699086904525757,
"logps/chosen": -2.167372465133667,
"logps/rejected": -2.687620162963867,
"loss": 1.2024,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -4.334744930267334,
"rewards/margins": 1.0404952764511108,
"rewards/rejected": -5.375240325927734,
"step": 467
},
{
"epoch": 0.9894945490584738,
"grad_norm": 3.1803112030029297,
"learning_rate": 2.1958174560282594e-10,
"logits/chosen": -0.6515716910362244,
"logits/rejected": -0.6526726484298706,
"logps/chosen": -2.0350496768951416,
"logps/rejected": -2.4857177734375,
"loss": 1.1524,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -4.070099353790283,
"rewards/margins": 0.9013361930847168,
"rewards/rejected": -4.971435546875,
"step": 468
},
{
"epoch": 0.991608853650479,
"grad_norm": 2.8402769565582275,
"learning_rate": 1.2351868753018858e-10,
"logits/chosen": -0.5555111765861511,
"logits/rejected": -0.5084383487701416,
"logps/chosen": -1.9741497039794922,
"logps/rejected": -2.5360653400421143,
"loss": 1.0956,
"rewards/accuracies": 0.703125,
"rewards/chosen": -3.9482994079589844,
"rewards/margins": 1.1238315105438232,
"rewards/rejected": -5.0721306800842285,
"step": 469
},
{
"epoch": 0.9937231582424844,
"grad_norm": 14.110418319702148,
"learning_rate": 5.4898450240536964e-11,
"logits/chosen": -0.6210866570472717,
"logits/rejected": -0.614806056022644,
"logps/chosen": -2.0763094425201416,
"logps/rejected": -2.5026116371154785,
"loss": 1.2184,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -4.152618885040283,
"rewards/margins": 0.8526046276092529,
"rewards/rejected": -5.005223274230957,
"step": 470
},
{
"epoch": 0.9958374628344896,
"grad_norm": 2.8393566608428955,
"learning_rate": 1.3724800930314805e-11,
"logits/chosen": -0.5895847678184509,
"logits/rejected": -0.6269129514694214,
"logps/chosen": -1.8787530660629272,
"logps/rejected": -2.4467647075653076,
"loss": 1.0714,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -3.7575061321258545,
"rewards/margins": 1.1360235214233398,
"rewards/rejected": -4.893529415130615,
"step": 471
},
{
"epoch": 0.9979517674264948,
"grad_norm": 3.9959075450897217,
"learning_rate": 0.0,
"logits/chosen": -0.6461910009384155,
"logits/rejected": -0.6503991484642029,
"logps/chosen": -1.798724889755249,
"logps/rejected": -2.3589823246002197,
"loss": 1.0133,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -3.597449779510498,
"rewards/margins": 1.1205153465270996,
"rewards/rejected": -4.7179646492004395,
"step": 472
},
{
"epoch": 0.9979517674264948,
"step": 472,
"total_flos": 0.0,
"train_loss": 1.280224425307775,
"train_runtime": 38087.5267,
"train_samples_per_second": 1.589,
"train_steps_per_second": 0.012
}
],
"logging_steps": 1,
"max_steps": 472,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 64,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}