RobustRDP / trainer_state.json
Jingcz's picture
Upload folder using huggingface_hub
e3fe7f5 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.998165137614679,
"eval_steps": 500,
"global_step": 221,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004516584333098095,
"grad_norm": 17.491543776149005,
"learning_rate": 4.285714285714285e-08,
"logits/chosen": -3.703737258911133,
"logits/rejected": -3.642177104949951,
"logps/chosen": -230.21658325195312,
"logps/rejected": -213.08389282226562,
"loss": 0.8161,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.00903316866619619,
"grad_norm": 16.5951979105037,
"learning_rate": 8.57142857142857e-08,
"logits/chosen": -3.811067819595337,
"logits/rejected": -3.761146306991577,
"logps/chosen": -186.53829956054688,
"logps/rejected": -172.09280395507812,
"loss": 0.818,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 2
},
{
"epoch": 0.013549752999294284,
"grad_norm": 16.426043598595893,
"learning_rate": 1.2857142857142855e-07,
"logits/chosen": -3.673379898071289,
"logits/rejected": -3.574615955352783,
"logps/chosen": -185.69815063476562,
"logps/rejected": -167.53453063964844,
"loss": 0.8218,
"rewards/accuracies": 0.421875,
"rewards/chosen": -0.00822072196751833,
"rewards/margins": -0.011108924634754658,
"rewards/rejected": 0.0028882024344056845,
"step": 3
},
{
"epoch": 0.01806633733239238,
"grad_norm": 17.182455484491946,
"learning_rate": 1.714285714285714e-07,
"logits/chosen": -3.6595711708068848,
"logits/rejected": -3.664757490158081,
"logps/chosen": -233.35186767578125,
"logps/rejected": -220.01324462890625,
"loss": 0.8183,
"rewards/accuracies": 0.453125,
"rewards/chosen": -0.0016412150580435991,
"rewards/margins": -0.0031568286940455437,
"rewards/rejected": 0.0015156148001551628,
"step": 4
},
{
"epoch": 0.022582921665490474,
"grad_norm": 15.69002599516555,
"learning_rate": 2.1428571428571428e-07,
"logits/chosen": -3.727100133895874,
"logits/rejected": -3.737985134124756,
"logps/chosen": -190.59376525878906,
"logps/rejected": -175.35711669921875,
"loss": 0.8179,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.007597364019602537,
"rewards/margins": -0.0018714312463998795,
"rewards/rejected": -0.005725932773202658,
"step": 5
},
{
"epoch": 0.02709950599858857,
"grad_norm": 15.612644086790919,
"learning_rate": 2.571428571428571e-07,
"logits/chosen": -3.6620445251464844,
"logits/rejected": -3.6660032272338867,
"logps/chosen": -184.26095581054688,
"logps/rejected": -162.46971130371094,
"loss": 0.8186,
"rewards/accuracies": 0.546875,
"rewards/chosen": -0.008165668696165085,
"rewards/margins": -0.008361553773283958,
"rewards/rejected": 0.00019588530994951725,
"step": 6
},
{
"epoch": 0.031616090331686664,
"grad_norm": 13.683214125166593,
"learning_rate": 3e-07,
"logits/chosen": -3.690432071685791,
"logits/rejected": -3.5761916637420654,
"logps/chosen": -170.06161499023438,
"logps/rejected": -156.4132080078125,
"loss": 0.8176,
"rewards/accuracies": 0.359375,
"rewards/chosen": -0.00132226780988276,
"rewards/margins": -0.01818685233592987,
"rewards/rejected": 0.016864586621522903,
"step": 7
},
{
"epoch": 0.03613267466478476,
"grad_norm": 16.4402451983829,
"learning_rate": 2.999838368626891e-07,
"logits/chosen": -3.741443157196045,
"logits/rejected": -3.670546293258667,
"logps/chosen": -196.43753051757812,
"logps/rejected": -175.54696655273438,
"loss": 0.8229,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.003386292140930891,
"rewards/margins": -0.005730946082621813,
"rewards/rejected": 0.002344653941690922,
"step": 8
},
{
"epoch": 0.04064925899788285,
"grad_norm": 17.452034360978363,
"learning_rate": 2.9993535093404974e-07,
"logits/chosen": -3.6631717681884766,
"logits/rejected": -3.5741701126098633,
"logps/chosen": -229.43096923828125,
"logps/rejected": -198.76205444335938,
"loss": 0.8287,
"rewards/accuracies": 0.421875,
"rewards/chosen": -0.007437766529619694,
"rewards/margins": -0.012869942933321,
"rewards/rejected": 0.005432176869362593,
"step": 9
},
{
"epoch": 0.04516584333098095,
"grad_norm": 18.322329811535944,
"learning_rate": 2.998545526632117e-07,
"logits/chosen": -3.725301504135132,
"logits/rejected": -3.667342185974121,
"logps/chosen": -203.80462646484375,
"logps/rejected": -184.80245971679688,
"loss": 0.8243,
"rewards/accuracies": 0.390625,
"rewards/chosen": -0.009436231106519699,
"rewards/margins": -0.01625261828303337,
"rewards/rejected": 0.0068163881078362465,
"step": 10
},
{
"epoch": 0.04968242766407904,
"grad_norm": 18.170319693328302,
"learning_rate": 2.9974145946288874e-07,
"logits/chosen": -3.6613407135009766,
"logits/rejected": -3.6700329780578613,
"logps/chosen": -222.37948608398438,
"logps/rejected": -197.45956420898438,
"loss": 0.8201,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.00792229175567627,
"rewards/margins": -0.00381114287301898,
"rewards/rejected": -0.004111149813979864,
"step": 11
},
{
"epoch": 0.05419901199717714,
"grad_norm": 16.73406724526876,
"learning_rate": 2.9959609570562665e-07,
"logits/chosen": -3.6817235946655273,
"logits/rejected": -3.6180002689361572,
"logps/chosen": -207.94598388671875,
"logps/rejected": -185.84361267089844,
"loss": 0.8223,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0013277027755975723,
"rewards/margins": -0.012387244962155819,
"rewards/rejected": 0.011059543117880821,
"step": 12
},
{
"epoch": 0.05871559633027523,
"grad_norm": 14.830646302858986,
"learning_rate": 2.994184927185504e-07,
"logits/chosen": -3.5986547470092773,
"logits/rejected": -3.653524875640869,
"logps/chosen": -195.15597534179688,
"logps/rejected": -176.31295776367188,
"loss": 0.8152,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.0033681748900562525,
"rewards/margins": 0.006227727048099041,
"rewards/rejected": -0.002859552390873432,
"step": 13
},
{
"epoch": 0.06323218066337333,
"grad_norm": 14.922261930025773,
"learning_rate": 2.9920868877661274e-07,
"logits/chosen": -3.749242067337036,
"logits/rejected": -3.669080972671509,
"logps/chosen": -187.9488525390625,
"logps/rejected": -171.089111328125,
"loss": 0.817,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.010204151272773743,
"rewards/margins": -0.0027150483801960945,
"rewards/rejected": -0.007489103823900223,
"step": 14
},
{
"epoch": 0.06774876499647142,
"grad_norm": 16.851808937739815,
"learning_rate": 2.9896672909434605e-07,
"logits/chosen": -3.7500953674316406,
"logits/rejected": -3.6712448596954346,
"logps/chosen": -207.16380310058594,
"logps/rejected": -186.6913299560547,
"loss": 0.8188,
"rewards/accuracies": 0.484375,
"rewards/chosen": -0.008293930441141129,
"rewards/margins": -0.005823222920298576,
"rewards/rejected": -0.0024707079865038395,
"step": 15
},
{
"epoch": 0.07226534932956952,
"grad_norm": 14.552003861923032,
"learning_rate": 2.986926658161179e-07,
"logits/chosen": -3.640725612640381,
"logits/rejected": -3.612764835357666,
"logps/chosen": -192.30868530273438,
"logps/rejected": -175.70521545410156,
"loss": 0.8232,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.0005644555203616619,
"rewards/margins": -0.005334064364433289,
"rewards/rejected": 0.005898520816117525,
"step": 16
},
{
"epoch": 0.07678193366266761,
"grad_norm": 16.088064283526844,
"learning_rate": 2.9838655800489354e-07,
"logits/chosen": -3.6310815811157227,
"logits/rejected": -3.6059412956237793,
"logps/chosen": -201.4388427734375,
"logps/rejected": -183.0606689453125,
"loss": 0.8124,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.0131416916847229,
"rewards/margins": 0.013241738080978394,
"rewards/rejected": -0.00010004616342484951,
"step": 17
},
{
"epoch": 0.0812985179957657,
"grad_norm": 14.992244007771152,
"learning_rate": 2.980484716295075e-07,
"logits/chosen": -3.733997344970703,
"logits/rejected": -3.698491096496582,
"logps/chosen": -188.22225952148438,
"logps/rejected": -173.397705078125,
"loss": 0.8156,
"rewards/accuracies": 0.484375,
"rewards/chosen": 0.005783616099506617,
"rewards/margins": 0.006108994595706463,
"rewards/rejected": -0.00032537919469177723,
"step": 18
},
{
"epoch": 0.0858151023288638,
"grad_norm": 19.12841459122326,
"learning_rate": 2.976784795504466e-07,
"logits/chosen": -3.6905295848846436,
"logits/rejected": -3.5900840759277344,
"logps/chosen": -203.93548583984375,
"logps/rejected": -177.45327758789062,
"loss": 0.8194,
"rewards/accuracies": 0.546875,
"rewards/chosen": -0.008039581589400768,
"rewards/margins": 0.008988430723547935,
"rewards/rejected": -0.01702801324427128,
"step": 19
},
{
"epoch": 0.0903316866619619,
"grad_norm": 16.487134694988207,
"learning_rate": 2.972766615041477e-07,
"logits/chosen": -3.625434398651123,
"logits/rejected": -3.5620625019073486,
"logps/chosen": -230.413818359375,
"logps/rejected": -209.59951782226562,
"loss": 0.8031,
"rewards/accuracies": 0.640625,
"rewards/chosen": 0.0009453308302909136,
"rewards/margins": 0.023260660469532013,
"rewards/rejected": -0.02231532707810402,
"step": 20
},
{
"epoch": 0.09484827099505999,
"grad_norm": 18.422012266521907,
"learning_rate": 2.968431040858144e-07,
"logits/chosen": -3.6120433807373047,
"logits/rejected": -3.5835161209106445,
"logps/chosen": -179.35166931152344,
"logps/rejected": -174.52101135253906,
"loss": 0.8095,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.007007395848631859,
"rewards/margins": 0.009391836822032928,
"rewards/rejected": -0.016399234533309937,
"step": 21
},
{
"epoch": 0.09936485532815809,
"grad_norm": 19.19498923375192,
"learning_rate": 2.963779007307544e-07,
"logits/chosen": -3.5445475578308105,
"logits/rejected": -3.5850906372070312,
"logps/chosen": -232.3138427734375,
"logps/rejected": -210.61709594726562,
"loss": 0.8196,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.0029985038563609123,
"rewards/margins": 0.020012138411402702,
"rewards/rejected": -0.017013631761074066,
"step": 22
},
{
"epoch": 0.10388143966125618,
"grad_norm": 15.614857590390317,
"learning_rate": 2.958811516942438e-07,
"logits/chosen": -3.697209358215332,
"logits/rejected": -3.624351978302002,
"logps/chosen": -197.09347534179688,
"logps/rejected": -177.77401733398438,
"loss": 0.804,
"rewards/accuracies": 0.609375,
"rewards/chosen": 0.0026644528843462467,
"rewards/margins": 0.028430193662643433,
"rewards/rejected": -0.025765739381313324,
"step": 23
},
{
"epoch": 0.10839802399435428,
"grad_norm": 15.89381246255073,
"learning_rate": 2.953529640299211e-07,
"logits/chosen": -3.600754737854004,
"logits/rejected": -3.5322327613830566,
"logps/chosen": -228.12118530273438,
"logps/rejected": -209.07611083984375,
"loss": 0.8188,
"rewards/accuracies": 0.453125,
"rewards/chosen": -0.006520797498524189,
"rewards/margins": -0.0006878629792481661,
"rewards/rejected": -0.0058329347521066666,
"step": 24
},
{
"epoch": 0.11291460832745237,
"grad_norm": 15.396893791413314,
"learning_rate": 2.947934515667162e-07,
"logits/chosen": -3.555856466293335,
"logits/rejected": -3.548595905303955,
"logps/chosen": -209.21588134765625,
"logps/rejected": -191.61837768554688,
"loss": 0.8118,
"rewards/accuracies": 0.578125,
"rewards/chosen": -0.0008083764696493745,
"rewards/margins": 0.019747722893953323,
"rewards/rejected": -0.020556099712848663,
"step": 25
},
{
"epoch": 0.11743119266055047,
"grad_norm": 17.20718944198085,
"learning_rate": 2.9420273488431933e-07,
"logits/chosen": -3.8108675479888916,
"logits/rejected": -3.727308750152588,
"logps/chosen": -208.67276000976562,
"logps/rejected": -187.67626953125,
"loss": 0.8029,
"rewards/accuracies": 0.515625,
"rewards/chosen": -0.009361563250422478,
"rewards/margins": 0.012206509709358215,
"rewards/rejected": -0.021568071097135544,
"step": 26
},
{
"epoch": 0.12194777699364856,
"grad_norm": 15.82300857880186,
"learning_rate": 2.9358094128719524e-07,
"logits/chosen": -3.725928544998169,
"logits/rejected": -3.6373510360717773,
"logps/chosen": -184.15992736816406,
"logps/rejected": -163.3759307861328,
"loss": 0.8106,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0011545311426743865,
"rewards/margins": 0.01487318892031908,
"rewards/rejected": -0.016027718782424927,
"step": 27
},
{
"epoch": 0.12646436132674665,
"grad_norm": 17.170961024910188,
"learning_rate": 2.929282047771477e-07,
"logits/chosen": -3.7685747146606445,
"logits/rejected": -3.627628803253174,
"logps/chosen": -180.48487854003906,
"logps/rejected": -159.71482849121094,
"loss": 0.8028,
"rewards/accuracies": 0.546875,
"rewards/chosen": -0.006781768519431353,
"rewards/margins": 0.012908656150102615,
"rewards/rejected": -0.01969042420387268,
"step": 28
},
{
"epoch": 0.13098094565984475,
"grad_norm": 16.359167675199306,
"learning_rate": 2.9224466602444125e-07,
"logits/chosen": -3.7441000938415527,
"logits/rejected": -3.6602091789245605,
"logps/chosen": -188.38925170898438,
"logps/rejected": -167.985595703125,
"loss": 0.8037,
"rewards/accuracies": 0.578125,
"rewards/chosen": -0.003988177981227636,
"rewards/margins": 0.01803458295762539,
"rewards/rejected": -0.022022761404514313,
"step": 29
},
{
"epoch": 0.13549752999294284,
"grad_norm": 17.676336950925887,
"learning_rate": 2.9153047233748554e-07,
"logits/chosen": -3.643461227416992,
"logits/rejected": -3.617875814437866,
"logps/chosen": -225.23348999023438,
"logps/rejected": -212.8236541748047,
"loss": 0.8048,
"rewards/accuracies": 0.640625,
"rewards/chosen": 0.004974519833922386,
"rewards/margins": 0.03113599866628647,
"rewards/rejected": -0.026161476969718933,
"step": 30
},
{
"epoch": 0.14001411432604094,
"grad_norm": 14.414082201816626,
"learning_rate": 2.907857776310889e-07,
"logits/chosen": -3.666928291320801,
"logits/rejected": -3.638302803039551,
"logps/chosen": -191.73712158203125,
"logps/rejected": -166.03549194335938,
"loss": 0.8045,
"rewards/accuracies": 0.609375,
"rewards/chosen": 0.004222148098051548,
"rewards/margins": 0.035250477492809296,
"rewards/rejected": -0.031028330326080322,
"step": 31
},
{
"epoch": 0.14453069865913903,
"grad_norm": 15.20568282502077,
"learning_rate": 2.9001074239328855e-07,
"logits/chosen": -3.678450345993042,
"logits/rejected": -3.6584630012512207,
"logps/chosen": -189.22283935546875,
"logps/rejected": -172.16836547851562,
"loss": 0.8085,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.005335791036486626,
"rewards/margins": 0.0169003177434206,
"rewards/rejected": -0.022236105054616928,
"step": 32
},
{
"epoch": 0.14904728299223713,
"grad_norm": 17.666411807362163,
"learning_rate": 2.892055336507641e-07,
"logits/chosen": -3.6488301753997803,
"logits/rejected": -3.6809892654418945,
"logps/chosen": -221.47088623046875,
"logps/rejected": -208.6617431640625,
"loss": 0.7961,
"rewards/accuracies": 0.765625,
"rewards/chosen": 0.0049254028126597404,
"rewards/margins": 0.04181712493300438,
"rewards/rejected": -0.03689172491431236,
"step": 33
},
{
"epoch": 0.15356386732533522,
"grad_norm": 15.904931812764481,
"learning_rate": 2.883703249328419e-07,
"logits/chosen": -3.7053463459014893,
"logits/rejected": -3.6859936714172363,
"logps/chosen": -193.99462890625,
"logps/rejected": -168.01174926757812,
"loss": 0.7982,
"rewards/accuracies": 0.703125,
"rewards/chosen": 0.005228930618613958,
"rewards/margins": 0.04429711773991585,
"rewards/rejected": -0.03906818851828575,
"step": 34
},
{
"epoch": 0.15808045165843332,
"grad_norm": 17.574931019060468,
"learning_rate": 2.8750529623409767e-07,
"logits/chosen": -3.74350643157959,
"logits/rejected": -3.688340902328491,
"logps/chosen": -227.44448852539062,
"logps/rejected": -207.76531982421875,
"loss": 0.8001,
"rewards/accuracies": 0.609375,
"rewards/chosen": -0.013922490179538727,
"rewards/margins": 0.041697580367326736,
"rewards/rejected": -0.05562007054686546,
"step": 35
},
{
"epoch": 0.1625970359915314,
"grad_norm": 16.144138203267183,
"learning_rate": 2.866106339755666e-07,
"logits/chosen": -3.6101019382476807,
"logits/rejected": -3.516798496246338,
"logps/chosen": -225.3680877685547,
"logps/rejected": -200.65811157226562,
"loss": 0.7982,
"rewards/accuracies": 0.703125,
"rewards/chosen": 0.00023476500064134598,
"rewards/margins": 0.043288152664899826,
"rewards/rejected": -0.043053388595581055,
"step": 36
},
{
"epoch": 0.1671136203246295,
"grad_norm": 18.051062496264226,
"learning_rate": 2.856865309645679e-07,
"logits/chosen": -3.668738603591919,
"logits/rejected": -3.5682475566864014,
"logps/chosen": -216.2345733642578,
"logps/rejected": -190.3118896484375,
"loss": 0.7908,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.005171060096472502,
"rewards/margins": 0.06707943975925446,
"rewards/rejected": -0.06190839037299156,
"step": 37
},
{
"epoch": 0.1716302046577276,
"grad_norm": 15.250674643466992,
"learning_rate": 2.847331863531529e-07,
"logits/chosen": -3.6172351837158203,
"logits/rejected": -3.513965129852295,
"logps/chosen": -207.12973022460938,
"logps/rejected": -186.56016540527344,
"loss": 0.7973,
"rewards/accuracies": 0.640625,
"rewards/chosen": -0.00749462703242898,
"rewards/margins": 0.036814432591199875,
"rewards/rejected": -0.04430905729532242,
"step": 38
},
{
"epoch": 0.1761467889908257,
"grad_norm": 13.727288814848803,
"learning_rate": 2.8375080559518633e-07,
"logits/chosen": -3.677856922149658,
"logits/rejected": -3.651683807373047,
"logps/chosen": -173.89752197265625,
"logps/rejected": -161.67294311523438,
"loss": 0.8033,
"rewards/accuracies": 0.640625,
"rewards/chosen": -0.005101449321955442,
"rewards/margins": 0.03479147329926491,
"rewards/rejected": -0.03989291936159134,
"step": 39
},
{
"epoch": 0.1806633733239238,
"grad_norm": 16.031051528638532,
"learning_rate": 2.827396004020694e-07,
"logits/chosen": -3.657525062561035,
"logits/rejected": -3.6614298820495605,
"logps/chosen": -187.38975524902344,
"logps/rejected": -174.07693481445312,
"loss": 0.7971,
"rewards/accuracies": 0.640625,
"rewards/chosen": -0.006134797818958759,
"rewards/margins": 0.04029170051217079,
"rewards/rejected": -0.046426497399806976,
"step": 40
},
{
"epoch": 0.1851799576570219,
"grad_norm": 14.72860095286963,
"learning_rate": 2.8169978869711385e-07,
"logits/chosen": -3.7374589443206787,
"logits/rejected": -3.656147003173828,
"logps/chosen": -187.12936401367188,
"logps/rejected": -161.41815185546875,
"loss": 0.787,
"rewards/accuracies": 0.703125,
"rewards/chosen": 0.004431622102856636,
"rewards/margins": 0.05767218768596649,
"rewards/rejected": -0.053240567445755005,
"step": 41
},
{
"epoch": 0.18969654199011998,
"grad_norm": 17.441579066359495,
"learning_rate": 2.806315945685779e-07,
"logits/chosen": -3.6515302658081055,
"logits/rejected": -3.551766872406006,
"logps/chosen": -250.11190795898438,
"logps/rejected": -221.34317016601562,
"loss": 0.7699,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.006256776861846447,
"rewards/margins": 0.09309859573841095,
"rewards/rejected": -0.08684182167053223,
"step": 42
},
{
"epoch": 0.19421312632321808,
"grad_norm": 17.048744298556095,
"learning_rate": 2.7953524822137317e-07,
"logits/chosen": -3.7105650901794434,
"logits/rejected": -3.6019039154052734,
"logps/chosen": -208.5770263671875,
"logps/rejected": -180.16976928710938,
"loss": 0.8027,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.018564769998192787,
"rewards/margins": 0.04158995673060417,
"rewards/rejected": -0.06015472859144211,
"step": 43
},
{
"epoch": 0.19872971065631617,
"grad_norm": 16.16815408757425,
"learning_rate": 2.784109859274537e-07,
"logits/chosen": -3.597439765930176,
"logits/rejected": -3.6136202812194824,
"logps/chosen": -211.610107421875,
"logps/rejected": -191.22201538085938,
"loss": 0.7829,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.003241416532546282,
"rewards/margins": 0.07393845170736313,
"rewards/rejected": -0.07717986404895782,
"step": 44
},
{
"epoch": 0.20324629498941427,
"grad_norm": 13.017137061508738,
"learning_rate": 2.7725904997489726e-07,
"logits/chosen": -3.613895893096924,
"logits/rejected": -3.6305315494537354,
"logps/chosen": -189.8942108154297,
"logps/rejected": -180.846923828125,
"loss": 0.7937,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.023781713098287582,
"rewards/margins": 0.04976597800850868,
"rewards/rejected": -0.07354769110679626,
"step": 45
},
{
"epoch": 0.20776287932251236,
"grad_norm": 15.730598564063932,
"learning_rate": 2.760796886156901e-07,
"logits/chosen": -3.5601043701171875,
"logits/rejected": -3.5674729347229004,
"logps/chosen": -202.303466796875,
"logps/rejected": -192.0014190673828,
"loss": 0.7865,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.017426731064915657,
"rewards/margins": 0.05039919540286064,
"rewards/rejected": -0.06782592833042145,
"step": 46
},
{
"epoch": 0.21227946365561046,
"grad_norm": 15.472698059079649,
"learning_rate": 2.748731560122267e-07,
"logits/chosen": -3.6035664081573486,
"logits/rejected": -3.539640426635742,
"logps/chosen": -221.04888916015625,
"logps/rejected": -198.54339599609375,
"loss": 0.7848,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.005871212109923363,
"rewards/margins": 0.07652122527360916,
"rewards/rejected": -0.08239243924617767,
"step": 47
},
{
"epoch": 0.21679604798870855,
"grad_norm": 15.777479890984548,
"learning_rate": 2.7363971218253573e-07,
"logits/chosen": -3.6367340087890625,
"logits/rejected": -3.5720105171203613,
"logps/chosen": -206.6673583984375,
"logps/rejected": -191.620361328125,
"loss": 0.7834,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.0010449562687426805,
"rewards/margins": 0.07644650340080261,
"rewards/rejected": -0.07540154457092285,
"step": 48
},
{
"epoch": 0.22131263232180665,
"grad_norm": 14.647917769515534,
"learning_rate": 2.7237962294424354e-07,
"logits/chosen": -3.742436408996582,
"logits/rejected": -3.583324432373047,
"logps/chosen": -216.07376098632812,
"logps/rejected": -191.45822143554688,
"loss": 0.7637,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.0027800833340734243,
"rewards/margins": 0.09453913569450378,
"rewards/rejected": -0.09175905585289001,
"step": 49
},
{
"epoch": 0.22582921665490474,
"grad_norm": 14.379641829000743,
"learning_rate": 2.7109315985728866e-07,
"logits/chosen": -3.4936299324035645,
"logits/rejected": -3.461601734161377,
"logps/chosen": -222.20791625976562,
"logps/rejected": -198.30593872070312,
"loss": 0.7946,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.01759425923228264,
"rewards/margins": 0.06354185938835144,
"rewards/rejected": -0.08113611489534378,
"step": 50
},
{
"epoch": 0.23034580098800284,
"grad_norm": 15.536214345554205,
"learning_rate": 2.697806001653979e-07,
"logits/chosen": -3.656852960586548,
"logits/rejected": -3.5931718349456787,
"logps/chosen": -215.4611053466797,
"logps/rejected": -195.17086791992188,
"loss": 0.7702,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.012303248047828674,
"rewards/margins": 0.09312085807323456,
"rewards/rejected": -0.10542410612106323,
"step": 51
},
{
"epoch": 0.23486238532110093,
"grad_norm": 15.948190715341653,
"learning_rate": 2.684422267363384e-07,
"logits/chosen": -3.658729076385498,
"logits/rejected": -3.5965871810913086,
"logps/chosen": -227.6315460205078,
"logps/rejected": -216.8878936767578,
"loss": 0.7749,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.02002471685409546,
"rewards/margins": 0.08053679019212723,
"rewards/rejected": -0.10056150704622269,
"step": 52
},
{
"epoch": 0.23937896965419903,
"grad_norm": 13.3354410986771,
"learning_rate": 2.670783280009569e-07,
"logits/chosen": -3.5585010051727295,
"logits/rejected": -3.542996406555176,
"logps/chosen": -200.6866912841797,
"logps/rejected": -176.0266876220703,
"loss": 0.7969,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.016675246879458427,
"rewards/margins": 0.0597347766160965,
"rewards/rejected": -0.07641002535820007,
"step": 53
},
{
"epoch": 0.24389555398729712,
"grad_norm": 15.37006073629648,
"learning_rate": 2.656891978910205e-07,
"logits/chosen": -3.5804214477539062,
"logits/rejected": -3.580320358276367,
"logps/chosen": -199.03363037109375,
"logps/rejected": -175.6981201171875,
"loss": 0.7826,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.022621821612119675,
"rewards/margins": 0.07916627079248428,
"rewards/rejected": -0.10178809612989426,
"step": 54
},
{
"epoch": 0.24841213832039521,
"grad_norm": 13.977675083898662,
"learning_rate": 2.642751357758722e-07,
"logits/chosen": -3.6277871131896973,
"logits/rejected": -3.576045274734497,
"logps/chosen": -198.56690979003906,
"logps/rejected": -173.84307861328125,
"loss": 0.7798,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.014770936220884323,
"rewards/margins": 0.08849596232175827,
"rewards/rejected": -0.10326690226793289,
"step": 55
},
{
"epoch": 0.2529287226534933,
"grad_norm": 15.0141286536347,
"learning_rate": 2.628364463979135e-07,
"logits/chosen": -3.591761589050293,
"logits/rejected": -3.6051506996154785,
"logps/chosen": -220.2593231201172,
"logps/rejected": -202.11480712890625,
"loss": 0.7609,
"rewards/accuracies": 0.890625,
"rewards/chosen": -0.004766255617141724,
"rewards/margins": 0.10991345345973969,
"rewards/rejected": -0.11467970907688141,
"step": 56
},
{
"epoch": 0.2574453069865914,
"grad_norm": 16.32019401181327,
"learning_rate": 2.613734398069308e-07,
"logits/chosen": -3.6448159217834473,
"logits/rejected": -3.6120004653930664,
"logps/chosen": -218.3624267578125,
"logps/rejected": -206.45156860351562,
"loss": 0.7646,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.0081629678606987,
"rewards/margins": 0.10102861374616623,
"rewards/rejected": -0.10919158160686493,
"step": 57
},
{
"epoch": 0.2619618913196895,
"grad_norm": 14.250044872308086,
"learning_rate": 2.598864312932762e-07,
"logits/chosen": -3.5635013580322266,
"logits/rejected": -3.560309886932373,
"logps/chosen": -199.04324340820312,
"logps/rejected": -181.06304931640625,
"loss": 0.7902,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.025674719363451004,
"rewards/margins": 0.06853548437356949,
"rewards/rejected": -0.09421020746231079,
"step": 58
},
{
"epoch": 0.2664784756527876,
"grad_norm": 15.467967190395154,
"learning_rate": 2.5837574131992034e-07,
"logits/chosen": -3.590390205383301,
"logits/rejected": -3.6538496017456055,
"logps/chosen": -211.0101776123047,
"logps/rejected": -200.92840576171875,
"loss": 0.7626,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.022280972450971603,
"rewards/margins": 0.10610129684209824,
"rewards/rejected": -0.12838226556777954,
"step": 59
},
{
"epoch": 0.2709950599858857,
"grad_norm": 15.203009992593772,
"learning_rate": 2.568416954533894e-07,
"logits/chosen": -3.687786102294922,
"logits/rejected": -3.6774373054504395,
"logps/chosen": -186.29849243164062,
"logps/rejected": -169.62168884277344,
"loss": 0.7785,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.018803317099809647,
"rewards/margins": 0.08361957967281342,
"rewards/rejected": -0.10242290794849396,
"step": 60
},
{
"epoch": 0.2755116443189838,
"grad_norm": 14.57683761120013,
"learning_rate": 2.552846242936032e-07,
"logits/chosen": -3.638808250427246,
"logits/rejected": -3.6172327995300293,
"logps/chosen": -202.23802185058594,
"logps/rejected": -183.21786499023438,
"loss": 0.7644,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.009251074865460396,
"rewards/margins": 0.11388231813907623,
"rewards/rejected": -0.12313339859247208,
"step": 61
},
{
"epoch": 0.2800282286520819,
"grad_norm": 14.071215109151332,
"learning_rate": 2.537048634026279e-07,
"logits/chosen": -3.652895927429199,
"logits/rejected": -3.5790305137634277,
"logps/chosen": -200.75323486328125,
"logps/rejected": -180.8776092529297,
"loss": 0.7716,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.02093246765434742,
"rewards/margins": 0.09524297714233398,
"rewards/rejected": -0.11617545038461685,
"step": 62
},
{
"epoch": 0.28454481298518,
"grad_norm": 13.217637221004559,
"learning_rate": 2.521027532323594e-07,
"logits/chosen": -3.59419584274292,
"logits/rejected": -3.5415499210357666,
"logps/chosen": -196.58969116210938,
"logps/rejected": -181.43600463867188,
"loss": 0.7674,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.023986171931028366,
"rewards/margins": 0.10850539058446884,
"rewards/rejected": -0.1324915587902069,
"step": 63
},
{
"epoch": 0.28906139731827807,
"grad_norm": 14.497006980804226,
"learning_rate": 2.5047863905115337e-07,
"logits/chosen": -3.5735766887664795,
"logits/rejected": -3.5254015922546387,
"logps/chosen": -198.43698120117188,
"logps/rejected": -177.77783203125,
"loss": 0.7642,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.018381193280220032,
"rewards/margins": 0.12361248582601547,
"rewards/rejected": -0.1419936716556549,
"step": 64
},
{
"epoch": 0.29357798165137616,
"grad_norm": 14.857860546032539,
"learning_rate": 2.4883287086941666e-07,
"logits/chosen": -3.592167854309082,
"logits/rejected": -3.554619073867798,
"logps/chosen": -209.84280395507812,
"logps/rejected": -195.99557495117188,
"loss": 0.7711,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.04406602308154106,
"rewards/margins": 0.11150160431861877,
"rewards/rejected": -0.15556763112545013,
"step": 65
},
{
"epoch": 0.29809456598447426,
"grad_norm": 15.7289147220562,
"learning_rate": 2.4716580336417735e-07,
"logits/chosen": -3.730616569519043,
"logits/rejected": -3.611698627471924,
"logps/chosen": -213.7239990234375,
"logps/rejected": -191.99624633789062,
"loss": 0.7641,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.04750160127878189,
"rewards/margins": 0.11192238330841064,
"rewards/rejected": -0.15942397713661194,
"step": 66
},
{
"epoch": 0.30261115031757235,
"grad_norm": 16.553215826780214,
"learning_rate": 2.4547779580264873e-07,
"logits/chosen": -3.6770639419555664,
"logits/rejected": -3.6511921882629395,
"logps/chosen": -229.4193115234375,
"logps/rejected": -219.91082763671875,
"loss": 0.7634,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.07060261070728302,
"rewards/margins": 0.10048267990350723,
"rewards/rejected": -0.17108528316020966,
"step": 67
},
{
"epoch": 0.30712773465067045,
"grad_norm": 14.535708064025933,
"learning_rate": 2.4376921196480405e-07,
"logits/chosen": -3.643744945526123,
"logits/rejected": -3.6282317638397217,
"logps/chosen": -215.79782104492188,
"logps/rejected": -195.50967407226562,
"loss": 0.7459,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.030901005491614342,
"rewards/margins": 0.15577057003974915,
"rewards/rejected": -0.18667156994342804,
"step": 68
},
{
"epoch": 0.31164431898376854,
"grad_norm": 12.826587337871429,
"learning_rate": 2.420404200649791e-07,
"logits/chosen": -3.6386120319366455,
"logits/rejected": -3.6321802139282227,
"logps/chosen": -197.25970458984375,
"logps/rejected": -188.24197387695312,
"loss": 0.7671,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.08027191460132599,
"rewards/margins": 0.09604091197252274,
"rewards/rejected": -0.17631281912326813,
"step": 69
},
{
"epoch": 0.31616090331686664,
"grad_norm": 16.418214717599835,
"learning_rate": 2.402917926725185e-07,
"logits/chosen": -3.6721415519714355,
"logits/rejected": -3.6725897789001465,
"logps/chosen": -216.7154541015625,
"logps/rejected": -201.8681640625,
"loss": 0.7291,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.029750004410743713,
"rewards/margins": 0.1923179030418396,
"rewards/rejected": -0.22206789255142212,
"step": 70
},
{
"epoch": 0.32067748764996473,
"grad_norm": 13.215008697180764,
"learning_rate": 2.385237066314845e-07,
"logits/chosen": -3.565258026123047,
"logits/rejected": -3.553618907928467,
"logps/chosen": -197.5516815185547,
"logps/rejected": -184.250732421875,
"loss": 0.7722,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.050527218729257584,
"rewards/margins": 0.11302457749843597,
"rewards/rejected": -0.16355180740356445,
"step": 71
},
{
"epoch": 0.3251940719830628,
"grad_norm": 13.956180499179757,
"learning_rate": 2.3673654297944303e-07,
"logits/chosen": -3.6860711574554443,
"logits/rejected": -3.6514902114868164,
"logps/chosen": -226.09010314941406,
"logps/rejected": -204.78941345214844,
"loss": 0.753,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.06019885092973709,
"rewards/margins": 0.1402389109134674,
"rewards/rejected": -0.2004377692937851,
"step": 72
},
{
"epoch": 0.3297106563161609,
"grad_norm": 12.632161370394448,
"learning_rate": 2.3493068686534757e-07,
"logits/chosen": -3.6133785247802734,
"logits/rejected": -3.573826789855957,
"logps/chosen": -208.22662353515625,
"logps/rejected": -189.77066040039062,
"loss": 0.7492,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.055879779160022736,
"rewards/margins": 0.1519559770822525,
"rewards/rejected": -0.20783576369285583,
"step": 73
},
{
"epoch": 0.334227240649259,
"grad_norm": 14.261595430312116,
"learning_rate": 2.3310652746653585e-07,
"logits/chosen": -3.6738648414611816,
"logits/rejected": -3.6415371894836426,
"logps/chosen": -185.10997009277344,
"logps/rejected": -167.96487426757812,
"loss": 0.7588,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.052835769951343536,
"rewards/margins": 0.1488420069217682,
"rewards/rejected": -0.20167775452136993,
"step": 74
},
{
"epoch": 0.3387438249823571,
"grad_norm": 14.822381543756691,
"learning_rate": 2.312644579048592e-07,
"logits/chosen": -3.7603840827941895,
"logits/rejected": -3.660132646560669,
"logps/chosen": -209.13519287109375,
"logps/rejected": -186.57931518554688,
"loss": 0.748,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.07213892042636871,
"rewards/margins": 0.1440207064151764,
"rewards/rejected": -0.2161596417427063,
"step": 75
},
{
"epoch": 0.3432604093154552,
"grad_norm": 14.275023304603453,
"learning_rate": 2.29404875161961e-07,
"logits/chosen": -3.6284379959106445,
"logits/rejected": -3.5983853340148926,
"logps/chosen": -222.89759826660156,
"logps/rejected": -200.91384887695312,
"loss": 0.7395,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.06927837431430817,
"rewards/margins": 0.18208304047584534,
"rewards/rejected": -0.2513614296913147,
"step": 76
},
{
"epoch": 0.3477769936485533,
"grad_norm": 12.109768727947394,
"learning_rate": 2.2752817999372408e-07,
"logits/chosen": -3.717994213104248,
"logits/rejected": -3.6696221828460693,
"logps/chosen": -179.09075927734375,
"logps/rejected": -165.41148376464844,
"loss": 0.7619,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.07646910846233368,
"rewards/margins": 0.10589072108268738,
"rewards/rejected": -0.18235982954502106,
"step": 77
},
{
"epoch": 0.3522935779816514,
"grad_norm": 12.334847197016632,
"learning_rate": 2.2563477684390454e-07,
"logits/chosen": -3.6596970558166504,
"logits/rejected": -3.7064521312713623,
"logps/chosen": -183.98486328125,
"logps/rejected": -175.17532348632812,
"loss": 0.7407,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.0660928264260292,
"rewards/margins": 0.15099170804023743,
"rewards/rejected": -0.21708452701568604,
"step": 78
},
{
"epoch": 0.3568101623147495,
"grad_norm": 14.527926172073567,
"learning_rate": 2.2372507375697016e-07,
"logits/chosen": -3.6260461807250977,
"logits/rejected": -3.56558895111084,
"logps/chosen": -215.37002563476562,
"logps/rejected": -191.77987670898438,
"loss": 0.7351,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.04920428246259689,
"rewards/margins": 0.18218760192394257,
"rewards/rejected": -0.23139187693595886,
"step": 79
},
{
"epoch": 0.3613267466478476,
"grad_norm": 13.953022020448985,
"learning_rate": 2.217994822901639e-07,
"logits/chosen": -3.6040546894073486,
"logits/rejected": -3.6221628189086914,
"logps/chosen": -223.6805419921875,
"logps/rejected": -201.2560577392578,
"loss": 0.7415,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.053364675492048264,
"rewards/margins": 0.1852804273366928,
"rewards/rejected": -0.23864510655403137,
"step": 80
},
{
"epoch": 0.3658433309809457,
"grad_norm": 13.78052722074895,
"learning_rate": 2.1985841742480954e-07,
"logits/chosen": -3.5805296897888184,
"logits/rejected": -3.5260043144226074,
"logps/chosen": -215.5792236328125,
"logps/rejected": -199.66006469726562,
"loss": 0.7274,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.06636206805706024,
"rewards/margins": 0.1942358762025833,
"rewards/rejected": -0.26059794425964355,
"step": 81
},
{
"epoch": 0.3703599153140438,
"grad_norm": 14.881670177581055,
"learning_rate": 2.1790229747687971e-07,
"logits/chosen": -3.739069938659668,
"logits/rejected": -3.669661283493042,
"logps/chosen": -239.0128173828125,
"logps/rejected": -217.56625366210938,
"loss": 0.7162,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.06281246244907379,
"rewards/margins": 0.22767522931098938,
"rewards/rejected": -0.290487676858902,
"step": 82
},
{
"epoch": 0.37487649964714187,
"grad_norm": 13.702491862774862,
"learning_rate": 2.1593154400684523e-07,
"logits/chosen": -3.6899726390838623,
"logits/rejected": -3.5771546363830566,
"logps/chosen": -202.50872802734375,
"logps/rejected": -176.08013916015625,
"loss": 0.7315,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.08337001502513885,
"rewards/margins": 0.18461519479751587,
"rewards/rejected": -0.2679852247238159,
"step": 83
},
{
"epoch": 0.37939308398023996,
"grad_norm": 13.984029281056696,
"learning_rate": 2.139465817288254e-07,
"logits/chosen": -3.7421321868896484,
"logits/rejected": -3.6175167560577393,
"logps/chosen": -190.1082000732422,
"logps/rejected": -175.02052307128906,
"loss": 0.7337,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.08382508903741837,
"rewards/margins": 0.17669573426246643,
"rewards/rejected": -0.2605208456516266,
"step": 84
},
{
"epoch": 0.38390966831333806,
"grad_norm": 19.996335370988827,
"learning_rate": 2.1194783841905826e-07,
"logits/chosen": -3.5821313858032227,
"logits/rejected": -3.4785704612731934,
"logps/chosen": -230.62477111816406,
"logps/rejected": -208.86831665039062,
"loss": 0.7245,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.11116647720336914,
"rewards/margins": 0.19649800658226013,
"rewards/rejected": -0.3076644837856293,
"step": 85
},
{
"epoch": 0.38842625264643615,
"grad_norm": 11.794852107515213,
"learning_rate": 2.0993574482371138e-07,
"logits/chosen": -3.557180643081665,
"logits/rejected": -3.5188608169555664,
"logps/chosen": -200.02206420898438,
"logps/rejected": -186.23666381835938,
"loss": 0.7401,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.09516202658414841,
"rewards/margins": 0.18562006950378418,
"rewards/rejected": -0.2807821035385132,
"step": 86
},
{
"epoch": 0.39294283697953425,
"grad_norm": 15.110048854614613,
"learning_rate": 2.0791073456605222e-07,
"logits/chosen": -3.6332998275756836,
"logits/rejected": -3.6093335151672363,
"logps/chosen": -247.05874633789062,
"logps/rejected": -224.82608032226562,
"loss": 0.6953,
"rewards/accuracies": 0.90625,
"rewards/chosen": -0.08461566269397736,
"rewards/margins": 0.2697482705116272,
"rewards/rejected": -0.35436391830444336,
"step": 87
},
{
"epoch": 0.39745942131263234,
"grad_norm": 12.45488028971463,
"learning_rate": 2.058732440529989e-07,
"logits/chosen": -3.7330398559570312,
"logits/rejected": -3.5911219120025635,
"logps/chosen": -211.71954345703125,
"logps/rejected": -190.85597229003906,
"loss": 0.733,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.11091723293066025,
"rewards/margins": 0.17457152903079987,
"rewards/rejected": -0.2854887545108795,
"step": 88
},
{
"epoch": 0.40197600564573044,
"grad_norm": 13.342737944026238,
"learning_rate": 2.0382371238107038e-07,
"logits/chosen": -3.7038931846618652,
"logits/rejected": -3.630575656890869,
"logps/chosen": -217.16465759277344,
"logps/rejected": -195.84320068359375,
"loss": 0.717,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.08662399649620056,
"rewards/margins": 0.21940693259239197,
"rewards/rejected": -0.30603092908859253,
"step": 89
},
{
"epoch": 0.40649258997882853,
"grad_norm": 12.805765983733862,
"learning_rate": 2.0176258124175791e-07,
"logits/chosen": -3.5431618690490723,
"logits/rejected": -3.5177979469299316,
"logps/chosen": -208.36294555664062,
"logps/rejected": -196.33544921875,
"loss": 0.7397,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.1321878284215927,
"rewards/margins": 0.17401957511901855,
"rewards/rejected": -0.30620741844177246,
"step": 90
},
{
"epoch": 0.41100917431192663,
"grad_norm": 14.324921342449116,
"learning_rate": 1.996902948263364e-07,
"logits/chosen": -3.6215481758117676,
"logits/rejected": -3.5871336460113525,
"logps/chosen": -218.2590789794922,
"logps/rejected": -202.82135009765625,
"loss": 0.7184,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.06889334321022034,
"rewards/margins": 0.2213161736726761,
"rewards/rejected": -0.2902095317840576,
"step": 91
},
{
"epoch": 0.4155257586450247,
"grad_norm": 13.522825965617656,
"learning_rate": 1.9760729973013756e-07,
"logits/chosen": -3.5652565956115723,
"logits/rejected": -3.559969902038574,
"logps/chosen": -216.3070068359375,
"logps/rejected": -207.52525329589844,
"loss": 0.7279,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.12052971869707108,
"rewards/margins": 0.19747133553028107,
"rewards/rejected": -0.31800103187561035,
"step": 92
},
{
"epoch": 0.4200423429781228,
"grad_norm": 14.17056179163305,
"learning_rate": 1.9551404485630487e-07,
"logits/chosen": -3.648214101791382,
"logits/rejected": -3.633265972137451,
"logps/chosen": -228.32022094726562,
"logps/rejected": -216.98306274414062,
"loss": 0.712,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.09251593053340912,
"rewards/margins": 0.24657899141311646,
"rewards/rejected": -0.33909493684768677,
"step": 93
},
{
"epoch": 0.4245589273112209,
"grad_norm": 12.41603788591784,
"learning_rate": 1.9341098131905102e-07,
"logits/chosen": -3.563978672027588,
"logits/rejected": -3.575545310974121,
"logps/chosen": -197.626708984375,
"logps/rejected": -183.598876953125,
"loss": 0.7199,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.06666558235883713,
"rewards/margins": 0.22736752033233643,
"rewards/rejected": -0.29403308033943176,
"step": 94
},
{
"epoch": 0.429075511644319,
"grad_norm": 13.43017680491703,
"learning_rate": 1.91298562346439e-07,
"logits/chosen": -3.5415198802948,
"logits/rejected": -3.423177480697632,
"logps/chosen": -210.36868286132812,
"logps/rejected": -193.19737243652344,
"loss": 0.7208,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.13729974627494812,
"rewards/margins": 0.20369592308998108,
"rewards/rejected": -0.3409956693649292,
"step": 95
},
{
"epoch": 0.4335920959774171,
"grad_norm": 13.192586910237464,
"learning_rate": 1.8917724318270764e-07,
"logits/chosen": -3.6500909328460693,
"logits/rejected": -3.641021728515625,
"logps/chosen": -212.2454833984375,
"logps/rejected": -195.04710388183594,
"loss": 0.7359,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.13967543840408325,
"rewards/margins": 0.19579245150089264,
"rewards/rejected": -0.3354678750038147,
"step": 96
},
{
"epoch": 0.4381086803105152,
"grad_norm": 14.631006890045864,
"learning_rate": 1.8704748099016263e-07,
"logits/chosen": -3.5358424186706543,
"logits/rejected": -3.492499828338623,
"logps/chosen": -226.14268493652344,
"logps/rejected": -209.61166381835938,
"loss": 0.7276,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.12765762209892273,
"rewards/margins": 0.21874003112316132,
"rewards/rejected": -0.34639766812324524,
"step": 97
},
{
"epoch": 0.4426252646436133,
"grad_norm": 12.620333608491627,
"learning_rate": 1.8490973475065407e-07,
"logits/chosen": -3.62363862991333,
"logits/rejected": -3.558384656906128,
"logps/chosen": -201.33468627929688,
"logps/rejected": -187.42794799804688,
"loss": 0.735,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.10830561816692352,
"rewards/margins": 0.19010263681411743,
"rewards/rejected": -0.29840826988220215,
"step": 98
},
{
"epoch": 0.4471418489767114,
"grad_norm": 11.634459384049228,
"learning_rate": 1.8276446516666194e-07,
"logits/chosen": -3.564702033996582,
"logits/rejected": -3.4371743202209473,
"logps/chosen": -203.50912475585938,
"logps/rejected": -186.2063446044922,
"loss": 0.7276,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.12979203462600708,
"rewards/margins": 0.19052280485630035,
"rewards/rejected": -0.32031482458114624,
"step": 99
},
{
"epoch": 0.4516584333098095,
"grad_norm": 14.33489341500007,
"learning_rate": 1.806121345620111e-07,
"logits/chosen": -3.5418810844421387,
"logits/rejected": -3.465827703475952,
"logps/chosen": -222.30589294433594,
"logps/rejected": -204.18182373046875,
"loss": 0.7318,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.15671955049037933,
"rewards/margins": 0.2025194764137268,
"rewards/rejected": -0.35923904180526733,
"step": 100
},
{
"epoch": 0.4561750176429076,
"grad_norm": 12.379106375796471,
"learning_rate": 1.7845320678223614e-07,
"logits/chosen": -3.572160482406616,
"logits/rejected": -3.4780170917510986,
"logps/chosen": -204.60537719726562,
"logps/rejected": -188.61119079589844,
"loss": 0.7315,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.11631282418966293,
"rewards/margins": 0.19585317373275757,
"rewards/rejected": -0.3121660053730011,
"step": 101
},
{
"epoch": 0.46069160197600567,
"grad_norm": 13.428841184771063,
"learning_rate": 1.7628814709461914e-07,
"logits/chosen": -3.4720003604888916,
"logits/rejected": -3.541748523712158,
"logps/chosen": -225.80712890625,
"logps/rejected": -213.91180419921875,
"loss": 0.7084,
"rewards/accuracies": 0.921875,
"rewards/chosen": -0.12924836575984955,
"rewards/margins": 0.276883989572525,
"rewards/rejected": -0.40613240003585815,
"step": 102
},
{
"epoch": 0.46520818630910377,
"grad_norm": 12.642202001549919,
"learning_rate": 1.7411742208792024e-07,
"logits/chosen": -3.6850690841674805,
"logits/rejected": -3.545577049255371,
"logps/chosen": -225.94509887695312,
"logps/rejected": -193.059326171875,
"loss": 0.7084,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.11674878001213074,
"rewards/margins": 0.25400400161743164,
"rewards/rejected": -0.3707527816295624,
"step": 103
},
{
"epoch": 0.46972477064220186,
"grad_norm": 11.73048532713264,
"learning_rate": 1.7194149957182414e-07,
"logits/chosen": -3.5629310607910156,
"logits/rejected": -3.5668156147003174,
"logps/chosen": -171.92477416992188,
"logps/rejected": -165.56927490234375,
"loss": 0.7396,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.1383664906024933,
"rewards/margins": 0.1683700531721115,
"rewards/rejected": -0.3067365288734436,
"step": 104
},
{
"epoch": 0.47424135497529996,
"grad_norm": 12.673432432328502,
"learning_rate": 1.6976084847612282e-07,
"logits/chosen": -3.525435447692871,
"logits/rejected": -3.4939839839935303,
"logps/chosen": -206.1438751220703,
"logps/rejected": -188.24546813964844,
"loss": 0.722,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.13165442645549774,
"rewards/margins": 0.2146846055984497,
"rewards/rejected": -0.34633904695510864,
"step": 105
},
{
"epoch": 0.47875793930839805,
"grad_norm": 12.032890874796571,
"learning_rate": 1.6757593874965754e-07,
"logits/chosen": -3.5656533241271973,
"logits/rejected": -3.527076005935669,
"logps/chosen": -197.67202758789062,
"logps/rejected": -181.7797088623047,
"loss": 0.7251,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.0992819219827652,
"rewards/margins": 0.225886732339859,
"rewards/rejected": -0.325168639421463,
"step": 106
},
{
"epoch": 0.48327452364149615,
"grad_norm": 12.310976191149708,
"learning_rate": 1.6538724125904051e-07,
"logits/chosen": -3.686993360519409,
"logits/rejected": -3.6414313316345215,
"logps/chosen": -204.51181030273438,
"logps/rejected": -196.01507568359375,
"loss": 0.7234,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.12974955141544342,
"rewards/margins": 0.21487677097320557,
"rewards/rejected": -0.3446263074874878,
"step": 107
},
{
"epoch": 0.48779110797459424,
"grad_norm": 12.199852142739852,
"learning_rate": 1.6319522768717944e-07,
"logits/chosen": -3.6431784629821777,
"logits/rejected": -3.561030864715576,
"logps/chosen": -199.9854278564453,
"logps/rejected": -181.25628662109375,
"loss": 0.7371,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.11536161601543427,
"rewards/margins": 0.2128218412399292,
"rewards/rejected": -0.32818344235420227,
"step": 108
},
{
"epoch": 0.49230769230769234,
"grad_norm": 13.488082140342616,
"learning_rate": 1.610003704316256e-07,
"logits/chosen": -3.7115769386291504,
"logits/rejected": -3.6150527000427246,
"logps/chosen": -206.2103271484375,
"logps/rejected": -184.147705078125,
"loss": 0.6968,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.07892445474863052,
"rewards/margins": 0.2777579128742218,
"rewards/rejected": -0.35668236017227173,
"step": 109
},
{
"epoch": 0.49682427664079043,
"grad_norm": 12.799344796933894,
"learning_rate": 1.5880314250276833e-07,
"logits/chosen": -3.6075048446655273,
"logits/rejected": -3.451253652572632,
"logps/chosen": -212.31695556640625,
"logps/rejected": -189.007080078125,
"loss": 0.723,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.1649230718612671,
"rewards/margins": 0.23563840985298157,
"rewards/rejected": -0.40056151151657104,
"step": 110
},
{
"epoch": 0.5013408609738885,
"grad_norm": 11.195885489420343,
"learning_rate": 1.5660401742189716e-07,
"logits/chosen": -3.6232829093933105,
"logits/rejected": -3.517642021179199,
"logps/chosen": -196.2974090576172,
"logps/rejected": -181.78778076171875,
"loss": 0.7163,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.11609620600938797,
"rewards/margins": 0.24031955003738403,
"rewards/rejected": -0.3564157783985138,
"step": 111
},
{
"epoch": 0.5058574453069866,
"grad_norm": 13.462764100267743,
"learning_rate": 1.5440346911915413e-07,
"logits/chosen": -3.5703439712524414,
"logits/rejected": -3.5258054733276367,
"logps/chosen": -197.42391967773438,
"logps/rejected": -182.4862060546875,
"loss": 0.707,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.13110296428203583,
"rewards/margins": 0.25373634696006775,
"rewards/rejected": -0.3848392963409424,
"step": 112
},
{
"epoch": 0.5103740296400847,
"grad_norm": 11.216776131932745,
"learning_rate": 1.522019718313975e-07,
"logits/chosen": -3.5310792922973633,
"logits/rejected": -3.565998077392578,
"logps/chosen": -209.51132202148438,
"logps/rejected": -189.17047119140625,
"loss": 0.7095,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.12987811863422394,
"rewards/margins": 0.2518337368965149,
"rewards/rejected": -0.38171184062957764,
"step": 113
},
{
"epoch": 0.5148906139731828,
"grad_norm": 13.293761430898734,
"learning_rate": 1.5e-07,
"logits/chosen": -3.7002620697021484,
"logits/rejected": -3.6795387268066406,
"logps/chosen": -213.72909545898438,
"logps/rejected": -196.23594665527344,
"loss": 0.713,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.13611683249473572,
"rewards/margins": 0.2516591548919678,
"rewards/rejected": -0.3877760171890259,
"step": 114
},
{
"epoch": 0.5194071983062809,
"grad_norm": 11.607463785941228,
"learning_rate": 1.4779802816860252e-07,
"logits/chosen": -3.5893545150756836,
"logits/rejected": -3.5178956985473633,
"logps/chosen": -208.91015625,
"logps/rejected": -186.68402099609375,
"loss": 0.6884,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.06911865621805191,
"rewards/margins": 0.3234255909919739,
"rewards/rejected": -0.3925442397594452,
"step": 115
},
{
"epoch": 0.523923782639379,
"grad_norm": 11.17744384668911,
"learning_rate": 1.4559653088084589e-07,
"logits/chosen": -3.5923399925231934,
"logits/rejected": -3.5601184368133545,
"logps/chosen": -197.11102294921875,
"logps/rejected": -182.57537841796875,
"loss": 0.7113,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.17773228883743286,
"rewards/margins": 0.2376946210861206,
"rewards/rejected": -0.41542690992355347,
"step": 116
},
{
"epoch": 0.5284403669724771,
"grad_norm": 12.324943566684535,
"learning_rate": 1.4339598257810283e-07,
"logits/chosen": -3.4791109561920166,
"logits/rejected": -3.528164863586426,
"logps/chosen": -203.9499969482422,
"logps/rejected": -191.4123077392578,
"loss": 0.7141,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.1312122344970703,
"rewards/margins": 0.24087485671043396,
"rewards/rejected": -0.3720870912075043,
"step": 117
},
{
"epoch": 0.5329569513055752,
"grad_norm": 13.355110060788324,
"learning_rate": 1.411968574972317e-07,
"logits/chosen": -3.4863524436950684,
"logits/rejected": -3.5186939239501953,
"logps/chosen": -211.86410522460938,
"logps/rejected": -196.56219482421875,
"loss": 0.7007,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.09455064684152603,
"rewards/margins": 0.29401230812072754,
"rewards/rejected": -0.38856297731399536,
"step": 118
},
{
"epoch": 0.5374735356386733,
"grad_norm": 11.843733502428305,
"learning_rate": 1.3899962956837443e-07,
"logits/chosen": -3.5447893142700195,
"logits/rejected": -3.5373752117156982,
"logps/chosen": -217.02601623535156,
"logps/rejected": -195.690185546875,
"loss": 0.6888,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.11182112246751785,
"rewards/margins": 0.3210294544696808,
"rewards/rejected": -0.43285059928894043,
"step": 119
},
{
"epoch": 0.5419901199717714,
"grad_norm": 12.77724705920504,
"learning_rate": 1.3680477231282058e-07,
"logits/chosen": -3.6904451847076416,
"logits/rejected": -3.648146629333496,
"logps/chosen": -174.260009765625,
"logps/rejected": -160.5956573486328,
"loss": 0.7273,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.14862585067749023,
"rewards/margins": 0.2124842405319214,
"rewards/rejected": -0.3611100912094116,
"step": 120
},
{
"epoch": 0.5465067043048695,
"grad_norm": 11.150050819573732,
"learning_rate": 1.346127587409595e-07,
"logits/chosen": -3.6496615409851074,
"logits/rejected": -3.544325828552246,
"logps/chosen": -199.09524536132812,
"logps/rejected": -178.0385284423828,
"loss": 0.6964,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.13068127632141113,
"rewards/margins": 0.2756379246711731,
"rewards/rejected": -0.40631920099258423,
"step": 121
},
{
"epoch": 0.5510232886379676,
"grad_norm": 12.080724661826311,
"learning_rate": 1.3242406125034247e-07,
"logits/chosen": -3.6039113998413086,
"logits/rejected": -3.4897897243499756,
"logps/chosen": -230.91986083984375,
"logps/rejected": -215.7164764404297,
"loss": 0.7039,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.20738250017166138,
"rewards/margins": 0.24289953708648682,
"rewards/rejected": -0.4502820372581482,
"step": 122
},
{
"epoch": 0.5555398729710657,
"grad_norm": 11.919256579608561,
"learning_rate": 1.302391515238772e-07,
"logits/chosen": -3.5706183910369873,
"logits/rejected": -3.552396774291992,
"logps/chosen": -213.68630981445312,
"logps/rejected": -193.32818603515625,
"loss": 0.6788,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.1757589429616928,
"rewards/margins": 0.32006677985191345,
"rewards/rejected": -0.49582570791244507,
"step": 123
},
{
"epoch": 0.5600564573041638,
"grad_norm": 12.629389606369296,
"learning_rate": 1.280585004281759e-07,
"logits/chosen": -3.539670944213867,
"logits/rejected": -3.535710334777832,
"logps/chosen": -206.27227783203125,
"logps/rejected": -193.86167907714844,
"loss": 0.7106,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.16979312896728516,
"rewards/margins": 0.2596694827079773,
"rewards/rejected": -0.42946261167526245,
"step": 124
},
{
"epoch": 0.5645730416372619,
"grad_norm": 11.591449990368316,
"learning_rate": 1.2588257791207977e-07,
"logits/chosen": -3.59249210357666,
"logits/rejected": -3.588090419769287,
"logps/chosen": -210.10736083984375,
"logps/rejected": -194.91561889648438,
"loss": 0.6973,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.17436593770980835,
"rewards/margins": 0.2836093306541443,
"rewards/rejected": -0.45797526836395264,
"step": 125
},
{
"epoch": 0.56908962597036,
"grad_norm": 12.618504079208098,
"learning_rate": 1.2371185290538087e-07,
"logits/chosen": -3.608921527862549,
"logits/rejected": -3.516284942626953,
"logps/chosen": -215.42098999023438,
"logps/rejected": -192.1516571044922,
"loss": 0.6996,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.18459659814834595,
"rewards/margins": 0.27924641966819763,
"rewards/rejected": -0.4638429880142212,
"step": 126
},
{
"epoch": 0.573606210303458,
"grad_norm": 12.749831427337236,
"learning_rate": 1.2154679321776385e-07,
"logits/chosen": -3.5367257595062256,
"logits/rejected": -3.458606719970703,
"logps/chosen": -225.93020629882812,
"logps/rejected": -214.3812713623047,
"loss": 0.721,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.2491096556186676,
"rewards/margins": 0.22173485159873962,
"rewards/rejected": -0.4708445072174072,
"step": 127
},
{
"epoch": 0.5781227946365561,
"grad_norm": 12.444761165337477,
"learning_rate": 1.193878654379889e-07,
"logits/chosen": -3.5870304107666016,
"logits/rejected": -3.5706138610839844,
"logps/chosen": -206.72052001953125,
"logps/rejected": -196.97564697265625,
"loss": 0.7064,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.17630356550216675,
"rewards/margins": 0.2867695987224579,
"rewards/rejected": -0.463073194026947,
"step": 128
},
{
"epoch": 0.5826393789696542,
"grad_norm": 14.412718695628323,
"learning_rate": 1.1723553483333806e-07,
"logits/chosen": -3.6507744789123535,
"logits/rejected": -3.536320209503174,
"logps/chosen": -192.77645874023438,
"logps/rejected": -172.97238159179688,
"loss": 0.7102,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.17945942282676697,
"rewards/margins": 0.24124844372272491,
"rewards/rejected": -0.4207078814506531,
"step": 129
},
{
"epoch": 0.5871559633027523,
"grad_norm": 11.81162755328704,
"learning_rate": 1.1509026524934596e-07,
"logits/chosen": -3.531088352203369,
"logits/rejected": -3.4827826023101807,
"logps/chosen": -208.96688842773438,
"logps/rejected": -188.86569213867188,
"loss": 0.7152,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.21842709183692932,
"rewards/margins": 0.2507480978965759,
"rewards/rejected": -0.46917521953582764,
"step": 130
},
{
"epoch": 0.5916725476358504,
"grad_norm": 12.914405491596224,
"learning_rate": 1.129525190098374e-07,
"logits/chosen": -3.6707763671875,
"logits/rejected": -3.636361598968506,
"logps/chosen": -213.6358642578125,
"logps/rejected": -204.06399536132812,
"loss": 0.7035,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.18059837818145752,
"rewards/margins": 0.2868345379829407,
"rewards/rejected": -0.4674329161643982,
"step": 131
},
{
"epoch": 0.5961891319689485,
"grad_norm": 11.557057022060595,
"learning_rate": 1.1082275681729236e-07,
"logits/chosen": -3.6127572059631348,
"logits/rejected": -3.531533718109131,
"logps/chosen": -183.26527404785156,
"logps/rejected": -165.15826416015625,
"loss": 0.7145,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.17764925956726074,
"rewards/margins": 0.2194777876138687,
"rewards/rejected": -0.39712706208229065,
"step": 132
},
{
"epoch": 0.6007057163020466,
"grad_norm": 12.328325193031683,
"learning_rate": 1.0870143765356105e-07,
"logits/chosen": -3.634964942932129,
"logits/rejected": -3.566643714904785,
"logps/chosen": -209.58245849609375,
"logps/rejected": -185.32574462890625,
"loss": 0.7069,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.1629945933818817,
"rewards/margins": 0.2675268054008484,
"rewards/rejected": -0.4305214285850525,
"step": 133
},
{
"epoch": 0.6052223006351447,
"grad_norm": 12.054973441313555,
"learning_rate": 1.0658901868094899e-07,
"logits/chosen": -3.571657657623291,
"logits/rejected": -3.5290191173553467,
"logps/chosen": -208.86460876464844,
"logps/rejected": -198.53517150878906,
"loss": 0.722,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.20138764381408691,
"rewards/margins": 0.22342219948768616,
"rewards/rejected": -0.42480987310409546,
"step": 134
},
{
"epoch": 0.6097388849682428,
"grad_norm": 11.358121818992657,
"learning_rate": 1.0448595514369515e-07,
"logits/chosen": -3.5903096199035645,
"logits/rejected": -3.4630379676818848,
"logps/chosen": -195.21963500976562,
"logps/rejected": -178.4676055908203,
"loss": 0.738,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.21748274564743042,
"rewards/margins": 0.20364663004875183,
"rewards/rejected": -0.42112940549850464,
"step": 135
},
{
"epoch": 0.6142554693013409,
"grad_norm": 12.076223242455574,
"learning_rate": 1.0239270026986241e-07,
"logits/chosen": -3.641045093536377,
"logits/rejected": -3.6035234928131104,
"logps/chosen": -210.5784454345703,
"logps/rejected": -193.73934936523438,
"loss": 0.6854,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.14184671640396118,
"rewards/margins": 0.324258953332901,
"rewards/rejected": -0.4661056697368622,
"step": 136
},
{
"epoch": 0.618772053634439,
"grad_norm": 11.692752596565882,
"learning_rate": 1.0030970517366362e-07,
"logits/chosen": -3.563607931137085,
"logits/rejected": -3.428741216659546,
"logps/chosen": -219.72509765625,
"logps/rejected": -196.94625854492188,
"loss": 0.6911,
"rewards/accuracies": 0.890625,
"rewards/chosen": -0.1963091939687729,
"rewards/margins": 0.31136542558670044,
"rewards/rejected": -0.5076746344566345,
"step": 137
},
{
"epoch": 0.6232886379675371,
"grad_norm": 12.345411080526995,
"learning_rate": 9.82374187582421e-08,
"logits/chosen": -3.528062343597412,
"logits/rejected": -3.505977153778076,
"logps/chosen": -218.63848876953125,
"logps/rejected": -206.9732666015625,
"loss": 0.7132,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.23708635568618774,
"rewards/margins": 0.25329455733299255,
"rewards/rejected": -0.4903808832168579,
"step": 138
},
{
"epoch": 0.6278052223006352,
"grad_norm": 11.674703919156808,
"learning_rate": 9.617628761892963e-08,
"logits/chosen": -3.6489012241363525,
"logits/rejected": -3.6186487674713135,
"logps/chosen": -189.92733764648438,
"logps/rejected": -176.64971923828125,
"loss": 0.7119,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.18905529379844666,
"rewards/margins": 0.25132396817207336,
"rewards/rejected": -0.4403792917728424,
"step": 139
},
{
"epoch": 0.6323218066337333,
"grad_norm": 13.29691200714144,
"learning_rate": 9.412675594700113e-08,
"logits/chosen": -3.5062429904937744,
"logits/rejected": -3.4647328853607178,
"logps/chosen": -201.7200927734375,
"logps/rejected": -181.33375549316406,
"loss": 0.681,
"rewards/accuracies": 0.890625,
"rewards/chosen": -0.08604306727647781,
"rewards/margins": 0.33681678771972656,
"rewards/rejected": -0.4228598475456238,
"step": 140
},
{
"epoch": 0.6368383909668314,
"grad_norm": 12.691599989680086,
"learning_rate": 9.208926543394776e-08,
"logits/chosen": -3.5145082473754883,
"logits/rejected": -3.4705848693847656,
"logps/chosen": -226.04798889160156,
"logps/rejected": -208.7861328125,
"loss": 0.6872,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.20757007598876953,
"rewards/margins": 0.32437780499458313,
"rewards/rejected": -0.5319478511810303,
"step": 141
},
{
"epoch": 0.6413549752999295,
"grad_norm": 12.77311070940504,
"learning_rate": 9.006425517628863e-08,
"logits/chosen": -3.5832412242889404,
"logits/rejected": -3.532606601715088,
"logps/chosen": -218.02249145507812,
"logps/rejected": -205.59286499023438,
"loss": 0.6794,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.21441030502319336,
"rewards/margins": 0.32625484466552734,
"rewards/rejected": -0.5406651496887207,
"step": 142
},
{
"epoch": 0.6458715596330276,
"grad_norm": 11.7323843348539,
"learning_rate": 8.805216158094177e-08,
"logits/chosen": -3.5744497776031494,
"logits/rejected": -3.5260581970214844,
"logps/chosen": -193.29339599609375,
"logps/rejected": -183.2548828125,
"loss": 0.7311,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.23033544421195984,
"rewards/margins": 0.2078953981399536,
"rewards/rejected": -0.43823081254959106,
"step": 143
},
{
"epoch": 0.6503881439661257,
"grad_norm": 11.68741246706069,
"learning_rate": 8.605341827117462e-08,
"logits/chosen": -3.584440231323242,
"logits/rejected": -3.5315933227539062,
"logps/chosen": -206.68771362304688,
"logps/rejected": -191.7264862060547,
"loss": 0.6958,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.183091938495636,
"rewards/margins": 0.28822189569473267,
"rewards/rejected": -0.47131383419036865,
"step": 144
},
{
"epoch": 0.6549047282992237,
"grad_norm": 12.84057054734779,
"learning_rate": 8.406845599315482e-08,
"logits/chosen": -3.5925729274749756,
"logits/rejected": -3.5170035362243652,
"logps/chosen": -220.13719177246094,
"logps/rejected": -200.61856079101562,
"loss": 0.6551,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.2075750231742859,
"rewards/margins": 0.38476982712745667,
"rewards/rejected": -0.5923448204994202,
"step": 145
},
{
"epoch": 0.6594213126323218,
"grad_norm": 12.946963405257083,
"learning_rate": 8.20977025231203e-08,
"logits/chosen": -3.5441179275512695,
"logits/rejected": -3.5702133178710938,
"logps/chosen": -245.39036560058594,
"logps/rejected": -230.05406188964844,
"loss": 0.6539,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.2259531170129776,
"rewards/margins": 0.41027867794036865,
"rewards/rejected": -0.6362317800521851,
"step": 146
},
{
"epoch": 0.6639378969654199,
"grad_norm": 11.995200470883699,
"learning_rate": 8.014158257519046e-08,
"logits/chosen": -3.5933175086975098,
"logits/rejected": -3.598524808883667,
"logps/chosen": -213.013427734375,
"logps/rejected": -197.81692504882812,
"loss": 0.674,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.15267431735992432,
"rewards/margins": 0.3510599732398987,
"rewards/rejected": -0.5037343502044678,
"step": 147
},
{
"epoch": 0.668454481298518,
"grad_norm": 11.025028923299288,
"learning_rate": 7.820051770983612e-08,
"logits/chosen": -3.578158378601074,
"logits/rejected": -3.4002773761749268,
"logps/chosen": -207.0416259765625,
"logps/rejected": -182.08555603027344,
"loss": 0.7096,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.1736908257007599,
"rewards/margins": 0.26846104860305786,
"rewards/rejected": -0.44215184450149536,
"step": 148
},
{
"epoch": 0.6729710656316161,
"grad_norm": 14.342920455609963,
"learning_rate": 7.627492624302986e-08,
"logits/chosen": -3.5777783393859863,
"logits/rejected": -3.579009532928467,
"logps/chosen": -221.22406005859375,
"logps/rejected": -209.30577087402344,
"loss": 0.6937,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.2393387109041214,
"rewards/margins": 0.28950440883636475,
"rewards/rejected": -0.5288431644439697,
"step": 149
},
{
"epoch": 0.6774876499647142,
"grad_norm": 11.16447507239811,
"learning_rate": 7.436522315609545e-08,
"logits/chosen": -3.6183109283447266,
"logits/rejected": -3.6173715591430664,
"logps/chosen": -191.01394653320312,
"logps/rejected": -181.1483612060547,
"loss": 0.6995,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.21332278847694397,
"rewards/margins": 0.2814997434616089,
"rewards/rejected": -0.49482250213623047,
"step": 150
},
{
"epoch": 0.6820042342978123,
"grad_norm": 12.13300661848618,
"learning_rate": 7.247182000627588e-08,
"logits/chosen": -3.4436144828796387,
"logits/rejected": -3.4601621627807617,
"logps/chosen": -206.80313110351562,
"logps/rejected": -197.01959228515625,
"loss": 0.7062,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.219729483127594,
"rewards/margins": 0.2750515937805176,
"rewards/rejected": -0.4947810769081116,
"step": 151
},
{
"epoch": 0.6865208186309104,
"grad_norm": 12.27938156296404,
"learning_rate": 7.059512483803904e-08,
"logits/chosen": -3.528646945953369,
"logits/rejected": -3.4958152770996094,
"logps/chosen": -227.61428833007812,
"logps/rejected": -216.52243041992188,
"loss": 0.6608,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.23144832253456116,
"rewards/margins": 0.3668804168701172,
"rewards/rejected": -0.598328709602356,
"step": 152
},
{
"epoch": 0.6910374029640085,
"grad_norm": 10.934485175214164,
"learning_rate": 6.873554209514085e-08,
"logits/chosen": -3.5069892406463623,
"logits/rejected": -3.464691638946533,
"logps/chosen": -182.59202575683594,
"logps/rejected": -175.9390106201172,
"loss": 0.7086,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.1632474958896637,
"rewards/margins": 0.27051842212677,
"rewards/rejected": -0.43376588821411133,
"step": 153
},
{
"epoch": 0.6955539872971066,
"grad_norm": 12.823137287258696,
"learning_rate": 6.689347253346412e-08,
"logits/chosen": -3.5779876708984375,
"logits/rejected": -3.5178349018096924,
"logps/chosen": -232.82090759277344,
"logps/rejected": -217.25289916992188,
"loss": 0.6864,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.28119853138923645,
"rewards/margins": 0.29815465211868286,
"rewards/rejected": -0.5793532133102417,
"step": 154
},
{
"epoch": 0.7000705716302047,
"grad_norm": 13.681445102824224,
"learning_rate": 6.506931313465244e-08,
"logits/chosen": -3.563887119293213,
"logits/rejected": -3.5259387493133545,
"logps/chosen": -235.2152099609375,
"logps/rejected": -218.48611450195312,
"loss": 0.6864,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.2641199231147766,
"rewards/margins": 0.32186007499694824,
"rewards/rejected": -0.5859800577163696,
"step": 155
},
{
"epoch": 0.7045871559633028,
"grad_norm": 11.872530426234707,
"learning_rate": 6.326345702055698e-08,
"logits/chosen": -3.666203737258911,
"logits/rejected": -3.594811201095581,
"logps/chosen": -196.93624877929688,
"logps/rejected": -176.68557739257812,
"loss": 0.7163,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.1758304387331009,
"rewards/margins": 0.2576484978199005,
"rewards/rejected": -0.4334789514541626,
"step": 156
},
{
"epoch": 0.7091037402964009,
"grad_norm": 12.799664975071849,
"learning_rate": 6.147629336851552e-08,
"logits/chosen": -3.624176025390625,
"logits/rejected": -3.557004451751709,
"logps/chosen": -218.7604522705078,
"logps/rejected": -206.44232177734375,
"loss": 0.7112,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.2525786757469177,
"rewards/margins": 0.24881505966186523,
"rewards/rejected": -0.5013936758041382,
"step": 157
},
{
"epoch": 0.713620324629499,
"grad_norm": 10.877958914597729,
"learning_rate": 5.970820732748143e-08,
"logits/chosen": -3.4155774116516113,
"logits/rejected": -3.501858711242676,
"logps/chosen": -199.32589721679688,
"logps/rejected": -196.3961181640625,
"loss": 0.6945,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.22878919541835785,
"rewards/margins": 0.28728824853897095,
"rewards/rejected": -0.5160773992538452,
"step": 158
},
{
"epoch": 0.7181369089625971,
"grad_norm": 12.955814203296498,
"learning_rate": 5.795957993502092e-08,
"logits/chosen": -3.540942668914795,
"logits/rejected": -3.4938039779663086,
"logps/chosen": -210.17218017578125,
"logps/rejected": -208.64927673339844,
"loss": 0.7032,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.2663414180278778,
"rewards/margins": 0.27345699071884155,
"rewards/rejected": -0.539798378944397,
"step": 159
},
{
"epoch": 0.7226534932956952,
"grad_norm": 11.004852801575502,
"learning_rate": 5.623078803519595e-08,
"logits/chosen": -3.6684892177581787,
"logits/rejected": -3.609145402908325,
"logps/chosen": -197.72994995117188,
"logps/rejected": -178.94186401367188,
"loss": 0.699,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.17543372511863708,
"rewards/margins": 0.2890481650829315,
"rewards/rejected": -0.4644818603992462,
"step": 160
},
{
"epoch": 0.7271700776287933,
"grad_norm": 11.267055827604478,
"learning_rate": 5.4522204197351294e-08,
"logits/chosen": -3.6161813735961914,
"logits/rejected": -3.5283775329589844,
"logps/chosen": -217.74359130859375,
"logps/rejected": -197.9161376953125,
"loss": 0.6485,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.13655489683151245,
"rewards/margins": 0.43066203594207764,
"rewards/rejected": -0.5672169923782349,
"step": 161
},
{
"epoch": 0.7316866619618914,
"grad_norm": 11.455186989092018,
"learning_rate": 5.2834196635822626e-08,
"logits/chosen": -3.6274447441101074,
"logits/rejected": -3.5645861625671387,
"logps/chosen": -192.35226440429688,
"logps/rejected": -178.86270141601562,
"loss": 0.7223,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.23346485197544098,
"rewards/margins": 0.22727568447589874,
"rewards/rejected": -0.4607405364513397,
"step": 162
},
{
"epoch": 0.7362032462949895,
"grad_norm": 12.833815191659758,
"learning_rate": 5.1167129130583346e-08,
"logits/chosen": -3.5601654052734375,
"logits/rejected": -3.5595359802246094,
"logps/chosen": -231.76150512695312,
"logps/rejected": -218.86770629882812,
"loss": 0.6694,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.27048397064208984,
"rewards/margins": 0.35143792629241943,
"rewards/rejected": -0.6219218969345093,
"step": 163
},
{
"epoch": 0.7407198306280875,
"grad_norm": 11.087097762035608,
"learning_rate": 4.952136094884666e-08,
"logits/chosen": -3.6404900550842285,
"logits/rejected": -3.5640687942504883,
"logps/chosen": -188.69154357910156,
"logps/rejected": -169.15692138671875,
"loss": 0.6994,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.18032759428024292,
"rewards/margins": 0.27624213695526123,
"rewards/rejected": -0.45656976103782654,
"step": 164
},
{
"epoch": 0.7452364149611856,
"grad_norm": 11.909284446478502,
"learning_rate": 4.789724676764062e-08,
"logits/chosen": -3.5401947498321533,
"logits/rejected": -3.585221767425537,
"logps/chosen": -204.66531372070312,
"logps/rejected": -201.26177978515625,
"loss": 0.7003,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.2440950870513916,
"rewards/margins": 0.27386218309402466,
"rewards/rejected": -0.5179572701454163,
"step": 165
},
{
"epoch": 0.7497529992942837,
"grad_norm": 11.847238391062595,
"learning_rate": 4.629513659737209e-08,
"logits/chosen": -3.520542860031128,
"logits/rejected": -3.4925966262817383,
"logps/chosen": -226.33909606933594,
"logps/rejected": -211.7904815673828,
"loss": 0.68,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.25608351826667786,
"rewards/margins": 0.3440699577331543,
"rewards/rejected": -0.6001534461975098,
"step": 166
},
{
"epoch": 0.7542695836273818,
"grad_norm": 11.52418594856635,
"learning_rate": 4.471537570639676e-08,
"logits/chosen": -3.586939811706543,
"logits/rejected": -3.5489325523376465,
"logps/chosen": -209.86761474609375,
"logps/rejected": -194.3120880126953,
"loss": 0.6738,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.18899373710155487,
"rewards/margins": 0.36132901906967163,
"rewards/rejected": -0.5503227710723877,
"step": 167
},
{
"epoch": 0.7587861679604799,
"grad_norm": 11.746177386790478,
"learning_rate": 4.315830454661059e-08,
"logits/chosen": -3.519134521484375,
"logits/rejected": -3.450460910797119,
"logps/chosen": -224.33236694335938,
"logps/rejected": -205.32876586914062,
"loss": 0.6724,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.14168740808963776,
"rewards/margins": 0.36835241317749023,
"rewards/rejected": -0.5100398063659668,
"step": 168
},
{
"epoch": 0.763302752293578,
"grad_norm": 13.727528925392352,
"learning_rate": 4.1624258680079695e-08,
"logits/chosen": -3.5889594554901123,
"logits/rejected": -3.6074798107147217,
"logps/chosen": -190.32859802246094,
"logps/rejected": -181.37701416015625,
"loss": 0.7041,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.21288928389549255,
"rewards/margins": 0.2832014262676239,
"rewards/rejected": -0.49609071016311646,
"step": 169
},
{
"epoch": 0.7678193366266761,
"grad_norm": 12.36532095556768,
"learning_rate": 4.0113568706723745e-08,
"logits/chosen": -3.5663981437683105,
"logits/rejected": -3.5320568084716797,
"logps/chosen": -205.47369384765625,
"logps/rejected": -189.5297393798828,
"loss": 0.6821,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.21529924869537354,
"rewards/margins": 0.3193380832672119,
"rewards/rejected": -0.5346373319625854,
"step": 170
},
{
"epoch": 0.7723359209597742,
"grad_norm": 11.211886652040736,
"learning_rate": 3.8626560193069194e-08,
"logits/chosen": -3.6578316688537598,
"logits/rejected": -3.5587844848632812,
"logps/chosen": -184.16693115234375,
"logps/rejected": -164.69442749023438,
"loss": 0.7124,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.1814623475074768,
"rewards/margins": 0.24063682556152344,
"rewards/rejected": -0.42209917306900024,
"step": 171
},
{
"epoch": 0.7768525052928723,
"grad_norm": 11.991981526284254,
"learning_rate": 3.71635536020865e-08,
"logits/chosen": -3.687999725341797,
"logits/rejected": -3.5739190578460693,
"logps/chosen": -187.41111755371094,
"logps/rejected": -170.08901977539062,
"loss": 0.6982,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.1899150013923645,
"rewards/margins": 0.27486538887023926,
"rewards/rejected": -0.46478039026260376,
"step": 172
},
{
"epoch": 0.7813690896259704,
"grad_norm": 11.682125153864412,
"learning_rate": 3.572486422412786e-08,
"logits/chosen": -3.541208028793335,
"logits/rejected": -3.4319119453430176,
"logps/chosen": -232.7732696533203,
"logps/rejected": -207.6559295654297,
"loss": 0.6788,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.19109418988227844,
"rewards/margins": 0.37378770112991333,
"rewards/rejected": -0.5648818612098694,
"step": 173
},
{
"epoch": 0.7858856739590685,
"grad_norm": 11.96356466385332,
"learning_rate": 3.4310802108979456e-08,
"logits/chosen": -3.6099984645843506,
"logits/rejected": -3.588571071624756,
"logps/chosen": -221.05567932128906,
"logps/rejected": -211.12063598632812,
"loss": 0.6749,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.2735821604728699,
"rewards/margins": 0.365002304315567,
"rewards/rejected": -0.6385844945907593,
"step": 174
},
{
"epoch": 0.7904022582921666,
"grad_norm": 12.839800224543241,
"learning_rate": 3.292167199904311e-08,
"logits/chosen": -3.5515999794006348,
"logits/rejected": -3.5092904567718506,
"logps/chosen": -238.809814453125,
"logps/rejected": -215.16036987304688,
"loss": 0.6457,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.21585515141487122,
"rewards/margins": 0.43380266427993774,
"rewards/rejected": -0.6496578454971313,
"step": 175
},
{
"epoch": 0.7949188426252647,
"grad_norm": 10.718656806306809,
"learning_rate": 3.1557773263661604e-08,
"logits/chosen": -3.477273941040039,
"logits/rejected": -3.4155681133270264,
"logps/chosen": -229.67233276367188,
"logps/rejected": -217.80172729492188,
"loss": 0.681,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.3297360837459564,
"rewards/margins": 0.32702451944351196,
"rewards/rejected": -0.6567606329917908,
"step": 176
},
{
"epoch": 0.7994354269583628,
"grad_norm": 12.333864621322466,
"learning_rate": 3.02193998346021e-08,
"logits/chosen": -3.5071773529052734,
"logits/rejected": -3.4976847171783447,
"logps/chosen": -221.59481811523438,
"logps/rejected": -209.19186401367188,
"loss": 0.6713,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.26982712745666504,
"rewards/margins": 0.3569854199886322,
"rewards/rejected": -0.6268125176429749,
"step": 177
},
{
"epoch": 0.8039520112914609,
"grad_norm": 12.039934107046923,
"learning_rate": 2.8906840142711338e-08,
"logits/chosen": -3.606412410736084,
"logits/rejected": -3.5920205116271973,
"logps/chosen": -235.42181396484375,
"logps/rejected": -216.65164184570312,
"loss": 0.6652,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.23246988654136658,
"rewards/margins": 0.3717581629753113,
"rewards/rejected": -0.6042280197143555,
"step": 178
},
{
"epoch": 0.808468595624559,
"grad_norm": 11.030662943317653,
"learning_rate": 2.7620377055756423e-08,
"logits/chosen": -3.624559164047241,
"logits/rejected": -3.638848066329956,
"logps/chosen": -191.0684814453125,
"logps/rejected": -183.99212646484375,
"loss": 0.6806,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.1932627409696579,
"rewards/margins": 0.34079742431640625,
"rewards/rejected": -0.5340601205825806,
"step": 179
},
{
"epoch": 0.8129851799576571,
"grad_norm": 11.319505715592419,
"learning_rate": 2.6360287817464256e-08,
"logits/chosen": -3.5405383110046387,
"logits/rejected": -3.5048179626464844,
"logps/chosen": -207.15835571289062,
"logps/rejected": -188.7417449951172,
"loss": 0.6835,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.19087350368499756,
"rewards/margins": 0.3352086842060089,
"rewards/rejected": -0.5260821580886841,
"step": 180
},
{
"epoch": 0.8175017642907552,
"grad_norm": 13.430513620252457,
"learning_rate": 2.512684398777329e-08,
"logits/chosen": -3.5324528217315674,
"logits/rejected": -3.4295010566711426,
"logps/chosen": -213.90878295898438,
"logps/rejected": -195.8572998046875,
"loss": 0.6963,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.23799797892570496,
"rewards/margins": 0.2941531538963318,
"rewards/rejected": -0.5321511626243591,
"step": 181
},
{
"epoch": 0.8220183486238533,
"grad_norm": 11.575530525209127,
"learning_rate": 2.3920311384309914e-08,
"logits/chosen": -3.601444721221924,
"logits/rejected": -3.5109052658081055,
"logps/chosen": -199.96237182617188,
"logps/rejected": -183.93069458007812,
"loss": 0.7104,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.22000840306282043,
"rewards/margins": 0.2704961895942688,
"rewards/rejected": -0.4905046224594116,
"step": 182
},
{
"epoch": 0.8265349329569514,
"grad_norm": 11.049495139232015,
"learning_rate": 2.2740950025102763e-08,
"logits/chosen": -3.5267105102539062,
"logits/rejected": -3.5060598850250244,
"logps/chosen": -204.97425842285156,
"logps/rejected": -196.4848175048828,
"loss": 0.7116,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.25764691829681396,
"rewards/margins": 0.26898401975631714,
"rewards/rejected": -0.5266309976577759,
"step": 183
},
{
"epoch": 0.8310515172900494,
"grad_norm": 12.243629851598072,
"learning_rate": 2.158901407254629e-08,
"logits/chosen": -3.6085617542266846,
"logits/rejected": -3.5918960571289062,
"logps/chosen": -207.69467163085938,
"logps/rejected": -201.7321014404297,
"loss": 0.6776,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.26026493310928345,
"rewards/margins": 0.3179108500480652,
"rewards/rejected": -0.5781757235527039,
"step": 184
},
{
"epoch": 0.8355681016231475,
"grad_norm": 10.504114021037811,
"learning_rate": 2.0464751778626836e-08,
"logits/chosen": -3.489086151123047,
"logits/rejected": -3.5122947692871094,
"logps/chosen": -227.21865844726562,
"logps/rejected": -225.1961669921875,
"loss": 0.6854,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.29528743028640747,
"rewards/margins": 0.34715089201927185,
"rewards/rejected": -0.6424383521080017,
"step": 185
},
{
"epoch": 0.8400846859562456,
"grad_norm": 12.604767051843007,
"learning_rate": 1.9368405431422102e-08,
"logits/chosen": -3.492341995239258,
"logits/rejected": -3.5087356567382812,
"logps/chosen": -239.1469268798828,
"logps/rejected": -227.90689086914062,
"loss": 0.6872,
"rewards/accuracies": 0.890625,
"rewards/chosen": -0.3633117079734802,
"rewards/margins": 0.31293728947639465,
"rewards/rejected": -0.6762489676475525,
"step": 186
},
{
"epoch": 0.8446012702893437,
"grad_norm": 13.007777850694795,
"learning_rate": 1.8300211302886137e-08,
"logits/chosen": -3.5985684394836426,
"logits/rejected": -3.560959815979004,
"logps/chosen": -233.37771606445312,
"logps/rejected": -218.78062438964844,
"loss": 0.6679,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.31275439262390137,
"rewards/margins": 0.35552066564559937,
"rewards/rejected": -0.6682751178741455,
"step": 187
},
{
"epoch": 0.8491178546224418,
"grad_norm": 12.224849381089012,
"learning_rate": 1.726039959793059e-08,
"logits/chosen": -3.6801674365997314,
"logits/rejected": -3.643399953842163,
"logps/chosen": -201.95065307617188,
"logps/rejected": -191.9815673828125,
"loss": 0.7021,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.23352208733558655,
"rewards/margins": 0.2819408178329468,
"rewards/rejected": -0.5154628753662109,
"step": 188
},
{
"epoch": 0.8536344389555399,
"grad_norm": 11.06103238426834,
"learning_rate": 1.6249194404813633e-08,
"logits/chosen": -3.583648204803467,
"logits/rejected": -3.536038398742676,
"logps/chosen": -205.93267822265625,
"logps/rejected": -188.04385375976562,
"loss": 0.6819,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.20944997668266296,
"rewards/margins": 0.33364248275756836,
"rewards/rejected": -0.5430924892425537,
"step": 189
},
{
"epoch": 0.858151023288638,
"grad_norm": 11.515369124606178,
"learning_rate": 1.526681364684707e-08,
"logits/chosen": -3.544914722442627,
"logits/rejected": -3.477529525756836,
"logps/chosen": -246.21878051757812,
"logps/rejected": -227.75767517089844,
"loss": 0.6284,
"rewards/accuracies": 0.890625,
"rewards/chosen": -0.25970178842544556,
"rewards/margins": 0.4616071283817291,
"rewards/rejected": -0.7213089466094971,
"step": 190
},
{
"epoch": 0.8626676076217361,
"grad_norm": 11.17899475785984,
"learning_rate": 1.4313469035432053e-08,
"logits/chosen": -3.539396286010742,
"logits/rejected": -3.491584062576294,
"logps/chosen": -217.02066040039062,
"logps/rejected": -194.50637817382812,
"loss": 0.6783,
"rewards/accuracies": 0.890625,
"rewards/chosen": -0.23896610736846924,
"rewards/margins": 0.36122721433639526,
"rewards/rejected": -0.6001933813095093,
"step": 191
},
{
"epoch": 0.8671841919548342,
"grad_norm": 11.452045965709495,
"learning_rate": 1.3389366024433346e-08,
"logits/chosen": -3.508542776107788,
"logits/rejected": -3.4577219486236572,
"logps/chosen": -209.47579956054688,
"logps/rejected": -191.61953735351562,
"loss": 0.6856,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.23392406105995178,
"rewards/margins": 0.3334580063819885,
"rewards/rejected": -0.5673820972442627,
"step": 192
},
{
"epoch": 0.8717007762879323,
"grad_norm": 11.051060681758146,
"learning_rate": 1.2494703765902337e-08,
"logits/chosen": -3.545461654663086,
"logits/rejected": -3.5021471977233887,
"logps/chosen": -221.29922485351562,
"logps/rejected": -205.3109588623047,
"loss": 0.6884,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.26481908559799194,
"rewards/margins": 0.3176526129245758,
"rewards/rejected": -0.5824716687202454,
"step": 193
},
{
"epoch": 0.8762173606210304,
"grad_norm": 11.263698377490607,
"learning_rate": 1.1629675067158119e-08,
"logits/chosen": -3.59144926071167,
"logits/rejected": -3.596311569213867,
"logps/chosen": -223.28732299804688,
"logps/rejected": -207.96267700195312,
"loss": 0.6421,
"rewards/accuracies": 0.90625,
"rewards/chosen": -0.16717705130577087,
"rewards/margins": 0.46462714672088623,
"rewards/rejected": -0.6318042278289795,
"step": 194
},
{
"epoch": 0.8807339449541285,
"grad_norm": 11.60107390050444,
"learning_rate": 1.0794466349235865e-08,
"logits/chosen": -3.6060633659362793,
"logits/rejected": -3.517813205718994,
"logps/chosen": -202.70123291015625,
"logps/rejected": -183.82339477539062,
"loss": 0.6759,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.16405262053012848,
"rewards/margins": 0.36374321579933167,
"rewards/rejected": -0.5277957916259766,
"step": 195
},
{
"epoch": 0.8852505292872266,
"grad_norm": 11.879162180318056,
"learning_rate": 9.989257606711438e-09,
"logits/chosen": -3.59369158744812,
"logits/rejected": -3.5690078735351562,
"logps/chosen": -226.8720703125,
"logps/rejected": -210.02069091796875,
"loss": 0.6661,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.2107160985469818,
"rewards/margins": 0.3873599171638489,
"rewards/rejected": -0.5980759859085083,
"step": 196
},
{
"epoch": 0.8897671136203247,
"grad_norm": 11.434668917143895,
"learning_rate": 9.214222368911112e-09,
"logits/chosen": -3.5294957160949707,
"logits/rejected": -3.504406452178955,
"logps/chosen": -210.91481018066406,
"logps/rejected": -190.58346557617188,
"loss": 0.6694,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.17558521032333374,
"rewards/margins": 0.37856271862983704,
"rewards/rejected": -0.5541479587554932,
"step": 197
},
{
"epoch": 0.8942836979534228,
"grad_norm": 11.442820506378803,
"learning_rate": 8.469527662514425e-09,
"logits/chosen": -3.6074564456939697,
"logits/rejected": -3.5393269062042236,
"logps/chosen": -206.037109375,
"logps/rejected": -184.319580078125,
"loss": 0.6837,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.2739471197128296,
"rewards/margins": 0.31510984897613525,
"rewards/rejected": -0.5890569686889648,
"step": 198
},
{
"epoch": 0.8988002822865209,
"grad_norm": 10.48141977546952,
"learning_rate": 7.755333975558703e-09,
"logits/chosen": -3.534519910812378,
"logits/rejected": -3.546844005584717,
"logps/chosen": -209.56192016601562,
"logps/rejected": -192.3565673828125,
"loss": 0.656,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.1646820604801178,
"rewards/margins": 0.422355055809021,
"rewards/rejected": -0.5870370864868164,
"step": 199
},
{
"epoch": 0.903316866619619,
"grad_norm": 11.512112387427932,
"learning_rate": 7.071795222852295e-09,
"logits/chosen": -3.5276589393615723,
"logits/rejected": -3.4519450664520264,
"logps/chosen": -203.3861083984375,
"logps/rejected": -186.114013671875,
"loss": 0.7095,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.2570461928844452,
"rewards/margins": 0.2705690562725067,
"rewards/rejected": -0.5276152491569519,
"step": 200
},
{
"epoch": 0.9078334509527171,
"grad_norm": 11.664115289787672,
"learning_rate": 6.41905871280477e-09,
"logits/chosen": -3.5618152618408203,
"logits/rejected": -3.5523681640625,
"logps/chosen": -208.64450073242188,
"logps/rejected": -194.45249938964844,
"loss": 0.7034,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.2307577133178711,
"rewards/margins": 0.2903120219707489,
"rewards/rejected": -0.5210697650909424,
"step": 201
},
{
"epoch": 0.9123500352858152,
"grad_norm": 12.221452533932299,
"learning_rate": 5.797265115680649e-09,
"logits/chosen": -3.560934543609619,
"logits/rejected": -3.5254406929016113,
"logps/chosen": -212.36541748046875,
"logps/rejected": -200.04124450683594,
"loss": 0.7023,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.3073262572288513,
"rewards/margins": 0.28462159633636475,
"rewards/rejected": -0.5919477939605713,
"step": 202
},
{
"epoch": 0.9168666196189132,
"grad_norm": 11.449688321668301,
"learning_rate": 5.206548433283803e-09,
"logits/chosen": -3.5118632316589355,
"logits/rejected": -3.5008668899536133,
"logps/chosen": -216.9571990966797,
"logps/rejected": -194.16934204101562,
"loss": 0.6675,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.248696431517601,
"rewards/margins": 0.396094411611557,
"rewards/rejected": -0.6447908282279968,
"step": 203
},
{
"epoch": 0.9213832039520113,
"grad_norm": 12.04856516660961,
"learning_rate": 4.6470359700788995e-09,
"logits/chosen": -3.621587038040161,
"logits/rejected": -3.5827927589416504,
"logps/chosen": -221.77987670898438,
"logps/rejected": -201.95237731933594,
"loss": 0.6732,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.26525014638900757,
"rewards/margins": 0.36969608068466187,
"rewards/rejected": -0.6349462270736694,
"step": 204
},
{
"epoch": 0.9258997882851094,
"grad_norm": 11.722034459630727,
"learning_rate": 4.118848305756173e-09,
"logits/chosen": -3.595360279083252,
"logits/rejected": -3.5677242279052734,
"logps/chosen": -223.11941528320312,
"logps/rejected": -205.64205932617188,
"loss": 0.6758,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.2749682664871216,
"rewards/margins": 0.3367688059806824,
"rewards/rejected": -0.611737072467804,
"step": 205
},
{
"epoch": 0.9304163726182075,
"grad_norm": 11.503388221795708,
"learning_rate": 3.622099269245571e-09,
"logits/chosen": -3.60404109954834,
"logits/rejected": -3.5229146480560303,
"logps/chosen": -223.0252685546875,
"logps/rejected": -200.0838623046875,
"loss": 0.6655,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.18073460459709167,
"rewards/margins": 0.37948232889175415,
"rewards/rejected": -0.5602169036865234,
"step": 206
},
{
"epoch": 0.9349329569513056,
"grad_norm": 10.985620131645993,
"learning_rate": 3.156895914185581e-09,
"logits/chosen": -3.526765823364258,
"logits/rejected": -3.5020856857299805,
"logps/chosen": -209.4959259033203,
"logps/rejected": -197.26101684570312,
"loss": 0.6902,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.24708959460258484,
"rewards/margins": 0.3107626438140869,
"rewards/rejected": -0.5578522086143494,
"step": 207
},
{
"epoch": 0.9394495412844037,
"grad_norm": 12.99867871857846,
"learning_rate": 2.7233384958522676e-09,
"logits/chosen": -3.6278443336486816,
"logits/rejected": -3.6109533309936523,
"logps/chosen": -207.57948303222656,
"logps/rejected": -194.7212677001953,
"loss": 0.6843,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.23499882221221924,
"rewards/margins": 0.3160548508167267,
"rewards/rejected": -0.5510536432266235,
"step": 208
},
{
"epoch": 0.9439661256175018,
"grad_norm": 11.2046360254521,
"learning_rate": 2.321520449553421e-09,
"logits/chosen": -3.5132226943969727,
"logits/rejected": -3.491821527481079,
"logps/chosen": -206.74038696289062,
"logps/rejected": -185.68463134765625,
"loss": 0.6676,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.1989631950855255,
"rewards/margins": 0.39246267080307007,
"rewards/rejected": -0.591425895690918,
"step": 209
},
{
"epoch": 0.9484827099505999,
"grad_norm": 13.729648643233745,
"learning_rate": 1.9515283704924667e-09,
"logits/chosen": -3.6648130416870117,
"logits/rejected": -3.5529017448425293,
"logps/chosen": -232.48114013671875,
"logps/rejected": -210.60609436035156,
"loss": 0.6581,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.2861940860748291,
"rewards/margins": 0.377960741519928,
"rewards/rejected": -0.6641547679901123,
"step": 210
},
{
"epoch": 0.952999294283698,
"grad_norm": 12.04630458243632,
"learning_rate": 1.6134419951064404e-09,
"logits/chosen": -3.586575984954834,
"logits/rejected": -3.577587127685547,
"logps/chosen": -201.98065185546875,
"logps/rejected": -184.5204620361328,
"loss": 0.6845,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.21713638305664062,
"rewards/margins": 0.32104575634002686,
"rewards/rejected": -0.5381821393966675,
"step": 211
},
{
"epoch": 0.9575158786167961,
"grad_norm": 11.591398022496064,
"learning_rate": 1.3073341838821028e-09,
"logits/chosen": -3.606295585632324,
"logits/rejected": -3.6138882637023926,
"logps/chosen": -222.92762756347656,
"logps/rejected": -213.224365234375,
"loss": 0.6756,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.2295455038547516,
"rewards/margins": 0.37904244661331177,
"rewards/rejected": -0.608587920665741,
"step": 212
},
{
"epoch": 0.9620324629498942,
"grad_norm": 14.166283230715495,
"learning_rate": 1.033270905653949e-09,
"logits/chosen": -3.615994453430176,
"logits/rejected": -3.5957651138305664,
"logps/chosen": -197.43399047851562,
"logps/rejected": -187.89100646972656,
"loss": 0.7109,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.2787552773952484,
"rewards/margins": 0.2632961571216583,
"rewards/rejected": -0.5420514345169067,
"step": 213
},
{
"epoch": 0.9665490472829923,
"grad_norm": 12.160467402707475,
"learning_rate": 7.913112233872476e-10,
"logits/chosen": -3.5975520610809326,
"logits/rejected": -3.5754852294921875,
"logps/chosen": -212.54818725585938,
"logps/rejected": -193.0272216796875,
"loss": 0.666,
"rewards/accuracies": 0.90625,
"rewards/chosen": -0.23875494301319122,
"rewards/margins": 0.3742338716983795,
"rewards/rejected": -0.6129888296127319,
"step": 214
},
{
"epoch": 0.9710656316160904,
"grad_norm": 11.288257675305129,
"learning_rate": 5.815072814496225e-10,
"logits/chosen": -3.5631093978881836,
"logits/rejected": -3.5513646602630615,
"logps/chosen": -179.36865234375,
"logps/rejected": -169.8910675048828,
"loss": 0.6992,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.21426759660243988,
"rewards/margins": 0.2879588007926941,
"rewards/rejected": -0.5022264122962952,
"step": 215
},
{
"epoch": 0.9755822159491885,
"grad_norm": 11.189287909041054,
"learning_rate": 4.0390429437332505e-10,
"logits/chosen": -3.49703311920166,
"logits/rejected": -3.45841908454895,
"logps/chosen": -210.85260009765625,
"logps/rejected": -196.04306030273438,
"loss": 0.6895,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.2567548155784607,
"rewards/margins": 0.3082759380340576,
"rewards/rejected": -0.5650308132171631,
"step": 216
},
{
"epoch": 0.9800988002822866,
"grad_norm": 11.496270664039283,
"learning_rate": 2.585405371112459e-10,
"logits/chosen": -3.5845980644226074,
"logits/rejected": -3.5456676483154297,
"logps/chosen": -199.40249633789062,
"logps/rejected": -191.75473022460938,
"loss": 0.733,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.319938600063324,
"rewards/margins": 0.20854635536670685,
"rewards/rejected": -0.5284849405288696,
"step": 217
},
{
"epoch": 0.9846153846153847,
"grad_norm": 12.656855394825056,
"learning_rate": 1.454473367883291e-10,
"logits/chosen": -3.478101968765259,
"logits/rejected": -3.501936435699463,
"logps/chosen": -222.27523803710938,
"logps/rejected": -206.86306762695312,
"loss": 0.6763,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.26684999465942383,
"rewards/margins": 0.34992286562919617,
"rewards/rejected": -0.6167728900909424,
"step": 218
},
{
"epoch": 0.9891319689484828,
"grad_norm": 13.110720385558672,
"learning_rate": 6.464906595023967e-11,
"logits/chosen": -3.5070900917053223,
"logits/rejected": -3.489541530609131,
"logps/chosen": -221.77259826660156,
"logps/rejected": -207.49349975585938,
"loss": 0.6686,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.22918352484703064,
"rewards/margins": 0.3775954246520996,
"rewards/rejected": -0.6067789793014526,
"step": 219
},
{
"epoch": 0.9936485532815809,
"grad_norm": 10.689364544141856,
"learning_rate": 1.616313731091501e-11,
"logits/chosen": -3.472511053085327,
"logits/rejected": -3.4571049213409424,
"logps/chosen": -218.83880615234375,
"logps/rejected": -205.46530151367188,
"loss": 0.6523,
"rewards/accuracies": 0.90625,
"rewards/chosen": -0.24689728021621704,
"rewards/margins": 0.4170025587081909,
"rewards/rejected": -0.663899838924408,
"step": 220
},
{
"epoch": 0.998165137614679,
"grad_norm": 11.564165675567537,
"learning_rate": 0.0,
"logits/chosen": -3.556617259979248,
"logits/rejected": -3.5278592109680176,
"logps/chosen": -206.43194580078125,
"logps/rejected": -186.3187713623047,
"loss": 0.6886,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.1849951595067978,
"rewards/margins": 0.3221530318260193,
"rewards/rejected": -0.5071482062339783,
"step": 221
}
],
"logging_steps": 1,
"max_steps": 221,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 316050221826048.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}