{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.998165137614679, "eval_steps": 500, "global_step": 221, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004516584333098095, "grad_norm": 17.491543776149005, "learning_rate": 4.285714285714285e-08, "logits/chosen": -3.703737258911133, "logits/rejected": -3.642177104949951, "logps/chosen": -230.21658325195312, "logps/rejected": -213.08389282226562, "loss": 0.8161, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.00903316866619619, "grad_norm": 16.5951979105037, "learning_rate": 8.57142857142857e-08, "logits/chosen": -3.811067819595337, "logits/rejected": -3.761146306991577, "logps/chosen": -186.53829956054688, "logps/rejected": -172.09280395507812, "loss": 0.818, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.013549752999294284, "grad_norm": 16.426043598595893, "learning_rate": 1.2857142857142855e-07, "logits/chosen": -3.673379898071289, "logits/rejected": -3.574615955352783, "logps/chosen": -185.69815063476562, "logps/rejected": -167.53453063964844, "loss": 0.8218, "rewards/accuracies": 0.421875, "rewards/chosen": -0.00822072196751833, "rewards/margins": -0.011108924634754658, "rewards/rejected": 0.0028882024344056845, "step": 3 }, { "epoch": 0.01806633733239238, "grad_norm": 17.182455484491946, "learning_rate": 1.714285714285714e-07, "logits/chosen": -3.6595711708068848, "logits/rejected": -3.664757490158081, "logps/chosen": -233.35186767578125, "logps/rejected": -220.01324462890625, "loss": 0.8183, "rewards/accuracies": 0.453125, "rewards/chosen": -0.0016412150580435991, "rewards/margins": -0.0031568286940455437, "rewards/rejected": 0.0015156148001551628, "step": 4 }, { "epoch": 0.022582921665490474, "grad_norm": 15.69002599516555, "learning_rate": 2.1428571428571428e-07, "logits/chosen": -3.727100133895874, "logits/rejected": -3.737985134124756, "logps/chosen": -190.59376525878906, "logps/rejected": -175.35711669921875, "loss": 0.8179, "rewards/accuracies": 0.46875, "rewards/chosen": -0.007597364019602537, "rewards/margins": -0.0018714312463998795, "rewards/rejected": -0.005725932773202658, "step": 5 }, { "epoch": 0.02709950599858857, "grad_norm": 15.612644086790919, "learning_rate": 2.571428571428571e-07, "logits/chosen": -3.6620445251464844, "logits/rejected": -3.6660032272338867, "logps/chosen": -184.26095581054688, "logps/rejected": -162.46971130371094, "loss": 0.8186, "rewards/accuracies": 0.546875, "rewards/chosen": -0.008165668696165085, "rewards/margins": -0.008361553773283958, "rewards/rejected": 0.00019588530994951725, "step": 6 }, { "epoch": 0.031616090331686664, "grad_norm": 13.683214125166593, "learning_rate": 3e-07, "logits/chosen": -3.690432071685791, "logits/rejected": -3.5761916637420654, "logps/chosen": -170.06161499023438, "logps/rejected": -156.4132080078125, "loss": 0.8176, "rewards/accuracies": 0.359375, "rewards/chosen": -0.00132226780988276, "rewards/margins": -0.01818685233592987, "rewards/rejected": 0.016864586621522903, "step": 7 }, { "epoch": 0.03613267466478476, "grad_norm": 16.4402451983829, "learning_rate": 2.999838368626891e-07, "logits/chosen": -3.741443157196045, "logits/rejected": -3.670546293258667, "logps/chosen": -196.43753051757812, "logps/rejected": -175.54696655273438, "loss": 0.8229, "rewards/accuracies": 0.46875, "rewards/chosen": -0.003386292140930891, "rewards/margins": -0.005730946082621813, "rewards/rejected": 0.002344653941690922, "step": 8 }, { "epoch": 0.04064925899788285, "grad_norm": 17.452034360978363, "learning_rate": 2.9993535093404974e-07, "logits/chosen": -3.6631717681884766, "logits/rejected": -3.5741701126098633, "logps/chosen": -229.43096923828125, "logps/rejected": -198.76205444335938, "loss": 0.8287, "rewards/accuracies": 0.421875, "rewards/chosen": -0.007437766529619694, "rewards/margins": -0.012869942933321, "rewards/rejected": 0.005432176869362593, "step": 9 }, { "epoch": 0.04516584333098095, "grad_norm": 18.322329811535944, "learning_rate": 2.998545526632117e-07, "logits/chosen": -3.725301504135132, "logits/rejected": -3.667342185974121, "logps/chosen": -203.80462646484375, "logps/rejected": -184.80245971679688, "loss": 0.8243, "rewards/accuracies": 0.390625, "rewards/chosen": -0.009436231106519699, "rewards/margins": -0.01625261828303337, "rewards/rejected": 0.0068163881078362465, "step": 10 }, { "epoch": 0.04968242766407904, "grad_norm": 18.170319693328302, "learning_rate": 2.9974145946288874e-07, "logits/chosen": -3.6613407135009766, "logits/rejected": -3.6700329780578613, "logps/chosen": -222.37948608398438, "logps/rejected": -197.45956420898438, "loss": 0.8201, "rewards/accuracies": 0.5, "rewards/chosen": -0.00792229175567627, "rewards/margins": -0.00381114287301898, "rewards/rejected": -0.004111149813979864, "step": 11 }, { "epoch": 0.05419901199717714, "grad_norm": 16.73406724526876, "learning_rate": 2.9959609570562665e-07, "logits/chosen": -3.6817235946655273, "logits/rejected": -3.6180002689361572, "logps/chosen": -207.94598388671875, "logps/rejected": -185.84361267089844, "loss": 0.8223, "rewards/accuracies": 0.5, "rewards/chosen": -0.0013277027755975723, "rewards/margins": -0.012387244962155819, "rewards/rejected": 0.011059543117880821, "step": 12 }, { "epoch": 0.05871559633027523, "grad_norm": 14.830646302858986, "learning_rate": 2.994184927185504e-07, "logits/chosen": -3.5986547470092773, "logits/rejected": -3.653524875640869, "logps/chosen": -195.15597534179688, "logps/rejected": -176.31295776367188, "loss": 0.8152, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0033681748900562525, "rewards/margins": 0.006227727048099041, "rewards/rejected": -0.002859552390873432, "step": 13 }, { "epoch": 0.06323218066337333, "grad_norm": 14.922261930025773, "learning_rate": 2.9920868877661274e-07, "logits/chosen": -3.749242067337036, "logits/rejected": -3.669080972671509, "logps/chosen": -187.9488525390625, "logps/rejected": -171.089111328125, "loss": 0.817, "rewards/accuracies": 0.5, "rewards/chosen": -0.010204151272773743, "rewards/margins": -0.0027150483801960945, "rewards/rejected": -0.007489103823900223, "step": 14 }, { "epoch": 0.06774876499647142, "grad_norm": 16.851808937739815, "learning_rate": 2.9896672909434605e-07, "logits/chosen": -3.7500953674316406, "logits/rejected": -3.6712448596954346, "logps/chosen": -207.16380310058594, "logps/rejected": -186.6913299560547, "loss": 0.8188, "rewards/accuracies": 0.484375, "rewards/chosen": -0.008293930441141129, "rewards/margins": -0.005823222920298576, "rewards/rejected": -0.0024707079865038395, "step": 15 }, { "epoch": 0.07226534932956952, "grad_norm": 14.552003861923032, "learning_rate": 2.986926658161179e-07, "logits/chosen": -3.640725612640381, "logits/rejected": -3.612764835357666, "logps/chosen": -192.30868530273438, "logps/rejected": -175.70521545410156, "loss": 0.8232, "rewards/accuracies": 0.5, "rewards/chosen": 0.0005644555203616619, "rewards/margins": -0.005334064364433289, "rewards/rejected": 0.005898520816117525, "step": 16 }, { "epoch": 0.07678193366266761, "grad_norm": 16.088064283526844, "learning_rate": 2.9838655800489354e-07, "logits/chosen": -3.6310815811157227, "logits/rejected": -3.6059412956237793, "logps/chosen": -201.4388427734375, "logps/rejected": -183.0606689453125, "loss": 0.8124, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0131416916847229, "rewards/margins": 0.013241738080978394, "rewards/rejected": -0.00010004616342484951, "step": 17 }, { "epoch": 0.0812985179957657, "grad_norm": 14.992244007771152, "learning_rate": 2.980484716295075e-07, "logits/chosen": -3.733997344970703, "logits/rejected": -3.698491096496582, "logps/chosen": -188.22225952148438, "logps/rejected": -173.397705078125, "loss": 0.8156, "rewards/accuracies": 0.484375, "rewards/chosen": 0.005783616099506617, "rewards/margins": 0.006108994595706463, "rewards/rejected": -0.00032537919469177723, "step": 18 }, { "epoch": 0.0858151023288638, "grad_norm": 19.12841459122326, "learning_rate": 2.976784795504466e-07, "logits/chosen": -3.6905295848846436, "logits/rejected": -3.5900840759277344, "logps/chosen": -203.93548583984375, "logps/rejected": -177.45327758789062, "loss": 0.8194, "rewards/accuracies": 0.546875, "rewards/chosen": -0.008039581589400768, "rewards/margins": 0.008988430723547935, "rewards/rejected": -0.01702801324427128, "step": 19 }, { "epoch": 0.0903316866619619, "grad_norm": 16.487134694988207, "learning_rate": 2.972766615041477e-07, "logits/chosen": -3.625434398651123, "logits/rejected": -3.5620625019073486, "logps/chosen": -230.413818359375, "logps/rejected": -209.59951782226562, "loss": 0.8031, "rewards/accuracies": 0.640625, "rewards/chosen": 0.0009453308302909136, "rewards/margins": 0.023260660469532013, "rewards/rejected": -0.02231532707810402, "step": 20 }, { "epoch": 0.09484827099505999, "grad_norm": 18.422012266521907, "learning_rate": 2.968431040858144e-07, "logits/chosen": -3.6120433807373047, "logits/rejected": -3.5835161209106445, "logps/chosen": -179.35166931152344, "logps/rejected": -174.52101135253906, "loss": 0.8095, "rewards/accuracies": 0.5625, "rewards/chosen": -0.007007395848631859, "rewards/margins": 0.009391836822032928, "rewards/rejected": -0.016399234533309937, "step": 21 }, { "epoch": 0.09936485532815809, "grad_norm": 19.19498923375192, "learning_rate": 2.963779007307544e-07, "logits/chosen": -3.5445475578308105, "logits/rejected": -3.5850906372070312, "logps/chosen": -232.3138427734375, "logps/rejected": -210.61709594726562, "loss": 0.8196, "rewards/accuracies": 0.625, "rewards/chosen": 0.0029985038563609123, "rewards/margins": 0.020012138411402702, "rewards/rejected": -0.017013631761074066, "step": 22 }, { "epoch": 0.10388143966125618, "grad_norm": 15.614857590390317, "learning_rate": 2.958811516942438e-07, "logits/chosen": -3.697209358215332, "logits/rejected": -3.624351978302002, "logps/chosen": -197.09347534179688, "logps/rejected": -177.77401733398438, "loss": 0.804, "rewards/accuracies": 0.609375, "rewards/chosen": 0.0026644528843462467, "rewards/margins": 0.028430193662643433, "rewards/rejected": -0.025765739381313324, "step": 23 }, { "epoch": 0.10839802399435428, "grad_norm": 15.89381246255073, "learning_rate": 2.953529640299211e-07, "logits/chosen": -3.600754737854004, "logits/rejected": -3.5322327613830566, "logps/chosen": -228.12118530273438, "logps/rejected": -209.07611083984375, "loss": 0.8188, "rewards/accuracies": 0.453125, "rewards/chosen": -0.006520797498524189, "rewards/margins": -0.0006878629792481661, "rewards/rejected": -0.0058329347521066666, "step": 24 }, { "epoch": 0.11291460832745237, "grad_norm": 15.396893791413314, "learning_rate": 2.947934515667162e-07, "logits/chosen": -3.555856466293335, "logits/rejected": -3.548595905303955, "logps/chosen": -209.21588134765625, "logps/rejected": -191.61837768554688, "loss": 0.8118, "rewards/accuracies": 0.578125, "rewards/chosen": -0.0008083764696493745, "rewards/margins": 0.019747722893953323, "rewards/rejected": -0.020556099712848663, "step": 25 }, { "epoch": 0.11743119266055047, "grad_norm": 17.20718944198085, "learning_rate": 2.9420273488431933e-07, "logits/chosen": -3.8108675479888916, "logits/rejected": -3.727308750152588, "logps/chosen": -208.67276000976562, "logps/rejected": -187.67626953125, "loss": 0.8029, "rewards/accuracies": 0.515625, "rewards/chosen": -0.009361563250422478, "rewards/margins": 0.012206509709358215, "rewards/rejected": -0.021568071097135544, "step": 26 }, { "epoch": 0.12194777699364856, "grad_norm": 15.82300857880186, "learning_rate": 2.9358094128719524e-07, "logits/chosen": -3.725928544998169, "logits/rejected": -3.6373510360717773, "logps/chosen": -184.15992736816406, "logps/rejected": -163.3759307861328, "loss": 0.8106, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0011545311426743865, "rewards/margins": 0.01487318892031908, "rewards/rejected": -0.016027718782424927, "step": 27 }, { "epoch": 0.12646436132674665, "grad_norm": 17.170961024910188, "learning_rate": 2.929282047771477e-07, "logits/chosen": -3.7685747146606445, "logits/rejected": -3.627628803253174, "logps/chosen": -180.48487854003906, "logps/rejected": -159.71482849121094, "loss": 0.8028, "rewards/accuracies": 0.546875, "rewards/chosen": -0.006781768519431353, "rewards/margins": 0.012908656150102615, "rewards/rejected": -0.01969042420387268, "step": 28 }, { "epoch": 0.13098094565984475, "grad_norm": 16.359167675199306, "learning_rate": 2.9224466602444125e-07, "logits/chosen": -3.7441000938415527, "logits/rejected": -3.6602091789245605, "logps/chosen": -188.38925170898438, "logps/rejected": -167.985595703125, "loss": 0.8037, "rewards/accuracies": 0.578125, "rewards/chosen": -0.003988177981227636, "rewards/margins": 0.01803458295762539, "rewards/rejected": -0.022022761404514313, "step": 29 }, { "epoch": 0.13549752999294284, "grad_norm": 17.676336950925887, "learning_rate": 2.9153047233748554e-07, "logits/chosen": -3.643461227416992, "logits/rejected": -3.617875814437866, "logps/chosen": -225.23348999023438, "logps/rejected": -212.8236541748047, "loss": 0.8048, "rewards/accuracies": 0.640625, "rewards/chosen": 0.004974519833922386, "rewards/margins": 0.03113599866628647, "rewards/rejected": -0.026161476969718933, "step": 30 }, { "epoch": 0.14001411432604094, "grad_norm": 14.414082201816626, "learning_rate": 2.907857776310889e-07, "logits/chosen": -3.666928291320801, "logits/rejected": -3.638302803039551, "logps/chosen": -191.73712158203125, "logps/rejected": -166.03549194335938, "loss": 0.8045, "rewards/accuracies": 0.609375, "rewards/chosen": 0.004222148098051548, "rewards/margins": 0.035250477492809296, "rewards/rejected": -0.031028330326080322, "step": 31 }, { "epoch": 0.14453069865913903, "grad_norm": 15.20568282502077, "learning_rate": 2.9001074239328855e-07, "logits/chosen": -3.678450345993042, "logits/rejected": -3.6584630012512207, "logps/chosen": -189.22283935546875, "logps/rejected": -172.16836547851562, "loss": 0.8085, "rewards/accuracies": 0.59375, "rewards/chosen": -0.005335791036486626, "rewards/margins": 0.0169003177434206, "rewards/rejected": -0.022236105054616928, "step": 32 }, { "epoch": 0.14904728299223713, "grad_norm": 17.666411807362163, "learning_rate": 2.892055336507641e-07, "logits/chosen": -3.6488301753997803, "logits/rejected": -3.6809892654418945, "logps/chosen": -221.47088623046875, "logps/rejected": -208.6617431640625, "loss": 0.7961, "rewards/accuracies": 0.765625, "rewards/chosen": 0.0049254028126597404, "rewards/margins": 0.04181712493300438, "rewards/rejected": -0.03689172491431236, "step": 33 }, { "epoch": 0.15356386732533522, "grad_norm": 15.904931812764481, "learning_rate": 2.883703249328419e-07, "logits/chosen": -3.7053463459014893, "logits/rejected": -3.6859936714172363, "logps/chosen": -193.99462890625, "logps/rejected": -168.01174926757812, "loss": 0.7982, "rewards/accuracies": 0.703125, "rewards/chosen": 0.005228930618613958, "rewards/margins": 0.04429711773991585, "rewards/rejected": -0.03906818851828575, "step": 34 }, { "epoch": 0.15808045165843332, "grad_norm": 17.574931019060468, "learning_rate": 2.8750529623409767e-07, "logits/chosen": -3.74350643157959, "logits/rejected": -3.688340902328491, "logps/chosen": -227.44448852539062, "logps/rejected": -207.76531982421875, "loss": 0.8001, "rewards/accuracies": 0.609375, "rewards/chosen": -0.013922490179538727, "rewards/margins": 0.041697580367326736, "rewards/rejected": -0.05562007054686546, "step": 35 }, { "epoch": 0.1625970359915314, "grad_norm": 16.144138203267183, "learning_rate": 2.866106339755666e-07, "logits/chosen": -3.6101019382476807, "logits/rejected": -3.516798496246338, "logps/chosen": -225.3680877685547, "logps/rejected": -200.65811157226562, "loss": 0.7982, "rewards/accuracies": 0.703125, "rewards/chosen": 0.00023476500064134598, "rewards/margins": 0.043288152664899826, "rewards/rejected": -0.043053388595581055, "step": 36 }, { "epoch": 0.1671136203246295, "grad_norm": 18.051062496264226, "learning_rate": 2.856865309645679e-07, "logits/chosen": -3.668738603591919, "logits/rejected": -3.5682475566864014, "logps/chosen": -216.2345733642578, "logps/rejected": -190.3118896484375, "loss": 0.7908, "rewards/accuracies": 0.71875, "rewards/chosen": 0.005171060096472502, "rewards/margins": 0.06707943975925446, "rewards/rejected": -0.06190839037299156, "step": 37 }, { "epoch": 0.1716302046577276, "grad_norm": 15.250674643466992, "learning_rate": 2.847331863531529e-07, "logits/chosen": -3.6172351837158203, "logits/rejected": -3.513965129852295, "logps/chosen": -207.12973022460938, "logps/rejected": -186.56016540527344, "loss": 0.7973, "rewards/accuracies": 0.640625, "rewards/chosen": -0.00749462703242898, "rewards/margins": 0.036814432591199875, "rewards/rejected": -0.04430905729532242, "step": 38 }, { "epoch": 0.1761467889908257, "grad_norm": 13.727288814848803, "learning_rate": 2.8375080559518633e-07, "logits/chosen": -3.677856922149658, "logits/rejected": -3.651683807373047, "logps/chosen": -173.89752197265625, "logps/rejected": -161.67294311523438, "loss": 0.8033, "rewards/accuracies": 0.640625, "rewards/chosen": -0.005101449321955442, "rewards/margins": 0.03479147329926491, "rewards/rejected": -0.03989291936159134, "step": 39 }, { "epoch": 0.1806633733239238, "grad_norm": 16.031051528638532, "learning_rate": 2.827396004020694e-07, "logits/chosen": -3.657525062561035, "logits/rejected": -3.6614298820495605, "logps/chosen": -187.38975524902344, "logps/rejected": -174.07693481445312, "loss": 0.7971, "rewards/accuracies": 0.640625, "rewards/chosen": -0.006134797818958759, "rewards/margins": 0.04029170051217079, "rewards/rejected": -0.046426497399806976, "step": 40 }, { "epoch": 0.1851799576570219, "grad_norm": 14.72860095286963, "learning_rate": 2.8169978869711385e-07, "logits/chosen": -3.7374589443206787, "logits/rejected": -3.656147003173828, "logps/chosen": -187.12936401367188, "logps/rejected": -161.41815185546875, "loss": 0.787, "rewards/accuracies": 0.703125, "rewards/chosen": 0.004431622102856636, "rewards/margins": 0.05767218768596649, "rewards/rejected": -0.053240567445755005, "step": 41 }, { "epoch": 0.18969654199011998, "grad_norm": 17.441579066359495, "learning_rate": 2.806315945685779e-07, "logits/chosen": -3.6515302658081055, "logits/rejected": -3.551766872406006, "logps/chosen": -250.11190795898438, "logps/rejected": -221.34317016601562, "loss": 0.7699, "rewards/accuracies": 0.796875, "rewards/chosen": 0.006256776861846447, "rewards/margins": 0.09309859573841095, "rewards/rejected": -0.08684182167053223, "step": 42 }, { "epoch": 0.19421312632321808, "grad_norm": 17.048744298556095, "learning_rate": 2.7953524822137317e-07, "logits/chosen": -3.7105650901794434, "logits/rejected": -3.6019039154052734, "logps/chosen": -208.5770263671875, "logps/rejected": -180.16976928710938, "loss": 0.8027, "rewards/accuracies": 0.65625, "rewards/chosen": -0.018564769998192787, "rewards/margins": 0.04158995673060417, "rewards/rejected": -0.06015472859144211, "step": 43 }, { "epoch": 0.19872971065631617, "grad_norm": 16.16815408757425, "learning_rate": 2.784109859274537e-07, "logits/chosen": -3.597439765930176, "logits/rejected": -3.6136202812194824, "logps/chosen": -211.610107421875, "logps/rejected": -191.22201538085938, "loss": 0.7829, "rewards/accuracies": 0.765625, "rewards/chosen": -0.003241416532546282, "rewards/margins": 0.07393845170736313, "rewards/rejected": -0.07717986404895782, "step": 44 }, { "epoch": 0.20324629498941427, "grad_norm": 13.017137061508738, "learning_rate": 2.7725904997489726e-07, "logits/chosen": -3.613895893096924, "logits/rejected": -3.6305315494537354, "logps/chosen": -189.8942108154297, "logps/rejected": -180.846923828125, "loss": 0.7937, "rewards/accuracies": 0.65625, "rewards/chosen": -0.023781713098287582, "rewards/margins": 0.04976597800850868, "rewards/rejected": -0.07354769110679626, "step": 45 }, { "epoch": 0.20776287932251236, "grad_norm": 15.730598564063932, "learning_rate": 2.760796886156901e-07, "logits/chosen": -3.5601043701171875, "logits/rejected": -3.5674729347229004, "logps/chosen": -202.303466796875, "logps/rejected": -192.0014190673828, "loss": 0.7865, "rewards/accuracies": 0.65625, "rewards/chosen": -0.017426731064915657, "rewards/margins": 0.05039919540286064, "rewards/rejected": -0.06782592833042145, "step": 46 }, { "epoch": 0.21227946365561046, "grad_norm": 15.472698059079649, "learning_rate": 2.748731560122267e-07, "logits/chosen": -3.6035664081573486, "logits/rejected": -3.539640426635742, "logps/chosen": -221.04888916015625, "logps/rejected": -198.54339599609375, "loss": 0.7848, "rewards/accuracies": 0.8125, "rewards/chosen": -0.005871212109923363, "rewards/margins": 0.07652122527360916, "rewards/rejected": -0.08239243924617767, "step": 47 }, { "epoch": 0.21679604798870855, "grad_norm": 15.777479890984548, "learning_rate": 2.7363971218253573e-07, "logits/chosen": -3.6367340087890625, "logits/rejected": -3.5720105171203613, "logps/chosen": -206.6673583984375, "logps/rejected": -191.620361328125, "loss": 0.7834, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0010449562687426805, "rewards/margins": 0.07644650340080261, "rewards/rejected": -0.07540154457092285, "step": 48 }, { "epoch": 0.22131263232180665, "grad_norm": 14.647917769515534, "learning_rate": 2.7237962294424354e-07, "logits/chosen": -3.742436408996582, "logits/rejected": -3.583324432373047, "logps/chosen": -216.07376098632812, "logps/rejected": -191.45822143554688, "loss": 0.7637, "rewards/accuracies": 0.84375, "rewards/chosen": 0.0027800833340734243, "rewards/margins": 0.09453913569450378, "rewards/rejected": -0.09175905585289001, "step": 49 }, { "epoch": 0.22582921665490474, "grad_norm": 14.379641829000743, "learning_rate": 2.7109315985728866e-07, "logits/chosen": -3.4936299324035645, "logits/rejected": -3.461601734161377, "logps/chosen": -222.20791625976562, "logps/rejected": -198.30593872070312, "loss": 0.7946, "rewards/accuracies": 0.71875, "rewards/chosen": -0.01759425923228264, "rewards/margins": 0.06354185938835144, "rewards/rejected": -0.08113611489534378, "step": 50 }, { "epoch": 0.23034580098800284, "grad_norm": 15.536214345554205, "learning_rate": 2.697806001653979e-07, "logits/chosen": -3.656852960586548, "logits/rejected": -3.5931718349456787, "logps/chosen": -215.4611053466797, "logps/rejected": -195.17086791992188, "loss": 0.7702, "rewards/accuracies": 0.71875, "rewards/chosen": -0.012303248047828674, "rewards/margins": 0.09312085807323456, "rewards/rejected": -0.10542410612106323, "step": 51 }, { "epoch": 0.23486238532110093, "grad_norm": 15.948190715341653, "learning_rate": 2.684422267363384e-07, "logits/chosen": -3.658729076385498, "logits/rejected": -3.5965871810913086, "logps/chosen": -227.6315460205078, "logps/rejected": -216.8878936767578, "loss": 0.7749, "rewards/accuracies": 0.8125, "rewards/chosen": -0.02002471685409546, "rewards/margins": 0.08053679019212723, "rewards/rejected": -0.10056150704622269, "step": 52 }, { "epoch": 0.23937896965419903, "grad_norm": 13.3354410986771, "learning_rate": 2.670783280009569e-07, "logits/chosen": -3.5585010051727295, "logits/rejected": -3.542996406555176, "logps/chosen": -200.6866912841797, "logps/rejected": -176.0266876220703, "loss": 0.7969, "rewards/accuracies": 0.78125, "rewards/chosen": -0.016675246879458427, "rewards/margins": 0.0597347766160965, "rewards/rejected": -0.07641002535820007, "step": 53 }, { "epoch": 0.24389555398729712, "grad_norm": 15.37006073629648, "learning_rate": 2.656891978910205e-07, "logits/chosen": -3.5804214477539062, "logits/rejected": -3.580320358276367, "logps/chosen": -199.03363037109375, "logps/rejected": -175.6981201171875, "loss": 0.7826, "rewards/accuracies": 0.734375, "rewards/chosen": -0.022621821612119675, "rewards/margins": 0.07916627079248428, "rewards/rejected": -0.10178809612989426, "step": 54 }, { "epoch": 0.24841213832039521, "grad_norm": 13.977675083898662, "learning_rate": 2.642751357758722e-07, "logits/chosen": -3.6277871131896973, "logits/rejected": -3.576045274734497, "logps/chosen": -198.56690979003906, "logps/rejected": -173.84307861328125, "loss": 0.7798, "rewards/accuracies": 0.734375, "rewards/chosen": -0.014770936220884323, "rewards/margins": 0.08849596232175827, "rewards/rejected": -0.10326690226793289, "step": 55 }, { "epoch": 0.2529287226534933, "grad_norm": 15.0141286536347, "learning_rate": 2.628364463979135e-07, "logits/chosen": -3.591761589050293, "logits/rejected": -3.6051506996154785, "logps/chosen": -220.2593231201172, "logps/rejected": -202.11480712890625, "loss": 0.7609, "rewards/accuracies": 0.890625, "rewards/chosen": -0.004766255617141724, "rewards/margins": 0.10991345345973969, "rewards/rejected": -0.11467970907688141, "step": 56 }, { "epoch": 0.2574453069865914, "grad_norm": 16.32019401181327, "learning_rate": 2.613734398069308e-07, "logits/chosen": -3.6448159217834473, "logits/rejected": -3.6120004653930664, "logps/chosen": -218.3624267578125, "logps/rejected": -206.45156860351562, "loss": 0.7646, "rewards/accuracies": 0.78125, "rewards/chosen": -0.0081629678606987, "rewards/margins": 0.10102861374616623, "rewards/rejected": -0.10919158160686493, "step": 57 }, { "epoch": 0.2619618913196895, "grad_norm": 14.250044872308086, "learning_rate": 2.598864312932762e-07, "logits/chosen": -3.5635013580322266, "logits/rejected": -3.560309886932373, "logps/chosen": -199.04324340820312, "logps/rejected": -181.06304931640625, "loss": 0.7902, "rewards/accuracies": 0.65625, "rewards/chosen": -0.025674719363451004, "rewards/margins": 0.06853548437356949, "rewards/rejected": -0.09421020746231079, "step": 58 }, { "epoch": 0.2664784756527876, "grad_norm": 15.467967190395154, "learning_rate": 2.5837574131992034e-07, "logits/chosen": -3.590390205383301, "logits/rejected": -3.6538496017456055, "logps/chosen": -211.0101776123047, "logps/rejected": -200.92840576171875, "loss": 0.7626, "rewards/accuracies": 0.765625, "rewards/chosen": -0.022280972450971603, "rewards/margins": 0.10610129684209824, "rewards/rejected": -0.12838226556777954, "step": 59 }, { "epoch": 0.2709950599858857, "grad_norm": 15.203009992593772, "learning_rate": 2.568416954533894e-07, "logits/chosen": -3.687786102294922, "logits/rejected": -3.6774373054504395, "logps/chosen": -186.29849243164062, "logps/rejected": -169.62168884277344, "loss": 0.7785, "rewards/accuracies": 0.75, "rewards/chosen": -0.018803317099809647, "rewards/margins": 0.08361957967281342, "rewards/rejected": -0.10242290794849396, "step": 60 }, { "epoch": 0.2755116443189838, "grad_norm": 14.57683761120013, "learning_rate": 2.552846242936032e-07, "logits/chosen": -3.638808250427246, "logits/rejected": -3.6172327995300293, "logps/chosen": -202.23802185058594, "logps/rejected": -183.21786499023438, "loss": 0.7644, "rewards/accuracies": 0.765625, "rewards/chosen": -0.009251074865460396, "rewards/margins": 0.11388231813907623, "rewards/rejected": -0.12313339859247208, "step": 61 }, { "epoch": 0.2800282286520819, "grad_norm": 14.071215109151332, "learning_rate": 2.537048634026279e-07, "logits/chosen": -3.652895927429199, "logits/rejected": -3.5790305137634277, "logps/chosen": -200.75323486328125, "logps/rejected": -180.8776092529297, "loss": 0.7716, "rewards/accuracies": 0.75, "rewards/chosen": -0.02093246765434742, "rewards/margins": 0.09524297714233398, "rewards/rejected": -0.11617545038461685, "step": 62 }, { "epoch": 0.28454481298518, "grad_norm": 13.217637221004559, "learning_rate": 2.521027532323594e-07, "logits/chosen": -3.59419584274292, "logits/rejected": -3.5415499210357666, "logps/chosen": -196.58969116210938, "logps/rejected": -181.43600463867188, "loss": 0.7674, "rewards/accuracies": 0.75, "rewards/chosen": -0.023986171931028366, "rewards/margins": 0.10850539058446884, "rewards/rejected": -0.1324915587902069, "step": 63 }, { "epoch": 0.28906139731827807, "grad_norm": 14.497006980804226, "learning_rate": 2.5047863905115337e-07, "logits/chosen": -3.5735766887664795, "logits/rejected": -3.5254015922546387, "logps/chosen": -198.43698120117188, "logps/rejected": -177.77783203125, "loss": 0.7642, "rewards/accuracies": 0.8125, "rewards/chosen": -0.018381193280220032, "rewards/margins": 0.12361248582601547, "rewards/rejected": -0.1419936716556549, "step": 64 }, { "epoch": 0.29357798165137616, "grad_norm": 14.857860546032539, "learning_rate": 2.4883287086941666e-07, "logits/chosen": -3.592167854309082, "logits/rejected": -3.554619073867798, "logps/chosen": -209.84280395507812, "logps/rejected": -195.99557495117188, "loss": 0.7711, "rewards/accuracies": 0.75, "rewards/chosen": -0.04406602308154106, "rewards/margins": 0.11150160431861877, "rewards/rejected": -0.15556763112545013, "step": 65 }, { "epoch": 0.29809456598447426, "grad_norm": 15.7289147220562, "learning_rate": 2.4716580336417735e-07, "logits/chosen": -3.730616569519043, "logits/rejected": -3.611698627471924, "logps/chosen": -213.7239990234375, "logps/rejected": -191.99624633789062, "loss": 0.7641, "rewards/accuracies": 0.828125, "rewards/chosen": -0.04750160127878189, "rewards/margins": 0.11192238330841064, "rewards/rejected": -0.15942397713661194, "step": 66 }, { "epoch": 0.30261115031757235, "grad_norm": 16.553215826780214, "learning_rate": 2.4547779580264873e-07, "logits/chosen": -3.6770639419555664, "logits/rejected": -3.6511921882629395, "logps/chosen": -229.4193115234375, "logps/rejected": -219.91082763671875, "loss": 0.7634, "rewards/accuracies": 0.796875, "rewards/chosen": -0.07060261070728302, "rewards/margins": 0.10048267990350723, "rewards/rejected": -0.17108528316020966, "step": 67 }, { "epoch": 0.30712773465067045, "grad_norm": 14.535708064025933, "learning_rate": 2.4376921196480405e-07, "logits/chosen": -3.643744945526123, "logits/rejected": -3.6282317638397217, "logps/chosen": -215.79782104492188, "logps/rejected": -195.50967407226562, "loss": 0.7459, "rewards/accuracies": 0.859375, "rewards/chosen": -0.030901005491614342, "rewards/margins": 0.15577057003974915, "rewards/rejected": -0.18667156994342804, "step": 68 }, { "epoch": 0.31164431898376854, "grad_norm": 12.826587337871429, "learning_rate": 2.420404200649791e-07, "logits/chosen": -3.6386120319366455, "logits/rejected": -3.6321802139282227, "logps/chosen": -197.25970458984375, "logps/rejected": -188.24197387695312, "loss": 0.7671, "rewards/accuracies": 0.765625, "rewards/chosen": -0.08027191460132599, "rewards/margins": 0.09604091197252274, "rewards/rejected": -0.17631281912326813, "step": 69 }, { "epoch": 0.31616090331686664, "grad_norm": 16.418214717599835, "learning_rate": 2.402917926725185e-07, "logits/chosen": -3.6721415519714355, "logits/rejected": -3.6725897789001465, "logps/chosen": -216.7154541015625, "logps/rejected": -201.8681640625, "loss": 0.7291, "rewards/accuracies": 0.859375, "rewards/chosen": -0.029750004410743713, "rewards/margins": 0.1923179030418396, "rewards/rejected": -0.22206789255142212, "step": 70 }, { "epoch": 0.32067748764996473, "grad_norm": 13.215008697180764, "learning_rate": 2.385237066314845e-07, "logits/chosen": -3.565258026123047, "logits/rejected": -3.553618907928467, "logps/chosen": -197.5516815185547, "logps/rejected": -184.250732421875, "loss": 0.7722, "rewards/accuracies": 0.84375, "rewards/chosen": -0.050527218729257584, "rewards/margins": 0.11302457749843597, "rewards/rejected": -0.16355180740356445, "step": 71 }, { "epoch": 0.3251940719830628, "grad_norm": 13.956180499179757, "learning_rate": 2.3673654297944303e-07, "logits/chosen": -3.6860711574554443, "logits/rejected": -3.6514902114868164, "logps/chosen": -226.09010314941406, "logps/rejected": -204.78941345214844, "loss": 0.753, "rewards/accuracies": 0.765625, "rewards/chosen": -0.06019885092973709, "rewards/margins": 0.1402389109134674, "rewards/rejected": -0.2004377692937851, "step": 72 }, { "epoch": 0.3297106563161609, "grad_norm": 12.632161370394448, "learning_rate": 2.3493068686534757e-07, "logits/chosen": -3.6133785247802734, "logits/rejected": -3.573826789855957, "logps/chosen": -208.22662353515625, "logps/rejected": -189.77066040039062, "loss": 0.7492, "rewards/accuracies": 0.8125, "rewards/chosen": -0.055879779160022736, "rewards/margins": 0.1519559770822525, "rewards/rejected": -0.20783576369285583, "step": 73 }, { "epoch": 0.334227240649259, "grad_norm": 14.261595430312116, "learning_rate": 2.3310652746653585e-07, "logits/chosen": -3.6738648414611816, "logits/rejected": -3.6415371894836426, "logps/chosen": -185.10997009277344, "logps/rejected": -167.96487426757812, "loss": 0.7588, "rewards/accuracies": 0.828125, "rewards/chosen": -0.052835769951343536, "rewards/margins": 0.1488420069217682, "rewards/rejected": -0.20167775452136993, "step": 74 }, { "epoch": 0.3387438249823571, "grad_norm": 14.822381543756691, "learning_rate": 2.312644579048592e-07, "logits/chosen": -3.7603840827941895, "logits/rejected": -3.660132646560669, "logps/chosen": -209.13519287109375, "logps/rejected": -186.57931518554688, "loss": 0.748, "rewards/accuracies": 0.796875, "rewards/chosen": -0.07213892042636871, "rewards/margins": 0.1440207064151764, "rewards/rejected": -0.2161596417427063, "step": 75 }, { "epoch": 0.3432604093154552, "grad_norm": 14.275023304603453, "learning_rate": 2.29404875161961e-07, "logits/chosen": -3.6284379959106445, "logits/rejected": -3.5983853340148926, "logps/chosen": -222.89759826660156, "logps/rejected": -200.91384887695312, "loss": 0.7395, "rewards/accuracies": 0.78125, "rewards/chosen": -0.06927837431430817, "rewards/margins": 0.18208304047584534, "rewards/rejected": -0.2513614296913147, "step": 76 }, { "epoch": 0.3477769936485533, "grad_norm": 12.109768727947394, "learning_rate": 2.2752817999372408e-07, "logits/chosen": -3.717994213104248, "logits/rejected": -3.6696221828460693, "logps/chosen": -179.09075927734375, "logps/rejected": -165.41148376464844, "loss": 0.7619, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07646910846233368, "rewards/margins": 0.10589072108268738, "rewards/rejected": -0.18235982954502106, "step": 77 }, { "epoch": 0.3522935779816514, "grad_norm": 12.334847197016632, "learning_rate": 2.2563477684390454e-07, "logits/chosen": -3.6596970558166504, "logits/rejected": -3.7064521312713623, "logps/chosen": -183.98486328125, "logps/rejected": -175.17532348632812, "loss": 0.7407, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0660928264260292, "rewards/margins": 0.15099170804023743, "rewards/rejected": -0.21708452701568604, "step": 78 }, { "epoch": 0.3568101623147495, "grad_norm": 14.527926172073567, "learning_rate": 2.2372507375697016e-07, "logits/chosen": -3.6260461807250977, "logits/rejected": -3.56558895111084, "logps/chosen": -215.37002563476562, "logps/rejected": -191.77987670898438, "loss": 0.7351, "rewards/accuracies": 0.8125, "rewards/chosen": -0.04920428246259689, "rewards/margins": 0.18218760192394257, "rewards/rejected": -0.23139187693595886, "step": 79 }, { "epoch": 0.3613267466478476, "grad_norm": 13.953022020448985, "learning_rate": 2.217994822901639e-07, "logits/chosen": -3.6040546894073486, "logits/rejected": -3.6221628189086914, "logps/chosen": -223.6805419921875, "logps/rejected": -201.2560577392578, "loss": 0.7415, "rewards/accuracies": 0.84375, "rewards/chosen": -0.053364675492048264, "rewards/margins": 0.1852804273366928, "rewards/rejected": -0.23864510655403137, "step": 80 }, { "epoch": 0.3658433309809457, "grad_norm": 13.78052722074895, "learning_rate": 2.1985841742480954e-07, "logits/chosen": -3.5805296897888184, "logits/rejected": -3.5260043144226074, "logps/chosen": -215.5792236328125, "logps/rejected": -199.66006469726562, "loss": 0.7274, "rewards/accuracies": 0.859375, "rewards/chosen": -0.06636206805706024, "rewards/margins": 0.1942358762025833, "rewards/rejected": -0.26059794425964355, "step": 81 }, { "epoch": 0.3703599153140438, "grad_norm": 14.881670177581055, "learning_rate": 2.1790229747687971e-07, "logits/chosen": -3.739069938659668, "logits/rejected": -3.669661283493042, "logps/chosen": -239.0128173828125, "logps/rejected": -217.56625366210938, "loss": 0.7162, "rewards/accuracies": 0.84375, "rewards/chosen": -0.06281246244907379, "rewards/margins": 0.22767522931098938, "rewards/rejected": -0.290487676858902, "step": 82 }, { "epoch": 0.37487649964714187, "grad_norm": 13.702491862774862, "learning_rate": 2.1593154400684523e-07, "logits/chosen": -3.6899726390838623, "logits/rejected": -3.5771546363830566, "logps/chosen": -202.50872802734375, "logps/rejected": -176.08013916015625, "loss": 0.7315, "rewards/accuracies": 0.765625, "rewards/chosen": -0.08337001502513885, "rewards/margins": 0.18461519479751587, "rewards/rejected": -0.2679852247238159, "step": 83 }, { "epoch": 0.37939308398023996, "grad_norm": 13.984029281056696, "learning_rate": 2.139465817288254e-07, "logits/chosen": -3.7421321868896484, "logits/rejected": -3.6175167560577393, "logps/chosen": -190.1082000732422, "logps/rejected": -175.02052307128906, "loss": 0.7337, "rewards/accuracies": 0.796875, "rewards/chosen": -0.08382508903741837, "rewards/margins": 0.17669573426246643, "rewards/rejected": -0.2605208456516266, "step": 84 }, { "epoch": 0.38390966831333806, "grad_norm": 19.996335370988827, "learning_rate": 2.1194783841905826e-07, "logits/chosen": -3.5821313858032227, "logits/rejected": -3.4785704612731934, "logps/chosen": -230.62477111816406, "logps/rejected": -208.86831665039062, "loss": 0.7245, "rewards/accuracies": 0.859375, "rewards/chosen": -0.11116647720336914, "rewards/margins": 0.19649800658226013, "rewards/rejected": -0.3076644837856293, "step": 85 }, { "epoch": 0.38842625264643615, "grad_norm": 11.794852107515213, "learning_rate": 2.0993574482371138e-07, "logits/chosen": -3.557180643081665, "logits/rejected": -3.5188608169555664, "logps/chosen": -200.02206420898438, "logps/rejected": -186.23666381835938, "loss": 0.7401, "rewards/accuracies": 0.796875, "rewards/chosen": -0.09516202658414841, "rewards/margins": 0.18562006950378418, "rewards/rejected": -0.2807821035385132, "step": 86 }, { "epoch": 0.39294283697953425, "grad_norm": 15.110048854614613, "learning_rate": 2.0791073456605222e-07, "logits/chosen": -3.6332998275756836, "logits/rejected": -3.6093335151672363, "logps/chosen": -247.05874633789062, "logps/rejected": -224.82608032226562, "loss": 0.6953, "rewards/accuracies": 0.90625, "rewards/chosen": -0.08461566269397736, "rewards/margins": 0.2697482705116272, "rewards/rejected": -0.35436391830444336, "step": 87 }, { "epoch": 0.39745942131263234, "grad_norm": 12.45488028971463, "learning_rate": 2.058732440529989e-07, "logits/chosen": -3.7330398559570312, "logits/rejected": -3.5911219120025635, "logps/chosen": -211.71954345703125, "logps/rejected": -190.85597229003906, "loss": 0.733, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11091723293066025, "rewards/margins": 0.17457152903079987, "rewards/rejected": -0.2854887545108795, "step": 88 }, { "epoch": 0.40197600564573044, "grad_norm": 13.342737944026238, "learning_rate": 2.0382371238107038e-07, "logits/chosen": -3.7038931846618652, "logits/rejected": -3.630575656890869, "logps/chosen": -217.16465759277344, "logps/rejected": -195.84320068359375, "loss": 0.717, "rewards/accuracies": 0.859375, "rewards/chosen": -0.08662399649620056, "rewards/margins": 0.21940693259239197, "rewards/rejected": -0.30603092908859253, "step": 89 }, { "epoch": 0.40649258997882853, "grad_norm": 12.805765983733862, "learning_rate": 2.0176258124175791e-07, "logits/chosen": -3.5431618690490723, "logits/rejected": -3.5177979469299316, "logps/chosen": -208.36294555664062, "logps/rejected": -196.33544921875, "loss": 0.7397, "rewards/accuracies": 0.734375, "rewards/chosen": -0.1321878284215927, "rewards/margins": 0.17401957511901855, "rewards/rejected": -0.30620741844177246, "step": 90 }, { "epoch": 0.41100917431192663, "grad_norm": 14.324921342449116, "learning_rate": 1.996902948263364e-07, "logits/chosen": -3.6215481758117676, "logits/rejected": -3.5871336460113525, "logps/chosen": -218.2590789794922, "logps/rejected": -202.82135009765625, "loss": 0.7184, "rewards/accuracies": 0.796875, "rewards/chosen": -0.06889334321022034, "rewards/margins": 0.2213161736726761, "rewards/rejected": -0.2902095317840576, "step": 91 }, { "epoch": 0.4155257586450247, "grad_norm": 13.522825965617656, "learning_rate": 1.9760729973013756e-07, "logits/chosen": -3.5652565956115723, "logits/rejected": -3.559969902038574, "logps/chosen": -216.3070068359375, "logps/rejected": -207.52525329589844, "loss": 0.7279, "rewards/accuracies": 0.8125, "rewards/chosen": -0.12052971869707108, "rewards/margins": 0.19747133553028107, "rewards/rejected": -0.31800103187561035, "step": 92 }, { "epoch": 0.4200423429781228, "grad_norm": 14.17056179163305, "learning_rate": 1.9551404485630487e-07, "logits/chosen": -3.648214101791382, "logits/rejected": -3.633265972137451, "logps/chosen": -228.32022094726562, "logps/rejected": -216.98306274414062, "loss": 0.712, "rewards/accuracies": 0.8125, "rewards/chosen": -0.09251593053340912, "rewards/margins": 0.24657899141311646, "rewards/rejected": -0.33909493684768677, "step": 93 }, { "epoch": 0.4245589273112209, "grad_norm": 12.41603788591784, "learning_rate": 1.9341098131905102e-07, "logits/chosen": -3.563978672027588, "logits/rejected": -3.575545310974121, "logps/chosen": -197.626708984375, "logps/rejected": -183.598876953125, "loss": 0.7199, "rewards/accuracies": 0.828125, "rewards/chosen": -0.06666558235883713, "rewards/margins": 0.22736752033233643, "rewards/rejected": -0.29403308033943176, "step": 94 }, { "epoch": 0.429075511644319, "grad_norm": 13.43017680491703, "learning_rate": 1.91298562346439e-07, "logits/chosen": -3.5415198802948, "logits/rejected": -3.423177480697632, "logps/chosen": -210.36868286132812, "logps/rejected": -193.19737243652344, "loss": 0.7208, "rewards/accuracies": 0.859375, "rewards/chosen": -0.13729974627494812, "rewards/margins": 0.20369592308998108, "rewards/rejected": -0.3409956693649292, "step": 95 }, { "epoch": 0.4335920959774171, "grad_norm": 13.192586910237464, "learning_rate": 1.8917724318270764e-07, "logits/chosen": -3.6500909328460693, "logits/rejected": -3.641021728515625, "logps/chosen": -212.2454833984375, "logps/rejected": -195.04710388183594, "loss": 0.7359, "rewards/accuracies": 0.765625, "rewards/chosen": -0.13967543840408325, "rewards/margins": 0.19579245150089264, "rewards/rejected": -0.3354678750038147, "step": 96 }, { "epoch": 0.4381086803105152, "grad_norm": 14.631006890045864, "learning_rate": 1.8704748099016263e-07, "logits/chosen": -3.5358424186706543, "logits/rejected": -3.492499828338623, "logps/chosen": -226.14268493652344, "logps/rejected": -209.61166381835938, "loss": 0.7276, "rewards/accuracies": 0.78125, "rewards/chosen": -0.12765762209892273, "rewards/margins": 0.21874003112316132, "rewards/rejected": -0.34639766812324524, "step": 97 }, { "epoch": 0.4426252646436133, "grad_norm": 12.620333608491627, "learning_rate": 1.8490973475065407e-07, "logits/chosen": -3.62363862991333, "logits/rejected": -3.558384656906128, "logps/chosen": -201.33468627929688, "logps/rejected": -187.42794799804688, "loss": 0.735, "rewards/accuracies": 0.75, "rewards/chosen": -0.10830561816692352, "rewards/margins": 0.19010263681411743, "rewards/rejected": -0.29840826988220215, "step": 98 }, { "epoch": 0.4471418489767114, "grad_norm": 11.634459384049228, "learning_rate": 1.8276446516666194e-07, "logits/chosen": -3.564702033996582, "logits/rejected": -3.4371743202209473, "logps/chosen": -203.50912475585938, "logps/rejected": -186.2063446044922, "loss": 0.7276, "rewards/accuracies": 0.796875, "rewards/chosen": -0.12979203462600708, "rewards/margins": 0.19052280485630035, "rewards/rejected": -0.32031482458114624, "step": 99 }, { "epoch": 0.4516584333098095, "grad_norm": 14.33489341500007, "learning_rate": 1.806121345620111e-07, "logits/chosen": -3.5418810844421387, "logits/rejected": -3.465827703475952, "logps/chosen": -222.30589294433594, "logps/rejected": -204.18182373046875, "loss": 0.7318, "rewards/accuracies": 0.765625, "rewards/chosen": -0.15671955049037933, "rewards/margins": 0.2025194764137268, "rewards/rejected": -0.35923904180526733, "step": 100 }, { "epoch": 0.4561750176429076, "grad_norm": 12.379106375796471, "learning_rate": 1.7845320678223614e-07, "logits/chosen": -3.572160482406616, "logits/rejected": -3.4780170917510986, "logps/chosen": -204.60537719726562, "logps/rejected": -188.61119079589844, "loss": 0.7315, "rewards/accuracies": 0.828125, "rewards/chosen": -0.11631282418966293, "rewards/margins": 0.19585317373275757, "rewards/rejected": -0.3121660053730011, "step": 101 }, { "epoch": 0.46069160197600567, "grad_norm": 13.428841184771063, "learning_rate": 1.7628814709461914e-07, "logits/chosen": -3.4720003604888916, "logits/rejected": -3.541748523712158, "logps/chosen": -225.80712890625, "logps/rejected": -213.91180419921875, "loss": 0.7084, "rewards/accuracies": 0.921875, "rewards/chosen": -0.12924836575984955, "rewards/margins": 0.276883989572525, "rewards/rejected": -0.40613240003585815, "step": 102 }, { "epoch": 0.46520818630910377, "grad_norm": 12.642202001549919, "learning_rate": 1.7411742208792024e-07, "logits/chosen": -3.6850690841674805, "logits/rejected": -3.545577049255371, "logps/chosen": -225.94509887695312, "logps/rejected": -193.059326171875, "loss": 0.7084, "rewards/accuracies": 0.8125, "rewards/chosen": -0.11674878001213074, "rewards/margins": 0.25400400161743164, "rewards/rejected": -0.3707527816295624, "step": 103 }, { "epoch": 0.46972477064220186, "grad_norm": 11.73048532713264, "learning_rate": 1.7194149957182414e-07, "logits/chosen": -3.5629310607910156, "logits/rejected": -3.5668156147003174, "logps/chosen": -171.92477416992188, "logps/rejected": -165.56927490234375, "loss": 0.7396, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1383664906024933, "rewards/margins": 0.1683700531721115, "rewards/rejected": -0.3067365288734436, "step": 104 }, { "epoch": 0.47424135497529996, "grad_norm": 12.673432432328502, "learning_rate": 1.6976084847612282e-07, "logits/chosen": -3.525435447692871, "logits/rejected": -3.4939839839935303, "logps/chosen": -206.1438751220703, "logps/rejected": -188.24546813964844, "loss": 0.722, "rewards/accuracies": 0.75, "rewards/chosen": -0.13165442645549774, "rewards/margins": 0.2146846055984497, "rewards/rejected": -0.34633904695510864, "step": 105 }, { "epoch": 0.47875793930839805, "grad_norm": 12.032890874796571, "learning_rate": 1.6757593874965754e-07, "logits/chosen": -3.5656533241271973, "logits/rejected": -3.527076005935669, "logps/chosen": -197.67202758789062, "logps/rejected": -181.7797088623047, "loss": 0.7251, "rewards/accuracies": 0.796875, "rewards/chosen": -0.0992819219827652, "rewards/margins": 0.225886732339859, "rewards/rejected": -0.325168639421463, "step": 106 }, { "epoch": 0.48327452364149615, "grad_norm": 12.310976191149708, "learning_rate": 1.6538724125904051e-07, "logits/chosen": -3.686993360519409, "logits/rejected": -3.6414313316345215, "logps/chosen": -204.51181030273438, "logps/rejected": -196.01507568359375, "loss": 0.7234, "rewards/accuracies": 0.828125, "rewards/chosen": -0.12974955141544342, "rewards/margins": 0.21487677097320557, "rewards/rejected": -0.3446263074874878, "step": 107 }, { "epoch": 0.48779110797459424, "grad_norm": 12.199852142739852, "learning_rate": 1.6319522768717944e-07, "logits/chosen": -3.6431784629821777, "logits/rejected": -3.561030864715576, "logps/chosen": -199.9854278564453, "logps/rejected": -181.25628662109375, "loss": 0.7371, "rewards/accuracies": 0.71875, "rewards/chosen": -0.11536161601543427, "rewards/margins": 0.2128218412399292, "rewards/rejected": -0.32818344235420227, "step": 108 }, { "epoch": 0.49230769230769234, "grad_norm": 13.488082140342616, "learning_rate": 1.610003704316256e-07, "logits/chosen": -3.7115769386291504, "logits/rejected": -3.6150527000427246, "logps/chosen": -206.2103271484375, "logps/rejected": -184.147705078125, "loss": 0.6968, "rewards/accuracies": 0.859375, "rewards/chosen": -0.07892445474863052, "rewards/margins": 0.2777579128742218, "rewards/rejected": -0.35668236017227173, "step": 109 }, { "epoch": 0.49682427664079043, "grad_norm": 12.799344796933894, "learning_rate": 1.5880314250276833e-07, "logits/chosen": -3.6075048446655273, "logits/rejected": -3.451253652572632, "logps/chosen": -212.31695556640625, "logps/rejected": -189.007080078125, "loss": 0.723, "rewards/accuracies": 0.859375, "rewards/chosen": -0.1649230718612671, "rewards/margins": 0.23563840985298157, "rewards/rejected": -0.40056151151657104, "step": 110 }, { "epoch": 0.5013408609738885, "grad_norm": 11.195885489420343, "learning_rate": 1.5660401742189716e-07, "logits/chosen": -3.6232829093933105, "logits/rejected": -3.517642021179199, "logps/chosen": -196.2974090576172, "logps/rejected": -181.78778076171875, "loss": 0.7163, "rewards/accuracies": 0.78125, "rewards/chosen": -0.11609620600938797, "rewards/margins": 0.24031955003738403, "rewards/rejected": -0.3564157783985138, "step": 111 }, { "epoch": 0.5058574453069866, "grad_norm": 13.462764100267743, "learning_rate": 1.5440346911915413e-07, "logits/chosen": -3.5703439712524414, "logits/rejected": -3.5258054733276367, "logps/chosen": -197.42391967773438, "logps/rejected": -182.4862060546875, "loss": 0.707, "rewards/accuracies": 0.84375, "rewards/chosen": -0.13110296428203583, "rewards/margins": 0.25373634696006775, "rewards/rejected": -0.3848392963409424, "step": 112 }, { "epoch": 0.5103740296400847, "grad_norm": 11.216776131932745, "learning_rate": 1.522019718313975e-07, "logits/chosen": -3.5310792922973633, "logits/rejected": -3.565998077392578, "logps/chosen": -209.51132202148438, "logps/rejected": -189.17047119140625, "loss": 0.7095, "rewards/accuracies": 0.828125, "rewards/chosen": -0.12987811863422394, "rewards/margins": 0.2518337368965149, "rewards/rejected": -0.38171184062957764, "step": 113 }, { "epoch": 0.5148906139731828, "grad_norm": 13.293761430898734, "learning_rate": 1.5e-07, "logits/chosen": -3.7002620697021484, "logits/rejected": -3.6795387268066406, "logps/chosen": -213.72909545898438, "logps/rejected": -196.23594665527344, "loss": 0.713, "rewards/accuracies": 0.8125, "rewards/chosen": -0.13611683249473572, "rewards/margins": 0.2516591548919678, "rewards/rejected": -0.3877760171890259, "step": 114 }, { "epoch": 0.5194071983062809, "grad_norm": 11.607463785941228, "learning_rate": 1.4779802816860252e-07, "logits/chosen": -3.5893545150756836, "logits/rejected": -3.5178956985473633, "logps/chosen": -208.91015625, "logps/rejected": -186.68402099609375, "loss": 0.6884, "rewards/accuracies": 0.84375, "rewards/chosen": -0.06911865621805191, "rewards/margins": 0.3234255909919739, "rewards/rejected": -0.3925442397594452, "step": 115 }, { "epoch": 0.523923782639379, "grad_norm": 11.17744384668911, "learning_rate": 1.4559653088084589e-07, "logits/chosen": -3.5923399925231934, "logits/rejected": -3.5601184368133545, "logps/chosen": -197.11102294921875, "logps/rejected": -182.57537841796875, "loss": 0.7113, "rewards/accuracies": 0.8125, "rewards/chosen": -0.17773228883743286, "rewards/margins": 0.2376946210861206, "rewards/rejected": -0.41542690992355347, "step": 116 }, { "epoch": 0.5284403669724771, "grad_norm": 12.324943566684535, "learning_rate": 1.4339598257810283e-07, "logits/chosen": -3.4791109561920166, "logits/rejected": -3.528164863586426, "logps/chosen": -203.9499969482422, "logps/rejected": -191.4123077392578, "loss": 0.7141, "rewards/accuracies": 0.859375, "rewards/chosen": -0.1312122344970703, "rewards/margins": 0.24087485671043396, "rewards/rejected": -0.3720870912075043, "step": 117 }, { "epoch": 0.5329569513055752, "grad_norm": 13.355110060788324, "learning_rate": 1.411968574972317e-07, "logits/chosen": -3.4863524436950684, "logits/rejected": -3.5186939239501953, "logps/chosen": -211.86410522460938, "logps/rejected": -196.56219482421875, "loss": 0.7007, "rewards/accuracies": 0.765625, "rewards/chosen": -0.09455064684152603, "rewards/margins": 0.29401230812072754, "rewards/rejected": -0.38856297731399536, "step": 118 }, { "epoch": 0.5374735356386733, "grad_norm": 11.843733502428305, "learning_rate": 1.3899962956837443e-07, "logits/chosen": -3.5447893142700195, "logits/rejected": -3.5373752117156982, "logps/chosen": -217.02601623535156, "logps/rejected": -195.690185546875, "loss": 0.6888, "rewards/accuracies": 0.75, "rewards/chosen": -0.11182112246751785, "rewards/margins": 0.3210294544696808, "rewards/rejected": -0.43285059928894043, "step": 119 }, { "epoch": 0.5419901199717714, "grad_norm": 12.77724705920504, "learning_rate": 1.3680477231282058e-07, "logits/chosen": -3.6904451847076416, "logits/rejected": -3.648146629333496, "logps/chosen": -174.260009765625, "logps/rejected": -160.5956573486328, "loss": 0.7273, "rewards/accuracies": 0.71875, "rewards/chosen": -0.14862585067749023, "rewards/margins": 0.2124842405319214, "rewards/rejected": -0.3611100912094116, "step": 120 }, { "epoch": 0.5465067043048695, "grad_norm": 11.150050819573732, "learning_rate": 1.346127587409595e-07, "logits/chosen": -3.6496615409851074, "logits/rejected": -3.544325828552246, "logps/chosen": -199.09524536132812, "logps/rejected": -178.0385284423828, "loss": 0.6964, "rewards/accuracies": 0.8125, "rewards/chosen": -0.13068127632141113, "rewards/margins": 0.2756379246711731, "rewards/rejected": -0.40631920099258423, "step": 121 }, { "epoch": 0.5510232886379676, "grad_norm": 12.080724661826311, "learning_rate": 1.3242406125034247e-07, "logits/chosen": -3.6039113998413086, "logits/rejected": -3.4897897243499756, "logps/chosen": -230.91986083984375, "logps/rejected": -215.7164764404297, "loss": 0.7039, "rewards/accuracies": 0.828125, "rewards/chosen": -0.20738250017166138, "rewards/margins": 0.24289953708648682, "rewards/rejected": -0.4502820372581482, "step": 122 }, { "epoch": 0.5555398729710657, "grad_norm": 11.919256579608561, "learning_rate": 1.302391515238772e-07, "logits/chosen": -3.5706183910369873, "logits/rejected": -3.552396774291992, "logps/chosen": -213.68630981445312, "logps/rejected": -193.32818603515625, "loss": 0.6788, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1757589429616928, "rewards/margins": 0.32006677985191345, "rewards/rejected": -0.49582570791244507, "step": 123 }, { "epoch": 0.5600564573041638, "grad_norm": 12.629389606369296, "learning_rate": 1.280585004281759e-07, "logits/chosen": -3.539670944213867, "logits/rejected": -3.535710334777832, "logps/chosen": -206.27227783203125, "logps/rejected": -193.86167907714844, "loss": 0.7106, "rewards/accuracies": 0.78125, "rewards/chosen": -0.16979312896728516, "rewards/margins": 0.2596694827079773, "rewards/rejected": -0.42946261167526245, "step": 124 }, { "epoch": 0.5645730416372619, "grad_norm": 11.591449990368316, "learning_rate": 1.2588257791207977e-07, "logits/chosen": -3.59249210357666, "logits/rejected": -3.588090419769287, "logps/chosen": -210.10736083984375, "logps/rejected": -194.91561889648438, "loss": 0.6973, "rewards/accuracies": 0.765625, "rewards/chosen": -0.17436593770980835, "rewards/margins": 0.2836093306541443, "rewards/rejected": -0.45797526836395264, "step": 125 }, { "epoch": 0.56908962597036, "grad_norm": 12.618504079208098, "learning_rate": 1.2371185290538087e-07, "logits/chosen": -3.608921527862549, "logits/rejected": -3.516284942626953, "logps/chosen": -215.42098999023438, "logps/rejected": -192.1516571044922, "loss": 0.6996, "rewards/accuracies": 0.796875, "rewards/chosen": -0.18459659814834595, "rewards/margins": 0.27924641966819763, "rewards/rejected": -0.4638429880142212, "step": 126 }, { "epoch": 0.573606210303458, "grad_norm": 12.749831427337236, "learning_rate": 1.2154679321776385e-07, "logits/chosen": -3.5367257595062256, "logits/rejected": -3.458606719970703, "logps/chosen": -225.93020629882812, "logps/rejected": -214.3812713623047, "loss": 0.721, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2491096556186676, "rewards/margins": 0.22173485159873962, "rewards/rejected": -0.4708445072174072, "step": 127 }, { "epoch": 0.5781227946365561, "grad_norm": 12.444761165337477, "learning_rate": 1.193878654379889e-07, "logits/chosen": -3.5870304107666016, "logits/rejected": -3.5706138610839844, "logps/chosen": -206.72052001953125, "logps/rejected": -196.97564697265625, "loss": 0.7064, "rewards/accuracies": 0.75, "rewards/chosen": -0.17630356550216675, "rewards/margins": 0.2867695987224579, "rewards/rejected": -0.463073194026947, "step": 128 }, { "epoch": 0.5826393789696542, "grad_norm": 14.412718695628323, "learning_rate": 1.1723553483333806e-07, "logits/chosen": -3.6507744789123535, "logits/rejected": -3.536320209503174, "logps/chosen": -192.77645874023438, "logps/rejected": -172.97238159179688, "loss": 0.7102, "rewards/accuracies": 0.828125, "rewards/chosen": -0.17945942282676697, "rewards/margins": 0.24124844372272491, "rewards/rejected": -0.4207078814506531, "step": 129 }, { "epoch": 0.5871559633027523, "grad_norm": 11.81162755328704, "learning_rate": 1.1509026524934596e-07, "logits/chosen": -3.531088352203369, "logits/rejected": -3.4827826023101807, "logps/chosen": -208.96688842773438, "logps/rejected": -188.86569213867188, "loss": 0.7152, "rewards/accuracies": 0.828125, "rewards/chosen": -0.21842709183692932, "rewards/margins": 0.2507480978965759, "rewards/rejected": -0.46917521953582764, "step": 130 }, { "epoch": 0.5916725476358504, "grad_norm": 12.914405491596224, "learning_rate": 1.129525190098374e-07, "logits/chosen": -3.6707763671875, "logits/rejected": -3.636361598968506, "logps/chosen": -213.6358642578125, "logps/rejected": -204.06399536132812, "loss": 0.7035, "rewards/accuracies": 0.75, "rewards/chosen": -0.18059837818145752, "rewards/margins": 0.2868345379829407, "rewards/rejected": -0.4674329161643982, "step": 131 }, { "epoch": 0.5961891319689485, "grad_norm": 11.557057022060595, "learning_rate": 1.1082275681729236e-07, "logits/chosen": -3.6127572059631348, "logits/rejected": -3.531533718109131, "logps/chosen": -183.26527404785156, "logps/rejected": -165.15826416015625, "loss": 0.7145, "rewards/accuracies": 0.71875, "rewards/chosen": -0.17764925956726074, "rewards/margins": 0.2194777876138687, "rewards/rejected": -0.39712706208229065, "step": 132 }, { "epoch": 0.6007057163020466, "grad_norm": 12.328325193031683, "learning_rate": 1.0870143765356105e-07, "logits/chosen": -3.634964942932129, "logits/rejected": -3.566643714904785, "logps/chosen": -209.58245849609375, "logps/rejected": -185.32574462890625, "loss": 0.7069, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1629945933818817, "rewards/margins": 0.2675268054008484, "rewards/rejected": -0.4305214285850525, "step": 133 }, { "epoch": 0.6052223006351447, "grad_norm": 12.054973441313555, "learning_rate": 1.0658901868094899e-07, "logits/chosen": -3.571657657623291, "logits/rejected": -3.5290191173553467, "logps/chosen": -208.86460876464844, "logps/rejected": -198.53517150878906, "loss": 0.722, "rewards/accuracies": 0.84375, "rewards/chosen": -0.20138764381408691, "rewards/margins": 0.22342219948768616, "rewards/rejected": -0.42480987310409546, "step": 134 }, { "epoch": 0.6097388849682428, "grad_norm": 11.358121818992657, "learning_rate": 1.0448595514369515e-07, "logits/chosen": -3.5903096199035645, "logits/rejected": -3.4630379676818848, "logps/chosen": -195.21963500976562, "logps/rejected": -178.4676055908203, "loss": 0.738, "rewards/accuracies": 0.671875, "rewards/chosen": -0.21748274564743042, "rewards/margins": 0.20364663004875183, "rewards/rejected": -0.42112940549850464, "step": 135 }, { "epoch": 0.6142554693013409, "grad_norm": 12.076223242455574, "learning_rate": 1.0239270026986241e-07, "logits/chosen": -3.641045093536377, "logits/rejected": -3.6035234928131104, "logps/chosen": -210.5784454345703, "logps/rejected": -193.73934936523438, "loss": 0.6854, "rewards/accuracies": 0.859375, "rewards/chosen": -0.14184671640396118, "rewards/margins": 0.324258953332901, "rewards/rejected": -0.4661056697368622, "step": 136 }, { "epoch": 0.618772053634439, "grad_norm": 11.692752596565882, "learning_rate": 1.0030970517366362e-07, "logits/chosen": -3.563607931137085, "logits/rejected": -3.428741216659546, "logps/chosen": -219.72509765625, "logps/rejected": -196.94625854492188, "loss": 0.6911, "rewards/accuracies": 0.890625, "rewards/chosen": -0.1963091939687729, "rewards/margins": 0.31136542558670044, "rewards/rejected": -0.5076746344566345, "step": 137 }, { "epoch": 0.6232886379675371, "grad_norm": 12.345411080526995, "learning_rate": 9.82374187582421e-08, "logits/chosen": -3.528062343597412, "logits/rejected": -3.505977153778076, "logps/chosen": -218.63848876953125, "logps/rejected": -206.9732666015625, "loss": 0.7132, "rewards/accuracies": 0.734375, "rewards/chosen": -0.23708635568618774, "rewards/margins": 0.25329455733299255, "rewards/rejected": -0.4903808832168579, "step": 138 }, { "epoch": 0.6278052223006352, "grad_norm": 11.674703919156808, "learning_rate": 9.617628761892963e-08, "logits/chosen": -3.6489012241363525, "logits/rejected": -3.6186487674713135, "logps/chosen": -189.92733764648438, "logps/rejected": -176.64971923828125, "loss": 0.7119, "rewards/accuracies": 0.796875, "rewards/chosen": -0.18905529379844666, "rewards/margins": 0.25132396817207336, "rewards/rejected": -0.4403792917728424, "step": 139 }, { "epoch": 0.6323218066337333, "grad_norm": 13.29691200714144, "learning_rate": 9.412675594700113e-08, "logits/chosen": -3.5062429904937744, "logits/rejected": -3.4647328853607178, "logps/chosen": -201.7200927734375, "logps/rejected": -181.33375549316406, "loss": 0.681, "rewards/accuracies": 0.890625, "rewards/chosen": -0.08604306727647781, "rewards/margins": 0.33681678771972656, "rewards/rejected": -0.4228598475456238, "step": 140 }, { "epoch": 0.6368383909668314, "grad_norm": 12.691599989680086, "learning_rate": 9.208926543394776e-08, "logits/chosen": -3.5145082473754883, "logits/rejected": -3.4705848693847656, "logps/chosen": -226.04798889160156, "logps/rejected": -208.7861328125, "loss": 0.6872, "rewards/accuracies": 0.84375, "rewards/chosen": -0.20757007598876953, "rewards/margins": 0.32437780499458313, "rewards/rejected": -0.5319478511810303, "step": 141 }, { "epoch": 0.6413549752999295, "grad_norm": 12.77311070940504, "learning_rate": 9.006425517628863e-08, "logits/chosen": -3.5832412242889404, "logits/rejected": -3.532606601715088, "logps/chosen": -218.02249145507812, "logps/rejected": -205.59286499023438, "loss": 0.6794, "rewards/accuracies": 0.828125, "rewards/chosen": -0.21441030502319336, "rewards/margins": 0.32625484466552734, "rewards/rejected": -0.5406651496887207, "step": 142 }, { "epoch": 0.6458715596330276, "grad_norm": 11.7323843348539, "learning_rate": 8.805216158094177e-08, "logits/chosen": -3.5744497776031494, "logits/rejected": -3.5260581970214844, "logps/chosen": -193.29339599609375, "logps/rejected": -183.2548828125, "loss": 0.7311, "rewards/accuracies": 0.71875, "rewards/chosen": -0.23033544421195984, "rewards/margins": 0.2078953981399536, "rewards/rejected": -0.43823081254959106, "step": 143 }, { "epoch": 0.6503881439661257, "grad_norm": 11.68741246706069, "learning_rate": 8.605341827117462e-08, "logits/chosen": -3.584440231323242, "logits/rejected": -3.5315933227539062, "logps/chosen": -206.68771362304688, "logps/rejected": -191.7264862060547, "loss": 0.6958, "rewards/accuracies": 0.8125, "rewards/chosen": -0.183091938495636, "rewards/margins": 0.28822189569473267, "rewards/rejected": -0.47131383419036865, "step": 144 }, { "epoch": 0.6549047282992237, "grad_norm": 12.84057054734779, "learning_rate": 8.406845599315482e-08, "logits/chosen": -3.5925729274749756, "logits/rejected": -3.5170035362243652, "logps/chosen": -220.13719177246094, "logps/rejected": -200.61856079101562, "loss": 0.6551, "rewards/accuracies": 0.875, "rewards/chosen": -0.2075750231742859, "rewards/margins": 0.38476982712745667, "rewards/rejected": -0.5923448204994202, "step": 145 }, { "epoch": 0.6594213126323218, "grad_norm": 12.946963405257083, "learning_rate": 8.20977025231203e-08, "logits/chosen": -3.5441179275512695, "logits/rejected": -3.5702133178710938, "logps/chosen": -245.39036560058594, "logps/rejected": -230.05406188964844, "loss": 0.6539, "rewards/accuracies": 0.828125, "rewards/chosen": -0.2259531170129776, "rewards/margins": 0.41027867794036865, "rewards/rejected": -0.6362317800521851, "step": 146 }, { "epoch": 0.6639378969654199, "grad_norm": 11.995200470883699, "learning_rate": 8.014158257519046e-08, "logits/chosen": -3.5933175086975098, "logits/rejected": -3.598524808883667, "logps/chosen": -213.013427734375, "logps/rejected": -197.81692504882812, "loss": 0.674, "rewards/accuracies": 0.828125, "rewards/chosen": -0.15267431735992432, "rewards/margins": 0.3510599732398987, "rewards/rejected": -0.5037343502044678, "step": 147 }, { "epoch": 0.668454481298518, "grad_norm": 11.025028923299288, "learning_rate": 7.820051770983612e-08, "logits/chosen": -3.578158378601074, "logits/rejected": -3.4002773761749268, "logps/chosen": -207.0416259765625, "logps/rejected": -182.08555603027344, "loss": 0.7096, "rewards/accuracies": 0.71875, "rewards/chosen": -0.1736908257007599, "rewards/margins": 0.26846104860305786, "rewards/rejected": -0.44215184450149536, "step": 148 }, { "epoch": 0.6729710656316161, "grad_norm": 14.342920455609963, "learning_rate": 7.627492624302986e-08, "logits/chosen": -3.5777783393859863, "logits/rejected": -3.579009532928467, "logps/chosen": -221.22406005859375, "logps/rejected": -209.30577087402344, "loss": 0.6937, "rewards/accuracies": 0.859375, "rewards/chosen": -0.2393387109041214, "rewards/margins": 0.28950440883636475, "rewards/rejected": -0.5288431644439697, "step": 149 }, { "epoch": 0.6774876499647142, "grad_norm": 11.16447507239811, "learning_rate": 7.436522315609545e-08, "logits/chosen": -3.6183109283447266, "logits/rejected": -3.6173715591430664, "logps/chosen": -191.01394653320312, "logps/rejected": -181.1483612060547, "loss": 0.6995, "rewards/accuracies": 0.875, "rewards/chosen": -0.21332278847694397, "rewards/margins": 0.2814997434616089, "rewards/rejected": -0.49482250213623047, "step": 150 }, { "epoch": 0.6820042342978123, "grad_norm": 12.13300661848618, "learning_rate": 7.247182000627588e-08, "logits/chosen": -3.4436144828796387, "logits/rejected": -3.4601621627807617, "logps/chosen": -206.80313110351562, "logps/rejected": -197.01959228515625, "loss": 0.7062, "rewards/accuracies": 0.828125, "rewards/chosen": -0.219729483127594, "rewards/margins": 0.2750515937805176, "rewards/rejected": -0.4947810769081116, "step": 151 }, { "epoch": 0.6865208186309104, "grad_norm": 12.27938156296404, "learning_rate": 7.059512483803904e-08, "logits/chosen": -3.528646945953369, "logits/rejected": -3.4958152770996094, "logps/chosen": -227.61428833007812, "logps/rejected": -216.52243041992188, "loss": 0.6608, "rewards/accuracies": 0.9375, "rewards/chosen": -0.23144832253456116, "rewards/margins": 0.3668804168701172, "rewards/rejected": -0.598328709602356, "step": 152 }, { "epoch": 0.6910374029640085, "grad_norm": 10.934485175214164, "learning_rate": 6.873554209514085e-08, "logits/chosen": -3.5069892406463623, "logits/rejected": -3.464691638946533, "logps/chosen": -182.59202575683594, "logps/rejected": -175.9390106201172, "loss": 0.7086, "rewards/accuracies": 0.734375, "rewards/chosen": -0.1632474958896637, "rewards/margins": 0.27051842212677, "rewards/rejected": -0.43376588821411133, "step": 153 }, { "epoch": 0.6955539872971066, "grad_norm": 12.823137287258696, "learning_rate": 6.689347253346412e-08, "logits/chosen": -3.5779876708984375, "logits/rejected": -3.5178349018096924, "logps/chosen": -232.82090759277344, "logps/rejected": -217.25289916992188, "loss": 0.6864, "rewards/accuracies": 0.875, "rewards/chosen": -0.28119853138923645, "rewards/margins": 0.29815465211868286, "rewards/rejected": -0.5793532133102417, "step": 154 }, { "epoch": 0.7000705716302047, "grad_norm": 13.681445102824224, "learning_rate": 6.506931313465244e-08, "logits/chosen": -3.563887119293213, "logits/rejected": -3.5259387493133545, "logps/chosen": -235.2152099609375, "logps/rejected": -218.48611450195312, "loss": 0.6864, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2641199231147766, "rewards/margins": 0.32186007499694824, "rewards/rejected": -0.5859800577163696, "step": 155 }, { "epoch": 0.7045871559633028, "grad_norm": 11.872530426234707, "learning_rate": 6.326345702055698e-08, "logits/chosen": -3.666203737258911, "logits/rejected": -3.594811201095581, "logps/chosen": -196.93624877929688, "logps/rejected": -176.68557739257812, "loss": 0.7163, "rewards/accuracies": 0.859375, "rewards/chosen": -0.1758304387331009, "rewards/margins": 0.2576484978199005, "rewards/rejected": -0.4334789514541626, "step": 156 }, { "epoch": 0.7091037402964009, "grad_norm": 12.799664975071849, "learning_rate": 6.147629336851552e-08, "logits/chosen": -3.624176025390625, "logits/rejected": -3.557004451751709, "logps/chosen": -218.7604522705078, "logps/rejected": -206.44232177734375, "loss": 0.7112, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2525786757469177, "rewards/margins": 0.24881505966186523, "rewards/rejected": -0.5013936758041382, "step": 157 }, { "epoch": 0.713620324629499, "grad_norm": 10.877958914597729, "learning_rate": 5.970820732748143e-08, "logits/chosen": -3.4155774116516113, "logits/rejected": -3.501858711242676, "logps/chosen": -199.32589721679688, "logps/rejected": -196.3961181640625, "loss": 0.6945, "rewards/accuracies": 0.859375, "rewards/chosen": -0.22878919541835785, "rewards/margins": 0.28728824853897095, "rewards/rejected": -0.5160773992538452, "step": 158 }, { "epoch": 0.7181369089625971, "grad_norm": 12.955814203296498, "learning_rate": 5.795957993502092e-08, "logits/chosen": -3.540942668914795, "logits/rejected": -3.4938039779663086, "logps/chosen": -210.17218017578125, "logps/rejected": -208.64927673339844, "loss": 0.7032, "rewards/accuracies": 0.859375, "rewards/chosen": -0.2663414180278778, "rewards/margins": 0.27345699071884155, "rewards/rejected": -0.539798378944397, "step": 159 }, { "epoch": 0.7226534932956952, "grad_norm": 11.004852801575502, "learning_rate": 5.623078803519595e-08, "logits/chosen": -3.6684892177581787, "logits/rejected": -3.609145402908325, "logps/chosen": -197.72994995117188, "logps/rejected": -178.94186401367188, "loss": 0.699, "rewards/accuracies": 0.765625, "rewards/chosen": -0.17543372511863708, "rewards/margins": 0.2890481650829315, "rewards/rejected": -0.4644818603992462, "step": 160 }, { "epoch": 0.7271700776287933, "grad_norm": 11.267055827604478, "learning_rate": 5.4522204197351294e-08, "logits/chosen": -3.6161813735961914, "logits/rejected": -3.5283775329589844, "logps/chosen": -217.74359130859375, "logps/rejected": -197.9161376953125, "loss": 0.6485, "rewards/accuracies": 0.828125, "rewards/chosen": -0.13655489683151245, "rewards/margins": 0.43066203594207764, "rewards/rejected": -0.5672169923782349, "step": 161 }, { "epoch": 0.7316866619618914, "grad_norm": 11.455186989092018, "learning_rate": 5.2834196635822626e-08, "logits/chosen": -3.6274447441101074, "logits/rejected": -3.5645861625671387, "logps/chosen": -192.35226440429688, "logps/rejected": -178.86270141601562, "loss": 0.7223, "rewards/accuracies": 0.828125, "rewards/chosen": -0.23346485197544098, "rewards/margins": 0.22727568447589874, "rewards/rejected": -0.4607405364513397, "step": 162 }, { "epoch": 0.7362032462949895, "grad_norm": 12.833815191659758, "learning_rate": 5.1167129130583346e-08, "logits/chosen": -3.5601654052734375, "logits/rejected": -3.5595359802246094, "logps/chosen": -231.76150512695312, "logps/rejected": -218.86770629882812, "loss": 0.6694, "rewards/accuracies": 0.84375, "rewards/chosen": -0.27048397064208984, "rewards/margins": 0.35143792629241943, "rewards/rejected": -0.6219218969345093, "step": 163 }, { "epoch": 0.7407198306280875, "grad_norm": 11.087097762035608, "learning_rate": 4.952136094884666e-08, "logits/chosen": -3.6404900550842285, "logits/rejected": -3.5640687942504883, "logps/chosen": -188.69154357910156, "logps/rejected": -169.15692138671875, "loss": 0.6994, "rewards/accuracies": 0.84375, "rewards/chosen": -0.18032759428024292, "rewards/margins": 0.27624213695526123, "rewards/rejected": -0.45656976103782654, "step": 164 }, { "epoch": 0.7452364149611856, "grad_norm": 11.909284446478502, "learning_rate": 4.789724676764062e-08, "logits/chosen": -3.5401947498321533, "logits/rejected": -3.585221767425537, "logps/chosen": -204.66531372070312, "logps/rejected": -201.26177978515625, "loss": 0.7003, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2440950870513916, "rewards/margins": 0.27386218309402466, "rewards/rejected": -0.5179572701454163, "step": 165 }, { "epoch": 0.7497529992942837, "grad_norm": 11.847238391062595, "learning_rate": 4.629513659737209e-08, "logits/chosen": -3.520542860031128, "logits/rejected": -3.4925966262817383, "logps/chosen": -226.33909606933594, "logps/rejected": -211.7904815673828, "loss": 0.68, "rewards/accuracies": 0.828125, "rewards/chosen": -0.25608351826667786, "rewards/margins": 0.3440699577331543, "rewards/rejected": -0.6001534461975098, "step": 166 }, { "epoch": 0.7542695836273818, "grad_norm": 11.52418594856635, "learning_rate": 4.471537570639676e-08, "logits/chosen": -3.586939811706543, "logits/rejected": -3.5489325523376465, "logps/chosen": -209.86761474609375, "logps/rejected": -194.3120880126953, "loss": 0.6738, "rewards/accuracies": 0.859375, "rewards/chosen": -0.18899373710155487, "rewards/margins": 0.36132901906967163, "rewards/rejected": -0.5503227710723877, "step": 167 }, { "epoch": 0.7587861679604799, "grad_norm": 11.746177386790478, "learning_rate": 4.315830454661059e-08, "logits/chosen": -3.519134521484375, "logits/rejected": -3.450460910797119, "logps/chosen": -224.33236694335938, "logps/rejected": -205.32876586914062, "loss": 0.6724, "rewards/accuracies": 0.859375, "rewards/chosen": -0.14168740808963776, "rewards/margins": 0.36835241317749023, "rewards/rejected": -0.5100398063659668, "step": 168 }, { "epoch": 0.763302752293578, "grad_norm": 13.727528925392352, "learning_rate": 4.1624258680079695e-08, "logits/chosen": -3.5889594554901123, "logits/rejected": -3.6074798107147217, "logps/chosen": -190.32859802246094, "logps/rejected": -181.37701416015625, "loss": 0.7041, "rewards/accuracies": 0.75, "rewards/chosen": -0.21288928389549255, "rewards/margins": 0.2832014262676239, "rewards/rejected": -0.49609071016311646, "step": 169 }, { "epoch": 0.7678193366266761, "grad_norm": 12.36532095556768, "learning_rate": 4.0113568706723745e-08, "logits/chosen": -3.5663981437683105, "logits/rejected": -3.5320568084716797, "logps/chosen": -205.47369384765625, "logps/rejected": -189.5297393798828, "loss": 0.6821, "rewards/accuracies": 0.8125, "rewards/chosen": -0.21529924869537354, "rewards/margins": 0.3193380832672119, "rewards/rejected": -0.5346373319625854, "step": 170 }, { "epoch": 0.7723359209597742, "grad_norm": 11.211886652040736, "learning_rate": 3.8626560193069194e-08, "logits/chosen": -3.6578316688537598, "logits/rejected": -3.5587844848632812, "logps/chosen": -184.16693115234375, "logps/rejected": -164.69442749023438, "loss": 0.7124, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1814623475074768, "rewards/margins": 0.24063682556152344, "rewards/rejected": -0.42209917306900024, "step": 171 }, { "epoch": 0.7768525052928723, "grad_norm": 11.991981526284254, "learning_rate": 3.71635536020865e-08, "logits/chosen": -3.687999725341797, "logits/rejected": -3.5739190578460693, "logps/chosen": -187.41111755371094, "logps/rejected": -170.08901977539062, "loss": 0.6982, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1899150013923645, "rewards/margins": 0.27486538887023926, "rewards/rejected": -0.46478039026260376, "step": 172 }, { "epoch": 0.7813690896259704, "grad_norm": 11.682125153864412, "learning_rate": 3.572486422412786e-08, "logits/chosen": -3.541208028793335, "logits/rejected": -3.4319119453430176, "logps/chosen": -232.7732696533203, "logps/rejected": -207.6559295654297, "loss": 0.6788, "rewards/accuracies": 0.796875, "rewards/chosen": -0.19109418988227844, "rewards/margins": 0.37378770112991333, "rewards/rejected": -0.5648818612098694, "step": 173 }, { "epoch": 0.7858856739590685, "grad_norm": 11.96356466385332, "learning_rate": 3.4310802108979456e-08, "logits/chosen": -3.6099984645843506, "logits/rejected": -3.588571071624756, "logps/chosen": -221.05567932128906, "logps/rejected": -211.12063598632812, "loss": 0.6749, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2735821604728699, "rewards/margins": 0.365002304315567, "rewards/rejected": -0.6385844945907593, "step": 174 }, { "epoch": 0.7904022582921666, "grad_norm": 12.839800224543241, "learning_rate": 3.292167199904311e-08, "logits/chosen": -3.5515999794006348, "logits/rejected": -3.5092904567718506, "logps/chosen": -238.809814453125, "logps/rejected": -215.16036987304688, "loss": 0.6457, "rewards/accuracies": 0.84375, "rewards/chosen": -0.21585515141487122, "rewards/margins": 0.43380266427993774, "rewards/rejected": -0.6496578454971313, "step": 175 }, { "epoch": 0.7949188426252647, "grad_norm": 10.718656806306809, "learning_rate": 3.1557773263661604e-08, "logits/chosen": -3.477273941040039, "logits/rejected": -3.4155681133270264, "logps/chosen": -229.67233276367188, "logps/rejected": -217.80172729492188, "loss": 0.681, "rewards/accuracies": 0.796875, "rewards/chosen": -0.3297360837459564, "rewards/margins": 0.32702451944351196, "rewards/rejected": -0.6567606329917908, "step": 176 }, { "epoch": 0.7994354269583628, "grad_norm": 12.333864621322466, "learning_rate": 3.02193998346021e-08, "logits/chosen": -3.5071773529052734, "logits/rejected": -3.4976847171783447, "logps/chosen": -221.59481811523438, "logps/rejected": -209.19186401367188, "loss": 0.6713, "rewards/accuracies": 0.859375, "rewards/chosen": -0.26982712745666504, "rewards/margins": 0.3569854199886322, "rewards/rejected": -0.6268125176429749, "step": 177 }, { "epoch": 0.8039520112914609, "grad_norm": 12.039934107046923, "learning_rate": 2.8906840142711338e-08, "logits/chosen": -3.606412410736084, "logits/rejected": -3.5920205116271973, "logps/chosen": -235.42181396484375, "logps/rejected": -216.65164184570312, "loss": 0.6652, "rewards/accuracies": 0.796875, "rewards/chosen": -0.23246988654136658, "rewards/margins": 0.3717581629753113, "rewards/rejected": -0.6042280197143555, "step": 178 }, { "epoch": 0.808468595624559, "grad_norm": 11.030662943317653, "learning_rate": 2.7620377055756423e-08, "logits/chosen": -3.624559164047241, "logits/rejected": -3.638848066329956, "logps/chosen": -191.0684814453125, "logps/rejected": -183.99212646484375, "loss": 0.6806, "rewards/accuracies": 0.828125, "rewards/chosen": -0.1932627409696579, "rewards/margins": 0.34079742431640625, "rewards/rejected": -0.5340601205825806, "step": 179 }, { "epoch": 0.8129851799576571, "grad_norm": 11.319505715592419, "learning_rate": 2.6360287817464256e-08, "logits/chosen": -3.5405383110046387, "logits/rejected": -3.5048179626464844, "logps/chosen": -207.15835571289062, "logps/rejected": -188.7417449951172, "loss": 0.6835, "rewards/accuracies": 0.828125, "rewards/chosen": -0.19087350368499756, "rewards/margins": 0.3352086842060089, "rewards/rejected": -0.5260821580886841, "step": 180 }, { "epoch": 0.8175017642907552, "grad_norm": 13.430513620252457, "learning_rate": 2.512684398777329e-08, "logits/chosen": -3.5324528217315674, "logits/rejected": -3.4295010566711426, "logps/chosen": -213.90878295898438, "logps/rejected": -195.8572998046875, "loss": 0.6963, "rewards/accuracies": 0.828125, "rewards/chosen": -0.23799797892570496, "rewards/margins": 0.2941531538963318, "rewards/rejected": -0.5321511626243591, "step": 181 }, { "epoch": 0.8220183486238533, "grad_norm": 11.575530525209127, "learning_rate": 2.3920311384309914e-08, "logits/chosen": -3.601444721221924, "logits/rejected": -3.5109052658081055, "logps/chosen": -199.96237182617188, "logps/rejected": -183.93069458007812, "loss": 0.7104, "rewards/accuracies": 0.703125, "rewards/chosen": -0.22000840306282043, "rewards/margins": 0.2704961895942688, "rewards/rejected": -0.4905046224594116, "step": 182 }, { "epoch": 0.8265349329569514, "grad_norm": 11.049495139232015, "learning_rate": 2.2740950025102763e-08, "logits/chosen": -3.5267105102539062, "logits/rejected": -3.5060598850250244, "logps/chosen": -204.97425842285156, "logps/rejected": -196.4848175048828, "loss": 0.7116, "rewards/accuracies": 0.765625, "rewards/chosen": -0.25764691829681396, "rewards/margins": 0.26898401975631714, "rewards/rejected": -0.5266309976577759, "step": 183 }, { "epoch": 0.8310515172900494, "grad_norm": 12.243629851598072, "learning_rate": 2.158901407254629e-08, "logits/chosen": -3.6085617542266846, "logits/rejected": -3.5918960571289062, "logps/chosen": -207.69467163085938, "logps/rejected": -201.7321014404297, "loss": 0.6776, "rewards/accuracies": 0.84375, "rewards/chosen": -0.26026493310928345, "rewards/margins": 0.3179108500480652, "rewards/rejected": -0.5781757235527039, "step": 184 }, { "epoch": 0.8355681016231475, "grad_norm": 10.504114021037811, "learning_rate": 2.0464751778626836e-08, "logits/chosen": -3.489086151123047, "logits/rejected": -3.5122947692871094, "logps/chosen": -227.21865844726562, "logps/rejected": -225.1961669921875, "loss": 0.6854, "rewards/accuracies": 0.765625, "rewards/chosen": -0.29528743028640747, "rewards/margins": 0.34715089201927185, "rewards/rejected": -0.6424383521080017, "step": 185 }, { "epoch": 0.8400846859562456, "grad_norm": 12.604767051843007, "learning_rate": 1.9368405431422102e-08, "logits/chosen": -3.492341995239258, "logits/rejected": -3.5087356567382812, "logps/chosen": -239.1469268798828, "logps/rejected": -227.90689086914062, "loss": 0.6872, "rewards/accuracies": 0.890625, "rewards/chosen": -0.3633117079734802, "rewards/margins": 0.31293728947639465, "rewards/rejected": -0.6762489676475525, "step": 186 }, { "epoch": 0.8446012702893437, "grad_norm": 13.007777850694795, "learning_rate": 1.8300211302886137e-08, "logits/chosen": -3.5985684394836426, "logits/rejected": -3.560959815979004, "logps/chosen": -233.37771606445312, "logps/rejected": -218.78062438964844, "loss": 0.6679, "rewards/accuracies": 0.8125, "rewards/chosen": -0.31275439262390137, "rewards/margins": 0.35552066564559937, "rewards/rejected": -0.6682751178741455, "step": 187 }, { "epoch": 0.8491178546224418, "grad_norm": 12.224849381089012, "learning_rate": 1.726039959793059e-08, "logits/chosen": -3.6801674365997314, "logits/rejected": -3.643399953842163, "logps/chosen": -201.95065307617188, "logps/rejected": -191.9815673828125, "loss": 0.7021, "rewards/accuracies": 0.765625, "rewards/chosen": -0.23352208733558655, "rewards/margins": 0.2819408178329468, "rewards/rejected": -0.5154628753662109, "step": 188 }, { "epoch": 0.8536344389555399, "grad_norm": 11.06103238426834, "learning_rate": 1.6249194404813633e-08, "logits/chosen": -3.583648204803467, "logits/rejected": -3.536038398742676, "logps/chosen": -205.93267822265625, "logps/rejected": -188.04385375976562, "loss": 0.6819, "rewards/accuracies": 0.8125, "rewards/chosen": -0.20944997668266296, "rewards/margins": 0.33364248275756836, "rewards/rejected": -0.5430924892425537, "step": 189 }, { "epoch": 0.858151023288638, "grad_norm": 11.515369124606178, "learning_rate": 1.526681364684707e-08, "logits/chosen": -3.544914722442627, "logits/rejected": -3.477529525756836, "logps/chosen": -246.21878051757812, "logps/rejected": -227.75767517089844, "loss": 0.6284, "rewards/accuracies": 0.890625, "rewards/chosen": -0.25970178842544556, "rewards/margins": 0.4616071283817291, "rewards/rejected": -0.7213089466094971, "step": 190 }, { "epoch": 0.8626676076217361, "grad_norm": 11.17899475785984, "learning_rate": 1.4313469035432053e-08, "logits/chosen": -3.539396286010742, "logits/rejected": -3.491584062576294, "logps/chosen": -217.02066040039062, "logps/rejected": -194.50637817382812, "loss": 0.6783, "rewards/accuracies": 0.890625, "rewards/chosen": -0.23896610736846924, "rewards/margins": 0.36122721433639526, "rewards/rejected": -0.6001933813095093, "step": 191 }, { "epoch": 0.8671841919548342, "grad_norm": 11.452045965709495, "learning_rate": 1.3389366024433346e-08, "logits/chosen": -3.508542776107788, "logits/rejected": -3.4577219486236572, "logps/chosen": -209.47579956054688, "logps/rejected": -191.61953735351562, "loss": 0.6856, "rewards/accuracies": 0.78125, "rewards/chosen": -0.23392406105995178, "rewards/margins": 0.3334580063819885, "rewards/rejected": -0.5673820972442627, "step": 192 }, { "epoch": 0.8717007762879323, "grad_norm": 11.051060681758146, "learning_rate": 1.2494703765902337e-08, "logits/chosen": -3.545461654663086, "logits/rejected": -3.5021471977233887, "logps/chosen": -221.29922485351562, "logps/rejected": -205.3109588623047, "loss": 0.6884, "rewards/accuracies": 0.8125, "rewards/chosen": -0.26481908559799194, "rewards/margins": 0.3176526129245758, "rewards/rejected": -0.5824716687202454, "step": 193 }, { "epoch": 0.8762173606210304, "grad_norm": 11.263698377490607, "learning_rate": 1.1629675067158119e-08, "logits/chosen": -3.59144926071167, "logits/rejected": -3.596311569213867, "logps/chosen": -223.28732299804688, "logps/rejected": -207.96267700195312, "loss": 0.6421, "rewards/accuracies": 0.90625, "rewards/chosen": -0.16717705130577087, "rewards/margins": 0.46462714672088623, "rewards/rejected": -0.6318042278289795, "step": 194 }, { "epoch": 0.8807339449541285, "grad_norm": 11.60107390050444, "learning_rate": 1.0794466349235865e-08, "logits/chosen": -3.6060633659362793, "logits/rejected": -3.517813205718994, "logps/chosen": -202.70123291015625, "logps/rejected": -183.82339477539062, "loss": 0.6759, "rewards/accuracies": 0.8125, "rewards/chosen": -0.16405262053012848, "rewards/margins": 0.36374321579933167, "rewards/rejected": -0.5277957916259766, "step": 195 }, { "epoch": 0.8852505292872266, "grad_norm": 11.879162180318056, "learning_rate": 9.989257606711438e-09, "logits/chosen": -3.59369158744812, "logits/rejected": -3.5690078735351562, "logps/chosen": -226.8720703125, "logps/rejected": -210.02069091796875, "loss": 0.6661, "rewards/accuracies": 0.875, "rewards/chosen": -0.2107160985469818, "rewards/margins": 0.3873599171638489, "rewards/rejected": -0.5980759859085083, "step": 196 }, { "epoch": 0.8897671136203247, "grad_norm": 11.434668917143895, "learning_rate": 9.214222368911112e-09, "logits/chosen": -3.5294957160949707, "logits/rejected": -3.504406452178955, "logps/chosen": -210.91481018066406, "logps/rejected": -190.58346557617188, "loss": 0.6694, "rewards/accuracies": 0.859375, "rewards/chosen": -0.17558521032333374, "rewards/margins": 0.37856271862983704, "rewards/rejected": -0.5541479587554932, "step": 197 }, { "epoch": 0.8942836979534228, "grad_norm": 11.442820506378803, "learning_rate": 8.469527662514425e-09, "logits/chosen": -3.6074564456939697, "logits/rejected": -3.5393269062042236, "logps/chosen": -206.037109375, "logps/rejected": -184.319580078125, "loss": 0.6837, "rewards/accuracies": 0.84375, "rewards/chosen": -0.2739471197128296, "rewards/margins": 0.31510984897613525, "rewards/rejected": -0.5890569686889648, "step": 198 }, { "epoch": 0.8988002822865209, "grad_norm": 10.48141977546952, "learning_rate": 7.755333975558703e-09, "logits/chosen": -3.534519910812378, "logits/rejected": -3.546844005584717, "logps/chosen": -209.56192016601562, "logps/rejected": -192.3565673828125, "loss": 0.656, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1646820604801178, "rewards/margins": 0.422355055809021, "rewards/rejected": -0.5870370864868164, "step": 199 }, { "epoch": 0.903316866619619, "grad_norm": 11.512112387427932, "learning_rate": 7.071795222852295e-09, "logits/chosen": -3.5276589393615723, "logits/rejected": -3.4519450664520264, "logps/chosen": -203.3861083984375, "logps/rejected": -186.114013671875, "loss": 0.7095, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2570461928844452, "rewards/margins": 0.2705690562725067, "rewards/rejected": -0.5276152491569519, "step": 200 }, { "epoch": 0.9078334509527171, "grad_norm": 11.664115289787672, "learning_rate": 6.41905871280477e-09, "logits/chosen": -3.5618152618408203, "logits/rejected": -3.5523681640625, "logps/chosen": -208.64450073242188, "logps/rejected": -194.45249938964844, "loss": 0.7034, "rewards/accuracies": 0.828125, "rewards/chosen": -0.2307577133178711, "rewards/margins": 0.2903120219707489, "rewards/rejected": -0.5210697650909424, "step": 201 }, { "epoch": 0.9123500352858152, "grad_norm": 12.221452533932299, "learning_rate": 5.797265115680649e-09, "logits/chosen": -3.560934543609619, "logits/rejected": -3.5254406929016113, "logps/chosen": -212.36541748046875, "logps/rejected": -200.04124450683594, "loss": 0.7023, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3073262572288513, "rewards/margins": 0.28462159633636475, "rewards/rejected": -0.5919477939605713, "step": 202 }, { "epoch": 0.9168666196189132, "grad_norm": 11.449688321668301, "learning_rate": 5.206548433283803e-09, "logits/chosen": -3.5118632316589355, "logits/rejected": -3.5008668899536133, "logps/chosen": -216.9571990966797, "logps/rejected": -194.16934204101562, "loss": 0.6675, "rewards/accuracies": 0.828125, "rewards/chosen": -0.248696431517601, "rewards/margins": 0.396094411611557, "rewards/rejected": -0.6447908282279968, "step": 203 }, { "epoch": 0.9213832039520113, "grad_norm": 12.04856516660961, "learning_rate": 4.6470359700788995e-09, "logits/chosen": -3.621587038040161, "logits/rejected": -3.5827927589416504, "logps/chosen": -221.77987670898438, "logps/rejected": -201.95237731933594, "loss": 0.6732, "rewards/accuracies": 0.828125, "rewards/chosen": -0.26525014638900757, "rewards/margins": 0.36969608068466187, "rewards/rejected": -0.6349462270736694, "step": 204 }, { "epoch": 0.9258997882851094, "grad_norm": 11.722034459630727, "learning_rate": 4.118848305756173e-09, "logits/chosen": -3.595360279083252, "logits/rejected": -3.5677242279052734, "logps/chosen": -223.11941528320312, "logps/rejected": -205.64205932617188, "loss": 0.6758, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2749682664871216, "rewards/margins": 0.3367688059806824, "rewards/rejected": -0.611737072467804, "step": 205 }, { "epoch": 0.9304163726182075, "grad_norm": 11.503388221795708, "learning_rate": 3.622099269245571e-09, "logits/chosen": -3.60404109954834, "logits/rejected": -3.5229146480560303, "logps/chosen": -223.0252685546875, "logps/rejected": -200.0838623046875, "loss": 0.6655, "rewards/accuracies": 0.859375, "rewards/chosen": -0.18073460459709167, "rewards/margins": 0.37948232889175415, "rewards/rejected": -0.5602169036865234, "step": 206 }, { "epoch": 0.9349329569513056, "grad_norm": 10.985620131645993, "learning_rate": 3.156895914185581e-09, "logits/chosen": -3.526765823364258, "logits/rejected": -3.5020856857299805, "logps/chosen": -209.4959259033203, "logps/rejected": -197.26101684570312, "loss": 0.6902, "rewards/accuracies": 0.859375, "rewards/chosen": -0.24708959460258484, "rewards/margins": 0.3107626438140869, "rewards/rejected": -0.5578522086143494, "step": 207 }, { "epoch": 0.9394495412844037, "grad_norm": 12.99867871857846, "learning_rate": 2.7233384958522676e-09, "logits/chosen": -3.6278443336486816, "logits/rejected": -3.6109533309936523, "logps/chosen": -207.57948303222656, "logps/rejected": -194.7212677001953, "loss": 0.6843, "rewards/accuracies": 0.828125, "rewards/chosen": -0.23499882221221924, "rewards/margins": 0.3160548508167267, "rewards/rejected": -0.5510536432266235, "step": 208 }, { "epoch": 0.9439661256175018, "grad_norm": 11.2046360254521, "learning_rate": 2.321520449553421e-09, "logits/chosen": -3.5132226943969727, "logits/rejected": -3.491821527481079, "logps/chosen": -206.74038696289062, "logps/rejected": -185.68463134765625, "loss": 0.6676, "rewards/accuracies": 0.875, "rewards/chosen": -0.1989631950855255, "rewards/margins": 0.39246267080307007, "rewards/rejected": -0.591425895690918, "step": 209 }, { "epoch": 0.9484827099505999, "grad_norm": 13.729648643233745, "learning_rate": 1.9515283704924667e-09, "logits/chosen": -3.6648130416870117, "logits/rejected": -3.5529017448425293, "logps/chosen": -232.48114013671875, "logps/rejected": -210.60609436035156, "loss": 0.6581, "rewards/accuracies": 0.828125, "rewards/chosen": -0.2861940860748291, "rewards/margins": 0.377960741519928, "rewards/rejected": -0.6641547679901123, "step": 210 }, { "epoch": 0.952999294283698, "grad_norm": 12.04630458243632, "learning_rate": 1.6134419951064404e-09, "logits/chosen": -3.586575984954834, "logits/rejected": -3.577587127685547, "logps/chosen": -201.98065185546875, "logps/rejected": -184.5204620361328, "loss": 0.6845, "rewards/accuracies": 0.828125, "rewards/chosen": -0.21713638305664062, "rewards/margins": 0.32104575634002686, "rewards/rejected": -0.5381821393966675, "step": 211 }, { "epoch": 0.9575158786167961, "grad_norm": 11.591398022496064, "learning_rate": 1.3073341838821028e-09, "logits/chosen": -3.606295585632324, "logits/rejected": -3.6138882637023926, "logps/chosen": -222.92762756347656, "logps/rejected": -213.224365234375, "loss": 0.6756, "rewards/accuracies": 0.828125, "rewards/chosen": -0.2295455038547516, "rewards/margins": 0.37904244661331177, "rewards/rejected": -0.608587920665741, "step": 212 }, { "epoch": 0.9620324629498942, "grad_norm": 14.166283230715495, "learning_rate": 1.033270905653949e-09, "logits/chosen": -3.615994453430176, "logits/rejected": -3.5957651138305664, "logps/chosen": -197.43399047851562, "logps/rejected": -187.89100646972656, "loss": 0.7109, "rewards/accuracies": 0.75, "rewards/chosen": -0.2787552773952484, "rewards/margins": 0.2632961571216583, "rewards/rejected": -0.5420514345169067, "step": 213 }, { "epoch": 0.9665490472829923, "grad_norm": 12.160467402707475, "learning_rate": 7.913112233872476e-10, "logits/chosen": -3.5975520610809326, "logits/rejected": -3.5754852294921875, "logps/chosen": -212.54818725585938, "logps/rejected": -193.0272216796875, "loss": 0.666, "rewards/accuracies": 0.90625, "rewards/chosen": -0.23875494301319122, "rewards/margins": 0.3742338716983795, "rewards/rejected": -0.6129888296127319, "step": 214 }, { "epoch": 0.9710656316160904, "grad_norm": 11.288257675305129, "learning_rate": 5.815072814496225e-10, "logits/chosen": -3.5631093978881836, "logits/rejected": -3.5513646602630615, "logps/chosen": -179.36865234375, "logps/rejected": -169.8910675048828, "loss": 0.6992, "rewards/accuracies": 0.828125, "rewards/chosen": -0.21426759660243988, "rewards/margins": 0.2879588007926941, "rewards/rejected": -0.5022264122962952, "step": 215 }, { "epoch": 0.9755822159491885, "grad_norm": 11.189287909041054, "learning_rate": 4.0390429437332505e-10, "logits/chosen": -3.49703311920166, "logits/rejected": -3.45841908454895, "logps/chosen": -210.85260009765625, "logps/rejected": -196.04306030273438, "loss": 0.6895, "rewards/accuracies": 0.765625, "rewards/chosen": -0.2567548155784607, "rewards/margins": 0.3082759380340576, "rewards/rejected": -0.5650308132171631, "step": 216 }, { "epoch": 0.9800988002822866, "grad_norm": 11.496270664039283, "learning_rate": 2.585405371112459e-10, "logits/chosen": -3.5845980644226074, "logits/rejected": -3.5456676483154297, "logps/chosen": -199.40249633789062, "logps/rejected": -191.75473022460938, "loss": 0.733, "rewards/accuracies": 0.828125, "rewards/chosen": -0.319938600063324, "rewards/margins": 0.20854635536670685, "rewards/rejected": -0.5284849405288696, "step": 217 }, { "epoch": 0.9846153846153847, "grad_norm": 12.656855394825056, "learning_rate": 1.454473367883291e-10, "logits/chosen": -3.478101968765259, "logits/rejected": -3.501936435699463, "logps/chosen": -222.27523803710938, "logps/rejected": -206.86306762695312, "loss": 0.6763, "rewards/accuracies": 0.796875, "rewards/chosen": -0.26684999465942383, "rewards/margins": 0.34992286562919617, "rewards/rejected": -0.6167728900909424, "step": 218 }, { "epoch": 0.9891319689484828, "grad_norm": 13.110720385558672, "learning_rate": 6.464906595023967e-11, "logits/chosen": -3.5070900917053223, "logits/rejected": -3.489541530609131, "logps/chosen": -221.77259826660156, "logps/rejected": -207.49349975585938, "loss": 0.6686, "rewards/accuracies": 0.859375, "rewards/chosen": -0.22918352484703064, "rewards/margins": 0.3775954246520996, "rewards/rejected": -0.6067789793014526, "step": 219 }, { "epoch": 0.9936485532815809, "grad_norm": 10.689364544141856, "learning_rate": 1.616313731091501e-11, "logits/chosen": -3.472511053085327, "logits/rejected": -3.4571049213409424, "logps/chosen": -218.83880615234375, "logps/rejected": -205.46530151367188, "loss": 0.6523, "rewards/accuracies": 0.90625, "rewards/chosen": -0.24689728021621704, "rewards/margins": 0.4170025587081909, "rewards/rejected": -0.663899838924408, "step": 220 }, { "epoch": 0.998165137614679, "grad_norm": 11.564165675567537, "learning_rate": 0.0, "logits/chosen": -3.556617259979248, "logits/rejected": -3.5278592109680176, "logps/chosen": -206.43194580078125, "logps/rejected": -186.3187713623047, "loss": 0.6886, "rewards/accuracies": 0.828125, "rewards/chosen": -0.1849951595067978, "rewards/margins": 0.3221530318260193, "rewards/rejected": -0.5071482062339783, "step": 221 } ], "logging_steps": 1, "max_steps": 221, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 316050221826048.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }