{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 6000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00016666666666666666, "grad_norm": 153.08250427246094, "learning_rate": 1.111111111111111e-09, "logits/chosen": 2.337772846221924, "logits/rejected": 2.4827752113342285, "logps/chosen": -37.39937973022461, "logps/rejected": -59.04928970336914, "loss": 1.6654, "nll_loss": 0.8499860167503357, "rewards/accuracies": 1.0, "rewards/chosen": -0.015680694952607155, "rewards/margins": 0.03295592963695526, "rewards/rejected": -0.048636626452207565, "step": 1 }, { "epoch": 0.0003333333333333333, "grad_norm": 186.69790649414062, "learning_rate": 2.222222222222222e-09, "logits/chosen": 2.6817171573638916, "logits/rejected": 2.6163880825042725, "logps/chosen": -74.80304718017578, "logps/rejected": -69.57865905761719, "loss": 1.8234, "nll_loss": 1.0686149597167969, "rewards/accuracies": 1.0, "rewards/chosen": 0.06782150268554688, "rewards/margins": 0.15963363647460938, "rewards/rejected": -0.0918121337890625, "step": 2 }, { "epoch": 0.0005, "grad_norm": 141.1098175048828, "learning_rate": 3.333333333333333e-09, "logits/chosen": 3.0970442295074463, "logits/rejected": 3.0161945819854736, "logps/chosen": -69.21766662597656, "logps/rejected": -39.7952766418457, "loss": 1.7894, "nll_loss": 0.9353739619255066, "rewards/accuracies": 0.0, "rewards/chosen": 0.03846130520105362, "rewards/margins": -0.0438179075717926, "rewards/rejected": 0.08227921277284622, "step": 3 }, { "epoch": 0.0006666666666666666, "grad_norm": 301.296142578125, "learning_rate": 4.444444444444444e-09, "logits/chosen": 2.2307980060577393, "logits/rejected": 2.635477304458618, "logps/chosen": -172.05064392089844, "logps/rejected": -388.5400695800781, "loss": 1.9673, "nll_loss": 1.068637490272522, "rewards/accuracies": 0.0, "rewards/chosen": -0.08633270114660263, "rewards/margins": -0.12985381484031677, "rewards/rejected": 0.04352111741900444, "step": 4 }, { "epoch": 0.0008333333333333334, "grad_norm": 213.5386505126953, "learning_rate": 5.555555555555555e-09, "logits/chosen": 2.461300849914551, "logits/rejected": 2.9732072353363037, "logps/chosen": -84.26083374023438, "logps/rejected": -95.67826080322266, "loss": 1.9287, "nll_loss": 1.040257215499878, "rewards/accuracies": 0.0, "rewards/chosen": 0.006495667155832052, "rewards/margins": -0.11032257229089737, "rewards/rejected": 0.11681824177503586, "step": 5 }, { "epoch": 0.001, "grad_norm": 255.81744384765625, "learning_rate": 6.666666666666666e-09, "logits/chosen": 3.097999334335327, "logits/rejected": 3.2171127796173096, "logps/chosen": -82.84479522705078, "logps/rejected": -146.22189331054688, "loss": 1.8466, "nll_loss": 1.0759062767028809, "rewards/accuracies": 1.0, "rewards/chosen": 0.004840850830078125, "rewards/margins": 0.12609481811523438, "rewards/rejected": -0.12125396728515625, "step": 6 }, { "epoch": 0.0011666666666666668, "grad_norm": 200.57354736328125, "learning_rate": 7.777777777777777e-09, "logits/chosen": 3.3226444721221924, "logits/rejected": 3.838266134262085, "logps/chosen": -37.94790267944336, "logps/rejected": -34.00613784790039, "loss": 1.5863, "nll_loss": 0.6542741656303406, "rewards/accuracies": 0.0, "rewards/chosen": -0.2117103636264801, "rewards/margins": -0.19120827317237854, "rewards/rejected": -0.020502090454101562, "step": 7 }, { "epoch": 0.0013333333333333333, "grad_norm": 206.7501983642578, "learning_rate": 8.888888888888889e-09, "logits/chosen": 1.5488975048065186, "logits/rejected": 1.991809606552124, "logps/chosen": -99.19429016113281, "logps/rejected": -170.251953125, "loss": 1.9461, "nll_loss": 1.1401642560958862, "rewards/accuracies": 1.0, "rewards/chosen": 0.01559906080365181, "rewards/margins": 0.05232696607708931, "rewards/rejected": -0.0367279052734375, "step": 8 }, { "epoch": 0.0015, "grad_norm": 166.1602783203125, "learning_rate": 1e-08, "logits/chosen": 2.351259231567383, "logits/rejected": 2.347055196762085, "logps/chosen": -96.54353332519531, "logps/rejected": -88.32405090332031, "loss": 1.9581, "nll_loss": 1.1773602962493896, "rewards/accuracies": 1.0, "rewards/chosen": 0.05176544189453125, "rewards/margins": 0.10458222031593323, "rewards/rejected": -0.05281677469611168, "step": 9 }, { "epoch": 0.0016666666666666668, "grad_norm": 159.87405395507812, "learning_rate": 1.111111111111111e-08, "logits/chosen": 2.4483752250671387, "logits/rejected": 2.438070058822632, "logps/chosen": -32.85239791870117, "logps/rejected": -39.72311019897461, "loss": 1.9161, "nll_loss": 1.1328414678573608, "rewards/accuracies": 1.0, "rewards/chosen": 0.061244964599609375, "rewards/margins": 0.09922103583812714, "rewards/rejected": -0.03797607496380806, "step": 10 }, { "epoch": 0.0018333333333333333, "grad_norm": 167.0572052001953, "learning_rate": 1.2222222222222222e-08, "logits/chosen": 3.7844955921173096, "logits/rejected": 3.857954263687134, "logps/chosen": -35.93043899536133, "logps/rejected": -16.24526023864746, "loss": 2.7336, "nll_loss": 1.8910757303237915, "rewards/accuracies": 0.0, "rewards/chosen": 0.05355339124798775, "rewards/margins": -0.020830344408750534, "rewards/rejected": 0.07438373565673828, "step": 11 }, { "epoch": 0.002, "grad_norm": 241.86907958984375, "learning_rate": 1.3333333333333332e-08, "logits/chosen": 3.156634569168091, "logits/rejected": 3.102428436279297, "logps/chosen": -93.36886596679688, "logps/rejected": -79.07455444335938, "loss": 2.3155, "nll_loss": 1.353171944618225, "rewards/accuracies": 0.0, "rewards/chosen": -0.22738800942897797, "rewards/margins": -0.24654771387577057, "rewards/rejected": 0.019159698858857155, "step": 12 }, { "epoch": 0.0021666666666666666, "grad_norm": 208.2474365234375, "learning_rate": 1.4444444444444442e-08, "logits/chosen": 2.4764864444732666, "logits/rejected": 2.2257120609283447, "logps/chosen": -125.87950134277344, "logps/rejected": -63.801177978515625, "loss": 2.3795, "nll_loss": 1.5166206359863281, "rewards/accuracies": 0.0, "rewards/chosen": -0.04237213730812073, "rewards/margins": -0.061440665274858475, "rewards/rejected": 0.019068527966737747, "step": 13 }, { "epoch": 0.0023333333333333335, "grad_norm": 139.7952880859375, "learning_rate": 1.5555555555555554e-08, "logits/chosen": 2.4761850833892822, "logits/rejected": 2.3051764965057373, "logps/chosen": -33.05902099609375, "logps/rejected": -19.47052001953125, "loss": 2.3386, "nll_loss": 1.5026828050613403, "rewards/accuracies": 0.0, "rewards/chosen": 0.01623382605612278, "rewards/margins": -0.008192252367734909, "rewards/rejected": 0.02442607842385769, "step": 14 }, { "epoch": 0.0025, "grad_norm": 258.3049621582031, "learning_rate": 1.6666666666666664e-08, "logits/chosen": 2.4898147583007812, "logits/rejected": 2.770441770553589, "logps/chosen": -156.22154235839844, "logps/rejected": -210.60528564453125, "loss": 2.234, "nll_loss": 1.2805047035217285, "rewards/accuracies": 0.0, "rewards/chosen": 0.21683502197265625, "rewards/margins": -0.22244417667388916, "rewards/rejected": 0.4392791986465454, "step": 15 }, { "epoch": 0.0026666666666666666, "grad_norm": 185.6620635986328, "learning_rate": 1.7777777777777777e-08, "logits/chosen": 2.6887400150299072, "logits/rejected": 2.4091827869415283, "logps/chosen": -59.6091423034668, "logps/rejected": -19.739818572998047, "loss": 2.9461, "nll_loss": 2.055487871170044, "rewards/accuracies": 0.0, "rewards/chosen": -0.036325838416814804, "rewards/margins": -0.11476364731788635, "rewards/rejected": 0.07843780517578125, "step": 16 }, { "epoch": 0.0028333333333333335, "grad_norm": 207.3758544921875, "learning_rate": 1.8888888888888887e-08, "logits/chosen": 2.0547831058502197, "logits/rejected": 2.0222151279449463, "logps/chosen": -101.24158477783203, "logps/rejected": -98.55430603027344, "loss": 2.0008, "nll_loss": 1.1636962890625, "rewards/accuracies": 0.0, "rewards/chosen": -0.03954926133155823, "rewards/margins": -0.010460669174790382, "rewards/rejected": -0.029088592156767845, "step": 17 }, { "epoch": 0.003, "grad_norm": 241.206787109375, "learning_rate": 2e-08, "logits/chosen": 2.805539608001709, "logits/rejected": 2.9929583072662354, "logps/chosen": -61.53700637817383, "logps/rejected": -108.35177612304688, "loss": 1.9316, "nll_loss": 1.0795966386795044, "rewards/accuracies": 0.0, "rewards/chosen": -0.010014343075454235, "rewards/margins": -0.040120698511600494, "rewards/rejected": 0.030106354504823685, "step": 18 }, { "epoch": 0.0031666666666666666, "grad_norm": 226.41168212890625, "learning_rate": 2.111111111111111e-08, "logits/chosen": 1.5191223621368408, "logits/rejected": 2.9830057621002197, "logps/chosen": -57.89079284667969, "logps/rejected": -227.59719848632812, "loss": 1.7077, "nll_loss": 0.9337223768234253, "rewards/accuracies": 1.0, "rewards/chosen": -0.0063953404314816, "rewards/margins": 0.11926689743995667, "rewards/rejected": -0.12566223740577698, "step": 19 }, { "epoch": 0.0033333333333333335, "grad_norm": 197.8740692138672, "learning_rate": 2.222222222222222e-08, "logits/chosen": 3.1541945934295654, "logits/rejected": 3.3096816539764404, "logps/chosen": -95.64491271972656, "logps/rejected": -183.86660766601562, "loss": 1.6658, "nll_loss": 0.9564491510391235, "rewards/accuracies": 1.0, "rewards/chosen": -0.11648255586624146, "rewards/margins": 0.2667389214038849, "rewards/rejected": -0.38322147727012634, "step": 20 }, { "epoch": 0.0035, "grad_norm": 168.50338745117188, "learning_rate": 2.3333333333333334e-08, "logits/chosen": 2.5207319259643555, "logits/rejected": 2.1851301193237305, "logps/chosen": -83.29013061523438, "logps/rejected": -35.01243209838867, "loss": 1.8059, "nll_loss": 1.028273105621338, "rewards/accuracies": 1.0, "rewards/chosen": -0.02760009840130806, "rewards/margins": 0.11161576211452484, "rewards/rejected": -0.1392158567905426, "step": 21 }, { "epoch": 0.0036666666666666666, "grad_norm": 216.2969970703125, "learning_rate": 2.4444444444444444e-08, "logits/chosen": 2.015916347503662, "logits/rejected": 2.400761842727661, "logps/chosen": -67.39773559570312, "logps/rejected": -87.9083023071289, "loss": 2.0434, "nll_loss": 1.0870604515075684, "rewards/accuracies": 0.0, "rewards/chosen": -0.033087920397520065, "rewards/margins": -0.23580704629421234, "rewards/rejected": 0.20271912217140198, "step": 22 }, { "epoch": 0.003833333333333333, "grad_norm": 231.1486358642578, "learning_rate": 2.5555555555555554e-08, "logits/chosen": 3.019589424133301, "logits/rejected": 3.217968463897705, "logps/chosen": -108.0939712524414, "logps/rejected": -357.65679931640625, "loss": 1.599, "nll_loss": 0.8379378318786621, "rewards/accuracies": 1.0, "rewards/chosen": 0.08991318941116333, "rewards/margins": 0.1462608426809311, "rewards/rejected": -0.05634765699505806, "step": 23 }, { "epoch": 0.004, "grad_norm": 282.3943176269531, "learning_rate": 2.6666666666666664e-08, "logits/chosen": 3.039409637451172, "logits/rejected": 3.2221741676330566, "logps/chosen": -64.61595153808594, "logps/rejected": -221.41775512695312, "loss": 1.7875, "nll_loss": 1.0256497859954834, "rewards/accuracies": 1.0, "rewards/chosen": 0.032891083508729935, "rewards/margins": 0.14461593329906464, "rewards/rejected": -0.111724853515625, "step": 24 }, { "epoch": 0.004166666666666667, "grad_norm": 175.44248962402344, "learning_rate": 2.7777777777777777e-08, "logits/chosen": 2.002192497253418, "logits/rejected": 1.663749098777771, "logps/chosen": -75.57332611083984, "logps/rejected": -38.05303192138672, "loss": 1.7314, "nll_loss": 0.8491385579109192, "rewards/accuracies": 0.0, "rewards/chosen": -0.049925997853279114, "rewards/margins": -0.09871827065944672, "rewards/rejected": 0.048792269080877304, "step": 25 }, { "epoch": 0.004333333333333333, "grad_norm": 194.08187866210938, "learning_rate": 2.8888888888888884e-08, "logits/chosen": 2.7771434783935547, "logits/rejected": 2.6261022090911865, "logps/chosen": -109.53401947021484, "logps/rejected": -76.3560562133789, "loss": 1.9133, "nll_loss": 1.1176942586898804, "rewards/accuracies": 1.0, "rewards/chosen": 0.20231857895851135, "rewards/margins": 0.07631149888038635, "rewards/rejected": 0.126007080078125, "step": 26 }, { "epoch": 0.0045, "grad_norm": 231.50027465820312, "learning_rate": 3e-08, "logits/chosen": 2.521339178085327, "logits/rejected": 2.702133893966675, "logps/chosen": -72.98348236083984, "logps/rejected": -120.30801391601562, "loss": 1.8856, "nll_loss": 0.9238415360450745, "rewards/accuracies": 0.0, "rewards/chosen": -0.07568206638097763, "rewards/margins": -0.24610671401023865, "rewards/rejected": 0.1704246550798416, "step": 27 }, { "epoch": 0.004666666666666667, "grad_norm": 155.0140838623047, "learning_rate": 3.111111111111111e-08, "logits/chosen": 3.05564546585083, "logits/rejected": 3.1009738445281982, "logps/chosen": -33.15594482421875, "logps/rejected": -23.815095901489258, "loss": 1.7719, "nll_loss": 1.004725456237793, "rewards/accuracies": 1.0, "rewards/chosen": 0.08814316242933273, "rewards/margins": 0.13320675492286682, "rewards/rejected": -0.04506359249353409, "step": 28 }, { "epoch": 0.004833333333333334, "grad_norm": 152.20993041992188, "learning_rate": 3.2222222222222224e-08, "logits/chosen": 2.7657313346862793, "logits/rejected": 3.043818950653076, "logps/chosen": -78.63709259033203, "logps/rejected": -63.8984489440918, "loss": 3.261, "nll_loss": 2.6212360858917236, "rewards/accuracies": 1.0, "rewards/chosen": 0.036043550819158554, "rewards/margins": 0.4278728663921356, "rewards/rejected": -0.39182931184768677, "step": 29 }, { "epoch": 0.005, "grad_norm": 169.27374267578125, "learning_rate": 3.333333333333333e-08, "logits/chosen": 2.505549669265747, "logits/rejected": 2.8577237129211426, "logps/chosen": -51.23637390136719, "logps/rejected": -177.71023559570312, "loss": 1.5952, "nll_loss": 0.8539394736289978, "rewards/accuracies": 1.0, "rewards/chosen": 0.019415665417909622, "rewards/margins": 0.1895267516374588, "rewards/rejected": -0.17011108994483948, "step": 30 }, { "epoch": 0.005166666666666667, "grad_norm": 280.47967529296875, "learning_rate": 3.4444444444444444e-08, "logits/chosen": 3.18043851852417, "logits/rejected": 3.325166940689087, "logps/chosen": -92.90371704101562, "logps/rejected": -160.77647399902344, "loss": 1.907, "nll_loss": 0.9479970335960388, "rewards/accuracies": 0.0, "rewards/chosen": 0.09777221828699112, "rewards/margins": -0.23714753985404968, "rewards/rejected": 0.3349197506904602, "step": 31 }, { "epoch": 0.005333333333333333, "grad_norm": 214.8060760498047, "learning_rate": 3.5555555555555554e-08, "logits/chosen": 2.5925705432891846, "logits/rejected": 2.7316219806671143, "logps/chosen": -100.91081237792969, "logps/rejected": -170.0988006591797, "loss": 1.8212, "nll_loss": 1.1733816862106323, "rewards/accuracies": 1.0, "rewards/chosen": 0.028192901983857155, "rewards/margins": 0.40802842378616333, "rewards/rejected": -0.3798355162143707, "step": 32 }, { "epoch": 0.0055, "grad_norm": 185.21701049804688, "learning_rate": 3.6666666666666664e-08, "logits/chosen": 3.2050602436065674, "logits/rejected": 3.297877311706543, "logps/chosen": -79.685302734375, "logps/rejected": -173.41860961914062, "loss": 2.5251, "nll_loss": 1.6601104736328125, "rewards/accuracies": 0.0, "rewards/chosen": -0.05666504055261612, "rewards/margins": -0.06541595607995987, "rewards/rejected": 0.00875091552734375, "step": 33 }, { "epoch": 0.005666666666666667, "grad_norm": 253.46820068359375, "learning_rate": 3.7777777777777774e-08, "logits/chosen": 3.011021375656128, "logits/rejected": 2.901956081390381, "logps/chosen": -117.71678161621094, "logps/rejected": -37.98039627075195, "loss": 2.0859, "nll_loss": 1.1890584230422974, "rewards/accuracies": 0.0, "rewards/chosen": -0.2073570340871811, "rewards/margins": -0.12470932304859161, "rewards/rejected": -0.08264771103858948, "step": 34 }, { "epoch": 0.005833333333333334, "grad_norm": 200.3080596923828, "learning_rate": 3.888888888888889e-08, "logits/chosen": 1.7869467735290527, "logits/rejected": 2.8905069828033447, "logps/chosen": -48.95701599121094, "logps/rejected": -161.82220458984375, "loss": 1.5178, "nll_loss": 0.611962616443634, "rewards/accuracies": 0.0, "rewards/chosen": -0.0272369384765625, "rewards/margins": -0.14314576983451843, "rewards/rejected": 0.11590882390737534, "step": 35 }, { "epoch": 0.006, "grad_norm": 260.6243591308594, "learning_rate": 4e-08, "logits/chosen": 2.166992425918579, "logits/rejected": 2.534665107727051, "logps/chosen": -39.96133041381836, "logps/rejected": -441.6473388671875, "loss": 3.1425, "nll_loss": 2.3506662845611572, "rewards/accuracies": 1.0, "rewards/chosen": 0.025893403217196465, "rewards/margins": 0.08133164048194885, "rewards/rejected": -0.05543823540210724, "step": 36 }, { "epoch": 0.006166666666666667, "grad_norm": 142.2858428955078, "learning_rate": 4.1111111111111104e-08, "logits/chosen": 2.9700474739074707, "logits/rejected": 2.9130308628082275, "logps/chosen": -22.65979766845703, "logps/rejected": -11.829797744750977, "loss": 2.0219, "nll_loss": 1.1926209926605225, "rewards/accuracies": 1.0, "rewards/chosen": 0.005849075503647327, "rewards/margins": 0.004925346467643976, "rewards/rejected": 0.0009237290360033512, "step": 37 }, { "epoch": 0.006333333333333333, "grad_norm": 288.2447509765625, "learning_rate": 4.222222222222222e-08, "logits/chosen": 2.8603522777557373, "logits/rejected": 3.180778741836548, "logps/chosen": -99.25495910644531, "logps/rejected": -351.75439453125, "loss": 2.2869, "nll_loss": 1.4596319198608398, "rewards/accuracies": 1.0, "rewards/chosen": 0.1330009549856186, "rewards/margins": 0.010866552591323853, "rewards/rejected": 0.12213440239429474, "step": 38 }, { "epoch": 0.0065, "grad_norm": 236.0635986328125, "learning_rate": 4.333333333333333e-08, "logits/chosen": 3.576676845550537, "logits/rejected": 3.724644184112549, "logps/chosen": -120.8401870727539, "logps/rejected": -177.63478088378906, "loss": 2.1801, "nll_loss": 1.313480257987976, "rewards/accuracies": 0.0, "rewards/chosen": -0.15712814033031464, "rewards/margins": -0.06736831367015839, "rewards/rejected": -0.08975982666015625, "step": 39 }, { "epoch": 0.006666666666666667, "grad_norm": 112.04328155517578, "learning_rate": 4.444444444444444e-08, "logits/chosen": 1.945117712020874, "logits/rejected": 2.0083694458007812, "logps/chosen": -20.639511108398438, "logps/rejected": -11.927247047424316, "loss": 1.5405, "nll_loss": 0.6879836916923523, "rewards/accuracies": 0.0, "rewards/chosen": -0.028591729700565338, "rewards/margins": -0.04111070930957794, "rewards/rejected": 0.01251897867769003, "step": 40 }, { "epoch": 0.006833333333333334, "grad_norm": 232.28395080566406, "learning_rate": 4.555555555555555e-08, "logits/chosen": 2.984529972076416, "logits/rejected": 2.8054134845733643, "logps/chosen": -161.3368377685547, "logps/rejected": -44.903717041015625, "loss": 1.6437, "nll_loss": 0.8491412401199341, "rewards/accuracies": 1.0, "rewards/chosen": -0.0042327879928052425, "rewards/margins": 0.07593917846679688, "rewards/rejected": -0.08017196506261826, "step": 41 }, { "epoch": 0.007, "grad_norm": 202.77276611328125, "learning_rate": 4.666666666666667e-08, "logits/chosen": 2.564053535461426, "logits/rejected": 2.325129270553589, "logps/chosen": -123.55107879638672, "logps/rejected": -62.30998229980469, "loss": 2.3109, "nll_loss": 1.4885672330856323, "rewards/accuracies": 1.0, "rewards/chosen": 0.1904701292514801, "rewards/margins": 0.022282034158706665, "rewards/rejected": 0.16818809509277344, "step": 42 }, { "epoch": 0.007166666666666667, "grad_norm": 117.3135757446289, "learning_rate": 4.777777777777778e-08, "logits/chosen": 2.6161255836486816, "logits/rejected": 2.5946872234344482, "logps/chosen": -28.58790397644043, "logps/rejected": -46.11754608154297, "loss": 1.6342, "nll_loss": 0.8167973160743713, "rewards/accuracies": 1.0, "rewards/chosen": -0.024985123425722122, "rewards/margins": 0.029111098498106003, "rewards/rejected": -0.054096221923828125, "step": 43 }, { "epoch": 0.007333333333333333, "grad_norm": 393.46832275390625, "learning_rate": 4.888888888888889e-08, "logits/chosen": 2.9191834926605225, "logits/rejected": 2.9039618968963623, "logps/chosen": -91.03946685791016, "logps/rejected": -120.32989501953125, "loss": 1.684, "nll_loss": 1.0710524320602417, "rewards/accuracies": 1.0, "rewards/chosen": -0.021338656544685364, "rewards/margins": 0.5000015497207642, "rewards/rejected": -0.5213401913642883, "step": 44 }, { "epoch": 0.0075, "grad_norm": 277.556640625, "learning_rate": 5e-08, "logits/chosen": 1.6976217031478882, "logits/rejected": 1.9335415363311768, "logps/chosen": -138.96316528320312, "logps/rejected": -134.65330505371094, "loss": 1.9552, "nll_loss": 1.0607876777648926, "rewards/accuracies": 0.0, "rewards/chosen": 0.14563599228858948, "rewards/margins": -0.11773985624313354, "rewards/rejected": 0.263375848531723, "step": 45 }, { "epoch": 0.007666666666666666, "grad_norm": 117.48649597167969, "learning_rate": 5.111111111111111e-08, "logits/chosen": 2.4614570140838623, "logits/rejected": 2.247962474822998, "logps/chosen": -32.15388107299805, "logps/rejected": -34.03590774536133, "loss": 1.5534, "nll_loss": 0.6989974975585938, "rewards/accuracies": 0.0, "rewards/chosen": -0.03829498216509819, "rewards/margins": -0.04480629041790962, "rewards/rejected": 0.00651130685582757, "step": 46 }, { "epoch": 0.007833333333333333, "grad_norm": 276.57275390625, "learning_rate": 5.2222222222222224e-08, "logits/chosen": 3.201129674911499, "logits/rejected": 3.203169345855713, "logps/chosen": -46.44024658203125, "logps/rejected": -91.53710174560547, "loss": 1.8209, "nll_loss": 0.9675052165985107, "rewards/accuracies": 0.0, "rewards/chosen": -0.05610504746437073, "rewards/margins": -0.0428161695599556, "rewards/rejected": -0.01328887976706028, "step": 47 }, { "epoch": 0.008, "grad_norm": 229.49099731445312, "learning_rate": 5.333333333333333e-08, "logits/chosen": 2.214189291000366, "logits/rejected": 2.892519950866699, "logps/chosen": -110.546142578125, "logps/rejected": -248.78091430664062, "loss": 1.8949, "nll_loss": 1.004964828491211, "rewards/accuracies": 0.0, "rewards/chosen": 0.04725876450538635, "rewards/margins": -0.11231157183647156, "rewards/rejected": 0.1595703363418579, "step": 48 }, { "epoch": 0.008166666666666666, "grad_norm": 151.86407470703125, "learning_rate": 5.444444444444444e-08, "logits/chosen": 3.156470537185669, "logits/rejected": 3.124194622039795, "logps/chosen": -24.03297996520996, "logps/rejected": -31.637859344482422, "loss": 2.0302, "nll_loss": 1.1444278955459595, "rewards/accuracies": 0.0, "rewards/chosen": -0.027631569653749466, "rewards/margins": -0.10533466935157776, "rewards/rejected": 0.077703095972538, "step": 49 }, { "epoch": 0.008333333333333333, "grad_norm": 267.846923828125, "learning_rate": 5.5555555555555555e-08, "logits/chosen": 2.620399236679077, "logits/rejected": 2.7185497283935547, "logps/chosen": -47.057498931884766, "logps/rejected": -87.27135467529297, "loss": 1.8503, "nll_loss": 1.147743821144104, "rewards/accuracies": 1.0, "rewards/chosen": 0.11878395080566406, "rewards/margins": 0.27556419372558594, "rewards/rejected": -0.15678024291992188, "step": 50 }, { "epoch": 0.0085, "grad_norm": 134.5943603515625, "learning_rate": 5.6666666666666665e-08, "logits/chosen": 3.216923236846924, "logits/rejected": 3.119548797607422, "logps/chosen": -51.64862823486328, "logps/rejected": -15.907742500305176, "loss": 1.7027, "nll_loss": 0.9564561247825623, "rewards/accuracies": 1.0, "rewards/chosen": 0.1601196527481079, "rewards/margins": 0.1787419617176056, "rewards/rejected": -0.018622303381562233, "step": 51 }, { "epoch": 0.008666666666666666, "grad_norm": 212.8755340576172, "learning_rate": 5.777777777777777e-08, "logits/chosen": 3.0656898021698, "logits/rejected": 3.0704853534698486, "logps/chosen": -38.53984069824219, "logps/rejected": -104.42578887939453, "loss": 2.3038, "nll_loss": 1.4823014736175537, "rewards/accuracies": 1.0, "rewards/chosen": 0.019573213532567024, "rewards/margins": 0.02065124735236168, "rewards/rejected": -0.0010780334705486894, "step": 52 }, { "epoch": 0.008833333333333334, "grad_norm": 232.81642150878906, "learning_rate": 5.888888888888889e-08, "logits/chosen": 2.463435649871826, "logits/rejected": 2.2170066833496094, "logps/chosen": -70.42146301269531, "logps/rejected": -57.685302734375, "loss": 1.9248, "nll_loss": 0.9780758619308472, "rewards/accuracies": 0.0, "rewards/chosen": -0.0005119323614053428, "rewards/margins": -0.2179737240076065, "rewards/rejected": 0.21746179461479187, "step": 53 }, { "epoch": 0.009, "grad_norm": 167.91909790039062, "learning_rate": 6e-08, "logits/chosen": 3.256500244140625, "logits/rejected": 3.2144644260406494, "logps/chosen": -55.45177459716797, "logps/rejected": -65.93903350830078, "loss": 1.5292, "nll_loss": 0.7201528549194336, "rewards/accuracies": 1.0, "rewards/chosen": 0.01788940466940403, "rewards/margins": 0.045871734619140625, "rewards/rejected": -0.027982329949736595, "step": 54 }, { "epoch": 0.009166666666666667, "grad_norm": 194.2391357421875, "learning_rate": 6.111111111111111e-08, "logits/chosen": 3.247015953063965, "logits/rejected": 3.5081820487976074, "logps/chosen": -51.10954284667969, "logps/rejected": -167.77410888671875, "loss": 1.6156, "nll_loss": 0.8243473172187805, "rewards/accuracies": 1.0, "rewards/chosen": 0.02784271538257599, "rewards/margins": 0.08267365396022797, "rewards/rejected": -0.05483093857765198, "step": 55 }, { "epoch": 0.009333333333333334, "grad_norm": 271.230224609375, "learning_rate": 6.222222222222221e-08, "logits/chosen": 1.5327719449996948, "logits/rejected": 2.907160520553589, "logps/chosen": -88.69013214111328, "logps/rejected": -394.5796813964844, "loss": 1.7182, "nll_loss": 0.8527896404266357, "rewards/accuracies": 0.0, "rewards/chosen": 0.040679167956113815, "rewards/margins": -0.06576310098171234, "rewards/rejected": 0.10644226521253586, "step": 56 }, { "epoch": 0.0095, "grad_norm": 239.40213012695312, "learning_rate": 6.333333333333333e-08, "logits/chosen": 1.2255479097366333, "logits/rejected": 3.050295829772949, "logps/chosen": -47.48216247558594, "logps/rejected": -417.1323547363281, "loss": 1.6972, "nll_loss": 0.8792992830276489, "rewards/accuracies": 1.0, "rewards/chosen": -0.025505831465125084, "rewards/margins": 0.028134917840361595, "rewards/rejected": -0.05364074930548668, "step": 57 }, { "epoch": 0.009666666666666667, "grad_norm": 169.29771423339844, "learning_rate": 6.444444444444445e-08, "logits/chosen": 2.773747682571411, "logits/rejected": 2.0231316089630127, "logps/chosen": -106.29412078857422, "logps/rejected": -51.86308670043945, "loss": 2.0351, "nll_loss": 1.250519037246704, "rewards/accuracies": 1.0, "rewards/chosen": 0.1336662322282791, "rewards/margins": 0.09724883735179901, "rewards/rejected": 0.036417391151189804, "step": 58 }, { "epoch": 0.009833333333333333, "grad_norm": 176.5576629638672, "learning_rate": 6.555555555555555e-08, "logits/chosen": 3.255034923553467, "logits/rejected": 3.0686745643615723, "logps/chosen": -97.35493469238281, "logps/rejected": -60.09547424316406, "loss": 1.9002, "nll_loss": 1.024788737297058, "rewards/accuracies": 0.0, "rewards/chosen": -0.009600830264389515, "rewards/margins": -0.08560753613710403, "rewards/rejected": 0.0760067030787468, "step": 59 }, { "epoch": 0.01, "grad_norm": 209.61688232421875, "learning_rate": 6.666666666666665e-08, "logits/chosen": 2.589618444442749, "logits/rejected": 2.626617431640625, "logps/chosen": -90.06629943847656, "logps/rejected": -122.640380859375, "loss": 1.7898, "nll_loss": 0.9285186529159546, "rewards/accuracies": 0.0, "rewards/chosen": -0.01835784874856472, "rewards/margins": -0.05831756442785263, "rewards/rejected": 0.03995971754193306, "step": 60 }, { "epoch": 0.010166666666666666, "grad_norm": 135.76480102539062, "learning_rate": 6.777777777777778e-08, "logits/chosen": 3.258664608001709, "logits/rejected": 3.2263896465301514, "logps/chosen": -47.74639129638672, "logps/rejected": -35.63920211791992, "loss": 1.9463, "nll_loss": 1.1103812456130981, "rewards/accuracies": 0.0, "rewards/chosen": 0.04995880275964737, "rewards/margins": -0.007923506200313568, "rewards/rejected": 0.05788230895996094, "step": 61 }, { "epoch": 0.010333333333333333, "grad_norm": 169.6089324951172, "learning_rate": 6.888888888888889e-08, "logits/chosen": 2.301638126373291, "logits/rejected": 2.6921496391296387, "logps/chosen": -83.30186462402344, "logps/rejected": -80.855712890625, "loss": 1.8617, "nll_loss": 1.0679727792739868, "rewards/accuracies": 1.0, "rewards/chosen": 0.02286377176642418, "rewards/margins": 0.07739105820655823, "rewards/rejected": -0.05452728271484375, "step": 62 }, { "epoch": 0.0105, "grad_norm": 214.21759033203125, "learning_rate": 6.999999999999999e-08, "logits/chosen": 3.0224545001983643, "logits/rejected": 3.278524160385132, "logps/chosen": -66.2686996459961, "logps/rejected": -141.45840454101562, "loss": 1.6922, "nll_loss": 0.8955230712890625, "rewards/accuracies": 1.0, "rewards/chosen": -0.013559723272919655, "rewards/margins": 0.07167588174343109, "rewards/rejected": -0.0852356031537056, "step": 63 }, { "epoch": 0.010666666666666666, "grad_norm": 242.8707733154297, "learning_rate": 7.111111111111111e-08, "logits/chosen": 2.733693838119507, "logits/rejected": 3.105349540710449, "logps/chosen": -64.84800720214844, "logps/rejected": -306.9703369140625, "loss": 1.9943, "nll_loss": 1.2235474586486816, "rewards/accuracies": 1.0, "rewards/chosen": 0.07231750339269638, "rewards/margins": 0.12564392387866974, "rewards/rejected": -0.05332641676068306, "step": 64 }, { "epoch": 0.010833333333333334, "grad_norm": 867.9107055664062, "learning_rate": 7.222222222222221e-08, "logits/chosen": 2.0995521545410156, "logits/rejected": 2.664541482925415, "logps/chosen": -108.47883605957031, "logps/rejected": -338.28521728515625, "loss": 2.0124, "nll_loss": 1.1069267988204956, "rewards/accuracies": 0.0, "rewards/chosen": 0.04344482347369194, "rewards/margins": -0.1417541652917862, "rewards/rejected": 0.18519899249076843, "step": 65 }, { "epoch": 0.011, "grad_norm": 175.87530517578125, "learning_rate": 7.333333333333333e-08, "logits/chosen": 2.42016339302063, "logits/rejected": 2.1156764030456543, "logps/chosen": -95.10767364501953, "logps/rejected": -11.217387199401855, "loss": 1.8121, "nll_loss": 1.0117837190628052, "rewards/accuracies": 1.0, "rewards/chosen": 0.032445527613162994, "rewards/margins": 0.06379175186157227, "rewards/rejected": -0.03134622797369957, "step": 66 }, { "epoch": 0.011166666666666667, "grad_norm": 145.50172424316406, "learning_rate": 7.444444444444444e-08, "logits/chosen": 2.6021857261657715, "logits/rejected": 2.260584831237793, "logps/chosen": -36.722408294677734, "logps/rejected": -8.43846607208252, "loss": 2.0576, "nll_loss": 1.1475753784179688, "rewards/accuracies": 0.0, "rewards/chosen": -0.060387421399354935, "rewards/margins": -0.15132589638233185, "rewards/rejected": 0.09093847125768661, "step": 67 }, { "epoch": 0.011333333333333334, "grad_norm": 181.60580444335938, "learning_rate": 7.555555555555555e-08, "logits/chosen": 2.7372963428497314, "logits/rejected": 3.038727045059204, "logps/chosen": -122.35691833496094, "logps/rejected": -165.6352996826172, "loss": 1.946, "nll_loss": 1.1543105840682983, "rewards/accuracies": 1.0, "rewards/chosen": 0.02885284647345543, "rewards/margins": 0.08175355195999146, "rewards/rejected": -0.05290070176124573, "step": 68 }, { "epoch": 0.0115, "grad_norm": 167.68885803222656, "learning_rate": 7.666666666666666e-08, "logits/chosen": 2.4517030715942383, "logits/rejected": 2.515162229537964, "logps/chosen": -38.59302520751953, "logps/rejected": -52.457069396972656, "loss": 2.0609, "nll_loss": 1.286434292793274, "rewards/accuracies": 1.0, "rewards/chosen": 0.0146484375, "rewards/margins": 0.1179020032286644, "rewards/rejected": -0.1032535657286644, "step": 69 }, { "epoch": 0.011666666666666667, "grad_norm": 235.77745056152344, "learning_rate": 7.777777777777778e-08, "logits/chosen": 2.416735887527466, "logits/rejected": 3.0721657276153564, "logps/chosen": -74.47631072998047, "logps/rejected": -151.45529174804688, "loss": 1.5253, "nll_loss": 0.7839610576629639, "rewards/accuracies": 1.0, "rewards/chosen": 0.09101792424917221, "rewards/margins": 0.18897172808647156, "rewards/rejected": -0.09795379638671875, "step": 70 }, { "epoch": 0.011833333333333333, "grad_norm": 181.74412536621094, "learning_rate": 7.888888888888889e-08, "logits/chosen": 3.5104193687438965, "logits/rejected": 3.514213800430298, "logps/chosen": -31.519020080566406, "logps/rejected": -93.0535888671875, "loss": 1.5358, "nll_loss": 0.7004227042198181, "rewards/accuracies": 0.0, "rewards/chosen": -0.006056404206901789, "rewards/margins": -0.007242012303322554, "rewards/rejected": 0.001185607980005443, "step": 71 }, { "epoch": 0.012, "grad_norm": 185.22471618652344, "learning_rate": 8e-08, "logits/chosen": 2.608513593673706, "logits/rejected": 1.9609630107879639, "logps/chosen": -90.37042236328125, "logps/rejected": -13.783026695251465, "loss": 1.8755, "nll_loss": 1.026936650276184, "rewards/accuracies": 0.0, "rewards/chosen": 0.02074890211224556, "rewards/margins": -0.033127497881650925, "rewards/rejected": 0.053876399993896484, "step": 72 }, { "epoch": 0.012166666666666666, "grad_norm": 238.00843811035156, "learning_rate": 8.11111111111111e-08, "logits/chosen": 2.455630302429199, "logits/rejected": 2.9150664806365967, "logps/chosen": -82.27204895019531, "logps/rejected": -159.08372497558594, "loss": 2.1918, "nll_loss": 1.3269686698913574, "rewards/accuracies": 0.0, "rewards/chosen": -0.0017784119118005037, "rewards/margins": -0.0651756301522255, "rewards/rejected": 0.06339722126722336, "step": 73 }, { "epoch": 0.012333333333333333, "grad_norm": 246.30258178710938, "learning_rate": 8.222222222222221e-08, "logits/chosen": 2.920456647872925, "logits/rejected": 3.0246081352233887, "logps/chosen": -83.34988403320312, "logps/rejected": -59.50188446044922, "loss": 3.1508, "nll_loss": 2.315274477005005, "rewards/accuracies": 0.0, "rewards/chosen": -0.037824250757694244, "rewards/margins": -0.007318498566746712, "rewards/rejected": -0.030505752190947533, "step": 74 }, { "epoch": 0.0125, "grad_norm": 218.02627563476562, "learning_rate": 8.333333333333334e-08, "logits/chosen": 0.6974374651908875, "logits/rejected": 2.311278820037842, "logps/chosen": -52.73231506347656, "logps/rejected": -192.2121124267578, "loss": 2.5078, "nll_loss": 1.7577437162399292, "rewards/accuracies": 1.0, "rewards/chosen": -0.0266494769603014, "rewards/margins": 0.1712799072265625, "rewards/rejected": -0.19792938232421875, "step": 75 }, { "epoch": 0.012666666666666666, "grad_norm": 170.86270141601562, "learning_rate": 8.444444444444444e-08, "logits/chosen": 2.1462464332580566, "logits/rejected": 1.7793136835098267, "logps/chosen": -75.47627258300781, "logps/rejected": -65.03053283691406, "loss": 1.5012, "nll_loss": 0.6289690732955933, "rewards/accuracies": 0.0, "rewards/chosen": 0.02855071984231472, "rewards/margins": -0.07898864895105362, "rewards/rejected": 0.10753937065601349, "step": 76 }, { "epoch": 0.012833333333333334, "grad_norm": 282.6992492675781, "learning_rate": 8.555555555555555e-08, "logits/chosen": 3.0818750858306885, "logits/rejected": 3.4587831497192383, "logps/chosen": -35.17906951904297, "logps/rejected": -250.65235900878906, "loss": 3.7961, "nll_loss": 2.931589365005493, "rewards/accuracies": 0.0, "rewards/chosen": -0.06251907348632812, "rewards/margins": -0.06436081230640411, "rewards/rejected": 0.001841735909692943, "step": 77 }, { "epoch": 0.013, "grad_norm": 184.07701110839844, "learning_rate": 8.666666666666666e-08, "logits/chosen": 2.554391384124756, "logits/rejected": 2.560436248779297, "logps/chosen": -57.18570327758789, "logps/rejected": -23.40809440612793, "loss": 3.027, "nll_loss": 2.1994500160217285, "rewards/accuracies": 1.0, "rewards/chosen": 0.028501130640506744, "rewards/margins": 0.008498763665556908, "rewards/rejected": 0.020002366974949837, "step": 78 }, { "epoch": 0.013166666666666667, "grad_norm": 327.97271728515625, "learning_rate": 8.777777777777778e-08, "logits/chosen": 0.8347261548042297, "logits/rejected": 2.0137133598327637, "logps/chosen": -83.75889587402344, "logps/rejected": -303.6317443847656, "loss": 1.918, "nll_loss": 1.087778091430664, "rewards/accuracies": 1.0, "rewards/chosen": -0.1947929561138153, "rewards/margins": 0.006974011659622192, "rewards/rejected": -0.2017669677734375, "step": 79 }, { "epoch": 0.013333333333333334, "grad_norm": 191.96640014648438, "learning_rate": 8.888888888888888e-08, "logits/chosen": 1.9949308633804321, "logits/rejected": 2.71652889251709, "logps/chosen": -19.09813117980957, "logps/rejected": -80.04209899902344, "loss": 2.3314, "nll_loss": 1.4690868854522705, "rewards/accuracies": 0.0, "rewards/chosen": -0.04024658352136612, "rewards/margins": -0.06022949516773224, "rewards/rejected": 0.01998291164636612, "step": 80 }, { "epoch": 0.0135, "grad_norm": 141.84214782714844, "learning_rate": 9e-08, "logits/chosen": 3.0251002311706543, "logits/rejected": 2.5951991081237793, "logps/chosen": -69.50809478759766, "logps/rejected": -71.02533721923828, "loss": 1.7132, "nll_loss": 0.9392985701560974, "rewards/accuracies": 1.0, "rewards/chosen": 0.10336380451917648, "rewards/margins": 0.11906281113624573, "rewards/rejected": -0.015699004754424095, "step": 81 }, { "epoch": 0.013666666666666667, "grad_norm": 256.6083068847656, "learning_rate": 9.11111111111111e-08, "logits/chosen": 3.3780384063720703, "logits/rejected": 3.438462257385254, "logps/chosen": -106.84634399414062, "logps/rejected": -139.2332305908203, "loss": 1.9922, "nll_loss": 1.1366631984710693, "rewards/accuracies": 0.0, "rewards/chosen": -0.05157165601849556, "rewards/margins": -0.04692993313074112, "rewards/rejected": -0.004641723819077015, "step": 82 }, { "epoch": 0.013833333333333333, "grad_norm": 193.1544952392578, "learning_rate": 9.222222222222222e-08, "logits/chosen": 2.9858343601226807, "logits/rejected": 3.268052339553833, "logps/chosen": -104.731689453125, "logps/rejected": -209.25071716308594, "loss": 1.8952, "nll_loss": 1.13838791847229, "rewards/accuracies": 1.0, "rewards/chosen": -0.06202392652630806, "rewards/margins": 0.15740816295146942, "rewards/rejected": -0.21943208575248718, "step": 83 }, { "epoch": 0.014, "grad_norm": 253.61160278320312, "learning_rate": 9.333333333333334e-08, "logits/chosen": 2.8024284839630127, "logits/rejected": 3.021498203277588, "logps/chosen": -38.2917366027832, "logps/rejected": -372.9683532714844, "loss": 2.0809, "nll_loss": 1.4727591276168823, "rewards/accuracies": 1.0, "rewards/chosen": 0.035192109644412994, "rewards/margins": 0.5098259449005127, "rewards/rejected": -0.4746338129043579, "step": 84 }, { "epoch": 0.014166666666666666, "grad_norm": 130.2939910888672, "learning_rate": 9.444444444444444e-08, "logits/chosen": 2.6673855781555176, "logits/rejected": 2.235053777694702, "logps/chosen": -41.8231201171875, "logps/rejected": -48.37190628051758, "loss": 1.7248, "nll_loss": 1.0200761556625366, "rewards/accuracies": 1.0, "rewards/chosen": 0.17696154117584229, "rewards/margins": 0.2707100212574005, "rewards/rejected": -0.09374848008155823, "step": 85 }, { "epoch": 0.014333333333333333, "grad_norm": 222.80853271484375, "learning_rate": 9.555555555555556e-08, "logits/chosen": 2.357757329940796, "logits/rejected": 2.3068268299102783, "logps/chosen": -190.26283264160156, "logps/rejected": -196.93002319335938, "loss": 1.7616, "nll_loss": 0.9560947418212891, "rewards/accuracies": 1.0, "rewards/chosen": -0.03838043287396431, "rewards/margins": 0.05356598272919655, "rewards/rejected": -0.09194641560316086, "step": 86 }, { "epoch": 0.0145, "grad_norm": 175.67701721191406, "learning_rate": 9.666666666666666e-08, "logits/chosen": 3.0488550662994385, "logits/rejected": 3.1595451831817627, "logps/chosen": -66.31192016601562, "logps/rejected": -199.8951416015625, "loss": 1.576, "nll_loss": 0.8611937761306763, "rewards/accuracies": 1.0, "rewards/chosen": -0.07753600925207138, "rewards/margins": 0.2520752251148224, "rewards/rejected": -0.32961124181747437, "step": 87 }, { "epoch": 0.014666666666666666, "grad_norm": 187.7196044921875, "learning_rate": 9.777777777777778e-08, "logits/chosen": 2.704042434692383, "logits/rejected": 2.8199167251586914, "logps/chosen": -96.08419799804688, "logps/rejected": -114.35616302490234, "loss": 2.0663, "nll_loss": 1.2811226844787598, "rewards/accuracies": 1.0, "rewards/chosen": -0.004177856724709272, "rewards/margins": 0.09546585381031036, "rewards/rejected": -0.09964370727539062, "step": 88 }, { "epoch": 0.014833333333333334, "grad_norm": 197.9832000732422, "learning_rate": 9.888888888888889e-08, "logits/chosen": 4.056828022003174, "logits/rejected": 4.3839592933654785, "logps/chosen": -56.992095947265625, "logps/rejected": -177.77735900878906, "loss": 1.7892, "nll_loss": 0.9659677147865295, "rewards/accuracies": 1.0, "rewards/chosen": 0.02671661600470543, "rewards/margins": 0.01712951809167862, "rewards/rejected": 0.009587096981704235, "step": 89 }, { "epoch": 0.015, "grad_norm": 230.8881072998047, "learning_rate": 1e-07, "logits/chosen": 2.7862634658813477, "logits/rejected": 2.9252092838287354, "logps/chosen": -79.44670867919922, "logps/rejected": -123.33193969726562, "loss": 1.7402, "nll_loss": 0.8926596641540527, "rewards/accuracies": 0.0, "rewards/chosen": -0.06865005940198898, "rewards/margins": -0.031087499111890793, "rewards/rejected": -0.03756256029009819, "step": 90 }, { "epoch": 0.015166666666666667, "grad_norm": 224.1750030517578, "learning_rate": 1.011111111111111e-07, "logits/chosen": 2.2100412845611572, "logits/rejected": 2.3485186100006104, "logps/chosen": -93.11880493164062, "logps/rejected": -209.0218963623047, "loss": 1.7346, "nll_loss": 0.9311881065368652, "rewards/accuracies": 1.0, "rewards/chosen": -0.05854645371437073, "rewards/margins": 0.05808716267347336, "rewards/rejected": -0.11663361638784409, "step": 91 }, { "epoch": 0.015333333333333332, "grad_norm": 154.50111389160156, "learning_rate": 1.0222222222222222e-07, "logits/chosen": 3.23179030418396, "logits/rejected": 3.0962846279144287, "logps/chosen": -58.969329833984375, "logps/rejected": -38.89338302612305, "loss": 1.6629, "nll_loss": 0.8305540084838867, "rewards/accuracies": 0.0, "rewards/chosen": -0.08133430778980255, "rewards/margins": -0.0005271881818771362, "rewards/rejected": -0.08080711960792542, "step": 92 }, { "epoch": 0.0155, "grad_norm": 204.7209014892578, "learning_rate": 1.0333333333333335e-07, "logits/chosen": 2.816871404647827, "logits/rejected": 3.1686620712280273, "logps/chosen": -76.14012145996094, "logps/rejected": -327.57208251953125, "loss": 1.7808, "nll_loss": 1.0723960399627686, "rewards/accuracies": 1.0, "rewards/chosen": 0.02704162895679474, "rewards/margins": 0.263174444437027, "rewards/rejected": -0.23613281548023224, "step": 93 }, { "epoch": 0.015666666666666666, "grad_norm": 148.2394256591797, "learning_rate": 1.0444444444444445e-07, "logits/chosen": 2.4352753162384033, "logits/rejected": 2.588228225708008, "logps/chosen": -43.437198638916016, "logps/rejected": -56.518741607666016, "loss": 1.6422, "nll_loss": 0.7620559334754944, "rewards/accuracies": 0.0, "rewards/chosen": -0.10495338588953018, "rewards/margins": -0.09447135776281357, "rewards/rejected": -0.01048202533274889, "step": 94 }, { "epoch": 0.015833333333333335, "grad_norm": 234.82534790039062, "learning_rate": 1.0555555555555555e-07, "logits/chosen": 2.5385966300964355, "logits/rejected": 2.936291217803955, "logps/chosen": -117.51573181152344, "logps/rejected": -92.32499694824219, "loss": 2.7664, "nll_loss": 1.9917924404144287, "rewards/accuracies": 1.0, "rewards/chosen": 0.17159578204154968, "rewards/margins": 0.11898653209209442, "rewards/rejected": 0.05260925367474556, "step": 95 }, { "epoch": 0.016, "grad_norm": 277.9039611816406, "learning_rate": 1.0666666666666666e-07, "logits/chosen": 2.334294557571411, "logits/rejected": 2.4970107078552246, "logps/chosen": -137.77394104003906, "logps/rejected": -218.20401000976562, "loss": 2.0212, "nll_loss": 1.0848342180252075, "rewards/accuracies": 0.0, "rewards/chosen": -0.12204743176698685, "rewards/margins": -0.20007935166358948, "rewards/rejected": 0.07803191989660263, "step": 96 }, { "epoch": 0.016166666666666666, "grad_norm": 224.6792449951172, "learning_rate": 1.0777777777777777e-07, "logits/chosen": 1.944317102432251, "logits/rejected": 2.7380850315093994, "logps/chosen": -30.42295265197754, "logps/rejected": -111.02812194824219, "loss": 1.777, "nll_loss": 0.8692272901535034, "rewards/accuracies": 0.0, "rewards/chosen": -0.014980508014559746, "rewards/margins": -0.1467393934726715, "rewards/rejected": 0.1317588835954666, "step": 97 }, { "epoch": 0.01633333333333333, "grad_norm": 300.9002380371094, "learning_rate": 1.0888888888888888e-07, "logits/chosen": 3.120178461074829, "logits/rejected": 2.8527908325195312, "logps/chosen": -280.5830383300781, "logps/rejected": -34.04075241088867, "loss": 2.3166, "nll_loss": 1.594221830368042, "rewards/accuracies": 1.0, "rewards/chosen": 0.25531005859375, "rewards/margins": 0.2331714630126953, "rewards/rejected": 0.022138597443699837, "step": 98 }, { "epoch": 0.0165, "grad_norm": 160.23902893066406, "learning_rate": 1.1e-07, "logits/chosen": 2.9979147911071777, "logits/rejected": 2.9521877765655518, "logps/chosen": -72.85929870605469, "logps/rejected": -61.17735290527344, "loss": 1.7482, "nll_loss": 0.8472011685371399, "rewards/accuracies": 0.0, "rewards/chosen": -0.007905579172074795, "rewards/margins": -0.13395270705223083, "rewards/rejected": 0.12604713439941406, "step": 99 }, { "epoch": 0.016666666666666666, "grad_norm": 209.86618041992188, "learning_rate": 1.1111111111111111e-07, "logits/chosen": 2.6933813095092773, "logits/rejected": 2.6759674549102783, "logps/chosen": -47.84461212158203, "logps/rejected": -79.12385559082031, "loss": 1.6967, "nll_loss": 0.88601154088974, "rewards/accuracies": 1.0, "rewards/chosen": 0.04061966389417648, "rewards/margins": 0.0427192747592926, "rewards/rejected": -0.0020996094681322575, "step": 100 }, { "epoch": 0.016833333333333332, "grad_norm": 192.91920471191406, "learning_rate": 1.1222222222222221e-07, "logits/chosen": 2.350532293319702, "logits/rejected": 2.5621070861816406, "logps/chosen": -117.78085327148438, "logps/rejected": -115.28996276855469, "loss": 1.5936, "nll_loss": 0.9274082779884338, "rewards/accuracies": 1.0, "rewards/chosen": 0.005488586612045765, "rewards/margins": 0.36361387372016907, "rewards/rejected": -0.3581252992153168, "step": 101 }, { "epoch": 0.017, "grad_norm": 207.51119995117188, "learning_rate": 1.1333333333333333e-07, "logits/chosen": 2.254540205001831, "logits/rejected": 2.4512200355529785, "logps/chosen": -51.581302642822266, "logps/rejected": -129.30335998535156, "loss": 1.6381, "nll_loss": 0.8059578537940979, "rewards/accuracies": 0.0, "rewards/chosen": -0.011678695678710938, "rewards/margins": -0.0006969450041651726, "rewards/rejected": -0.010981750674545765, "step": 102 }, { "epoch": 0.017166666666666667, "grad_norm": 255.06887817382812, "learning_rate": 1.1444444444444443e-07, "logits/chosen": 2.8688080310821533, "logits/rejected": 3.040680408477783, "logps/chosen": -123.7669448852539, "logps/rejected": -395.1297607421875, "loss": 1.9423, "nll_loss": 1.2254152297973633, "rewards/accuracies": 1.0, "rewards/chosen": 0.0870361328125, "rewards/margins": 0.24316711723804474, "rewards/rejected": -0.15613098442554474, "step": 103 }, { "epoch": 0.017333333333333333, "grad_norm": 158.1673126220703, "learning_rate": 1.1555555555555554e-07, "logits/chosen": 2.4570021629333496, "logits/rejected": 2.0813372135162354, "logps/chosen": -94.41679382324219, "logps/rejected": -26.855344772338867, "loss": 1.8376, "nll_loss": 1.0490756034851074, "rewards/accuracies": 1.0, "rewards/chosen": 0.06471862643957138, "rewards/margins": 0.08836765587329865, "rewards/rejected": -0.023649025708436966, "step": 104 }, { "epoch": 0.0175, "grad_norm": 166.7901611328125, "learning_rate": 1.1666666666666667e-07, "logits/chosen": 2.8350958824157715, "logits/rejected": 3.076018810272217, "logps/chosen": -55.144317626953125, "logps/rejected": -74.58719635009766, "loss": 2.332, "nll_loss": 1.490386962890625, "rewards/accuracies": 0.0, "rewards/chosen": -0.06534653156995773, "rewards/margins": -0.01933899149298668, "rewards/rejected": -0.046007540076971054, "step": 105 }, { "epoch": 0.017666666666666667, "grad_norm": 166.458740234375, "learning_rate": 1.1777777777777778e-07, "logits/chosen": 3.352876901626587, "logits/rejected": 3.258270263671875, "logps/chosen": -101.99755096435547, "logps/rejected": -9.14191722869873, "loss": 1.9179, "nll_loss": 1.1208521127700806, "rewards/accuracies": 1.0, "rewards/chosen": 0.05531616136431694, "rewards/margins": 0.07056131213903427, "rewards/rejected": -0.01524515263736248, "step": 106 }, { "epoch": 0.017833333333333333, "grad_norm": 203.51199340820312, "learning_rate": 1.1888888888888889e-07, "logits/chosen": 1.3737096786499023, "logits/rejected": 2.4763104915618896, "logps/chosen": -78.51432800292969, "logps/rejected": -289.78460693359375, "loss": 1.5916, "nll_loss": 0.8264666199684143, "rewards/accuracies": 1.0, "rewards/chosen": 0.087005615234375, "rewards/margins": 0.13756103813648224, "rewards/rejected": -0.05055542290210724, "step": 107 }, { "epoch": 0.018, "grad_norm": 180.8767547607422, "learning_rate": 1.2e-07, "logits/chosen": 1.296702265739441, "logits/rejected": 3.004390001296997, "logps/chosen": -33.4600715637207, "logps/rejected": -90.07830810546875, "loss": 1.4525, "nll_loss": 0.6434628367424011, "rewards/accuracies": 1.0, "rewards/chosen": -0.018836213275790215, "rewards/margins": 0.046115875244140625, "rewards/rejected": -0.06495209038257599, "step": 108 }, { "epoch": 0.018166666666666668, "grad_norm": 287.6692810058594, "learning_rate": 1.211111111111111e-07, "logits/chosen": 3.301391124725342, "logits/rejected": 3.3830697536468506, "logps/chosen": -86.9949722290039, "logps/rejected": -83.65708923339844, "loss": 2.5721, "nll_loss": 1.6110178232192993, "rewards/accuracies": 0.0, "rewards/chosen": -0.2156120389699936, "rewards/margins": -0.24457474052906036, "rewards/rejected": 0.02896270714700222, "step": 109 }, { "epoch": 0.018333333333333333, "grad_norm": 291.41827392578125, "learning_rate": 1.2222222222222222e-07, "logits/chosen": 3.4571824073791504, "logits/rejected": 3.574305534362793, "logps/chosen": -167.81692504882812, "logps/rejected": -215.07772827148438, "loss": 2.0117, "nll_loss": 1.0688974857330322, "rewards/accuracies": 0.0, "rewards/chosen": -0.03705139085650444, "rewards/margins": -0.21134035289287567, "rewards/rejected": 0.17428895831108093, "step": 110 }, { "epoch": 0.0185, "grad_norm": 182.7056884765625, "learning_rate": 1.2333333333333333e-07, "logits/chosen": 2.599581003189087, "logits/rejected": 2.4407737255096436, "logps/chosen": -123.51608276367188, "logps/rejected": -42.00950622558594, "loss": 2.1041, "nll_loss": 1.3573194742202759, "rewards/accuracies": 1.0, "rewards/chosen": 0.17186281085014343, "rewards/margins": 0.1779811978340149, "rewards/rejected": -0.006118393037468195, "step": 111 }, { "epoch": 0.018666666666666668, "grad_norm": 258.1807861328125, "learning_rate": 1.2444444444444443e-07, "logits/chosen": 3.1825742721557617, "logits/rejected": 3.4712705612182617, "logps/chosen": -102.58448028564453, "logps/rejected": -248.62960815429688, "loss": 1.7505, "nll_loss": 0.8843489289283752, "rewards/accuracies": 0.0, "rewards/chosen": -0.08778076618909836, "rewards/margins": -0.06756897270679474, "rewards/rejected": -0.02021179348230362, "step": 112 }, { "epoch": 0.018833333333333334, "grad_norm": 151.3474578857422, "learning_rate": 1.2555555555555556e-07, "logits/chosen": 2.9121665954589844, "logits/rejected": 2.6976215839385986, "logps/chosen": -92.17813873291016, "logps/rejected": -41.44212341308594, "loss": 1.8969, "nll_loss": 1.1668120622634888, "rewards/accuracies": 1.0, "rewards/chosen": 0.13871918618679047, "rewards/margins": 0.2138618528842926, "rewards/rejected": -0.07514267414808273, "step": 113 }, { "epoch": 0.019, "grad_norm": 302.83111572265625, "learning_rate": 1.2666666666666666e-07, "logits/chosen": 3.479769706726074, "logits/rejected": 3.7271201610565186, "logps/chosen": -143.0454864501953, "logps/rejected": -225.03656005859375, "loss": 2.4954, "nll_loss": 1.6442010402679443, "rewards/accuracies": 0.0, "rewards/chosen": -0.03789520263671875, "rewards/margins": -0.03842468187212944, "rewards/rejected": 0.0005294799921102822, "step": 114 }, { "epoch": 0.019166666666666665, "grad_norm": 147.59107971191406, "learning_rate": 1.2777777777777777e-07, "logits/chosen": 2.544308662414551, "logits/rejected": 2.275979995727539, "logps/chosen": -72.79188537597656, "logps/rejected": -48.3606071472168, "loss": 2.292, "nll_loss": 1.485548496246338, "rewards/accuracies": 1.0, "rewards/chosen": 0.09999313205480576, "rewards/margins": 0.05188369378447533, "rewards/rejected": 0.04810943827033043, "step": 115 }, { "epoch": 0.019333333333333334, "grad_norm": 203.14378356933594, "learning_rate": 1.288888888888889e-07, "logits/chosen": 3.1030311584472656, "logits/rejected": 3.10040545463562, "logps/chosen": -119.37541961669922, "logps/rejected": -175.0606689453125, "loss": 1.8945, "nll_loss": 1.230674386024475, "rewards/accuracies": 1.0, "rewards/chosen": 0.25719985365867615, "rewards/margins": 0.36673659086227417, "rewards/rejected": -0.10953675210475922, "step": 116 }, { "epoch": 0.0195, "grad_norm": 255.1413116455078, "learning_rate": 1.3e-07, "logits/chosen": 2.7919974327087402, "logits/rejected": 2.857170581817627, "logps/chosen": -115.02532958984375, "logps/rejected": -221.59234619140625, "loss": 1.8292, "nll_loss": 0.991597592830658, "rewards/accuracies": 0.0, "rewards/chosen": 0.04861908033490181, "rewards/margins": -0.01130371168255806, "rewards/rejected": 0.05992279201745987, "step": 117 }, { "epoch": 0.019666666666666666, "grad_norm": 779.9506225585938, "learning_rate": 1.311111111111111e-07, "logits/chosen": 2.115520715713501, "logits/rejected": 1.6509724855422974, "logps/chosen": -81.06991577148438, "logps/rejected": -47.966278076171875, "loss": 1.7766, "nll_loss": 1.0133737325668335, "rewards/accuracies": 1.0, "rewards/chosen": 0.05148468166589737, "rewards/margins": 0.14167939126491547, "rewards/rejected": -0.0901947095990181, "step": 118 }, { "epoch": 0.019833333333333335, "grad_norm": 303.2100524902344, "learning_rate": 1.322222222222222e-07, "logits/chosen": 0.41759374737739563, "logits/rejected": 2.4753966331481934, "logps/chosen": -95.40348815917969, "logps/rejected": -358.45166015625, "loss": 1.735, "nll_loss": 0.9173412322998047, "rewards/accuracies": 1.0, "rewards/chosen": 0.008213805966079235, "rewards/margins": 0.02842865139245987, "rewards/rejected": -0.02021484449505806, "step": 119 }, { "epoch": 0.02, "grad_norm": 238.27621459960938, "learning_rate": 1.333333333333333e-07, "logits/chosen": 2.645465850830078, "logits/rejected": 2.879045009613037, "logps/chosen": -74.17674255371094, "logps/rejected": -281.259033203125, "loss": 1.8967, "nll_loss": 0.9890232086181641, "rewards/accuracies": 0.0, "rewards/chosen": -0.009905243292450905, "rewards/margins": -0.14647141098976135, "rewards/rejected": 0.136566162109375, "step": 120 }, { "epoch": 0.020166666666666666, "grad_norm": 164.12246704101562, "learning_rate": 1.3444444444444444e-07, "logits/chosen": 2.2345967292785645, "logits/rejected": 2.262946367263794, "logps/chosen": -47.6595573425293, "logps/rejected": -79.4146957397461, "loss": 1.5486, "nll_loss": 0.7446805834770203, "rewards/accuracies": 1.0, "rewards/chosen": -0.05768165737390518, "rewards/margins": 0.05700799077749252, "rewards/rejected": -0.1146896481513977, "step": 121 }, { "epoch": 0.02033333333333333, "grad_norm": 252.57415771484375, "learning_rate": 1.3555555555555557e-07, "logits/chosen": 2.2850050926208496, "logits/rejected": 2.59147310256958, "logps/chosen": -106.8981704711914, "logps/rejected": -157.10177612304688, "loss": 1.8707, "nll_loss": 1.0480214357376099, "rewards/accuracies": 1.0, "rewards/chosen": -0.06794891506433487, "rewards/margins": 0.018740855157375336, "rewards/rejected": -0.0866897702217102, "step": 122 }, { "epoch": 0.0205, "grad_norm": 235.2695770263672, "learning_rate": 1.3666666666666667e-07, "logits/chosen": 3.1303467750549316, "logits/rejected": 2.87715744972229, "logps/chosen": -93.09073638916016, "logps/rejected": -48.617576599121094, "loss": 1.7467, "nll_loss": 0.8865785002708435, "rewards/accuracies": 0.0, "rewards/chosen": -0.013149261474609375, "rewards/margins": -0.05593986436724663, "rewards/rejected": 0.04279060289263725, "step": 123 }, { "epoch": 0.020666666666666667, "grad_norm": 368.3924560546875, "learning_rate": 1.3777777777777778e-07, "logits/chosen": 3.031297445297241, "logits/rejected": 2.9553232192993164, "logps/chosen": -94.20911407470703, "logps/rejected": -75.94937133789062, "loss": 1.9055, "nll_loss": 1.0954550504684448, "rewards/accuracies": 1.0, "rewards/chosen": -0.033953096717596054, "rewards/margins": 0.04415435716509819, "rewards/rejected": -0.07810745388269424, "step": 124 }, { "epoch": 0.020833333333333332, "grad_norm": 234.5783233642578, "learning_rate": 1.3888888888888888e-07, "logits/chosen": 2.1138856410980225, "logits/rejected": 3.0797741413116455, "logps/chosen": -20.82793426513672, "logps/rejected": -131.7994384765625, "loss": 1.6219, "nll_loss": 0.6125863194465637, "rewards/accuracies": 0.0, "rewards/chosen": 0.018537331372499466, "rewards/margins": -0.3274633288383484, "rewards/rejected": 0.34600067138671875, "step": 125 }, { "epoch": 0.021, "grad_norm": 189.0839385986328, "learning_rate": 1.3999999999999998e-07, "logits/chosen": 3.6236073970794678, "logits/rejected": 3.548736572265625, "logps/chosen": -71.3276138305664, "logps/rejected": -87.03208923339844, "loss": 1.6698, "nll_loss": 0.8491383790969849, "rewards/accuracies": 1.0, "rewards/chosen": -0.1588997095823288, "rewards/margins": 0.025012195110321045, "rewards/rejected": -0.18391190469264984, "step": 126 }, { "epoch": 0.021166666666666667, "grad_norm": 180.25027465820312, "learning_rate": 1.411111111111111e-07, "logits/chosen": 2.815687894821167, "logits/rejected": 2.930884599685669, "logps/chosen": -79.96846771240234, "logps/rejected": -138.72189331054688, "loss": 1.8437, "nll_loss": 1.052216649055481, "rewards/accuracies": 1.0, "rewards/chosen": 0.12166748940944672, "rewards/margins": 0.08295594155788422, "rewards/rejected": 0.0387115478515625, "step": 127 }, { "epoch": 0.021333333333333333, "grad_norm": 173.16424560546875, "learning_rate": 1.4222222222222222e-07, "logits/chosen": 2.401409864425659, "logits/rejected": 2.284654140472412, "logps/chosen": -61.4034538269043, "logps/rejected": -80.098876953125, "loss": 1.7753, "nll_loss": 0.9446684718132019, "rewards/accuracies": 1.0, "rewards/chosen": 0.029619598761200905, "rewards/margins": 0.002475738525390625, "rewards/rejected": 0.02714386023581028, "step": 128 }, { "epoch": 0.0215, "grad_norm": 136.74008178710938, "learning_rate": 1.4333333333333332e-07, "logits/chosen": 3.2652339935302734, "logits/rejected": 3.1386218070983887, "logps/chosen": -16.628395080566406, "logps/rejected": -21.339582443237305, "loss": 1.4664, "nll_loss": 0.6651358604431152, "rewards/accuracies": 1.0, "rewards/chosen": 0.0642026886343956, "rewards/margins": 0.062076374888420105, "rewards/rejected": 0.0021263123489916325, "step": 129 }, { "epoch": 0.021666666666666667, "grad_norm": 231.60308837890625, "learning_rate": 1.4444444444444442e-07, "logits/chosen": 2.337888717651367, "logits/rejected": 2.7157094478607178, "logps/chosen": -33.65115737915039, "logps/rejected": -188.24891662597656, "loss": 2.6707, "nll_loss": 1.8695087432861328, "rewards/accuracies": 1.0, "rewards/chosen": -0.020175933837890625, "rewards/margins": 0.06231308728456497, "rewards/rejected": -0.0824890211224556, "step": 130 }, { "epoch": 0.021833333333333333, "grad_norm": 212.48265075683594, "learning_rate": 1.4555555555555555e-07, "logits/chosen": 2.8797879219055176, "logits/rejected": 3.1261484622955322, "logps/chosen": -41.85201644897461, "logps/rejected": -124.36868286132812, "loss": 1.9379, "nll_loss": 1.1311355829238892, "rewards/accuracies": 1.0, "rewards/chosen": 0.019745638594031334, "rewards/margins": 0.05060654133558273, "rewards/rejected": -0.03086090087890625, "step": 131 }, { "epoch": 0.022, "grad_norm": 227.21119689941406, "learning_rate": 1.4666666666666666e-07, "logits/chosen": 1.5056912899017334, "logits/rejected": 2.3021843433380127, "logps/chosen": -114.91908264160156, "logps/rejected": -148.4170684814453, "loss": 1.8887, "nll_loss": 1.1726436614990234, "rewards/accuracies": 1.0, "rewards/chosen": 0.6758880615234375, "rewards/margins": 0.2771652042865753, "rewards/rejected": 0.3987228572368622, "step": 132 }, { "epoch": 0.022166666666666668, "grad_norm": 171.69158935546875, "learning_rate": 1.4777777777777779e-07, "logits/chosen": 2.716111660003662, "logits/rejected": 2.521406888961792, "logps/chosen": -67.73246002197266, "logps/rejected": -36.209991455078125, "loss": 1.7917, "nll_loss": 0.8796424269676208, "rewards/accuracies": 0.0, "rewards/chosen": -0.09123153984546661, "rewards/margins": -0.1551845669746399, "rewards/rejected": 0.06395301967859268, "step": 133 }, { "epoch": 0.022333333333333334, "grad_norm": 354.13507080078125, "learning_rate": 1.488888888888889e-07, "logits/chosen": 2.801795721054077, "logits/rejected": 2.904346466064453, "logps/chosen": -105.08779907226562, "logps/rejected": -157.96385192871094, "loss": 3.4771, "nll_loss": 2.69455885887146, "rewards/accuracies": 1.0, "rewards/chosen": 0.2073928862810135, "rewards/margins": 0.103363037109375, "rewards/rejected": 0.10402984917163849, "step": 134 }, { "epoch": 0.0225, "grad_norm": 185.2747344970703, "learning_rate": 1.5e-07, "logits/chosen": 2.128427028656006, "logits/rejected": 1.693103551864624, "logps/chosen": -98.91627502441406, "logps/rejected": -42.594940185546875, "loss": 1.6704, "nll_loss": 0.9074888229370117, "rewards/accuracies": 1.0, "rewards/chosen": 0.1316482573747635, "rewards/margins": 0.14259567856788635, "rewards/rejected": -0.01094741839915514, "step": 135 }, { "epoch": 0.02266666666666667, "grad_norm": 232.74986267089844, "learning_rate": 1.511111111111111e-07, "logits/chosen": 2.4453089237213135, "logits/rejected": 2.447125196456909, "logps/chosen": -207.17083740234375, "logps/rejected": -218.0897674560547, "loss": 1.7312, "nll_loss": 0.8891451358795166, "rewards/accuracies": 0.0, "rewards/chosen": -0.01262512244284153, "rewards/margins": -0.02055206336081028, "rewards/rejected": 0.00792694091796875, "step": 136 }, { "epoch": 0.022833333333333334, "grad_norm": 200.71316528320312, "learning_rate": 1.522222222222222e-07, "logits/chosen": 2.393763542175293, "logits/rejected": 2.866176128387451, "logps/chosen": -44.41714096069336, "logps/rejected": -132.07737731933594, "loss": 1.4869, "nll_loss": 0.807584285736084, "rewards/accuracies": 1.0, "rewards/chosen": 0.028973007574677467, "rewards/margins": 0.3313060998916626, "rewards/rejected": -0.3023330867290497, "step": 137 }, { "epoch": 0.023, "grad_norm": 152.4671173095703, "learning_rate": 1.5333333333333333e-07, "logits/chosen": 3.1072633266448975, "logits/rejected": 2.9919261932373047, "logps/chosen": -90.77371215820312, "logps/rejected": -28.772445678710938, "loss": 2.1936, "nll_loss": 1.5128955841064453, "rewards/accuracies": 1.0, "rewards/chosen": 0.2567543089389801, "rewards/margins": 0.3269214630126953, "rewards/rejected": -0.0701671615242958, "step": 138 }, { "epoch": 0.023166666666666665, "grad_norm": 255.24232482910156, "learning_rate": 1.5444444444444443e-07, "logits/chosen": 3.1642792224884033, "logits/rejected": 3.5606770515441895, "logps/chosen": -49.28935241699219, "logps/rejected": -246.01712036132812, "loss": 1.8734, "nll_loss": 0.9857869744300842, "rewards/accuracies": 0.0, "rewards/chosen": 0.14831772446632385, "rewards/margins": -0.10527116060256958, "rewards/rejected": 0.25358888506889343, "step": 139 }, { "epoch": 0.023333333333333334, "grad_norm": 216.766357421875, "learning_rate": 1.5555555555555556e-07, "logits/chosen": 3.5168814659118652, "logits/rejected": 3.491330862045288, "logps/chosen": -72.66914367675781, "logps/rejected": -84.9541015625, "loss": 1.5203, "nll_loss": 0.719496488571167, "rewards/accuracies": 1.0, "rewards/chosen": 0.038408663123846054, "rewards/margins": 0.0628456175327301, "rewards/rejected": -0.0244369525462389, "step": 140 }, { "epoch": 0.0235, "grad_norm": 135.7557830810547, "learning_rate": 1.5666666666666667e-07, "logits/chosen": 2.368246555328369, "logits/rejected": 2.2794103622436523, "logps/chosen": -78.61595153808594, "logps/rejected": -69.495849609375, "loss": 1.9365, "nll_loss": 1.034420371055603, "rewards/accuracies": 0.0, "rewards/chosen": -0.05683746561408043, "rewards/margins": -0.136444091796875, "rewards/rejected": 0.07960662990808487, "step": 141 }, { "epoch": 0.023666666666666666, "grad_norm": 157.73184204101562, "learning_rate": 1.5777777777777777e-07, "logits/chosen": 2.5058953762054443, "logits/rejected": 2.640967845916748, "logps/chosen": -48.17010498046875, "logps/rejected": -44.39349365234375, "loss": 2.3656, "nll_loss": 1.5538743734359741, "rewards/accuracies": 1.0, "rewards/chosen": 0.02669677883386612, "rewards/margins": 0.04053650051355362, "rewards/rejected": -0.013839722611010075, "step": 142 }, { "epoch": 0.023833333333333335, "grad_norm": 262.0166015625, "learning_rate": 1.5888888888888887e-07, "logits/chosen": 2.4449808597564697, "logits/rejected": 2.6937153339385986, "logps/chosen": -48.2454833984375, "logps/rejected": -326.4071960449219, "loss": 1.7138, "nll_loss": 0.9102920889854431, "rewards/accuracies": 1.0, "rewards/chosen": 0.045796968042850494, "rewards/margins": 0.05733871832489967, "rewards/rejected": -0.011541749350726604, "step": 143 }, { "epoch": 0.024, "grad_norm": 172.7244415283203, "learning_rate": 1.6e-07, "logits/chosen": 2.528625249862671, "logits/rejected": 2.7098426818847656, "logps/chosen": -69.22085571289062, "logps/rejected": -61.54661178588867, "loss": 2.2178, "nll_loss": 1.3844172954559326, "rewards/accuracies": 0.0, "rewards/chosen": 0.02966614067554474, "rewards/margins": -0.003200910985469818, "rewards/rejected": 0.03286705166101456, "step": 144 }, { "epoch": 0.024166666666666666, "grad_norm": 138.75196838378906, "learning_rate": 1.611111111111111e-07, "logits/chosen": 2.2797648906707764, "logits/rejected": 1.6726337671279907, "logps/chosen": -78.48419189453125, "logps/rejected": -31.249431610107422, "loss": 1.7382, "nll_loss": 0.9810525178909302, "rewards/accuracies": 1.0, "rewards/chosen": 0.08338012546300888, "rewards/margins": 0.15458469092845917, "rewards/rejected": -0.07120456546545029, "step": 145 }, { "epoch": 0.024333333333333332, "grad_norm": 186.93801879882812, "learning_rate": 1.622222222222222e-07, "logits/chosen": 2.5829720497131348, "logits/rejected": 2.3122315406799316, "logps/chosen": -98.66552734375, "logps/rejected": -56.428733825683594, "loss": 1.7381, "nll_loss": 0.8969593644142151, "rewards/accuracies": 0.0, "rewards/chosen": 0.02562256157398224, "rewards/margins": -0.01845093071460724, "rewards/rejected": 0.04407349228858948, "step": 146 }, { "epoch": 0.0245, "grad_norm": 201.8964385986328, "learning_rate": 1.6333333333333331e-07, "logits/chosen": 1.897786021232605, "logits/rejected": 2.0148160457611084, "logps/chosen": -90.38471984863281, "logps/rejected": -79.39930725097656, "loss": 1.8005, "nll_loss": 0.971878707408905, "rewards/accuracies": 1.0, "rewards/chosen": 0.04436340555548668, "rewards/margins": 0.006488040089607239, "rewards/rejected": 0.03787536546587944, "step": 147 }, { "epoch": 0.024666666666666667, "grad_norm": 161.30714416503906, "learning_rate": 1.6444444444444442e-07, "logits/chosen": 2.8314383029937744, "logits/rejected": 2.8829872608184814, "logps/chosen": -29.542255401611328, "logps/rejected": -85.27389526367188, "loss": 2.7671, "nll_loss": 1.9694838523864746, "rewards/accuracies": 1.0, "rewards/chosen": -0.012903595343232155, "rewards/margins": 0.06963882595300674, "rewards/rejected": -0.08254241943359375, "step": 148 }, { "epoch": 0.024833333333333332, "grad_norm": 177.8955078125, "learning_rate": 1.6555555555555555e-07, "logits/chosen": 3.027317762374878, "logits/rejected": 2.9578490257263184, "logps/chosen": -43.54201126098633, "logps/rejected": -128.37628173828125, "loss": 1.447, "nll_loss": 0.6911430954933167, "rewards/accuracies": 1.0, "rewards/chosen": 0.08522912114858627, "rewards/margins": 0.15746194124221802, "rewards/rejected": -0.07223282009363174, "step": 149 }, { "epoch": 0.025, "grad_norm": 257.46990966796875, "learning_rate": 1.6666666666666668e-07, "logits/chosen": 1.2256499528884888, "logits/rejected": 1.9756964445114136, "logps/chosen": -31.392974853515625, "logps/rejected": -240.4392547607422, "loss": 2.4917, "nll_loss": 1.6522618532180786, "rewards/accuracies": 0.0, "rewards/chosen": 0.05472717434167862, "rewards/margins": -0.01489105075597763, "rewards/rejected": 0.06961822509765625, "step": 150 }, { "epoch": 0.025166666666666667, "grad_norm": 127.7153549194336, "learning_rate": 1.6777777777777778e-07, "logits/chosen": 2.4734177589416504, "logits/rejected": 2.3218135833740234, "logps/chosen": -50.648780822753906, "logps/rejected": -40.383460998535156, "loss": 1.628, "nll_loss": 0.8303079605102539, "rewards/accuracies": 1.0, "rewards/chosen": 0.06852646172046661, "rewards/margins": 0.0693584457039833, "rewards/rejected": -0.0008319854969158769, "step": 151 }, { "epoch": 0.025333333333333333, "grad_norm": 193.34019470214844, "learning_rate": 1.6888888888888888e-07, "logits/chosen": 3.195734739303589, "logits/rejected": 3.0931217670440674, "logps/chosen": -59.024925231933594, "logps/rejected": -80.7229232788086, "loss": 1.4908, "nll_loss": 0.7287028431892395, "rewards/accuracies": 1.0, "rewards/chosen": 0.23061446845531464, "rewards/margins": 0.14664916694164276, "rewards/rejected": 0.08396530151367188, "step": 152 }, { "epoch": 0.0255, "grad_norm": 147.99951171875, "learning_rate": 1.7e-07, "logits/chosen": 2.868004322052002, "logits/rejected": 2.7977523803710938, "logps/chosen": -62.459190368652344, "logps/rejected": -38.98863220214844, "loss": 1.9495, "nll_loss": 1.1566516160964966, "rewards/accuracies": 1.0, "rewards/chosen": 0.018749618902802467, "rewards/margins": 0.07942352443933487, "rewards/rejected": -0.06067390739917755, "step": 153 }, { "epoch": 0.025666666666666667, "grad_norm": 167.36216735839844, "learning_rate": 1.711111111111111e-07, "logits/chosen": 3.0371484756469727, "logits/rejected": 2.9233579635620117, "logps/chosen": -78.11137390136719, "logps/rejected": -79.2865219116211, "loss": 1.6035, "nll_loss": 0.781113862991333, "rewards/accuracies": 1.0, "rewards/chosen": 0.13607025146484375, "rewards/margins": 0.02045135200023651, "rewards/rejected": 0.11561889946460724, "step": 154 }, { "epoch": 0.025833333333333333, "grad_norm": 257.66192626953125, "learning_rate": 1.7222222222222222e-07, "logits/chosen": 1.687878966331482, "logits/rejected": 2.5036392211914062, "logps/chosen": -60.747169494628906, "logps/rejected": -211.42379760742188, "loss": 3.0974, "nll_loss": 2.3364298343658447, "rewards/accuracies": 1.0, "rewards/chosen": 0.041043855249881744, "rewards/margins": 0.14662399888038635, "rewards/rejected": -0.1055801510810852, "step": 155 }, { "epoch": 0.026, "grad_norm": 174.04747009277344, "learning_rate": 1.7333333333333332e-07, "logits/chosen": 3.4877331256866455, "logits/rejected": 3.5294625759124756, "logps/chosen": -83.71026611328125, "logps/rejected": -82.40531158447266, "loss": 1.6001, "nll_loss": 0.797240674495697, "rewards/accuracies": 1.0, "rewards/chosen": 0.11395110934972763, "rewards/margins": 0.05916595086455345, "rewards/rejected": 0.05478515848517418, "step": 156 }, { "epoch": 0.026166666666666668, "grad_norm": 148.89244079589844, "learning_rate": 1.7444444444444443e-07, "logits/chosen": 3.0560896396636963, "logits/rejected": 2.9167473316192627, "logps/chosen": -50.83985137939453, "logps/rejected": -81.01025390625, "loss": 1.5544, "nll_loss": 0.7476449608802795, "rewards/accuracies": 1.0, "rewards/chosen": 0.02409362979233265, "rewards/margins": 0.05067749321460724, "rewards/rejected": -0.02658386528491974, "step": 157 }, { "epoch": 0.026333333333333334, "grad_norm": 187.8508758544922, "learning_rate": 1.7555555555555556e-07, "logits/chosen": 3.5319714546203613, "logits/rejected": 3.6570510864257812, "logps/chosen": -120.65570831298828, "logps/rejected": -49.875213623046875, "loss": 2.1992, "nll_loss": 1.4363775253295898, "rewards/accuracies": 1.0, "rewards/chosen": 0.2733299434185028, "rewards/margins": 0.1465706080198288, "rewards/rejected": 0.126759335398674, "step": 158 }, { "epoch": 0.0265, "grad_norm": 160.36019897460938, "learning_rate": 1.7666666666666666e-07, "logits/chosen": 2.422980546951294, "logits/rejected": 2.5019640922546387, "logps/chosen": -68.07955169677734, "logps/rejected": -46.49641036987305, "loss": 1.8527, "nll_loss": 1.031508207321167, "rewards/accuracies": 1.0, "rewards/chosen": 0.047719575464725494, "rewards/margins": 0.021274948492646217, "rewards/rejected": 0.026444626972079277, "step": 159 }, { "epoch": 0.02666666666666667, "grad_norm": 233.7633056640625, "learning_rate": 1.7777777777777776e-07, "logits/chosen": 2.867396593093872, "logits/rejected": 3.0253541469573975, "logps/chosen": -115.89584350585938, "logps/rejected": -256.0360412597656, "loss": 2.5606, "nll_loss": 1.8108724355697632, "rewards/accuracies": 1.0, "rewards/chosen": 0.11274109780788422, "rewards/margins": 0.17068177461624146, "rewards/rejected": -0.05794067308306694, "step": 160 }, { "epoch": 0.026833333333333334, "grad_norm": 135.08734130859375, "learning_rate": 1.788888888888889e-07, "logits/chosen": 2.957408905029297, "logits/rejected": 2.816516399383545, "logps/chosen": -78.39338684082031, "logps/rejected": -15.716646194458008, "loss": 1.8439, "nll_loss": 1.0738818645477295, "rewards/accuracies": 1.0, "rewards/chosen": 0.08591079711914062, "rewards/margins": 0.12721338868141174, "rewards/rejected": -0.04130258783698082, "step": 161 }, { "epoch": 0.027, "grad_norm": 186.6488494873047, "learning_rate": 1.8e-07, "logits/chosen": 2.5823097229003906, "logits/rejected": 2.5391366481781006, "logps/chosen": -71.2135009765625, "logps/rejected": -81.90970611572266, "loss": 1.678, "nll_loss": 0.9014368057250977, "rewards/accuracies": 1.0, "rewards/chosen": -0.01855163648724556, "rewards/margins": 0.11397019028663635, "rewards/rejected": -0.1325218230485916, "step": 162 }, { "epoch": 0.027166666666666665, "grad_norm": 160.27394104003906, "learning_rate": 1.811111111111111e-07, "logits/chosen": 2.592761993408203, "logits/rejected": 1.758857250213623, "logps/chosen": -66.56914520263672, "logps/rejected": -20.12500762939453, "loss": 1.6464, "nll_loss": 0.9245714545249939, "rewards/accuracies": 1.0, "rewards/chosen": 0.24763794243335724, "rewards/margins": 0.23378829658031464, "rewards/rejected": 0.01384964119642973, "step": 163 }, { "epoch": 0.027333333333333334, "grad_norm": 336.0793151855469, "learning_rate": 1.822222222222222e-07, "logits/chosen": 2.4192421436309814, "logits/rejected": 1.4315072298049927, "logps/chosen": -293.1036071777344, "logps/rejected": -130.55545043945312, "loss": 2.0095, "nll_loss": 1.0775866508483887, "rewards/accuracies": 0.0, "rewards/chosen": -0.05886535719037056, "rewards/margins": -0.19190064072608948, "rewards/rejected": 0.13303528726100922, "step": 164 }, { "epoch": 0.0275, "grad_norm": 262.7001037597656, "learning_rate": 1.833333333333333e-07, "logits/chosen": 1.943320870399475, "logits/rejected": 2.0520448684692383, "logps/chosen": -38.70958709716797, "logps/rejected": -121.93741607666016, "loss": 1.4331, "nll_loss": 0.6560948491096497, "rewards/accuracies": 1.0, "rewards/chosen": 0.16996192932128906, "rewards/margins": 0.11365928500890732, "rewards/rejected": 0.056302644312381744, "step": 165 }, { "epoch": 0.027666666666666666, "grad_norm": 130.6446075439453, "learning_rate": 1.8444444444444444e-07, "logits/chosen": 3.0766453742980957, "logits/rejected": 2.985400915145874, "logps/chosen": -112.00189208984375, "logps/rejected": -13.293543815612793, "loss": 2.0053, "nll_loss": 1.2444654703140259, "rewards/accuracies": 1.0, "rewards/chosen": 0.09601974487304688, "rewards/margins": 0.14680233597755432, "rewards/rejected": -0.050782591104507446, "step": 166 }, { "epoch": 0.027833333333333335, "grad_norm": 246.020751953125, "learning_rate": 1.8555555555555557e-07, "logits/chosen": 1.6093804836273193, "logits/rejected": 2.609999179840088, "logps/chosen": -45.759273529052734, "logps/rejected": -259.4237365722656, "loss": 1.7861, "nll_loss": 1.2367370128631592, "rewards/accuracies": 1.0, "rewards/chosen": 0.11038704961538315, "rewards/margins": 0.6690265536308289, "rewards/rejected": -0.5586395263671875, "step": 167 }, { "epoch": 0.028, "grad_norm": 212.70742797851562, "learning_rate": 1.8666666666666667e-07, "logits/chosen": 2.6932694911956787, "logits/rejected": 2.521127700805664, "logps/chosen": -117.46273803710938, "logps/rejected": -35.34483337402344, "loss": 1.7327, "nll_loss": 0.8390195369720459, "rewards/accuracies": 0.0, "rewards/chosen": -0.144786074757576, "rewards/margins": -0.11974945664405823, "rewards/rejected": -0.02503662183880806, "step": 168 }, { "epoch": 0.028166666666666666, "grad_norm": 149.21080017089844, "learning_rate": 1.8777777777777777e-07, "logits/chosen": 2.9692044258117676, "logits/rejected": 2.7418673038482666, "logps/chosen": -109.19945526123047, "logps/rejected": -27.051481246948242, "loss": 1.8258, "nll_loss": 1.0205556154251099, "rewards/accuracies": 1.0, "rewards/chosen": 0.036106109619140625, "rewards/margins": 0.05366554111242294, "rewards/rejected": -0.017559433355927467, "step": 169 }, { "epoch": 0.028333333333333332, "grad_norm": 209.21302795410156, "learning_rate": 1.8888888888888888e-07, "logits/chosen": 3.18906307220459, "logits/rejected": 3.2893147468566895, "logps/chosen": -70.31578063964844, "logps/rejected": -232.6517791748047, "loss": 1.8488, "nll_loss": 1.0653903484344482, "rewards/accuracies": 1.0, "rewards/chosen": 0.15278930962085724, "rewards/margins": 0.09996948391199112, "rewards/rejected": 0.05281982570886612, "step": 170 }, { "epoch": 0.0285, "grad_norm": 162.55209350585938, "learning_rate": 1.8999999999999998e-07, "logits/chosen": 3.0813934803009033, "logits/rejected": 3.1830883026123047, "logps/chosen": -77.65685272216797, "logps/rejected": -160.25924682617188, "loss": 1.5885, "nll_loss": 0.9356246590614319, "rewards/accuracies": 1.0, "rewards/chosen": 0.2477317899465561, "rewards/margins": 0.39241257309913635, "rewards/rejected": -0.14468078315258026, "step": 171 }, { "epoch": 0.028666666666666667, "grad_norm": 199.78404235839844, "learning_rate": 1.911111111111111e-07, "logits/chosen": 2.892326593399048, "logits/rejected": 2.944324493408203, "logps/chosen": -52.40480041503906, "logps/rejected": -232.23782348632812, "loss": 1.4222, "nll_loss": 0.738095760345459, "rewards/accuracies": 1.0, "rewards/chosen": 0.24530602991580963, "rewards/margins": 0.31879693269729614, "rewards/rejected": -0.07349091023206711, "step": 172 }, { "epoch": 0.028833333333333332, "grad_norm": 108.41622161865234, "learning_rate": 1.9222222222222221e-07, "logits/chosen": 3.2930314540863037, "logits/rejected": 3.2091028690338135, "logps/chosen": -18.538063049316406, "logps/rejected": -14.968109130859375, "loss": 1.231, "nll_loss": 0.4634515643119812, "rewards/accuracies": 1.0, "rewards/chosen": 0.17656442523002625, "rewards/margins": 0.13358622789382935, "rewards/rejected": 0.0429781936109066, "step": 173 }, { "epoch": 0.029, "grad_norm": 197.77291870117188, "learning_rate": 1.9333333333333332e-07, "logits/chosen": 2.534821033477783, "logits/rejected": 1.8751698732376099, "logps/chosen": -158.70782470703125, "logps/rejected": -62.68010330200195, "loss": 1.9031, "nll_loss": 1.2302930355072021, "rewards/accuracies": 1.0, "rewards/chosen": 0.38233643770217896, "rewards/margins": 0.34924355149269104, "rewards/rejected": 0.03309288248419762, "step": 174 }, { "epoch": 0.029166666666666667, "grad_norm": 176.09046936035156, "learning_rate": 1.9444444444444442e-07, "logits/chosen": 2.474961042404175, "logits/rejected": 2.2057926654815674, "logps/chosen": -86.79010772705078, "logps/rejected": -67.9026870727539, "loss": 1.9168, "nll_loss": 1.3150018453598022, "rewards/accuracies": 1.0, "rewards/chosen": 0.4544014036655426, "rewards/margins": 0.5244377255439758, "rewards/rejected": -0.07003631442785263, "step": 175 }, { "epoch": 0.029333333333333333, "grad_norm": 276.22589111328125, "learning_rate": 1.9555555555555555e-07, "logits/chosen": 2.5071401596069336, "logits/rejected": 2.6763155460357666, "logps/chosen": -115.34390258789062, "logps/rejected": -322.928955078125, "loss": 1.6408, "nll_loss": 1.0582009553909302, "rewards/accuracies": 1.0, "rewards/chosen": 0.12283172458410263, "rewards/margins": 0.5746933221817017, "rewards/rejected": -0.45186159014701843, "step": 176 }, { "epoch": 0.0295, "grad_norm": 176.5428466796875, "learning_rate": 1.9666666666666665e-07, "logits/chosen": 2.7155237197875977, "logits/rejected": 2.6834194660186768, "logps/chosen": -132.00531005859375, "logps/rejected": -97.32435607910156, "loss": 1.9988, "nll_loss": 1.306983470916748, "rewards/accuracies": 1.0, "rewards/chosen": 0.24385225772857666, "rewards/margins": 0.30097123980522156, "rewards/rejected": -0.057118989527225494, "step": 177 }, { "epoch": 0.029666666666666668, "grad_norm": 196.0032958984375, "learning_rate": 1.9777777777777778e-07, "logits/chosen": 2.8670260906219482, "logits/rejected": 2.9097533226013184, "logps/chosen": -126.72874450683594, "logps/rejected": -123.98394775390625, "loss": 2.1921, "nll_loss": 1.3774864673614502, "rewards/accuracies": 1.0, "rewards/chosen": 0.04586182162165642, "rewards/margins": 0.0346679724752903, "rewards/rejected": 0.01119384914636612, "step": 178 }, { "epoch": 0.029833333333333333, "grad_norm": 225.99559020996094, "learning_rate": 1.988888888888889e-07, "logits/chosen": 2.4643285274505615, "logits/rejected": 2.9549758434295654, "logps/chosen": -40.19816970825195, "logps/rejected": -311.5015563964844, "loss": 3.0989, "nll_loss": 2.364598512649536, "rewards/accuracies": 1.0, "rewards/chosen": 0.061400603502988815, "rewards/margins": 0.20456162095069885, "rewards/rejected": -0.14316101372241974, "step": 179 }, { "epoch": 0.03, "grad_norm": 158.80059814453125, "learning_rate": 2e-07, "logits/chosen": 2.45416522026062, "logits/rejected": 2.5600900650024414, "logps/chosen": -79.06828308105469, "logps/rejected": -93.67478942871094, "loss": 1.4552, "nll_loss": 0.8501965999603271, "rewards/accuracies": 1.0, "rewards/chosen": 0.20516128838062286, "rewards/margins": 0.5124794244766235, "rewards/rejected": -0.3073181211948395, "step": 180 }, { "epoch": 0.030166666666666668, "grad_norm": 158.47488403320312, "learning_rate": 1.9999998543120038e-07, "logits/chosen": 2.705528974533081, "logits/rejected": 2.795889377593994, "logps/chosen": -73.32592010498047, "logps/rejected": -34.06602478027344, "loss": 1.9578, "nll_loss": 1.0184155702590942, "rewards/accuracies": 0.0, "rewards/chosen": 0.094232939183712, "rewards/margins": -0.20260199904441833, "rewards/rejected": 0.29683494567871094, "step": 181 }, { "epoch": 0.030333333333333334, "grad_norm": 203.58453369140625, "learning_rate": 1.9999994172480576e-07, "logits/chosen": 3.167670726776123, "logits/rejected": 2.9663143157958984, "logps/chosen": -280.0538635253906, "logps/rejected": -35.11732864379883, "loss": 2.244, "nll_loss": 1.5912150144577026, "rewards/accuracies": 1.0, "rewards/chosen": 0.3082275390625, "rewards/margins": 0.393746554851532, "rewards/rejected": -0.08551903069019318, "step": 182 }, { "epoch": 0.0305, "grad_norm": 198.48204040527344, "learning_rate": 1.9999986888082892e-07, "logits/chosen": 2.8758063316345215, "logits/rejected": 3.0892932415008545, "logps/chosen": -45.44416046142578, "logps/rejected": -149.6801300048828, "loss": 1.7349, "nll_loss": 1.0098702907562256, "rewards/accuracies": 1.0, "rewards/chosen": 0.3698471188545227, "rewards/margins": 0.23145295679569244, "rewards/rejected": 0.13839416205883026, "step": 183 }, { "epoch": 0.030666666666666665, "grad_norm": 158.17799377441406, "learning_rate": 1.9999976689929106e-07, "logits/chosen": 2.389695882797241, "logits/rejected": 2.4463770389556885, "logps/chosen": -87.20854187011719, "logps/rejected": -98.01803588867188, "loss": 1.9884, "nll_loss": 1.2458362579345703, "rewards/accuracies": 1.0, "rewards/chosen": 0.3417007625102997, "rewards/margins": 0.19208833575248718, "rewards/rejected": 0.1496124267578125, "step": 184 }, { "epoch": 0.030833333333333334, "grad_norm": 202.22665405273438, "learning_rate": 1.9999963578022186e-07, "logits/chosen": 1.797419548034668, "logits/rejected": 2.481313467025757, "logps/chosen": -23.5584716796875, "logps/rejected": -80.34188842773438, "loss": 2.6143, "nll_loss": 1.812190294265747, "rewards/accuracies": 1.0, "rewards/chosen": -0.019036484882235527, "rewards/margins": 0.06061591953039169, "rewards/rejected": -0.07965240627527237, "step": 185 }, { "epoch": 0.031, "grad_norm": 178.35841369628906, "learning_rate": 1.9999947552365958e-07, "logits/chosen": 2.6354875564575195, "logits/rejected": 4.248212814331055, "logps/chosen": -56.01299285888672, "logps/rejected": -37.94375228881836, "loss": 2.1254, "nll_loss": 0.8486817479133606, "rewards/accuracies": 0.0, "rewards/chosen": 0.13125954568386078, "rewards/margins": -0.7447612881660461, "rewards/rejected": 0.8760208487510681, "step": 186 }, { "epoch": 0.031166666666666665, "grad_norm": 105.9174575805664, "learning_rate": 1.9999928612965088e-07, "logits/chosen": 3.1052298545837402, "logits/rejected": 2.982059955596924, "logps/chosen": -13.072588920593262, "logps/rejected": -6.16080379486084, "loss": 1.4764, "nll_loss": 0.688031017780304, "rewards/accuracies": 1.0, "rewards/chosen": 0.02956085279583931, "rewards/margins": 0.0886409729719162, "rewards/rejected": -0.05908012390136719, "step": 187 }, { "epoch": 0.03133333333333333, "grad_norm": 258.8608703613281, "learning_rate": 1.9999906759825095e-07, "logits/chosen": 2.5518805980682373, "logits/rejected": 3.284672975540161, "logps/chosen": -57.45061111450195, "logps/rejected": -97.07876586914062, "loss": 1.5084, "nll_loss": 0.7272229790687561, "rewards/accuracies": 1.0, "rewards/chosen": -0.08106918632984161, "rewards/margins": 0.10583877563476562, "rewards/rejected": -0.18690796196460724, "step": 188 }, { "epoch": 0.0315, "grad_norm": 214.438720703125, "learning_rate": 1.999988199295235e-07, "logits/chosen": 2.658306837081909, "logits/rejected": 2.4892783164978027, "logps/chosen": -137.43814086914062, "logps/rejected": -33.556053161621094, "loss": 1.7756, "nll_loss": 1.1549423933029175, "rewards/accuracies": 1.0, "rewards/chosen": 0.3936966061592102, "rewards/margins": 0.47447893023490906, "rewards/rejected": -0.08078231662511826, "step": 189 }, { "epoch": 0.03166666666666667, "grad_norm": 210.17596435546875, "learning_rate": 1.9999854312354063e-07, "logits/chosen": 2.1609463691711426, "logits/rejected": 2.6048810482025146, "logps/chosen": -96.49158477783203, "logps/rejected": -157.5196533203125, "loss": 1.741, "nll_loss": 1.0051207542419434, "rewards/accuracies": 1.0, "rewards/chosen": 0.23946991562843323, "rewards/margins": 0.20244598388671875, "rewards/rejected": 0.03702392801642418, "step": 190 }, { "epoch": 0.03183333333333333, "grad_norm": 153.11399841308594, "learning_rate": 1.9999823718038305e-07, "logits/chosen": 2.4977681636810303, "logits/rejected": 1.8709007501602173, "logps/chosen": -71.3812255859375, "logps/rejected": -27.75798225402832, "loss": 1.7712, "nll_loss": 1.0653914213180542, "rewards/accuracies": 1.0, "rewards/chosen": 0.3621475398540497, "rewards/margins": 0.2729606628417969, "rewards/rejected": 0.08918686956167221, "step": 191 }, { "epoch": 0.032, "grad_norm": 138.04644775390625, "learning_rate": 1.9999790210013987e-07, "logits/chosen": 2.4402530193328857, "logits/rejected": 2.1042633056640625, "logps/chosen": -79.2791748046875, "logps/rejected": -86.04535675048828, "loss": 1.7505, "nll_loss": 1.043147087097168, "rewards/accuracies": 1.0, "rewards/chosen": 0.14946213364601135, "rewards/margins": 0.2644813656806946, "rewards/rejected": -0.11501922458410263, "step": 192 }, { "epoch": 0.03216666666666667, "grad_norm": 200.40274047851562, "learning_rate": 1.9999753788290874e-07, "logits/chosen": 2.423832654953003, "logits/rejected": 2.282092332839966, "logps/chosen": -230.8970184326172, "logps/rejected": -73.15625, "loss": 1.6386, "nll_loss": 0.9825403690338135, "rewards/accuracies": 1.0, "rewards/chosen": 0.3273361623287201, "rewards/margins": 0.38605350255966187, "rewards/rejected": -0.05871735140681267, "step": 193 }, { "epoch": 0.03233333333333333, "grad_norm": 119.77589416503906, "learning_rate": 1.999971445287958e-07, "logits/chosen": 3.0629734992980957, "logits/rejected": 2.8624203205108643, "logps/chosen": -47.112491607666016, "logps/rejected": -55.685665130615234, "loss": 1.8291, "nll_loss": 1.0956393480300903, "rewards/accuracies": 1.0, "rewards/chosen": 0.16991348564624786, "rewards/margins": 0.2063644528388977, "rewards/rejected": -0.036450959742069244, "step": 194 }, { "epoch": 0.0325, "grad_norm": 227.929443359375, "learning_rate": 1.9999672203791561e-07, "logits/chosen": 3.272948741912842, "logits/rejected": 3.1113479137420654, "logps/chosen": -205.63211059570312, "logps/rejected": -79.41266632080078, "loss": 2.0385, "nll_loss": 1.2313300371170044, "rewards/accuracies": 1.0, "rewards/chosen": 0.18808594346046448, "rewards/margins": 0.05186691880226135, "rewards/rejected": 0.13621902465820312, "step": 195 }, { "epoch": 0.03266666666666666, "grad_norm": 174.9930877685547, "learning_rate": 1.9999627041039134e-07, "logits/chosen": 2.4192376136779785, "logits/rejected": 2.2033298015594482, "logps/chosen": -128.857666015625, "logps/rejected": -12.126338958740234, "loss": 1.378, "nll_loss": 0.6442884802818298, "rewards/accuracies": 1.0, "rewards/chosen": 0.20987243950366974, "rewards/margins": 0.20656433701515198, "rewards/rejected": 0.0033081055153161287, "step": 196 }, { "epoch": 0.03283333333333333, "grad_norm": 165.29571533203125, "learning_rate": 1.9999578964635453e-07, "logits/chosen": 2.7547619342803955, "logits/rejected": 2.878737688064575, "logps/chosen": -53.944358825683594, "logps/rejected": -166.90713500976562, "loss": 1.4219, "nll_loss": 0.7818023562431335, "rewards/accuracies": 1.0, "rewards/chosen": 0.2651863098144531, "rewards/margins": 0.4236763119697571, "rewards/rejected": -0.15849000215530396, "step": 197 }, { "epoch": 0.033, "grad_norm": 186.7834930419922, "learning_rate": 1.999952797459453e-07, "logits/chosen": 2.5778679847717285, "logits/rejected": 2.898524522781372, "logps/chosen": -34.63543701171875, "logps/rejected": -235.32211303710938, "loss": 1.7507, "nll_loss": 0.9895839095115662, "rewards/accuracies": 1.0, "rewards/chosen": 0.23312263190746307, "rewards/margins": 0.1481708586215973, "rewards/rejected": 0.08495178073644638, "step": 198 }, { "epoch": 0.033166666666666664, "grad_norm": 172.34048461914062, "learning_rate": 1.9999474070931224e-07, "logits/chosen": 2.4529542922973633, "logits/rejected": 2.7514545917510986, "logps/chosen": -69.71153259277344, "logps/rejected": -156.08758544921875, "loss": 1.5706, "nll_loss": 0.8713940382003784, "rewards/accuracies": 1.0, "rewards/chosen": 0.1510360687971115, "rewards/margins": 0.2832443118095398, "rewards/rejected": -0.13220825791358948, "step": 199 }, { "epoch": 0.03333333333333333, "grad_norm": 194.7084503173828, "learning_rate": 1.9999417253661234e-07, "logits/chosen": 2.33841609954834, "logits/rejected": 2.7246739864349365, "logps/chosen": -68.69033813476562, "logps/rejected": -189.30828857421875, "loss": 1.6833, "nll_loss": 0.9812906384468079, "rewards/accuracies": 1.0, "rewards/chosen": 0.11598816514015198, "rewards/margins": 0.2769577205181122, "rewards/rejected": -0.1609695553779602, "step": 200 }, { "epoch": 0.0335, "grad_norm": 153.58377075195312, "learning_rate": 1.9999357522801122e-07, "logits/chosen": 2.4840750694274902, "logits/rejected": 2.1368887424468994, "logps/chosen": -92.15505981445312, "logps/rejected": -56.572696685791016, "loss": 1.8386, "nll_loss": 1.0841771364212036, "rewards/accuracies": 1.0, "rewards/chosen": 0.14174194633960724, "rewards/margins": 0.1607593595981598, "rewards/rejected": -0.019017411395907402, "step": 201 }, { "epoch": 0.033666666666666664, "grad_norm": 386.85260009765625, "learning_rate": 1.9999294878368286e-07, "logits/chosen": 2.411672830581665, "logits/rejected": 2.5730795860290527, "logps/chosen": -38.179931640625, "logps/rejected": -42.00851821899414, "loss": 2.2629, "nll_loss": 1.5271975994110107, "rewards/accuracies": 1.0, "rewards/chosen": -0.007227325811982155, "rewards/margins": 0.20310592651367188, "rewards/rejected": -0.21033325791358948, "step": 202 }, { "epoch": 0.03383333333333333, "grad_norm": 176.54942321777344, "learning_rate": 1.9999229320380982e-07, "logits/chosen": 2.738999366760254, "logits/rejected": 2.5860462188720703, "logps/chosen": -31.15791893005371, "logps/rejected": -24.32840919494629, "loss": 3.0092, "nll_loss": 2.2255656719207764, "rewards/accuracies": 1.0, "rewards/chosen": -0.007356834597885609, "rewards/margins": 0.09897308051586151, "rewards/rejected": -0.10632991790771484, "step": 203 }, { "epoch": 0.034, "grad_norm": 108.76744842529297, "learning_rate": 1.9999160848858318e-07, "logits/chosen": 3.402966260910034, "logits/rejected": 3.320018768310547, "logps/chosen": -57.58336639404297, "logps/rejected": -39.44041442871094, "loss": 1.3755, "nll_loss": 0.7022361159324646, "rewards/accuracies": 1.0, "rewards/chosen": 0.3004436492919922, "rewards/margins": 0.3448108732700348, "rewards/rejected": -0.044367220252752304, "step": 204 }, { "epoch": 0.034166666666666665, "grad_norm": 214.66360473632812, "learning_rate": 1.9999089463820237e-07, "logits/chosen": 2.593994140625, "logits/rejected": 2.7731821537017822, "logps/chosen": -98.40230560302734, "logps/rejected": -230.2439422607422, "loss": 2.5393, "nll_loss": 1.856647253036499, "rewards/accuracies": 1.0, "rewards/chosen": 0.13180923461914062, "rewards/margins": 0.3215996026992798, "rewards/rejected": -0.18979035317897797, "step": 205 }, { "epoch": 0.034333333333333334, "grad_norm": 139.5031280517578, "learning_rate": 1.9999015165287542e-07, "logits/chosen": 3.5146307945251465, "logits/rejected": 3.5452804565429688, "logps/chosen": -48.468597412109375, "logps/rejected": -57.02947235107422, "loss": 1.6988, "nll_loss": 1.0770798921585083, "rewards/accuracies": 1.0, "rewards/chosen": 0.30665817856788635, "rewards/margins": 0.4695785641670227, "rewards/rejected": -0.16292038559913635, "step": 206 }, { "epoch": 0.0345, "grad_norm": 123.22521209716797, "learning_rate": 1.9998937953281877e-07, "logits/chosen": 2.2373247146606445, "logits/rejected": 2.720377206802368, "logps/chosen": -53.54522705078125, "logps/rejected": -116.63272094726562, "loss": 1.2845, "nll_loss": 0.6864772439002991, "rewards/accuracies": 1.0, "rewards/chosen": 0.3502960205078125, "rewards/margins": 0.5306350588798523, "rewards/rejected": -0.180339053273201, "step": 207 }, { "epoch": 0.034666666666666665, "grad_norm": 217.81260681152344, "learning_rate": 1.999885782782575e-07, "logits/chosen": 3.920170307159424, "logits/rejected": 4.02524995803833, "logps/chosen": -80.16388702392578, "logps/rejected": -81.30840301513672, "loss": 2.7362, "nll_loss": 1.8642762899398804, "rewards/accuracies": 0.0, "rewards/chosen": 0.1261650174856186, "rewards/margins": -0.07688598334789276, "rewards/rejected": 0.20305100083351135, "step": 208 }, { "epoch": 0.034833333333333334, "grad_norm": 195.88275146484375, "learning_rate": 1.9998774788942497e-07, "logits/chosen": 2.4019358158111572, "logits/rejected": 2.6555325984954834, "logps/chosen": -51.548583984375, "logps/rejected": -148.92596435546875, "loss": 1.4885, "nll_loss": 0.8887686729431152, "rewards/accuracies": 1.0, "rewards/chosen": 0.2561046779155731, "rewards/margins": 0.5256710052490234, "rewards/rejected": -0.2695663571357727, "step": 209 }, { "epoch": 0.035, "grad_norm": 154.4641571044922, "learning_rate": 1.9998688836656322e-07, "logits/chosen": 2.2931647300720215, "logits/rejected": 2.295586585998535, "logps/chosen": -52.656959533691406, "logps/rejected": -86.2108154296875, "loss": 1.4894, "nll_loss": 0.7978326678276062, "rewards/accuracies": 1.0, "rewards/chosen": 0.3020271360874176, "rewards/margins": 0.3025306761264801, "rewards/rejected": -0.0005035400390625, "step": 210 }, { "epoch": 0.035166666666666666, "grad_norm": 111.22073364257812, "learning_rate": 1.9998599970992264e-07, "logits/chosen": 2.9773123264312744, "logits/rejected": 2.788119077682495, "logps/chosen": -75.45098876953125, "logps/rejected": -16.582643508911133, "loss": 1.6406, "nll_loss": 1.033575177192688, "rewards/accuracies": 1.0, "rewards/chosen": 0.3801506459712982, "rewards/margins": 0.5080529451370239, "rewards/rejected": -0.1279023140668869, "step": 211 }, { "epoch": 0.035333333333333335, "grad_norm": 114.67243194580078, "learning_rate": 1.9998508191976216e-07, "logits/chosen": 2.8370327949523926, "logits/rejected": 2.78326416015625, "logps/chosen": -122.66146850585938, "logps/rejected": -2.8009631633758545, "loss": 1.6854, "nll_loss": 1.0395039319992065, "rewards/accuracies": 1.0, "rewards/chosen": 0.40940627455711365, "rewards/margins": 0.4132680296897888, "rewards/rejected": -0.0038617614191025496, "step": 212 }, { "epoch": 0.0355, "grad_norm": 138.48451232910156, "learning_rate": 1.9998413499634922e-07, "logits/chosen": 3.0686886310577393, "logits/rejected": 3.0612423419952393, "logps/chosen": -94.03397369384766, "logps/rejected": -76.83776092529297, "loss": 1.6523, "nll_loss": 0.9694223999977112, "rewards/accuracies": 1.0, "rewards/chosen": 0.31674885749816895, "rewards/margins": 0.3228248953819275, "rewards/rejected": -0.006076050456613302, "step": 213 }, { "epoch": 0.035666666666666666, "grad_norm": 161.10792541503906, "learning_rate": 1.9998315893995976e-07, "logits/chosen": 2.569852828979492, "logits/rejected": 2.3053951263427734, "logps/chosen": -61.691558837890625, "logps/rejected": -93.7528076171875, "loss": 1.3329, "nll_loss": 0.6854618787765503, "rewards/accuracies": 1.0, "rewards/chosen": 0.47387391328811646, "rewards/margins": 0.4126853942871094, "rewards/rejected": 0.061188507825136185, "step": 214 }, { "epoch": 0.035833333333333335, "grad_norm": 221.76776123046875, "learning_rate": 1.9998215375087813e-07, "logits/chosen": 2.239941120147705, "logits/rejected": 0.9291749596595764, "logps/chosen": -331.7875061035156, "logps/rejected": -115.06280517578125, "loss": 1.502, "nll_loss": 0.8991531133651733, "rewards/accuracies": 1.0, "rewards/chosen": 0.29228517413139343, "rewards/margins": 0.517529308795929, "rewards/rejected": -0.22524414956569672, "step": 215 }, { "epoch": 0.036, "grad_norm": 230.6516876220703, "learning_rate": 1.9998111942939727e-07, "logits/chosen": 3.861752986907959, "logits/rejected": 3.882394313812256, "logps/chosen": -56.355934143066406, "logps/rejected": -93.54817962646484, "loss": 3.5249, "nll_loss": 2.8177967071533203, "rewards/accuracies": 1.0, "rewards/chosen": 0.2417289912700653, "rewards/margins": 0.26586076617240906, "rewards/rejected": -0.0241317767649889, "step": 216 }, { "epoch": 0.036166666666666666, "grad_norm": 164.20692443847656, "learning_rate": 1.999800559758185e-07, "logits/chosen": 0.9181820750236511, "logits/rejected": 2.3053011894226074, "logps/chosen": -102.84687805175781, "logps/rejected": -345.95037841796875, "loss": 1.4538, "nll_loss": 0.9985134601593018, "rewards/accuracies": 1.0, "rewards/chosen": 0.6090362668037415, "rewards/margins": 0.9507660269737244, "rewards/rejected": -0.3417297601699829, "step": 217 }, { "epoch": 0.036333333333333336, "grad_norm": 328.2340393066406, "learning_rate": 1.9997896339045171e-07, "logits/chosen": 2.172964572906494, "logits/rejected": 2.3851382732391357, "logps/chosen": -248.05575561523438, "logps/rejected": -249.75088500976562, "loss": 1.9227, "nll_loss": 1.15374755859375, "rewards/accuracies": 1.0, "rewards/chosen": -0.06391754001379013, "rewards/margins": 0.13183745741844177, "rewards/rejected": -0.1957550048828125, "step": 218 }, { "epoch": 0.0365, "grad_norm": 231.0224151611328, "learning_rate": 1.9997784167361523e-07, "logits/chosen": 2.4957966804504395, "logits/rejected": 2.806208372116089, "logps/chosen": -28.559864044189453, "logps/rejected": -103.24486541748047, "loss": 2.7145, "nll_loss": 1.9039908647537231, "rewards/accuracies": 1.0, "rewards/chosen": -0.0056594847701489925, "rewards/margins": 0.04329300671815872, "rewards/rejected": -0.04895249009132385, "step": 219 }, { "epoch": 0.03666666666666667, "grad_norm": 280.4942626953125, "learning_rate": 1.9997669082563595e-07, "logits/chosen": 2.6029129028320312, "logits/rejected": 2.9084396362304688, "logps/chosen": -127.95736694335938, "logps/rejected": -283.6067810058594, "loss": 2.0391, "nll_loss": 1.2423045635223389, "rewards/accuracies": 1.0, "rewards/chosen": 0.22555695474147797, "rewards/margins": 0.07397308945655823, "rewards/rejected": 0.15158386528491974, "step": 220 }, { "epoch": 0.036833333333333336, "grad_norm": 143.58840942382812, "learning_rate": 1.9997551084684918e-07, "logits/chosen": 2.1269190311431885, "logits/rejected": 2.234853744506836, "logps/chosen": -68.09034729003906, "logps/rejected": -93.92404174804688, "loss": 1.5984, "nll_loss": 0.9456993937492371, "rewards/accuracies": 1.0, "rewards/chosen": 0.2410019040107727, "rewards/margins": 0.3927902579307556, "rewards/rejected": -0.15178833901882172, "step": 221 }, { "epoch": 0.037, "grad_norm": 230.7613067626953, "learning_rate": 1.9997430173759872e-07, "logits/chosen": 2.9444046020507812, "logits/rejected": 2.9431772232055664, "logps/chosen": -25.633136749267578, "logps/rejected": -110.39595031738281, "loss": 2.8548, "nll_loss": 2.1360950469970703, "rewards/accuracies": 1.0, "rewards/chosen": 0.12360382080078125, "rewards/margins": 0.23907470703125, "rewards/rejected": -0.11547088623046875, "step": 222 }, { "epoch": 0.03716666666666667, "grad_norm": 177.8938751220703, "learning_rate": 1.999730634982369e-07, "logits/chosen": 2.279269218444824, "logits/rejected": 2.05206036567688, "logps/chosen": -214.73330688476562, "logps/rejected": -174.314208984375, "loss": 1.44, "nll_loss": 0.876462459564209, "rewards/accuracies": 1.0, "rewards/chosen": 0.5378952026367188, "rewards/margins": 0.6276885867118835, "rewards/rejected": -0.08979339897632599, "step": 223 }, { "epoch": 0.037333333333333336, "grad_norm": 148.08241271972656, "learning_rate": 1.9997179612912452e-07, "logits/chosen": 3.2244856357574463, "logits/rejected": 3.1315200328826904, "logps/chosen": -60.25410079956055, "logps/rejected": -18.035621643066406, "loss": 1.5644, "nll_loss": 0.9564144611358643, "rewards/accuracies": 1.0, "rewards/chosen": 0.21587295830249786, "rewards/margins": 0.5047421455383301, "rewards/rejected": -0.2888692021369934, "step": 224 }, { "epoch": 0.0375, "grad_norm": 216.64547729492188, "learning_rate": 1.999704996306308e-07, "logits/chosen": 3.3778586387634277, "logits/rejected": 3.2372207641601562, "logps/chosen": -89.43605041503906, "logps/rejected": -121.13446044921875, "loss": 1.6885, "nll_loss": 0.9220211505889893, "rewards/accuracies": 1.0, "rewards/chosen": 0.4376968443393707, "rewards/margins": 0.1467239260673523, "rewards/rejected": 0.29097291827201843, "step": 225 }, { "epoch": 0.03766666666666667, "grad_norm": 116.67481231689453, "learning_rate": 1.9996917400313355e-07, "logits/chosen": 2.6217262744903564, "logits/rejected": 2.610738754272461, "logps/chosen": -62.986412048339844, "logps/rejected": -38.18379592895508, "loss": 1.4216, "nll_loss": 0.8180053234100342, "rewards/accuracies": 1.0, "rewards/chosen": 0.3833732604980469, "rewards/margins": 0.5168007016181946, "rewards/rejected": -0.1334274262189865, "step": 226 }, { "epoch": 0.03783333333333333, "grad_norm": 215.63746643066406, "learning_rate": 1.9996781924701907e-07, "logits/chosen": 2.598376750946045, "logits/rejected": 2.7211477756500244, "logps/chosen": -44.55501174926758, "logps/rejected": -219.89181518554688, "loss": 1.8508, "nll_loss": 1.204189419746399, "rewards/accuracies": 1.0, "rewards/chosen": 0.04154396057128906, "rewards/margins": 0.411929726600647, "rewards/rejected": -0.3703857660293579, "step": 227 }, { "epoch": 0.038, "grad_norm": 137.6041717529297, "learning_rate": 1.99966435362682e-07, "logits/chosen": 1.589249610900879, "logits/rejected": 2.0979018211364746, "logps/chosen": -50.02370071411133, "logps/rejected": -144.1227264404297, "loss": 1.3127, "nll_loss": 0.7940269112586975, "rewards/accuracies": 1.0, "rewards/chosen": 0.6617023944854736, "rewards/margins": 0.7595905661582947, "rewards/rejected": -0.09788818657398224, "step": 228 }, { "epoch": 0.03816666666666667, "grad_norm": 193.49337768554688, "learning_rate": 1.999650223505257e-07, "logits/chosen": 2.9302310943603516, "logits/rejected": 3.0956594944000244, "logps/chosen": -48.30166244506836, "logps/rejected": -96.32463073730469, "loss": 2.3108, "nll_loss": 1.3800474405288696, "rewards/accuracies": 0.0, "rewards/chosen": -0.0011524200672283769, "rewards/margins": -0.18945962190628052, "rewards/rejected": 0.18830719590187073, "step": 229 }, { "epoch": 0.03833333333333333, "grad_norm": 157.9708251953125, "learning_rate": 1.9996358021096172e-07, "logits/chosen": 2.117868661880493, "logits/rejected": 2.5862584114074707, "logps/chosen": -75.86122131347656, "logps/rejected": -94.6449966430664, "loss": 1.6219, "nll_loss": 0.9139904975891113, "rewards/accuracies": 1.0, "rewards/chosen": 0.34158021211624146, "rewards/margins": 0.26684877276420593, "rewards/rejected": 0.07473144680261612, "step": 230 }, { "epoch": 0.0385, "grad_norm": 161.79888916015625, "learning_rate": 1.9996210894441044e-07, "logits/chosen": 3.1587841510772705, "logits/rejected": 3.159788131713867, "logps/chosen": -40.09031677246094, "logps/rejected": -14.768795013427734, "loss": 3.0903, "nll_loss": 2.1100165843963623, "rewards/accuracies": 0.0, "rewards/chosen": -0.05274009704589844, "rewards/margins": -0.2791265845298767, "rewards/rejected": 0.22638647258281708, "step": 231 }, { "epoch": 0.03866666666666667, "grad_norm": 215.8321533203125, "learning_rate": 1.9996060855130045e-07, "logits/chosen": 2.3413424491882324, "logits/rejected": 2.913527011871338, "logps/chosen": -53.46649932861328, "logps/rejected": -157.56552124023438, "loss": 2.0121, "nll_loss": 1.1881444454193115, "rewards/accuracies": 1.0, "rewards/chosen": -0.048615265637636185, "rewards/margins": 0.016460422426462173, "rewards/rejected": -0.06507568806409836, "step": 232 }, { "epoch": 0.03883333333333333, "grad_norm": 162.57810974121094, "learning_rate": 1.9995907903206898e-07, "logits/chosen": 2.7528252601623535, "logits/rejected": 2.830514669418335, "logps/chosen": -71.80857849121094, "logps/rejected": -144.40493774414062, "loss": 1.4141, "nll_loss": 0.8160064816474915, "rewards/accuracies": 1.0, "rewards/chosen": 0.15679551661014557, "rewards/margins": 0.5321769714355469, "rewards/rejected": -0.3753814697265625, "step": 233 }, { "epoch": 0.039, "grad_norm": 155.2031707763672, "learning_rate": 1.9995752038716164e-07, "logits/chosen": 2.9418141841888428, "logits/rejected": 3.0159757137298584, "logps/chosen": -96.52220153808594, "logps/rejected": -230.4770050048828, "loss": 1.5128, "nll_loss": 1.0054396390914917, "rewards/accuracies": 1.0, "rewards/chosen": 0.4782348573207855, "rewards/margins": 0.7843551635742188, "rewards/rejected": -0.3061203062534332, "step": 234 }, { "epoch": 0.03916666666666667, "grad_norm": 162.75753784179688, "learning_rate": 1.999559326170326e-07, "logits/chosen": 3.259455919265747, "logits/rejected": 3.2420856952667236, "logps/chosen": -169.01002502441406, "logps/rejected": -113.74189758300781, "loss": 2.0664, "nll_loss": 1.4445301294326782, "rewards/accuracies": 1.0, "rewards/chosen": 0.3582046627998352, "rewards/margins": 0.4699081480503082, "rewards/rejected": -0.11170349270105362, "step": 235 }, { "epoch": 0.03933333333333333, "grad_norm": 222.65139770507812, "learning_rate": 1.9995431572214453e-07, "logits/chosen": 2.8730878829956055, "logits/rejected": 3.3446221351623535, "logps/chosen": -90.85618591308594, "logps/rejected": -157.0183868408203, "loss": 2.5839, "nll_loss": 1.8928371667861938, "rewards/accuracies": 1.0, "rewards/chosen": 0.06883849948644638, "rewards/margins": 0.3033126890659332, "rewards/rejected": -0.23447418212890625, "step": 236 }, { "epoch": 0.0395, "grad_norm": 127.49321746826172, "learning_rate": 1.9995266970296853e-07, "logits/chosen": 2.7456605434417725, "logits/rejected": 2.6235337257385254, "logps/chosen": -62.342315673828125, "logps/rejected": -21.451162338256836, "loss": 1.5216, "nll_loss": 0.9740986824035645, "rewards/accuracies": 1.0, "rewards/chosen": 0.5268226861953735, "rewards/margins": 0.6707766056060791, "rewards/rejected": -0.14395390450954437, "step": 237 }, { "epoch": 0.03966666666666667, "grad_norm": 136.60769653320312, "learning_rate": 1.9995099455998422e-07, "logits/chosen": 2.959263563156128, "logits/rejected": 3.036580801010132, "logps/chosen": -96.14225769042969, "logps/rejected": -114.06158447265625, "loss": 1.595, "nll_loss": 1.0565085411071777, "rewards/accuracies": 1.0, "rewards/chosen": 0.4662437438964844, "rewards/margins": 0.6938080191612244, "rewards/rejected": -0.2275642603635788, "step": 238 }, { "epoch": 0.03983333333333333, "grad_norm": 148.75881958007812, "learning_rate": 1.9994929029367967e-07, "logits/chosen": 2.5913214683532715, "logits/rejected": 2.3422608375549316, "logps/chosen": -67.56815338134766, "logps/rejected": -30.925708770751953, "loss": 2.3876, "nll_loss": 1.8261662721633911, "rewards/accuracies": 1.0, "rewards/chosen": 0.2114921659231186, "rewards/margins": 0.6303858160972595, "rewards/rejected": -0.4188936650753021, "step": 239 }, { "epoch": 0.04, "grad_norm": 205.3869171142578, "learning_rate": 1.999475569045515e-07, "logits/chosen": 2.617767810821533, "logits/rejected": 3.0386364459991455, "logps/chosen": -97.75382232666016, "logps/rejected": -465.83709716796875, "loss": 1.5068, "nll_loss": 0.8728019595146179, "rewards/accuracies": 1.0, "rewards/chosen": 0.4410568177700043, "rewards/margins": 0.4429580569267273, "rewards/rejected": -0.0019012450939044356, "step": 240 }, { "epoch": 0.04016666666666667, "grad_norm": 233.8216094970703, "learning_rate": 1.9994579439310474e-07, "logits/chosen": 2.4774413108825684, "logits/rejected": 2.58439040184021, "logps/chosen": -46.39244079589844, "logps/rejected": -233.68675231933594, "loss": 2.2721, "nll_loss": 1.4058319330215454, "rewards/accuracies": 0.0, "rewards/chosen": 0.059545136988162994, "rewards/margins": -0.06757431477308273, "rewards/rejected": 0.12711945176124573, "step": 241 }, { "epoch": 0.04033333333333333, "grad_norm": 127.90225219726562, "learning_rate": 1.9994400275985294e-07, "logits/chosen": 2.8753113746643066, "logits/rejected": 3.0040926933288574, "logps/chosen": -17.36386489868164, "logps/rejected": -83.25592041015625, "loss": 1.0896, "nll_loss": 0.48232951760292053, "rewards/accuracies": 1.0, "rewards/chosen": 0.37883782386779785, "rewards/margins": 0.5071505308151245, "rewards/rejected": -0.12831269204616547, "step": 242 }, { "epoch": 0.0405, "grad_norm": 163.6046142578125, "learning_rate": 1.999421820053182e-07, "logits/chosen": 2.145427703857422, "logits/rejected": 3.2411718368530273, "logps/chosen": -62.15194320678711, "logps/rejected": -248.59800720214844, "loss": 1.619, "nll_loss": 1.018884301185608, "rewards/accuracies": 1.0, "rewards/chosen": 0.580264687538147, "rewards/margins": 0.5346637964248657, "rewards/rejected": 0.04560089111328125, "step": 243 }, { "epoch": 0.04066666666666666, "grad_norm": 176.5882568359375, "learning_rate": 1.99940332130031e-07, "logits/chosen": 3.2477681636810303, "logits/rejected": 3.4291203022003174, "logps/chosen": -50.3884162902832, "logps/rejected": -182.60284423828125, "loss": 1.5522, "nll_loss": 0.854040801525116, "rewards/accuracies": 1.0, "rewards/chosen": 0.281747430562973, "rewards/margins": 0.28677672147750854, "rewards/rejected": -0.005029297433793545, "step": 244 }, { "epoch": 0.04083333333333333, "grad_norm": 192.9368896484375, "learning_rate": 1.9993845313453038e-07, "logits/chosen": 3.1344733238220215, "logits/rejected": 3.16422963142395, "logps/chosen": -50.47273254394531, "logps/rejected": -139.99545288085938, "loss": 1.599, "nll_loss": 0.9896615147590637, "rewards/accuracies": 1.0, "rewards/chosen": 0.3628990352153778, "rewards/margins": 0.5016960501670837, "rewards/rejected": -0.13879700005054474, "step": 245 }, { "epoch": 0.041, "grad_norm": 131.6648712158203, "learning_rate": 1.9993654501936377e-07, "logits/chosen": 2.1505751609802246, "logits/rejected": 2.579749345779419, "logps/chosen": -34.40559768676758, "logps/rejected": -52.20121765136719, "loss": 1.5245, "nll_loss": 0.6881120204925537, "rewards/accuracies": 1.0, "rewards/chosen": 0.4391513764858246, "rewards/margins": 0.006831318140029907, "rewards/rejected": 0.4323200583457947, "step": 246 }, { "epoch": 0.041166666666666664, "grad_norm": 149.65390014648438, "learning_rate": 1.999346077850872e-07, "logits/chosen": 2.954211711883545, "logits/rejected": 2.9079651832580566, "logps/chosen": -111.34147644042969, "logps/rejected": -89.78364562988281, "loss": 2.009, "nll_loss": 1.4459930658340454, "rewards/accuracies": 1.0, "rewards/chosen": 0.4965873956680298, "rewards/margins": 0.6268715262413025, "rewards/rejected": -0.1302841156721115, "step": 247 }, { "epoch": 0.04133333333333333, "grad_norm": 119.60041809082031, "learning_rate": 1.9993264143226512e-07, "logits/chosen": 3.3861374855041504, "logits/rejected": 3.4040658473968506, "logps/chosen": -57.88346481323242, "logps/rejected": -83.68788146972656, "loss": 1.3653, "nll_loss": 0.7420957684516907, "rewards/accuracies": 1.0, "rewards/chosen": 0.46401214599609375, "rewards/margins": 0.4702621400356293, "rewards/rejected": -0.006250000558793545, "step": 248 }, { "epoch": 0.0415, "grad_norm": 307.68798828125, "learning_rate": 1.999306459614705e-07, "logits/chosen": 2.73176646232605, "logits/rejected": 2.781907081604004, "logps/chosen": -138.2235107421875, "logps/rejected": -151.3164825439453, "loss": 2.2175, "nll_loss": 1.2341383695602417, "rewards/accuracies": 0.0, "rewards/chosen": -0.4439941346645355, "rewards/margins": -0.2748351991176605, "rewards/rejected": -0.169158935546875, "step": 249 }, { "epoch": 0.041666666666666664, "grad_norm": 108.97005462646484, "learning_rate": 1.999286213732847e-07, "logits/chosen": 3.151655435562134, "logits/rejected": 3.1073176860809326, "logps/chosen": -79.28973388671875, "logps/rejected": -8.801301956176758, "loss": 1.3141, "nll_loss": 0.843507707118988, "rewards/accuracies": 1.0, "rewards/chosen": 0.7968529462814331, "rewards/margins": 0.9131712913513184, "rewards/rejected": -0.11631832271814346, "step": 250 }, { "epoch": 0.041833333333333333, "grad_norm": 200.6004180908203, "learning_rate": 1.9992656766829775e-07, "logits/chosen": 3.069796562194824, "logits/rejected": 3.291718006134033, "logps/chosen": -30.433713912963867, "logps/rejected": -142.494140625, "loss": 1.7407, "nll_loss": 1.0144572257995605, "rewards/accuracies": 1.0, "rewards/chosen": 0.18755702674388885, "rewards/margins": 0.2223852127790451, "rewards/rejected": -0.03482818603515625, "step": 251 }, { "epoch": 0.042, "grad_norm": 177.6026611328125, "learning_rate": 1.9992448484710796e-07, "logits/chosen": 1.6591688394546509, "logits/rejected": 2.7367918491363525, "logps/chosen": -80.24656677246094, "logps/rejected": -263.564453125, "loss": 1.6529, "nll_loss": 1.0421631336212158, "rewards/accuracies": 1.0, "rewards/chosen": 0.31733018159866333, "rewards/margins": 0.4972343444824219, "rewards/rejected": -0.17990417778491974, "step": 252 }, { "epoch": 0.042166666666666665, "grad_norm": 194.4388427734375, "learning_rate": 1.9992237291032222e-07, "logits/chosen": 1.338205099105835, "logits/rejected": 2.2182819843292236, "logps/chosen": -126.05793762207031, "logps/rejected": -275.8018798828125, "loss": 1.5985, "nll_loss": 1.0682876110076904, "rewards/accuracies": 1.0, "rewards/chosen": 0.6874512434005737, "rewards/margins": 0.7281098365783691, "rewards/rejected": -0.04065857082605362, "step": 253 }, { "epoch": 0.042333333333333334, "grad_norm": 123.48579406738281, "learning_rate": 1.9992023185855598e-07, "logits/chosen": 2.961251735687256, "logits/rejected": 2.803619146347046, "logps/chosen": -163.78379821777344, "logps/rejected": -75.27043151855469, "loss": 1.2506, "nll_loss": 0.7874221801757812, "rewards/accuracies": 1.0, "rewards/chosen": 0.7082901000976562, "rewards/margins": 0.9294479489326477, "rewards/rejected": -0.22115783393383026, "step": 254 }, { "epoch": 0.0425, "grad_norm": 115.94007873535156, "learning_rate": 1.9991806169243298e-07, "logits/chosen": 2.965787172317505, "logits/rejected": 3.147144079208374, "logps/chosen": -57.239784240722656, "logps/rejected": -180.13589477539062, "loss": 1.0589, "nll_loss": 0.7154973149299622, "rewards/accuracies": 1.0, "rewards/chosen": 0.7863747477531433, "rewards/margins": 1.3752785921096802, "rewards/rejected": -0.5889038443565369, "step": 255 }, { "epoch": 0.042666666666666665, "grad_norm": 139.3114013671875, "learning_rate": 1.9991586241258562e-07, "logits/chosen": 2.417722463607788, "logits/rejected": 1.7152292728424072, "logps/chosen": -148.74388122558594, "logps/rejected": -99.19156646728516, "loss": 1.2888, "nll_loss": 0.7954220175743103, "rewards/accuracies": 1.0, "rewards/chosen": 0.8373245596885681, "rewards/margins": 0.8474975824356079, "rewards/rejected": -0.01017303578555584, "step": 256 }, { "epoch": 0.042833333333333334, "grad_norm": 146.1566162109375, "learning_rate": 1.9991363401965473e-07, "logits/chosen": 2.391545534133911, "logits/rejected": 2.8395190238952637, "logps/chosen": -49.16947555541992, "logps/rejected": -127.63742065429688, "loss": 1.9014, "nll_loss": 1.2939335107803345, "rewards/accuracies": 1.0, "rewards/chosen": 0.36980971693992615, "rewards/margins": 0.5063537359237671, "rewards/rejected": -0.13654403388500214, "step": 257 }, { "epoch": 0.043, "grad_norm": 136.58680725097656, "learning_rate": 1.9991137651428956e-07, "logits/chosen": 3.2054665088653564, "logits/rejected": 3.3094699382781982, "logps/chosen": -112.64015197753906, "logps/rejected": -271.91497802734375, "loss": 1.4684, "nll_loss": 1.072763204574585, "rewards/accuracies": 1.0, "rewards/chosen": 0.9655144214630127, "rewards/margins": 1.181093692779541, "rewards/rejected": -0.21557924151420593, "step": 258 }, { "epoch": 0.043166666666666666, "grad_norm": 150.3721466064453, "learning_rate": 1.9990908989714792e-07, "logits/chosen": 2.9980006217956543, "logits/rejected": 3.0791707038879395, "logps/chosen": -34.720794677734375, "logps/rejected": -12.362446784973145, "loss": 2.7627, "nll_loss": 2.0423998832702637, "rewards/accuracies": 1.0, "rewards/chosen": 0.1391860991716385, "rewards/margins": 0.2354201376438141, "rewards/rejected": -0.0962340384721756, "step": 259 }, { "epoch": 0.043333333333333335, "grad_norm": 135.27984619140625, "learning_rate": 1.9990677416889606e-07, "logits/chosen": 2.404792547225952, "logits/rejected": 2.693066120147705, "logps/chosen": -67.29322814941406, "logps/rejected": -157.2587127685547, "loss": 1.3976, "nll_loss": 0.8411653637886047, "rewards/accuracies": 1.0, "rewards/chosen": 0.3928665518760681, "rewards/margins": 0.6421875357627869, "rewards/rejected": -0.24932099878787994, "step": 260 }, { "epoch": 0.0435, "grad_norm": 166.3841552734375, "learning_rate": 1.9990442933020874e-07, "logits/chosen": 2.6597542762756348, "logits/rejected": 2.532191038131714, "logps/chosen": -211.61648559570312, "logps/rejected": -118.61386108398438, "loss": 1.5172, "nll_loss": 1.032275676727295, "rewards/accuracies": 1.0, "rewards/chosen": 0.6590698957443237, "rewards/margins": 0.8584892749786377, "rewards/rejected": -0.19941940903663635, "step": 261 }, { "epoch": 0.043666666666666666, "grad_norm": 149.97097778320312, "learning_rate": 1.999020553817692e-07, "logits/chosen": 3.0388340950012207, "logits/rejected": 2.869393825531006, "logps/chosen": -55.53657913208008, "logps/rejected": -12.645319938659668, "loss": 1.8303, "nll_loss": 1.133399486541748, "rewards/accuracies": 1.0, "rewards/chosen": 0.21759149432182312, "rewards/margins": 0.288436621427536, "rewards/rejected": -0.07084512710571289, "step": 262 }, { "epoch": 0.043833333333333335, "grad_norm": 146.5283203125, "learning_rate": 1.9989965232426916e-07, "logits/chosen": 2.9178335666656494, "logits/rejected": 3.1878468990325928, "logps/chosen": -48.3065299987793, "logps/rejected": -231.97738647460938, "loss": 1.1544, "nll_loss": 0.7791374921798706, "rewards/accuracies": 1.0, "rewards/chosen": 0.3420223295688629, "rewards/margins": 1.2606197595596313, "rewards/rejected": -0.9185974597930908, "step": 263 }, { "epoch": 0.044, "grad_norm": 220.4069061279297, "learning_rate": 1.9989722015840878e-07, "logits/chosen": 3.229496479034424, "logits/rejected": 3.2788474559783936, "logps/chosen": -107.78274536132812, "logps/rejected": -269.2496643066406, "loss": 1.8264, "nll_loss": 1.1715515851974487, "rewards/accuracies": 1.0, "rewards/chosen": 0.583514392375946, "rewards/margins": 0.40131527185440063, "rewards/rejected": 0.18219910562038422, "step": 264 }, { "epoch": 0.04416666666666667, "grad_norm": 160.9208984375, "learning_rate": 1.9989475888489672e-07, "logits/chosen": 2.55778169631958, "logits/rejected": 2.5134127140045166, "logps/chosen": -52.36540985107422, "logps/rejected": -69.37086486816406, "loss": 2.2921, "nll_loss": 1.689206600189209, "rewards/accuracies": 1.0, "rewards/chosen": 0.40543824434280396, "rewards/margins": 0.5188583731651306, "rewards/rejected": -0.11342010647058487, "step": 265 }, { "epoch": 0.044333333333333336, "grad_norm": 162.1368865966797, "learning_rate": 1.9989226850445017e-07, "logits/chosen": 1.2828750610351562, "logits/rejected": 2.2940189838409424, "logps/chosen": -98.517333984375, "logps/rejected": -228.2860565185547, "loss": 1.5689, "nll_loss": 1.0826083421707153, "rewards/accuracies": 1.0, "rewards/chosen": 0.5960060358047485, "rewards/margins": 0.8511894941329956, "rewards/rejected": -0.2551834285259247, "step": 266 }, { "epoch": 0.0445, "grad_norm": 170.64169311523438, "learning_rate": 1.998897490177948e-07, "logits/chosen": 2.7609686851501465, "logits/rejected": 1.9254100322723389, "logps/chosen": -69.78482055664062, "logps/rejected": -24.050527572631836, "loss": 1.6886, "nll_loss": 1.0736125707626343, "rewards/accuracies": 1.0, "rewards/chosen": 0.3195221424102783, "rewards/margins": 0.4865885376930237, "rewards/rejected": -0.16706638038158417, "step": 267 }, { "epoch": 0.04466666666666667, "grad_norm": 182.87156677246094, "learning_rate": 1.9988720042566466e-07, "logits/chosen": 1.896323800086975, "logits/rejected": 2.834848165512085, "logps/chosen": -96.79179382324219, "logps/rejected": -281.8890380859375, "loss": 1.5478, "nll_loss": 0.8642125129699707, "rewards/accuracies": 1.0, "rewards/chosen": 0.36683350801467896, "rewards/margins": 0.3221893310546875, "rewards/rejected": 0.04464416950941086, "step": 268 }, { "epoch": 0.044833333333333336, "grad_norm": 220.34190368652344, "learning_rate": 1.998846227288024e-07, "logits/chosen": 2.57102108001709, "logits/rejected": 2.9065489768981934, "logps/chosen": -31.010866165161133, "logps/rejected": -332.7291259765625, "loss": 1.0837, "nll_loss": 0.689130425453186, "rewards/accuracies": 1.0, "rewards/chosen": 0.5108698010444641, "rewards/margins": 1.1678247451782227, "rewards/rejected": -0.6569549441337585, "step": 269 }, { "epoch": 0.045, "grad_norm": 234.33889770507812, "learning_rate": 1.9988201592795905e-07, "logits/chosen": 3.1155593395233154, "logits/rejected": 3.0441441535949707, "logps/chosen": -82.92965698242188, "logps/rejected": -24.729469299316406, "loss": 2.718, "nll_loss": 1.658592939376831, "rewards/accuracies": 0.0, "rewards/chosen": -0.3784500062465668, "rewards/margins": -0.41260698437690735, "rewards/rejected": 0.03415698930621147, "step": 270 }, { "epoch": 0.04516666666666667, "grad_norm": 127.02069854736328, "learning_rate": 1.998793800238942e-07, "logits/chosen": 3.1742992401123047, "logits/rejected": 3.2938387393951416, "logps/chosen": -79.6528549194336, "logps/rejected": -197.45840454101562, "loss": 1.1102, "nll_loss": 0.7658929228782654, "rewards/accuracies": 1.0, "rewards/chosen": 0.5033653378486633, "rewards/margins": 1.3803398609161377, "rewards/rejected": -0.8769745230674744, "step": 271 }, { "epoch": 0.04533333333333334, "grad_norm": 143.4978485107422, "learning_rate": 1.998767150173759e-07, "logits/chosen": 2.6812591552734375, "logits/rejected": 2.7863993644714355, "logps/chosen": -79.69171905517578, "logps/rejected": -99.90152740478516, "loss": 1.2067, "nll_loss": 0.7589688301086426, "rewards/accuracies": 1.0, "rewards/chosen": 0.6300209760665894, "rewards/margins": 0.9761627912521362, "rewards/rejected": -0.3461418151855469, "step": 272 }, { "epoch": 0.0455, "grad_norm": 164.6746063232422, "learning_rate": 1.9987402090918065e-07, "logits/chosen": 2.9573352336883545, "logits/rejected": 3.204045534133911, "logps/chosen": -126.87345123291016, "logps/rejected": -208.5772705078125, "loss": 1.6122, "nll_loss": 1.1032472848892212, "rewards/accuracies": 1.0, "rewards/chosen": 0.636115312576294, "rewards/margins": 0.7851113080978394, "rewards/rejected": -0.14899598062038422, "step": 273 }, { "epoch": 0.04566666666666667, "grad_norm": 203.7967529296875, "learning_rate": 1.9987129770009348e-07, "logits/chosen": 1.3553879261016846, "logits/rejected": 1.9691200256347656, "logps/chosen": -49.49406814575195, "logps/rejected": -133.06124877929688, "loss": 2.2867, "nll_loss": 1.5965828895568848, "rewards/accuracies": 1.0, "rewards/chosen": 0.3045368194580078, "rewards/margins": 0.3054996430873871, "rewards/rejected": -0.0009628296829760075, "step": 274 }, { "epoch": 0.04583333333333333, "grad_norm": 150.80946350097656, "learning_rate": 1.9986854539090781e-07, "logits/chosen": 2.3577399253845215, "logits/rejected": 2.3963048458099365, "logps/chosen": -16.74890899658203, "logps/rejected": -179.42005920410156, "loss": 1.2494, "nll_loss": 0.6978710293769836, "rewards/accuracies": 1.0, "rewards/chosen": 0.3425472676753998, "rewards/margins": 0.6558666825294495, "rewards/rejected": -0.3133194148540497, "step": 275 }, { "epoch": 0.046, "grad_norm": 167.7863006591797, "learning_rate": 1.9986576398242562e-07, "logits/chosen": 3.408329725265503, "logits/rejected": 3.694420099258423, "logps/chosen": -92.3779525756836, "logps/rejected": -105.59661865234375, "loss": 2.2711, "nll_loss": 1.649606466293335, "rewards/accuracies": 1.0, "rewards/chosen": 0.20937120914459229, "rewards/margins": 0.4704796075820923, "rewards/rejected": -0.2611083984375, "step": 276 }, { "epoch": 0.04616666666666667, "grad_norm": 189.12139892578125, "learning_rate": 1.9986295347545736e-07, "logits/chosen": 2.7288568019866943, "logits/rejected": 2.995051622390747, "logps/chosen": -46.380611419677734, "logps/rejected": -242.56863403320312, "loss": 1.5912, "nll_loss": 1.0082741975784302, "rewards/accuracies": 1.0, "rewards/chosen": 0.22612038254737854, "rewards/margins": 0.5713138580322266, "rewards/rejected": -0.345193475484848, "step": 277 }, { "epoch": 0.04633333333333333, "grad_norm": 129.56991577148438, "learning_rate": 1.9986011387082195e-07, "logits/chosen": 1.7504082918167114, "logits/rejected": 1.5612655878067017, "logps/chosen": -163.50286865234375, "logps/rejected": -124.85884857177734, "loss": 1.2171, "nll_loss": 0.6673586964607239, "rewards/accuracies": 1.0, "rewards/chosen": 0.48803406953811646, "rewards/margins": 0.662609875202179, "rewards/rejected": -0.1745758056640625, "step": 278 }, { "epoch": 0.0465, "grad_norm": 157.24595642089844, "learning_rate": 1.9985724516934677e-07, "logits/chosen": 2.5407488346099854, "logits/rejected": 2.4931154251098633, "logps/chosen": -67.1334228515625, "logps/rejected": -24.820110321044922, "loss": 3.1612, "nll_loss": 2.582054615020752, "rewards/accuracies": 1.0, "rewards/chosen": 0.37004244327545166, "rewards/margins": 0.5802463889122009, "rewards/rejected": -0.21020394563674927, "step": 279 }, { "epoch": 0.04666666666666667, "grad_norm": 97.13182830810547, "learning_rate": 1.9985434737186769e-07, "logits/chosen": 2.5957722663879395, "logits/rejected": 2.544113874435425, "logps/chosen": -41.179874420166016, "logps/rejected": -50.886932373046875, "loss": 1.0676, "nll_loss": 0.6641915440559387, "rewards/accuracies": 1.0, "rewards/chosen": 0.6483616828918457, "rewards/margins": 1.132458209991455, "rewards/rejected": -0.4840965270996094, "step": 280 }, { "epoch": 0.04683333333333333, "grad_norm": 84.76709747314453, "learning_rate": 1.9985142047922906e-07, "logits/chosen": 2.3824424743652344, "logits/rejected": 2.0484066009521484, "logps/chosen": -18.100482940673828, "logps/rejected": -19.94441032409668, "loss": 1.3276, "nll_loss": 0.6961724162101746, "rewards/accuracies": 1.0, "rewards/chosen": 0.3941909968852997, "rewards/margins": 0.4468996226787567, "rewards/rejected": -0.05270862579345703, "step": 281 }, { "epoch": 0.047, "grad_norm": 204.18658447265625, "learning_rate": 1.9984846449228368e-07, "logits/chosen": 3.519028425216675, "logits/rejected": 3.395259380340576, "logps/chosen": -211.5786590576172, "logps/rejected": -137.09512329101562, "loss": 2.528, "nll_loss": 1.8083646297454834, "rewards/accuracies": 1.0, "rewards/chosen": -0.01966094970703125, "rewards/margins": 0.24060669541358948, "rewards/rejected": -0.2602676451206207, "step": 282 }, { "epoch": 0.04716666666666667, "grad_norm": 118.96526336669922, "learning_rate": 1.9984547941189292e-07, "logits/chosen": 2.535226821899414, "logits/rejected": 2.9023094177246094, "logps/chosen": -38.75307083129883, "logps/rejected": -210.61965942382812, "loss": 1.0928, "nll_loss": 0.7452514171600342, "rewards/accuracies": 1.0, "rewards/chosen": 0.32057762145996094, "rewards/margins": 1.3895046710968018, "rewards/rejected": -1.0689270496368408, "step": 283 }, { "epoch": 0.04733333333333333, "grad_norm": 125.63201141357422, "learning_rate": 1.9984246523892646e-07, "logits/chosen": 2.9259912967681885, "logits/rejected": 2.9328866004943848, "logps/chosen": -60.28314208984375, "logps/rejected": -51.04917907714844, "loss": 1.8214, "nll_loss": 1.0575988292694092, "rewards/accuracies": 1.0, "rewards/chosen": -0.06201591715216637, "rewards/margins": 0.14360696077346802, "rewards/rejected": -0.20562288165092468, "step": 284 }, { "epoch": 0.0475, "grad_norm": 168.14865112304688, "learning_rate": 1.9983942197426268e-07, "logits/chosen": 2.9398601055145264, "logits/rejected": 3.0956661701202393, "logps/chosen": -81.74857330322266, "logps/rejected": -36.65345764160156, "loss": 2.3687, "nll_loss": 1.8579223155975342, "rewards/accuracies": 1.0, "rewards/chosen": 0.7694976925849915, "rewards/margins": 0.7896644473075867, "rewards/rejected": -0.020166780799627304, "step": 285 }, { "epoch": 0.04766666666666667, "grad_norm": 103.53581237792969, "learning_rate": 1.9983634961878823e-07, "logits/chosen": 2.859218120574951, "logits/rejected": 2.7029898166656494, "logps/chosen": -71.34165954589844, "logps/rejected": -32.60145568847656, "loss": 1.3754, "nll_loss": 0.8295540809631348, "rewards/accuracies": 1.0, "rewards/chosen": 0.2479187250137329, "rewards/margins": 0.6738344430923462, "rewards/rejected": -0.4259157180786133, "step": 286 }, { "epoch": 0.04783333333333333, "grad_norm": 154.49647521972656, "learning_rate": 1.9983324817339832e-07, "logits/chosen": 2.744440793991089, "logits/rejected": 2.916905164718628, "logps/chosen": -81.6702880859375, "logps/rejected": -127.8006820678711, "loss": 1.5428, "nll_loss": 1.1036524772644043, "rewards/accuracies": 1.0, "rewards/chosen": 0.5899589657783508, "rewards/margins": 1.0042037963867188, "rewards/rejected": -0.4142448604106903, "step": 287 }, { "epoch": 0.048, "grad_norm": 125.63018798828125, "learning_rate": 1.9983011763899672e-07, "logits/chosen": 2.8581724166870117, "logits/rejected": 2.7810702323913574, "logps/chosen": -327.9617614746094, "logps/rejected": -206.68212890625, "loss": 1.3399, "nll_loss": 1.0579413175582886, "rewards/accuracies": 1.0, "rewards/chosen": 0.7732453346252441, "rewards/margins": 1.6731812953948975, "rewards/rejected": -0.8999359011650085, "step": 288 }, { "epoch": 0.04816666666666667, "grad_norm": 171.96368408203125, "learning_rate": 1.9982695801649548e-07, "logits/chosen": 2.521393060684204, "logits/rejected": 2.637784481048584, "logps/chosen": -22.929094314575195, "logps/rejected": -348.532470703125, "loss": 0.9648, "nll_loss": 0.5211158394813538, "rewards/accuracies": 1.0, "rewards/chosen": 0.3446314036846161, "rewards/margins": 0.9942225217819214, "rewards/rejected": -0.6495910882949829, "step": 289 }, { "epoch": 0.04833333333333333, "grad_norm": 173.88136291503906, "learning_rate": 1.998237693068153e-07, "logits/chosen": 1.3684390783309937, "logits/rejected": 2.2902626991271973, "logps/chosen": -45.48534393310547, "logps/rejected": -208.01065063476562, "loss": 2.9176, "nll_loss": 2.393965244293213, "rewards/accuracies": 1.0, "rewards/chosen": 0.04862060770392418, "rewards/margins": 0.7545791268348694, "rewards/rejected": -0.7059585452079773, "step": 290 }, { "epoch": 0.0485, "grad_norm": 127.35739135742188, "learning_rate": 1.998205515108853e-07, "logits/chosen": 2.871626615524292, "logits/rejected": 3.124549150466919, "logps/chosen": -82.76315307617188, "logps/rejected": -127.74143981933594, "loss": 1.2148, "nll_loss": 0.7324172854423523, "rewards/accuracies": 1.0, "rewards/chosen": 0.6110793948173523, "rewards/margins": 0.8641853332519531, "rewards/rejected": -0.25310593843460083, "step": 291 }, { "epoch": 0.048666666666666664, "grad_norm": 215.01527404785156, "learning_rate": 1.99817304629643e-07, "logits/chosen": 1.9338805675506592, "logits/rejected": 2.8477795124053955, "logps/chosen": -82.04563903808594, "logps/rejected": -33.249855041503906, "loss": 3.599, "nll_loss": 2.9302010536193848, "rewards/accuracies": 1.0, "rewards/chosen": 0.22089233994483948, "rewards/margins": 0.35401612520217896, "rewards/rejected": -0.13312378525733948, "step": 292 }, { "epoch": 0.04883333333333333, "grad_norm": 125.57191467285156, "learning_rate": 1.998140286640346e-07, "logits/chosen": 2.681962490081787, "logits/rejected": 2.333914279937744, "logps/chosen": -34.932498931884766, "logps/rejected": -46.968360900878906, "loss": 1.5212, "nll_loss": 0.9192762970924377, "rewards/accuracies": 1.0, "rewards/chosen": 0.5791912078857422, "rewards/margins": 0.5294666290283203, "rewards/rejected": 0.049724578857421875, "step": 293 }, { "epoch": 0.049, "grad_norm": 144.14639282226562, "learning_rate": 1.9981072361501447e-07, "logits/chosen": 2.528625726699829, "logits/rejected": 3.002506971359253, "logps/chosen": -92.79833221435547, "logps/rejected": -162.553955078125, "loss": 1.7602, "nll_loss": 1.174662470817566, "rewards/accuracies": 1.0, "rewards/chosen": 0.5083274841308594, "rewards/margins": 0.5672706365585327, "rewards/rejected": -0.05894317850470543, "step": 294 }, { "epoch": 0.049166666666666664, "grad_norm": 223.14208984375, "learning_rate": 1.9980738948354574e-07, "logits/chosen": 2.81960391998291, "logits/rejected": 2.992738723754883, "logps/chosen": -77.48622131347656, "logps/rejected": -138.6261749267578, "loss": 2.8144, "nll_loss": 1.9868258237838745, "rewards/accuracies": 1.0, "rewards/chosen": 0.4723259210586548, "rewards/margins": 0.02590864896774292, "rewards/rejected": 0.44641727209091187, "step": 295 }, { "epoch": 0.04933333333333333, "grad_norm": 143.33767700195312, "learning_rate": 1.9980402627059986e-07, "logits/chosen": 2.03086519241333, "logits/rejected": 2.8118364810943604, "logps/chosen": -17.922256469726562, "logps/rejected": -40.62417984008789, "loss": 2.031, "nll_loss": 1.2801610231399536, "rewards/accuracies": 1.0, "rewards/chosen": 0.09190960973501205, "rewards/margins": 0.1683763563632965, "rewards/rejected": -0.07646675407886505, "step": 296 }, { "epoch": 0.0495, "grad_norm": 154.460693359375, "learning_rate": 1.9980063397715682e-07, "logits/chosen": 2.3931350708007812, "logits/rejected": 2.328108549118042, "logps/chosen": -156.3479461669922, "logps/rejected": -36.21399688720703, "loss": 1.6309, "nll_loss": 0.9958468079566956, "rewards/accuracies": 1.0, "rewards/chosen": 0.6005569696426392, "rewards/margins": 0.44952166080474854, "rewards/rejected": 0.15103530883789062, "step": 297 }, { "epoch": 0.049666666666666665, "grad_norm": 549.8364868164062, "learning_rate": 1.9979721260420499e-07, "logits/chosen": 3.523954391479492, "logits/rejected": 3.7349820137023926, "logps/chosen": -59.098793029785156, "logps/rejected": -131.9599609375, "loss": 2.1695, "nll_loss": 1.5153536796569824, "rewards/accuracies": 1.0, "rewards/chosen": 0.46444398164749146, "rewards/margins": 0.39501649141311646, "rewards/rejected": 0.069427490234375, "step": 298 }, { "epoch": 0.049833333333333334, "grad_norm": 129.5803985595703, "learning_rate": 1.997937621527413e-07, "logits/chosen": 4.04953145980835, "logits/rejected": 4.089148998260498, "logps/chosen": -41.77289581298828, "logps/rejected": -134.47506713867188, "loss": 1.5907, "nll_loss": 1.0710997581481934, "rewards/accuracies": 1.0, "rewards/chosen": 0.5234687924385071, "rewards/margins": 0.748957097530365, "rewards/rejected": -0.2254883050918579, "step": 299 }, { "epoch": 0.05, "grad_norm": 108.064208984375, "learning_rate": 1.9979028262377116e-07, "logits/chosen": 3.025224208831787, "logits/rejected": 2.9718596935272217, "logps/chosen": -104.8935546875, "logps/rejected": -53.493656158447266, "loss": 1.3914, "nll_loss": 1.0813769102096558, "rewards/accuracies": 1.0, "rewards/chosen": 1.1405304670333862, "rewards/margins": 1.5499043464660645, "rewards/rejected": -0.4093738794326782, "step": 300 }, { "epoch": 0.050166666666666665, "grad_norm": 146.5094757080078, "learning_rate": 1.997867740183084e-07, "logits/chosen": 1.2590690851211548, "logits/rejected": 2.4243178367614746, "logps/chosen": -62.929473876953125, "logps/rejected": -204.67723083496094, "loss": 1.0738, "nll_loss": 0.6915326118469238, "rewards/accuracies": 1.0, "rewards/chosen": 0.4360824525356293, "rewards/margins": 1.2216049432754517, "rewards/rejected": -0.7855224609375, "step": 301 }, { "epoch": 0.050333333333333334, "grad_norm": 130.4385223388672, "learning_rate": 1.9978323633737533e-07, "logits/chosen": 2.082620620727539, "logits/rejected": 1.3812984228134155, "logps/chosen": -183.99630737304688, "logps/rejected": -31.161602020263672, "loss": 1.5654, "nll_loss": 1.2025901079177856, "rewards/accuracies": 1.0, "rewards/chosen": 1.316992163658142, "rewards/margins": 1.3575960397720337, "rewards/rejected": -0.04060383141040802, "step": 302 }, { "epoch": 0.0505, "grad_norm": 103.46880340576172, "learning_rate": 1.9977966958200274e-07, "logits/chosen": 2.4756510257720947, "logits/rejected": 2.2474963665008545, "logps/chosen": -224.75088500976562, "logps/rejected": -78.64306640625, "loss": 1.2627, "nll_loss": 0.9563867449760437, "rewards/accuracies": 1.0, "rewards/chosen": 0.9419494867324829, "rewards/margins": 1.5493484735488892, "rewards/rejected": -0.6073989868164062, "step": 303 }, { "epoch": 0.050666666666666665, "grad_norm": 74.86602783203125, "learning_rate": 1.9977607375322995e-07, "logits/chosen": 2.733337640762329, "logits/rejected": 2.7038347721099854, "logps/chosen": -61.493289947509766, "logps/rejected": -34.319244384765625, "loss": 0.9622, "nll_loss": 0.627482533454895, "rewards/accuracies": 1.0, "rewards/chosen": 1.0710567235946655, "rewards/margins": 1.4317333698272705, "rewards/rejected": -0.3606766164302826, "step": 304 }, { "epoch": 0.050833333333333335, "grad_norm": 158.20639038085938, "learning_rate": 1.9977244885210464e-07, "logits/chosen": 1.6222033500671387, "logits/rejected": 2.136300563812256, "logps/chosen": -108.00369262695312, "logps/rejected": -226.3303985595703, "loss": 1.4725, "nll_loss": 0.9730062484741211, "rewards/accuracies": 1.0, "rewards/chosen": 0.14528046548366547, "rewards/margins": 0.8221650123596191, "rewards/rejected": -0.6768845319747925, "step": 305 }, { "epoch": 0.051, "grad_norm": 153.8753662109375, "learning_rate": 1.9976879487968306e-07, "logits/chosen": 2.2519962787628174, "logits/rejected": 2.7838563919067383, "logps/chosen": -68.85362243652344, "logps/rejected": -308.9859313964844, "loss": 2.4079, "nll_loss": 2.025106906890869, "rewards/accuracies": 1.0, "rewards/chosen": 0.08344727754592896, "rewards/margins": 1.27555251121521, "rewards/rejected": -1.1921051740646362, "step": 306 }, { "epoch": 0.051166666666666666, "grad_norm": 96.89569854736328, "learning_rate": 1.9976511183702985e-07, "logits/chosen": 2.778887987136841, "logits/rejected": 2.2202274799346924, "logps/chosen": -91.38068389892578, "logps/rejected": -41.82392883300781, "loss": 1.4337, "nll_loss": 1.0503528118133545, "rewards/accuracies": 1.0, "rewards/chosen": 1.1022300720214844, "rewards/margins": 1.2427222728729248, "rewards/rejected": -0.140492245554924, "step": 307 }, { "epoch": 0.051333333333333335, "grad_norm": 86.83275604248047, "learning_rate": 1.997613997252182e-07, "logits/chosen": 3.110210657119751, "logits/rejected": 3.0473361015319824, "logps/chosen": -80.01530456542969, "logps/rejected": -30.588603973388672, "loss": 1.6426, "nll_loss": 1.3335883617401123, "rewards/accuracies": 1.0, "rewards/chosen": 1.3325952291488647, "rewards/margins": 1.5843782424926758, "rewards/rejected": -0.25178298354148865, "step": 308 }, { "epoch": 0.0515, "grad_norm": 171.94015502929688, "learning_rate": 1.9975765854532972e-07, "logits/chosen": 3.0371034145355225, "logits/rejected": 3.0404930114746094, "logps/chosen": -46.46971130371094, "logps/rejected": -16.910655975341797, "loss": 2.7213, "nll_loss": 1.8587881326675415, "rewards/accuracies": 0.0, "rewards/chosen": -0.2167331874370575, "rewards/margins": -0.05520249903202057, "rewards/rejected": -0.16153068840503693, "step": 309 }, { "epoch": 0.051666666666666666, "grad_norm": 119.42511749267578, "learning_rate": 1.9975388829845446e-07, "logits/chosen": 2.578035593032837, "logits/rejected": 2.5472214221954346, "logps/chosen": -73.85579681396484, "logps/rejected": -67.09759521484375, "loss": 1.6618, "nll_loss": 1.1362429857254028, "rewards/accuracies": 1.0, "rewards/chosen": 0.3058120906352997, "rewards/margins": 0.7315444946289062, "rewards/rejected": -0.42573243379592896, "step": 310 }, { "epoch": 0.051833333333333335, "grad_norm": 114.60469818115234, "learning_rate": 1.9975008898569102e-07, "logits/chosen": 2.9802613258361816, "logits/rejected": 3.3838679790496826, "logps/chosen": -55.88592529296875, "logps/rejected": -142.64337158203125, "loss": 1.2922, "nll_loss": 0.8467563986778259, "rewards/accuracies": 1.0, "rewards/chosen": 0.6249732971191406, "rewards/margins": 0.9834206104278564, "rewards/rejected": -0.35844728350639343, "step": 311 }, { "epoch": 0.052, "grad_norm": 102.88074493408203, "learning_rate": 1.9974626060814645e-07, "logits/chosen": 2.4444403648376465, "logits/rejected": 1.8174068927764893, "logps/chosen": -132.5846405029297, "logps/rejected": -9.0869140625, "loss": 1.5608, "nll_loss": 1.1235986948013306, "rewards/accuracies": 1.0, "rewards/chosen": 0.7137497663497925, "rewards/margins": 1.0145742893218994, "rewards/rejected": -0.30082449316978455, "step": 312 }, { "epoch": 0.05216666666666667, "grad_norm": 166.12254333496094, "learning_rate": 1.9974240316693621e-07, "logits/chosen": 2.2842800617218018, "logits/rejected": 1.7166783809661865, "logps/chosen": -147.33612060546875, "logps/rejected": -91.69163513183594, "loss": 1.7456, "nll_loss": 1.124703049659729, "rewards/accuracies": 1.0, "rewards/chosen": 0.390542596578598, "rewards/margins": 0.4728546142578125, "rewards/rejected": -0.08231201022863388, "step": 313 }, { "epoch": 0.052333333333333336, "grad_norm": 119.29399871826172, "learning_rate": 1.9973851666318424e-07, "logits/chosen": 2.7596378326416016, "logits/rejected": 2.6993963718414307, "logps/chosen": -40.90799331665039, "logps/rejected": -21.498676300048828, "loss": 2.2076, "nll_loss": 1.6363199949264526, "rewards/accuracies": 1.0, "rewards/chosen": 0.28838998079299927, "rewards/margins": 0.6016778945922852, "rewards/rejected": -0.3132879436016083, "step": 314 }, { "epoch": 0.0525, "grad_norm": 92.88170623779297, "learning_rate": 1.9973460109802304e-07, "logits/chosen": 2.1305291652679443, "logits/rejected": 1.7107523679733276, "logps/chosen": -143.82858276367188, "logps/rejected": -100.849365234375, "loss": 1.096, "nll_loss": 0.769136905670166, "rewards/accuracies": 1.0, "rewards/chosen": 1.3288544416427612, "rewards/margins": 1.5048073530197144, "rewards/rejected": -0.17595291137695312, "step": 315 }, { "epoch": 0.05266666666666667, "grad_norm": 134.56591796875, "learning_rate": 1.9973065647259346e-07, "logits/chosen": 1.729424238204956, "logits/rejected": 2.1593010425567627, "logps/chosen": -57.208702087402344, "logps/rejected": -222.44287109375, "loss": 2.5042, "nll_loss": 2.200334310531616, "rewards/accuracies": 1.0, "rewards/chosen": 0.39489060640335083, "rewards/margins": 1.6023781299591064, "rewards/rejected": -1.2074874639511108, "step": 316 }, { "epoch": 0.052833333333333336, "grad_norm": 164.53704833984375, "learning_rate": 1.997266827880449e-07, "logits/chosen": 1.731918215751648, "logits/rejected": 2.4375650882720947, "logps/chosen": -125.45236206054688, "logps/rejected": -225.15762329101562, "loss": 1.4938, "nll_loss": 0.9878138899803162, "rewards/accuracies": 1.0, "rewards/chosen": 0.5938782095909119, "rewards/margins": 0.7914704084396362, "rewards/rejected": -0.19759216904640198, "step": 317 }, { "epoch": 0.053, "grad_norm": 223.68917846679688, "learning_rate": 1.997226800455352e-07, "logits/chosen": 3.4080164432525635, "logits/rejected": 3.3450405597686768, "logps/chosen": -303.07940673828125, "logps/rejected": -150.76963806152344, "loss": 1.6485, "nll_loss": 0.9745319485664368, "rewards/accuracies": 1.0, "rewards/chosen": 0.4699462950229645, "rewards/margins": 0.3488143980503082, "rewards/rejected": 0.12113190442323685, "step": 318 }, { "epoch": 0.05316666666666667, "grad_norm": 154.855224609375, "learning_rate": 1.997186482462306e-07, "logits/chosen": 3.1251869201660156, "logits/rejected": 2.7920103073120117, "logps/chosen": -96.84759521484375, "logps/rejected": -14.36243724822998, "loss": 1.3881, "nll_loss": 0.7336938977241516, "rewards/accuracies": 1.0, "rewards/chosen": 0.29966431856155396, "rewards/margins": 0.3890012800693512, "rewards/rejected": -0.08933696895837784, "step": 319 }, { "epoch": 0.05333333333333334, "grad_norm": 112.58473205566406, "learning_rate": 1.9971458739130595e-07, "logits/chosen": 2.941950559616089, "logits/rejected": 2.9408586025238037, "logps/chosen": -28.9217529296875, "logps/rejected": -83.19314575195312, "loss": 1.1116, "nll_loss": 0.4986508786678314, "rewards/accuracies": 1.0, "rewards/chosen": 0.30622807145118713, "rewards/margins": 0.49161168932914734, "rewards/rejected": -0.1853836178779602, "step": 320 }, { "epoch": 0.0535, "grad_norm": 73.6568374633789, "learning_rate": 1.9971049748194444e-07, "logits/chosen": 2.417340040206909, "logits/rejected": 1.75364089012146, "logps/chosen": -55.56190490722656, "logps/rejected": -22.133037567138672, "loss": 1.0923, "nll_loss": 0.7716930508613586, "rewards/accuracies": 1.0, "rewards/chosen": 1.3483619689941406, "rewards/margins": 1.5353152751922607, "rewards/rejected": -0.1869533658027649, "step": 321 }, { "epoch": 0.05366666666666667, "grad_norm": 181.9224395751953, "learning_rate": 1.9970637851933781e-07, "logits/chosen": 2.6753909587860107, "logits/rejected": 2.678154945373535, "logps/chosen": -48.555885314941406, "logps/rejected": -86.971923828125, "loss": 1.5539, "nll_loss": 0.7831594944000244, "rewards/accuracies": 1.0, "rewards/chosen": 0.5308006405830383, "rewards/margins": 0.14303207397460938, "rewards/rejected": 0.38776856660842896, "step": 322 }, { "epoch": 0.05383333333333333, "grad_norm": 110.43831634521484, "learning_rate": 1.9970223050468617e-07, "logits/chosen": 0.806276261806488, "logits/rejected": 1.6601519584655762, "logps/chosen": -53.982933044433594, "logps/rejected": -225.9707794189453, "loss": 0.9601, "nll_loss": 0.6350933313369751, "rewards/accuracies": 1.0, "rewards/chosen": 0.6516319513320923, "rewards/margins": 1.4608010053634644, "rewards/rejected": -0.8091690540313721, "step": 323 }, { "epoch": 0.054, "grad_norm": 130.77891540527344, "learning_rate": 1.9969805343919818e-07, "logits/chosen": 3.4555351734161377, "logits/rejected": 3.5911102294921875, "logps/chosen": -67.64453887939453, "logps/rejected": -93.62144470214844, "loss": 1.7481, "nll_loss": 1.2079381942749023, "rewards/accuracies": 1.0, "rewards/chosen": 0.417562872171402, "rewards/margins": 0.6879104375839233, "rewards/rejected": -0.27034759521484375, "step": 324 }, { "epoch": 0.05416666666666667, "grad_norm": 136.6653289794922, "learning_rate": 1.9969384732409097e-07, "logits/chosen": 2.0399491786956787, "logits/rejected": 1.7544610500335693, "logps/chosen": -158.93875122070312, "logps/rejected": -67.1219253540039, "loss": 1.4351, "nll_loss": 0.9030608534812927, "rewards/accuracies": 1.0, "rewards/chosen": 0.8762481808662415, "rewards/margins": 0.7397300601005554, "rewards/rejected": 0.13651810586452484, "step": 325 }, { "epoch": 0.05433333333333333, "grad_norm": 97.14109802246094, "learning_rate": 1.9968961216059004e-07, "logits/chosen": 2.470367670059204, "logits/rejected": 2.385061025619507, "logps/chosen": -74.04069519042969, "logps/rejected": -16.046119689941406, "loss": 1.5391, "nll_loss": 1.042826533317566, "rewards/accuracies": 1.0, "rewards/chosen": 0.7788803577423096, "rewards/margins": 0.8320059180259705, "rewards/rejected": -0.05312557518482208, "step": 326 }, { "epoch": 0.0545, "grad_norm": 67.59920501708984, "learning_rate": 1.9968534794992947e-07, "logits/chosen": 2.560077428817749, "logits/rejected": 1.6668503284454346, "logps/chosen": -55.04454040527344, "logps/rejected": -23.55713653564453, "loss": 1.0453, "nll_loss": 0.764507532119751, "rewards/accuracies": 1.0, "rewards/chosen": 1.4000985622406006, "rewards/margins": 1.729461908340454, "rewards/rejected": -0.32936328649520874, "step": 327 }, { "epoch": 0.05466666666666667, "grad_norm": 100.13758850097656, "learning_rate": 1.996810546933517e-07, "logits/chosen": 2.7404978275299072, "logits/rejected": 2.9617552757263184, "logps/chosen": -108.00614929199219, "logps/rejected": -118.58502197265625, "loss": 1.3407, "nll_loss": 1.0094033479690552, "rewards/accuracies": 1.0, "rewards/chosen": 0.5782226920127869, "rewards/margins": 1.4359245300292969, "rewards/rejected": -0.8577018976211548, "step": 328 }, { "epoch": 0.05483333333333333, "grad_norm": 236.17312622070312, "learning_rate": 1.9967673239210773e-07, "logits/chosen": 2.320255994796753, "logits/rejected": 2.454930305480957, "logps/chosen": -112.96765899658203, "logps/rejected": -205.08059692382812, "loss": 2.2211, "nll_loss": 1.5475023984909058, "rewards/accuracies": 1.0, "rewards/chosen": 0.04000702500343323, "rewards/margins": 0.3469040095806122, "rewards/rejected": -0.30689698457717896, "step": 329 }, { "epoch": 0.055, "grad_norm": 144.03173828125, "learning_rate": 1.9967238104745695e-07, "logits/chosen": 2.881380796432495, "logits/rejected": 2.9497249126434326, "logps/chosen": -37.23208236694336, "logps/rejected": -192.17413330078125, "loss": 1.2838, "nll_loss": 0.6648585200309753, "rewards/accuracies": 1.0, "rewards/chosen": 0.6094009280204773, "rewards/margins": 0.4881759583950043, "rewards/rejected": 0.12122497707605362, "step": 330 }, { "epoch": 0.05516666666666667, "grad_norm": 108.2416763305664, "learning_rate": 1.996680006606672e-07, "logits/chosen": 2.25424861907959, "logits/rejected": 2.1550652980804443, "logps/chosen": -43.58298110961914, "logps/rejected": -18.15372085571289, "loss": 1.6705, "nll_loss": 1.1175124645233154, "rewards/accuracies": 1.0, "rewards/chosen": 0.2452220916748047, "rewards/margins": 0.6540879011154175, "rewards/rejected": -0.4088657796382904, "step": 331 }, { "epoch": 0.05533333333333333, "grad_norm": 161.261474609375, "learning_rate": 1.996635912330149e-07, "logits/chosen": 1.3279776573181152, "logits/rejected": 2.6674821376800537, "logps/chosen": -31.608379364013672, "logps/rejected": -177.79043579101562, "loss": 2.4623, "nll_loss": 2.1072256565093994, "rewards/accuracies": 1.0, "rewards/chosen": 0.06588973850011826, "rewards/margins": 1.4162956476211548, "rewards/rejected": -1.3504059314727783, "step": 332 }, { "epoch": 0.0555, "grad_norm": 98.29866027832031, "learning_rate": 1.9965915276578478e-07, "logits/chosen": 2.650416135787964, "logits/rejected": 2.972867012023926, "logps/chosen": -28.226978302001953, "logps/rejected": -240.57891845703125, "loss": 1.165, "nll_loss": 0.8064850568771362, "rewards/accuracies": 1.0, "rewards/chosen": 0.8739685416221619, "rewards/margins": 1.314697265625, "rewards/rejected": -0.4407287836074829, "step": 333 }, { "epoch": 0.05566666666666667, "grad_norm": 163.3273162841797, "learning_rate": 1.9965468526027014e-07, "logits/chosen": 1.9807605743408203, "logits/rejected": 1.7448172569274902, "logps/chosen": -187.13311767578125, "logps/rejected": -163.58677673339844, "loss": 1.453, "nll_loss": 0.8744538426399231, "rewards/accuracies": 1.0, "rewards/chosen": 0.9309112429618835, "rewards/margins": 0.6232467889785767, "rewards/rejected": 0.3076644837856293, "step": 334 }, { "epoch": 0.05583333333333333, "grad_norm": 102.54463958740234, "learning_rate": 1.9965018871777271e-07, "logits/chosen": 3.165900707244873, "logits/rejected": 2.887441873550415, "logps/chosen": -126.51751708984375, "logps/rejected": -36.907073974609375, "loss": 1.6573, "nll_loss": 1.139797568321228, "rewards/accuracies": 1.0, "rewards/chosen": 0.6878936886787415, "rewards/margins": 0.7627593874931335, "rewards/rejected": -0.07486572116613388, "step": 335 }, { "epoch": 0.056, "grad_norm": 136.91879272460938, "learning_rate": 1.9964566313960263e-07, "logits/chosen": 2.644335985183716, "logits/rejected": 2.718883752822876, "logps/chosen": -71.5215072631836, "logps/rejected": -161.33172607421875, "loss": 1.9081, "nll_loss": 1.4596227407455444, "rewards/accuracies": 1.0, "rewards/chosen": 0.6140213012695312, "rewards/margins": 0.9725906848907471, "rewards/rejected": -0.35856935381889343, "step": 336 }, { "epoch": 0.05616666666666666, "grad_norm": 248.06942749023438, "learning_rate": 1.9964110852707861e-07, "logits/chosen": 2.694972276687622, "logits/rejected": 2.6651105880737305, "logps/chosen": -125.64576721191406, "logps/rejected": -53.73416519165039, "loss": 2.6063, "nll_loss": 1.4277927875518799, "rewards/accuracies": 0.0, "rewards/chosen": 0.195831298828125, "rewards/margins": -0.5961406826972961, "rewards/rejected": 0.7919719815254211, "step": 337 }, { "epoch": 0.05633333333333333, "grad_norm": 93.4765396118164, "learning_rate": 1.9963652488152773e-07, "logits/chosen": 3.364166021347046, "logits/rejected": 3.3385403156280518, "logps/chosen": -59.91341781616211, "logps/rejected": -154.03929138183594, "loss": 1.1439, "nll_loss": 0.8207314610481262, "rewards/accuracies": 1.0, "rewards/chosen": 0.8128475546836853, "rewards/margins": 1.4656963348388672, "rewards/rejected": -0.6528488397598267, "step": 338 }, { "epoch": 0.0565, "grad_norm": 229.19491577148438, "learning_rate": 1.9963191220428547e-07, "logits/chosen": 0.8070737719535828, "logits/rejected": 2.567190647125244, "logps/chosen": -30.001874923706055, "logps/rejected": -313.6348876953125, "loss": 1.7234, "nll_loss": 0.9091476798057556, "rewards/accuracies": 1.0, "rewards/chosen": -0.12856446206569672, "rewards/margins": 0.03965453803539276, "rewards/rejected": -0.16821900010108948, "step": 339 }, { "epoch": 0.056666666666666664, "grad_norm": 119.94745635986328, "learning_rate": 1.9962727049669598e-07, "logits/chosen": 3.3597471714019775, "logits/rejected": 3.4186534881591797, "logps/chosen": -55.34349060058594, "logps/rejected": -72.82855987548828, "loss": 1.2624, "nll_loss": 0.7379131317138672, "rewards/accuracies": 1.0, "rewards/chosen": 1.0134018659591675, "rewards/margins": 0.7784096598625183, "rewards/rejected": 0.23499222099781036, "step": 340 }, { "epoch": 0.05683333333333333, "grad_norm": 158.31256103515625, "learning_rate": 1.9962259976011168e-07, "logits/chosen": 2.358287811279297, "logits/rejected": 2.6421830654144287, "logps/chosen": -88.80545806884766, "logps/rejected": -85.23809814453125, "loss": 2.0101, "nll_loss": 1.3875852823257446, "rewards/accuracies": 1.0, "rewards/chosen": 0.5393959283828735, "rewards/margins": 0.47457581758499146, "rewards/rejected": 0.06482010334730148, "step": 341 }, { "epoch": 0.057, "grad_norm": 192.9686279296875, "learning_rate": 1.9961789999589355e-07, "logits/chosen": 2.9953935146331787, "logits/rejected": 3.1339919567108154, "logps/chosen": -41.21373748779297, "logps/rejected": -166.57447814941406, "loss": 3.0329, "nll_loss": 2.424337387084961, "rewards/accuracies": 1.0, "rewards/chosen": 0.24146807193756104, "rewards/margins": 0.5032524466514587, "rewards/rejected": -0.2617843747138977, "step": 342 }, { "epoch": 0.057166666666666664, "grad_norm": 109.36775970458984, "learning_rate": 1.996131712054109e-07, "logits/chosen": 2.8560075759887695, "logits/rejected": 2.8493878841400146, "logps/chosen": -60.8373908996582, "logps/rejected": -38.670257568359375, "loss": 1.39, "nll_loss": 0.8568647503852844, "rewards/accuracies": 1.0, "rewards/chosen": 0.48087504506111145, "rewards/margins": 0.7084770202636719, "rewards/rejected": -0.2276020050048828, "step": 343 }, { "epoch": 0.05733333333333333, "grad_norm": 194.55599975585938, "learning_rate": 1.996084133900417e-07, "logits/chosen": 2.8589367866516113, "logits/rejected": 2.9994029998779297, "logps/chosen": -66.12206268310547, "logps/rejected": -230.88092041015625, "loss": 1.9099, "nll_loss": 1.4693787097930908, "rewards/accuracies": 1.0, "rewards/chosen": 0.9098045825958252, "rewards/margins": 1.0178108215332031, "rewards/rejected": -0.10800629109144211, "step": 344 }, { "epoch": 0.0575, "grad_norm": 160.3729705810547, "learning_rate": 1.9960362655117216e-07, "logits/chosen": 3.374506711959839, "logits/rejected": 3.555192470550537, "logps/chosen": -38.12871170043945, "logps/rejected": -261.863037109375, "loss": 1.2967, "nll_loss": 0.8473049402236938, "rewards/accuracies": 1.0, "rewards/chosen": 0.5338073968887329, "rewards/margins": 0.96881103515625, "rewards/rejected": -0.4350036680698395, "step": 345 }, { "epoch": 0.057666666666666665, "grad_norm": 76.08953094482422, "learning_rate": 1.9959881069019713e-07, "logits/chosen": 2.489497184753418, "logits/rejected": 2.5051212310791016, "logps/chosen": -27.750362396240234, "logps/rejected": -42.7049560546875, "loss": 0.8507, "nll_loss": 0.44758644700050354, "rewards/accuracies": 1.0, "rewards/chosen": 1.0220798254013062, "rewards/margins": 1.1583656072616577, "rewards/rejected": -0.13628578186035156, "step": 346 }, { "epoch": 0.057833333333333334, "grad_norm": 84.54146575927734, "learning_rate": 1.995939658085198e-07, "logits/chosen": 3.0857441425323486, "logits/rejected": 3.2895874977111816, "logps/chosen": -97.87481689453125, "logps/rejected": -333.8968505859375, "loss": 1.3087, "nll_loss": 1.0874980688095093, "rewards/accuracies": 1.0, "rewards/chosen": 1.0705482959747314, "rewards/margins": 2.038205862045288, "rewards/rejected": -0.9676575064659119, "step": 347 }, { "epoch": 0.058, "grad_norm": 118.123291015625, "learning_rate": 1.9958909190755185e-07, "logits/chosen": 3.0094001293182373, "logits/rejected": 3.1854159832000732, "logps/chosen": -45.32835388183594, "logps/rejected": -192.2648468017578, "loss": 1.2064, "nll_loss": 0.7430878281593323, "rewards/accuracies": 1.0, "rewards/chosen": 0.6520252823829651, "rewards/margins": 0.9251987934112549, "rewards/rejected": -0.2731735408306122, "step": 348 }, { "epoch": 0.058166666666666665, "grad_norm": 153.85501098632812, "learning_rate": 1.995841889887134e-07, "logits/chosen": 2.153491497039795, "logits/rejected": 2.4617674350738525, "logps/chosen": -13.771252632141113, "logps/rejected": -205.9180145263672, "loss": 1.0686, "nll_loss": 0.36240139603614807, "rewards/accuracies": 1.0, "rewards/chosen": 0.2865775227546692, "rewards/margins": 0.2681296467781067, "rewards/rejected": 0.0184478759765625, "step": 349 }, { "epoch": 0.058333333333333334, "grad_norm": 119.88876342773438, "learning_rate": 1.995792570534331e-07, "logits/chosen": 2.72731351852417, "logits/rejected": 3.000556230545044, "logps/chosen": -70.7576675415039, "logps/rejected": -156.89901733398438, "loss": 1.1812, "nll_loss": 0.7691050171852112, "rewards/accuracies": 1.0, "rewards/chosen": 1.1200706958770752, "rewards/margins": 1.140210747718811, "rewards/rejected": -0.02014007605612278, "step": 350 }, { "epoch": 0.0585, "grad_norm": 101.02230072021484, "learning_rate": 1.9957429610314797e-07, "logits/chosen": 2.105956554412842, "logits/rejected": 1.6234536170959473, "logps/chosen": -84.96725463867188, "logps/rejected": -16.226022720336914, "loss": 1.5046, "nll_loss": 0.9440804123878479, "rewards/accuracies": 1.0, "rewards/chosen": 0.3240997791290283, "rewards/margins": 0.6310728788375854, "rewards/rejected": -0.30697306990623474, "step": 351 }, { "epoch": 0.058666666666666666, "grad_norm": 123.79422760009766, "learning_rate": 1.9956930613930347e-07, "logits/chosen": 2.223222494125366, "logits/rejected": 2.1955628395080566, "logps/chosen": -59.69174575805664, "logps/rejected": -50.553470611572266, "loss": 1.6861, "nll_loss": 0.9627702236175537, "rewards/accuracies": 1.0, "rewards/chosen": 0.6707256436347961, "rewards/margins": 0.254037469625473, "rewards/rejected": 0.4166881740093231, "step": 352 }, { "epoch": 0.058833333333333335, "grad_norm": 147.2931671142578, "learning_rate": 1.995642871633536e-07, "logits/chosen": 3.161029100418091, "logits/rejected": 3.1082468032836914, "logps/chosen": -204.62518310546875, "logps/rejected": -334.6922607421875, "loss": 1.3227, "nll_loss": 0.9652131199836731, "rewards/accuracies": 1.0, "rewards/chosen": 0.551165759563446, "rewards/margins": 1.319488525390625, "rewards/rejected": -0.7683228254318237, "step": 353 }, { "epoch": 0.059, "grad_norm": 145.2660675048828, "learning_rate": 1.9955923917676078e-07, "logits/chosen": 2.769843816757202, "logits/rejected": 3.2361080646514893, "logps/chosen": -43.47734451293945, "logps/rejected": -385.277587890625, "loss": 1.2824, "nll_loss": 0.8695467114448547, "rewards/accuracies": 1.0, "rewards/chosen": 0.8978199362754822, "rewards/margins": 1.110069751739502, "rewards/rejected": -0.212249755859375, "step": 354 }, { "epoch": 0.059166666666666666, "grad_norm": 64.00053405761719, "learning_rate": 1.9955416218099585e-07, "logits/chosen": 2.472156286239624, "logits/rejected": 3.0210440158843994, "logps/chosen": -66.28229522705078, "logps/rejected": -104.96131896972656, "loss": 0.9506, "nll_loss": 0.669518232345581, "rewards/accuracies": 1.0, "rewards/chosen": 0.7589424252510071, "rewards/margins": 1.6802818775177002, "rewards/rejected": -0.9213394522666931, "step": 355 }, { "epoch": 0.059333333333333335, "grad_norm": 123.54872131347656, "learning_rate": 1.9954905617753813e-07, "logits/chosen": 2.562762498855591, "logits/rejected": 2.7800378799438477, "logps/chosen": -86.87954711914062, "logps/rejected": -94.3391342163086, "loss": 1.3308, "nll_loss": 0.8687953948974609, "rewards/accuracies": 1.0, "rewards/chosen": 0.8444870114326477, "rewards/margins": 0.9417091608047485, "rewards/rejected": -0.09722214192152023, "step": 356 }, { "epoch": 0.0595, "grad_norm": 92.13542175292969, "learning_rate": 1.9954392116787537e-07, "logits/chosen": 2.9696645736694336, "logits/rejected": 2.849153995513916, "logps/chosen": -31.737686157226562, "logps/rejected": -18.932924270629883, "loss": 1.083, "nll_loss": 0.6347537040710449, "rewards/accuracies": 1.0, "rewards/chosen": 0.5566467642784119, "rewards/margins": 0.9726426601409912, "rewards/rejected": -0.41599589586257935, "step": 357 }, { "epoch": 0.059666666666666666, "grad_norm": 273.71527099609375, "learning_rate": 1.995387571535038e-07, "logits/chosen": 1.1810790300369263, "logits/rejected": 1.6322187185287476, "logps/chosen": -138.45388793945312, "logps/rejected": -137.5558624267578, "loss": 2.1654, "nll_loss": 1.1256414651870728, "rewards/accuracies": 0.0, "rewards/chosen": 0.8249955177307129, "rewards/margins": -0.31353139877319336, "rewards/rejected": 1.1385269165039062, "step": 358 }, { "epoch": 0.059833333333333336, "grad_norm": 104.78315734863281, "learning_rate": 1.9953356413592811e-07, "logits/chosen": 2.79010272026062, "logits/rejected": 2.773037910461426, "logps/chosen": -20.340770721435547, "logps/rejected": -51.28319549560547, "loss": 1.111, "nll_loss": 0.6356491446495056, "rewards/accuracies": 1.0, "rewards/chosen": 0.7074684500694275, "rewards/margins": 0.8891487717628479, "rewards/rejected": -0.18168030679225922, "step": 359 }, { "epoch": 0.06, "grad_norm": 90.08673858642578, "learning_rate": 1.9952834211666138e-07, "logits/chosen": 4.03428840637207, "logits/rejected": 4.608205318450928, "logps/chosen": -102.33854675292969, "logps/rejected": -342.6847839355469, "loss": 1.326, "nll_loss": 1.1370952129364014, "rewards/accuracies": 1.0, "rewards/chosen": 0.9715927243232727, "rewards/margins": 2.2983109951019287, "rewards/rejected": -1.3267182111740112, "step": 360 }, { "epoch": 0.06016666666666667, "grad_norm": 98.75691986083984, "learning_rate": 1.9952309109722524e-07, "logits/chosen": 2.315281391143799, "logits/rejected": 1.943560242652893, "logps/chosen": -86.83897399902344, "logps/rejected": -88.13228607177734, "loss": 1.291, "nll_loss": 0.9337524175643921, "rewards/accuracies": 1.0, "rewards/chosen": 0.47245335578918457, "rewards/margins": 1.3278160095214844, "rewards/rejected": -0.8553627133369446, "step": 361 }, { "epoch": 0.060333333333333336, "grad_norm": 109.89427947998047, "learning_rate": 1.9951781107914965e-07, "logits/chosen": 1.5941458940505981, "logits/rejected": 2.5654349327087402, "logps/chosen": -81.36940002441406, "logps/rejected": -192.50302124023438, "loss": 1.748, "nll_loss": 1.4275330305099487, "rewards/accuracies": 1.0, "rewards/chosen": 0.8886253237724304, "rewards/margins": 1.4795372486114502, "rewards/rejected": -0.590911865234375, "step": 362 }, { "epoch": 0.0605, "grad_norm": 88.05857849121094, "learning_rate": 1.995125020639731e-07, "logits/chosen": 2.6040234565734863, "logits/rejected": 2.4595839977264404, "logps/chosen": -94.06962585449219, "logps/rejected": -78.4049301147461, "loss": 1.1443, "nll_loss": 0.8874491453170776, "rewards/accuracies": 1.0, "rewards/chosen": 0.9886955618858337, "rewards/margins": 1.8119981288909912, "rewards/rejected": -0.8233025074005127, "step": 363 }, { "epoch": 0.06066666666666667, "grad_norm": 128.37901306152344, "learning_rate": 1.9950716405324253e-07, "logits/chosen": 3.193981885910034, "logits/rejected": 3.175732135772705, "logps/chosen": -129.60267639160156, "logps/rejected": -169.00851440429688, "loss": 1.5693, "nll_loss": 1.126979947090149, "rewards/accuracies": 1.0, "rewards/chosen": 0.9508454203605652, "rewards/margins": 1.0156067609786987, "rewards/rejected": -0.06476135551929474, "step": 364 }, { "epoch": 0.060833333333333336, "grad_norm": 81.71685791015625, "learning_rate": 1.995017970485133e-07, "logits/chosen": 3.1710281372070312, "logits/rejected": 3.16813063621521, "logps/chosen": -69.1532211303711, "logps/rejected": -35.23093795776367, "loss": 1.276, "nll_loss": 0.8537434935569763, "rewards/accuracies": 1.0, "rewards/chosen": 0.8221138119697571, "rewards/margins": 1.0711463689804077, "rewards/rejected": -0.24903260171413422, "step": 365 }, { "epoch": 0.061, "grad_norm": 114.5416030883789, "learning_rate": 1.9949640105134918e-07, "logits/chosen": 2.674818277359009, "logits/rejected": 2.4508261680603027, "logps/chosen": -68.76225280761719, "logps/rejected": -64.3272933959961, "loss": 1.3012, "nll_loss": 0.8595280647277832, "rewards/accuracies": 1.0, "rewards/chosen": 0.5017265677452087, "rewards/margins": 0.9956508874893188, "rewards/rejected": -0.4939243495464325, "step": 366 }, { "epoch": 0.06116666666666667, "grad_norm": 246.0567169189453, "learning_rate": 1.994909760633225e-07, "logits/chosen": 2.581164598464966, "logits/rejected": 2.40787410736084, "logps/chosen": -78.03057861328125, "logps/rejected": -17.43083381652832, "loss": 2.2064, "nll_loss": 1.21922767162323, "rewards/accuracies": 0.0, "rewards/chosen": -0.5115761160850525, "rewards/margins": -0.27465593814849854, "rewards/rejected": -0.23692017793655396, "step": 367 }, { "epoch": 0.06133333333333333, "grad_norm": 103.45446014404297, "learning_rate": 1.9948552208601398e-07, "logits/chosen": 3.43192458152771, "logits/rejected": 3.450472354888916, "logps/chosen": -53.652923583984375, "logps/rejected": -73.47074127197266, "loss": 1.1693, "nll_loss": 0.7153723239898682, "rewards/accuracies": 1.0, "rewards/chosen": 1.182458519935608, "rewards/margins": 1.0116844177246094, "rewards/rejected": 0.17077408730983734, "step": 368 }, { "epoch": 0.0615, "grad_norm": 197.9720458984375, "learning_rate": 1.994800391210127e-07, "logits/chosen": 3.330366611480713, "logits/rejected": 3.145982503890991, "logps/chosen": -168.35287475585938, "logps/rejected": -103.43284606933594, "loss": 2.0508, "nll_loss": 1.379941701889038, "rewards/accuracies": 1.0, "rewards/chosen": 0.25616455078125, "rewards/margins": 0.3490447998046875, "rewards/rejected": -0.0928802490234375, "step": 369 }, { "epoch": 0.06166666666666667, "grad_norm": 113.40699768066406, "learning_rate": 1.994745271699163e-07, "logits/chosen": 2.7605361938476562, "logits/rejected": 2.8309147357940674, "logps/chosen": -29.607580184936523, "logps/rejected": -52.28527069091797, "loss": 1.1772, "nll_loss": 0.6042363047599792, "rewards/accuracies": 1.0, "rewards/chosen": 0.5914193987846375, "rewards/margins": 0.6030526757240295, "rewards/rejected": -0.01163330115377903, "step": 370 }, { "epoch": 0.06183333333333333, "grad_norm": 148.4364776611328, "learning_rate": 1.9946898623433087e-07, "logits/chosen": 3.575756788253784, "logits/rejected": 3.663820743560791, "logps/chosen": -117.83389282226562, "logps/rejected": -64.15972137451172, "loss": 2.7827, "nll_loss": 2.3104684352874756, "rewards/accuracies": 1.0, "rewards/chosen": 0.5491302609443665, "rewards/margins": 0.8937622308731079, "rewards/rejected": -0.34463196992874146, "step": 371 }, { "epoch": 0.062, "grad_norm": 156.72349548339844, "learning_rate": 1.9946341631587086e-07, "logits/chosen": 2.695106267929077, "logits/rejected": 1.7143659591674805, "logps/chosen": -141.8616485595703, "logps/rejected": -58.35625076293945, "loss": 2.0198, "nll_loss": 1.3772977590560913, "rewards/accuracies": 1.0, "rewards/chosen": 0.2440536618232727, "rewards/margins": 0.4176620841026306, "rewards/rejected": -0.1736084222793579, "step": 372 }, { "epoch": 0.06216666666666667, "grad_norm": 151.02215576171875, "learning_rate": 1.9945781741615918e-07, "logits/chosen": 2.468052864074707, "logits/rejected": 2.8843212127685547, "logps/chosen": -76.95917510986328, "logps/rejected": -236.9021759033203, "loss": 2.1665, "nll_loss": 1.789747953414917, "rewards/accuracies": 1.0, "rewards/chosen": 0.5453285574913025, "rewards/margins": 1.2385978698730469, "rewards/rejected": -0.6932693719863892, "step": 373 }, { "epoch": 0.06233333333333333, "grad_norm": 130.21343994140625, "learning_rate": 1.9945218953682733e-07, "logits/chosen": 2.7368037700653076, "logits/rejected": 2.3724024295806885, "logps/chosen": -54.22690963745117, "logps/rejected": -17.525972366333008, "loss": 2.35, "nll_loss": 1.8698933124542236, "rewards/accuracies": 1.0, "rewards/chosen": 0.44452667236328125, "rewards/margins": 0.868702232837677, "rewards/rejected": -0.42417556047439575, "step": 374 }, { "epoch": 0.0625, "grad_norm": 80.56707000732422, "learning_rate": 1.9944653267951503e-07, "logits/chosen": 2.9238803386688232, "logits/rejected": 3.0475902557373047, "logps/chosen": -69.93478393554688, "logps/rejected": -85.40813446044922, "loss": 1.027, "nll_loss": 0.6993478536605835, "rewards/accuracies": 1.0, "rewards/chosen": 0.9537292718887329, "rewards/margins": 1.4502716064453125, "rewards/rejected": -0.49654239416122437, "step": 375 }, { "epoch": 0.06266666666666666, "grad_norm": 109.77435302734375, "learning_rate": 1.994408468458706e-07, "logits/chosen": 2.2516634464263916, "logits/rejected": 2.4647133350372314, "logps/chosen": -124.14452362060547, "logps/rejected": -129.472900390625, "loss": 1.9875, "nll_loss": 1.6334805488586426, "rewards/accuracies": 1.0, "rewards/chosen": 1.201851725578308, "rewards/margins": 1.3667298555374146, "rewards/rejected": -0.16487808525562286, "step": 376 }, { "epoch": 0.06283333333333334, "grad_norm": 73.67166137695312, "learning_rate": 1.9943513203755073e-07, "logits/chosen": 2.914538621902466, "logits/rejected": 3.019098997116089, "logps/chosen": -20.59687042236328, "logps/rejected": -80.359619140625, "loss": 0.7398, "nll_loss": 0.4577081501483917, "rewards/accuracies": 1.0, "rewards/chosen": 1.2872800827026367, "rewards/margins": 1.7005064487457275, "rewards/rejected": -0.41322633624076843, "step": 377 }, { "epoch": 0.063, "grad_norm": 96.78551483154297, "learning_rate": 1.9942938825622062e-07, "logits/chosen": 3.434746742248535, "logits/rejected": 3.5542778968811035, "logps/chosen": -60.298492431640625, "logps/rejected": -98.70533752441406, "loss": 1.0012, "nll_loss": 0.7444258332252502, "rewards/accuracies": 1.0, "rewards/chosen": 0.4755867123603821, "rewards/margins": 1.8815484046936035, "rewards/rejected": -1.4059616327285767, "step": 378 }, { "epoch": 0.06316666666666666, "grad_norm": 148.1284942626953, "learning_rate": 1.9942361550355384e-07, "logits/chosen": 2.3582799434661865, "logits/rejected": 2.6860830783843994, "logps/chosen": -21.525728225708008, "logps/rejected": -126.54707336425781, "loss": 1.966, "nll_loss": 1.655825138092041, "rewards/accuracies": 1.0, "rewards/chosen": 0.588399350643158, "rewards/margins": 1.5408835411071777, "rewards/rejected": -0.952484130859375, "step": 379 }, { "epoch": 0.06333333333333334, "grad_norm": 211.43556213378906, "learning_rate": 1.994178137812324e-07, "logits/chosen": 2.1895272731781006, "logits/rejected": 1.9584767818450928, "logps/chosen": -79.28166961669922, "logps/rejected": -61.399208068847656, "loss": 2.2099, "nll_loss": 0.8524911403656006, "rewards/accuracies": 0.0, "rewards/chosen": 0.3225822448730469, "rewards/margins": -0.8500549793243408, "rewards/rejected": 1.1726372241973877, "step": 380 }, { "epoch": 0.0635, "grad_norm": 132.4114990234375, "learning_rate": 1.9941198309094686e-07, "logits/chosen": 2.0712904930114746, "logits/rejected": 1.6902592182159424, "logps/chosen": -126.10711669921875, "logps/rejected": -81.08753204345703, "loss": 1.6667, "nll_loss": 1.077838659286499, "rewards/accuracies": 1.0, "rewards/chosen": 0.5853821039199829, "rewards/margins": 0.5611274838447571, "rewards/rejected": 0.024254608899354935, "step": 381 }, { "epoch": 0.06366666666666666, "grad_norm": 238.40623474121094, "learning_rate": 1.9940612343439608e-07, "logits/chosen": 2.9974749088287354, "logits/rejected": 3.3635342121124268, "logps/chosen": -182.44775390625, "logps/rejected": -520.38916015625, "loss": 2.1352, "nll_loss": 1.5203980207443237, "rewards/accuracies": 1.0, "rewards/chosen": 0.6051849722862244, "rewards/margins": 0.4968353509902954, "rewards/rejected": 0.10834961384534836, "step": 382 }, { "epoch": 0.06383333333333334, "grad_norm": 65.10319519042969, "learning_rate": 1.9940023481328745e-07, "logits/chosen": 2.306185007095337, "logits/rejected": 1.6797399520874023, "logps/chosen": -170.57269287109375, "logps/rejected": -80.88714599609375, "loss": 1.26, "nll_loss": 1.0529178380966187, "rewards/accuracies": 1.0, "rewards/chosen": 1.6214935779571533, "rewards/margins": 2.195974826812744, "rewards/rejected": -0.574481189250946, "step": 383 }, { "epoch": 0.064, "grad_norm": 139.45904541015625, "learning_rate": 1.9939431722933677e-07, "logits/chosen": 2.3496789932250977, "logits/rejected": 2.747783899307251, "logps/chosen": -62.0966682434082, "logps/rejected": -314.0292663574219, "loss": 2.4509, "nll_loss": 2.0698888301849365, "rewards/accuracies": 1.0, "rewards/chosen": 0.07330741733312607, "rewards/margins": 1.2933483123779297, "rewards/rejected": -1.2200409173965454, "step": 384 }, { "epoch": 0.06416666666666666, "grad_norm": 64.11956787109375, "learning_rate": 1.993883706842683e-07, "logits/chosen": 3.1351866722106934, "logits/rejected": 3.116614818572998, "logps/chosen": -70.16484069824219, "logps/rejected": -30.915584564208984, "loss": 1.2599, "nll_loss": 0.9355311989784241, "rewards/accuracies": 1.0, "rewards/chosen": 1.0542144775390625, "rewards/margins": 1.4723060131072998, "rewards/rejected": -0.4180915951728821, "step": 385 }, { "epoch": 0.06433333333333334, "grad_norm": 100.110595703125, "learning_rate": 1.9938239517981462e-07, "logits/chosen": 2.769657611846924, "logits/rejected": 2.95363450050354, "logps/chosen": -76.30790710449219, "logps/rejected": -110.95252227783203, "loss": 1.1904, "nll_loss": 0.7481167316436768, "rewards/accuracies": 1.0, "rewards/chosen": 0.7565155029296875, "rewards/margins": 0.9985100030899048, "rewards/rejected": -0.2419944852590561, "step": 386 }, { "epoch": 0.0645, "grad_norm": 147.18959045410156, "learning_rate": 1.9937639071771702e-07, "logits/chosen": 2.9401867389678955, "logits/rejected": 2.9127464294433594, "logps/chosen": -85.94068145751953, "logps/rejected": -4.97836446762085, "loss": 1.8801, "nll_loss": 1.3428232669830322, "rewards/accuracies": 1.0, "rewards/chosen": 0.6759971976280212, "rewards/margins": 0.7048250436782837, "rewards/rejected": -0.028827859088778496, "step": 387 }, { "epoch": 0.06466666666666666, "grad_norm": 143.2528533935547, "learning_rate": 1.993703572997249e-07, "logits/chosen": 2.42533016204834, "logits/rejected": 2.5203816890716553, "logps/chosen": -177.17916870117188, "logps/rejected": -334.36712646484375, "loss": 1.2511, "nll_loss": 0.9276395440101624, "rewards/accuracies": 1.0, "rewards/chosen": 1.4268968105316162, "rewards/margins": 1.5342302322387695, "rewards/rejected": -0.10733337700366974, "step": 388 }, { "epoch": 0.06483333333333334, "grad_norm": 92.70568084716797, "learning_rate": 1.9936429492759633e-07, "logits/chosen": 3.1987218856811523, "logits/rejected": 3.0981287956237793, "logps/chosen": -24.204885482788086, "logps/rejected": -130.96099853515625, "loss": 1.2194, "nll_loss": 0.8964771628379822, "rewards/accuracies": 1.0, "rewards/chosen": 0.3577198088169098, "rewards/margins": 1.5123707056045532, "rewards/rejected": -1.1546509265899658, "step": 389 }, { "epoch": 0.065, "grad_norm": 158.40847778320312, "learning_rate": 1.9935820360309775e-07, "logits/chosen": 2.7172625064849854, "logits/rejected": 2.7666776180267334, "logps/chosen": -51.11508560180664, "logps/rejected": -19.496366500854492, "loss": 2.4331, "nll_loss": 1.7625895738601685, "rewards/accuracies": 1.0, "rewards/chosen": -0.00487785367295146, "rewards/margins": 0.35765552520751953, "rewards/rejected": -0.3625333905220032, "step": 390 }, { "epoch": 0.06516666666666666, "grad_norm": 60.03987121582031, "learning_rate": 1.99352083328004e-07, "logits/chosen": 3.269973039627075, "logits/rejected": 3.1886513233184814, "logps/chosen": -33.1342658996582, "logps/rejected": -140.2228546142578, "loss": 0.8716, "nll_loss": 0.6902971863746643, "rewards/accuracies": 1.0, "rewards/chosen": 1.594089150428772, "rewards/margins": 2.377063512802124, "rewards/rejected": -0.7829743027687073, "step": 391 }, { "epoch": 0.06533333333333333, "grad_norm": 98.85523986816406, "learning_rate": 1.9934593410409838e-07, "logits/chosen": 3.357156276702881, "logits/rejected": 3.2953269481658936, "logps/chosen": -42.861351013183594, "logps/rejected": -87.14947509765625, "loss": 1.1493, "nll_loss": 0.7026451230049133, "rewards/accuracies": 1.0, "rewards/chosen": 0.7471501231193542, "rewards/margins": 0.9832577705383301, "rewards/rejected": -0.23610763251781464, "step": 392 }, { "epoch": 0.0655, "grad_norm": 175.71311950683594, "learning_rate": 1.993397559331726e-07, "logits/chosen": 2.6103272438049316, "logits/rejected": 2.786426544189453, "logps/chosen": -41.98625564575195, "logps/rejected": -262.96630859375, "loss": 1.7378, "nll_loss": 1.1347635984420776, "rewards/accuracies": 1.0, "rewards/chosen": 0.48169368505477905, "rewards/margins": 0.5197338461875916, "rewards/rejected": -0.0380401611328125, "step": 393 }, { "epoch": 0.06566666666666666, "grad_norm": 100.1485595703125, "learning_rate": 1.9933354881702692e-07, "logits/chosen": 1.6763249635696411, "logits/rejected": 2.506119966506958, "logps/chosen": -214.5516357421875, "logps/rejected": -325.82769775390625, "loss": 1.4256, "nll_loss": 1.1724133491516113, "rewards/accuracies": 1.0, "rewards/chosen": 0.2887420654296875, "rewards/margins": 1.981011986732483, "rewards/rejected": -1.6922699213027954, "step": 394 }, { "epoch": 0.06583333333333333, "grad_norm": 60.38763427734375, "learning_rate": 1.9932731275746984e-07, "logits/chosen": 3.0013163089752197, "logits/rejected": 2.862039566040039, "logps/chosen": -76.81616973876953, "logps/rejected": -65.90629577636719, "loss": 1.2076, "nll_loss": 0.972356379032135, "rewards/accuracies": 1.0, "rewards/chosen": 1.4872841835021973, "rewards/margins": 1.989802598953247, "rewards/rejected": -0.5025184750556946, "step": 395 }, { "epoch": 0.066, "grad_norm": 87.74413299560547, "learning_rate": 1.9932104775631845e-07, "logits/chosen": 2.6384365558624268, "logits/rejected": 2.697638750076294, "logps/chosen": -73.6531753540039, "logps/rejected": -53.2362060546875, "loss": 1.2605, "nll_loss": 0.8564322590827942, "rewards/accuracies": 1.0, "rewards/chosen": 0.87215656042099, "rewards/margins": 1.1383514404296875, "rewards/rejected": -0.2661949098110199, "step": 396 }, { "epoch": 0.06616666666666667, "grad_norm": 98.82099151611328, "learning_rate": 1.993147538153982e-07, "logits/chosen": 2.280897855758667, "logits/rejected": 2.250339984893799, "logps/chosen": -79.41810607910156, "logps/rejected": -62.391937255859375, "loss": 1.4048, "nll_loss": 1.0181807279586792, "rewards/accuracies": 1.0, "rewards/chosen": 0.49872055649757385, "rewards/margins": 1.201642632484436, "rewards/rejected": -0.7029220461845398, "step": 397 }, { "epoch": 0.06633333333333333, "grad_norm": 63.394065856933594, "learning_rate": 1.9930843093654305e-07, "logits/chosen": 2.3908376693725586, "logits/rejected": 2.2657809257507324, "logps/chosen": -128.88494873046875, "logps/rejected": -134.32345581054688, "loss": 1.3596, "nll_loss": 1.161125659942627, "rewards/accuracies": 1.0, "rewards/chosen": 1.4260284900665283, "rewards/margins": 2.2207565307617188, "rewards/rejected": -0.7947281002998352, "step": 398 }, { "epoch": 0.0665, "grad_norm": 88.01782989501953, "learning_rate": 1.9930207912159527e-07, "logits/chosen": 2.050607442855835, "logits/rejected": 2.8206658363342285, "logps/chosen": -64.85485076904297, "logps/rejected": -491.09857177734375, "loss": 1.0347, "nll_loss": 0.7909127473831177, "rewards/accuracies": 1.0, "rewards/chosen": 0.4499496519565582, "rewards/margins": 1.986384630203247, "rewards/rejected": -1.5364350080490112, "step": 399 }, { "epoch": 0.06666666666666667, "grad_norm": 383.01348876953125, "learning_rate": 1.9929569837240564e-07, "logits/chosen": 3.162980079650879, "logits/rejected": 3.3049159049987793, "logps/chosen": -21.197566986083984, "logps/rejected": -88.6671371459961, "loss": 1.5042, "nll_loss": 0.7309505343437195, "rewards/accuracies": 1.0, "rewards/chosen": 0.6114303469657898, "rewards/margins": 0.1430777907371521, "rewards/rejected": 0.4683525562286377, "step": 400 }, { "epoch": 0.06683333333333333, "grad_norm": 101.09073638916016, "learning_rate": 1.9928928869083337e-07, "logits/chosen": 3.173279285430908, "logits/rejected": 3.224757194519043, "logps/chosen": -60.41118621826172, "logps/rejected": -19.98590850830078, "loss": 2.5244, "nll_loss": 2.2374515533447266, "rewards/accuracies": 1.0, "rewards/chosen": 0.7505241632461548, "rewards/margins": 1.64933180809021, "rewards/rejected": -0.8988077044487, "step": 401 }, { "epoch": 0.067, "grad_norm": 66.39128875732422, "learning_rate": 1.9928285007874607e-07, "logits/chosen": 2.7738633155822754, "logits/rejected": 2.9362547397613525, "logps/chosen": -64.8071517944336, "logps/rejected": -186.71707153320312, "loss": 0.8837, "nll_loss": 0.7044256329536438, "rewards/accuracies": 1.0, "rewards/chosen": 1.6563043594360352, "rewards/margins": 2.4053690433502197, "rewards/rejected": -0.7490646243095398, "step": 402 }, { "epoch": 0.06716666666666667, "grad_norm": 138.9801788330078, "learning_rate": 1.9927638253801984e-07, "logits/chosen": 2.2211296558380127, "logits/rejected": 2.0988805294036865, "logps/chosen": -65.98419952392578, "logps/rejected": -24.030336380004883, "loss": 1.6793, "nll_loss": 0.9997606873512268, "rewards/accuracies": 1.0, "rewards/chosen": -0.1699112057685852, "rewards/margins": 0.3491056561470032, "rewards/rejected": -0.5190168619155884, "step": 403 }, { "epoch": 0.06733333333333333, "grad_norm": 80.13654327392578, "learning_rate": 1.9926988607053912e-07, "logits/chosen": 2.024400472640991, "logits/rejected": 2.4223151206970215, "logps/chosen": -27.87089729309082, "logps/rejected": -142.80459594726562, "loss": 0.9272, "nll_loss": 0.7146384119987488, "rewards/accuracies": 1.0, "rewards/chosen": 0.8759275674819946, "rewards/margins": 2.116943359375, "rewards/rejected": -1.2410156726837158, "step": 404 }, { "epoch": 0.0675, "grad_norm": 124.83365631103516, "learning_rate": 1.9926336067819682e-07, "logits/chosen": 1.6399729251861572, "logits/rejected": 2.8095502853393555, "logps/chosen": -71.31146240234375, "logps/rejected": -52.769561767578125, "loss": 1.6582, "nll_loss": 1.2734190225601196, "rewards/accuracies": 1.0, "rewards/chosen": 0.44353944063186646, "rewards/margins": 1.2134606838226318, "rewards/rejected": -0.7699211835861206, "step": 405 }, { "epoch": 0.06766666666666667, "grad_norm": 77.50898742675781, "learning_rate": 1.9925680636289436e-07, "logits/chosen": 3.0865166187286377, "logits/rejected": 3.0872480869293213, "logps/chosen": -52.26709747314453, "logps/rejected": -69.36461639404297, "loss": 1.1351, "nll_loss": 0.7574940919876099, "rewards/accuracies": 1.0, "rewards/chosen": 1.037514567375183, "rewards/margins": 1.2522011995315552, "rewards/rejected": -0.21468660235404968, "step": 406 }, { "epoch": 0.06783333333333333, "grad_norm": 208.6669921875, "learning_rate": 1.992502231265414e-07, "logits/chosen": 2.744439125061035, "logits/rejected": 2.6930317878723145, "logps/chosen": -91.3368911743164, "logps/rejected": -73.10089111328125, "loss": 2.3463, "nll_loss": 1.5480830669403076, "rewards/accuracies": 1.0, "rewards/chosen": -0.6491341590881348, "rewards/margins": 0.13129264116287231, "rewards/rejected": -0.7804268002510071, "step": 407 }, { "epoch": 0.068, "grad_norm": 142.47901916503906, "learning_rate": 1.9924361097105622e-07, "logits/chosen": 2.29974102973938, "logits/rejected": 2.5466296672821045, "logps/chosen": -134.47210693359375, "logps/rejected": -204.36492919921875, "loss": 1.6642, "nll_loss": 1.2224738597869873, "rewards/accuracies": 1.0, "rewards/chosen": 0.2815795838832855, "rewards/margins": 1.008815050125122, "rewards/rejected": -0.7272354364395142, "step": 408 }, { "epoch": 0.06816666666666667, "grad_norm": 162.089599609375, "learning_rate": 1.992369698983654e-07, "logits/chosen": 1.65287446975708, "logits/rejected": 2.5916388034820557, "logps/chosen": -32.62274932861328, "logps/rejected": -154.25308227539062, "loss": 1.9606, "nll_loss": 1.359281063079834, "rewards/accuracies": 1.0, "rewards/chosen": 0.36220818758010864, "rewards/margins": 0.521580159664154, "rewards/rejected": -0.15937195718288422, "step": 409 }, { "epoch": 0.06833333333333333, "grad_norm": 191.4104461669922, "learning_rate": 1.99230299910404e-07, "logits/chosen": 3.2372865676879883, "logits/rejected": 3.563476324081421, "logps/chosen": -250.76263427734375, "logps/rejected": -413.7361755371094, "loss": 1.6545, "nll_loss": 1.1941076517105103, "rewards/accuracies": 1.0, "rewards/chosen": -0.187225341796875, "rewards/margins": 1.024499535560608, "rewards/rejected": -1.211724877357483, "step": 410 }, { "epoch": 0.0685, "grad_norm": 96.38447570800781, "learning_rate": 1.992236010091155e-07, "logits/chosen": 1.769289255142212, "logits/rejected": 2.366713762283325, "logps/chosen": -70.9959487915039, "logps/rejected": -105.51760864257812, "loss": 2.025, "nll_loss": 1.8204092979431152, "rewards/accuracies": 1.0, "rewards/chosen": 0.9110504388809204, "rewards/margins": 2.174734592437744, "rewards/rejected": -1.2636841535568237, "step": 411 }, { "epoch": 0.06866666666666667, "grad_norm": 74.0869140625, "learning_rate": 1.9921687319645181e-07, "logits/chosen": 2.3114047050476074, "logits/rejected": 2.356903553009033, "logps/chosen": -83.19039154052734, "logps/rejected": -103.42311096191406, "loss": 1.0999, "nll_loss": 0.8756884336471558, "rewards/accuracies": 1.0, "rewards/chosen": 1.0671089887619019, "rewards/margins": 2.0180389881134033, "rewards/rejected": -0.9509300589561462, "step": 412 }, { "epoch": 0.06883333333333333, "grad_norm": 140.9281768798828, "learning_rate": 1.9921011647437325e-07, "logits/chosen": 2.469684362411499, "logits/rejected": 2.2318828105926514, "logps/chosen": -69.36248779296875, "logps/rejected": -34.084720611572266, "loss": 2.2092, "nll_loss": 1.3087259531021118, "rewards/accuracies": 0.0, "rewards/chosen": 0.42787325382232666, "rewards/margins": -0.11712265014648438, "rewards/rejected": 0.544995903968811, "step": 413 }, { "epoch": 0.069, "grad_norm": 71.55677032470703, "learning_rate": 1.9920333084484855e-07, "logits/chosen": 2.0150675773620605, "logits/rejected": 2.1064393520355225, "logps/chosen": -39.44782638549805, "logps/rejected": -34.63311004638672, "loss": 0.9856, "nll_loss": 0.6261560916900635, "rewards/accuracies": 1.0, "rewards/chosen": 0.8241264820098877, "rewards/margins": 1.3076740503311157, "rewards/rejected": -0.4835475981235504, "step": 414 }, { "epoch": 0.06916666666666667, "grad_norm": 129.33123779296875, "learning_rate": 1.9919651630985487e-07, "logits/chosen": 3.794229507446289, "logits/rejected": 3.731818675994873, "logps/chosen": -48.277427673339844, "logps/rejected": -95.69009399414062, "loss": 1.2982, "nll_loss": 0.7314761877059937, "rewards/accuracies": 1.0, "rewards/chosen": 0.9294193387031555, "rewards/margins": 0.6512592434883118, "rewards/rejected": 0.27816009521484375, "step": 415 }, { "epoch": 0.06933333333333333, "grad_norm": 158.98439025878906, "learning_rate": 1.991896728713778e-07, "logits/chosen": 2.8943567276000977, "logits/rejected": 2.939800977706909, "logps/chosen": -15.286381721496582, "logps/rejected": -161.60818481445312, "loss": 1.047, "nll_loss": 0.3821595013141632, "rewards/accuracies": 1.0, "rewards/chosen": 0.3846295475959778, "rewards/margins": 0.3654431402683258, "rewards/rejected": 0.01918640173971653, "step": 416 }, { "epoch": 0.0695, "grad_norm": 79.38163757324219, "learning_rate": 1.991828005314114e-07, "logits/chosen": 0.3286121189594269, "logits/rejected": 2.0046355724334717, "logps/chosen": -105.57286071777344, "logps/rejected": -444.487060546875, "loss": 1.4271, "nll_loss": 1.213481068611145, "rewards/accuracies": 1.0, "rewards/chosen": 0.3981674313545227, "rewards/margins": 2.273289442062378, "rewards/rejected": -1.8751220703125, "step": 417 }, { "epoch": 0.06966666666666667, "grad_norm": 110.30870819091797, "learning_rate": 1.9917589929195808e-07, "logits/chosen": 1.80719792842865, "logits/rejected": 2.5662190914154053, "logps/chosen": -43.733421325683594, "logps/rejected": -136.94876098632812, "loss": 1.2999, "nll_loss": 0.8575180172920227, "rewards/accuracies": 1.0, "rewards/chosen": 0.26102447509765625, "rewards/margins": 1.0082199573516846, "rewards/rejected": -0.7471954226493835, "step": 418 }, { "epoch": 0.06983333333333333, "grad_norm": 75.88169860839844, "learning_rate": 1.9916896915502868e-07, "logits/chosen": 2.1289825439453125, "logits/rejected": 2.5611660480499268, "logps/chosen": -70.42594909667969, "logps/rejected": -247.67257690429688, "loss": 0.9154, "nll_loss": 0.6904504895210266, "rewards/accuracies": 1.0, "rewards/chosen": 0.8394088745117188, "rewards/margins": 2.028242588043213, "rewards/rejected": -1.1888335943222046, "step": 419 }, { "epoch": 0.07, "grad_norm": 97.70183563232422, "learning_rate": 1.991620101226425e-07, "logits/chosen": 2.2258129119873047, "logits/rejected": 2.54664945602417, "logps/chosen": -43.77426528930664, "logps/rejected": -142.20669555664062, "loss": 0.9593, "nll_loss": 0.6344096660614014, "rewards/accuracies": 1.0, "rewards/chosen": 0.8518585562705994, "rewards/margins": 1.4582138061523438, "rewards/rejected": -0.6063553094863892, "step": 420 }, { "epoch": 0.07016666666666667, "grad_norm": 128.22865295410156, "learning_rate": 1.9915502219682723e-07, "logits/chosen": 0.8482425212860107, "logits/rejected": 2.9160776138305664, "logps/chosen": -44.988311767578125, "logps/rejected": -367.09979248046875, "loss": 2.0902, "nll_loss": 1.799532413482666, "rewards/accuracies": 1.0, "rewards/chosen": 0.34467774629592896, "rewards/margins": 1.6973786354064941, "rewards/rejected": -1.35270094871521, "step": 421 }, { "epoch": 0.07033333333333333, "grad_norm": 145.43280029296875, "learning_rate": 1.9914800537961893e-07, "logits/chosen": 3.0297443866729736, "logits/rejected": 3.0538690090179443, "logps/chosen": -44.71299362182617, "logps/rejected": -48.80281066894531, "loss": 1.4104, "nll_loss": 0.5589123964309692, "rewards/accuracies": 1.0, "rewards/chosen": 0.853919267654419, "rewards/margins": 0.019278347492218018, "rewards/rejected": 0.8346409201622009, "step": 422 }, { "epoch": 0.0705, "grad_norm": 92.84748077392578, "learning_rate": 1.991409596730622e-07, "logits/chosen": 2.618774175643921, "logits/rejected": 2.7674405574798584, "logps/chosen": -34.01647186279297, "logps/rejected": -119.61368560791016, "loss": 0.8563, "nll_loss": 0.5315074324607849, "rewards/accuracies": 1.0, "rewards/chosen": 0.8592125177383423, "rewards/margins": 1.4590377807617188, "rewards/rejected": -0.5998253226280212, "step": 423 }, { "epoch": 0.07066666666666667, "grad_norm": 49.87283706665039, "learning_rate": 1.9913388507921002e-07, "logits/chosen": 2.2655820846557617, "logits/rejected": 1.9575968980789185, "logps/chosen": -66.85295104980469, "logps/rejected": -55.54737091064453, "loss": 0.9373, "nll_loss": 0.7773599624633789, "rewards/accuracies": 1.0, "rewards/chosen": 1.5650131702423096, "rewards/margins": 2.553896427154541, "rewards/rejected": -0.9888832569122314, "step": 424 }, { "epoch": 0.07083333333333333, "grad_norm": 76.94860076904297, "learning_rate": 1.9912678160012364e-07, "logits/chosen": 2.4240963459014893, "logits/rejected": 2.365434169769287, "logps/chosen": -57.347511291503906, "logps/rejected": -17.31131362915039, "loss": 1.2169, "nll_loss": 0.8433457612991333, "rewards/accuracies": 1.0, "rewards/chosen": 0.915270984172821, "rewards/margins": 1.256496787071228, "rewards/rejected": -0.34122583270072937, "step": 425 }, { "epoch": 0.071, "grad_norm": 94.39221954345703, "learning_rate": 1.9911964923787292e-07, "logits/chosen": 3.0732951164245605, "logits/rejected": 3.175832509994507, "logps/chosen": -70.48291015625, "logps/rejected": -293.2926330566406, "loss": 1.2795, "nll_loss": 1.0068989992141724, "rewards/accuracies": 1.0, "rewards/chosen": 0.4617759883403778, "rewards/margins": 1.7791130542755127, "rewards/rejected": -1.3173370361328125, "step": 426 }, { "epoch": 0.07116666666666667, "grad_norm": 75.8165283203125, "learning_rate": 1.9911248799453607e-07, "logits/chosen": 3.2901229858398438, "logits/rejected": 3.3583261966705322, "logps/chosen": -31.188644409179688, "logps/rejected": -19.44209861755371, "loss": 0.9266, "nll_loss": 0.5775675177574158, "rewards/accuracies": 1.0, "rewards/chosen": 0.7344902157783508, "rewards/margins": 1.3501255512237549, "rewards/rejected": -0.615635335445404, "step": 427 }, { "epoch": 0.07133333333333333, "grad_norm": 103.00367736816406, "learning_rate": 1.9910529787219967e-07, "logits/chosen": 2.820185661315918, "logits/rejected": 2.7961907386779785, "logps/chosen": -119.82521057128906, "logps/rejected": -188.02984619140625, "loss": 1.5357, "nll_loss": 1.1982518434524536, "rewards/accuracies": 1.0, "rewards/chosen": 0.9301101565361023, "rewards/margins": 1.4057632684707642, "rewards/rejected": -0.4756530821323395, "step": 428 }, { "epoch": 0.0715, "grad_norm": 208.22177124023438, "learning_rate": 1.9909807887295877e-07, "logits/chosen": 2.78694748878479, "logits/rejected": 2.72029185295105, "logps/chosen": -108.65910339355469, "logps/rejected": -54.922691345214844, "loss": 2.0301, "nll_loss": 0.9878100752830505, "rewards/accuracies": 0.0, "rewards/chosen": 0.00504150427877903, "rewards/margins": -0.3860839903354645, "rewards/rejected": 0.39112550020217896, "step": 429 }, { "epoch": 0.07166666666666667, "grad_norm": 115.5654296875, "learning_rate": 1.990908309989168e-07, "logits/chosen": 2.8322300910949707, "logits/rejected": 3.609400987625122, "logps/chosen": -63.152164459228516, "logps/rejected": -218.4949951171875, "loss": 1.7858, "nll_loss": 1.40338134765625, "rewards/accuracies": 1.0, "rewards/chosen": 0.3377353847026825, "rewards/margins": 1.2342045307159424, "rewards/rejected": -0.8964691162109375, "step": 430 }, { "epoch": 0.07183333333333333, "grad_norm": 113.9919204711914, "learning_rate": 1.990835542521856e-07, "logits/chosen": 3.073394298553467, "logits/rejected": 3.0597081184387207, "logps/chosen": -58.799068450927734, "logps/rejected": -67.83763885498047, "loss": 1.4239, "nll_loss": 0.8281558752059937, "rewards/accuracies": 1.0, "rewards/chosen": 0.3076145350933075, "rewards/margins": 0.5360264182090759, "rewards/rejected": -0.22841186821460724, "step": 431 }, { "epoch": 0.072, "grad_norm": 86.5669174194336, "learning_rate": 1.9907624863488547e-07, "logits/chosen": 3.387197256088257, "logits/rejected": 3.472726345062256, "logps/chosen": -86.64368438720703, "logps/rejected": -151.10479736328125, "loss": 1.6143, "nll_loss": 1.3538074493408203, "rewards/accuracies": 1.0, "rewards/chosen": 0.8113563656806946, "rewards/margins": 1.7959572076797485, "rewards/rejected": -0.984600841999054, "step": 432 }, { "epoch": 0.07216666666666667, "grad_norm": 93.99266815185547, "learning_rate": 1.9906891414914508e-07, "logits/chosen": 2.875256299972534, "logits/rejected": 2.853537082672119, "logps/chosen": -70.42756652832031, "logps/rejected": -134.04888916015625, "loss": 1.0286, "nll_loss": 0.7913209795951843, "rewards/accuracies": 1.0, "rewards/chosen": 0.8332642316818237, "rewards/margins": 1.9425218105316162, "rewards/rejected": -1.1092575788497925, "step": 433 }, { "epoch": 0.07233333333333333, "grad_norm": 167.0312957763672, "learning_rate": 1.9906155079710154e-07, "logits/chosen": 3.0018234252929688, "logits/rejected": 2.845228672027588, "logps/chosen": -80.5176010131836, "logps/rejected": -71.2723617553711, "loss": 1.7676, "nll_loss": 0.8475536704063416, "rewards/accuracies": 0.0, "rewards/chosen": 0.6061286926269531, "rewards/margins": -0.13644111156463623, "rewards/rejected": 0.7425698041915894, "step": 434 }, { "epoch": 0.0725, "grad_norm": 90.24458312988281, "learning_rate": 1.9905415858090032e-07, "logits/chosen": 2.8027279376983643, "logits/rejected": 2.877140998840332, "logps/chosen": -142.51434326171875, "logps/rejected": -92.22039031982422, "loss": 0.9565, "nll_loss": 0.6722375154495239, "rewards/accuracies": 1.0, "rewards/chosen": 1.1535857915878296, "rewards/margins": 1.6723426580429077, "rewards/rejected": -0.5187568664550781, "step": 435 }, { "epoch": 0.07266666666666667, "grad_norm": 73.29553985595703, "learning_rate": 1.9904673750269536e-07, "logits/chosen": 2.6032614707946777, "logits/rejected": 2.7415354251861572, "logps/chosen": -94.97499084472656, "logps/rejected": -171.9806365966797, "loss": 1.3366, "nll_loss": 1.1725307703018188, "rewards/accuracies": 1.0, "rewards/chosen": 0.7949432730674744, "rewards/margins": 2.6092913150787354, "rewards/rejected": -1.8143479824066162, "step": 436 }, { "epoch": 0.07283333333333333, "grad_norm": 89.28350067138672, "learning_rate": 1.99039287564649e-07, "logits/chosen": 3.2641475200653076, "logits/rejected": 3.254258871078491, "logps/chosen": -23.08012580871582, "logps/rejected": -149.15159606933594, "loss": 0.7946, "nll_loss": 0.5367471575737, "rewards/accuracies": 1.0, "rewards/chosen": 0.7278234362602234, "rewards/margins": 1.820619821548462, "rewards/rejected": -1.0927963256835938, "step": 437 }, { "epoch": 0.073, "grad_norm": 119.34525299072266, "learning_rate": 1.990318087689319e-07, "logits/chosen": 1.1474239826202393, "logits/rejected": 2.5850954055786133, "logps/chosen": -96.21746826171875, "logps/rejected": -152.08419799804688, "loss": 1.7905, "nll_loss": 1.4802685976028442, "rewards/accuracies": 1.0, "rewards/chosen": 0.3060615658760071, "rewards/margins": 1.5926902294158936, "rewards/rejected": -1.2866287231445312, "step": 438 }, { "epoch": 0.07316666666666667, "grad_norm": 93.9219741821289, "learning_rate": 1.990243011177233e-07, "logits/chosen": 2.1973977088928223, "logits/rejected": 2.161865234375, "logps/chosen": -161.0986328125, "logps/rejected": -209.6383514404297, "loss": 1.0646, "nll_loss": 0.7458270192146301, "rewards/accuracies": 1.0, "rewards/chosen": 0.801318347454071, "rewards/margins": 1.4861037731170654, "rewards/rejected": -0.6847854852676392, "step": 439 }, { "epoch": 0.07333333333333333, "grad_norm": 109.6817626953125, "learning_rate": 1.9901676461321067e-07, "logits/chosen": 2.740464925765991, "logits/rejected": 2.6643025875091553, "logps/chosen": -117.78298950195312, "logps/rejected": -19.977426528930664, "loss": 1.8734, "nll_loss": 1.3384431600570679, "rewards/accuracies": 1.0, "rewards/chosen": 0.9778686761856079, "rewards/margins": 0.7431325912475586, "rewards/rejected": 0.23473607003688812, "step": 440 }, { "epoch": 0.0735, "grad_norm": 117.66997528076172, "learning_rate": 1.9900919925758998e-07, "logits/chosen": 2.7142419815063477, "logits/rejected": 2.919539213180542, "logps/chosen": -97.75965881347656, "logps/rejected": -270.4588623046875, "loss": 1.1307, "nll_loss": 0.8575407266616821, "rewards/accuracies": 1.0, "rewards/chosen": 0.6505325436592102, "rewards/margins": 1.7376420497894287, "rewards/rejected": -1.0871094465255737, "step": 441 }, { "epoch": 0.07366666666666667, "grad_norm": 73.71847534179688, "learning_rate": 1.9900160505306566e-07, "logits/chosen": 2.8571298122406006, "logits/rejected": 3.000324010848999, "logps/chosen": -85.98734283447266, "logps/rejected": -118.01527404785156, "loss": 1.3357, "nll_loss": 1.1464979648590088, "rewards/accuracies": 1.0, "rewards/chosen": 1.3259797096252441, "rewards/margins": 2.2813539505004883, "rewards/rejected": -0.9553741812705994, "step": 442 }, { "epoch": 0.07383333333333333, "grad_norm": 90.60680389404297, "learning_rate": 1.9899398200185037e-07, "logits/chosen": 3.2927675247192383, "logits/rejected": 3.2265796661376953, "logps/chosen": -54.304534912109375, "logps/rejected": -105.82667541503906, "loss": 1.1122, "nll_loss": 0.7542297840118408, "rewards/accuracies": 1.0, "rewards/chosen": 1.0794814825057983, "rewards/margins": 1.3342254161834717, "rewards/rejected": -0.2547439634799957, "step": 443 }, { "epoch": 0.074, "grad_norm": 115.38491821289062, "learning_rate": 1.989863301061654e-07, "logits/chosen": 1.9570443630218506, "logits/rejected": 2.045306921005249, "logps/chosen": -40.12623596191406, "logps/rejected": -139.80809020996094, "loss": 2.0046, "nll_loss": 1.7446190118789673, "rewards/accuracies": 1.0, "rewards/chosen": 1.2262753248214722, "rewards/margins": 1.8087384700775146, "rewards/rejected": -0.5824630856513977, "step": 444 }, { "epoch": 0.07416666666666667, "grad_norm": 69.26205444335938, "learning_rate": 1.9897864936824024e-07, "logits/chosen": 3.2829294204711914, "logits/rejected": 3.30301833152771, "logps/chosen": -45.25738525390625, "logps/rejected": -15.12937068939209, "loss": 1.3284, "nll_loss": 1.0057196617126465, "rewards/accuracies": 1.0, "rewards/chosen": 1.1478585004806519, "rewards/margins": 1.490565538406372, "rewards/rejected": -0.3427070677280426, "step": 445 }, { "epoch": 0.07433333333333333, "grad_norm": 37.47370910644531, "learning_rate": 1.989709397903129e-07, "logits/chosen": 2.592937469482422, "logits/rejected": 2.675494909286499, "logps/chosen": -184.35073852539062, "logps/rejected": -165.36279296875, "loss": 0.7556, "nll_loss": 0.6631321310997009, "rewards/accuracies": 1.0, "rewards/chosen": 1.3974411487579346, "rewards/margins": 3.4992265701293945, "rewards/rejected": -2.10178542137146, "step": 446 }, { "epoch": 0.0745, "grad_norm": 151.867431640625, "learning_rate": 1.989632013746298e-07, "logits/chosen": 3.0224838256835938, "logits/rejected": 3.0453009605407715, "logps/chosen": -81.57206726074219, "logps/rejected": -15.659663200378418, "loss": 2.1534, "nll_loss": 1.4566441774368286, "rewards/accuracies": 1.0, "rewards/chosen": 0.01092224195599556, "rewards/margins": 0.2935298979282379, "rewards/rejected": -0.28260764479637146, "step": 447 }, { "epoch": 0.07466666666666667, "grad_norm": 48.6195068359375, "learning_rate": 1.9895543412344567e-07, "logits/chosen": 2.6741669178009033, "logits/rejected": 2.7692840099334717, "logps/chosen": -79.02174377441406, "logps/rejected": -162.27862548828125, "loss": 0.9901, "nll_loss": 0.8683707118034363, "rewards/accuracies": 1.0, "rewards/chosen": 0.9989982843399048, "rewards/margins": 3.129819393157959, "rewards/rejected": -2.1308212280273438, "step": 448 }, { "epoch": 0.07483333333333334, "grad_norm": 136.39678955078125, "learning_rate": 1.9894763803902373e-07, "logits/chosen": 3.4095659255981445, "logits/rejected": 3.3051397800445557, "logps/chosen": -79.93569946289062, "logps/rejected": -45.5258903503418, "loss": 1.5452, "nll_loss": 0.7201413512229919, "rewards/accuracies": 1.0, "rewards/chosen": 0.9411316514015198, "rewards/margins": 0.0821804404258728, "rewards/rejected": 0.858951210975647, "step": 449 }, { "epoch": 0.075, "grad_norm": 223.73081970214844, "learning_rate": 1.989398131236356e-07, "logits/chosen": 2.4299283027648926, "logits/rejected": 2.4378011226654053, "logps/chosen": -56.33473205566406, "logps/rejected": -89.36936950683594, "loss": 0.8957, "nll_loss": 0.7130979895591736, "rewards/accuracies": 1.0, "rewards/chosen": 1.469325304031372, "rewards/margins": 2.347813606262207, "rewards/rejected": -0.8784881830215454, "step": 450 }, { "epoch": 0.07516666666666667, "grad_norm": 60.7216682434082, "learning_rate": 1.9893195937956122e-07, "logits/chosen": 2.4216060638427734, "logits/rejected": 2.3117258548736572, "logps/chosen": -52.03632736206055, "logps/rejected": -49.24159240722656, "loss": 0.9657, "nll_loss": 0.7329059839248657, "rewards/accuracies": 1.0, "rewards/chosen": 0.8944774866104126, "rewards/margins": 1.967081904411316, "rewards/rejected": -1.0726044178009033, "step": 451 }, { "epoch": 0.07533333333333334, "grad_norm": 104.67797088623047, "learning_rate": 1.98924076809089e-07, "logits/chosen": 4.038281440734863, "logits/rejected": 4.22868537902832, "logps/chosen": -21.412864685058594, "logps/rejected": -103.76444244384766, "loss": 1.1087, "nll_loss": 0.6691519618034363, "rewards/accuracies": 1.0, "rewards/chosen": 0.8824473023414612, "rewards/margins": 1.017199158668518, "rewards/rejected": -0.13475190103054047, "step": 452 }, { "epoch": 0.0755, "grad_norm": 34.46847915649414, "learning_rate": 1.9891616541451576e-07, "logits/chosen": 3.1372270584106445, "logits/rejected": 3.1905503273010254, "logps/chosen": -73.6513671875, "logps/rejected": -217.60894775390625, "loss": 0.7936, "nll_loss": 0.7081862688064575, "rewards/accuracies": 1.0, "rewards/chosen": 1.1035140752792358, "rewards/margins": 3.9955430030822754, "rewards/rejected": -2.892029047012329, "step": 453 }, { "epoch": 0.07566666666666666, "grad_norm": 64.61624145507812, "learning_rate": 1.9890822519814665e-07, "logits/chosen": 2.13973331451416, "logits/rejected": 2.6673924922943115, "logps/chosen": -59.30845642089844, "logps/rejected": -135.71189880371094, "loss": 1.0828, "nll_loss": 0.8595430254936218, "rewards/accuracies": 1.0, "rewards/chosen": 0.5707947015762329, "rewards/margins": 2.104872226715088, "rewards/rejected": -1.534077525138855, "step": 454 }, { "epoch": 0.07583333333333334, "grad_norm": 168.9178009033203, "learning_rate": 1.9890025616229526e-07, "logits/chosen": 1.3369532823562622, "logits/rejected": 3.5298666954040527, "logps/chosen": -31.76618194580078, "logps/rejected": -280.97210693359375, "loss": 0.9208, "nll_loss": 0.4813058078289032, "rewards/accuracies": 1.0, "rewards/chosen": 0.7251038551330566, "rewards/margins": 1.0063294172286987, "rewards/rejected": -0.2812255918979645, "step": 455 }, { "epoch": 0.076, "grad_norm": 54.12069320678711, "learning_rate": 1.988922583092836e-07, "logits/chosen": 2.121899366378784, "logits/rejected": 2.0852110385894775, "logps/chosen": -68.60812377929688, "logps/rejected": -129.69424438476562, "loss": 0.9976, "nll_loss": 0.8266040086746216, "rewards/accuracies": 1.0, "rewards/chosen": 1.0325149297714233, "rewards/margins": 2.455807685852051, "rewards/rejected": -1.423292636871338, "step": 456 }, { "epoch": 0.07616666666666666, "grad_norm": 159.9742431640625, "learning_rate": 1.988842316414421e-07, "logits/chosen": 3.2221083641052246, "logits/rejected": 3.3228752613067627, "logps/chosen": -30.80352783203125, "logps/rejected": -114.164306640625, "loss": 2.06, "nll_loss": 1.5401763916015625, "rewards/accuracies": 1.0, "rewards/chosen": 0.16467438638210297, "rewards/margins": 0.7578476071357727, "rewards/rejected": -0.5931732058525085, "step": 457 }, { "epoch": 0.07633333333333334, "grad_norm": 82.50293731689453, "learning_rate": 1.9887617616110944e-07, "logits/chosen": 2.352020025253296, "logits/rejected": 1.9703311920166016, "logps/chosen": -85.3687515258789, "logps/rejected": -60.878761291503906, "loss": 1.2266, "nll_loss": 0.8892578482627869, "rewards/accuracies": 1.0, "rewards/chosen": 0.6002235412597656, "rewards/margins": 1.4068272113800049, "rewards/rejected": -0.8066036701202393, "step": 458 }, { "epoch": 0.0765, "grad_norm": 112.79603576660156, "learning_rate": 1.9886809187063284e-07, "logits/chosen": 3.391003131866455, "logits/rejected": 3.430558443069458, "logps/chosen": -44.60591125488281, "logps/rejected": -97.783935546875, "loss": 1.5673, "nll_loss": 1.0620454549789429, "rewards/accuracies": 1.0, "rewards/chosen": 0.5321712493896484, "rewards/margins": 0.7911128997802734, "rewards/rejected": -0.258941650390625, "step": 459 }, { "epoch": 0.07666666666666666, "grad_norm": 70.92733764648438, "learning_rate": 1.9885997877236786e-07, "logits/chosen": 2.677464246749878, "logits/rejected": 2.5848021507263184, "logps/chosen": -64.60681915283203, "logps/rejected": -21.190153121948242, "loss": 1.242, "nll_loss": 0.8730651140213013, "rewards/accuracies": 1.0, "rewards/chosen": 1.2042922973632812, "rewards/margins": 1.3096132278442383, "rewards/rejected": -0.10532093048095703, "step": 460 }, { "epoch": 0.07683333333333334, "grad_norm": 125.91451263427734, "learning_rate": 1.9885183686867845e-07, "logits/chosen": 2.767807960510254, "logits/rejected": 2.894044876098633, "logps/chosen": -97.58515930175781, "logps/rejected": -95.87811279296875, "loss": 1.6574, "nll_loss": 1.0723645687103271, "rewards/accuracies": 1.0, "rewards/chosen": 0.7093720436096191, "rewards/margins": 0.5804826021194458, "rewards/rejected": 0.12888947129249573, "step": 461 }, { "epoch": 0.077, "grad_norm": 72.00745391845703, "learning_rate": 1.9884366616193704e-07, "logits/chosen": 2.2632229328155518, "logits/rejected": 2.9955735206604004, "logps/chosen": -23.058025360107422, "logps/rejected": -100.1103744506836, "loss": 1.1097, "nll_loss": 0.8868472576141357, "rewards/accuracies": 1.0, "rewards/chosen": 1.0445060729980469, "rewards/margins": 2.0274810791015625, "rewards/rejected": -0.9829750061035156, "step": 462 }, { "epoch": 0.07716666666666666, "grad_norm": 47.83548355102539, "learning_rate": 1.9883546665452427e-07, "logits/chosen": 1.9478243589401245, "logits/rejected": 1.3946830034255981, "logps/chosen": -56.38369369506836, "logps/rejected": -44.64006042480469, "loss": 0.8963, "nll_loss": 0.6793215274810791, "rewards/accuracies": 1.0, "rewards/chosen": 1.5319347381591797, "rewards/margins": 2.1133601665496826, "rewards/rejected": -0.5814254879951477, "step": 463 }, { "epoch": 0.07733333333333334, "grad_norm": 116.92327117919922, "learning_rate": 1.9882723834882932e-07, "logits/chosen": 2.6115448474884033, "logits/rejected": 2.829887866973877, "logps/chosen": -103.53538513183594, "logps/rejected": -246.2332305908203, "loss": 2.4749, "nll_loss": 2.202880620956421, "rewards/accuracies": 1.0, "rewards/chosen": -0.1967620849609375, "rewards/margins": 2.147722005844116, "rewards/rejected": -2.3444840908050537, "step": 464 }, { "epoch": 0.0775, "grad_norm": 73.64630889892578, "learning_rate": 1.9881898124724978e-07, "logits/chosen": 2.253957748413086, "logits/rejected": 2.266573905944824, "logps/chosen": -71.81448364257812, "logps/rejected": -18.404144287109375, "loss": 1.3784, "nll_loss": 1.0114717483520508, "rewards/accuracies": 1.0, "rewards/chosen": 1.001501441001892, "rewards/margins": 1.2904294729232788, "rewards/rejected": -0.2889280319213867, "step": 465 }, { "epoch": 0.07766666666666666, "grad_norm": 101.58184051513672, "learning_rate": 1.9881069535219148e-07, "logits/chosen": 2.3990049362182617, "logits/rejected": 2.1991045475006104, "logps/chosen": -112.29803466796875, "logps/rejected": -48.17557144165039, "loss": 1.4153, "nll_loss": 1.1118618249893188, "rewards/accuracies": 1.0, "rewards/chosen": 0.5065650939941406, "rewards/margins": 1.5870860815048218, "rewards/rejected": -1.0805209875106812, "step": 466 }, { "epoch": 0.07783333333333334, "grad_norm": 233.45584106445312, "learning_rate": 1.9880238066606882e-07, "logits/chosen": 3.0438404083251953, "logits/rejected": 3.0586555004119873, "logps/chosen": -62.95942687988281, "logps/rejected": -24.55706024169922, "loss": 2.9468, "nll_loss": 1.1447169780731201, "rewards/accuracies": 0.0, "rewards/chosen": -0.606632649898529, "rewards/margins": -1.4822399616241455, "rewards/rejected": 0.8756073713302612, "step": 467 }, { "epoch": 0.078, "grad_norm": 98.62518310546875, "learning_rate": 1.9879403719130438e-07, "logits/chosen": 2.572173595428467, "logits/rejected": 3.024492025375366, "logps/chosen": -57.599151611328125, "logps/rejected": -245.425537109375, "loss": 2.1896, "nll_loss": 1.9861773252487183, "rewards/accuracies": 1.0, "rewards/chosen": 0.3392539918422699, "rewards/margins": 2.4180045127868652, "rewards/rejected": -2.0787506103515625, "step": 468 }, { "epoch": 0.07816666666666666, "grad_norm": 87.93802642822266, "learning_rate": 1.987856649303294e-07, "logits/chosen": 2.856599807739258, "logits/rejected": 2.9276294708251953, "logps/chosen": -86.54827117919922, "logps/rejected": -121.84282684326172, "loss": 1.2508, "nll_loss": 0.9948077201843262, "rewards/accuracies": 1.0, "rewards/chosen": 0.926630437374115, "rewards/margins": 1.8169381618499756, "rewards/rejected": -0.8903076648712158, "step": 469 }, { "epoch": 0.07833333333333334, "grad_norm": 92.46974182128906, "learning_rate": 1.9877726388558322e-07, "logits/chosen": 2.952493190765381, "logits/rejected": 2.9872710704803467, "logps/chosen": -47.69475173950195, "logps/rejected": -91.94694519042969, "loss": 1.0495, "nll_loss": 0.619412362575531, "rewards/accuracies": 1.0, "rewards/chosen": 0.6102378964424133, "rewards/margins": 1.0355125665664673, "rewards/rejected": -0.42527467012405396, "step": 470 }, { "epoch": 0.0785, "grad_norm": 42.19978713989258, "learning_rate": 1.9876883405951376e-07, "logits/chosen": 1.8931840658187866, "logits/rejected": 1.3845605850219727, "logps/chosen": -172.4434356689453, "logps/rejected": -155.11012268066406, "loss": 0.8267, "nll_loss": 0.7067353129386902, "rewards/accuracies": 1.0, "rewards/chosen": 1.1630722284317017, "rewards/margins": 3.0678300857543945, "rewards/rejected": -1.9047577381134033, "step": 471 }, { "epoch": 0.07866666666666666, "grad_norm": 68.09273529052734, "learning_rate": 1.987603754545773e-07, "logits/chosen": 2.9173429012298584, "logits/rejected": 2.996516227722168, "logps/chosen": -12.560306549072266, "logps/rejected": -7.801944732666016, "loss": 1.0377, "nll_loss": 0.5024122595787048, "rewards/accuracies": 1.0, "rewards/chosen": 0.4159824550151825, "rewards/margins": 0.7018672227859497, "rewards/rejected": -0.2858847677707672, "step": 472 }, { "epoch": 0.07883333333333334, "grad_norm": 92.937255859375, "learning_rate": 1.987518880732384e-07, "logits/chosen": 3.1169192790985107, "logits/rejected": 3.016918659210205, "logps/chosen": -31.678844451904297, "logps/rejected": -15.360312461853027, "loss": 1.3183, "nll_loss": 0.7039743065834045, "rewards/accuracies": 1.0, "rewards/chosen": 0.11132850497961044, "rewards/margins": 0.49281179904937744, "rewards/rejected": -0.3814832866191864, "step": 473 }, { "epoch": 0.079, "grad_norm": 116.49576568603516, "learning_rate": 1.9874337191797017e-07, "logits/chosen": 2.9276413917541504, "logits/rejected": 2.988596200942993, "logps/chosen": -56.873294830322266, "logps/rejected": -89.91249084472656, "loss": 1.9551, "nll_loss": 1.5798139572143555, "rewards/accuracies": 1.0, "rewards/chosen": 0.32240602374076843, "rewards/margins": 1.2660439014434814, "rewards/rejected": -0.9436378479003906, "step": 474 }, { "epoch": 0.07916666666666666, "grad_norm": 113.58451080322266, "learning_rate": 1.9873482699125393e-07, "logits/chosen": 3.3792271614074707, "logits/rejected": 3.340907096862793, "logps/chosen": -119.46018981933594, "logps/rejected": -173.38772583007812, "loss": 1.9081, "nll_loss": 1.5718443393707275, "rewards/accuracies": 1.0, "rewards/chosen": 0.4988288879394531, "rewards/margins": 1.4207558631896973, "rewards/rejected": -0.9219269752502441, "step": 475 }, { "epoch": 0.07933333333333334, "grad_norm": 92.05630493164062, "learning_rate": 1.9872625329557953e-07, "logits/chosen": 2.097156524658203, "logits/rejected": 2.2122254371643066, "logps/chosen": -54.71734619140625, "logps/rejected": -107.33419799804688, "loss": 1.2016, "nll_loss": 0.8290507793426514, "rewards/accuracies": 1.0, "rewards/chosen": 0.5843502283096313, "rewards/margins": 1.2540478706359863, "rewards/rejected": -0.6696975827217102, "step": 476 }, { "epoch": 0.0795, "grad_norm": 43.04352569580078, "learning_rate": 1.9871765083344508e-07, "logits/chosen": 2.268688678741455, "logits/rejected": 2.831210136413574, "logps/chosen": -75.31553649902344, "logps/rejected": -257.6057434082031, "loss": 1.2044, "nll_loss": 1.1241124868392944, "rewards/accuracies": 1.0, "rewards/chosen": 0.8842361569404602, "rewards/margins": 5.087133884429932, "rewards/rejected": -4.202897548675537, "step": 477 }, { "epoch": 0.07966666666666666, "grad_norm": 65.92811584472656, "learning_rate": 1.9870901960735717e-07, "logits/chosen": 2.93652081489563, "logits/rejected": 3.1793792247772217, "logps/chosen": -34.327388763427734, "logps/rejected": -274.44708251953125, "loss": 1.0351, "nll_loss": 0.8372533917427063, "rewards/accuracies": 1.0, "rewards/chosen": 0.3966549038887024, "rewards/margins": 2.434133529663086, "rewards/rejected": -2.0374786853790283, "step": 478 }, { "epoch": 0.07983333333333334, "grad_norm": 89.68827056884766, "learning_rate": 1.987003596198307e-07, "logits/chosen": 3.5624983310699463, "logits/rejected": 3.570770740509033, "logps/chosen": -41.18531036376953, "logps/rejected": -78.9588851928711, "loss": 1.5797, "nll_loss": 1.248039722442627, "rewards/accuracies": 1.0, "rewards/chosen": 0.5980430841445923, "rewards/margins": 1.4327912330627441, "rewards/rejected": -0.8347480893135071, "step": 479 }, { "epoch": 0.08, "grad_norm": 92.2607421875, "learning_rate": 1.9869167087338905e-07, "logits/chosen": 3.0119519233703613, "logits/rejected": 2.9774370193481445, "logps/chosen": -21.481103897094727, "logps/rejected": -8.364031791687012, "loss": 1.3726, "nll_loss": 0.933961033821106, "rewards/accuracies": 1.0, "rewards/chosen": 0.610583484172821, "rewards/margins": 1.005987524986267, "rewards/rejected": -0.39540407061576843, "step": 480 }, { "epoch": 0.08016666666666666, "grad_norm": 35.26662826538086, "learning_rate": 1.9868295337056386e-07, "logits/chosen": 2.6135590076446533, "logits/rejected": 2.595407009124756, "logps/chosen": -85.19733428955078, "logps/rejected": -65.25315856933594, "loss": 0.9785, "nll_loss": 0.8605790734291077, "rewards/accuracies": 1.0, "rewards/chosen": 2.3999321460723877, "rewards/margins": 3.236044406890869, "rewards/rejected": -0.8361122012138367, "step": 481 }, { "epoch": 0.08033333333333334, "grad_norm": 151.77572631835938, "learning_rate": 1.986742071138952e-07, "logits/chosen": 2.729708671569824, "logits/rejected": 2.7908477783203125, "logps/chosen": -80.30999755859375, "logps/rejected": -39.612266540527344, "loss": 2.6214, "nll_loss": 1.638979434967041, "rewards/accuracies": 0.0, "rewards/chosen": 0.5429893732070923, "rewards/margins": -0.2506115436553955, "rewards/rejected": 0.7936009168624878, "step": 482 }, { "epoch": 0.0805, "grad_norm": 102.19202423095703, "learning_rate": 1.9866543210593152e-07, "logits/chosen": 2.9992406368255615, "logits/rejected": 3.1105740070343018, "logps/chosen": -34.84341812133789, "logps/rejected": -93.79362487792969, "loss": 0.9759, "nll_loss": 0.5807235836982727, "rewards/accuracies": 1.0, "rewards/chosen": 1.4484825134277344, "rewards/margins": 1.2639518976211548, "rewards/rejected": 0.18453064560890198, "step": 483 }, { "epoch": 0.08066666666666666, "grad_norm": 43.626197814941406, "learning_rate": 1.9865662834922968e-07, "logits/chosen": 2.3519058227539062, "logits/rejected": 2.0059309005737305, "logps/chosen": -23.87891387939453, "logps/rejected": -21.77129364013672, "loss": 0.6791, "nll_loss": 0.4682140052318573, "rewards/accuracies": 1.0, "rewards/chosen": 1.443907618522644, "rewards/margins": 2.13883113861084, "rewards/rejected": -0.6949236392974854, "step": 484 }, { "epoch": 0.08083333333333333, "grad_norm": 45.606178283691406, "learning_rate": 1.9864779584635484e-07, "logits/chosen": 2.643995761871338, "logits/rejected": 2.541954517364502, "logps/chosen": -61.7417106628418, "logps/rejected": -21.819673538208008, "loss": 0.9714, "nll_loss": 0.8018404245376587, "rewards/accuracies": 1.0, "rewards/chosen": 1.777194619178772, "rewards/margins": 2.5154857635498047, "rewards/rejected": -0.7382912039756775, "step": 485 }, { "epoch": 0.081, "grad_norm": 133.04505920410156, "learning_rate": 1.986389345998806e-07, "logits/chosen": 2.27359938621521, "logits/rejected": 2.8337018489837646, "logps/chosen": -90.70292663574219, "logps/rejected": -183.00411987304688, "loss": 1.6664, "nll_loss": 1.2597627639770508, "rewards/accuracies": 1.0, "rewards/chosen": 0.10376129299402237, "rewards/margins": 1.1702485084533691, "rewards/rejected": -1.066487193107605, "step": 486 }, { "epoch": 0.08116666666666666, "grad_norm": 109.63984680175781, "learning_rate": 1.986300446123889e-07, "logits/chosen": 4.263433456420898, "logits/rejected": 4.468450546264648, "logps/chosen": -45.873435974121094, "logps/rejected": -284.01788330078125, "loss": 1.669, "nll_loss": 1.3492186069488525, "rewards/accuracies": 1.0, "rewards/chosen": -0.17593078315258026, "rewards/margins": 1.7321045398712158, "rewards/rejected": -1.9080352783203125, "step": 487 }, { "epoch": 0.08133333333333333, "grad_norm": 85.8838119506836, "learning_rate": 1.986211258864701e-07, "logits/chosen": 3.2789738178253174, "logits/rejected": 3.4211692810058594, "logps/chosen": -68.08654022216797, "logps/rejected": -162.6693572998047, "loss": 1.2433, "nll_loss": 1.063852071762085, "rewards/accuracies": 1.0, "rewards/chosen": 0.6422958374023438, "rewards/margins": 2.4963440895080566, "rewards/rejected": -1.854048252105713, "step": 488 }, { "epoch": 0.0815, "grad_norm": 101.25920867919922, "learning_rate": 1.9861217842472288e-07, "logits/chosen": 1.0288316011428833, "logits/rejected": 2.345654249191284, "logps/chosen": -36.72187805175781, "logps/rejected": -248.81008911132812, "loss": 1.1306, "nll_loss": 0.8743304014205933, "rewards/accuracies": 1.0, "rewards/chosen": 0.882697343826294, "rewards/margins": 1.816835880279541, "rewards/rejected": -0.9341385364532471, "step": 489 }, { "epoch": 0.08166666666666667, "grad_norm": 86.74808502197266, "learning_rate": 1.986032022297543e-07, "logits/chosen": 2.4996697902679443, "logits/rejected": 2.6927497386932373, "logps/chosen": -194.26185607910156, "logps/rejected": -211.88453674316406, "loss": 1.1117, "nll_loss": 0.879012942314148, "rewards/accuracies": 1.0, "rewards/chosen": 0.6092727780342102, "rewards/margins": 2.014726400375366, "rewards/rejected": -1.4054535627365112, "step": 490 }, { "epoch": 0.08183333333333333, "grad_norm": 62.59606170654297, "learning_rate": 1.9859419730417987e-07, "logits/chosen": 2.675943613052368, "logits/rejected": 2.7484683990478516, "logps/chosen": -49.34668731689453, "logps/rejected": -23.12220001220703, "loss": 1.0067, "nll_loss": 0.7256865501403809, "rewards/accuracies": 1.0, "rewards/chosen": 1.4927445650100708, "rewards/margins": 1.748125433921814, "rewards/rejected": -0.2553808391094208, "step": 491 }, { "epoch": 0.082, "grad_norm": 117.70201110839844, "learning_rate": 1.9858516365062333e-07, "logits/chosen": 2.8624894618988037, "logits/rejected": 3.0045695304870605, "logps/chosen": -65.86125946044922, "logps/rejected": -83.66790771484375, "loss": 1.4061, "nll_loss": 0.9022090435028076, "rewards/accuracies": 1.0, "rewards/chosen": 1.254109263420105, "rewards/margins": 0.8779686689376831, "rewards/rejected": 0.3761405944824219, "step": 492 }, { "epoch": 0.08216666666666667, "grad_norm": 86.0505142211914, "learning_rate": 1.985761012717169e-07, "logits/chosen": 2.750565528869629, "logits/rejected": 3.1790318489074707, "logps/chosen": -64.03753662109375, "logps/rejected": -264.91583251953125, "loss": 0.9977, "nll_loss": 0.7360637187957764, "rewards/accuracies": 1.0, "rewards/chosen": 0.47370225191116333, "rewards/margins": 1.8449654579162598, "rewards/rejected": -1.3712631464004517, "step": 493 }, { "epoch": 0.08233333333333333, "grad_norm": 115.99758911132812, "learning_rate": 1.9856701017010115e-07, "logits/chosen": 2.6273629665374756, "logits/rejected": 2.5494515895843506, "logps/chosen": -212.09103393554688, "logps/rejected": -95.19606018066406, "loss": 1.1222, "nll_loss": 0.7467994093894958, "rewards/accuracies": 1.0, "rewards/chosen": 0.9026428461074829, "rewards/margins": 1.2492973804473877, "rewards/rejected": -0.3466545045375824, "step": 494 }, { "epoch": 0.0825, "grad_norm": 95.90945434570312, "learning_rate": 1.9855789034842504e-07, "logits/chosen": 3.2425389289855957, "logits/rejected": 3.3140060901641846, "logps/chosen": -55.60527801513672, "logps/rejected": -52.14281463623047, "loss": 1.1997, "nll_loss": 0.5505472421646118, "rewards/accuracies": 1.0, "rewards/chosen": 1.3358726501464844, "rewards/margins": 0.523912787437439, "rewards/rejected": 0.8119598627090454, "step": 495 }, { "epoch": 0.08266666666666667, "grad_norm": 110.08880615234375, "learning_rate": 1.9854874180934578e-07, "logits/chosen": 3.0928843021392822, "logits/rejected": 3.0134270191192627, "logps/chosen": -17.71768569946289, "logps/rejected": -42.47433853149414, "loss": 1.2408, "nll_loss": 0.4788563847541809, "rewards/accuracies": 1.0, "rewards/chosen": 0.7705860733985901, "rewards/margins": 0.18489080667495728, "rewards/rejected": 0.5856952667236328, "step": 496 }, { "epoch": 0.08283333333333333, "grad_norm": 84.64491271972656, "learning_rate": 1.9853956455552908e-07, "logits/chosen": 2.139543056488037, "logits/rejected": 1.6182490587234497, "logps/chosen": -75.1727066040039, "logps/rejected": -39.9799919128418, "loss": 1.2913, "nll_loss": 0.9396589398384094, "rewards/accuracies": 1.0, "rewards/chosen": 0.41452866792678833, "rewards/margins": 1.3587892055511475, "rewards/rejected": -0.9442605972290039, "step": 497 }, { "epoch": 0.083, "grad_norm": 366.7554016113281, "learning_rate": 1.9853035858964904e-07, "logits/chosen": 1.9566097259521484, "logits/rejected": 1.715766429901123, "logps/chosen": -284.427734375, "logps/rejected": -247.16969299316406, "loss": 2.1255, "nll_loss": 1.0733120441436768, "rewards/accuracies": 0.0, "rewards/chosen": -0.9359710812568665, "rewards/margins": -0.3486419916152954, "rewards/rejected": -0.587329089641571, "step": 498 }, { "epoch": 0.08316666666666667, "grad_norm": 82.56185150146484, "learning_rate": 1.9852112391438795e-07, "logits/chosen": 3.078974723815918, "logits/rejected": 3.1259617805480957, "logps/chosen": -78.742919921875, "logps/rejected": -16.006595611572266, "loss": 1.2224, "nll_loss": 0.9721350073814392, "rewards/accuracies": 1.0, "rewards/chosen": 0.745607852935791, "rewards/margins": 1.8655110597610474, "rewards/rejected": -1.1199032068252563, "step": 499 }, { "epoch": 0.08333333333333333, "grad_norm": 53.57255554199219, "learning_rate": 1.9851186053243664e-07, "logits/chosen": 2.994640350341797, "logits/rejected": 2.9868216514587402, "logps/chosen": -47.96277618408203, "logps/rejected": -146.2497100830078, "loss": 0.7889, "nll_loss": 0.6481456160545349, "rewards/accuracies": 1.0, "rewards/chosen": 0.9123226404190063, "rewards/margins": 2.858863592147827, "rewards/rejected": -1.9465409517288208, "step": 500 }, { "epoch": 0.0835, "grad_norm": 91.66716766357422, "learning_rate": 1.985025684464942e-07, "logits/chosen": 2.582078695297241, "logits/rejected": 3.093843936920166, "logps/chosen": -47.819358825683594, "logps/rejected": -622.3004760742188, "loss": 1.2743, "nll_loss": 1.0174331665039062, "rewards/accuracies": 1.0, "rewards/chosen": 0.31712037324905396, "rewards/margins": 1.93206787109375, "rewards/rejected": -1.6149475574493408, "step": 501 }, { "epoch": 0.08366666666666667, "grad_norm": 87.9129409790039, "learning_rate": 1.9849324765926813e-07, "logits/chosen": 3.1159555912017822, "logits/rejected": 3.0619847774505615, "logps/chosen": -76.0555191040039, "logps/rejected": -19.642751693725586, "loss": 1.2912, "nll_loss": 0.8642672896385193, "rewards/accuracies": 1.0, "rewards/chosen": 1.057381510734558, "rewards/margins": 1.0814815759658813, "rewards/rejected": -0.024100113660097122, "step": 502 }, { "epoch": 0.08383333333333333, "grad_norm": 45.61384582519531, "learning_rate": 1.984838981734743e-07, "logits/chosen": 2.9344146251678467, "logits/rejected": 3.0230207443237305, "logps/chosen": -95.97235107421875, "logps/rejected": -182.0185089111328, "loss": 1.1012, "nll_loss": 0.9894057512283325, "rewards/accuracies": 1.0, "rewards/chosen": 2.5975069999694824, "rewards/margins": 3.402827739715576, "rewards/rejected": -0.8053207397460938, "step": 503 }, { "epoch": 0.084, "grad_norm": 59.07085418701172, "learning_rate": 1.9847451999183692e-07, "logits/chosen": 1.5310611724853516, "logits/rejected": 1.9748756885528564, "logps/chosen": -105.10317993164062, "logps/rejected": -243.69583129882812, "loss": 1.1124, "nll_loss": 0.9468756318092346, "rewards/accuracies": 1.0, "rewards/chosen": 0.4353317320346832, "rewards/margins": 2.848759651184082, "rewards/rejected": -2.4134278297424316, "step": 504 }, { "epoch": 0.08416666666666667, "grad_norm": 241.64166259765625, "learning_rate": 1.9846511311708855e-07, "logits/chosen": 2.9037158489227295, "logits/rejected": 3.079937219619751, "logps/chosen": -88.439208984375, "logps/rejected": -62.46528625488281, "loss": 3.4157, "nll_loss": 2.4566447734832764, "rewards/accuracies": 0.0, "rewards/chosen": -0.5467567443847656, "rewards/margins": -0.21991080045700073, "rewards/rejected": -0.3268459439277649, "step": 505 }, { "epoch": 0.08433333333333333, "grad_norm": 38.56328201293945, "learning_rate": 1.984556775519701e-07, "logits/chosen": 2.147376537322998, "logits/rejected": 1.6784240007400513, "logps/chosen": -172.6741485595703, "logps/rejected": -85.26042175292969, "loss": 0.7529, "nll_loss": 0.6419112682342529, "rewards/accuracies": 1.0, "rewards/chosen": 1.7539689540863037, "rewards/margins": 3.1252641677856445, "rewards/rejected": -1.3712953329086304, "step": 506 }, { "epoch": 0.0845, "grad_norm": 76.07540130615234, "learning_rate": 1.9844621329923097e-07, "logits/chosen": 3.4982643127441406, "logits/rejected": 3.2937099933624268, "logps/chosen": -83.11691284179688, "logps/rejected": -68.07743835449219, "loss": 1.1744, "nll_loss": 0.8937301635742188, "rewards/accuracies": 1.0, "rewards/chosen": 0.9859657287597656, "rewards/margins": 1.679650902748108, "rewards/rejected": -0.6936851739883423, "step": 507 }, { "epoch": 0.08466666666666667, "grad_norm": 118.36328887939453, "learning_rate": 1.9843672036162876e-07, "logits/chosen": 3.490320920944214, "logits/rejected": 3.5831494331359863, "logps/chosen": -40.30556106567383, "logps/rejected": -10.988733291625977, "loss": 2.6647, "nll_loss": 1.8320708274841309, "rewards/accuracies": 1.0, "rewards/chosen": 0.4522407650947571, "rewards/margins": 0.013427734375, "rewards/rejected": 0.4388130307197571, "step": 508 }, { "epoch": 0.08483333333333333, "grad_norm": 116.2778091430664, "learning_rate": 1.9842719874192943e-07, "logits/chosen": 2.8755974769592285, "logits/rejected": 3.047351360321045, "logps/chosen": -72.07056427001953, "logps/rejected": -135.27877807617188, "loss": 1.3233, "nll_loss": 0.9739265441894531, "rewards/accuracies": 1.0, "rewards/chosen": 1.0353507995605469, "rewards/margins": 1.3656227588653564, "rewards/rejected": -0.33027195930480957, "step": 509 }, { "epoch": 0.085, "grad_norm": 70.8017349243164, "learning_rate": 1.9841764844290743e-07, "logits/chosen": 2.2490944862365723, "logits/rejected": 1.7047075033187866, "logps/chosen": -87.19988250732422, "logps/rejected": -51.193870544433594, "loss": 1.2799, "nll_loss": 1.0139522552490234, "rewards/accuracies": 1.0, "rewards/chosen": 1.4732109308242798, "rewards/margins": 1.8196022510528564, "rewards/rejected": -0.34639132022857666, "step": 510 }, { "epoch": 0.08516666666666667, "grad_norm": 91.31776428222656, "learning_rate": 1.9840806946734542e-07, "logits/chosen": 2.8516879081726074, "logits/rejected": 2.8827707767486572, "logps/chosen": -103.01322174072266, "logps/rejected": -227.89773559570312, "loss": 1.2072, "nll_loss": 0.9197608232498169, "rewards/accuracies": 1.0, "rewards/chosen": 1.267382025718689, "rewards/margins": 1.673052191734314, "rewards/rejected": -0.405670166015625, "step": 511 }, { "epoch": 0.08533333333333333, "grad_norm": 66.12789154052734, "learning_rate": 1.9839846181803454e-07, "logits/chosen": 3.9502575397491455, "logits/rejected": 4.3957061767578125, "logps/chosen": -59.27171325683594, "logps/rejected": -384.0660095214844, "loss": 0.8297, "nll_loss": 0.6735420823097229, "rewards/accuracies": 1.0, "rewards/chosen": 1.6434845924377441, "rewards/margins": 2.602935791015625, "rewards/rejected": -0.9594513177871704, "step": 512 }, { "epoch": 0.0855, "grad_norm": 131.40005493164062, "learning_rate": 1.9838882549777423e-07, "logits/chosen": 2.5055856704711914, "logits/rejected": 2.5986249446868896, "logps/chosen": -50.621299743652344, "logps/rejected": -76.17340087890625, "loss": 1.3774, "nll_loss": 1.1249178647994995, "rewards/accuracies": 1.0, "rewards/chosen": 0.8479938507080078, "rewards/margins": 1.8419818878173828, "rewards/rejected": -0.993988037109375, "step": 513 }, { "epoch": 0.08566666666666667, "grad_norm": 65.04849243164062, "learning_rate": 1.9837916050937221e-07, "logits/chosen": 2.3824074268341064, "logits/rejected": 2.7276158332824707, "logps/chosen": -15.274910926818848, "logps/rejected": -116.6122817993164, "loss": 0.7005, "nll_loss": 0.3818727433681488, "rewards/accuracies": 1.0, "rewards/chosen": 0.6204302906990051, "rewards/margins": 1.4941234588623047, "rewards/rejected": -0.8736931681632996, "step": 514 }, { "epoch": 0.08583333333333333, "grad_norm": 160.03623962402344, "learning_rate": 1.983694668556447e-07, "logits/chosen": 2.697319746017456, "logits/rejected": 2.6095783710479736, "logps/chosen": -101.20221710205078, "logps/rejected": -46.047035217285156, "loss": 1.8626, "nll_loss": 1.1500252485275269, "rewards/accuracies": 1.0, "rewards/chosen": 1.0435470342636108, "rewards/margins": 0.3275828957557678, "rewards/rejected": 0.715964138507843, "step": 515 }, { "epoch": 0.086, "grad_norm": 36.995697021484375, "learning_rate": 1.9835974453941619e-07, "logits/chosen": 1.3177047967910767, "logits/rejected": 1.6995038986206055, "logps/chosen": -100.04898834228516, "logps/rejected": -212.6888885498047, "loss": 1.1235, "nll_loss": 1.0531471967697144, "rewards/accuracies": 1.0, "rewards/chosen": 1.0824699401855469, "rewards/margins": 4.999763011932373, "rewards/rejected": -3.917293071746826, "step": 516 }, { "epoch": 0.08616666666666667, "grad_norm": 52.84809112548828, "learning_rate": 1.9834999356351948e-07, "logits/chosen": 2.641021966934204, "logits/rejected": 2.8658666610717773, "logps/chosen": -32.883026123046875, "logps/rejected": -248.2943115234375, "loss": 0.7369, "nll_loss": 0.5871968269348145, "rewards/accuracies": 1.0, "rewards/chosen": 0.9013328552246094, "rewards/margins": 2.736971378326416, "rewards/rejected": -1.835638403892517, "step": 517 }, { "epoch": 0.08633333333333333, "grad_norm": 163.8793487548828, "learning_rate": 1.9834021393079584e-07, "logits/chosen": 2.2471306324005127, "logits/rejected": 2.2384696006774902, "logps/chosen": -105.65460205078125, "logps/rejected": -104.78650665283203, "loss": 1.876, "nll_loss": 1.3545461893081665, "rewards/accuracies": 1.0, "rewards/chosen": 1.0990524291992188, "rewards/margins": 0.8005836009979248, "rewards/rejected": 0.29846879839897156, "step": 518 }, { "epoch": 0.0865, "grad_norm": 67.7530517578125, "learning_rate": 1.9833040564409475e-07, "logits/chosen": 3.107112407684326, "logits/rejected": 3.074004650115967, "logps/chosen": -34.72883605957031, "logps/rejected": -36.943443298339844, "loss": 1.0651, "nll_loss": 0.7235174775123596, "rewards/accuracies": 1.0, "rewards/chosen": 0.8766723871231079, "rewards/margins": 1.3857018947601318, "rewards/rejected": -0.5090295672416687, "step": 519 }, { "epoch": 0.08666666666666667, "grad_norm": 72.45250701904297, "learning_rate": 1.9832056870627416e-07, "logits/chosen": 2.9558629989624023, "logits/rejected": 2.9934282302856445, "logps/chosen": -48.01634216308594, "logps/rejected": -102.46390533447266, "loss": 1.0076, "nll_loss": 0.8730244636535645, "rewards/accuracies": 1.0, "rewards/chosen": 0.618731677532196, "rewards/margins": 3.230576515197754, "rewards/rejected": -2.611844778060913, "step": 520 }, { "epoch": 0.08683333333333333, "grad_norm": 106.43130493164062, "learning_rate": 1.9831070312020032e-07, "logits/chosen": 2.9432713985443115, "logits/rejected": 2.95149827003479, "logps/chosen": -54.467041015625, "logps/rejected": -72.62947845458984, "loss": 1.0909, "nll_loss": 0.6260579228401184, "rewards/accuracies": 1.0, "rewards/chosen": 0.24163055419921875, "rewards/margins": 0.9292831420898438, "rewards/rejected": -0.687652587890625, "step": 521 }, { "epoch": 0.087, "grad_norm": 76.1220703125, "learning_rate": 1.9830080888874777e-07, "logits/chosen": 3.696434259414673, "logits/rejected": 3.648312568664551, "logps/chosen": -29.30810546875, "logps/rejected": -34.12981414794922, "loss": 0.9467, "nll_loss": 0.574668824672699, "rewards/accuracies": 1.0, "rewards/chosen": 0.8938740491867065, "rewards/margins": 1.262035846710205, "rewards/rejected": -0.36816176772117615, "step": 522 }, { "epoch": 0.08716666666666667, "grad_norm": 45.53923416137695, "learning_rate": 1.982908860147995e-07, "logits/chosen": 1.504003882408142, "logits/rejected": 2.045558452606201, "logps/chosen": -93.9454116821289, "logps/rejected": -292.17266845703125, "loss": 1.0196, "nll_loss": 0.9120914340019226, "rewards/accuracies": 1.0, "rewards/chosen": 1.1780052185058594, "rewards/margins": 3.2851312160491943, "rewards/rejected": -2.107125997543335, "step": 523 }, { "epoch": 0.08733333333333333, "grad_norm": 99.37480926513672, "learning_rate": 1.9828093450124675e-07, "logits/chosen": 2.915926456451416, "logits/rejected": 3.182119131088257, "logps/chosen": -62.13477325439453, "logps/rejected": -410.3134460449219, "loss": 2.2308, "nll_loss": 2.0711591243743896, "rewards/accuracies": 1.0, "rewards/chosen": 0.39911580085754395, "rewards/margins": 2.9951913356781006, "rewards/rejected": -2.5960755348205566, "step": 524 }, { "epoch": 0.0875, "grad_norm": 74.84679412841797, "learning_rate": 1.9827095435098923e-07, "logits/chosen": 2.3100221157073975, "logits/rejected": 2.4704182147979736, "logps/chosen": -29.841596603393555, "logps/rejected": -169.9243621826172, "loss": 1.128, "nll_loss": 0.9325498342514038, "rewards/accuracies": 1.0, "rewards/chosen": 0.7321733832359314, "rewards/margins": 2.291015863418579, "rewards/rejected": -1.5588425397872925, "step": 525 }, { "epoch": 0.08766666666666667, "grad_norm": 167.4678192138672, "learning_rate": 1.9826094556693486e-07, "logits/chosen": 2.402073621749878, "logits/rejected": 2.088289976119995, "logps/chosen": -98.97483825683594, "logps/rejected": -52.597991943359375, "loss": 2.08, "nll_loss": 1.064245581626892, "rewards/accuracies": 0.0, "rewards/chosen": 1.0186675786972046, "rewards/margins": -0.24226462841033936, "rewards/rejected": 1.260932207107544, "step": 526 }, { "epoch": 0.08783333333333333, "grad_norm": 128.24905395507812, "learning_rate": 1.9825090815199999e-07, "logits/chosen": 3.7394819259643555, "logits/rejected": 3.6317059993743896, "logps/chosen": -34.36882781982422, "logps/rejected": -102.36676025390625, "loss": 2.1228, "nll_loss": 1.808885931968689, "rewards/accuracies": 1.0, "rewards/chosen": -0.10445290058851242, "rewards/margins": 1.7273740768432617, "rewards/rejected": -1.83182692527771, "step": 527 }, { "epoch": 0.088, "grad_norm": 154.2330322265625, "learning_rate": 1.9824084210910924e-07, "logits/chosen": 2.569394588470459, "logits/rejected": 2.4473116397857666, "logps/chosen": -245.541748046875, "logps/rejected": -121.46348571777344, "loss": 1.9179, "nll_loss": 1.5156899690628052, "rewards/accuracies": 1.0, "rewards/chosen": 0.09374085068702698, "rewards/margins": 1.1890289783477783, "rewards/rejected": -1.0952881574630737, "step": 528 }, { "epoch": 0.08816666666666667, "grad_norm": 203.33108520507812, "learning_rate": 1.9823074744119564e-07, "logits/chosen": 2.9159867763519287, "logits/rejected": 2.8035836219787598, "logps/chosen": -85.92863464355469, "logps/rejected": -5.651129245758057, "loss": 1.8563, "nll_loss": 1.342634916305542, "rewards/accuracies": 1.0, "rewards/chosen": 0.6772018671035767, "rewards/margins": 0.7733061909675598, "rewards/rejected": -0.09610433876514435, "step": 529 }, { "epoch": 0.08833333333333333, "grad_norm": 86.1764907836914, "learning_rate": 1.9822062415120051e-07, "logits/chosen": 2.493516683578491, "logits/rejected": 2.420422315597534, "logps/chosen": -73.75189208984375, "logps/rejected": -38.568275451660156, "loss": 1.1724, "nll_loss": 0.8994133472442627, "rewards/accuracies": 1.0, "rewards/chosen": 1.20506751537323, "rewards/margins": 1.7382385730743408, "rewards/rejected": -0.5331711173057556, "step": 530 }, { "epoch": 0.0885, "grad_norm": 63.094417572021484, "learning_rate": 1.982104722420736e-07, "logits/chosen": 1.5740963220596313, "logits/rejected": 2.692451238632202, "logps/chosen": -97.13116455078125, "logps/rejected": -230.4757843017578, "loss": 1.3752, "nll_loss": 1.2295081615447998, "rewards/accuracies": 1.0, "rewards/chosen": 0.8540313839912415, "rewards/margins": 2.8132522106170654, "rewards/rejected": -1.9592208862304688, "step": 531 }, { "epoch": 0.08866666666666667, "grad_norm": 98.38809204101562, "learning_rate": 1.9820029171677284e-07, "logits/chosen": 3.1637282371520996, "logits/rejected": 3.2132606506347656, "logps/chosen": -72.983642578125, "logps/rejected": -109.63313293457031, "loss": 1.327, "nll_loss": 0.9997759461402893, "rewards/accuracies": 1.0, "rewards/chosen": 0.6862778067588806, "rewards/margins": 1.4483444690704346, "rewards/rejected": -0.7620667219161987, "step": 532 }, { "epoch": 0.08883333333333333, "grad_norm": 35.914798736572266, "learning_rate": 1.9819008257826466e-07, "logits/chosen": 2.5422306060791016, "logits/rejected": 2.832447052001953, "logps/chosen": -92.53648376464844, "logps/rejected": -456.78497314453125, "loss": 1.0317, "nll_loss": 0.9740683436393738, "rewards/accuracies": 1.0, "rewards/chosen": 1.664923906326294, "rewards/margins": 4.418763160705566, "rewards/rejected": -2.7538392543792725, "step": 533 }, { "epoch": 0.089, "grad_norm": 85.92554473876953, "learning_rate": 1.9817984482952375e-07, "logits/chosen": 2.3523786067962646, "logits/rejected": 2.245728015899658, "logps/chosen": -57.840576171875, "logps/rejected": -47.56568145751953, "loss": 1.2836, "nll_loss": 0.8763723969459534, "rewards/accuracies": 1.0, "rewards/chosen": 1.0716171264648438, "rewards/margins": 1.152099609375, "rewards/rejected": -0.08048248291015625, "step": 534 }, { "epoch": 0.08916666666666667, "grad_norm": 63.45906448364258, "learning_rate": 1.981695784735331e-07, "logits/chosen": 3.6731555461883545, "logits/rejected": 3.7939813137054443, "logps/chosen": -28.50271987915039, "logps/rejected": -168.0089874267578, "loss": 0.7668, "nll_loss": 0.6064409017562866, "rewards/accuracies": 1.0, "rewards/chosen": 1.7283226251602173, "rewards/margins": 2.5836663246154785, "rewards/rejected": -0.8553436398506165, "step": 535 }, { "epoch": 0.08933333333333333, "grad_norm": 92.24909973144531, "learning_rate": 1.9815928351328411e-07, "logits/chosen": 3.30771541595459, "logits/rejected": 3.113111972808838, "logps/chosen": -55.925254821777344, "logps/rejected": -46.474517822265625, "loss": 1.2136, "nll_loss": 0.6283736824989319, "rewards/accuracies": 1.0, "rewards/chosen": 0.8727874755859375, "rewards/margins": 0.5991714000701904, "rewards/rejected": 0.2736160457134247, "step": 536 }, { "epoch": 0.0895, "grad_norm": 68.26935577392578, "learning_rate": 1.981489599517765e-07, "logits/chosen": 1.8038142919540405, "logits/rejected": 1.4934486150741577, "logps/chosen": -45.190032958984375, "logps/rejected": -39.083595275878906, "loss": 0.9784, "nll_loss": 0.6549279689788818, "rewards/accuracies": 1.0, "rewards/chosen": 0.8596439361572266, "rewards/margins": 1.4654362201690674, "rewards/rejected": -0.605792224407196, "step": 537 }, { "epoch": 0.08966666666666667, "grad_norm": 131.12506103515625, "learning_rate": 1.9813860779201832e-07, "logits/chosen": 2.7630558013916016, "logits/rejected": 2.8869833946228027, "logps/chosen": -81.39436340332031, "logps/rejected": -106.39321899414062, "loss": 1.5401, "nll_loss": 1.08525812625885, "rewards/accuracies": 1.0, "rewards/chosen": 0.7796837091445923, "rewards/margins": 0.9603943228721619, "rewards/rejected": -0.18071061372756958, "step": 538 }, { "epoch": 0.08983333333333333, "grad_norm": 68.62084197998047, "learning_rate": 1.9812822703702586e-07, "logits/chosen": 2.9506914615631104, "logits/rejected": 2.9714722633361816, "logps/chosen": -68.17652130126953, "logps/rejected": -184.3612060546875, "loss": 1.2979, "nll_loss": 1.065258264541626, "rewards/accuracies": 1.0, "rewards/chosen": 0.5871574282646179, "rewards/margins": 2.0201973915100098, "rewards/rejected": -1.433039903640747, "step": 539 }, { "epoch": 0.09, "grad_norm": 57.03388214111328, "learning_rate": 1.981178176898239e-07, "logits/chosen": 2.468904972076416, "logits/rejected": 2.4568653106689453, "logps/chosen": -66.958251953125, "logps/rejected": -75.26042175292969, "loss": 1.2254, "nll_loss": 1.030126929283142, "rewards/accuracies": 1.0, "rewards/chosen": 0.9955665469169617, "rewards/margins": 2.237581729888916, "rewards/rejected": -1.2420151233673096, "step": 540 }, { "epoch": 0.09016666666666667, "grad_norm": 27.19868278503418, "learning_rate": 1.9810737975344543e-07, "logits/chosen": 1.0699636936187744, "logits/rejected": 2.294290065765381, "logps/chosen": -62.04894256591797, "logps/rejected": -295.51690673828125, "loss": 0.7141, "nll_loss": 0.6600952744483948, "rewards/accuracies": 1.0, "rewards/chosen": 2.6688380241394043, "rewards/margins": 4.424160480499268, "rewards/rejected": -1.7553223371505737, "step": 541 }, { "epoch": 0.09033333333333333, "grad_norm": 170.23416137695312, "learning_rate": 1.9809691323093184e-07, "logits/chosen": 2.8700547218322754, "logits/rejected": 3.099970579147339, "logps/chosen": -48.763221740722656, "logps/rejected": -81.18647766113281, "loss": 4.5009, "nll_loss": 4.063601970672607, "rewards/accuracies": 1.0, "rewards/chosen": -0.6700271964073181, "rewards/margins": 1.3186612129211426, "rewards/rejected": -1.9886884689331055, "step": 542 }, { "epoch": 0.0905, "grad_norm": 55.98674011230469, "learning_rate": 1.9808641812533282e-07, "logits/chosen": 3.6891028881073, "logits/rejected": 3.605743885040283, "logps/chosen": -51.189170837402344, "logps/rejected": -123.61236572265625, "loss": 0.7782, "nll_loss": 0.5816951394081116, "rewards/accuracies": 1.0, "rewards/chosen": 1.364973545074463, "rewards/margins": 2.2311019897460938, "rewards/rejected": -0.8661285638809204, "step": 543 }, { "epoch": 0.09066666666666667, "grad_norm": 31.986486434936523, "learning_rate": 1.9807589443970642e-07, "logits/chosen": 2.9920170307159424, "logits/rejected": 2.9736807346343994, "logps/chosen": -233.93173217773438, "logps/rejected": -195.0313720703125, "loss": 0.9939, "nll_loss": 0.9319989681243896, "rewards/accuracies": 1.0, "rewards/chosen": 1.6056365966796875, "rewards/margins": 4.280940532684326, "rewards/rejected": -2.6753039360046387, "step": 544 }, { "epoch": 0.09083333333333334, "grad_norm": 83.63236236572266, "learning_rate": 1.9806534217711892e-07, "logits/chosen": 2.5214099884033203, "logits/rejected": 2.702329397201538, "logps/chosen": -36.173988342285156, "logps/rejected": -133.9276123046875, "loss": 1.0041, "nll_loss": 0.7696592211723328, "rewards/accuracies": 1.0, "rewards/chosen": 1.0720462799072266, "rewards/margins": 1.9505481719970703, "rewards/rejected": -0.8785018920898438, "step": 545 }, { "epoch": 0.091, "grad_norm": 88.85358428955078, "learning_rate": 1.9805476134064506e-07, "logits/chosen": 3.195573091506958, "logits/rejected": 3.1281087398529053, "logps/chosen": -104.91864776611328, "logps/rejected": -40.87842559814453, "loss": 1.3612, "nll_loss": 0.9714690446853638, "rewards/accuracies": 1.0, "rewards/chosen": 1.4680137634277344, "rewards/margins": 1.2903305292129517, "rewards/rejected": 0.1776832640171051, "step": 546 }, { "epoch": 0.09116666666666666, "grad_norm": 161.5996551513672, "learning_rate": 1.9804415193336777e-07, "logits/chosen": 3.153449296951294, "logits/rejected": 3.221519947052002, "logps/chosen": -89.71845245361328, "logps/rejected": -126.40878295898438, "loss": 1.8513, "nll_loss": 1.2816919088363647, "rewards/accuracies": 1.0, "rewards/chosen": 1.6291581392288208, "rewards/margins": 0.7896759510040283, "rewards/rejected": 0.8394821882247925, "step": 547 }, { "epoch": 0.09133333333333334, "grad_norm": 37.02498245239258, "learning_rate": 1.9803351395837846e-07, "logits/chosen": 3.0337328910827637, "logits/rejected": 3.17802357673645, "logps/chosen": -14.73823070526123, "logps/rejected": -87.1783218383789, "loss": 0.4528, "nll_loss": 0.32751625776290894, "rewards/accuracies": 1.0, "rewards/chosen": 1.8731437921524048, "rewards/margins": 2.968240261077881, "rewards/rejected": -1.0950965881347656, "step": 548 }, { "epoch": 0.0915, "grad_norm": 51.18918991088867, "learning_rate": 1.9802284741877672e-07, "logits/chosen": 2.637768507003784, "logits/rejected": 2.7748360633850098, "logps/chosen": -92.62995910644531, "logps/rejected": -233.44500732421875, "loss": 1.3584, "nll_loss": 1.2350660562515259, "rewards/accuracies": 1.0, "rewards/chosen": 0.5491325259208679, "rewards/margins": 3.6648125648498535, "rewards/rejected": -3.115679979324341, "step": 549 }, { "epoch": 0.09166666666666666, "grad_norm": 101.3015365600586, "learning_rate": 1.9801215231767056e-07, "logits/chosen": 2.2823736667633057, "logits/rejected": 2.4312572479248047, "logps/chosen": -238.2293243408203, "logps/rejected": -310.04656982421875, "loss": 1.4491, "nll_loss": 1.1508662700653076, "rewards/accuracies": 1.0, "rewards/chosen": -0.180003359913826, "rewards/margins": 1.8874099254608154, "rewards/rejected": -2.067413330078125, "step": 550 }, { "epoch": 0.09183333333333334, "grad_norm": 228.7339324951172, "learning_rate": 1.9800142865817625e-07, "logits/chosen": 4.363297939300537, "logits/rejected": 4.302156925201416, "logps/chosen": -80.75334167480469, "logps/rejected": -47.791500091552734, "loss": 3.1997, "nll_loss": 2.447071075439453, "rewards/accuracies": 1.0, "rewards/chosen": -0.10782089084386826, "rewards/margins": 0.1705196499824524, "rewards/rejected": -0.27834054827690125, "step": 551 }, { "epoch": 0.092, "grad_norm": 89.39916229248047, "learning_rate": 1.9799067644341842e-07, "logits/chosen": 1.8592290878295898, "logits/rejected": 2.37752628326416, "logps/chosen": -67.00482177734375, "logps/rejected": -243.744384765625, "loss": 0.9828, "nll_loss": 0.6907714605331421, "rewards/accuracies": 1.0, "rewards/chosen": 0.545642077922821, "rewards/margins": 1.6409485340118408, "rewards/rejected": -1.095306396484375, "step": 552 }, { "epoch": 0.09216666666666666, "grad_norm": 148.4570770263672, "learning_rate": 1.9797989567653003e-07, "logits/chosen": 1.6218175888061523, "logits/rejected": 1.448315978050232, "logps/chosen": -83.00779724121094, "logps/rejected": -39.49325942993164, "loss": 1.8367, "nll_loss": 1.0780231952667236, "rewards/accuracies": 1.0, "rewards/chosen": 0.5461395382881165, "rewards/margins": 0.16973382234573364, "rewards/rejected": 0.3764057159423828, "step": 553 }, { "epoch": 0.09233333333333334, "grad_norm": 123.9259033203125, "learning_rate": 1.979690863606523e-07, "logits/chosen": 2.3797013759613037, "logits/rejected": 2.385244369506836, "logps/chosen": -34.82954788208008, "logps/rejected": -19.734935760498047, "loss": 1.6749, "nll_loss": 1.2439125776290894, "rewards/accuracies": 1.0, "rewards/chosen": 0.6882225871086121, "rewards/margins": 1.0346134901046753, "rewards/rejected": -0.3463909327983856, "step": 554 }, { "epoch": 0.0925, "grad_norm": 89.77994537353516, "learning_rate": 1.9795824849893478e-07, "logits/chosen": 2.6698312759399414, "logits/rejected": 2.8592958450317383, "logps/chosen": -75.99376678466797, "logps/rejected": -231.7147216796875, "loss": 0.9186, "nll_loss": 0.6495193243026733, "rewards/accuracies": 1.0, "rewards/chosen": 0.6183220148086548, "rewards/margins": 1.7639350891113281, "rewards/rejected": -1.1456130743026733, "step": 555 }, { "epoch": 0.09266666666666666, "grad_norm": 69.97000885009766, "learning_rate": 1.9794738209453544e-07, "logits/chosen": 2.989119529724121, "logits/rejected": 3.08648943901062, "logps/chosen": -88.053466796875, "logps/rejected": -84.8667984008789, "loss": 1.2204, "nll_loss": 0.9468116164207458, "rewards/accuracies": 1.0, "rewards/chosen": 1.2952325344085693, "rewards/margins": 1.7487452030181885, "rewards/rejected": -0.45351260900497437, "step": 556 }, { "epoch": 0.09283333333333334, "grad_norm": 96.9338150024414, "learning_rate": 1.9793648715062043e-07, "logits/chosen": 3.03145694732666, "logits/rejected": 3.306777238845825, "logps/chosen": -62.698970794677734, "logps/rejected": -142.04855346679688, "loss": 2.6181, "nll_loss": 2.4114990234375, "rewards/accuracies": 1.0, "rewards/chosen": 0.14196434617042542, "rewards/margins": 2.5394468307495117, "rewards/rejected": -2.397482395172119, "step": 557 }, { "epoch": 0.093, "grad_norm": 42.820003509521484, "learning_rate": 1.979255636703643e-07, "logits/chosen": 3.2206573486328125, "logits/rejected": 3.3282949924468994, "logps/chosen": -39.066864013671875, "logps/rejected": -288.3263244628906, "loss": 0.9965, "nll_loss": 0.9301634430885315, "rewards/accuracies": 1.0, "rewards/chosen": 1.3763068914413452, "rewards/margins": 4.367557525634766, "rewards/rejected": -2.99125075340271, "step": 558 }, { "epoch": 0.09316666666666666, "grad_norm": 131.709716796875, "learning_rate": 1.979146116569499e-07, "logits/chosen": 2.814453125, "logits/rejected": 2.6305601596832275, "logps/chosen": -77.42059326171875, "logps/rejected": -40.698482513427734, "loss": 2.3656, "nll_loss": 1.580012321472168, "rewards/accuracies": 1.0, "rewards/chosen": 0.8319298028945923, "rewards/margins": 0.14695054292678833, "rewards/rejected": 0.684979259967804, "step": 559 }, { "epoch": 0.09333333333333334, "grad_norm": 75.51110076904297, "learning_rate": 1.9790363111356836e-07, "logits/chosen": 2.3237409591674805, "logits/rejected": 2.515554904937744, "logps/chosen": -141.61610412597656, "logps/rejected": -288.06219482421875, "loss": 1.4012, "nll_loss": 1.1703810691833496, "rewards/accuracies": 1.0, "rewards/chosen": -0.2651931941509247, "rewards/margins": 2.743708848953247, "rewards/rejected": -3.008902072906494, "step": 560 }, { "epoch": 0.0935, "grad_norm": 124.04908752441406, "learning_rate": 1.9789262204341914e-07, "logits/chosen": 1.2947051525115967, "logits/rejected": 3.072399854660034, "logps/chosen": -65.25656127929688, "logps/rejected": -445.59326171875, "loss": 2.2248, "nll_loss": 1.8644733428955078, "rewards/accuracies": 1.0, "rewards/chosen": 0.21584777534008026, "rewards/margins": 1.3491668701171875, "rewards/rejected": -1.1333191394805908, "step": 561 }, { "epoch": 0.09366666666666666, "grad_norm": 40.37671661376953, "learning_rate": 1.9788158444971007e-07, "logits/chosen": 3.0534799098968506, "logits/rejected": 3.144803762435913, "logps/chosen": -104.0009994506836, "logps/rejected": -273.2464599609375, "loss": 1.046, "nll_loss": 0.9454637169837952, "rewards/accuracies": 1.0, "rewards/chosen": 1.0125938653945923, "rewards/margins": 3.5748391151428223, "rewards/rejected": -2.5622451305389404, "step": 562 }, { "epoch": 0.09383333333333334, "grad_norm": 42.929386138916016, "learning_rate": 1.9787051833565714e-07, "logits/chosen": 3.9300286769866943, "logits/rejected": 3.9830734729766846, "logps/chosen": -66.85301208496094, "logps/rejected": -122.14541625976562, "loss": 1.0164, "nll_loss": 0.9034190773963928, "rewards/accuracies": 1.0, "rewards/chosen": 1.7219613790512085, "rewards/margins": 3.096348762512207, "rewards/rejected": -1.374387502670288, "step": 563 }, { "epoch": 0.094, "grad_norm": 45.13384246826172, "learning_rate": 1.9785942370448487e-07, "logits/chosen": 3.046375274658203, "logits/rejected": 2.9052746295928955, "logps/chosen": -76.5621337890625, "logps/rejected": -37.541114807128906, "loss": 1.1758, "nll_loss": 1.063362956047058, "rewards/accuracies": 1.0, "rewards/chosen": 1.060821533203125, "rewards/margins": 3.2488720417022705, "rewards/rejected": -2.1880505084991455, "step": 564 }, { "epoch": 0.09416666666666666, "grad_norm": 51.54033279418945, "learning_rate": 1.978483005594259e-07, "logits/chosen": 2.188575506210327, "logits/rejected": 2.2380568981170654, "logps/chosen": -53.20161819458008, "logps/rejected": -19.030977249145508, "loss": 0.9672, "nll_loss": 0.718940794467926, "rewards/accuracies": 1.0, "rewards/chosen": 1.3106693029403687, "rewards/margins": 1.8897864818572998, "rewards/rejected": -0.5791171193122864, "step": 565 }, { "epoch": 0.09433333333333334, "grad_norm": 37.33366775512695, "learning_rate": 1.9783714890372124e-07, "logits/chosen": 2.680741548538208, "logits/rejected": 2.711505174636841, "logps/chosen": -54.25426483154297, "logps/rejected": -181.06661987304688, "loss": 0.7931, "nll_loss": 0.6781783103942871, "rewards/accuracies": 1.0, "rewards/chosen": 0.8393257260322571, "rewards/margins": 3.396505117416382, "rewards/rejected": -2.5571794509887695, "step": 566 }, { "epoch": 0.0945, "grad_norm": 39.783878326416016, "learning_rate": 1.9782596874062025e-07, "logits/chosen": 1.8663017749786377, "logits/rejected": 2.722533702850342, "logps/chosen": -22.56342887878418, "logps/rejected": -206.73016357421875, "loss": 0.5598, "nll_loss": 0.470071405172348, "rewards/accuracies": 1.0, "rewards/chosen": 1.5412867069244385, "rewards/margins": 3.4928858280181885, "rewards/rejected": -1.95159912109375, "step": 567 }, { "epoch": 0.09466666666666666, "grad_norm": 140.62161254882812, "learning_rate": 1.9781476007338056e-07, "logits/chosen": 2.266312837600708, "logits/rejected": 2.1286938190460205, "logps/chosen": -28.03472900390625, "logps/rejected": -14.248624801635742, "loss": 2.7669, "nll_loss": 2.156517744064331, "rewards/accuracies": 1.0, "rewards/chosen": -0.017602158710360527, "rewards/margins": 0.5102550983428955, "rewards/rejected": -0.52785724401474, "step": 568 }, { "epoch": 0.09483333333333334, "grad_norm": 53.56691360473633, "learning_rate": 1.9780352290526808e-07, "logits/chosen": 3.140866279602051, "logits/rejected": 3.1623597145080566, "logps/chosen": -63.538204193115234, "logps/rejected": -138.07505798339844, "loss": 1.0089, "nll_loss": 0.8360289335250854, "rewards/accuracies": 1.0, "rewards/chosen": 1.4075489044189453, "rewards/margins": 2.4248876571655273, "rewards/rejected": -1.0173386335372925, "step": 569 }, { "epoch": 0.095, "grad_norm": 66.67207336425781, "learning_rate": 1.9779225723955707e-07, "logits/chosen": 2.478698253631592, "logits/rejected": 2.5539934635162354, "logps/chosen": -15.352152824401855, "logps/rejected": -48.268707275390625, "loss": 0.8731, "nll_loss": 0.5685982704162598, "rewards/accuracies": 1.0, "rewards/chosen": 0.5649155974388123, "rewards/margins": 1.569007158279419, "rewards/rejected": -1.0040916204452515, "step": 570 }, { "epoch": 0.09516666666666666, "grad_norm": 57.7244873046875, "learning_rate": 1.9778096307953006e-07, "logits/chosen": 2.8515126705169678, "logits/rejected": 3.138058662414551, "logps/chosen": -24.900766372680664, "logps/rejected": -533.3846435546875, "loss": 0.805, "nll_loss": 0.6916879415512085, "rewards/accuracies": 1.0, "rewards/chosen": 1.1881582736968994, "rewards/margins": 3.160546064376831, "rewards/rejected": -1.9723877906799316, "step": 571 }, { "epoch": 0.09533333333333334, "grad_norm": 87.62401580810547, "learning_rate": 1.9776964042847788e-07, "logits/chosen": 2.8304848670959473, "logits/rejected": 3.3029518127441406, "logps/chosen": -50.937889099121094, "logps/rejected": -41.35448455810547, "loss": 0.9741, "nll_loss": 0.6977792978286743, "rewards/accuracies": 1.0, "rewards/chosen": 0.9671459197998047, "rewards/margins": 1.7026432752609253, "rewards/rejected": -0.7354973554611206, "step": 572 }, { "epoch": 0.0955, "grad_norm": 72.87964630126953, "learning_rate": 1.9775828928969973e-07, "logits/chosen": 2.541417360305786, "logits/rejected": 2.897764205932617, "logps/chosen": -60.57882308959961, "logps/rejected": -44.18194580078125, "loss": 1.1873, "nll_loss": 0.9930953979492188, "rewards/accuracies": 1.0, "rewards/chosen": 1.262412667274475, "rewards/margins": 2.240386486053467, "rewards/rejected": -0.9779739379882812, "step": 573 }, { "epoch": 0.09566666666666666, "grad_norm": 57.24178695678711, "learning_rate": 1.9774690966650303e-07, "logits/chosen": 2.0901784896850586, "logits/rejected": 2.6292054653167725, "logps/chosen": -80.6123046875, "logps/rejected": -103.22735595703125, "loss": 1.2249, "nll_loss": 1.0334908962249756, "rewards/accuracies": 1.0, "rewards/chosen": 0.2918197810649872, "rewards/margins": 2.5835113525390625, "rewards/rejected": -2.291691541671753, "step": 574 }, { "epoch": 0.09583333333333334, "grad_norm": 44.38501739501953, "learning_rate": 1.9773550156220354e-07, "logits/chosen": 2.5754172801971436, "logits/rejected": 2.587902545928955, "logps/chosen": -54.34849166870117, "logps/rejected": -47.45262145996094, "loss": 0.8915, "nll_loss": 0.7058247327804565, "rewards/accuracies": 1.0, "rewards/chosen": 1.247165322303772, "rewards/margins": 2.3074753284454346, "rewards/rejected": -1.0603100061416626, "step": 575 }, { "epoch": 0.096, "grad_norm": 55.9158935546875, "learning_rate": 1.9772406498012527e-07, "logits/chosen": 1.660274624824524, "logits/rejected": 2.745544195175171, "logps/chosen": -24.393653869628906, "logps/rejected": -225.45018005371094, "loss": 0.901, "nll_loss": 0.6969614624977112, "rewards/accuracies": 1.0, "rewards/chosen": 0.705760657787323, "rewards/margins": 2.215691089630127, "rewards/rejected": -1.5099304914474487, "step": 576 }, { "epoch": 0.09616666666666666, "grad_norm": 137.3025360107422, "learning_rate": 1.9771259992360064e-07, "logits/chosen": 1.9805152416229248, "logits/rejected": 1.9995348453521729, "logps/chosen": -119.97492980957031, "logps/rejected": -122.74493408203125, "loss": 1.6902, "nll_loss": 1.2497388124465942, "rewards/accuracies": 1.0, "rewards/chosen": 0.6828567385673523, "rewards/margins": 1.0026085376739502, "rewards/rejected": -0.3197517693042755, "step": 577 }, { "epoch": 0.09633333333333334, "grad_norm": 88.05399322509766, "learning_rate": 1.9770110639597024e-07, "logits/chosen": 1.4732247591018677, "logits/rejected": 1.120802402496338, "logps/chosen": -108.59101867675781, "logps/rejected": -59.7728157043457, "loss": 1.2738, "nll_loss": 0.9871910810470581, "rewards/accuracies": 1.0, "rewards/chosen": 0.7690322995185852, "rewards/margins": 1.6481938362121582, "rewards/rejected": -0.8791614770889282, "step": 578 }, { "epoch": 0.0965, "grad_norm": 32.13365936279297, "learning_rate": 1.97689584400583e-07, "logits/chosen": 1.884521245956421, "logits/rejected": 2.578847646713257, "logps/chosen": -42.54056167602539, "logps/rejected": -151.92320251464844, "loss": 0.6201, "nll_loss": 0.5524747967720032, "rewards/accuracies": 1.0, "rewards/chosen": 1.5992481708526611, "rewards/margins": 4.056480884552002, "rewards/rejected": -2.457232713699341, "step": 579 }, { "epoch": 0.09666666666666666, "grad_norm": 60.66727066040039, "learning_rate": 1.9767803394079615e-07, "logits/chosen": 3.1422231197357178, "logits/rejected": 3.1530818939208984, "logps/chosen": -51.31452178955078, "logps/rejected": -110.40324401855469, "loss": 0.9299, "nll_loss": 0.712701678276062, "rewards/accuracies": 1.0, "rewards/chosen": 1.3784828186035156, "rewards/margins": 2.090883731842041, "rewards/rejected": -0.7124009132385254, "step": 580 }, { "epoch": 0.09683333333333333, "grad_norm": 36.550941467285156, "learning_rate": 1.976664550199753e-07, "logits/chosen": 3.0405468940734863, "logits/rejected": 3.4501283168792725, "logps/chosen": -62.62760925292969, "logps/rejected": -251.86468505859375, "loss": 0.8689, "nll_loss": 0.7927544713020325, "rewards/accuracies": 1.0, "rewards/chosen": 1.6747758388519287, "rewards/margins": 3.769505500793457, "rewards/rejected": -2.0947296619415283, "step": 581 }, { "epoch": 0.097, "grad_norm": 109.13319396972656, "learning_rate": 1.976548476414941e-07, "logits/chosen": 2.8100969791412354, "logits/rejected": 2.866755723953247, "logps/chosen": -64.77899932861328, "logps/rejected": -89.79733276367188, "loss": 1.6436, "nll_loss": 1.2955800294876099, "rewards/accuracies": 1.0, "rewards/chosen": 0.6752815246582031, "rewards/margins": 1.3544456958770752, "rewards/rejected": -0.6791641712188721, "step": 582 }, { "epoch": 0.09716666666666667, "grad_norm": 47.695247650146484, "learning_rate": 1.9764321180873484e-07, "logits/chosen": 1.3498647212982178, "logits/rejected": 2.3236544132232666, "logps/chosen": -75.58629608154297, "logps/rejected": -242.82733154296875, "loss": 1.1289, "nll_loss": 0.9945564270019531, "rewards/accuracies": 1.0, "rewards/chosen": 0.8812103271484375, "rewards/margins": 2.9626739025115967, "rewards/rejected": -2.081463575363159, "step": 583 }, { "epoch": 0.09733333333333333, "grad_norm": 59.34674072265625, "learning_rate": 1.9763154752508785e-07, "logits/chosen": 2.237910032272339, "logits/rejected": 1.9324716329574585, "logps/chosen": -192.11587524414062, "logps/rejected": -111.25411987304688, "loss": 1.2134, "nll_loss": 1.0441080331802368, "rewards/accuracies": 1.0, "rewards/chosen": 0.5784500241279602, "rewards/margins": 2.644740343093872, "rewards/rejected": -2.0662903785705566, "step": 584 }, { "epoch": 0.0975, "grad_norm": 76.45232391357422, "learning_rate": 1.9761985479395177e-07, "logits/chosen": 2.885737657546997, "logits/rejected": 2.8720993995666504, "logps/chosen": -74.40553283691406, "logps/rejected": -122.44004821777344, "loss": 1.0257, "nll_loss": 0.7592401504516602, "rewards/accuracies": 1.0, "rewards/chosen": 1.405428409576416, "rewards/margins": 1.8082101345062256, "rewards/rejected": -0.4027816951274872, "step": 585 }, { "epoch": 0.09766666666666667, "grad_norm": 75.75363159179688, "learning_rate": 1.9760813361873368e-07, "logits/chosen": 2.63240909576416, "logits/rejected": 2.819828748703003, "logps/chosen": -169.8399658203125, "logps/rejected": -193.4849395751953, "loss": 1.6472, "nll_loss": 1.4898242950439453, "rewards/accuracies": 1.0, "rewards/chosen": 0.8019668459892273, "rewards/margins": 2.6710052490234375, "rewards/rejected": -1.869038462638855, "step": 586 }, { "epoch": 0.09783333333333333, "grad_norm": 46.85858917236328, "learning_rate": 1.975963840028488e-07, "logits/chosen": 2.50242018699646, "logits/rejected": 2.4910998344421387, "logps/chosen": -68.82202911376953, "logps/rejected": -69.0047607421875, "loss": 1.0417, "nll_loss": 0.8937926888465881, "rewards/accuracies": 1.0, "rewards/chosen": 1.585767388343811, "rewards/margins": 2.677102565765381, "rewards/rejected": -1.0913352966308594, "step": 587 }, { "epoch": 0.098, "grad_norm": 111.95850372314453, "learning_rate": 1.9758460594972066e-07, "logits/chosen": 1.8209782838821411, "logits/rejected": 1.7131673097610474, "logps/chosen": -121.26087188720703, "logps/rejected": -195.2490997314453, "loss": 1.3936, "nll_loss": 1.0189989805221558, "rewards/accuracies": 1.0, "rewards/chosen": 1.2099770307540894, "rewards/margins": 1.2952691316604614, "rewards/rejected": -0.08529205620288849, "step": 588 }, { "epoch": 0.09816666666666667, "grad_norm": 226.8843994140625, "learning_rate": 1.9757279946278117e-07, "logits/chosen": 3.0648930072784424, "logits/rejected": 2.9134039878845215, "logps/chosen": -256.05865478515625, "logps/rejected": -82.98637390136719, "loss": 2.0049, "nll_loss": 1.1854567527770996, "rewards/accuracies": 1.0, "rewards/chosen": -1.2565292119979858, "rewards/margins": 0.2147674560546875, "rewards/rejected": -1.4712966680526733, "step": 589 }, { "epoch": 0.09833333333333333, "grad_norm": 45.81262969970703, "learning_rate": 1.9756096454547038e-07, "logits/chosen": 3.1461381912231445, "logits/rejected": 3.104917049407959, "logps/chosen": -86.31925964355469, "logps/rejected": -198.87738037109375, "loss": 0.9315, "nll_loss": 0.8380510807037354, "rewards/accuracies": 1.0, "rewards/chosen": 0.766316294670105, "rewards/margins": 4.365825176239014, "rewards/rejected": -3.599508762359619, "step": 590 }, { "epoch": 0.0985, "grad_norm": 86.12709045410156, "learning_rate": 1.9754910120123672e-07, "logits/chosen": 1.7615458965301514, "logits/rejected": 2.3203001022338867, "logps/chosen": -23.830440521240234, "logps/rejected": -76.39287567138672, "loss": 1.0313, "nll_loss": 0.5957609415054321, "rewards/accuracies": 1.0, "rewards/chosen": 0.45056042075157166, "rewards/margins": 1.0174579620361328, "rewards/rejected": -0.5668975710868835, "step": 591 }, { "epoch": 0.09866666666666667, "grad_norm": 66.08325958251953, "learning_rate": 1.9753720943353692e-07, "logits/chosen": 2.180893659591675, "logits/rejected": 2.7935895919799805, "logps/chosen": -25.02439308166504, "logps/rejected": -89.66486358642578, "loss": 0.7448, "nll_loss": 0.5560975670814514, "rewards/accuracies": 1.0, "rewards/chosen": 1.5095983743667603, "rewards/margins": 2.313288688659668, "rewards/rejected": -0.8036904335021973, "step": 592 }, { "epoch": 0.09883333333333333, "grad_norm": 72.14125061035156, "learning_rate": 1.9752528924583594e-07, "logits/chosen": 1.6489589214324951, "logits/rejected": 1.8734756708145142, "logps/chosen": -97.72418212890625, "logps/rejected": -106.26255798339844, "loss": 1.0916, "nll_loss": 0.8572296500205994, "rewards/accuracies": 1.0, "rewards/chosen": 0.7008072137832642, "rewards/margins": 1.9761933088302612, "rewards/rejected": -1.275386095046997, "step": 593 }, { "epoch": 0.099, "grad_norm": 217.99256896972656, "learning_rate": 1.9751334064160704e-07, "logits/chosen": 2.3519530296325684, "logits/rejected": 2.551687479019165, "logps/chosen": -64.71829986572266, "logps/rejected": -226.46697998046875, "loss": 1.4868, "nll_loss": 0.924547016620636, "rewards/accuracies": 1.0, "rewards/chosen": 0.044539641588926315, "rewards/margins": 0.6380195617675781, "rewards/rejected": -0.5934799313545227, "step": 594 }, { "epoch": 0.09916666666666667, "grad_norm": 82.63080596923828, "learning_rate": 1.9750136362433174e-07, "logits/chosen": 1.4708709716796875, "logits/rejected": 1.4558409452438354, "logps/chosen": -205.27481079101562, "logps/rejected": -277.4117431640625, "loss": 1.2392, "nll_loss": 0.9821761250495911, "rewards/accuracies": 1.0, "rewards/chosen": -0.24101562798023224, "rewards/margins": 2.3131134510040283, "rewards/rejected": -2.554129123687744, "step": 595 }, { "epoch": 0.09933333333333333, "grad_norm": 36.01694869995117, "learning_rate": 1.9748935819749984e-07, "logits/chosen": 2.7673866748809814, "logits/rejected": 3.0589587688446045, "logps/chosen": -60.83509826660156, "logps/rejected": -191.66659545898438, "loss": 1.0313, "nll_loss": 0.9812111854553223, "rewards/accuracies": 1.0, "rewards/chosen": 1.8383477926254272, "rewards/margins": 4.6029229164123535, "rewards/rejected": -2.764575242996216, "step": 596 }, { "epoch": 0.0995, "grad_norm": 36.49534606933594, "learning_rate": 1.9747732436460951e-07, "logits/chosen": 1.4344197511672974, "logits/rejected": 2.039682626724243, "logps/chosen": -69.69383239746094, "logps/rejected": -220.1802520751953, "loss": 0.7576, "nll_loss": 0.6701329946517944, "rewards/accuracies": 1.0, "rewards/chosen": 1.1102126836776733, "rewards/margins": 3.838977336883545, "rewards/rejected": -2.728764533996582, "step": 597 }, { "epoch": 0.09966666666666667, "grad_norm": 115.61087036132812, "learning_rate": 1.9746526212916704e-07, "logits/chosen": 2.865194797515869, "logits/rejected": 3.025463342666626, "logps/chosen": -13.513863563537598, "logps/rejected": -234.184326171875, "loss": 1.4755, "nll_loss": 1.2285330295562744, "rewards/accuracies": 1.0, "rewards/chosen": -0.3183615803718567, "rewards/margins": 2.538646936416626, "rewards/rejected": -2.857008457183838, "step": 598 }, { "epoch": 0.09983333333333333, "grad_norm": 39.77694320678711, "learning_rate": 1.9745317149468707e-07, "logits/chosen": 2.760631799697876, "logits/rejected": 2.7707812786102295, "logps/chosen": -65.29861450195312, "logps/rejected": -92.63037872314453, "loss": 0.9464, "nll_loss": 0.8480339646339417, "rewards/accuracies": 1.0, "rewards/chosen": 1.0610198974609375, "rewards/margins": 3.558096408843994, "rewards/rejected": -2.4970765113830566, "step": 599 }, { "epoch": 0.1, "grad_norm": 52.570518493652344, "learning_rate": 1.974410524646926e-07, "logits/chosen": 2.54494047164917, "logits/rejected": 2.3604393005371094, "logps/chosen": -321.93377685546875, "logps/rejected": -239.06246948242188, "loss": 0.9378, "nll_loss": 0.8233599662780762, "rewards/accuracies": 1.0, "rewards/chosen": 1.0302246809005737, "rewards/margins": 3.2148146629333496, "rewards/rejected": -2.1845901012420654, "step": 600 }, { "epoch": 0.10016666666666667, "grad_norm": 66.10369110107422, "learning_rate": 1.9742890504271475e-07, "logits/chosen": 3.3568146228790283, "logits/rejected": 3.877699375152588, "logps/chosen": -53.02088928222656, "logps/rejected": -80.75869750976562, "loss": 1.228, "nll_loss": 1.0196325778961182, "rewards/accuracies": 1.0, "rewards/chosen": 0.3326553404331207, "rewards/margins": 2.3346517086029053, "rewards/rejected": -2.0019962787628174, "step": 601 }, { "epoch": 0.10033333333333333, "grad_norm": 202.9335479736328, "learning_rate": 1.97416729232293e-07, "logits/chosen": 2.71126127243042, "logits/rejected": 2.6847801208496094, "logps/chosen": -38.66828155517578, "logps/rejected": -28.389698028564453, "loss": 2.9936, "nll_loss": 1.7576491832733154, "rewards/accuracies": 0.0, "rewards/chosen": -0.7776211500167847, "rewards/margins": -0.6851136088371277, "rewards/rejected": -0.09250755608081818, "step": 602 }, { "epoch": 0.1005, "grad_norm": 58.14347457885742, "learning_rate": 1.9740452503697514e-07, "logits/chosen": 2.6041040420532227, "logits/rejected": 2.65421199798584, "logps/chosen": -163.58822631835938, "logps/rejected": -152.7443389892578, "loss": 1.279, "nll_loss": 1.1204673051834106, "rewards/accuracies": 1.0, "rewards/chosen": 0.4921570122241974, "rewards/margins": 2.8618149757385254, "rewards/rejected": -2.3696579933166504, "step": 603 }, { "epoch": 0.10066666666666667, "grad_norm": 83.0643081665039, "learning_rate": 1.9739229246031717e-07, "logits/chosen": 2.7221317291259766, "logits/rejected": 2.6497764587402344, "logps/chosen": -14.765554428100586, "logps/rejected": -15.940401077270508, "loss": 1.2385, "nll_loss": 0.6711615920066833, "rewards/accuracies": 1.0, "rewards/chosen": 0.11595498025417328, "rewards/margins": 0.6176247000694275, "rewards/rejected": -0.501669704914093, "step": 604 }, { "epoch": 0.10083333333333333, "grad_norm": 41.98073959350586, "learning_rate": 1.973800315058833e-07, "logits/chosen": 1.987784743309021, "logits/rejected": 3.1793341636657715, "logps/chosen": -48.67395782470703, "logps/rejected": -175.64224243164062, "loss": 0.9173, "nll_loss": 0.8249824047088623, "rewards/accuracies": 1.0, "rewards/chosen": 0.8485599756240845, "rewards/margins": 4.110936641693115, "rewards/rejected": -3.262376546859741, "step": 605 }, { "epoch": 0.101, "grad_norm": 88.97056579589844, "learning_rate": 1.9736774217724613e-07, "logits/chosen": 2.488992691040039, "logits/rejected": 2.985753059387207, "logps/chosen": -57.64252853393555, "logps/rejected": -255.74972534179688, "loss": 2.1301, "nll_loss": 1.9876735210418701, "rewards/accuracies": 1.0, "rewards/chosen": 0.334916353225708, "rewards/margins": 3.4460859298706055, "rewards/rejected": -3.1111695766448975, "step": 606 }, { "epoch": 0.10116666666666667, "grad_norm": 50.416690826416016, "learning_rate": 1.9735542447798642e-07, "logits/chosen": 2.920072317123413, "logits/rejected": 2.982069253921509, "logps/chosen": -76.84403991699219, "logps/rejected": -128.03831481933594, "loss": 1.1719, "nll_loss": 1.0245871543884277, "rewards/accuracies": 1.0, "rewards/chosen": 0.6899719834327698, "rewards/margins": 2.8680832386016846, "rewards/rejected": -2.1781113147735596, "step": 607 }, { "epoch": 0.10133333333333333, "grad_norm": 52.18882751464844, "learning_rate": 1.9734307841169336e-07, "logits/chosen": 3.3760619163513184, "logits/rejected": 3.670646905899048, "logps/chosen": -125.91181945800781, "logps/rejected": -239.34783935546875, "loss": 1.3418, "nll_loss": 1.222444772720337, "rewards/accuracies": 1.0, "rewards/chosen": 0.7646408081054688, "rewards/margins": 3.3365585803985596, "rewards/rejected": -2.571917772293091, "step": 608 }, { "epoch": 0.1015, "grad_norm": 46.67845916748047, "learning_rate": 1.973307039819642e-07, "logits/chosen": 2.6785287857055664, "logits/rejected": 2.4342093467712402, "logps/chosen": -62.06285095214844, "logps/rejected": -73.06819152832031, "loss": 0.9133, "nll_loss": 0.8060110807418823, "rewards/accuracies": 1.0, "rewards/chosen": 2.276846408843994, "rewards/margins": 3.3303980827331543, "rewards/rejected": -1.0535515546798706, "step": 609 }, { "epoch": 0.10166666666666667, "grad_norm": 71.2285385131836, "learning_rate": 1.9731830119240462e-07, "logits/chosen": 1.1920706033706665, "logits/rejected": 2.642101526260376, "logps/chosen": -61.32777404785156, "logps/rejected": -389.1809387207031, "loss": 1.234, "nll_loss": 1.0221295356750488, "rewards/accuracies": 1.0, "rewards/chosen": 0.20067331194877625, "rewards/margins": 2.3828632831573486, "rewards/rejected": -2.18218994140625, "step": 610 }, { "epoch": 0.10183333333333333, "grad_norm": 77.54747772216797, "learning_rate": 1.9730587004662843e-07, "logits/chosen": 3.1243817806243896, "logits/rejected": 2.884408950805664, "logps/chosen": -182.78822326660156, "logps/rejected": -124.08360290527344, "loss": 1.0294, "nll_loss": 0.8423421382904053, "rewards/accuracies": 1.0, "rewards/chosen": 0.6833053827285767, "rewards/margins": 2.373049259185791, "rewards/rejected": -1.6897438764572144, "step": 611 }, { "epoch": 0.102, "grad_norm": 182.119140625, "learning_rate": 1.9729341054825782e-07, "logits/chosen": 2.186150312423706, "logits/rejected": 2.4423508644104004, "logps/chosen": -49.132301330566406, "logps/rejected": -209.9788818359375, "loss": 1.4341, "nll_loss": 1.0453683137893677, "rewards/accuracies": 1.0, "rewards/chosen": 0.2086647003889084, "rewards/margins": 1.2169806957244873, "rewards/rejected": -1.0083160400390625, "step": 612 }, { "epoch": 0.10216666666666667, "grad_norm": 89.16496276855469, "learning_rate": 1.9728092270092316e-07, "logits/chosen": 2.8338987827301025, "logits/rejected": 2.699284791946411, "logps/chosen": -21.16209602355957, "logps/rejected": -15.4059476852417, "loss": 1.0069, "nll_loss": 0.5290524959564209, "rewards/accuracies": 1.0, "rewards/chosen": 0.31575721502304077, "rewards/margins": 0.8780493140220642, "rewards/rejected": -0.5622920989990234, "step": 613 }, { "epoch": 0.10233333333333333, "grad_norm": 71.765625, "learning_rate": 1.9726840650826308e-07, "logits/chosen": 1.8710192441940308, "logits/rejected": 1.535117506980896, "logps/chosen": -225.1287078857422, "logps/rejected": -159.79266357421875, "loss": 1.3842, "nll_loss": 1.2438050508499146, "rewards/accuracies": 1.0, "rewards/chosen": 1.299098253250122, "rewards/margins": 2.745091438293457, "rewards/rejected": -1.4459930658340454, "step": 614 }, { "epoch": 0.1025, "grad_norm": 64.58187103271484, "learning_rate": 1.972558619739246e-07, "logits/chosen": 3.4959990978240967, "logits/rejected": 3.7552335262298584, "logps/chosen": -45.824790954589844, "logps/rejected": -132.53836059570312, "loss": 1.8061, "nll_loss": 1.6972144842147827, "rewards/accuracies": 1.0, "rewards/chosen": 1.18089759349823, "rewards/margins": 3.2292556762695312, "rewards/rejected": -2.048358201980591, "step": 615 }, { "epoch": 0.10266666666666667, "grad_norm": 78.32469177246094, "learning_rate": 1.9724328910156278e-07, "logits/chosen": 2.2365989685058594, "logits/rejected": 2.534569263458252, "logps/chosen": -33.828853607177734, "logps/rejected": -43.206382751464844, "loss": 1.2275, "nll_loss": 0.7867177128791809, "rewards/accuracies": 1.0, "rewards/chosen": 0.7945259809494019, "rewards/margins": 1.0113297700881958, "rewards/rejected": -0.21680375933647156, "step": 616 }, { "epoch": 0.10283333333333333, "grad_norm": 108.48190307617188, "learning_rate": 1.9723068789484113e-07, "logits/chosen": 2.797304630279541, "logits/rejected": 3.134697914123535, "logps/chosen": -50.848358154296875, "logps/rejected": -140.8223876953125, "loss": 2.108, "nll_loss": 1.8160125017166138, "rewards/accuracies": 1.0, "rewards/chosen": 0.05438767001032829, "rewards/margins": 1.772123098373413, "rewards/rejected": -1.7177354097366333, "step": 617 }, { "epoch": 0.103, "grad_norm": 31.61306381225586, "learning_rate": 1.972180583574313e-07, "logits/chosen": 2.2559423446655273, "logits/rejected": 3.0480198860168457, "logps/chosen": -36.07120895385742, "logps/rejected": -364.643310546875, "loss": 0.6459, "nll_loss": 0.5636126399040222, "rewards/accuracies": 1.0, "rewards/chosen": 0.8550716638565063, "rewards/margins": 4.700649738311768, "rewards/rejected": -3.845578193664551, "step": 618 }, { "epoch": 0.10316666666666667, "grad_norm": 59.469600677490234, "learning_rate": 1.9720540049301327e-07, "logits/chosen": 1.427805781364441, "logits/rejected": 2.367037057876587, "logps/chosen": -70.70558166503906, "logps/rejected": -136.10414123535156, "loss": 1.4372, "nll_loss": 1.334067702293396, "rewards/accuracies": 1.0, "rewards/chosen": 0.9285759329795837, "rewards/margins": 3.5506203174591064, "rewards/rejected": -2.622044324874878, "step": 619 }, { "epoch": 0.10333333333333333, "grad_norm": 66.51069641113281, "learning_rate": 1.9719271430527518e-07, "logits/chosen": 3.1811904907226562, "logits/rejected": 3.1138439178466797, "logps/chosen": -94.1441650390625, "logps/rejected": -29.03786849975586, "loss": 1.3739, "nll_loss": 1.1075786352157593, "rewards/accuracies": 1.0, "rewards/chosen": 0.8176674842834473, "rewards/margins": 1.7577853202819824, "rewards/rejected": -0.9401178359985352, "step": 620 }, { "epoch": 0.1035, "grad_norm": 53.33623123168945, "learning_rate": 1.9717999979791353e-07, "logits/chosen": 2.049928903579712, "logits/rejected": 2.485633611679077, "logps/chosen": -34.64119338989258, "logps/rejected": -119.86228942871094, "loss": 0.8786, "nll_loss": 0.7216915488243103, "rewards/accuracies": 1.0, "rewards/chosen": 1.577798843383789, "rewards/margins": 2.595006227493286, "rewards/rejected": -1.017207384109497, "step": 621 }, { "epoch": 0.10366666666666667, "grad_norm": 40.237937927246094, "learning_rate": 1.9716725697463302e-07, "logits/chosen": 2.455389976501465, "logits/rejected": 2.8307385444641113, "logps/chosen": -40.146907806396484, "logps/rejected": -149.1456298828125, "loss": 0.8172, "nll_loss": 0.7043316960334778, "rewards/accuracies": 1.0, "rewards/chosen": 0.6948680877685547, "rewards/margins": 3.6029324531555176, "rewards/rejected": -2.908064365386963, "step": 622 }, { "epoch": 0.10383333333333333, "grad_norm": 30.547710418701172, "learning_rate": 1.9715448583914657e-07, "logits/chosen": 3.0252015590667725, "logits/rejected": 3.1517391204833984, "logps/chosen": -41.84136962890625, "logps/rejected": -136.6087646484375, "loss": 0.6256, "nll_loss": 0.5296375155448914, "rewards/accuracies": 1.0, "rewards/chosen": 1.2422279119491577, "rewards/margins": 3.4616174697875977, "rewards/rejected": -2.2193894386291504, "step": 623 }, { "epoch": 0.104, "grad_norm": 129.7845916748047, "learning_rate": 1.9714168639517542e-07, "logits/chosen": 2.3540518283843994, "logits/rejected": 2.23537540435791, "logps/chosen": -16.23128890991211, "logps/rejected": -105.38604736328125, "loss": 1.1637, "nll_loss": 0.33815184235572815, "rewards/accuracies": 1.0, "rewards/chosen": -0.2660662829875946, "rewards/margins": 0.019920825958251953, "rewards/rejected": -0.28598710894584656, "step": 624 }, { "epoch": 0.10416666666666667, "grad_norm": 86.58470916748047, "learning_rate": 1.9712885864644899e-07, "logits/chosen": 2.5137736797332764, "logits/rejected": 2.4335761070251465, "logps/chosen": -53.754085540771484, "logps/rejected": -64.24254608154297, "loss": 1.3484, "nll_loss": 1.119876742362976, "rewards/accuracies": 1.0, "rewards/chosen": 0.9110393524169922, "rewards/margins": 1.9902362823486328, "rewards/rejected": -1.0791969299316406, "step": 625 }, { "epoch": 0.10433333333333333, "grad_norm": 76.11297607421875, "learning_rate": 1.97116002596705e-07, "logits/chosen": 2.9358484745025635, "logits/rejected": 2.9261515140533447, "logps/chosen": -44.39657974243164, "logps/rejected": -77.71202087402344, "loss": 0.87, "nll_loss": 0.522312581539154, "rewards/accuracies": 1.0, "rewards/chosen": 0.8453510999679565, "rewards/margins": 1.361471176147461, "rewards/rejected": -0.5161201357841492, "step": 626 }, { "epoch": 0.1045, "grad_norm": 50.187255859375, "learning_rate": 1.971031182496894e-07, "logits/chosen": 1.9540504217147827, "logits/rejected": 2.0719919204711914, "logps/chosen": -59.05278778076172, "logps/rejected": -91.65994262695312, "loss": 0.8921, "nll_loss": 0.7381598949432373, "rewards/accuracies": 1.0, "rewards/chosen": 0.6932449340820312, "rewards/margins": 2.757502794265747, "rewards/rejected": -2.064257860183716, "step": 627 }, { "epoch": 0.10466666666666667, "grad_norm": 133.13377380371094, "learning_rate": 1.9709020560915635e-07, "logits/chosen": 3.4303693771362305, "logits/rejected": 3.4761877059936523, "logps/chosen": -33.5452880859375, "logps/rejected": -46.272605895996094, "loss": 1.697, "nll_loss": 0.599023163318634, "rewards/accuracies": 0.0, "rewards/chosen": 1.0325263738632202, "rewards/margins": -0.36366963386535645, "rewards/rejected": 1.3961960077285767, "step": 628 }, { "epoch": 0.10483333333333333, "grad_norm": 45.521080017089844, "learning_rate": 1.9707726467886828e-07, "logits/chosen": 2.2211523056030273, "logits/rejected": 2.79487943649292, "logps/chosen": -215.67965698242188, "logps/rejected": -201.00576782226562, "loss": 1.4346, "nll_loss": 1.3564757108688354, "rewards/accuracies": 1.0, "rewards/chosen": 1.3807891607284546, "rewards/margins": 3.8424863815307617, "rewards/rejected": -2.4616973400115967, "step": 629 }, { "epoch": 0.105, "grad_norm": 68.28943634033203, "learning_rate": 1.970642954625959e-07, "logits/chosen": 2.404728412628174, "logits/rejected": 2.4388723373413086, "logps/chosen": -52.134117126464844, "logps/rejected": -39.8917236328125, "loss": 1.0283, "nll_loss": 0.714165985584259, "rewards/accuracies": 1.0, "rewards/chosen": 0.7117290496826172, "rewards/margins": 1.5088156461715698, "rewards/rejected": -0.7970865964889526, "step": 630 }, { "epoch": 0.10516666666666667, "grad_norm": 70.04417419433594, "learning_rate": 1.9705129796411813e-07, "logits/chosen": 1.6668236255645752, "logits/rejected": 2.5466580390930176, "logps/chosen": -68.24249267578125, "logps/rejected": -235.55120849609375, "loss": 2.1262, "nll_loss": 1.9497851133346558, "rewards/accuracies": 1.0, "rewards/chosen": 0.42487335205078125, "rewards/margins": 2.635754346847534, "rewards/rejected": -2.210880994796753, "step": 631 }, { "epoch": 0.10533333333333333, "grad_norm": 32.496971130371094, "learning_rate": 1.9703827218722208e-07, "logits/chosen": 1.0918298959732056, "logits/rejected": 2.710334539413452, "logps/chosen": -28.54875946044922, "logps/rejected": -365.964111328125, "loss": 0.6233, "nll_loss": 0.5709752440452576, "rewards/accuracies": 1.0, "rewards/chosen": 1.586848497390747, "rewards/margins": 4.74376106262207, "rewards/rejected": -3.156912326812744, "step": 632 }, { "epoch": 0.1055, "grad_norm": 61.36091995239258, "learning_rate": 1.970252181357032e-07, "logits/chosen": 2.526848793029785, "logits/rejected": 2.383028745651245, "logps/chosen": -39.225669860839844, "logps/rejected": -73.5413818359375, "loss": 0.8983, "nll_loss": 0.6648419499397278, "rewards/accuracies": 1.0, "rewards/chosen": 0.5692524313926697, "rewards/margins": 2.0028083324432373, "rewards/rejected": -1.4335559606552124, "step": 633 }, { "epoch": 0.10566666666666667, "grad_norm": 63.19691467285156, "learning_rate": 1.970121358133651e-07, "logits/chosen": 2.688387870788574, "logits/rejected": 2.9564876556396484, "logps/chosen": -115.0325927734375, "logps/rejected": -192.57379150390625, "loss": 1.3343, "nll_loss": 1.150325894355774, "rewards/accuracies": 1.0, "rewards/chosen": 1.4093719720840454, "rewards/margins": 2.3394196033477783, "rewards/rejected": -0.9300476312637329, "step": 634 }, { "epoch": 0.10583333333333333, "grad_norm": 82.27018737792969, "learning_rate": 1.969990252240197e-07, "logits/chosen": 1.932505488395691, "logits/rejected": 1.6533656120300293, "logps/chosen": -156.51060485839844, "logps/rejected": -74.71498107910156, "loss": 1.161, "nll_loss": 0.8892648816108704, "rewards/accuracies": 1.0, "rewards/chosen": 1.1190627813339233, "rewards/margins": 1.7418502569198608, "rewards/rejected": -0.6227874755859375, "step": 635 }, { "epoch": 0.106, "grad_norm": 27.631431579589844, "learning_rate": 1.96985886371487e-07, "logits/chosen": 2.61468768119812, "logits/rejected": 3.2083423137664795, "logps/chosen": -38.42708206176758, "logps/rejected": -309.6973876953125, "loss": 0.5974, "nll_loss": 0.556914210319519, "rewards/accuracies": 1.0, "rewards/chosen": 2.5421178340911865, "rewards/margins": 4.845188140869141, "rewards/rejected": -2.303070068359375, "step": 636 }, { "epoch": 0.10616666666666667, "grad_norm": 56.87316131591797, "learning_rate": 1.969727192595955e-07, "logits/chosen": 2.678138494491577, "logits/rejected": 2.764958620071411, "logps/chosen": -76.10822296142578, "logps/rejected": -156.413818359375, "loss": 1.3029, "nll_loss": 1.1891908645629883, "rewards/accuracies": 1.0, "rewards/chosen": 1.182643175125122, "rewards/margins": 3.137962579727173, "rewards/rejected": -1.9553194046020508, "step": 637 }, { "epoch": 0.10633333333333334, "grad_norm": 71.48970794677734, "learning_rate": 1.9695952389218166e-07, "logits/chosen": 2.7780909538269043, "logits/rejected": 2.472419500350952, "logps/chosen": -89.47604370117188, "logps/rejected": -50.548404693603516, "loss": 1.1549, "nll_loss": 0.8947604298591614, "rewards/accuracies": 1.0, "rewards/chosen": 0.9373504519462585, "rewards/margins": 1.7931804656982422, "rewards/rejected": -0.8558300137519836, "step": 638 }, { "epoch": 0.1065, "grad_norm": 43.99238204956055, "learning_rate": 1.969463002730903e-07, "logits/chosen": 3.0072388648986816, "logits/rejected": 2.730834484100342, "logps/chosen": -116.25904083251953, "logps/rejected": -62.470306396484375, "loss": 0.9027, "nll_loss": 0.7648621201515198, "rewards/accuracies": 1.0, "rewards/chosen": 1.416140079498291, "rewards/margins": 2.7723746299743652, "rewards/rejected": -1.3562344312667847, "step": 639 }, { "epoch": 0.10666666666666667, "grad_norm": 58.779415130615234, "learning_rate": 1.9693304840617456e-07, "logits/chosen": 2.7758851051330566, "logits/rejected": 2.6169776916503906, "logps/chosen": -94.14604949951172, "logps/rejected": -29.47895050048828, "loss": 1.356, "nll_loss": 1.1342896223068237, "rewards/accuracies": 1.0, "rewards/chosen": 2.0874016284942627, "rewards/margins": 2.2818686962127686, "rewards/rejected": -0.19446717202663422, "step": 640 }, { "epoch": 0.10683333333333334, "grad_norm": 80.446533203125, "learning_rate": 1.9691976829529562e-07, "logits/chosen": 3.0609116554260254, "logits/rejected": 3.2023231983184814, "logps/chosen": -99.17677307128906, "logps/rejected": -221.74588012695312, "loss": 1.0388, "nll_loss": 0.7809194922447205, "rewards/accuracies": 1.0, "rewards/chosen": 1.127008080482483, "rewards/margins": 1.8179504871368408, "rewards/rejected": -0.6909424066543579, "step": 641 }, { "epoch": 0.107, "grad_norm": 44.38124084472656, "learning_rate": 1.9690645994432305e-07, "logits/chosen": 2.729496955871582, "logits/rejected": 2.7365808486938477, "logps/chosen": -21.51150894165039, "logps/rejected": -69.10759735107422, "loss": 0.6348, "nll_loss": 0.500267744064331, "rewards/accuracies": 1.0, "rewards/chosen": 1.1207942962646484, "rewards/margins": 2.8418831825256348, "rewards/rejected": -1.7210887670516968, "step": 642 }, { "epoch": 0.10716666666666666, "grad_norm": 68.63825225830078, "learning_rate": 1.9689312335713454e-07, "logits/chosen": 3.7903504371643066, "logits/rejected": 3.9314770698547363, "logps/chosen": -50.949745178222656, "logps/rejected": -75.2257308959961, "loss": 0.8763, "nll_loss": 0.5789743661880493, "rewards/accuracies": 1.0, "rewards/chosen": 0.9626858234405518, "rewards/margins": 1.597527027130127, "rewards/rejected": -0.6348412036895752, "step": 643 }, { "epoch": 0.10733333333333334, "grad_norm": 47.51137161254883, "learning_rate": 1.9687975853761602e-07, "logits/chosen": 2.0977776050567627, "logits/rejected": 1.7106866836547852, "logps/chosen": -36.09075164794922, "logps/rejected": -36.80650329589844, "loss": 0.6857, "nll_loss": 0.5552424192428589, "rewards/accuracies": 1.0, "rewards/chosen": 1.5644608736038208, "rewards/margins": 2.8661022186279297, "rewards/rejected": -1.3016412258148193, "step": 644 }, { "epoch": 0.1075, "grad_norm": 439.1903076171875, "learning_rate": 1.9686636548966176e-07, "logits/chosen": 3.077425003051758, "logits/rejected": 2.8132858276367188, "logps/chosen": -376.55670166015625, "logps/rejected": -79.18240356445312, "loss": 2.1125, "nll_loss": 1.1075196266174316, "rewards/accuracies": 1.0, "rewards/chosen": -2.580761671066284, "rewards/margins": 0.23979783058166504, "rewards/rejected": -2.820559501647949, "step": 645 }, { "epoch": 0.10766666666666666, "grad_norm": 46.258941650390625, "learning_rate": 1.9685294421717414e-07, "logits/chosen": 3.131951332092285, "logits/rejected": 3.072481632232666, "logps/chosen": -87.82429504394531, "logps/rejected": -67.67881774902344, "loss": 1.0645, "nll_loss": 0.9148364067077637, "rewards/accuracies": 1.0, "rewards/chosen": 0.8500229716300964, "rewards/margins": 2.7313039302825928, "rewards/rejected": -1.8812810182571411, "step": 646 }, { "epoch": 0.10783333333333334, "grad_norm": 36.91995620727539, "learning_rate": 1.9683949472406377e-07, "logits/chosen": 3.351391077041626, "logits/rejected": 3.3806655406951904, "logps/chosen": -54.78907775878906, "logps/rejected": -87.08936309814453, "loss": 0.7724, "nll_loss": 0.6764083504676819, "rewards/accuracies": 1.0, "rewards/chosen": 1.205714464187622, "rewards/margins": 3.4783248901367188, "rewards/rejected": -2.2726104259490967, "step": 647 }, { "epoch": 0.108, "grad_norm": 61.555912017822266, "learning_rate": 1.9682601701424956e-07, "logits/chosen": 2.5911450386047363, "logits/rejected": 2.7166543006896973, "logps/chosen": -48.12207794189453, "logps/rejected": -111.60503387451172, "loss": 0.7949, "nll_loss": 0.5661420822143555, "rewards/accuracies": 1.0, "rewards/chosen": 1.0454349517822266, "rewards/margins": 1.9885776042938232, "rewards/rejected": -0.9431427121162415, "step": 648 }, { "epoch": 0.10816666666666666, "grad_norm": 77.37020874023438, "learning_rate": 1.9681251109165858e-07, "logits/chosen": 3.6009066104888916, "logits/rejected": 3.6804840564727783, "logps/chosen": -19.235448837280273, "logps/rejected": -81.7182388305664, "loss": 0.8492, "nll_loss": 0.4932166337966919, "rewards/accuracies": 1.0, "rewards/chosen": 1.051505446434021, "rewards/margins": 1.349045753479004, "rewards/rejected": -0.2975402772426605, "step": 649 }, { "epoch": 0.10833333333333334, "grad_norm": 62.32012939453125, "learning_rate": 1.9679897696022607e-07, "logits/chosen": 2.8519177436828613, "logits/rejected": 2.95387601852417, "logps/chosen": -175.0659942626953, "logps/rejected": -227.8751678466797, "loss": 1.4623, "nll_loss": 1.3466614484786987, "rewards/accuracies": 1.0, "rewards/chosen": 0.7047226428985596, "rewards/margins": 3.4918367862701416, "rewards/rejected": -2.787114143371582, "step": 650 }, { "epoch": 0.1085, "grad_norm": 164.8250732421875, "learning_rate": 1.967854146238956e-07, "logits/chosen": 3.1989710330963135, "logits/rejected": 3.0197222232818604, "logps/chosen": -226.07354736328125, "logps/rejected": -56.754676818847656, "loss": 1.4219, "nll_loss": 0.8250857591629028, "rewards/accuracies": 1.0, "rewards/chosen": 0.423013299703598, "rewards/margins": 0.5366809964179993, "rewards/rejected": -0.11366768926382065, "step": 651 }, { "epoch": 0.10866666666666666, "grad_norm": 88.35371398925781, "learning_rate": 1.9677182408661892e-07, "logits/chosen": 2.290876865386963, "logits/rejected": 2.2161998748779297, "logps/chosen": -60.765899658203125, "logps/rejected": -29.95786476135254, "loss": 1.4823, "nll_loss": 1.0127651691436768, "rewards/accuracies": 1.0, "rewards/chosen": 1.0790176391601562, "rewards/margins": 0.9594404101371765, "rewards/rejected": 0.11957722902297974, "step": 652 }, { "epoch": 0.10883333333333334, "grad_norm": 91.10638427734375, "learning_rate": 1.9675820535235596e-07, "logits/chosen": 2.906330108642578, "logits/rejected": 3.0388245582580566, "logps/chosen": -35.0716552734375, "logps/rejected": -28.376283645629883, "loss": 1.4795, "nll_loss": 1.0315192937850952, "rewards/accuracies": 1.0, "rewards/chosen": 0.018460845574736595, "rewards/margins": 1.009641408920288, "rewards/rejected": -0.9911805987358093, "step": 653 }, { "epoch": 0.109, "grad_norm": 55.877933502197266, "learning_rate": 1.9674455842507493e-07, "logits/chosen": 2.8481791019439697, "logits/rejected": 2.9379522800445557, "logps/chosen": -15.741754531860352, "logps/rejected": -105.67080688476562, "loss": 0.7313, "nll_loss": 0.5622054934501648, "rewards/accuracies": 1.0, "rewards/chosen": 0.8983535766601562, "rewards/margins": 2.487017869949341, "rewards/rejected": -1.5886642932891846, "step": 654 }, { "epoch": 0.10916666666666666, "grad_norm": 84.32633209228516, "learning_rate": 1.9673088330875215e-07, "logits/chosen": 1.4660240411758423, "logits/rejected": 2.3315324783325195, "logps/chosen": -74.51422119140625, "logps/rejected": -171.4427490234375, "loss": 1.9416, "nll_loss": 1.8174200057983398, "rewards/accuracies": 1.0, "rewards/chosen": 0.5120712518692017, "rewards/margins": 3.575227737426758, "rewards/rejected": -3.0631563663482666, "step": 655 }, { "epoch": 0.10933333333333334, "grad_norm": 47.50465774536133, "learning_rate": 1.9671718000737228e-07, "logits/chosen": 3.395066738128662, "logits/rejected": 3.4810283184051514, "logps/chosen": -22.796077728271484, "logps/rejected": -152.17835998535156, "loss": 0.8685, "nll_loss": 0.7353573441505432, "rewards/accuracies": 1.0, "rewards/chosen": 0.6522018313407898, "rewards/margins": 3.1353089809417725, "rewards/rejected": -2.483107089996338, "step": 656 }, { "epoch": 0.1095, "grad_norm": 74.34750366210938, "learning_rate": 1.967034485249281e-07, "logits/chosen": 3.0620851516723633, "logits/rejected": 3.3346755504608154, "logps/chosen": -26.63906478881836, "logps/rejected": -290.678466796875, "loss": 0.6617, "nll_loss": 0.4756976068019867, "rewards/accuracies": 1.0, "rewards/chosen": 1.425471544265747, "rewards/margins": 2.3269882202148438, "rewards/rejected": -0.9015167355537415, "step": 657 }, { "epoch": 0.10966666666666666, "grad_norm": 119.3974609375, "learning_rate": 1.9668968886542069e-07, "logits/chosen": 2.0587706565856934, "logits/rejected": 1.703250527381897, "logps/chosen": -25.52828598022461, "logps/rejected": -47.880393981933594, "loss": 1.2528, "nll_loss": 0.46415066719055176, "rewards/accuracies": 1.0, "rewards/chosen": 0.41633379459381104, "rewards/margins": 0.10301363468170166, "rewards/rejected": 0.3133201599121094, "step": 658 }, { "epoch": 0.10983333333333334, "grad_norm": 47.12540817260742, "learning_rate": 1.9667590103285918e-07, "logits/chosen": 3.0376393795013428, "logits/rejected": 3.1467840671539307, "logps/chosen": -109.99440002441406, "logps/rejected": -137.80636596679688, "loss": 1.3416, "nll_loss": 1.2499364614486694, "rewards/accuracies": 1.0, "rewards/chosen": 0.8910965323448181, "rewards/margins": 3.9798715114593506, "rewards/rejected": -3.0887749195098877, "step": 659 }, { "epoch": 0.11, "grad_norm": 42.24985122680664, "learning_rate": 1.966620850312611e-07, "logits/chosen": 2.4485890865325928, "logits/rejected": 2.800755023956299, "logps/chosen": -42.77794647216797, "logps/rejected": -556.667724609375, "loss": 1.0043, "nll_loss": 0.9506210088729858, "rewards/accuracies": 1.0, "rewards/chosen": 1.3491116762161255, "rewards/margins": 5.242983818054199, "rewards/rejected": -3.8938722610473633, "step": 660 }, { "epoch": 0.11016666666666666, "grad_norm": 50.86042022705078, "learning_rate": 1.966482408646521e-07, "logits/chosen": 2.7930073738098145, "logits/rejected": 3.267974615097046, "logps/chosen": -115.72996520996094, "logps/rejected": -432.76544189453125, "loss": 1.1773, "nll_loss": 1.0426123142242432, "rewards/accuracies": 1.0, "rewards/chosen": 0.7071807980537415, "rewards/margins": 3.0480194091796875, "rewards/rejected": -2.340838670730591, "step": 661 }, { "epoch": 0.11033333333333334, "grad_norm": 44.897377014160156, "learning_rate": 1.9663436853706598e-07, "logits/chosen": 2.8141839504241943, "logits/rejected": 2.6846868991851807, "logps/chosen": -109.96615600585938, "logps/rejected": -104.86846923828125, "loss": 1.5301, "nll_loss": 1.4281319379806519, "rewards/accuracies": 1.0, "rewards/chosen": 0.9550735354423523, "rewards/margins": 3.5316154956817627, "rewards/rejected": -2.5765419006347656, "step": 662 }, { "epoch": 0.1105, "grad_norm": 59.326698303222656, "learning_rate": 1.9662046805254487e-07, "logits/chosen": 3.1660609245300293, "logits/rejected": 3.0644829273223877, "logps/chosen": -73.03226470947266, "logps/rejected": -78.2164077758789, "loss": 0.9627, "nll_loss": 0.7938289642333984, "rewards/accuracies": 1.0, "rewards/chosen": 1.1481269598007202, "rewards/margins": 2.454474687576294, "rewards/rejected": -1.3063477277755737, "step": 663 }, { "epoch": 0.11066666666666666, "grad_norm": 47.91972732543945, "learning_rate": 1.9660653941513898e-07, "logits/chosen": 1.4009242057800293, "logits/rejected": 2.7198703289031982, "logps/chosen": -30.24726104736328, "logps/rejected": -173.3546600341797, "loss": 0.9401, "nll_loss": 0.86420738697052, "rewards/accuracies": 1.0, "rewards/chosen": 1.0880317687988281, "rewards/margins": 4.294234275817871, "rewards/rejected": -3.206202745437622, "step": 664 }, { "epoch": 0.11083333333333334, "grad_norm": 45.998477935791016, "learning_rate": 1.965925826289068e-07, "logits/chosen": 2.463432550430298, "logits/rejected": 2.263134002685547, "logps/chosen": -148.5611572265625, "logps/rejected": -108.1851806640625, "loss": 0.7476, "nll_loss": 0.6088573336601257, "rewards/accuracies": 1.0, "rewards/chosen": 0.5505599975585938, "rewards/margins": 3.126034736633301, "rewards/rejected": -2.575474739074707, "step": 665 }, { "epoch": 0.111, "grad_norm": 151.3238983154297, "learning_rate": 1.9657859769791502e-07, "logits/chosen": 3.3912649154663086, "logits/rejected": 3.41337251663208, "logps/chosen": -76.33619689941406, "logps/rejected": -163.26559448242188, "loss": 2.1114, "nll_loss": 1.659482717514038, "rewards/accuracies": 1.0, "rewards/chosen": -0.5507164001464844, "rewards/margins": 1.1520912647247314, "rewards/rejected": -1.7028076648712158, "step": 666 }, { "epoch": 0.11116666666666666, "grad_norm": 71.28016662597656, "learning_rate": 1.9656458462623852e-07, "logits/chosen": 3.256993055343628, "logits/rejected": 3.279916286468506, "logps/chosen": -48.026634216308594, "logps/rejected": -140.67953491210938, "loss": 1.8247, "nll_loss": 1.7787643671035767, "rewards/accuracies": 1.0, "rewards/chosen": 2.313636064529419, "rewards/margins": 4.611786842346191, "rewards/rejected": -2.2981507778167725, "step": 667 }, { "epoch": 0.11133333333333334, "grad_norm": 191.47889709472656, "learning_rate": 1.9655054341796037e-07, "logits/chosen": 3.154186725616455, "logits/rejected": 3.153108596801758, "logps/chosen": -29.241979598999023, "logps/rejected": -108.48265838623047, "loss": 1.9188, "nll_loss": 0.913811981678009, "rewards/accuracies": 0.0, "rewards/chosen": 0.21788635849952698, "rewards/margins": -0.30939027667045593, "rewards/rejected": 0.5272766351699829, "step": 668 }, { "epoch": 0.1115, "grad_norm": 56.68727493286133, "learning_rate": 1.9653647407717177e-07, "logits/chosen": 1.874986171722412, "logits/rejected": 2.5440080165863037, "logps/chosen": -18.10767364501953, "logps/rejected": -133.5245361328125, "loss": 0.721, "nll_loss": 0.5658648014068604, "rewards/accuracies": 1.0, "rewards/chosen": 0.5083206295967102, "rewards/margins": 2.864523410797119, "rewards/rejected": -2.3562028408050537, "step": 669 }, { "epoch": 0.11166666666666666, "grad_norm": 38.52712631225586, "learning_rate": 1.9652237660797225e-07, "logits/chosen": 2.2194879055023193, "logits/rejected": 2.5573155879974365, "logps/chosen": -77.73204040527344, "logps/rejected": -228.50421142578125, "loss": 0.873, "nll_loss": 0.8013613224029541, "rewards/accuracies": 1.0, "rewards/chosen": 2.908952474594116, "rewards/margins": 4.205406188964844, "rewards/rejected": -1.2964539527893066, "step": 670 }, { "epoch": 0.11183333333333334, "grad_norm": 36.37068557739258, "learning_rate": 1.965082510144695e-07, "logits/chosen": 2.8200559616088867, "logits/rejected": 2.8771729469299316, "logps/chosen": -56.144744873046875, "logps/rejected": -139.98019409179688, "loss": 0.8386, "nll_loss": 0.7587125897407532, "rewards/accuracies": 1.0, "rewards/chosen": 1.4353845119476318, "rewards/margins": 3.7432398796081543, "rewards/rejected": -2.3078553676605225, "step": 671 }, { "epoch": 0.112, "grad_norm": 72.94679260253906, "learning_rate": 1.9649409730077933e-07, "logits/chosen": 2.617711067199707, "logits/rejected": 3.003760576248169, "logps/chosen": -129.6357421875, "logps/rejected": -246.27444458007812, "loss": 1.3501, "nll_loss": 1.1678895950317383, "rewards/accuracies": 1.0, "rewards/chosen": 0.2672576904296875, "rewards/margins": 2.6775176525115967, "rewards/rejected": -2.410259962081909, "step": 672 }, { "epoch": 0.11216666666666666, "grad_norm": 199.27548217773438, "learning_rate": 1.9647991547102582e-07, "logits/chosen": 3.244837522506714, "logits/rejected": 3.239287853240967, "logps/chosen": -156.13723754882812, "logps/rejected": -34.24007034301758, "loss": 2.3133, "nll_loss": 1.5771437883377075, "rewards/accuracies": 1.0, "rewards/chosen": 0.8853134512901306, "rewards/margins": 0.2658916711807251, "rewards/rejected": 0.6194217801094055, "step": 673 }, { "epoch": 0.11233333333333333, "grad_norm": 139.82135009765625, "learning_rate": 1.964657055293412e-07, "logits/chosen": 2.9097518920898438, "logits/rejected": 2.964240074157715, "logps/chosen": -35.649070739746094, "logps/rejected": -93.78688049316406, "loss": 1.2775, "nll_loss": 0.5401374697685242, "rewards/accuracies": 1.0, "rewards/chosen": 1.9286072254180908, "rewards/margins": 0.5018386840820312, "rewards/rejected": 1.4267685413360596, "step": 674 }, { "epoch": 0.1125, "grad_norm": 54.270938873291016, "learning_rate": 1.9645146747986589e-07, "logits/chosen": 3.0356764793395996, "logits/rejected": 3.1473612785339355, "logps/chosen": -32.35062026977539, "logps/rejected": -44.63285827636719, "loss": 0.8782, "nll_loss": 0.6739711761474609, "rewards/accuracies": 1.0, "rewards/chosen": 0.42727622389793396, "rewards/margins": 2.2998125553131104, "rewards/rejected": -1.8725364208221436, "step": 675 }, { "epoch": 0.11266666666666666, "grad_norm": 76.38179016113281, "learning_rate": 1.9643720132674853e-07, "logits/chosen": 3.0394601821899414, "logits/rejected": 3.253394842147827, "logps/chosen": -58.19161605834961, "logps/rejected": -246.16468811035156, "loss": 2.3273, "nll_loss": 2.2381386756896973, "rewards/accuracies": 1.0, "rewards/chosen": 0.9930225610733032, "rewards/margins": 3.8777379989624023, "rewards/rejected": -2.8847155570983887, "step": 676 }, { "epoch": 0.11283333333333333, "grad_norm": 32.85429000854492, "learning_rate": 1.9642290707414595e-07, "logits/chosen": 1.0750256776809692, "logits/rejected": 1.9663407802581787, "logps/chosen": -34.04509735107422, "logps/rejected": -196.94439697265625, "loss": 0.6363, "nll_loss": 0.540398359298706, "rewards/accuracies": 1.0, "rewards/chosen": 0.5046802759170532, "rewards/margins": 5.753624439239502, "rewards/rejected": -5.248944282531738, "step": 677 }, { "epoch": 0.113, "grad_norm": 195.01609802246094, "learning_rate": 1.9640858472622315e-07, "logits/chosen": 0.7368265986442566, "logits/rejected": 3.732675075531006, "logps/chosen": -20.840116500854492, "logps/rejected": -133.87628173828125, "loss": 1.1595, "nll_loss": 0.8015429973602295, "rewards/accuracies": 1.0, "rewards/chosen": 0.6418666839599609, "rewards/margins": 1.3120007514953613, "rewards/rejected": -0.6701340675354004, "step": 678 }, { "epoch": 0.11316666666666667, "grad_norm": 70.86003112792969, "learning_rate": 1.963942342871533e-07, "logits/chosen": 2.0807149410247803, "logits/rejected": 2.042428493499756, "logps/chosen": -39.322479248046875, "logps/rejected": -155.16648864746094, "loss": 1.8053, "nll_loss": 1.7096728086471558, "rewards/accuracies": 1.0, "rewards/chosen": 1.3066511154174805, "rewards/margins": 3.4249541759490967, "rewards/rejected": -2.118303060531616, "step": 679 }, { "epoch": 0.11333333333333333, "grad_norm": 61.37153625488281, "learning_rate": 1.9637985576111778e-07, "logits/chosen": 2.0461721420288086, "logits/rejected": 2.3149044513702393, "logps/chosen": -72.94136810302734, "logps/rejected": -86.70587921142578, "loss": 1.0613, "nll_loss": 0.8481555581092834, "rewards/accuracies": 1.0, "rewards/chosen": 0.7872940301895142, "rewards/margins": 2.110170841217041, "rewards/rejected": -1.3228768110275269, "step": 680 }, { "epoch": 0.1135, "grad_norm": 128.43890380859375, "learning_rate": 1.9636544915230617e-07, "logits/chosen": 1.6248464584350586, "logits/rejected": 2.772091865539551, "logps/chosen": -97.45367431640625, "logps/rejected": -161.39952087402344, "loss": 1.8999, "nll_loss": 1.4992873668670654, "rewards/accuracies": 1.0, "rewards/chosen": -0.4370384216308594, "rewards/margins": 1.344438910484314, "rewards/rejected": -1.7814773321151733, "step": 681 }, { "epoch": 0.11366666666666667, "grad_norm": 136.67662048339844, "learning_rate": 1.9635101446491617e-07, "logits/chosen": 2.541362762451172, "logits/rejected": 2.6428773403167725, "logps/chosen": -38.43993377685547, "logps/rejected": -92.93193054199219, "loss": 2.1477, "nll_loss": 1.671301245689392, "rewards/accuracies": 1.0, "rewards/chosen": -0.012165069580078125, "rewards/margins": 0.9087188839912415, "rewards/rejected": -0.9208839535713196, "step": 682 }, { "epoch": 0.11383333333333333, "grad_norm": 53.50474548339844, "learning_rate": 1.9633655170315374e-07, "logits/chosen": 2.2219176292419434, "logits/rejected": 2.4859261512756348, "logps/chosen": -82.8534164428711, "logps/rejected": -360.15118408203125, "loss": 1.2663, "nll_loss": 1.134978175163269, "rewards/accuracies": 1.0, "rewards/chosen": 0.4582328796386719, "rewards/margins": 3.4269587993621826, "rewards/rejected": -2.9687259197235107, "step": 683 }, { "epoch": 0.114, "grad_norm": 77.88947296142578, "learning_rate": 1.9632206087123295e-07, "logits/chosen": 2.1072888374328613, "logits/rejected": 2.1920981407165527, "logps/chosen": -111.28675842285156, "logps/rejected": -201.39859008789062, "loss": 0.9876, "nll_loss": 0.7949054837226868, "rewards/accuracies": 1.0, "rewards/chosen": 0.6547486186027527, "rewards/margins": 2.314598321914673, "rewards/rejected": -1.6598496437072754, "step": 684 }, { "epoch": 0.11416666666666667, "grad_norm": 41.84270477294922, "learning_rate": 1.9630754197337608e-07, "logits/chosen": 0.5081257224082947, "logits/rejected": 1.3864223957061768, "logps/chosen": -49.85379409790039, "logps/rejected": -210.8743896484375, "loss": 0.8923, "nll_loss": 0.8172752857208252, "rewards/accuracies": 1.0, "rewards/chosen": 1.5312618017196655, "rewards/margins": 3.8266048431396484, "rewards/rejected": -2.2953431606292725, "step": 685 }, { "epoch": 0.11433333333333333, "grad_norm": 102.74933624267578, "learning_rate": 1.9629299501381363e-07, "logits/chosen": 2.41782546043396, "logits/rejected": 2.1422436237335205, "logps/chosen": -24.846614837646484, "logps/rejected": -74.50930786132812, "loss": 1.0112, "nll_loss": 0.637092649936676, "rewards/accuracies": 1.0, "rewards/chosen": 0.07009220123291016, "rewards/margins": 1.2995065450668335, "rewards/rejected": -1.2294143438339233, "step": 686 }, { "epoch": 0.1145, "grad_norm": 58.123077392578125, "learning_rate": 1.9627841999678418e-07, "logits/chosen": 2.7596445083618164, "logits/rejected": 2.831678628921509, "logps/chosen": -63.00472640991211, "logps/rejected": -332.7370300292969, "loss": 1.1815, "nll_loss": 1.0162051916122437, "rewards/accuracies": 1.0, "rewards/chosen": 0.1246616393327713, "rewards/margins": 3.177203893661499, "rewards/rejected": -3.052542209625244, "step": 687 }, { "epoch": 0.11466666666666667, "grad_norm": 57.67237854003906, "learning_rate": 1.962638169265346e-07, "logits/chosen": 1.7887264490127563, "logits/rejected": 2.5282206535339355, "logps/chosen": -32.110084533691406, "logps/rejected": -139.43026733398438, "loss": 0.9368, "nll_loss": 0.7467460632324219, "rewards/accuracies": 1.0, "rewards/chosen": -0.23283863067626953, "rewards/margins": 3.412972927093506, "rewards/rejected": -3.6458115577697754, "step": 688 }, { "epoch": 0.11483333333333333, "grad_norm": 42.81077575683594, "learning_rate": 1.9624918580731982e-07, "logits/chosen": 2.875124454498291, "logits/rejected": 3.075397253036499, "logps/chosen": -42.77349090576172, "logps/rejected": -256.3672180175781, "loss": 0.7345, "nll_loss": 0.6384104490280151, "rewards/accuracies": 1.0, "rewards/chosen": 1.4790246486663818, "rewards/margins": 3.364396095275879, "rewards/rejected": -1.885371446609497, "step": 689 }, { "epoch": 0.115, "grad_norm": 60.02245330810547, "learning_rate": 1.9623452664340303e-07, "logits/chosen": 2.859724998474121, "logits/rejected": 2.9606988430023193, "logps/chosen": -77.05831146240234, "logps/rejected": -391.8104248046875, "loss": 1.6872, "nll_loss": 1.6053813695907593, "rewards/accuracies": 1.0, "rewards/chosen": 0.8914688229560852, "rewards/margins": 4.423015117645264, "rewards/rejected": -3.5315463542938232, "step": 690 }, { "epoch": 0.11516666666666667, "grad_norm": 73.91586303710938, "learning_rate": 1.9621983943905552e-07, "logits/chosen": 2.3534748554229736, "logits/rejected": 2.406822681427002, "logps/chosen": -56.127197265625, "logps/rejected": -31.112821578979492, "loss": 1.0501, "nll_loss": 0.7195793390274048, "rewards/accuracies": 1.0, "rewards/chosen": 0.20563852787017822, "rewards/margins": 1.4802786111831665, "rewards/rejected": -1.2746400833129883, "step": 691 }, { "epoch": 0.11533333333333333, "grad_norm": 103.69051361083984, "learning_rate": 1.962051241985568e-07, "logits/chosen": 2.5839431285858154, "logits/rejected": 2.551471471786499, "logps/chosen": -95.814208984375, "logps/rejected": -74.38233947753906, "loss": 2.7782, "nll_loss": 2.52142596244812, "rewards/accuracies": 1.0, "rewards/chosen": 0.44117432832717896, "rewards/margins": 1.8598601818084717, "rewards/rejected": -1.4186859130859375, "step": 692 }, { "epoch": 0.1155, "grad_norm": 57.8751220703125, "learning_rate": 1.9619038092619462e-07, "logits/chosen": 2.01705265045166, "logits/rejected": 2.6698198318481445, "logps/chosen": -48.56561279296875, "logps/rejected": -202.475830078125, "loss": 1.5018, "nll_loss": 1.4284003973007202, "rewards/accuracies": 1.0, "rewards/chosen": 1.5034435987472534, "rewards/margins": 3.8828673362731934, "rewards/rejected": -2.3794236183166504, "step": 693 }, { "epoch": 0.11566666666666667, "grad_norm": 49.246952056884766, "learning_rate": 1.961756096262647e-07, "logits/chosen": 2.4653427600860596, "logits/rejected": 2.5933778285980225, "logps/chosen": -54.051902770996094, "logps/rejected": -136.51681518554688, "loss": 0.9807, "nll_loss": 0.8579667806625366, "rewards/accuracies": 1.0, "rewards/chosen": 0.6244010925292969, "rewards/margins": 3.367351770401001, "rewards/rejected": -2.742950677871704, "step": 694 }, { "epoch": 0.11583333333333333, "grad_norm": 51.088645935058594, "learning_rate": 1.961608103030711e-07, "logits/chosen": 2.8245232105255127, "logits/rejected": 2.2365214824676514, "logps/chosen": -39.849395751953125, "logps/rejected": -18.45970916748047, "loss": 1.0625, "nll_loss": 0.8301957249641418, "rewards/accuracies": 1.0, "rewards/chosen": 1.3061192035675049, "rewards/margins": 1.9965803623199463, "rewards/rejected": -0.6904611587524414, "step": 695 }, { "epoch": 0.116, "grad_norm": 98.30490112304688, "learning_rate": 1.96145982960926e-07, "logits/chosen": 1.5663083791732788, "logits/rejected": 2.610750198364258, "logps/chosen": -7.528344631195068, "logps/rejected": -129.17112731933594, "loss": 0.7646, "nll_loss": 0.627362072467804, "rewards/accuracies": 1.0, "rewards/chosen": 0.47044628858566284, "rewards/margins": 3.2339885234832764, "rewards/rejected": -2.7635421752929688, "step": 696 }, { "epoch": 0.11616666666666667, "grad_norm": 270.5205993652344, "learning_rate": 1.961311276041497e-07, "logits/chosen": 2.348507881164551, "logits/rejected": 2.3014843463897705, "logps/chosen": -79.22423553466797, "logps/rejected": -218.3371124267578, "loss": 2.7022, "nll_loss": 1.5844846963882446, "rewards/accuracies": 0.0, "rewards/chosen": -2.219639539718628, "rewards/margins": -0.19474172592163086, "rewards/rejected": -2.024897813796997, "step": 697 }, { "epoch": 0.11633333333333333, "grad_norm": 54.92570495605469, "learning_rate": 1.9611624423707067e-07, "logits/chosen": 1.9696680307388306, "logits/rejected": 2.545074939727783, "logps/chosen": -57.33584976196289, "logps/rejected": -124.48976135253906, "loss": 1.144, "nll_loss": 0.9555975794792175, "rewards/accuracies": 1.0, "rewards/chosen": 1.0022121667861938, "rewards/margins": 2.2869715690612793, "rewards/rejected": -1.284759521484375, "step": 698 }, { "epoch": 0.1165, "grad_norm": 33.50812911987305, "learning_rate": 1.9610133286402562e-07, "logits/chosen": 1.9391698837280273, "logits/rejected": 1.6883630752563477, "logps/chosen": -56.30569839477539, "logps/rejected": -70.74813842773438, "loss": 0.8844, "nll_loss": 0.8280250430107117, "rewards/accuracies": 1.0, "rewards/chosen": 1.7542552947998047, "rewards/margins": 4.326296806335449, "rewards/rejected": -2.5720415115356445, "step": 699 }, { "epoch": 0.11666666666666667, "grad_norm": 38.354736328125, "learning_rate": 1.9608639348935936e-07, "logits/chosen": 2.3858914375305176, "logits/rejected": 2.906830072402954, "logps/chosen": -45.59532165527344, "logps/rejected": -289.06634521484375, "loss": 0.8421, "nll_loss": 0.7728021144866943, "rewards/accuracies": 1.0, "rewards/chosen": 0.8993396759033203, "rewards/margins": 5.780257225036621, "rewards/rejected": -4.880917549133301, "step": 700 }, { "epoch": 0.11683333333333333, "grad_norm": 267.43701171875, "learning_rate": 1.9607142611742483e-07, "logits/chosen": 2.9898934364318848, "logits/rejected": 3.2833411693573, "logps/chosen": -87.5421371459961, "logps/rejected": -153.76016235351562, "loss": 2.0727, "nll_loss": 1.2158631086349487, "rewards/accuracies": 1.0, "rewards/chosen": 0.777722954750061, "rewards/margins": 0.01576310396194458, "rewards/rejected": 0.7619598507881165, "step": 701 }, { "epoch": 0.117, "grad_norm": 64.7607650756836, "learning_rate": 1.960564307525832e-07, "logits/chosen": 2.5190038681030273, "logits/rejected": 2.6453349590301514, "logps/chosen": -25.113290786743164, "logps/rejected": -180.8678436279297, "loss": 0.8426, "nll_loss": 0.6608760952949524, "rewards/accuracies": 1.0, "rewards/chosen": 0.6597270965576172, "rewards/margins": 2.4156627655029297, "rewards/rejected": -1.7559356689453125, "step": 702 }, { "epoch": 0.11716666666666667, "grad_norm": 178.7213134765625, "learning_rate": 1.9604140739920374e-07, "logits/chosen": 2.8727352619171143, "logits/rejected": 3.1204822063446045, "logps/chosen": -61.08985137939453, "logps/rejected": -112.56736755371094, "loss": 2.0061, "nll_loss": 1.420694351196289, "rewards/accuracies": 1.0, "rewards/chosen": -0.05585785210132599, "rewards/margins": 0.5742889642715454, "rewards/rejected": -0.6301468014717102, "step": 703 }, { "epoch": 0.11733333333333333, "grad_norm": 78.96505737304688, "learning_rate": 1.960263560616639e-07, "logits/chosen": 2.6243979930877686, "logits/rejected": 2.671049118041992, "logps/chosen": -47.11851119995117, "logps/rejected": -210.57127380371094, "loss": 1.3699, "nll_loss": 1.2734732627868652, "rewards/accuracies": 1.0, "rewards/chosen": 0.46376991271972656, "rewards/margins": 5.891010284423828, "rewards/rejected": -5.427240371704102, "step": 704 }, { "epoch": 0.1175, "grad_norm": 33.96166229248047, "learning_rate": 1.9601127674434928e-07, "logits/chosen": 2.6915299892425537, "logits/rejected": 3.055335760116577, "logps/chosen": -58.864349365234375, "logps/rejected": -296.828369140625, "loss": 0.7723, "nll_loss": 0.7092090249061584, "rewards/accuracies": 1.0, "rewards/chosen": 1.110761284828186, "rewards/margins": 5.070108890533447, "rewards/rejected": -3.959347724914551, "step": 705 }, { "epoch": 0.11766666666666667, "grad_norm": 79.44143676757812, "learning_rate": 1.959961694516536e-07, "logits/chosen": 2.7263059616088867, "logits/rejected": 2.835773468017578, "logps/chosen": -58.78361511230469, "logps/rejected": -73.86531829833984, "loss": 1.4903, "nll_loss": 1.1756724119186401, "rewards/accuracies": 1.0, "rewards/chosen": 0.09802743047475815, "rewards/margins": 1.5908771753311157, "rewards/rejected": -1.4928497076034546, "step": 706 }, { "epoch": 0.11783333333333333, "grad_norm": 58.08846664428711, "learning_rate": 1.959810341879788e-07, "logits/chosen": 2.157357692718506, "logits/rejected": 2.6815106868743896, "logps/chosen": -65.7442626953125, "logps/rejected": -235.88693237304688, "loss": 1.7946, "nll_loss": 1.7301119565963745, "rewards/accuracies": 1.0, "rewards/chosen": 1.046179175376892, "rewards/margins": 5.241931438446045, "rewards/rejected": -4.195752143859863, "step": 707 }, { "epoch": 0.118, "grad_norm": 69.01889038085938, "learning_rate": 1.9596587095773494e-07, "logits/chosen": 2.6580240726470947, "logits/rejected": 2.787498950958252, "logps/chosen": -31.214862823486328, "logps/rejected": -85.84120178222656, "loss": 1.5404, "nll_loss": 1.3571678400039673, "rewards/accuracies": 1.0, "rewards/chosen": 0.12198295444250107, "rewards/margins": 2.784442663192749, "rewards/rejected": -2.6624596118927, "step": 708 }, { "epoch": 0.11816666666666667, "grad_norm": 62.48591613769531, "learning_rate": 1.959506797653402e-07, "logits/chosen": 1.6326509714126587, "logits/rejected": 2.6034412384033203, "logps/chosen": -51.67730712890625, "logps/rejected": -325.1628723144531, "loss": 1.2096, "nll_loss": 1.0546388626098633, "rewards/accuracies": 1.0, "rewards/chosen": -0.16628913581371307, "rewards/margins": 5.050843238830566, "rewards/rejected": -5.217132568359375, "step": 709 }, { "epoch": 0.11833333333333333, "grad_norm": 87.09532165527344, "learning_rate": 1.959354606152209e-07, "logits/chosen": 2.663668155670166, "logits/rejected": 2.3860743045806885, "logps/chosen": -32.23796463012695, "logps/rejected": -21.085124969482422, "loss": 1.4865, "nll_loss": 1.1513557434082031, "rewards/accuracies": 1.0, "rewards/chosen": 0.9473809003829956, "rewards/margins": 1.428790807723999, "rewards/rejected": -0.48140984773635864, "step": 710 }, { "epoch": 0.1185, "grad_norm": 95.25872802734375, "learning_rate": 1.9592021351181161e-07, "logits/chosen": 2.160428762435913, "logits/rejected": 2.0689847469329834, "logps/chosen": -40.671051025390625, "logps/rejected": -74.65705871582031, "loss": 1.0479, "nll_loss": 0.8300216197967529, "rewards/accuracies": 1.0, "rewards/chosen": 0.8021801114082336, "rewards/margins": 2.068636894226074, "rewards/rejected": -1.2664567232131958, "step": 711 }, { "epoch": 0.11866666666666667, "grad_norm": 39.226806640625, "learning_rate": 1.9590493845955489e-07, "logits/chosen": 2.699012517929077, "logits/rejected": 2.7189199924468994, "logps/chosen": -14.286391258239746, "logps/rejected": -186.861083984375, "loss": 0.4376, "nll_loss": 0.3105736970901489, "rewards/accuracies": 1.0, "rewards/chosen": 1.2510298490524292, "rewards/margins": 2.90887451171875, "rewards/rejected": -1.6578446626663208, "step": 712 }, { "epoch": 0.11883333333333333, "grad_norm": 56.68980407714844, "learning_rate": 1.958896354629016e-07, "logits/chosen": 1.5424904823303223, "logits/rejected": 3.252929210662842, "logps/chosen": -90.61934661865234, "logps/rejected": -471.12835693359375, "loss": 1.05, "nll_loss": 1.006881594657898, "rewards/accuracies": 1.0, "rewards/chosen": 1.6247849464416504, "rewards/margins": 5.292625427246094, "rewards/rejected": -3.6678407192230225, "step": 713 }, { "epoch": 0.119, "grad_norm": 39.98688507080078, "learning_rate": 1.9587430452631058e-07, "logits/chosen": 1.1406372785568237, "logits/rejected": 2.2132153511047363, "logps/chosen": -95.41976928710938, "logps/rejected": -260.813232421875, "loss": 1.1285, "nll_loss": 1.048568844795227, "rewards/accuracies": 1.0, "rewards/chosen": 0.9057625532150269, "rewards/margins": 4.413663864135742, "rewards/rejected": -3.507901191711426, "step": 714 }, { "epoch": 0.11916666666666667, "grad_norm": 67.59465026855469, "learning_rate": 1.9585894565424898e-07, "logits/chosen": 3.24902606010437, "logits/rejected": 3.160057306289673, "logps/chosen": -102.41394805908203, "logps/rejected": -39.15880584716797, "loss": 1.2196, "nll_loss": 0.9226479530334473, "rewards/accuracies": 1.0, "rewards/chosen": 0.8836174011230469, "rewards/margins": 1.5982334613800049, "rewards/rejected": -0.714616060256958, "step": 715 }, { "epoch": 0.11933333333333333, "grad_norm": 34.460296630859375, "learning_rate": 1.9584355885119196e-07, "logits/chosen": 2.45088791847229, "logits/rejected": 2.648477077484131, "logps/chosen": -26.46753692626953, "logps/rejected": -209.59652709960938, "loss": 0.7129, "nll_loss": 0.6786548495292664, "rewards/accuracies": 1.0, "rewards/chosen": 1.6351227760314941, "rewards/margins": 7.631631851196289, "rewards/rejected": -5.996509075164795, "step": 716 }, { "epoch": 0.1195, "grad_norm": 178.94775390625, "learning_rate": 1.9582814412162288e-07, "logits/chosen": 2.3961098194122314, "logits/rejected": 2.8812670707702637, "logps/chosen": -47.3997802734375, "logps/rejected": -324.8383483886719, "loss": 3.3301, "nll_loss": 2.788222074508667, "rewards/accuracies": 1.0, "rewards/chosen": -0.6587605476379395, "rewards/margins": 0.8180797100067139, "rewards/rejected": -1.4768402576446533, "step": 717 }, { "epoch": 0.11966666666666667, "grad_norm": 25.68305206298828, "learning_rate": 1.958127014700332e-07, "logits/chosen": 1.573682188987732, "logits/rejected": 2.9134304523468018, "logps/chosen": -36.942928314208984, "logps/rejected": -200.21731567382812, "loss": 0.6407, "nll_loss": 0.6157154440879822, "rewards/accuracies": 1.0, "rewards/chosen": 2.0086307525634766, "rewards/margins": 7.3465962409973145, "rewards/rejected": -5.337965488433838, "step": 718 }, { "epoch": 0.11983333333333333, "grad_norm": 77.43218994140625, "learning_rate": 1.9579723090092254e-07, "logits/chosen": 1.7220396995544434, "logits/rejected": 2.6222646236419678, "logps/chosen": -21.424251556396484, "logps/rejected": -128.60430908203125, "loss": 0.8843, "nll_loss": 0.549339771270752, "rewards/accuracies": 1.0, "rewards/chosen": 0.13296012580394745, "rewards/margins": 1.466233491897583, "rewards/rejected": -1.3332734107971191, "step": 719 }, { "epoch": 0.12, "grad_norm": 49.58785629272461, "learning_rate": 1.957817324187987e-07, "logits/chosen": 2.3936731815338135, "logits/rejected": 2.2738325595855713, "logps/chosen": -26.278623580932617, "logps/rejected": -58.16779327392578, "loss": 0.7193, "nll_loss": 0.5362985134124756, "rewards/accuracies": 1.0, "rewards/chosen": 1.180777907371521, "rewards/margins": 2.3311986923217773, "rewards/rejected": -1.1504207849502563, "step": 720 }, { "epoch": 0.12016666666666667, "grad_norm": 87.06597137451172, "learning_rate": 1.9576620602817752e-07, "logits/chosen": 2.828827142715454, "logits/rejected": 3.9125466346740723, "logps/chosen": -99.83454895019531, "logps/rejected": -32.6550407409668, "loss": 1.4015, "nll_loss": 1.0620696544647217, "rewards/accuracies": 1.0, "rewards/chosen": 0.48315736651420593, "rewards/margins": 1.3955167531967163, "rewards/rejected": -0.9123594164848328, "step": 721 }, { "epoch": 0.12033333333333333, "grad_norm": 32.95079040527344, "learning_rate": 1.9575065173358304e-07, "logits/chosen": 2.935507297515869, "logits/rejected": 3.0629866123199463, "logps/chosen": -113.64250183105469, "logps/rejected": -256.3868408203125, "loss": 0.9492, "nll_loss": 0.88783198595047, "rewards/accuracies": 1.0, "rewards/chosen": 1.2654221057891846, "rewards/margins": 4.634651184082031, "rewards/rejected": -3.3692290782928467, "step": 722 }, { "epoch": 0.1205, "grad_norm": 34.60264205932617, "learning_rate": 1.957350695395474e-07, "logits/chosen": 1.1707051992416382, "logits/rejected": 1.3407164812088013, "logps/chosen": -42.73780059814453, "logps/rejected": -98.60359191894531, "loss": 0.7446, "nll_loss": 0.6475423574447632, "rewards/accuracies": 1.0, "rewards/chosen": 1.206450343132019, "rewards/margins": 3.417023181915283, "rewards/rejected": -2.2105729579925537, "step": 723 }, { "epoch": 0.12066666666666667, "grad_norm": 68.00506591796875, "learning_rate": 1.9571945945061087e-07, "logits/chosen": 2.5805273056030273, "logits/rejected": 2.3312745094299316, "logps/chosen": -35.92936325073242, "logps/rejected": -36.84382629394531, "loss": 0.9683, "nll_loss": 0.8355667591094971, "rewards/accuracies": 1.0, "rewards/chosen": 1.0267330408096313, "rewards/margins": 2.8720688819885254, "rewards/rejected": -1.845335841178894, "step": 724 }, { "epoch": 0.12083333333333333, "grad_norm": 49.738502502441406, "learning_rate": 1.9570382147132185e-07, "logits/chosen": 2.765507698059082, "logits/rejected": 2.8097615242004395, "logps/chosen": -105.80596923828125, "logps/rejected": -253.89224243164062, "loss": 1.1509, "nll_loss": 1.0373133420944214, "rewards/accuracies": 1.0, "rewards/chosen": 1.2991868257522583, "rewards/margins": 3.0928211212158203, "rewards/rejected": -1.7936341762542725, "step": 725 }, { "epoch": 0.121, "grad_norm": 86.8982925415039, "learning_rate": 1.956881556062369e-07, "logits/chosen": 2.3047354221343994, "logits/rejected": 2.6229584217071533, "logps/chosen": -63.171607971191406, "logps/rejected": -333.445556640625, "loss": 2.2829, "nll_loss": 2.105720043182373, "rewards/accuracies": 1.0, "rewards/chosen": -0.034186553210020065, "rewards/margins": 3.127483367919922, "rewards/rejected": -3.161669969558716, "step": 726 }, { "epoch": 0.12116666666666667, "grad_norm": 115.15472412109375, "learning_rate": 1.9567246185992064e-07, "logits/chosen": 2.1087028980255127, "logits/rejected": 2.518803834915161, "logps/chosen": -19.09510612487793, "logps/rejected": -48.394500732421875, "loss": 1.8629, "nll_loss": 1.3639363050460815, "rewards/accuracies": 1.0, "rewards/chosen": -0.0253753662109375, "rewards/margins": 0.8281235098838806, "rewards/rejected": -0.8534988760948181, "step": 727 }, { "epoch": 0.12133333333333333, "grad_norm": 85.6344985961914, "learning_rate": 1.9565674023694587e-07, "logits/chosen": 2.798529624938965, "logits/rejected": 3.110671281814575, "logps/chosen": -45.06611633300781, "logps/rejected": -283.1528015136719, "loss": 2.2476, "nll_loss": 2.048460006713867, "rewards/accuracies": 1.0, "rewards/chosen": -0.6128708124160767, "rewards/margins": 6.600297927856445, "rewards/rejected": -7.213168621063232, "step": 728 }, { "epoch": 0.1215, "grad_norm": 31.81283950805664, "learning_rate": 1.956409907418935e-07, "logits/chosen": 1.9381215572357178, "logits/rejected": 2.4813232421875, "logps/chosen": -75.6917724609375, "logps/rejected": -215.48231506347656, "loss": 0.9622, "nll_loss": 0.9010926485061646, "rewards/accuracies": 1.0, "rewards/chosen": 1.1667053699493408, "rewards/margins": 4.905400276184082, "rewards/rejected": -3.7386951446533203, "step": 729 }, { "epoch": 0.12166666666666667, "grad_norm": 56.983848571777344, "learning_rate": 1.9562521337935253e-07, "logits/chosen": 2.970113515853882, "logits/rejected": 3.023343563079834, "logps/chosen": -85.86524963378906, "logps/rejected": -396.8670654296875, "loss": 1.9575, "nll_loss": 1.9081166982650757, "rewards/accuracies": 1.0, "rewards/chosen": 1.4151901006698608, "rewards/margins": 5.203010559082031, "rewards/rejected": -3.78782057762146, "step": 730 }, { "epoch": 0.12183333333333334, "grad_norm": 74.92829132080078, "learning_rate": 1.9560940815392014e-07, "logits/chosen": 2.3404624462127686, "logits/rejected": 2.1051511764526367, "logps/chosen": -155.38702392578125, "logps/rejected": -100.78598022460938, "loss": 1.0771, "nll_loss": 0.8399298787117004, "rewards/accuracies": 1.0, "rewards/chosen": 1.2609742879867554, "rewards/margins": 1.967065691947937, "rewards/rejected": -0.7060914039611816, "step": 731 }, { "epoch": 0.122, "grad_norm": 68.12564849853516, "learning_rate": 1.955935750702016e-07, "logits/chosen": 3.136226177215576, "logits/rejected": 3.3011186122894287, "logps/chosen": -119.69326782226562, "logps/rejected": -34.99237823486328, "loss": 2.5783, "nll_loss": 2.4427196979522705, "rewards/accuracies": 1.0, "rewards/chosen": 1.528279185295105, "rewards/margins": 2.814807891845703, "rewards/rejected": -1.2865285873413086, "step": 732 }, { "epoch": 0.12216666666666667, "grad_norm": 79.7674560546875, "learning_rate": 1.9557771413281024e-07, "logits/chosen": 2.0639219284057617, "logits/rejected": 2.327427864074707, "logps/chosen": -59.81327819824219, "logps/rejected": -112.11145782470703, "loss": 1.1769, "nll_loss": 0.8796069025993347, "rewards/accuracies": 1.0, "rewards/chosen": 1.2838013172149658, "rewards/margins": 1.6560783386230469, "rewards/rejected": -0.37227708101272583, "step": 733 }, { "epoch": 0.12233333333333334, "grad_norm": 37.03799819946289, "learning_rate": 1.9556182534636762e-07, "logits/chosen": 3.1101794242858887, "logits/rejected": 3.4206955432891846, "logps/chosen": -36.955440521240234, "logps/rejected": -309.94525146484375, "loss": 0.9163, "nll_loss": 0.8798914551734924, "rewards/accuracies": 1.0, "rewards/chosen": 1.587449312210083, "rewards/margins": 6.740592956542969, "rewards/rejected": -5.153143405914307, "step": 734 }, { "epoch": 0.1225, "grad_norm": 207.8078155517578, "learning_rate": 1.9554590871550327e-07, "logits/chosen": 2.804957389831543, "logits/rejected": 2.7275023460388184, "logps/chosen": -78.82386016845703, "logps/rejected": -41.543312072753906, "loss": 2.2122, "nll_loss": 1.0105623006820679, "rewards/accuracies": 0.0, "rewards/chosen": -0.11600952595472336, "rewards/margins": -0.6403701901435852, "rewards/rejected": 0.5243606567382812, "step": 735 }, { "epoch": 0.12266666666666666, "grad_norm": 126.68962097167969, "learning_rate": 1.9552996424485494e-07, "logits/chosen": 2.9926953315734863, "logits/rejected": 3.0634195804595947, "logps/chosen": -37.291046142578125, "logps/rejected": -26.405454635620117, "loss": 2.363, "nll_loss": 1.9626864194869995, "rewards/accuracies": 1.0, "rewards/chosen": 0.058689117431640625, "rewards/margins": 1.1767137050628662, "rewards/rejected": -1.1180245876312256, "step": 736 }, { "epoch": 0.12283333333333334, "grad_norm": 43.74540710449219, "learning_rate": 1.9551399193906854e-07, "logits/chosen": 2.4228687286376953, "logits/rejected": 2.195789337158203, "logps/chosen": -186.690185546875, "logps/rejected": -161.7968292236328, "loss": 0.7833, "nll_loss": 0.6691405177116394, "rewards/accuracies": 1.0, "rewards/chosen": 0.6468353271484375, "rewards/margins": 3.4977340698242188, "rewards/rejected": -2.8508987426757812, "step": 737 }, { "epoch": 0.123, "grad_norm": 55.8447380065918, "learning_rate": 1.954979918027979e-07, "logits/chosen": 2.9058468341827393, "logits/rejected": 3.0275497436523438, "logps/chosen": -65.79424285888672, "logps/rejected": -145.1661376953125, "loss": 1.012, "nll_loss": 0.8891113996505737, "rewards/accuracies": 1.0, "rewards/chosen": 1.6629829406738281, "rewards/margins": 2.9819908142089844, "rewards/rejected": -1.3190078735351562, "step": 738 }, { "epoch": 0.12316666666666666, "grad_norm": 42.24879837036133, "learning_rate": 1.9548196384070522e-07, "logits/chosen": 3.212768316268921, "logits/rejected": 3.156442403793335, "logps/chosen": -51.15953826904297, "logps/rejected": -118.74147033691406, "loss": 0.8345, "nll_loss": 0.7105491161346436, "rewards/accuracies": 1.0, "rewards/chosen": 1.3939812183380127, "rewards/margins": 2.9402048587799072, "rewards/rejected": -1.5462236404418945, "step": 739 }, { "epoch": 0.12333333333333334, "grad_norm": 57.70600128173828, "learning_rate": 1.9546590805746051e-07, "logits/chosen": 2.5220205783843994, "logits/rejected": 2.591118335723877, "logps/chosen": -44.980587005615234, "logps/rejected": -158.4715576171875, "loss": 1.2759, "nll_loss": 1.1836997270584106, "rewards/accuracies": 1.0, "rewards/chosen": 0.788698673248291, "rewards/margins": 4.0086565017700195, "rewards/rejected": -3.2199578285217285, "step": 740 }, { "epoch": 0.1235, "grad_norm": 34.852203369140625, "learning_rate": 1.9544982445774215e-07, "logits/chosen": 1.8220374584197998, "logits/rejected": 2.585620403289795, "logps/chosen": -67.15728759765625, "logps/rejected": -250.72903442382812, "loss": 1.0143, "nll_loss": 0.9732939004898071, "rewards/accuracies": 1.0, "rewards/chosen": 1.8343292474746704, "rewards/margins": 5.041595458984375, "rewards/rejected": -3.207266330718994, "step": 741 }, { "epoch": 0.12366666666666666, "grad_norm": 34.23876953125, "learning_rate": 1.9543371304623642e-07, "logits/chosen": 2.6406776905059814, "logits/rejected": 2.7478747367858887, "logps/chosen": -70.3019027709961, "logps/rejected": -192.068603515625, "loss": 0.9119, "nll_loss": 0.857340395450592, "rewards/accuracies": 1.0, "rewards/chosen": 1.173804521560669, "rewards/margins": 5.689374923706055, "rewards/rejected": -4.515570163726807, "step": 742 }, { "epoch": 0.12383333333333334, "grad_norm": 128.71417236328125, "learning_rate": 1.954175738276379e-07, "logits/chosen": 2.980475425720215, "logits/rejected": 2.817131996154785, "logps/chosen": -44.24115753173828, "logps/rejected": -64.33116149902344, "loss": 1.3874, "nll_loss": 0.641176164150238, "rewards/accuracies": 1.0, "rewards/chosen": 0.9033646583557129, "rewards/margins": 0.25934839248657227, "rewards/rejected": 0.6440162658691406, "step": 743 }, { "epoch": 0.124, "grad_norm": 33.118831634521484, "learning_rate": 1.9540140680664912e-07, "logits/chosen": 4.227691173553467, "logits/rejected": 4.256126880645752, "logps/chosen": -45.79784393310547, "logps/rejected": -235.62557983398438, "loss": 0.7554, "nll_loss": 0.6939067840576172, "rewards/accuracies": 1.0, "rewards/chosen": 1.0905662775039673, "rewards/margins": 5.157595157623291, "rewards/rejected": -4.067028999328613, "step": 744 }, { "epoch": 0.12416666666666666, "grad_norm": 57.3322868347168, "learning_rate": 1.9538521198798076e-07, "logits/chosen": 2.207109212875366, "logits/rejected": 2.7824230194091797, "logps/chosen": -41.3306999206543, "logps/rejected": -340.7677917480469, "loss": 1.2318, "nll_loss": 1.1170457601547241, "rewards/accuracies": 1.0, "rewards/chosen": 0.4089107811450958, "rewards/margins": 3.994539976119995, "rewards/rejected": -3.5856292247772217, "step": 745 }, { "epoch": 0.12433333333333334, "grad_norm": 45.01563262939453, "learning_rate": 1.953689893763516e-07, "logits/chosen": 3.5610809326171875, "logits/rejected": 3.503323793411255, "logps/chosen": -33.948509216308594, "logps/rejected": -195.43606567382812, "loss": 0.9096, "nll_loss": 0.8487126231193542, "rewards/accuracies": 1.0, "rewards/chosen": 1.0366954803466797, "rewards/margins": 5.588657855987549, "rewards/rejected": -4.551962375640869, "step": 746 }, { "epoch": 0.1245, "grad_norm": 55.63166809082031, "learning_rate": 1.9535273897648853e-07, "logits/chosen": 2.8641438484191895, "logits/rejected": 2.7027270793914795, "logps/chosen": -86.9053955078125, "logps/rejected": -66.25556182861328, "loss": 1.0819, "nll_loss": 0.9550043344497681, "rewards/accuracies": 1.0, "rewards/chosen": 1.3918397426605225, "rewards/margins": 2.9039831161499023, "rewards/rejected": -1.5121434926986694, "step": 747 }, { "epoch": 0.12466666666666666, "grad_norm": 57.85581970214844, "learning_rate": 1.9533646079312652e-07, "logits/chosen": 0.7562713027000427, "logits/rejected": 2.8775718212127686, "logps/chosen": -107.97815704345703, "logps/rejected": -333.21533203125, "loss": 1.7884, "nll_loss": 1.6612026691436768, "rewards/accuracies": 1.0, "rewards/chosen": 0.12274475395679474, "rewards/margins": 4.642886161804199, "rewards/rejected": -4.5201416015625, "step": 748 }, { "epoch": 0.12483333333333334, "grad_norm": 69.24007415771484, "learning_rate": 1.9532015483100865e-07, "logits/chosen": 2.2732605934143066, "logits/rejected": 2.4577996730804443, "logps/chosen": -24.94005012512207, "logps/rejected": -144.18603515625, "loss": 0.786, "nll_loss": 0.5421749949455261, "rewards/accuracies": 1.0, "rewards/chosen": 0.051227763295173645, "rewards/margins": 2.0902087688446045, "rewards/rejected": -2.0389809608459473, "step": 749 }, { "epoch": 0.125, "grad_norm": 285.0784912109375, "learning_rate": 1.9530382109488609e-07, "logits/chosen": 2.582618474960327, "logits/rejected": 2.9599690437316895, "logps/chosen": -117.02783203125, "logps/rejected": -168.78573608398438, "loss": 2.7452, "nll_loss": 1.3767979145050049, "rewards/accuracies": 0.0, "rewards/chosen": -2.358572483062744, "rewards/margins": -0.6599701642990112, "rewards/rejected": -1.698602318763733, "step": 750 }, { "epoch": 0.12516666666666668, "grad_norm": 85.58812713623047, "learning_rate": 1.9528745958951806e-07, "logits/chosen": 2.7689688205718994, "logits/rejected": 2.804339647293091, "logps/chosen": -44.21519470214844, "logps/rejected": -107.99295043945312, "loss": 1.3459, "nll_loss": 1.1950054168701172, "rewards/accuracies": 1.0, "rewards/chosen": 0.47977757453918457, "rewards/margins": 2.885045051574707, "rewards/rejected": -2.4052674770355225, "step": 751 }, { "epoch": 0.12533333333333332, "grad_norm": 250.07406616210938, "learning_rate": 1.9527107031967197e-07, "logits/chosen": 1.0250009298324585, "logits/rejected": 2.810511827468872, "logps/chosen": -70.91849517822266, "logps/rejected": -216.88612365722656, "loss": 2.5388, "nll_loss": 1.6885358095169067, "rewards/accuracies": 0.0, "rewards/chosen": 0.07432708889245987, "rewards/margins": -0.03264312446117401, "rewards/rejected": 0.10697021335363388, "step": 752 }, { "epoch": 0.1255, "grad_norm": 38.85531997680664, "learning_rate": 1.952546532901232e-07, "logits/chosen": 3.0788321495056152, "logits/rejected": 3.102905035018921, "logps/chosen": -41.66408920288086, "logps/rejected": -112.59162902832031, "loss": 0.6973, "nll_loss": 0.595201313495636, "rewards/accuracies": 1.0, "rewards/chosen": 1.3089399337768555, "rewards/margins": 3.274244546890259, "rewards/rejected": -1.9653046131134033, "step": 753 }, { "epoch": 0.12566666666666668, "grad_norm": 155.2676544189453, "learning_rate": 1.9523820850565533e-07, "logits/chosen": 3.4392385482788086, "logits/rejected": 3.4572627544403076, "logps/chosen": -112.18785858154297, "logps/rejected": -111.57310485839844, "loss": 2.5628, "nll_loss": 2.003354787826538, "rewards/accuracies": 1.0, "rewards/chosen": -0.9119720458984375, "rewards/margins": 0.8303375244140625, "rewards/rejected": -1.7423095703125, "step": 754 }, { "epoch": 0.12583333333333332, "grad_norm": 38.37605667114258, "learning_rate": 1.9522173597105995e-07, "logits/chosen": 1.8522993326187134, "logits/rejected": 1.699377179145813, "logps/chosen": -42.50154495239258, "logps/rejected": -84.22142028808594, "loss": 0.6786, "nll_loss": 0.5448916554450989, "rewards/accuracies": 1.0, "rewards/chosen": 0.6681728363037109, "rewards/margins": 3.0205516815185547, "rewards/rejected": -2.3523788452148438, "step": 755 }, { "epoch": 0.126, "grad_norm": 58.06755828857422, "learning_rate": 1.9520523569113676e-07, "logits/chosen": 2.1874449253082275, "logits/rejected": 2.2742953300476074, "logps/chosen": -37.57670593261719, "logps/rejected": -107.20097351074219, "loss": 1.0465, "nll_loss": 0.8946833610534668, "rewards/accuracies": 1.0, "rewards/chosen": 1.395180106163025, "rewards/margins": 2.6324329376220703, "rewards/rejected": -1.2372528314590454, "step": 756 }, { "epoch": 0.12616666666666668, "grad_norm": 39.80746841430664, "learning_rate": 1.9518870767069354e-07, "logits/chosen": 1.9340648651123047, "logits/rejected": 2.1290621757507324, "logps/chosen": -106.97697448730469, "logps/rejected": -355.7149658203125, "loss": 0.9767, "nll_loss": 0.9302344918251038, "rewards/accuracies": 1.0, "rewards/chosen": 1.2617920637130737, "rewards/margins": 6.999487400054932, "rewards/rejected": -5.737695217132568, "step": 757 }, { "epoch": 0.12633333333333333, "grad_norm": 30.14207649230957, "learning_rate": 1.9517215191454617e-07, "logits/chosen": 3.1719913482666016, "logits/rejected": 3.23980712890625, "logps/chosen": -61.49022674560547, "logps/rejected": -149.8299560546875, "loss": 0.883, "nll_loss": 0.8090819716453552, "rewards/accuracies": 1.0, "rewards/chosen": 1.6123466491699219, "rewards/margins": 3.8051750659942627, "rewards/rejected": -2.192828416824341, "step": 758 }, { "epoch": 0.1265, "grad_norm": 107.7535171508789, "learning_rate": 1.951555684275186e-07, "logits/chosen": 2.398561954498291, "logits/rejected": 2.583157777786255, "logps/chosen": -74.6574478149414, "logps/rejected": -136.67361450195312, "loss": 2.8527, "nll_loss": 2.666337728500366, "rewards/accuracies": 1.0, "rewards/chosen": -0.4013938903808594, "rewards/margins": 3.985389232635498, "rewards/rejected": -4.386783123016357, "step": 759 }, { "epoch": 0.12666666666666668, "grad_norm": 49.96342468261719, "learning_rate": 1.9513895721444284e-07, "logits/chosen": 2.962740421295166, "logits/rejected": 3.0281827449798584, "logps/chosen": -56.002044677734375, "logps/rejected": -145.50775146484375, "loss": 0.9199, "nll_loss": 0.767151415348053, "rewards/accuracies": 1.0, "rewards/chosen": 0.6125866174697876, "rewards/margins": 2.754972457885742, "rewards/rejected": -2.142385959625244, "step": 760 }, { "epoch": 0.12683333333333333, "grad_norm": 100.85626220703125, "learning_rate": 1.9512231828015904e-07, "logits/chosen": 1.911260724067688, "logits/rejected": 1.6589970588684082, "logps/chosen": -104.6365737915039, "logps/rejected": -111.88276672363281, "loss": 1.2502, "nll_loss": 0.8943297266960144, "rewards/accuracies": 1.0, "rewards/chosen": 0.9881263971328735, "rewards/margins": 1.3559197187423706, "rewards/rejected": -0.3677932918071747, "step": 761 }, { "epoch": 0.127, "grad_norm": 27.74998664855957, "learning_rate": 1.9510565162951537e-07, "logits/chosen": 2.322695016860962, "logits/rejected": 2.650299549102783, "logps/chosen": -95.50729370117188, "logps/rejected": -490.4749450683594, "loss": 1.0102, "nll_loss": 0.9948676228523254, "rewards/accuracies": 1.0, "rewards/chosen": 2.7193679809570312, "rewards/margins": 6.9521074295043945, "rewards/rejected": -4.232739448547363, "step": 762 }, { "epoch": 0.12716666666666668, "grad_norm": 111.69232940673828, "learning_rate": 1.9508895726736805e-07, "logits/chosen": 3.1117353439331055, "logits/rejected": 3.0476653575897217, "logps/chosen": -16.290834426879883, "logps/rejected": -37.20144271850586, "loss": 1.4371, "nll_loss": 0.4402928352355957, "rewards/accuracies": 0.0, "rewards/chosen": 0.9132711887359619, "rewards/margins": -0.19971370697021484, "rewards/rejected": 1.1129848957061768, "step": 763 }, { "epoch": 0.12733333333333333, "grad_norm": 48.24898910522461, "learning_rate": 1.9507223519858144e-07, "logits/chosen": 2.98283314704895, "logits/rejected": 3.2691380977630615, "logps/chosen": -34.540096282958984, "logps/rejected": -90.5579605102539, "loss": 0.8034, "nll_loss": 0.67725670337677, "rewards/accuracies": 1.0, "rewards/chosen": 0.7960842847824097, "rewards/margins": 3.0594091415405273, "rewards/rejected": -2.263324737548828, "step": 764 }, { "epoch": 0.1275, "grad_norm": 90.47776794433594, "learning_rate": 1.9505548542802802e-07, "logits/chosen": 2.6605372428894043, "logits/rejected": 2.5941171646118164, "logps/chosen": -46.10110092163086, "logps/rejected": -30.807125091552734, "loss": 1.8786, "nll_loss": 1.7074480056762695, "rewards/accuracies": 1.0, "rewards/chosen": 1.41386878490448, "rewards/margins": 2.4616518020629883, "rewards/rejected": -1.0477828979492188, "step": 765 }, { "epoch": 0.12766666666666668, "grad_norm": 53.65630340576172, "learning_rate": 1.9503870796058817e-07, "logits/chosen": 2.724006414413452, "logits/rejected": 2.684735059738159, "logps/chosen": -18.968658447265625, "logps/rejected": -36.06466293334961, "loss": 0.8508, "nll_loss": 0.72956383228302, "rewards/accuracies": 1.0, "rewards/chosen": 1.3181896209716797, "rewards/margins": 2.9772636890411377, "rewards/rejected": -1.659074068069458, "step": 766 }, { "epoch": 0.12783333333333333, "grad_norm": 257.0987548828125, "learning_rate": 1.9502190280115047e-07, "logits/chosen": 2.1190221309661865, "logits/rejected": 2.257235527038574, "logps/chosen": -62.570579528808594, "logps/rejected": -70.00403594970703, "loss": 2.9334, "nll_loss": 0.8232972621917725, "rewards/accuracies": 0.0, "rewards/chosen": 1.2325851917266846, "rewards/margins": -1.6429648399353027, "rewards/rejected": 2.8755500316619873, "step": 767 }, { "epoch": 0.128, "grad_norm": 62.14600372314453, "learning_rate": 1.9500506995461156e-07, "logits/chosen": 2.6969454288482666, "logits/rejected": 2.800635814666748, "logps/chosen": -41.50356674194336, "logps/rejected": -73.92092895507812, "loss": 1.388, "nll_loss": 1.2576839923858643, "rewards/accuracies": 1.0, "rewards/chosen": 1.3621968030929565, "rewards/margins": 2.8621022701263428, "rewards/rejected": -1.4999054670333862, "step": 768 }, { "epoch": 0.12816666666666668, "grad_norm": 67.66310119628906, "learning_rate": 1.9498820942587614e-07, "logits/chosen": 2.908273458480835, "logits/rejected": 2.8707845211029053, "logps/chosen": -37.355709075927734, "logps/rejected": -32.55696105957031, "loss": 1.0351, "nll_loss": 0.7948022484779358, "rewards/accuracies": 1.0, "rewards/chosen": 1.196832299232483, "rewards/margins": 1.941890001296997, "rewards/rejected": -0.7450577020645142, "step": 769 }, { "epoch": 0.12833333333333333, "grad_norm": 113.30207061767578, "learning_rate": 1.9497132121985694e-07, "logits/chosen": 4.887753486633301, "logits/rejected": 4.923312187194824, "logps/chosen": -77.69972229003906, "logps/rejected": -64.66644287109375, "loss": 1.6432, "nll_loss": 0.9475575685501099, "rewards/accuracies": 1.0, "rewards/chosen": 1.588234782218933, "rewards/margins": 0.5222816467285156, "rewards/rejected": 1.0659531354904175, "step": 770 }, { "epoch": 0.1285, "grad_norm": 26.87917137145996, "learning_rate": 1.9495440534147477e-07, "logits/chosen": 2.242781639099121, "logits/rejected": 2.56561279296875, "logps/chosen": -108.27780151367188, "logps/rejected": -261.3929443359375, "loss": 0.7709, "nll_loss": 0.7076979875564575, "rewards/accuracies": 1.0, "rewards/chosen": 1.475144386291504, "rewards/margins": 4.202380180358887, "rewards/rejected": -2.7272355556488037, "step": 771 }, { "epoch": 0.12866666666666668, "grad_norm": 62.27250289916992, "learning_rate": 1.9493746179565852e-07, "logits/chosen": 2.3050079345703125, "logits/rejected": 2.0593886375427246, "logps/chosen": -19.188640594482422, "logps/rejected": -30.31855583190918, "loss": 0.9247, "nll_loss": 0.5996449589729309, "rewards/accuracies": 1.0, "rewards/chosen": 0.753064751625061, "rewards/margins": 1.4607425928115845, "rewards/rejected": -0.7076778411865234, "step": 772 }, { "epoch": 0.12883333333333333, "grad_norm": 114.35568237304688, "learning_rate": 1.9492049058734512e-07, "logits/chosen": 3.018078327178955, "logits/rejected": 3.0092573165893555, "logps/chosen": -47.050262451171875, "logps/rejected": -48.16339111328125, "loss": 1.7497, "nll_loss": 1.1475673913955688, "rewards/accuracies": 1.0, "rewards/chosen": 0.1362968534231186, "rewards/margins": 0.5193912386894226, "rewards/rejected": -0.3830944001674652, "step": 773 }, { "epoch": 0.129, "grad_norm": 64.17587280273438, "learning_rate": 1.9490349172147963e-07, "logits/chosen": 1.7036843299865723, "logits/rejected": 2.052279233932495, "logps/chosen": -53.40577697753906, "logps/rejected": -141.01495361328125, "loss": 1.1717, "nll_loss": 1.0681155920028687, "rewards/accuracies": 1.0, "rewards/chosen": 0.7538742423057556, "rewards/margins": 3.619744300842285, "rewards/rejected": -2.8658699989318848, "step": 774 }, { "epoch": 0.12916666666666668, "grad_norm": 59.51513671875, "learning_rate": 1.94886465203015e-07, "logits/chosen": 2.6773200035095215, "logits/rejected": 2.786562204360962, "logps/chosen": -28.804080963134766, "logps/rejected": -92.80828094482422, "loss": 1.3755, "nll_loss": 1.2523514032363892, "rewards/accuracies": 1.0, "rewards/chosen": 0.36306115984916687, "rewards/margins": 3.722228765487671, "rewards/rejected": -3.3591675758361816, "step": 775 }, { "epoch": 0.12933333333333333, "grad_norm": 93.9781723022461, "learning_rate": 1.9486941103691246e-07, "logits/chosen": 3.0297369956970215, "logits/rejected": 3.1067142486572266, "logps/chosen": -59.356468200683594, "logps/rejected": -179.1953582763672, "loss": 2.3963, "nll_loss": 2.2829408645629883, "rewards/accuracies": 1.0, "rewards/chosen": 0.35541534423828125, "rewards/margins": 4.20648193359375, "rewards/rejected": -3.8510665893554688, "step": 776 }, { "epoch": 0.1295, "grad_norm": 35.382606506347656, "learning_rate": 1.9485232922814116e-07, "logits/chosen": 2.5673599243164062, "logits/rejected": 2.896362781524658, "logps/chosen": -111.91587829589844, "logps/rejected": -384.9849548339844, "loss": 1.0954, "nll_loss": 1.026751160621643, "rewards/accuracies": 1.0, "rewards/chosen": 1.1471108198165894, "rewards/margins": 4.360081195831299, "rewards/rejected": -3.21297025680542, "step": 777 }, { "epoch": 0.12966666666666668, "grad_norm": 61.66047286987305, "learning_rate": 1.9483521978167828e-07, "logits/chosen": 2.448777198791504, "logits/rejected": 2.524543523788452, "logps/chosen": -67.8018798828125, "logps/rejected": -109.7366943359375, "loss": 1.2123, "nll_loss": 1.0273011922836304, "rewards/accuracies": 1.0, "rewards/chosen": 0.6740218997001648, "rewards/margins": 2.3566582202911377, "rewards/rejected": -1.6826362609863281, "step": 778 }, { "epoch": 0.12983333333333333, "grad_norm": 37.86282730102539, "learning_rate": 1.948180827025091e-07, "logits/chosen": 2.841663122177124, "logits/rejected": 2.8032066822052, "logps/chosen": -14.904903411865234, "logps/rejected": -86.40115356445312, "loss": 0.549, "nll_loss": 0.43837955594062805, "rewards/accuracies": 1.0, "rewards/chosen": 1.4830856323242188, "rewards/margins": 3.1237449645996094, "rewards/rejected": -1.6406594514846802, "step": 779 }, { "epoch": 0.13, "grad_norm": 80.37630462646484, "learning_rate": 1.9480091799562703e-07, "logits/chosen": 2.6709861755371094, "logits/rejected": 2.530437469482422, "logps/chosen": -45.1212272644043, "logps/rejected": -139.26278686523438, "loss": 0.9977, "nll_loss": 0.8057363629341125, "rewards/accuracies": 1.0, "rewards/chosen": 1.4995425939559937, "rewards/margins": 2.3184313774108887, "rewards/rejected": -0.8188888430595398, "step": 780 }, { "epoch": 0.13016666666666668, "grad_norm": 75.96260833740234, "learning_rate": 1.947837256660334e-07, "logits/chosen": 2.083517551422119, "logits/rejected": 2.1598479747772217, "logps/chosen": -82.1605453491211, "logps/rejected": -113.48823547363281, "loss": 1.2675, "nll_loss": 1.0270068645477295, "rewards/accuracies": 1.0, "rewards/chosen": 0.9197776913642883, "rewards/margins": 1.9129799604415894, "rewards/rejected": -0.993202269077301, "step": 781 }, { "epoch": 0.13033333333333333, "grad_norm": 66.54057312011719, "learning_rate": 1.9476650571873765e-07, "logits/chosen": 2.3261520862579346, "logits/rejected": 2.112438678741455, "logps/chosen": -51.84087371826172, "logps/rejected": -25.034883499145508, "loss": 1.0954, "nll_loss": 0.7301531434059143, "rewards/accuracies": 1.0, "rewards/chosen": 0.8684784173965454, "rewards/margins": 1.3026313781738281, "rewards/rejected": -0.4341530203819275, "step": 782 }, { "epoch": 0.1305, "grad_norm": 158.78170776367188, "learning_rate": 1.9474925815875727e-07, "logits/chosen": 3.1021196842193604, "logits/rejected": 2.9283342361450195, "logps/chosen": -90.7071304321289, "logps/rejected": -35.530975341796875, "loss": 2.0475, "nll_loss": 0.9649696350097656, "rewards/accuracies": 0.0, "rewards/chosen": 0.43792957067489624, "rewards/margins": -0.408208429813385, "rewards/rejected": 0.8461380004882812, "step": 783 }, { "epoch": 0.13066666666666665, "grad_norm": 59.3419075012207, "learning_rate": 1.9473198299111777e-07, "logits/chosen": 1.9822978973388672, "logits/rejected": 2.337106943130493, "logps/chosen": -85.54232025146484, "logps/rejected": -214.05003356933594, "loss": 1.389, "nll_loss": 1.2960957288742065, "rewards/accuracies": 1.0, "rewards/chosen": 0.477451354265213, "rewards/margins": 5.378993988037109, "rewards/rejected": -4.901542663574219, "step": 784 }, { "epoch": 0.13083333333333333, "grad_norm": 32.47792434692383, "learning_rate": 1.947146802208527e-07, "logits/chosen": 2.8309006690979004, "logits/rejected": 2.75573468208313, "logps/chosen": -144.98394775390625, "logps/rejected": -64.8744125366211, "loss": 0.8409, "nll_loss": 0.7630733847618103, "rewards/accuracies": 1.0, "rewards/chosen": 1.6310561895370483, "rewards/margins": 3.7082977294921875, "rewards/rejected": -2.0772416591644287, "step": 785 }, { "epoch": 0.131, "grad_norm": 90.82164764404297, "learning_rate": 1.946973498530037e-07, "logits/chosen": 1.9194341897964478, "logits/rejected": 1.7202599048614502, "logps/chosen": -85.4022216796875, "logps/rejected": -48.4498291015625, "loss": 1.4032, "nll_loss": 1.0047321319580078, "rewards/accuracies": 1.0, "rewards/chosen": 0.20877611637115479, "rewards/margins": 1.1613869667053223, "rewards/rejected": -0.9526108503341675, "step": 786 }, { "epoch": 0.13116666666666665, "grad_norm": 176.67019653320312, "learning_rate": 1.9467999189262043e-07, "logits/chosen": 1.0689384937286377, "logits/rejected": 2.790313959121704, "logps/chosen": -69.2262954711914, "logps/rejected": -222.84091186523438, "loss": 2.1731, "nll_loss": 1.6482449769973755, "rewards/accuracies": 1.0, "rewards/chosen": 0.24354708194732666, "rewards/margins": 0.7320556640625, "rewards/rejected": -0.4885086119174957, "step": 787 }, { "epoch": 0.13133333333333333, "grad_norm": 20.491546630859375, "learning_rate": 1.946626063447606e-07, "logits/chosen": 2.345266103744507, "logits/rejected": 2.550828456878662, "logps/chosen": -37.4339599609375, "logps/rejected": -148.9090118408203, "loss": 0.4765, "nll_loss": 0.45651164650917053, "rewards/accuracies": 1.0, "rewards/chosen": 2.2181496620178223, "rewards/margins": 7.489657878875732, "rewards/rejected": -5.27150821685791, "step": 788 }, { "epoch": 0.1315, "grad_norm": 27.87025260925293, "learning_rate": 1.9464519321448986e-07, "logits/chosen": 2.186779499053955, "logits/rejected": 2.780639410018921, "logps/chosen": -73.90364074707031, "logps/rejected": -145.97129821777344, "loss": 0.8502, "nll_loss": 0.7862088680267334, "rewards/accuracies": 1.0, "rewards/chosen": 2.457991123199463, "rewards/margins": 4.171536445617676, "rewards/rejected": -1.7135452032089233, "step": 789 }, { "epoch": 0.13166666666666665, "grad_norm": 37.93517303466797, "learning_rate": 1.9462775250688205e-07, "logits/chosen": 2.99314546585083, "logits/rejected": 2.9498422145843506, "logps/chosen": -36.9061393737793, "logps/rejected": -129.60650634765625, "loss": 0.8873, "nll_loss": 0.8201364874839783, "rewards/accuracies": 1.0, "rewards/chosen": 1.5028934478759766, "rewards/margins": 4.033806800842285, "rewards/rejected": -2.5309131145477295, "step": 790 }, { "epoch": 0.13183333333333333, "grad_norm": 79.6259994506836, "learning_rate": 1.9461028422701898e-07, "logits/chosen": 3.119626522064209, "logits/rejected": 2.9801137447357178, "logps/chosen": -53.767974853515625, "logps/rejected": -32.075504302978516, "loss": 1.5762, "nll_loss": 1.168868899345398, "rewards/accuracies": 1.0, "rewards/chosen": 1.2865440845489502, "rewards/margins": 1.2259712219238281, "rewards/rejected": 0.06057281419634819, "step": 791 }, { "epoch": 0.132, "grad_norm": 69.49896240234375, "learning_rate": 1.9459278837999044e-07, "logits/chosen": 3.1708571910858154, "logits/rejected": 3.1306047439575195, "logps/chosen": -55.44553756713867, "logps/rejected": -125.78190612792969, "loss": 1.2475, "nll_loss": 1.046142339706421, "rewards/accuracies": 1.0, "rewards/chosen": 0.2338821440935135, "rewards/margins": 2.3843910694122314, "rewards/rejected": -2.1505088806152344, "step": 792 }, { "epoch": 0.13216666666666665, "grad_norm": 91.34194946289062, "learning_rate": 1.9457526497089435e-07, "logits/chosen": 2.5906801223754883, "logits/rejected": 2.2566092014312744, "logps/chosen": -64.4002685546875, "logps/rejected": -61.959999084472656, "loss": 1.1874, "nll_loss": 0.9200037121772766, "rewards/accuracies": 1.0, "rewards/chosen": 0.4597206115722656, "rewards/margins": 1.7724900245666504, "rewards/rejected": -1.3127694129943848, "step": 793 }, { "epoch": 0.13233333333333333, "grad_norm": 49.083404541015625, "learning_rate": 1.9455771400483654e-07, "logits/chosen": 2.023775577545166, "logits/rejected": 2.3538260459899902, "logps/chosen": -64.22769165039062, "logps/rejected": -152.70831298828125, "loss": 0.9627, "nll_loss": 0.8563691973686218, "rewards/accuracies": 1.0, "rewards/chosen": 1.544893741607666, "rewards/margins": 3.190246105194092, "rewards/rejected": -1.6453522443771362, "step": 794 }, { "epoch": 0.1325, "grad_norm": 93.34910583496094, "learning_rate": 1.94540135486931e-07, "logits/chosen": 1.9973835945129395, "logits/rejected": 2.8489737510681152, "logps/chosen": -13.940225601196289, "logps/rejected": -177.50350952148438, "loss": 1.3601, "nll_loss": 0.9957303404808044, "rewards/accuracies": 1.0, "rewards/chosen": -0.22448551654815674, "rewards/margins": 1.4056872129440308, "rewards/rejected": -1.6301727294921875, "step": 795 }, { "epoch": 0.13266666666666665, "grad_norm": 94.52047729492188, "learning_rate": 1.9452252942229967e-07, "logits/chosen": 0.7604236006736755, "logits/rejected": 3.6062285900115967, "logps/chosen": -23.82342529296875, "logps/rejected": -331.6452941894531, "loss": 1.0714, "nll_loss": 0.9529369473457336, "rewards/accuracies": 1.0, "rewards/chosen": 0.794818103313446, "rewards/margins": 3.2004547119140625, "rewards/rejected": -2.4056365489959717, "step": 796 }, { "epoch": 0.13283333333333333, "grad_norm": 114.47265625, "learning_rate": 1.945048958160725e-07, "logits/chosen": 2.134498119354248, "logits/rejected": 2.6877782344818115, "logps/chosen": -33.79677963256836, "logps/rejected": -215.18116760253906, "loss": 2.5377, "nll_loss": 2.414055585861206, "rewards/accuracies": 1.0, "rewards/chosen": 0.10705529153347015, "rewards/margins": 4.9849348068237305, "rewards/rejected": -4.877879619598389, "step": 797 }, { "epoch": 0.133, "grad_norm": 627.42724609375, "learning_rate": 1.944872346733876e-07, "logits/chosen": 2.8953027725219727, "logits/rejected": 3.1238412857055664, "logps/chosen": -187.47482299804688, "logps/rejected": -159.10061645507812, "loss": 3.4335, "nll_loss": 1.241555094718933, "rewards/accuracies": 0.0, "rewards/chosen": -4.285986423492432, "rewards/margins": -1.4751007556915283, "rewards/rejected": -2.8108856678009033, "step": 798 }, { "epoch": 0.13316666666666666, "grad_norm": 50.234710693359375, "learning_rate": 1.9446954599939092e-07, "logits/chosen": 1.8678079843521118, "logits/rejected": 2.038673162460327, "logps/chosen": -71.00910949707031, "logps/rejected": -137.65121459960938, "loss": 0.9497, "nll_loss": 0.8354012966156006, "rewards/accuracies": 1.0, "rewards/chosen": 0.48845523595809937, "rewards/margins": 3.704211473464966, "rewards/rejected": -3.2157561779022217, "step": 799 }, { "epoch": 0.13333333333333333, "grad_norm": 53.366695404052734, "learning_rate": 1.9445182979923653e-07, "logits/chosen": 2.460494041442871, "logits/rejected": 2.5443837642669678, "logps/chosen": -103.82748413085938, "logps/rejected": -139.3462677001953, "loss": 1.5142, "nll_loss": 1.4420483112335205, "rewards/accuracies": 1.0, "rewards/chosen": 0.989105224609375, "rewards/margins": 4.4751176834106445, "rewards/rejected": -3.4860122203826904, "step": 800 }, { "epoch": 0.1335, "grad_norm": 54.49576950073242, "learning_rate": 1.9443408607808649e-07, "logits/chosen": 3.712445020675659, "logits/rejected": 3.6781017780303955, "logps/chosen": -174.91400146484375, "logps/rejected": -149.3397216796875, "loss": 1.1992, "nll_loss": 1.0289058685302734, "rewards/accuracies": 1.0, "rewards/chosen": -0.3320404291152954, "rewards/margins": 4.435439586639404, "rewards/rejected": -4.76747989654541, "step": 801 }, { "epoch": 0.13366666666666666, "grad_norm": 40.87813949584961, "learning_rate": 1.9441631484111093e-07, "logits/chosen": 3.071214437484741, "logits/rejected": 3.1520986557006836, "logps/chosen": -43.74760437011719, "logps/rejected": -216.36471557617188, "loss": 0.8217, "nll_loss": 0.7171739339828491, "rewards/accuracies": 1.0, "rewards/chosen": 0.8101001977920532, "rewards/margins": 3.4932608604431152, "rewards/rejected": -2.6831605434417725, "step": 802 }, { "epoch": 0.13383333333333333, "grad_norm": 140.50425720214844, "learning_rate": 1.943985160934879e-07, "logits/chosen": 2.3547730445861816, "logits/rejected": 2.355648994445801, "logps/chosen": -125.47970581054688, "logps/rejected": -188.30816650390625, "loss": 1.605, "nll_loss": 1.1104398965835571, "rewards/accuracies": 1.0, "rewards/chosen": -0.01850128173828125, "rewards/margins": 0.8360794186592102, "rewards/rejected": -0.8545807003974915, "step": 803 }, { "epoch": 0.134, "grad_norm": 59.580482482910156, "learning_rate": 1.9438068984040363e-07, "logits/chosen": 1.0869457721710205, "logits/rejected": 3.1612257957458496, "logps/chosen": -86.28825378417969, "logps/rejected": -362.7419128417969, "loss": 1.0218, "nll_loss": 0.8988359570503235, "rewards/accuracies": 1.0, "rewards/chosen": 1.1895004510879517, "rewards/margins": 2.962336778640747, "rewards/rejected": -1.7728363275527954, "step": 804 }, { "epoch": 0.13416666666666666, "grad_norm": 198.2568359375, "learning_rate": 1.943628360870522e-07, "logits/chosen": 3.8218491077423096, "logits/rejected": 3.8145627975463867, "logps/chosen": -50.326744079589844, "logps/rejected": -45.48955535888672, "loss": 2.4385, "nll_loss": 0.7088274955749512, "rewards/accuracies": 0.0, "rewards/chosen": 1.213372826576233, "rewards/margins": -1.184672474861145, "rewards/rejected": 2.398045301437378, "step": 805 }, { "epoch": 0.13433333333333333, "grad_norm": 49.04609680175781, "learning_rate": 1.9434495483863573e-07, "logits/chosen": 2.109097719192505, "logits/rejected": 2.8511226177215576, "logps/chosen": -81.08830261230469, "logps/rejected": -830.6884155273438, "loss": 1.1561, "nll_loss": 1.0395936965942383, "rewards/accuracies": 1.0, "rewards/chosen": 0.239369198679924, "rewards/margins": 4.4866414070129395, "rewards/rejected": -4.24727201461792, "step": 806 }, { "epoch": 0.1345, "grad_norm": 193.36666870117188, "learning_rate": 1.9432704610036446e-07, "logits/chosen": 2.7986538410186768, "logits/rejected": 2.6432957649230957, "logps/chosen": -73.03831481933594, "logps/rejected": -26.478424072265625, "loss": 2.432, "nll_loss": 1.014420986175537, "rewards/accuracies": 0.0, "rewards/chosen": 0.12299346923828125, "rewards/margins": -0.9326015710830688, "rewards/rejected": 1.05559504032135, "step": 807 }, { "epoch": 0.13466666666666666, "grad_norm": 43.83087158203125, "learning_rate": 1.943091098774565e-07, "logits/chosen": 1.9369269609451294, "logits/rejected": 1.3925584554672241, "logps/chosen": -93.84434509277344, "logps/rejected": -31.162233352661133, "loss": 0.9162, "nll_loss": 0.7568092942237854, "rewards/accuracies": 1.0, "rewards/chosen": 1.2098610401153564, "rewards/margins": 2.545746326446533, "rewards/rejected": -1.3358854055404663, "step": 808 }, { "epoch": 0.13483333333333333, "grad_norm": 68.89736938476562, "learning_rate": 1.9429114617513812e-07, "logits/chosen": 1.4242196083068848, "logits/rejected": 2.5430221557617188, "logps/chosen": -59.903404235839844, "logps/rejected": -169.23297119140625, "loss": 2.4043, "nll_loss": 2.218644380569458, "rewards/accuracies": 1.0, "rewards/chosen": 0.018779754638671875, "rewards/margins": 2.777799606323242, "rewards/rejected": -2.7590198516845703, "step": 809 }, { "epoch": 0.135, "grad_norm": 37.008975982666016, "learning_rate": 1.942731549986434e-07, "logits/chosen": 1.2103348970413208, "logits/rejected": 2.9584054946899414, "logps/chosen": -37.859397888183594, "logps/rejected": -489.68841552734375, "loss": 0.88, "nll_loss": 0.8413199782371521, "rewards/accuracies": 1.0, "rewards/chosen": 1.4670286178588867, "rewards/margins": 6.68149471282959, "rewards/rejected": -5.214466094970703, "step": 810 }, { "epoch": 0.13516666666666666, "grad_norm": 39.19898986816406, "learning_rate": 1.9425513635321465e-07, "logits/chosen": 2.68296217918396, "logits/rejected": 3.0134549140930176, "logps/chosen": -34.85658645629883, "logps/rejected": -288.3824157714844, "loss": 0.6911, "nll_loss": 0.5622029304504395, "rewards/accuracies": 1.0, "rewards/chosen": 1.0664924383163452, "rewards/margins": 2.8998513221740723, "rewards/rejected": -1.833358883857727, "step": 811 }, { "epoch": 0.13533333333333333, "grad_norm": 31.239643096923828, "learning_rate": 1.9423709024410195e-07, "logits/chosen": 2.385369300842285, "logits/rejected": 2.3936219215393066, "logps/chosen": -86.12417602539062, "logps/rejected": -124.56512451171875, "loss": 0.8372, "nll_loss": 0.7901301383972168, "rewards/accuracies": 1.0, "rewards/chosen": 1.3560723066329956, "rewards/margins": 5.5067524909973145, "rewards/rejected": -4.150680065155029, "step": 812 }, { "epoch": 0.1355, "grad_norm": 58.868804931640625, "learning_rate": 1.9421901667656363e-07, "logits/chosen": 1.6041805744171143, "logits/rejected": 2.2764089107513428, "logps/chosen": -87.06784057617188, "logps/rejected": -211.65882873535156, "loss": 1.8467, "nll_loss": 1.813913345336914, "rewards/accuracies": 1.0, "rewards/chosen": 1.641017198562622, "rewards/margins": 6.968774795532227, "rewards/rejected": -5.327757835388184, "step": 813 }, { "epoch": 0.13566666666666666, "grad_norm": 162.31884765625, "learning_rate": 1.942009156558658e-07, "logits/chosen": 2.642611026763916, "logits/rejected": 2.7393457889556885, "logps/chosen": -101.6343765258789, "logps/rejected": -89.4151382446289, "loss": 2.4015, "nll_loss": 1.7226165533065796, "rewards/accuracies": 1.0, "rewards/chosen": -1.6788827180862427, "rewards/margins": 0.7329689264297485, "rewards/rejected": -2.411851644515991, "step": 814 }, { "epoch": 0.13583333333333333, "grad_norm": 74.96925354003906, "learning_rate": 1.9418278718728272e-07, "logits/chosen": 2.5393080711364746, "logits/rejected": 2.6635963916778564, "logps/chosen": -17.678918838500977, "logps/rejected": -90.16339111328125, "loss": 0.9765, "nll_loss": 0.6799584031105042, "rewards/accuracies": 1.0, "rewards/chosen": 0.811793327331543, "rewards/margins": 1.6006124019622803, "rewards/rejected": -0.7888191342353821, "step": 815 }, { "epoch": 0.136, "grad_norm": 153.45516967773438, "learning_rate": 1.9416463127609653e-07, "logits/chosen": 2.963009834289551, "logits/rejected": 2.9746646881103516, "logps/chosen": -104.61042022705078, "logps/rejected": -93.7176284790039, "loss": 1.7156, "nll_loss": 1.2307108640670776, "rewards/accuracies": 1.0, "rewards/chosen": 0.42145538330078125, "rewards/margins": 0.856208086013794, "rewards/rejected": -0.4347526729106903, "step": 816 }, { "epoch": 0.13616666666666666, "grad_norm": 43.57034683227539, "learning_rate": 1.9414644792759748e-07, "logits/chosen": 1.9352443218231201, "logits/rejected": 2.5160067081451416, "logps/chosen": -59.04070281982422, "logps/rejected": -149.80958557128906, "loss": 0.9679, "nll_loss": 0.8556623458862305, "rewards/accuracies": 1.0, "rewards/chosen": 0.5975700616836548, "rewards/margins": 3.5414161682128906, "rewards/rejected": -2.9438462257385254, "step": 817 }, { "epoch": 0.13633333333333333, "grad_norm": 53.46684646606445, "learning_rate": 1.9412823714708377e-07, "logits/chosen": 1.9780607223510742, "logits/rejected": 2.536531448364258, "logps/chosen": -38.694602966308594, "logps/rejected": -112.85182189941406, "loss": 1.041, "nll_loss": 0.8998743891716003, "rewards/accuracies": 1.0, "rewards/chosen": 1.823248267173767, "rewards/margins": 2.844226121902466, "rewards/rejected": -1.0209778547286987, "step": 818 }, { "epoch": 0.1365, "grad_norm": 91.9419937133789, "learning_rate": 1.9410999893986154e-07, "logits/chosen": 2.434880495071411, "logits/rejected": 2.4123120307922363, "logps/chosen": -22.982540130615234, "logps/rejected": -34.744930267333984, "loss": 1.6226, "nll_loss": 1.2768079042434692, "rewards/accuracies": 1.0, "rewards/chosen": 0.3536476194858551, "rewards/margins": 1.3704185485839844, "rewards/rejected": -1.0167709589004517, "step": 819 }, { "epoch": 0.13666666666666666, "grad_norm": 74.97418212890625, "learning_rate": 1.9409173331124498e-07, "logits/chosen": 2.711360454559326, "logits/rejected": 2.244093179702759, "logps/chosen": -46.540687561035156, "logps/rejected": -21.994125366210938, "loss": 1.8235, "nll_loss": 1.604851245880127, "rewards/accuracies": 1.0, "rewards/chosen": 1.2131489515304565, "rewards/margins": 2.084139823913574, "rewards/rejected": -0.8709908723831177, "step": 820 }, { "epoch": 0.13683333333333333, "grad_norm": 28.098649978637695, "learning_rate": 1.9407344026655627e-07, "logits/chosen": 2.7768845558166504, "logits/rejected": 2.9643537998199463, "logps/chosen": -25.79187774658203, "logps/rejected": -171.56857299804688, "loss": 0.5544, "nll_loss": 0.4959976077079773, "rewards/accuracies": 1.0, "rewards/chosen": 1.2157398462295532, "rewards/margins": 4.798232078552246, "rewards/rejected": -3.5824921131134033, "step": 821 }, { "epoch": 0.137, "grad_norm": 46.06572341918945, "learning_rate": 1.940551198111255e-07, "logits/chosen": 2.2218291759490967, "logits/rejected": 2.286914348602295, "logps/chosen": -14.733388900756836, "logps/rejected": -38.217288970947266, "loss": 0.7283, "nll_loss": 0.6138911843299866, "rewards/accuracies": 1.0, "rewards/chosen": 0.9879751205444336, "rewards/margins": 3.1476855278015137, "rewards/rejected": -2.15971040725708, "step": 822 }, { "epoch": 0.13716666666666666, "grad_norm": 118.1383056640625, "learning_rate": 1.9403677195029094e-07, "logits/chosen": 1.7345929145812988, "logits/rejected": 2.4828412532806396, "logps/chosen": -27.039390563964844, "logps/rejected": -170.6180419921875, "loss": 0.6351, "nll_loss": 0.6008752584457397, "rewards/accuracies": 1.0, "rewards/chosen": 1.8341622352600098, "rewards/margins": 5.575785160064697, "rewards/rejected": -3.7416229248046875, "step": 823 }, { "epoch": 0.13733333333333334, "grad_norm": 34.926849365234375, "learning_rate": 1.940183966893986e-07, "logits/chosen": 1.9815737009048462, "logits/rejected": 2.6879422664642334, "logps/chosen": -10.205791473388672, "logps/rejected": -108.47713470458984, "loss": 0.4552, "nll_loss": 0.3189309537410736, "rewards/accuracies": 1.0, "rewards/chosen": 0.9495209455490112, "rewards/margins": 2.8261940479278564, "rewards/rejected": -1.8766731023788452, "step": 824 }, { "epoch": 0.1375, "grad_norm": 80.34368896484375, "learning_rate": 1.9399999403380262e-07, "logits/chosen": 2.8973541259765625, "logits/rejected": 2.9242820739746094, "logps/chosen": -50.24308776855469, "logps/rejected": -106.79418182373047, "loss": 1.0924, "nll_loss": 0.7850483059883118, "rewards/accuracies": 1.0, "rewards/chosen": 0.9415470957756042, "rewards/margins": 1.5616154670715332, "rewards/rejected": -0.620068371295929, "step": 825 }, { "epoch": 0.13766666666666666, "grad_norm": 147.19215393066406, "learning_rate": 1.9398156398886512e-07, "logits/chosen": 1.524454116821289, "logits/rejected": 1.3445559740066528, "logps/chosen": -77.95238494873047, "logps/rejected": -32.5175895690918, "loss": 1.9248, "nll_loss": 1.0123686790466309, "rewards/accuracies": 0.0, "rewards/chosen": 1.0516808032989502, "rewards/margins": -0.022291898727416992, "rewards/rejected": 1.0739727020263672, "step": 826 }, { "epoch": 0.13783333333333334, "grad_norm": 174.5767822265625, "learning_rate": 1.9396310655995616e-07, "logits/chosen": 2.367107629776001, "logits/rejected": 2.4062891006469727, "logps/chosen": -81.1633071899414, "logps/rejected": -21.84956932067871, "loss": 2.4118, "nll_loss": 1.3527218103408813, "rewards/accuracies": 0.0, "rewards/chosen": 0.6398003101348877, "rewards/margins": -0.3405408263206482, "rewards/rejected": 0.9803411364555359, "step": 827 }, { "epoch": 0.138, "grad_norm": 54.87144470214844, "learning_rate": 1.9394462175245379e-07, "logits/chosen": 2.5166356563568115, "logits/rejected": 2.6156961917877197, "logps/chosen": -52.496280670166016, "logps/rejected": -164.1553497314453, "loss": 0.8582, "nll_loss": 0.709409236907959, "rewards/accuracies": 1.0, "rewards/chosen": 0.7653102874755859, "rewards/margins": 2.720731735229492, "rewards/rejected": -1.9554214477539062, "step": 828 }, { "epoch": 0.13816666666666666, "grad_norm": 102.53290557861328, "learning_rate": 1.9392610957174402e-07, "logits/chosen": 3.4076120853424072, "logits/rejected": 3.472885847091675, "logps/chosen": -36.41432571411133, "logps/rejected": -9.916378021240234, "loss": 2.3834, "nll_loss": 1.6551967859268188, "rewards/accuracies": 1.0, "rewards/chosen": 0.841364324092865, "rewards/margins": 0.2953157424926758, "rewards/rejected": 0.5460485816001892, "step": 829 }, { "epoch": 0.13833333333333334, "grad_norm": 26.57851219177246, "learning_rate": 1.939075700232209e-07, "logits/chosen": 2.238297462463379, "logits/rejected": 2.2311277389526367, "logps/chosen": -149.02972412109375, "logps/rejected": -186.30543518066406, "loss": 0.8625, "nll_loss": 0.8188446164131165, "rewards/accuracies": 1.0, "rewards/chosen": 2.2445709705352783, "rewards/margins": 4.702676773071289, "rewards/rejected": -2.4581055641174316, "step": 830 }, { "epoch": 0.1385, "grad_norm": 30.07573699951172, "learning_rate": 1.9388900311228635e-07, "logits/chosen": 1.951352834701538, "logits/rejected": 2.008800983428955, "logps/chosen": -63.72543716430664, "logps/rejected": -40.97800827026367, "loss": 0.6235, "nll_loss": 0.5310452580451965, "rewards/accuracies": 1.0, "rewards/chosen": 1.2307270765304565, "rewards/margins": 3.476245403289795, "rewards/rejected": -2.245518207550049, "step": 831 }, { "epoch": 0.13866666666666666, "grad_norm": 73.2099838256836, "learning_rate": 1.9387040884435034e-07, "logits/chosen": 1.4937189817428589, "logits/rejected": 1.534436821937561, "logps/chosen": -53.21989059448242, "logps/rejected": -39.100807189941406, "loss": 1.6201, "nll_loss": 1.40052330493927, "rewards/accuracies": 1.0, "rewards/chosen": -0.006576538551598787, "rewards/margins": 2.3387584686279297, "rewards/rejected": -2.345335006713867, "step": 832 }, { "epoch": 0.13883333333333334, "grad_norm": 66.8720703125, "learning_rate": 1.9385178722483084e-07, "logits/chosen": 2.9436697959899902, "logits/rejected": 3.101608991622925, "logps/chosen": -91.38910675048828, "logps/rejected": -248.5679473876953, "loss": 1.7434, "nll_loss": 1.6319481134414673, "rewards/accuracies": 1.0, "rewards/chosen": 0.5468398928642273, "rewards/margins": 3.650583028793335, "rewards/rejected": -3.103743076324463, "step": 833 }, { "epoch": 0.139, "grad_norm": 38.68128204345703, "learning_rate": 1.9383313825915368e-07, "logits/chosen": 2.900454044342041, "logits/rejected": 2.9734115600585938, "logps/chosen": -73.88069152832031, "logps/rejected": -132.623046875, "loss": 0.9464, "nll_loss": 0.820896565914154, "rewards/accuracies": 1.0, "rewards/chosen": 0.7546936869621277, "rewards/margins": 3.0776751041412354, "rewards/rejected": -2.322981357574463, "step": 834 }, { "epoch": 0.13916666666666666, "grad_norm": 117.96598815917969, "learning_rate": 1.9381446195275277e-07, "logits/chosen": 2.761685848236084, "logits/rejected": 2.7530112266540527, "logps/chosen": -66.03484344482422, "logps/rejected": -108.12950897216797, "loss": 1.3922, "nll_loss": 0.8923628330230713, "rewards/accuracies": 1.0, "rewards/chosen": 0.17957687377929688, "rewards/margins": 0.8077698349952698, "rewards/rejected": -0.6281929612159729, "step": 835 }, { "epoch": 0.13933333333333334, "grad_norm": 39.410888671875, "learning_rate": 1.9379575831106993e-07, "logits/chosen": 3.140123128890991, "logits/rejected": 3.022742509841919, "logps/chosen": -86.21487426757812, "logps/rejected": -28.166162490844727, "loss": 1.0629, "nll_loss": 0.9371181726455688, "rewards/accuracies": 1.0, "rewards/chosen": 1.6620880365371704, "rewards/margins": 2.9611105918884277, "rewards/rejected": -1.2990225553512573, "step": 836 }, { "epoch": 0.1395, "grad_norm": 45.34018325805664, "learning_rate": 1.937770273395549e-07, "logits/chosen": 2.7395005226135254, "logits/rejected": 2.950392484664917, "logps/chosen": -51.65168762207031, "logps/rejected": -79.18269348144531, "loss": 0.9987, "nll_loss": 0.8754522800445557, "rewards/accuracies": 1.0, "rewards/chosen": 0.9913890957832336, "rewards/margins": 2.9988033771514893, "rewards/rejected": -2.0074143409729004, "step": 837 }, { "epoch": 0.13966666666666666, "grad_norm": 36.343231201171875, "learning_rate": 1.9375826904366553e-07, "logits/chosen": 2.804802417755127, "logits/rejected": 2.9428551197052, "logps/chosen": -40.971778869628906, "logps/rejected": -41.62166976928711, "loss": 0.7586, "nll_loss": 0.6025261878967285, "rewards/accuracies": 1.0, "rewards/chosen": 1.2648259401321411, "rewards/margins": 2.581991672515869, "rewards/rejected": -1.3171656131744385, "step": 838 }, { "epoch": 0.13983333333333334, "grad_norm": 53.88981246948242, "learning_rate": 1.9373948342886744e-07, "logits/chosen": 2.9452614784240723, "logits/rejected": 3.0211141109466553, "logps/chosen": -52.41952896118164, "logps/rejected": -99.24342346191406, "loss": 1.5176, "nll_loss": 1.4167442321777344, "rewards/accuracies": 1.0, "rewards/chosen": 0.8785068392753601, "rewards/margins": 3.503694534301758, "rewards/rejected": -2.625187635421753, "step": 839 }, { "epoch": 0.14, "grad_norm": 48.070438385009766, "learning_rate": 1.9372067050063436e-07, "logits/chosen": 1.1617164611816406, "logits/rejected": 1.96815824508667, "logps/chosen": -75.67073822021484, "logps/rejected": -274.6498107910156, "loss": 1.1053, "nll_loss": 0.9956675171852112, "rewards/accuracies": 1.0, "rewards/chosen": 0.5708702206611633, "rewards/margins": 3.661269426345825, "rewards/rejected": -3.0903992652893066, "step": 840 }, { "epoch": 0.14016666666666666, "grad_norm": 55.712100982666016, "learning_rate": 1.937018302644479e-07, "logits/chosen": 3.1331570148468018, "logits/rejected": 3.271599054336548, "logps/chosen": -60.43173599243164, "logps/rejected": -160.11367797851562, "loss": 1.1309, "nll_loss": 1.0242667198181152, "rewards/accuracies": 1.0, "rewards/chosen": 0.7691219449043274, "rewards/margins": 3.4658565521240234, "rewards/rejected": -2.696734666824341, "step": 841 }, { "epoch": 0.14033333333333334, "grad_norm": 51.88179397583008, "learning_rate": 1.936829627257977e-07, "logits/chosen": 2.4717860221862793, "logits/rejected": 2.8216495513916016, "logps/chosen": -29.047788619995117, "logps/rejected": -462.0076904296875, "loss": 1.1218, "nll_loss": 1.0374209880828857, "rewards/accuracies": 1.0, "rewards/chosen": 1.4510096311569214, "rewards/margins": 3.583822250366211, "rewards/rejected": -2.1328125, "step": 842 }, { "epoch": 0.1405, "grad_norm": 78.60468292236328, "learning_rate": 1.9366406789018124e-07, "logits/chosen": 3.2160274982452393, "logits/rejected": 3.467496633529663, "logps/chosen": -30.37533187866211, "logps/rejected": -531.9591674804688, "loss": 1.4933, "nll_loss": 1.3806968927383423, "rewards/accuracies": 1.0, "rewards/chosen": 0.3491542935371399, "rewards/margins": 4.115084648132324, "rewards/rejected": -3.76593017578125, "step": 843 }, { "epoch": 0.14066666666666666, "grad_norm": 92.0506362915039, "learning_rate": 1.9364514576310406e-07, "logits/chosen": 4.592746257781982, "logits/rejected": 4.640986442565918, "logps/chosen": -58.04045104980469, "logps/rejected": -69.44947052001953, "loss": 1.4052, "nll_loss": 1.2897881269454956, "rewards/accuracies": 1.0, "rewards/chosen": 1.4450409412384033, "rewards/margins": 3.056804656982422, "rewards/rejected": -1.611763834953308, "step": 844 }, { "epoch": 0.14083333333333334, "grad_norm": 54.54743957519531, "learning_rate": 1.9362619635007963e-07, "logits/chosen": 2.6295864582061768, "logits/rejected": 2.727853536605835, "logps/chosen": -92.1379623413086, "logps/rejected": -178.524169921875, "loss": 0.995, "nll_loss": 0.8153802752494812, "rewards/accuracies": 1.0, "rewards/chosen": 1.885927677154541, "rewards/margins": 2.5439858436584473, "rewards/rejected": -0.6580581665039062, "step": 845 }, { "epoch": 0.141, "grad_norm": 31.451807022094727, "learning_rate": 1.936072196566293e-07, "logits/chosen": 1.8165117502212524, "logits/rejected": 1.8842014074325562, "logps/chosen": -206.93377685546875, "logps/rejected": -147.81515502929688, "loss": 1.117, "nll_loss": 1.0557847023010254, "rewards/accuracies": 1.0, "rewards/chosen": 1.1669052839279175, "rewards/margins": 4.693855285644531, "rewards/rejected": -3.5269501209259033, "step": 846 }, { "epoch": 0.14116666666666666, "grad_norm": 35.9335823059082, "learning_rate": 1.935882156882825e-07, "logits/chosen": 3.546168327331543, "logits/rejected": 3.6299757957458496, "logps/chosen": -58.211761474609375, "logps/rejected": -197.31520080566406, "loss": 1.0051, "nll_loss": 0.9239962697029114, "rewards/accuracies": 1.0, "rewards/chosen": 0.6323379874229431, "rewards/margins": 5.261247634887695, "rewards/rejected": -4.628909587860107, "step": 847 }, { "epoch": 0.14133333333333334, "grad_norm": 108.75224304199219, "learning_rate": 1.9356918445057646e-07, "logits/chosen": 2.088618755340576, "logits/rejected": 2.552356481552124, "logps/chosen": -48.82377243041992, "logps/rejected": -216.5547332763672, "loss": 1.2924, "nll_loss": 1.0388036966323853, "rewards/accuracies": 1.0, "rewards/chosen": 0.23951759934425354, "rewards/margins": 1.9054187536239624, "rewards/rejected": -1.6659011840820312, "step": 848 }, { "epoch": 0.1415, "grad_norm": 54.23193359375, "learning_rate": 1.9355012594905644e-07, "logits/chosen": 2.2613213062286377, "logits/rejected": 2.9000141620635986, "logps/chosen": -13.706625938415527, "logps/rejected": -107.68541717529297, "loss": 0.6002, "nll_loss": 0.507652759552002, "rewards/accuracies": 1.0, "rewards/chosen": 0.9586936235427856, "rewards/margins": 3.6447906494140625, "rewards/rejected": -2.6860971450805664, "step": 849 }, { "epoch": 0.14166666666666666, "grad_norm": 54.3088493347168, "learning_rate": 1.9353104018927565e-07, "logits/chosen": 1.8391679525375366, "logits/rejected": 1.7359541654586792, "logps/chosen": -19.913162231445312, "logps/rejected": -36.49081039428711, "loss": 0.735, "nll_loss": 0.4630967974662781, "rewards/accuracies": 1.0, "rewards/chosen": 0.36775168776512146, "rewards/margins": 1.7537193298339844, "rewards/rejected": -1.3859676122665405, "step": 850 }, { "epoch": 0.14183333333333334, "grad_norm": 28.28121566772461, "learning_rate": 1.9351192717679522e-07, "logits/chosen": 1.167639970779419, "logits/rejected": 1.9669851064682007, "logps/chosen": -74.22318267822266, "logps/rejected": -201.80096435546875, "loss": 0.925, "nll_loss": 0.8942552208900452, "rewards/accuracies": 1.0, "rewards/chosen": 1.7311677932739258, "rewards/margins": 6.644388198852539, "rewards/rejected": -4.913220405578613, "step": 851 }, { "epoch": 0.142, "grad_norm": 159.95204162597656, "learning_rate": 1.9349278691718426e-07, "logits/chosen": 2.308133840560913, "logits/rejected": 2.3641467094421387, "logps/chosen": -27.701473236083984, "logps/rejected": -48.900657653808594, "loss": 2.9663, "nll_loss": 2.7701475620269775, "rewards/accuracies": 1.0, "rewards/chosen": -0.04216728359460831, "rewards/margins": 2.6575260162353516, "rewards/rejected": -2.699693202972412, "step": 852 }, { "epoch": 0.14216666666666666, "grad_norm": 28.94318389892578, "learning_rate": 1.9347361941601967e-07, "logits/chosen": 2.9594311714172363, "logits/rejected": 2.834878921508789, "logps/chosen": -101.45237731933594, "logps/rejected": -84.69546508789062, "loss": 0.7625, "nll_loss": 0.6948793530464172, "rewards/accuracies": 1.0, "rewards/chosen": 1.8525116443634033, "rewards/margins": 3.941704273223877, "rewards/rejected": -2.0891926288604736, "step": 853 }, { "epoch": 0.14233333333333334, "grad_norm": 140.822998046875, "learning_rate": 1.9345442467888651e-07, "logits/chosen": 2.6159472465515137, "logits/rejected": 2.734757661819458, "logps/chosen": -103.75483703613281, "logps/rejected": -27.33927345275879, "loss": 2.4048, "nll_loss": 1.4613356590270996, "rewards/accuracies": 1.0, "rewards/chosen": 1.7300294637680054, "rewards/margins": 0.08092498779296875, "rewards/rejected": 1.6491044759750366, "step": 854 }, { "epoch": 0.1425, "grad_norm": 44.99922180175781, "learning_rate": 1.934352027113776e-07, "logits/chosen": 2.2307851314544678, "logits/rejected": 2.698507308959961, "logps/chosen": -88.31298828125, "logps/rejected": -525.8665161132812, "loss": 1.1588, "nll_loss": 1.0640116930007935, "rewards/accuracies": 1.0, "rewards/chosen": 0.3332367241382599, "rewards/margins": 9.646438598632812, "rewards/rejected": -9.313201904296875, "step": 855 }, { "epoch": 0.14266666666666666, "grad_norm": 27.865903854370117, "learning_rate": 1.9341595351909382e-07, "logits/chosen": 2.3426990509033203, "logits/rejected": 3.438854455947876, "logps/chosen": -64.86758422851562, "logps/rejected": -231.90081787109375, "loss": 0.8247, "nll_loss": 0.8008342981338501, "rewards/accuracies": 1.0, "rewards/chosen": 2.421588182449341, "rewards/margins": 5.899810791015625, "rewards/rejected": -3.4782228469848633, "step": 856 }, { "epoch": 0.14283333333333334, "grad_norm": 43.80131149291992, "learning_rate": 1.9339667710764387e-07, "logits/chosen": 2.809286117553711, "logits/rejected": 2.9693617820739746, "logps/chosen": -39.93403625488281, "logps/rejected": -117.14999389648438, "loss": 0.7294, "nll_loss": 0.5872653722763062, "rewards/accuracies": 1.0, "rewards/chosen": 0.8465862274169922, "rewards/margins": 2.7726757526397705, "rewards/rejected": -1.9260895252227783, "step": 857 }, { "epoch": 0.143, "grad_norm": 57.179508209228516, "learning_rate": 1.9337737348264446e-07, "logits/chosen": 2.8087430000305176, "logits/rejected": 2.7096409797668457, "logps/chosen": -19.558664321899414, "logps/rejected": -81.51902770996094, "loss": 0.6422, "nll_loss": 0.4161418080329895, "rewards/accuracies": 1.0, "rewards/chosen": 0.6882624626159668, "rewards/margins": 2.01106595993042, "rewards/rejected": -1.3228034973144531, "step": 858 }, { "epoch": 0.14316666666666666, "grad_norm": 82.65800476074219, "learning_rate": 1.9335804264972017e-07, "logits/chosen": 1.022181749343872, "logits/rejected": 1.6099108457565308, "logps/chosen": -79.89018249511719, "logps/rejected": -277.92913818359375, "loss": 2.8571, "nll_loss": 2.754833936691284, "rewards/accuracies": 1.0, "rewards/chosen": 1.3012253046035767, "rewards/margins": 3.2631640434265137, "rewards/rejected": -1.9619386196136475, "step": 859 }, { "epoch": 0.14333333333333334, "grad_norm": 37.705604553222656, "learning_rate": 1.9333868461450358e-07, "logits/chosen": 3.1834919452667236, "logits/rejected": 3.2919981479644775, "logps/chosen": -22.88802719116211, "logps/rejected": -133.6888427734375, "loss": 0.647, "nll_loss": 0.6185954809188843, "rewards/accuracies": 1.0, "rewards/chosen": 2.060939073562622, "rewards/margins": 5.802656650543213, "rewards/rejected": -3.741717576980591, "step": 860 }, { "epoch": 0.1435, "grad_norm": 28.28911018371582, "learning_rate": 1.9331929938263513e-07, "logits/chosen": 2.258747100830078, "logits/rejected": 2.4532291889190674, "logps/chosen": -64.55319213867188, "logps/rejected": -210.0051727294922, "loss": 0.8891, "nll_loss": 0.8607093095779419, "rewards/accuracies": 1.0, "rewards/chosen": 1.7587776184082031, "rewards/margins": 7.392984867095947, "rewards/rejected": -5.634207248687744, "step": 861 }, { "epoch": 0.14366666666666666, "grad_norm": 111.966064453125, "learning_rate": 1.932998869597632e-07, "logits/chosen": 1.8255583047866821, "logits/rejected": 1.7352168560028076, "logps/chosen": -79.63044738769531, "logps/rejected": -58.621246337890625, "loss": 1.5899, "nll_loss": 1.061739206314087, "rewards/accuracies": 1.0, "rewards/chosen": 1.4122451543807983, "rewards/margins": 0.9012954235076904, "rewards/rejected": 0.5109497308731079, "step": 862 }, { "epoch": 0.14383333333333334, "grad_norm": 48.920654296875, "learning_rate": 1.932804473515442e-07, "logits/chosen": 1.3184705972671509, "logits/rejected": 2.0975120067596436, "logps/chosen": -41.27919006347656, "logps/rejected": -199.11105346679688, "loss": 0.9626, "nll_loss": 0.8255838751792908, "rewards/accuracies": 1.0, "rewards/chosen": 1.7160921096801758, "rewards/margins": 2.858328342437744, "rewards/rejected": -1.142236351966858, "step": 863 }, { "epoch": 0.144, "grad_norm": 52.10555648803711, "learning_rate": 1.9326098056364223e-07, "logits/chosen": 3.0004384517669678, "logits/rejected": 2.9622981548309326, "logps/chosen": -29.848623275756836, "logps/rejected": -56.33694076538086, "loss": 0.8234, "nll_loss": 0.6488832235336304, "rewards/accuracies": 1.0, "rewards/chosen": 1.0022557973861694, "rewards/margins": 2.402493476867676, "rewards/rejected": -1.4002376794815063, "step": 864 }, { "epoch": 0.14416666666666667, "grad_norm": 215.34567260742188, "learning_rate": 1.9324148660172952e-07, "logits/chosen": 1.1521916389465332, "logits/rejected": 2.315528392791748, "logps/chosen": -77.69978332519531, "logps/rejected": -282.900146484375, "loss": 2.7835, "nll_loss": 1.7659039497375488, "rewards/accuracies": 1.0, "rewards/chosen": -2.498802661895752, "rewards/margins": 0.09661984443664551, "rewards/rejected": -2.5954225063323975, "step": 865 }, { "epoch": 0.14433333333333334, "grad_norm": 62.638065338134766, "learning_rate": 1.9322196547148613e-07, "logits/chosen": 3.0803823471069336, "logits/rejected": 3.2352488040924072, "logps/chosen": -40.625099182128906, "logps/rejected": -160.8894500732422, "loss": 1.6055, "nll_loss": 1.5625038146972656, "rewards/accuracies": 1.0, "rewards/chosen": 1.5761440992355347, "rewards/margins": 5.172611713409424, "rewards/rejected": -3.5964677333831787, "step": 866 }, { "epoch": 0.1445, "grad_norm": 26.96259307861328, "learning_rate": 1.9320241717860004e-07, "logits/chosen": 2.1231353282928467, "logits/rejected": 2.2626447677612305, "logps/chosen": -57.59638595581055, "logps/rejected": -239.5122833251953, "loss": 0.7518, "nll_loss": 0.7290682792663574, "rewards/accuracies": 1.0, "rewards/chosen": 2.1297314167022705, "rewards/margins": 6.612069129943848, "rewards/rejected": -4.482337951660156, "step": 867 }, { "epoch": 0.14466666666666667, "grad_norm": 35.711708068847656, "learning_rate": 1.9318284172876719e-07, "logits/chosen": 2.830336570739746, "logits/rejected": 2.9018287658691406, "logps/chosen": -32.200408935546875, "logps/rejected": -101.28897857666016, "loss": 0.5651, "nll_loss": 0.473535418510437, "rewards/accuracies": 1.0, "rewards/chosen": 0.7643380165100098, "rewards/margins": 3.9187991619110107, "rewards/rejected": -3.154461145401001, "step": 868 }, { "epoch": 0.14483333333333334, "grad_norm": 69.65438842773438, "learning_rate": 1.931632391276913e-07, "logits/chosen": 1.5949333906173706, "logits/rejected": 1.4220749139785767, "logps/chosen": -51.57673645019531, "logps/rejected": -38.96164321899414, "loss": 1.5509, "nll_loss": 1.357282280921936, "rewards/accuracies": 1.0, "rewards/chosen": 0.15773889422416687, "rewards/margins": 2.4891576766967773, "rewards/rejected": -2.331418752670288, "step": 869 }, { "epoch": 0.145, "grad_norm": 66.47762298583984, "learning_rate": 1.9314360938108424e-07, "logits/chosen": 2.510237216949463, "logits/rejected": 2.493928909301758, "logps/chosen": -122.40743255615234, "logps/rejected": -138.41366577148438, "loss": 1.0394, "nll_loss": 0.8160495162010193, "rewards/accuracies": 1.0, "rewards/chosen": 0.8669136762619019, "rewards/margins": 2.0242416858673096, "rewards/rejected": -1.1573280096054077, "step": 870 }, { "epoch": 0.14516666666666667, "grad_norm": 75.220703125, "learning_rate": 1.931239524946655e-07, "logits/chosen": 1.4851738214492798, "logits/rejected": 2.3619160652160645, "logps/chosen": -243.7685089111328, "logps/rejected": -543.9088134765625, "loss": 1.398, "nll_loss": 1.1391053199768066, "rewards/accuracies": 1.0, "rewards/chosen": -1.1296554803848267, "rewards/margins": 5.451777935028076, "rewards/rejected": -6.581433296203613, "step": 871 }, { "epoch": 0.14533333333333334, "grad_norm": 39.58372116088867, "learning_rate": 1.9310426847416274e-07, "logits/chosen": 2.108128070831299, "logits/rejected": 3.0655558109283447, "logps/chosen": -68.49615478515625, "logps/rejected": -45.12657928466797, "loss": 0.9205, "nll_loss": 0.8252546787261963, "rewards/accuracies": 1.0, "rewards/chosen": 1.7845786809921265, "rewards/margins": 3.3995957374572754, "rewards/rejected": -1.6150171756744385, "step": 872 }, { "epoch": 0.1455, "grad_norm": 199.7241668701172, "learning_rate": 1.9308455732531136e-07, "logits/chosen": 2.628021240234375, "logits/rejected": 3.069669723510742, "logps/chosen": -9.108118057250977, "logps/rejected": -37.12855911254883, "loss": 1.1058, "nll_loss": 0.3036039471626282, "rewards/accuracies": 1.0, "rewards/chosen": -0.42380648851394653, "rewards/margins": 0.06776458024978638, "rewards/rejected": -0.4915710687637329, "step": 873 }, { "epoch": 0.14566666666666667, "grad_norm": 114.25769805908203, "learning_rate": 1.9306481905385472e-07, "logits/chosen": 2.727583408355713, "logits/rejected": 2.761021375656128, "logps/chosen": -29.995332717895508, "logps/rejected": -20.377361297607422, "loss": 1.5394, "nll_loss": 0.7141745090484619, "rewards/accuracies": 1.0, "rewards/chosen": 0.2728685438632965, "rewards/margins": 0.03227043151855469, "rewards/rejected": 0.24059811234474182, "step": 874 }, { "epoch": 0.14583333333333334, "grad_norm": 65.57100677490234, "learning_rate": 1.930450536655441e-07, "logits/chosen": 1.548802375793457, "logits/rejected": 3.0467820167541504, "logps/chosen": -62.62100601196289, "logps/rejected": -49.61408233642578, "loss": 2.2529, "nll_loss": 2.1593453884124756, "rewards/accuracies": 1.0, "rewards/chosen": 1.4532849788665771, "rewards/margins": 3.398519515991211, "rewards/rejected": -1.9452346563339233, "step": 875 }, { "epoch": 0.146, "grad_norm": 33.39441680908203, "learning_rate": 1.9302526116613862e-07, "logits/chosen": 1.8608176708221436, "logits/rejected": 2.4604992866516113, "logps/chosen": -105.11590576171875, "logps/rejected": -143.3682861328125, "loss": 0.889, "nll_loss": 0.8276843428611755, "rewards/accuracies": 1.0, "rewards/chosen": 1.2719833850860596, "rewards/margins": 4.437941074371338, "rewards/rejected": -3.1659576892852783, "step": 876 }, { "epoch": 0.14616666666666667, "grad_norm": 73.77005004882812, "learning_rate": 1.9300544156140535e-07, "logits/chosen": 2.0616095066070557, "logits/rejected": 1.569871187210083, "logps/chosen": -64.58305358886719, "logps/rejected": -25.726133346557617, "loss": 1.1153, "nll_loss": 0.7509658336639404, "rewards/accuracies": 1.0, "rewards/chosen": 0.22902755439281464, "rewards/margins": 1.2958171367645264, "rewards/rejected": -1.0667896270751953, "step": 877 }, { "epoch": 0.14633333333333334, "grad_norm": 139.33958435058594, "learning_rate": 1.9298559485711924e-07, "logits/chosen": 2.6023058891296387, "logits/rejected": 2.6400227546691895, "logps/chosen": -95.70376586914062, "logps/rejected": -59.511783599853516, "loss": 1.6597, "nll_loss": 1.0402582883834839, "rewards/accuracies": 1.0, "rewards/chosen": -0.01549987867474556, "rewards/margins": 0.47564470767974854, "rewards/rejected": -0.491144597530365, "step": 878 }, { "epoch": 0.1465, "grad_norm": 44.48097610473633, "learning_rate": 1.929657210590632e-07, "logits/chosen": 2.6948232650756836, "logits/rejected": 2.647944450378418, "logps/chosen": -62.73974609375, "logps/rejected": -74.5736312866211, "loss": 0.764, "nll_loss": 0.6337347030639648, "rewards/accuracies": 1.0, "rewards/chosen": 1.2619651556015015, "rewards/margins": 2.861691951751709, "rewards/rejected": -1.599726915359497, "step": 879 }, { "epoch": 0.14666666666666667, "grad_norm": 30.6712646484375, "learning_rate": 1.9294582017302794e-07, "logits/chosen": 0.6653977632522583, "logits/rejected": 2.152104139328003, "logps/chosen": -53.033119201660156, "logps/rejected": -406.769775390625, "loss": 0.7884, "nll_loss": 0.7469453811645508, "rewards/accuracies": 1.0, "rewards/chosen": 1.3401817083358765, "rewards/margins": 6.711770534515381, "rewards/rejected": -5.371588706970215, "step": 880 }, { "epoch": 0.14683333333333334, "grad_norm": 34.86634063720703, "learning_rate": 1.9292589220481209e-07, "logits/chosen": 2.8416316509246826, "logits/rejected": 2.8169052600860596, "logps/chosen": -72.05223083496094, "logps/rejected": -50.01399612426758, "loss": 1.0476, "nll_loss": 1.000725269317627, "rewards/accuracies": 1.0, "rewards/chosen": 1.5118119716644287, "rewards/margins": 4.947150707244873, "rewards/rejected": -3.4353387355804443, "step": 881 }, { "epoch": 0.147, "grad_norm": 333.2150573730469, "learning_rate": 1.9290593716022214e-07, "logits/chosen": 2.9157209396362305, "logits/rejected": 2.7859513759613037, "logps/chosen": -229.52581787109375, "logps/rejected": -90.89231872558594, "loss": 2.3212, "nll_loss": 0.9808794856071472, "rewards/accuracies": 0.0, "rewards/chosen": 0.06633605808019638, "rewards/margins": -0.8258339762687683, "rewards/rejected": 0.8921700119972229, "step": 882 }, { "epoch": 0.14716666666666667, "grad_norm": 25.559574127197266, "learning_rate": 1.928859550450726e-07, "logits/chosen": 2.325585126876831, "logits/rejected": 2.30830979347229, "logps/chosen": -172.046142578125, "logps/rejected": -147.92323303222656, "loss": 0.7525, "nll_loss": 0.6827227473258972, "rewards/accuracies": 1.0, "rewards/chosen": 1.4071564674377441, "rewards/margins": 3.9681544303894043, "rewards/rejected": -2.56099796295166, "step": 883 }, { "epoch": 0.14733333333333334, "grad_norm": 41.71619415283203, "learning_rate": 1.9286594586518575e-07, "logits/chosen": 2.129622459411621, "logits/rejected": 1.9327441453933716, "logps/chosen": -29.80219078063965, "logps/rejected": -22.916770935058594, "loss": 0.7247, "nll_loss": 0.6082078814506531, "rewards/accuracies": 1.0, "rewards/chosen": 1.4405603408813477, "rewards/margins": 3.0449633598327637, "rewards/rejected": -1.604403018951416, "step": 884 }, { "epoch": 0.1475, "grad_norm": 241.04664611816406, "learning_rate": 1.9284590962639176e-07, "logits/chosen": 2.930720090866089, "logits/rejected": 2.910407543182373, "logps/chosen": -59.276763916015625, "logps/rejected": -23.579256057739258, "loss": 3.6317, "nll_loss": 2.371070623397827, "rewards/accuracies": 0.0, "rewards/chosen": -1.4974384307861328, "rewards/margins": -0.6690477132797241, "rewards/rejected": -0.8283907175064087, "step": 885 }, { "epoch": 0.14766666666666667, "grad_norm": 93.05884552001953, "learning_rate": 1.928258463345287e-07, "logits/chosen": 2.8714022636413574, "logits/rejected": 2.88643217086792, "logps/chosen": -52.21889114379883, "logps/rejected": -38.98759841918945, "loss": 2.3483, "nll_loss": 2.0887556076049805, "rewards/accuracies": 1.0, "rewards/chosen": -0.10099678486585617, "rewards/margins": 1.9978506565093994, "rewards/rejected": -2.0988473892211914, "step": 886 }, { "epoch": 0.14783333333333334, "grad_norm": 127.62291717529297, "learning_rate": 1.928057559954426e-07, "logits/chosen": 2.943608045578003, "logits/rejected": 2.8438220024108887, "logps/chosen": -42.507652282714844, "logps/rejected": -48.629276275634766, "loss": 2.2054, "nll_loss": 1.574357509613037, "rewards/accuracies": 1.0, "rewards/chosen": -0.4028259217739105, "rewards/margins": 0.47071996331214905, "rewards/rejected": -0.8735458850860596, "step": 887 }, { "epoch": 0.148, "grad_norm": 39.343910217285156, "learning_rate": 1.927856386149872e-07, "logits/chosen": 3.338428020477295, "logits/rejected": 3.4494850635528564, "logps/chosen": -22.68454360961914, "logps/rejected": -112.78886413574219, "loss": 0.5695, "nll_loss": 0.44479501247406006, "rewards/accuracies": 1.0, "rewards/chosen": 1.440277099609375, "rewards/margins": 2.940962314605713, "rewards/rejected": -1.5006850957870483, "step": 888 }, { "epoch": 0.14816666666666667, "grad_norm": 98.47403717041016, "learning_rate": 1.9276549419902435e-07, "logits/chosen": 2.5389280319213867, "logits/rejected": 2.5131492614746094, "logps/chosen": -23.121715545654297, "logps/rejected": -74.579345703125, "loss": 1.3258, "nll_loss": 0.9248685240745544, "rewards/accuracies": 1.0, "rewards/chosen": -0.35451453924179077, "rewards/margins": 1.2500765323638916, "rewards/rejected": -1.6045910120010376, "step": 889 }, { "epoch": 0.14833333333333334, "grad_norm": 40.25075912475586, "learning_rate": 1.9274532275342352e-07, "logits/chosen": 1.4465699195861816, "logits/rejected": 2.3436810970306396, "logps/chosen": -64.80384826660156, "logps/rejected": -155.8837127685547, "loss": 0.9933, "nll_loss": 0.9000533223152161, "rewards/accuracies": 1.0, "rewards/chosen": 0.5979011654853821, "rewards/margins": 4.207815647125244, "rewards/rejected": -3.609914541244507, "step": 890 }, { "epoch": 0.1485, "grad_norm": 27.005008697509766, "learning_rate": 1.9272512428406227e-07, "logits/chosen": 2.8992550373077393, "logits/rejected": 2.907550811767578, "logps/chosen": -84.15554809570312, "logps/rejected": -38.14471435546875, "loss": 0.5751, "nll_loss": 0.5039253830909729, "rewards/accuracies": 1.0, "rewards/chosen": 2.0577392578125, "rewards/margins": 3.900589942932129, "rewards/rejected": -1.8428505659103394, "step": 891 }, { "epoch": 0.14866666666666667, "grad_norm": 43.706302642822266, "learning_rate": 1.927048987968259e-07, "logits/chosen": 2.9196488857269287, "logits/rejected": 2.6257307529449463, "logps/chosen": -65.45215606689453, "logps/rejected": -32.236915588378906, "loss": 1.2668, "nll_loss": 1.1482833623886108, "rewards/accuracies": 1.0, "rewards/chosen": 1.3033020496368408, "rewards/margins": 3.0125668048858643, "rewards/rejected": -1.7092647552490234, "step": 892 }, { "epoch": 0.14883333333333335, "grad_norm": 26.858552932739258, "learning_rate": 1.9268464629760767e-07, "logits/chosen": 2.24056339263916, "logits/rejected": 2.2660038471221924, "logps/chosen": -77.5462417602539, "logps/rejected": -125.56796264648438, "loss": 0.8628, "nll_loss": 0.8162761330604553, "rewards/accuracies": 1.0, "rewards/chosen": 1.6315239667892456, "rewards/margins": 4.796939373016357, "rewards/rejected": -3.1654152870178223, "step": 893 }, { "epoch": 0.149, "grad_norm": 161.00662231445312, "learning_rate": 1.9266436679230865e-07, "logits/chosen": 1.5887914896011353, "logits/rejected": 2.394421100616455, "logps/chosen": -54.16963577270508, "logps/rejected": -167.17721557617188, "loss": 1.3473, "nll_loss": 0.8207520246505737, "rewards/accuracies": 1.0, "rewards/chosen": 1.1548649072647095, "rewards/margins": 0.8459293842315674, "rewards/rejected": 0.3089355528354645, "step": 894 }, { "epoch": 0.14916666666666667, "grad_norm": 78.94393157958984, "learning_rate": 1.9264406028683778e-07, "logits/chosen": 4.591374397277832, "logits/rejected": 4.608766555786133, "logps/chosen": -59.569252014160156, "logps/rejected": -72.43396759033203, "loss": 1.4295, "nll_loss": 1.3237611055374146, "rewards/accuracies": 1.0, "rewards/chosen": 1.2921608686447144, "rewards/margins": 3.2023744583129883, "rewards/rejected": -1.9102134704589844, "step": 895 }, { "epoch": 0.14933333333333335, "grad_norm": 29.835121154785156, "learning_rate": 1.9262372678711193e-07, "logits/chosen": 2.893984079360962, "logits/rejected": 3.021364450454712, "logps/chosen": -41.72154235839844, "logps/rejected": -260.42388916015625, "loss": 0.6857, "nll_loss": 0.6321445107460022, "rewards/accuracies": 1.0, "rewards/chosen": 1.1174331903457642, "rewards/margins": 5.600062370300293, "rewards/rejected": -4.482629299163818, "step": 896 }, { "epoch": 0.1495, "grad_norm": 83.99952697753906, "learning_rate": 1.9260336629905578e-07, "logits/chosen": 3.2104012966156006, "logits/rejected": 3.283376455307007, "logps/chosen": -38.26190185546875, "logps/rejected": -108.32245635986328, "loss": 1.1419, "nll_loss": 1.0341055393218994, "rewards/accuracies": 1.0, "rewards/chosen": 0.45079347491264343, "rewards/margins": 3.9120635986328125, "rewards/rejected": -3.4612700939178467, "step": 897 }, { "epoch": 0.14966666666666667, "grad_norm": 22.13212013244629, "learning_rate": 1.9258297882860188e-07, "logits/chosen": 2.844395399093628, "logits/rejected": 2.7972114086151123, "logps/chosen": -155.24560546875, "logps/rejected": -194.2588653564453, "loss": 0.6472, "nll_loss": 0.6064282059669495, "rewards/accuracies": 1.0, "rewards/chosen": 1.5148193836212158, "rewards/margins": 5.536374092102051, "rewards/rejected": -4.021554470062256, "step": 898 }, { "epoch": 0.14983333333333335, "grad_norm": 52.00752639770508, "learning_rate": 1.9256256438169068e-07, "logits/chosen": 2.834062337875366, "logits/rejected": 2.8574345111846924, "logps/chosen": -123.84577941894531, "logps/rejected": -276.8976745605469, "loss": 1.566, "nll_loss": 1.440067172050476, "rewards/accuracies": 1.0, "rewards/chosen": 0.3707153797149658, "rewards/margins": 3.4478275775909424, "rewards/rejected": -3.0771121978759766, "step": 899 }, { "epoch": 0.15, "grad_norm": 98.23392486572266, "learning_rate": 1.925421229642704e-07, "logits/chosen": 2.7664403915405273, "logits/rejected": 2.618661880493164, "logps/chosen": -173.4996795654297, "logps/rejected": -95.12811279296875, "loss": 1.7059, "nll_loss": 1.3346126079559326, "rewards/accuracies": 1.0, "rewards/chosen": -0.8980209231376648, "rewards/margins": 1.7080612182617188, "rewards/rejected": -2.6060822010040283, "step": 900 }, { "epoch": 0.15016666666666667, "grad_norm": 37.349449157714844, "learning_rate": 1.9252165458229724e-07, "logits/chosen": 2.241053581237793, "logits/rejected": 2.119229316711426, "logps/chosen": -120.29418182373047, "logps/rejected": -125.80047607421875, "loss": 1.2169, "nll_loss": 1.1679048538208008, "rewards/accuracies": 1.0, "rewards/chosen": 1.3857628107070923, "rewards/margins": 5.019458293914795, "rewards/rejected": -3.633695602416992, "step": 901 }, { "epoch": 0.15033333333333335, "grad_norm": 46.348384857177734, "learning_rate": 1.9250115924173516e-07, "logits/chosen": 1.8776402473449707, "logits/rejected": 1.4411648511886597, "logps/chosen": -66.10533905029297, "logps/rejected": -41.42719268798828, "loss": 0.8676, "nll_loss": 0.7598314881324768, "rewards/accuracies": 1.0, "rewards/chosen": 0.9745025634765625, "rewards/margins": 3.2537152767181396, "rewards/rejected": -2.279212713241577, "step": 902 }, { "epoch": 0.1505, "grad_norm": 33.68196487426758, "learning_rate": 1.92480636948556e-07, "logits/chosen": 1.9373091459274292, "logits/rejected": 2.3986568450927734, "logps/chosen": -60.71513366699219, "logps/rejected": -195.33839416503906, "loss": 0.7894, "nll_loss": 0.7495694756507874, "rewards/accuracies": 1.0, "rewards/chosen": 1.4087761640548706, "rewards/margins": 6.302283763885498, "rewards/rejected": -4.893507480621338, "step": 903 }, { "epoch": 0.15066666666666667, "grad_norm": 23.87565803527832, "learning_rate": 1.9246008770873948e-07, "logits/chosen": 3.1241936683654785, "logits/rejected": 3.2164456844329834, "logps/chosen": -195.08267211914062, "logps/rejected": -241.9648895263672, "loss": 0.9116, "nll_loss": 0.8670339584350586, "rewards/accuracies": 1.0, "rewards/chosen": 2.1196670532226562, "rewards/margins": 4.665998935699463, "rewards/rejected": -2.5463318824768066, "step": 904 }, { "epoch": 0.15083333333333335, "grad_norm": 74.71215057373047, "learning_rate": 1.924395115282732e-07, "logits/chosen": 3.0003178119659424, "logits/rejected": 2.996852159500122, "logps/chosen": -63.45416259765625, "logps/rejected": -39.79555130004883, "loss": 1.3599, "nll_loss": 1.0234543085098267, "rewards/accuracies": 1.0, "rewards/chosen": 1.1129196882247925, "rewards/margins": 1.4709874391555786, "rewards/rejected": -0.35806772112846375, "step": 905 }, { "epoch": 0.151, "grad_norm": 30.416059494018555, "learning_rate": 1.9241890841315248e-07, "logits/chosen": 2.4506094455718994, "logits/rejected": 2.660682201385498, "logps/chosen": -57.003211975097656, "logps/rejected": -233.77537536621094, "loss": 0.7417, "nll_loss": 0.695160984992981, "rewards/accuracies": 1.0, "rewards/chosen": 1.927838921546936, "rewards/margins": 4.6081719398498535, "rewards/rejected": -2.680332899093628, "step": 906 }, { "epoch": 0.15116666666666667, "grad_norm": 100.05924987792969, "learning_rate": 1.9239827836938064e-07, "logits/chosen": 2.8526108264923096, "logits/rejected": 2.7501020431518555, "logps/chosen": -123.60260772705078, "logps/rejected": -28.054601669311523, "loss": 2.6199, "nll_loss": 2.332125186920166, "rewards/accuracies": 1.0, "rewards/chosen": 0.3498581051826477, "rewards/margins": 1.657670259475708, "rewards/rejected": -1.307812213897705, "step": 907 }, { "epoch": 0.15133333333333332, "grad_norm": 38.49941635131836, "learning_rate": 1.923776214029687e-07, "logits/chosen": 1.709793210029602, "logits/rejected": 2.484198808670044, "logps/chosen": -27.40430450439453, "logps/rejected": -147.26193237304688, "loss": 0.7325, "nll_loss": 0.6851075887680054, "rewards/accuracies": 1.0, "rewards/chosen": 2.193753480911255, "rewards/margins": 4.561946868896484, "rewards/rejected": -2.3681931495666504, "step": 908 }, { "epoch": 0.1515, "grad_norm": 89.3094711303711, "learning_rate": 1.9235693751993568e-07, "logits/chosen": 2.7084786891937256, "logits/rejected": 2.7722344398498535, "logps/chosen": -43.90788650512695, "logps/rejected": -31.43053436279297, "loss": 1.7734, "nll_loss": 1.6262179613113403, "rewards/accuracies": 1.0, "rewards/chosen": 1.6331901550292969, "rewards/margins": 2.743314027786255, "rewards/rejected": -1.110123872756958, "step": 909 }, { "epoch": 0.15166666666666667, "grad_norm": 43.8275146484375, "learning_rate": 1.923362267263084e-07, "logits/chosen": 2.2067854404449463, "logits/rejected": 2.820242166519165, "logps/chosen": -71.71871948242188, "logps/rejected": -183.88076782226562, "loss": 1.1247, "nll_loss": 1.0704286098480225, "rewards/accuracies": 1.0, "rewards/chosen": 1.0164642333984375, "rewards/margins": 6.418885707855225, "rewards/rejected": -5.402421474456787, "step": 910 }, { "epoch": 0.15183333333333332, "grad_norm": 65.95820617675781, "learning_rate": 1.9231548902812137e-07, "logits/chosen": 2.7027251720428467, "logits/rejected": 2.4030911922454834, "logps/chosen": -15.329314231872559, "logps/rejected": -30.964778900146484, "loss": 0.9412, "nll_loss": 0.4944940209388733, "rewards/accuracies": 1.0, "rewards/chosen": 0.7963590621948242, "rewards/margins": 1.0161688327789307, "rewards/rejected": -0.21980972588062286, "step": 911 }, { "epoch": 0.152, "grad_norm": 104.36202239990234, "learning_rate": 1.9229472443141717e-07, "logits/chosen": 2.9358668327331543, "logits/rejected": 2.7830216884613037, "logps/chosen": -95.02732849121094, "logps/rejected": -21.262821197509766, "loss": 1.5828, "nll_loss": 1.0329056978225708, "rewards/accuracies": 1.0, "rewards/chosen": 1.5193437337875366, "rewards/margins": 0.8806561827659607, "rewards/rejected": 0.6386875510215759, "step": 912 }, { "epoch": 0.15216666666666667, "grad_norm": 74.5613021850586, "learning_rate": 1.92273932942246e-07, "logits/chosen": 1.6795282363891602, "logits/rejected": 2.631415843963623, "logps/chosen": -42.31619644165039, "logps/rejected": -141.44662475585938, "loss": 1.7408, "nll_loss": 1.6926480531692505, "rewards/accuracies": 1.0, "rewards/chosen": 1.5494434833526611, "rewards/margins": 4.781573295593262, "rewards/rejected": -3.2321298122406006, "step": 913 }, { "epoch": 0.15233333333333332, "grad_norm": 26.8193359375, "learning_rate": 1.9225311456666608e-07, "logits/chosen": 1.70542311668396, "logits/rejected": 2.810274124145508, "logps/chosen": -33.555625915527344, "logps/rejected": -239.2801971435547, "loss": 0.5902, "nll_loss": 0.5500921010971069, "rewards/accuracies": 1.0, "rewards/chosen": 1.3635926246643066, "rewards/margins": 6.700859069824219, "rewards/rejected": -5.337266445159912, "step": 914 }, { "epoch": 0.1525, "grad_norm": 61.53122329711914, "learning_rate": 1.922322693107434e-07, "logits/chosen": 2.627723455429077, "logits/rejected": 2.6108856201171875, "logps/chosen": -30.074724197387695, "logps/rejected": -40.69820785522461, "loss": 1.1946, "nll_loss": 1.0740973949432373, "rewards/accuracies": 1.0, "rewards/chosen": 0.8774527311325073, "rewards/margins": 3.0687527656555176, "rewards/rejected": -2.1912999153137207, "step": 915 }, { "epoch": 0.15266666666666667, "grad_norm": 41.17625427246094, "learning_rate": 1.9221139718055167e-07, "logits/chosen": 2.764674425125122, "logits/rejected": 2.8785083293914795, "logps/chosen": -34.64848327636719, "logps/rejected": -248.267578125, "loss": 0.7672, "nll_loss": 0.6793820858001709, "rewards/accuracies": 1.0, "rewards/chosen": 1.2372902631759644, "rewards/margins": 3.5547690391540527, "rewards/rejected": -2.317478895187378, "step": 916 }, { "epoch": 0.15283333333333332, "grad_norm": 74.33259582519531, "learning_rate": 1.921904981821726e-07, "logits/chosen": 3.1513924598693848, "logits/rejected": 3.0571227073669434, "logps/chosen": -106.73072814941406, "logps/rejected": -81.4193115234375, "loss": 1.5189, "nll_loss": 1.3015943765640259, "rewards/accuracies": 1.0, "rewards/chosen": 0.44033128023147583, "rewards/margins": 2.1117255687713623, "rewards/rejected": -1.6713943481445312, "step": 917 }, { "epoch": 0.153, "grad_norm": 22.345996856689453, "learning_rate": 1.9216957232169566e-07, "logits/chosen": 3.1250593662261963, "logits/rejected": 3.2876217365264893, "logps/chosen": -62.773719787597656, "logps/rejected": -157.80555725097656, "loss": 0.584, "nll_loss": 0.5506467223167419, "rewards/accuracies": 1.0, "rewards/chosen": 1.5985612869262695, "rewards/margins": 6.540426254272461, "rewards/rejected": -4.941864967346191, "step": 918 }, { "epoch": 0.15316666666666667, "grad_norm": 51.48362731933594, "learning_rate": 1.9214861960521812e-07, "logits/chosen": 2.216261863708496, "logits/rejected": 1.8161571025848389, "logps/chosen": -72.16975402832031, "logps/rejected": -32.85265350341797, "loss": 1.2851, "nll_loss": 1.0771605968475342, "rewards/accuracies": 1.0, "rewards/chosen": 0.5285980105400085, "rewards/margins": 2.166752815246582, "rewards/rejected": -1.6381548643112183, "step": 919 }, { "epoch": 0.15333333333333332, "grad_norm": 42.540184020996094, "learning_rate": 1.921276400388451e-07, "logits/chosen": 2.7026402950286865, "logits/rejected": 1.878431797027588, "logps/chosen": -35.5615119934082, "logps/rejected": -23.35469627380371, "loss": 0.7078, "nll_loss": 0.49390989542007446, "rewards/accuracies": 1.0, "rewards/chosen": 1.2645171880722046, "rewards/margins": 2.134395122528076, "rewards/rejected": -0.869877815246582, "step": 920 }, { "epoch": 0.1535, "grad_norm": 24.84566307067871, "learning_rate": 1.9210663362868954e-07, "logits/chosen": 1.9918454885482788, "logits/rejected": 2.6030569076538086, "logps/chosen": -74.22486877441406, "logps/rejected": -332.21356201171875, "loss": 0.8809, "nll_loss": 0.8732337355613708, "rewards/accuracies": 1.0, "rewards/chosen": 3.181105136871338, "rewards/margins": 8.784395217895508, "rewards/rejected": -5.60329008102417, "step": 921 }, { "epoch": 0.15366666666666667, "grad_norm": 45.33732986450195, "learning_rate": 1.920856003808722e-07, "logits/chosen": 2.9427316188812256, "logits/rejected": 3.0266189575195312, "logps/chosen": -71.7091293334961, "logps/rejected": -88.53618621826172, "loss": 1.1835, "nll_loss": 1.1204551458358765, "rewards/accuracies": 1.0, "rewards/chosen": 1.8209587335586548, "rewards/margins": 4.060362339019775, "rewards/rejected": -2.239403486251831, "step": 922 }, { "epoch": 0.15383333333333332, "grad_norm": 98.33944702148438, "learning_rate": 1.920645403015217e-07, "logits/chosen": 4.348410129547119, "logits/rejected": 3.9796979427337646, "logps/chosen": -92.25606536865234, "logps/rejected": -20.586944580078125, "loss": 1.7233, "nll_loss": 1.1532008647918701, "rewards/accuracies": 1.0, "rewards/chosen": 1.0905746221542358, "rewards/margins": 0.7197356820106506, "rewards/rejected": 0.3708389401435852, "step": 923 }, { "epoch": 0.154, "grad_norm": 39.43292999267578, "learning_rate": 1.920434533967744e-07, "logits/chosen": 2.5751895904541016, "logits/rejected": 2.717874050140381, "logps/chosen": -90.36521911621094, "logps/rejected": -262.8824462890625, "loss": 1.2699, "nll_loss": 1.2048697471618652, "rewards/accuracies": 1.0, "rewards/chosen": 0.7756065726280212, "rewards/margins": 6.835031032562256, "rewards/rejected": -6.05942440032959, "step": 924 }, { "epoch": 0.15416666666666667, "grad_norm": 36.17152404785156, "learning_rate": 1.9202233967277454e-07, "logits/chosen": 2.536327600479126, "logits/rejected": 2.3523194789886475, "logps/chosen": -69.79473114013672, "logps/rejected": -115.96065521240234, "loss": 0.8789, "nll_loss": 0.7931219339370728, "rewards/accuracies": 1.0, "rewards/chosen": 1.0377380847930908, "rewards/margins": 3.7239646911621094, "rewards/rejected": -2.6862266063690186, "step": 925 }, { "epoch": 0.15433333333333332, "grad_norm": 95.58018493652344, "learning_rate": 1.9200119913567412e-07, "logits/chosen": 2.442641258239746, "logits/rejected": 2.8157975673675537, "logps/chosen": -23.376224517822266, "logps/rejected": -267.00628662109375, "loss": 1.7494, "nll_loss": 1.5584149360656738, "rewards/accuracies": 1.0, "rewards/chosen": 1.1332066059112549, "rewards/margins": 2.2753820419311523, "rewards/rejected": -1.142175316810608, "step": 926 }, { "epoch": 0.1545, "grad_norm": 156.15843200683594, "learning_rate": 1.9198003179163306e-07, "logits/chosen": 1.6166528463363647, "logits/rejected": 1.8373785018920898, "logps/chosen": -42.15227127075195, "logps/rejected": -85.19161224365234, "loss": 3.2574, "nll_loss": 2.634516716003418, "rewards/accuracies": 1.0, "rewards/chosen": -0.7647614479064941, "rewards/margins": 0.5515433549880981, "rewards/rejected": -1.3163048028945923, "step": 927 }, { "epoch": 0.15466666666666667, "grad_norm": 33.551246643066406, "learning_rate": 1.919588376468189e-07, "logits/chosen": 2.4307520389556885, "logits/rejected": 2.831509828567505, "logps/chosen": -42.666526794433594, "logps/rejected": -297.5931701660156, "loss": 0.7687, "nll_loss": 0.723161518573761, "rewards/accuracies": 1.0, "rewards/chosen": 1.1922191381454468, "rewards/margins": 6.925819396972656, "rewards/rejected": -5.73360013961792, "step": 928 }, { "epoch": 0.15483333333333332, "grad_norm": 57.67573165893555, "learning_rate": 1.919376167074072e-07, "logits/chosen": 1.6611961126327515, "logits/rejected": 2.6281936168670654, "logps/chosen": -28.890426635742188, "logps/rejected": -159.31422424316406, "loss": 0.8701, "nll_loss": 0.7222606539726257, "rewards/accuracies": 1.0, "rewards/chosen": 0.8161686658859253, "rewards/margins": 2.6979939937591553, "rewards/rejected": -1.88182532787323, "step": 929 }, { "epoch": 0.155, "grad_norm": 230.0287322998047, "learning_rate": 1.919163689795812e-07, "logits/chosen": 2.400799036026001, "logits/rejected": 2.5052859783172607, "logps/chosen": -79.69233703613281, "logps/rejected": -39.68695831298828, "loss": 2.4969, "nll_loss": 1.1719462871551514, "rewards/accuracies": 0.0, "rewards/chosen": 0.37229615449905396, "rewards/margins": -0.768959105014801, "rewards/rejected": 1.141255259513855, "step": 930 }, { "epoch": 0.15516666666666667, "grad_norm": 63.04716491699219, "learning_rate": 1.9189509446953197e-07, "logits/chosen": 2.238562822341919, "logits/rejected": 3.351097583770752, "logps/chosen": -22.180673599243164, "logps/rejected": -171.9266357421875, "loss": 1.0183, "nll_loss": 0.9241946339607239, "rewards/accuracies": 1.0, "rewards/chosen": 1.0206362009048462, "rewards/margins": 3.5141053199768066, "rewards/rejected": -2.49346923828125, "step": 931 }, { "epoch": 0.15533333333333332, "grad_norm": 146.57005310058594, "learning_rate": 1.9187379318345845e-07, "logits/chosen": 2.0946433544158936, "logits/rejected": 1.920538306236267, "logps/chosen": -52.88913345336914, "logps/rejected": -47.01058578491211, "loss": 1.7883, "nll_loss": 0.9118815660476685, "rewards/accuracies": 0.0, "rewards/chosen": 0.7005543112754822, "rewards/margins": -0.00838625431060791, "rewards/rejected": 0.7089405655860901, "step": 932 }, { "epoch": 0.1555, "grad_norm": 28.797597885131836, "learning_rate": 1.9185246512756725e-07, "logits/chosen": 3.1637284755706787, "logits/rejected": 3.1760189533233643, "logps/chosen": -25.197044372558594, "logps/rejected": -172.34060668945312, "loss": 0.5842, "nll_loss": 0.5599343776702881, "rewards/accuracies": 1.0, "rewards/chosen": 2.2642745971679688, "rewards/margins": 5.963475227355957, "rewards/rejected": -3.699200391769409, "step": 933 }, { "epoch": 0.15566666666666668, "grad_norm": 81.66947937011719, "learning_rate": 1.9183111030807287e-07, "logits/chosen": 2.2856335639953613, "logits/rejected": 2.72107195854187, "logps/chosen": -47.30840301513672, "logps/rejected": -117.26620483398438, "loss": 2.0371, "nll_loss": 1.7521631717681885, "rewards/accuracies": 1.0, "rewards/chosen": 1.4413236379623413, "rewards/margins": 1.7847989797592163, "rewards/rejected": -0.343475341796875, "step": 934 }, { "epoch": 0.15583333333333332, "grad_norm": 110.31451416015625, "learning_rate": 1.9180972873119758e-07, "logits/chosen": 1.7859547138214111, "logits/rejected": 2.70363712310791, "logps/chosen": -27.294496536254883, "logps/rejected": -95.30994415283203, "loss": 2.0154, "nll_loss": 1.81963312625885, "rewards/accuracies": 1.0, "rewards/chosen": 0.4336439073085785, "rewards/margins": 2.3007795810699463, "rewards/rejected": -1.8671356439590454, "step": 935 }, { "epoch": 0.156, "grad_norm": 38.019012451171875, "learning_rate": 1.9178832040317153e-07, "logits/chosen": 2.7538869380950928, "logits/rejected": 2.9773902893066406, "logps/chosen": -57.411094665527344, "logps/rejected": -229.8173370361328, "loss": 0.8954, "nll_loss": 0.8086069822311401, "rewards/accuracies": 1.0, "rewards/chosen": 0.6289932131767273, "rewards/margins": 4.422830104827881, "rewards/rejected": -3.793837070465088, "step": 936 }, { "epoch": 0.15616666666666668, "grad_norm": 92.28284454345703, "learning_rate": 1.917668853302325e-07, "logits/chosen": 2.8589720726013184, "logits/rejected": 2.7674951553344727, "logps/chosen": -18.305877685546875, "logps/rejected": -81.74857330322266, "loss": 1.1441, "nll_loss": 0.6537812948226929, "rewards/accuracies": 1.0, "rewards/chosen": 1.3959823846817017, "rewards/margins": 1.015049695968628, "rewards/rejected": 0.38093265891075134, "step": 937 }, { "epoch": 0.15633333333333332, "grad_norm": 34.920494079589844, "learning_rate": 1.917454235186262e-07, "logits/chosen": 1.8960416316986084, "logits/rejected": 1.9975109100341797, "logps/chosen": -49.39678955078125, "logps/rejected": -276.52752685546875, "loss": 0.718, "nll_loss": 0.6332922577857971, "rewards/accuracies": 1.0, "rewards/chosen": 0.8988983035087585, "rewards/margins": 3.907501220703125, "rewards/rejected": -3.0086028575897217, "step": 938 }, { "epoch": 0.1565, "grad_norm": 130.00082397460938, "learning_rate": 1.917239349746061e-07, "logits/chosen": 2.820828437805176, "logits/rejected": 2.694587469100952, "logps/chosen": -92.09318542480469, "logps/rejected": -25.362897872924805, "loss": 1.5148, "nll_loss": 1.0010128021240234, "rewards/accuracies": 1.0, "rewards/chosen": 0.02474060095846653, "rewards/margins": 0.7680120468139648, "rewards/rejected": -0.7432714700698853, "step": 939 }, { "epoch": 0.15666666666666668, "grad_norm": 48.97792434692383, "learning_rate": 1.917024197044334e-07, "logits/chosen": 2.716283082962036, "logits/rejected": 2.4077181816101074, "logps/chosen": -118.20893859863281, "logps/rejected": -46.16336441040039, "loss": 1.3313, "nll_loss": 1.170385718345642, "rewards/accuracies": 1.0, "rewards/chosen": 1.6234893798828125, "rewards/margins": 2.6202292442321777, "rewards/rejected": -0.9967399835586548, "step": 940 }, { "epoch": 0.15683333333333332, "grad_norm": 36.2784538269043, "learning_rate": 1.916808777143772e-07, "logits/chosen": 1.2913845777511597, "logits/rejected": 2.906432867050171, "logps/chosen": -55.27769470214844, "logps/rejected": -404.3973693847656, "loss": 0.7903, "nll_loss": 0.7273380756378174, "rewards/accuracies": 1.0, "rewards/chosen": 0.9616684317588806, "rewards/margins": 5.065245151519775, "rewards/rejected": -4.10357666015625, "step": 941 }, { "epoch": 0.157, "grad_norm": 34.16549301147461, "learning_rate": 1.9165930901071427e-07, "logits/chosen": 0.9141554832458496, "logits/rejected": 2.270465850830078, "logps/chosen": -16.462461471557617, "logps/rejected": -235.49903869628906, "loss": 0.5765, "nll_loss": 0.5310471653938293, "rewards/accuracies": 1.0, "rewards/chosen": 1.3635425567626953, "rewards/margins": 5.405681133270264, "rewards/rejected": -4.042138576507568, "step": 942 }, { "epoch": 0.15716666666666668, "grad_norm": 70.75831604003906, "learning_rate": 1.9163771359972921e-07, "logits/chosen": 2.657078742980957, "logits/rejected": 2.7635884284973145, "logps/chosen": -90.3284683227539, "logps/rejected": -161.75881958007812, "loss": 1.2172, "nll_loss": 0.9609412550926208, "rewards/accuracies": 1.0, "rewards/chosen": 0.7620911598205566, "rewards/margins": 1.8165086507797241, "rewards/rejected": -1.0544174909591675, "step": 943 }, { "epoch": 0.15733333333333333, "grad_norm": 25.31389045715332, "learning_rate": 1.9161609148771442e-07, "logits/chosen": 1.5549012422561646, "logits/rejected": 2.46799635887146, "logps/chosen": -59.280277252197266, "logps/rejected": -224.6536865234375, "loss": 0.8347, "nll_loss": 0.8120586276054382, "rewards/accuracies": 1.0, "rewards/chosen": 2.0876948833465576, "rewards/margins": 6.653689384460449, "rewards/rejected": -4.5659942626953125, "step": 944 }, { "epoch": 0.1575, "grad_norm": 40.630489349365234, "learning_rate": 1.915944426809701e-07, "logits/chosen": 0.17113612592220306, "logits/rejected": 2.3762073516845703, "logps/chosen": -52.80259323120117, "logps/rejected": -599.2525634765625, "loss": 0.7766, "nll_loss": 0.6947709321975708, "rewards/accuracies": 1.0, "rewards/chosen": 0.4630485475063324, "rewards/margins": 10.690228462219238, "rewards/rejected": -10.227179527282715, "step": 945 }, { "epoch": 0.15766666666666668, "grad_norm": 58.378604888916016, "learning_rate": 1.915727671858041e-07, "logits/chosen": 2.2789995670318604, "logits/rejected": 2.414254903793335, "logps/chosen": -16.067291259765625, "logps/rejected": -36.598567962646484, "loss": 0.8381, "nll_loss": 0.617972731590271, "rewards/accuracies": 1.0, "rewards/chosen": 0.7933094501495361, "rewards/margins": 2.045656681060791, "rewards/rejected": -1.2523471117019653, "step": 946 }, { "epoch": 0.15783333333333333, "grad_norm": 24.472970962524414, "learning_rate": 1.9155106500853223e-07, "logits/chosen": 2.457615852355957, "logits/rejected": 2.6631979942321777, "logps/chosen": -77.05096435546875, "logps/rejected": -165.00534057617188, "loss": 0.8345, "nll_loss": 0.8110628724098206, "rewards/accuracies": 1.0, "rewards/chosen": 2.1275527477264404, "rewards/margins": 6.316162109375, "rewards/rejected": -4.188609600067139, "step": 947 }, { "epoch": 0.158, "grad_norm": 243.7894744873047, "learning_rate": 1.9152933615547795e-07, "logits/chosen": 2.9205124378204346, "logits/rejected": 2.9316062927246094, "logps/chosen": -62.52593994140625, "logps/rejected": -30.14577865600586, "loss": 3.6859, "nll_loss": 2.404843807220459, "rewards/accuracies": 0.0, "rewards/chosen": -1.178977608680725, "rewards/margins": -0.7438686490058899, "rewards/rejected": -0.4351089596748352, "step": 948 }, { "epoch": 0.15816666666666668, "grad_norm": 21.8325252532959, "learning_rate": 1.915075806329725e-07, "logits/chosen": 3.086277723312378, "logits/rejected": 3.1459813117980957, "logps/chosen": -61.17449951171875, "logps/rejected": -187.54830932617188, "loss": 0.6827, "nll_loss": 0.6649402976036072, "rewards/accuracies": 1.0, "rewards/chosen": 2.7807841300964355, "rewards/margins": 6.382157325744629, "rewards/rejected": -3.6013734340667725, "step": 949 }, { "epoch": 0.15833333333333333, "grad_norm": 78.79121398925781, "learning_rate": 1.9148579844735495e-07, "logits/chosen": 3.398890972137451, "logits/rejected": 3.6610524654388428, "logps/chosen": -24.564773559570312, "logps/rejected": -247.54295349121094, "loss": 1.3107, "nll_loss": 1.228238821029663, "rewards/accuracies": 1.0, "rewards/chosen": 0.6283625364303589, "rewards/margins": 4.708619594573975, "rewards/rejected": -4.080256938934326, "step": 950 }, { "epoch": 0.1585, "grad_norm": 165.04319763183594, "learning_rate": 1.914639896049721e-07, "logits/chosen": 3.151677131652832, "logits/rejected": 3.2975263595581055, "logps/chosen": -73.46217346191406, "logps/rejected": -55.44200134277344, "loss": 2.1866, "nll_loss": 0.8958801627159119, "rewards/accuracies": 0.0, "rewards/chosen": 1.650670051574707, "rewards/margins": -0.4864013195037842, "rewards/rejected": 2.137071371078491, "step": 951 }, { "epoch": 0.15866666666666668, "grad_norm": 116.4824447631836, "learning_rate": 1.914421541121785e-07, "logits/chosen": 3.3583812713623047, "logits/rejected": 3.263930320739746, "logps/chosen": -215.96434020996094, "logps/rejected": -154.598388671875, "loss": 2.1941, "nll_loss": 1.8458486795425415, "rewards/accuracies": 1.0, "rewards/chosen": -0.45822906494140625, "rewards/margins": 1.5523650646209717, "rewards/rejected": -2.010594129562378, "step": 952 }, { "epoch": 0.15883333333333333, "grad_norm": 33.69325637817383, "learning_rate": 1.9142029197533652e-07, "logits/chosen": -1.4577150344848633, "logits/rejected": -2.140639543533325, "logps/chosen": -119.36250305175781, "logps/rejected": -42.7254753112793, "loss": 0.7405, "nll_loss": 0.6249344348907471, "rewards/accuracies": 1.0, "rewards/chosen": 1.4429353475570679, "rewards/margins": 3.0607590675354004, "rewards/rejected": -1.617823600769043, "step": 953 }, { "epoch": 0.159, "grad_norm": 41.954627990722656, "learning_rate": 1.9139840320081628e-07, "logits/chosen": 2.3403468132019043, "logits/rejected": 2.981942653656006, "logps/chosen": -117.03253936767578, "logps/rejected": -379.4914245605469, "loss": 1.6748, "nll_loss": 1.6483454704284668, "rewards/accuracies": 1.0, "rewards/chosen": 1.7311761379241943, "rewards/margins": 11.376363754272461, "rewards/rejected": -9.645187377929688, "step": 954 }, { "epoch": 0.15916666666666668, "grad_norm": 32.15553283691406, "learning_rate": 1.9137648779499558e-07, "logits/chosen": 2.0557806491851807, "logits/rejected": 2.7914369106292725, "logps/chosen": -83.22193145751953, "logps/rejected": -152.49420166015625, "loss": 0.8139, "nll_loss": 0.7430527806282043, "rewards/accuracies": 1.0, "rewards/chosen": 1.8055886030197144, "rewards/margins": 3.8664283752441406, "rewards/rejected": -2.060839891433716, "step": 955 }, { "epoch": 0.15933333333333333, "grad_norm": 53.53400802612305, "learning_rate": 1.9135454576426007e-07, "logits/chosen": 3.49454402923584, "logits/rejected": 3.9270458221435547, "logps/chosen": -36.12620162963867, "logps/rejected": -279.73516845703125, "loss": 0.9423, "nll_loss": 0.7853522300720215, "rewards/accuracies": 1.0, "rewards/chosen": 0.6687183380126953, "rewards/margins": 2.627321243286133, "rewards/rejected": -1.9586029052734375, "step": 956 }, { "epoch": 0.1595, "grad_norm": 88.08792877197266, "learning_rate": 1.9133257711500316e-07, "logits/chosen": 2.3910927772521973, "logits/rejected": 2.8012354373931885, "logps/chosen": -21.915449142456055, "logps/rejected": -267.7134704589844, "loss": 1.6281, "nll_loss": 1.4610298871994019, "rewards/accuracies": 1.0, "rewards/chosen": 1.2792842388153076, "rewards/margins": 2.492177963256836, "rewards/rejected": -1.2128937244415283, "step": 957 }, { "epoch": 0.15966666666666668, "grad_norm": 41.49901580810547, "learning_rate": 1.9131058185362596e-07, "logits/chosen": 2.4258530139923096, "logits/rejected": 2.8414902687072754, "logps/chosen": -120.22148132324219, "logps/rejected": -181.32859802246094, "loss": 1.0655, "nll_loss": 0.9854218363761902, "rewards/accuracies": 1.0, "rewards/chosen": 0.7714470624923706, "rewards/margins": 4.339633941650391, "rewards/rejected": -3.5681869983673096, "step": 958 }, { "epoch": 0.15983333333333333, "grad_norm": 44.795135498046875, "learning_rate": 1.9128855998653734e-07, "logits/chosen": 1.6477059125900269, "logits/rejected": 2.007624626159668, "logps/chosen": -221.40640258789062, "logps/rejected": -232.47265625, "loss": 0.956, "nll_loss": 0.8483003973960876, "rewards/accuracies": 1.0, "rewards/chosen": 1.048518419265747, "rewards/margins": 3.214381456375122, "rewards/rejected": -2.165863037109375, "step": 959 }, { "epoch": 0.16, "grad_norm": 73.02716064453125, "learning_rate": 1.91266511520154e-07, "logits/chosen": 1.5290380716323853, "logits/rejected": 2.6871304512023926, "logps/chosen": -33.96788787841797, "logps/rejected": -168.57080078125, "loss": 1.4689, "nll_loss": 1.358715534210205, "rewards/accuracies": 1.0, "rewards/chosen": 0.35999298095703125, "rewards/margins": 3.9589128494262695, "rewards/rejected": -3.5989198684692383, "step": 960 }, { "epoch": 0.16016666666666668, "grad_norm": 41.580718994140625, "learning_rate": 1.9124443646090028e-07, "logits/chosen": 2.274111032485962, "logits/rejected": 2.1215338706970215, "logps/chosen": -81.60418701171875, "logps/rejected": -41.65415954589844, "loss": 1.1412, "nll_loss": 1.020052194595337, "rewards/accuracies": 1.0, "rewards/chosen": 1.3336381912231445, "rewards/margins": 2.9798338413238525, "rewards/rejected": -1.646195650100708, "step": 961 }, { "epoch": 0.16033333333333333, "grad_norm": 102.08993530273438, "learning_rate": 1.9122233481520837e-07, "logits/chosen": 2.9639437198638916, "logits/rejected": 3.1052026748657227, "logps/chosen": -64.50486755371094, "logps/rejected": -41.86174011230469, "loss": 1.3884, "nll_loss": 0.8959009647369385, "rewards/accuracies": 1.0, "rewards/chosen": 0.9531906247138977, "rewards/margins": 0.9075554013252258, "rewards/rejected": 0.045635223388671875, "step": 962 }, { "epoch": 0.1605, "grad_norm": 29.950679779052734, "learning_rate": 1.9120020658951812e-07, "logits/chosen": 2.376465320587158, "logits/rejected": 2.138085126876831, "logps/chosen": -152.18173217773438, "logps/rejected": -237.3118438720703, "loss": 0.9966, "nll_loss": 0.9393935799598694, "rewards/accuracies": 1.0, "rewards/chosen": 3.5009403228759766, "rewards/margins": 5.047609329223633, "rewards/rejected": -1.5466690063476562, "step": 963 }, { "epoch": 0.16066666666666668, "grad_norm": 33.81017303466797, "learning_rate": 1.911780517902772e-07, "logits/chosen": 2.2088539600372314, "logits/rejected": 2.5540361404418945, "logps/chosen": -58.241493225097656, "logps/rejected": -220.51181030273438, "loss": 0.8004, "nll_loss": 0.7190307974815369, "rewards/accuracies": 1.0, "rewards/chosen": 0.5015377402305603, "rewards/margins": 6.094900131225586, "rewards/rejected": -5.593362331390381, "step": 964 }, { "epoch": 0.16083333333333333, "grad_norm": 180.31492614746094, "learning_rate": 1.9115587042394092e-07, "logits/chosen": 2.466073989868164, "logits/rejected": 2.434138298034668, "logps/chosen": -110.11656951904297, "logps/rejected": -49.438087463378906, "loss": 2.1732, "nll_loss": 1.069092869758606, "rewards/accuracies": 0.0, "rewards/chosen": 0.7295951843261719, "rewards/margins": -0.386654257774353, "rewards/rejected": 1.116249442100525, "step": 965 }, { "epoch": 0.161, "grad_norm": 71.76189422607422, "learning_rate": 1.9113366249697248e-07, "logits/chosen": 2.8619048595428467, "logits/rejected": 2.6888961791992188, "logps/chosen": -30.530092239379883, "logps/rejected": -24.955053329467773, "loss": 0.8733, "nll_loss": 0.6360436677932739, "rewards/accuracies": 1.0, "rewards/chosen": 0.7633382678031921, "rewards/margins": 1.931943416595459, "rewards/rejected": -1.1686052083969116, "step": 966 }, { "epoch": 0.16116666666666668, "grad_norm": 43.355308532714844, "learning_rate": 1.9111142801584265e-07, "logits/chosen": 1.4922071695327759, "logits/rejected": 2.820347309112549, "logps/chosen": -42.60993576049805, "logps/rejected": -193.6942596435547, "loss": 0.8921, "nll_loss": 0.8194218873977661, "rewards/accuracies": 1.0, "rewards/chosen": 0.6067207455635071, "rewards/margins": 7.477322101593018, "rewards/rejected": -6.870601177215576, "step": 967 }, { "epoch": 0.16133333333333333, "grad_norm": 30.16254234313965, "learning_rate": 1.9108916698703012e-07, "logits/chosen": 3.0562288761138916, "logits/rejected": 3.075580358505249, "logps/chosen": -75.6922378540039, "logps/rejected": -68.01243591308594, "loss": 0.8567, "nll_loss": 0.7723698616027832, "rewards/accuracies": 1.0, "rewards/chosen": 1.6021080017089844, "rewards/margins": 3.5701708793640137, "rewards/rejected": -1.9680629968643188, "step": 968 }, { "epoch": 0.1615, "grad_norm": 64.56319427490234, "learning_rate": 1.9106687941702114e-07, "logits/chosen": 3.4878487586975098, "logits/rejected": 3.6896512508392334, "logps/chosen": -79.10018920898438, "logps/rejected": -133.83758544921875, "loss": 1.2768, "nll_loss": 1.0835641622543335, "rewards/accuracies": 1.0, "rewards/chosen": 0.8541428446769714, "rewards/margins": 2.245837450027466, "rewards/rejected": -1.3916946649551392, "step": 969 }, { "epoch": 0.16166666666666665, "grad_norm": 114.29508209228516, "learning_rate": 1.9104456531230985e-07, "logits/chosen": 2.6075947284698486, "logits/rejected": 2.4631786346435547, "logps/chosen": -36.5301513671875, "logps/rejected": -28.287425994873047, "loss": 2.2601, "nll_loss": 1.4612059593200684, "rewards/accuracies": 1.0, "rewards/chosen": -0.44887009263038635, "rewards/margins": 0.0734226405620575, "rewards/rejected": -0.5222927331924438, "step": 970 }, { "epoch": 0.16183333333333333, "grad_norm": 73.94898986816406, "learning_rate": 1.9102222467939795e-07, "logits/chosen": 2.9218358993530273, "logits/rejected": 2.79146671295166, "logps/chosen": -164.54428100585938, "logps/rejected": -124.03477478027344, "loss": 1.2534, "nll_loss": 1.1043239831924438, "rewards/accuracies": 1.0, "rewards/chosen": -0.16631470620632172, "rewards/margins": 4.0829057693481445, "rewards/rejected": -4.249220371246338, "step": 971 }, { "epoch": 0.162, "grad_norm": 35.803104400634766, "learning_rate": 1.9099985752479503e-07, "logits/chosen": 3.9587717056274414, "logits/rejected": 4.588257789611816, "logps/chosen": -93.45977020263672, "logps/rejected": -388.2991027832031, "loss": 1.0627, "nll_loss": 1.0384418964385986, "rewards/accuracies": 1.0, "rewards/chosen": 1.8594704866409302, "rewards/margins": 7.747620582580566, "rewards/rejected": -5.888150215148926, "step": 972 }, { "epoch": 0.16216666666666665, "grad_norm": 27.533475875854492, "learning_rate": 1.9097746385501834e-07, "logits/chosen": 0.4400869905948639, "logits/rejected": 2.2239229679107666, "logps/chosen": -20.053802490234375, "logps/rejected": -334.1724853515625, "loss": 0.5063, "nll_loss": 0.4557682275772095, "rewards/accuracies": 1.0, "rewards/chosen": 1.175188660621643, "rewards/margins": 5.45259952545166, "rewards/rejected": -4.277410984039307, "step": 973 }, { "epoch": 0.16233333333333333, "grad_norm": 36.91716003417969, "learning_rate": 1.9095504367659282e-07, "logits/chosen": 1.1505897045135498, "logits/rejected": 2.1026270389556885, "logps/chosen": -88.62090301513672, "logps/rejected": -226.64804077148438, "loss": 1.1677, "nll_loss": 1.1217834949493408, "rewards/accuracies": 1.0, "rewards/chosen": 1.2865829467773438, "rewards/margins": 5.570069789886475, "rewards/rejected": -4.283486843109131, "step": 974 }, { "epoch": 0.1625, "grad_norm": 63.782135009765625, "learning_rate": 1.909325969960512e-07, "logits/chosen": 2.9325270652770996, "logits/rejected": 3.062809944152832, "logps/chosen": -115.81776428222656, "logps/rejected": -99.67840576171875, "loss": 1.6554, "nll_loss": 1.5442367792129517, "rewards/accuracies": 1.0, "rewards/chosen": 0.5481948852539062, "rewards/margins": 3.5141029357910156, "rewards/rejected": -2.9659080505371094, "step": 975 }, { "epoch": 0.16266666666666665, "grad_norm": 23.075349807739258, "learning_rate": 1.909101238199339e-07, "logits/chosen": 1.3202000856399536, "logits/rejected": 2.1648480892181396, "logps/chosen": -72.3131103515625, "logps/rejected": -279.37249755859375, "loss": 0.7403, "nll_loss": 0.7231312394142151, "rewards/accuracies": 1.0, "rewards/chosen": 2.1770217418670654, "rewards/margins": 10.53978157043457, "rewards/rejected": -8.362759590148926, "step": 976 }, { "epoch": 0.16283333333333333, "grad_norm": 46.926353454589844, "learning_rate": 1.9088762415478907e-07, "logits/chosen": 2.7847867012023926, "logits/rejected": 2.747899293899536, "logps/chosen": -59.076141357421875, "logps/rejected": -170.99310302734375, "loss": 0.7362, "nll_loss": 0.5791777968406677, "rewards/accuracies": 1.0, "rewards/chosen": 1.0428390502929688, "rewards/margins": 2.5664076805114746, "rewards/rejected": -1.5235687494277954, "step": 977 }, { "epoch": 0.163, "grad_norm": 30.62248420715332, "learning_rate": 1.9086509800717256e-07, "logits/chosen": 1.1512705087661743, "logits/rejected": 2.388773202896118, "logps/chosen": -62.555458068847656, "logps/rejected": -334.9063720703125, "loss": 0.7857, "nll_loss": 0.7359464168548584, "rewards/accuracies": 1.0, "rewards/chosen": 1.1126976013183594, "rewards/margins": 5.993850231170654, "rewards/rejected": -4.881152629852295, "step": 978 }, { "epoch": 0.16316666666666665, "grad_norm": 67.71681213378906, "learning_rate": 1.9084254538364796e-07, "logits/chosen": 3.065814733505249, "logits/rejected": 3.1479177474975586, "logps/chosen": -59.29609680175781, "logps/rejected": -371.804443359375, "loss": 2.2833, "nll_loss": 2.1961517333984375, "rewards/accuracies": 1.0, "rewards/chosen": 0.6162155270576477, "rewards/margins": 4.330840110778809, "rewards/rejected": -3.7146244049072266, "step": 979 }, { "epoch": 0.16333333333333333, "grad_norm": 76.00386810302734, "learning_rate": 1.9081996629078654e-07, "logits/chosen": 2.81626033782959, "logits/rejected": 2.6948745250701904, "logps/chosen": -27.16176986694336, "logps/rejected": -650.1240234375, "loss": 0.9069, "nll_loss": 0.8761861324310303, "rewards/accuracies": 1.0, "rewards/chosen": 1.553995132446289, "rewards/margins": 9.468997955322266, "rewards/rejected": -7.915002822875977, "step": 980 }, { "epoch": 0.1635, "grad_norm": 32.68090057373047, "learning_rate": 1.9079736073516734e-07, "logits/chosen": 3.091496706008911, "logits/rejected": 3.0707504749298096, "logps/chosen": -69.68441772460938, "logps/rejected": -184.34654235839844, "loss": 0.7352, "nll_loss": 0.7038829326629639, "rewards/accuracies": 1.0, "rewards/chosen": 2.12068247795105, "rewards/margins": 5.368495941162109, "rewards/rejected": -3.2478137016296387, "step": 981 }, { "epoch": 0.16366666666666665, "grad_norm": 35.560546875, "learning_rate": 1.9077472872337705e-07, "logits/chosen": 2.6077563762664795, "logits/rejected": 2.3528618812561035, "logps/chosen": -121.88949584960938, "logps/rejected": -56.649688720703125, "loss": 1.0779, "nll_loss": 1.0242815017700195, "rewards/accuracies": 1.0, "rewards/chosen": 1.9485610723495483, "rewards/margins": 4.338706970214844, "rewards/rejected": -2.390146017074585, "step": 982 }, { "epoch": 0.16383333333333333, "grad_norm": 74.31008911132812, "learning_rate": 1.907520702620101e-07, "logits/chosen": 2.8049204349517822, "logits/rejected": 2.7484869956970215, "logps/chosen": -71.05013275146484, "logps/rejected": -40.21131134033203, "loss": 1.7015, "nll_loss": 1.511704921722412, "rewards/accuracies": 1.0, "rewards/chosen": 0.17518463730812073, "rewards/margins": 2.4721474647521973, "rewards/rejected": -2.2969627380371094, "step": 983 }, { "epoch": 0.164, "grad_norm": 27.382160186767578, "learning_rate": 1.9072938535766862e-07, "logits/chosen": 2.0209667682647705, "logits/rejected": 2.5498669147491455, "logps/chosen": -69.1974105834961, "logps/rejected": -251.71023559570312, "loss": 0.8343, "nll_loss": 0.786334216594696, "rewards/accuracies": 1.0, "rewards/chosen": 1.221577525138855, "rewards/margins": 5.521135330200195, "rewards/rejected": -4.299557685852051, "step": 984 }, { "epoch": 0.16416666666666666, "grad_norm": 57.163631439208984, "learning_rate": 1.9070667401696247e-07, "logits/chosen": 3.0881693363189697, "logits/rejected": 3.00824236869812, "logps/chosen": -112.7350845336914, "logps/rejected": -97.40730285644531, "loss": 1.5979, "nll_loss": 1.5031343698501587, "rewards/accuracies": 1.0, "rewards/chosen": 0.8564629554748535, "rewards/margins": 3.5952608585357666, "rewards/rejected": -2.738797903060913, "step": 985 }, { "epoch": 0.16433333333333333, "grad_norm": 113.6233901977539, "learning_rate": 1.9068393624650913e-07, "logits/chosen": 2.2111191749572754, "logits/rejected": 2.711801767349243, "logps/chosen": -36.81804275512695, "logps/rejected": -165.4913330078125, "loss": 2.8976, "nll_loss": 2.454536199569702, "rewards/accuracies": 1.0, "rewards/chosen": -1.265373706817627, "rewards/margins": 1.4893381595611572, "rewards/rejected": -2.754711866378784, "step": 986 }, { "epoch": 0.1645, "grad_norm": 43.73469543457031, "learning_rate": 1.906611720529339e-07, "logits/chosen": 2.5775110721588135, "logits/rejected": 2.5425400733947754, "logps/chosen": -101.75394439697266, "logps/rejected": -51.16038131713867, "loss": 0.8446, "nll_loss": 0.7268139123916626, "rewards/accuracies": 1.0, "rewards/chosen": 1.4260934591293335, "rewards/margins": 3.032684803009033, "rewards/rejected": -1.6065914630889893, "step": 987 }, { "epoch": 0.16466666666666666, "grad_norm": 31.292346954345703, "learning_rate": 1.9063838144286972e-07, "logits/chosen": 2.227926015853882, "logits/rejected": 2.4298477172851562, "logps/chosen": -13.990507125854492, "logps/rejected": -79.60100555419922, "loss": 0.4617, "nll_loss": 0.35873091220855713, "rewards/accuracies": 1.0, "rewards/chosen": 1.2889342308044434, "rewards/margins": 3.242548704147339, "rewards/rejected": -1.9536144733428955, "step": 988 }, { "epoch": 0.16483333333333333, "grad_norm": 31.327714920043945, "learning_rate": 1.906155644229572e-07, "logits/chosen": 2.6820569038391113, "logits/rejected": 2.663149833679199, "logps/chosen": -108.97933959960938, "logps/rejected": -226.73660278320312, "loss": 0.9932, "nll_loss": 0.9644189476966858, "rewards/accuracies": 1.0, "rewards/chosen": 1.7046631574630737, "rewards/margins": 6.827762603759766, "rewards/rejected": -5.123099327087402, "step": 989 }, { "epoch": 0.165, "grad_norm": 23.267728805541992, "learning_rate": 1.9059272099984466e-07, "logits/chosen": 2.313472032546997, "logits/rejected": 2.257956027984619, "logps/chosen": -32.650428771972656, "logps/rejected": -83.94366455078125, "loss": 0.4757, "nll_loss": 0.41859525442123413, "rewards/accuracies": 1.0, "rewards/chosen": 1.7459850311279297, "rewards/margins": 4.2396039962768555, "rewards/rejected": -2.4936187267303467, "step": 990 }, { "epoch": 0.16516666666666666, "grad_norm": 35.78553009033203, "learning_rate": 1.905698511801882e-07, "logits/chosen": 2.9154155254364014, "logits/rejected": 2.9910378456115723, "logps/chosen": -54.3543815612793, "logps/rejected": -214.048828125, "loss": 0.7428, "nll_loss": 0.6794298887252808, "rewards/accuracies": 1.0, "rewards/chosen": 1.7155064344406128, "rewards/margins": 4.05308723449707, "rewards/rejected": -2.337580919265747, "step": 991 }, { "epoch": 0.16533333333333333, "grad_norm": 32.21837615966797, "learning_rate": 1.905469549706514e-07, "logits/chosen": 1.2013702392578125, "logits/rejected": 2.180912494659424, "logps/chosen": -70.49900817871094, "logps/rejected": -258.21112060546875, "loss": 0.975, "nll_loss": 0.9276185035705566, "rewards/accuracies": 1.0, "rewards/chosen": 1.3899391889572144, "rewards/margins": 5.009781837463379, "rewards/rejected": -3.619842529296875, "step": 992 }, { "epoch": 0.1655, "grad_norm": 24.10542106628418, "learning_rate": 1.905240323779058e-07, "logits/chosen": 1.4420548677444458, "logits/rejected": 2.3188557624816895, "logps/chosen": -66.45068359375, "logps/rejected": -197.92190551757812, "loss": 0.8726, "nll_loss": 0.8519317507743835, "rewards/accuracies": 1.0, "rewards/chosen": 2.236288547515869, "rewards/margins": 6.505393981933594, "rewards/rejected": -4.269105434417725, "step": 993 }, { "epoch": 0.16566666666666666, "grad_norm": 121.27458953857422, "learning_rate": 1.9050108340863043e-07, "logits/chosen": 2.28999400138855, "logits/rejected": 2.3232791423797607, "logps/chosen": -23.276338577270508, "logps/rejected": -19.150785446166992, "loss": 2.426, "nll_loss": 1.939694881439209, "rewards/accuracies": 1.0, "rewards/chosen": 0.11836281418800354, "rewards/margins": 0.849320650100708, "rewards/rejected": -0.7309578657150269, "step": 994 }, { "epoch": 0.16583333333333333, "grad_norm": 30.928678512573242, "learning_rate": 1.9047810806951206e-07, "logits/chosen": 0.4643592834472656, "logits/rejected": 1.134140968322754, "logps/chosen": -55.5565299987793, "logps/rejected": -330.27777099609375, "loss": 0.805, "nll_loss": 0.7824863791465759, "rewards/accuracies": 1.0, "rewards/chosen": 1.9804500341415405, "rewards/margins": 7.080773830413818, "rewards/rejected": -5.100323677062988, "step": 995 }, { "epoch": 0.166, "grad_norm": 39.89521026611328, "learning_rate": 1.9045510636724517e-07, "logits/chosen": 2.5000338554382324, "logits/rejected": 2.5342764854431152, "logps/chosen": -72.3454818725586, "logps/rejected": -187.20550537109375, "loss": 0.8045, "nll_loss": 0.7307624220848083, "rewards/accuracies": 1.0, "rewards/chosen": 1.8210029602050781, "rewards/margins": 3.8084969520568848, "rewards/rejected": -1.9874939918518066, "step": 996 }, { "epoch": 0.16616666666666666, "grad_norm": 31.891550064086914, "learning_rate": 1.9043207830853195e-07, "logits/chosen": 0.7308049201965332, "logits/rejected": 3.083414077758789, "logps/chosen": -8.509577751159668, "logps/rejected": -342.5794677734375, "loss": 0.4019, "nll_loss": 0.35456565022468567, "rewards/accuracies": 1.0, "rewards/chosen": 1.2103122472763062, "rewards/margins": 5.638580799102783, "rewards/rejected": -4.4282684326171875, "step": 997 }, { "epoch": 0.16633333333333333, "grad_norm": 91.93390655517578, "learning_rate": 1.9040902390008214e-07, "logits/chosen": 2.5640370845794678, "logits/rejected": 2.6529886722564697, "logps/chosen": -20.035120010375977, "logps/rejected": -57.90857696533203, "loss": 1.1947, "nll_loss": 0.37102073431015015, "rewards/accuracies": 1.0, "rewards/chosen": 2.8215692043304443, "rewards/margins": 0.7021408081054688, "rewards/rejected": 2.1194283962249756, "step": 998 }, { "epoch": 0.1665, "grad_norm": 37.716678619384766, "learning_rate": 1.9038594314861328e-07, "logits/chosen": 2.4959049224853516, "logits/rejected": 2.6747195720672607, "logps/chosen": -92.72034454345703, "logps/rejected": -167.3793487548828, "loss": 1.382, "nll_loss": 1.3437730073928833, "rewards/accuracies": 1.0, "rewards/chosen": 1.395424723625183, "rewards/margins": 6.363404273986816, "rewards/rejected": -4.967979431152344, "step": 999 }, { "epoch": 0.16666666666666666, "grad_norm": 47.62810516357422, "learning_rate": 1.9036283606085054e-07, "logits/chosen": 2.534977912902832, "logits/rejected": 3.123628854751587, "logps/chosen": -17.71222686767578, "logps/rejected": -50.498538970947266, "loss": 0.7068, "nll_loss": 0.5535070896148682, "rewards/accuracies": 1.0, "rewards/chosen": 1.010549783706665, "rewards/margins": 2.604719400405884, "rewards/rejected": -1.5941696166992188, "step": 1000 }, { "epoch": 0.16683333333333333, "grad_norm": 25.313899993896484, "learning_rate": 1.9033970264352677e-07, "logits/chosen": 2.9705235958099365, "logits/rejected": 3.0998241901397705, "logps/chosen": -98.19183349609375, "logps/rejected": -179.88467407226562, "loss": 0.9132, "nll_loss": 0.8767128586769104, "rewards/accuracies": 1.0, "rewards/chosen": 1.446542501449585, "rewards/margins": 6.424522399902344, "rewards/rejected": -4.977980136871338, "step": 1001 }, { "epoch": 0.167, "grad_norm": 115.0064926147461, "learning_rate": 1.9031654290338253e-07, "logits/chosen": 2.9918198585510254, "logits/rejected": 2.972290515899658, "logps/chosen": -129.4072723388672, "logps/rejected": -127.21280670166016, "loss": 1.6215, "nll_loss": 1.2686985731124878, "rewards/accuracies": 1.0, "rewards/chosen": -0.3079238831996918, "rewards/margins": 1.4484962224960327, "rewards/rejected": -1.7564201354980469, "step": 1002 }, { "epoch": 0.16716666666666666, "grad_norm": 32.66685485839844, "learning_rate": 1.9029335684716595e-07, "logits/chosen": 1.9062490463256836, "logits/rejected": 2.742823600769043, "logps/chosen": -32.060203552246094, "logps/rejected": -395.6708068847656, "loss": 0.8465, "nll_loss": 0.8015050888061523, "rewards/accuracies": 1.0, "rewards/chosen": 1.1916424036026, "rewards/margins": 6.332172870635986, "rewards/rejected": -5.140530586242676, "step": 1003 }, { "epoch": 0.16733333333333333, "grad_norm": 31.351224899291992, "learning_rate": 1.9027014448163295e-07, "logits/chosen": 1.0774930715560913, "logits/rejected": 1.891707181930542, "logps/chosen": -125.9433364868164, "logps/rejected": -333.02496337890625, "loss": 1.2444, "nll_loss": 1.1994603872299194, "rewards/accuracies": 1.0, "rewards/chosen": 1.174695611000061, "rewards/margins": 6.5614333152771, "rewards/rejected": -5.386737823486328, "step": 1004 }, { "epoch": 0.1675, "grad_norm": 45.31418228149414, "learning_rate": 1.9024690581354698e-07, "logits/chosen": 1.3633545637130737, "logits/rejected": 2.2865614891052246, "logps/chosen": -51.2432975769043, "logps/rejected": -207.35940551757812, "loss": 0.8146, "nll_loss": 0.7883585095405579, "rewards/accuracies": 1.0, "rewards/chosen": 1.8619999885559082, "rewards/margins": 6.511441707611084, "rewards/rejected": -4.649441719055176, "step": 1005 }, { "epoch": 0.16766666666666666, "grad_norm": 340.56201171875, "learning_rate": 1.902236408496793e-07, "logits/chosen": 2.68613338470459, "logits/rejected": 2.727433204650879, "logps/chosen": -129.70614624023438, "logps/rejected": -81.4923095703125, "loss": 2.889, "nll_loss": 1.425342321395874, "rewards/accuracies": 0.0, "rewards/chosen": 0.6437745094299316, "rewards/margins": -0.9150536060333252, "rewards/rejected": 1.5588281154632568, "step": 1006 }, { "epoch": 0.16783333333333333, "grad_norm": 55.46914291381836, "learning_rate": 1.9020034959680874e-07, "logits/chosen": 0.4619339406490326, "logits/rejected": 2.1785786151885986, "logps/chosen": -105.808349609375, "logps/rejected": -425.9510498046875, "loss": 1.5085, "nll_loss": 1.4107780456542969, "rewards/accuracies": 1.0, "rewards/chosen": 0.22061920166015625, "rewards/margins": 6.442668914794922, "rewards/rejected": -6.222049713134766, "step": 1007 }, { "epoch": 0.168, "grad_norm": 60.54523468017578, "learning_rate": 1.9017703206172184e-07, "logits/chosen": 2.3521082401275635, "logits/rejected": 2.429508924484253, "logps/chosen": -16.293413162231445, "logps/rejected": -79.65888977050781, "loss": 0.8308, "nll_loss": 0.6266697645187378, "rewards/accuracies": 1.0, "rewards/chosen": 1.1681950092315674, "rewards/margins": 2.193204164505005, "rewards/rejected": -1.0250091552734375, "step": 1008 }, { "epoch": 0.16816666666666666, "grad_norm": 32.3486213684082, "learning_rate": 1.9015368825121272e-07, "logits/chosen": 3.019773483276367, "logits/rejected": 3.026442527770996, "logps/chosen": -57.26177978515625, "logps/rejected": -76.09194946289062, "loss": 0.8581, "nll_loss": 0.8065038919448853, "rewards/accuracies": 1.0, "rewards/chosen": 1.8757638931274414, "rewards/margins": 4.409622669219971, "rewards/rejected": -2.5338587760925293, "step": 1009 }, { "epoch": 0.16833333333333333, "grad_norm": 29.222835540771484, "learning_rate": 1.9013031817208322e-07, "logits/chosen": 1.4326577186584473, "logits/rejected": 2.6776092052459717, "logps/chosen": -79.60234832763672, "logps/rejected": -244.47122192382812, "loss": 0.9031, "nll_loss": 0.874751091003418, "rewards/accuracies": 1.0, "rewards/chosen": 1.7215843200683594, "rewards/margins": 6.730632781982422, "rewards/rejected": -5.0090484619140625, "step": 1010 }, { "epoch": 0.1685, "grad_norm": 35.579532623291016, "learning_rate": 1.9010692183114282e-07, "logits/chosen": 2.9927945137023926, "logits/rejected": 3.1044726371765137, "logps/chosen": -122.69429016113281, "logps/rejected": -193.91732788085938, "loss": 1.1258, "nll_loss": 1.0669070482254028, "rewards/accuracies": 1.0, "rewards/chosen": 1.6416840553283691, "rewards/margins": 4.197327136993408, "rewards/rejected": -2.555643081665039, "step": 1011 }, { "epoch": 0.16866666666666666, "grad_norm": 50.291194915771484, "learning_rate": 1.9008349923520867e-07, "logits/chosen": 3.0378217697143555, "logits/rejected": 2.944371223449707, "logps/chosen": -77.300048828125, "logps/rejected": -62.646331787109375, "loss": 1.1221, "nll_loss": 0.9094124436378479, "rewards/accuracies": 1.0, "rewards/chosen": 1.4521576166152954, "rewards/margins": 2.2034544944763184, "rewards/rejected": -0.7512969970703125, "step": 1012 }, { "epoch": 0.16883333333333334, "grad_norm": 85.63945770263672, "learning_rate": 1.9006005039110554e-07, "logits/chosen": 1.4182559251785278, "logits/rejected": 1.7477891445159912, "logps/chosen": -133.75460815429688, "logps/rejected": -100.22203063964844, "loss": 1.2202, "nll_loss": 0.9288515448570251, "rewards/accuracies": 1.0, "rewards/chosen": 0.9737014770507812, "rewards/margins": 1.6587226390838623, "rewards/rejected": -0.6850212216377258, "step": 1013 }, { "epoch": 0.169, "grad_norm": 61.91297149658203, "learning_rate": 1.9003657530566588e-07, "logits/chosen": -0.11642400175333023, "logits/rejected": 2.4303667545318604, "logps/chosen": -30.2205867767334, "logps/rejected": -636.55224609375, "loss": 1.2958, "nll_loss": 1.2088234424591064, "rewards/accuracies": 1.0, "rewards/chosen": 0.5435419082641602, "rewards/margins": 4.527593612670898, "rewards/rejected": -3.9840517044067383, "step": 1014 }, { "epoch": 0.16916666666666666, "grad_norm": 29.095712661743164, "learning_rate": 1.9001307398572972e-07, "logits/chosen": 2.2253055572509766, "logits/rejected": 2.2657690048217773, "logps/chosen": -122.33560180664062, "logps/rejected": -97.93197631835938, "loss": 1.0153, "nll_loss": 0.9557469487190247, "rewards/accuracies": 1.0, "rewards/chosen": 1.769989013671875, "rewards/margins": 4.157591342926025, "rewards/rejected": -2.3876023292541504, "step": 1015 }, { "epoch": 0.16933333333333334, "grad_norm": 38.8934211730957, "learning_rate": 1.8998954643814483e-07, "logits/chosen": 2.9971225261688232, "logits/rejected": 2.8360753059387207, "logps/chosen": -161.505126953125, "logps/rejected": -81.37892150878906, "loss": 0.944, "nll_loss": 0.8115834593772888, "rewards/accuracies": 1.0, "rewards/chosen": 2.44976806640625, "rewards/margins": 3.2800049781799316, "rewards/rejected": -0.8302369117736816, "step": 1016 }, { "epoch": 0.1695, "grad_norm": 32.41066360473633, "learning_rate": 1.8996599266976655e-07, "logits/chosen": 2.186717987060547, "logits/rejected": 2.4706804752349854, "logps/chosen": -70.18384552001953, "logps/rejected": -167.1309814453125, "loss": 0.6781, "nll_loss": 0.6156477928161621, "rewards/accuracies": 1.0, "rewards/chosen": 0.8864967823028564, "rewards/margins": 5.234513282775879, "rewards/rejected": -4.348016262054443, "step": 1017 }, { "epoch": 0.16966666666666666, "grad_norm": 82.18083190917969, "learning_rate": 1.8994241268745785e-07, "logits/chosen": 2.7538506984710693, "logits/rejected": 2.7926344871520996, "logps/chosen": -28.434391021728516, "logps/rejected": -98.57521057128906, "loss": 1.4584, "nll_loss": 1.3540185689926147, "rewards/accuracies": 1.0, "rewards/chosen": 1.2413276433944702, "rewards/margins": 3.222759962081909, "rewards/rejected": -1.981432318687439, "step": 1018 }, { "epoch": 0.16983333333333334, "grad_norm": 223.78125, "learning_rate": 1.8991880649808942e-07, "logits/chosen": 1.7122317552566528, "logits/rejected": 1.7291607856750488, "logps/chosen": -46.464019775390625, "logps/rejected": -71.59584045410156, "loss": 2.0746, "nll_loss": 0.5105937123298645, "rewards/accuracies": 0.0, "rewards/chosen": 0.6001937985420227, "rewards/margins": -1.0540244579315186, "rewards/rejected": 1.654218316078186, "step": 1019 }, { "epoch": 0.17, "grad_norm": 36.52116394042969, "learning_rate": 1.8989517410853953e-07, "logits/chosen": 2.703688383102417, "logits/rejected": 2.73464298248291, "logps/chosen": -171.0850372314453, "logps/rejected": -175.73927307128906, "loss": 1.066, "nll_loss": 0.9776287078857422, "rewards/accuracies": 1.0, "rewards/chosen": 0.6672470569610596, "rewards/margins": 4.066886901855469, "rewards/rejected": -3.3996400833129883, "step": 1020 }, { "epoch": 0.17016666666666666, "grad_norm": 29.948009490966797, "learning_rate": 1.8987151552569407e-07, "logits/chosen": 2.9763731956481934, "logits/rejected": 2.9033639430999756, "logps/chosen": -72.81841278076172, "logps/rejected": -73.60604095458984, "loss": 0.92, "nll_loss": 0.8467256426811218, "rewards/accuracies": 1.0, "rewards/chosen": 2.3004860877990723, "rewards/margins": 3.978294849395752, "rewards/rejected": -1.6778087615966797, "step": 1021 }, { "epoch": 0.17033333333333334, "grad_norm": 43.71226119995117, "learning_rate": 1.8984783075644662e-07, "logits/chosen": 2.7455880641937256, "logits/rejected": 2.9477760791778564, "logps/chosen": -58.09639358520508, "logps/rejected": -104.01637268066406, "loss": 0.768, "nll_loss": 0.645515501499176, "rewards/accuracies": 1.0, "rewards/chosen": 1.0503323078155518, "rewards/margins": 2.9713268280029297, "rewards/rejected": -1.9209946393966675, "step": 1022 }, { "epoch": 0.1705, "grad_norm": 50.71647644042969, "learning_rate": 1.8982411980769827e-07, "logits/chosen": 2.7770638465881348, "logits/rejected": 2.928516387939453, "logps/chosen": -83.59587097167969, "logps/rejected": -258.1645202636719, "loss": 0.8873, "nll_loss": 0.7531160116195679, "rewards/accuracies": 1.0, "rewards/chosen": 0.6005393862724304, "rewards/margins": 2.9530937671661377, "rewards/rejected": -2.3525543212890625, "step": 1023 }, { "epoch": 0.17066666666666666, "grad_norm": 31.5563907623291, "learning_rate": 1.8980038268635795e-07, "logits/chosen": 2.3689029216766357, "logits/rejected": 1.8551230430603027, "logps/chosen": -57.69034957885742, "logps/rejected": -39.69997787475586, "loss": 0.9182, "nll_loss": 0.8610501289367676, "rewards/accuracies": 1.0, "rewards/chosen": 3.0953941345214844, "rewards/margins": 4.774832248687744, "rewards/rejected": -1.6794382333755493, "step": 1024 }, { "epoch": 0.17083333333333334, "grad_norm": 45.127586364746094, "learning_rate": 1.8977661939934198e-07, "logits/chosen": 2.3931655883789062, "logits/rejected": 3.049067497253418, "logps/chosen": -43.802894592285156, "logps/rejected": -150.05186462402344, "loss": 0.9793, "nll_loss": 0.8939366340637207, "rewards/accuracies": 1.0, "rewards/chosen": 0.4407837390899658, "rewards/margins": 5.42801570892334, "rewards/rejected": -4.987231731414795, "step": 1025 }, { "epoch": 0.171, "grad_norm": 29.071996688842773, "learning_rate": 1.8975282995357445e-07, "logits/chosen": 2.9515788555145264, "logits/rejected": 2.9345757961273193, "logps/chosen": -72.80184936523438, "logps/rejected": -74.92134094238281, "loss": 0.9131, "nll_loss": 0.8465331196784973, "rewards/accuracies": 1.0, "rewards/chosen": 2.302142381668091, "rewards/margins": 4.111481189727783, "rewards/rejected": -1.8093388080596924, "step": 1026 }, { "epoch": 0.17116666666666666, "grad_norm": 71.98246765136719, "learning_rate": 1.8972901435598703e-07, "logits/chosen": 2.6551077365875244, "logits/rejected": 2.3340296745300293, "logps/chosen": -136.11083984375, "logps/rejected": -18.66733169555664, "loss": 0.8973, "nll_loss": 0.5488340258598328, "rewards/accuracies": 1.0, "rewards/chosen": 1.3231353759765625, "rewards/margins": 1.4903957843780518, "rewards/rejected": -0.16726037859916687, "step": 1027 }, { "epoch": 0.17133333333333334, "grad_norm": 60.99420166015625, "learning_rate": 1.89705172613519e-07, "logits/chosen": 2.52893328666687, "logits/rejected": 3.421461582183838, "logps/chosen": -90.26884460449219, "logps/rejected": -471.75213623046875, "loss": 1.415, "nll_loss": 1.3082441091537476, "rewards/accuracies": 1.0, "rewards/chosen": 0.5158790946006775, "rewards/margins": 3.6537272930145264, "rewards/rejected": -3.137848138809204, "step": 1028 }, { "epoch": 0.1715, "grad_norm": 67.82925415039062, "learning_rate": 1.896813047331173e-07, "logits/chosen": 1.1238563060760498, "logits/rejected": 2.472385883331299, "logps/chosen": -59.766571044921875, "logps/rejected": -373.6268310546875, "loss": 1.5373, "nll_loss": 1.4230138063430786, "rewards/accuracies": 1.0, "rewards/chosen": 0.5614174008369446, "rewards/margins": 3.3782694339752197, "rewards/rejected": -2.81685209274292, "step": 1029 }, { "epoch": 0.17166666666666666, "grad_norm": 22.72157859802246, "learning_rate": 1.8965741072173644e-07, "logits/chosen": 2.8678596019744873, "logits/rejected": 2.9361891746520996, "logps/chosen": -95.005615234375, "logps/rejected": -158.02609252929688, "loss": 0.7062, "nll_loss": 0.6834936141967773, "rewards/accuracies": 1.0, "rewards/chosen": 1.9814598560333252, "rewards/margins": 6.9048566818237305, "rewards/rejected": -4.923396587371826, "step": 1030 }, { "epoch": 0.17183333333333334, "grad_norm": 23.72401237487793, "learning_rate": 1.896334905863386e-07, "logits/chosen": 2.803133249282837, "logits/rejected": 2.9048655033111572, "logps/chosen": -63.86552810668945, "logps/rejected": -174.18601989746094, "loss": 0.6995, "nll_loss": 0.6652659773826599, "rewards/accuracies": 1.0, "rewards/chosen": 1.852352499961853, "rewards/margins": 5.3438873291015625, "rewards/rejected": -3.49153470993042, "step": 1031 }, { "epoch": 0.172, "grad_norm": 24.195697784423828, "learning_rate": 1.8960954433389345e-07, "logits/chosen": 3.00626277923584, "logits/rejected": 3.0366334915161133, "logps/chosen": -18.052906036376953, "logps/rejected": -49.847137451171875, "loss": 0.4072, "nll_loss": 0.3112569749355316, "rewards/accuracies": 1.0, "rewards/chosen": 0.9717289209365845, "rewards/margins": 3.465399742126465, "rewards/rejected": -2.49367094039917, "step": 1032 }, { "epoch": 0.17216666666666666, "grad_norm": 68.67825317382812, "learning_rate": 1.895855719713784e-07, "logits/chosen": 2.8506031036376953, "logits/rejected": 2.9638988971710205, "logps/chosen": -33.7744255065918, "logps/rejected": -131.4040069580078, "loss": 1.301, "nll_loss": 1.0554509162902832, "rewards/accuracies": 1.0, "rewards/chosen": -0.3090406358242035, "rewards/margins": 2.2121853828430176, "rewards/rejected": -2.521225929260254, "step": 1033 }, { "epoch": 0.17233333333333334, "grad_norm": 34.6162109375, "learning_rate": 1.8956157350577846e-07, "logits/chosen": 0.59429532289505, "logits/rejected": 2.315622568130493, "logps/chosen": -31.104650497436523, "logps/rejected": -393.3369140625, "loss": 0.6263, "nll_loss": 0.5760120153427124, "rewards/accuracies": 1.0, "rewards/chosen": 0.9850039482116699, "rewards/margins": 8.312531471252441, "rewards/rejected": -7.3275275230407715, "step": 1034 }, { "epoch": 0.1725, "grad_norm": 32.99842834472656, "learning_rate": 1.8953754894408616e-07, "logits/chosen": 2.828554630279541, "logits/rejected": 2.865349292755127, "logps/chosen": -29.06735610961914, "logps/rejected": -199.96994018554688, "loss": 0.5197, "nll_loss": 0.46882832050323486, "rewards/accuracies": 1.0, "rewards/chosen": 1.1565430164337158, "rewards/margins": 5.303799629211426, "rewards/rejected": -4.147256374359131, "step": 1035 }, { "epoch": 0.17266666666666666, "grad_norm": 33.06748962402344, "learning_rate": 1.8951349829330167e-07, "logits/chosen": 2.0058703422546387, "logits/rejected": 2.701395034790039, "logps/chosen": -31.262662887573242, "logps/rejected": -256.9765625, "loss": 0.6731, "nll_loss": 0.601205050945282, "rewards/accuracies": 1.0, "rewards/chosen": 1.0431784391403198, "rewards/margins": 4.124544620513916, "rewards/rejected": -3.0813660621643066, "step": 1036 }, { "epoch": 0.17283333333333334, "grad_norm": 28.856136322021484, "learning_rate": 1.894894215604328e-07, "logits/chosen": 4.07179594039917, "logits/rejected": 4.378378391265869, "logps/chosen": -41.90806579589844, "logps/rejected": -246.9132537841797, "loss": 0.6693, "nll_loss": 0.6349707245826721, "rewards/accuracies": 1.0, "rewards/chosen": 1.4795440435409546, "rewards/margins": 6.67534065246582, "rewards/rejected": -5.195796489715576, "step": 1037 }, { "epoch": 0.173, "grad_norm": 56.23735427856445, "learning_rate": 1.894653187524949e-07, "logits/chosen": 2.8235576152801514, "logits/rejected": 2.7411246299743652, "logps/chosen": -23.362993240356445, "logps/rejected": -33.7877082824707, "loss": 0.9376, "nll_loss": 0.778766393661499, "rewards/accuracies": 1.0, "rewards/chosen": 0.9785609245300293, "rewards/margins": 2.548534393310547, "rewards/rejected": -1.569973349571228, "step": 1038 }, { "epoch": 0.17316666666666666, "grad_norm": 30.41145896911621, "learning_rate": 1.89441189876511e-07, "logits/chosen": 2.015023708343506, "logits/rejected": 2.0143375396728516, "logps/chosen": -161.60043334960938, "logps/rejected": -119.88058471679688, "loss": 0.7372, "nll_loss": 0.6622968912124634, "rewards/accuracies": 1.0, "rewards/chosen": 1.293257236480713, "rewards/margins": 3.82804274559021, "rewards/rejected": -2.534785509109497, "step": 1039 }, { "epoch": 0.17333333333333334, "grad_norm": 44.32216262817383, "learning_rate": 1.8941703493951163e-07, "logits/chosen": 3.073668956756592, "logits/rejected": 3.200277805328369, "logps/chosen": -46.626224517822266, "logps/rejected": -140.32464599609375, "loss": 1.0355, "nll_loss": 0.9713796973228455, "rewards/accuracies": 1.0, "rewards/chosen": 1.1154381036758423, "rewards/margins": 4.37911319732666, "rewards/rejected": -3.2636749744415283, "step": 1040 }, { "epoch": 0.1735, "grad_norm": 29.815750122070312, "learning_rate": 1.89392853948535e-07, "logits/chosen": 1.2316973209381104, "logits/rejected": 2.2668991088867188, "logps/chosen": -68.95769500732422, "logps/rejected": -258.9392395019531, "loss": 0.948, "nll_loss": 0.9073382019996643, "rewards/accuracies": 1.0, "rewards/chosen": 1.5440704822540283, "rewards/margins": 5.236724853515625, "rewards/rejected": -3.6926543712615967, "step": 1041 }, { "epoch": 0.17366666666666666, "grad_norm": 72.11442565917969, "learning_rate": 1.8936864691062682e-07, "logits/chosen": 1.9827160835266113, "logits/rejected": 2.3413124084472656, "logps/chosen": -10.962726593017578, "logps/rejected": -218.04978942871094, "loss": 0.5539, "nll_loss": 0.2884927988052368, "rewards/accuracies": 1.0, "rewards/chosen": 0.5674301385879517, "rewards/margins": 1.762159824371338, "rewards/rejected": -1.1947296857833862, "step": 1042 }, { "epoch": 0.17383333333333334, "grad_norm": 39.7937126159668, "learning_rate": 1.8934441383284048e-07, "logits/chosen": 2.4189767837524414, "logits/rejected": 2.602006673812866, "logps/chosen": -78.02076721191406, "logps/rejected": -131.91595458984375, "loss": 0.9363, "nll_loss": 0.848051905632019, "rewards/accuracies": 1.0, "rewards/chosen": 1.5062134265899658, "rewards/margins": 3.4940733909606934, "rewards/rejected": -1.987860083580017, "step": 1043 }, { "epoch": 0.174, "grad_norm": 41.10580825805664, "learning_rate": 1.893201547222369e-07, "logits/chosen": 3.0125157833099365, "logits/rejected": 2.901085615158081, "logps/chosen": -32.223934173583984, "logps/rejected": -45.555908203125, "loss": 0.763, "nll_loss": 0.671332061290741, "rewards/accuracies": 1.0, "rewards/chosen": 1.4364705085754395, "rewards/margins": 3.430558681488037, "rewards/rejected": -1.9940881729125977, "step": 1044 }, { "epoch": 0.17416666666666666, "grad_norm": 18.14354133605957, "learning_rate": 1.8929586958588462e-07, "logits/chosen": 2.304004430770874, "logits/rejected": 2.299171209335327, "logps/chosen": -166.27540588378906, "logps/rejected": -152.3287811279297, "loss": 0.6978, "nll_loss": 0.6598230004310608, "rewards/accuracies": 1.0, "rewards/chosen": 1.9842301607131958, "rewards/margins": 4.985782623291016, "rewards/rejected": -3.0015525817871094, "step": 1045 }, { "epoch": 0.17433333333333334, "grad_norm": 308.0206604003906, "learning_rate": 1.892715584308597e-07, "logits/chosen": 2.971033811569214, "logits/rejected": 3.019376754760742, "logps/chosen": -97.60433197021484, "logps/rejected": -23.205089569091797, "loss": 4.2184, "nll_loss": 1.9520866870880127, "rewards/accuracies": 0.0, "rewards/chosen": -1.8459175825119019, "rewards/margins": -2.032512664794922, "rewards/rejected": 0.18659496307373047, "step": 1046 }, { "epoch": 0.1745, "grad_norm": 40.248538970947266, "learning_rate": 1.8924722126424589e-07, "logits/chosen": 2.4195754528045654, "logits/rejected": 2.6544153690338135, "logps/chosen": -46.18922805786133, "logps/rejected": -288.0115051269531, "loss": 0.8391, "nll_loss": 0.7698204517364502, "rewards/accuracies": 1.0, "rewards/chosen": 0.8896583914756775, "rewards/margins": 4.537182331085205, "rewards/rejected": -3.647523880004883, "step": 1047 }, { "epoch": 0.17466666666666666, "grad_norm": 29.823657989501953, "learning_rate": 1.892228580931344e-07, "logits/chosen": 1.7980132102966309, "logits/rejected": 2.2741928100585938, "logps/chosen": -33.35880661010742, "logps/rejected": -157.0328826904297, "loss": 0.5798, "nll_loss": 0.5054365396499634, "rewards/accuracies": 1.0, "rewards/chosen": 1.0459625720977783, "rewards/margins": 4.0191545486450195, "rewards/rejected": -2.973191976547241, "step": 1048 }, { "epoch": 0.17483333333333334, "grad_norm": 173.6779327392578, "learning_rate": 1.8919846892462413e-07, "logits/chosen": 3.0012717247009277, "logits/rejected": 2.9199304580688477, "logps/chosen": -39.18021011352539, "logps/rejected": -86.95240020751953, "loss": 1.6328, "nll_loss": 0.8336215615272522, "rewards/accuracies": 1.0, "rewards/chosen": -0.4055057466030121, "rewards/margins": 0.06882289052009583, "rewards/rejected": -0.4743286371231079, "step": 1049 }, { "epoch": 0.175, "grad_norm": 30.518728256225586, "learning_rate": 1.8917405376582142e-07, "logits/chosen": 2.4753057956695557, "logits/rejected": 2.301678419113159, "logps/chosen": -147.02430725097656, "logps/rejected": -81.18826293945312, "loss": 1.1409, "nll_loss": 1.1138205528259277, "rewards/accuracies": 1.0, "rewards/chosen": 1.9067214727401733, "rewards/margins": 6.0827317237854, "rewards/rejected": -4.1760101318359375, "step": 1050 }, { "epoch": 0.17516666666666666, "grad_norm": 35.29458236694336, "learning_rate": 1.8914961262384034e-07, "logits/chosen": 2.3772783279418945, "logits/rejected": 2.343191623687744, "logps/chosen": -19.433696746826172, "logps/rejected": -68.12138366699219, "loss": 0.5741, "nll_loss": 0.4416749179363251, "rewards/accuracies": 1.0, "rewards/chosen": 1.0458459854125977, "rewards/margins": 2.8382723331451416, "rewards/rejected": -1.792426347732544, "step": 1051 }, { "epoch": 0.17533333333333334, "grad_norm": 21.644681930541992, "learning_rate": 1.891251455058024e-07, "logits/chosen": 1.3872599601745605, "logits/rejected": 1.3242088556289673, "logps/chosen": -258.5147705078125, "logps/rejected": -290.694091796875, "loss": 0.7234, "nll_loss": 0.687539279460907, "rewards/accuracies": 1.0, "rewards/chosen": 1.3892364501953125, "rewards/margins": 7.117771148681641, "rewards/rejected": -5.728534698486328, "step": 1052 }, { "epoch": 0.1755, "grad_norm": 34.49214553833008, "learning_rate": 1.8910065241883678e-07, "logits/chosen": 1.436484456062317, "logits/rejected": 1.5650019645690918, "logps/chosen": -42.89957809448242, "logps/rejected": -70.79782104492188, "loss": 0.6408, "nll_loss": 0.5430325865745544, "rewards/accuracies": 1.0, "rewards/chosen": 0.7357742786407471, "rewards/margins": 3.597691535949707, "rewards/rejected": -2.86191725730896, "step": 1053 }, { "epoch": 0.17566666666666667, "grad_norm": 116.35106658935547, "learning_rate": 1.8907613337008013e-07, "logits/chosen": 3.000005006790161, "logits/rejected": 3.208859920501709, "logps/chosen": -33.367576599121094, "logps/rejected": -217.3006134033203, "loss": 2.3396, "nll_loss": 2.2245051860809326, "rewards/accuracies": 1.0, "rewards/chosen": 0.1424354612827301, "rewards/margins": 4.3035759925842285, "rewards/rejected": -4.161140441894531, "step": 1054 }, { "epoch": 0.17583333333333334, "grad_norm": 37.99162673950195, "learning_rate": 1.8905158836667673e-07, "logits/chosen": 3.204169511795044, "logits/rejected": 3.289006471633911, "logps/chosen": -96.84446716308594, "logps/rejected": -175.74058532714844, "loss": 1.0699, "nll_loss": 1.0194154977798462, "rewards/accuracies": 1.0, "rewards/chosen": 1.7611252069473267, "rewards/margins": 4.4667253494262695, "rewards/rejected": -2.7056000232696533, "step": 1055 }, { "epoch": 0.176, "grad_norm": 43.539703369140625, "learning_rate": 1.890270174157784e-07, "logits/chosen": 2.832749366760254, "logits/rejected": 2.8076958656311035, "logps/chosen": -72.82170867919922, "logps/rejected": -194.1104736328125, "loss": 1.0899, "nll_loss": 0.9840771555900574, "rewards/accuracies": 1.0, "rewards/chosen": 0.10912628471851349, "rewards/margins": 5.830931186676025, "rewards/rejected": -5.721805095672607, "step": 1056 }, { "epoch": 0.17616666666666667, "grad_norm": 21.07243537902832, "learning_rate": 1.8900242052454453e-07, "logits/chosen": 2.7930352687835693, "logits/rejected": 2.711665153503418, "logps/chosen": -150.67962646484375, "logps/rejected": -110.13124084472656, "loss": 0.7524, "nll_loss": 0.7244213819503784, "rewards/accuracies": 1.0, "rewards/chosen": 2.018707275390625, "rewards/margins": 5.725946426391602, "rewards/rejected": -3.7072391510009766, "step": 1057 }, { "epoch": 0.17633333333333334, "grad_norm": 53.49617385864258, "learning_rate": 1.8897779770014207e-07, "logits/chosen": 2.5542776584625244, "logits/rejected": 2.4211668968200684, "logps/chosen": -132.0274658203125, "logps/rejected": -19.34422492980957, "loss": 0.8062, "nll_loss": 0.5323688983917236, "rewards/accuracies": 1.0, "rewards/chosen": 1.7314729690551758, "rewards/margins": 1.9664226770401, "rewards/rejected": -0.23494970798492432, "step": 1058 }, { "epoch": 0.1765, "grad_norm": 32.60660171508789, "learning_rate": 1.889531489497455e-07, "logits/chosen": 2.6851918697357178, "logits/rejected": 2.5138564109802246, "logps/chosen": -182.9290771484375, "logps/rejected": -139.6507568359375, "loss": 0.8474, "nll_loss": 0.7851032614707947, "rewards/accuracies": 1.0, "rewards/chosen": 1.3867462873458862, "rewards/margins": 4.175059795379639, "rewards/rejected": -2.788313388824463, "step": 1059 }, { "epoch": 0.17666666666666667, "grad_norm": 38.55118179321289, "learning_rate": 1.889284742805369e-07, "logits/chosen": 2.9630801677703857, "logits/rejected": 3.1163864135742188, "logps/chosen": -84.29753875732422, "logps/rejected": -215.09283447265625, "loss": 0.918, "nll_loss": 0.818422794342041, "rewards/accuracies": 1.0, "rewards/chosen": 0.4980247914791107, "rewards/margins": 3.902101993560791, "rewards/rejected": -3.4040772914886475, "step": 1060 }, { "epoch": 0.17683333333333334, "grad_norm": 35.123966217041016, "learning_rate": 1.8890377369970584e-07, "logits/chosen": 3.1662511825561523, "logits/rejected": 3.292344331741333, "logps/chosen": -72.26889038085938, "logps/rejected": -264.61639404296875, "loss": 1.068, "nll_loss": 1.017871618270874, "rewards/accuracies": 1.0, "rewards/chosen": 1.2346595525741577, "rewards/margins": 5.058902740478516, "rewards/rejected": -3.8242433071136475, "step": 1061 }, { "epoch": 0.177, "grad_norm": 55.094486236572266, "learning_rate": 1.8887904721444952e-07, "logits/chosen": 3.0379130840301514, "logits/rejected": 2.955899715423584, "logps/chosen": -46.610389709472656, "logps/rejected": -114.33473205566406, "loss": 0.8262, "nll_loss": 0.6564842462539673, "rewards/accuracies": 1.0, "rewards/chosen": 1.0077232122421265, "rewards/margins": 2.447927951812744, "rewards/rejected": -1.4402046203613281, "step": 1062 }, { "epoch": 0.17716666666666667, "grad_norm": 28.147493362426758, "learning_rate": 1.8885429483197263e-07, "logits/chosen": 1.5454466342926025, "logits/rejected": 2.1970083713531494, "logps/chosen": -108.64176177978516, "logps/rejected": -408.27874755859375, "loss": 0.9044, "nll_loss": 0.8554468750953674, "rewards/accuracies": 1.0, "rewards/chosen": 1.1325501203536987, "rewards/margins": 5.623632907867432, "rewards/rejected": -4.491082668304443, "step": 1063 }, { "epoch": 0.17733333333333334, "grad_norm": 139.8994903564453, "learning_rate": 1.888295165594874e-07, "logits/chosen": 2.3411543369293213, "logits/rejected": 2.1883482933044434, "logps/chosen": -63.59952926635742, "logps/rejected": -29.91054916381836, "loss": 2.0875, "nll_loss": 1.19999098777771, "rewards/accuracies": 1.0, "rewards/chosen": 1.0041691064834595, "rewards/margins": 0.04175609350204468, "rewards/rejected": 0.9624130129814148, "step": 1064 }, { "epoch": 0.1775, "grad_norm": 38.28388214111328, "learning_rate": 1.8880471240421364e-07, "logits/chosen": 1.3550151586532593, "logits/rejected": 2.1027536392211914, "logps/chosen": -50.86219787597656, "logps/rejected": -140.54217529296875, "loss": 0.854, "nll_loss": 0.8203579187393188, "rewards/accuracies": 1.0, "rewards/chosen": 1.5801124572753906, "rewards/margins": 6.014656066894531, "rewards/rejected": -4.434543609619141, "step": 1065 }, { "epoch": 0.17766666666666667, "grad_norm": 35.36601638793945, "learning_rate": 1.8877988237337868e-07, "logits/chosen": 1.6215254068374634, "logits/rejected": 2.1620066165924072, "logps/chosen": -43.70394515991211, "logps/rejected": -288.2537536621094, "loss": 0.6371, "nll_loss": 0.6155485510826111, "rewards/accuracies": 1.0, "rewards/chosen": 1.9437848329544067, "rewards/margins": 7.623560905456543, "rewards/rejected": -5.679776191711426, "step": 1066 }, { "epoch": 0.17783333333333334, "grad_norm": 42.58735656738281, "learning_rate": 1.887550264742174e-07, "logits/chosen": 2.0913450717926025, "logits/rejected": 2.177475929260254, "logps/chosen": -120.31710052490234, "logps/rejected": -261.3146667480469, "loss": 1.3139, "nll_loss": 1.227725625038147, "rewards/accuracies": 1.0, "rewards/chosen": 0.4560203552246094, "rewards/margins": 4.913188934326172, "rewards/rejected": -4.4571685791015625, "step": 1067 }, { "epoch": 0.178, "grad_norm": 43.20032501220703, "learning_rate": 1.8873014471397221e-07, "logits/chosen": 2.6648402214050293, "logits/rejected": 2.6985538005828857, "logps/chosen": -113.57406616210938, "logps/rejected": -121.99005126953125, "loss": 1.1402, "nll_loss": 1.0614399909973145, "rewards/accuracies": 1.0, "rewards/chosen": 1.0736297369003296, "rewards/margins": 3.8318581581115723, "rewards/rejected": -2.7582285404205322, "step": 1068 }, { "epoch": 0.17816666666666667, "grad_norm": 102.5638656616211, "learning_rate": 1.8870523709989306e-07, "logits/chosen": 2.855215549468994, "logits/rejected": 3.2222073078155518, "logps/chosen": -28.972450256347656, "logps/rejected": -292.6793212890625, "loss": 2.1573, "nll_loss": 2.0694611072540283, "rewards/accuracies": 1.0, "rewards/chosen": 0.39278340339660645, "rewards/margins": 5.193698883056641, "rewards/rejected": -4.800915718078613, "step": 1069 }, { "epoch": 0.17833333333333334, "grad_norm": 92.6780776977539, "learning_rate": 1.8868030363923743e-07, "logits/chosen": 2.613614559173584, "logits/rejected": 2.967963933944702, "logps/chosen": -47.96114730834961, "logps/rejected": -203.44491577148438, "loss": 1.7898, "nll_loss": 1.7128978967666626, "rewards/accuracies": 1.0, "rewards/chosen": 0.8846885561943054, "rewards/margins": 4.109648704528809, "rewards/rejected": -3.2249603271484375, "step": 1070 }, { "epoch": 0.1785, "grad_norm": 34.515869140625, "learning_rate": 1.8865534433927033e-07, "logits/chosen": 2.2992188930511475, "logits/rejected": 2.33240008354187, "logps/chosen": -66.87294006347656, "logps/rejected": -57.835174560546875, "loss": 0.8924, "nll_loss": 0.8255918622016907, "rewards/accuracies": 1.0, "rewards/chosen": 1.7599091529846191, "rewards/margins": 3.9630696773529053, "rewards/rejected": -2.203160524368286, "step": 1071 }, { "epoch": 0.17866666666666667, "grad_norm": 29.205223083496094, "learning_rate": 1.886303592072643e-07, "logits/chosen": 2.130986452102661, "logits/rejected": 2.5494210720062256, "logps/chosen": -75.59349060058594, "logps/rejected": -225.4141387939453, "loss": 0.6536, "nll_loss": 0.5859960913658142, "rewards/accuracies": 1.0, "rewards/chosen": 0.8753364682197571, "rewards/margins": 4.644265174865723, "rewards/rejected": -3.7689285278320312, "step": 1072 }, { "epoch": 0.17883333333333334, "grad_norm": 44.00325393676758, "learning_rate": 1.886053482504994e-07, "logits/chosen": 2.044729471206665, "logits/rejected": 2.8772339820861816, "logps/chosen": -37.664955139160156, "logps/rejected": -388.81134033203125, "loss": 1.3294, "nll_loss": 1.298791766166687, "rewards/accuracies": 1.0, "rewards/chosen": 1.5695244073867798, "rewards/margins": 6.959075450897217, "rewards/rejected": -5.389551162719727, "step": 1073 }, { "epoch": 0.179, "grad_norm": 28.089475631713867, "learning_rate": 1.8858031147626323e-07, "logits/chosen": 2.8444526195526123, "logits/rejected": 2.6786742210388184, "logps/chosen": -35.28279495239258, "logps/rejected": -82.69473266601562, "loss": 0.6437, "nll_loss": 0.6189964413642883, "rewards/accuracies": 1.0, "rewards/chosen": 2.453831911087036, "rewards/margins": 5.7419915199279785, "rewards/rejected": -3.2881596088409424, "step": 1074 }, { "epoch": 0.17916666666666667, "grad_norm": 33.47237777709961, "learning_rate": 1.8855524889185093e-07, "logits/chosen": 2.3546359539031982, "logits/rejected": 2.701179027557373, "logps/chosen": -46.04350280761719, "logps/rejected": -179.41864013671875, "loss": 0.8446, "nll_loss": 0.8222053647041321, "rewards/accuracies": 1.0, "rewards/chosen": 2.1429078578948975, "rewards/margins": 6.286791801452637, "rewards/rejected": -4.143884181976318, "step": 1075 }, { "epoch": 0.17933333333333334, "grad_norm": 58.7552604675293, "learning_rate": 1.885301605045651e-07, "logits/chosen": 2.738802194595337, "logits/rejected": 2.799330472946167, "logps/chosen": -119.21965026855469, "logps/rejected": -57.20488739013672, "loss": 1.7035, "nll_loss": 1.4718475341796875, "rewards/accuracies": 1.0, "rewards/chosen": 1.3895981311798096, "rewards/margins": 2.0795204639434814, "rewards/rejected": -0.6899223327636719, "step": 1076 }, { "epoch": 0.1795, "grad_norm": 72.16417694091797, "learning_rate": 1.8850504632171589e-07, "logits/chosen": 2.3471145629882812, "logits/rejected": 2.7120814323425293, "logps/chosen": -27.410383224487305, "logps/rejected": -204.7562255859375, "loss": 1.1663, "nll_loss": 1.0542454719543457, "rewards/accuracies": 1.0, "rewards/chosen": 0.8485075235366821, "rewards/margins": 3.1833267211914062, "rewards/rejected": -2.3348190784454346, "step": 1077 }, { "epoch": 0.17966666666666667, "grad_norm": 51.27765655517578, "learning_rate": 1.8847990635062096e-07, "logits/chosen": 2.153233528137207, "logits/rejected": 2.4369351863861084, "logps/chosen": -59.603511810302734, "logps/rejected": -229.496826171875, "loss": 1.2099, "nll_loss": 1.0276466608047485, "rewards/accuracies": 1.0, "rewards/chosen": -0.49221497774124146, "rewards/margins": 3.6845614910125732, "rewards/rejected": -4.17677640914917, "step": 1078 }, { "epoch": 0.17983333333333335, "grad_norm": 38.58438491821289, "learning_rate": 1.8845474059860558e-07, "logits/chosen": 0.7133589386940002, "logits/rejected": 2.211242198944092, "logps/chosen": -20.81273651123047, "logps/rejected": -345.04248046875, "loss": 0.6224, "nll_loss": 0.5477035641670227, "rewards/accuracies": 1.0, "rewards/chosen": 0.48292484879493713, "rewards/margins": 11.153250694274902, "rewards/rejected": -10.670326232910156, "step": 1079 }, { "epoch": 0.18, "grad_norm": 48.99235534667969, "learning_rate": 1.8842954907300233e-07, "logits/chosen": 1.173560380935669, "logits/rejected": 2.248270034790039, "logps/chosen": -71.727783203125, "logps/rejected": -267.7385559082031, "loss": 1.3799, "nll_loss": 1.3282923698425293, "rewards/accuracies": 1.0, "rewards/chosen": 0.9548508524894714, "rewards/margins": 6.660991191864014, "rewards/rejected": -5.706140518188477, "step": 1080 }, { "epoch": 0.18016666666666667, "grad_norm": 42.48307800292969, "learning_rate": 1.8840433178115153e-07, "logits/chosen": 2.940560817718506, "logits/rejected": 3.1008427143096924, "logps/chosen": -44.04608917236328, "logps/rejected": -298.303466796875, "loss": 0.8959, "nll_loss": 0.8156681656837463, "rewards/accuracies": 1.0, "rewards/chosen": 0.5231094360351562, "rewards/margins": 5.037086486816406, "rewards/rejected": -4.51397705078125, "step": 1081 }, { "epoch": 0.18033333333333335, "grad_norm": 44.83076477050781, "learning_rate": 1.8837908873040077e-07, "logits/chosen": 1.4263267517089844, "logits/rejected": 2.59391713142395, "logps/chosen": -77.22128295898438, "logps/rejected": -251.42196655273438, "loss": 1.5112, "nll_loss": 1.4850246906280518, "rewards/accuracies": 1.0, "rewards/chosen": 1.8246934413909912, "rewards/margins": 6.468742370605469, "rewards/rejected": -4.644049167633057, "step": 1082 }, { "epoch": 0.1805, "grad_norm": 788.0984497070312, "learning_rate": 1.883538199281054e-07, "logits/chosen": 3.6602485179901123, "logits/rejected": 3.5581278800964355, "logps/chosen": -79.84246826171875, "logps/rejected": -108.13594818115234, "loss": 5.4918, "nll_loss": 2.8515169620513916, "rewards/accuracies": 0.0, "rewards/chosen": -1.2936525344848633, "rewards/margins": -2.471015453338623, "rewards/rejected": 1.1773627996444702, "step": 1083 }, { "epoch": 0.18066666666666667, "grad_norm": 71.411376953125, "learning_rate": 1.88328525381628e-07, "logits/chosen": 2.5709009170532227, "logits/rejected": 3.115217685699463, "logps/chosen": -71.35951232910156, "logps/rejected": -363.7798767089844, "loss": 1.5281, "nll_loss": 1.4866565465927124, "rewards/accuracies": 1.0, "rewards/chosen": 1.5326988697052002, "rewards/margins": 5.126952171325684, "rewards/rejected": -3.5942535400390625, "step": 1084 }, { "epoch": 0.18083333333333335, "grad_norm": 559.8289184570312, "learning_rate": 1.8830320509833895e-07, "logits/chosen": 4.190971851348877, "logits/rejected": 3.1855854988098145, "logps/chosen": -178.09202575683594, "logps/rejected": -116.32451629638672, "loss": 1.7084, "nll_loss": 1.2905218601226807, "rewards/accuracies": 1.0, "rewards/chosen": -0.5963546633720398, "rewards/margins": 1.2110276222229004, "rewards/rejected": -1.8073822259902954, "step": 1085 }, { "epoch": 0.181, "grad_norm": 48.82075500488281, "learning_rate": 1.8827785908561582e-07, "logits/chosen": 0.6978514790534973, "logits/rejected": 2.1125221252441406, "logps/chosen": -58.81232452392578, "logps/rejected": -234.66189575195312, "loss": 0.9746, "nll_loss": 0.8523523807525635, "rewards/accuracies": 1.0, "rewards/chosen": 0.4127868711948395, "rewards/margins": 3.3059847354888916, "rewards/rejected": -2.893197774887085, "step": 1086 }, { "epoch": 0.18116666666666667, "grad_norm": 43.524810791015625, "learning_rate": 1.8825248735084394e-07, "logits/chosen": 2.1863479614257812, "logits/rejected": 2.3756818771362305, "logps/chosen": -25.383743286132812, "logps/rejected": -52.57654571533203, "loss": 0.781, "nll_loss": 0.6508652567863464, "rewards/accuracies": 1.0, "rewards/chosen": 1.167833685874939, "rewards/margins": 2.8643558025360107, "rewards/rejected": -1.6965221166610718, "step": 1087 }, { "epoch": 0.18133333333333335, "grad_norm": 84.71383666992188, "learning_rate": 1.8822708990141597e-07, "logits/chosen": 3.2022407054901123, "logits/rejected": 2.867647886276245, "logps/chosen": -181.34716796875, "logps/rejected": -80.90681457519531, "loss": 1.4043, "nll_loss": 1.1624819040298462, "rewards/accuracies": 1.0, "rewards/chosen": 0.621234118938446, "rewards/margins": 1.902787685394287, "rewards/rejected": -1.2815536260604858, "step": 1088 }, { "epoch": 0.1815, "grad_norm": 52.20988464355469, "learning_rate": 1.8820166674473216e-07, "logits/chosen": 3.0129337310791016, "logits/rejected": 3.0687367916107178, "logps/chosen": -20.561695098876953, "logps/rejected": -101.10670471191406, "loss": 1.0455, "nll_loss": 0.8939867615699768, "rewards/accuracies": 1.0, "rewards/chosen": 0.7063609957695007, "rewards/margins": 2.6545681953430176, "rewards/rejected": -1.948207139968872, "step": 1089 }, { "epoch": 0.18166666666666667, "grad_norm": 62.369590759277344, "learning_rate": 1.8817621788820015e-07, "logits/chosen": 2.052117109298706, "logits/rejected": 2.1333601474761963, "logps/chosen": -35.45427703857422, "logps/rejected": -86.80135345458984, "loss": 0.7984, "nll_loss": 0.7235568165779114, "rewards/accuracies": 1.0, "rewards/chosen": 1.3238575458526611, "rewards/margins": 3.804743766784668, "rewards/rejected": -2.480886220932007, "step": 1090 }, { "epoch": 0.18183333333333335, "grad_norm": 21.847137451171875, "learning_rate": 1.8815074333923518e-07, "logits/chosen": 3.0900869369506836, "logits/rejected": 3.2629377841949463, "logps/chosen": -69.45606994628906, "logps/rejected": -234.3754425048828, "loss": 0.7213, "nll_loss": 0.708735466003418, "rewards/accuracies": 1.0, "rewards/chosen": 2.4425370693206787, "rewards/margins": 9.467514038085938, "rewards/rejected": -7.024977207183838, "step": 1091 }, { "epoch": 0.182, "grad_norm": 57.76626968383789, "learning_rate": 1.881252431052599e-07, "logits/chosen": 2.485621929168701, "logits/rejected": 2.6095595359802246, "logps/chosen": -38.81774139404297, "logps/rejected": -88.83041381835938, "loss": 1.533, "nll_loss": 1.4929898977279663, "rewards/accuracies": 1.0, "rewards/chosen": 1.4167530536651611, "rewards/margins": 5.545302391052246, "rewards/rejected": -4.128549098968506, "step": 1092 }, { "epoch": 0.18216666666666667, "grad_norm": 165.55494689941406, "learning_rate": 1.8809971719370442e-07, "logits/chosen": 2.44240140914917, "logits/rejected": 2.4643611907958984, "logps/chosen": -61.30376052856445, "logps/rejected": -71.51508331298828, "loss": 1.8727, "nll_loss": 1.226075291633606, "rewards/accuracies": 1.0, "rewards/chosen": -0.8040550351142883, "rewards/margins": 0.4695282578468323, "rewards/rejected": -1.2735832929611206, "step": 1093 }, { "epoch": 0.18233333333333332, "grad_norm": 28.831472396850586, "learning_rate": 1.8807416561200643e-07, "logits/chosen": 1.9875661134719849, "logits/rejected": 2.289980411529541, "logps/chosen": -71.5226058959961, "logps/rejected": -218.67242431640625, "loss": 0.8245, "nll_loss": 0.7946957349777222, "rewards/accuracies": 1.0, "rewards/chosen": 1.5992332696914673, "rewards/margins": 6.781449317932129, "rewards/rejected": -5.182216167449951, "step": 1094 }, { "epoch": 0.1825, "grad_norm": 137.51966857910156, "learning_rate": 1.8804858836761106e-07, "logits/chosen": 2.5928454399108887, "logits/rejected": 2.6219379901885986, "logps/chosen": -43.0706787109375, "logps/rejected": -31.95735740661621, "loss": 2.0117, "nll_loss": 1.1640722751617432, "rewards/accuracies": 1.0, "rewards/chosen": 0.2610969543457031, "rewards/margins": 0.0015790760517120361, "rewards/rejected": 0.2595178782939911, "step": 1095 }, { "epoch": 0.18266666666666667, "grad_norm": 24.999767303466797, "learning_rate": 1.880229854679709e-07, "logits/chosen": 2.489121675491333, "logits/rejected": 2.499250650405884, "logps/chosen": -85.93136596679688, "logps/rejected": -31.541187286376953, "loss": 0.779, "nll_loss": 0.7043554186820984, "rewards/accuracies": 1.0, "rewards/chosen": 2.4361464977264404, "rewards/margins": 4.048501491546631, "rewards/rejected": -1.61235511302948, "step": 1096 }, { "epoch": 0.18283333333333332, "grad_norm": 163.99044799804688, "learning_rate": 1.87997356920546e-07, "logits/chosen": 2.2830159664154053, "logits/rejected": 2.3153483867645264, "logps/chosen": -58.690704345703125, "logps/rejected": -65.19129943847656, "loss": 2.0118, "nll_loss": 1.0119086503982544, "rewards/accuracies": 0.0, "rewards/chosen": 1.5491615533828735, "rewards/margins": -0.02508544921875, "rewards/rejected": 1.5742470026016235, "step": 1097 }, { "epoch": 0.183, "grad_norm": 35.33065414428711, "learning_rate": 1.8797170273280387e-07, "logits/chosen": 2.0712687969207764, "logits/rejected": 2.580073356628418, "logps/chosen": -62.451786041259766, "logps/rejected": -508.07489013671875, "loss": 0.9615, "nll_loss": 0.9050983786582947, "rewards/accuracies": 1.0, "rewards/chosen": 0.8964642286300659, "rewards/margins": 5.805174350738525, "rewards/rejected": -4.90871000289917, "step": 1098 }, { "epoch": 0.18316666666666667, "grad_norm": 82.97079467773438, "learning_rate": 1.879460229122196e-07, "logits/chosen": 2.859807252883911, "logits/rejected": 2.9611902236938477, "logps/chosen": -50.21166229248047, "logps/rejected": -44.13186264038086, "loss": 2.1801, "nll_loss": 2.0084667205810547, "rewards/accuracies": 1.0, "rewards/chosen": 0.099726103246212, "rewards/margins": 2.7130000591278076, "rewards/rejected": -2.613273859024048, "step": 1099 }, { "epoch": 0.18333333333333332, "grad_norm": 41.054603576660156, "learning_rate": 1.879203174662756e-07, "logits/chosen": 0.5668736696243286, "logits/rejected": 2.52490496635437, "logps/chosen": -23.045345306396484, "logps/rejected": -291.93426513671875, "loss": 0.6618, "nll_loss": 0.5909063220024109, "rewards/accuracies": 1.0, "rewards/chosen": 0.6236820220947266, "rewards/margins": 5.51310920715332, "rewards/rejected": -4.889427185058594, "step": 1100 }, { "epoch": 0.1835, "grad_norm": 66.23294830322266, "learning_rate": 1.878945864024619e-07, "logits/chosen": 1.8133116960525513, "logits/rejected": 1.4689500331878662, "logps/chosen": -41.405120849609375, "logps/rejected": -21.494882583618164, "loss": 0.9774, "nll_loss": 0.6572240591049194, "rewards/accuracies": 1.0, "rewards/chosen": 0.7166435718536377, "rewards/margins": 1.4971389770507812, "rewards/rejected": -0.7804954648017883, "step": 1101 }, { "epoch": 0.18366666666666667, "grad_norm": 55.93202209472656, "learning_rate": 1.8786882972827585e-07, "logits/chosen": 2.7542331218719482, "logits/rejected": 2.846161365509033, "logps/chosen": -32.07527160644531, "logps/rejected": -72.86295318603516, "loss": 0.8875, "nll_loss": 0.6682348251342773, "rewards/accuracies": 1.0, "rewards/chosen": 0.853090763092041, "rewards/margins": 2.057372570037842, "rewards/rejected": -1.2042816877365112, "step": 1102 }, { "epoch": 0.18383333333333332, "grad_norm": 32.306861877441406, "learning_rate": 1.8784304745122235e-07, "logits/chosen": 3.447744846343994, "logits/rejected": 3.656477928161621, "logps/chosen": -31.20652198791504, "logps/rejected": -395.5555725097656, "loss": 0.5842, "nll_loss": 0.5474829077720642, "rewards/accuracies": 1.0, "rewards/chosen": 1.2892191410064697, "rewards/margins": 8.438728332519531, "rewards/rejected": -7.149509429931641, "step": 1103 }, { "epoch": 0.184, "grad_norm": 167.46832275390625, "learning_rate": 1.878172395788137e-07, "logits/chosen": 4.296903133392334, "logits/rejected": 4.284821510314941, "logps/chosen": -85.40229797363281, "logps/rejected": -59.25031280517578, "loss": 3.0947, "nll_loss": 2.5879483222961426, "rewards/accuracies": 1.0, "rewards/chosen": -0.5727165341377258, "rewards/margins": 0.8515053391456604, "rewards/rejected": -1.4242218732833862, "step": 1104 }, { "epoch": 0.18416666666666667, "grad_norm": 85.4263916015625, "learning_rate": 1.8779140611856977e-07, "logits/chosen": 2.3469157218933105, "logits/rejected": 2.36175274848938, "logps/chosen": -94.58213806152344, "logps/rejected": -32.699676513671875, "loss": 1.5504, "nll_loss": 0.9956014752388, "rewards/accuracies": 1.0, "rewards/chosen": 2.604884624481201, "rewards/margins": 1.3291198015213013, "rewards/rejected": 1.2757648229599, "step": 1105 }, { "epoch": 0.18433333333333332, "grad_norm": 181.9091796875, "learning_rate": 1.8776554707801776e-07, "logits/chosen": 2.4908721446990967, "logits/rejected": 2.2135493755340576, "logps/chosen": -76.48265075683594, "logps/rejected": -25.79474639892578, "loss": 1.7326, "nll_loss": 1.195041298866272, "rewards/accuracies": 1.0, "rewards/chosen": -0.35678330063819885, "rewards/margins": 0.7165281772613525, "rewards/rejected": -1.073311448097229, "step": 1106 }, { "epoch": 0.1845, "grad_norm": 18.76651954650879, "learning_rate": 1.8773966246469234e-07, "logits/chosen": 2.411151885986328, "logits/rejected": 2.322998046875, "logps/chosen": -173.74249267578125, "logps/rejected": -214.20675659179688, "loss": 0.7028, "nll_loss": 0.6734204888343811, "rewards/accuracies": 1.0, "rewards/chosen": 1.9684480428695679, "rewards/margins": 5.6038970947265625, "rewards/rejected": -3.635449171066284, "step": 1107 }, { "epoch": 0.18466666666666667, "grad_norm": 36.791019439697266, "learning_rate": 1.8771375228613576e-07, "logits/chosen": 2.5601398944854736, "logits/rejected": 2.7300002574920654, "logps/chosen": -77.57083129882812, "logps/rejected": -322.9771423339844, "loss": 1.2474, "nll_loss": 1.212044358253479, "rewards/accuracies": 1.0, "rewards/chosen": 1.6828416585922241, "rewards/margins": 5.405125617980957, "rewards/rejected": -3.7222840785980225, "step": 1108 }, { "epoch": 0.18483333333333332, "grad_norm": 445.113037109375, "learning_rate": 1.8768781654989755e-07, "logits/chosen": 2.123173475265503, "logits/rejected": 2.3354098796844482, "logps/chosen": -161.04901123046875, "logps/rejected": -212.97454833984375, "loss": 3.9536, "nll_loss": 2.3340439796447754, "rewards/accuracies": 0.0, "rewards/chosen": -3.05035400390625, "rewards/margins": -0.9649627208709717, "rewards/rejected": -2.0853912830352783, "step": 1109 }, { "epoch": 0.185, "grad_norm": 53.87984085083008, "learning_rate": 1.8766185526353477e-07, "logits/chosen": 3.119615077972412, "logits/rejected": 3.16460919380188, "logps/chosen": -31.430248260498047, "logps/rejected": -56.26207733154297, "loss": 0.9043, "nll_loss": 0.7309360504150391, "rewards/accuracies": 1.0, "rewards/chosen": 1.1102962493896484, "rewards/margins": 2.428335189819336, "rewards/rejected": -1.3180389404296875, "step": 1110 }, { "epoch": 0.18516666666666667, "grad_norm": 46.34815979003906, "learning_rate": 1.8763586843461194e-07, "logits/chosen": 2.7329864501953125, "logits/rejected": 2.7197000980377197, "logps/chosen": -89.67647552490234, "logps/rejected": -86.3139877319336, "loss": 1.0467, "nll_loss": 0.9058229327201843, "rewards/accuracies": 1.0, "rewards/chosen": 1.281930685043335, "rewards/margins": 2.75576114654541, "rewards/rejected": -1.4738304615020752, "step": 1111 }, { "epoch": 0.18533333333333332, "grad_norm": 41.32980728149414, "learning_rate": 1.8760985607070097e-07, "logits/chosen": 1.8121660947799683, "logits/rejected": 1.9790613651275635, "logps/chosen": -32.58624267578125, "logps/rejected": -69.43951416015625, "loss": 0.7423, "nll_loss": 0.6788800358772278, "rewards/accuracies": 1.0, "rewards/chosen": 1.0669952630996704, "rewards/margins": 4.418748378753662, "rewards/rejected": -3.351752996444702, "step": 1112 }, { "epoch": 0.1855, "grad_norm": 36.18080139160156, "learning_rate": 1.8758381817938125e-07, "logits/chosen": 2.843653917312622, "logits/rejected": 2.9799907207489014, "logps/chosen": -93.5645980834961, "logps/rejected": -139.45753479003906, "loss": 1.0375, "nll_loss": 0.9746312499046326, "rewards/accuracies": 1.0, "rewards/chosen": 1.581613302230835, "rewards/margins": 4.074079990386963, "rewards/rejected": -2.492466688156128, "step": 1113 }, { "epoch": 0.18566666666666667, "grad_norm": 156.8282470703125, "learning_rate": 1.875577547682396e-07, "logits/chosen": 2.996530055999756, "logits/rejected": 2.9918859004974365, "logps/chosen": -129.28030395507812, "logps/rejected": -104.5056381225586, "loss": 1.8546, "nll_loss": 1.2800029516220093, "rewards/accuracies": 1.0, "rewards/chosen": -1.5454620122909546, "rewards/margins": 0.9666301012039185, "rewards/rejected": -2.512092113494873, "step": 1114 }, { "epoch": 0.18583333333333332, "grad_norm": 195.83670043945312, "learning_rate": 1.8753166584487028e-07, "logits/chosen": 2.3186380863189697, "logits/rejected": 2.4114105701446533, "logps/chosen": -40.67744827270508, "logps/rejected": -35.593231201171875, "loss": 3.8599, "nll_loss": 2.5423405170440674, "rewards/accuracies": 0.0, "rewards/chosen": 0.28014373779296875, "rewards/margins": -0.7553818225860596, "rewards/rejected": 1.0355255603790283, "step": 1115 }, { "epoch": 0.186, "grad_norm": 43.99269104003906, "learning_rate": 1.87505551416875e-07, "logits/chosen": 2.7941527366638184, "logits/rejected": 3.0793426036834717, "logps/chosen": -48.712581634521484, "logps/rejected": -195.1910858154297, "loss": 1.0777, "nll_loss": 1.014845371246338, "rewards/accuracies": 1.0, "rewards/chosen": 0.9823722839355469, "rewards/margins": 4.614902496337891, "rewards/rejected": -3.6325302124023438, "step": 1116 }, { "epoch": 0.18616666666666667, "grad_norm": 41.07536697387695, "learning_rate": 1.8747941149186278e-07, "logits/chosen": 2.222187042236328, "logits/rejected": 2.8681490421295166, "logps/chosen": -87.87196350097656, "logps/rejected": -254.7787322998047, "loss": 1.1947, "nll_loss": 1.1265636682510376, "rewards/accuracies": 1.0, "rewards/chosen": 1.1588172912597656, "rewards/margins": 4.096207618713379, "rewards/rejected": -2.9373903274536133, "step": 1117 }, { "epoch": 0.18633333333333332, "grad_norm": 46.34823989868164, "learning_rate": 1.8745324607745028e-07, "logits/chosen": 2.685610771179199, "logits/rejected": 3.3717262744903564, "logps/chosen": -103.92939758300781, "logps/rejected": -208.64202880859375, "loss": 1.1618, "nll_loss": 1.0939937829971313, "rewards/accuracies": 1.0, "rewards/chosen": 1.9081192016601562, "rewards/margins": 3.9689910411834717, "rewards/rejected": -2.0608718395233154, "step": 1118 }, { "epoch": 0.1865, "grad_norm": 44.66832733154297, "learning_rate": 1.874270551812614e-07, "logits/chosen": 2.383763551712036, "logits/rejected": 2.5736427307128906, "logps/chosen": -17.498809814453125, "logps/rejected": -54.59518051147461, "loss": 0.581, "nll_loss": 0.43747028708457947, "rewards/accuracies": 1.0, "rewards/chosen": 0.6243357062339783, "rewards/margins": 2.775888681411743, "rewards/rejected": -2.15155291557312, "step": 1119 }, { "epoch": 0.18666666666666668, "grad_norm": 34.91358184814453, "learning_rate": 1.8740083881092755e-07, "logits/chosen": 2.3345935344696045, "logits/rejected": 2.6829473972320557, "logps/chosen": -59.28105163574219, "logps/rejected": -206.44302368164062, "loss": 0.967, "nll_loss": 0.9120160937309265, "rewards/accuracies": 1.0, "rewards/chosen": 0.9349403381347656, "rewards/margins": 5.661231517791748, "rewards/rejected": -4.726291179656982, "step": 1120 }, { "epoch": 0.18683333333333332, "grad_norm": 84.6291275024414, "learning_rate": 1.8737459697408758e-07, "logits/chosen": 2.8019490242004395, "logits/rejected": 2.7728731632232666, "logps/chosen": -136.44796752929688, "logps/rejected": -90.06995391845703, "loss": 1.3672, "nll_loss": 1.1466217041015625, "rewards/accuracies": 1.0, "rewards/chosen": 0.35170289874076843, "rewards/margins": 2.076763868331909, "rewards/rejected": -1.725061058998108, "step": 1121 }, { "epoch": 0.187, "grad_norm": 36.492279052734375, "learning_rate": 1.8734832967838773e-07, "logits/chosen": 2.5475265979766846, "logits/rejected": 2.7028422355651855, "logps/chosen": -23.463361740112305, "logps/rejected": -45.19118118286133, "loss": 0.6521, "nll_loss": 0.5865840911865234, "rewards/accuracies": 1.0, "rewards/chosen": 1.3572131395339966, "rewards/margins": 4.0573859214782715, "rewards/rejected": -2.7001726627349854, "step": 1122 }, { "epoch": 0.18716666666666668, "grad_norm": 44.888336181640625, "learning_rate": 1.8732203693148164e-07, "logits/chosen": 2.5166139602661133, "logits/rejected": 2.8372724056243896, "logps/chosen": -73.96031188964844, "logps/rejected": -132.87124633789062, "loss": 1.2471, "nll_loss": 1.1556298732757568, "rewards/accuracies": 1.0, "rewards/chosen": 1.439978003501892, "rewards/margins": 3.4348227977752686, "rewards/rejected": -1.9948447942733765, "step": 1123 }, { "epoch": 0.18733333333333332, "grad_norm": 224.05618286132812, "learning_rate": 1.872957187410304e-07, "logits/chosen": 1.2427098751068115, "logits/rejected": 2.3453686237335205, "logps/chosen": -29.99040412902832, "logps/rejected": -132.67343139648438, "loss": 2.3201, "nll_loss": 1.3039305210113525, "rewards/accuracies": 0.0, "rewards/chosen": -0.19077090919017792, "rewards/margins": -0.33439120650291443, "rewards/rejected": 0.1436202973127365, "step": 1124 }, { "epoch": 0.1875, "grad_norm": 42.04939270019531, "learning_rate": 1.8726937511470244e-07, "logits/chosen": 2.800631046295166, "logits/rejected": 2.9687814712524414, "logps/chosen": -45.11580276489258, "logps/rejected": -150.89910888671875, "loss": 0.7763, "nll_loss": 0.6634677648544312, "rewards/accuracies": 1.0, "rewards/chosen": 1.397856593132019, "rewards/margins": 3.104689598083496, "rewards/rejected": -1.7068328857421875, "step": 1125 }, { "epoch": 0.18766666666666668, "grad_norm": 43.275394439697266, "learning_rate": 1.8724300606017375e-07, "logits/chosen": 3.3185057640075684, "logits/rejected": 3.578129291534424, "logps/chosen": -23.73236083984375, "logps/rejected": -90.43626403808594, "loss": 0.7658, "nll_loss": 0.6592321991920471, "rewards/accuracies": 1.0, "rewards/chosen": 1.0618301630020142, "rewards/margins": 3.203120708465576, "rewards/rejected": -2.1412904262542725, "step": 1126 }, { "epoch": 0.18783333333333332, "grad_norm": 43.12870407104492, "learning_rate": 1.872166115851276e-07, "logits/chosen": 2.4639639854431152, "logits/rejected": 2.7376744747161865, "logps/chosen": -114.85894775390625, "logps/rejected": -397.4447937011719, "loss": 1.2097, "nll_loss": 1.172029972076416, "rewards/accuracies": 1.0, "rewards/chosen": 1.6149063110351562, "rewards/margins": 5.290220737457275, "rewards/rejected": -3.675314426422119, "step": 1127 }, { "epoch": 0.188, "grad_norm": 25.870502471923828, "learning_rate": 1.871901916972547e-07, "logits/chosen": 1.2087360620498657, "logits/rejected": 2.1806650161743164, "logps/chosen": -102.68193054199219, "logps/rejected": -278.71734619140625, "loss": 0.8723, "nll_loss": 0.8280799984931946, "rewards/accuracies": 1.0, "rewards/chosen": 1.3841943740844727, "rewards/margins": 5.132827281951904, "rewards/rejected": -3.7486329078674316, "step": 1128 }, { "epoch": 0.18816666666666668, "grad_norm": 35.47182846069336, "learning_rate": 1.8716374640425317e-07, "logits/chosen": 2.6514835357666016, "logits/rejected": 2.8920602798461914, "logps/chosen": -25.094635009765625, "logps/rejected": -232.697998046875, "loss": 0.6424, "nll_loss": 0.5835962295532227, "rewards/accuracies": 1.0, "rewards/chosen": 0.942487359046936, "rewards/margins": 5.061077117919922, "rewards/rejected": -4.118589878082275, "step": 1129 }, { "epoch": 0.18833333333333332, "grad_norm": 26.639240264892578, "learning_rate": 1.8713727571382853e-07, "logits/chosen": 3.1539947986602783, "logits/rejected": 3.5148653984069824, "logps/chosen": -40.83618927001953, "logps/rejected": -229.4964599609375, "loss": 0.5815, "nll_loss": 0.5593997240066528, "rewards/accuracies": 1.0, "rewards/chosen": 2.2974860668182373, "rewards/margins": 6.06310510635376, "rewards/rejected": -3.7656190395355225, "step": 1130 }, { "epoch": 0.1885, "grad_norm": 127.58232116699219, "learning_rate": 1.8711077963369373e-07, "logits/chosen": 2.7564845085144043, "logits/rejected": 2.6543896198272705, "logps/chosen": -32.50121307373047, "logps/rejected": -17.932819366455078, "loss": 2.8307, "nll_loss": 2.1667473316192627, "rewards/accuracies": 1.0, "rewards/chosen": -0.6316936612129211, "rewards/margins": 0.39445632696151733, "rewards/rejected": -1.0261499881744385, "step": 1131 }, { "epoch": 0.18866666666666668, "grad_norm": 23.19657325744629, "learning_rate": 1.8708425817156908e-07, "logits/chosen": 2.3630690574645996, "logits/rejected": 2.7579476833343506, "logps/chosen": -108.45155334472656, "logps/rejected": -303.1287841796875, "loss": 0.8098, "nll_loss": 0.7802270650863647, "rewards/accuracies": 1.0, "rewards/chosen": 1.5336976051330566, "rewards/margins": 7.752216339111328, "rewards/rejected": -6.2185187339782715, "step": 1132 }, { "epoch": 0.18883333333333333, "grad_norm": 138.9523162841797, "learning_rate": 1.8705771133518226e-07, "logits/chosen": 2.3195934295654297, "logits/rejected": 2.2211482524871826, "logps/chosen": -46.21538162231445, "logps/rejected": -36.58396530151367, "loss": 1.5695, "nll_loss": 0.502341091632843, "rewards/accuracies": 0.0, "rewards/chosen": 1.4384281635284424, "rewards/margins": -0.16204047203063965, "rewards/rejected": 1.600468635559082, "step": 1133 }, { "epoch": 0.189, "grad_norm": 176.40289306640625, "learning_rate": 1.8703113913226843e-07, "logits/chosen": 3.2730603218078613, "logits/rejected": 3.0130274295806885, "logps/chosen": -87.0044937133789, "logps/rejected": -33.29766082763672, "loss": 2.5644, "nll_loss": 1.160059928894043, "rewards/accuracies": 0.0, "rewards/chosen": 1.5522384643554688, "rewards/margins": -0.6470253467559814, "rewards/rejected": 2.19926381111145, "step": 1134 }, { "epoch": 0.18916666666666668, "grad_norm": 33.44631576538086, "learning_rate": 1.8700454157057008e-07, "logits/chosen": 2.2348506450653076, "logits/rejected": 2.344529867172241, "logps/chosen": -102.86430358886719, "logps/rejected": -95.85159301757812, "loss": 1.0637, "nll_loss": 0.9986826181411743, "rewards/accuracies": 1.0, "rewards/chosen": 1.347266435623169, "rewards/margins": 4.077823638916016, "rewards/rejected": -2.730557441711426, "step": 1135 }, { "epoch": 0.18933333333333333, "grad_norm": 70.01184844970703, "learning_rate": 1.869779186578371e-07, "logits/chosen": 3.171102285385132, "logits/rejected": 3.1654293537139893, "logps/chosen": -78.04978942871094, "logps/rejected": -51.21139907836914, "loss": 1.459, "nll_loss": 1.182572603225708, "rewards/accuracies": 1.0, "rewards/chosen": 1.613532304763794, "rewards/margins": 1.9243885278701782, "rewards/rejected": -0.31085625290870667, "step": 1136 }, { "epoch": 0.1895, "grad_norm": 27.530418395996094, "learning_rate": 1.8695127040182676e-07, "logits/chosen": 2.5280704498291016, "logits/rejected": 2.766767740249634, "logps/chosen": -30.348251342773438, "logps/rejected": -292.93231201171875, "loss": 0.5836, "nll_loss": 0.5419331192970276, "rewards/accuracies": 1.0, "rewards/chosen": 1.1548103094100952, "rewards/margins": 7.454248905181885, "rewards/rejected": -6.2994384765625, "step": 1137 }, { "epoch": 0.18966666666666668, "grad_norm": 115.22634887695312, "learning_rate": 1.869245968103037e-07, "logits/chosen": 3.1550779342651367, "logits/rejected": 3.140551805496216, "logps/chosen": -78.64397430419922, "logps/rejected": -43.01604461669922, "loss": 1.6529, "nll_loss": 1.3329488039016724, "rewards/accuracies": 1.0, "rewards/chosen": 0.4536628723144531, "rewards/margins": 1.4810622930526733, "rewards/rejected": -1.0273994207382202, "step": 1138 }, { "epoch": 0.18983333333333333, "grad_norm": 51.76725769042969, "learning_rate": 1.8689789789104e-07, "logits/chosen": 2.489798069000244, "logits/rejected": 2.657970905303955, "logps/chosen": -60.09551239013672, "logps/rejected": -103.50643920898438, "loss": 1.302, "nll_loss": 1.1783432960510254, "rewards/accuracies": 1.0, "rewards/chosen": 0.5877693295478821, "rewards/margins": 3.101672410964966, "rewards/rejected": -2.5139031410217285, "step": 1139 }, { "epoch": 0.19, "grad_norm": 79.98934173583984, "learning_rate": 1.8687117365181512e-07, "logits/chosen": 3.578239917755127, "logits/rejected": 3.5005321502685547, "logps/chosen": -25.704730987548828, "logps/rejected": -26.574832916259766, "loss": 1.0452, "nll_loss": 0.5040143132209778, "rewards/accuracies": 1.0, "rewards/chosen": 1.2542115449905396, "rewards/margins": 0.866875171661377, "rewards/rejected": 0.3873363435268402, "step": 1140 }, { "epoch": 0.19016666666666668, "grad_norm": 20.519012451171875, "learning_rate": 1.8684442410041576e-07, "logits/chosen": 2.2092230319976807, "logits/rejected": 1.9865894317626953, "logps/chosen": -135.1603546142578, "logps/rejected": -97.14875030517578, "loss": 0.8092, "nll_loss": 0.7812738418579102, "rewards/accuracies": 1.0, "rewards/chosen": 2.6737842559814453, "rewards/margins": 5.545633316040039, "rewards/rejected": -2.871849298477173, "step": 1141 }, { "epoch": 0.19033333333333333, "grad_norm": 22.92801284790039, "learning_rate": 1.8681764924463616e-07, "logits/chosen": 2.617558717727661, "logits/rejected": 2.7989304065704346, "logps/chosen": -97.2159194946289, "logps/rejected": -250.76771545410156, "loss": 0.7045, "nll_loss": 0.6704546213150024, "rewards/accuracies": 1.0, "rewards/chosen": 1.6085548400878906, "rewards/margins": 5.707931041717529, "rewards/rejected": -4.099376201629639, "step": 1142 }, { "epoch": 0.1905, "grad_norm": 40.33667755126953, "learning_rate": 1.8679084909227786e-07, "logits/chosen": 1.806179165840149, "logits/rejected": 1.603706955909729, "logps/chosen": -68.9869384765625, "logps/rejected": -54.399330139160156, "loss": 0.9577, "nll_loss": 0.8623368144035339, "rewards/accuracies": 1.0, "rewards/chosen": 1.033105492591858, "rewards/margins": 3.419300079345703, "rewards/rejected": -2.3861944675445557, "step": 1143 }, { "epoch": 0.19066666666666668, "grad_norm": 35.579124450683594, "learning_rate": 1.867640236511498e-07, "logits/chosen": 1.186073899269104, "logits/rejected": 2.9958221912384033, "logps/chosen": -35.487648010253906, "logps/rejected": -502.78912353515625, "loss": 0.8137, "nll_loss": 0.7886142134666443, "rewards/accuracies": 1.0, "rewards/chosen": 1.704203486442566, "rewards/margins": 8.228739738464355, "rewards/rejected": -6.5245361328125, "step": 1144 }, { "epoch": 0.19083333333333333, "grad_norm": 21.440418243408203, "learning_rate": 1.8673717292906828e-07, "logits/chosen": 2.1558728218078613, "logits/rejected": 2.1553666591644287, "logps/chosen": -97.56222534179688, "logps/rejected": -146.41921997070312, "loss": 0.8386, "nll_loss": 0.8267984986305237, "rewards/accuracies": 1.0, "rewards/chosen": 2.682485342025757, "rewards/margins": 7.66831111907959, "rewards/rejected": -4.985826015472412, "step": 1145 }, { "epoch": 0.191, "grad_norm": 68.36588287353516, "learning_rate": 1.867102969338569e-07, "logits/chosen": 2.308746337890625, "logits/rejected": 1.600696086883545, "logps/chosen": -211.22161865234375, "logps/rejected": -139.02847290039062, "loss": 1.17, "nll_loss": 0.9689066410064697, "rewards/accuracies": 1.0, "rewards/chosen": 1.103704810142517, "rewards/margins": 2.2156524658203125, "rewards/rejected": -1.1119476556777954, "step": 1146 }, { "epoch": 0.19116666666666668, "grad_norm": 67.9195556640625, "learning_rate": 1.866833956733467e-07, "logits/chosen": 2.200082778930664, "logits/rejected": 2.4827828407287598, "logps/chosen": -26.10957908630371, "logps/rejected": -265.78643798828125, "loss": 1.171, "nll_loss": 1.0878992080688477, "rewards/accuracies": 1.0, "rewards/chosen": 0.954793393611908, "rewards/margins": 3.7659873962402344, "rewards/rejected": -2.8111939430236816, "step": 1147 }, { "epoch": 0.19133333333333333, "grad_norm": 39.845218658447266, "learning_rate": 1.8665646915537608e-07, "logits/chosen": 1.96956467628479, "logits/rejected": 2.5561683177948, "logps/chosen": -87.71709442138672, "logps/rejected": -159.636474609375, "loss": 1.0582, "nll_loss": 0.9746342301368713, "rewards/accuracies": 1.0, "rewards/chosen": 1.0947922468185425, "rewards/margins": 3.661041259765625, "rewards/rejected": -2.566249132156372, "step": 1148 }, { "epoch": 0.1915, "grad_norm": 32.545936584472656, "learning_rate": 1.8662951738779076e-07, "logits/chosen": 1.2609219551086426, "logits/rejected": 2.198174476623535, "logps/chosen": -69.93619537353516, "logps/rejected": -318.97369384765625, "loss": 0.8361, "nll_loss": 0.7770688533782959, "rewards/accuracies": 1.0, "rewards/chosen": 0.9934120774269104, "rewards/margins": 4.841410160064697, "rewards/rejected": -3.8479981422424316, "step": 1149 }, { "epoch": 0.19166666666666668, "grad_norm": 28.80196762084961, "learning_rate": 1.8660254037844388e-07, "logits/chosen": 1.9318867921829224, "logits/rejected": 2.381225109100342, "logps/chosen": -95.82599639892578, "logps/rejected": -246.7410888671875, "loss": 1.0679, "nll_loss": 1.0303869247436523, "rewards/accuracies": 1.0, "rewards/chosen": 1.7990188598632812, "rewards/margins": 5.084102153778076, "rewards/rejected": -3.285083293914795, "step": 1150 }, { "epoch": 0.19183333333333333, "grad_norm": 139.8633575439453, "learning_rate": 1.865755381351958e-07, "logits/chosen": 2.7831027507781982, "logits/rejected": 2.916414260864258, "logps/chosen": -74.07648468017578, "logps/rejected": -124.18424987792969, "loss": 2.1134, "nll_loss": 1.7637256383895874, "rewards/accuracies": 1.0, "rewards/chosen": -0.1602630615234375, "rewards/margins": 1.399427056312561, "rewards/rejected": -1.5596901178359985, "step": 1151 }, { "epoch": 0.192, "grad_norm": 26.4075870513916, "learning_rate": 1.8654851066591447e-07, "logits/chosen": 2.1382317543029785, "logits/rejected": 2.9897818565368652, "logps/chosen": -54.88469696044922, "logps/rejected": -350.587158203125, "loss": 0.7823, "nll_loss": 0.7622874975204468, "rewards/accuracies": 1.0, "rewards/chosen": 2.031660556793213, "rewards/margins": 7.211705207824707, "rewards/rejected": -5.180044651031494, "step": 1152 }, { "epoch": 0.19216666666666668, "grad_norm": 40.99347686767578, "learning_rate": 1.8652145797847488e-07, "logits/chosen": 2.247826099395752, "logits/rejected": 2.5117685794830322, "logps/chosen": -41.27236557006836, "logps/rejected": -36.35860061645508, "loss": 0.9324, "nll_loss": 0.8598408699035645, "rewards/accuracies": 1.0, "rewards/chosen": 1.4545776844024658, "rewards/margins": 3.829371452331543, "rewards/rejected": -2.374793767929077, "step": 1153 }, { "epoch": 0.19233333333333333, "grad_norm": 38.7778434753418, "learning_rate": 1.8649438008075967e-07, "logits/chosen": 2.4262073040008545, "logits/rejected": 2.8011462688446045, "logps/chosen": -69.72364807128906, "logps/rejected": -123.98111724853516, "loss": 0.8083, "nll_loss": 0.6835651993751526, "rewards/accuracies": 1.0, "rewards/chosen": 1.414941430091858, "rewards/margins": 2.9597954750061035, "rewards/rejected": -1.5448540449142456, "step": 1154 }, { "epoch": 0.1925, "grad_norm": 35.53172302246094, "learning_rate": 1.8646727698065863e-07, "logits/chosen": 1.4717302322387695, "logits/rejected": 1.7191555500030518, "logps/chosen": -68.69619750976562, "logps/rejected": -79.36080932617188, "loss": 0.8029, "nll_loss": 0.6801602244377136, "rewards/accuracies": 1.0, "rewards/chosen": 1.5170501470565796, "rewards/margins": 3.0055441856384277, "rewards/rejected": -1.4884941577911377, "step": 1155 }, { "epoch": 0.19266666666666668, "grad_norm": 111.62886810302734, "learning_rate": 1.8644014868606895e-07, "logits/chosen": 2.8693768978118896, "logits/rejected": 2.969001531600952, "logps/chosen": -39.226097106933594, "logps/rejected": -63.517704010009766, "loss": 1.2008, "nll_loss": 0.5684940218925476, "rewards/accuracies": 1.0, "rewards/chosen": 1.404870629310608, "rewards/margins": 0.679508626461029, "rewards/rejected": 0.7253620028495789, "step": 1156 }, { "epoch": 0.19283333333333333, "grad_norm": 33.0346794128418, "learning_rate": 1.8641299520489518e-07, "logits/chosen": 2.5809738636016846, "logits/rejected": 2.906243085861206, "logps/chosen": -86.37466430664062, "logps/rejected": -491.1263427734375, "loss": 1.1097, "nll_loss": 1.0533496141433716, "rewards/accuracies": 1.0, "rewards/chosen": 1.166236162185669, "rewards/margins": 4.635425567626953, "rewards/rejected": -3.469189405441284, "step": 1157 }, { "epoch": 0.193, "grad_norm": 34.625179290771484, "learning_rate": 1.8638581654504916e-07, "logits/chosen": 2.219891309738159, "logits/rejected": 2.775686502456665, "logps/chosen": -60.16353225708008, "logps/rejected": -125.02510070800781, "loss": 0.7602, "nll_loss": 0.6836764812469482, "rewards/accuracies": 1.0, "rewards/chosen": 1.1163936853408813, "rewards/margins": 3.840691089630127, "rewards/rejected": -2.724297285079956, "step": 1158 }, { "epoch": 0.19316666666666665, "grad_norm": 202.80935668945312, "learning_rate": 1.8635861271445014e-07, "logits/chosen": 3.3489205837249756, "logits/rejected": 3.3917930126190186, "logps/chosen": -32.61958312988281, "logps/rejected": -66.66290283203125, "loss": 1.9101, "nll_loss": 0.61546391248703, "rewards/accuracies": 0.0, "rewards/chosen": 0.8969661593437195, "rewards/margins": -0.6246114373207092, "rewards/rejected": 1.5215775966644287, "step": 1159 }, { "epoch": 0.19333333333333333, "grad_norm": 43.186885833740234, "learning_rate": 1.8633138372102466e-07, "logits/chosen": 2.8961658477783203, "logits/rejected": 3.0099377632141113, "logps/chosen": -14.257071495056152, "logps/rejected": -160.5648193359375, "loss": 0.5703, "nll_loss": 0.528039813041687, "rewards/accuracies": 1.0, "rewards/chosen": 1.7227444648742676, "rewards/margins": 4.846227645874023, "rewards/rejected": -3.123483419418335, "step": 1160 }, { "epoch": 0.1935, "grad_norm": 44.684654235839844, "learning_rate": 1.8630412957270657e-07, "logits/chosen": 2.929509162902832, "logits/rejected": 2.9940106868743896, "logps/chosen": -52.92201232910156, "logps/rejected": -143.82664489746094, "loss": 1.1464, "nll_loss": 1.0584402084350586, "rewards/accuracies": 1.0, "rewards/chosen": 0.49742740392684937, "rewards/margins": 4.3615617752075195, "rewards/rejected": -3.8641343116760254, "step": 1161 }, { "epoch": 0.19366666666666665, "grad_norm": 36.603755950927734, "learning_rate": 1.862768502774371e-07, "logits/chosen": 3.332035541534424, "logits/rejected": 3.1288435459136963, "logps/chosen": -69.71638488769531, "logps/rejected": -48.509220123291016, "loss": 0.9221, "nll_loss": 0.850199818611145, "rewards/accuracies": 1.0, "rewards/chosen": 1.1347633600234985, "rewards/margins": 3.9747304916381836, "rewards/rejected": -2.8399672508239746, "step": 1162 }, { "epoch": 0.19383333333333333, "grad_norm": 99.15402221679688, "learning_rate": 1.8624954584316474e-07, "logits/chosen": 3.676069974899292, "logits/rejected": 3.681849956512451, "logps/chosen": -35.0175666809082, "logps/rejected": -49.67227554321289, "loss": 1.9767, "nll_loss": 1.6675031185150146, "rewards/accuracies": 1.0, "rewards/chosen": 0.3536495268344879, "rewards/margins": 1.5333912372589111, "rewards/rejected": -1.1797417402267456, "step": 1163 }, { "epoch": 0.194, "grad_norm": 44.045658111572266, "learning_rate": 1.862222162778454e-07, "logits/chosen": 3.435364246368408, "logits/rejected": 3.4820311069488525, "logps/chosen": -85.04228210449219, "logps/rejected": -188.55409240722656, "loss": 1.1402, "nll_loss": 1.0764844417572021, "rewards/accuracies": 1.0, "rewards/chosen": 0.8192794919013977, "rewards/margins": 5.0478291511535645, "rewards/rejected": -4.228549480438232, "step": 1164 }, { "epoch": 0.19416666666666665, "grad_norm": 40.91101837158203, "learning_rate": 1.861948615894422e-07, "logits/chosen": 2.522033929824829, "logits/rejected": 2.357710361480713, "logps/chosen": -40.13514709472656, "logps/rejected": -87.29106140136719, "loss": 0.6984, "nll_loss": 0.5816687345504761, "rewards/accuracies": 1.0, "rewards/chosen": 0.9661049246788025, "rewards/margins": 3.0568597316741943, "rewards/rejected": -2.090754747390747, "step": 1165 }, { "epoch": 0.19433333333333333, "grad_norm": 68.72229766845703, "learning_rate": 1.8616748178592571e-07, "logits/chosen": 2.5841147899627686, "logits/rejected": 2.7287817001342773, "logps/chosen": -46.029930114746094, "logps/rejected": -170.89544677734375, "loss": 1.4851, "nll_loss": 1.3948463201522827, "rewards/accuracies": 1.0, "rewards/chosen": 1.0059818029403687, "rewards/margins": 3.5401463508605957, "rewards/rejected": -2.5341644287109375, "step": 1166 }, { "epoch": 0.1945, "grad_norm": 44.2335205078125, "learning_rate": 1.8614007687527372e-07, "logits/chosen": 1.8700186014175415, "logits/rejected": 2.820981025695801, "logps/chosen": -43.38838577270508, "logps/rejected": -185.3074188232422, "loss": 0.9508, "nll_loss": 0.8854772448539734, "rewards/accuracies": 1.0, "rewards/chosen": 1.4665422439575195, "rewards/margins": 4.023088455200195, "rewards/rejected": -2.5565459728240967, "step": 1167 }, { "epoch": 0.19466666666666665, "grad_norm": 35.28886413574219, "learning_rate": 1.8611264686547133e-07, "logits/chosen": 0.8927727937698364, "logits/rejected": 2.17726469039917, "logps/chosen": -44.070823669433594, "logps/rejected": -171.40032958984375, "loss": 0.7078, "nll_loss": 0.6295831799507141, "rewards/accuracies": 1.0, "rewards/chosen": 0.7643830180168152, "rewards/margins": 4.183004856109619, "rewards/rejected": -3.418621778488159, "step": 1168 }, { "epoch": 0.19483333333333333, "grad_norm": 30.600337982177734, "learning_rate": 1.86085191764511e-07, "logits/chosen": 2.435983657836914, "logits/rejected": 2.4205515384674072, "logps/chosen": -67.12997436523438, "logps/rejected": -84.4012222290039, "loss": 0.9242, "nll_loss": 0.871817946434021, "rewards/accuracies": 1.0, "rewards/chosen": 1.7549728155136108, "rewards/margins": 4.3859543800354, "rewards/rejected": -2.6309814453125, "step": 1169 }, { "epoch": 0.195, "grad_norm": 31.080881118774414, "learning_rate": 1.8605771158039253e-07, "logits/chosen": 1.539946436882019, "logits/rejected": 2.244515895843506, "logps/chosen": -57.38043975830078, "logps/rejected": -235.02362060546875, "loss": 0.7004, "nll_loss": 0.6447240710258484, "rewards/accuracies": 1.0, "rewards/chosen": 1.0500274896621704, "rewards/margins": 4.945045471191406, "rewards/rejected": -3.8950181007385254, "step": 1170 }, { "epoch": 0.19516666666666665, "grad_norm": 27.829133987426758, "learning_rate": 1.860302063211229e-07, "logits/chosen": 2.7272541522979736, "logits/rejected": 2.732032060623169, "logps/chosen": -97.92543029785156, "logps/rejected": -154.4124755859375, "loss": 0.8781, "nll_loss": 0.8369695544242859, "rewards/accuracies": 1.0, "rewards/chosen": 1.8448243141174316, "rewards/margins": 4.842108249664307, "rewards/rejected": -2.997283935546875, "step": 1171 }, { "epoch": 0.19533333333333333, "grad_norm": 31.78695297241211, "learning_rate": 1.860026759947166e-07, "logits/chosen": 2.145359754562378, "logits/rejected": 2.202291250228882, "logps/chosen": -81.01042175292969, "logps/rejected": -178.3079376220703, "loss": 0.9211, "nll_loss": 0.8902245163917542, "rewards/accuracies": 1.0, "rewards/chosen": 1.5707184076309204, "rewards/margins": 6.458320617675781, "rewards/rejected": -4.88760232925415, "step": 1172 }, { "epoch": 0.1955, "grad_norm": 21.862409591674805, "learning_rate": 1.859751206091952e-07, "logits/chosen": 1.2266566753387451, "logits/rejected": 2.399538040161133, "logps/chosen": -124.02960205078125, "logps/rejected": -416.24981689453125, "loss": 0.9668, "nll_loss": 0.954073965549469, "rewards/accuracies": 1.0, "rewards/chosen": 2.3837342262268066, "rewards/margins": 10.461688995361328, "rewards/rejected": -8.077954292297363, "step": 1173 }, { "epoch": 0.19566666666666666, "grad_norm": 30.582542419433594, "learning_rate": 1.8594754017258774e-07, "logits/chosen": 0.17923197150230408, "logits/rejected": 3.4730424880981445, "logps/chosen": -117.91815948486328, "logps/rejected": -83.24705505371094, "loss": 1.1065, "nll_loss": 1.0719832181930542, "rewards/accuracies": 1.0, "rewards/chosen": 1.5822153091430664, "rewards/margins": 5.67888879776001, "rewards/rejected": -4.096673488616943, "step": 1174 }, { "epoch": 0.19583333333333333, "grad_norm": 28.263179779052734, "learning_rate": 1.8591993469293046e-07, "logits/chosen": 2.74362850189209, "logits/rejected": 2.8774712085723877, "logps/chosen": -33.72999954223633, "logps/rejected": -215.18673706054688, "loss": 0.5923, "nll_loss": 0.5529508590698242, "rewards/accuracies": 1.0, "rewards/chosen": 1.3502674102783203, "rewards/margins": 5.798590660095215, "rewards/rejected": -4.4483232498168945, "step": 1175 }, { "epoch": 0.196, "grad_norm": 33.56565856933594, "learning_rate": 1.8589230417826695e-07, "logits/chosen": 1.7793940305709839, "logits/rejected": 2.63667893409729, "logps/chosen": -124.68480682373047, "logps/rejected": -357.410400390625, "loss": 1.0271, "nll_loss": 0.9974784255027771, "rewards/accuracies": 1.0, "rewards/chosen": 1.5722817182540894, "rewards/margins": 6.8300700187683105, "rewards/rejected": -5.257788181304932, "step": 1176 }, { "epoch": 0.19616666666666666, "grad_norm": 45.381927490234375, "learning_rate": 1.858646486366481e-07, "logits/chosen": 2.315305709838867, "logits/rejected": 1.8238580226898193, "logps/chosen": -75.60842895507812, "logps/rejected": -73.70162963867188, "loss": 1.1353, "nll_loss": 1.0357320308685303, "rewards/accuracies": 1.0, "rewards/chosen": 0.8255791068077087, "rewards/margins": 3.4236416816711426, "rewards/rejected": -2.598062515258789, "step": 1177 }, { "epoch": 0.19633333333333333, "grad_norm": 120.40264129638672, "learning_rate": 1.8583696807613205e-07, "logits/chosen": 2.8617146015167236, "logits/rejected": 2.81929612159729, "logps/chosen": -18.625089645385742, "logps/rejected": -38.988285064697266, "loss": 1.8412, "nll_loss": 1.4326990842819214, "rewards/accuracies": 1.0, "rewards/chosen": -0.08707218617200851, "rewards/margins": 1.12850821018219, "rewards/rejected": -1.2155803442001343, "step": 1178 }, { "epoch": 0.1965, "grad_norm": 25.86678695678711, "learning_rate": 1.8580926250478425e-07, "logits/chosen": 1.6937354803085327, "logits/rejected": 2.421729326248169, "logps/chosen": -39.54097366333008, "logps/rejected": -407.0440673828125, "loss": 0.6026, "nll_loss": 0.5730576515197754, "rewards/accuracies": 1.0, "rewards/chosen": 1.7593400478363037, "rewards/margins": 5.894233703613281, "rewards/rejected": -4.134893894195557, "step": 1179 }, { "epoch": 0.19666666666666666, "grad_norm": 21.539140701293945, "learning_rate": 1.8578153193067743e-07, "logits/chosen": 1.1209162473678589, "logits/rejected": 2.88464617729187, "logps/chosen": -7.262392520904541, "logps/rejected": -444.5472412109375, "loss": 0.256, "nll_loss": 0.19628088176250458, "rewards/accuracies": 1.0, "rewards/chosen": 0.7571747303009033, "rewards/margins": 6.2445831298828125, "rewards/rejected": -5.487408638000488, "step": 1180 }, { "epoch": 0.19683333333333333, "grad_norm": 38.06412887573242, "learning_rate": 1.8575377636189162e-07, "logits/chosen": 1.638434648513794, "logits/rejected": 1.597527027130127, "logps/chosen": -93.67786407470703, "logps/rejected": -111.41805267333984, "loss": 1.0814, "nll_loss": 0.9965728521347046, "rewards/accuracies": 1.0, "rewards/chosen": 1.2201988697052002, "rewards/margins": 3.581601142883301, "rewards/rejected": -2.3614022731781006, "step": 1181 }, { "epoch": 0.197, "grad_norm": 72.47123718261719, "learning_rate": 1.8572599580651413e-07, "logits/chosen": 2.568525791168213, "logits/rejected": 2.5175626277923584, "logps/chosen": -36.52397537231445, "logps/rejected": -28.622968673706055, "loss": 1.2806, "nll_loss": 1.0742346048355103, "rewards/accuracies": 1.0, "rewards/chosen": 0.5276805758476257, "rewards/margins": 2.1552789211273193, "rewards/rejected": -1.6275982856750488, "step": 1182 }, { "epoch": 0.19716666666666666, "grad_norm": 48.051509857177734, "learning_rate": 1.8569819027263955e-07, "logits/chosen": 2.931309700012207, "logits/rejected": 3.0484507083892822, "logps/chosen": -99.68531799316406, "logps/rejected": -120.81407165527344, "loss": 1.2726, "nll_loss": 1.1727685928344727, "rewards/accuracies": 1.0, "rewards/chosen": 0.5378555655479431, "rewards/margins": 3.7023911476135254, "rewards/rejected": -3.1645355224609375, "step": 1183 }, { "epoch": 0.19733333333333333, "grad_norm": 35.09181213378906, "learning_rate": 1.8567035976836974e-07, "logits/chosen": 1.252990961074829, "logits/rejected": 2.3718013763427734, "logps/chosen": -31.604022979736328, "logps/rejected": -191.52059936523438, "loss": 0.6935, "nll_loss": 0.6449800133705139, "rewards/accuracies": 1.0, "rewards/chosen": 0.9387760758399963, "rewards/margins": 9.436052322387695, "rewards/rejected": -8.497276306152344, "step": 1184 }, { "epoch": 0.1975, "grad_norm": 28.37973403930664, "learning_rate": 1.8564250430181384e-07, "logits/chosen": 2.245990037918091, "logits/rejected": 2.4106085300445557, "logps/chosen": -68.57709503173828, "logps/rejected": -148.57798767089844, "loss": 0.8173, "nll_loss": 0.7882424592971802, "rewards/accuracies": 1.0, "rewards/chosen": 1.5936295986175537, "rewards/margins": 6.78721809387207, "rewards/rejected": -5.1935882568359375, "step": 1185 }, { "epoch": 0.19766666666666666, "grad_norm": 40.297550201416016, "learning_rate": 1.8561462388108827e-07, "logits/chosen": 0.9811115264892578, "logits/rejected": 2.718416452407837, "logps/chosen": -51.49604034423828, "logps/rejected": -309.04248046875, "loss": 1.0129, "nll_loss": 0.9536304473876953, "rewards/accuracies": 1.0, "rewards/chosen": 0.7387199401855469, "rewards/margins": 6.700924396514893, "rewards/rejected": -5.962204456329346, "step": 1186 }, { "epoch": 0.19783333333333333, "grad_norm": 97.32675170898438, "learning_rate": 1.855867185143167e-07, "logits/chosen": 2.358275890350342, "logits/rejected": 2.5363047122955322, "logps/chosen": -74.9918212890625, "logps/rejected": -154.1259765625, "loss": 2.8276, "nll_loss": 2.678279161453247, "rewards/accuracies": 1.0, "rewards/chosen": -0.4348312318325043, "rewards/margins": 5.697187900543213, "rewards/rejected": -6.13201904296875, "step": 1187 }, { "epoch": 0.198, "grad_norm": 26.137500762939453, "learning_rate": 1.8555878820963012e-07, "logits/chosen": 2.328575372695923, "logits/rejected": 2.4061036109924316, "logps/chosen": -77.2228012084961, "logps/rejected": -115.61127471923828, "loss": 0.8619, "nll_loss": 0.8303527235984802, "rewards/accuracies": 1.0, "rewards/chosen": 1.676326870918274, "rewards/margins": 5.784928798675537, "rewards/rejected": -4.108602046966553, "step": 1188 }, { "epoch": 0.19816666666666666, "grad_norm": 170.7095184326172, "learning_rate": 1.855308329751667e-07, "logits/chosen": 3.1525940895080566, "logits/rejected": 3.0734376907348633, "logps/chosen": -111.1483383178711, "logps/rejected": -47.394508361816406, "loss": 2.8898, "nll_loss": 2.2229666709899902, "rewards/accuracies": 1.0, "rewards/chosen": -1.253048062324524, "rewards/margins": 0.5106003284454346, "rewards/rejected": -1.7636483907699585, "step": 1189 }, { "epoch": 0.19833333333333333, "grad_norm": 150.08999633789062, "learning_rate": 1.8550285281907198e-07, "logits/chosen": 3.4301257133483887, "logits/rejected": 3.392421245574951, "logps/chosen": -97.4663314819336, "logps/rejected": -39.41728210449219, "loss": 2.0965, "nll_loss": 1.048025131225586, "rewards/accuracies": 0.0, "rewards/chosen": 1.7177391052246094, "rewards/margins": -0.052765727043151855, "rewards/rejected": 1.7705048322677612, "step": 1190 }, { "epoch": 0.1985, "grad_norm": 66.9994888305664, "learning_rate": 1.8547484774949865e-07, "logits/chosen": 2.1783995628356934, "logits/rejected": 2.1519665718078613, "logps/chosen": -271.605224609375, "logps/rejected": -242.19288635253906, "loss": 1.364, "nll_loss": 1.1757802963256836, "rewards/accuracies": 1.0, "rewards/chosen": -0.7957153916358948, "rewards/margins": 6.159947395324707, "rewards/rejected": -6.955662727355957, "step": 1191 }, { "epoch": 0.19866666666666666, "grad_norm": 193.53921508789062, "learning_rate": 1.8544681777460674e-07, "logits/chosen": 3.0466175079345703, "logits/rejected": 2.992840051651001, "logps/chosen": -27.632259368896484, "logps/rejected": -24.1024112701416, "loss": 1.5178, "nll_loss": 0.6007013916969299, "rewards/accuracies": 1.0, "rewards/chosen": 1.5498764514923096, "rewards/margins": 0.13456523418426514, "rewards/rejected": 1.4153112173080444, "step": 1192 }, { "epoch": 0.19883333333333333, "grad_norm": 51.3208122253418, "learning_rate": 1.8541876290256354e-07, "logits/chosen": 2.6775026321411133, "logits/rejected": 2.9273648262023926, "logps/chosen": -69.66710662841797, "logps/rejected": -143.71847534179688, "loss": 1.1461, "nll_loss": 1.009668231010437, "rewards/accuracies": 1.0, "rewards/chosen": 1.832283854484558, "rewards/margins": 2.9746742248535156, "rewards/rejected": -1.142390489578247, "step": 1193 }, { "epoch": 0.199, "grad_norm": 284.5718688964844, "learning_rate": 1.853906831415435e-07, "logits/chosen": 1.876865267753601, "logits/rejected": 2.0605480670928955, "logps/chosen": -22.115081787109375, "logps/rejected": -79.77393341064453, "loss": 3.3459, "nll_loss": 0.374831885099411, "rewards/accuracies": 0.0, "rewards/chosen": 1.8294124603271484, "rewards/margins": -2.4432387351989746, "rewards/rejected": 4.272651195526123, "step": 1194 }, { "epoch": 0.19916666666666666, "grad_norm": 41.1924934387207, "learning_rate": 1.8536257849972844e-07, "logits/chosen": 1.6373095512390137, "logits/rejected": 1.7879608869552612, "logps/chosen": -71.54147338867188, "logps/rejected": -77.40376281738281, "loss": 1.0896, "nll_loss": 0.9936314821243286, "rewards/accuracies": 1.0, "rewards/chosen": 0.596173882484436, "rewards/margins": 3.731827735900879, "rewards/rejected": -3.1356537342071533, "step": 1195 }, { "epoch": 0.19933333333333333, "grad_norm": 44.1147575378418, "learning_rate": 1.853344489853074e-07, "logits/chosen": 2.303968906402588, "logits/rejected": 2.3735873699188232, "logps/chosen": -87.82489013671875, "logps/rejected": -101.64007568359375, "loss": 1.3693, "nll_loss": 1.3108192682266235, "rewards/accuracies": 1.0, "rewards/chosen": 1.15008544921875, "rewards/margins": 4.500823497772217, "rewards/rejected": -3.350738048553467, "step": 1196 }, { "epoch": 0.1995, "grad_norm": 80.85449981689453, "learning_rate": 1.8530629460647656e-07, "logits/chosen": 2.7657456398010254, "logits/rejected": 2.6236040592193604, "logps/chosen": -17.38751983642578, "logps/rejected": -13.809563636779785, "loss": 1.5176, "nll_loss": 0.620982825756073, "rewards/accuracies": 0.0, "rewards/chosen": 0.4121505916118622, "rewards/margins": -0.06541013717651367, "rewards/rejected": 0.47756072878837585, "step": 1197 }, { "epoch": 0.19966666666666666, "grad_norm": 102.25452423095703, "learning_rate": 1.852781153714395e-07, "logits/chosen": 2.5156567096710205, "logits/rejected": 2.5492610931396484, "logps/chosen": -93.92587280273438, "logps/rejected": -97.51820373535156, "loss": 1.5177, "nll_loss": 1.1740734577178955, "rewards/accuracies": 1.0, "rewards/chosen": 0.5735405087471008, "rewards/margins": 1.3848273754119873, "rewards/rejected": -0.8112869262695312, "step": 1198 }, { "epoch": 0.19983333333333334, "grad_norm": 23.749574661254883, "learning_rate": 1.8524991128840698e-07, "logits/chosen": 3.3313755989074707, "logits/rejected": 3.4010491371154785, "logps/chosen": -101.50631713867188, "logps/rejected": -143.3438720703125, "loss": 0.9601, "nll_loss": 0.939873218536377, "rewards/accuracies": 1.0, "rewards/chosen": 2.503358840942383, "rewards/margins": 6.138182640075684, "rewards/rejected": -3.6348235607147217, "step": 1199 }, { "epoch": 0.2, "grad_norm": 23.696468353271484, "learning_rate": 1.8522168236559692e-07, "logits/chosen": 2.5688364505767822, "logits/rejected": 2.795426607131958, "logps/chosen": -129.89678955078125, "logps/rejected": -246.52279663085938, "loss": 1.0067, "nll_loss": 0.9840666055679321, "rewards/accuracies": 1.0, "rewards/chosen": 1.9448273181915283, "rewards/margins": 6.65423583984375, "rewards/rejected": -4.709408760070801, "step": 1200 }, { "epoch": 0.20016666666666666, "grad_norm": 40.033180236816406, "learning_rate": 1.8519342861123462e-07, "logits/chosen": 1.3306281566619873, "logits/rejected": 2.366098165512085, "logps/chosen": -22.455421447753906, "logps/rejected": -188.2357635498047, "loss": 0.6723, "nll_loss": 0.5909321308135986, "rewards/accuracies": 1.0, "rewards/chosen": 0.779788613319397, "rewards/margins": 3.9862632751464844, "rewards/rejected": -3.206474542617798, "step": 1201 }, { "epoch": 0.20033333333333334, "grad_norm": 200.3350372314453, "learning_rate": 1.851651500335525e-07, "logits/chosen": 3.242062568664551, "logits/rejected": 3.165055990219116, "logps/chosen": -37.79738998413086, "logps/rejected": -166.17648315429688, "loss": 1.5101, "nll_loss": 0.7713753581047058, "rewards/accuracies": 1.0, "rewards/chosen": 0.5024952292442322, "rewards/margins": 0.25368237495422363, "rewards/rejected": 0.24881286919116974, "step": 1202 }, { "epoch": 0.2005, "grad_norm": 38.31722640991211, "learning_rate": 1.8513684664079032e-07, "logits/chosen": 2.407048225402832, "logits/rejected": 2.681756019592285, "logps/chosen": -23.940322875976562, "logps/rejected": -155.39999389648438, "loss": 0.5403, "nll_loss": 0.5093685984611511, "rewards/accuracies": 1.0, "rewards/chosen": 2.29541277885437, "rewards/margins": 5.321152687072754, "rewards/rejected": -3.025740146636963, "step": 1203 }, { "epoch": 0.20066666666666666, "grad_norm": 159.9829864501953, "learning_rate": 1.8510851844119492e-07, "logits/chosen": 2.727328300476074, "logits/rejected": 3.012747049331665, "logps/chosen": -83.29747009277344, "logps/rejected": -85.9979248046875, "loss": 1.9223, "nll_loss": 1.3435075283050537, "rewards/accuracies": 1.0, "rewards/chosen": -1.2795411348342896, "rewards/margins": 0.8062864542007446, "rewards/rejected": -2.085827589035034, "step": 1204 }, { "epoch": 0.20083333333333334, "grad_norm": 48.29962158203125, "learning_rate": 1.8508016544302055e-07, "logits/chosen": 1.847987174987793, "logits/rejected": 1.9241399765014648, "logps/chosen": -13.969255447387695, "logps/rejected": -55.4255256652832, "loss": 0.6021, "nll_loss": 0.517379879951477, "rewards/accuracies": 1.0, "rewards/chosen": 0.8836321830749512, "rewards/margins": 3.7530643939971924, "rewards/rejected": -2.869432210922241, "step": 1205 }, { "epoch": 0.201, "grad_norm": 46.99458312988281, "learning_rate": 1.8505178765452851e-07, "logits/chosen": 2.811919689178467, "logits/rejected": 2.7391958236694336, "logps/chosen": -69.77316284179688, "logps/rejected": -91.93364715576172, "loss": 0.947, "nll_loss": 0.820860743522644, "rewards/accuracies": 1.0, "rewards/chosen": 1.0473419427871704, "rewards/margins": 2.9139761924743652, "rewards/rejected": -1.8666343688964844, "step": 1206 }, { "epoch": 0.20116666666666666, "grad_norm": 44.15155029296875, "learning_rate": 1.8502338508398747e-07, "logits/chosen": 3.0906643867492676, "logits/rejected": 3.0959808826446533, "logps/chosen": -66.16311645507812, "logps/rejected": -85.24400329589844, "loss": 0.878, "nll_loss": 0.7518537044525146, "rewards/accuracies": 1.0, "rewards/chosen": 1.351100206375122, "rewards/margins": 2.936279535293579, "rewards/rejected": -1.585179328918457, "step": 1207 }, { "epoch": 0.20133333333333334, "grad_norm": 36.532833099365234, "learning_rate": 1.8499495773967324e-07, "logits/chosen": 0.13457678258419037, "logits/rejected": 2.3508002758026123, "logps/chosen": -90.7850570678711, "logps/rejected": -439.9713134765625, "loss": 0.9451, "nll_loss": 0.8729333877563477, "rewards/accuracies": 1.0, "rewards/chosen": 0.47005695104599, "rewards/margins": 8.642236709594727, "rewards/rejected": -8.17218017578125, "step": 1208 }, { "epoch": 0.2015, "grad_norm": 42.593544006347656, "learning_rate": 1.8496650562986884e-07, "logits/chosen": 4.037491321563721, "logits/rejected": 3.6024391651153564, "logps/chosen": -13.23377799987793, "logps/rejected": -17.838783264160156, "loss": 0.7357, "nll_loss": 0.11507634073495865, "rewards/accuracies": 1.0, "rewards/chosen": -0.24779148399829865, "rewards/margins": 0.4738094210624695, "rewards/rejected": -0.7216008901596069, "step": 1209 }, { "epoch": 0.20166666666666666, "grad_norm": 33.06840896606445, "learning_rate": 1.8493802876286456e-07, "logits/chosen": 1.7431163787841797, "logits/rejected": 2.343919277191162, "logps/chosen": -55.1398811340332, "logps/rejected": -322.9701232910156, "loss": 0.8426, "nll_loss": 0.787712574005127, "rewards/accuracies": 1.0, "rewards/chosen": 0.7939667105674744, "rewards/margins": 7.973132610321045, "rewards/rejected": -7.179165840148926, "step": 1210 }, { "epoch": 0.20183333333333334, "grad_norm": 25.908016204833984, "learning_rate": 1.8490952714695784e-07, "logits/chosen": 2.500565767288208, "logits/rejected": 2.618295669555664, "logps/chosen": -115.82032012939453, "logps/rejected": -244.1031951904297, "loss": 1.0375, "nll_loss": 1.015967845916748, "rewards/accuracies": 1.0, "rewards/chosen": 1.9437065124511719, "rewards/margins": 6.970149040222168, "rewards/rejected": -5.026442527770996, "step": 1211 }, { "epoch": 0.202, "grad_norm": 74.80113983154297, "learning_rate": 1.8488100079045342e-07, "logits/chosen": 2.6473867893218994, "logits/rejected": 2.7786219120025635, "logps/chosen": -40.63929748535156, "logps/rejected": -346.58172607421875, "loss": 1.6604, "nll_loss": 1.6255720853805542, "rewards/accuracies": 1.0, "rewards/chosen": 1.4199905395507812, "rewards/margins": 6.227626323699951, "rewards/rejected": -4.80763578414917, "step": 1212 }, { "epoch": 0.20216666666666666, "grad_norm": 71.76296997070312, "learning_rate": 1.8485244970166317e-07, "logits/chosen": 0.7215235829353333, "logits/rejected": 1.2700144052505493, "logps/chosen": -29.840518951416016, "logps/rejected": -390.4790954589844, "loss": 1.4538, "nll_loss": 1.2974138259887695, "rewards/accuracies": 1.0, "rewards/chosen": -0.06232795864343643, "rewards/margins": 3.117222309112549, "rewards/rejected": -3.1795501708984375, "step": 1213 }, { "epoch": 0.20233333333333334, "grad_norm": 60.223697662353516, "learning_rate": 1.8482387388890618e-07, "logits/chosen": 1.4226435422897339, "logits/rejected": 2.3403501510620117, "logps/chosen": -84.44667053222656, "logps/rejected": -226.382080078125, "loss": 1.7793, "nll_loss": 1.7593055963516235, "rewards/accuracies": 1.0, "rewards/chosen": 1.9031343460083008, "rewards/margins": 8.703217506408691, "rewards/rejected": -6.800083160400391, "step": 1214 }, { "epoch": 0.2025, "grad_norm": 29.367753982543945, "learning_rate": 1.8479527336050878e-07, "logits/chosen": 3.0059099197387695, "logits/rejected": 3.014448642730713, "logps/chosen": -84.05104064941406, "logps/rejected": -126.63804626464844, "loss": 1.0377, "nll_loss": 1.0006076097488403, "rewards/accuracies": 1.0, "rewards/chosen": 1.7979538440704346, "rewards/margins": 5.0885114669799805, "rewards/rejected": -3.290557384490967, "step": 1215 }, { "epoch": 0.20266666666666666, "grad_norm": 17.693254470825195, "learning_rate": 1.8476664812480445e-07, "logits/chosen": 1.529766321182251, "logits/rejected": 0.6691815853118896, "logps/chosen": -160.5328369140625, "logps/rejected": -99.06887817382812, "loss": 0.6759, "nll_loss": 0.6525724530220032, "rewards/accuracies": 1.0, "rewards/chosen": 2.2545547485351562, "rewards/margins": 5.927501678466797, "rewards/rejected": -3.6729469299316406, "step": 1216 }, { "epoch": 0.20283333333333334, "grad_norm": 147.7724609375, "learning_rate": 1.8473799819013392e-07, "logits/chosen": 2.8086063861846924, "logits/rejected": 2.782032012939453, "logps/chosen": -64.61161041259766, "logps/rejected": -71.64651489257812, "loss": 1.5438, "nll_loss": 0.7601364850997925, "rewards/accuracies": 1.0, "rewards/chosen": 0.5460724234580994, "rewards/margins": 0.1703232228755951, "rewards/rejected": 0.3757492005825043, "step": 1217 }, { "epoch": 0.203, "grad_norm": 27.270631790161133, "learning_rate": 1.8470932356484506e-07, "logits/chosen": 2.7570931911468506, "logits/rejected": 2.857296943664551, "logps/chosen": -59.64812469482422, "logps/rejected": -153.639892578125, "loss": 0.8341, "nll_loss": 0.8060557246208191, "rewards/accuracies": 1.0, "rewards/chosen": 1.5974931716918945, "rewards/margins": 7.018393039703369, "rewards/rejected": -5.420899868011475, "step": 1218 }, { "epoch": 0.20316666666666666, "grad_norm": 291.3513488769531, "learning_rate": 1.84680624257293e-07, "logits/chosen": 1.440641164779663, "logits/rejected": 1.9386894702911377, "logps/chosen": -72.59208679199219, "logps/rejected": -83.18222045898438, "loss": 4.5784, "nll_loss": 1.3198562860488892, "rewards/accuracies": 0.0, "rewards/chosen": -0.24449920654296875, "rewards/margins": -3.061997413635254, "rewards/rejected": 2.817498207092285, "step": 1219 }, { "epoch": 0.20333333333333334, "grad_norm": 20.820960998535156, "learning_rate": 1.8465190027584002e-07, "logits/chosen": 1.0357204675674438, "logits/rejected": 1.9095563888549805, "logps/chosen": -131.65707397460938, "logps/rejected": -508.0299072265625, "loss": 0.8591, "nll_loss": 0.8439555168151855, "rewards/accuracies": 1.0, "rewards/chosen": 2.1785340309143066, "rewards/margins": 10.2158203125, "rewards/rejected": -8.037286758422852, "step": 1220 }, { "epoch": 0.2035, "grad_norm": 87.97158813476562, "learning_rate": 1.846231516288556e-07, "logits/chosen": 2.7332923412323, "logits/rejected": 2.7810404300689697, "logps/chosen": -12.121784210205078, "logps/rejected": -66.46861267089844, "loss": 1.46, "nll_loss": 1.0101487636566162, "rewards/accuracies": 1.0, "rewards/chosen": 0.09020061790943146, "rewards/margins": 0.9674569964408875, "rewards/rejected": -0.8772563934326172, "step": 1221 }, { "epoch": 0.20366666666666666, "grad_norm": 30.377138137817383, "learning_rate": 1.845943783247164e-07, "logits/chosen": 0.5497713088989258, "logits/rejected": 2.7774322032928467, "logps/chosen": -19.79402732849121, "logps/rejected": -278.6640930175781, "loss": 0.5729, "nll_loss": 0.5208954811096191, "rewards/accuracies": 1.0, "rewards/chosen": 1.1594663858413696, "rewards/margins": 4.8800811767578125, "rewards/rejected": -3.7206146717071533, "step": 1222 }, { "epoch": 0.20383333333333334, "grad_norm": 62.33089828491211, "learning_rate": 1.8456558037180628e-07, "logits/chosen": 2.801854133605957, "logits/rejected": 2.777099370956421, "logps/chosen": -68.01239013671875, "logps/rejected": -44.49393844604492, "loss": 1.568, "nll_loss": 1.4470722675323486, "rewards/accuracies": 1.0, "rewards/chosen": 0.4789589047431946, "rewards/margins": 3.2041842937469482, "rewards/rejected": -2.7252254486083984, "step": 1223 }, { "epoch": 0.204, "grad_norm": 92.5699462890625, "learning_rate": 1.8453675777851624e-07, "logits/chosen": 2.754059076309204, "logits/rejected": 2.362964391708374, "logps/chosen": -135.64292907714844, "logps/rejected": -14.322375297546387, "loss": 1.5903, "nll_loss": 1.0597102642059326, "rewards/accuracies": 1.0, "rewards/chosen": 2.373173475265503, "rewards/margins": 1.3195421695709229, "rewards/rejected": 1.05363130569458, "step": 1224 }, { "epoch": 0.20416666666666666, "grad_norm": 34.91926193237305, "learning_rate": 1.8450791055324454e-07, "logits/chosen": 3.037034034729004, "logits/rejected": 3.0046305656433105, "logps/chosen": -127.10818481445312, "logps/rejected": -41.61201858520508, "loss": 1.1588, "nll_loss": 1.0863944292068481, "rewards/accuracies": 1.0, "rewards/chosen": 1.5476808547973633, "rewards/margins": 3.824631452560425, "rewards/rejected": -2.2769505977630615, "step": 1225 }, { "epoch": 0.20433333333333334, "grad_norm": 27.475238800048828, "learning_rate": 1.8447903870439654e-07, "logits/chosen": 3.521242380142212, "logits/rejected": 3.7197954654693604, "logps/chosen": -70.86798095703125, "logps/rejected": -163.49496459960938, "loss": 0.8974, "nll_loss": 0.8749132752418518, "rewards/accuracies": 1.0, "rewards/chosen": 1.8344802856445312, "rewards/margins": 7.363372802734375, "rewards/rejected": -5.528892517089844, "step": 1226 }, { "epoch": 0.2045, "grad_norm": 68.12130737304688, "learning_rate": 1.8445014224038483e-07, "logits/chosen": 2.4400556087493896, "logits/rejected": 2.685715675354004, "logps/chosen": -25.11741065979004, "logps/rejected": -160.7448272705078, "loss": 0.5601, "nll_loss": 0.5344129800796509, "rewards/accuracies": 1.0, "rewards/chosen": 2.177704095840454, "rewards/margins": 5.737927436828613, "rewards/rejected": -3.560223340988159, "step": 1227 }, { "epoch": 0.20466666666666666, "grad_norm": 28.619892120361328, "learning_rate": 1.8442122116962908e-07, "logits/chosen": 1.6222035884857178, "logits/rejected": 1.3335570096969604, "logps/chosen": -80.98908996582031, "logps/rejected": -90.49517059326172, "loss": 0.8461, "nll_loss": 0.8180716633796692, "rewards/accuracies": 1.0, "rewards/chosen": 1.763819932937622, "rewards/margins": 6.054872512817383, "rewards/rejected": -4.29105281829834, "step": 1228 }, { "epoch": 0.20483333333333334, "grad_norm": 26.467498779296875, "learning_rate": 1.8439227550055628e-07, "logits/chosen": 0.8828450441360474, "logits/rejected": 1.6543267965316772, "logps/chosen": -55.536842346191406, "logps/rejected": -216.70401000976562, "loss": 0.6317, "nll_loss": 0.5785087943077087, "rewards/accuracies": 1.0, "rewards/chosen": 0.9781365394592285, "rewards/margins": 5.362215518951416, "rewards/rejected": -4.3840789794921875, "step": 1229 }, { "epoch": 0.205, "grad_norm": 28.073461532592773, "learning_rate": 1.8436330524160044e-07, "logits/chosen": 2.7899348735809326, "logits/rejected": 2.715296745300293, "logps/chosen": -63.852054595947266, "logps/rejected": -93.0589599609375, "loss": 0.7433, "nll_loss": 0.7016708254814148, "rewards/accuracies": 1.0, "rewards/chosen": 2.113973617553711, "rewards/margins": 4.792104721069336, "rewards/rejected": -2.678131103515625, "step": 1230 }, { "epoch": 0.20516666666666666, "grad_norm": 326.13897705078125, "learning_rate": 1.8433431040120285e-07, "logits/chosen": 3.294065237045288, "logits/rejected": 3.331303358078003, "logps/chosen": -193.35415649414062, "logps/rejected": -298.3199157714844, "loss": 1.5305, "nll_loss": 1.007053017616272, "rewards/accuracies": 1.0, "rewards/chosen": -1.4824646711349487, "rewards/margins": 1.124038815498352, "rewards/rejected": -2.606503486633301, "step": 1231 }, { "epoch": 0.20533333333333334, "grad_norm": 117.30386352539062, "learning_rate": 1.8430529098781186e-07, "logits/chosen": 2.5891947746276855, "logits/rejected": 2.63035249710083, "logps/chosen": -92.18843841552734, "logps/rejected": -93.32711791992188, "loss": 1.5591, "nll_loss": 1.0130598545074463, "rewards/accuracies": 1.0, "rewards/chosen": 1.2490440607070923, "rewards/margins": 0.8650550842285156, "rewards/rejected": 0.3839889466762543, "step": 1232 }, { "epoch": 0.2055, "grad_norm": 100.13888549804688, "learning_rate": 1.8427624700988305e-07, "logits/chosen": 2.201814651489258, "logits/rejected": 2.0296237468719482, "logps/chosen": -107.78884887695312, "logps/rejected": -101.96289825439453, "loss": 1.3218, "nll_loss": 1.0364311933517456, "rewards/accuracies": 1.0, "rewards/chosen": 1.9509385824203491, "rewards/margins": 2.0430338382720947, "rewards/rejected": -0.09209519624710083, "step": 1233 }, { "epoch": 0.20566666666666666, "grad_norm": 30.37326431274414, "learning_rate": 1.8424717847587913e-07, "logits/chosen": 2.786041021347046, "logits/rejected": 2.7850942611694336, "logps/chosen": -63.27752685546875, "logps/rejected": -125.99124145507812, "loss": 0.8714, "nll_loss": 0.821786105632782, "rewards/accuracies": 1.0, "rewards/chosen": 1.364966630935669, "rewards/margins": 4.698078155517578, "rewards/rejected": -3.3331117630004883, "step": 1234 }, { "epoch": 0.20583333333333334, "grad_norm": 124.04951477050781, "learning_rate": 1.8421808539427002e-07, "logits/chosen": 2.193247079849243, "logits/rejected": 1.8927056789398193, "logps/chosen": -53.402828216552734, "logps/rejected": -51.992122650146484, "loss": 1.3408, "nll_loss": 0.6000317931175232, "rewards/accuracies": 1.0, "rewards/chosen": 1.2938381433486938, "rewards/margins": 0.41628074645996094, "rewards/rejected": 0.8775573968887329, "step": 1235 }, { "epoch": 0.206, "grad_norm": 30.912132263183594, "learning_rate": 1.841889677735327e-07, "logits/chosen": 2.871992349624634, "logits/rejected": 2.9701075553894043, "logps/chosen": -87.6461410522461, "logps/rejected": -126.92583465576172, "loss": 0.9781, "nll_loss": 0.9225910902023315, "rewards/accuracies": 1.0, "rewards/chosen": 1.557824730873108, "rewards/margins": 4.306182861328125, "rewards/rejected": -2.7483582496643066, "step": 1236 }, { "epoch": 0.20616666666666666, "grad_norm": 30.23142433166504, "learning_rate": 1.8415982562215134e-07, "logits/chosen": 2.6899936199188232, "logits/rejected": 2.510286569595337, "logps/chosen": -76.36764526367188, "logps/rejected": -134.0675048828125, "loss": 0.8111, "nll_loss": 0.7561153173446655, "rewards/accuracies": 1.0, "rewards/chosen": 1.2065902948379517, "rewards/margins": 4.592770576477051, "rewards/rejected": -3.3861801624298096, "step": 1237 }, { "epoch": 0.20633333333333334, "grad_norm": 27.775388717651367, "learning_rate": 1.841306589486173e-07, "logits/chosen": 2.9663734436035156, "logits/rejected": 3.614696741104126, "logps/chosen": -16.81576156616211, "logps/rejected": -174.76889038085938, "loss": 0.4181, "nll_loss": 0.37368354201316833, "rewards/accuracies": 1.0, "rewards/chosen": 1.9191395044326782, "rewards/margins": 4.6705145835876465, "rewards/rejected": -2.751375198364258, "step": 1238 }, { "epoch": 0.2065, "grad_norm": 24.835769653320312, "learning_rate": 1.8410146776142898e-07, "logits/chosen": 2.0114309787750244, "logits/rejected": 2.678927183151245, "logps/chosen": -16.504009246826172, "logps/rejected": -113.01873779296875, "loss": 0.3948, "nll_loss": 0.36675581336021423, "rewards/accuracies": 1.0, "rewards/chosen": 2.3616368770599365, "rewards/margins": 5.500715255737305, "rewards/rejected": -3.139078140258789, "step": 1239 }, { "epoch": 0.20666666666666667, "grad_norm": 74.06168365478516, "learning_rate": 1.8407225206909208e-07, "logits/chosen": 2.5073440074920654, "logits/rejected": 2.6291675567626953, "logps/chosen": -67.8999252319336, "logps/rejected": -147.029296875, "loss": 2.2826, "nll_loss": 2.1903204917907715, "rewards/accuracies": 1.0, "rewards/chosen": 0.5654037594795227, "rewards/margins": 3.887263536453247, "rewards/rejected": -3.321859836578369, "step": 1240 }, { "epoch": 0.20683333333333334, "grad_norm": 23.80601692199707, "learning_rate": 1.840430118801193e-07, "logits/chosen": 2.6652467250823975, "logits/rejected": 2.6726391315460205, "logps/chosen": -142.1317901611328, "logps/rejected": -214.48980712890625, "loss": 1.1342, "nll_loss": 1.11040461063385, "rewards/accuracies": 1.0, "rewards/chosen": 2.4764633178710938, "rewards/margins": 5.803865432739258, "rewards/rejected": -3.327401876449585, "step": 1241 }, { "epoch": 0.207, "grad_norm": 261.0464782714844, "learning_rate": 1.8401374720303055e-07, "logits/chosen": 2.3033697605133057, "logits/rejected": 2.4736499786376953, "logps/chosen": -37.28623580932617, "logps/rejected": -317.4537353515625, "loss": 1.5032, "nll_loss": 0.7035138607025146, "rewards/accuracies": 1.0, "rewards/chosen": 1.1417217254638672, "rewards/margins": 0.2579174041748047, "rewards/rejected": 0.8838043212890625, "step": 1242 }, { "epoch": 0.20716666666666667, "grad_norm": 38.12748336791992, "learning_rate": 1.839844580463528e-07, "logits/chosen": 2.9687893390655518, "logits/rejected": 2.978611946105957, "logps/chosen": -31.495941162109375, "logps/rejected": -56.95630645751953, "loss": 0.7087, "nll_loss": 0.5726535320281982, "rewards/accuracies": 1.0, "rewards/chosen": 1.2504892349243164, "rewards/margins": 2.8104639053344727, "rewards/rejected": -1.5599746704101562, "step": 1243 }, { "epoch": 0.20733333333333334, "grad_norm": 40.225677490234375, "learning_rate": 1.8395514441862025e-07, "logits/chosen": 2.5509302616119385, "logits/rejected": 2.5370800495147705, "logps/chosen": -81.40375518798828, "logps/rejected": -42.834861755371094, "loss": 0.96, "nll_loss": 0.8479557633399963, "rewards/accuracies": 1.0, "rewards/chosen": 0.905341386795044, "rewards/margins": 3.137096881866455, "rewards/rejected": -2.231755495071411, "step": 1244 }, { "epoch": 0.2075, "grad_norm": 32.841922760009766, "learning_rate": 1.8392580632837423e-07, "logits/chosen": 2.483603000640869, "logits/rejected": 2.4151790142059326, "logps/chosen": -114.28495788574219, "logps/rejected": -53.76874542236328, "loss": 1.2121, "nll_loss": 1.131534218788147, "rewards/accuracies": 1.0, "rewards/chosen": 2.015887498855591, "rewards/margins": 3.773165702819824, "rewards/rejected": -1.7572782039642334, "step": 1245 }, { "epoch": 0.20766666666666667, "grad_norm": 30.93092155456543, "learning_rate": 1.8389644378416308e-07, "logits/chosen": 3.1866703033447266, "logits/rejected": 3.3014578819274902, "logps/chosen": -70.60526275634766, "logps/rejected": -268.0360107421875, "loss": 1.0332, "nll_loss": 0.9944401979446411, "rewards/accuracies": 1.0, "rewards/chosen": 1.4010224342346191, "rewards/margins": 5.567227363586426, "rewards/rejected": -4.166204929351807, "step": 1246 }, { "epoch": 0.20783333333333334, "grad_norm": 27.420190811157227, "learning_rate": 1.838670567945424e-07, "logits/chosen": 0.3052615523338318, "logits/rejected": 1.6447516679763794, "logps/chosen": -75.44917297363281, "logps/rejected": -412.66748046875, "loss": 0.7011, "nll_loss": 0.6560797095298767, "rewards/accuracies": 1.0, "rewards/chosen": 1.0160163640975952, "rewards/margins": 7.6678385734558105, "rewards/rejected": -6.651822090148926, "step": 1247 }, { "epoch": 0.208, "grad_norm": 39.320308685302734, "learning_rate": 1.8383764536807482e-07, "logits/chosen": 2.20314359664917, "logits/rejected": 1.527191400527954, "logps/chosen": -136.8060760498047, "logps/rejected": -59.61457443237305, "loss": 1.3733, "nll_loss": 1.2785615921020508, "rewards/accuracies": 1.0, "rewards/chosen": 0.7897934317588806, "rewards/margins": 3.5413200855255127, "rewards/rejected": -2.7515265941619873, "step": 1248 }, { "epoch": 0.20816666666666667, "grad_norm": 358.7321472167969, "learning_rate": 1.8380820951333012e-07, "logits/chosen": 4.278208255767822, "logits/rejected": 4.086760997772217, "logps/chosen": -224.23721313476562, "logps/rejected": -102.69920349121094, "loss": 3.7115, "nll_loss": 1.2740750312805176, "rewards/accuracies": 0.0, "rewards/chosen": -0.9324555397033691, "rewards/margins": -2.2305803298950195, "rewards/rejected": 1.2981246709823608, "step": 1249 }, { "epoch": 0.20833333333333334, "grad_norm": 30.824275970458984, "learning_rate": 1.837787492388852e-07, "logits/chosen": 2.3611652851104736, "logits/rejected": 2.6598525047302246, "logps/chosen": -55.82953643798828, "logps/rejected": -236.91290283203125, "loss": 0.8322, "nll_loss": 0.7975648045539856, "rewards/accuracies": 1.0, "rewards/chosen": 1.4020683765411377, "rewards/margins": 6.32349967956543, "rewards/rejected": -4.921431064605713, "step": 1250 }, { "epoch": 0.2085, "grad_norm": 135.49472045898438, "learning_rate": 1.8374926455332408e-07, "logits/chosen": 2.5662119388580322, "logits/rejected": 2.5035994052886963, "logps/chosen": -64.78483581542969, "logps/rejected": -29.002742767333984, "loss": 1.3727, "nll_loss": 0.8098104596138, "rewards/accuracies": 1.0, "rewards/chosen": -0.6051079034805298, "rewards/margins": 0.6654174327850342, "rewards/rejected": -1.270525336265564, "step": 1251 }, { "epoch": 0.20866666666666667, "grad_norm": 82.82237243652344, "learning_rate": 1.8371975546523792e-07, "logits/chosen": 2.731727361679077, "logits/rejected": 2.6298954486846924, "logps/chosen": -29.229522705078125, "logps/rejected": -23.47713279724121, "loss": 1.3137, "nll_loss": 0.5412874817848206, "rewards/accuracies": 1.0, "rewards/chosen": 1.559273600578308, "rewards/margins": 0.42406749725341797, "rewards/rejected": 1.1352061033248901, "step": 1252 }, { "epoch": 0.20883333333333334, "grad_norm": 66.34580993652344, "learning_rate": 1.8369022198322495e-07, "logits/chosen": 2.6548893451690674, "logits/rejected": 2.7842767238616943, "logps/chosen": -25.091083526611328, "logps/rejected": -82.71527099609375, "loss": 0.8314, "nll_loss": 0.6272770166397095, "rewards/accuracies": 1.0, "rewards/chosen": 0.7259129285812378, "rewards/margins": 2.1611366271972656, "rewards/rejected": -1.4352235794067383, "step": 1253 }, { "epoch": 0.209, "grad_norm": 83.61400604248047, "learning_rate": 1.8366066411589048e-07, "logits/chosen": 1.4582690000534058, "logits/rejected": 2.1146326065063477, "logps/chosen": -101.27116394042969, "logps/rejected": -284.57293701171875, "loss": 1.4355, "nll_loss": 1.3152098655700684, "rewards/accuracies": 1.0, "rewards/chosen": 0.017082978039979935, "rewards/margins": 4.126345157623291, "rewards/rejected": -4.109261989593506, "step": 1254 }, { "epoch": 0.20916666666666667, "grad_norm": 24.558034896850586, "learning_rate": 1.83631081871847e-07, "logits/chosen": 2.724722385406494, "logits/rejected": 3.058391571044922, "logps/chosen": -92.3705825805664, "logps/rejected": -394.7401123046875, "loss": 0.9227, "nll_loss": 0.896801769733429, "rewards/accuracies": 1.0, "rewards/chosen": 2.1124870777130127, "rewards/margins": 5.759200096130371, "rewards/rejected": -3.6467132568359375, "step": 1255 }, { "epoch": 0.20933333333333334, "grad_norm": 33.15571212768555, "learning_rate": 1.8360147525971402e-07, "logits/chosen": 2.8239481449127197, "logits/rejected": 2.9107627868652344, "logps/chosen": -59.10812759399414, "logps/rejected": -156.20445251464844, "loss": 0.8429, "nll_loss": 0.7987585067749023, "rewards/accuracies": 1.0, "rewards/chosen": 2.331594467163086, "rewards/margins": 4.754433631896973, "rewards/rejected": -2.422839403152466, "step": 1256 }, { "epoch": 0.2095, "grad_norm": 26.54585075378418, "learning_rate": 1.8357184428811827e-07, "logits/chosen": 3.329861879348755, "logits/rejected": 3.792081117630005, "logps/chosen": -68.84100341796875, "logps/rejected": -162.75331115722656, "loss": 0.8686, "nll_loss": 0.8498889207839966, "rewards/accuracies": 1.0, "rewards/chosen": 2.0371780395507812, "rewards/margins": 7.4919047355651855, "rewards/rejected": -5.454726696014404, "step": 1257 }, { "epoch": 0.20966666666666667, "grad_norm": 83.16120147705078, "learning_rate": 1.8354218896569343e-07, "logits/chosen": 2.538025140762329, "logits/rejected": 1.9907989501953125, "logps/chosen": -172.934326171875, "logps/rejected": -55.72890853881836, "loss": 1.7175, "nll_loss": 1.5037767887115479, "rewards/accuracies": 1.0, "rewards/chosen": 0.02085571363568306, "rewards/margins": 2.231424331665039, "rewards/rejected": -2.21056866645813, "step": 1258 }, { "epoch": 0.20983333333333334, "grad_norm": 31.812538146972656, "learning_rate": 1.835125093010804e-07, "logits/chosen": 2.719154119491577, "logits/rejected": 2.9723684787750244, "logps/chosen": -70.77642822265625, "logps/rejected": -367.1039123535156, "loss": 0.9487, "nll_loss": 0.9191743731498718, "rewards/accuracies": 1.0, "rewards/chosen": 1.4768364429473877, "rewards/margins": 8.039767265319824, "rewards/rejected": -6.562931060791016, "step": 1259 }, { "epoch": 0.21, "grad_norm": 30.918821334838867, "learning_rate": 1.834828053029271e-07, "logits/chosen": 1.5133073329925537, "logits/rejected": 2.432446241378784, "logps/chosen": -33.77916717529297, "logps/rejected": -278.93212890625, "loss": 0.6008, "nll_loss": 0.5629861354827881, "rewards/accuracies": 1.0, "rewards/chosen": 1.2758903503417969, "rewards/margins": 6.398029327392578, "rewards/rejected": -5.122138977050781, "step": 1260 }, { "epoch": 0.21016666666666667, "grad_norm": 32.99858093261719, "learning_rate": 1.834530769798886e-07, "logits/chosen": 2.368879795074463, "logits/rejected": 2.3877482414245605, "logps/chosen": -21.456729888916016, "logps/rejected": -30.320045471191406, "loss": 0.5671, "nll_loss": 0.45652616024017334, "rewards/accuracies": 1.0, "rewards/chosen": 1.3954572677612305, "rewards/margins": 3.14286208152771, "rewards/rejected": -1.7474048137664795, "step": 1261 }, { "epoch": 0.21033333333333334, "grad_norm": 41.98788833618164, "learning_rate": 1.8342332434062694e-07, "logits/chosen": 2.719733476638794, "logits/rejected": 2.7667200565338135, "logps/chosen": -35.91524124145508, "logps/rejected": -150.47613525390625, "loss": 0.7587, "nll_loss": 0.6776461601257324, "rewards/accuracies": 1.0, "rewards/chosen": 0.6117206811904907, "rewards/margins": 4.284903526306152, "rewards/rejected": -3.673182725906372, "step": 1262 }, { "epoch": 0.2105, "grad_norm": 29.62425994873047, "learning_rate": 1.8339354739381138e-07, "logits/chosen": 2.511087656021118, "logits/rejected": 2.861074209213257, "logps/chosen": -41.35838317871094, "logps/rejected": -148.4781951904297, "loss": 0.6619, "nll_loss": 0.6266421675682068, "rewards/accuracies": 1.0, "rewards/chosen": 1.8080048561096191, "rewards/margins": 5.1939592361450195, "rewards/rejected": -3.3859543800354004, "step": 1263 }, { "epoch": 0.21066666666666667, "grad_norm": 61.47077178955078, "learning_rate": 1.8336374614811817e-07, "logits/chosen": 2.36053204536438, "logits/rejected": 2.5319972038269043, "logps/chosen": -27.368925094604492, "logps/rejected": -277.10687255859375, "loss": 0.6835, "nll_loss": 0.5163947939872742, "rewards/accuracies": 1.0, "rewards/chosen": 1.5786017179489136, "rewards/margins": 2.6122992038726807, "rewards/rejected": -1.033697485923767, "step": 1264 }, { "epoch": 0.21083333333333334, "grad_norm": 26.355072021484375, "learning_rate": 1.8333392061223078e-07, "logits/chosen": 1.4334073066711426, "logits/rejected": 2.2074294090270996, "logps/chosen": -100.22465515136719, "logps/rejected": -342.2063903808594, "loss": 0.8354, "nll_loss": 0.814834475517273, "rewards/accuracies": 1.0, "rewards/chosen": 1.8433563709259033, "rewards/margins": 9.872897148132324, "rewards/rejected": -8.029541015625, "step": 1265 }, { "epoch": 0.211, "grad_norm": 54.79262161254883, "learning_rate": 1.833040707948395e-07, "logits/chosen": 2.2232048511505127, "logits/rejected": 2.332502841949463, "logps/chosen": -115.52899169921875, "logps/rejected": -108.31369018554688, "loss": 1.6579, "nll_loss": 1.5003764629364014, "rewards/accuracies": 1.0, "rewards/chosen": 0.22672197222709656, "rewards/margins": 2.7612953186035156, "rewards/rejected": -2.5345733165740967, "step": 1266 }, { "epoch": 0.21116666666666667, "grad_norm": 48.63263702392578, "learning_rate": 1.8327419670464197e-07, "logits/chosen": 1.0820215940475464, "logits/rejected": 1.935468316078186, "logps/chosen": -37.668113708496094, "logps/rejected": -186.67617797851562, "loss": 1.2161, "nll_loss": 1.177128553390503, "rewards/accuracies": 1.0, "rewards/chosen": 1.1567634344100952, "rewards/margins": 8.397497177124023, "rewards/rejected": -7.240733623504639, "step": 1267 }, { "epoch": 0.21133333333333335, "grad_norm": 56.868675231933594, "learning_rate": 1.8324429835034275e-07, "logits/chosen": 3.200230121612549, "logits/rejected": 3.242689371109009, "logps/chosen": -9.056822776794434, "logps/rejected": -37.23834228515625, "loss": 0.656, "nll_loss": 0.3937748968601227, "rewards/accuracies": 1.0, "rewards/chosen": 0.38381171226501465, "rewards/margins": 1.7837767601013184, "rewards/rejected": -1.3999650478363037, "step": 1268 }, { "epoch": 0.2115, "grad_norm": 119.021484375, "learning_rate": 1.8321437574065347e-07, "logits/chosen": 2.1818721294403076, "logits/rejected": 2.1433675289154053, "logps/chosen": -43.380245208740234, "logps/rejected": -41.72624588012695, "loss": 1.4459, "nll_loss": 0.5561569333076477, "rewards/accuracies": 0.0, "rewards/chosen": 0.19326402246952057, "rewards/margins": -0.0765514224767685, "rewards/rejected": 0.26981544494628906, "step": 1269 }, { "epoch": 0.21166666666666667, "grad_norm": 29.6614990234375, "learning_rate": 1.831844288842929e-07, "logits/chosen": 1.6777533292770386, "logits/rejected": 2.192955255508423, "logps/chosen": -67.01399230957031, "logps/rejected": -136.17037963867188, "loss": 0.8525, "nll_loss": 0.8073974847793579, "rewards/accuracies": 1.0, "rewards/chosen": 1.2263031005859375, "rewards/margins": 5.304110050201416, "rewards/rejected": -4.0778069496154785, "step": 1270 }, { "epoch": 0.21183333333333335, "grad_norm": 57.99751281738281, "learning_rate": 1.831544577899868e-07, "logits/chosen": 2.7500243186950684, "logits/rejected": 2.888859748840332, "logps/chosen": -347.0763854980469, "logps/rejected": -466.4100646972656, "loss": 1.6837, "nll_loss": 1.5920932292938232, "rewards/accuracies": 1.0, "rewards/chosen": 0.1998687982559204, "rewards/margins": 5.749704360961914, "rewards/rejected": -5.549835681915283, "step": 1271 }, { "epoch": 0.212, "grad_norm": 28.067607879638672, "learning_rate": 1.8312446246646806e-07, "logits/chosen": -0.2497275173664093, "logits/rejected": 0.9281541705131531, "logps/chosen": -50.8085823059082, "logps/rejected": -529.794921875, "loss": 0.8051, "nll_loss": 0.7471851706504822, "rewards/accuracies": 1.0, "rewards/chosen": 0.6929874420166016, "rewards/margins": 11.960247993469238, "rewards/rejected": -11.267260551452637, "step": 1272 }, { "epoch": 0.21216666666666667, "grad_norm": 33.296363830566406, "learning_rate": 1.830944429224766e-07, "logits/chosen": 2.7780728340148926, "logits/rejected": 2.8925654888153076, "logps/chosen": -51.05293273925781, "logps/rejected": -281.2727355957031, "loss": 0.8178, "nll_loss": 0.7507783770561218, "rewards/accuracies": 1.0, "rewards/chosen": 0.7380806803703308, "rewards/margins": 4.8258490562438965, "rewards/rejected": -4.0877685546875, "step": 1273 }, { "epoch": 0.21233333333333335, "grad_norm": 29.306867599487305, "learning_rate": 1.8306439916675934e-07, "logits/chosen": 2.2312276363372803, "logits/rejected": 2.864410400390625, "logps/chosen": -90.18598937988281, "logps/rejected": -462.5078125, "loss": 0.8998, "nll_loss": 0.8589141964912415, "rewards/accuracies": 1.0, "rewards/chosen": 1.178688883781433, "rewards/margins": 6.2486114501953125, "rewards/rejected": -5.06992244720459, "step": 1274 }, { "epoch": 0.2125, "grad_norm": 32.654720306396484, "learning_rate": 1.830343312080704e-07, "logits/chosen": 2.97117018699646, "logits/rejected": 3.1363277435302734, "logps/chosen": -34.30048370361328, "logps/rejected": -258.7946472167969, "loss": 0.851, "nll_loss": 0.8166783452033997, "rewards/accuracies": 1.0, "rewards/chosen": 1.379250407218933, "rewards/margins": 6.449991703033447, "rewards/rejected": -5.070741176605225, "step": 1275 }, { "epoch": 0.21266666666666667, "grad_norm": 55.81217575073242, "learning_rate": 1.8300423905517079e-07, "logits/chosen": 2.7966525554656982, "logits/rejected": 2.8828723430633545, "logps/chosen": -71.36394500732422, "logps/rejected": -49.323219299316406, "loss": 1.1749, "nll_loss": 1.0651335716247559, "rewards/accuracies": 1.0, "rewards/chosen": 1.7353456020355225, "rewards/margins": 3.2474467754364014, "rewards/rejected": -1.512101173400879, "step": 1276 }, { "epoch": 0.21283333333333335, "grad_norm": 88.1567153930664, "learning_rate": 1.8297412271682863e-07, "logits/chosen": 2.7453362941741943, "logits/rejected": 2.6607799530029297, "logps/chosen": -82.70470428466797, "logps/rejected": -100.8961410522461, "loss": 0.9251, "nll_loss": 0.9088429808616638, "rewards/accuracies": 1.0, "rewards/chosen": 2.5057663917541504, "rewards/margins": 6.671255111694336, "rewards/rejected": -4.1654887199401855, "step": 1277 }, { "epoch": 0.213, "grad_norm": 44.044471740722656, "learning_rate": 1.8294398220181914e-07, "logits/chosen": 1.177834153175354, "logits/rejected": 2.025325059890747, "logps/chosen": -19.36499786376953, "logps/rejected": -384.29083251953125, "loss": 0.6557, "nll_loss": 0.6051561236381531, "rewards/accuracies": 1.0, "rewards/chosen": 1.2971398830413818, "rewards/margins": 4.701925277709961, "rewards/rejected": -3.40478515625, "step": 1278 }, { "epoch": 0.21316666666666667, "grad_norm": 30.082138061523438, "learning_rate": 1.8291381751892456e-07, "logits/chosen": 2.706108808517456, "logits/rejected": 2.4066097736358643, "logps/chosen": -140.13311767578125, "logps/rejected": -97.32064819335938, "loss": 0.9103, "nll_loss": 0.844175398349762, "rewards/accuracies": 1.0, "rewards/chosen": 1.8814986944198608, "rewards/margins": 4.0201096534729, "rewards/rejected": -2.13861083984375, "step": 1279 }, { "epoch": 0.21333333333333335, "grad_norm": 40.38299560546875, "learning_rate": 1.8288362867693413e-07, "logits/chosen": 2.4301981925964355, "logits/rejected": 2.753838300704956, "logps/chosen": -30.988449096679688, "logps/rejected": -251.08633422851562, "loss": 0.6548, "nll_loss": 0.5959316492080688, "rewards/accuracies": 1.0, "rewards/chosen": 0.7658486366271973, "rewards/margins": 5.723587989807129, "rewards/rejected": -4.957739353179932, "step": 1280 }, { "epoch": 0.2135, "grad_norm": 47.100589752197266, "learning_rate": 1.8285341568464413e-07, "logits/chosen": 1.1354023218154907, "logits/rejected": 3.0366899967193604, "logps/chosen": -17.445127487182617, "logps/rejected": -458.13568115234375, "loss": 0.6428, "nll_loss": 0.5286401510238647, "rewards/accuracies": 1.0, "rewards/chosen": 0.6655405759811401, "rewards/margins": 3.1862282752990723, "rewards/rejected": -2.5206878185272217, "step": 1281 }, { "epoch": 0.21366666666666667, "grad_norm": 39.08161926269531, "learning_rate": 1.828231785508579e-07, "logits/chosen": 3.0299317836761475, "logits/rejected": 3.1805713176727295, "logps/chosen": -80.50509643554688, "logps/rejected": -170.85989379882812, "loss": 1.1259, "nll_loss": 1.0734013319015503, "rewards/accuracies": 1.0, "rewards/chosen": 0.9369301199913025, "rewards/margins": 5.556071758270264, "rewards/rejected": -4.619141578674316, "step": 1282 }, { "epoch": 0.21383333333333332, "grad_norm": 28.132020950317383, "learning_rate": 1.8279291728438586e-07, "logits/chosen": 2.181165933609009, "logits/rejected": 1.87375807762146, "logps/chosen": -96.05940246582031, "logps/rejected": -96.68441009521484, "loss": 1.0498, "nll_loss": 1.011151671409607, "rewards/accuracies": 1.0, "rewards/chosen": 2.535914897918701, "rewards/margins": 5.035854339599609, "rewards/rejected": -2.4999396800994873, "step": 1283 }, { "epoch": 0.214, "grad_norm": 39.090946197509766, "learning_rate": 1.8276263189404539e-07, "logits/chosen": 2.723254919052124, "logits/rejected": 2.576782703399658, "logps/chosen": -21.160953521728516, "logps/rejected": -82.73432922363281, "loss": 0.5452, "nll_loss": 0.45023313164711, "rewards/accuracies": 1.0, "rewards/chosen": 1.205788016319275, "rewards/margins": 3.3745784759521484, "rewards/rejected": -2.168790578842163, "step": 1284 }, { "epoch": 0.21416666666666667, "grad_norm": 37.21101379394531, "learning_rate": 1.827323223886609e-07, "logits/chosen": 2.353172540664673, "logits/rejected": 2.7347521781921387, "logps/chosen": -14.21304702758789, "logps/rejected": -99.41584777832031, "loss": 0.5668, "nll_loss": 0.5076088309288025, "rewards/accuracies": 1.0, "rewards/chosen": 1.3533271551132202, "rewards/margins": 4.244241714477539, "rewards/rejected": -2.8909144401550293, "step": 1285 }, { "epoch": 0.21433333333333332, "grad_norm": 97.0829849243164, "learning_rate": 1.827019887770639e-07, "logits/chosen": 2.9466967582702637, "logits/rejected": 3.0002245903015137, "logps/chosen": -53.47061538696289, "logps/rejected": -63.44062042236328, "loss": 1.1062, "nll_loss": 0.7531072497367859, "rewards/accuracies": 1.0, "rewards/chosen": 1.390183687210083, "rewards/margins": 1.5346943140029907, "rewards/rejected": -0.1445106565952301, "step": 1286 }, { "epoch": 0.2145, "grad_norm": 40.07606506347656, "learning_rate": 1.8267163106809287e-07, "logits/chosen": 2.7234230041503906, "logits/rejected": 2.712562322616577, "logps/chosen": -36.76805114746094, "logps/rejected": -134.7193603515625, "loss": 0.8259, "nll_loss": 0.7822988629341125, "rewards/accuracies": 1.0, "rewards/chosen": 1.7252670526504517, "rewards/margins": 4.74157190322876, "rewards/rejected": -3.0163047313690186, "step": 1287 }, { "epoch": 0.21466666666666667, "grad_norm": 173.45652770996094, "learning_rate": 1.826412492705933e-07, "logits/chosen": 1.0703901052474976, "logits/rejected": 1.9635815620422363, "logps/chosen": -130.67408752441406, "logps/rejected": -306.0728759765625, "loss": 2.2609, "nll_loss": 1.719395637512207, "rewards/accuracies": 1.0, "rewards/chosen": -2.419865369796753, "rewards/margins": 1.9175307750701904, "rewards/rejected": -4.337396144866943, "step": 1288 }, { "epoch": 0.21483333333333332, "grad_norm": 160.34588623046875, "learning_rate": 1.826108433934177e-07, "logits/chosen": 2.520132303237915, "logits/rejected": 2.3965461254119873, "logps/chosen": -64.90052795410156, "logps/rejected": -58.7589225769043, "loss": 1.6856, "nll_loss": 0.9140918254852295, "rewards/accuracies": 1.0, "rewards/chosen": -0.8399563431739807, "rewards/margins": 0.15137016773223877, "rewards/rejected": -0.9913265109062195, "step": 1289 }, { "epoch": 0.215, "grad_norm": 50.92367172241211, "learning_rate": 1.8258041344542562e-07, "logits/chosen": 2.0105035305023193, "logits/rejected": 2.170348644256592, "logps/chosen": -80.48091888427734, "logps/rejected": -74.39032745361328, "loss": 1.252, "nll_loss": 1.1497273445129395, "rewards/accuracies": 1.0, "rewards/chosen": 0.42639774084091187, "rewards/margins": 3.7172977924346924, "rewards/rejected": -3.2908999919891357, "step": 1290 }, { "epoch": 0.21516666666666667, "grad_norm": 82.32613372802734, "learning_rate": 1.8254995943548366e-07, "logits/chosen": 2.0115346908569336, "logits/rejected": 2.1027166843414307, "logps/chosen": -22.744964599609375, "logps/rejected": -32.05428695678711, "loss": 1.3993, "nll_loss": 1.1971033811569214, "rewards/accuracies": 1.0, "rewards/chosen": 0.8147659301757812, "rewards/margins": 2.1806576251983643, "rewards/rejected": -1.365891695022583, "step": 1291 }, { "epoch": 0.21533333333333332, "grad_norm": 23.127601623535156, "learning_rate": 1.8251948137246537e-07, "logits/chosen": 2.604746103286743, "logits/rejected": 2.691070318222046, "logps/chosen": -126.62838745117188, "logps/rejected": -292.20452880859375, "loss": 0.721, "nll_loss": 0.6735551357269287, "rewards/accuracies": 1.0, "rewards/chosen": 2.7980592250823975, "rewards/margins": 4.90362548828125, "rewards/rejected": -2.1055665016174316, "step": 1292 }, { "epoch": 0.2155, "grad_norm": 44.119815826416016, "learning_rate": 1.8248897926525127e-07, "logits/chosen": 2.5302112102508545, "logits/rejected": 2.6081721782684326, "logps/chosen": -24.59720230102539, "logps/rejected": -173.9739227294922, "loss": 0.7402, "nll_loss": 0.6647891998291016, "rewards/accuracies": 1.0, "rewards/chosen": 0.38511011004447937, "rewards/margins": 7.36187219619751, "rewards/rejected": -6.976762294769287, "step": 1293 }, { "epoch": 0.21566666666666667, "grad_norm": 55.23116683959961, "learning_rate": 1.82458453122729e-07, "logits/chosen": 2.711538076400757, "logits/rejected": 2.7468338012695312, "logps/chosen": -79.24179077148438, "logps/rejected": -121.9312744140625, "loss": 1.1982, "nll_loss": 1.056557297706604, "rewards/accuracies": 1.0, "rewards/chosen": 0.9949409365653992, "rewards/margins": 2.729457139968872, "rewards/rejected": -1.7345161437988281, "step": 1294 }, { "epoch": 0.21583333333333332, "grad_norm": 160.35704040527344, "learning_rate": 1.8242790295379312e-07, "logits/chosen": 1.1040980815887451, "logits/rejected": 2.0154800415039062, "logps/chosen": -127.96180725097656, "logps/rejected": -307.0529479980469, "loss": 2.1447, "nll_loss": 1.6837078332901, "rewards/accuracies": 1.0, "rewards/chosen": -2.148637533187866, "rewards/margins": 2.2867658138275146, "rewards/rejected": -4.435403347015381, "step": 1295 }, { "epoch": 0.216, "grad_norm": 136.47557067871094, "learning_rate": 1.8239732876734524e-07, "logits/chosen": 2.646968126296997, "logits/rejected": 2.463693857192993, "logps/chosen": -66.24151611328125, "logps/rejected": -104.50520324707031, "loss": 1.6743, "nll_loss": 1.0859265327453613, "rewards/accuracies": 1.0, "rewards/chosen": -0.7546879053115845, "rewards/margins": 0.609605073928833, "rewards/rejected": -1.3642929792404175, "step": 1296 }, { "epoch": 0.21616666666666667, "grad_norm": 45.19565200805664, "learning_rate": 1.8236673057229393e-07, "logits/chosen": 1.7021311521530151, "logits/rejected": 1.952477216720581, "logps/chosen": -26.161609649658203, "logps/rejected": -172.32846069335938, "loss": 0.812, "nll_loss": 0.7267113924026489, "rewards/accuracies": 1.0, "rewards/chosen": 1.1835041046142578, "rewards/margins": 3.567105531692505, "rewards/rejected": -2.383601427078247, "step": 1297 }, { "epoch": 0.21633333333333332, "grad_norm": 23.847431182861328, "learning_rate": 1.8233610837755477e-07, "logits/chosen": 1.1782183647155762, "logits/rejected": 2.3370378017425537, "logps/chosen": -54.801815032958984, "logps/rejected": -258.2695007324219, "loss": 0.6381, "nll_loss": 0.6022177338600159, "rewards/accuracies": 1.0, "rewards/chosen": 1.24884831905365, "rewards/margins": 7.3935980796813965, "rewards/rejected": -6.144749641418457, "step": 1298 }, { "epoch": 0.2165, "grad_norm": 35.82067108154297, "learning_rate": 1.823054621920503e-07, "logits/chosen": 3.0414254665374756, "logits/rejected": 3.038787841796875, "logps/chosen": -49.63114929199219, "logps/rejected": -73.67144775390625, "loss": 0.7668, "nll_loss": 0.6990302801132202, "rewards/accuracies": 1.0, "rewards/chosen": 0.8524837493896484, "rewards/margins": 4.411097526550293, "rewards/rejected": -3.5586137771606445, "step": 1299 }, { "epoch": 0.21666666666666667, "grad_norm": 74.0166244506836, "learning_rate": 1.8227479202471012e-07, "logits/chosen": 1.9125936031341553, "logits/rejected": 2.2287209033966064, "logps/chosen": -68.83259582519531, "logps/rejected": -156.5018768310547, "loss": 1.6196, "nll_loss": 1.298728108406067, "rewards/accuracies": 1.0, "rewards/chosen": -1.4388519525527954, "rewards/margins": 2.738369941711426, "rewards/rejected": -4.177221775054932, "step": 1300 }, { "epoch": 0.21683333333333332, "grad_norm": 56.46044921875, "learning_rate": 1.8224409788447078e-07, "logits/chosen": 2.200295925140381, "logits/rejected": 2.523200273513794, "logps/chosen": -21.746294021606445, "logps/rejected": -337.3548889160156, "loss": 0.7955, "nll_loss": 0.7014934420585632, "rewards/accuracies": 1.0, "rewards/chosen": 0.44931527972221375, "rewards/margins": 3.9699268341064453, "rewards/rejected": -3.520611524581909, "step": 1301 }, { "epoch": 0.217, "grad_norm": 90.93794250488281, "learning_rate": 1.822133797802758e-07, "logits/chosen": 2.6562652587890625, "logits/rejected": 2.7512872219085693, "logps/chosen": -28.262149810791016, "logps/rejected": -123.2339096069336, "loss": 0.9697, "nll_loss": 0.6893206834793091, "rewards/accuracies": 1.0, "rewards/chosen": 0.43767285346984863, "rewards/margins": 1.6794766187667847, "rewards/rejected": -1.241803765296936, "step": 1302 }, { "epoch": 0.21716666666666667, "grad_norm": 75.16671752929688, "learning_rate": 1.8218263772107572e-07, "logits/chosen": 2.916318416595459, "logits/rejected": 3.2569587230682373, "logps/chosen": -20.27646255493164, "logps/rejected": -326.21783447265625, "loss": 1.6303, "nll_loss": 1.5597277879714966, "rewards/accuracies": 1.0, "rewards/chosen": 0.7882431149482727, "rewards/margins": 4.3788957595825195, "rewards/rejected": -3.5906527042388916, "step": 1303 }, { "epoch": 0.21733333333333332, "grad_norm": 61.98628234863281, "learning_rate": 1.82151871715828e-07, "logits/chosen": 1.2123677730560303, "logits/rejected": 2.2291765213012695, "logps/chosen": -103.53263092041016, "logps/rejected": -363.91217041015625, "loss": 1.7707, "nll_loss": 1.6176973581314087, "rewards/accuracies": 1.0, "rewards/chosen": -0.5606796145439148, "rewards/margins": 11.342846870422363, "rewards/rejected": -11.903526306152344, "step": 1304 }, { "epoch": 0.2175, "grad_norm": 37.73351287841797, "learning_rate": 1.821210817734972e-07, "logits/chosen": 2.9243483543395996, "logits/rejected": 2.9133200645446777, "logps/chosen": -59.76313781738281, "logps/rejected": -234.727294921875, "loss": 0.8553, "nll_loss": 0.7761446833610535, "rewards/accuracies": 1.0, "rewards/chosen": 0.5773422122001648, "rewards/margins": 4.3901686668396, "rewards/rejected": -3.812826633453369, "step": 1305 }, { "epoch": 0.21766666666666667, "grad_norm": 45.11479949951172, "learning_rate": 1.8209026790305465e-07, "logits/chosen": 0.7866069674491882, "logits/rejected": 1.7889983654022217, "logps/chosen": -68.75909423828125, "logps/rejected": -259.3452453613281, "loss": 1.3891, "nll_loss": 1.3482173681259155, "rewards/accuracies": 1.0, "rewards/chosen": 1.2052100896835327, "rewards/margins": 5.878416061401367, "rewards/rejected": -4.673205852508545, "step": 1306 }, { "epoch": 0.21783333333333332, "grad_norm": 26.03003692626953, "learning_rate": 1.8205943011347883e-07, "logits/chosen": 1.9306246042251587, "logits/rejected": 1.8626813888549805, "logps/chosen": -161.97955322265625, "logps/rejected": -130.22763061523438, "loss": 0.9754, "nll_loss": 0.9417415857315063, "rewards/accuracies": 1.0, "rewards/chosen": 1.7361085414886475, "rewards/margins": 5.354885101318359, "rewards/rejected": -3.618776798248291, "step": 1307 }, { "epoch": 0.218, "grad_norm": 84.95336151123047, "learning_rate": 1.8202856841375515e-07, "logits/chosen": 2.44496488571167, "logits/rejected": 2.1529903411865234, "logps/chosen": -66.35478973388672, "logps/rejected": -37.36372756958008, "loss": 1.5896, "nll_loss": 1.327095627784729, "rewards/accuracies": 1.0, "rewards/chosen": -0.13890457153320312, "rewards/margins": 1.8812601566314697, "rewards/rejected": -2.020164728164673, "step": 1308 }, { "epoch": 0.21816666666666668, "grad_norm": 30.232357025146484, "learning_rate": 1.8199768281287597e-07, "logits/chosen": 2.726951837539673, "logits/rejected": 2.6987743377685547, "logps/chosen": -78.47327423095703, "logps/rejected": -85.8726806640625, "loss": 1.0237, "nll_loss": 0.9569911360740662, "rewards/accuracies": 1.0, "rewards/chosen": 1.6167786121368408, "rewards/margins": 3.964097738265991, "rewards/rejected": -2.3473191261291504, "step": 1309 }, { "epoch": 0.21833333333333332, "grad_norm": 28.8843936920166, "learning_rate": 1.819667733198406e-07, "logits/chosen": 2.4123964309692383, "logits/rejected": 1.913050889968872, "logps/chosen": -80.97343444824219, "logps/rejected": -61.4882926940918, "loss": 0.9909, "nll_loss": 0.9307292103767395, "rewards/accuracies": 1.0, "rewards/chosen": 2.1429550647735596, "rewards/margins": 4.249883651733398, "rewards/rejected": -2.106928586959839, "step": 1310 }, { "epoch": 0.2185, "grad_norm": 38.2808837890625, "learning_rate": 1.8193583994365529e-07, "logits/chosen": 1.6700940132141113, "logits/rejected": 1.63444185256958, "logps/chosen": -93.8662338256836, "logps/rejected": -72.79547119140625, "loss": 0.9487, "nll_loss": 0.8772545456886292, "rewards/accuracies": 1.0, "rewards/chosen": 1.065271019935608, "rewards/margins": 3.9798412322998047, "rewards/rejected": -2.9145703315734863, "step": 1311 }, { "epoch": 0.21866666666666668, "grad_norm": 24.72816276550293, "learning_rate": 1.8190488269333332e-07, "logits/chosen": 2.1356937885284424, "logits/rejected": 2.281972646713257, "logps/chosen": -93.52685546875, "logps/rejected": -238.68923950195312, "loss": 1.0326, "nll_loss": 1.0165961980819702, "rewards/accuracies": 1.0, "rewards/chosen": 2.075425863265991, "rewards/margins": 9.506383895874023, "rewards/rejected": -7.430957794189453, "step": 1312 }, { "epoch": 0.21883333333333332, "grad_norm": 67.91783905029297, "learning_rate": 1.818739015778949e-07, "logits/chosen": 2.946669816970825, "logits/rejected": 2.8803417682647705, "logps/chosen": -113.70491790771484, "logps/rejected": -62.10905456542969, "loss": 1.3628, "nll_loss": 1.2359230518341064, "rewards/accuracies": 1.0, "rewards/chosen": 1.0216530561447144, "rewards/margins": 2.903158187866211, "rewards/rejected": -1.8815052509307861, "step": 1313 }, { "epoch": 0.219, "grad_norm": 47.300289154052734, "learning_rate": 1.8184289660636713e-07, "logits/chosen": 2.6649515628814697, "logits/rejected": 2.924929618835449, "logps/chosen": -57.777008056640625, "logps/rejected": -195.53260803222656, "loss": 0.7816, "nll_loss": 0.6349121928215027, "rewards/accuracies": 1.0, "rewards/chosen": 1.5259950160980225, "rewards/margins": 2.7849626541137695, "rewards/rejected": -1.258967638015747, "step": 1314 }, { "epoch": 0.21916666666666668, "grad_norm": 31.38109016418457, "learning_rate": 1.8181186778778416e-07, "logits/chosen": 1.7062065601348877, "logits/rejected": 1.6345535516738892, "logps/chosen": -70.37319946289062, "logps/rejected": -72.27015686035156, "loss": 0.8133, "nll_loss": 0.7649260759353638, "rewards/accuracies": 1.0, "rewards/chosen": 1.2346314191818237, "rewards/margins": 4.906153202056885, "rewards/rejected": -3.6715219020843506, "step": 1315 }, { "epoch": 0.21933333333333332, "grad_norm": 36.61269760131836, "learning_rate": 1.8178081513118703e-07, "logits/chosen": 2.912170648574829, "logits/rejected": 3.0466153621673584, "logps/chosen": -12.847599029541016, "logps/rejected": -165.33499145507812, "loss": 0.4652, "nll_loss": 0.42825326323509216, "rewards/accuracies": 1.0, "rewards/chosen": 1.3405101299285889, "rewards/margins": 5.895442962646484, "rewards/rejected": -4.554933071136475, "step": 1316 }, { "epoch": 0.2195, "grad_norm": 14.52726936340332, "learning_rate": 1.8174973864562378e-07, "logits/chosen": 2.3179218769073486, "logits/rejected": 2.337799310684204, "logps/chosen": -160.91517639160156, "logps/rejected": -266.5331115722656, "loss": 0.6578, "nll_loss": 0.6385524272918701, "rewards/accuracies": 1.0, "rewards/chosen": 2.5975770950317383, "rewards/margins": 6.2015228271484375, "rewards/rejected": -3.6039459705352783, "step": 1317 }, { "epoch": 0.21966666666666668, "grad_norm": 26.553014755249023, "learning_rate": 1.8171863834014927e-07, "logits/chosen": 1.2637219429016113, "logits/rejected": 2.675496816635132, "logps/chosen": -41.567237854003906, "logps/rejected": -211.98605346679688, "loss": 0.5803, "nll_loss": 0.5195904970169067, "rewards/accuracies": 1.0, "rewards/chosen": 0.7117409110069275, "rewards/margins": 5.612216949462891, "rewards/rejected": -4.900475978851318, "step": 1318 }, { "epoch": 0.21983333333333333, "grad_norm": 106.2247543334961, "learning_rate": 1.8168751422382545e-07, "logits/chosen": 1.6392009258270264, "logits/rejected": 1.8983508348464966, "logps/chosen": -26.385936737060547, "logps/rejected": -80.61377716064453, "loss": 2.0437, "nll_loss": 1.8847097158432007, "rewards/accuracies": 1.0, "rewards/chosen": 0.29044076800346375, "rewards/margins": 2.6765077114105225, "rewards/rejected": -2.3860669136047363, "step": 1319 }, { "epoch": 0.22, "grad_norm": 124.83856964111328, "learning_rate": 1.8165636630572108e-07, "logits/chosen": 2.858497381210327, "logits/rejected": 2.6298415660858154, "logps/chosen": -72.3200912475586, "logps/rejected": -36.49530792236328, "loss": 1.3661, "nll_loss": 0.7090205550193787, "rewards/accuracies": 1.0, "rewards/chosen": 1.1449470520019531, "rewards/margins": 0.575182318687439, "rewards/rejected": 0.5697647333145142, "step": 1320 }, { "epoch": 0.22016666666666668, "grad_norm": 41.442325592041016, "learning_rate": 1.8162519459491198e-07, "logits/chosen": 2.312331199645996, "logits/rejected": 2.496847629547119, "logps/chosen": -26.982702255249023, "logps/rejected": -283.9732360839844, "loss": 0.6099, "nll_loss": 0.5091075897216797, "rewards/accuracies": 1.0, "rewards/chosen": 1.617224097251892, "rewards/margins": 3.3375582695007324, "rewards/rejected": -1.7203340530395508, "step": 1321 }, { "epoch": 0.22033333333333333, "grad_norm": 35.08711242675781, "learning_rate": 1.8159399910048082e-07, "logits/chosen": 2.3887102603912354, "logits/rejected": 2.5799367427825928, "logps/chosen": -106.2298583984375, "logps/rejected": -38.71721649169922, "loss": 1.428, "nll_loss": 1.3796085119247437, "rewards/accuracies": 1.0, "rewards/chosen": 1.379033088684082, "rewards/margins": 4.7037739753723145, "rewards/rejected": -3.3247408866882324, "step": 1322 }, { "epoch": 0.2205, "grad_norm": 52.438148498535156, "learning_rate": 1.8156277983151717e-07, "logits/chosen": 2.791900873184204, "logits/rejected": 2.958930253982544, "logps/chosen": -39.0998649597168, "logps/rejected": -415.0570068359375, "loss": 1.4141, "nll_loss": 1.396423578262329, "rewards/accuracies": 1.0, "rewards/chosen": 2.577698230743408, "rewards/margins": 6.389349937438965, "rewards/rejected": -3.8116517066955566, "step": 1323 }, { "epoch": 0.22066666666666668, "grad_norm": 60.514122009277344, "learning_rate": 1.815315367971176e-07, "logits/chosen": 2.5874059200286865, "logits/rejected": 2.711658000946045, "logps/chosen": -44.244789123535156, "logps/rejected": -404.98626708984375, "loss": 1.7218, "nll_loss": 1.7017226219177246, "rewards/accuracies": 1.0, "rewards/chosen": 1.825278878211975, "rewards/margins": 9.874938011169434, "rewards/rejected": -8.04965877532959, "step": 1324 }, { "epoch": 0.22083333333333333, "grad_norm": 52.339717864990234, "learning_rate": 1.8150027000638563e-07, "logits/chosen": 3.454038619995117, "logits/rejected": 3.929215908050537, "logps/chosen": -31.297470092773438, "logps/rejected": -70.93429565429688, "loss": 0.9531, "nll_loss": 0.7451777458190918, "rewards/accuracies": 1.0, "rewards/chosen": 0.7900554537773132, "rewards/margins": 2.139132499694824, "rewards/rejected": -1.3490769863128662, "step": 1325 }, { "epoch": 0.221, "grad_norm": 37.6610221862793, "learning_rate": 1.814689794684316e-07, "logits/chosen": 3.5616261959075928, "logits/rejected": 3.473214626312256, "logps/chosen": -81.53775787353516, "logps/rejected": -147.51483154296875, "loss": 1.3327, "nll_loss": 1.3151251077651978, "rewards/accuracies": 1.0, "rewards/chosen": 3.260915517807007, "rewards/margins": 6.519428253173828, "rewards/rejected": -3.2585129737854004, "step": 1326 }, { "epoch": 0.22116666666666668, "grad_norm": 181.3647003173828, "learning_rate": 1.8143766519237284e-07, "logits/chosen": 1.195797324180603, "logits/rejected": 3.115872621536255, "logps/chosen": -69.68472290039062, "logps/rejected": -476.2358703613281, "loss": 3.4176, "nll_loss": 2.787388801574707, "rewards/accuracies": 1.0, "rewards/chosen": -1.9405796527862549, "rewards/margins": 0.9071040153503418, "rewards/rejected": -2.8476836681365967, "step": 1327 }, { "epoch": 0.22133333333333333, "grad_norm": 36.362300872802734, "learning_rate": 1.814063271873336e-07, "logits/chosen": 2.856701612472534, "logits/rejected": 2.9211788177490234, "logps/chosen": -103.75633239746094, "logps/rejected": -84.96826934814453, "loss": 1.013, "nll_loss": 0.934741735458374, "rewards/accuracies": 1.0, "rewards/chosen": 1.1047958135604858, "rewards/margins": 3.750319480895996, "rewards/rejected": -2.6455237865448, "step": 1328 }, { "epoch": 0.2215, "grad_norm": 80.70616149902344, "learning_rate": 1.8137496546244497e-07, "logits/chosen": 3.231929063796997, "logits/rejected": 3.2347798347473145, "logps/chosen": -48.62877655029297, "logps/rejected": -90.26116180419922, "loss": 1.1078, "nll_loss": 0.8531363606452942, "rewards/accuracies": 1.0, "rewards/chosen": 0.03876380994915962, "rewards/margins": 1.8778620958328247, "rewards/rejected": -1.839098334312439, "step": 1329 }, { "epoch": 0.22166666666666668, "grad_norm": 30.828561782836914, "learning_rate": 1.81343580026845e-07, "logits/chosen": 1.19609534740448, "logits/rejected": 2.4896137714385986, "logps/chosen": -77.36653900146484, "logps/rejected": -306.2362060546875, "loss": 0.8492, "nll_loss": 0.7975930571556091, "rewards/accuracies": 1.0, "rewards/chosen": 0.87663733959198, "rewards/margins": 5.997774124145508, "rewards/rejected": -5.121136665344238, "step": 1330 }, { "epoch": 0.22183333333333333, "grad_norm": 47.939517974853516, "learning_rate": 1.8131217088967874e-07, "logits/chosen": 2.602783679962158, "logits/rejected": 2.9520227909088135, "logps/chosen": -45.55309295654297, "logps/rejected": -335.83819580078125, "loss": 1.7063, "nll_loss": 1.6871517896652222, "rewards/accuracies": 1.0, "rewards/chosen": 2.469733476638794, "rewards/margins": 6.24665641784668, "rewards/rejected": -3.7769227027893066, "step": 1331 }, { "epoch": 0.222, "grad_norm": 39.81605911254883, "learning_rate": 1.81280738060098e-07, "logits/chosen": 2.4963786602020264, "logits/rejected": 2.4339494705200195, "logps/chosen": -55.917938232421875, "logps/rejected": -44.946327209472656, "loss": 0.8427, "nll_loss": 0.7455726265907288, "rewards/accuracies": 1.0, "rewards/chosen": 1.163732886314392, "rewards/margins": 3.337498188018799, "rewards/rejected": -2.173765182495117, "step": 1332 }, { "epoch": 0.22216666666666668, "grad_norm": 31.163572311401367, "learning_rate": 1.812492815472615e-07, "logits/chosen": 3.2246508598327637, "logits/rejected": 3.3517374992370605, "logps/chosen": -86.84927368164062, "logps/rejected": -102.94163513183594, "loss": 0.9728, "nll_loss": 0.923928439617157, "rewards/accuracies": 1.0, "rewards/chosen": 2.0212249755859375, "rewards/margins": 4.524738311767578, "rewards/rejected": -2.5035133361816406, "step": 1333 }, { "epoch": 0.22233333333333333, "grad_norm": 57.444732666015625, "learning_rate": 1.81217801360335e-07, "logits/chosen": 3.2122302055358887, "logits/rejected": 3.404675245285034, "logps/chosen": -53.8698844909668, "logps/rejected": -110.88956451416016, "loss": 1.2555, "nll_loss": 1.1461677551269531, "rewards/accuracies": 1.0, "rewards/chosen": 0.17212258279323578, "rewards/margins": 3.944779634475708, "rewards/rejected": -3.7726571559906006, "step": 1334 }, { "epoch": 0.2225, "grad_norm": 26.20624351501465, "learning_rate": 1.8118629750849104e-07, "logits/chosen": 1.653908371925354, "logits/rejected": 1.6603937149047852, "logps/chosen": -38.42250442504883, "logps/rejected": -97.30817413330078, "loss": 0.5473, "nll_loss": 0.49259620904922485, "rewards/accuracies": 1.0, "rewards/chosen": 1.076076865196228, "rewards/margins": 4.737131118774414, "rewards/rejected": -3.6610543727874756, "step": 1335 }, { "epoch": 0.22266666666666668, "grad_norm": 43.95100021362305, "learning_rate": 1.8115477000090907e-07, "logits/chosen": 2.0220563411712646, "logits/rejected": 2.171217441558838, "logps/chosen": -57.184814453125, "logps/rejected": -123.58346557617188, "loss": 0.9615, "nll_loss": 0.8409532904624939, "rewards/accuracies": 1.0, "rewards/chosen": 1.5466476678848267, "rewards/margins": 3.0661253929138184, "rewards/rejected": -1.5194778442382812, "step": 1336 }, { "epoch": 0.22283333333333333, "grad_norm": 41.87136459350586, "learning_rate": 1.8112321884677547e-07, "logits/chosen": 3.035370349884033, "logits/rejected": 3.300283670425415, "logps/chosen": -59.38621520996094, "logps/rejected": -335.21881103515625, "loss": 1.1887, "nll_loss": 1.164435625076294, "rewards/accuracies": 1.0, "rewards/chosen": 1.6520538330078125, "rewards/margins": 7.852914810180664, "rewards/rejected": -6.200860977172852, "step": 1337 }, { "epoch": 0.223, "grad_norm": 21.394542694091797, "learning_rate": 1.8109164405528348e-07, "logits/chosen": 2.661688804626465, "logits/rejected": 3.0083940029144287, "logps/chosen": -58.63296890258789, "logps/rejected": -319.1470947265625, "loss": 0.6581, "nll_loss": 0.6373148560523987, "rewards/accuracies": 1.0, "rewards/chosen": 1.8622006177902222, "rewards/margins": 7.418194770812988, "rewards/rejected": -5.555994033813477, "step": 1338 }, { "epoch": 0.22316666666666668, "grad_norm": 26.01576042175293, "learning_rate": 1.8106004563563323e-07, "logits/chosen": 2.442147970199585, "logits/rejected": 2.5482053756713867, "logps/chosen": -68.27732849121094, "logps/rejected": -104.65210723876953, "loss": 0.6923, "nll_loss": 0.6441256403923035, "rewards/accuracies": 1.0, "rewards/chosen": 1.5297410488128662, "rewards/margins": 4.593658447265625, "rewards/rejected": -3.063917636871338, "step": 1339 }, { "epoch": 0.22333333333333333, "grad_norm": 34.84745407104492, "learning_rate": 1.8102842359703175e-07, "logits/chosen": 2.0595200061798096, "logits/rejected": 2.180896043777466, "logps/chosen": -37.391937255859375, "logps/rejected": -110.64189910888672, "loss": 0.6887, "nll_loss": 0.6129826307296753, "rewards/accuracies": 1.0, "rewards/chosen": 0.912148654460907, "rewards/margins": 3.9492452144622803, "rewards/rejected": -3.0370965003967285, "step": 1340 }, { "epoch": 0.2235, "grad_norm": 32.9025764465332, "learning_rate": 1.8099677794869294e-07, "logits/chosen": 3.76883864402771, "logits/rejected": 3.6518170833587646, "logps/chosen": -124.81076049804688, "logps/rejected": -124.08268737792969, "loss": 0.8427, "nll_loss": 0.7949730157852173, "rewards/accuracies": 1.0, "rewards/chosen": 1.0116500854492188, "rewards/margins": 5.679390907287598, "rewards/rejected": -4.667740821838379, "step": 1341 }, { "epoch": 0.22366666666666668, "grad_norm": 4.715241432189941, "learning_rate": 1.809651086998376e-07, "logits/chosen": 3.206164598464966, "logits/rejected": -0.4921785593032837, "logps/chosen": -68.31656646728516, "logps/rejected": -186.008056640625, "loss": 0.0841, "nll_loss": 0.07763247936964035, "rewards/accuracies": 1.0, "rewards/chosen": 3.07806396484375, "rewards/margins": 9.427055358886719, "rewards/rejected": -6.348991394042969, "step": 1342 }, { "epoch": 0.22383333333333333, "grad_norm": 24.39424705505371, "learning_rate": 1.8093341585969336e-07, "logits/chosen": 1.9208322763442993, "logits/rejected": 2.676901340484619, "logps/chosen": -61.82447052001953, "logps/rejected": -121.93608093261719, "loss": 0.6477, "nll_loss": 0.6121235489845276, "rewards/accuracies": 1.0, "rewards/chosen": 1.583937168121338, "rewards/margins": 5.36244010925293, "rewards/rejected": -3.7785027027130127, "step": 1343 }, { "epoch": 0.224, "grad_norm": 39.293663024902344, "learning_rate": 1.8090169943749475e-07, "logits/chosen": 2.855050563812256, "logits/rejected": 2.815809726715088, "logps/chosen": -72.29576110839844, "logps/rejected": -62.88804244995117, "loss": 1.0458, "nll_loss": 0.9151362180709839, "rewards/accuracies": 1.0, "rewards/chosen": 1.6508735418319702, "rewards/margins": 2.992124080657959, "rewards/rejected": -1.3412504196166992, "step": 1344 }, { "epoch": 0.22416666666666665, "grad_norm": 80.66913604736328, "learning_rate": 1.8086995944248317e-07, "logits/chosen": 1.7516582012176514, "logits/rejected": 1.6087543964385986, "logps/chosen": -36.439632415771484, "logps/rejected": -32.03071594238281, "loss": 1.819, "nll_loss": 1.6563467979431152, "rewards/accuracies": 1.0, "rewards/chosen": 0.7470332980155945, "rewards/margins": 2.5134072303771973, "rewards/rejected": -1.7663739919662476, "step": 1345 }, { "epoch": 0.22433333333333333, "grad_norm": 32.7989501953125, "learning_rate": 1.8083819588390695e-07, "logits/chosen": 1.6811784505844116, "logits/rejected": 1.6786483526229858, "logps/chosen": -101.38996887207031, "logps/rejected": -95.41743469238281, "loss": 0.894, "nll_loss": 0.8243086934089661, "rewards/accuracies": 1.0, "rewards/chosen": 1.5252022743225098, "rewards/margins": 3.888665199279785, "rewards/rejected": -2.3634629249572754, "step": 1346 }, { "epoch": 0.2245, "grad_norm": 22.866724014282227, "learning_rate": 1.8080640877102117e-07, "logits/chosen": 2.9584147930145264, "logits/rejected": 2.844911813735962, "logps/chosen": -181.98684692382812, "logps/rejected": -202.31759643554688, "loss": 0.8323, "nll_loss": 0.8052515983581543, "rewards/accuracies": 1.0, "rewards/chosen": 1.5276062488555908, "rewards/margins": 7.844149589538574, "rewards/rejected": -6.3165435791015625, "step": 1347 }, { "epoch": 0.22466666666666665, "grad_norm": 96.67202758789062, "learning_rate": 1.8077459811308785e-07, "logits/chosen": 2.670785665512085, "logits/rejected": 2.7941536903381348, "logps/chosen": -75.0459213256836, "logps/rejected": -123.09188079833984, "loss": 2.817, "nll_loss": 2.6802115440368652, "rewards/accuracies": 1.0, "rewards/chosen": 0.37836000323295593, "rewards/margins": 2.9418106079101562, "rewards/rejected": -2.563450574874878, "step": 1348 }, { "epoch": 0.22483333333333333, "grad_norm": 35.921531677246094, "learning_rate": 1.8074276391937587e-07, "logits/chosen": 0.8137407302856445, "logits/rejected": 2.0847432613372803, "logps/chosen": -41.34952926635742, "logps/rejected": -251.56008911132812, "loss": 0.7966, "nll_loss": 0.7254303097724915, "rewards/accuracies": 1.0, "rewards/chosen": 0.5164081454277039, "rewards/margins": 5.340179920196533, "rewards/rejected": -4.823771953582764, "step": 1349 }, { "epoch": 0.225, "grad_norm": 73.10659790039062, "learning_rate": 1.807109061991609e-07, "logits/chosen": 3.127490997314453, "logits/rejected": 3.232419013977051, "logps/chosen": -29.986392974853516, "logps/rejected": -122.7371826171875, "loss": 1.8388, "nll_loss": 1.7639052867889404, "rewards/accuracies": 1.0, "rewards/chosen": 0.706607460975647, "rewards/margins": 4.260854244232178, "rewards/rejected": -3.554246664047241, "step": 1350 }, { "epoch": 0.22516666666666665, "grad_norm": 37.387115478515625, "learning_rate": 1.806790249617256e-07, "logits/chosen": 3.1033060550689697, "logits/rejected": 3.1539924144744873, "logps/chosen": -66.1983642578125, "logps/rejected": -184.91302490234375, "loss": 0.8281, "nll_loss": 0.752254068851471, "rewards/accuracies": 1.0, "rewards/chosen": 0.9038421511650085, "rewards/margins": 3.947262763977051, "rewards/rejected": -3.0434205532073975, "step": 1351 }, { "epoch": 0.22533333333333333, "grad_norm": 196.91925048828125, "learning_rate": 1.806471202163593e-07, "logits/chosen": 2.795102119445801, "logits/rejected": 2.877901315689087, "logps/chosen": -65.89222717285156, "logps/rejected": -59.80146026611328, "loss": 0.8191, "nll_loss": 0.6863773465156555, "rewards/accuracies": 1.0, "rewards/chosen": 1.5502939224243164, "rewards/margins": 2.935868263244629, "rewards/rejected": -1.3855743408203125, "step": 1352 }, { "epoch": 0.2255, "grad_norm": 43.00166702270508, "learning_rate": 1.8061519197235834e-07, "logits/chosen": 2.753525972366333, "logits/rejected": 2.6130404472351074, "logps/chosen": -20.58588981628418, "logps/rejected": -32.93891906738281, "loss": 0.6906, "nll_loss": 0.3742889165878296, "rewards/accuracies": 1.0, "rewards/chosen": 2.190192699432373, "rewards/margins": 2.048849105834961, "rewards/rejected": 0.14134369790554047, "step": 1353 }, { "epoch": 0.22566666666666665, "grad_norm": 101.0867691040039, "learning_rate": 1.805832402390258e-07, "logits/chosen": 2.9077529907226562, "logits/rejected": 2.973029851913452, "logps/chosen": -54.276939392089844, "logps/rejected": -41.55474090576172, "loss": 2.0027, "nll_loss": 1.5076929330825806, "rewards/accuracies": 1.0, "rewards/chosen": -0.9228695034980774, "rewards/margins": 0.956882894039154, "rewards/rejected": -1.8797523975372314, "step": 1354 }, { "epoch": 0.22583333333333333, "grad_norm": 32.71537780761719, "learning_rate": 1.805512650256717e-07, "logits/chosen": 2.868079900741577, "logits/rejected": 2.8162190914154053, "logps/chosen": -150.63751220703125, "logps/rejected": -30.063274383544922, "loss": 0.8257, "nll_loss": 0.717321515083313, "rewards/accuracies": 1.0, "rewards/chosen": 1.9406251907348633, "rewards/margins": 3.365687608718872, "rewards/rejected": -1.4250624179840088, "step": 1355 }, { "epoch": 0.226, "grad_norm": 38.97807693481445, "learning_rate": 1.805192663416128e-07, "logits/chosen": 2.3377935886383057, "logits/rejected": 2.4027483463287354, "logps/chosen": -46.463470458984375, "logps/rejected": -129.51580810546875, "loss": 0.7512, "nll_loss": 0.683286190032959, "rewards/accuracies": 1.0, "rewards/chosen": 1.1524628400802612, "rewards/margins": 4.026690483093262, "rewards/rejected": -2.874227523803711, "step": 1356 }, { "epoch": 0.22616666666666665, "grad_norm": 21.268898010253906, "learning_rate": 1.804872441961728e-07, "logits/chosen": 2.9202516078948975, "logits/rejected": 3.0250871181488037, "logps/chosen": -44.43111038208008, "logps/rejected": -206.52593994140625, "loss": 0.6151, "nll_loss": 0.6004204750061035, "rewards/accuracies": 1.0, "rewards/chosen": 2.170199155807495, "rewards/margins": 8.762188911437988, "rewards/rejected": -6.591989517211914, "step": 1357 }, { "epoch": 0.22633333333333333, "grad_norm": 109.42243194580078, "learning_rate": 1.8045519859868213e-07, "logits/chosen": 2.9261136054992676, "logits/rejected": 2.8603687286376953, "logps/chosen": -37.27280044555664, "logps/rejected": -18.552715301513672, "loss": 2.5314, "nll_loss": 2.1925177574157715, "rewards/accuracies": 1.0, "rewards/chosen": 0.05376396328210831, "rewards/margins": 1.4025492668151855, "rewards/rejected": -1.3487852811813354, "step": 1358 }, { "epoch": 0.2265, "grad_norm": 28.458053588867188, "learning_rate": 1.8042312955847817e-07, "logits/chosen": 2.7821407318115234, "logits/rejected": 3.115729570388794, "logps/chosen": -13.288753509521484, "logps/rejected": -106.09649658203125, "loss": 0.4266, "nll_loss": 0.3908456861972809, "rewards/accuracies": 1.0, "rewards/chosen": 1.644700527191162, "rewards/margins": 5.254894256591797, "rewards/rejected": -3.6101937294006348, "step": 1359 }, { "epoch": 0.22666666666666666, "grad_norm": 31.112619400024414, "learning_rate": 1.80391037084905e-07, "logits/chosen": 1.781049370765686, "logits/rejected": 2.1320488452911377, "logps/chosen": -84.04722595214844, "logps/rejected": -186.84719848632812, "loss": 0.9291, "nll_loss": 0.8754920363426208, "rewards/accuracies": 1.0, "rewards/chosen": 1.4839057922363281, "rewards/margins": 4.379636764526367, "rewards/rejected": -2.89573073387146, "step": 1360 }, { "epoch": 0.22683333333333333, "grad_norm": 32.868255615234375, "learning_rate": 1.8035892118731365e-07, "logits/chosen": 2.7424476146698, "logits/rejected": 2.074902057647705, "logps/chosen": -97.974609375, "logps/rejected": -28.797645568847656, "loss": 1.059, "nll_loss": 0.96053546667099, "rewards/accuracies": 1.0, "rewards/chosen": 1.5012298822402954, "rewards/margins": 3.346039295196533, "rewards/rejected": -1.8448095321655273, "step": 1361 }, { "epoch": 0.227, "grad_norm": 40.549293518066406, "learning_rate": 1.8032678187506185e-07, "logits/chosen": 3.048665761947632, "logits/rejected": 3.103107213973999, "logps/chosen": -35.59517288208008, "logps/rejected": -169.27540588378906, "loss": 1.1103, "nll_loss": 1.0786415338516235, "rewards/accuracies": 1.0, "rewards/chosen": 1.8063992261886597, "rewards/margins": 5.42929220199585, "rewards/rejected": -3.6228928565979004, "step": 1362 }, { "epoch": 0.22716666666666666, "grad_norm": 88.81818389892578, "learning_rate": 1.802946191575143e-07, "logits/chosen": 2.2377493381500244, "logits/rejected": 2.518778085708618, "logps/chosen": -75.52156066894531, "logps/rejected": -187.33200073242188, "loss": 2.7099, "nll_loss": 2.604192018508911, "rewards/accuracies": 1.0, "rewards/chosen": -0.0743865966796875, "rewards/margins": 7.0318121910095215, "rewards/rejected": -7.106198787689209, "step": 1363 }, { "epoch": 0.22733333333333333, "grad_norm": 29.090116500854492, "learning_rate": 1.8026243304404242e-07, "logits/chosen": 3.115963935852051, "logits/rejected": 3.2066574096679688, "logps/chosen": -62.433319091796875, "logps/rejected": -183.85696411132812, "loss": 0.7107, "nll_loss": 0.6436424851417542, "rewards/accuracies": 1.0, "rewards/chosen": 1.5678246021270752, "rewards/margins": 3.954653263092041, "rewards/rejected": -2.386828660964966, "step": 1364 }, { "epoch": 0.2275, "grad_norm": 48.593910217285156, "learning_rate": 1.802302235440245e-07, "logits/chosen": 1.918513298034668, "logits/rejected": 1.8164997100830078, "logps/chosen": -52.431339263916016, "logps/rejected": -83.81051635742188, "loss": 0.8263, "nll_loss": 0.7710491418838501, "rewards/accuracies": 1.0, "rewards/chosen": 0.9508907198905945, "rewards/margins": 4.962327003479004, "rewards/rejected": -4.011436462402344, "step": 1365 }, { "epoch": 0.22766666666666666, "grad_norm": 35.520015716552734, "learning_rate": 1.8019799066684553e-07, "logits/chosen": 2.3466145992279053, "logits/rejected": 2.692188024520874, "logps/chosen": -54.638885498046875, "logps/rejected": -94.3235092163086, "loss": 0.882, "nll_loss": 0.8035129904747009, "rewards/accuracies": 1.0, "rewards/chosen": 1.1852692365646362, "rewards/margins": 3.7134718894958496, "rewards/rejected": -2.528202772140503, "step": 1366 }, { "epoch": 0.22783333333333333, "grad_norm": 36.27534484863281, "learning_rate": 1.8016573442189746e-07, "logits/chosen": 2.877394676208496, "logits/rejected": 3.0601918697357178, "logps/chosen": -78.41621398925781, "logps/rejected": -247.47467041015625, "loss": 1.0907, "nll_loss": 1.0455493927001953, "rewards/accuracies": 1.0, "rewards/chosen": 1.0225327014923096, "rewards/margins": 6.06993293762207, "rewards/rejected": -5.047399997711182, "step": 1367 }, { "epoch": 0.228, "grad_norm": 51.58590316772461, "learning_rate": 1.80133454818579e-07, "logits/chosen": 3.339524030685425, "logits/rejected": 3.149364471435547, "logps/chosen": -112.41349029541016, "logps/rejected": -43.92471694946289, "loss": 1.1013, "nll_loss": 1.0408657789230347, "rewards/accuracies": 1.0, "rewards/chosen": 1.574042558670044, "rewards/margins": 4.13321590423584, "rewards/rejected": -2.559173345565796, "step": 1368 }, { "epoch": 0.22816666666666666, "grad_norm": 39.87889099121094, "learning_rate": 1.8010115186629559e-07, "logits/chosen": 2.5656166076660156, "logits/rejected": 3.037818193435669, "logps/chosen": -55.28086471557617, "logps/rejected": -197.1737060546875, "loss": 0.7241, "nll_loss": 0.6074819564819336, "rewards/accuracies": 1.0, "rewards/chosen": 1.7756092548370361, "rewards/margins": 3.1986865997314453, "rewards/rejected": -1.4230774641036987, "step": 1369 }, { "epoch": 0.22833333333333333, "grad_norm": 24.63811683654785, "learning_rate": 1.8006882557445962e-07, "logits/chosen": 2.1645267009735107, "logits/rejected": 2.3743679523468018, "logps/chosen": -90.68209075927734, "logps/rejected": -399.0162353515625, "loss": 1.0064, "nll_loss": 0.99650639295578, "rewards/accuracies": 1.0, "rewards/chosen": 2.559957265853882, "rewards/margins": 9.8631010055542, "rewards/rejected": -7.303143501281738, "step": 1370 }, { "epoch": 0.2285, "grad_norm": 36.89509963989258, "learning_rate": 1.8003647595249012e-07, "logits/chosen": 2.9367945194244385, "logits/rejected": 2.949923276901245, "logps/chosen": -8.867412567138672, "logps/rejected": -57.250160217285156, "loss": 0.4688, "nll_loss": 0.42225784063339233, "rewards/accuracies": 1.0, "rewards/chosen": 1.2496309280395508, "rewards/margins": 4.97265625, "rewards/rejected": -3.72302508354187, "step": 1371 }, { "epoch": 0.22866666666666666, "grad_norm": 16.318702697753906, "learning_rate": 1.8000410300981301e-07, "logits/chosen": 1.8884700536727905, "logits/rejected": 1.6161178350448608, "logps/chosen": -158.67105102539062, "logps/rejected": -101.2762680053711, "loss": 0.6124, "nll_loss": 0.5898551940917969, "rewards/accuracies": 1.0, "rewards/chosen": 3.1542787551879883, "rewards/margins": 6.1271586418151855, "rewards/rejected": -2.9728798866271973, "step": 1372 }, { "epoch": 0.22883333333333333, "grad_norm": 190.069580078125, "learning_rate": 1.79971706755861e-07, "logits/chosen": 1.7663625478744507, "logits/rejected": 2.1773455142974854, "logps/chosen": -54.79693603515625, "logps/rejected": -154.7896728515625, "loss": 5.4838, "nll_loss": 4.981538772583008, "rewards/accuracies": 1.0, "rewards/chosen": -2.472811698913574, "rewards/margins": 2.3778324127197266, "rewards/rejected": -4.850644111633301, "step": 1373 }, { "epoch": 0.229, "grad_norm": 44.230804443359375, "learning_rate": 1.799392872000736e-07, "logits/chosen": 1.3502092361450195, "logits/rejected": 2.0569489002227783, "logps/chosen": -63.57249069213867, "logps/rejected": -237.844970703125, "loss": 1.4687, "nll_loss": 1.4448292255401611, "rewards/accuracies": 1.0, "rewards/chosen": 1.7771236896514893, "rewards/margins": 6.628606796264648, "rewards/rejected": -4.851483345031738, "step": 1374 }, { "epoch": 0.22916666666666666, "grad_norm": 43.30186080932617, "learning_rate": 1.7990684435189705e-07, "logits/chosen": 2.204467296600342, "logits/rejected": 2.4303500652313232, "logps/chosen": -54.33638381958008, "logps/rejected": -393.77545166015625, "loss": 1.2139, "nll_loss": 1.207475185394287, "rewards/accuracies": 1.0, "rewards/chosen": 3.0010571479797363, "rewards/margins": 10.654529571533203, "rewards/rejected": -7.653472900390625, "step": 1375 }, { "epoch": 0.22933333333333333, "grad_norm": 28.42243766784668, "learning_rate": 1.798743782207844e-07, "logits/chosen": 3.086956024169922, "logits/rejected": 3.1554813385009766, "logps/chosen": -18.79192543029785, "logps/rejected": -158.01593017578125, "loss": 0.493, "nll_loss": 0.4697982370853424, "rewards/accuracies": 1.0, "rewards/chosen": 1.8481330871582031, "rewards/margins": 6.5043463706970215, "rewards/rejected": -4.656213283538818, "step": 1376 }, { "epoch": 0.2295, "grad_norm": 39.39359664916992, "learning_rate": 1.7984188881619562e-07, "logits/chosen": 0.8734198808670044, "logits/rejected": 3.009009599685669, "logps/chosen": -58.423927307128906, "logps/rejected": -72.58076477050781, "loss": 1.0146, "nll_loss": 0.9423213601112366, "rewards/accuracies": 1.0, "rewards/chosen": 0.6334426999092102, "rewards/margins": 4.567380428314209, "rewards/rejected": -3.9339377880096436, "step": 1377 }, { "epoch": 0.22966666666666666, "grad_norm": 30.638917922973633, "learning_rate": 1.7980937614759722e-07, "logits/chosen": 2.3437674045562744, "logits/rejected": 2.2349910736083984, "logps/chosen": -87.33582305908203, "logps/rejected": -77.60667419433594, "loss": 0.99, "nll_loss": 0.959734320640564, "rewards/accuracies": 1.0, "rewards/chosen": 1.9253610372543335, "rewards/margins": 5.4466962814331055, "rewards/rejected": -3.5213353633880615, "step": 1378 }, { "epoch": 0.22983333333333333, "grad_norm": 46.742828369140625, "learning_rate": 1.7977684022446266e-07, "logits/chosen": 0.7745881676673889, "logits/rejected": 3.0949268341064453, "logps/chosen": -20.92621612548828, "logps/rejected": -398.9416809082031, "loss": 0.9078, "nll_loss": 0.8370486497879028, "rewards/accuracies": 1.0, "rewards/chosen": 0.8145235776901245, "rewards/margins": 4.247729778289795, "rewards/rejected": -3.43320631980896, "step": 1379 }, { "epoch": 0.23, "grad_norm": 33.23981475830078, "learning_rate": 1.7974428105627206e-07, "logits/chosen": 1.4453943967819214, "logits/rejected": 2.6215109825134277, "logps/chosen": -32.29267883300781, "logps/rejected": -332.9227294921875, "loss": 0.5984, "nll_loss": 0.5382114052772522, "rewards/accuracies": 1.0, "rewards/chosen": 0.6288661956787109, "rewards/margins": 6.472665309906006, "rewards/rejected": -5.843799114227295, "step": 1380 }, { "epoch": 0.23016666666666666, "grad_norm": 41.24601745605469, "learning_rate": 1.797116986525125e-07, "logits/chosen": 3.6999223232269287, "logits/rejected": 3.8635990619659424, "logps/chosen": -57.6954345703125, "logps/rejected": -44.13277053833008, "loss": 1.2298, "nll_loss": 1.2019881010055542, "rewards/accuracies": 1.0, "rewards/chosen": 2.324390411376953, "rewards/margins": 5.515135288238525, "rewards/rejected": -3.1907448768615723, "step": 1381 }, { "epoch": 0.23033333333333333, "grad_norm": 23.813583374023438, "learning_rate": 1.7967909302267764e-07, "logits/chosen": 1.1363327503204346, "logits/rejected": 2.3312973976135254, "logps/chosen": -80.42730712890625, "logps/rejected": -281.1141357421875, "loss": 0.8906, "nll_loss": 0.8742098808288574, "rewards/accuracies": 1.0, "rewards/chosen": 2.1154191493988037, "rewards/margins": 7.599327087402344, "rewards/rejected": -5.483908176422119, "step": 1382 }, { "epoch": 0.2305, "grad_norm": 30.955720901489258, "learning_rate": 1.7964646417626796e-07, "logits/chosen": 2.3148996829986572, "logits/rejected": 2.5582878589630127, "logps/chosen": -41.83149337768555, "logps/rejected": -109.64997100830078, "loss": 0.7652, "nll_loss": 0.7212326526641846, "rewards/accuracies": 1.0, "rewards/chosen": 2.540750503540039, "rewards/margins": 4.886608600616455, "rewards/rejected": -2.345858097076416, "step": 1383 }, { "epoch": 0.23066666666666666, "grad_norm": 132.80421447753906, "learning_rate": 1.7961381212279076e-07, "logits/chosen": 2.958348512649536, "logits/rejected": 2.8858275413513184, "logps/chosen": -29.517236709594727, "logps/rejected": -72.45428466796875, "loss": 1.2545, "nll_loss": 0.44723081588745117, "rewards/accuracies": 1.0, "rewards/chosen": 0.9367033243179321, "rewards/margins": 0.2145940065383911, "rewards/rejected": 0.722109317779541, "step": 1384 }, { "epoch": 0.23083333333333333, "grad_norm": 28.660938262939453, "learning_rate": 1.7958113687176005e-07, "logits/chosen": 2.3460147380828857, "logits/rejected": 2.285533905029297, "logps/chosen": -64.68954467773438, "logps/rejected": -105.90827941894531, "loss": 0.9298, "nll_loss": 0.8984658718109131, "rewards/accuracies": 1.0, "rewards/chosen": 1.925035834312439, "rewards/margins": 5.361809730529785, "rewards/rejected": -3.4367737770080566, "step": 1385 }, { "epoch": 0.231, "grad_norm": 58.663543701171875, "learning_rate": 1.7954843843269663e-07, "logits/chosen": 2.6640589237213135, "logits/rejected": 2.8023996353149414, "logps/chosen": -44.77002716064453, "logps/rejected": -102.5877685546875, "loss": 1.0693, "nll_loss": 0.9327089190483093, "rewards/accuracies": 1.0, "rewards/chosen": -0.1578422635793686, "rewards/margins": 3.709453582763672, "rewards/rejected": -3.867295742034912, "step": 1386 }, { "epoch": 0.23116666666666666, "grad_norm": 28.49257469177246, "learning_rate": 1.79515716815128e-07, "logits/chosen": 4.29617977142334, "logits/rejected": 4.3677778244018555, "logps/chosen": -29.440570831298828, "logps/rejected": -141.17678833007812, "loss": 0.5607, "nll_loss": 0.5352830290794373, "rewards/accuracies": 1.0, "rewards/chosen": 1.829321265220642, "rewards/margins": 6.109597206115723, "rewards/rejected": -4.280275821685791, "step": 1387 }, { "epoch": 0.23133333333333334, "grad_norm": 28.242721557617188, "learning_rate": 1.794829720285885e-07, "logits/chosen": 2.6921417713165283, "logits/rejected": 2.802511215209961, "logps/chosen": -248.4964141845703, "logps/rejected": -357.7795715332031, "loss": 1.0922, "nll_loss": 1.0711051225662231, "rewards/accuracies": 1.0, "rewards/chosen": 1.74095618724823, "rewards/margins": 9.798484802246094, "rewards/rejected": -8.057528495788574, "step": 1388 }, { "epoch": 0.2315, "grad_norm": 35.968204498291016, "learning_rate": 1.7945020408261915e-07, "logits/chosen": 2.8009262084960938, "logits/rejected": 2.8019227981567383, "logps/chosen": -8.343162536621094, "logps/rejected": -58.3889045715332, "loss": 0.4403, "nll_loss": 0.39729344844818115, "rewards/accuracies": 1.0, "rewards/chosen": 1.302056074142456, "rewards/margins": 5.138955116271973, "rewards/rejected": -3.8368992805480957, "step": 1389 }, { "epoch": 0.23166666666666666, "grad_norm": 29.76601219177246, "learning_rate": 1.7941741298676774e-07, "logits/chosen": 2.699261426925659, "logits/rejected": 2.8940744400024414, "logps/chosen": -87.59136962890625, "logps/rejected": -322.722900390625, "loss": 0.9948, "nll_loss": 0.9732376933097839, "rewards/accuracies": 1.0, "rewards/chosen": 1.8072128295898438, "rewards/margins": 7.236064910888672, "rewards/rejected": -5.428852081298828, "step": 1390 }, { "epoch": 0.23183333333333334, "grad_norm": 34.87151336669922, "learning_rate": 1.7938459875058884e-07, "logits/chosen": 1.708386778831482, "logits/rejected": 2.363804578781128, "logps/chosen": -44.35676193237305, "logps/rejected": -149.584228515625, "loss": 0.8467, "nll_loss": 0.7920851707458496, "rewards/accuracies": 1.0, "rewards/chosen": 1.1033748388290405, "rewards/margins": 4.641117572784424, "rewards/rejected": -3.537742853164673, "step": 1391 }, { "epoch": 0.232, "grad_norm": 131.9807586669922, "learning_rate": 1.7935176138364368e-07, "logits/chosen": 2.4880826473236084, "logits/rejected": 2.27901554107666, "logps/chosen": -72.95697021484375, "logps/rejected": -19.583969116210938, "loss": 1.545, "nll_loss": 0.8685354590415955, "rewards/accuracies": 1.0, "rewards/chosen": 1.7638969421386719, "rewards/margins": 0.7264434099197388, "rewards/rejected": 1.037453532218933, "step": 1392 }, { "epoch": 0.23216666666666666, "grad_norm": 121.50273895263672, "learning_rate": 1.7931890089550033e-07, "logits/chosen": 2.0956175327301025, "logits/rejected": 2.6781952381134033, "logps/chosen": -40.712669372558594, "logps/rejected": -111.18665313720703, "loss": 1.4725, "nll_loss": 1.1003422737121582, "rewards/accuracies": 1.0, "rewards/chosen": 0.7232219576835632, "rewards/margins": 1.3056461811065674, "rewards/rejected": -0.5824241638183594, "step": 1393 }, { "epoch": 0.23233333333333334, "grad_norm": 85.02567291259766, "learning_rate": 1.792860172957335e-07, "logits/chosen": 1.708556056022644, "logits/rejected": 2.6962480545043945, "logps/chosen": -74.52384948730469, "logps/rejected": -169.86988830566406, "loss": 2.7312, "nll_loss": 2.6615660190582275, "rewards/accuracies": 1.0, "rewards/chosen": 0.5239166617393494, "rewards/margins": 5.350102424621582, "rewards/rejected": -4.826185703277588, "step": 1394 }, { "epoch": 0.2325, "grad_norm": 35.483734130859375, "learning_rate": 1.792531105939247e-07, "logits/chosen": 2.078310966491699, "logits/rejected": 2.1644229888916016, "logps/chosen": -23.07575035095215, "logps/rejected": -58.04488754272461, "loss": 0.6724, "nll_loss": 0.5916858315467834, "rewards/accuracies": 1.0, "rewards/chosen": 1.3986330032348633, "rewards/margins": 3.6419894695281982, "rewards/rejected": -2.243356466293335, "step": 1395 }, { "epoch": 0.23266666666666666, "grad_norm": 67.12770080566406, "learning_rate": 1.7922018079966218e-07, "logits/chosen": 2.8667421340942383, "logits/rejected": 2.813857078552246, "logps/chosen": -63.59005355834961, "logps/rejected": -89.7005386352539, "loss": 1.1221, "nll_loss": 0.8152570724487305, "rewards/accuracies": 1.0, "rewards/chosen": 1.2874317169189453, "rewards/margins": 1.7140270471572876, "rewards/rejected": -0.4265953302383423, "step": 1396 }, { "epoch": 0.23283333333333334, "grad_norm": 34.84318542480469, "learning_rate": 1.7918722792254086e-07, "logits/chosen": 1.0382579565048218, "logits/rejected": 1.85264253616333, "logps/chosen": -96.45600128173828, "logps/rejected": -210.06997680664062, "loss": 1.1066, "nll_loss": 1.0371614694595337, "rewards/accuracies": 1.0, "rewards/chosen": 0.5522972345352173, "rewards/margins": 5.172057151794434, "rewards/rejected": -4.619760036468506, "step": 1397 }, { "epoch": 0.233, "grad_norm": 34.65964889526367, "learning_rate": 1.7915425197216243e-07, "logits/chosen": 0.9597716331481934, "logits/rejected": 2.7104594707489014, "logps/chosen": -75.14119720458984, "logps/rejected": -364.4417724609375, "loss": 1.2322, "nll_loss": 1.1740812063217163, "rewards/accuracies": 1.0, "rewards/chosen": 1.0172569751739502, "rewards/margins": 4.568251609802246, "rewards/rejected": -3.550994873046875, "step": 1398 }, { "epoch": 0.23316666666666666, "grad_norm": 28.628339767456055, "learning_rate": 1.7912125295813526e-07, "logits/chosen": 2.2629497051239014, "logits/rejected": 2.0960347652435303, "logps/chosen": -116.7128677368164, "logps/rejected": -86.01470947265625, "loss": 1.051, "nll_loss": 0.9890920519828796, "rewards/accuracies": 1.0, "rewards/chosen": 1.4198784828186035, "rewards/margins": 4.107372760772705, "rewards/rejected": -2.6874942779541016, "step": 1399 }, { "epoch": 0.23333333333333334, "grad_norm": 51.076107025146484, "learning_rate": 1.7908823089007456e-07, "logits/chosen": 2.469482421875, "logits/rejected": 2.364349365234375, "logps/chosen": -38.252960205078125, "logps/rejected": -33.00440216064453, "loss": 1.1936, "nll_loss": 1.0929417610168457, "rewards/accuracies": 1.0, "rewards/chosen": 0.8153733015060425, "rewards/margins": 3.3456788063049316, "rewards/rejected": -2.5303056240081787, "step": 1400 }, { "epoch": 0.2335, "grad_norm": 43.814453125, "learning_rate": 1.7905518577760205e-07, "logits/chosen": 2.2509384155273438, "logits/rejected": 2.1044325828552246, "logps/chosen": -23.496294021606445, "logps/rejected": -59.47146987915039, "loss": 0.5967, "nll_loss": 0.534006655216217, "rewards/accuracies": 1.0, "rewards/chosen": 1.420204758644104, "rewards/margins": 4.083168029785156, "rewards/rejected": -2.662963390350342, "step": 1401 }, { "epoch": 0.23366666666666666, "grad_norm": 127.576904296875, "learning_rate": 1.7902211763034636e-07, "logits/chosen": 2.893948554992676, "logits/rejected": 2.9817514419555664, "logps/chosen": -120.29295349121094, "logps/rejected": -158.92868041992188, "loss": 2.4321, "nll_loss": 1.8795772790908813, "rewards/accuracies": 1.0, "rewards/chosen": -3.0362353324890137, "rewards/margins": 3.500122547149658, "rewards/rejected": -6.536357879638672, "step": 1402 }, { "epoch": 0.23383333333333334, "grad_norm": 103.62757110595703, "learning_rate": 1.7898902645794273e-07, "logits/chosen": 2.897465944290161, "logits/rejected": 2.8920390605926514, "logps/chosen": -42.70643615722656, "logps/rejected": -21.412025451660156, "loss": 2.0286, "nll_loss": 1.6425551176071167, "rewards/accuracies": 1.0, "rewards/chosen": 0.1860649138689041, "rewards/margins": 1.1990526914596558, "rewards/rejected": -1.012987732887268, "step": 1403 }, { "epoch": 0.234, "grad_norm": 62.461761474609375, "learning_rate": 1.7895591227003313e-07, "logits/chosen": 3.063258409500122, "logits/rejected": 2.981848955154419, "logps/chosen": -15.221662521362305, "logps/rejected": -27.01279067993164, "loss": 0.9082, "nll_loss": 0.6918937563896179, "rewards/accuracies": 1.0, "rewards/chosen": 0.9893354773521423, "rewards/margins": 2.116140365600586, "rewards/rejected": -1.1268049478530884, "step": 1404 }, { "epoch": 0.23416666666666666, "grad_norm": 98.00849151611328, "learning_rate": 1.7892277507626626e-07, "logits/chosen": 2.57110595703125, "logits/rejected": 2.9088785648345947, "logps/chosen": -32.693607330322266, "logps/rejected": -153.5968780517578, "loss": 1.2367, "nll_loss": 1.0897868871688843, "rewards/accuracies": 1.0, "rewards/chosen": 1.1956112384796143, "rewards/margins": 2.704880952835083, "rewards/rejected": -1.5092697143554688, "step": 1405 }, { "epoch": 0.23433333333333334, "grad_norm": 29.589527130126953, "learning_rate": 1.788896148862975e-07, "logits/chosen": 2.777562141418457, "logits/rejected": 2.8699724674224854, "logps/chosen": -66.034423828125, "logps/rejected": -96.26925659179688, "loss": 0.8598, "nll_loss": 0.8152397871017456, "rewards/accuracies": 1.0, "rewards/chosen": 1.951165795326233, "rewards/margins": 4.67041540145874, "rewards/rejected": -2.719249725341797, "step": 1406 }, { "epoch": 0.2345, "grad_norm": 21.033323287963867, "learning_rate": 1.7885643170978889e-07, "logits/chosen": 0.5344043970108032, "logits/rejected": 1.366322636604309, "logps/chosen": -24.314796447753906, "logps/rejected": -330.42840576171875, "loss": 0.4536, "nll_loss": 0.41922059655189514, "rewards/accuracies": 1.0, "rewards/chosen": 1.2649853229522705, "rewards/margins": 6.926435470581055, "rewards/rejected": -5.661450386047363, "step": 1407 }, { "epoch": 0.23466666666666666, "grad_norm": 36.30241394042969, "learning_rate": 1.7882322555640923e-07, "logits/chosen": 1.7659599781036377, "logits/rejected": 1.6836529970169067, "logps/chosen": -35.635658264160156, "logps/rejected": -49.85586166381836, "loss": 0.6188, "nll_loss": 0.4949396848678589, "rewards/accuracies": 1.0, "rewards/chosen": 0.8460914492607117, "rewards/margins": 2.952195167541504, "rewards/rejected": -2.1061036586761475, "step": 1408 }, { "epoch": 0.23483333333333334, "grad_norm": 29.37489891052246, "learning_rate": 1.7878999643583404e-07, "logits/chosen": 2.587681770324707, "logits/rejected": 2.6931982040405273, "logps/chosen": -56.87559509277344, "logps/rejected": -193.29251098632812, "loss": 0.7385, "nll_loss": 0.7021678686141968, "rewards/accuracies": 1.0, "rewards/chosen": 1.4936531782150269, "rewards/margins": 5.387295246124268, "rewards/rejected": -3.893641948699951, "step": 1409 }, { "epoch": 0.235, "grad_norm": 29.824155807495117, "learning_rate": 1.7875674435774544e-07, "logits/chosen": 2.4322619438171387, "logits/rejected": 2.4248828887939453, "logps/chosen": -80.01091003417969, "logps/rejected": -78.22105407714844, "loss": 0.7685, "nll_loss": 0.7208189368247986, "rewards/accuracies": 1.0, "rewards/chosen": 1.3314460515975952, "rewards/margins": 4.745746612548828, "rewards/rejected": -3.4143006801605225, "step": 1410 }, { "epoch": 0.23516666666666666, "grad_norm": 64.88482666015625, "learning_rate": 1.787234693318323e-07, "logits/chosen": 2.7554409503936768, "logits/rejected": 2.7070846557617188, "logps/chosen": -173.79507446289062, "logps/rejected": -114.65784454345703, "loss": 1.3138, "nll_loss": 1.1664098501205444, "rewards/accuracies": 1.0, "rewards/chosen": 0.549407958984375, "rewards/margins": 2.704914093017578, "rewards/rejected": -2.155506134033203, "step": 1411 }, { "epoch": 0.23533333333333334, "grad_norm": 118.83024597167969, "learning_rate": 1.7869017136779018e-07, "logits/chosen": 2.537147283554077, "logits/rejected": 2.4681220054626465, "logps/chosen": -46.649810791015625, "logps/rejected": -16.3431453704834, "loss": 1.6362, "nll_loss": 0.5424396395683289, "rewards/accuracies": 0.0, "rewards/chosen": 0.8150588870048523, "rewards/margins": -0.31140297651290894, "rewards/rejected": 1.1264618635177612, "step": 1412 }, { "epoch": 0.2355, "grad_norm": 378.4134826660156, "learning_rate": 1.786568504753213e-07, "logits/chosen": 3.370037078857422, "logits/rejected": 3.3004603385925293, "logps/chosen": -117.57927703857422, "logps/rejected": -29.53466796875, "loss": 3.4925, "nll_loss": 1.9928690195083618, "rewards/accuracies": 0.0, "rewards/chosen": -1.3404655456542969, "rewards/margins": -1.07577645778656, "rewards/rejected": -0.26468905806541443, "step": 1413 }, { "epoch": 0.23566666666666666, "grad_norm": 35.120723724365234, "learning_rate": 1.7862350666413453e-07, "logits/chosen": 3.1510732173919678, "logits/rejected": 3.079610824584961, "logps/chosen": -90.69403076171875, "logps/rejected": -57.3544807434082, "loss": 1.1401, "nll_loss": 1.0545817613601685, "rewards/accuracies": 1.0, "rewards/chosen": 2.2507407665252686, "rewards/margins": 3.8510379791259766, "rewards/rejected": -1.600297212600708, "step": 1414 }, { "epoch": 0.23583333333333334, "grad_norm": 205.570068359375, "learning_rate": 1.785901399439455e-07, "logits/chosen": 2.0841798782348633, "logits/rejected": 2.1657726764678955, "logps/chosen": -64.57452392578125, "logps/rejected": -46.96647644042969, "loss": 2.7503, "nll_loss": 0.9358627796173096, "rewards/accuracies": 0.0, "rewards/chosen": 0.24880829453468323, "rewards/margins": -1.3907212018966675, "rewards/rejected": 1.6395294666290283, "step": 1415 }, { "epoch": 0.236, "grad_norm": 30.0272159576416, "learning_rate": 1.7855675032447645e-07, "logits/chosen": 2.3921587467193604, "logits/rejected": 2.217526435852051, "logps/chosen": -137.20123291015625, "logps/rejected": -163.0843505859375, "loss": 1.1303, "nll_loss": 1.0888986587524414, "rewards/accuracies": 1.0, "rewards/chosen": 1.1929397583007812, "rewards/margins": 5.620912075042725, "rewards/rejected": -4.427972316741943, "step": 1416 }, { "epoch": 0.23616666666666666, "grad_norm": 45.37027359008789, "learning_rate": 1.7852333781545635e-07, "logits/chosen": 2.5596814155578613, "logits/rejected": 2.5721242427825928, "logps/chosen": -30.931289672851562, "logps/rejected": -191.7162628173828, "loss": 0.8191, "nll_loss": 0.7364594340324402, "rewards/accuracies": 1.0, "rewards/chosen": 0.730797290802002, "rewards/margins": 3.8685216903686523, "rewards/rejected": -3.1377243995666504, "step": 1417 }, { "epoch": 0.23633333333333334, "grad_norm": 26.1283016204834, "learning_rate": 1.7848990242662074e-07, "logits/chosen": 2.356567144393921, "logits/rejected": 2.380328893661499, "logps/chosen": -76.27102661132812, "logps/rejected": -119.4922866821289, "loss": 0.8454, "nll_loss": 0.8201186656951904, "rewards/accuracies": 1.0, "rewards/chosen": 1.771504282951355, "rewards/margins": 6.268208026885986, "rewards/rejected": -4.496703624725342, "step": 1418 }, { "epoch": 0.2365, "grad_norm": 41.301265716552734, "learning_rate": 1.7845644416771198e-07, "logits/chosen": 2.4246163368225098, "logits/rejected": 2.4273154735565186, "logps/chosen": -53.89204025268555, "logps/rejected": -146.60006713867188, "loss": 0.8391, "nll_loss": 0.8043588399887085, "rewards/accuracies": 1.0, "rewards/chosen": 1.2625302076339722, "rewards/margins": 6.762088298797607, "rewards/rejected": -5.499557971954346, "step": 1419 }, { "epoch": 0.23666666666666666, "grad_norm": 66.15780639648438, "learning_rate": 1.784229630484789e-07, "logits/chosen": 2.623230457305908, "logits/rejected": 2.7892560958862305, "logps/chosen": -84.72117614746094, "logps/rejected": -209.39633178710938, "loss": 2.1878, "nll_loss": 2.172337770462036, "rewards/accuracies": 1.0, "rewards/chosen": 2.2440552711486816, "rewards/margins": 7.283274173736572, "rewards/rejected": -5.039218902587891, "step": 1420 }, { "epoch": 0.23683333333333334, "grad_norm": 54.723968505859375, "learning_rate": 1.7838945907867715e-07, "logits/chosen": 2.9782416820526123, "logits/rejected": 3.2148993015289307, "logps/chosen": -59.48360061645508, "logps/rejected": -225.59344482421875, "loss": 2.1601, "nll_loss": 2.1244144439697266, "rewards/accuracies": 1.0, "rewards/chosen": 1.2558804750442505, "rewards/margins": 6.4500932693481445, "rewards/rejected": -5.194212913513184, "step": 1421 }, { "epoch": 0.237, "grad_norm": 367.0162353515625, "learning_rate": 1.78355932268069e-07, "logits/chosen": 2.9452240467071533, "logits/rejected": 2.673006057739258, "logps/chosen": -400.2810974121094, "logps/rejected": -109.00775909423828, "loss": 2.3606, "nll_loss": 1.1772973537445068, "rewards/accuracies": 1.0, "rewards/chosen": -4.9532012939453125, "rewards/margins": 0.8498935699462891, "rewards/rejected": -5.803094863891602, "step": 1422 }, { "epoch": 0.23716666666666666, "grad_norm": 30.984731674194336, "learning_rate": 1.783223826264233e-07, "logits/chosen": 2.0439136028289795, "logits/rejected": 2.424036979675293, "logps/chosen": -97.8607406616211, "logps/rejected": -221.162353515625, "loss": 1.0819, "nll_loss": 1.0637037754058838, "rewards/accuracies": 1.0, "rewards/chosen": 1.9064722061157227, "rewards/margins": 8.651939392089844, "rewards/rejected": -6.745466709136963, "step": 1423 }, { "epoch": 0.23733333333333334, "grad_norm": 24.631528854370117, "learning_rate": 1.782888101635157e-07, "logits/chosen": 3.029728651046753, "logits/rejected": 3.362114906311035, "logps/chosen": -40.94443893432617, "logps/rejected": -175.44248962402344, "loss": 0.5501, "nll_loss": 0.5054868459701538, "rewards/accuracies": 1.0, "rewards/chosen": 0.9223682284355164, "rewards/margins": 7.876416206359863, "rewards/rejected": -6.954048156738281, "step": 1424 }, { "epoch": 0.2375, "grad_norm": 78.40756225585938, "learning_rate": 1.782552148891283e-07, "logits/chosen": 2.1250486373901367, "logits/rejected": 2.1501872539520264, "logps/chosen": -49.13861846923828, "logps/rejected": -7.467960357666016, "loss": 2.0327, "nll_loss": 1.445253610610962, "rewards/accuracies": 1.0, "rewards/chosen": 1.7997711896896362, "rewards/margins": 0.966644823551178, "rewards/rejected": 0.8331263661384583, "step": 1425 }, { "epoch": 0.23766666666666666, "grad_norm": 143.1824951171875, "learning_rate": 1.7822159681305e-07, "logits/chosen": 2.7863097190856934, "logits/rejected": 2.8258817195892334, "logps/chosen": -62.68943786621094, "logps/rejected": -70.36767578125, "loss": 1.5155, "nll_loss": 0.7375227808952332, "rewards/accuracies": 1.0, "rewards/chosen": 0.7382896542549133, "rewards/margins": 0.23465651273727417, "rewards/rejected": 0.5036331415176392, "step": 1426 }, { "epoch": 0.23783333333333334, "grad_norm": 50.10219192504883, "learning_rate": 1.7818795594507635e-07, "logits/chosen": 2.576899290084839, "logits/rejected": 2.6436874866485596, "logps/chosen": -16.857807159423828, "logps/rejected": -31.400449752807617, "loss": 0.7999, "nll_loss": 0.7024085521697998, "rewards/accuracies": 1.0, "rewards/chosen": 1.4229589700698853, "rewards/margins": 3.351595401763916, "rewards/rejected": -1.9286363124847412, "step": 1427 }, { "epoch": 0.238, "grad_norm": 179.6209259033203, "learning_rate": 1.7815429229500945e-07, "logits/chosen": 1.0205094814300537, "logits/rejected": 2.324472427368164, "logps/chosen": -83.13461303710938, "logps/rejected": -296.4140625, "loss": 2.6948, "nll_loss": 1.8894230127334595, "rewards/accuracies": 1.0, "rewards/chosen": -3.042285680770874, "rewards/margins": 0.9045286178588867, "rewards/rejected": -3.9468142986297607, "step": 1428 }, { "epoch": 0.23816666666666667, "grad_norm": 125.64283752441406, "learning_rate": 1.7812060587265806e-07, "logits/chosen": 3.090088129043579, "logits/rejected": 2.92108154296875, "logps/chosen": -23.358495712280273, "logps/rejected": -23.02206039428711, "loss": 1.3433, "nll_loss": 0.5077933073043823, "rewards/accuracies": 1.0, "rewards/chosen": 1.9772528409957886, "rewards/margins": 0.4539065361022949, "rewards/rejected": 1.5233463048934937, "step": 1429 }, { "epoch": 0.23833333333333334, "grad_norm": 506.110107421875, "learning_rate": 1.780868966878376e-07, "logits/chosen": 2.7579801082611084, "logits/rejected": 2.7201433181762695, "logps/chosen": -228.72665405273438, "logps/rejected": -101.30418395996094, "loss": 3.8648, "nll_loss": 1.5559635162353516, "rewards/accuracies": 0.0, "rewards/chosen": -1.4187805652618408, "rewards/margins": -2.0953476428985596, "rewards/rejected": 0.6765670776367188, "step": 1430 }, { "epoch": 0.2385, "grad_norm": 31.141660690307617, "learning_rate": 1.7805316475037016e-07, "logits/chosen": 2.9259984493255615, "logits/rejected": 3.077481985092163, "logps/chosen": -62.088836669921875, "logps/rejected": -196.23617553710938, "loss": 0.9141, "nll_loss": 0.886983335018158, "rewards/accuracies": 1.0, "rewards/chosen": 1.5813865661621094, "rewards/margins": 6.616689205169678, "rewards/rejected": -5.035302639007568, "step": 1431 }, { "epoch": 0.23866666666666667, "grad_norm": 35.395606994628906, "learning_rate": 1.7801941007008439e-07, "logits/chosen": 2.988008975982666, "logits/rejected": 3.0565812587738037, "logps/chosen": -21.746368408203125, "logps/rejected": -151.1200714111328, "loss": 0.5644, "nll_loss": 0.5303992033004761, "rewards/accuracies": 1.0, "rewards/chosen": 1.6962947845458984, "rewards/margins": 5.3056793212890625, "rewards/rejected": -3.609384298324585, "step": 1432 }, { "epoch": 0.23883333333333334, "grad_norm": 35.1031608581543, "learning_rate": 1.7798563265681556e-07, "logits/chosen": 2.0845251083374023, "logits/rejected": 2.0381104946136475, "logps/chosen": -117.60602569580078, "logps/rejected": -113.47539520263672, "loss": 1.2478, "nll_loss": 1.1879395246505737, "rewards/accuracies": 1.0, "rewards/chosen": 1.4290367364883423, "rewards/margins": 4.167553901672363, "rewards/rejected": -2.7385172843933105, "step": 1433 }, { "epoch": 0.239, "grad_norm": 36.72195816040039, "learning_rate": 1.7795183252040566e-07, "logits/chosen": 1.5792815685272217, "logits/rejected": 1.824326992034912, "logps/chosen": -60.07805633544922, "logps/rejected": -111.92636108398438, "loss": 0.9046, "nll_loss": 0.8461697697639465, "rewards/accuracies": 1.0, "rewards/chosen": 1.626922607421875, "rewards/margins": 4.191245079040527, "rewards/rejected": -2.5643227100372314, "step": 1434 }, { "epoch": 0.23916666666666667, "grad_norm": 272.68389892578125, "learning_rate": 1.779180096707032e-07, "logits/chosen": 2.956753730773926, "logits/rejected": 2.8600080013275146, "logps/chosen": -137.16671752929688, "logps/rejected": -45.840450286865234, "loss": 3.1385, "nll_loss": 0.8965142965316772, "rewards/accuracies": 0.0, "rewards/chosen": 0.8923889398574829, "rewards/margins": -1.79096257686615, "rewards/rejected": 2.683351516723633, "step": 1435 }, { "epoch": 0.23933333333333334, "grad_norm": 25.287525177001953, "learning_rate": 1.7788416411756337e-07, "logits/chosen": 2.8939578533172607, "logits/rejected": 2.9426651000976562, "logps/chosen": -30.314208984375, "logps/rejected": -197.62799072265625, "loss": 0.4849, "nll_loss": 0.473659485578537, "rewards/accuracies": 1.0, "rewards/chosen": 2.9540505409240723, "rewards/margins": 7.264573097229004, "rewards/rejected": -4.310522556304932, "step": 1436 }, { "epoch": 0.2395, "grad_norm": 62.952171325683594, "learning_rate": 1.7785029587084792e-07, "logits/chosen": 2.165107488632202, "logits/rejected": 2.314436912536621, "logps/chosen": -33.03532028198242, "logps/rejected": -275.53021240234375, "loss": 1.5181, "nll_loss": 1.50160551071167, "rewards/accuracies": 1.0, "rewards/chosen": 2.2220981121063232, "rewards/margins": 6.976666450500488, "rewards/rejected": -4.754568576812744, "step": 1437 }, { "epoch": 0.23966666666666667, "grad_norm": 28.9776611328125, "learning_rate": 1.7781640494042526e-07, "logits/chosen": 2.003448486328125, "logits/rejected": 2.988286018371582, "logps/chosen": -25.270687103271484, "logps/rejected": -228.10028076171875, "loss": 0.5037, "nll_loss": 0.45126229524612427, "rewards/accuracies": 1.0, "rewards/chosen": 1.8035154342651367, "rewards/margins": 4.383477687835693, "rewards/rejected": -2.5799622535705566, "step": 1438 }, { "epoch": 0.23983333333333334, "grad_norm": 29.870925903320312, "learning_rate": 1.777824913361704e-07, "logits/chosen": 1.476686716079712, "logits/rejected": 2.064898729324341, "logps/chosen": -81.8135757446289, "logps/rejected": -136.23924255371094, "loss": 0.8801, "nll_loss": 0.834832489490509, "rewards/accuracies": 1.0, "rewards/chosen": 0.9889625906944275, "rewards/margins": 6.0706682205200195, "rewards/rejected": -5.081705570220947, "step": 1439 }, { "epoch": 0.24, "grad_norm": 41.055030822753906, "learning_rate": 1.7774855506796493e-07, "logits/chosen": 2.8784260749816895, "logits/rejected": 2.881587505340576, "logps/chosen": -31.95197296142578, "logps/rejected": -102.3816909790039, "loss": 0.9598, "nll_loss": 0.9129136204719543, "rewards/accuracies": 1.0, "rewards/chosen": 1.9342201948165894, "rewards/margins": 4.584765911102295, "rewards/rejected": -2.650545597076416, "step": 1440 }, { "epoch": 0.24016666666666667, "grad_norm": 29.110918045043945, "learning_rate": 1.7771459614569707e-07, "logits/chosen": 2.0177741050720215, "logits/rejected": 1.810640573501587, "logps/chosen": -58.9582633972168, "logps/rejected": -65.7197265625, "loss": 0.8073, "nll_loss": 0.7656917572021484, "rewards/accuracies": 1.0, "rewards/chosen": 1.9911617040634155, "rewards/margins": 4.790306568145752, "rewards/rejected": -2.799144744873047, "step": 1441 }, { "epoch": 0.24033333333333334, "grad_norm": 24.157136917114258, "learning_rate": 1.7768061457926166e-07, "logits/chosen": 2.703503131866455, "logits/rejected": 3.009830951690674, "logps/chosen": -36.878631591796875, "logps/rejected": -275.4146728515625, "loss": 0.5835, "nll_loss": 0.5587671399116516, "rewards/accuracies": 1.0, "rewards/chosen": 1.60172438621521, "rewards/margins": 7.583432197570801, "rewards/rejected": -5.98170804977417, "step": 1442 }, { "epoch": 0.2405, "grad_norm": 35.71104049682617, "learning_rate": 1.776466103785601e-07, "logits/chosen": 1.8835235834121704, "logits/rejected": 2.2750861644744873, "logps/chosen": -71.75841522216797, "logps/rejected": -185.88917541503906, "loss": 0.9495, "nll_loss": 0.8969801068305969, "rewards/accuracies": 1.0, "rewards/chosen": 0.7398239374160767, "rewards/margins": 7.401657581329346, "rewards/rejected": -6.661833763122559, "step": 1443 }, { "epoch": 0.24066666666666667, "grad_norm": 27.33095932006836, "learning_rate": 1.7761258355350036e-07, "logits/chosen": 0.8689449429512024, "logits/rejected": 1.6861441135406494, "logps/chosen": -75.51588439941406, "logps/rejected": -280.1049499511719, "loss": 0.8322, "nll_loss": 0.8119986057281494, "rewards/accuracies": 1.0, "rewards/chosen": 1.7793197631835938, "rewards/margins": 8.850658416748047, "rewards/rejected": -7.071338176727295, "step": 1444 }, { "epoch": 0.24083333333333334, "grad_norm": 23.563274383544922, "learning_rate": 1.7757853411399712e-07, "logits/chosen": 3.091911554336548, "logits/rejected": 2.9937167167663574, "logps/chosen": -238.30014038085938, "logps/rejected": -208.7741241455078, "loss": 0.9641, "nll_loss": 0.9308599829673767, "rewards/accuracies": 1.0, "rewards/chosen": 1.998870849609375, "rewards/margins": 5.199118137359619, "rewards/rejected": -3.200247287750244, "step": 1445 }, { "epoch": 0.241, "grad_norm": 32.645591735839844, "learning_rate": 1.7754446206997149e-07, "logits/chosen": 2.7322068214416504, "logits/rejected": 2.64463210105896, "logps/chosen": -79.40432739257812, "logps/rejected": -58.545745849609375, "loss": 0.8651, "nll_loss": 0.7861814498901367, "rewards/accuracies": 1.0, "rewards/chosen": 1.4518203735351562, "rewards/margins": 3.684439182281494, "rewards/rejected": -2.232618808746338, "step": 1446 }, { "epoch": 0.24116666666666667, "grad_norm": 158.57955932617188, "learning_rate": 1.775103674313513e-07, "logits/chosen": 1.783045768737793, "logits/rejected": 1.779317021369934, "logps/chosen": -114.14848327636719, "logps/rejected": -118.9078140258789, "loss": 1.8898, "nll_loss": 1.1890467405319214, "rewards/accuracies": 1.0, "rewards/chosen": 1.4817641973495483, "rewards/margins": 0.5862869024276733, "rewards/rejected": 0.895477294921875, "step": 1447 }, { "epoch": 0.24133333333333334, "grad_norm": 38.477352142333984, "learning_rate": 1.774762502080709e-07, "logits/chosen": 1.1505417823791504, "logits/rejected": 2.6143345832824707, "logps/chosen": -26.480514526367188, "logps/rejected": -274.4019470214844, "loss": 0.9394, "nll_loss": 0.9131211042404175, "rewards/accuracies": 1.0, "rewards/chosen": 2.041104555130005, "rewards/margins": 5.701204299926758, "rewards/rejected": -3.660099744796753, "step": 1448 }, { "epoch": 0.2415, "grad_norm": 34.50722122192383, "learning_rate": 1.7744211041007119e-07, "logits/chosen": 3.101473808288574, "logits/rejected": 3.19659686088562, "logps/chosen": -16.302818298339844, "logps/rejected": -111.35466003417969, "loss": 0.5179, "nll_loss": 0.4940248429775238, "rewards/accuracies": 1.0, "rewards/chosen": 2.309140920639038, "rewards/margins": 5.807762145996094, "rewards/rejected": -3.4986214637756348, "step": 1449 }, { "epoch": 0.24166666666666667, "grad_norm": 39.247676849365234, "learning_rate": 1.774079480472997e-07, "logits/chosen": 2.609675407409668, "logits/rejected": 2.839735269546509, "logps/chosen": -60.88074493408203, "logps/rejected": -224.95059204101562, "loss": 1.0331, "nll_loss": 0.9663610458374023, "rewards/accuracies": 1.0, "rewards/chosen": 0.833040714263916, "rewards/margins": 4.373748302459717, "rewards/rejected": -3.540707588195801, "step": 1450 }, { "epoch": 0.24183333333333334, "grad_norm": 25.385848999023438, "learning_rate": 1.7737376312971052e-07, "logits/chosen": 2.3240153789520264, "logits/rejected": 1.7342183589935303, "logps/chosen": -96.27102661132812, "logps/rejected": -99.99395751953125, "loss": 1.0452, "nll_loss": 1.0133793354034424, "rewards/accuracies": 1.0, "rewards/chosen": 2.514752149581909, "rewards/margins": 5.345646858215332, "rewards/rejected": -2.8308944702148438, "step": 1451 }, { "epoch": 0.242, "grad_norm": 50.78303909301758, "learning_rate": 1.7733955566726436e-07, "logits/chosen": 2.176711082458496, "logits/rejected": 2.0207390785217285, "logps/chosen": -128.90122985839844, "logps/rejected": -128.82106018066406, "loss": 1.4059, "nll_loss": 1.3020325899124146, "rewards/accuracies": 1.0, "rewards/chosen": 0.11228180676698685, "rewards/margins": 4.267739772796631, "rewards/rejected": -4.155457973480225, "step": 1452 }, { "epoch": 0.24216666666666667, "grad_norm": 89.00477600097656, "learning_rate": 1.7730532566992844e-07, "logits/chosen": 2.7516539096832275, "logits/rejected": 2.6114933490753174, "logps/chosen": -40.152835845947266, "logps/rejected": -90.46179962158203, "loss": 1.0212, "nll_loss": 0.6273881196975708, "rewards/accuracies": 1.0, "rewards/chosen": 1.4395084381103516, "rewards/margins": 1.4264720678329468, "rewards/rejected": 0.01303634699434042, "step": 1453 }, { "epoch": 0.24233333333333335, "grad_norm": 220.61090087890625, "learning_rate": 1.7727107314767654e-07, "logits/chosen": 1.749694585800171, "logits/rejected": 1.6024692058563232, "logps/chosen": -102.37953186035156, "logps/rejected": -27.656003952026367, "loss": 3.7717, "nll_loss": 1.62507164478302, "rewards/accuracies": 0.0, "rewards/chosen": 1.0714409351348877, "rewards/margins": -1.6489946842193604, "rewards/rejected": 2.720435619354248, "step": 1454 }, { "epoch": 0.2425, "grad_norm": 23.757444381713867, "learning_rate": 1.77236798110489e-07, "logits/chosen": 2.911569595336914, "logits/rejected": 3.4873220920562744, "logps/chosen": -14.025599479675293, "logps/rejected": -179.23353576660156, "loss": 0.3414, "nll_loss": 0.31168004870414734, "rewards/accuracies": 1.0, "rewards/chosen": 2.198155641555786, "rewards/margins": 5.395995140075684, "rewards/rejected": -3.1978394985198975, "step": 1455 }, { "epoch": 0.24266666666666667, "grad_norm": 79.06619262695312, "learning_rate": 1.772025005683528e-07, "logits/chosen": 2.100679397583008, "logits/rejected": 1.925212025642395, "logps/chosen": -84.72427368164062, "logps/rejected": -24.950626373291016, "loss": 1.0713, "nll_loss": 0.9013220071792603, "rewards/accuracies": 1.0, "rewards/chosen": 1.0707855224609375, "rewards/margins": 2.4754557609558105, "rewards/rejected": -1.404670238494873, "step": 1456 }, { "epoch": 0.24283333333333335, "grad_norm": 30.79473304748535, "learning_rate": 1.7716818053126137e-07, "logits/chosen": 2.6417465209960938, "logits/rejected": 2.9261910915374756, "logps/chosen": -37.141056060791016, "logps/rejected": -278.5193786621094, "loss": 0.755, "nll_loss": 0.7282559871673584, "rewards/accuracies": 1.0, "rewards/chosen": 2.0122973918914795, "rewards/margins": 5.67966365814209, "rewards/rejected": -3.6673662662506104, "step": 1457 }, { "epoch": 0.243, "grad_norm": 77.19857025146484, "learning_rate": 1.7713383800921476e-07, "logits/chosen": 2.3349685668945312, "logits/rejected": 3.1331112384796143, "logps/chosen": -41.70245361328125, "logps/rejected": -61.601654052734375, "loss": 1.5514, "nll_loss": 1.4380156993865967, "rewards/accuracies": 1.0, "rewards/chosen": 0.9362411499023438, "rewards/margins": 3.0903396606445312, "rewards/rejected": -2.1540985107421875, "step": 1458 }, { "epoch": 0.24316666666666667, "grad_norm": 38.40167236328125, "learning_rate": 1.7709947301221957e-07, "logits/chosen": 1.9457961320877075, "logits/rejected": 1.8158622980117798, "logps/chosen": -69.54671478271484, "logps/rejected": -127.08524322509766, "loss": 0.9175, "nll_loss": 0.8379121422767639, "rewards/accuracies": 1.0, "rewards/chosen": 0.5469779968261719, "rewards/margins": 4.2840728759765625, "rewards/rejected": -3.7370948791503906, "step": 1459 }, { "epoch": 0.24333333333333335, "grad_norm": 18.425905227661133, "learning_rate": 1.7706508555028893e-07, "logits/chosen": 2.755596160888672, "logits/rejected": 2.72247052192688, "logps/chosen": -127.95484161376953, "logps/rejected": -112.65973663330078, "loss": 0.6405, "nll_loss": 0.6035606861114502, "rewards/accuracies": 1.0, "rewards/chosen": 2.6095361709594727, "rewards/margins": 5.17222785949707, "rewards/rejected": -2.5626916885375977, "step": 1460 }, { "epoch": 0.2435, "grad_norm": 52.23133850097656, "learning_rate": 1.770306756334425e-07, "logits/chosen": 2.085733652114868, "logits/rejected": 2.0915682315826416, "logps/chosen": -85.15133666992188, "logps/rejected": -73.60969543457031, "loss": 1.3114, "nll_loss": 1.1664568185806274, "rewards/accuracies": 1.0, "rewards/chosen": 0.5462043881416321, "rewards/margins": 2.7329437732696533, "rewards/rejected": -2.186739444732666, "step": 1461 }, { "epoch": 0.24366666666666667, "grad_norm": 46.62310028076172, "learning_rate": 1.7699624327170653e-07, "logits/chosen": 3.1259024143218994, "logits/rejected": 3.097709894180298, "logps/chosen": -52.902835845947266, "logps/rejected": -96.93785095214844, "loss": 1.462, "nll_loss": 1.4298063516616821, "rewards/accuracies": 1.0, "rewards/chosen": 1.9213230609893799, "rewards/margins": 5.289987564086914, "rewards/rejected": -3.3686647415161133, "step": 1462 }, { "epoch": 0.24383333333333335, "grad_norm": 30.510334014892578, "learning_rate": 1.7696178847511377e-07, "logits/chosen": 2.508333683013916, "logits/rejected": 2.6087512969970703, "logps/chosen": -59.30683135986328, "logps/rejected": -179.32577514648438, "loss": 0.764, "nll_loss": 0.7232541441917419, "rewards/accuracies": 1.0, "rewards/chosen": 1.981225609779358, "rewards/margins": 4.826684474945068, "rewards/rejected": -2.845458984375, "step": 1463 }, { "epoch": 0.244, "grad_norm": 30.1180419921875, "learning_rate": 1.7692731125370352e-07, "logits/chosen": 2.6484570503234863, "logits/rejected": 2.7079906463623047, "logps/chosen": -22.051237106323242, "logps/rejected": -88.36097717285156, "loss": 0.5247, "nll_loss": 0.4594007432460785, "rewards/accuracies": 1.0, "rewards/chosen": 1.495788812637329, "rewards/margins": 3.9994723796844482, "rewards/rejected": -2.503683567047119, "step": 1464 }, { "epoch": 0.24416666666666667, "grad_norm": 28.518020629882812, "learning_rate": 1.7689281161752164e-07, "logits/chosen": 2.022277593612671, "logits/rejected": 1.9789220094680786, "logps/chosen": -38.692649841308594, "logps/rejected": -118.0483627319336, "loss": 0.6963, "nll_loss": 0.655807614326477, "rewards/accuracies": 1.0, "rewards/chosen": 1.6595063209533691, "rewards/margins": 4.896204471588135, "rewards/rejected": -3.2366981506347656, "step": 1465 }, { "epoch": 0.24433333333333335, "grad_norm": 175.7654266357422, "learning_rate": 1.7685828957662046e-07, "logits/chosen": 2.7757680416107178, "logits/rejected": 2.7809131145477295, "logps/chosen": -51.94095230102539, "logps/rejected": -29.80649185180664, "loss": 2.8134, "nll_loss": 0.7527675032615662, "rewards/accuracies": 0.0, "rewards/chosen": 0.8467651605606079, "rewards/margins": -1.5887805223464966, "rewards/rejected": 2.4355456829071045, "step": 1466 }, { "epoch": 0.2445, "grad_norm": 22.47551155090332, "learning_rate": 1.7682374514105886e-07, "logits/chosen": 2.229177951812744, "logits/rejected": 2.400144577026367, "logps/chosen": -12.740667343139648, "logps/rejected": -153.68960571289062, "loss": 0.3202, "nll_loss": 0.2831259071826935, "rewards/accuracies": 1.0, "rewards/chosen": 2.8945560455322266, "rewards/margins": 5.331892967224121, "rewards/rejected": -2.4373366832733154, "step": 1467 }, { "epoch": 0.24466666666666667, "grad_norm": 39.267051696777344, "learning_rate": 1.7678917832090232e-07, "logits/chosen": 0.8958791494369507, "logits/rejected": 2.7623343467712402, "logps/chosen": -40.47943115234375, "logps/rejected": -187.34072875976562, "loss": 0.9098, "nll_loss": 0.82611083984375, "rewards/accuracies": 1.0, "rewards/chosen": 0.17902833223342896, "rewards/margins": 7.667901039123535, "rewards/rejected": -7.488872528076172, "step": 1468 }, { "epoch": 0.24483333333333332, "grad_norm": 175.78054809570312, "learning_rate": 1.7675458912622277e-07, "logits/chosen": 2.485020637512207, "logits/rejected": 2.7846193313598633, "logps/chosen": -118.3006820678711, "logps/rejected": -185.65914916992188, "loss": 2.1009, "nll_loss": 1.3917728662490845, "rewards/accuracies": 1.0, "rewards/chosen": -2.4858574867248535, "rewards/margins": 0.9000864028930664, "rewards/rejected": -3.38594388961792, "step": 1469 }, { "epoch": 0.245, "grad_norm": 33.31284713745117, "learning_rate": 1.767199775670986e-07, "logits/chosen": 2.645824432373047, "logits/rejected": 2.8090832233428955, "logps/chosen": -32.32477569580078, "logps/rejected": -346.20489501953125, "loss": 0.7797, "nll_loss": 0.751738965511322, "rewards/accuracies": 1.0, "rewards/chosen": 1.4080612659454346, "rewards/margins": 12.40760612487793, "rewards/rejected": -10.999545097351074, "step": 1470 }, { "epoch": 0.24516666666666667, "grad_norm": 43.99984359741211, "learning_rate": 1.7668534365361488e-07, "logits/chosen": 1.1662503480911255, "logits/rejected": 2.193142890930176, "logps/chosen": -34.95105743408203, "logps/rejected": -268.65948486328125, "loss": 1.1855, "nll_loss": 1.1650352478027344, "rewards/accuracies": 1.0, "rewards/chosen": 2.3577771186828613, "rewards/margins": 6.120843410491943, "rewards/rejected": -3.763066291809082, "step": 1471 }, { "epoch": 0.24533333333333332, "grad_norm": 57.47602081298828, "learning_rate": 1.7665068739586303e-07, "logits/chosen": 2.5287959575653076, "logits/rejected": 2.4350712299346924, "logps/chosen": -42.555824279785156, "logps/rejected": -37.07230758666992, "loss": 1.7206, "nll_loss": 1.6367624998092651, "rewards/accuracies": 1.0, "rewards/chosen": 2.8278026580810547, "rewards/margins": 4.263226509094238, "rewards/rejected": -1.4354238510131836, "step": 1472 }, { "epoch": 0.2455, "grad_norm": 30.089317321777344, "learning_rate": 1.766160088039411e-07, "logits/chosen": 3.0181782245635986, "logits/rejected": 3.064635992050171, "logps/chosen": -52.05796813964844, "logps/rejected": -146.64324951171875, "loss": 0.6684, "nll_loss": 0.6197376847267151, "rewards/accuracies": 1.0, "rewards/chosen": 0.894818902015686, "rewards/margins": 6.016811847686768, "rewards/rejected": -5.121993064880371, "step": 1473 }, { "epoch": 0.24566666666666667, "grad_norm": 214.03811645507812, "learning_rate": 1.7658130788795355e-07, "logits/chosen": 2.4343807697296143, "logits/rejected": 2.294271945953369, "logps/chosen": -187.05233764648438, "logps/rejected": -77.41559600830078, "loss": 1.8789, "nll_loss": 1.0277600288391113, "rewards/accuracies": 1.0, "rewards/chosen": 0.5875458121299744, "rewards/margins": 0.06619954109191895, "rewards/rejected": 0.5213462710380554, "step": 1474 }, { "epoch": 0.24583333333333332, "grad_norm": 33.05157470703125, "learning_rate": 1.7654658465801146e-07, "logits/chosen": 3.1944096088409424, "logits/rejected": 3.2043814659118652, "logps/chosen": -74.80043029785156, "logps/rejected": -217.56861877441406, "loss": 0.8678, "nll_loss": 0.821982741355896, "rewards/accuracies": 1.0, "rewards/chosen": 1.5458892583847046, "rewards/margins": 4.6752400398254395, "rewards/rejected": -3.1293506622314453, "step": 1475 }, { "epoch": 0.246, "grad_norm": 36.764827728271484, "learning_rate": 1.7651183912423227e-07, "logits/chosen": 2.9736881256103516, "logits/rejected": 2.894543409347534, "logps/chosen": -102.67366790771484, "logps/rejected": -115.4754638671875, "loss": 1.0597, "nll_loss": 1.0267367362976074, "rewards/accuracies": 1.0, "rewards/chosen": 1.408547282218933, "rewards/margins": 6.041730880737305, "rewards/rejected": -4.633183479309082, "step": 1476 }, { "epoch": 0.24616666666666667, "grad_norm": 107.99727630615234, "learning_rate": 1.7647707129674007e-07, "logits/chosen": 2.968900680541992, "logits/rejected": 3.240586757659912, "logps/chosen": -58.018531799316406, "logps/rejected": -319.9224853515625, "loss": 2.8512, "nll_loss": 2.6372058391571045, "rewards/accuracies": 1.0, "rewards/chosen": -1.0293480157852173, "rewards/margins": 4.153635501861572, "rewards/rejected": -5.1829833984375, "step": 1477 }, { "epoch": 0.24633333333333332, "grad_norm": 22.902238845825195, "learning_rate": 1.764422811856653e-07, "logits/chosen": 1.7764923572540283, "logits/rejected": 2.2067549228668213, "logps/chosen": -183.71458435058594, "logps/rejected": -336.8435974121094, "loss": 1.0811, "nll_loss": 1.0558310747146606, "rewards/accuracies": 1.0, "rewards/chosen": 1.5520858764648438, "rewards/margins": 7.872023105621338, "rewards/rejected": -6.319937229156494, "step": 1478 }, { "epoch": 0.2465, "grad_norm": 39.12397003173828, "learning_rate": 1.7640746880114502e-07, "logits/chosen": 3.0119080543518066, "logits/rejected": 3.2751636505126953, "logps/chosen": -26.70255470275879, "logps/rejected": -263.6339111328125, "loss": 0.7659, "nll_loss": 0.7216905951499939, "rewards/accuracies": 1.0, "rewards/chosen": 1.1152563095092773, "rewards/margins": 5.442514896392822, "rewards/rejected": -4.327258586883545, "step": 1479 }, { "epoch": 0.24666666666666667, "grad_norm": 19.053617477416992, "learning_rate": 1.763726341533227e-07, "logits/chosen": 2.803640127182007, "logits/rejected": 2.5826921463012695, "logps/chosen": -135.65863037109375, "logps/rejected": -76.85112762451172, "loss": 0.7375, "nll_loss": 0.7139927744865417, "rewards/accuracies": 1.0, "rewards/chosen": 2.5635881423950195, "rewards/margins": 5.8385009765625, "rewards/rejected": -3.2749130725860596, "step": 1480 }, { "epoch": 0.24683333333333332, "grad_norm": 40.043453216552734, "learning_rate": 1.763377772523483e-07, "logits/chosen": 2.116549491882324, "logits/rejected": 2.202205181121826, "logps/chosen": -17.41640281677246, "logps/rejected": -301.60205078125, "loss": 0.6345, "nll_loss": 0.5805467367172241, "rewards/accuracies": 1.0, "rewards/chosen": 0.8273869156837463, "rewards/margins": 5.446161270141602, "rewards/rejected": -4.6187744140625, "step": 1481 }, { "epoch": 0.247, "grad_norm": 20.481019973754883, "learning_rate": 1.7630289810837833e-07, "logits/chosen": 2.0934674739837646, "logits/rejected": 2.586416721343994, "logps/chosen": -151.36741638183594, "logps/rejected": -449.7158203125, "loss": 0.9573, "nll_loss": 0.940170407295227, "rewards/accuracies": 1.0, "rewards/chosen": 1.9819902181625366, "rewards/margins": 8.056044578552246, "rewards/rejected": -6.07405424118042, "step": 1482 }, { "epoch": 0.24716666666666667, "grad_norm": 46.65192413330078, "learning_rate": 1.7626799673157567e-07, "logits/chosen": 2.6529648303985596, "logits/rejected": 2.6256444454193115, "logps/chosen": -39.24758529663086, "logps/rejected": -45.275978088378906, "loss": 1.004, "nll_loss": 0.8919906616210938, "rewards/accuracies": 1.0, "rewards/chosen": 1.1708786487579346, "rewards/margins": 3.1080737113952637, "rewards/rejected": -1.9371951818466187, "step": 1483 }, { "epoch": 0.24733333333333332, "grad_norm": 79.1517105102539, "learning_rate": 1.762330731321098e-07, "logits/chosen": 1.9698134660720825, "logits/rejected": 1.6991245746612549, "logps/chosen": -87.34776306152344, "logps/rejected": -59.522003173828125, "loss": 1.1293, "nll_loss": 0.7402351498603821, "rewards/accuracies": 1.0, "rewards/chosen": 1.419804573059082, "rewards/margins": 1.4403955936431885, "rewards/rejected": -0.020590974017977715, "step": 1484 }, { "epoch": 0.2475, "grad_norm": 54.85307312011719, "learning_rate": 1.7619812732015663e-07, "logits/chosen": 1.6029436588287354, "logits/rejected": 2.7659666538238525, "logps/chosen": -87.81770324707031, "logps/rejected": -182.14529418945312, "loss": 1.4293, "nll_loss": 1.3510414361953735, "rewards/accuracies": 1.0, "rewards/chosen": 0.5265586972236633, "rewards/margins": 4.382613658905029, "rewards/rejected": -3.8560547828674316, "step": 1485 }, { "epoch": 0.24766666666666667, "grad_norm": 59.64723205566406, "learning_rate": 1.7616315930589847e-07, "logits/chosen": 2.7660012245178223, "logits/rejected": 2.701921224594116, "logps/chosen": -46.98394012451172, "logps/rejected": -70.72785949707031, "loss": 1.5208, "nll_loss": 1.381880521774292, "rewards/accuracies": 1.0, "rewards/chosen": 0.8774479627609253, "rewards/margins": 2.7594640254974365, "rewards/rejected": -1.8820160627365112, "step": 1486 }, { "epoch": 0.24783333333333332, "grad_norm": 22.058691024780273, "learning_rate": 1.7612816909952418e-07, "logits/chosen": 1.140848159790039, "logits/rejected": 3.2195231914520264, "logps/chosen": -87.86405944824219, "logps/rejected": -460.6427001953125, "loss": 0.8659, "nll_loss": 0.8530492782592773, "rewards/accuracies": 1.0, "rewards/chosen": 2.221827745437622, "rewards/margins": 10.955156326293945, "rewards/rejected": -8.733328819274902, "step": 1487 }, { "epoch": 0.248, "grad_norm": 47.339229583740234, "learning_rate": 1.760931567112291e-07, "logits/chosen": 2.327775239944458, "logits/rejected": 2.543720006942749, "logps/chosen": -20.300365447998047, "logps/rejected": -278.4613342285156, "loss": 0.9084, "nll_loss": 0.8826245665550232, "rewards/accuracies": 1.0, "rewards/chosen": 1.5546903610229492, "rewards/margins": 7.281778335571289, "rewards/rejected": -5.72708797454834, "step": 1488 }, { "epoch": 0.24816666666666667, "grad_norm": 44.0145149230957, "learning_rate": 1.7605812215121496e-07, "logits/chosen": 2.5672731399536133, "logits/rejected": 2.6639504432678223, "logps/chosen": -62.10350036621094, "logps/rejected": -185.48193359375, "loss": 0.9729, "nll_loss": 0.8871928453445435, "rewards/accuracies": 1.0, "rewards/chosen": 0.3876999020576477, "rewards/margins": 4.337346076965332, "rewards/rejected": -3.94964599609375, "step": 1489 }, { "epoch": 0.24833333333333332, "grad_norm": 22.20098304748535, "learning_rate": 1.7602306542969003e-07, "logits/chosen": 1.4400546550750732, "logits/rejected": 1.884067177772522, "logps/chosen": -67.23303985595703, "logps/rejected": -107.4052505493164, "loss": 0.6161, "nll_loss": 0.5846350789070129, "rewards/accuracies": 1.0, "rewards/chosen": 2.090475559234619, "rewards/margins": 5.294480323791504, "rewards/rejected": -3.2040047645568848, "step": 1490 }, { "epoch": 0.2485, "grad_norm": 36.05405044555664, "learning_rate": 1.7598798655686898e-07, "logits/chosen": 3.1060686111450195, "logits/rejected": 3.068558931350708, "logps/chosen": -45.039493560791016, "logps/rejected": -53.027347564697266, "loss": 0.822, "nll_loss": 0.7264434695243835, "rewards/accuracies": 1.0, "rewards/chosen": 1.4035500288009644, "rewards/margins": 3.381922721862793, "rewards/rejected": -1.978372573852539, "step": 1491 }, { "epoch": 0.24866666666666667, "grad_norm": 25.915302276611328, "learning_rate": 1.7595288554297292e-07, "logits/chosen": 2.871579647064209, "logits/rejected": 2.9250731468200684, "logps/chosen": -93.34809875488281, "logps/rejected": -209.19256591796875, "loss": 0.98, "nll_loss": 0.9623515605926514, "rewards/accuracies": 1.0, "rewards/chosen": 2.8599319458007812, "rewards/margins": 6.3826584815979, "rewards/rejected": -3.522726535797119, "step": 1492 }, { "epoch": 0.24883333333333332, "grad_norm": 53.0078010559082, "learning_rate": 1.7591776239822945e-07, "logits/chosen": 3.3767452239990234, "logits/rejected": 3.616367816925049, "logps/chosen": -51.47757339477539, "logps/rejected": -150.8936767578125, "loss": 1.0582, "nll_loss": 0.935955822467804, "rewards/accuracies": 1.0, "rewards/chosen": 1.5922223329544067, "rewards/margins": 3.0826430320739746, "rewards/rejected": -1.4904205799102783, "step": 1493 }, { "epoch": 0.249, "grad_norm": 24.026039123535156, "learning_rate": 1.7588261713287266e-07, "logits/chosen": 4.2194695472717285, "logits/rejected": 4.356523036956787, "logps/chosen": -76.75909423828125, "logps/rejected": -128.60092163085938, "loss": 0.8221, "nll_loss": 0.8079903721809387, "rewards/accuracies": 1.0, "rewards/chosen": 2.1578736305236816, "rewards/margins": 8.818410873413086, "rewards/rejected": -6.6605377197265625, "step": 1494 }, { "epoch": 0.24916666666666668, "grad_norm": 47.03980255126953, "learning_rate": 1.75847449757143e-07, "logits/chosen": 1.9834725856781006, "logits/rejected": 2.6106631755828857, "logps/chosen": -52.98130798339844, "logps/rejected": -105.46336364746094, "loss": 1.1219, "nll_loss": 0.9996472597122192, "rewards/accuracies": 1.0, "rewards/chosen": 0.9165619611740112, "rewards/margins": 2.965005397796631, "rewards/rejected": -2.048443555831909, "step": 1495 }, { "epoch": 0.24933333333333332, "grad_norm": 280.2947998046875, "learning_rate": 1.758122602812874e-07, "logits/chosen": 3.0278801918029785, "logits/rejected": 2.85280704498291, "logps/chosen": -273.1944885253906, "logps/rejected": -95.76915740966797, "loss": 2.4427, "nll_loss": 1.264789342880249, "rewards/accuracies": 0.0, "rewards/chosen": -2.9701125621795654, "rewards/margins": -0.22053742408752441, "rewards/rejected": -2.749575138092041, "step": 1496 }, { "epoch": 0.2495, "grad_norm": 34.22079086303711, "learning_rate": 1.757770487155592e-07, "logits/chosen": 2.872919797897339, "logits/rejected": 2.92104434967041, "logps/chosen": -107.52043151855469, "logps/rejected": -23.669748306274414, "loss": 1.032, "nll_loss": 0.9035329818725586, "rewards/accuracies": 1.0, "rewards/chosen": 2.3684494495391846, "rewards/margins": 3.4123992919921875, "rewards/rejected": -1.0439499616622925, "step": 1497 }, { "epoch": 0.24966666666666668, "grad_norm": 42.270477294921875, "learning_rate": 1.757418150702183e-07, "logits/chosen": 2.9770984649658203, "logits/rejected": 2.90753173828125, "logps/chosen": -80.59590148925781, "logps/rejected": -44.62471389770508, "loss": 1.0643, "nll_loss": 0.8760423064231873, "rewards/accuracies": 1.0, "rewards/chosen": 1.5293769836425781, "rewards/margins": 2.4748120307922363, "rewards/rejected": -0.9454350471496582, "step": 1498 }, { "epoch": 0.24983333333333332, "grad_norm": 35.539894104003906, "learning_rate": 1.7570655935553082e-07, "logits/chosen": 3.0076470375061035, "logits/rejected": 2.8854033946990967, "logps/chosen": -101.58673095703125, "logps/rejected": -156.7366180419922, "loss": 1.178, "nll_loss": 1.1543947458267212, "rewards/accuracies": 1.0, "rewards/chosen": 1.7318634986877441, "rewards/margins": 6.713663578033447, "rewards/rejected": -4.981800079345703, "step": 1499 }, { "epoch": 0.25, "grad_norm": 42.851070404052734, "learning_rate": 1.7567128158176952e-07, "logits/chosen": 2.012883186340332, "logits/rejected": 2.642793655395508, "logps/chosen": -33.652740478515625, "logps/rejected": -172.65670776367188, "loss": 1.1056, "nll_loss": 1.0855721235275269, "rewards/accuracies": 1.0, "rewards/chosen": 2.047344923019409, "rewards/margins": 6.5160932540893555, "rewards/rejected": -4.468748569488525, "step": 1500 }, { "epoch": 0.25016666666666665, "grad_norm": 47.2381591796875, "learning_rate": 1.7563598175921345e-07, "logits/chosen": 2.2138452529907227, "logits/rejected": 2.2650861740112305, "logps/chosen": -46.648887634277344, "logps/rejected": -82.3729019165039, "loss": 0.8716, "nll_loss": 0.7524013519287109, "rewards/accuracies": 1.0, "rewards/chosen": 0.9012832641601562, "rewards/margins": 3.006929397583008, "rewards/rejected": -2.1056461334228516, "step": 1501 }, { "epoch": 0.25033333333333335, "grad_norm": 29.518482208251953, "learning_rate": 1.7560065989814815e-07, "logits/chosen": 2.444978713989258, "logits/rejected": 2.599971055984497, "logps/chosen": -95.88052368164062, "logps/rejected": -347.8468322753906, "loss": 1.138, "nll_loss": 1.1148896217346191, "rewards/accuracies": 1.0, "rewards/chosen": 1.9937310218811035, "rewards/margins": 6.089504241943359, "rewards/rejected": -4.095773220062256, "step": 1502 }, { "epoch": 0.2505, "grad_norm": 26.936697006225586, "learning_rate": 1.7556531600886553e-07, "logits/chosen": 3.562107801437378, "logits/rejected": 3.5517525672912598, "logps/chosen": -57.480308532714844, "logps/rejected": -191.54856872558594, "loss": 0.7272, "nll_loss": 0.6925337910652161, "rewards/accuracies": 1.0, "rewards/chosen": 1.2279258966445923, "rewards/margins": 6.859407901763916, "rewards/rejected": -5.631482124328613, "step": 1503 }, { "epoch": 0.25066666666666665, "grad_norm": 137.00042724609375, "learning_rate": 1.75529950101664e-07, "logits/chosen": 2.9315381050109863, "logits/rejected": 2.790212631225586, "logps/chosen": -55.583404541015625, "logps/rejected": -72.87235260009766, "loss": 1.4901, "nll_loss": 0.8822761178016663, "rewards/accuracies": 1.0, "rewards/chosen": 0.6551556587219238, "rewards/margins": 0.5970585942268372, "rewards/rejected": 0.058097075670957565, "step": 1504 }, { "epoch": 0.25083333333333335, "grad_norm": 35.54199981689453, "learning_rate": 1.7549456218684832e-07, "logits/chosen": 2.0519654750823975, "logits/rejected": 2.253068447113037, "logps/chosen": -22.81201171875, "logps/rejected": -83.1305160522461, "loss": 0.5877, "nll_loss": 0.5184547901153564, "rewards/accuracies": 1.0, "rewards/chosen": 1.4430561065673828, "rewards/margins": 3.899815320968628, "rewards/rejected": -2.456759214401245, "step": 1505 }, { "epoch": 0.251, "grad_norm": 123.1107177734375, "learning_rate": 1.7545915227472964e-07, "logits/chosen": 2.73537015914917, "logits/rejected": 2.8176426887512207, "logps/chosen": -16.732152938842773, "logps/rejected": -142.65240478515625, "loss": 1.4556, "nll_loss": 1.3943458795547485, "rewards/accuracies": 1.0, "rewards/chosen": 1.0137022733688354, "rewards/margins": 4.354818820953369, "rewards/rejected": -3.3411166667938232, "step": 1506 }, { "epoch": 0.25116666666666665, "grad_norm": 34.05120086669922, "learning_rate": 1.754237203756256e-07, "logits/chosen": 1.083570957183838, "logits/rejected": 2.414484977722168, "logps/chosen": -27.397756576538086, "logps/rejected": -310.7855224609375, "loss": 0.8637, "nll_loss": 0.8561797738075256, "rewards/accuracies": 1.0, "rewards/chosen": 2.8595521450042725, "rewards/margins": 9.103515625, "rewards/rejected": -6.243963718414307, "step": 1507 }, { "epoch": 0.25133333333333335, "grad_norm": 36.15415573120117, "learning_rate": 1.7538826649986018e-07, "logits/chosen": 0.693132221698761, "logits/rejected": 3.266913890838623, "logps/chosen": -65.15921783447266, "logps/rejected": -61.789512634277344, "loss": 0.9422, "nll_loss": 0.8687896728515625, "rewards/accuracies": 1.0, "rewards/chosen": 1.2427955865859985, "rewards/margins": 3.814889430999756, "rewards/rejected": -2.572093963623047, "step": 1508 }, { "epoch": 0.2515, "grad_norm": 44.165184020996094, "learning_rate": 1.7535279065776378e-07, "logits/chosen": 2.176023483276367, "logits/rejected": 0.9044565558433533, "logps/chosen": -204.99053955078125, "logps/rejected": -57.783180236816406, "loss": 1.0141, "nll_loss": 0.915136456489563, "rewards/accuracies": 1.0, "rewards/chosen": 0.2116699367761612, "rewards/margins": 4.124994277954102, "rewards/rejected": -3.9133241176605225, "step": 1509 }, { "epoch": 0.25166666666666665, "grad_norm": 25.875776290893555, "learning_rate": 1.7531729285967328e-07, "logits/chosen": 2.3230435848236084, "logits/rejected": 2.1947309970855713, "logps/chosen": -81.48452758789062, "logps/rejected": -75.13986206054688, "loss": 0.8469, "nll_loss": 0.79886794090271, "rewards/accuracies": 1.0, "rewards/chosen": 2.7184901237487793, "rewards/margins": 4.897585868835449, "rewards/rejected": -2.179095506668091, "step": 1510 }, { "epoch": 0.25183333333333335, "grad_norm": 31.492761611938477, "learning_rate": 1.7528177311593183e-07, "logits/chosen": 2.95184326171875, "logits/rejected": 3.1731741428375244, "logps/chosen": -32.897056579589844, "logps/rejected": -261.1075744628906, "loss": 0.8106, "nll_loss": 0.7832634449005127, "rewards/accuracies": 1.0, "rewards/chosen": 1.5195931196212769, "rewards/margins": 6.821627140045166, "rewards/rejected": -5.3020339012146, "step": 1511 }, { "epoch": 0.252, "grad_norm": 30.09421157836914, "learning_rate": 1.75246231436889e-07, "logits/chosen": 2.415321111679077, "logits/rejected": 2.498281717300415, "logps/chosen": -20.836589813232422, "logps/rejected": -252.21774291992188, "loss": 0.5472, "nll_loss": 0.5342715978622437, "rewards/accuracies": 1.0, "rewards/chosen": 2.1982173919677734, "rewards/margins": 12.45684814453125, "rewards/rejected": -10.258630752563477, "step": 1512 }, { "epoch": 0.25216666666666665, "grad_norm": 98.29554748535156, "learning_rate": 1.7521066783290087e-07, "logits/chosen": 2.180853843688965, "logits/rejected": 2.4153409004211426, "logps/chosen": -83.14220428466797, "logps/rejected": -176.20059204101562, "loss": 1.3252, "nll_loss": 0.978143572807312, "rewards/accuracies": 1.0, "rewards/chosen": -0.42454075813293457, "rewards/margins": 1.4372161626815796, "rewards/rejected": -1.8617569208145142, "step": 1513 }, { "epoch": 0.25233333333333335, "grad_norm": 53.627464294433594, "learning_rate": 1.7517508231432973e-07, "logits/chosen": 3.4522149562835693, "logits/rejected": 3.298922300338745, "logps/chosen": -79.41231536865234, "logps/rejected": -30.468225479125977, "loss": 1.4057, "nll_loss": 1.28084397315979, "rewards/accuracies": 1.0, "rewards/chosen": 1.2224982976913452, "rewards/margins": 2.954007625579834, "rewards/rejected": -1.7315094470977783, "step": 1514 }, { "epoch": 0.2525, "grad_norm": 56.40516662597656, "learning_rate": 1.751394748915444e-07, "logits/chosen": 3.0542140007019043, "logits/rejected": 2.935272216796875, "logps/chosen": -42.25307846069336, "logps/rejected": -32.582157135009766, "loss": 1.2531, "nll_loss": 1.1119228601455688, "rewards/accuracies": 1.0, "rewards/chosen": 0.8464604020118713, "rewards/margins": 2.7342138290405273, "rewards/rejected": -1.8877534866333008, "step": 1515 }, { "epoch": 0.25266666666666665, "grad_norm": 57.734432220458984, "learning_rate": 1.7510384557491998e-07, "logits/chosen": 1.6171149015426636, "logits/rejected": 2.24577260017395, "logps/chosen": -25.52292251586914, "logps/rejected": -164.0013885498047, "loss": 1.0895, "nll_loss": 1.0209169387817383, "rewards/accuracies": 1.0, "rewards/chosen": 1.6340367794036865, "rewards/margins": 3.9317402839660645, "rewards/rejected": -2.297703504562378, "step": 1516 }, { "epoch": 0.25283333333333335, "grad_norm": 234.3533172607422, "learning_rate": 1.7506819437483808e-07, "logits/chosen": 2.902242660522461, "logits/rejected": 3.0774073600769043, "logps/chosen": -68.75198364257812, "logps/rejected": -65.1654281616211, "loss": 3.3959, "nll_loss": 0.7237051725387573, "rewards/accuracies": 0.0, "rewards/chosen": 1.697342872619629, "rewards/margins": -2.1174237728118896, "rewards/rejected": 3.8147666454315186, "step": 1517 }, { "epoch": 0.253, "grad_norm": 26.904924392700195, "learning_rate": 1.7503252130168655e-07, "logits/chosen": 1.8967832326889038, "logits/rejected": 3.0506701469421387, "logps/chosen": -23.919273376464844, "logps/rejected": -229.85264587402344, "loss": 0.4711, "nll_loss": 0.4271298944950104, "rewards/accuracies": 1.0, "rewards/chosen": 1.9386566877365112, "rewards/margins": 4.693855285644531, "rewards/rejected": -2.7551987171173096, "step": 1518 }, { "epoch": 0.25316666666666665, "grad_norm": 29.966533660888672, "learning_rate": 1.7499682636585967e-07, "logits/chosen": 2.577342987060547, "logits/rejected": 2.5544753074645996, "logps/chosen": -83.23431396484375, "logps/rejected": -218.8419952392578, "loss": 1.0444, "nll_loss": 1.0275843143463135, "rewards/accuracies": 1.0, "rewards/chosen": 1.9690109491348267, "rewards/margins": 8.46949577331543, "rewards/rejected": -6.500484466552734, "step": 1519 }, { "epoch": 0.25333333333333335, "grad_norm": 30.409332275390625, "learning_rate": 1.7496110957775808e-07, "logits/chosen": 2.4108049869537354, "logits/rejected": 2.198371648788452, "logps/chosen": -55.63555908203125, "logps/rejected": -46.84379577636719, "loss": 0.9038, "nll_loss": 0.8693056106567383, "rewards/accuracies": 1.0, "rewards/chosen": 1.8391830921173096, "rewards/margins": 5.158871650695801, "rewards/rejected": -3.3196887969970703, "step": 1520 }, { "epoch": 0.2535, "grad_norm": 31.022432327270508, "learning_rate": 1.749253709477888e-07, "logits/chosen": 2.4547793865203857, "logits/rejected": 2.390305280685425, "logps/chosen": -29.444652557373047, "logps/rejected": -39.61146926879883, "loss": 0.7239, "nll_loss": 0.6691966652870178, "rewards/accuracies": 1.0, "rewards/chosen": 1.9421314001083374, "rewards/margins": 4.348945140838623, "rewards/rejected": -2.406813621520996, "step": 1521 }, { "epoch": 0.25366666666666665, "grad_norm": 206.75827026367188, "learning_rate": 1.7488961048636517e-07, "logits/chosen": 3.1111514568328857, "logits/rejected": 3.15458607673645, "logps/chosen": -81.26394653320312, "logps/rejected": -71.88743591308594, "loss": 2.4991, "nll_loss": 0.8377727270126343, "rewards/accuracies": 0.0, "rewards/chosen": 1.0428192615509033, "rewards/margins": -1.0615754127502441, "rewards/rejected": 2.1043946743011475, "step": 1522 }, { "epoch": 0.25383333333333336, "grad_norm": 35.6155891418457, "learning_rate": 1.74853828203907e-07, "logits/chosen": 1.2324275970458984, "logits/rejected": 2.3826959133148193, "logps/chosen": -17.322738647460938, "logps/rejected": -260.602294921875, "loss": 0.654, "nll_loss": 0.6186692118644714, "rewards/accuracies": 1.0, "rewards/chosen": 1.5900812149047852, "rewards/margins": 5.285398483276367, "rewards/rejected": -3.695317268371582, "step": 1523 }, { "epoch": 0.254, "grad_norm": 39.0693244934082, "learning_rate": 1.7481802411084038e-07, "logits/chosen": 2.6162469387054443, "logits/rejected": 2.666038751602173, "logps/chosen": -60.45167541503906, "logps/rejected": -79.99596405029297, "loss": 0.8558, "nll_loss": 0.7652111053466797, "rewards/accuracies": 1.0, "rewards/chosen": 1.526009440422058, "rewards/margins": 3.4884445667266846, "rewards/rejected": -1.9624351263046265, "step": 1524 }, { "epoch": 0.25416666666666665, "grad_norm": 42.24835968017578, "learning_rate": 1.7478219821759775e-07, "logits/chosen": 3.523211717605591, "logits/rejected": 3.583800792694092, "logps/chosen": -44.492393493652344, "logps/rejected": -67.41123962402344, "loss": 0.9466, "nll_loss": 0.8556228876113892, "rewards/accuracies": 1.0, "rewards/chosen": 1.7053825855255127, "rewards/margins": 3.5337562561035156, "rewards/rejected": -1.8283737897872925, "step": 1525 }, { "epoch": 0.25433333333333336, "grad_norm": 29.781078338623047, "learning_rate": 1.747463505346179e-07, "logits/chosen": 2.467454433441162, "logits/rejected": 2.50311541557312, "logps/chosen": -69.60699462890625, "logps/rejected": -157.6766815185547, "loss": 0.9081, "nll_loss": 0.8923973441123962, "rewards/accuracies": 1.0, "rewards/chosen": 2.285754442214966, "rewards/margins": 6.965846061706543, "rewards/rejected": -4.680091381072998, "step": 1526 }, { "epoch": 0.2545, "grad_norm": 32.91801452636719, "learning_rate": 1.7471048107234596e-07, "logits/chosen": 2.0029642581939697, "logits/rejected": 2.124898910522461, "logps/chosen": -48.921390533447266, "logps/rejected": -135.78720092773438, "loss": 0.8278, "nll_loss": 0.7643967866897583, "rewards/accuracies": 1.0, "rewards/chosen": 0.6878670454025269, "rewards/margins": 4.940366744995117, "rewards/rejected": -4.252499580383301, "step": 1527 }, { "epoch": 0.25466666666666665, "grad_norm": 39.501853942871094, "learning_rate": 1.7467458984123348e-07, "logits/chosen": 1.6149024963378906, "logits/rejected": 2.178830623626709, "logps/chosen": -14.409950256347656, "logps/rejected": -114.47401428222656, "loss": 0.5806, "nll_loss": 0.5542287826538086, "rewards/accuracies": 1.0, "rewards/chosen": 1.771215796470642, "rewards/margins": 5.9819793701171875, "rewards/rejected": -4.210763454437256, "step": 1528 }, { "epoch": 0.25483333333333336, "grad_norm": 28.98074722290039, "learning_rate": 1.7463867685173832e-07, "logits/chosen": 1.643312931060791, "logits/rejected": 1.95622718334198, "logps/chosen": -15.370038986206055, "logps/rejected": -119.03325653076172, "loss": 0.4918, "nll_loss": 0.4520599842071533, "rewards/accuracies": 1.0, "rewards/chosen": 1.2669109106063843, "rewards/margins": 5.459198474884033, "rewards/rejected": -4.192287445068359, "step": 1529 }, { "epoch": 0.255, "grad_norm": 26.153034210205078, "learning_rate": 1.746027421143246e-07, "logits/chosen": 2.16351318359375, "logits/rejected": 2.4895541667938232, "logps/chosen": -62.6319580078125, "logps/rejected": -277.16436767578125, "loss": 0.9256, "nll_loss": 0.907709538936615, "rewards/accuracies": 1.0, "rewards/chosen": 1.9008049964904785, "rewards/margins": 8.467063903808594, "rewards/rejected": -6.566258430480957, "step": 1530 }, { "epoch": 0.25516666666666665, "grad_norm": 86.00808715820312, "learning_rate": 1.7456678563946286e-07, "logits/chosen": 2.3236095905303955, "logits/rejected": 2.7557101249694824, "logps/chosen": -41.45200729370117, "logps/rejected": -343.91009521484375, "loss": 2.5768, "nll_loss": 2.4383535385131836, "rewards/accuracies": 1.0, "rewards/chosen": -0.06398315727710724, "rewards/margins": 3.3200318813323975, "rewards/rejected": -3.3840150833129883, "step": 1531 }, { "epoch": 0.25533333333333336, "grad_norm": 99.22260284423828, "learning_rate": 1.7453080743762998e-07, "logits/chosen": 2.8409297466278076, "logits/rejected": 2.911970376968384, "logps/chosen": -34.1169319152832, "logps/rejected": -34.13722229003906, "loss": 2.1833, "nll_loss": 2.00687837600708, "rewards/accuracies": 1.0, "rewards/chosen": 0.1995723694562912, "rewards/margins": 2.4732840061187744, "rewards/rejected": -2.273711681365967, "step": 1532 }, { "epoch": 0.2555, "grad_norm": 88.26714324951172, "learning_rate": 1.7449480751930911e-07, "logits/chosen": 3.2869606018066406, "logits/rejected": 3.060455083847046, "logps/chosen": -147.50485229492188, "logps/rejected": -23.677515029907227, "loss": 1.6998, "nll_loss": 1.3532553911209106, "rewards/accuracies": 1.0, "rewards/chosen": 0.4231460690498352, "rewards/margins": 1.3722695112228394, "rewards/rejected": -0.9491234421730042, "step": 1533 }, { "epoch": 0.25566666666666665, "grad_norm": 58.293418884277344, "learning_rate": 1.7445878589498978e-07, "logits/chosen": 2.249566078186035, "logits/rejected": 2.571049928665161, "logps/chosen": -54.20318603515625, "logps/rejected": -353.1258850097656, "loss": 1.8559, "nll_loss": 1.8067728281021118, "rewards/accuracies": 1.0, "rewards/chosen": 0.8626556396484375, "rewards/margins": 5.992359161376953, "rewards/rejected": -5.129703521728516, "step": 1534 }, { "epoch": 0.25583333333333336, "grad_norm": 376.3248291015625, "learning_rate": 1.7442274257516781e-07, "logits/chosen": 1.53050696849823, "logits/rejected": 3.2091968059539795, "logps/chosen": -51.760650634765625, "logps/rejected": -81.7778091430664, "loss": 1.1697, "nll_loss": 0.9953970313072205, "rewards/accuracies": 1.0, "rewards/chosen": -0.18533097207546234, "rewards/margins": 2.766103982925415, "rewards/rejected": -2.951434850692749, "step": 1535 }, { "epoch": 0.256, "grad_norm": 39.63248062133789, "learning_rate": 1.7438667757034543e-07, "logits/chosen": 2.458134174346924, "logits/rejected": 2.5553975105285645, "logps/chosen": -79.97311401367188, "logps/rejected": -157.93576049804688, "loss": 1.1302, "nll_loss": 1.0807175636291504, "rewards/accuracies": 1.0, "rewards/chosen": 0.7854682803153992, "rewards/margins": 7.238651275634766, "rewards/rejected": -6.453183174133301, "step": 1536 }, { "epoch": 0.25616666666666665, "grad_norm": 38.937808990478516, "learning_rate": 1.7435059089103103e-07, "logits/chosen": 2.3322317600250244, "logits/rejected": 1.3614405393600464, "logps/chosen": -81.10509490966797, "logps/rejected": -52.03384780883789, "loss": 0.9443, "nll_loss": 0.8192433714866638, "rewards/accuracies": 1.0, "rewards/chosen": 1.5418792963027954, "rewards/margins": 3.038344621658325, "rewards/rejected": -1.4964653253555298, "step": 1537 }, { "epoch": 0.25633333333333336, "grad_norm": 30.83228302001953, "learning_rate": 1.7431448254773942e-07, "logits/chosen": 1.6635841131210327, "logits/rejected": 2.0257408618927, "logps/chosen": -83.76022338867188, "logps/rejected": -140.24951171875, "loss": 0.8173, "nll_loss": 0.7684423923492432, "rewards/accuracies": 1.0, "rewards/chosen": 1.02093505859375, "rewards/margins": 5.16433048248291, "rewards/rejected": -4.14339542388916, "step": 1538 }, { "epoch": 0.2565, "grad_norm": 57.803897857666016, "learning_rate": 1.742783525509917e-07, "logits/chosen": 2.8739287853240967, "logits/rejected": 3.0893301963806152, "logps/chosen": -37.220455169677734, "logps/rejected": -172.646484375, "loss": 1.3378, "nll_loss": 1.3293018341064453, "rewards/accuracies": 1.0, "rewards/chosen": 2.7104344367980957, "rewards/margins": 9.103240013122559, "rewards/rejected": -6.392805576324463, "step": 1539 }, { "epoch": 0.25666666666666665, "grad_norm": 23.602092742919922, "learning_rate": 1.7424220091131535e-07, "logits/chosen": 2.571392059326172, "logits/rejected": 2.9224660396575928, "logps/chosen": -33.486515045166016, "logps/rejected": -308.776611328125, "loss": 0.4855, "nll_loss": 0.47164106369018555, "rewards/accuracies": 1.0, "rewards/chosen": 2.137134552001953, "rewards/margins": 9.864503860473633, "rewards/rejected": -7.727369785308838, "step": 1540 }, { "epoch": 0.25683333333333336, "grad_norm": 42.01734161376953, "learning_rate": 1.74206027639244e-07, "logits/chosen": 1.2120590209960938, "logits/rejected": 2.9503884315490723, "logps/chosen": -16.060100555419922, "logps/rejected": -460.0010681152344, "loss": 0.5788, "nll_loss": 0.48666977882385254, "rewards/accuracies": 1.0, "rewards/chosen": 0.8040432929992676, "rewards/margins": 3.5112698078155518, "rewards/rejected": -2.707226514816284, "step": 1541 }, { "epoch": 0.257, "grad_norm": 30.960466384887695, "learning_rate": 1.7416983274531774e-07, "logits/chosen": 2.705429792404175, "logits/rejected": 2.9494576454162598, "logps/chosen": -79.70384216308594, "logps/rejected": -256.30181884765625, "loss": 0.7669, "nll_loss": 0.6991564035415649, "rewards/accuracies": 1.0, "rewards/chosen": 0.40040743350982666, "rewards/margins": 7.963340759277344, "rewards/rejected": -7.562933444976807, "step": 1542 }, { "epoch": 0.25716666666666665, "grad_norm": 219.5398406982422, "learning_rate": 1.7413361624008283e-07, "logits/chosen": 2.9147887229919434, "logits/rejected": 2.750241756439209, "logps/chosen": -72.6602783203125, "logps/rejected": -42.90288543701172, "loss": 3.1755, "nll_loss": 0.8754250407218933, "rewards/accuracies": 0.0, "rewards/chosen": 1.6658029556274414, "rewards/margins": -1.704958438873291, "rewards/rejected": 3.3707613945007324, "step": 1543 }, { "epoch": 0.25733333333333336, "grad_norm": 367.21234130859375, "learning_rate": 1.7409737813409193e-07, "logits/chosen": 2.5463361740112305, "logits/rejected": 2.5392837524414062, "logps/chosen": -216.00965881347656, "logps/rejected": -120.5575180053711, "loss": 3.8826, "nll_loss": 2.4828696250915527, "rewards/accuracies": 0.0, "rewards/chosen": -3.339766025543213, "rewards/margins": -0.5490293502807617, "rewards/rejected": -2.790736675262451, "step": 1544 }, { "epoch": 0.2575, "grad_norm": 96.61102294921875, "learning_rate": 1.7406111843790398e-07, "logits/chosen": 2.599475622177124, "logits/rejected": 2.8252601623535156, "logps/chosen": -25.804222106933594, "logps/rejected": -170.14657592773438, "loss": 0.5812, "nll_loss": 0.5609613656997681, "rewards/accuracies": 1.0, "rewards/chosen": 2.146916151046753, "rewards/margins": 6.290918350219727, "rewards/rejected": -4.1440019607543945, "step": 1545 }, { "epoch": 0.25766666666666665, "grad_norm": 34.3204460144043, "learning_rate": 1.7402483716208414e-07, "logits/chosen": 2.300778388977051, "logits/rejected": 2.5466392040252686, "logps/chosen": -63.89398956298828, "logps/rejected": -474.05914306640625, "loss": 0.9116, "nll_loss": 0.8634322285652161, "rewards/accuracies": 1.0, "rewards/chosen": 0.8034111261367798, "rewards/margins": 7.478942394256592, "rewards/rejected": -6.675531387329102, "step": 1546 }, { "epoch": 0.25783333333333336, "grad_norm": 33.78910827636719, "learning_rate": 1.739885343172039e-07, "logits/chosen": 2.6976816654205322, "logits/rejected": 2.6976239681243896, "logps/chosen": -27.314313888549805, "logps/rejected": -35.513389587402344, "loss": 0.5946, "nll_loss": 0.4966239929199219, "rewards/accuracies": 1.0, "rewards/chosen": 1.4158976078033447, "rewards/margins": 3.350925922393799, "rewards/rejected": -1.935028314590454, "step": 1547 }, { "epoch": 0.258, "grad_norm": 26.664098739624023, "learning_rate": 1.7395220991384107e-07, "logits/chosen": 2.089773178100586, "logits/rejected": 2.380946636199951, "logps/chosen": -115.87757873535156, "logps/rejected": -222.1837921142578, "loss": 1.1032, "nll_loss": 1.0829681158065796, "rewards/accuracies": 1.0, "rewards/chosen": 1.9055893421173096, "rewards/margins": 6.842096328735352, "rewards/rejected": -4.936506748199463, "step": 1548 }, { "epoch": 0.25816666666666666, "grad_norm": 102.1990966796875, "learning_rate": 1.739158639625797e-07, "logits/chosen": 2.778752088546753, "logits/rejected": 2.6838302612304688, "logps/chosen": -30.60706901550293, "logps/rejected": -26.90641212463379, "loss": 0.803, "nll_loss": 0.6956151723861694, "rewards/accuracies": 1.0, "rewards/chosen": 0.7621168494224548, "rewards/margins": 3.2176196575164795, "rewards/rejected": -2.45550274848938, "step": 1549 }, { "epoch": 0.25833333333333336, "grad_norm": 212.6658477783203, "learning_rate": 1.738794964740101e-07, "logits/chosen": 2.6183855533599854, "logits/rejected": 2.6779656410217285, "logps/chosen": -143.0611572265625, "logps/rejected": -89.89456176757812, "loss": 2.7236, "nll_loss": 1.8341176509857178, "rewards/accuracies": 1.0, "rewards/chosen": -2.712001323699951, "rewards/margins": 0.38502001762390137, "rewards/rejected": -3.0970213413238525, "step": 1550 }, { "epoch": 0.2585, "grad_norm": 66.05730438232422, "learning_rate": 1.7384310745872895e-07, "logits/chosen": 2.56795072555542, "logits/rejected": 2.576262950897217, "logps/chosen": -40.185237884521484, "logps/rejected": -154.33035278320312, "loss": 0.7742, "nll_loss": 0.7175936698913574, "rewards/accuracies": 1.0, "rewards/chosen": 1.9931416511535645, "rewards/margins": 4.318787097930908, "rewards/rejected": -2.3256454467773438, "step": 1551 }, { "epoch": 0.25866666666666666, "grad_norm": 36.01359558105469, "learning_rate": 1.7380669692733903e-07, "logits/chosen": 2.5110230445861816, "logits/rejected": 2.742286443710327, "logps/chosen": -45.59969711303711, "logps/rejected": -111.50062561035156, "loss": 0.9769, "nll_loss": 0.9499935507774353, "rewards/accuracies": 1.0, "rewards/chosen": 1.6109699010849, "rewards/margins": 6.3081769943237305, "rewards/rejected": -4.697206974029541, "step": 1552 }, { "epoch": 0.25883333333333336, "grad_norm": 246.09573364257812, "learning_rate": 1.7377026489044958e-07, "logits/chosen": 2.8244988918304443, "logits/rejected": 2.704115390777588, "logps/chosen": -224.9703369140625, "logps/rejected": -93.72544860839844, "loss": 1.8937, "nll_loss": 0.9614114165306091, "rewards/accuracies": 0.0, "rewards/chosen": 0.5218841433525085, "rewards/margins": -0.08697283267974854, "rewards/rejected": 0.6088569760322571, "step": 1553 }, { "epoch": 0.259, "grad_norm": 87.5410385131836, "learning_rate": 1.73733811358676e-07, "logits/chosen": 2.6668589115142822, "logits/rejected": 2.7228009700775146, "logps/chosen": -73.94091033935547, "logps/rejected": -46.518768310546875, "loss": 1.2947, "nll_loss": 0.7544991374015808, "rewards/accuracies": 1.0, "rewards/chosen": 1.7374695539474487, "rewards/margins": 1.0873513221740723, "rewards/rejected": 0.6501182913780212, "step": 1554 }, { "epoch": 0.25916666666666666, "grad_norm": 21.45336151123047, "learning_rate": 1.7369733634264e-07, "logits/chosen": 2.970181703567505, "logits/rejected": 3.16027569770813, "logps/chosen": -70.8519287109375, "logps/rejected": -244.609619140625, "loss": 0.7201, "nll_loss": 0.7085192203521729, "rewards/accuracies": 1.0, "rewards/chosen": 2.3628158569335938, "rewards/margins": 8.82033920288086, "rewards/rejected": -6.457523345947266, "step": 1555 }, { "epoch": 0.25933333333333336, "grad_norm": 38.68830490112305, "learning_rate": 1.7366083985296944e-07, "logits/chosen": 2.8538119792938232, "logits/rejected": 2.914013385772705, "logps/chosen": -25.004121780395508, "logps/rejected": -157.3468780517578, "loss": 0.7417, "nll_loss": 0.6945587992668152, "rewards/accuracies": 1.0, "rewards/chosen": 1.05946946144104, "rewards/margins": 5.203669548034668, "rewards/rejected": -4.144200325012207, "step": 1556 }, { "epoch": 0.2595, "grad_norm": 108.87435150146484, "learning_rate": 1.7362432190029858e-07, "logits/chosen": 2.97796630859375, "logits/rejected": 2.9261014461517334, "logps/chosen": -2.933687686920166, "logps/rejected": -67.63892364501953, "loss": 0.2757, "nll_loss": 0.0553525947034359, "rewards/accuracies": 1.0, "rewards/chosen": 0.540874719619751, "rewards/margins": 2.0446550846099854, "rewards/rejected": -1.5037803649902344, "step": 1557 }, { "epoch": 0.25966666666666666, "grad_norm": 47.60789489746094, "learning_rate": 1.7358778249526788e-07, "logits/chosen": 2.541969060897827, "logits/rejected": 2.4463281631469727, "logps/chosen": -43.677635192871094, "logps/rejected": -97.0908203125, "loss": 1.1952, "nll_loss": 1.1494115591049194, "rewards/accuracies": 1.0, "rewards/chosen": 1.0407050848007202, "rewards/margins": 5.423757553100586, "rewards/rejected": -4.383052349090576, "step": 1558 }, { "epoch": 0.25983333333333336, "grad_norm": 61.29338836669922, "learning_rate": 1.7355122164852408e-07, "logits/chosen": 1.8036726713180542, "logits/rejected": 1.694716453552246, "logps/chosen": -66.57778930664062, "logps/rejected": -41.94171905517578, "loss": 1.0822, "nll_loss": 0.9511112570762634, "rewards/accuracies": 1.0, "rewards/chosen": 0.35285264253616333, "rewards/margins": 3.001687526702881, "rewards/rejected": -2.6488349437713623, "step": 1559 }, { "epoch": 0.26, "grad_norm": 40.19255828857422, "learning_rate": 1.7351463937072005e-07, "logits/chosen": 2.478639602661133, "logits/rejected": 2.8593125343322754, "logps/chosen": -72.31967163085938, "logps/rejected": -136.03363037109375, "loss": 1.1992, "nll_loss": 1.1299949884414673, "rewards/accuracies": 1.0, "rewards/chosen": 1.6040420532226562, "rewards/margins": 3.9151253700256348, "rewards/rejected": -2.3110833168029785, "step": 1560 }, { "epoch": 0.26016666666666666, "grad_norm": 73.04706573486328, "learning_rate": 1.73478035672515e-07, "logits/chosen": 1.5028647184371948, "logits/rejected": 2.0682032108306885, "logps/chosen": -29.91227149963379, "logps/rejected": -78.26571655273438, "loss": 1.2797, "nll_loss": 1.0682954788208008, "rewards/accuracies": 1.0, "rewards/chosen": 1.3464080095291138, "rewards/margins": 2.2597227096557617, "rewards/rejected": -0.9133148193359375, "step": 1561 }, { "epoch": 0.26033333333333336, "grad_norm": 34.272308349609375, "learning_rate": 1.734414105645744e-07, "logits/chosen": 2.749263286590576, "logits/rejected": 3.0981838703155518, "logps/chosen": -20.056644439697266, "logps/rejected": -569.223388671875, "loss": 0.5801, "nll_loss": 0.55712890625, "rewards/accuracies": 1.0, "rewards/chosen": 1.6725705862045288, "rewards/margins": 7.2288336753845215, "rewards/rejected": -5.556262969970703, "step": 1562 }, { "epoch": 0.2605, "grad_norm": 54.12916564941406, "learning_rate": 1.7340476405756997e-07, "logits/chosen": 2.6513023376464844, "logits/rejected": 2.6728804111480713, "logps/chosen": -43.028934478759766, "logps/rejected": -102.75682067871094, "loss": 1.0112, "nll_loss": 0.8964362144470215, "rewards/accuracies": 1.0, "rewards/chosen": 0.016267014667391777, "rewards/margins": 3.900467872619629, "rewards/rejected": -3.8842008113861084, "step": 1563 }, { "epoch": 0.26066666666666666, "grad_norm": 42.77461624145508, "learning_rate": 1.7336809616217954e-07, "logits/chosen": 2.536433219909668, "logits/rejected": 2.8822247982025146, "logps/chosen": -78.16751098632812, "logps/rejected": -153.1272430419922, "loss": 1.2062, "nll_loss": 1.1166785955429077, "rewards/accuracies": 1.0, "rewards/chosen": 0.505139946937561, "rewards/margins": 3.8511223793029785, "rewards/rejected": -3.345982551574707, "step": 1564 }, { "epoch": 0.2608333333333333, "grad_norm": 45.375946044921875, "learning_rate": 1.7333140688908728e-07, "logits/chosen": 1.2081931829452515, "logits/rejected": 2.876904010772705, "logps/chosen": -31.963899612426758, "logps/rejected": -452.08001708984375, "loss": 1.154, "nll_loss": 1.1415678262710571, "rewards/accuracies": 1.0, "rewards/chosen": 2.2688615322113037, "rewards/margins": 8.97221565246582, "rewards/rejected": -6.7033538818359375, "step": 1565 }, { "epoch": 0.261, "grad_norm": 101.8311538696289, "learning_rate": 1.732946962489836e-07, "logits/chosen": 3.00382399559021, "logits/rejected": 2.8707966804504395, "logps/chosen": -52.630191802978516, "logps/rejected": -40.94386291503906, "loss": 1.9116, "nll_loss": 1.4619497060775757, "rewards/accuracies": 1.0, "rewards/chosen": -0.7581948041915894, "rewards/margins": 1.0604697465896606, "rewards/rejected": -1.81866455078125, "step": 1566 }, { "epoch": 0.26116666666666666, "grad_norm": 105.46578979492188, "learning_rate": 1.7325796425256503e-07, "logits/chosen": 2.6892435550689697, "logits/rejected": 2.772578477859497, "logps/chosen": -53.689842224121094, "logps/rejected": -59.21283721923828, "loss": 2.5495, "nll_loss": 1.988512635231018, "rewards/accuracies": 1.0, "rewards/chosen": -1.889814019203186, "rewards/margins": 1.1164747476577759, "rewards/rejected": -3.006288766860962, "step": 1567 }, { "epoch": 0.2613333333333333, "grad_norm": 95.34471893310547, "learning_rate": 1.7322121091053446e-07, "logits/chosen": 3.153548240661621, "logits/rejected": 3.415361166000366, "logps/chosen": -45.048561096191406, "logps/rejected": -180.38064575195312, "loss": 1.8323, "nll_loss": 1.7326369285583496, "rewards/accuracies": 1.0, "rewards/chosen": 0.01616363599896431, "rewards/margins": 4.951379776000977, "rewards/rejected": -4.935215950012207, "step": 1568 }, { "epoch": 0.2615, "grad_norm": 24.709718704223633, "learning_rate": 1.7318443623360088e-07, "logits/chosen": 3.8979241847991943, "logits/rejected": 3.8902783393859863, "logps/chosen": -63.23050308227539, "logps/rejected": -158.54275512695312, "loss": 0.6426, "nll_loss": 0.5909392833709717, "rewards/accuracies": 1.0, "rewards/chosen": 1.049618124961853, "rewards/margins": 4.815147399902344, "rewards/rejected": -3.765529155731201, "step": 1569 }, { "epoch": 0.26166666666666666, "grad_norm": 57.19196701049805, "learning_rate": 1.731476402324796e-07, "logits/chosen": 2.017469882965088, "logits/rejected": 2.0850377082824707, "logps/chosen": -38.28696823120117, "logps/rejected": -32.24940872192383, "loss": 1.1626, "nll_loss": 0.9817172288894653, "rewards/accuracies": 1.0, "rewards/chosen": 0.5968868136405945, "rewards/margins": 2.3473095893859863, "rewards/rejected": -1.750422716140747, "step": 1570 }, { "epoch": 0.2618333333333333, "grad_norm": 40.1201057434082, "learning_rate": 1.7311082291789205e-07, "logits/chosen": 2.833850860595703, "logits/rejected": 2.6887400150299072, "logps/chosen": -146.5735321044922, "logps/rejected": -31.006587982177734, "loss": 1.4035, "nll_loss": 1.3204823732376099, "rewards/accuracies": 1.0, "rewards/chosen": 1.4421356916427612, "rewards/margins": 3.6081156730651855, "rewards/rejected": -2.165980100631714, "step": 1571 }, { "epoch": 0.262, "grad_norm": 46.113319396972656, "learning_rate": 1.7307398430056594e-07, "logits/chosen": 2.434483766555786, "logits/rejected": 2.4051966667175293, "logps/chosen": -7.689373970031738, "logps/rejected": -23.844606399536133, "loss": 0.4797, "nll_loss": 0.36616066098213196, "rewards/accuracies": 1.0, "rewards/chosen": 1.1482058763504028, "rewards/margins": 3.088772773742676, "rewards/rejected": -1.9405670166015625, "step": 1572 }, { "epoch": 0.26216666666666666, "grad_norm": 37.4476203918457, "learning_rate": 1.7303712439123514e-07, "logits/chosen": 2.4235339164733887, "logits/rejected": 2.643496036529541, "logps/chosen": -50.59080505371094, "logps/rejected": -335.6376647949219, "loss": 1.171, "nll_loss": 1.1497910022735596, "rewards/accuracies": 1.0, "rewards/chosen": 1.6879372596740723, "rewards/margins": 8.467079162597656, "rewards/rejected": -6.779141426086426, "step": 1573 }, { "epoch": 0.2623333333333333, "grad_norm": 73.39949798583984, "learning_rate": 1.7300024320063972e-07, "logits/chosen": 2.0313689708709717, "logits/rejected": 1.67329740524292, "logps/chosen": -119.3354721069336, "logps/rejected": -101.91157531738281, "loss": 1.5248, "nll_loss": 1.3716720342636108, "rewards/accuracies": 1.0, "rewards/chosen": 0.061855316162109375, "rewards/margins": 2.848005771636963, "rewards/rejected": -2.7861504554748535, "step": 1574 }, { "epoch": 0.2625, "grad_norm": 65.98209381103516, "learning_rate": 1.7296334073952604e-07, "logits/chosen": 2.5984394550323486, "logits/rejected": 2.770256757736206, "logps/chosen": -254.77980041503906, "logps/rejected": -307.666259765625, "loss": 1.1468, "nll_loss": 1.0070347785949707, "rewards/accuracies": 1.0, "rewards/chosen": -0.5331360101699829, "rewards/margins": 6.882803440093994, "rewards/rejected": -7.4159393310546875, "step": 1575 }, { "epoch": 0.26266666666666666, "grad_norm": 50.099266052246094, "learning_rate": 1.729264170186465e-07, "logits/chosen": 3.4258763790130615, "logits/rejected": 3.3500120639801025, "logps/chosen": -28.471588134765625, "logps/rejected": -65.67913055419922, "loss": 0.7886, "nll_loss": 0.5475307106971741, "rewards/accuracies": 1.0, "rewards/chosen": 1.796961784362793, "rewards/margins": 2.2737226486206055, "rewards/rejected": -0.4767608642578125, "step": 1576 }, { "epoch": 0.2628333333333333, "grad_norm": 33.25566482543945, "learning_rate": 1.7288947204875987e-07, "logits/chosen": 2.8015244007110596, "logits/rejected": 2.894929885864258, "logps/chosen": -32.49338150024414, "logps/rejected": -152.26348876953125, "loss": 0.6637, "nll_loss": 0.6498675346374512, "rewards/accuracies": 1.0, "rewards/chosen": 2.2289021015167236, "rewards/margins": 7.807056427001953, "rewards/rejected": -5.578154563903809, "step": 1577 }, { "epoch": 0.263, "grad_norm": 22.92774200439453, "learning_rate": 1.72852505840631e-07, "logits/chosen": 2.5131664276123047, "logits/rejected": 2.6447877883911133, "logps/chosen": -33.523616790771484, "logps/rejected": -244.07569885253906, "loss": 0.5358, "nll_loss": 0.5238065123558044, "rewards/accuracies": 1.0, "rewards/chosen": 2.3033337593078613, "rewards/margins": 8.874940872192383, "rewards/rejected": -6.5716071128845215, "step": 1578 }, { "epoch": 0.26316666666666666, "grad_norm": 32.85111999511719, "learning_rate": 1.7281551840503093e-07, "logits/chosen": 2.934800863265991, "logits/rejected": 2.9271419048309326, "logps/chosen": -64.14481353759766, "logps/rejected": -44.73851013183594, "loss": 0.9509, "nll_loss": 0.8552641868591309, "rewards/accuracies": 1.0, "rewards/chosen": 1.656217336654663, "rewards/margins": 3.456601619720459, "rewards/rejected": -1.800384283065796, "step": 1579 }, { "epoch": 0.2633333333333333, "grad_norm": 37.283447265625, "learning_rate": 1.7277850975273693e-07, "logits/chosen": 1.9455420970916748, "logits/rejected": 2.2171003818511963, "logps/chosen": -104.04072570800781, "logps/rejected": -152.2311248779297, "loss": 0.9994, "nll_loss": 0.9126378297805786, "rewards/accuracies": 1.0, "rewards/chosen": 1.6273177862167358, "rewards/margins": 3.5864944458007812, "rewards/rejected": -1.9591766595840454, "step": 1580 }, { "epoch": 0.2635, "grad_norm": 42.76715087890625, "learning_rate": 1.7274147989453245e-07, "logits/chosen": 3.3982763290405273, "logits/rejected": 3.483753204345703, "logps/chosen": -112.43120574951172, "logps/rejected": -120.96580505371094, "loss": 1.1484, "nll_loss": 1.0810691118240356, "rewards/accuracies": 1.0, "rewards/chosen": 0.8993453979492188, "rewards/margins": 4.18232536315918, "rewards/rejected": -3.282979726791382, "step": 1581 }, { "epoch": 0.26366666666666666, "grad_norm": 54.8547248840332, "learning_rate": 1.7270442884120708e-07, "logits/chosen": 2.2258853912353516, "logits/rejected": 2.729595422744751, "logps/chosen": -22.134902954101562, "logps/rejected": -62.90169906616211, "loss": 0.8761, "nll_loss": 0.8513423204421997, "rewards/accuracies": 1.0, "rewards/chosen": 1.9178972244262695, "rewards/margins": 5.923717021942139, "rewards/rejected": -4.005819797515869, "step": 1582 }, { "epoch": 0.2638333333333333, "grad_norm": 33.90567398071289, "learning_rate": 1.726673566035566e-07, "logits/chosen": 3.379107713699341, "logits/rejected": 3.4693443775177, "logps/chosen": -34.07563781738281, "logps/rejected": -320.9427185058594, "loss": 0.6425, "nll_loss": 0.59781813621521, "rewards/accuracies": 1.0, "rewards/chosen": 0.9249370694160461, "rewards/margins": 6.340061664581299, "rewards/rejected": -5.415124416351318, "step": 1583 }, { "epoch": 0.264, "grad_norm": 108.73430633544922, "learning_rate": 1.7263026319238297e-07, "logits/chosen": 2.337693214416504, "logits/rejected": 2.4143102169036865, "logps/chosen": -25.842782974243164, "logps/rejected": -69.11582946777344, "loss": 1.2645, "nll_loss": 0.6153044700622559, "rewards/accuracies": 1.0, "rewards/chosen": 1.2257028818130493, "rewards/margins": 0.6439386606216431, "rewards/rejected": 0.5817642211914062, "step": 1584 }, { "epoch": 0.26416666666666666, "grad_norm": 74.19392395019531, "learning_rate": 1.7259314861849438e-07, "logits/chosen": 1.4917875528335571, "logits/rejected": 2.015803813934326, "logps/chosen": -40.355899810791016, "logps/rejected": -102.16029357910156, "loss": 1.7943, "nll_loss": 1.7546042203903198, "rewards/accuracies": 1.0, "rewards/chosen": 1.6633529663085938, "rewards/margins": 4.917546272277832, "rewards/rejected": -3.254193067550659, "step": 1585 }, { "epoch": 0.2643333333333333, "grad_norm": 214.51443481445312, "learning_rate": 1.7255601289270506e-07, "logits/chosen": 2.408121109008789, "logits/rejected": 2.743741273880005, "logps/chosen": -157.94906616210938, "logps/rejected": -281.7755126953125, "loss": 2.1306, "nll_loss": 2.0249879360198975, "rewards/accuracies": 1.0, "rewards/chosen": -0.09331970661878586, "rewards/margins": 5.174250602722168, "rewards/rejected": -5.267570495605469, "step": 1586 }, { "epoch": 0.2645, "grad_norm": 33.493408203125, "learning_rate": 1.7251885602583545e-07, "logits/chosen": 2.436774253845215, "logits/rejected": 2.529710531234741, "logps/chosen": -104.83235168457031, "logps/rejected": -41.50178146362305, "loss": 1.3994, "nll_loss": 1.3614591360092163, "rewards/accuracies": 1.0, "rewards/chosen": 1.5187835693359375, "rewards/margins": 5.121980667114258, "rewards/rejected": -3.6031970977783203, "step": 1587 }, { "epoch": 0.26466666666666666, "grad_norm": 35.046810150146484, "learning_rate": 1.7248167802871224e-07, "logits/chosen": 2.7759358882904053, "logits/rejected": 2.852849245071411, "logps/chosen": -71.39299011230469, "logps/rejected": -139.0974578857422, "loss": 1.0266, "nll_loss": 0.9271817803382874, "rewards/accuracies": 1.0, "rewards/chosen": 1.0151817798614502, "rewards/margins": 3.301302433013916, "rewards/rejected": -2.286120653152466, "step": 1588 }, { "epoch": 0.2648333333333333, "grad_norm": 75.34310150146484, "learning_rate": 1.7244447891216816e-07, "logits/chosen": 2.3706929683685303, "logits/rejected": 2.6372129917144775, "logps/chosen": -61.71209716796875, "logps/rejected": -278.770751953125, "loss": 1.2634, "nll_loss": 1.0826683044433594, "rewards/accuracies": 1.0, "rewards/chosen": 1.1486237049102783, "rewards/margins": 2.41766357421875, "rewards/rejected": -1.2690399885177612, "step": 1589 }, { "epoch": 0.265, "grad_norm": 33.730655670166016, "learning_rate": 1.7240725868704216e-07, "logits/chosen": 2.208594560623169, "logits/rejected": 2.3361692428588867, "logps/chosen": -74.4352798461914, "logps/rejected": -154.7026824951172, "loss": 1.0173, "nll_loss": 0.9794116020202637, "rewards/accuracies": 1.0, "rewards/chosen": 1.3549667596817017, "rewards/margins": 5.3769121170043945, "rewards/rejected": -4.021945476531982, "step": 1590 }, { "epoch": 0.26516666666666666, "grad_norm": 48.32780838012695, "learning_rate": 1.7237001736417928e-07, "logits/chosen": 2.5990734100341797, "logits/rejected": 2.6325502395629883, "logps/chosen": -55.688560485839844, "logps/rejected": -41.188194274902344, "loss": 0.9955, "nll_loss": 0.9129272699356079, "rewards/accuracies": 1.0, "rewards/chosen": 1.1089050769805908, "rewards/margins": 3.6161813735961914, "rewards/rejected": -2.5072762966156006, "step": 1591 }, { "epoch": 0.2653333333333333, "grad_norm": 37.28657531738281, "learning_rate": 1.7233275495443082e-07, "logits/chosen": 1.4540507793426514, "logits/rejected": 2.6958673000335693, "logps/chosen": -50.48546600341797, "logps/rejected": -292.04998779296875, "loss": 1.0993, "nll_loss": 1.0975102186203003, "rewards/accuracies": 1.0, "rewards/chosen": 4.370187282562256, "rewards/margins": 11.307696342468262, "rewards/rejected": -6.937509059906006, "step": 1592 }, { "epoch": 0.2655, "grad_norm": 24.987178802490234, "learning_rate": 1.7229547146865408e-07, "logits/chosen": 3.8409342765808105, "logits/rejected": 3.813060998916626, "logps/chosen": -46.29920196533203, "logps/rejected": -162.6769256591797, "loss": 0.6499, "nll_loss": 0.6173226833343506, "rewards/accuracies": 1.0, "rewards/chosen": 1.2711331844329834, "rewards/margins": 6.823781967163086, "rewards/rejected": -5.552648544311523, "step": 1593 }, { "epoch": 0.26566666666666666, "grad_norm": 32.4611930847168, "learning_rate": 1.722581669177126e-07, "logits/chosen": 2.050034284591675, "logits/rejected": 1.971724271774292, "logps/chosen": -20.043777465820312, "logps/rejected": -55.650203704833984, "loss": 0.5334, "nll_loss": 0.43573424220085144, "rewards/accuracies": 1.0, "rewards/chosen": 1.172715425491333, "rewards/margins": 3.327633857727051, "rewards/rejected": -2.1549184322357178, "step": 1594 }, { "epoch": 0.2658333333333333, "grad_norm": 41.931400299072266, "learning_rate": 1.7222084131247606e-07, "logits/chosen": 3.1363492012023926, "logits/rejected": 3.2235114574432373, "logps/chosen": -103.12349700927734, "logps/rejected": -30.406551361083984, "loss": 1.3451, "nll_loss": 1.2424519062042236, "rewards/accuracies": 1.0, "rewards/chosen": 2.2131805419921875, "rewards/margins": 3.6308212280273438, "rewards/rejected": -1.4176405668258667, "step": 1595 }, { "epoch": 0.266, "grad_norm": 189.03001403808594, "learning_rate": 1.7218349466382022e-07, "logits/chosen": 2.59544038772583, "logits/rejected": 2.372516393661499, "logps/chosen": -44.126075744628906, "logps/rejected": -29.19045639038086, "loss": 3.0245, "nll_loss": 0.7004139423370361, "rewards/accuracies": 0.0, "rewards/chosen": 1.6441287994384766, "rewards/margins": -1.732715368270874, "rewards/rejected": 3.3768441677093506, "step": 1596 }, { "epoch": 0.26616666666666666, "grad_norm": 47.05009460449219, "learning_rate": 1.7214612698262695e-07, "logits/chosen": 2.4404706954956055, "logits/rejected": 2.7033298015594482, "logps/chosen": -39.55343246459961, "logps/rejected": -213.2476806640625, "loss": 0.9, "nll_loss": 0.8072130084037781, "rewards/accuracies": 1.0, "rewards/chosen": 0.23132744431495667, "rewards/margins": 4.3029279708862305, "rewards/rejected": -4.071600437164307, "step": 1597 }, { "epoch": 0.2663333333333333, "grad_norm": 18.563737869262695, "learning_rate": 1.7210873827978438e-07, "logits/chosen": 0.9785614013671875, "logits/rejected": 1.815203309059143, "logps/chosen": -53.498512268066406, "logps/rejected": -289.65484619140625, "loss": 0.5778, "nll_loss": 0.5691329836845398, "rewards/accuracies": 1.0, "rewards/chosen": 2.6434457302093506, "rewards/margins": 9.3782377243042, "rewards/rejected": -6.7347917556762695, "step": 1598 }, { "epoch": 0.2665, "grad_norm": 32.743560791015625, "learning_rate": 1.7207132856618666e-07, "logits/chosen": 1.5495210886001587, "logits/rejected": 2.519556999206543, "logps/chosen": -82.41707611083984, "logps/rejected": -290.09283447265625, "loss": 0.9798, "nll_loss": 0.9260346293449402, "rewards/accuracies": 1.0, "rewards/chosen": 1.7189735174179077, "rewards/margins": 4.339115142822266, "rewards/rejected": -2.6201417446136475, "step": 1599 }, { "epoch": 0.26666666666666666, "grad_norm": 50.4151725769043, "learning_rate": 1.72033897852734e-07, "logits/chosen": 2.7779533863067627, "logits/rejected": 2.7149860858917236, "logps/chosen": -19.51016616821289, "logps/rejected": -53.361759185791016, "loss": 0.8813, "nll_loss": 0.8482680320739746, "rewards/accuracies": 1.0, "rewards/chosen": 1.5496224164962769, "rewards/margins": 5.516579627990723, "rewards/rejected": -3.9669570922851562, "step": 1600 }, { "epoch": 0.2668333333333333, "grad_norm": 25.40929412841797, "learning_rate": 1.719964461503329e-07, "logits/chosen": 3.1208717823028564, "logits/rejected": 3.455406427383423, "logps/chosen": -127.50875854492188, "logps/rejected": -479.94903564453125, "loss": 1.1812, "nll_loss": 1.1698052883148193, "rewards/accuracies": 1.0, "rewards/chosen": 2.3022079467773438, "rewards/margins": 11.291356086730957, "rewards/rejected": -8.989148139953613, "step": 1601 }, { "epoch": 0.267, "grad_norm": 40.81943893432617, "learning_rate": 1.7195897346989586e-07, "logits/chosen": 2.192711353302002, "logits/rejected": 2.0708701610565186, "logps/chosen": -120.36715698242188, "logps/rejected": -119.87096405029297, "loss": 1.291, "nll_loss": 1.2408987283706665, "rewards/accuracies": 1.0, "rewards/chosen": 0.8260208368301392, "rewards/margins": 5.840569972991943, "rewards/rejected": -5.014549255371094, "step": 1602 }, { "epoch": 0.26716666666666666, "grad_norm": 49.304500579833984, "learning_rate": 1.7192147982234153e-07, "logits/chosen": 2.4628801345825195, "logits/rejected": 2.7085185050964355, "logps/chosen": -34.473228454589844, "logps/rejected": -408.0934143066406, "loss": 1.3329, "nll_loss": 1.325893521308899, "rewards/accuracies": 1.0, "rewards/chosen": 2.8024349212646484, "rewards/margins": 11.162808418273926, "rewards/rejected": -8.360373497009277, "step": 1603 }, { "epoch": 0.2673333333333333, "grad_norm": 89.67628479003906, "learning_rate": 1.7188396521859465e-07, "logits/chosen": 1.909653663635254, "logits/rejected": 2.331211805343628, "logps/chosen": -14.057646751403809, "logps/rejected": -61.53717803955078, "loss": 1.1564, "nll_loss": 1.0041176080703735, "rewards/accuracies": 1.0, "rewards/chosen": 0.4783706068992615, "rewards/margins": 2.646137237548828, "rewards/rejected": -2.167766571044922, "step": 1604 }, { "epoch": 0.2675, "grad_norm": 36.29856872558594, "learning_rate": 1.7184642966958607e-07, "logits/chosen": 1.892799735069275, "logits/rejected": 2.2338762283325195, "logps/chosen": -169.05755615234375, "logps/rejected": -232.55276489257812, "loss": 1.1068, "nll_loss": 1.0245912075042725, "rewards/accuracies": 1.0, "rewards/chosen": 0.542163074016571, "rewards/margins": 4.048541069030762, "rewards/rejected": -3.506378173828125, "step": 1605 }, { "epoch": 0.26766666666666666, "grad_norm": 73.46124267578125, "learning_rate": 1.7180887318625277e-07, "logits/chosen": 2.205932140350342, "logits/rejected": 2.1804990768432617, "logps/chosen": -50.65977478027344, "logps/rejected": -99.85293579101562, "loss": 1.6071, "nll_loss": 1.4899930953979492, "rewards/accuracies": 1.0, "rewards/chosen": -0.12157707661390305, "rewards/margins": 4.214395999908447, "rewards/rejected": -4.335973262786865, "step": 1606 }, { "epoch": 0.2678333333333333, "grad_norm": 37.569496154785156, "learning_rate": 1.7177129577953778e-07, "logits/chosen": 2.572857141494751, "logits/rejected": 2.7134525775909424, "logps/chosen": -99.08769226074219, "logps/rejected": -298.4493713378906, "loss": 1.0511, "nll_loss": 0.9908769726753235, "rewards/accuracies": 1.0, "rewards/chosen": 0.5141067504882812, "rewards/margins": 7.684608459472656, "rewards/rejected": -7.170501708984375, "step": 1607 }, { "epoch": 0.268, "grad_norm": 38.45466613769531, "learning_rate": 1.7173369746039023e-07, "logits/chosen": 3.0417752265930176, "logits/rejected": 3.028470277786255, "logps/chosen": -140.06015014648438, "logps/rejected": -201.20858764648438, "loss": 1.578, "nll_loss": 1.5060232877731323, "rewards/accuracies": 1.0, "rewards/chosen": 0.9909286499023438, "rewards/margins": 3.9312944412231445, "rewards/rejected": -2.940365791320801, "step": 1608 }, { "epoch": 0.26816666666666666, "grad_norm": 25.83591079711914, "learning_rate": 1.7169607823976543e-07, "logits/chosen": 2.950528383255005, "logits/rejected": 3.065483570098877, "logps/chosen": -93.5174560546875, "logps/rejected": -279.88531494140625, "loss": 0.9671, "nll_loss": 0.944620668888092, "rewards/accuracies": 1.0, "rewards/chosen": 1.687597632408142, "rewards/margins": 7.152040958404541, "rewards/rejected": -5.464443206787109, "step": 1609 }, { "epoch": 0.2683333333333333, "grad_norm": 30.007707595825195, "learning_rate": 1.716584381286247e-07, "logits/chosen": 1.153990626335144, "logits/rejected": 2.1978209018707275, "logps/chosen": -42.83676528930664, "logps/rejected": -306.68060302734375, "loss": 0.6351, "nll_loss": 0.5636417269706726, "rewards/accuracies": 1.0, "rewards/chosen": 1.1821835041046143, "rewards/margins": 3.870303153991699, "rewards/rejected": -2.688119649887085, "step": 1610 }, { "epoch": 0.2685, "grad_norm": 20.872901916503906, "learning_rate": 1.7162077713793543e-07, "logits/chosen": 1.7232463359832764, "logits/rejected": 0.9790407419204712, "logps/chosen": -156.5072021484375, "logps/rejected": -47.14143371582031, "loss": 0.7354, "nll_loss": 0.698692798614502, "rewards/accuracies": 1.0, "rewards/chosen": 1.5776138305664062, "rewards/margins": 5.148321151733398, "rewards/rejected": -3.570707321166992, "step": 1611 }, { "epoch": 0.26866666666666666, "grad_norm": 48.29130554199219, "learning_rate": 1.7158309527867118e-07, "logits/chosen": 1.8105703592300415, "logits/rejected": 2.4519288539886475, "logps/chosen": -16.804275512695312, "logps/rejected": -342.1848449707031, "loss": 0.6952, "nll_loss": 0.6721709966659546, "rewards/accuracies": 1.0, "rewards/chosen": 2.178149461746216, "rewards/margins": 5.904520034790039, "rewards/rejected": -3.726370334625244, "step": 1612 }, { "epoch": 0.2688333333333333, "grad_norm": 33.147823333740234, "learning_rate": 1.7154539256181147e-07, "logits/chosen": 2.9065330028533936, "logits/rejected": 2.808032989501953, "logps/chosen": -116.861083984375, "logps/rejected": -56.66436767578125, "loss": 1.1334, "nll_loss": 1.0528024435043335, "rewards/accuracies": 1.0, "rewards/chosen": 1.6535370349884033, "rewards/margins": 3.704132080078125, "rewards/rejected": -2.0505950450897217, "step": 1613 }, { "epoch": 0.269, "grad_norm": 31.019264221191406, "learning_rate": 1.7150766899834203e-07, "logits/chosen": 2.300625801086426, "logits/rejected": 2.464385747909546, "logps/chosen": -8.038267135620117, "logps/rejected": -147.34034729003906, "loss": 0.3718, "nll_loss": 0.29771363735198975, "rewards/accuracies": 1.0, "rewards/chosen": 1.3768590688705444, "rewards/margins": 3.7855634689331055, "rewards/rejected": -2.4087045192718506, "step": 1614 }, { "epoch": 0.26916666666666667, "grad_norm": 32.73859786987305, "learning_rate": 1.714699245992546e-07, "logits/chosen": 2.1254279613494873, "logits/rejected": 1.762802243232727, "logps/chosen": -140.96734619140625, "logps/rejected": -93.01556396484375, "loss": 1.4059, "nll_loss": 1.3554553985595703, "rewards/accuracies": 1.0, "rewards/chosen": 1.3689483404159546, "rewards/margins": 4.51863431930542, "rewards/rejected": -3.149686098098755, "step": 1615 }, { "epoch": 0.2693333333333333, "grad_norm": 75.95797729492188, "learning_rate": 1.7143215937554695e-07, "logits/chosen": 2.2426648139953613, "logits/rejected": 2.7757961750030518, "logps/chosen": -67.25737762451172, "logps/rejected": -268.8250732421875, "loss": 2.4857, "nll_loss": 2.4020488262176514, "rewards/accuracies": 1.0, "rewards/chosen": 0.5752632021903992, "rewards/margins": 3.9316139221191406, "rewards/rejected": -3.3563506603240967, "step": 1616 }, { "epoch": 0.2695, "grad_norm": 125.22093963623047, "learning_rate": 1.71394373338223e-07, "logits/chosen": 2.8881843090057373, "logits/rejected": 2.8712680339813232, "logps/chosen": -195.97402954101562, "logps/rejected": -152.29904174804688, "loss": 1.4526, "nll_loss": 1.166512131690979, "rewards/accuracies": 1.0, "rewards/chosen": -0.43008729815483093, "rewards/margins": 1.783848524093628, "rewards/rejected": -2.2139358520507812, "step": 1617 }, { "epoch": 0.26966666666666667, "grad_norm": 26.94222068786621, "learning_rate": 1.7135656649829265e-07, "logits/chosen": 3.50395131111145, "logits/rejected": 3.88287091255188, "logps/chosen": -73.88335418701172, "logps/rejected": -349.6612548828125, "loss": 0.9832, "nll_loss": 0.9472224116325378, "rewards/accuracies": 1.0, "rewards/chosen": 1.1876792907714844, "rewards/margins": 6.321298122406006, "rewards/rejected": -5.1336188316345215, "step": 1618 }, { "epoch": 0.2698333333333333, "grad_norm": 36.92264938354492, "learning_rate": 1.7131873886677194e-07, "logits/chosen": 3.1909806728363037, "logits/rejected": 3.176360607147217, "logps/chosen": -15.646646499633789, "logps/rejected": -35.19709777832031, "loss": 0.4874, "nll_loss": 0.39116618037223816, "rewards/accuracies": 1.0, "rewards/chosen": 2.0106139183044434, "rewards/margins": 3.6095919609069824, "rewards/rejected": -1.598978042602539, "step": 1619 }, { "epoch": 0.27, "grad_norm": 49.61838912963867, "learning_rate": 1.7128089045468293e-07, "logits/chosen": 2.6035783290863037, "logits/rejected": 2.7972805500030518, "logps/chosen": -98.54298400878906, "logps/rejected": -156.5732421875, "loss": 1.2756, "nll_loss": 1.201743721961975, "rewards/accuracies": 1.0, "rewards/chosen": 0.5840538144111633, "rewards/margins": 4.358397960662842, "rewards/rejected": -3.774343967437744, "step": 1620 }, { "epoch": 0.27016666666666667, "grad_norm": 66.96900939941406, "learning_rate": 1.7124302127305371e-07, "logits/chosen": 2.741770029067993, "logits/rejected": 2.969318389892578, "logps/chosen": -18.94561195373535, "logps/rejected": -276.6759948730469, "loss": 0.9209, "nll_loss": 0.9021720886230469, "rewards/accuracies": 1.0, "rewards/chosen": 1.8395891189575195, "rewards/margins": 7.825826168060303, "rewards/rejected": -5.986237049102783, "step": 1621 }, { "epoch": 0.2703333333333333, "grad_norm": 472.1914978027344, "learning_rate": 1.712051313329185e-07, "logits/chosen": 2.0145068168640137, "logits/rejected": 2.3392670154571533, "logps/chosen": -168.866943359375, "logps/rejected": -224.1859893798828, "loss": 3.9679, "nll_loss": 2.4473469257354736, "rewards/accuracies": 0.0, "rewards/chosen": -3.8321473598480225, "rewards/margins": -0.6256120204925537, "rewards/rejected": -3.2065353393554688, "step": 1622 }, { "epoch": 0.2705, "grad_norm": 22.324644088745117, "learning_rate": 1.7116722064531747e-07, "logits/chosen": 2.685540199279785, "logits/rejected": 2.620535135269165, "logps/chosen": -22.123117446899414, "logps/rejected": -176.595703125, "loss": 0.4483, "nll_loss": 0.42544448375701904, "rewards/accuracies": 1.0, "rewards/chosen": 2.1633737087249756, "rewards/margins": 5.930123329162598, "rewards/rejected": -3.766749858856201, "step": 1623 }, { "epoch": 0.27066666666666667, "grad_norm": 32.12664794921875, "learning_rate": 1.711292892212969e-07, "logits/chosen": 3.1871390342712402, "logits/rejected": 3.3198788166046143, "logps/chosen": -91.66600799560547, "logps/rejected": -178.6779022216797, "loss": 0.9876, "nll_loss": 0.9548541903495789, "rewards/accuracies": 1.0, "rewards/chosen": 1.4331474304199219, "rewards/margins": 5.773401737213135, "rewards/rejected": -4.340254306793213, "step": 1624 }, { "epoch": 0.2708333333333333, "grad_norm": 42.34506607055664, "learning_rate": 1.710913370719091e-07, "logits/chosen": 1.3691692352294922, "logits/rejected": 2.6275105476379395, "logps/chosen": -51.36829376220703, "logps/rejected": -252.15594482421875, "loss": 0.7773, "nll_loss": 0.733832836151123, "rewards/accuracies": 1.0, "rewards/chosen": 0.871025562286377, "rewards/margins": 8.51497745513916, "rewards/rejected": -7.643951892852783, "step": 1625 }, { "epoch": 0.271, "grad_norm": 42.62411880493164, "learning_rate": 1.7105336420821246e-07, "logits/chosen": 1.5429474115371704, "logits/rejected": 2.320587635040283, "logps/chosen": -36.37056350708008, "logps/rejected": -147.27700805664062, "loss": 0.7268, "nll_loss": 0.661283016204834, "rewards/accuracies": 1.0, "rewards/chosen": 1.3263744115829468, "rewards/margins": 4.002864837646484, "rewards/rejected": -2.676490306854248, "step": 1626 }, { "epoch": 0.27116666666666667, "grad_norm": 25.01170539855957, "learning_rate": 1.7101537064127127e-07, "logits/chosen": 2.8791582584381104, "logits/rejected": 2.796739101409912, "logps/chosen": -71.26866149902344, "logps/rejected": -133.10183715820312, "loss": 0.7156, "nll_loss": 0.6660623550415039, "rewards/accuracies": 1.0, "rewards/chosen": 2.0702178478240967, "rewards/margins": 4.552223205566406, "rewards/rejected": -2.4820053577423096, "step": 1627 }, { "epoch": 0.2713333333333333, "grad_norm": 29.290206909179688, "learning_rate": 1.7097735638215602e-07, "logits/chosen": 2.9111788272857666, "logits/rejected": 3.031226396560669, "logps/chosen": -15.944347381591797, "logps/rejected": -161.87384033203125, "loss": 0.5053, "nll_loss": 0.46895143389701843, "rewards/accuracies": 1.0, "rewards/chosen": 1.1585842370986938, "rewards/margins": 6.400764465332031, "rewards/rejected": -5.242180347442627, "step": 1628 }, { "epoch": 0.2715, "grad_norm": 21.978118896484375, "learning_rate": 1.7093932144194309e-07, "logits/chosen": 1.7226827144622803, "logits/rejected": 2.2346627712249756, "logps/chosen": -35.78324508666992, "logps/rejected": -206.4497833251953, "loss": 0.5605, "nll_loss": 0.5505115389823914, "rewards/accuracies": 1.0, "rewards/chosen": 2.4311370849609375, "rewards/margins": 10.69073486328125, "rewards/rejected": -8.259597778320312, "step": 1629 }, { "epoch": 0.27166666666666667, "grad_norm": 53.46982192993164, "learning_rate": 1.7090126583171503e-07, "logits/chosen": 2.7663674354553223, "logits/rejected": 2.8665757179260254, "logps/chosen": -43.85773468017578, "logps/rejected": -410.4903564453125, "loss": 1.9372, "nll_loss": 1.9068580865859985, "rewards/accuracies": 1.0, "rewards/chosen": 1.5081219673156738, "rewards/margins": 5.907395839691162, "rewards/rejected": -4.399273872375488, "step": 1630 }, { "epoch": 0.2718333333333333, "grad_norm": 122.63729095458984, "learning_rate": 1.7086318956256025e-07, "logits/chosen": 2.4662046432495117, "logits/rejected": 2.635146141052246, "logps/chosen": -36.657432556152344, "logps/rejected": -68.93138885498047, "loss": 2.6204, "nll_loss": 2.443828821182251, "rewards/accuracies": 1.0, "rewards/chosen": 0.42443084716796875, "rewards/margins": 2.399509906768799, "rewards/rejected": -1.97507905960083, "step": 1631 }, { "epoch": 0.272, "grad_norm": 75.71968841552734, "learning_rate": 1.708250926455733e-07, "logits/chosen": 3.081571102142334, "logits/rejected": 3.3115994930267334, "logps/chosen": -15.486822128295898, "logps/rejected": -185.69949340820312, "loss": 1.0544, "nll_loss": 1.0324548482894897, "rewards/accuracies": 1.0, "rewards/chosen": 1.703716516494751, "rewards/margins": 7.149723052978516, "rewards/rejected": -5.446006774902344, "step": 1632 }, { "epoch": 0.27216666666666667, "grad_norm": 132.68836975097656, "learning_rate": 1.707869750918547e-07, "logits/chosen": 2.7419943809509277, "logits/rejected": 2.683617115020752, "logps/chosen": -41.679779052734375, "logps/rejected": -78.91759490966797, "loss": 2.4201, "nll_loss": 1.9847513437271118, "rewards/accuracies": 1.0, "rewards/chosen": -0.013819122686982155, "rewards/margins": 1.0170388221740723, "rewards/rejected": -1.0308579206466675, "step": 1633 }, { "epoch": 0.2723333333333333, "grad_norm": 35.33539581298828, "learning_rate": 1.70748836912511e-07, "logits/chosen": 2.7378671169281006, "logits/rejected": 2.725215196609497, "logps/chosen": -231.8245849609375, "logps/rejected": -92.68748474121094, "loss": 0.9659, "nll_loss": 0.8586096167564392, "rewards/accuracies": 1.0, "rewards/chosen": 1.5433472394943237, "rewards/margins": 3.2658989429473877, "rewards/rejected": -1.722551703453064, "step": 1634 }, { "epoch": 0.2725, "grad_norm": 35.30402755737305, "learning_rate": 1.7071067811865473e-07, "logits/chosen": 2.670405387878418, "logits/rejected": 2.879387617111206, "logps/chosen": -63.543827056884766, "logps/rejected": -277.3600769042969, "loss": 0.9779, "nll_loss": 0.9484153985977173, "rewards/accuracies": 1.0, "rewards/chosen": 1.317901372909546, "rewards/margins": 7.649204254150391, "rewards/rejected": -6.331302642822266, "step": 1635 }, { "epoch": 0.27266666666666667, "grad_norm": 34.5579719543457, "learning_rate": 1.7067249872140448e-07, "logits/chosen": 0.9510816931724548, "logits/rejected": 2.7416398525238037, "logps/chosen": -37.35395812988281, "logps/rejected": -194.8543701171875, "loss": 0.8225, "nll_loss": 0.7623255848884583, "rewards/accuracies": 1.0, "rewards/chosen": 0.4915756285190582, "rewards/margins": 8.7318115234375, "rewards/rejected": -8.240236282348633, "step": 1636 }, { "epoch": 0.2728333333333333, "grad_norm": 33.9787483215332, "learning_rate": 1.7063429873188476e-07, "logits/chosen": 2.0670578479766846, "logits/rejected": 2.241044759750366, "logps/chosen": -70.5022201538086, "logps/rejected": -164.86557006835938, "loss": 1.0683, "nll_loss": 1.0217710733413696, "rewards/accuracies": 1.0, "rewards/chosen": 0.8859321475028992, "rewards/margins": 5.9608235359191895, "rewards/rejected": -5.074891567230225, "step": 1637 }, { "epoch": 0.273, "grad_norm": 37.765933990478516, "learning_rate": 1.7059607816122618e-07, "logits/chosen": 2.355842351913452, "logits/rejected": 2.3237335681915283, "logps/chosen": -54.894203186035156, "logps/rejected": -48.51097869873047, "loss": 0.9334, "nll_loss": 0.8445262908935547, "rewards/accuracies": 1.0, "rewards/chosen": 0.37861865758895874, "rewards/margins": 4.035338401794434, "rewards/rejected": -3.656719923019409, "step": 1638 }, { "epoch": 0.27316666666666667, "grad_norm": 184.88975524902344, "learning_rate": 1.7055783702056525e-07, "logits/chosen": 2.8294944763183594, "logits/rejected": 2.7176570892333984, "logps/chosen": -66.0692138671875, "logps/rejected": -122.19699096679688, "loss": 1.4909, "nll_loss": 0.8928272128105164, "rewards/accuracies": 1.0, "rewards/chosen": -0.9113144278526306, "rewards/margins": 0.5823196768760681, "rewards/rejected": -1.4936341047286987, "step": 1639 }, { "epoch": 0.2733333333333333, "grad_norm": 70.93421936035156, "learning_rate": 1.7051957532104459e-07, "logits/chosen": 2.890540838241577, "logits/rejected": 3.021036148071289, "logps/chosen": -69.18304443359375, "logps/rejected": -72.2763442993164, "loss": 1.2704, "nll_loss": 0.9477129578590393, "rewards/accuracies": 1.0, "rewards/chosen": 1.9127427339553833, "rewards/margins": 1.9397400617599487, "rewards/rejected": -0.02699737623333931, "step": 1640 }, { "epoch": 0.2735, "grad_norm": 28.805435180664062, "learning_rate": 1.7048129307381266e-07, "logits/chosen": 3.327907085418701, "logits/rejected": 3.22312593460083, "logps/chosen": -76.32194519042969, "logps/rejected": -225.9499053955078, "loss": 0.8624, "nll_loss": 0.8387026190757751, "rewards/accuracies": 1.0, "rewards/chosen": 1.7061188220977783, "rewards/margins": 6.516810417175293, "rewards/rejected": -4.810691833496094, "step": 1641 }, { "epoch": 0.27366666666666667, "grad_norm": 54.78974151611328, "learning_rate": 1.70442990290024e-07, "logits/chosen": 2.9484286308288574, "logits/rejected": 2.9419758319854736, "logps/chosen": -72.66838073730469, "logps/rejected": -56.06306076049805, "loss": 1.1626, "nll_loss": 0.8549220561981201, "rewards/accuracies": 1.0, "rewards/chosen": 1.9153244495391846, "rewards/margins": 2.0082943439483643, "rewards/rejected": -0.09296990185976028, "step": 1642 }, { "epoch": 0.2738333333333333, "grad_norm": 37.088382720947266, "learning_rate": 1.7040466698083917e-07, "logits/chosen": 1.6262781620025635, "logits/rejected": 1.5594626665115356, "logps/chosen": -203.11190795898438, "logps/rejected": -134.37619018554688, "loss": 1.5687, "nll_loss": 1.5271574258804321, "rewards/accuracies": 1.0, "rewards/chosen": 1.0428298711776733, "rewards/margins": 5.895510196685791, "rewards/rejected": -4.852680206298828, "step": 1643 }, { "epoch": 0.274, "grad_norm": 25.923078536987305, "learning_rate": 1.703663231574246e-07, "logits/chosen": 3.1354076862335205, "logits/rejected": 3.077040195465088, "logps/chosen": -126.98182678222656, "logps/rejected": -209.70489501953125, "loss": 0.9841, "nll_loss": 0.969326913356781, "rewards/accuracies": 1.0, "rewards/chosen": 2.023794651031494, "rewards/margins": 9.59783935546875, "rewards/rejected": -7.574045181274414, "step": 1644 }, { "epoch": 0.27416666666666667, "grad_norm": 61.23235321044922, "learning_rate": 1.7032795883095284e-07, "logits/chosen": 2.84419584274292, "logits/rejected": 2.781005382537842, "logps/chosen": -48.550537109375, "logps/rejected": -101.45285034179688, "loss": 0.9019, "nll_loss": 0.7139785289764404, "rewards/accuracies": 1.0, "rewards/chosen": 0.2530250549316406, "rewards/margins": 2.3238685131073, "rewards/rejected": -2.070843458175659, "step": 1645 }, { "epoch": 0.2743333333333333, "grad_norm": 29.454486846923828, "learning_rate": 1.7028957401260228e-07, "logits/chosen": 2.621339797973633, "logits/rejected": 2.643704652786255, "logps/chosen": -52.76023864746094, "logps/rejected": -81.58538055419922, "loss": 0.5933, "nll_loss": 0.5122353434562683, "rewards/accuracies": 1.0, "rewards/chosen": 1.201757788658142, "rewards/margins": 3.634963035583496, "rewards/rejected": -2.4332053661346436, "step": 1646 }, { "epoch": 0.2745, "grad_norm": 23.505374908447266, "learning_rate": 1.7025116871355734e-07, "logits/chosen": 2.927128314971924, "logits/rejected": 3.171365737915039, "logps/chosen": -47.7684326171875, "logps/rejected": -243.26034545898438, "loss": 0.5459, "nll_loss": 0.5136390924453735, "rewards/accuracies": 1.0, "rewards/chosen": 1.2136391401290894, "rewards/margins": 7.516340255737305, "rewards/rejected": -6.302700996398926, "step": 1647 }, { "epoch": 0.27466666666666667, "grad_norm": 58.033016204833984, "learning_rate": 1.702127429450084e-07, "logits/chosen": 4.1322760581970215, "logits/rejected": 3.7818849086761475, "logps/chosen": -82.60539245605469, "logps/rejected": -19.988422393798828, "loss": 1.4522, "nll_loss": 1.0325672626495361, "rewards/accuracies": 1.0, "rewards/chosen": 2.0556418895721436, "rewards/margins": 1.624950647354126, "rewards/rejected": 0.4306911826133728, "step": 1648 }, { "epoch": 0.2748333333333333, "grad_norm": 36.58473205566406, "learning_rate": 1.7017429671815182e-07, "logits/chosen": 2.216005563735962, "logits/rejected": 1.5889809131622314, "logps/chosen": -207.90406799316406, "logps/rejected": -148.22671508789062, "loss": 1.0451, "nll_loss": 0.9536884427070618, "rewards/accuracies": 1.0, "rewards/chosen": 1.4354599714279175, "rewards/margins": 3.4672317504882812, "rewards/rejected": -2.0317718982696533, "step": 1649 }, { "epoch": 0.275, "grad_norm": 24.11160659790039, "learning_rate": 1.7013583004418992e-07, "logits/chosen": 1.6412312984466553, "logits/rejected": 2.926612377166748, "logps/chosen": -44.43364334106445, "logps/rejected": -211.1763916015625, "loss": 0.6368, "nll_loss": 0.6004545092582703, "rewards/accuracies": 1.0, "rewards/chosen": 2.255730152130127, "rewards/margins": 5.082512378692627, "rewards/rejected": -2.8267822265625, "step": 1650 }, { "epoch": 0.27516666666666667, "grad_norm": 30.265701293945312, "learning_rate": 1.700973429343309e-07, "logits/chosen": 2.216078281402588, "logits/rejected": 2.2852206230163574, "logps/chosen": -57.083839416503906, "logps/rejected": -133.97433471679688, "loss": 1.0568, "nll_loss": 1.019354224205017, "rewards/accuracies": 1.0, "rewards/chosen": 1.5371055603027344, "rewards/margins": 5.118030548095703, "rewards/rejected": -3.580925226211548, "step": 1651 }, { "epoch": 0.2753333333333333, "grad_norm": 225.45729064941406, "learning_rate": 1.7005883539978908e-07, "logits/chosen": 3.1157782077789307, "logits/rejected": 3.1232638359069824, "logps/chosen": -80.58626556396484, "logps/rejected": -36.06667709350586, "loss": 4.8163, "nll_loss": 1.1039215326309204, "rewards/accuracies": 0.0, "rewards/chosen": 0.21672365069389343, "rewards/margins": -3.453310251235962, "rewards/rejected": 3.6700339317321777, "step": 1652 }, { "epoch": 0.2755, "grad_norm": 21.095552444458008, "learning_rate": 1.7002030745178452e-07, "logits/chosen": 2.2408106327056885, "logits/rejected": 2.2094411849975586, "logps/chosen": -227.25103759765625, "logps/rejected": -326.8118591308594, "loss": 0.8109, "nll_loss": 0.7890660762786865, "rewards/accuracies": 1.0, "rewards/chosen": 1.5996735095977783, "rewards/margins": 10.378278732299805, "rewards/rejected": -8.778605461120605, "step": 1653 }, { "epoch": 0.27566666666666667, "grad_norm": 38.39143371582031, "learning_rate": 1.6998175910154341e-07, "logits/chosen": 3.301097869873047, "logits/rejected": 3.218287229537964, "logps/chosen": -19.271587371826172, "logps/rejected": -31.415102005004883, "loss": 0.6091, "nll_loss": 0.5208536982536316, "rewards/accuracies": 1.0, "rewards/chosen": 1.5826146602630615, "rewards/margins": 3.558101177215576, "rewards/rejected": -1.975486397743225, "step": 1654 }, { "epoch": 0.2758333333333333, "grad_norm": 132.48521423339844, "learning_rate": 1.6994319036029783e-07, "logits/chosen": 3.0319035053253174, "logits/rejected": 3.0721492767333984, "logps/chosen": -18.24508285522461, "logps/rejected": -102.09883117675781, "loss": 1.3211, "nll_loss": 1.216338872909546, "rewards/accuracies": 1.0, "rewards/chosen": 2.2241902351379395, "rewards/margins": 3.6260151863098145, "rewards/rejected": -1.401824951171875, "step": 1655 }, { "epoch": 0.276, "grad_norm": 31.86240005493164, "learning_rate": 1.6990460123928574e-07, "logits/chosen": 2.2110204696655273, "logits/rejected": 2.8179690837860107, "logps/chosen": -34.34678649902344, "logps/rejected": -145.5135040283203, "loss": 0.6641, "nll_loss": 0.624487042427063, "rewards/accuracies": 1.0, "rewards/chosen": 1.5832722187042236, "rewards/margins": 4.9461493492126465, "rewards/rejected": -3.362877130508423, "step": 1656 }, { "epoch": 0.27616666666666667, "grad_norm": 31.167884826660156, "learning_rate": 1.698659917497511e-07, "logits/chosen": 2.580402374267578, "logits/rejected": 2.579936981201172, "logps/chosen": -66.41202545166016, "logps/rejected": -66.48251342773438, "loss": 0.8409, "nll_loss": 0.763356626033783, "rewards/accuracies": 1.0, "rewards/chosen": 2.171800374984741, "rewards/margins": 3.9878451824188232, "rewards/rejected": -1.816044807434082, "step": 1657 }, { "epoch": 0.2763333333333333, "grad_norm": 25.922245025634766, "learning_rate": 1.6982736190294378e-07, "logits/chosen": 2.5838632583618164, "logits/rejected": 2.7979676723480225, "logps/chosen": -24.015050888061523, "logps/rejected": -201.1776123046875, "loss": 0.4891, "nll_loss": 0.46182781457901, "rewards/accuracies": 1.0, "rewards/chosen": 1.3934224843978882, "rewards/margins": 7.936818599700928, "rewards/rejected": -6.54339599609375, "step": 1658 }, { "epoch": 0.2765, "grad_norm": 33.78899002075195, "learning_rate": 1.697887117101196e-07, "logits/chosen": 2.57307767868042, "logits/rejected": 2.616549253463745, "logps/chosen": -67.20512390136719, "logps/rejected": -61.54523468017578, "loss": 0.86, "nll_loss": 0.7814549207687378, "rewards/accuracies": 1.0, "rewards/chosen": 0.661572277545929, "rewards/margins": 3.981865644454956, "rewards/rejected": -3.320293426513672, "step": 1659 }, { "epoch": 0.27666666666666667, "grad_norm": 34.23162841796875, "learning_rate": 1.6975004118254027e-07, "logits/chosen": 2.191767692565918, "logits/rejected": 2.2096402645111084, "logps/chosen": -54.38175964355469, "logps/rejected": -152.58099365234375, "loss": 0.8363, "nll_loss": 0.8116680383682251, "rewards/accuracies": 1.0, "rewards/chosen": 1.6020493507385254, "rewards/margins": 6.754785537719727, "rewards/rejected": -5.152736186981201, "step": 1660 }, { "epoch": 0.2768333333333333, "grad_norm": 25.420427322387695, "learning_rate": 1.697113503314735e-07, "logits/chosen": 3.119126319885254, "logits/rejected": 3.1771395206451416, "logps/chosen": -96.80796813964844, "logps/rejected": -221.33209228515625, "loss": 1.07, "nll_loss": 1.052260398864746, "rewards/accuracies": 1.0, "rewards/chosen": 1.9882774353027344, "rewards/margins": 7.111546993255615, "rewards/rejected": -5.123269557952881, "step": 1661 }, { "epoch": 0.277, "grad_norm": 21.158912658691406, "learning_rate": 1.6967263916819284e-07, "logits/chosen": 2.398489236831665, "logits/rejected": 2.39503812789917, "logps/chosen": -201.32086181640625, "logps/rejected": -125.55854034423828, "loss": 0.7393, "nll_loss": 0.7088763117790222, "rewards/accuracies": 1.0, "rewards/chosen": 1.9796600341796875, "rewards/margins": 5.362562656402588, "rewards/rejected": -3.3829026222229004, "step": 1662 }, { "epoch": 0.2771666666666667, "grad_norm": 28.73335075378418, "learning_rate": 1.6963390770397785e-07, "logits/chosen": 3.979983329772949, "logits/rejected": 4.0715436935424805, "logps/chosen": -72.31039428710938, "logps/rejected": -252.6865692138672, "loss": 0.9421, "nll_loss": 0.9153214693069458, "rewards/accuracies": 1.0, "rewards/chosen": 1.3886215686798096, "rewards/margins": 8.973891258239746, "rewards/rejected": -7.585269927978516, "step": 1663 }, { "epoch": 0.2773333333333333, "grad_norm": 41.93064880371094, "learning_rate": 1.6959515595011388e-07, "logits/chosen": 3.219317674636841, "logits/rejected": 3.7492904663085938, "logps/chosen": -99.94902801513672, "logps/rejected": -832.64111328125, "loss": 1.2808, "nll_loss": 1.2188905477523804, "rewards/accuracies": 1.0, "rewards/chosen": 0.6498939394950867, "rewards/margins": 4.983023643493652, "rewards/rejected": -4.3331298828125, "step": 1664 }, { "epoch": 0.2775, "grad_norm": 97.60518646240234, "learning_rate": 1.6955638391789226e-07, "logits/chosen": 2.2568352222442627, "logits/rejected": 2.40297532081604, "logps/chosen": -35.933807373046875, "logps/rejected": -44.271244049072266, "loss": 1.1991, "nll_loss": 0.5363254547119141, "rewards/accuracies": 1.0, "rewards/chosen": 1.0537761449813843, "rewards/margins": 0.5746421813964844, "rewards/rejected": 0.4791339933872223, "step": 1665 }, { "epoch": 0.2776666666666667, "grad_norm": 43.591796875, "learning_rate": 1.6951759161861029e-07, "logits/chosen": 1.055019736289978, "logits/rejected": 1.8024259805679321, "logps/chosen": -67.55452728271484, "logps/rejected": -144.44932556152344, "loss": 1.1902, "nll_loss": 1.107451319694519, "rewards/accuracies": 1.0, "rewards/chosen": 0.9519295692443848, "rewards/margins": 3.64548921585083, "rewards/rejected": -2.6935596466064453, "step": 1666 }, { "epoch": 0.2778333333333333, "grad_norm": 24.942232131958008, "learning_rate": 1.6947877906357104e-07, "logits/chosen": 1.198045015335083, "logits/rejected": 1.1912450790405273, "logps/chosen": -76.76097106933594, "logps/rejected": -111.4784927368164, "loss": 0.8433, "nll_loss": 0.8080100417137146, "rewards/accuracies": 1.0, "rewards/chosen": 2.463548421859741, "rewards/margins": 5.207625865936279, "rewards/rejected": -2.744077444076538, "step": 1667 }, { "epoch": 0.278, "grad_norm": 30.482471466064453, "learning_rate": 1.6943994626408362e-07, "logits/chosen": 1.2789398431777954, "logits/rejected": 1.9597381353378296, "logps/chosen": -85.48855590820312, "logps/rejected": -164.11117553710938, "loss": 1.0233, "nll_loss": 0.9826269745826721, "rewards/accuracies": 1.0, "rewards/chosen": 1.1218703985214233, "rewards/margins": 5.6209917068481445, "rewards/rejected": -4.499121189117432, "step": 1668 }, { "epoch": 0.2781666666666667, "grad_norm": 22.548437118530273, "learning_rate": 1.6940109323146294e-07, "logits/chosen": 0.7707271575927734, "logits/rejected": 2.6574466228485107, "logps/chosen": -25.871597290039062, "logps/rejected": -285.6544189453125, "loss": 0.4492, "nll_loss": 0.4311933219432831, "rewards/accuracies": 1.0, "rewards/chosen": 2.116412401199341, "rewards/margins": 6.657645225524902, "rewards/rejected": -4.541232585906982, "step": 1669 }, { "epoch": 0.2783333333333333, "grad_norm": 47.479103088378906, "learning_rate": 1.6936221997702988e-07, "logits/chosen": 2.395585775375366, "logits/rejected": 2.5709869861602783, "logps/chosen": -23.118860244750977, "logps/rejected": -44.50616455078125, "loss": 0.9703, "nll_loss": 0.9247545003890991, "rewards/accuracies": 1.0, "rewards/chosen": 2.067303419113159, "rewards/margins": 4.681340217590332, "rewards/rejected": -2.614036798477173, "step": 1670 }, { "epoch": 0.2785, "grad_norm": 28.991262435913086, "learning_rate": 1.6932332651211115e-07, "logits/chosen": 2.125486373901367, "logits/rejected": 2.3814589977264404, "logps/chosen": -72.81066131591797, "logps/rejected": -374.47491455078125, "loss": 0.7657, "nll_loss": 0.7208976149559021, "rewards/accuracies": 1.0, "rewards/chosen": 0.8592628836631775, "rewards/margins": 6.786262512207031, "rewards/rejected": -5.926999568939209, "step": 1671 }, { "epoch": 0.2786666666666667, "grad_norm": 43.33965301513672, "learning_rate": 1.6928441284803934e-07, "logits/chosen": 1.256369709968567, "logits/rejected": 2.866717576980591, "logps/chosen": -29.36385726928711, "logps/rejected": -459.10137939453125, "loss": 1.0578, "nll_loss": 1.0487091541290283, "rewards/accuracies": 1.0, "rewards/chosen": 2.5288658142089844, "rewards/margins": 9.934356689453125, "rewards/rejected": -7.405490398406982, "step": 1672 }, { "epoch": 0.2788333333333333, "grad_norm": 42.47261047363281, "learning_rate": 1.69245478996153e-07, "logits/chosen": 2.710679054260254, "logits/rejected": 2.559274911880493, "logps/chosen": -15.54370403289795, "logps/rejected": -43.0926399230957, "loss": 0.6232, "nll_loss": 0.5978347063064575, "rewards/accuracies": 1.0, "rewards/chosen": 2.2342591285705566, "rewards/margins": 5.689151763916016, "rewards/rejected": -3.45489239692688, "step": 1673 }, { "epoch": 0.279, "grad_norm": 27.26413345336914, "learning_rate": 1.6920652496779648e-07, "logits/chosen": 2.4430899620056152, "logits/rejected": 2.5536773204803467, "logps/chosen": -31.89923858642578, "logps/rejected": -85.96569061279297, "loss": 0.64, "nll_loss": 0.6018725037574768, "rewards/accuracies": 1.0, "rewards/chosen": 1.8083359003067017, "rewards/margins": 4.950438976287842, "rewards/rejected": -3.1421031951904297, "step": 1674 }, { "epoch": 0.2791666666666667, "grad_norm": 35.470863342285156, "learning_rate": 1.6916755077432012e-07, "logits/chosen": 1.9436396360397339, "logits/rejected": 2.370030403137207, "logps/chosen": -73.67799377441406, "logps/rejected": -243.6097869873047, "loss": 1.5385, "nll_loss": 1.534958004951477, "rewards/accuracies": 1.0, "rewards/chosen": 3.4661624431610107, "rewards/margins": 13.035131454467773, "rewards/rejected": -9.568968772888184, "step": 1675 }, { "epoch": 0.2793333333333333, "grad_norm": 325.6036682128906, "learning_rate": 1.6912855642708e-07, "logits/chosen": 1.60767662525177, "logits/rejected": 2.453106641769409, "logps/chosen": -58.54850387573242, "logps/rejected": -197.85064697265625, "loss": 2.6374, "nll_loss": 1.0645183324813843, "rewards/accuracies": 0.0, "rewards/chosen": 0.33900681138038635, "rewards/margins": -1.058890700340271, "rewards/rejected": 1.397897481918335, "step": 1676 }, { "epoch": 0.2795, "grad_norm": 254.4211883544922, "learning_rate": 1.6908954193743814e-07, "logits/chosen": 1.8205150365829468, "logits/rejected": 1.9667679071426392, "logps/chosen": -61.38970184326172, "logps/rejected": -40.05670928955078, "loss": 4.2584, "nll_loss": 1.1582963466644287, "rewards/accuracies": 0.0, "rewards/chosen": 0.9372535943984985, "rewards/margins": -2.704075813293457, "rewards/rejected": 3.641329288482666, "step": 1677 }, { "epoch": 0.2796666666666667, "grad_norm": 53.651344299316406, "learning_rate": 1.6905050731676246e-07, "logits/chosen": 2.742366313934326, "logits/rejected": 2.6481223106384277, "logps/chosen": -81.12371063232422, "logps/rejected": -42.36437225341797, "loss": 1.3793, "nll_loss": 1.175705909729004, "rewards/accuracies": 1.0, "rewards/chosen": 0.06439590454101562, "rewards/margins": 2.2314236164093018, "rewards/rejected": -2.167027711868286, "step": 1678 }, { "epoch": 0.2798333333333333, "grad_norm": 25.94630241394043, "learning_rate": 1.6901145257642665e-07, "logits/chosen": 2.894458055496216, "logits/rejected": 2.791820764541626, "logps/chosen": -70.05831146240234, "logps/rejected": -90.97744750976562, "loss": 0.9207, "nll_loss": 0.8868139982223511, "rewards/accuracies": 1.0, "rewards/chosen": 2.1630699634552, "rewards/margins": 5.172703742980957, "rewards/rejected": -3.009633779525757, "step": 1679 }, { "epoch": 0.28, "grad_norm": 40.982322692871094, "learning_rate": 1.6897237772781044e-07, "logits/chosen": 2.5861220359802246, "logits/rejected": 2.253878593444824, "logps/chosen": -35.855777740478516, "logps/rejected": -24.26607894897461, "loss": 0.8868, "nll_loss": 0.7469953894615173, "rewards/accuracies": 1.0, "rewards/chosen": 1.705480933189392, "rewards/margins": 2.976579189300537, "rewards/rejected": -1.2710981369018555, "step": 1680 }, { "epoch": 0.2801666666666667, "grad_norm": 22.88591766357422, "learning_rate": 1.6893328278229918e-07, "logits/chosen": 3.227243423461914, "logits/rejected": 3.19771671295166, "logps/chosen": -114.37238311767578, "logps/rejected": -192.03536987304688, "loss": 0.9291, "nll_loss": 0.9149789810180664, "rewards/accuracies": 1.0, "rewards/chosen": 2.4211831092834473, "rewards/margins": 7.011104583740234, "rewards/rejected": -4.589921474456787, "step": 1681 }, { "epoch": 0.2803333333333333, "grad_norm": 25.3424129486084, "learning_rate": 1.6889416775128424e-07, "logits/chosen": 1.7766671180725098, "logits/rejected": 1.9900766611099243, "logps/chosen": -45.27003479003906, "logps/rejected": -295.7911071777344, "loss": 0.6128, "nll_loss": 0.5803850889205933, "rewards/accuracies": 1.0, "rewards/chosen": 1.31157386302948, "rewards/margins": 6.246534824371338, "rewards/rejected": -4.934960842132568, "step": 1682 }, { "epoch": 0.2805, "grad_norm": 104.35287475585938, "learning_rate": 1.688550326461628e-07, "logits/chosen": 3.0358119010925293, "logits/rejected": 3.0511248111724854, "logps/chosen": -65.02692413330078, "logps/rejected": -29.82743263244629, "loss": 2.3888, "nll_loss": 2.1675641536712646, "rewards/accuracies": 1.0, "rewards/chosen": 0.2960571348667145, "rewards/margins": 2.044905662536621, "rewards/rejected": -1.748848557472229, "step": 1683 }, { "epoch": 0.2806666666666667, "grad_norm": 63.835548400878906, "learning_rate": 1.6881587747833793e-07, "logits/chosen": 1.8144288063049316, "logits/rejected": 2.1453638076782227, "logps/chosen": -28.192251205444336, "logps/rejected": -263.9778747558594, "loss": 0.856, "nll_loss": 0.7831181883811951, "rewards/accuracies": 1.0, "rewards/chosen": 0.6115274429321289, "rewards/margins": 4.303138732910156, "rewards/rejected": -3.6916110515594482, "step": 1684 }, { "epoch": 0.2808333333333333, "grad_norm": 111.22695922851562, "learning_rate": 1.6877670225921845e-07, "logits/chosen": 2.630095958709717, "logits/rejected": 2.6319797039031982, "logps/chosen": -29.363683700561523, "logps/rejected": -22.948883056640625, "loss": 2.3463, "nll_loss": 1.9575790166854858, "rewards/accuracies": 1.0, "rewards/chosen": -0.31794071197509766, "rewards/margins": 1.2098156213760376, "rewards/rejected": -1.5277563333511353, "step": 1685 }, { "epoch": 0.281, "grad_norm": 276.6610107421875, "learning_rate": 1.6873750700021914e-07, "logits/chosen": 2.2707653045654297, "logits/rejected": 1.9931471347808838, "logps/chosen": -174.38424682617188, "logps/rejected": -62.77828598022461, "loss": 1.9407, "nll_loss": 0.8807283639907837, "rewards/accuracies": 0.0, "rewards/chosen": 1.0354492664337158, "rewards/margins": -0.18910634517669678, "rewards/rejected": 1.2245556116104126, "step": 1686 }, { "epoch": 0.2811666666666667, "grad_norm": 28.01576805114746, "learning_rate": 1.6869829171276048e-07, "logits/chosen": 1.5362205505371094, "logits/rejected": 1.963818907737732, "logps/chosen": -36.95775604248047, "logps/rejected": -183.95382690429688, "loss": 0.6675, "nll_loss": 0.6483815908432007, "rewards/accuracies": 1.0, "rewards/chosen": 2.023045063018799, "rewards/margins": 6.60702657699585, "rewards/rejected": -4.583981513977051, "step": 1687 }, { "epoch": 0.2813333333333333, "grad_norm": 24.18242645263672, "learning_rate": 1.6865905640826892e-07, "logits/chosen": 2.454516649246216, "logits/rejected": 2.9765355587005615, "logps/chosen": -32.04822540283203, "logps/rejected": -307.1372375488281, "loss": 0.5579, "nll_loss": 0.5169068574905396, "rewards/accuracies": 1.0, "rewards/chosen": 1.347328543663025, "rewards/margins": 5.0561699867248535, "rewards/rejected": -3.708841323852539, "step": 1688 }, { "epoch": 0.2815, "grad_norm": 22.996252059936523, "learning_rate": 1.6861980109817668e-07, "logits/chosen": 2.6723313331604004, "logits/rejected": 2.7727651596069336, "logps/chosen": -121.5401611328125, "logps/rejected": -52.75804138183594, "loss": 1.0051, "nll_loss": 0.9881313443183899, "rewards/accuracies": 1.0, "rewards/chosen": 2.3773820400238037, "rewards/margins": 6.510307312011719, "rewards/rejected": -4.132925510406494, "step": 1689 }, { "epoch": 0.2816666666666667, "grad_norm": 59.28502655029297, "learning_rate": 1.685805257939218e-07, "logits/chosen": 3.015660524368286, "logits/rejected": 2.884908437728882, "logps/chosen": -74.7975082397461, "logps/rejected": -83.07978057861328, "loss": 1.1248, "nll_loss": 1.010777235031128, "rewards/accuracies": 1.0, "rewards/chosen": 0.22004471719264984, "rewards/margins": 3.431020975112915, "rewards/rejected": -3.2109763622283936, "step": 1690 }, { "epoch": 0.2818333333333333, "grad_norm": 38.374290466308594, "learning_rate": 1.6854123050694815e-07, "logits/chosen": 2.5956318378448486, "logits/rejected": 2.6641528606414795, "logps/chosen": -25.4527645111084, "logps/rejected": -195.55392456054688, "loss": 0.6662, "nll_loss": 0.62079918384552, "rewards/accuracies": 1.0, "rewards/chosen": 0.9634687900543213, "rewards/margins": 5.5972089767456055, "rewards/rejected": -4.633740425109863, "step": 1691 }, { "epoch": 0.282, "grad_norm": 68.72625732421875, "learning_rate": 1.6850191524870546e-07, "logits/chosen": 1.3328709602355957, "logits/rejected": 1.8849420547485352, "logps/chosen": -27.99083709716797, "logps/rejected": -201.94688415527344, "loss": 0.9183, "nll_loss": 0.9029302000999451, "rewards/accuracies": 1.0, "rewards/chosen": 1.9602017402648926, "rewards/margins": 10.655986785888672, "rewards/rejected": -8.695784568786621, "step": 1692 }, { "epoch": 0.2821666666666667, "grad_norm": 71.65819549560547, "learning_rate": 1.6846258003064923e-07, "logits/chosen": 3.051393747329712, "logits/rejected": 3.12272310256958, "logps/chosen": -59.496665954589844, "logps/rejected": -152.18878173828125, "loss": 1.0897, "nll_loss": 0.9014647006988525, "rewards/accuracies": 1.0, "rewards/chosen": 0.64199298620224, "rewards/margins": 2.2873756885528564, "rewards/rejected": -1.6453827619552612, "step": 1693 }, { "epoch": 0.2823333333333333, "grad_norm": 34.08808517456055, "learning_rate": 1.684232248642408e-07, "logits/chosen": 0.5440531969070435, "logits/rejected": 1.3578954935073853, "logps/chosen": -92.35093688964844, "logps/rejected": -371.1695251464844, "loss": 1.2347, "nll_loss": 1.2313460111618042, "rewards/accuracies": 1.0, "rewards/chosen": 3.5088775157928467, "rewards/margins": 14.930505752563477, "rewards/rejected": -11.42162799835205, "step": 1694 }, { "epoch": 0.2825, "grad_norm": 40.627418518066406, "learning_rate": 1.6838384976094736e-07, "logits/chosen": 0.6786738634109497, "logits/rejected": 1.6824595928192139, "logps/chosen": -39.95072937011719, "logps/rejected": -299.2511291503906, "loss": 0.8683, "nll_loss": 0.8500153422355652, "rewards/accuracies": 1.0, "rewards/chosen": 1.875618815422058, "rewards/margins": 7.4962029457092285, "rewards/rejected": -5.620584011077881, "step": 1695 }, { "epoch": 0.2826666666666667, "grad_norm": 45.48847579956055, "learning_rate": 1.6834445473224182e-07, "logits/chosen": 1.4002480506896973, "logits/rejected": 1.3233485221862793, "logps/chosen": -125.18923950195312, "logps/rejected": -73.5609130859375, "loss": 1.1447, "nll_loss": 1.0261414051055908, "rewards/accuracies": 1.0, "rewards/chosen": 1.5036927461624146, "rewards/margins": 3.120499610900879, "rewards/rejected": -1.6168068647384644, "step": 1696 }, { "epoch": 0.2828333333333333, "grad_norm": 37.362464904785156, "learning_rate": 1.6830503978960296e-07, "logits/chosen": 3.015691041946411, "logits/rejected": 3.1335151195526123, "logps/chosen": -9.937764167785645, "logps/rejected": -145.43661499023438, "loss": 0.4558, "nll_loss": 0.39751049876213074, "rewards/accuracies": 1.0, "rewards/chosen": 1.0331538915634155, "rewards/margins": 4.391411781311035, "rewards/rejected": -3.358258008956909, "step": 1697 }, { "epoch": 0.283, "grad_norm": 24.577335357666016, "learning_rate": 1.6826560494451535e-07, "logits/chosen": 1.5511119365692139, "logits/rejected": 2.052116632461548, "logps/chosen": -57.29486846923828, "logps/rejected": -186.29347229003906, "loss": 0.6557, "nll_loss": 0.6227703094482422, "rewards/accuracies": 1.0, "rewards/chosen": 1.1682442426681519, "rewards/margins": 7.867859840393066, "rewards/rejected": -6.699615478515625, "step": 1698 }, { "epoch": 0.2831666666666667, "grad_norm": 22.275428771972656, "learning_rate": 1.6822615020846937e-07, "logits/chosen": 2.584080219268799, "logits/rejected": 2.7087109088897705, "logps/chosen": -55.65985870361328, "logps/rejected": -124.55399322509766, "loss": 0.6248, "nll_loss": 0.6116467714309692, "rewards/accuracies": 1.0, "rewards/chosen": 2.4785964488983154, "rewards/margins": 7.134401321411133, "rewards/rejected": -4.6558051109313965, "step": 1699 }, { "epoch": 0.2833333333333333, "grad_norm": 29.687458038330078, "learning_rate": 1.681866755929612e-07, "logits/chosen": 1.4336413145065308, "logits/rejected": 3.198228597640991, "logps/chosen": -57.9234733581543, "logps/rejected": -514.4413452148438, "loss": 0.8082, "nll_loss": 0.7934720516204834, "rewards/accuracies": 1.0, "rewards/chosen": 2.0154857635498047, "rewards/margins": 9.173505783081055, "rewards/rejected": -7.15802001953125, "step": 1700 }, { "epoch": 0.2835, "grad_norm": 23.05150032043457, "learning_rate": 1.6814718110949272e-07, "logits/chosen": 1.2703403234481812, "logits/rejected": 1.5508619546890259, "logps/chosen": -84.94634246826172, "logps/rejected": -275.45880126953125, "loss": 0.9024, "nll_loss": 0.8941721320152283, "rewards/accuracies": 1.0, "rewards/chosen": 2.5927348136901855, "rewards/margins": 12.787019729614258, "rewards/rejected": -10.194284439086914, "step": 1701 }, { "epoch": 0.2836666666666667, "grad_norm": 31.86588478088379, "learning_rate": 1.6810766676957172e-07, "logits/chosen": 1.9771018028259277, "logits/rejected": 2.021456241607666, "logps/chosen": -115.50027465820312, "logps/rejected": -147.13134765625, "loss": 1.1395, "nll_loss": 1.1213618516921997, "rewards/accuracies": 1.0, "rewards/chosen": 1.8651535511016846, "rewards/margins": 7.631936073303223, "rewards/rejected": -5.766782283782959, "step": 1702 }, { "epoch": 0.2838333333333333, "grad_norm": 28.5582218170166, "learning_rate": 1.6806813258471176e-07, "logits/chosen": 1.6133993864059448, "logits/rejected": 2.447244167327881, "logps/chosen": -66.9531478881836, "logps/rejected": -418.20037841796875, "loss": 0.9954, "nll_loss": 0.9846050143241882, "rewards/accuracies": 1.0, "rewards/chosen": 2.3050835132598877, "rewards/margins": 12.904309272766113, "rewards/rejected": -10.599225997924805, "step": 1703 }, { "epoch": 0.284, "grad_norm": 21.158798217773438, "learning_rate": 1.6802857856643214e-07, "logits/chosen": 2.8128581047058105, "logits/rejected": 2.8721165657043457, "logps/chosen": -59.34111785888672, "logps/rejected": -161.7551727294922, "loss": 0.6536, "nll_loss": 0.6450120806694031, "rewards/accuracies": 1.0, "rewards/chosen": 2.863144636154175, "rewards/margins": 8.000750541687012, "rewards/rejected": -5.137606143951416, "step": 1704 }, { "epoch": 0.2841666666666667, "grad_norm": 43.21793746948242, "learning_rate": 1.679890047262579e-07, "logits/chosen": 2.124483346939087, "logits/rejected": 2.5909810066223145, "logps/chosen": -45.34760284423828, "logps/rejected": -591.3421020507812, "loss": 1.2226, "nll_loss": 1.193358063697815, "rewards/accuracies": 1.0, "rewards/chosen": 1.2629567384719849, "rewards/margins": 13.051787376403809, "rewards/rejected": -11.788830757141113, "step": 1705 }, { "epoch": 0.2843333333333333, "grad_norm": 132.0463104248047, "learning_rate": 1.6794941107571995e-07, "logits/chosen": 2.9733200073242188, "logits/rejected": 2.8718724250793457, "logps/chosen": -53.512821197509766, "logps/rejected": -34.24074935913086, "loss": 1.8826, "nll_loss": 0.9729604125022888, "rewards/accuracies": 1.0, "rewards/chosen": 1.2705364227294922, "rewards/margins": 0.13305580615997314, "rewards/rejected": 1.137480616569519, "step": 1706 }, { "epoch": 0.2845, "grad_norm": 50.45050811767578, "learning_rate": 1.6790979762635495e-07, "logits/chosen": 3.3052845001220703, "logits/rejected": 3.1558923721313477, "logps/chosen": -74.07738494873047, "logps/rejected": -88.9502182006836, "loss": 0.9854, "nll_loss": 0.9497100710868835, "rewards/accuracies": 1.0, "rewards/chosen": 1.3391671180725098, "rewards/margins": 5.536226272583008, "rewards/rejected": -4.197059154510498, "step": 1707 }, { "epoch": 0.2846666666666667, "grad_norm": 38.65908432006836, "learning_rate": 1.6787016438970524e-07, "logits/chosen": 2.937049150466919, "logits/rejected": 2.9383323192596436, "logps/chosen": -65.41831970214844, "logps/rejected": -82.89012908935547, "loss": 0.9638, "nll_loss": 0.884031355381012, "rewards/accuracies": 1.0, "rewards/chosen": 1.1016868352890015, "rewards/margins": 3.6723995208740234, "rewards/rejected": -2.5707128047943115, "step": 1708 }, { "epoch": 0.2848333333333333, "grad_norm": 27.6468505859375, "learning_rate": 1.6783051137731907e-07, "logits/chosen": 1.862699270248413, "logits/rejected": 1.9527699947357178, "logps/chosen": -65.38648223876953, "logps/rejected": -166.92501831054688, "loss": 0.7402, "nll_loss": 0.7265164852142334, "rewards/accuracies": 1.0, "rewards/chosen": 2.258071184158325, "rewards/margins": 7.465139389038086, "rewards/rejected": -5.20706844329834, "step": 1709 }, { "epoch": 0.285, "grad_norm": 157.6632537841797, "learning_rate": 1.6779083860075033e-07, "logits/chosen": 2.195552349090576, "logits/rejected": 2.208183765411377, "logps/chosen": -34.0070915222168, "logps/rejected": -84.58750915527344, "loss": 1.5886, "nll_loss": 0.8501772880554199, "rewards/accuracies": 1.0, "rewards/chosen": 0.4748958647251129, "rewards/margins": 0.28630030155181885, "rewards/rejected": 0.18859557807445526, "step": 1710 }, { "epoch": 0.2851666666666667, "grad_norm": 178.49713134765625, "learning_rate": 1.677511460715587e-07, "logits/chosen": 2.421905994415283, "logits/rejected": 2.4815123081207275, "logps/chosen": -47.13709259033203, "logps/rejected": -77.80351257324219, "loss": 2.4185, "nll_loss": 0.961981475353241, "rewards/accuracies": 0.0, "rewards/chosen": 2.770881175994873, "rewards/margins": -0.34554338455200195, "rewards/rejected": 3.116424560546875, "step": 1711 }, { "epoch": 0.2853333333333333, "grad_norm": 96.38177490234375, "learning_rate": 1.6771143380130965e-07, "logits/chosen": 2.6007912158966064, "logits/rejected": 2.7198851108551025, "logps/chosen": -14.434614181518555, "logps/rejected": -197.9451904296875, "loss": 1.0861, "nll_loss": 1.0310437679290771, "rewards/accuracies": 1.0, "rewards/chosen": 1.1771944761276245, "rewards/margins": 4.40818977355957, "rewards/rejected": -3.2309951782226562, "step": 1712 }, { "epoch": 0.2855, "grad_norm": 33.41116714477539, "learning_rate": 1.676717018015744e-07, "logits/chosen": 2.5415384769439697, "logits/rejected": 2.699519157409668, "logps/chosen": -81.73429870605469, "logps/rejected": -164.61065673828125, "loss": 1.0385, "nll_loss": 0.9967597126960754, "rewards/accuracies": 1.0, "rewards/chosen": 1.1158599853515625, "rewards/margins": 5.429002285003662, "rewards/rejected": -4.3131422996521, "step": 1713 }, { "epoch": 0.2856666666666667, "grad_norm": 654.0830688476562, "learning_rate": 1.676319500839299e-07, "logits/chosen": 2.8381693363189697, "logits/rejected": 2.944363832473755, "logps/chosen": -279.06402587890625, "logps/rejected": -284.88958740234375, "loss": 4.5513, "nll_loss": 1.120739221572876, "rewards/accuracies": 0.0, "rewards/chosen": -2.3674073219299316, "rewards/margins": -3.3329286575317383, "rewards/rejected": 0.9655212759971619, "step": 1714 }, { "epoch": 0.28583333333333333, "grad_norm": 22.50624656677246, "learning_rate": 1.6759217865995883e-07, "logits/chosen": 3.3531298637390137, "logits/rejected": 3.659943103790283, "logps/chosen": -56.338253021240234, "logps/rejected": -445.3042907714844, "loss": 0.6896, "nll_loss": 0.6550959944725037, "rewards/accuracies": 1.0, "rewards/chosen": 1.0817791223526, "rewards/margins": 12.46662425994873, "rewards/rejected": -11.384844779968262, "step": 1715 }, { "epoch": 0.286, "grad_norm": 113.14012908935547, "learning_rate": 1.6755238754124962e-07, "logits/chosen": 3.1136903762817383, "logits/rejected": 3.28495192527771, "logps/chosen": -84.10572814941406, "logps/rejected": -85.07179260253906, "loss": 2.5056, "nll_loss": 2.2133090496063232, "rewards/accuracies": 1.0, "rewards/chosen": -0.7342033982276917, "rewards/margins": 1.8789920806884766, "rewards/rejected": -2.6131954193115234, "step": 1716 }, { "epoch": 0.2861666666666667, "grad_norm": 91.84949493408203, "learning_rate": 1.6751257673939647e-07, "logits/chosen": 1.2277761697769165, "logits/rejected": 2.024092435836792, "logps/chosen": -25.448400497436523, "logps/rejected": -197.03359985351562, "loss": 0.854, "nll_loss": 0.6206926703453064, "rewards/accuracies": 1.0, "rewards/chosen": 0.7194158434867859, "rewards/margins": 1.9829137325286865, "rewards/rejected": -1.2634979486465454, "step": 1717 }, { "epoch": 0.28633333333333333, "grad_norm": 29.93385124206543, "learning_rate": 1.6747274626599927e-07, "logits/chosen": 2.840407371520996, "logits/rejected": 2.7895209789276123, "logps/chosen": -63.248661041259766, "logps/rejected": -47.362342834472656, "loss": 0.8453, "nll_loss": 0.7808476090431213, "rewards/accuracies": 1.0, "rewards/chosen": 1.2699558734893799, "rewards/margins": 4.036820411682129, "rewards/rejected": -2.766864538192749, "step": 1718 }, { "epoch": 0.2865, "grad_norm": 20.50391387939453, "learning_rate": 1.6743289613266368e-07, "logits/chosen": 2.6403536796569824, "logits/rejected": 2.799448013305664, "logps/chosen": -69.20284271240234, "logps/rejected": -192.25257873535156, "loss": 0.6676, "nll_loss": 0.6654120087623596, "rewards/accuracies": 1.0, "rewards/chosen": 4.065959453582764, "rewards/margins": 11.167962074279785, "rewards/rejected": -7.1020026206970215, "step": 1719 }, { "epoch": 0.2866666666666667, "grad_norm": 45.92304611206055, "learning_rate": 1.6739302635100107e-07, "logits/chosen": 2.3649356365203857, "logits/rejected": 2.5401241779327393, "logps/chosen": -39.30332565307617, "logps/rejected": -154.4636688232422, "loss": 1.0047, "nll_loss": 0.9586178064346313, "rewards/accuracies": 1.0, "rewards/chosen": 1.0490955114364624, "rewards/margins": 5.147061824798584, "rewards/rejected": -4.097966194152832, "step": 1720 }, { "epoch": 0.28683333333333333, "grad_norm": 15.235758781433105, "learning_rate": 1.6735313693262853e-07, "logits/chosen": 2.257228136062622, "logits/rejected": 2.178417921066284, "logps/chosen": -183.25418090820312, "logps/rejected": -204.00534057617188, "loss": 0.7054, "nll_loss": 0.6967840194702148, "rewards/accuracies": 1.0, "rewards/chosen": 2.7801272869110107, "rewards/margins": 8.17895793914795, "rewards/rejected": -5.398830413818359, "step": 1721 }, { "epoch": 0.287, "grad_norm": 59.779544830322266, "learning_rate": 1.673132278891689e-07, "logits/chosen": 0.9108428359031677, "logits/rejected": 2.56815505027771, "logps/chosen": -30.466880798339844, "logps/rejected": -344.32025146484375, "loss": 1.0536, "nll_loss": 1.015562653541565, "rewards/accuracies": 1.0, "rewards/chosen": 1.3561128377914429, "rewards/margins": 5.257352352142334, "rewards/rejected": -3.9012393951416016, "step": 1722 }, { "epoch": 0.2871666666666667, "grad_norm": 46.087074279785156, "learning_rate": 1.6727329923225065e-07, "logits/chosen": 2.2915635108947754, "logits/rejected": 2.2354648113250732, "logps/chosen": -69.68260192871094, "logps/rejected": -30.257164001464844, "loss": 1.1337, "nll_loss": 0.9814450740814209, "rewards/accuracies": 1.0, "rewards/chosen": 1.2146896123886108, "rewards/margins": 2.68891978263855, "rewards/rejected": -1.474230170249939, "step": 1723 }, { "epoch": 0.28733333333333333, "grad_norm": 25.514652252197266, "learning_rate": 1.6723335097350812e-07, "logits/chosen": 1.7441203594207764, "logits/rejected": 2.6703901290893555, "logps/chosen": -64.1063461303711, "logps/rejected": -232.73890686035156, "loss": 0.7927, "nll_loss": 0.7817847728729248, "rewards/accuracies": 1.0, "rewards/chosen": 2.468625783920288, "rewards/margins": 7.976214408874512, "rewards/rejected": -5.5075883865356445, "step": 1724 }, { "epoch": 0.2875, "grad_norm": 53.001953125, "learning_rate": 1.6719338312458123e-07, "logits/chosen": 2.7592339515686035, "logits/rejected": 2.7128851413726807, "logps/chosen": -64.09024047851562, "logps/rejected": -58.581111907958984, "loss": 0.8707, "nll_loss": 0.8112689852714539, "rewards/accuracies": 1.0, "rewards/chosen": 0.7111802101135254, "rewards/margins": 4.900552749633789, "rewards/rejected": -4.189372539520264, "step": 1725 }, { "epoch": 0.2876666666666667, "grad_norm": 25.20195770263672, "learning_rate": 1.6715339569711565e-07, "logits/chosen": 3.0711441040039062, "logits/rejected": 3.0842974185943604, "logps/chosen": -93.79618835449219, "logps/rejected": -111.49894714355469, "loss": 0.871, "nll_loss": 0.8450105786323547, "rewards/accuracies": 1.0, "rewards/chosen": 2.263430118560791, "rewards/margins": 5.641751289367676, "rewards/rejected": -3.3783211708068848, "step": 1726 }, { "epoch": 0.28783333333333333, "grad_norm": 34.17949676513672, "learning_rate": 1.671133887027628e-07, "logits/chosen": 2.6559107303619385, "logits/rejected": 2.6264939308166504, "logps/chosen": -193.28390502929688, "logps/rejected": -55.688270568847656, "loss": 1.4229, "nll_loss": 1.3708078861236572, "rewards/accuracies": 1.0, "rewards/chosen": 0.7368988394737244, "rewards/margins": 5.712728500366211, "rewards/rejected": -4.975829601287842, "step": 1727 }, { "epoch": 0.288, "grad_norm": 155.18048095703125, "learning_rate": 1.6707336215317965e-07, "logits/chosen": 2.1393935680389404, "logits/rejected": 2.264033555984497, "logps/chosen": -17.030323028564453, "logps/rejected": -58.08529281616211, "loss": 1.7564, "nll_loss": 1.7030322551727295, "rewards/accuracies": 1.0, "rewards/chosen": 1.0249477624893188, "rewards/margins": 4.643104553222656, "rewards/rejected": -3.618156671524048, "step": 1728 }, { "epoch": 0.2881666666666667, "grad_norm": 32.05028533935547, "learning_rate": 1.670333160600291e-07, "logits/chosen": 1.1058765649795532, "logits/rejected": 1.7357550859451294, "logps/chosen": -107.78007507324219, "logps/rejected": -270.27813720703125, "loss": 1.3665, "nll_loss": 1.3472506999969482, "rewards/accuracies": 1.0, "rewards/chosen": 1.7251427173614502, "rewards/margins": 8.750237464904785, "rewards/rejected": -7.025094509124756, "step": 1729 }, { "epoch": 0.28833333333333333, "grad_norm": 111.5307846069336, "learning_rate": 1.6699325043497953e-07, "logits/chosen": 3.4239747524261475, "logits/rejected": 3.900503158569336, "logps/chosen": -38.192054748535156, "logps/rejected": -36.649085998535156, "loss": 1.8725, "nll_loss": 1.591335654258728, "rewards/accuracies": 1.0, "rewards/chosen": 0.18598595261573792, "rewards/margins": 1.6757628917694092, "rewards/rejected": -1.4897769689559937, "step": 1730 }, { "epoch": 0.2885, "grad_norm": 40.37852478027344, "learning_rate": 1.6695316528970516e-07, "logits/chosen": 1.368366003036499, "logits/rejected": 2.792941093444824, "logps/chosen": -31.474496841430664, "logps/rejected": -346.9790954589844, "loss": 1.001, "nll_loss": 0.983578085899353, "rewards/accuracies": 1.0, "rewards/chosen": 2.418886423110962, "rewards/margins": 6.425670623779297, "rewards/rejected": -4.006783962249756, "step": 1731 }, { "epoch": 0.2886666666666667, "grad_norm": 114.5565185546875, "learning_rate": 1.669130606358858e-07, "logits/chosen": 1.736876368522644, "logits/rejected": 1.6959036588668823, "logps/chosen": -111.63966369628906, "logps/rejected": -123.18853759765625, "loss": 1.6519, "nll_loss": 1.162913203239441, "rewards/accuracies": 1.0, "rewards/chosen": 1.7326462268829346, "rewards/margins": 1.265241265296936, "rewards/rejected": 0.46740496158599854, "step": 1732 }, { "epoch": 0.28883333333333333, "grad_norm": 35.210205078125, "learning_rate": 1.6687293648520702e-07, "logits/chosen": 2.7278223037719727, "logits/rejected": 3.067946434020996, "logps/chosen": -21.614595413208008, "logps/rejected": -484.4951477050781, "loss": 0.6055, "nll_loss": 0.5688051581382751, "rewards/accuracies": 1.0, "rewards/chosen": 1.4445966482162476, "rewards/margins": 5.240131855010986, "rewards/rejected": -3.7955353260040283, "step": 1733 }, { "epoch": 0.289, "grad_norm": 70.17967224121094, "learning_rate": 1.6683279284936002e-07, "logits/chosen": 2.7456464767456055, "logits/rejected": 2.6012210845947266, "logps/chosen": -32.25336837768555, "logps/rejected": -67.5820541381836, "loss": 1.1626, "nll_loss": 0.977374792098999, "rewards/accuracies": 1.0, "rewards/chosen": -0.5160161852836609, "rewards/margins": 2.9024064540863037, "rewards/rejected": -3.4184226989746094, "step": 1734 }, { "epoch": 0.2891666666666667, "grad_norm": 42.09278106689453, "learning_rate": 1.6679262974004166e-07, "logits/chosen": 2.6865382194519043, "logits/rejected": 2.0093367099761963, "logps/chosen": -47.345733642578125, "logps/rejected": -29.031932830810547, "loss": 1.1293, "nll_loss": 1.0521273612976074, "rewards/accuracies": 1.0, "rewards/chosen": 2.77107572555542, "rewards/margins": 4.4026408195495605, "rewards/rejected": -1.631564974784851, "step": 1735 }, { "epoch": 0.28933333333333333, "grad_norm": 20.865970611572266, "learning_rate": 1.6675244716895453e-07, "logits/chosen": 2.206308126449585, "logits/rejected": 2.5742299556732178, "logps/chosen": -131.73373413085938, "logps/rejected": -362.9288330078125, "loss": 0.8279, "nll_loss": 0.8182218670845032, "rewards/accuracies": 1.0, "rewards/chosen": 2.701796054840088, "rewards/margins": 7.854621887207031, "rewards/rejected": -5.152825832366943, "step": 1736 }, { "epoch": 0.2895, "grad_norm": 71.87481689453125, "learning_rate": 1.667122451478069e-07, "logits/chosen": 2.2559008598327637, "logits/rejected": 2.1250505447387695, "logps/chosen": -27.936826705932617, "logps/rejected": -58.11686706542969, "loss": 0.9341, "nll_loss": 0.735179603099823, "rewards/accuracies": 1.0, "rewards/chosen": 1.2787585258483887, "rewards/margins": 2.3438844680786133, "rewards/rejected": -1.0651260614395142, "step": 1737 }, { "epoch": 0.2896666666666667, "grad_norm": 268.6536560058594, "learning_rate": 1.666720236883126e-07, "logits/chosen": 2.767446756362915, "logits/rejected": 2.6824772357940674, "logps/chosen": -171.06341552734375, "logps/rejected": -239.10601806640625, "loss": 2.1892, "nll_loss": 1.3260730504989624, "rewards/accuracies": 1.0, "rewards/chosen": -1.9250701665878296, "rewards/margins": 0.12003791332244873, "rewards/rejected": -2.0451080799102783, "step": 1738 }, { "epoch": 0.28983333333333333, "grad_norm": 26.413536071777344, "learning_rate": 1.6663178280219127e-07, "logits/chosen": 1.801650047302246, "logits/rejected": 1.632477879524231, "logps/chosen": -153.98379516601562, "logps/rejected": -93.04981994628906, "loss": 1.2068, "nll_loss": 1.16654372215271, "rewards/accuracies": 1.0, "rewards/chosen": 1.8886597156524658, "rewards/margins": 4.848423480987549, "rewards/rejected": -2.959763765335083, "step": 1739 }, { "epoch": 0.29, "grad_norm": 27.84084129333496, "learning_rate": 1.6659152250116811e-07, "logits/chosen": 1.092509388923645, "logits/rejected": 2.246257781982422, "logps/chosen": -80.68267822265625, "logps/rejected": -237.3951873779297, "loss": 0.9073, "nll_loss": 0.876985490322113, "rewards/accuracies": 1.0, "rewards/chosen": 1.2671570777893066, "rewards/margins": 7.2669453620910645, "rewards/rejected": -5.999788284301758, "step": 1740 }, { "epoch": 0.2901666666666667, "grad_norm": 21.775238037109375, "learning_rate": 1.6655124279697399e-07, "logits/chosen": 2.587170362472534, "logits/rejected": 2.6854615211486816, "logps/chosen": -140.6553192138672, "logps/rejected": -159.4339599609375, "loss": 0.9176, "nll_loss": 0.9074536561965942, "rewards/accuracies": 1.0, "rewards/chosen": 2.585797071456909, "rewards/margins": 7.916180610656738, "rewards/rejected": -5.33038330078125, "step": 1741 }, { "epoch": 0.29033333333333333, "grad_norm": 36.92793273925781, "learning_rate": 1.6651094370134546e-07, "logits/chosen": 2.5428686141967773, "logits/rejected": 2.3883094787597656, "logps/chosen": -29.521902084350586, "logps/rejected": -156.39639282226562, "loss": 0.5555, "nll_loss": 0.4406254291534424, "rewards/accuracies": 1.0, "rewards/chosen": 1.9328211545944214, "rewards/margins": 3.3603525161743164, "rewards/rejected": -1.4275314807891846, "step": 1742 }, { "epoch": 0.2905, "grad_norm": 30.929582595825195, "learning_rate": 1.664706252260247e-07, "logits/chosen": 2.749427318572998, "logits/rejected": 2.535458564758301, "logps/chosen": -190.25338745117188, "logps/rejected": -57.54006576538086, "loss": 1.3886, "nll_loss": 1.34931480884552, "rewards/accuracies": 1.0, "rewards/chosen": 1.0399506092071533, "rewards/margins": 6.200960159301758, "rewards/rejected": -5.161009311676025, "step": 1743 }, { "epoch": 0.2906666666666667, "grad_norm": 161.7628936767578, "learning_rate": 1.6643028738275958e-07, "logits/chosen": 2.8403828144073486, "logits/rejected": 2.9816248416900635, "logps/chosen": -30.516115188598633, "logps/rejected": -27.0706729888916, "loss": 3.2645, "nll_loss": 0.4694787263870239, "rewards/accuracies": 0.0, "rewards/chosen": 1.3265209197998047, "rewards/margins": -2.3045103549957275, "rewards/rejected": 3.6310312747955322, "step": 1744 }, { "epoch": 0.29083333333333333, "grad_norm": 36.44843673706055, "learning_rate": 1.6638993018330357e-07, "logits/chosen": 2.4947447776794434, "logits/rejected": 2.4561214447021484, "logps/chosen": -97.57005310058594, "logps/rejected": -108.5997085571289, "loss": 1.1092, "nll_loss": 1.0721983909606934, "rewards/accuracies": 1.0, "rewards/chosen": 1.1994705200195312, "rewards/margins": 5.767578601837158, "rewards/rejected": -4.568108081817627, "step": 1745 }, { "epoch": 0.291, "grad_norm": 77.2651596069336, "learning_rate": 1.6634955363941572e-07, "logits/chosen": 2.4572291374206543, "logits/rejected": 2.7077534198760986, "logps/chosen": -58.422420501708984, "logps/rejected": -281.3023681640625, "loss": 2.1285, "nll_loss": 2.086514949798584, "rewards/accuracies": 1.0, "rewards/chosen": 1.676127314567566, "rewards/margins": 4.781256198883057, "rewards/rejected": -3.105128765106201, "step": 1746 }, { "epoch": 0.2911666666666667, "grad_norm": 31.66712760925293, "learning_rate": 1.6630915776286087e-07, "logits/chosen": 1.3779269456863403, "logits/rejected": 2.378965139389038, "logps/chosen": -44.68281173706055, "logps/rejected": -157.94927978515625, "loss": 0.7627, "nll_loss": 0.732505202293396, "rewards/accuracies": 1.0, "rewards/chosen": 1.5325055122375488, "rewards/margins": 5.760058403015137, "rewards/rejected": -4.227552890777588, "step": 1747 }, { "epoch": 0.29133333333333333, "grad_norm": 30.700437545776367, "learning_rate": 1.6626874256540938e-07, "logits/chosen": 1.5164097547531128, "logits/rejected": 2.502161741256714, "logps/chosen": -12.140894889831543, "logps/rejected": -365.717041015625, "loss": 0.3698, "nll_loss": 0.30352237820625305, "rewards/accuracies": 1.0, "rewards/chosen": 0.34440430998802185, "rewards/margins": 8.047541618347168, "rewards/rejected": -7.703137397766113, "step": 1748 }, { "epoch": 0.2915, "grad_norm": 32.50913619995117, "learning_rate": 1.6622830805883727e-07, "logits/chosen": 2.9696171283721924, "logits/rejected": 2.988175868988037, "logps/chosen": -74.56304931640625, "logps/rejected": -255.14759826660156, "loss": 1.0211, "nll_loss": 0.9941740036010742, "rewards/accuracies": 1.0, "rewards/chosen": 1.4078491926193237, "rewards/margins": 7.2225422859191895, "rewards/rejected": -5.814692974090576, "step": 1749 }, { "epoch": 0.2916666666666667, "grad_norm": 50.9244384765625, "learning_rate": 1.6618785425492616e-07, "logits/chosen": 1.7027559280395508, "logits/rejected": 2.8906798362731934, "logps/chosen": -21.046674728393555, "logps/rejected": -271.2542724609375, "loss": 0.861, "nll_loss": 0.8418669700622559, "rewards/accuracies": 1.0, "rewards/chosen": 2.1075599193573, "rewards/margins": 6.410856246948242, "rewards/rejected": -4.303296089172363, "step": 1750 }, { "epoch": 0.29183333333333333, "grad_norm": 653.8436279296875, "learning_rate": 1.6614738116546335e-07, "logits/chosen": 0.061832837760448456, "logits/rejected": 1.5533151626586914, "logps/chosen": -169.4820556640625, "logps/rejected": -643.0768432617188, "loss": 6.1262, "nll_loss": 2.017643451690674, "rewards/accuracies": 0.0, "rewards/chosen": -3.6198806762695312, "rewards/margins": -4.022279262542725, "rewards/rejected": 0.4023986756801605, "step": 1751 }, { "epoch": 0.292, "grad_norm": 32.80065155029297, "learning_rate": 1.6610688880224175e-07, "logits/chosen": 1.2943415641784668, "logits/rejected": 1.6126705408096313, "logps/chosen": -124.3744888305664, "logps/rejected": -114.55579376220703, "loss": 0.933, "nll_loss": 0.8637115955352783, "rewards/accuracies": 1.0, "rewards/chosen": 1.9117134809494019, "rewards/margins": 4.030110836029053, "rewards/rejected": -2.1183974742889404, "step": 1752 }, { "epoch": 0.2921666666666667, "grad_norm": 35.14418029785156, "learning_rate": 1.6606637717705981e-07, "logits/chosen": 2.444761276245117, "logits/rejected": 3.0417630672454834, "logps/chosen": -118.20203399658203, "logps/rejected": -357.1854248046875, "loss": 1.3486, "nll_loss": 1.3432049751281738, "rewards/accuracies": 1.0, "rewards/chosen": 3.1943931579589844, "rewards/margins": 9.278707504272461, "rewards/rejected": -6.084313869476318, "step": 1753 }, { "epoch": 0.29233333333333333, "grad_norm": 20.391279220581055, "learning_rate": 1.6602584630172164e-07, "logits/chosen": 2.153517961502075, "logits/rejected": 2.492321729660034, "logps/chosen": -69.11599731445312, "logps/rejected": -242.05126953125, "loss": 0.5607, "nll_loss": 0.5357829332351685, "rewards/accuracies": 1.0, "rewards/chosen": 1.5230858325958252, "rewards/margins": 6.955727577209473, "rewards/rejected": -5.432641506195068, "step": 1754 }, { "epoch": 0.2925, "grad_norm": 38.25238037109375, "learning_rate": 1.6598529618803698e-07, "logits/chosen": 2.5778050422668457, "logits/rejected": 2.6275370121002197, "logps/chosen": -106.19902038574219, "logps/rejected": -164.46636962890625, "loss": 1.2104, "nll_loss": 1.1419249773025513, "rewards/accuracies": 1.0, "rewards/chosen": 0.9577797651290894, "rewards/margins": 4.032296180725098, "rewards/rejected": -3.0745162963867188, "step": 1755 }, { "epoch": 0.2926666666666667, "grad_norm": 35.823360443115234, "learning_rate": 1.659447268478212e-07, "logits/chosen": 3.8910956382751465, "logits/rejected": 4.0207624435424805, "logps/chosen": -107.73359680175781, "logps/rejected": -309.367919921875, "loss": 1.2874, "nll_loss": 1.2527161836624146, "rewards/accuracies": 1.0, "rewards/chosen": 1.3863191604614258, "rewards/margins": 5.513095378875732, "rewards/rejected": -4.126776218414307, "step": 1756 }, { "epoch": 0.29283333333333333, "grad_norm": 27.708131790161133, "learning_rate": 1.6590413829289518e-07, "logits/chosen": 2.2550244331359863, "logits/rejected": 2.128485679626465, "logps/chosen": -19.46822166442871, "logps/rejected": -162.96475219726562, "loss": 0.4882, "nll_loss": 0.4527492821216583, "rewards/accuracies": 1.0, "rewards/chosen": 1.2519292831420898, "rewards/margins": 5.78122615814209, "rewards/rejected": -4.529296875, "step": 1757 }, { "epoch": 0.293, "grad_norm": 165.3036651611328, "learning_rate": 1.6586353053508545e-07, "logits/chosen": 3.1396710872650146, "logits/rejected": 3.158491373062134, "logps/chosen": -69.43006134033203, "logps/rejected": -43.872840881347656, "loss": 2.7467, "nll_loss": 0.8265482783317566, "rewards/accuracies": 0.0, "rewards/chosen": 2.9423351287841797, "rewards/margins": -0.9401745796203613, "rewards/rejected": 3.882509708404541, "step": 1758 }, { "epoch": 0.2931666666666667, "grad_norm": 224.6240692138672, "learning_rate": 1.6582290358622418e-07, "logits/chosen": 3.132526159286499, "logits/rejected": 3.1310408115386963, "logps/chosen": -80.27835083007812, "logps/rejected": -33.99189376831055, "loss": 4.9933, "nll_loss": 1.0997034311294556, "rewards/accuracies": 0.0, "rewards/chosen": 0.24751512706279755, "rewards/margins": -3.6299967765808105, "rewards/rejected": 3.877511978149414, "step": 1759 }, { "epoch": 0.29333333333333333, "grad_norm": 85.07373809814453, "learning_rate": 1.6578225745814906e-07, "logits/chosen": 2.517338514328003, "logits/rejected": 2.1963632106781006, "logps/chosen": -246.975830078125, "logps/rejected": -85.28045654296875, "loss": 1.4423, "nll_loss": 1.2410846948623657, "rewards/accuracies": 1.0, "rewards/chosen": -0.2954559326171875, "rewards/margins": 2.424206018447876, "rewards/rejected": -2.7196619510650635, "step": 1760 }, { "epoch": 0.2935, "grad_norm": 24.037059783935547, "learning_rate": 1.657415921627034e-07, "logits/chosen": 2.785775661468506, "logits/rejected": 2.7674636840820312, "logps/chosen": -155.765869140625, "logps/rejected": -150.15768432617188, "loss": 1.0285, "nll_loss": 1.0049411058425903, "rewards/accuracies": 1.0, "rewards/chosen": 1.6688523292541504, "rewards/margins": 6.571133613586426, "rewards/rejected": -4.902281284332275, "step": 1761 }, { "epoch": 0.2936666666666667, "grad_norm": 71.44384765625, "learning_rate": 1.6570090771173607e-07, "logits/chosen": 3.196127414703369, "logits/rejected": 3.3614466190338135, "logps/chosen": -75.68675231933594, "logps/rejected": -51.49125289916992, "loss": 1.3921, "nll_loss": 0.8504128456115723, "rewards/accuracies": 1.0, "rewards/chosen": 2.7658233642578125, "rewards/margins": 1.5884231328964233, "rewards/rejected": 1.1774002313613892, "step": 1762 }, { "epoch": 0.29383333333333334, "grad_norm": 97.10344696044922, "learning_rate": 1.6566020411710158e-07, "logits/chosen": 3.5922482013702393, "logits/rejected": 3.6878788471221924, "logps/chosen": -34.98452377319336, "logps/rejected": -36.95283126831055, "loss": 1.6809, "nll_loss": 1.4576884508132935, "rewards/accuracies": 1.0, "rewards/chosen": 0.50673907995224, "rewards/margins": 2.026890516281128, "rewards/rejected": -1.5201514959335327, "step": 1763 }, { "epoch": 0.294, "grad_norm": 207.49386596679688, "learning_rate": 1.6561948139065995e-07, "logits/chosen": 2.657538652420044, "logits/rejected": 2.912590503692627, "logps/chosen": -68.7017822265625, "logps/rejected": -104.39151000976562, "loss": 2.07, "nll_loss": 0.7082657814025879, "rewards/accuracies": 0.0, "rewards/chosen": 0.7356811761856079, "rewards/margins": -0.7019668817520142, "rewards/rejected": 1.437648057937622, "step": 1764 }, { "epoch": 0.2941666666666667, "grad_norm": 383.25775146484375, "learning_rate": 1.655787395442768e-07, "logits/chosen": 3.55655574798584, "logits/rejected": 3.6372222900390625, "logps/chosen": -106.96087646484375, "logps/rejected": -93.96337890625, "loss": 4.0128, "nll_loss": 2.3252363204956055, "rewards/accuracies": 0.0, "rewards/chosen": -2.7423653602600098, "rewards/margins": -1.2000747919082642, "rewards/rejected": -1.5422905683517456, "step": 1765 }, { "epoch": 0.29433333333333334, "grad_norm": 22.479217529296875, "learning_rate": 1.6553797858982338e-07, "logits/chosen": 2.811307668685913, "logits/rejected": 2.8627564907073975, "logps/chosen": -94.99383544921875, "logps/rejected": -262.1453857421875, "loss": 0.7883, "nll_loss": 0.7723075151443481, "rewards/accuracies": 1.0, "rewards/chosen": 2.0660042762756348, "rewards/margins": 7.271572589874268, "rewards/rejected": -5.205568313598633, "step": 1766 }, { "epoch": 0.2945, "grad_norm": 25.299524307250977, "learning_rate": 1.6549719853917638e-07, "logits/chosen": 2.0626614093780518, "logits/rejected": 2.605774402618408, "logps/chosen": -94.92597198486328, "logps/rejected": -520.883544921875, "loss": 0.958, "nll_loss": 0.9492598176002502, "rewards/accuracies": 1.0, "rewards/chosen": 2.521841526031494, "rewards/margins": 10.979623794555664, "rewards/rejected": -8.457781791687012, "step": 1767 }, { "epoch": 0.2946666666666667, "grad_norm": 77.10700225830078, "learning_rate": 1.654563994042182e-07, "logits/chosen": 1.853859543800354, "logits/rejected": 2.139033555984497, "logps/chosen": -92.74850463867188, "logps/rejected": -148.14822387695312, "loss": 1.2309, "nll_loss": 1.1740316152572632, "rewards/accuracies": 1.0, "rewards/chosen": 0.6217986941337585, "rewards/margins": 5.6403961181640625, "rewards/rejected": -5.018597602844238, "step": 1768 }, { "epoch": 0.29483333333333334, "grad_norm": 49.56184387207031, "learning_rate": 1.6541558119683666e-07, "logits/chosen": 2.8509607315063477, "logits/rejected": 3.084482431411743, "logps/chosen": -173.96572875976562, "logps/rejected": -548.4425048828125, "loss": 1.5095, "nll_loss": 1.4497145414352417, "rewards/accuracies": 1.0, "rewards/chosen": 1.4533876180648804, "rewards/margins": 4.150372505187988, "rewards/rejected": -2.6969850063323975, "step": 1769 }, { "epoch": 0.295, "grad_norm": 28.15606689453125, "learning_rate": 1.6537474392892526e-07, "logits/chosen": 1.652221918106079, "logits/rejected": 1.8706486225128174, "logps/chosen": -68.99285888671875, "logps/rejected": -591.69677734375, "loss": 0.9642, "nll_loss": 0.945107638835907, "rewards/accuracies": 1.0, "rewards/chosen": 1.699426293373108, "rewards/margins": 12.270163536071777, "rewards/rejected": -10.5707368850708, "step": 1770 }, { "epoch": 0.2951666666666667, "grad_norm": 90.55575561523438, "learning_rate": 1.6533388761238296e-07, "logits/chosen": 3.0617151260375977, "logits/rejected": 3.006573438644409, "logps/chosen": -17.837514877319336, "logps/rejected": -199.8939666748047, "loss": 0.6721, "nll_loss": 0.6370541453361511, "rewards/accuracies": 1.0, "rewards/chosen": 1.5329378843307495, "rewards/margins": 5.268129348754883, "rewards/rejected": -3.735191583633423, "step": 1771 }, { "epoch": 0.29533333333333334, "grad_norm": 26.055801391601562, "learning_rate": 1.652930122591143e-07, "logits/chosen": 2.7666258811950684, "logits/rejected": 3.067218542098999, "logps/chosen": -61.991729736328125, "logps/rejected": -234.53428649902344, "loss": 0.648, "nll_loss": 0.6137794852256775, "rewards/accuracies": 1.0, "rewards/chosen": 1.2562179565429688, "rewards/margins": 5.975709438323975, "rewards/rejected": -4.719491481781006, "step": 1772 }, { "epoch": 0.2955, "grad_norm": 23.86830711364746, "learning_rate": 1.6525211788102944e-07, "logits/chosen": 2.8914270401000977, "logits/rejected": 3.0787312984466553, "logps/chosen": -52.804664611816406, "logps/rejected": -257.3271484375, "loss": 0.6704, "nll_loss": 0.6600582003593445, "rewards/accuracies": 1.0, "rewards/chosen": 2.347557783126831, "rewards/margins": 10.448171615600586, "rewards/rejected": -8.100613594055176, "step": 1773 }, { "epoch": 0.2956666666666667, "grad_norm": 29.404144287109375, "learning_rate": 1.6521120449004398e-07, "logits/chosen": 2.5310890674591064, "logits/rejected": 2.4487783908843994, "logps/chosen": -80.7087173461914, "logps/rejected": -60.226905822753906, "loss": 1.0021, "nll_loss": 0.9723939299583435, "rewards/accuracies": 1.0, "rewards/chosen": 2.263502597808838, "rewards/margins": 5.415972709655762, "rewards/rejected": -3.152470350265503, "step": 1774 }, { "epoch": 0.29583333333333334, "grad_norm": 42.70379638671875, "learning_rate": 1.651702720980791e-07, "logits/chosen": 3.047914743423462, "logits/rejected": 3.023730516433716, "logps/chosen": -201.58956909179688, "logps/rejected": -177.84864807128906, "loss": 1.5711, "nll_loss": 1.5271939039230347, "rewards/accuracies": 1.0, "rewards/chosen": 0.953460693359375, "rewards/margins": 5.7145586013793945, "rewards/rejected": -4.7610979080200195, "step": 1775 }, { "epoch": 0.296, "grad_norm": 41.759552001953125, "learning_rate": 1.651293207170615e-07, "logits/chosen": 2.0149455070495605, "logits/rejected": 2.591933488845825, "logps/chosen": -33.44459915161133, "logps/rejected": -162.45184326171875, "loss": 0.6523, "nll_loss": 0.5766310095787048, "rewards/accuracies": 1.0, "rewards/chosen": 0.1846698820590973, "rewards/margins": 7.224881649017334, "rewards/rejected": -7.0402116775512695, "step": 1776 }, { "epoch": 0.2961666666666667, "grad_norm": 48.93367004394531, "learning_rate": 1.650883503589235e-07, "logits/chosen": 3.4346578121185303, "logits/rejected": 3.5297179222106934, "logps/chosen": -69.5982894897461, "logps/rejected": -282.7225036621094, "loss": 1.0535, "nll_loss": 0.9534013271331787, "rewards/accuracies": 1.0, "rewards/chosen": -0.16249237954616547, "rewards/margins": 6.946942329406738, "rewards/rejected": -7.109434604644775, "step": 1777 }, { "epoch": 0.29633333333333334, "grad_norm": 27.375003814697266, "learning_rate": 1.6504736103560277e-07, "logits/chosen": 2.4467849731445312, "logits/rejected": 1.6250345706939697, "logps/chosen": -66.30538940429688, "logps/rejected": -35.71839141845703, "loss": 0.8856, "nll_loss": 0.7988601326942444, "rewards/accuracies": 1.0, "rewards/chosen": 1.7006020545959473, "rewards/margins": 3.6368117332458496, "rewards/rejected": -1.9362096786499023, "step": 1778 }, { "epoch": 0.2965, "grad_norm": 29.45733070373535, "learning_rate": 1.650063527590427e-07, "logits/chosen": 2.3963117599487305, "logits/rejected": 2.4305362701416016, "logps/chosen": -48.36455535888672, "logps/rejected": -188.44287109375, "loss": 0.6921, "nll_loss": 0.6535751223564148, "rewards/accuracies": 1.0, "rewards/chosen": 1.1784828901290894, "rewards/margins": 5.562656879425049, "rewards/rejected": -4.38417387008667, "step": 1779 }, { "epoch": 0.2966666666666667, "grad_norm": 25.01803970336914, "learning_rate": 1.6496532554119212e-07, "logits/chosen": 0.9715522527694702, "logits/rejected": 2.6038572788238525, "logps/chosen": -26.286808013916016, "logps/rejected": -408.28082275390625, "loss": 0.543, "nll_loss": 0.5257360935211182, "rewards/accuracies": 1.0, "rewards/chosen": 1.8130435943603516, "rewards/margins": 9.201627731323242, "rewards/rejected": -7.388584136962891, "step": 1780 }, { "epoch": 0.29683333333333334, "grad_norm": 12.186022758483887, "learning_rate": 1.6492427939400527e-07, "logits/chosen": 1.7705535888671875, "logits/rejected": 1.7545232772827148, "logps/chosen": -150.21353149414062, "logps/rejected": -213.4172821044922, "loss": 0.5513, "nll_loss": 0.5403363704681396, "rewards/accuracies": 1.0, "rewards/chosen": 2.4127776622772217, "rewards/margins": 8.135416030883789, "rewards/rejected": -5.722638130187988, "step": 1781 }, { "epoch": 0.297, "grad_norm": 28.718961715698242, "learning_rate": 1.6488321432944218e-07, "logits/chosen": 2.1069188117980957, "logits/rejected": 2.30572772026062, "logps/chosen": -97.42721557617188, "logps/rejected": -237.73191833496094, "loss": 0.996, "nll_loss": 0.9551687836647034, "rewards/accuracies": 1.0, "rewards/chosen": 0.8791465759277344, "rewards/margins": 9.028850555419922, "rewards/rejected": -8.149703979492188, "step": 1782 }, { "epoch": 0.2971666666666667, "grad_norm": 115.51355743408203, "learning_rate": 1.6484213035946806e-07, "logits/chosen": 3.271718740463257, "logits/rejected": 3.3157434463500977, "logps/chosen": -45.234169006347656, "logps/rejected": -57.06972885131836, "loss": 1.5919, "nll_loss": 0.6031222939491272, "rewards/accuracies": 1.0, "rewards/chosen": 2.024333953857422, "rewards/margins": 0.21345853805541992, "rewards/rejected": 1.810875415802002, "step": 1783 }, { "epoch": 0.29733333333333334, "grad_norm": 30.882719039916992, "learning_rate": 1.648010274960539e-07, "logits/chosen": 1.6841628551483154, "logits/rejected": 2.240598440170288, "logps/chosen": -76.21969604492188, "logps/rejected": -153.76063537597656, "loss": 0.9562, "nll_loss": 0.9409838914871216, "rewards/accuracies": 1.0, "rewards/chosen": 2.160845994949341, "rewards/margins": 7.160534858703613, "rewards/rejected": -4.999688625335693, "step": 1784 }, { "epoch": 0.2975, "grad_norm": 20.548316955566406, "learning_rate": 1.6475990575117603e-07, "logits/chosen": 2.3552441596984863, "logits/rejected": 2.2716803550720215, "logps/chosen": -131.58950805664062, "logps/rejected": -113.8141860961914, "loss": 0.5666, "nll_loss": 0.524260938167572, "rewards/accuracies": 1.0, "rewards/chosen": 1.392816185951233, "rewards/margins": 4.879403114318848, "rewards/rejected": -3.486586809158325, "step": 1785 }, { "epoch": 0.2976666666666667, "grad_norm": 28.67367172241211, "learning_rate": 1.6471876513681636e-07, "logits/chosen": 2.3636345863342285, "logits/rejected": 2.6241211891174316, "logps/chosen": -116.1573715209961, "logps/rejected": -528.265869140625, "loss": 1.1011, "nll_loss": 1.0855828523635864, "rewards/accuracies": 1.0, "rewards/chosen": 1.904244303703308, "rewards/margins": 14.061326026916504, "rewards/rejected": -12.157081604003906, "step": 1786 }, { "epoch": 0.29783333333333334, "grad_norm": 39.82246780395508, "learning_rate": 1.6467760566496227e-07, "logits/chosen": 2.8590211868286133, "logits/rejected": 2.8285610675811768, "logps/chosen": -79.27761840820312, "logps/rejected": -43.9992561340332, "loss": 1.0552, "nll_loss": 0.8617134094238281, "rewards/accuracies": 1.0, "rewards/chosen": 1.6612052917480469, "rewards/margins": 2.5440945625305176, "rewards/rejected": -0.8828892707824707, "step": 1787 }, { "epoch": 0.298, "grad_norm": 26.880203247070312, "learning_rate": 1.6463642734760668e-07, "logits/chosen": 1.7991578578948975, "logits/rejected": 2.162371873855591, "logps/chosen": -81.76260375976562, "logps/rejected": -147.44564819335938, "loss": 0.7449, "nll_loss": 0.6870806813240051, "rewards/accuracies": 1.0, "rewards/chosen": 0.9681045413017273, "rewards/margins": 4.448028564453125, "rewards/rejected": -3.479923963546753, "step": 1788 }, { "epoch": 0.2981666666666667, "grad_norm": 50.68985366821289, "learning_rate": 1.645952301967479e-07, "logits/chosen": 2.6165831089019775, "logits/rejected": 2.639209508895874, "logps/chosen": -64.51143646240234, "logps/rejected": -96.00189971923828, "loss": 0.9402, "nll_loss": 0.7964375615119934, "rewards/accuracies": 1.0, "rewards/chosen": 1.2139908075332642, "rewards/margins": 2.7743759155273438, "rewards/rejected": -1.5603851079940796, "step": 1789 }, { "epoch": 0.29833333333333334, "grad_norm": 85.26800537109375, "learning_rate": 1.6455401422438983e-07, "logits/chosen": 0.8460131883621216, "logits/rejected": 1.869430422782898, "logps/chosen": -115.84773254394531, "logps/rejected": -298.8453369140625, "loss": 1.5017, "nll_loss": 1.3016598224639893, "rewards/accuracies": 1.0, "rewards/chosen": -1.110913872718811, "rewards/margins": 5.076074123382568, "rewards/rejected": -6.18698787689209, "step": 1790 }, { "epoch": 0.2985, "grad_norm": 30.66898536682129, "learning_rate": 1.6451277944254182e-07, "logits/chosen": 2.7669858932495117, "logits/rejected": 2.7861223220825195, "logps/chosen": -51.7647819519043, "logps/rejected": -98.03683471679688, "loss": 0.6908, "nll_loss": 0.6019161343574524, "rewards/accuracies": 1.0, "rewards/chosen": 2.0454976558685303, "rewards/margins": 3.767697811126709, "rewards/rejected": -1.7222000360488892, "step": 1791 }, { "epoch": 0.2986666666666667, "grad_norm": 33.379024505615234, "learning_rate": 1.6447152586321868e-07, "logits/chosen": 2.9743568897247314, "logits/rejected": 3.1274287700653076, "logps/chosen": -78.70227813720703, "logps/rejected": -255.96974182128906, "loss": 1.143, "nll_loss": 1.1243181228637695, "rewards/accuracies": 1.0, "rewards/chosen": 1.7520508766174316, "rewards/margins": 8.22175407409668, "rewards/rejected": -6.469703674316406, "step": 1792 }, { "epoch": 0.29883333333333334, "grad_norm": 44.642250061035156, "learning_rate": 1.6443025349844076e-07, "logits/chosen": 2.24670672416687, "logits/rejected": 2.609996795654297, "logps/chosen": -30.916427612304688, "logps/rejected": -161.1198272705078, "loss": 1.1496, "nll_loss": 1.1450529098510742, "rewards/accuracies": 1.0, "rewards/chosen": 3.394953966140747, "rewards/margins": 9.379267692565918, "rewards/rejected": -5.98431396484375, "step": 1793 }, { "epoch": 0.299, "grad_norm": 54.632938385009766, "learning_rate": 1.6438896236023374e-07, "logits/chosen": 2.7137320041656494, "logits/rejected": 2.878314256668091, "logps/chosen": -96.69480895996094, "logps/rejected": -321.7127990722656, "loss": 1.2644, "nll_loss": 1.0625803470611572, "rewards/accuracies": 1.0, "rewards/chosen": -1.1658210754394531, "rewards/margins": 6.877246856689453, "rewards/rejected": -8.043067932128906, "step": 1794 }, { "epoch": 0.2991666666666667, "grad_norm": 54.00825881958008, "learning_rate": 1.6434765246062892e-07, "logits/chosen": 2.171962022781372, "logits/rejected": 2.297161817550659, "logps/chosen": -22.23577880859375, "logps/rejected": -40.14369583129883, "loss": 0.8612, "nll_loss": 0.7667509913444519, "rewards/accuracies": 1.0, "rewards/chosen": 0.9392982721328735, "rewards/margins": 3.3894729614257812, "rewards/rejected": -2.450174570083618, "step": 1795 }, { "epoch": 0.29933333333333334, "grad_norm": 38.60243606567383, "learning_rate": 1.6430632381166304e-07, "logits/chosen": 3.0435280799865723, "logits/rejected": 3.0061726570129395, "logps/chosen": -84.5399398803711, "logps/rejected": -120.43883514404297, "loss": 1.1767, "nll_loss": 1.1271991729736328, "rewards/accuracies": 1.0, "rewards/chosen": 1.6533546447753906, "rewards/margins": 4.478854179382324, "rewards/rejected": -2.8254997730255127, "step": 1796 }, { "epoch": 0.2995, "grad_norm": 20.80921173095703, "learning_rate": 1.6426497642537824e-07, "logits/chosen": 1.6961619853973389, "logits/rejected": 1.4667383432388306, "logps/chosen": -213.62208557128906, "logps/rejected": -205.06423950195312, "loss": 1.1904, "nll_loss": 1.1802324056625366, "rewards/accuracies": 1.0, "rewards/chosen": 2.4497604370117188, "rewards/margins": 8.422910690307617, "rewards/rejected": -5.973150730133057, "step": 1797 }, { "epoch": 0.2996666666666667, "grad_norm": 43.22372817993164, "learning_rate": 1.6422361031382215e-07, "logits/chosen": 1.9032421112060547, "logits/rejected": 1.7762118577957153, "logps/chosen": -70.94071197509766, "logps/rejected": -28.485576629638672, "loss": 1.0535, "nll_loss": 0.9334303140640259, "rewards/accuracies": 1.0, "rewards/chosen": 1.485595703125, "rewards/margins": 3.1126325130462646, "rewards/rejected": -1.6270368099212646, "step": 1798 }, { "epoch": 0.29983333333333334, "grad_norm": 140.01434326171875, "learning_rate": 1.6418222548904787e-07, "logits/chosen": 2.099560022354126, "logits/rejected": 1.693006992340088, "logps/chosen": -80.593994140625, "logps/rejected": -27.697874069213867, "loss": 1.5695, "nll_loss": 0.9828534722328186, "rewards/accuracies": 1.0, "rewards/chosen": -0.6158668398857117, "rewards/margins": 0.5702325701713562, "rewards/rejected": -1.1860994100570679, "step": 1799 }, { "epoch": 0.3, "grad_norm": 46.09706115722656, "learning_rate": 1.64140821963114e-07, "logits/chosen": 2.5825307369232178, "logits/rejected": 2.6289961338043213, "logps/chosen": -63.500179290771484, "logps/rejected": -97.487060546875, "loss": 0.9069, "nll_loss": 0.7839527726173401, "rewards/accuracies": 1.0, "rewards/chosen": 1.3151166439056396, "rewards/margins": 3.0240180492401123, "rewards/rejected": -1.7089014053344727, "step": 1800 }, { "epoch": 0.3001666666666667, "grad_norm": 31.842178344726562, "learning_rate": 1.6409939974808445e-07, "logits/chosen": 2.1523780822753906, "logits/rejected": 3.11841082572937, "logps/chosen": -83.48236083984375, "logps/rejected": -42.98979568481445, "loss": 0.9633, "nll_loss": 0.9173887372016907, "rewards/accuracies": 1.0, "rewards/chosen": 1.6691659688949585, "rewards/margins": 4.6129608154296875, "rewards/rejected": -2.9437947273254395, "step": 1801 }, { "epoch": 0.30033333333333334, "grad_norm": 28.140539169311523, "learning_rate": 1.6405795885602868e-07, "logits/chosen": 1.0452932119369507, "logits/rejected": 2.724851131439209, "logps/chosen": -63.43250274658203, "logps/rejected": -236.43905639648438, "loss": 0.8476, "nll_loss": 0.8237988352775574, "rewards/accuracies": 1.0, "rewards/chosen": 1.7604074478149414, "rewards/margins": 6.18703556060791, "rewards/rejected": -4.426628112792969, "step": 1802 }, { "epoch": 0.3005, "grad_norm": 38.754669189453125, "learning_rate": 1.640164992990216e-07, "logits/chosen": 3.2609994411468506, "logits/rejected": 3.2501888275146484, "logps/chosen": -112.45829772949219, "logps/rejected": -159.30413818359375, "loss": 1.1462, "nll_loss": 1.0918282270431519, "rewards/accuracies": 1.0, "rewards/chosen": 0.6266143918037415, "rewards/margins": 5.959097862243652, "rewards/rejected": -5.332483291625977, "step": 1803 }, { "epoch": 0.3006666666666667, "grad_norm": 34.7126579284668, "learning_rate": 1.6397502108914353e-07, "logits/chosen": 1.2130427360534668, "logits/rejected": 1.6553421020507812, "logps/chosen": -6.836577892303467, "logps/rejected": -153.7568359375, "loss": 0.3966, "nll_loss": 0.31075355410575867, "rewards/accuracies": 1.0, "rewards/chosen": 0.9067937731742859, "rewards/margins": 3.5720884799957275, "rewards/rejected": -2.665294647216797, "step": 1804 }, { "epoch": 0.30083333333333334, "grad_norm": 28.99087142944336, "learning_rate": 1.6393352423848015e-07, "logits/chosen": 1.7431702613830566, "logits/rejected": 1.7234139442443848, "logps/chosen": -53.34920120239258, "logps/rejected": -79.3929443359375, "loss": 0.8129, "nll_loss": 0.7845471501350403, "rewards/accuracies": 1.0, "rewards/chosen": 2.0499050617218018, "rewards/margins": 5.486427307128906, "rewards/rejected": -3.4365222454071045, "step": 1805 }, { "epoch": 0.301, "grad_norm": 27.319774627685547, "learning_rate": 1.6389200875912276e-07, "logits/chosen": 1.1826043128967285, "logits/rejected": 1.608401894569397, "logps/chosen": -85.32637786865234, "logps/rejected": -276.0802001953125, "loss": 0.8446, "nll_loss": 0.7974428534507751, "rewards/accuracies": 1.0, "rewards/chosen": 0.80105060338974, "rewards/margins": 6.030132293701172, "rewards/rejected": -5.229081630706787, "step": 1806 }, { "epoch": 0.3011666666666667, "grad_norm": 30.342918395996094, "learning_rate": 1.6385047466316788e-07, "logits/chosen": 2.314218044281006, "logits/rejected": 2.3836076259613037, "logps/chosen": -45.49370193481445, "logps/rejected": -45.61819076538086, "loss": 0.649, "nll_loss": 0.5832526087760925, "rewards/accuracies": 1.0, "rewards/chosen": 1.2689881324768066, "rewards/margins": 3.9941651821136475, "rewards/rejected": -2.725177049636841, "step": 1807 }, { "epoch": 0.30133333333333334, "grad_norm": 60.18303298950195, "learning_rate": 1.6380892196271762e-07, "logits/chosen": 3.6276516914367676, "logits/rejected": 3.5122456550598145, "logps/chosen": -175.377685546875, "logps/rejected": -178.22769165039062, "loss": 1.1481, "nll_loss": 1.0316334962844849, "rewards/accuracies": 1.0, "rewards/chosen": -0.37840884923934937, "rewards/margins": 7.277867794036865, "rewards/rejected": -7.656276702880859, "step": 1808 }, { "epoch": 0.3015, "grad_norm": 97.41088104248047, "learning_rate": 1.6376735066987938e-07, "logits/chosen": 2.4297797679901123, "logits/rejected": 2.6064765453338623, "logps/chosen": -30.78173828125, "logps/rejected": -453.5662841796875, "loss": 2.1841, "nll_loss": 2.0521159172058105, "rewards/accuracies": 1.0, "rewards/chosen": -0.5477170944213867, "rewards/margins": 7.102774620056152, "rewards/rejected": -7.650491714477539, "step": 1809 }, { "epoch": 0.3016666666666667, "grad_norm": 21.7927188873291, "learning_rate": 1.6372576079676606e-07, "logits/chosen": 1.2575597763061523, "logits/rejected": 1.9278267621994019, "logps/chosen": -62.902976989746094, "logps/rejected": -289.16796875, "loss": 0.6614, "nll_loss": 0.635383665561676, "rewards/accuracies": 1.0, "rewards/chosen": 1.55028235912323, "rewards/margins": 6.348299026489258, "rewards/rejected": -4.798016548156738, "step": 1810 }, { "epoch": 0.30183333333333334, "grad_norm": 28.838993072509766, "learning_rate": 1.6368415235549596e-07, "logits/chosen": 0.8237008452415466, "logits/rejected": 2.4498913288116455, "logps/chosen": -85.61148834228516, "logps/rejected": -317.14923095703125, "loss": 1.0412, "nll_loss": 1.0191842317581177, "rewards/accuracies": 1.0, "rewards/chosen": 1.5652817487716675, "rewards/margins": 8.100993156433105, "rewards/rejected": -6.535711765289307, "step": 1811 }, { "epoch": 0.302, "grad_norm": 266.8643798828125, "learning_rate": 1.6364252535819282e-07, "logits/chosen": 1.9667701721191406, "logits/rejected": 2.2322745323181152, "logps/chosen": -61.10062026977539, "logps/rejected": -51.147666931152344, "loss": 4.6392, "nll_loss": 0.803955614566803, "rewards/accuracies": 0.0, "rewards/chosen": 1.379581093788147, "rewards/margins": -3.381606101989746, "rewards/rejected": 4.7611870765686035, "step": 1812 }, { "epoch": 0.30216666666666664, "grad_norm": 60.22600555419922, "learning_rate": 1.6360087981698564e-07, "logits/chosen": 2.9841578006744385, "logits/rejected": 2.9085476398468018, "logps/chosen": -39.29414367675781, "logps/rejected": -31.431459426879883, "loss": 1.6016, "nll_loss": 1.5717657804489136, "rewards/accuracies": 1.0, "rewards/chosen": 3.542903423309326, "rewards/margins": 6.178380012512207, "rewards/rejected": -2.6354763507843018, "step": 1813 }, { "epoch": 0.30233333333333334, "grad_norm": 25.684640884399414, "learning_rate": 1.6355921574400902e-07, "logits/chosen": 2.8547139167785645, "logits/rejected": 2.9719011783599854, "logps/chosen": -50.44551086425781, "logps/rejected": -232.2571563720703, "loss": 0.6505, "nll_loss": 0.6305688619613647, "rewards/accuracies": 1.0, "rewards/chosen": 2.106393575668335, "rewards/margins": 6.26480770111084, "rewards/rejected": -4.158413887023926, "step": 1814 }, { "epoch": 0.3025, "grad_norm": 44.84699249267578, "learning_rate": 1.6351753315140285e-07, "logits/chosen": 1.6899317502975464, "logits/rejected": 2.1261038780212402, "logps/chosen": -63.52169418334961, "logps/rejected": -165.2853240966797, "loss": 1.4918, "nll_loss": 1.443674921989441, "rewards/accuracies": 1.0, "rewards/chosen": 0.7839291095733643, "rewards/margins": 5.9175615310668945, "rewards/rejected": -5.133632659912109, "step": 1815 }, { "epoch": 0.30266666666666664, "grad_norm": 25.073429107666016, "learning_rate": 1.634758320513124e-07, "logits/chosen": 2.309075355529785, "logits/rejected": 2.237125873565674, "logps/chosen": -39.40113067626953, "logps/rejected": -49.11016082763672, "loss": 0.6009, "nll_loss": 0.5397416353225708, "rewards/accuracies": 1.0, "rewards/chosen": 1.0453919172286987, "rewards/margins": 4.220282077789307, "rewards/rejected": -3.1748902797698975, "step": 1816 }, { "epoch": 0.30283333333333334, "grad_norm": 40.59684371948242, "learning_rate": 1.6343411245588842e-07, "logits/chosen": 1.6191672086715698, "logits/rejected": 2.4311747550964355, "logps/chosen": -33.095977783203125, "logps/rejected": -220.5992431640625, "loss": 0.7986, "nll_loss": 0.7521812319755554, "rewards/accuracies": 1.0, "rewards/chosen": 0.7162651419639587, "rewards/margins": 10.20997142791748, "rewards/rejected": -9.493706703186035, "step": 1817 }, { "epoch": 0.303, "grad_norm": 30.358530044555664, "learning_rate": 1.63392374377287e-07, "logits/chosen": 3.1004927158355713, "logits/rejected": 3.016742467880249, "logps/chosen": -89.26457214355469, "logps/rejected": -121.73701477050781, "loss": 0.9658, "nll_loss": 0.939626932144165, "rewards/accuracies": 1.0, "rewards/chosen": 1.598840355873108, "rewards/margins": 6.141108512878418, "rewards/rejected": -4.5422682762146, "step": 1818 }, { "epoch": 0.30316666666666664, "grad_norm": 70.03262329101562, "learning_rate": 1.6335061782766957e-07, "logits/chosen": 2.170724868774414, "logits/rejected": 2.2577826976776123, "logps/chosen": -42.88118362426758, "logps/rejected": -82.77726745605469, "loss": 1.1599, "nll_loss": 0.9321998357772827, "rewards/accuracies": 1.0, "rewards/chosen": 2.1864986419677734, "rewards/margins": 2.613921642303467, "rewards/rejected": -0.4274230897426605, "step": 1819 }, { "epoch": 0.30333333333333334, "grad_norm": 31.548187255859375, "learning_rate": 1.63308842819203e-07, "logits/chosen": 3.3848729133605957, "logits/rejected": 3.2646007537841797, "logps/chosen": -66.27877807617188, "logps/rejected": -59.38695526123047, "loss": 0.8541, "nll_loss": 0.7618251442909241, "rewards/accuracies": 1.0, "rewards/chosen": 1.9029580354690552, "rewards/margins": 3.6497721672058105, "rewards/rejected": -1.7468141317367554, "step": 1820 }, { "epoch": 0.3035, "grad_norm": 22.317626953125, "learning_rate": 1.6326704936405952e-07, "logits/chosen": 2.922579050064087, "logits/rejected": 3.0806148052215576, "logps/chosen": -92.93863677978516, "logps/rejected": -408.796875, "loss": 0.7439, "nll_loss": 0.7204544544219971, "rewards/accuracies": 1.0, "rewards/chosen": 1.6054468154907227, "rewards/margins": 6.775802135467529, "rewards/rejected": -5.170355319976807, "step": 1821 }, { "epoch": 0.30366666666666664, "grad_norm": 30.154537200927734, "learning_rate": 1.632252374744168e-07, "logits/chosen": 0.5513426661491394, "logits/rejected": 2.2138051986694336, "logps/chosen": -28.70901107788086, "logps/rejected": -407.4551086425781, "loss": 0.5609, "nll_loss": 0.5316482782363892, "rewards/accuracies": 1.0, "rewards/chosen": 1.2245677709579468, "rewards/margins": 9.963913917541504, "rewards/rejected": -8.739346504211426, "step": 1822 }, { "epoch": 0.30383333333333334, "grad_norm": 29.52449607849121, "learning_rate": 1.6318340716245772e-07, "logits/chosen": 1.5552608966827393, "logits/rejected": 2.042647123336792, "logps/chosen": -116.16976165771484, "logps/rejected": -357.94818115234375, "loss": 1.2261, "nll_loss": 1.2101017236709595, "rewards/accuracies": 1.0, "rewards/chosen": 1.855023980140686, "rewards/margins": 11.520139694213867, "rewards/rejected": -9.665115356445312, "step": 1823 }, { "epoch": 0.304, "grad_norm": 29.588550567626953, "learning_rate": 1.631415584403707e-07, "logits/chosen": 1.0249736309051514, "logits/rejected": 2.260040044784546, "logps/chosen": -55.54147720336914, "logps/rejected": -405.8204650878906, "loss": 0.9012, "nll_loss": 0.8816106915473938, "rewards/accuracies": 1.0, "rewards/chosen": 1.7812687158584595, "rewards/margins": 7.140693187713623, "rewards/rejected": -5.359424591064453, "step": 1824 }, { "epoch": 0.30416666666666664, "grad_norm": 20.81627655029297, "learning_rate": 1.6309969132034944e-07, "logits/chosen": 2.159899950027466, "logits/rejected": 2.188413619995117, "logps/chosen": -162.5116729736328, "logps/rejected": -258.90740966796875, "loss": 0.8898, "nll_loss": 0.8690463900566101, "rewards/accuracies": 1.0, "rewards/chosen": 1.6775803565979004, "rewards/margins": 7.3642706871032715, "rewards/rejected": -5.686690330505371, "step": 1825 }, { "epoch": 0.30433333333333334, "grad_norm": 29.756061553955078, "learning_rate": 1.63057805814593e-07, "logits/chosen": 1.4577158689498901, "logits/rejected": 2.14814829826355, "logps/chosen": -38.635650634765625, "logps/rejected": -310.3377685546875, "loss": 0.6403, "nll_loss": 0.6036819815635681, "rewards/accuracies": 1.0, "rewards/chosen": 1.0444172620773315, "rewards/margins": 6.667941093444824, "rewards/rejected": -5.623523712158203, "step": 1826 }, { "epoch": 0.3045, "grad_norm": 37.23548126220703, "learning_rate": 1.6301590193530584e-07, "logits/chosen": 2.8253839015960693, "logits/rejected": 2.891446352005005, "logps/chosen": -24.294755935668945, "logps/rejected": -106.37224578857422, "loss": 0.7208, "nll_loss": 0.6941359043121338, "rewards/accuracies": 1.0, "rewards/chosen": 2.699942111968994, "rewards/margins": 5.749543190002441, "rewards/rejected": -3.0496010780334473, "step": 1827 }, { "epoch": 0.30466666666666664, "grad_norm": 21.642353057861328, "learning_rate": 1.6297397969469772e-07, "logits/chosen": 2.5830371379852295, "logits/rejected": 2.4034738540649414, "logps/chosen": -173.35739135742188, "logps/rejected": -158.4988250732422, "loss": 0.7915, "nll_loss": 0.7570192813873291, "rewards/accuracies": 1.0, "rewards/chosen": 1.3339935541152954, "rewards/margins": 5.585143089294434, "rewards/rejected": -4.251149654388428, "step": 1828 }, { "epoch": 0.30483333333333335, "grad_norm": 30.251787185668945, "learning_rate": 1.6293203910498375e-07, "logits/chosen": 2.0606560707092285, "logits/rejected": 2.3053061962127686, "logps/chosen": -102.10086822509766, "logps/rejected": -170.31878662109375, "loss": 1.1528, "nll_loss": 1.1344541311264038, "rewards/accuracies": 1.0, "rewards/chosen": 2.0538978576660156, "rewards/margins": 6.571895122528076, "rewards/rejected": -4.5179972648620605, "step": 1829 }, { "epoch": 0.305, "grad_norm": 41.565765380859375, "learning_rate": 1.6289008017838445e-07, "logits/chosen": 3.026695728302002, "logits/rejected": 3.4252448081970215, "logps/chosen": -26.414936065673828, "logps/rejected": -242.949951171875, "loss": 0.8586, "nll_loss": 0.8520947098731995, "rewards/accuracies": 1.0, "rewards/chosen": 3.0111160278320312, "rewards/margins": 8.749371528625488, "rewards/rejected": -5.738255500793457, "step": 1830 }, { "epoch": 0.30516666666666664, "grad_norm": 115.05071258544922, "learning_rate": 1.6284810292712563e-07, "logits/chosen": 2.8907289505004883, "logits/rejected": 2.8509206771850586, "logps/chosen": -76.21581268310547, "logps/rejected": -46.15034484863281, "loss": 1.923, "nll_loss": 1.1908721923828125, "rewards/accuracies": 1.0, "rewards/chosen": 2.9491028785705566, "rewards/margins": 1.140546441078186, "rewards/rejected": 1.8085564374923706, "step": 1831 }, { "epoch": 0.30533333333333335, "grad_norm": 29.807355880737305, "learning_rate": 1.6280610736343844e-07, "logits/chosen": 2.399888038635254, "logits/rejected": 2.8723063468933105, "logps/chosen": -58.3387336730957, "logps/rejected": -311.6325988769531, "loss": 0.8503, "nll_loss": 0.8216723203659058, "rewards/accuracies": 1.0, "rewards/chosen": 1.2420543432235718, "rewards/margins": 9.704285621643066, "rewards/rejected": -8.462231636047363, "step": 1832 }, { "epoch": 0.3055, "grad_norm": 116.0958251953125, "learning_rate": 1.6276409349955945e-07, "logits/chosen": 2.2385902404785156, "logits/rejected": 2.014435291290283, "logps/chosen": -51.54228210449219, "logps/rejected": -19.64218521118164, "loss": 1.5897, "nll_loss": 0.6872304677963257, "rewards/accuracies": 1.0, "rewards/chosen": 1.1540580987930298, "rewards/margins": 0.12806224822998047, "rewards/rejected": 1.0259958505630493, "step": 1833 }, { "epoch": 0.30566666666666664, "grad_norm": 37.976680755615234, "learning_rate": 1.627220613477304e-07, "logits/chosen": 1.8781462907791138, "logits/rejected": 2.490910053253174, "logps/chosen": -71.80265045166016, "logps/rejected": -267.313720703125, "loss": 1.0027, "nll_loss": 0.9447718262672424, "rewards/accuracies": 1.0, "rewards/chosen": 0.47515642642974854, "rewards/margins": 7.352189540863037, "rewards/rejected": -6.877033233642578, "step": 1834 }, { "epoch": 0.30583333333333335, "grad_norm": 61.508148193359375, "learning_rate": 1.626800109201985e-07, "logits/chosen": 2.765535593032837, "logits/rejected": 2.794019937515259, "logps/chosen": -98.5662841796875, "logps/rejected": -221.821533203125, "loss": 1.0947, "nll_loss": 1.0713728666305542, "rewards/accuracies": 1.0, "rewards/chosen": 2.007176160812378, "rewards/margins": 5.919751167297363, "rewards/rejected": -3.9125750064849854, "step": 1835 }, { "epoch": 0.306, "grad_norm": 23.604358673095703, "learning_rate": 1.6263794222921617e-07, "logits/chosen": 1.4232906103134155, "logits/rejected": 1.9964841604232788, "logps/chosen": -91.30281066894531, "logps/rejected": -256.95147705078125, "loss": 0.7109, "nll_loss": 0.6864872574806213, "rewards/accuracies": 1.0, "rewards/chosen": 1.4820832014083862, "rewards/margins": 7.305208683013916, "rewards/rejected": -5.82312536239624, "step": 1836 }, { "epoch": 0.30616666666666664, "grad_norm": 29.730867385864258, "learning_rate": 1.6259585528704132e-07, "logits/chosen": 0.9411117434501648, "logits/rejected": 1.5974256992340088, "logps/chosen": -28.411834716796875, "logps/rejected": -238.50140380859375, "loss": 0.6665, "nll_loss": 0.660740315914154, "rewards/accuracies": 1.0, "rewards/chosen": 4.051510810852051, "rewards/margins": 8.640942573547363, "rewards/rejected": -4.5894317626953125, "step": 1837 }, { "epoch": 0.30633333333333335, "grad_norm": 21.783357620239258, "learning_rate": 1.6255375010593702e-07, "logits/chosen": 1.706963062286377, "logits/rejected": 1.4954462051391602, "logps/chosen": -152.11158752441406, "logps/rejected": -98.72405242919922, "loss": 0.9774, "nll_loss": 0.9389604330062866, "rewards/accuracies": 1.0, "rewards/chosen": 3.467604160308838, "rewards/margins": 5.825776100158691, "rewards/rejected": -2.3581719398498535, "step": 1838 }, { "epoch": 0.3065, "grad_norm": 26.683691024780273, "learning_rate": 1.6251162669817168e-07, "logits/chosen": 2.7402184009552, "logits/rejected": 2.82841157913208, "logps/chosen": -83.10518646240234, "logps/rejected": -106.51441955566406, "loss": 0.8735, "nll_loss": 0.839446485042572, "rewards/accuracies": 1.0, "rewards/chosen": 1.2188247442245483, "rewards/margins": 6.0417704582214355, "rewards/rejected": -4.822945594787598, "step": 1839 }, { "epoch": 0.30666666666666664, "grad_norm": 24.807418823242188, "learning_rate": 1.6246948507601913e-07, "logits/chosen": 2.649120569229126, "logits/rejected": 2.7705695629119873, "logps/chosen": -65.2795181274414, "logps/rejected": -241.56565856933594, "loss": 0.7114, "nll_loss": 0.6871526837348938, "rewards/accuracies": 1.0, "rewards/chosen": 1.4347374439239502, "rewards/margins": 8.432740211486816, "rewards/rejected": -6.998002529144287, "step": 1840 }, { "epoch": 0.30683333333333335, "grad_norm": 23.82042121887207, "learning_rate": 1.6242732525175833e-07, "logits/chosen": 3.105316400527954, "logits/rejected": 3.107706308364868, "logps/chosen": -108.68376922607422, "logps/rejected": -130.35287475585938, "loss": 0.8544, "nll_loss": 0.823361873626709, "rewards/accuracies": 1.0, "rewards/chosen": 2.0190086364746094, "rewards/margins": 5.314794540405273, "rewards/rejected": -3.295785903930664, "step": 1841 }, { "epoch": 0.307, "grad_norm": 66.66433715820312, "learning_rate": 1.6238514723767372e-07, "logits/chosen": 2.541487693786621, "logits/rejected": 2.7352817058563232, "logps/chosen": -48.91650390625, "logps/rejected": -87.77047729492188, "loss": 0.977, "nll_loss": 0.764320433139801, "rewards/accuracies": 1.0, "rewards/chosen": 0.8659466505050659, "rewards/margins": 2.1485066413879395, "rewards/rejected": -1.282560110092163, "step": 1842 }, { "epoch": 0.30716666666666664, "grad_norm": 31.409421920776367, "learning_rate": 1.6234295104605492e-07, "logits/chosen": 1.7098232507705688, "logits/rejected": 1.7628530263900757, "logps/chosen": -33.81578826904297, "logps/rejected": -52.91038131713867, "loss": 0.5609, "nll_loss": 0.46966373920440674, "rewards/accuracies": 1.0, "rewards/chosen": 1.0280784368515015, "rewards/margins": 3.439633846282959, "rewards/rejected": -2.411555528640747, "step": 1843 }, { "epoch": 0.30733333333333335, "grad_norm": 25.110212326049805, "learning_rate": 1.623007366891969e-07, "logits/chosen": 2.271975517272949, "logits/rejected": 2.6845901012420654, "logps/chosen": -159.8543243408203, "logps/rejected": -551.43212890625, "loss": 0.9374, "nll_loss": 0.9082630276679993, "rewards/accuracies": 1.0, "rewards/chosen": 1.2153854370117188, "rewards/margins": 15.014586448669434, "rewards/rejected": -13.799201011657715, "step": 1844 }, { "epoch": 0.3075, "grad_norm": 34.802452087402344, "learning_rate": 1.6225850417939988e-07, "logits/chosen": 1.6659016609191895, "logits/rejected": 2.5926742553710938, "logps/chosen": -73.62904357910156, "logps/rejected": -235.66009521484375, "loss": 1.1082, "nll_loss": 1.0989410877227783, "rewards/accuracies": 1.0, "rewards/chosen": 2.879606246948242, "rewards/margins": 7.6847944259643555, "rewards/rejected": -4.805188179016113, "step": 1845 }, { "epoch": 0.30766666666666664, "grad_norm": 31.683698654174805, "learning_rate": 1.6221625352896945e-07, "logits/chosen": 1.4588912725448608, "logits/rejected": 1.298618197441101, "logps/chosen": -70.57855987548828, "logps/rejected": -72.98567199707031, "loss": 0.8951, "nll_loss": 0.8402208685874939, "rewards/accuracies": 1.0, "rewards/chosen": 0.960004448890686, "rewards/margins": 4.579007148742676, "rewards/rejected": -3.6190028190612793, "step": 1846 }, { "epoch": 0.30783333333333335, "grad_norm": 18.59774398803711, "learning_rate": 1.6217398475021642e-07, "logits/chosen": 2.86720871925354, "logits/rejected": 2.7724547386169434, "logps/chosen": -93.6883544921875, "logps/rejected": -99.65762329101562, "loss": 0.661, "nll_loss": 0.6417009830474854, "rewards/accuracies": 1.0, "rewards/chosen": 2.6289138793945312, "rewards/margins": 6.214322566986084, "rewards/rejected": -3.5854086875915527, "step": 1847 }, { "epoch": 0.308, "grad_norm": 45.248313903808594, "learning_rate": 1.6213169785545687e-07, "logits/chosen": 2.4421191215515137, "logits/rejected": 2.6692333221435547, "logps/chosen": -29.185657501220703, "logps/rejected": -158.07696533203125, "loss": 1.0853, "nll_loss": 1.0809502601623535, "rewards/accuracies": 1.0, "rewards/chosen": 3.5680313110351562, "rewards/margins": 9.248059272766113, "rewards/rejected": -5.680027961730957, "step": 1848 }, { "epoch": 0.30816666666666664, "grad_norm": 38.30426788330078, "learning_rate": 1.6208939285701222e-07, "logits/chosen": 2.2981088161468506, "logits/rejected": 2.3709301948547363, "logps/chosen": -64.88436126708984, "logps/rejected": -124.22943115234375, "loss": 1.2158, "nll_loss": 1.2015622854232788, "rewards/accuracies": 1.0, "rewards/chosen": 2.248487949371338, "rewards/margins": 7.143658638000488, "rewards/rejected": -4.89517068862915, "step": 1849 }, { "epoch": 0.30833333333333335, "grad_norm": 31.16347312927246, "learning_rate": 1.6204706976720907e-07, "logits/chosen": 1.4326013326644897, "logits/rejected": 2.6061997413635254, "logps/chosen": -82.13839721679688, "logps/rejected": -264.8094177246094, "loss": 0.936, "nll_loss": 0.9026197791099548, "rewards/accuracies": 1.0, "rewards/chosen": 1.135401964187622, "rewards/margins": 6.867387771606445, "rewards/rejected": -5.731985569000244, "step": 1850 }, { "epoch": 0.3085, "grad_norm": 116.30767059326172, "learning_rate": 1.6200472859837943e-07, "logits/chosen": 3.1975553035736084, "logits/rejected": 3.1547343730926514, "logps/chosen": -25.445476531982422, "logps/rejected": -102.00830078125, "loss": 1.8589, "nll_loss": 1.4967925548553467, "rewards/accuracies": 1.0, "rewards/chosen": 0.6299780607223511, "rewards/margins": 1.3599653244018555, "rewards/rejected": -0.7299873232841492, "step": 1851 }, { "epoch": 0.30866666666666664, "grad_norm": 29.758167266845703, "learning_rate": 1.619623693628605e-07, "logits/chosen": 2.0499398708343506, "logits/rejected": 2.515623092651367, "logps/chosen": -26.048688888549805, "logps/rejected": -203.1134033203125, "loss": 0.5159, "nll_loss": 0.45699450373649597, "rewards/accuracies": 1.0, "rewards/chosen": 0.49259495735168457, "rewards/margins": 6.136944770812988, "rewards/rejected": -5.644349575042725, "step": 1852 }, { "epoch": 0.30883333333333335, "grad_norm": 23.431697845458984, "learning_rate": 1.6191999207299466e-07, "logits/chosen": 2.862170696258545, "logits/rejected": 2.7685158252716064, "logps/chosen": -179.82562255859375, "logps/rejected": -170.90220642089844, "loss": 0.9324, "nll_loss": 0.8991281390190125, "rewards/accuracies": 1.0, "rewards/chosen": 1.382757544517517, "rewards/margins": 5.60066556930542, "rewards/rejected": -4.217907905578613, "step": 1853 }, { "epoch": 0.309, "grad_norm": 64.17150115966797, "learning_rate": 1.618775967411297e-07, "logits/chosen": 3.2973573207855225, "logits/rejected": 3.3530819416046143, "logps/chosen": -46.50980758666992, "logps/rejected": -114.54901123046875, "loss": 1.4527, "nll_loss": 1.0570411682128906, "rewards/accuracies": 1.0, "rewards/chosen": 4.173629283905029, "rewards/margins": 3.184734582901001, "rewards/rejected": 0.9888946413993835, "step": 1854 }, { "epoch": 0.30916666666666665, "grad_norm": 72.91348266601562, "learning_rate": 1.6183518337961862e-07, "logits/chosen": 2.3942928314208984, "logits/rejected": 2.2610926628112793, "logps/chosen": -108.41551208496094, "logps/rejected": -39.23851013183594, "loss": 1.2282, "nll_loss": 0.8213295936584473, "rewards/accuracies": 1.0, "rewards/chosen": 1.677680253982544, "rewards/margins": 1.5339974164962769, "rewards/rejected": 0.14368286728858948, "step": 1855 }, { "epoch": 0.30933333333333335, "grad_norm": 37.24090576171875, "learning_rate": 1.6179275200081958e-07, "logits/chosen": 1.790435552597046, "logits/rejected": 2.180809259414673, "logps/chosen": -56.02663040161133, "logps/rejected": -176.06411743164062, "loss": 1.0217, "nll_loss": 1.0004754066467285, "rewards/accuracies": 1.0, "rewards/chosen": 2.0761075019836426, "rewards/margins": 6.103695392608643, "rewards/rejected": -4.027587890625, "step": 1856 }, { "epoch": 0.3095, "grad_norm": 25.052431106567383, "learning_rate": 1.6175030261709615e-07, "logits/chosen": 2.3761227130889893, "logits/rejected": 2.3395912647247314, "logps/chosen": -56.899925231933594, "logps/rejected": -60.93471145629883, "loss": 0.6019, "nll_loss": 0.5689993500709534, "rewards/accuracies": 1.0, "rewards/chosen": 1.6617584228515625, "rewards/margins": 5.292285442352295, "rewards/rejected": -3.6305270195007324, "step": 1857 }, { "epoch": 0.30966666666666665, "grad_norm": 39.66257095336914, "learning_rate": 1.61707835240817e-07, "logits/chosen": 2.8992905616760254, "logits/rejected": 3.1698474884033203, "logps/chosen": -24.40001678466797, "logps/rejected": -263.2369384765625, "loss": 0.8194, "nll_loss": 0.8133338093757629, "rewards/accuracies": 1.0, "rewards/chosen": 2.9105985164642334, "rewards/margins": 9.8134126663208, "rewards/rejected": -6.902813911437988, "step": 1858 }, { "epoch": 0.30983333333333335, "grad_norm": 22.00333023071289, "learning_rate": 1.616653498843561e-07, "logits/chosen": 1.9315072298049927, "logits/rejected": 2.47375226020813, "logps/chosen": -70.98542785644531, "logps/rejected": -313.82958984375, "loss": 0.6868, "nll_loss": 0.6696738600730896, "rewards/accuracies": 1.0, "rewards/chosen": 1.836410641670227, "rewards/margins": 8.094505310058594, "rewards/rejected": -6.258094787597656, "step": 1859 }, { "epoch": 0.31, "grad_norm": 62.68486785888672, "learning_rate": 1.6162284656009274e-07, "logits/chosen": 1.2612725496292114, "logits/rejected": 2.0037410259246826, "logps/chosen": -25.47067642211914, "logps/rejected": -170.64974975585938, "loss": 1.2421, "nll_loss": 1.1577582359313965, "rewards/accuracies": 1.0, "rewards/chosen": 2.9455056190490723, "rewards/margins": 4.460681915283203, "rewards/rejected": -1.5151764154434204, "step": 1860 }, { "epoch": 0.31016666666666665, "grad_norm": 29.977657318115234, "learning_rate": 1.6158032528041127e-07, "logits/chosen": 2.0421533584594727, "logits/rejected": 1.9967763423919678, "logps/chosen": -62.920860290527344, "logps/rejected": -88.84668731689453, "loss": 0.7721, "nll_loss": 0.683922290802002, "rewards/accuracies": 1.0, "rewards/chosen": 1.5689270496368408, "rewards/margins": 3.5791192054748535, "rewards/rejected": -2.0101921558380127, "step": 1861 }, { "epoch": 0.31033333333333335, "grad_norm": 42.1673583984375, "learning_rate": 1.6153778605770146e-07, "logits/chosen": 2.631185293197632, "logits/rejected": 2.721619129180908, "logps/chosen": -46.85787582397461, "logps/rejected": -372.96734619140625, "loss": 0.9205, "nll_loss": 0.8841108679771423, "rewards/accuracies": 1.0, "rewards/chosen": 1.0110934972763062, "rewards/margins": 7.262082576751709, "rewards/rejected": -6.250988960266113, "step": 1862 }, { "epoch": 0.3105, "grad_norm": 73.490234375, "learning_rate": 1.614952289043581e-07, "logits/chosen": 2.079416513442993, "logits/rejected": 1.8134887218475342, "logps/chosen": -65.27265930175781, "logps/rejected": -30.856882095336914, "loss": 1.235, "nll_loss": 0.8058351874351501, "rewards/accuracies": 1.0, "rewards/chosen": 1.7741470336914062, "rewards/margins": 1.4978078603744507, "rewards/rejected": 0.2763391435146332, "step": 1863 }, { "epoch": 0.31066666666666665, "grad_norm": 34.63467025756836, "learning_rate": 1.6145265383278143e-07, "logits/chosen": 3.7685554027557373, "logits/rejected": 3.755791187286377, "logps/chosen": -52.55231857299805, "logps/rejected": -144.67462158203125, "loss": 0.9142, "nll_loss": 0.8758718967437744, "rewards/accuracies": 1.0, "rewards/chosen": 1.1171997785568237, "rewards/margins": 5.705170631408691, "rewards/rejected": -4.587970733642578, "step": 1864 }, { "epoch": 0.31083333333333335, "grad_norm": 63.21937561035156, "learning_rate": 1.614100608553768e-07, "logits/chosen": 2.1164681911468506, "logits/rejected": 2.955925226211548, "logps/chosen": -61.77920913696289, "logps/rejected": -347.52056884765625, "loss": 1.5175, "nll_loss": 1.5068098306655884, "rewards/accuracies": 1.0, "rewards/chosen": 2.269449234008789, "rewards/margins": 10.665826797485352, "rewards/rejected": -8.396377563476562, "step": 1865 }, { "epoch": 0.311, "grad_norm": 172.97891235351562, "learning_rate": 1.6136744998455473e-07, "logits/chosen": 3.287510871887207, "logits/rejected": 3.3325083255767822, "logps/chosen": -65.2138442993164, "logps/rejected": -61.22344970703125, "loss": 1.759, "nll_loss": 1.0518361330032349, "rewards/accuracies": 1.0, "rewards/chosen": 0.8691078424453735, "rewards/margins": 0.4475959837436676, "rewards/rejected": 0.42151185870170593, "step": 1866 }, { "epoch": 0.31116666666666665, "grad_norm": 25.179332733154297, "learning_rate": 1.6132482123273105e-07, "logits/chosen": 2.454911231994629, "logits/rejected": 2.4459877014160156, "logps/chosen": -98.44291687011719, "logps/rejected": -393.71087646484375, "loss": 1.1211, "nll_loss": 1.1061002016067505, "rewards/accuracies": 1.0, "rewards/chosen": 2.284602403640747, "rewards/margins": 6.896919250488281, "rewards/rejected": -4.612317085266113, "step": 1867 }, { "epoch": 0.31133333333333335, "grad_norm": 30.329572677612305, "learning_rate": 1.612821746123267e-07, "logits/chosen": 1.250476360321045, "logits/rejected": 1.8216652870178223, "logps/chosen": -80.16329193115234, "logps/rejected": -154.83004760742188, "loss": 0.8623, "nll_loss": 0.8179927468299866, "rewards/accuracies": 1.0, "rewards/chosen": 0.8749680519104004, "rewards/margins": 5.910764694213867, "rewards/rejected": -5.035796642303467, "step": 1868 }, { "epoch": 0.3115, "grad_norm": 30.752836227416992, "learning_rate": 1.6123951013576792e-07, "logits/chosen": 3.0445408821105957, "logits/rejected": 3.081512212753296, "logps/chosen": -25.265193939208984, "logps/rejected": -128.9570770263672, "loss": 0.6695, "nll_loss": 0.6316298842430115, "rewards/accuracies": 1.0, "rewards/chosen": 1.490335464477539, "rewards/margins": 5.059828758239746, "rewards/rejected": -3.569493055343628, "step": 1869 }, { "epoch": 0.31166666666666665, "grad_norm": 38.69471740722656, "learning_rate": 1.6119682781548612e-07, "logits/chosen": 3.0997984409332275, "logits/rejected": 3.0799121856689453, "logps/chosen": -69.11552429199219, "logps/rejected": -139.80239868164062, "loss": 0.9946, "nll_loss": 0.9467880129814148, "rewards/accuracies": 1.0, "rewards/chosen": 1.073089599609375, "rewards/margins": 4.852083206176758, "rewards/rejected": -3.778993606567383, "step": 1870 }, { "epoch": 0.31183333333333335, "grad_norm": 32.155113220214844, "learning_rate": 1.6115412766391787e-07, "logits/chosen": 1.9764790534973145, "logits/rejected": 1.7226940393447876, "logps/chosen": -73.28903198242188, "logps/rejected": -56.47688293457031, "loss": 0.8309, "nll_loss": 0.755557119846344, "rewards/accuracies": 1.0, "rewards/chosen": 1.41215980052948, "rewards/margins": 3.771700382232666, "rewards/rejected": -2.3595407009124756, "step": 1871 }, { "epoch": 0.312, "grad_norm": 25.19619369506836, "learning_rate": 1.6111140969350502e-07, "logits/chosen": 1.8458847999572754, "logits/rejected": 2.4848995208740234, "logps/chosen": -64.85206604003906, "logps/rejected": -269.1087646484375, "loss": 0.7573, "nll_loss": 0.7369552254676819, "rewards/accuracies": 1.0, "rewards/chosen": 1.6561119556427002, "rewards/margins": 7.695522308349609, "rewards/rejected": -6.039410591125488, "step": 1872 }, { "epoch": 0.31216666666666665, "grad_norm": 33.37895965576172, "learning_rate": 1.610686739166945e-07, "logits/chosen": 1.8719561100006104, "logits/rejected": 1.9049222469329834, "logps/chosen": -49.4609260559082, "logps/rejected": -120.70970916748047, "loss": 0.9977, "nll_loss": 0.9698219895362854, "rewards/accuracies": 1.0, "rewards/chosen": 1.50949227809906, "rewards/margins": 6.053409576416016, "rewards/rejected": -4.543917179107666, "step": 1873 }, { "epoch": 0.31233333333333335, "grad_norm": 111.09326934814453, "learning_rate": 1.6102592034593853e-07, "logits/chosen": 2.344646453857422, "logits/rejected": 2.2551536560058594, "logps/chosen": -51.391807556152344, "logps/rejected": -71.75833892822266, "loss": 1.3956, "nll_loss": 0.6944838762283325, "rewards/accuracies": 1.0, "rewards/chosen": 1.539344072341919, "rewards/margins": 0.6544685363769531, "rewards/rejected": 0.8848755359649658, "step": 1874 }, { "epoch": 0.3125, "grad_norm": 24.12509536743164, "learning_rate": 1.6098314899369444e-07, "logits/chosen": 1.8640998601913452, "logits/rejected": 1.6243125200271606, "logps/chosen": -50.12477493286133, "logps/rejected": -85.8486328125, "loss": 0.6354, "nll_loss": 0.6039130091667175, "rewards/accuracies": 1.0, "rewards/chosen": 1.4487392902374268, "rewards/margins": 5.676785469055176, "rewards/rejected": -4.228046417236328, "step": 1875 }, { "epoch": 0.31266666666666665, "grad_norm": 138.74346923828125, "learning_rate": 1.6094035987242481e-07, "logits/chosen": 2.999305009841919, "logits/rejected": 2.9058499336242676, "logps/chosen": -71.98127746582031, "logps/rejected": -19.412038803100586, "loss": 1.932, "nll_loss": 1.0743473768234253, "rewards/accuracies": 1.0, "rewards/chosen": 1.7323777675628662, "rewards/margins": 0.38299357891082764, "rewards/rejected": 1.3493841886520386, "step": 1876 }, { "epoch": 0.31283333333333335, "grad_norm": 21.585657119750977, "learning_rate": 1.6089755299459737e-07, "logits/chosen": 2.986924648284912, "logits/rejected": 3.012521982192993, "logps/chosen": -144.68748474121094, "logps/rejected": -130.11329650878906, "loss": 0.8334, "nll_loss": 0.7949862480163574, "rewards/accuracies": 1.0, "rewards/chosen": 2.214617967605591, "rewards/margins": 5.011600494384766, "rewards/rejected": -2.796982526779175, "step": 1877 }, { "epoch": 0.313, "grad_norm": 48.00149154663086, "learning_rate": 1.60854728372685e-07, "logits/chosen": 2.619128465652466, "logits/rejected": 2.5302345752716064, "logps/chosen": -56.17314147949219, "logps/rejected": -54.10095977783203, "loss": 0.9355, "nll_loss": 0.7911710739135742, "rewards/accuracies": 1.0, "rewards/chosen": 0.9472999572753906, "rewards/margins": 2.7179722785949707, "rewards/rejected": -1.7706722021102905, "step": 1878 }, { "epoch": 0.31316666666666665, "grad_norm": 50.330772399902344, "learning_rate": 1.6081188601916574e-07, "logits/chosen": 2.360666275024414, "logits/rejected": 2.4103643894195557, "logps/chosen": -19.63507843017578, "logps/rejected": -149.30474853515625, "loss": 0.7611, "nll_loss": 0.7551952600479126, "rewards/accuracies": 1.0, "rewards/chosen": 3.4137070178985596, "rewards/margins": 8.517412185668945, "rewards/rejected": -5.103704929351807, "step": 1879 }, { "epoch": 0.31333333333333335, "grad_norm": 74.71092224121094, "learning_rate": 1.6076902594652288e-07, "logits/chosen": 2.533586263656616, "logits/rejected": 2.5670247077941895, "logps/chosen": -12.633038520812988, "logps/rejected": -51.49058532714844, "loss": 1.1007, "nll_loss": 0.9023599624633789, "rewards/accuracies": 1.0, "rewards/chosen": 1.1988781690597534, "rewards/margins": 2.3373963832855225, "rewards/rejected": -1.138518214225769, "step": 1880 }, { "epoch": 0.3135, "grad_norm": 41.579891204833984, "learning_rate": 1.6072614816724476e-07, "logits/chosen": 2.562955617904663, "logits/rejected": 2.191870927810669, "logps/chosen": -49.63238525390625, "logps/rejected": -46.99396514892578, "loss": 0.7834, "nll_loss": 0.5979805588722229, "rewards/accuracies": 1.0, "rewards/chosen": 1.4083967208862305, "rewards/margins": 2.5038952827453613, "rewards/rejected": -1.0954986810684204, "step": 1881 }, { "epoch": 0.31366666666666665, "grad_norm": 238.94163513183594, "learning_rate": 1.6068325269382496e-07, "logits/chosen": 2.6902413368225098, "logits/rejected": 2.6795754432678223, "logps/chosen": -76.98206329345703, "logps/rejected": -47.59154510498047, "loss": 3.9455, "nll_loss": 1.0264275074005127, "rewards/accuracies": 0.0, "rewards/chosen": 2.1147544384002686, "rewards/margins": -2.2763869762420654, "rewards/rejected": 4.391141414642334, "step": 1882 }, { "epoch": 0.31383333333333335, "grad_norm": 146.63548278808594, "learning_rate": 1.606403395387622e-07, "logits/chosen": 2.234344720840454, "logits/rejected": 2.3495538234710693, "logps/chosen": -188.057861328125, "logps/rejected": -154.89419555664062, "loss": 1.6074, "nll_loss": 1.168061375617981, "rewards/accuracies": 1.0, "rewards/chosen": -1.187017798423767, "rewards/margins": 1.2191346883773804, "rewards/rejected": -2.4061524868011475, "step": 1883 }, { "epoch": 0.314, "grad_norm": 27.4392147064209, "learning_rate": 1.6059740871456035e-07, "logits/chosen": 1.6101739406585693, "logits/rejected": 2.2470343112945557, "logps/chosen": -23.222900390625, "logps/rejected": -197.71063232421875, "loss": 0.528, "nll_loss": 0.516064465045929, "rewards/accuracies": 1.0, "rewards/chosen": 2.215811252593994, "rewards/margins": 8.666692733764648, "rewards/rejected": -6.4508819580078125, "step": 1884 }, { "epoch": 0.31416666666666665, "grad_norm": 56.49260330200195, "learning_rate": 1.6055446023372836e-07, "logits/chosen": 2.2000091075897217, "logits/rejected": 2.1323585510253906, "logps/chosen": -31.13862419128418, "logps/rejected": -39.89268112182617, "loss": 0.8642, "nll_loss": 0.7594786286354065, "rewards/accuracies": 1.0, "rewards/chosen": 0.4609350562095642, "rewards/margins": 3.3574604988098145, "rewards/rejected": -2.8965253829956055, "step": 1885 }, { "epoch": 0.31433333333333335, "grad_norm": 24.01310157775879, "learning_rate": 1.6051149410878046e-07, "logits/chosen": 1.7472901344299316, "logits/rejected": 2.6120243072509766, "logps/chosen": -77.16442108154297, "logps/rejected": -246.5426025390625, "loss": 0.858, "nll_loss": 0.8387436270713806, "rewards/accuracies": 1.0, "rewards/chosen": 1.661963701248169, "rewards/margins": 9.214219093322754, "rewards/rejected": -7.552255630493164, "step": 1886 }, { "epoch": 0.3145, "grad_norm": 23.65066146850586, "learning_rate": 1.6046851035223593e-07, "logits/chosen": 2.7575559616088867, "logits/rejected": 2.6966257095336914, "logps/chosen": -92.38449096679688, "logps/rejected": -123.60099029541016, "loss": 0.8902, "nll_loss": 0.879852294921875, "rewards/accuracies": 1.0, "rewards/chosen": 2.6601197719573975, "rewards/margins": 7.596478462219238, "rewards/rejected": -4.93635892868042, "step": 1887 }, { "epoch": 0.31466666666666665, "grad_norm": 32.543373107910156, "learning_rate": 1.6042550897661917e-07, "logits/chosen": 3.293870687484741, "logits/rejected": 3.209623098373413, "logps/chosen": -18.28643798828125, "logps/rejected": -152.3529815673828, "loss": 0.5383, "nll_loss": 0.4942280054092407, "rewards/accuracies": 1.0, "rewards/chosen": 1.9511359930038452, "rewards/margins": 4.718044281005859, "rewards/rejected": -2.7669084072113037, "step": 1888 }, { "epoch": 0.31483333333333335, "grad_norm": 77.66154479980469, "learning_rate": 1.6038248999445979e-07, "logits/chosen": 2.3409616947174072, "logits/rejected": 2.375347852706909, "logps/chosen": -66.3242416381836, "logps/rejected": -135.083251953125, "loss": 2.392, "nll_loss": 2.368722915649414, "rewards/accuracies": 1.0, "rewards/chosen": 1.6584550142288208, "rewards/margins": 6.493443489074707, "rewards/rejected": -4.834988594055176, "step": 1889 }, { "epoch": 0.315, "grad_norm": 23.195886611938477, "learning_rate": 1.6033945341829247e-07, "logits/chosen": 1.1077996492385864, "logits/rejected": 1.6192439794540405, "logps/chosen": -44.72195053100586, "logps/rejected": -131.93902587890625, "loss": 0.547, "nll_loss": 0.5261406302452087, "rewards/accuracies": 1.0, "rewards/chosen": 1.6179695129394531, "rewards/margins": 7.827117919921875, "rewards/rejected": -6.209148406982422, "step": 1890 }, { "epoch": 0.31516666666666665, "grad_norm": 27.66276741027832, "learning_rate": 1.60296399260657e-07, "logits/chosen": 2.147252082824707, "logits/rejected": 1.9128397703170776, "logps/chosen": -231.59947204589844, "logps/rejected": -107.75613403320312, "loss": 1.2453, "nll_loss": 1.1999971866607666, "rewards/accuracies": 1.0, "rewards/chosen": 1.4113311767578125, "rewards/margins": 4.68922758102417, "rewards/rejected": -3.2778964042663574, "step": 1891 }, { "epoch": 0.31533333333333335, "grad_norm": 34.66115188598633, "learning_rate": 1.602533275340984e-07, "logits/chosen": 4.157430648803711, "logits/rejected": 4.059243202209473, "logps/chosen": -96.40852355957031, "logps/rejected": -141.35989379882812, "loss": 0.9801, "nll_loss": 0.9545397758483887, "rewards/accuracies": 1.0, "rewards/chosen": 1.453354835510254, "rewards/margins": 6.92268705368042, "rewards/rejected": -5.469332218170166, "step": 1892 }, { "epoch": 0.3155, "grad_norm": 24.501325607299805, "learning_rate": 1.602102382511667e-07, "logits/chosen": 2.1646687984466553, "logits/rejected": 1.9063136577606201, "logps/chosen": -95.9789810180664, "logps/rejected": -103.94990539550781, "loss": 1.0355, "nll_loss": 1.0103049278259277, "rewards/accuracies": 1.0, "rewards/chosen": 2.543956995010376, "rewards/margins": 5.77044677734375, "rewards/rejected": -3.226489543914795, "step": 1893 }, { "epoch": 0.31566666666666665, "grad_norm": 23.234769821166992, "learning_rate": 1.6016713142441706e-07, "logits/chosen": 2.8682804107666016, "logits/rejected": 3.0153377056121826, "logps/chosen": -59.684776306152344, "logps/rejected": -206.9835968017578, "loss": 0.7361, "nll_loss": 0.7190936803817749, "rewards/accuracies": 1.0, "rewards/chosen": 2.0449395179748535, "rewards/margins": 6.86205530166626, "rewards/rejected": -4.817115783691406, "step": 1894 }, { "epoch": 0.31583333333333335, "grad_norm": 209.1532440185547, "learning_rate": 1.6012400706640983e-07, "logits/chosen": 2.645066022872925, "logits/rejected": 2.596830368041992, "logps/chosen": -118.65635681152344, "logps/rejected": -94.72713470458984, "loss": 1.9789, "nll_loss": 1.3959569931030273, "rewards/accuracies": 1.0, "rewards/chosen": -0.6228668093681335, "rewards/margins": 0.5790390372276306, "rewards/rejected": -1.2019058465957642, "step": 1895 }, { "epoch": 0.316, "grad_norm": 42.54344177246094, "learning_rate": 1.6008086518971037e-07, "logits/chosen": 2.6162731647491455, "logits/rejected": 2.8646724224090576, "logps/chosen": -11.154455184936523, "logps/rejected": -229.36004638671875, "loss": 0.5044, "nll_loss": 0.4849763512611389, "rewards/accuracies": 1.0, "rewards/chosen": 1.7447608709335327, "rewards/margins": 7.350856781005859, "rewards/rejected": -5.606095790863037, "step": 1896 }, { "epoch": 0.31616666666666665, "grad_norm": 29.61757469177246, "learning_rate": 1.6003770580688918e-07, "logits/chosen": 3.4381659030914307, "logits/rejected": 3.5268337726593018, "logps/chosen": -27.044557571411133, "logps/rejected": -100.66146087646484, "loss": 0.5703, "nll_loss": 0.5008251667022705, "rewards/accuracies": 1.0, "rewards/chosen": 1.8765567541122437, "rewards/margins": 4.030604839324951, "rewards/rejected": -2.154048204421997, "step": 1897 }, { "epoch": 0.31633333333333336, "grad_norm": 28.02381706237793, "learning_rate": 1.5999452893052187e-07, "logits/chosen": 2.1713695526123047, "logits/rejected": 2.4736645221710205, "logps/chosen": -24.792865753173828, "logps/rejected": -203.49200439453125, "loss": 0.4875, "nll_loss": 0.43496251106262207, "rewards/accuracies": 1.0, "rewards/chosen": 0.6181772351264954, "rewards/margins": 6.300387859344482, "rewards/rejected": -5.682210445404053, "step": 1898 }, { "epoch": 0.3165, "grad_norm": 125.16837310791016, "learning_rate": 1.599513345731892e-07, "logits/chosen": 2.5616259574890137, "logits/rejected": 2.7275261878967285, "logps/chosen": -33.592288970947266, "logps/rejected": -132.21621704101562, "loss": 2.3322, "nll_loss": 2.099518060684204, "rewards/accuracies": 1.0, "rewards/chosen": -0.20816802978515625, "rewards/margins": 2.0511155128479004, "rewards/rejected": -2.2592835426330566, "step": 1899 }, { "epoch": 0.31666666666666665, "grad_norm": 37.82748031616211, "learning_rate": 1.5990812274747693e-07, "logits/chosen": 2.230320930480957, "logits/rejected": 2.2013511657714844, "logps/chosen": -67.69283294677734, "logps/rejected": -35.47039031982422, "loss": 1.0496, "nll_loss": 0.9534201622009277, "rewards/accuracies": 1.0, "rewards/chosen": 1.4136666059494019, "rewards/margins": 3.409219264984131, "rewards/rejected": -1.995552659034729, "step": 1900 }, { "epoch": 0.31683333333333336, "grad_norm": 24.705982208251953, "learning_rate": 1.5986489346597592e-07, "logits/chosen": 0.7736250162124634, "logits/rejected": 2.0059831142425537, "logps/chosen": -8.391215324401855, "logps/rejected": -252.96702575683594, "loss": 0.2648, "nll_loss": 0.22678960859775543, "rewards/accuracies": 1.0, "rewards/chosen": 1.0103791952133179, "rewards/margins": 6.385585308074951, "rewards/rejected": -5.375205993652344, "step": 1901 }, { "epoch": 0.317, "grad_norm": 29.987598419189453, "learning_rate": 1.5982164674128218e-07, "logits/chosen": 1.538798451423645, "logits/rejected": 2.069190263748169, "logps/chosen": -58.20103454589844, "logps/rejected": -150.27801513671875, "loss": 0.7875, "nll_loss": 0.7367221713066101, "rewards/accuracies": 1.0, "rewards/chosen": 0.7921150326728821, "rewards/margins": 5.280994892120361, "rewards/rejected": -4.488879680633545, "step": 1902 }, { "epoch": 0.31716666666666665, "grad_norm": 30.770679473876953, "learning_rate": 1.5977838258599677e-07, "logits/chosen": 2.6675078868865967, "logits/rejected": 2.765523910522461, "logps/chosen": -41.32572555541992, "logps/rejected": -95.19491577148438, "loss": 0.5788, "nll_loss": 0.48618510365486145, "rewards/accuracies": 1.0, "rewards/chosen": 1.1524364948272705, "rewards/margins": 3.41684627532959, "rewards/rejected": -2.2644097805023193, "step": 1903 }, { "epoch": 0.31733333333333336, "grad_norm": 22.876020431518555, "learning_rate": 1.597351010127258e-07, "logits/chosen": 1.7356659173965454, "logits/rejected": 1.504587173461914, "logps/chosen": -48.21126174926758, "logps/rejected": -86.59043884277344, "loss": 0.6074, "nll_loss": 0.5808586478233337, "rewards/accuracies": 1.0, "rewards/chosen": 1.6400907039642334, "rewards/margins": 5.942317962646484, "rewards/rejected": -4.302227020263672, "step": 1904 }, { "epoch": 0.3175, "grad_norm": 51.6471061706543, "learning_rate": 1.596918020340805e-07, "logits/chosen": 2.8727452754974365, "logits/rejected": 3.1576359272003174, "logps/chosen": -61.896968841552734, "logps/rejected": -107.7219009399414, "loss": 0.9739, "nll_loss": 0.9378328919410706, "rewards/accuracies": 1.0, "rewards/chosen": 1.1975430250167847, "rewards/margins": 5.712717533111572, "rewards/rejected": -4.515174388885498, "step": 1905 }, { "epoch": 0.31766666666666665, "grad_norm": 46.25027084350586, "learning_rate": 1.5964848566267717e-07, "logits/chosen": 0.8200044631958008, "logits/rejected": 2.8958992958068848, "logps/chosen": -25.405784606933594, "logps/rejected": -498.875, "loss": 0.8836, "nll_loss": 0.8760614991188049, "rewards/accuracies": 1.0, "rewards/chosen": 2.8190455436706543, "rewards/margins": 8.57647705078125, "rewards/rejected": -5.757431507110596, "step": 1906 }, { "epoch": 0.31783333333333336, "grad_norm": 42.12919998168945, "learning_rate": 1.596051519111371e-07, "logits/chosen": 2.763822555541992, "logits/rejected": 2.708982467651367, "logps/chosen": -21.539352416992188, "logps/rejected": -52.093528747558594, "loss": 0.7126, "nll_loss": 0.6948177814483643, "rewards/accuracies": 1.0, "rewards/chosen": 2.1083195209503174, "rewards/margins": 6.568138122558594, "rewards/rejected": -4.4598188400268555, "step": 1907 }, { "epoch": 0.318, "grad_norm": 34.17176055908203, "learning_rate": 1.5956180079208682e-07, "logits/chosen": 1.8493694067001343, "logits/rejected": 1.7368484735488892, "logps/chosen": -31.559707641601562, "logps/rejected": -80.69627380371094, "loss": 0.6947, "nll_loss": 0.6574939489364624, "rewards/accuracies": 1.0, "rewards/chosen": 1.1696487665176392, "rewards/margins": 5.647077560424805, "rewards/rejected": -4.477428913116455, "step": 1908 }, { "epoch": 0.31816666666666665, "grad_norm": 25.527055740356445, "learning_rate": 1.5951843231815766e-07, "logits/chosen": 3.442997932434082, "logits/rejected": 3.7843151092529297, "logps/chosen": -103.21311950683594, "logps/rejected": -390.34619140625, "loss": 0.972, "nll_loss": 0.9556769132614136, "rewards/accuracies": 1.0, "rewards/chosen": 1.815991997718811, "rewards/margins": 10.571037292480469, "rewards/rejected": -8.755044937133789, "step": 1909 }, { "epoch": 0.31833333333333336, "grad_norm": 29.129825592041016, "learning_rate": 1.5947504650198627e-07, "logits/chosen": 2.5653207302093506, "logits/rejected": 2.7845420837402344, "logps/chosen": -71.5970230102539, "logps/rejected": -86.796630859375, "loss": 0.87, "nll_loss": 0.8423179388046265, "rewards/accuracies": 1.0, "rewards/chosen": 1.9211678504943848, "rewards/margins": 5.557513236999512, "rewards/rejected": -3.636345624923706, "step": 1910 }, { "epoch": 0.3185, "grad_norm": 29.251386642456055, "learning_rate": 1.5943164335621417e-07, "logits/chosen": 1.020487666130066, "logits/rejected": 1.714353322982788, "logps/chosen": -92.85897064208984, "logps/rejected": -214.81832885742188, "loss": 1.0408, "nll_loss": 0.9984836578369141, "rewards/accuracies": 1.0, "rewards/chosen": 0.912000298500061, "rewards/margins": 6.006595611572266, "rewards/rejected": -5.094595432281494, "step": 1911 }, { "epoch": 0.31866666666666665, "grad_norm": 23.310667037963867, "learning_rate": 1.59388222893488e-07, "logits/chosen": 1.8329511880874634, "logits/rejected": 2.761996030807495, "logps/chosen": -67.07681274414062, "logps/rejected": -233.1009063720703, "loss": 0.7141, "nll_loss": 0.6915135979652405, "rewards/accuracies": 1.0, "rewards/chosen": 1.644976019859314, "rewards/margins": 6.699540138244629, "rewards/rejected": -5.054563999176025, "step": 1912 }, { "epoch": 0.31883333333333336, "grad_norm": 25.246692657470703, "learning_rate": 1.5934478512645947e-07, "logits/chosen": 0.5520047545433044, "logits/rejected": 2.3382771015167236, "logps/chosen": -11.308305740356445, "logps/rejected": -319.3946838378906, "loss": 0.3928, "nll_loss": 0.35338446497917175, "rewards/accuracies": 1.0, "rewards/chosen": 0.9422027468681335, "rewards/margins": 6.55143404006958, "rewards/rejected": -5.609231472015381, "step": 1913 }, { "epoch": 0.319, "grad_norm": 195.2477569580078, "learning_rate": 1.5930133006778528e-07, "logits/chosen": 2.2025773525238037, "logits/rejected": 2.047480344772339, "logps/chosen": -44.842018127441406, "logps/rejected": -69.59776306152344, "loss": 2.0836, "nll_loss": 0.5676204562187195, "rewards/accuracies": 0.0, "rewards/chosen": 0.41319888830184937, "rewards/margins": -0.9591575264930725, "rewards/rejected": 1.3723564147949219, "step": 1914 }, { "epoch": 0.31916666666666665, "grad_norm": 39.06485366821289, "learning_rate": 1.592578577301272e-07, "logits/chosen": 1.4576119184494019, "logits/rejected": 2.0087597370147705, "logps/chosen": -19.92464256286621, "logps/rejected": -112.40079498291016, "loss": 0.5663, "nll_loss": 0.4859667420387268, "rewards/accuracies": 1.0, "rewards/chosen": 0.6261720657348633, "rewards/margins": 3.8609418869018555, "rewards/rejected": -3.234769821166992, "step": 1915 }, { "epoch": 0.31933333333333336, "grad_norm": 30.556447982788086, "learning_rate": 1.5921436812615203e-07, "logits/chosen": 2.346665143966675, "logits/rejected": 2.5652058124542236, "logps/chosen": -20.20926284790039, "logps/rejected": -327.93560791015625, "loss": 0.5981, "nll_loss": 0.5774074196815491, "rewards/accuracies": 1.0, "rewards/chosen": 1.5588047504425049, "rewards/margins": 14.173802375793457, "rewards/rejected": -12.614997863769531, "step": 1916 }, { "epoch": 0.3195, "grad_norm": 29.718168258666992, "learning_rate": 1.591708612685316e-07, "logits/chosen": 3.0071475505828857, "logits/rejected": 2.943052291870117, "logps/chosen": -16.837766647338867, "logps/rejected": -39.15082931518555, "loss": 0.4274, "nll_loss": 0.3741725981235504, "rewards/accuracies": 1.0, "rewards/chosen": 1.5954363346099854, "rewards/margins": 4.355971336364746, "rewards/rejected": -2.76053524017334, "step": 1917 }, { "epoch": 0.31966666666666665, "grad_norm": 39.139827728271484, "learning_rate": 1.5912733716994273e-07, "logits/chosen": 2.5004289150238037, "logits/rejected": 2.7491703033447266, "logps/chosen": -30.960102081298828, "logps/rejected": -311.57763671875, "loss": 0.6691, "nll_loss": 0.6587255597114563, "rewards/accuracies": 1.0, "rewards/chosen": 2.3788089752197266, "rewards/margins": 8.486576080322266, "rewards/rejected": -6.107766628265381, "step": 1918 }, { "epoch": 0.31983333333333336, "grad_norm": 60.75238037109375, "learning_rate": 1.590837958430673e-07, "logits/chosen": 1.6804966926574707, "logits/rejected": 2.2341525554656982, "logps/chosen": -21.266273498535156, "logps/rejected": -286.97509765625, "loss": 0.9761, "nll_loss": 0.9666486978530884, "rewards/accuracies": 1.0, "rewards/chosen": 2.571803092956543, "rewards/margins": 8.126627922058105, "rewards/rejected": -5.5548248291015625, "step": 1919 }, { "epoch": 0.32, "grad_norm": 29.022632598876953, "learning_rate": 1.5904023730059226e-07, "logits/chosen": 1.8739495277404785, "logits/rejected": 2.049379348754883, "logps/chosen": -23.837568283081055, "logps/rejected": -55.58647918701172, "loss": 0.5122, "nll_loss": 0.39729276299476624, "rewards/accuracies": 1.0, "rewards/chosen": 1.517874002456665, "rewards/margins": 3.198373794555664, "rewards/rejected": -1.680499792098999, "step": 1920 }, { "epoch": 0.32016666666666665, "grad_norm": 221.6335906982422, "learning_rate": 1.589966615552095e-07, "logits/chosen": 2.187295913696289, "logits/rejected": 2.13480806350708, "logps/chosen": -47.475318908691406, "logps/rejected": -50.7965202331543, "loss": 3.824, "nll_loss": 0.8185399770736694, "rewards/accuracies": 0.0, "rewards/chosen": 1.838930606842041, "rewards/margins": -2.421261787414551, "rewards/rejected": 4.260192394256592, "step": 1921 }, { "epoch": 0.32033333333333336, "grad_norm": 31.748544692993164, "learning_rate": 1.5895306861961592e-07, "logits/chosen": 2.5000553131103516, "logits/rejected": 2.3341596126556396, "logps/chosen": -66.35814666748047, "logps/rejected": -28.098201751708984, "loss": 0.845, "nll_loss": 0.7373127341270447, "rewards/accuracies": 1.0, "rewards/chosen": 2.51430606842041, "rewards/margins": 3.841744899749756, "rewards/rejected": -1.3274388313293457, "step": 1922 }, { "epoch": 0.3205, "grad_norm": 32.389686584472656, "learning_rate": 1.5890945850651345e-07, "logits/chosen": 2.852630615234375, "logits/rejected": 3.0383756160736084, "logps/chosen": -57.145774841308594, "logps/rejected": -243.4866485595703, "loss": 0.7893, "nll_loss": 0.742152988910675, "rewards/accuracies": 1.0, "rewards/chosen": 0.8390785455703735, "rewards/margins": 5.527840614318848, "rewards/rejected": -4.688762187957764, "step": 1923 }, { "epoch": 0.32066666666666666, "grad_norm": 38.875030517578125, "learning_rate": 1.588658312286091e-07, "logits/chosen": 2.112506151199341, "logits/rejected": 2.6741411685943604, "logps/chosen": -14.723938941955566, "logps/rejected": -248.88931274414062, "loss": 0.5299, "nll_loss": 0.46012309193611145, "rewards/accuracies": 1.0, "rewards/chosen": 0.23715773224830627, "rewards/margins": 7.010528564453125, "rewards/rejected": -6.773370742797852, "step": 1924 }, { "epoch": 0.32083333333333336, "grad_norm": 33.45489501953125, "learning_rate": 1.5882218679861472e-07, "logits/chosen": 2.698530435562134, "logits/rejected": 2.882520914077759, "logps/chosen": -56.86501693725586, "logps/rejected": -234.53805541992188, "loss": 0.9375, "nll_loss": 0.9026192426681519, "rewards/accuracies": 1.0, "rewards/chosen": 1.2346134185791016, "rewards/margins": 5.734067440032959, "rewards/rejected": -4.499454021453857, "step": 1925 }, { "epoch": 0.321, "grad_norm": 38.18328094482422, "learning_rate": 1.5877852522924732e-07, "logits/chosen": 2.191927671432495, "logits/rejected": 2.3190581798553467, "logps/chosen": -63.034324645996094, "logps/rejected": -129.42828369140625, "loss": 1.1782, "nll_loss": 1.1673022508621216, "rewards/accuracies": 1.0, "rewards/chosen": 2.4334914684295654, "rewards/margins": 7.848546981811523, "rewards/rejected": -5.415055751800537, "step": 1926 }, { "epoch": 0.32116666666666666, "grad_norm": 233.4822235107422, "learning_rate": 1.5873484653322878e-07, "logits/chosen": 1.5092257261276245, "logits/rejected": 1.4857102632522583, "logps/chosen": -49.24977493286133, "logps/rejected": -23.11553192138672, "loss": 3.5363, "nll_loss": 0.7035682201385498, "rewards/accuracies": 0.0, "rewards/chosen": 1.131608247756958, "rewards/margins": -2.3697237968444824, "rewards/rejected": 3.5013320446014404, "step": 1927 }, { "epoch": 0.32133333333333336, "grad_norm": 69.37641906738281, "learning_rate": 1.5869115072328608e-07, "logits/chosen": 2.327634334564209, "logits/rejected": 2.353105068206787, "logps/chosen": -9.498937606811523, "logps/rejected": -44.908851623535156, "loss": 0.593, "nll_loss": 0.5277187824249268, "rewards/accuracies": 1.0, "rewards/chosen": 1.71407151222229, "rewards/margins": 4.063577651977539, "rewards/rejected": -2.349506139755249, "step": 1928 }, { "epoch": 0.3215, "grad_norm": 20.912776947021484, "learning_rate": 1.5864743781215107e-07, "logits/chosen": 2.3275704383850098, "logits/rejected": 2.7361762523651123, "logps/chosen": -105.8605728149414, "logps/rejected": -500.36669921875, "loss": 0.893, "nll_loss": 0.8821713924407959, "rewards/accuracies": 1.0, "rewards/chosen": 2.247119903564453, "rewards/margins": 10.286906242370605, "rewards/rejected": -8.039786338806152, "step": 1929 }, { "epoch": 0.32166666666666666, "grad_norm": 27.44679069519043, "learning_rate": 1.586037078125607e-07, "logits/chosen": 2.5149309635162354, "logits/rejected": 2.4920780658721924, "logps/chosen": -120.22931671142578, "logps/rejected": -57.50494384765625, "loss": 0.9664, "nll_loss": 0.9466874599456787, "rewards/accuracies": 1.0, "rewards/chosen": 2.1654412746429443, "rewards/margins": 6.2197065353393555, "rewards/rejected": -4.054265022277832, "step": 1930 }, { "epoch": 0.32183333333333336, "grad_norm": 102.13038635253906, "learning_rate": 1.585599607372568e-07, "logits/chosen": 2.191894054412842, "logits/rejected": 2.525421380996704, "logps/chosen": -91.11026000976562, "logps/rejected": -158.78720092773438, "loss": 1.1194, "nll_loss": 1.0594216585159302, "rewards/accuracies": 1.0, "rewards/chosen": 0.8543792963027954, "rewards/margins": 4.425477504730225, "rewards/rejected": -3.5710983276367188, "step": 1931 }, { "epoch": 0.322, "grad_norm": 48.69500732421875, "learning_rate": 1.5851619659898624e-07, "logits/chosen": 3.046652317047119, "logits/rejected": 3.0902411937713623, "logps/chosen": -48.43949508666992, "logps/rejected": -60.87549591064453, "loss": 1.0015, "nll_loss": 0.9497942328453064, "rewards/accuracies": 1.0, "rewards/chosen": 2.4444034099578857, "rewards/margins": 4.728890419006348, "rewards/rejected": -2.284487247467041, "step": 1932 }, { "epoch": 0.32216666666666666, "grad_norm": 23.45412254333496, "learning_rate": 1.584724154105008e-07, "logits/chosen": 1.3373092412948608, "logits/rejected": 1.8431131839752197, "logps/chosen": -213.14096069335938, "logps/rejected": -295.2760009765625, "loss": 0.99, "nll_loss": 0.9600943922996521, "rewards/accuracies": 1.0, "rewards/chosen": 1.4002563953399658, "rewards/margins": 6.0004987716674805, "rewards/rejected": -4.600242614746094, "step": 1933 }, { "epoch": 0.32233333333333336, "grad_norm": 18.64017105102539, "learning_rate": 1.5842861718455734e-07, "logits/chosen": 2.406491994857788, "logits/rejected": 2.435518741607666, "logps/chosen": -69.31861877441406, "logps/rejected": -119.56114959716797, "loss": 0.5624, "nll_loss": 0.5415517091751099, "rewards/accuracies": 1.0, "rewards/chosen": 2.43818736076355, "rewards/margins": 6.056884765625, "rewards/rejected": -3.6186976432800293, "step": 1934 }, { "epoch": 0.3225, "grad_norm": 22.235687255859375, "learning_rate": 1.5838480193391753e-07, "logits/chosen": 1.986484408378601, "logits/rejected": 2.3222861289978027, "logps/chosen": -129.15045166015625, "logps/rejected": -267.26348876953125, "loss": 0.9806, "nll_loss": 0.9710560441017151, "rewards/accuracies": 1.0, "rewards/chosen": 2.6668243408203125, "rewards/margins": 7.866174697875977, "rewards/rejected": -5.199350357055664, "step": 1935 }, { "epoch": 0.32266666666666666, "grad_norm": 23.990562438964844, "learning_rate": 1.5834096967134814e-07, "logits/chosen": 2.647587537765503, "logits/rejected": 2.599154472351074, "logps/chosen": -82.82540893554688, "logps/rejected": -172.9840087890625, "loss": 0.848, "nll_loss": 0.828253984451294, "rewards/accuracies": 1.0, "rewards/chosen": 1.6792892217636108, "rewards/margins": 7.695457458496094, "rewards/rejected": -6.016168117523193, "step": 1936 }, { "epoch": 0.32283333333333336, "grad_norm": 50.06648635864258, "learning_rate": 1.5829712040962084e-07, "logits/chosen": 2.6733641624450684, "logits/rejected": 2.8124818801879883, "logps/chosen": -57.72456741333008, "logps/rejected": -200.53538513183594, "loss": 1.5267, "nll_loss": 1.519067645072937, "rewards/accuracies": 1.0, "rewards/chosen": 2.5940558910369873, "rewards/margins": 11.164509773254395, "rewards/rejected": -8.570453643798828, "step": 1937 }, { "epoch": 0.323, "grad_norm": 211.01629638671875, "learning_rate": 1.5825325416151222e-07, "logits/chosen": 2.655428886413574, "logits/rejected": 2.681596279144287, "logps/chosen": -48.558353424072266, "logps/rejected": -38.85686492919922, "loss": 3.3183, "nll_loss": 0.5850405097007751, "rewards/accuracies": 0.0, "rewards/chosen": 0.9246296286582947, "rewards/margins": -2.2986700534820557, "rewards/rejected": 3.223299741744995, "step": 1938 }, { "epoch": 0.32316666666666666, "grad_norm": 145.795654296875, "learning_rate": 1.5820937093980384e-07, "logits/chosen": 2.7567174434661865, "logits/rejected": 2.5425729751586914, "logps/chosen": -191.62843322753906, "logps/rejected": -177.64224243164062, "loss": 2.6857, "nll_loss": 1.975550889968872, "rewards/accuracies": 1.0, "rewards/chosen": -3.4383699893951416, "rewards/margins": 1.6537764072418213, "rewards/rejected": -5.092146396636963, "step": 1939 }, { "epoch": 0.3233333333333333, "grad_norm": 27.183124542236328, "learning_rate": 1.5816547075728226e-07, "logits/chosen": 2.8064897060394287, "logits/rejected": 3.129659652709961, "logps/chosen": -42.399131774902344, "logps/rejected": -128.15924072265625, "loss": 0.6183, "nll_loss": 0.5808099508285522, "rewards/accuracies": 1.0, "rewards/chosen": 1.2343124151229858, "rewards/margins": 5.404689311981201, "rewards/rejected": -4.170376777648926, "step": 1940 }, { "epoch": 0.3235, "grad_norm": 25.52777862548828, "learning_rate": 1.5812155362673893e-07, "logits/chosen": 1.9737786054611206, "logits/rejected": 2.313871383666992, "logps/chosen": -77.93922424316406, "logps/rejected": -309.35015869140625, "loss": 0.8508, "nll_loss": 0.8291406035423279, "rewards/accuracies": 1.0, "rewards/chosen": 1.7124168872833252, "rewards/margins": 6.637957572937012, "rewards/rejected": -4.925540447235107, "step": 1941 }, { "epoch": 0.32366666666666666, "grad_norm": 43.40453338623047, "learning_rate": 1.5807761956097024e-07, "logits/chosen": 2.5838160514831543, "logits/rejected": 2.3530776500701904, "logps/chosen": -7.733247756958008, "logps/rejected": -33.36204528808594, "loss": 0.4331, "nll_loss": 0.36824992299079895, "rewards/accuracies": 1.0, "rewards/chosen": 1.1438184976577759, "rewards/margins": 4.036129474639893, "rewards/rejected": -2.8923110961914062, "step": 1942 }, { "epoch": 0.3238333333333333, "grad_norm": 47.047698974609375, "learning_rate": 1.5803366857277748e-07, "logits/chosen": 3.2810730934143066, "logits/rejected": 3.3045694828033447, "logps/chosen": -20.897079467773438, "logps/rejected": -54.08373260498047, "loss": 0.5608, "nll_loss": 0.3869829475879669, "rewards/accuracies": 1.0, "rewards/chosen": 0.6809150576591492, "rewards/margins": 2.414677858352661, "rewards/rejected": -1.7337627410888672, "step": 1943 }, { "epoch": 0.324, "grad_norm": 23.078632354736328, "learning_rate": 1.57989700674967e-07, "logits/chosen": 2.8910248279571533, "logits/rejected": 3.0601704120635986, "logps/chosen": -85.58663940429688, "logps/rejected": -267.8878173828125, "loss": 0.8334, "nll_loss": 0.8074212074279785, "rewards/accuracies": 1.0, "rewards/chosen": 1.3160820007324219, "rewards/margins": 10.640694618225098, "rewards/rejected": -9.324612617492676, "step": 1944 }, { "epoch": 0.32416666666666666, "grad_norm": 29.910249710083008, "learning_rate": 1.579457158803499e-07, "logits/chosen": 2.9467780590057373, "logits/rejected": 2.8705384731292725, "logps/chosen": -21.472028732299805, "logps/rejected": -89.081787109375, "loss": 0.5181, "nll_loss": 0.48800066113471985, "rewards/accuracies": 1.0, "rewards/chosen": 1.5468050241470337, "rewards/margins": 5.631772041320801, "rewards/rejected": -4.084967136383057, "step": 1945 }, { "epoch": 0.3243333333333333, "grad_norm": 29.708696365356445, "learning_rate": 1.579017142017424e-07, "logits/chosen": 1.4427016973495483, "logits/rejected": 2.1038057804107666, "logps/chosen": -57.13746643066406, "logps/rejected": -151.24569702148438, "loss": 0.7686, "nll_loss": 0.7232590913772583, "rewards/accuracies": 1.0, "rewards/chosen": 0.8984718322753906, "rewards/margins": 5.484119415283203, "rewards/rejected": -4.5856475830078125, "step": 1946 }, { "epoch": 0.3245, "grad_norm": 65.18689727783203, "learning_rate": 1.578576956519654e-07, "logits/chosen": 3.0756564140319824, "logits/rejected": 3.032330274581909, "logps/chosen": -6.311270713806152, "logps/rejected": -48.69601821899414, "loss": 0.4093, "nll_loss": 0.30053675174713135, "rewards/accuracies": 1.0, "rewards/chosen": 1.7445392608642578, "rewards/margins": 3.372652053833008, "rewards/rejected": -1.62811279296875, "step": 1947 }, { "epoch": 0.32466666666666666, "grad_norm": 48.45275115966797, "learning_rate": 1.5781366024384495e-07, "logits/chosen": 2.802086114883423, "logits/rejected": 2.835750102996826, "logps/chosen": -37.501041412353516, "logps/rejected": -142.5272979736328, "loss": 1.0163, "nll_loss": 0.9868696331977844, "rewards/accuracies": 1.0, "rewards/chosen": 1.602063775062561, "rewards/margins": 5.623150825500488, "rewards/rejected": -4.021087169647217, "step": 1948 }, { "epoch": 0.3248333333333333, "grad_norm": 297.8153076171875, "learning_rate": 1.5776960799021187e-07, "logits/chosen": 2.5476677417755127, "logits/rejected": 2.4419021606445312, "logps/chosen": -155.38905334472656, "logps/rejected": -111.90162658691406, "loss": 2.4116, "nll_loss": 1.2842072248458862, "rewards/accuracies": 0.0, "rewards/chosen": -3.1186647415161133, "rewards/margins": -0.11466693878173828, "rewards/rejected": -3.003997802734375, "step": 1949 }, { "epoch": 0.325, "grad_norm": 56.8993034362793, "learning_rate": 1.5772553890390196e-07, "logits/chosen": 2.5357449054718018, "logits/rejected": 2.5638763904571533, "logps/chosen": -14.597404479980469, "logps/rejected": -119.26185607910156, "loss": 0.6498, "nll_loss": 0.5838961601257324, "rewards/accuracies": 1.0, "rewards/chosen": 2.540597438812256, "rewards/margins": 4.486447334289551, "rewards/rejected": -1.9458496570587158, "step": 1950 }, { "epoch": 0.32516666666666666, "grad_norm": 62.22491455078125, "learning_rate": 1.5768145299775584e-07, "logits/chosen": 1.2765060663223267, "logits/rejected": 2.1996123790740967, "logps/chosen": -12.951019287109375, "logps/rejected": -130.416015625, "loss": 0.8496, "nll_loss": 0.8094387054443359, "rewards/accuracies": 1.0, "rewards/chosen": 1.6857526302337646, "rewards/margins": 4.851835250854492, "rewards/rejected": -3.1660828590393066, "step": 1951 }, { "epoch": 0.3253333333333333, "grad_norm": 23.128854751586914, "learning_rate": 1.5763735028461913e-07, "logits/chosen": 0.7792144417762756, "logits/rejected": 1.4014148712158203, "logps/chosen": -23.59821128845215, "logps/rejected": -173.29193115234375, "loss": 0.4205, "nll_loss": 0.39330342411994934, "rewards/accuracies": 1.0, "rewards/chosen": 1.3071924448013306, "rewards/margins": 7.590311050415039, "rewards/rejected": -6.283118724822998, "step": 1952 }, { "epoch": 0.3255, "grad_norm": 41.93413162231445, "learning_rate": 1.575932307773423e-07, "logits/chosen": 2.0086984634399414, "logits/rejected": 2.542909622192383, "logps/chosen": -28.867660522460938, "logps/rejected": -207.77378845214844, "loss": 1.0999, "nll_loss": 1.0691726207733154, "rewards/accuracies": 1.0, "rewards/chosen": 3.2166976928710938, "rewards/margins": 5.917424201965332, "rewards/rejected": -2.7007265090942383, "step": 1953 }, { "epoch": 0.32566666666666666, "grad_norm": 33.192474365234375, "learning_rate": 1.5754909448878065e-07, "logits/chosen": 3.0959854125976562, "logits/rejected": 3.1639106273651123, "logps/chosen": -119.57181549072266, "logps/rejected": -145.875732421875, "loss": 1.0571, "nll_loss": 1.0307915210723877, "rewards/accuracies": 1.0, "rewards/chosen": 1.6946296691894531, "rewards/margins": 5.85152530670166, "rewards/rejected": -4.156895637512207, "step": 1954 }, { "epoch": 0.3258333333333333, "grad_norm": 19.29073143005371, "learning_rate": 1.5750494143179453e-07, "logits/chosen": 2.397819757461548, "logits/rejected": 2.563209295272827, "logps/chosen": -180.20286560058594, "logps/rejected": -218.93887329101562, "loss": 0.9932, "nll_loss": 0.9901256561279297, "rewards/accuracies": 1.0, "rewards/chosen": 3.722133159637451, "rewards/margins": 10.224013328552246, "rewards/rejected": -6.501880168914795, "step": 1955 }, { "epoch": 0.326, "grad_norm": 24.604597091674805, "learning_rate": 1.5746077161924903e-07, "logits/chosen": 3.119086742401123, "logits/rejected": 3.2099881172180176, "logps/chosen": -74.75018310546875, "logps/rejected": -280.11761474609375, "loss": 0.8013, "nll_loss": 0.7786478400230408, "rewards/accuracies": 1.0, "rewards/chosen": 1.8885536193847656, "rewards/margins": 6.068848609924316, "rewards/rejected": -4.180294990539551, "step": 1956 }, { "epoch": 0.32616666666666666, "grad_norm": 28.731300354003906, "learning_rate": 1.5741658506401421e-07, "logits/chosen": 0.07173822820186615, "logits/rejected": 2.208347797393799, "logps/chosen": -86.1394271850586, "logps/rejected": -460.69537353515625, "loss": 0.865, "nll_loss": 0.8282636404037476, "rewards/accuracies": 1.0, "rewards/chosen": 0.9346199035644531, "rewards/margins": 11.179206848144531, "rewards/rejected": -10.244586944580078, "step": 1957 }, { "epoch": 0.3263333333333333, "grad_norm": 29.65165901184082, "learning_rate": 1.573723817789649e-07, "logits/chosen": 1.3113094568252563, "logits/rejected": 1.1935575008392334, "logps/chosen": -120.17440795898438, "logps/rejected": -77.89292907714844, "loss": 1.2521, "nll_loss": 1.2138828039169312, "rewards/accuracies": 1.0, "rewards/chosen": 1.3667116165161133, "rewards/margins": 5.12355899810791, "rewards/rejected": -3.7568471431732178, "step": 1958 }, { "epoch": 0.3265, "grad_norm": 62.45112991333008, "learning_rate": 1.5732816177698096e-07, "logits/chosen": 2.279245615005493, "logits/rejected": 2.78849720954895, "logps/chosen": -22.453914642333984, "logps/rejected": -295.56884765625, "loss": 0.946, "nll_loss": 0.9355795979499817, "rewards/accuracies": 1.0, "rewards/chosen": 2.458991289138794, "rewards/margins": 7.952732086181641, "rewards/rejected": -5.493741035461426, "step": 1959 }, { "epoch": 0.32666666666666666, "grad_norm": 26.007644653320312, "learning_rate": 1.5728392507094696e-07, "logits/chosen": 2.366978406906128, "logits/rejected": 2.401752471923828, "logps/chosen": -100.48112487792969, "logps/rejected": -126.40194702148438, "loss": 0.9001, "nll_loss": 0.8814132809638977, "rewards/accuracies": 1.0, "rewards/chosen": 1.7194808721542358, "rewards/margins": 7.860449314117432, "rewards/rejected": -6.140968322753906, "step": 1960 }, { "epoch": 0.3268333333333333, "grad_norm": 24.0595760345459, "learning_rate": 1.5723967167375247e-07, "logits/chosen": 2.5689096450805664, "logits/rejected": 2.553351879119873, "logps/chosen": -103.99459075927734, "logps/rejected": -203.56517028808594, "loss": 0.9848, "nll_loss": 0.9810811877250671, "rewards/accuracies": 1.0, "rewards/chosen": 3.390923500061035, "rewards/margins": 10.608182907104492, "rewards/rejected": -7.217259407043457, "step": 1961 }, { "epoch": 0.327, "grad_norm": 37.602455139160156, "learning_rate": 1.5719540159829182e-07, "logits/chosen": 2.994357109069824, "logits/rejected": 3.0671489238739014, "logps/chosen": -72.06008911132812, "logps/rejected": -206.25857543945312, "loss": 0.9692, "nll_loss": 0.9358451962471008, "rewards/accuracies": 1.0, "rewards/chosen": 1.0833114385604858, "rewards/margins": 7.2082343101501465, "rewards/rejected": -6.124922752380371, "step": 1962 }, { "epoch": 0.32716666666666666, "grad_norm": 27.625938415527344, "learning_rate": 1.5715111485746434e-07, "logits/chosen": 3.2553770542144775, "logits/rejected": 3.467773675918579, "logps/chosen": -50.99234390258789, "logps/rejected": -128.20751953125, "loss": 0.8044, "nll_loss": 0.7844976186752319, "rewards/accuracies": 1.0, "rewards/chosen": 1.7381633520126343, "rewards/margins": 7.0213799476623535, "rewards/rejected": -5.28321647644043, "step": 1963 }, { "epoch": 0.3273333333333333, "grad_norm": 35.272762298583984, "learning_rate": 1.5710681146417398e-07, "logits/chosen": 1.092036247253418, "logits/rejected": 1.6656635999679565, "logps/chosen": -88.05601501464844, "logps/rejected": -256.2049560546875, "loss": 1.0642, "nll_loss": 1.0359530448913574, "rewards/accuracies": 1.0, "rewards/chosen": 1.2809007167816162, "rewards/margins": 7.134233474731445, "rewards/rejected": -5.853332996368408, "step": 1964 }, { "epoch": 0.3275, "grad_norm": 69.68830871582031, "learning_rate": 1.570624914313298e-07, "logits/chosen": 1.4904292821884155, "logits/rejected": 1.597607970237732, "logps/chosen": -15.327831268310547, "logps/rejected": -26.762916564941406, "loss": 0.9493, "nll_loss": 0.2947659194469452, "rewards/accuracies": 1.0, "rewards/chosen": 2.479166030883789, "rewards/margins": 1.148060917854309, "rewards/rejected": 1.33110511302948, "step": 1965 }, { "epoch": 0.32766666666666666, "grad_norm": 109.76797485351562, "learning_rate": 1.5701815477184558e-07, "logits/chosen": 1.5831080675125122, "logits/rejected": 1.8676434755325317, "logps/chosen": -36.824546813964844, "logps/rejected": -92.81459045410156, "loss": 2.562, "nll_loss": 2.3015341758728027, "rewards/accuracies": 1.0, "rewards/chosen": -0.23198890686035156, "rewards/margins": 1.8466136455535889, "rewards/rejected": -2.0786025524139404, "step": 1966 }, { "epoch": 0.3278333333333333, "grad_norm": 62.90400314331055, "learning_rate": 1.5697380149863988e-07, "logits/chosen": 2.529290199279785, "logits/rejected": 2.6814372539520264, "logps/chosen": -110.80196380615234, "logps/rejected": -162.59109497070312, "loss": 1.0309, "nll_loss": 0.9082127809524536, "rewards/accuracies": 1.0, "rewards/chosen": -0.49301835894584656, "rewards/margins": 6.711113452911377, "rewards/rejected": -7.204131603240967, "step": 1967 }, { "epoch": 0.328, "grad_norm": 113.7312240600586, "learning_rate": 1.5692943162463626e-07, "logits/chosen": 2.656914472579956, "logits/rejected": 2.473504066467285, "logps/chosen": -133.2545623779297, "logps/rejected": -13.029422760009766, "loss": 1.9978, "nll_loss": 0.9870707392692566, "rewards/accuracies": 1.0, "rewards/chosen": 2.7846620082855225, "rewards/margins": 0.4518585205078125, "rewards/rejected": 2.33280348777771, "step": 1968 }, { "epoch": 0.32816666666666666, "grad_norm": 152.16709899902344, "learning_rate": 1.5688504516276301e-07, "logits/chosen": 2.880647659301758, "logits/rejected": 2.6142587661743164, "logps/chosen": -113.26473236083984, "logps/rejected": -36.21576690673828, "loss": 1.3769, "nll_loss": 0.8580661416053772, "rewards/accuracies": 1.0, "rewards/chosen": -1.3420495986938477, "rewards/margins": 0.9326205253601074, "rewards/rejected": -2.274670124053955, "step": 1969 }, { "epoch": 0.3283333333333333, "grad_norm": 24.32561683654785, "learning_rate": 1.568406421259533e-07, "logits/chosen": 2.5078821182250977, "logits/rejected": 2.605588912963867, "logps/chosen": -143.7252655029297, "logps/rejected": -270.81915283203125, "loss": 1.0483, "nll_loss": 1.026608943939209, "rewards/accuracies": 1.0, "rewards/chosen": 1.6648637056350708, "rewards/margins": 6.78446102142334, "rewards/rejected": -5.119597434997559, "step": 1970 }, { "epoch": 0.3285, "grad_norm": 32.678497314453125, "learning_rate": 1.5679622252714506e-07, "logits/chosen": 1.4921660423278809, "logits/rejected": 2.1010830402374268, "logps/chosen": -60.34511184692383, "logps/rejected": -503.13250732421875, "loss": 0.9752, "nll_loss": 0.9428923726081848, "rewards/accuracies": 1.0, "rewards/chosen": 1.1171826124191284, "rewards/margins": 7.213377952575684, "rewards/rejected": -6.096195220947266, "step": 1971 }, { "epoch": 0.32866666666666666, "grad_norm": 36.622467041015625, "learning_rate": 1.5675178637928112e-07, "logits/chosen": 2.3910465240478516, "logits/rejected": 2.715940237045288, "logps/chosen": -70.0333480834961, "logps/rejected": -139.14564514160156, "loss": 1.1458, "nll_loss": 1.094271183013916, "rewards/accuracies": 1.0, "rewards/chosen": 1.832674503326416, "rewards/margins": 4.454958915710449, "rewards/rejected": -2.6222846508026123, "step": 1972 }, { "epoch": 0.3288333333333333, "grad_norm": 41.892887115478516, "learning_rate": 1.5670733369530914e-07, "logits/chosen": 2.6508448123931885, "logits/rejected": 2.52756667137146, "logps/chosen": -74.52617645263672, "logps/rejected": -30.497303009033203, "loss": 0.8153, "nll_loss": 0.6775106191635132, "rewards/accuracies": 1.0, "rewards/chosen": 1.429421305656433, "rewards/margins": 2.923647403717041, "rewards/rejected": -1.4942259788513184, "step": 1973 }, { "epoch": 0.329, "grad_norm": 47.49744415283203, "learning_rate": 1.566628644881815e-07, "logits/chosen": 3.4775238037109375, "logits/rejected": 3.5040087699890137, "logps/chosen": -107.2374038696289, "logps/rejected": -122.4320068359375, "loss": 1.4043, "nll_loss": 1.3404675722122192, "rewards/accuracies": 1.0, "rewards/chosen": 1.0352859497070312, "rewards/margins": 4.104781627655029, "rewards/rejected": -3.069495677947998, "step": 1974 }, { "epoch": 0.32916666666666666, "grad_norm": 26.057538986206055, "learning_rate": 1.566183787708555e-07, "logits/chosen": 3.015766143798828, "logits/rejected": 3.167268991470337, "logps/chosen": -29.510299682617188, "logps/rejected": -361.42352294921875, "loss": 0.5638, "nll_loss": 0.5365509390830994, "rewards/accuracies": 1.0, "rewards/chosen": 1.2952613830566406, "rewards/margins": 7.5659613609313965, "rewards/rejected": -6.270699977874756, "step": 1975 }, { "epoch": 0.3293333333333333, "grad_norm": 25.19394302368164, "learning_rate": 1.565738765562932e-07, "logits/chosen": 2.888869285583496, "logits/rejected": 3.1271722316741943, "logps/chosen": -31.232315063476562, "logps/rejected": -140.1446533203125, "loss": 0.536, "nll_loss": 0.5120052099227905, "rewards/accuracies": 1.0, "rewards/chosen": 1.886331558227539, "rewards/margins": 5.901449203491211, "rewards/rejected": -4.015117645263672, "step": 1976 }, { "epoch": 0.3295, "grad_norm": 263.1483154296875, "learning_rate": 1.565293578574615e-07, "logits/chosen": 2.6680502891540527, "logits/rejected": 2.9159440994262695, "logps/chosen": -40.88063049316406, "logps/rejected": -107.40016174316406, "loss": 3.6207, "nll_loss": 0.619403600692749, "rewards/accuracies": 0.0, "rewards/chosen": 1.7009509801864624, "rewards/margins": -2.4403834342956543, "rewards/rejected": 4.141334533691406, "step": 1977 }, { "epoch": 0.32966666666666666, "grad_norm": 47.32052230834961, "learning_rate": 1.5648482268733206e-07, "logits/chosen": 3.1915831565856934, "logits/rejected": 3.306962490081787, "logps/chosen": -77.25720977783203, "logps/rejected": -153.83270263671875, "loss": 1.5698, "nll_loss": 1.5451440811157227, "rewards/accuracies": 1.0, "rewards/chosen": 3.596731185913086, "rewards/margins": 6.4865546226501465, "rewards/rejected": -2.8898234367370605, "step": 1978 }, { "epoch": 0.3298333333333333, "grad_norm": 111.61925506591797, "learning_rate": 1.5644027105888133e-07, "logits/chosen": 2.760999917984009, "logits/rejected": 3.049692392349243, "logps/chosen": -32.23871612548828, "logps/rejected": -99.56588745117188, "loss": 1.725, "nll_loss": 1.4016833305358887, "rewards/accuracies": 1.0, "rewards/chosen": -1.4485639333724976, "rewards/margins": 2.21870756149292, "rewards/rejected": -3.667271375656128, "step": 1979 }, { "epoch": 0.33, "grad_norm": 31.331533432006836, "learning_rate": 1.5639570298509063e-07, "logits/chosen": 1.5094075202941895, "logits/rejected": 2.5763697624206543, "logps/chosen": -94.74491882324219, "logps/rejected": -325.4991760253906, "loss": 1.0959, "nll_loss": 1.0766469240188599, "rewards/accuracies": 1.0, "rewards/chosen": 1.6174354553222656, "rewards/margins": 12.882539749145508, "rewards/rejected": -11.265104293823242, "step": 1980 }, { "epoch": 0.33016666666666666, "grad_norm": 42.27424621582031, "learning_rate": 1.5635111847894602e-07, "logits/chosen": 3.0905699729919434, "logits/rejected": 3.231006145477295, "logps/chosen": -18.966230392456055, "logps/rejected": -33.953765869140625, "loss": 0.867, "nll_loss": 0.8246188163757324, "rewards/accuracies": 1.0, "rewards/chosen": 2.8939738273620605, "rewards/margins": 5.292283058166504, "rewards/rejected": -2.3983089923858643, "step": 1981 }, { "epoch": 0.3303333333333333, "grad_norm": 32.43072509765625, "learning_rate": 1.5630651755343835e-07, "logits/chosen": 1.3341069221496582, "logits/rejected": 2.2018723487854004, "logps/chosen": -86.92109680175781, "logps/rejected": -261.66510009765625, "loss": 1.1667, "nll_loss": 1.128845453262329, "rewards/accuracies": 1.0, "rewards/chosen": 0.8979431390762329, "rewards/margins": 9.044995307922363, "rewards/rejected": -8.147051811218262, "step": 1982 }, { "epoch": 0.3305, "grad_norm": 33.41130447387695, "learning_rate": 1.5626190022156326e-07, "logits/chosen": 2.786313533782959, "logits/rejected": 2.9300198554992676, "logps/chosen": -64.44491577148438, "logps/rejected": -274.2445068359375, "loss": 0.9287, "nll_loss": 0.9076750874519348, "rewards/accuracies": 1.0, "rewards/chosen": 1.5392448902130127, "rewards/margins": 9.128185272216797, "rewards/rejected": -7.588940620422363, "step": 1983 }, { "epoch": 0.33066666666666666, "grad_norm": 57.80878829956055, "learning_rate": 1.5621726649632114e-07, "logits/chosen": 2.151319980621338, "logits/rejected": 2.3395638465881348, "logps/chosen": -241.63168334960938, "logps/rejected": -347.5538330078125, "loss": 1.2959, "nll_loss": 1.1673028469085693, "rewards/accuracies": 1.0, "rewards/chosen": -0.5202392935752869, "rewards/margins": 5.297900676727295, "rewards/rejected": -5.818140029907227, "step": 1984 }, { "epoch": 0.3308333333333333, "grad_norm": 29.64607048034668, "learning_rate": 1.5617261639071723e-07, "logits/chosen": 3.422426700592041, "logits/rejected": 3.5315983295440674, "logps/chosen": -76.06303405761719, "logps/rejected": -202.36203002929688, "loss": 0.9822, "nll_loss": 0.9628233313560486, "rewards/accuracies": 1.0, "rewards/chosen": 1.7172043323516846, "rewards/margins": 7.326547622680664, "rewards/rejected": -5.6093430519104, "step": 1985 }, { "epoch": 0.331, "grad_norm": 70.77349853515625, "learning_rate": 1.5612794991776145e-07, "logits/chosen": 2.992788076400757, "logits/rejected": 3.0766384601593018, "logps/chosen": -83.2315444946289, "logps/rejected": -201.91619873046875, "loss": 2.6572, "nll_loss": 2.6009857654571533, "rewards/accuracies": 1.0, "rewards/chosen": 0.4968811273574829, "rewards/margins": 6.3857102394104, "rewards/rejected": -5.888829231262207, "step": 1986 }, { "epoch": 0.33116666666666666, "grad_norm": 74.32589721679688, "learning_rate": 1.560832670904686e-07, "logits/chosen": 1.6716734170913696, "logits/rejected": 2.403566360473633, "logps/chosen": -32.71429443359375, "logps/rejected": -316.35369873046875, "loss": 1.3556, "nll_loss": 1.3085715770721436, "rewards/accuracies": 1.0, "rewards/chosen": 0.6791465878486633, "rewards/margins": 7.107807636260986, "rewards/rejected": -6.428660869598389, "step": 1987 }, { "epoch": 0.3313333333333333, "grad_norm": 295.8221435546875, "learning_rate": 1.5603856792185815e-07, "logits/chosen": 1.8569445610046387, "logits/rejected": 2.27496075630188, "logps/chosen": -86.79539489746094, "logps/rejected": -89.48023223876953, "loss": 3.1137, "nll_loss": 0.8947978615760803, "rewards/accuracies": 0.0, "rewards/chosen": 1.5070511102676392, "rewards/margins": -1.6101497411727905, "rewards/rejected": 3.1172008514404297, "step": 1988 }, { "epoch": 0.3315, "grad_norm": 30.917945861816406, "learning_rate": 1.5599385242495434e-07, "logits/chosen": 2.945600748062134, "logits/rejected": 2.7236781120300293, "logps/chosen": -64.66353607177734, "logps/rejected": -75.50143432617188, "loss": 0.7944, "nll_loss": 0.7432589530944824, "rewards/accuracies": 1.0, "rewards/chosen": 1.5644371509552002, "rewards/margins": 4.424108505249023, "rewards/rejected": -2.8596713542938232, "step": 1989 }, { "epoch": 0.33166666666666667, "grad_norm": 31.52985191345215, "learning_rate": 1.5594912061278625e-07, "logits/chosen": 2.6739134788513184, "logits/rejected": 2.8660852909088135, "logps/chosen": -49.94376754760742, "logps/rejected": -373.28302001953125, "loss": 0.829, "nll_loss": 0.7803714275360107, "rewards/accuracies": 1.0, "rewards/chosen": 1.0042861700057983, "rewards/margins": 4.844304084777832, "rewards/rejected": -3.840017795562744, "step": 1990 }, { "epoch": 0.3318333333333333, "grad_norm": 208.19517517089844, "learning_rate": 1.5590437249838759e-07, "logits/chosen": 2.8224172592163086, "logits/rejected": 2.180753707885742, "logps/chosen": -53.886756896972656, "logps/rejected": -31.084062576293945, "loss": 1.9423, "nll_loss": 0.22641490399837494, "rewards/accuracies": 0.0, "rewards/chosen": -0.982271671295166, "rewards/margins": -1.3635573387145996, "rewards/rejected": 0.3812856674194336, "step": 1991 }, { "epoch": 0.332, "grad_norm": 26.527442932128906, "learning_rate": 1.5585960809479694e-07, "logits/chosen": 2.7218759059906006, "logits/rejected": 2.7874486446380615, "logps/chosen": -137.57835388183594, "logps/rejected": -277.66912841796875, "loss": 0.9205, "nll_loss": 0.876295268535614, "rewards/accuracies": 1.0, "rewards/chosen": 2.107832431793213, "rewards/margins": 4.780710220336914, "rewards/rejected": -2.672877550125122, "step": 1992 }, { "epoch": 0.33216666666666667, "grad_norm": 204.55247497558594, "learning_rate": 1.5581482741505756e-07, "logits/chosen": 2.966987133026123, "logits/rejected": 2.195699691772461, "logps/chosen": -53.94216537475586, "logps/rejected": -31.88646697998047, "loss": 1.8841, "nll_loss": 0.22664771974086761, "rewards/accuracies": 0.0, "rewards/chosen": -0.9878125190734863, "rewards/margins": -1.2888576984405518, "rewards/rejected": 0.3010452389717102, "step": 1993 }, { "epoch": 0.3323333333333333, "grad_norm": 29.501211166381836, "learning_rate": 1.5577003047221743e-07, "logits/chosen": 2.8624014854431152, "logits/rejected": 2.9910941123962402, "logps/chosen": -51.83827209472656, "logps/rejected": -128.4317626953125, "loss": 0.6822, "nll_loss": 0.6399786472320557, "rewards/accuracies": 1.0, "rewards/chosen": 0.94927978515625, "rewards/margins": 5.6361985206604, "rewards/rejected": -4.68691873550415, "step": 1994 }, { "epoch": 0.3325, "grad_norm": 42.89493942260742, "learning_rate": 1.5572521727932936e-07, "logits/chosen": 1.9909316301345825, "logits/rejected": 2.6504478454589844, "logps/chosen": -97.42355346679688, "logps/rejected": -376.7601318359375, "loss": 0.8958, "nll_loss": 0.8856688737869263, "rewards/accuracies": 1.0, "rewards/chosen": 2.334648847579956, "rewards/margins": 9.080511093139648, "rewards/rejected": -6.7458624839782715, "step": 1995 }, { "epoch": 0.33266666666666667, "grad_norm": 42.22117233276367, "learning_rate": 1.5568038784945078e-07, "logits/chosen": 3.5685157775878906, "logits/rejected": 3.6777682304382324, "logps/chosen": -65.57781982421875, "logps/rejected": -66.86427307128906, "loss": 1.0321, "nll_loss": 0.9236313700675964, "rewards/accuracies": 1.0, "rewards/chosen": 0.3010147213935852, "rewards/margins": 3.365081787109375, "rewards/rejected": -3.0640671253204346, "step": 1996 }, { "epoch": 0.3328333333333333, "grad_norm": 29.044353485107422, "learning_rate": 1.5563554219564395e-07, "logits/chosen": 1.2107700109481812, "logits/rejected": 1.7413618564605713, "logps/chosen": -134.0599365234375, "logps/rejected": -248.91029357910156, "loss": 1.2115, "nll_loss": 1.196963906288147, "rewards/accuracies": 1.0, "rewards/chosen": 1.909895420074463, "rewards/margins": 10.873189926147461, "rewards/rejected": -8.963294982910156, "step": 1997 }, { "epoch": 0.333, "grad_norm": 24.65973472595215, "learning_rate": 1.555906803309758e-07, "logits/chosen": 1.9493440389633179, "logits/rejected": 1.315794587135315, "logps/chosen": -230.771240234375, "logps/rejected": -147.89637756347656, "loss": 0.9888, "nll_loss": 0.9615468978881836, "rewards/accuracies": 1.0, "rewards/chosen": 1.6620635986328125, "rewards/margins": 5.7665228843688965, "rewards/rejected": -4.104459285736084, "step": 1998 }, { "epoch": 0.33316666666666667, "grad_norm": 27.869190216064453, "learning_rate": 1.55545802268518e-07, "logits/chosen": 2.273763656616211, "logits/rejected": 2.7459115982055664, "logps/chosen": -14.06382942199707, "logps/rejected": -132.2957305908203, "loss": 0.4542, "nll_loss": 0.39066195487976074, "rewards/accuracies": 1.0, "rewards/chosen": 0.9633064270019531, "rewards/margins": 4.150028228759766, "rewards/rejected": -3.1867218017578125, "step": 1999 }, { "epoch": 0.3333333333333333, "grad_norm": 23.723243713378906, "learning_rate": 1.55500908021347e-07, "logits/chosen": 1.5366880893707275, "logits/rejected": 2.6003308296203613, "logps/chosen": -28.938785552978516, "logps/rejected": -271.8175048828125, "loss": 0.4901, "nll_loss": 0.47440630197525024, "rewards/accuracies": 1.0, "rewards/chosen": 1.8252766132354736, "rewards/margins": 10.416274070739746, "rewards/rejected": -8.590997695922852, "step": 2000 }, { "epoch": 0.3335, "grad_norm": 27.40096092224121, "learning_rate": 1.554559976025438e-07, "logits/chosen": 1.5337066650390625, "logits/rejected": 2.3190717697143555, "logps/chosen": -51.67434310913086, "logps/rejected": -176.2503662109375, "loss": 0.7524, "nll_loss": 0.7278075218200684, "rewards/accuracies": 1.0, "rewards/chosen": 2.0029513835906982, "rewards/margins": 5.776910781860352, "rewards/rejected": -3.7739596366882324, "step": 2001 }, { "epoch": 0.33366666666666667, "grad_norm": 229.908447265625, "learning_rate": 1.554110710251943e-07, "logits/chosen": 2.77999210357666, "logits/rejected": 2.6965737342834473, "logps/chosen": -95.2432861328125, "logps/rejected": -49.808982849121094, "loss": 3.0356, "nll_loss": 1.2210679054260254, "rewards/accuracies": 0.0, "rewards/chosen": 1.17363440990448, "rewards/margins": -1.1930290460586548, "rewards/rejected": 2.3666634559631348, "step": 2002 }, { "epoch": 0.3338333333333333, "grad_norm": 31.840652465820312, "learning_rate": 1.5536612830238897e-07, "logits/chosen": 2.7480878829956055, "logits/rejected": 2.822369337081909, "logps/chosen": -89.17247772216797, "logps/rejected": -170.2711944580078, "loss": 1.0071, "nll_loss": 0.9799174070358276, "rewards/accuracies": 1.0, "rewards/chosen": 1.2770271301269531, "rewards/margins": 7.884435176849365, "rewards/rejected": -6.607408046722412, "step": 2003 }, { "epoch": 0.334, "grad_norm": 29.339323043823242, "learning_rate": 1.5532116944722307e-07, "logits/chosen": 2.2859017848968506, "logits/rejected": 2.2995471954345703, "logps/chosen": -79.94429016113281, "logps/rejected": -62.13564682006836, "loss": 0.8868, "nll_loss": 0.8241679668426514, "rewards/accuracies": 1.0, "rewards/chosen": 1.3470207452774048, "rewards/margins": 4.071645259857178, "rewards/rejected": -2.7246246337890625, "step": 2004 }, { "epoch": 0.33416666666666667, "grad_norm": 71.8138427734375, "learning_rate": 1.5527619447279654e-07, "logits/chosen": 1.97356379032135, "logits/rejected": 2.0220353603363037, "logps/chosen": -10.255894660949707, "logps/rejected": -35.870704650878906, "loss": 0.629, "nll_loss": 0.5397838950157166, "rewards/accuracies": 1.0, "rewards/chosen": 2.0636730194091797, "rewards/margins": 3.811206579208374, "rewards/rejected": -1.7475335597991943, "step": 2005 }, { "epoch": 0.3343333333333333, "grad_norm": 72.5429458618164, "learning_rate": 1.55231203392214e-07, "logits/chosen": 3.07486891746521, "logits/rejected": 3.1019492149353027, "logps/chosen": -45.77452850341797, "logps/rejected": -135.01638793945312, "loss": 1.2749, "nll_loss": 1.2371493577957153, "rewards/accuracies": 1.0, "rewards/chosen": 0.8952827453613281, "rewards/margins": 8.798290252685547, "rewards/rejected": -7.903007984161377, "step": 2006 }, { "epoch": 0.3345, "grad_norm": 29.962501525878906, "learning_rate": 1.5518619621858473e-07, "logits/chosen": 3.225659132003784, "logits/rejected": 3.1468613147735596, "logps/chosen": -86.1405029296875, "logps/rejected": -121.67301940917969, "loss": 0.9247, "nll_loss": 0.8789847493171692, "rewards/accuracies": 1.0, "rewards/chosen": 1.719915747642517, "rewards/margins": 4.625315189361572, "rewards/rejected": -2.9053995609283447, "step": 2007 }, { "epoch": 0.33466666666666667, "grad_norm": 89.15556335449219, "learning_rate": 1.551411729650228e-07, "logits/chosen": 2.593252420425415, "logits/rejected": 3.1836323738098145, "logps/chosen": -91.10589599609375, "logps/rejected": -470.10101318359375, "loss": 1.7233, "nll_loss": 1.469449758529663, "rewards/accuracies": 1.0, "rewards/chosen": -1.5715187788009644, "rewards/margins": 4.72473669052124, "rewards/rejected": -6.296255588531494, "step": 2008 }, { "epoch": 0.3348333333333333, "grad_norm": 31.85531997680664, "learning_rate": 1.550961336446469e-07, "logits/chosen": 2.8738043308258057, "logits/rejected": 3.235126495361328, "logps/chosen": -52.29513931274414, "logps/rejected": -168.4122314453125, "loss": 0.7323, "nll_loss": 0.6972684264183044, "rewards/accuracies": 1.0, "rewards/chosen": 1.0541820526123047, "rewards/margins": 6.544074535369873, "rewards/rejected": -5.489892482757568, "step": 2009 }, { "epoch": 0.335, "grad_norm": 112.51761627197266, "learning_rate": 1.5505107827058036e-07, "logits/chosen": 2.9070773124694824, "logits/rejected": 2.9890828132629395, "logps/chosen": -36.80371856689453, "logps/rejected": -90.30508422851562, "loss": 2.5716, "nll_loss": 2.4535815715789795, "rewards/accuracies": 1.0, "rewards/chosen": 0.148590087890625, "rewards/margins": 3.2846839427948, "rewards/rejected": -3.136093854904175, "step": 2010 }, { "epoch": 0.33516666666666667, "grad_norm": 57.47226333618164, "learning_rate": 1.5500600685595128e-07, "logits/chosen": 2.385998249053955, "logits/rejected": 2.599025011062622, "logps/chosen": -84.55614471435547, "logps/rejected": -140.30010986328125, "loss": 1.0859, "nll_loss": 1.031172752380371, "rewards/accuracies": 1.0, "rewards/chosen": 1.2352584600448608, "rewards/margins": 4.3404364585876465, "rewards/rejected": -3.105178117752075, "step": 2011 }, { "epoch": 0.3353333333333333, "grad_norm": 21.7257022857666, "learning_rate": 1.5496091941389233e-07, "logits/chosen": 2.77634859085083, "logits/rejected": 2.915987730026245, "logps/chosen": -69.1087417602539, "logps/rejected": -137.402099609375, "loss": 0.6808, "nll_loss": 0.6458759903907776, "rewards/accuracies": 1.0, "rewards/chosen": 2.28620982170105, "rewards/margins": 5.198241233825684, "rewards/rejected": -2.912031650543213, "step": 2012 }, { "epoch": 0.3355, "grad_norm": 25.92667579650879, "learning_rate": 1.54915815957541e-07, "logits/chosen": 2.542168378829956, "logits/rejected": 2.4423696994781494, "logps/chosen": -120.09081268310547, "logps/rejected": -114.87750244140625, "loss": 1.0304, "nll_loss": 0.9924860596656799, "rewards/accuracies": 1.0, "rewards/chosen": 2.424579620361328, "rewards/margins": 5.145507335662842, "rewards/rejected": -2.7209277153015137, "step": 2013 }, { "epoch": 0.33566666666666667, "grad_norm": 33.71769714355469, "learning_rate": 1.5487069650003925e-07, "logits/chosen": 2.7047839164733887, "logits/rejected": 2.7785184383392334, "logps/chosen": -31.541553497314453, "logps/rejected": -50.58675765991211, "loss": 0.7268, "nll_loss": 0.6710968613624573, "rewards/accuracies": 1.0, "rewards/chosen": 1.7782478332519531, "rewards/margins": 4.326285362243652, "rewards/rejected": -2.548037528991699, "step": 2014 }, { "epoch": 0.3358333333333333, "grad_norm": 22.82570457458496, "learning_rate": 1.548255610545339e-07, "logits/chosen": 2.3993825912475586, "logits/rejected": 2.337494373321533, "logps/chosen": -77.00212097167969, "logps/rejected": -245.49916076660156, "loss": 0.7993, "nll_loss": 0.7857357859611511, "rewards/accuracies": 1.0, "rewards/chosen": 2.005115509033203, "rewards/margins": 9.018428802490234, "rewards/rejected": -7.013313293457031, "step": 2015 }, { "epoch": 0.336, "grad_norm": 26.684722900390625, "learning_rate": 1.5478040963417628e-07, "logits/chosen": 1.167069673538208, "logits/rejected": 1.2587907314300537, "logps/chosen": -87.61688995361328, "logps/rejected": -186.7146759033203, "loss": 1.0636, "nll_loss": 1.0556249618530273, "rewards/accuracies": 1.0, "rewards/chosen": 2.6482553482055664, "rewards/margins": 8.787918090820312, "rewards/rejected": -6.139662265777588, "step": 2016 }, { "epoch": 0.33616666666666667, "grad_norm": 24.684377670288086, "learning_rate": 1.5473524225212247e-07, "logits/chosen": 1.7623587846755981, "logits/rejected": 2.5634758472442627, "logps/chosen": -24.137907028198242, "logps/rejected": -281.8976135253906, "loss": 0.4829, "nll_loss": 0.4641905426979065, "rewards/accuracies": 1.0, "rewards/chosen": 1.7556540966033936, "rewards/margins": 7.32912540435791, "rewards/rejected": -5.5734710693359375, "step": 2017 }, { "epoch": 0.3363333333333333, "grad_norm": 52.4196891784668, "learning_rate": 1.546900589215331e-07, "logits/chosen": 1.262901782989502, "logits/rejected": 2.293503999710083, "logps/chosen": -30.531843185424805, "logps/rejected": -229.66323852539062, "loss": 0.8551, "nll_loss": 0.8251849412918091, "rewards/accuracies": 1.0, "rewards/chosen": 1.5433686971664429, "rewards/margins": 5.623061180114746, "rewards/rejected": -4.079692363739014, "step": 2018 }, { "epoch": 0.3365, "grad_norm": 36.38166809082031, "learning_rate": 1.5464485965557357e-07, "logits/chosen": 3.274961471557617, "logits/rejected": 3.1225781440734863, "logps/chosen": -23.79458999633789, "logps/rejected": -85.9231948852539, "loss": 0.6105, "nll_loss": 0.5948647260665894, "rewards/accuracies": 1.0, "rewards/chosen": 2.429983139038086, "rewards/margins": 6.624701976776123, "rewards/rejected": -4.194718837738037, "step": 2019 }, { "epoch": 0.33666666666666667, "grad_norm": 51.65391159057617, "learning_rate": 1.545996444674138e-07, "logits/chosen": 2.10221266746521, "logits/rejected": 1.9565274715423584, "logps/chosen": -34.94244384765625, "logps/rejected": -53.8346061706543, "loss": 1.1224, "nll_loss": 1.0919513702392578, "rewards/accuracies": 1.0, "rewards/chosen": 1.788839340209961, "rewards/margins": 5.379611968994141, "rewards/rejected": -3.5907723903656006, "step": 2020 }, { "epoch": 0.3368333333333333, "grad_norm": 38.52401351928711, "learning_rate": 1.5455441337022848e-07, "logits/chosen": 3.0583996772766113, "logits/rejected": 3.1113944053649902, "logps/chosen": -15.643318176269531, "logps/rejected": -99.24269104003906, "loss": 0.5009, "nll_loss": 0.4011107385158539, "rewards/accuracies": 1.0, "rewards/chosen": 0.6489285230636597, "rewards/margins": 3.3407278060913086, "rewards/rejected": -2.6917991638183594, "step": 2021 }, { "epoch": 0.337, "grad_norm": 26.268964767456055, "learning_rate": 1.545091663771968e-07, "logits/chosen": 1.1706784963607788, "logits/rejected": 1.7317146062850952, "logps/chosen": -43.28499984741211, "logps/rejected": -152.77294921875, "loss": 0.6537, "nll_loss": 0.6096479296684265, "rewards/accuracies": 1.0, "rewards/chosen": 1.0295677185058594, "rewards/margins": 5.136001110076904, "rewards/rejected": -4.106433391571045, "step": 2022 }, { "epoch": 0.33716666666666667, "grad_norm": 24.254655838012695, "learning_rate": 1.544639035015027e-07, "logits/chosen": 2.33807110786438, "logits/rejected": 2.4512522220611572, "logps/chosen": -73.73440551757812, "logps/rejected": -287.4214172363281, "loss": 0.7776, "nll_loss": 0.7523917555809021, "rewards/accuracies": 1.0, "rewards/chosen": 1.5509010553359985, "rewards/margins": 6.28200626373291, "rewards/rejected": -4.731105327606201, "step": 2023 }, { "epoch": 0.3373333333333333, "grad_norm": 47.136173248291016, "learning_rate": 1.5441862475633466e-07, "logits/chosen": 1.9601943492889404, "logits/rejected": 1.917211890220642, "logps/chosen": -68.48530578613281, "logps/rejected": -65.38346099853516, "loss": 1.2746, "nll_loss": 1.222951889038086, "rewards/accuracies": 1.0, "rewards/chosen": 0.8948608636856079, "rewards/margins": 4.804894924163818, "rewards/rejected": -3.910033941268921, "step": 2024 }, { "epoch": 0.3375, "grad_norm": 29.507076263427734, "learning_rate": 1.5437333015488586e-07, "logits/chosen": 2.506917715072632, "logits/rejected": 2.570863962173462, "logps/chosen": -73.84800720214844, "logps/rejected": -154.06280517578125, "loss": 0.9272, "nll_loss": 0.8897350430488586, "rewards/accuracies": 1.0, "rewards/chosen": 1.113167643547058, "rewards/margins": 5.656518936157227, "rewards/rejected": -4.543351173400879, "step": 2025 }, { "epoch": 0.33766666666666667, "grad_norm": 47.613372802734375, "learning_rate": 1.54328019710354e-07, "logits/chosen": 3.101282835006714, "logits/rejected": 3.120920181274414, "logps/chosen": -84.73271942138672, "logps/rejected": -222.33798217773438, "loss": 1.3079, "nll_loss": 1.2104673385620117, "rewards/accuracies": 1.0, "rewards/chosen": 1.103253960609436, "rewards/margins": 3.3380255699157715, "rewards/rejected": -2.234771728515625, "step": 2026 }, { "epoch": 0.3378333333333333, "grad_norm": 26.17234992980957, "learning_rate": 1.542826934359415e-07, "logits/chosen": 1.9087673425674438, "logits/rejected": 2.1427407264709473, "logps/chosen": -21.709903717041016, "logps/rejected": -143.80511474609375, "loss": 0.4721, "nll_loss": 0.4522896707057953, "rewards/accuracies": 1.0, "rewards/chosen": 2.8709278106689453, "rewards/margins": 6.2824177742004395, "rewards/rejected": -3.411489963531494, "step": 2027 }, { "epoch": 0.338, "grad_norm": 48.1601676940918, "learning_rate": 1.5423735134485534e-07, "logits/chosen": 1.2051478624343872, "logits/rejected": 2.217421531677246, "logps/chosen": -128.53155517578125, "logps/rejected": -273.1358947753906, "loss": 1.4748, "nll_loss": 1.3529636859893799, "rewards/accuracies": 1.0, "rewards/chosen": -0.4458824098110199, "rewards/margins": 5.1132378578186035, "rewards/rejected": -5.559120178222656, "step": 2028 }, { "epoch": 0.33816666666666667, "grad_norm": 24.197195053100586, "learning_rate": 1.5419199345030708e-07, "logits/chosen": 2.782374858856201, "logits/rejected": 2.994752883911133, "logps/chosen": -76.03544616699219, "logps/rejected": -331.1561279296875, "loss": 0.8741, "nll_loss": 0.8640391826629639, "rewards/accuracies": 1.0, "rewards/chosen": 2.3823113441467285, "rewards/margins": 8.467357635498047, "rewards/rejected": -6.085046768188477, "step": 2029 }, { "epoch": 0.3383333333333333, "grad_norm": 33.58424758911133, "learning_rate": 1.54146619765513e-07, "logits/chosen": 2.1499085426330566, "logits/rejected": 1.817142367362976, "logps/chosen": -95.68241882324219, "logps/rejected": -133.90231323242188, "loss": 1.012, "nll_loss": 0.9568241238594055, "rewards/accuracies": 1.0, "rewards/chosen": 1.4006942510604858, "rewards/margins": 4.290960311889648, "rewards/rejected": -2.890265941619873, "step": 2030 }, { "epoch": 0.3385, "grad_norm": 27.087604522705078, "learning_rate": 1.5410123030369384e-07, "logits/chosen": 2.1359336376190186, "logits/rejected": 2.464451551437378, "logps/chosen": -105.08208465576172, "logps/rejected": -189.4487762451172, "loss": 0.8785, "nll_loss": 0.8406566977500916, "rewards/accuracies": 1.0, "rewards/chosen": 0.8763527274131775, "rewards/margins": 9.902524948120117, "rewards/rejected": -9.026172637939453, "step": 2031 }, { "epoch": 0.33866666666666667, "grad_norm": 31.412002563476562, "learning_rate": 1.54055825078075e-07, "logits/chosen": 2.602792501449585, "logits/rejected": 2.6350276470184326, "logps/chosen": -106.29330444335938, "logps/rejected": -177.82948303222656, "loss": 0.9911, "nll_loss": 0.9575971961021423, "rewards/accuracies": 1.0, "rewards/chosen": 1.027117133140564, "rewards/margins": 7.935153484344482, "rewards/rejected": -6.908036231994629, "step": 2032 }, { "epoch": 0.3388333333333333, "grad_norm": 39.26091766357422, "learning_rate": 1.540104041018865e-07, "logits/chosen": 1.9910352230072021, "logits/rejected": 2.219942808151245, "logps/chosen": -36.89461135864258, "logps/rejected": -361.9505615234375, "loss": 0.8916, "nll_loss": 0.8784432411193848, "rewards/accuracies": 1.0, "rewards/chosen": 1.9928357601165771, "rewards/margins": 14.306464195251465, "rewards/rejected": -12.313628196716309, "step": 2033 }, { "epoch": 0.339, "grad_norm": 24.887741088867188, "learning_rate": 1.5396496738836291e-07, "logits/chosen": 1.3233778476715088, "logits/rejected": 2.36023211479187, "logps/chosen": -76.69002532958984, "logps/rejected": -554.9871215820312, "loss": 0.9736, "nll_loss": 0.958625316619873, "rewards/accuracies": 1.0, "rewards/chosen": 2.5593254566192627, "rewards/margins": 6.687335014343262, "rewards/rejected": -4.12800931930542, "step": 2034 }, { "epoch": 0.33916666666666667, "grad_norm": 153.8590545654297, "learning_rate": 1.539195149507434e-07, "logits/chosen": 2.1127066612243652, "logits/rejected": 2.4793102741241455, "logps/chosen": -66.7770004272461, "logps/rejected": -125.54641723632812, "loss": 1.0735, "nll_loss": 0.9539572596549988, "rewards/accuracies": 1.0, "rewards/chosen": 0.5034454464912415, "rewards/margins": 3.0388152599334717, "rewards/rejected": -2.535369873046875, "step": 2035 }, { "epoch": 0.3393333333333333, "grad_norm": 32.3512077331543, "learning_rate": 1.5387404680227173e-07, "logits/chosen": 2.8391172885894775, "logits/rejected": 2.812485456466675, "logps/chosen": -56.58118438720703, "logps/rejected": -154.13714599609375, "loss": 0.8449, "nll_loss": 0.7858498096466064, "rewards/accuracies": 1.0, "rewards/chosen": 0.481362909078598, "rewards/margins": 5.566645622253418, "rewards/rejected": -5.085282802581787, "step": 2036 }, { "epoch": 0.3395, "grad_norm": 52.43410873413086, "learning_rate": 1.538285629561962e-07, "logits/chosen": 2.824706792831421, "logits/rejected": 2.802945852279663, "logps/chosen": -103.50929260253906, "logps/rejected": -81.87200164794922, "loss": 1.596, "nll_loss": 1.5001345872879028, "rewards/accuracies": 1.0, "rewards/chosen": 0.9734787940979004, "rewards/margins": 3.3575539588928223, "rewards/rejected": -2.384075164794922, "step": 2037 }, { "epoch": 0.3396666666666667, "grad_norm": 43.50046157836914, "learning_rate": 1.5378306342576973e-07, "logits/chosen": 2.5474467277526855, "logits/rejected": 2.538480281829834, "logps/chosen": -119.14716339111328, "logps/rejected": -119.83329010009766, "loss": 1.2925, "nll_loss": 1.2283213138580322, "rewards/accuracies": 1.0, "rewards/chosen": 1.6646629571914673, "rewards/margins": 4.086406230926514, "rewards/rejected": -2.421743154525757, "step": 2038 }, { "epoch": 0.3398333333333333, "grad_norm": 31.13317108154297, "learning_rate": 1.5373754822424977e-07, "logits/chosen": 2.776620388031006, "logits/rejected": 2.87341046333313, "logps/chosen": -55.36328887939453, "logps/rejected": -141.110107421875, "loss": 0.8645, "nll_loss": 0.838837742805481, "rewards/accuracies": 1.0, "rewards/chosen": 1.4647263288497925, "rewards/margins": 6.462953567504883, "rewards/rejected": -4.998227119445801, "step": 2039 }, { "epoch": 0.34, "grad_norm": 22.958311080932617, "learning_rate": 1.5369201736489839e-07, "logits/chosen": 3.1162843704223633, "logits/rejected": 3.221813201904297, "logps/chosen": -70.9289321899414, "logps/rejected": -107.35983276367188, "loss": 0.6082, "nll_loss": 0.5720075964927673, "rewards/accuracies": 1.0, "rewards/chosen": 1.4092880487442017, "rewards/margins": 5.19796085357666, "rewards/rejected": -3.788672685623169, "step": 2040 }, { "epoch": 0.3401666666666667, "grad_norm": 37.08390808105469, "learning_rate": 1.5364647086098216e-07, "logits/chosen": 1.1611688137054443, "logits/rejected": 3.3443167209625244, "logps/chosen": -16.07102394104004, "logps/rejected": -418.3307800292969, "loss": 0.5004, "nll_loss": 0.45917215943336487, "rewards/accuracies": 1.0, "rewards/chosen": 0.8154016733169556, "rewards/margins": 6.997277736663818, "rewards/rejected": -6.181876182556152, "step": 2041 }, { "epoch": 0.3403333333333333, "grad_norm": 122.49512481689453, "learning_rate": 1.5360090872577228e-07, "logits/chosen": 2.064177989959717, "logits/rejected": 2.2652697563171387, "logps/chosen": -15.485864639282227, "logps/rejected": -70.2126693725586, "loss": 1.3255, "nll_loss": 0.7039029002189636, "rewards/accuracies": 1.0, "rewards/chosen": 1.9449156522750854, "rewards/margins": 1.018386721611023, "rewards/rejected": 0.9265289306640625, "step": 2042 }, { "epoch": 0.3405, "grad_norm": 26.19841194152832, "learning_rate": 1.5355533097254436e-07, "logits/chosen": 2.9529647827148438, "logits/rejected": 2.961235761642456, "logps/chosen": -62.28190612792969, "logps/rejected": -183.81578063964844, "loss": 0.7831, "nll_loss": 0.7503845691680908, "rewards/accuracies": 1.0, "rewards/chosen": 1.1158684492111206, "rewards/margins": 6.657094955444336, "rewards/rejected": -5.541226387023926, "step": 2043 }, { "epoch": 0.3406666666666667, "grad_norm": 107.20011901855469, "learning_rate": 1.535097376145788e-07, "logits/chosen": 2.249643325805664, "logits/rejected": 2.7249412536621094, "logps/chosen": -83.08922576904297, "logps/rejected": -231.25242614746094, "loss": 2.7704, "nll_loss": 2.443800926208496, "rewards/accuracies": 1.0, "rewards/chosen": -2.056614637374878, "rewards/margins": 4.463271141052246, "rewards/rejected": -6.519885540008545, "step": 2044 }, { "epoch": 0.3408333333333333, "grad_norm": 53.597572326660156, "learning_rate": 1.534641286651603e-07, "logits/chosen": 2.883255958557129, "logits/rejected": 2.5485808849334717, "logps/chosen": -28.168880462646484, "logps/rejected": -89.4970703125, "loss": 0.9458, "nll_loss": 0.9389626979827881, "rewards/accuracies": 1.0, "rewards/chosen": 2.8404605388641357, "rewards/margins": 8.841866493225098, "rewards/rejected": -6.001406192779541, "step": 2045 }, { "epoch": 0.341, "grad_norm": 35.29999542236328, "learning_rate": 1.534185041375783e-07, "logits/chosen": 3.106304168701172, "logits/rejected": 3.1980464458465576, "logps/chosen": -13.494857788085938, "logps/rejected": -137.019775390625, "loss": 0.5237, "nll_loss": 0.49980947375297546, "rewards/accuracies": 1.0, "rewards/chosen": 1.768168330192566, "rewards/margins": 6.015392303466797, "rewards/rejected": -4.247223854064941, "step": 2046 }, { "epoch": 0.3411666666666667, "grad_norm": 45.79985809326172, "learning_rate": 1.5337286404512662e-07, "logits/chosen": 3.234926223754883, "logits/rejected": 3.2207634449005127, "logps/chosen": -32.33470916748047, "logps/rejected": -95.7601089477539, "loss": 0.7838, "nll_loss": 0.6598920226097107, "rewards/accuracies": 1.0, "rewards/chosen": 1.4726818799972534, "rewards/margins": 3.0960209369659424, "rewards/rejected": -1.623339056968689, "step": 2047 }, { "epoch": 0.3413333333333333, "grad_norm": 19.168954849243164, "learning_rate": 1.5332720840110374e-07, "logits/chosen": 2.049065589904785, "logits/rejected": 2.1201817989349365, "logps/chosen": -117.25497436523438, "logps/rejected": -341.3133239746094, "loss": 0.9371, "nll_loss": 0.9232675433158875, "rewards/accuracies": 1.0, "rewards/chosen": 1.9298492670059204, "rewards/margins": 14.162749290466309, "rewards/rejected": -12.23289966583252, "step": 2048 }, { "epoch": 0.3415, "grad_norm": 19.187963485717773, "learning_rate": 1.532815372188126e-07, "logits/chosen": 3.075077533721924, "logits/rejected": 3.131352186203003, "logps/chosen": -51.07008361816406, "logps/rejected": -220.7611083984375, "loss": 0.5219, "nll_loss": 0.5006871819496155, "rewards/accuracies": 1.0, "rewards/chosen": 1.5018333196640015, "rewards/margins": 8.941783905029297, "rewards/rejected": -7.439950942993164, "step": 2049 }, { "epoch": 0.3416666666666667, "grad_norm": 78.12928771972656, "learning_rate": 1.5323585051156067e-07, "logits/chosen": 2.665530204772949, "logits/rejected": 2.5488953590393066, "logps/chosen": -43.54802703857422, "logps/rejected": -33.48533248901367, "loss": 1.6655, "nll_loss": 1.5016562938690186, "rewards/accuracies": 1.0, "rewards/chosen": 0.7518280148506165, "rewards/margins": 2.5132579803466797, "rewards/rejected": -1.761430025100708, "step": 2050 }, { "epoch": 0.3418333333333333, "grad_norm": 20.98651123046875, "learning_rate": 1.5319014829265996e-07, "logits/chosen": 2.796854257583618, "logits/rejected": 2.8884973526000977, "logps/chosen": -33.85432815551758, "logps/rejected": -242.9714813232422, "loss": 0.5286, "nll_loss": 0.5129443407058716, "rewards/accuracies": 1.0, "rewards/chosen": 1.8285900354385376, "rewards/margins": 9.087994575500488, "rewards/rejected": -7.259404182434082, "step": 2051 }, { "epoch": 0.342, "grad_norm": 187.9527587890625, "learning_rate": 1.5314443057542702e-07, "logits/chosen": 2.9544131755828857, "logits/rejected": 3.0573301315307617, "logps/chosen": -80.77630615234375, "logps/rejected": -19.98688316345215, "loss": 3.2056, "nll_loss": 1.0770174264907837, "rewards/accuracies": 0.0, "rewards/chosen": 2.1750571727752686, "rewards/margins": -1.3552842140197754, "rewards/rejected": 3.530341386795044, "step": 2052 }, { "epoch": 0.3421666666666667, "grad_norm": 34.20323181152344, "learning_rate": 1.530986973731829e-07, "logits/chosen": 2.4270026683807373, "logits/rejected": 1.8490962982177734, "logps/chosen": -62.99201583862305, "logps/rejected": -33.370758056640625, "loss": 1.1775, "nll_loss": 1.0160001516342163, "rewards/accuracies": 1.0, "rewards/chosen": 4.494428634643555, "rewards/margins": 5.038104057312012, "rewards/rejected": -0.5436752438545227, "step": 2053 }, { "epoch": 0.3423333333333333, "grad_norm": 21.905223846435547, "learning_rate": 1.5305294869925313e-07, "logits/chosen": 2.5166783332824707, "logits/rejected": 2.5627288818359375, "logps/chosen": -80.1373062133789, "logps/rejected": -239.54307556152344, "loss": 0.8769, "nll_loss": 0.8616915345191956, "rewards/accuracies": 1.0, "rewards/chosen": 4.332417964935303, "rewards/margins": 7.691791534423828, "rewards/rejected": -3.3593735694885254, "step": 2054 }, { "epoch": 0.3425, "grad_norm": 37.35413360595703, "learning_rate": 1.5300718456696777e-07, "logits/chosen": 1.9387013912200928, "logits/rejected": 2.031099557876587, "logps/chosen": -60.65639114379883, "logps/rejected": -129.15228271484375, "loss": 0.9764, "nll_loss": 0.9331752061843872, "rewards/accuracies": 1.0, "rewards/chosen": 1.452576994895935, "rewards/margins": 4.749549865722656, "rewards/rejected": -3.2969727516174316, "step": 2055 }, { "epoch": 0.3426666666666667, "grad_norm": 43.061519622802734, "learning_rate": 1.5296140498966144e-07, "logits/chosen": 2.553605079650879, "logits/rejected": 2.0360794067382812, "logps/chosen": -45.4073486328125, "logps/rejected": -32.48107147216797, "loss": 1.0691, "nll_loss": 1.009052038192749, "rewards/accuracies": 1.0, "rewards/chosen": 2.964914083480835, "rewards/margins": 4.94139289855957, "rewards/rejected": -1.9764788150787354, "step": 2056 }, { "epoch": 0.3428333333333333, "grad_norm": 45.13665771484375, "learning_rate": 1.5291560998067318e-07, "logits/chosen": 2.386704444885254, "logits/rejected": 2.721010208129883, "logps/chosen": -26.239261627197266, "logps/rejected": -475.2614440917969, "loss": 0.9707, "nll_loss": 0.9371165633201599, "rewards/accuracies": 1.0, "rewards/chosen": 1.7318623065948486, "rewards/margins": 5.19005012512207, "rewards/rejected": -3.4581878185272217, "step": 2057 }, { "epoch": 0.343, "grad_norm": 21.73410415649414, "learning_rate": 1.528697995533465e-07, "logits/chosen": 3.1404764652252197, "logits/rejected": 3.221125841140747, "logps/chosen": -78.21200561523438, "logps/rejected": -194.84860229492188, "loss": 0.8396, "nll_loss": 0.8320425152778625, "rewards/accuracies": 1.0, "rewards/chosen": 2.8118622303009033, "rewards/margins": 8.37804126739502, "rewards/rejected": -5.566179275512695, "step": 2058 }, { "epoch": 0.3431666666666667, "grad_norm": 33.677486419677734, "learning_rate": 1.5282397372102957e-07, "logits/chosen": 1.7727184295654297, "logits/rejected": 1.5428160429000854, "logps/chosen": -64.65569305419922, "logps/rejected": -55.1422004699707, "loss": 0.7901, "nll_loss": 0.7431687712669373, "rewards/accuracies": 1.0, "rewards/chosen": 1.1194671392440796, "rewards/margins": 4.770180702209473, "rewards/rejected": -3.6507134437561035, "step": 2059 }, { "epoch": 0.3433333333333333, "grad_norm": 150.6050567626953, "learning_rate": 1.5277813249707485e-07, "logits/chosen": 2.7943637371063232, "logits/rejected": 2.718571662902832, "logps/chosen": -112.2982406616211, "logps/rejected": -94.2041244506836, "loss": 1.7173, "nll_loss": 1.3211559057235718, "rewards/accuracies": 1.0, "rewards/chosen": 0.012944795191287994, "rewards/margins": 1.162549614906311, "rewards/rejected": -1.1496047973632812, "step": 2060 }, { "epoch": 0.3435, "grad_norm": 44.85755157470703, "learning_rate": 1.5273227589483945e-07, "logits/chosen": 2.8062198162078857, "logits/rejected": 2.7023041248321533, "logps/chosen": -29.44121551513672, "logps/rejected": -33.60367965698242, "loss": 0.738, "nll_loss": 0.6691185235977173, "rewards/accuracies": 1.0, "rewards/chosen": 0.8787021636962891, "rewards/margins": 4.003931999206543, "rewards/rejected": -3.125229835510254, "step": 2061 }, { "epoch": 0.3436666666666667, "grad_norm": 34.914695739746094, "learning_rate": 1.5268640392768478e-07, "logits/chosen": 2.839125633239746, "logits/rejected": 2.992511510848999, "logps/chosen": -17.318115234375, "logps/rejected": -223.68246459960938, "loss": 0.5327, "nll_loss": 0.49480319023132324, "rewards/accuracies": 1.0, "rewards/chosen": 1.008494257926941, "rewards/margins": 6.002094745635986, "rewards/rejected": -4.993600368499756, "step": 2062 }, { "epoch": 0.3438333333333333, "grad_norm": 21.562726974487305, "learning_rate": 1.526405166089769e-07, "logits/chosen": 2.473933696746826, "logits/rejected": 2.3174543380737305, "logps/chosen": -144.3341064453125, "logps/rejected": -151.58590698242188, "loss": 0.994, "nll_loss": 0.9686851501464844, "rewards/accuracies": 1.0, "rewards/chosen": 2.284135580062866, "rewards/margins": 5.706506729125977, "rewards/rejected": -3.4223709106445312, "step": 2063 }, { "epoch": 0.344, "grad_norm": 27.500423431396484, "learning_rate": 1.5259461395208626e-07, "logits/chosen": 1.9348522424697876, "logits/rejected": 2.0267763137817383, "logps/chosen": -58.38035583496094, "logps/rejected": -103.46726989746094, "loss": 0.7776, "nll_loss": 0.7484661340713501, "rewards/accuracies": 1.0, "rewards/chosen": 2.103566884994507, "rewards/margins": 5.434855937957764, "rewards/rejected": -3.331289052963257, "step": 2064 }, { "epoch": 0.3441666666666667, "grad_norm": 185.0840606689453, "learning_rate": 1.525486959703878e-07, "logits/chosen": 3.2409098148345947, "logits/rejected": 3.28118634223938, "logps/chosen": -30.620807647705078, "logps/rejected": -22.266963958740234, "loss": 2.4123, "nll_loss": 0.49388402700424194, "rewards/accuracies": 0.0, "rewards/chosen": 0.9659767150878906, "rewards/margins": -1.3574581146240234, "rewards/rejected": 2.323434829711914, "step": 2065 }, { "epoch": 0.3443333333333333, "grad_norm": 23.451709747314453, "learning_rate": 1.525027626772609e-07, "logits/chosen": 2.777198076248169, "logits/rejected": 2.994929790496826, "logps/chosen": -43.41705322265625, "logps/rejected": -202.941162109375, "loss": 0.6481, "nll_loss": 0.6384861469268799, "rewards/accuracies": 1.0, "rewards/chosen": 2.55792236328125, "rewards/margins": 7.946923732757568, "rewards/rejected": -5.389001369476318, "step": 2066 }, { "epoch": 0.3445, "grad_norm": 30.784866333007812, "learning_rate": 1.5245681408608942e-07, "logits/chosen": 1.9514555931091309, "logits/rejected": 2.019127130508423, "logps/chosen": -90.83793640136719, "logps/rejected": -107.18923950195312, "loss": 0.8434, "nll_loss": 0.7898951768875122, "rewards/accuracies": 1.0, "rewards/chosen": 1.3844964504241943, "rewards/margins": 4.348844528198242, "rewards/rejected": -2.9643478393554688, "step": 2067 }, { "epoch": 0.3446666666666667, "grad_norm": 20.82941436767578, "learning_rate": 1.524108502102617e-07, "logits/chosen": 1.3741153478622437, "logits/rejected": 2.2844247817993164, "logps/chosen": -87.13862609863281, "logps/rejected": -256.9096374511719, "loss": 0.671, "nll_loss": 0.6551775336265564, "rewards/accuracies": 1.0, "rewards/chosen": 1.8985016345977783, "rewards/margins": 7.717442512512207, "rewards/rejected": -5.818941116333008, "step": 2068 }, { "epoch": 0.3448333333333333, "grad_norm": 29.1798095703125, "learning_rate": 1.5236487106317048e-07, "logits/chosen": 2.15100359916687, "logits/rejected": 2.217359781265259, "logps/chosen": -59.427040100097656, "logps/rejected": -163.4776611328125, "loss": 0.8028, "nll_loss": 0.7717798352241516, "rewards/accuracies": 1.0, "rewards/chosen": 1.100701928138733, "rewards/margins": 8.147539138793945, "rewards/rejected": -7.046836853027344, "step": 2069 }, { "epoch": 0.345, "grad_norm": 45.880184173583984, "learning_rate": 1.52318876658213e-07, "logits/chosen": 0.9719594120979309, "logits/rejected": 2.505943536758423, "logps/chosen": -24.780244827270508, "logps/rejected": -323.7920837402344, "loss": 0.969, "nll_loss": 0.9530864953994751, "rewards/accuracies": 1.0, "rewards/chosen": 2.3998308181762695, "rewards/margins": 6.58427619934082, "rewards/rejected": -4.184445381164551, "step": 2070 }, { "epoch": 0.3451666666666667, "grad_norm": 63.515174865722656, "learning_rate": 1.5227286700879093e-07, "logits/chosen": 3.080404281616211, "logits/rejected": 3.0276169776916504, "logps/chosen": -40.839012145996094, "logps/rejected": -214.18508911132812, "loss": 1.5816, "nll_loss": 1.5707314014434814, "rewards/accuracies": 1.0, "rewards/chosen": 2.2071609497070312, "rewards/margins": 9.557201385498047, "rewards/rejected": -7.350039958953857, "step": 2071 }, { "epoch": 0.3453333333333333, "grad_norm": 39.70988845825195, "learning_rate": 1.5222684212831034e-07, "logits/chosen": 2.896202802658081, "logits/rejected": 2.8192052841186523, "logps/chosen": -92.36353302001953, "logps/rejected": -116.61164855957031, "loss": 1.1398, "nll_loss": 1.0866297483444214, "rewards/accuracies": 1.0, "rewards/chosen": 1.6461442708969116, "rewards/margins": 4.370299339294434, "rewards/rejected": -2.7241549491882324, "step": 2072 }, { "epoch": 0.3455, "grad_norm": 49.19292449951172, "learning_rate": 1.521808020301818e-07, "logits/chosen": 2.7399179935455322, "logits/rejected": 2.5660173892974854, "logps/chosen": -130.73683166503906, "logps/rejected": -81.65959930419922, "loss": 0.8995, "nll_loss": 0.782855212688446, "rewards/accuracies": 1.0, "rewards/chosen": 1.4830673933029175, "rewards/margins": 3.1834840774536133, "rewards/rejected": -1.7004166841506958, "step": 2073 }, { "epoch": 0.3456666666666667, "grad_norm": 27.897310256958008, "learning_rate": 1.521347467278203e-07, "logits/chosen": 2.485016107559204, "logits/rejected": 2.5982563495635986, "logps/chosen": -111.80632019042969, "logps/rejected": -343.07965087890625, "loss": 1.1208, "nll_loss": 1.106993317604065, "rewards/accuracies": 1.0, "rewards/chosen": 1.9303512573242188, "rewards/margins": 13.136619567871094, "rewards/rejected": -11.206268310546875, "step": 2074 }, { "epoch": 0.3458333333333333, "grad_norm": 40.37145233154297, "learning_rate": 1.5208867623464525e-07, "logits/chosen": 1.7847599983215332, "logits/rejected": 2.486166477203369, "logps/chosen": -34.36970138549805, "logps/rejected": -184.77059936523438, "loss": 0.9362, "nll_loss": 0.9289109706878662, "rewards/accuracies": 1.0, "rewards/chosen": 2.639953851699829, "rewards/margins": 9.656965255737305, "rewards/rejected": -7.0170111656188965, "step": 2075 }, { "epoch": 0.346, "grad_norm": 48.721412658691406, "learning_rate": 1.5204259056408045e-07, "logits/chosen": 2.513812303543091, "logits/rejected": 2.5158519744873047, "logps/chosen": -89.5790786743164, "logps/rejected": -147.70632934570312, "loss": 1.2664, "nll_loss": 1.1943875551223755, "rewards/accuracies": 1.0, "rewards/chosen": 0.646334171295166, "rewards/margins": 4.080994606018066, "rewards/rejected": -3.4346604347229004, "step": 2076 }, { "epoch": 0.3461666666666667, "grad_norm": 55.18900680541992, "learning_rate": 1.519964897295542e-07, "logits/chosen": 2.258429527282715, "logits/rejected": 2.374783754348755, "logps/chosen": -59.57508087158203, "logps/rejected": -124.5712890625, "loss": 1.3903, "nll_loss": 1.3238908052444458, "rewards/accuracies": 1.0, "rewards/chosen": 1.0989738702774048, "rewards/margins": 3.991060256958008, "rewards/rejected": -2.8920862674713135, "step": 2077 }, { "epoch": 0.3463333333333333, "grad_norm": 22.68537139892578, "learning_rate": 1.5195037374449914e-07, "logits/chosen": 3.9714934825897217, "logits/rejected": 4.137472629547119, "logps/chosen": -69.37171173095703, "logps/rejected": -152.58465576171875, "loss": 0.8171, "nll_loss": 0.8066478967666626, "rewards/accuracies": 1.0, "rewards/chosen": 2.288414716720581, "rewards/margins": 8.751333236694336, "rewards/rejected": -6.462918281555176, "step": 2078 }, { "epoch": 0.3465, "grad_norm": 62.96661376953125, "learning_rate": 1.519042426223524e-07, "logits/chosen": 2.429912805557251, "logits/rejected": 2.5677411556243896, "logps/chosen": -30.865751266479492, "logps/rejected": -161.87078857421875, "loss": 1.2827, "nll_loss": 1.2346301078796387, "rewards/accuracies": 1.0, "rewards/chosen": 0.6234323382377625, "rewards/margins": 7.119897365570068, "rewards/rejected": -6.49646520614624, "step": 2079 }, { "epoch": 0.3466666666666667, "grad_norm": 52.13422775268555, "learning_rate": 1.5185809637655547e-07, "logits/chosen": 2.178178310394287, "logits/rejected": 2.335197925567627, "logps/chosen": -12.82676887512207, "logps/rejected": -147.2464599609375, "loss": 0.6046, "nll_loss": 0.5830350518226624, "rewards/accuracies": 1.0, "rewards/chosen": 1.5614385604858398, "rewards/margins": 7.23652458190918, "rewards/rejected": -5.67508602142334, "step": 2080 }, { "epoch": 0.3468333333333333, "grad_norm": 28.007970809936523, "learning_rate": 1.5181193502055422e-07, "logits/chosen": 0.9475769400596619, "logits/rejected": 2.904874086380005, "logps/chosen": -52.509124755859375, "logps/rejected": -538.2291870117188, "loss": 0.72, "nll_loss": 0.6909095644950867, "rewards/accuracies": 1.0, "rewards/chosen": 1.1473846435546875, "rewards/margins": 9.784588813781738, "rewards/rejected": -8.63720417022705, "step": 2081 }, { "epoch": 0.347, "grad_norm": 31.026424407958984, "learning_rate": 1.5176575856779904e-07, "logits/chosen": 2.2401504516601562, "logits/rejected": 2.103450059890747, "logps/chosen": -76.12680053710938, "logps/rejected": -73.82099151611328, "loss": 0.8425, "nll_loss": 0.7848122119903564, "rewards/accuracies": 1.0, "rewards/chosen": 1.6509530544281006, "rewards/margins": 4.246183395385742, "rewards/rejected": -2.5952301025390625, "step": 2082 }, { "epoch": 0.3471666666666667, "grad_norm": 27.12986946105957, "learning_rate": 1.5171956703174452e-07, "logits/chosen": 2.583414077758789, "logits/rejected": 2.5969626903533936, "logps/chosen": -77.69255828857422, "logps/rejected": -147.1934814453125, "loss": 0.8063, "nll_loss": 0.7769256830215454, "rewards/accuracies": 1.0, "rewards/chosen": 1.3563019037246704, "rewards/margins": 6.047591209411621, "rewards/rejected": -4.69128942489624, "step": 2083 }, { "epoch": 0.3473333333333333, "grad_norm": 23.728429794311523, "learning_rate": 1.5167336042584988e-07, "logits/chosen": 2.952826499938965, "logits/rejected": 3.112199068069458, "logps/chosen": -21.60700225830078, "logps/rejected": -327.5435791015625, "loss": 0.4049, "nll_loss": 0.3858392834663391, "rewards/accuracies": 1.0, "rewards/chosen": 1.9286777973175049, "rewards/margins": 6.516705513000488, "rewards/rejected": -4.5880279541015625, "step": 2084 }, { "epoch": 0.3475, "grad_norm": 52.046592712402344, "learning_rate": 1.5162713876357859e-07, "logits/chosen": 2.722660779953003, "logits/rejected": 2.733732223510742, "logps/chosen": -15.895596504211426, "logps/rejected": -38.894039154052734, "loss": 0.7197, "nll_loss": 0.3973899185657501, "rewards/accuracies": 1.0, "rewards/chosen": 1.3063340187072754, "rewards/margins": 1.7417218685150146, "rewards/rejected": -0.43538782000541687, "step": 2085 }, { "epoch": 0.3476666666666667, "grad_norm": 117.47238159179688, "learning_rate": 1.5158090205839848e-07, "logits/chosen": 2.8245792388916016, "logits/rejected": 2.7896206378936768, "logps/chosen": -66.86422729492188, "logps/rejected": -9.01302719116211, "loss": 2.4468, "nll_loss": 1.7144674062728882, "rewards/accuracies": 1.0, "rewards/chosen": 1.1377838850021362, "rewards/margins": 0.48302900791168213, "rewards/rejected": 0.6547548770904541, "step": 2086 }, { "epoch": 0.3478333333333333, "grad_norm": 43.79777145385742, "learning_rate": 1.5153465032378183e-07, "logits/chosen": 3.2304251194000244, "logits/rejected": 3.187544822692871, "logps/chosen": -77.07243347167969, "logps/rejected": -40.89445114135742, "loss": 1.1368, "nll_loss": 1.0704503059387207, "rewards/accuracies": 1.0, "rewards/chosen": 0.8885917663574219, "rewards/margins": 4.077875137329102, "rewards/rejected": -3.1892833709716797, "step": 2087 }, { "epoch": 0.348, "grad_norm": 27.701478958129883, "learning_rate": 1.5148838357320535e-07, "logits/chosen": 2.673760175704956, "logits/rejected": 2.7422704696655273, "logps/chosen": -36.80453872680664, "logps/rejected": -246.20729064941406, "loss": 0.5946, "nll_loss": 0.5936216711997986, "rewards/accuracies": 1.0, "rewards/chosen": 4.88670015335083, "rewards/margins": 12.45463752746582, "rewards/rejected": -7.567936897277832, "step": 2088 }, { "epoch": 0.3481666666666667, "grad_norm": 26.992883682250977, "learning_rate": 1.5144210182014998e-07, "logits/chosen": 2.9338114261627197, "logits/rejected": 2.919976234436035, "logps/chosen": -48.18402862548828, "logps/rejected": -110.96430969238281, "loss": 0.6419, "nll_loss": 0.5876100659370422, "rewards/accuracies": 1.0, "rewards/chosen": 1.5633903741836548, "rewards/margins": 4.324372291564941, "rewards/rejected": -2.760982036590576, "step": 2089 }, { "epoch": 0.34833333333333333, "grad_norm": 30.548789978027344, "learning_rate": 1.5139580507810117e-07, "logits/chosen": 3.0440757274627686, "logits/rejected": 3.0644989013671875, "logps/chosen": -27.65410804748535, "logps/rejected": -67.90052795410156, "loss": 0.5584, "nll_loss": 0.476794958114624, "rewards/accuracies": 1.0, "rewards/chosen": 2.0407025814056396, "rewards/margins": 3.9297943115234375, "rewards/rejected": -1.8890916109085083, "step": 2090 }, { "epoch": 0.3485, "grad_norm": 31.80453872680664, "learning_rate": 1.5134949336054863e-07, "logits/chosen": 2.284658432006836, "logits/rejected": 2.4747819900512695, "logps/chosen": -36.36326599121094, "logps/rejected": -89.18841552734375, "loss": 0.6248, "nll_loss": 0.5865043997764587, "rewards/accuracies": 1.0, "rewards/chosen": 1.130022406578064, "rewards/margins": 5.444267272949219, "rewards/rejected": -4.314244747161865, "step": 2091 }, { "epoch": 0.3486666666666667, "grad_norm": 33.25831985473633, "learning_rate": 1.5130316668098652e-07, "logits/chosen": 3.24218487739563, "logits/rejected": 3.11078143119812, "logps/chosen": -51.669158935546875, "logps/rejected": -19.13104820251465, "loss": 0.9439, "nll_loss": 0.8201452493667603, "rewards/accuracies": 1.0, "rewards/chosen": 2.8633391857147217, "rewards/margins": 3.966648578643799, "rewards/rejected": -1.1033092737197876, "step": 2092 }, { "epoch": 0.34883333333333333, "grad_norm": 143.1815185546875, "learning_rate": 1.5125682505291332e-07, "logits/chosen": 1.6795114278793335, "logits/rejected": 1.9039928913116455, "logps/chosen": -20.15536880493164, "logps/rejected": -40.46891403198242, "loss": 3.7207, "nll_loss": 0.775206446647644, "rewards/accuracies": 0.0, "rewards/chosen": 2.3348560333251953, "rewards/margins": -2.245882511138916, "rewards/rejected": 4.580738544464111, "step": 2093 }, { "epoch": 0.349, "grad_norm": 86.97321319580078, "learning_rate": 1.5121046848983187e-07, "logits/chosen": 2.830242395401001, "logits/rejected": 2.9842936992645264, "logps/chosen": -139.8516082763672, "logps/rejected": -337.8026123046875, "loss": 1.4762, "nll_loss": 1.3070248365402222, "rewards/accuracies": 1.0, "rewards/chosen": -0.9869033694267273, "rewards/margins": 6.7554521560668945, "rewards/rejected": -7.7423553466796875, "step": 2094 }, { "epoch": 0.3491666666666667, "grad_norm": 34.007362365722656, "learning_rate": 1.5116409700524933e-07, "logits/chosen": 2.536165952682495, "logits/rejected": 2.878875970840454, "logps/chosen": -34.80310821533203, "logps/rejected": -403.69647216796875, "loss": 0.8241, "nll_loss": 0.7909797430038452, "rewards/accuracies": 1.0, "rewards/chosen": 1.016448974609375, "rewards/margins": 8.286142349243164, "rewards/rejected": -7.269692897796631, "step": 2095 }, { "epoch": 0.34933333333333333, "grad_norm": 27.890182495117188, "learning_rate": 1.5111771061267727e-07, "logits/chosen": 2.6645870208740234, "logits/rejected": 2.720529079437256, "logps/chosen": -54.13346481323242, "logps/rejected": -157.5923614501953, "loss": 0.8377, "nll_loss": 0.8202040195465088, "rewards/accuracies": 1.0, "rewards/chosen": 1.8007253408432007, "rewards/margins": 7.428112030029297, "rewards/rejected": -5.627386569976807, "step": 2096 }, { "epoch": 0.3495, "grad_norm": 29.049989700317383, "learning_rate": 1.510713093256315e-07, "logits/chosen": 1.210738182067871, "logits/rejected": 1.8656258583068848, "logps/chosen": -97.61183166503906, "logps/rejected": -270.1304626464844, "loss": 1.2509, "nll_loss": 1.2355924844741821, "rewards/accuracies": 1.0, "rewards/chosen": 3.092320203781128, "rewards/margins": 6.779026985168457, "rewards/rejected": -3.686706781387329, "step": 2097 }, { "epoch": 0.3496666666666667, "grad_norm": 25.591772079467773, "learning_rate": 1.5102489315763233e-07, "logits/chosen": 3.0873727798461914, "logits/rejected": 3.242671251296997, "logps/chosen": -69.71642303466797, "logps/rejected": -63.49414825439453, "loss": 0.8516, "nll_loss": 0.8299574851989746, "rewards/accuracies": 1.0, "rewards/chosen": 1.6149253845214844, "rewards/margins": 6.832492351531982, "rewards/rejected": -5.217566967010498, "step": 2098 }, { "epoch": 0.34983333333333333, "grad_norm": 39.627193450927734, "learning_rate": 1.5097846212220433e-07, "logits/chosen": 2.6799304485321045, "logits/rejected": 2.9217257499694824, "logps/chosen": -26.777130126953125, "logps/rejected": -216.79270935058594, "loss": 0.9961, "nll_loss": 0.9917454719543457, "rewards/accuracies": 1.0, "rewards/chosen": 4.018447399139404, "rewards/margins": 9.098429679870605, "rewards/rejected": -5.079982280731201, "step": 2099 }, { "epoch": 0.35, "grad_norm": 48.68954849243164, "learning_rate": 1.509320162328763e-07, "logits/chosen": 2.6413025856018066, "logits/rejected": 2.8060317039489746, "logps/chosen": -19.820079803466797, "logps/rejected": -362.1427001953125, "loss": 0.7965, "nll_loss": 0.7928030490875244, "rewards/accuracies": 1.0, "rewards/chosen": 3.5019123554229736, "rewards/margins": 9.865645408630371, "rewards/rejected": -6.363732814788818, "step": 2100 }, { "epoch": 0.3501666666666667, "grad_norm": 41.32299041748047, "learning_rate": 1.5088555550318153e-07, "logits/chosen": 3.5031886100769043, "logits/rejected": 3.4059886932373047, "logps/chosen": -75.82786560058594, "logps/rejected": -45.0159797668457, "loss": 0.9991, "nll_loss": 0.8920924663543701, "rewards/accuracies": 1.0, "rewards/chosen": 1.6692596673965454, "rewards/margins": 3.382622241973877, "rewards/rejected": -1.713362693786621, "step": 2101 }, { "epoch": 0.35033333333333333, "grad_norm": 129.56634521484375, "learning_rate": 1.5083907994665748e-07, "logits/chosen": 2.010006904602051, "logits/rejected": 2.2045037746429443, "logps/chosen": -35.14317321777344, "logps/rejected": -149.31707763671875, "loss": 2.6012, "nll_loss": 2.3428778648376465, "rewards/accuracies": 1.0, "rewards/chosen": -0.6352365612983704, "rewards/margins": 2.012495994567871, "rewards/rejected": -2.6477324962615967, "step": 2102 }, { "epoch": 0.3505, "grad_norm": 55.63848876953125, "learning_rate": 1.507925895768461e-07, "logits/chosen": 2.0310235023498535, "logits/rejected": 2.215726852416992, "logps/chosen": -14.051265716552734, "logps/rejected": -99.95506286621094, "loss": 0.5774, "nll_loss": 0.5620506405830383, "rewards/accuracies": 1.0, "rewards/chosen": 2.6364734172821045, "rewards/margins": 6.63400411605835, "rewards/rejected": -3.997530698776245, "step": 2103 }, { "epoch": 0.3506666666666667, "grad_norm": 81.44893646240234, "learning_rate": 1.507460844072935e-07, "logits/chosen": 2.9959588050842285, "logits/rejected": 2.939274311065674, "logps/chosen": -62.9163818359375, "logps/rejected": -137.64744567871094, "loss": 2.0098, "nll_loss": 1.6556944847106934, "rewards/accuracies": 1.0, "rewards/chosen": -2.197760820388794, "rewards/margins": 3.914604902267456, "rewards/rejected": -6.11236572265625, "step": 2104 }, { "epoch": 0.35083333333333333, "grad_norm": 169.42857360839844, "learning_rate": 1.5069956445155025e-07, "logits/chosen": 2.3967208862304688, "logits/rejected": 2.0368716716766357, "logps/chosen": -105.73704528808594, "logps/rejected": -19.56086540222168, "loss": 2.8025, "nll_loss": 1.2894762754440308, "rewards/accuracies": 0.0, "rewards/chosen": 0.8143356442451477, "rewards/margins": -0.8709258437156677, "rewards/rejected": 1.6852614879608154, "step": 2105 }, { "epoch": 0.351, "grad_norm": 38.52131652832031, "learning_rate": 1.5065302972317107e-07, "logits/chosen": 2.2109742164611816, "logits/rejected": 2.4097585678100586, "logps/chosen": -80.23641967773438, "logps/rejected": -146.03619384765625, "loss": 1.075, "nll_loss": 0.9905731081962585, "rewards/accuracies": 1.0, "rewards/chosen": 0.029639435932040215, "rewards/margins": 5.201854705810547, "rewards/rejected": -5.172215461730957, "step": 2106 }, { "epoch": 0.3511666666666667, "grad_norm": 35.379859924316406, "learning_rate": 1.506064802357151e-07, "logits/chosen": 2.474440574645996, "logits/rejected": 2.737053394317627, "logps/chosen": -23.967330932617188, "logps/rejected": -32.00160217285156, "loss": 0.7246, "nll_loss": 0.5845690965652466, "rewards/accuracies": 1.0, "rewards/chosen": 1.1591846942901611, "rewards/margins": 2.824528932571411, "rewards/rejected": -1.66534423828125, "step": 2107 }, { "epoch": 0.35133333333333333, "grad_norm": 61.19590377807617, "learning_rate": 1.5055991600274572e-07, "logits/chosen": 2.4686999320983887, "logits/rejected": 2.459479331970215, "logps/chosen": -19.730010986328125, "logps/rejected": -43.42339324951172, "loss": 0.7382, "nll_loss": 0.6364519596099854, "rewards/accuracies": 1.0, "rewards/chosen": 0.7023372650146484, "rewards/margins": 3.282043933868408, "rewards/rejected": -2.5797066688537598, "step": 2108 }, { "epoch": 0.3515, "grad_norm": 155.70431518554688, "learning_rate": 1.5051333703783066e-07, "logits/chosen": 2.721369504928589, "logits/rejected": 2.8122682571411133, "logps/chosen": -119.08134460449219, "logps/rejected": -305.79864501953125, "loss": 1.3015, "nll_loss": 1.1234090328216553, "rewards/accuracies": 1.0, "rewards/chosen": -0.9256684184074402, "rewards/margins": 4.00557279586792, "rewards/rejected": -4.931241035461426, "step": 2109 }, { "epoch": 0.3516666666666667, "grad_norm": 32.161434173583984, "learning_rate": 1.5046674335454188e-07, "logits/chosen": 2.842535972595215, "logits/rejected": 3.0019829273223877, "logps/chosen": -15.532295227050781, "logps/rejected": -210.34576416015625, "loss": 0.4231, "nll_loss": 0.3530067205429077, "rewards/accuracies": 1.0, "rewards/chosen": 1.3854389190673828, "rewards/margins": 3.891432523727417, "rewards/rejected": -2.505993604660034, "step": 2110 }, { "epoch": 0.35183333333333333, "grad_norm": 36.58574295043945, "learning_rate": 1.504201349664557e-07, "logits/chosen": 2.3628034591674805, "logits/rejected": 2.532430410385132, "logps/chosen": -14.449024200439453, "logps/rejected": -137.4219970703125, "loss": 0.4452, "nll_loss": 0.3612256348133087, "rewards/accuracies": 1.0, "rewards/chosen": 0.7030189633369446, "rewards/margins": 3.6576836109161377, "rewards/rejected": -2.954664707183838, "step": 2111 }, { "epoch": 0.352, "grad_norm": 120.51708221435547, "learning_rate": 1.5037351188715263e-07, "logits/chosen": 1.7458775043487549, "logits/rejected": 2.414280652999878, "logps/chosen": -41.70924377441406, "logps/rejected": -73.19837951660156, "loss": 1.2297, "nll_loss": 0.8021007180213928, "rewards/accuracies": 1.0, "rewards/chosen": -0.12135925889015198, "rewards/margins": 1.0450725555419922, "rewards/rejected": -1.1664317846298218, "step": 2112 }, { "epoch": 0.3521666666666667, "grad_norm": 65.82917785644531, "learning_rate": 1.503268741302176e-07, "logits/chosen": 2.356149911880493, "logits/rejected": 2.4407529830932617, "logps/chosen": -8.025195121765137, "logps/rejected": -179.38076782226562, "loss": 0.6319, "nll_loss": 0.6173226237297058, "rewards/accuracies": 1.0, "rewards/chosen": 1.9384526014328003, "rewards/margins": 8.174306869506836, "rewards/rejected": -6.235854148864746, "step": 2113 }, { "epoch": 0.35233333333333333, "grad_norm": 132.56382751464844, "learning_rate": 1.5028022170923964e-07, "logits/chosen": 2.1375796794891357, "logits/rejected": 2.1675989627838135, "logps/chosen": -67.59176635742188, "logps/rejected": -31.640256881713867, "loss": 1.9456, "nll_loss": 0.9519968032836914, "rewards/accuracies": 1.0, "rewards/chosen": 1.4301751852035522, "rewards/margins": 0.06001472473144531, "rewards/rejected": 1.370160460472107, "step": 2114 }, { "epoch": 0.3525, "grad_norm": 29.283809661865234, "learning_rate": 1.502335546378122e-07, "logits/chosen": 2.4398090839385986, "logits/rejected": 2.425950527191162, "logps/chosen": -34.226863861083984, "logps/rejected": -72.68807983398438, "loss": 0.7194, "nll_loss": 0.6985074281692505, "rewards/accuracies": 1.0, "rewards/chosen": 1.576186180114746, "rewards/margins": 7.429750919342041, "rewards/rejected": -5.853564739227295, "step": 2115 }, { "epoch": 0.3526666666666667, "grad_norm": 114.46229553222656, "learning_rate": 1.501868729295329e-07, "logits/chosen": 2.445382833480835, "logits/rejected": 2.5327391624450684, "logps/chosen": -117.99640655517578, "logps/rejected": -149.90594482421875, "loss": 1.5872, "nll_loss": 1.25528085231781, "rewards/accuracies": 1.0, "rewards/chosen": -1.5915511846542358, "rewards/margins": 2.2663536071777344, "rewards/rejected": -3.8579049110412598, "step": 2116 }, { "epoch": 0.35283333333333333, "grad_norm": 30.714385986328125, "learning_rate": 1.5014017659800375e-07, "logits/chosen": 1.8728559017181396, "logits/rejected": 2.246929407119751, "logps/chosen": -69.03398132324219, "logps/rejected": -196.29461669921875, "loss": 0.8959, "nll_loss": 0.8629246950149536, "rewards/accuracies": 1.0, "rewards/chosen": 1.0122673511505127, "rewards/margins": 8.714644432067871, "rewards/rejected": -7.7023773193359375, "step": 2117 }, { "epoch": 0.353, "grad_norm": 33.92218017578125, "learning_rate": 1.5009346565683086e-07, "logits/chosen": 2.7208094596862793, "logits/rejected": 2.777606248855591, "logps/chosen": -87.3554458618164, "logps/rejected": -152.12820434570312, "loss": 1.1883, "nll_loss": 1.1647391319274902, "rewards/accuracies": 1.0, "rewards/chosen": 1.3981735706329346, "rewards/margins": 8.002924919128418, "rewards/rejected": -6.6047515869140625, "step": 2118 }, { "epoch": 0.3531666666666667, "grad_norm": 35.43739700317383, "learning_rate": 1.500467401196247e-07, "logits/chosen": 2.1297590732574463, "logits/rejected": 2.4695448875427246, "logps/chosen": -31.28982162475586, "logps/rejected": -159.92645263671875, "loss": 0.6758, "nll_loss": 0.6017273664474487, "rewards/accuracies": 1.0, "rewards/chosen": 1.019500732421875, "rewards/margins": 3.798274278640747, "rewards/rejected": -2.778773546218872, "step": 2119 }, { "epoch": 0.35333333333333333, "grad_norm": 37.153778076171875, "learning_rate": 1.5e-07, "logits/chosen": 2.2991013526916504, "logits/rejected": 2.6164138317108154, "logps/chosen": -14.245555877685547, "logps/rejected": -110.63290405273438, "loss": 0.5436, "nll_loss": 0.5087698698043823, "rewards/accuracies": 1.0, "rewards/chosen": 1.3500763177871704, "rewards/margins": 5.362696170806885, "rewards/rejected": -4.012619972229004, "step": 2120 }, { "epoch": 0.3535, "grad_norm": 41.0416374206543, "learning_rate": 1.4995324531157568e-07, "logits/chosen": 2.5072383880615234, "logits/rejected": 2.516120433807373, "logps/chosen": -16.606056213378906, "logps/rejected": -165.5319366455078, "loss": 0.6568, "nll_loss": 0.6386944055557251, "rewards/accuracies": 1.0, "rewards/chosen": 2.689835786819458, "rewards/margins": 6.358101844787598, "rewards/rejected": -3.6682662963867188, "step": 2121 }, { "epoch": 0.3536666666666667, "grad_norm": 22.178651809692383, "learning_rate": 1.4990647606797492e-07, "logits/chosen": 1.9647880792617798, "logits/rejected": 2.3856253623962402, "logps/chosen": -17.176841735839844, "logps/rejected": -194.7012176513672, "loss": 0.3712, "nll_loss": 0.3578508794307709, "rewards/accuracies": 1.0, "rewards/chosen": 2.630497932434082, "rewards/margins": 6.907214641571045, "rewards/rejected": -4.276716709136963, "step": 2122 }, { "epoch": 0.35383333333333333, "grad_norm": 23.45485496520996, "learning_rate": 1.498596922828252e-07, "logits/chosen": 1.2860230207443237, "logits/rejected": 1.5579935312271118, "logps/chosen": -64.4491958618164, "logps/rejected": -108.77955627441406, "loss": 0.6092, "nll_loss": 0.5754392743110657, "rewards/accuracies": 1.0, "rewards/chosen": 1.4478005170822144, "rewards/margins": 5.3356194496154785, "rewards/rejected": -3.8878190517425537, "step": 2123 }, { "epoch": 0.354, "grad_norm": 39.401405334472656, "learning_rate": 1.4981289396975815e-07, "logits/chosen": 2.2992615699768066, "logits/rejected": 1.899722933769226, "logps/chosen": -88.84375, "logps/rejected": -79.76022338867188, "loss": 1.0919, "nll_loss": 1.0452207326889038, "rewards/accuracies": 1.0, "rewards/chosen": 1.8787033557891846, "rewards/margins": 4.631999969482422, "rewards/rejected": -2.753296375274658, "step": 2124 }, { "epoch": 0.3541666666666667, "grad_norm": 70.79818725585938, "learning_rate": 1.4976608114240969e-07, "logits/chosen": 2.3639450073242188, "logits/rejected": 2.414689779281616, "logps/chosen": -38.254188537597656, "logps/rejected": -113.62828826904297, "loss": 1.0076, "nll_loss": 0.9330288171768188, "rewards/accuracies": 1.0, "rewards/chosen": 0.9991150498390198, "rewards/margins": 3.79158878326416, "rewards/rejected": -2.792473793029785, "step": 2125 }, { "epoch": 0.35433333333333333, "grad_norm": 68.17687225341797, "learning_rate": 1.4971925381441997e-07, "logits/chosen": 3.055448293685913, "logits/rejected": 2.7997782230377197, "logps/chosen": -10.742103576660156, "logps/rejected": -12.587301254272461, "loss": 0.8948, "nll_loss": 0.2685525715351105, "rewards/accuracies": 1.0, "rewards/chosen": 0.9561603665351868, "rewards/margins": 0.6751013994216919, "rewards/rejected": 0.28105899691581726, "step": 2126 }, { "epoch": 0.3545, "grad_norm": 41.25550842285156, "learning_rate": 1.496724119994333e-07, "logits/chosen": 2.3397679328918457, "logits/rejected": 2.5079524517059326, "logps/chosen": -2.361362934112549, "logps/rejected": -65.03791809082031, "loss": 0.3142, "nll_loss": 0.1389036774635315, "rewards/accuracies": 1.0, "rewards/chosen": 0.17075243592262268, "rewards/margins": 2.4283347129821777, "rewards/rejected": -2.257582187652588, "step": 2127 }, { "epoch": 0.3546666666666667, "grad_norm": 26.154430389404297, "learning_rate": 1.4962555571109834e-07, "logits/chosen": 2.770113945007324, "logits/rejected": 3.201244354248047, "logps/chosen": -101.4062728881836, "logps/rejected": -415.1878356933594, "loss": 1.1267, "nll_loss": 1.0903900861740112, "rewards/accuracies": 1.0, "rewards/chosen": 1.3886444568634033, "rewards/margins": 5.193768501281738, "rewards/rejected": -3.805124282836914, "step": 2128 }, { "epoch": 0.35483333333333333, "grad_norm": 40.38204574584961, "learning_rate": 1.495786849630678e-07, "logits/chosen": 1.2298502922058105, "logits/rejected": 3.0060112476348877, "logps/chosen": -66.4819564819336, "logps/rejected": -464.4710693359375, "loss": 0.8933, "nll_loss": 0.8523328900337219, "rewards/accuracies": 1.0, "rewards/chosen": 1.3523048162460327, "rewards/margins": 4.907490253448486, "rewards/rejected": -3.555185317993164, "step": 2129 }, { "epoch": 0.355, "grad_norm": 89.011962890625, "learning_rate": 1.4953179976899877e-07, "logits/chosen": 2.7791988849639893, "logits/rejected": 2.815070390701294, "logps/chosen": -14.63555908203125, "logps/rejected": -139.5303955078125, "loss": 0.9422, "nll_loss": 0.9147224426269531, "rewards/accuracies": 1.0, "rewards/chosen": 1.9485543966293335, "rewards/margins": 5.545135021209717, "rewards/rejected": -3.5965805053710938, "step": 2130 }, { "epoch": 0.3551666666666667, "grad_norm": 37.41210174560547, "learning_rate": 1.494849001425524e-07, "logits/chosen": 2.609236717224121, "logits/rejected": 2.5890371799468994, "logps/chosen": -53.6396484375, "logps/rejected": -46.62845230102539, "loss": 0.9326, "nll_loss": 0.8793383836746216, "rewards/accuracies": 1.0, "rewards/chosen": 1.3137962818145752, "rewards/margins": 4.365097999572754, "rewards/rejected": -3.051301956176758, "step": 2131 }, { "epoch": 0.35533333333333333, "grad_norm": 135.90554809570312, "learning_rate": 1.4943798609739417e-07, "logits/chosen": 2.468092441558838, "logits/rejected": 2.428302049636841, "logps/chosen": -44.05976867675781, "logps/rejected": -8.873056411743164, "loss": 2.0126, "nll_loss": 0.5123228430747986, "rewards/accuracies": 0.0, "rewards/chosen": 1.0740631818771362, "rewards/margins": -0.7994074821472168, "rewards/rejected": 1.873470664024353, "step": 2132 }, { "epoch": 0.3555, "grad_norm": 22.652721405029297, "learning_rate": 1.4939105764719367e-07, "logits/chosen": 0.8981720209121704, "logits/rejected": 2.4671483039855957, "logps/chosen": -37.46159362792969, "logps/rejected": -316.23291015625, "loss": 0.4496, "nll_loss": 0.440724641084671, "rewards/accuracies": 1.0, "rewards/chosen": 2.375737428665161, "rewards/margins": 11.728267669677734, "rewards/rejected": -9.352530479431152, "step": 2133 }, { "epoch": 0.3556666666666667, "grad_norm": 32.71010971069336, "learning_rate": 1.4934411480562474e-07, "logits/chosen": 2.8454368114471436, "logits/rejected": 2.8693954944610596, "logps/chosen": -56.101139068603516, "logps/rejected": -165.6510009765625, "loss": 0.8334, "nll_loss": 0.7685087323188782, "rewards/accuracies": 1.0, "rewards/chosen": 0.6480922698974609, "rewards/margins": 4.378517150878906, "rewards/rejected": -3.730424642562866, "step": 2134 }, { "epoch": 0.35583333333333333, "grad_norm": 39.662872314453125, "learning_rate": 1.492971575863654e-07, "logits/chosen": 2.6226963996887207, "logits/rejected": 2.7569515705108643, "logps/chosen": -25.639352798461914, "logps/rejected": -87.04047393798828, "loss": 0.7146, "nll_loss": 0.6929555535316467, "rewards/accuracies": 1.0, "rewards/chosen": 1.6750802993774414, "rewards/margins": 6.551628112792969, "rewards/rejected": -4.876547813415527, "step": 2135 }, { "epoch": 0.356, "grad_norm": 60.33328628540039, "learning_rate": 1.4925018600309781e-07, "logits/chosen": 1.8828195333480835, "logits/rejected": 1.3138728141784668, "logps/chosen": -60.4963493347168, "logps/rejected": -18.58591651916504, "loss": 1.218, "nll_loss": 0.5550124049186707, "rewards/accuracies": 1.0, "rewards/chosen": 2.87406063079834, "rewards/margins": 1.3377467393875122, "rewards/rejected": 1.5363138914108276, "step": 2136 }, { "epoch": 0.3561666666666667, "grad_norm": 118.86309814453125, "learning_rate": 1.4920320006950845e-07, "logits/chosen": 3.335430145263672, "logits/rejected": 3.1455507278442383, "logps/chosen": -98.51545715332031, "logps/rejected": -42.64940643310547, "loss": 2.1266, "nll_loss": 1.5637375116348267, "rewards/accuracies": 1.0, "rewards/chosen": 1.7854232788085938, "rewards/margins": 1.1208994388580322, "rewards/rejected": 0.6645237803459167, "step": 2137 }, { "epoch": 0.35633333333333334, "grad_norm": 22.394149780273438, "learning_rate": 1.4915619979928784e-07, "logits/chosen": 1.969616413116455, "logits/rejected": 1.9442027807235718, "logps/chosen": -52.863548278808594, "logps/rejected": -120.86436462402344, "loss": 0.601, "nll_loss": 0.5746037364006042, "rewards/accuracies": 1.0, "rewards/chosen": 1.3682472705841064, "rewards/margins": 6.690778732299805, "rewards/rejected": -5.322531700134277, "step": 2138 }, { "epoch": 0.3565, "grad_norm": 41.59381866455078, "learning_rate": 1.491091852061307e-07, "logits/chosen": 2.2844133377075195, "logits/rejected": 1.793014407157898, "logps/chosen": -109.33125305175781, "logps/rejected": -56.76336669921875, "loss": 1.2119, "nll_loss": 1.1508551836013794, "rewards/accuracies": 1.0, "rewards/chosen": 1.043962836265564, "rewards/margins": 4.175824165344238, "rewards/rejected": -3.131861448287964, "step": 2139 }, { "epoch": 0.3566666666666667, "grad_norm": 25.75110626220703, "learning_rate": 1.4906215630373603e-07, "logits/chosen": 1.528623104095459, "logits/rejected": 2.269388198852539, "logps/chosen": -64.94320678710938, "logps/rejected": -144.43272399902344, "loss": 0.8088, "nll_loss": 0.7824482917785645, "rewards/accuracies": 1.0, "rewards/chosen": 1.4333817958831787, "rewards/margins": 6.337423324584961, "rewards/rejected": -4.904041290283203, "step": 2140 }, { "epoch": 0.35683333333333334, "grad_norm": 40.10688018798828, "learning_rate": 1.4901511310580688e-07, "logits/chosen": 1.6240592002868652, "logits/rejected": 2.067220449447632, "logps/chosen": -56.29207229614258, "logps/rejected": -179.5313262939453, "loss": 0.9891, "nll_loss": 0.9079365730285645, "rewards/accuracies": 1.0, "rewards/chosen": 0.17361414432525635, "rewards/margins": 4.620161056518555, "rewards/rejected": -4.446547031402588, "step": 2141 }, { "epoch": 0.357, "grad_norm": 24.092174530029297, "learning_rate": 1.489680556260505e-07, "logits/chosen": 2.6575496196746826, "logits/rejected": 2.5919594764709473, "logps/chosen": -127.51542663574219, "logps/rejected": -75.17900085449219, "loss": 0.6846, "nll_loss": 0.6014878749847412, "rewards/accuracies": 1.0, "rewards/chosen": 2.2772903442382812, "rewards/margins": 4.055698871612549, "rewards/rejected": -1.778408408164978, "step": 2142 }, { "epoch": 0.3571666666666667, "grad_norm": 52.01390075683594, "learning_rate": 1.4892098387817833e-07, "logits/chosen": 0.5843035578727722, "logits/rejected": 2.2968404293060303, "logps/chosen": -40.03093338012695, "logps/rejected": -342.005126953125, "loss": 1.0956, "nll_loss": 1.0264344215393066, "rewards/accuracies": 1.0, "rewards/chosen": 0.24227601289749146, "rewards/margins": 5.676071643829346, "rewards/rejected": -5.43379545211792, "step": 2143 }, { "epoch": 0.35733333333333334, "grad_norm": 55.470951080322266, "learning_rate": 1.4887389787590593e-07, "logits/chosen": 2.6623470783233643, "logits/rejected": 2.809156894683838, "logps/chosen": -117.77875518798828, "logps/rejected": -309.4931945800781, "loss": 1.268, "nll_loss": 1.111120343208313, "rewards/accuracies": 1.0, "rewards/chosen": -0.7954093813896179, "rewards/margins": 4.505286693572998, "rewards/rejected": -5.300695896148682, "step": 2144 }, { "epoch": 0.3575, "grad_norm": 23.063405990600586, "learning_rate": 1.4882679763295304e-07, "logits/chosen": 2.2941975593566895, "logits/rejected": 2.5738348960876465, "logps/chosen": -51.123329162597656, "logps/rejected": -363.73541259765625, "loss": 0.5753, "nll_loss": 0.5680369734764099, "rewards/accuracies": 1.0, "rewards/chosen": 2.580465078353882, "rewards/margins": 11.482714653015137, "rewards/rejected": -8.902249336242676, "step": 2145 }, { "epoch": 0.3576666666666667, "grad_norm": 33.238792419433594, "learning_rate": 1.4877968316304355e-07, "logits/chosen": 1.74765944480896, "logits/rejected": 1.353230357170105, "logps/chosen": -90.49761962890625, "logps/rejected": -43.5081787109375, "loss": 0.9837, "nll_loss": 0.9426836371421814, "rewards/accuracies": 1.0, "rewards/chosen": 2.160936117172241, "rewards/margins": 4.926543712615967, "rewards/rejected": -2.7656075954437256, "step": 2146 }, { "epoch": 0.35783333333333334, "grad_norm": 23.90665626525879, "learning_rate": 1.4873255447990545e-07, "logits/chosen": 2.945202112197876, "logits/rejected": 3.098599672317505, "logps/chosen": -29.200740814208984, "logps/rejected": -204.79690551757812, "loss": 0.5018, "nll_loss": 0.4866790473461151, "rewards/accuracies": 1.0, "rewards/chosen": 1.9263302087783813, "rewards/margins": 7.776688575744629, "rewards/rejected": -5.850358486175537, "step": 2147 }, { "epoch": 0.358, "grad_norm": 52.37460708618164, "learning_rate": 1.4868541159727096e-07, "logits/chosen": 2.332712173461914, "logits/rejected": 2.590284824371338, "logps/chosen": -54.84687423706055, "logps/rejected": -217.1329345703125, "loss": 1.6686, "nll_loss": 1.6620265245437622, "rewards/accuracies": 1.0, "rewards/chosen": 2.778489351272583, "rewards/margins": 9.35722827911377, "rewards/rejected": -6.578738689422607, "step": 2148 }, { "epoch": 0.3581666666666667, "grad_norm": 51.779571533203125, "learning_rate": 1.4863825452887634e-07, "logits/chosen": 2.4029672145843506, "logits/rejected": 2.382218837738037, "logps/chosen": -77.40959167480469, "logps/rejected": -101.49690246582031, "loss": 1.3295, "nll_loss": 1.2095248699188232, "rewards/accuracies": 1.0, "rewards/chosen": 1.6789827346801758, "rewards/margins": 3.2400431632995605, "rewards/rejected": -1.5610604286193848, "step": 2149 }, { "epoch": 0.35833333333333334, "grad_norm": 16.765865325927734, "learning_rate": 1.4859108328846204e-07, "logits/chosen": 1.232790470123291, "logits/rejected": 2.2824697494506836, "logps/chosen": -35.385520935058594, "logps/rejected": -269.40283203125, "loss": 0.4074, "nll_loss": 0.3931724429130554, "rewards/accuracies": 1.0, "rewards/chosen": 2.1074604988098145, "rewards/margins": 7.318150997161865, "rewards/rejected": -5.210690498352051, "step": 2150 }, { "epoch": 0.3585, "grad_norm": 31.11652946472168, "learning_rate": 1.4854389788977265e-07, "logits/chosen": 2.9065632820129395, "logits/rejected": 2.9163100719451904, "logps/chosen": -27.192880630493164, "logps/rejected": -51.4798469543457, "loss": 0.6643, "nll_loss": 0.6323925256729126, "rewards/accuracies": 1.0, "rewards/chosen": 2.331568479537964, "rewards/margins": 5.364760398864746, "rewards/rejected": -3.0331921577453613, "step": 2151 }, { "epoch": 0.3586666666666667, "grad_norm": 23.801774978637695, "learning_rate": 1.484966983465568e-07, "logits/chosen": 1.766994833946228, "logits/rejected": 2.3313636779785156, "logps/chosen": -48.672523498535156, "logps/rejected": -203.98580932617188, "loss": 0.6582, "nll_loss": 0.6321108341217041, "rewards/accuracies": 1.0, "rewards/chosen": 1.3977855443954468, "rewards/margins": 6.574739456176758, "rewards/rejected": -5.1769537925720215, "step": 2152 }, { "epoch": 0.35883333333333334, "grad_norm": 70.3598403930664, "learning_rate": 1.484494846725674e-07, "logits/chosen": 1.5525462627410889, "logits/rejected": 1.6390990018844604, "logps/chosen": -18.55902862548828, "logps/rejected": -8.981237411499023, "loss": 1.1566, "nll_loss": 0.32559701800346375, "rewards/accuracies": 1.0, "rewards/chosen": 0.7883041501045227, "rewards/margins": 0.1952468752861023, "rewards/rejected": 0.5930572748184204, "step": 2153 }, { "epoch": 0.359, "grad_norm": 38.09634780883789, "learning_rate": 1.484022568815613e-07, "logits/chosen": 2.1113569736480713, "logits/rejected": 2.335815191268921, "logps/chosen": -51.650821685791016, "logps/rejected": -91.36151885986328, "loss": 0.6661, "nll_loss": 0.5436928868293762, "rewards/accuracies": 1.0, "rewards/chosen": 0.9092190265655518, "rewards/margins": 2.9688146114349365, "rewards/rejected": -2.0595955848693848, "step": 2154 }, { "epoch": 0.3591666666666667, "grad_norm": 26.565431594848633, "learning_rate": 1.4835501498729957e-07, "logits/chosen": 2.5444414615631104, "logits/rejected": 2.7357470989227295, "logps/chosen": -44.02781677246094, "logps/rejected": -201.58892822265625, "loss": 0.6759, "nll_loss": 0.657131552696228, "rewards/accuracies": 1.0, "rewards/chosen": 2.1424057483673096, "rewards/margins": 6.3199872970581055, "rewards/rejected": -4.177581787109375, "step": 2155 }, { "epoch": 0.35933333333333334, "grad_norm": 26.565170288085938, "learning_rate": 1.4830775900354735e-07, "logits/chosen": 2.4602677822113037, "logits/rejected": 2.668576240539551, "logps/chosen": -112.05419158935547, "logps/rejected": -535.483642578125, "loss": 1.0566, "nll_loss": 1.0472354888916016, "rewards/accuracies": 1.0, "rewards/chosen": 2.3145623207092285, "rewards/margins": 15.19342041015625, "rewards/rejected": -12.878857612609863, "step": 2156 }, { "epoch": 0.3595, "grad_norm": 43.11827850341797, "learning_rate": 1.4826048894407394e-07, "logits/chosen": 2.1273956298828125, "logits/rejected": 2.1914634704589844, "logps/chosen": -44.28758239746094, "logps/rejected": -226.43759155273438, "loss": 1.0326, "nll_loss": 1.0299437046051025, "rewards/accuracies": 1.0, "rewards/chosen": 4.568733215332031, "rewards/margins": 10.110811233520508, "rewards/rejected": -5.542077541351318, "step": 2157 }, { "epoch": 0.3596666666666667, "grad_norm": 49.84091567993164, "learning_rate": 1.4821320482265267e-07, "logits/chosen": 2.901390790939331, "logits/rejected": 2.8889782428741455, "logps/chosen": -109.03739929199219, "logps/rejected": -69.69206237792969, "loss": 1.2462, "nll_loss": 1.1851892471313477, "rewards/accuracies": 1.0, "rewards/chosen": 1.4884048700332642, "rewards/margins": 4.12821102142334, "rewards/rejected": -2.639806032180786, "step": 2158 }, { "epoch": 0.35983333333333334, "grad_norm": 29.055498123168945, "learning_rate": 1.48165906653061e-07, "logits/chosen": 2.238501787185669, "logits/rejected": 1.925646185874939, "logps/chosen": -94.57765197753906, "logps/rejected": -91.46057891845703, "loss": 0.971, "nll_loss": 0.9364123940467834, "rewards/accuracies": 1.0, "rewards/chosen": 1.9757111072540283, "rewards/margins": 5.12928581237793, "rewards/rejected": -3.1535744667053223, "step": 2159 }, { "epoch": 0.36, "grad_norm": 36.095829010009766, "learning_rate": 1.481185944490805e-07, "logits/chosen": 3.03905987739563, "logits/rejected": 3.065030813217163, "logps/chosen": -52.422176361083984, "logps/rejected": -189.312255859375, "loss": 0.8474, "nll_loss": 0.7942754626274109, "rewards/accuracies": 1.0, "rewards/chosen": 0.539459228515625, "rewards/margins": 6.086129665374756, "rewards/rejected": -5.546670436859131, "step": 2160 }, { "epoch": 0.3601666666666667, "grad_norm": 34.258819580078125, "learning_rate": 1.480712682244968e-07, "logits/chosen": 2.136139154434204, "logits/rejected": 2.48654842376709, "logps/chosen": -88.32818603515625, "logps/rejected": -142.59751892089844, "loss": 0.9285, "nll_loss": 0.8575551509857178, "rewards/accuracies": 1.0, "rewards/chosen": 1.821232795715332, "rewards/margins": 4.0127716064453125, "rewards/rejected": -2.1915390491485596, "step": 2161 }, { "epoch": 0.36033333333333334, "grad_norm": 124.49011993408203, "learning_rate": 1.4802392799309957e-07, "logits/chosen": 2.156689167022705, "logits/rejected": 2.0403757095336914, "logps/chosen": -121.21333312988281, "logps/rejected": -59.63800048828125, "loss": 1.9087, "nll_loss": 1.3619475364685059, "rewards/accuracies": 1.0, "rewards/chosen": -2.4145493507385254, "rewards/margins": 1.4603805541992188, "rewards/rejected": -3.874929904937744, "step": 2162 }, { "epoch": 0.3605, "grad_norm": 26.819761276245117, "learning_rate": 1.479765737686827e-07, "logits/chosen": 1.9233399629592896, "logits/rejected": 2.5925865173339844, "logps/chosen": -77.41438293457031, "logps/rejected": -45.872169494628906, "loss": 0.7432, "nll_loss": 0.7102237939834595, "rewards/accuracies": 1.0, "rewards/chosen": 1.49021315574646, "rewards/margins": 5.3530988693237305, "rewards/rejected": -3.8628859519958496, "step": 2163 }, { "epoch": 0.3606666666666667, "grad_norm": 30.3223934173584, "learning_rate": 1.4792920556504404e-07, "logits/chosen": 2.946551561355591, "logits/rejected": 3.0093021392822266, "logps/chosen": -79.68988037109375, "logps/rejected": -251.60142517089844, "loss": 1.1191, "nll_loss": 1.091642141342163, "rewards/accuracies": 1.0, "rewards/chosen": 1.5286530256271362, "rewards/margins": 5.8752827644348145, "rewards/rejected": -4.346629619598389, "step": 2164 }, { "epoch": 0.36083333333333334, "grad_norm": 27.409847259521484, "learning_rate": 1.4788182339598557e-07, "logits/chosen": 3.290856122970581, "logits/rejected": 3.3913979530334473, "logps/chosen": -24.810546875, "logps/rejected": -102.35539245605469, "loss": 0.5174, "nll_loss": 0.4594545364379883, "rewards/accuracies": 1.0, "rewards/chosen": 2.0999577045440674, "rewards/margins": 4.423398971557617, "rewards/rejected": -2.32344126701355, "step": 2165 }, { "epoch": 0.361, "grad_norm": 28.276290893554688, "learning_rate": 1.4783442727531325e-07, "logits/chosen": 2.8802924156188965, "logits/rejected": 3.0045878887176514, "logps/chosen": -92.69271850585938, "logps/rejected": -242.6569366455078, "loss": 1.064, "nll_loss": 1.0533264875411987, "rewards/accuracies": 1.0, "rewards/chosen": 2.5981948375701904, "rewards/margins": 7.478106498718262, "rewards/rejected": -4.87991189956665, "step": 2166 }, { "epoch": 0.3611666666666667, "grad_norm": 12.483288764953613, "learning_rate": 1.4778701721683727e-07, "logits/chosen": 2.5782389640808105, "logits/rejected": 2.6826694011688232, "logps/chosen": -128.39810180664062, "logps/rejected": -232.0280303955078, "loss": 0.5321, "nll_loss": 0.5262218117713928, "rewards/accuracies": 1.0, "rewards/chosen": 2.9749085903167725, "rewards/margins": 9.096717834472656, "rewards/rejected": -6.121809482574463, "step": 2167 }, { "epoch": 0.36133333333333334, "grad_norm": 29.290821075439453, "learning_rate": 1.477395932343717e-07, "logits/chosen": 2.763550281524658, "logits/rejected": 2.74904465675354, "logps/chosen": -66.19328308105469, "logps/rejected": -82.94407653808594, "loss": 0.9315, "nll_loss": 0.9193510413169861, "rewards/accuracies": 1.0, "rewards/chosen": 2.0977065563201904, "rewards/margins": 8.826054573059082, "rewards/rejected": -6.7283477783203125, "step": 2168 }, { "epoch": 0.3615, "grad_norm": 27.658000946044922, "learning_rate": 1.4769215534173473e-07, "logits/chosen": 2.0692026615142822, "logits/rejected": 2.180257558822632, "logps/chosen": -62.179969787597656, "logps/rejected": -264.7811279296875, "loss": 0.8417, "nll_loss": 0.829066276550293, "rewards/accuracies": 1.0, "rewards/chosen": 1.996099829673767, "rewards/margins": 13.107902526855469, "rewards/rejected": -11.11180305480957, "step": 2169 }, { "epoch": 0.3616666666666667, "grad_norm": 345.6463317871094, "learning_rate": 1.4764470355274874e-07, "logits/chosen": 4.262783527374268, "logits/rejected": 4.063057899475098, "logps/chosen": -212.7130584716797, "logps/rejected": -95.022705078125, "loss": 3.439, "nll_loss": 1.2085968255996704, "rewards/accuracies": 0.0, "rewards/chosen": 0.2199600338935852, "rewards/margins": -1.8458147048950195, "rewards/rejected": 2.06577467918396, "step": 2170 }, { "epoch": 0.36183333333333334, "grad_norm": 42.52238464355469, "learning_rate": 1.4759723788123996e-07, "logits/chosen": 2.968590259552002, "logits/rejected": 2.821005344390869, "logps/chosen": -24.025615692138672, "logps/rejected": -57.754478454589844, "loss": 0.9359, "nll_loss": 0.8898376226425171, "rewards/accuracies": 1.0, "rewards/chosen": 3.7060890197753906, "rewards/margins": 5.901087760925293, "rewards/rejected": -2.1949985027313232, "step": 2171 }, { "epoch": 0.362, "grad_norm": 60.35470199584961, "learning_rate": 1.4754975834103875e-07, "logits/chosen": 3.4705049991607666, "logits/rejected": 3.6539387702941895, "logps/chosen": -190.83935546875, "logps/rejected": -324.53558349609375, "loss": 1.1881, "nll_loss": 1.1160197257995605, "rewards/accuracies": 1.0, "rewards/chosen": 0.10601501911878586, "rewards/margins": 9.330751419067383, "rewards/rejected": -9.224736213684082, "step": 2172 }, { "epoch": 0.3621666666666667, "grad_norm": 33.500972747802734, "learning_rate": 1.4750226494597952e-07, "logits/chosen": 1.8964394330978394, "logits/rejected": 2.365324020385742, "logps/chosen": -19.042118072509766, "logps/rejected": -90.81804656982422, "loss": 0.5349, "nll_loss": 0.46444180607795715, "rewards/accuracies": 1.0, "rewards/chosen": 2.099904775619507, "rewards/margins": 4.166381359100342, "rewards/rejected": -2.066476583480835, "step": 2173 }, { "epoch": 0.36233333333333334, "grad_norm": 174.8185272216797, "learning_rate": 1.4745475770990073e-07, "logits/chosen": 1.976157307624817, "logits/rejected": 1.675297737121582, "logps/chosen": -35.84132766723633, "logps/rejected": -13.83777141571045, "loss": 2.8368, "nll_loss": 0.9955923557281494, "rewards/accuracies": 0.0, "rewards/chosen": 0.43075066804885864, "rewards/margins": -1.3555457592010498, "rewards/rejected": 1.7862964868545532, "step": 2174 }, { "epoch": 0.3625, "grad_norm": 30.49842643737793, "learning_rate": 1.474072366466448e-07, "logits/chosen": 1.9800965785980225, "logits/rejected": 2.079777956008911, "logps/chosen": -20.725759506225586, "logps/rejected": -70.2760009765625, "loss": 0.5668, "nll_loss": 0.5314297676086426, "rewards/accuracies": 1.0, "rewards/chosen": 1.6336320638656616, "rewards/margins": 5.100099563598633, "rewards/rejected": -3.4664676189422607, "step": 2175 }, { "epoch": 0.3626666666666667, "grad_norm": 256.3053283691406, "learning_rate": 1.4735970177005826e-07, "logits/chosen": 1.6135119199752808, "logits/rejected": 2.2861666679382324, "logps/chosen": -13.627232551574707, "logps/rejected": -115.99857330322266, "loss": 2.8325, "nll_loss": 0.5678014159202576, "rewards/accuracies": 0.0, "rewards/chosen": 0.6222350001335144, "rewards/margins": -1.8200128078460693, "rewards/rejected": 2.4422478675842285, "step": 2176 }, { "epoch": 0.36283333333333334, "grad_norm": 25.729717254638672, "learning_rate": 1.473121530939916e-07, "logits/chosen": 2.454794406890869, "logits/rejected": 2.445340633392334, "logps/chosen": -23.563514709472656, "logps/rejected": -125.86512756347656, "loss": 0.4728, "nll_loss": 0.45314452052116394, "rewards/accuracies": 1.0, "rewards/chosen": 2.222715377807617, "rewards/margins": 6.1717729568481445, "rewards/rejected": -3.9490578174591064, "step": 2177 }, { "epoch": 0.363, "grad_norm": 34.50965118408203, "learning_rate": 1.4726459063229944e-07, "logits/chosen": 0.5578843355178833, "logits/rejected": 1.6292649507522583, "logps/chosen": -51.75988006591797, "logps/rejected": -356.36871337890625, "loss": 0.8719, "nll_loss": 0.8348366618156433, "rewards/accuracies": 1.0, "rewards/chosen": 0.8619117736816406, "rewards/margins": 11.922220230102539, "rewards/rejected": -11.060308456420898, "step": 2178 }, { "epoch": 0.3631666666666667, "grad_norm": 37.62104034423828, "learning_rate": 1.4721701439884022e-07, "logits/chosen": 2.049408435821533, "logits/rejected": 2.0651423931121826, "logps/chosen": -101.92660522460938, "logps/rejected": -104.69367980957031, "loss": 1.1116, "nll_loss": 1.0617355108261108, "rewards/accuracies": 1.0, "rewards/chosen": 1.4853638410568237, "rewards/margins": 4.4678263664245605, "rewards/rejected": -2.9824624061584473, "step": 2179 }, { "epoch": 0.36333333333333334, "grad_norm": 40.203548431396484, "learning_rate": 1.4716942440747662e-07, "logits/chosen": 1.5684173107147217, "logits/rejected": 2.338798761367798, "logps/chosen": -16.543262481689453, "logps/rejected": -151.58233642578125, "loss": 0.6453, "nll_loss": 0.6127133965492249, "rewards/accuracies": 1.0, "rewards/chosen": 1.141824722290039, "rewards/margins": 6.263245582580566, "rewards/rejected": -5.121420860290527, "step": 2180 }, { "epoch": 0.3635, "grad_norm": 30.88935661315918, "learning_rate": 1.4712182067207514e-07, "logits/chosen": 2.5377888679504395, "logits/rejected": 2.517726182937622, "logps/chosen": -105.05021667480469, "logps/rejected": -73.48145294189453, "loss": 1.1212, "nll_loss": 1.0719408988952637, "rewards/accuracies": 1.0, "rewards/chosen": 1.393441915512085, "rewards/margins": 4.496579170227051, "rewards/rejected": -3.103137254714966, "step": 2181 }, { "epoch": 0.3636666666666667, "grad_norm": 128.62168884277344, "learning_rate": 1.4707420320650643e-07, "logits/chosen": 2.432082414627075, "logits/rejected": 2.6411452293395996, "logps/chosen": -21.833328247070312, "logps/rejected": -198.60403442382812, "loss": 0.8279, "nll_loss": 0.519841194152832, "rewards/accuracies": 1.0, "rewards/chosen": 1.1958694458007812, "rewards/margins": 1.769758701324463, "rewards/rejected": -0.5738891959190369, "step": 2182 }, { "epoch": 0.36383333333333334, "grad_norm": 25.20392608642578, "learning_rate": 1.4702657202464503e-07, "logits/chosen": 1.4126092195510864, "logits/rejected": 2.257333993911743, "logps/chosen": -50.69597625732422, "logps/rejected": -182.22735595703125, "loss": 0.7319, "nll_loss": 0.7140278220176697, "rewards/accuracies": 1.0, "rewards/chosen": 2.100788116455078, "rewards/margins": 6.472446441650391, "rewards/rejected": -4.3716583251953125, "step": 2183 }, { "epoch": 0.364, "grad_norm": 23.073522567749023, "learning_rate": 1.4697892714036957e-07, "logits/chosen": 1.0074596405029297, "logits/rejected": 2.0832934379577637, "logps/chosen": -32.91557312011719, "logps/rejected": -216.02635192871094, "loss": 0.5316, "nll_loss": 0.5224694013595581, "rewards/accuracies": 1.0, "rewards/chosen": 2.3725152015686035, "rewards/margins": 9.6607666015625, "rewards/rejected": -7.288250923156738, "step": 2184 }, { "epoch": 0.3641666666666667, "grad_norm": 26.23186492919922, "learning_rate": 1.4693126856756258e-07, "logits/chosen": 2.7317745685577393, "logits/rejected": 2.663970708847046, "logps/chosen": -69.3524169921875, "logps/rejected": -39.55632400512695, "loss": 0.844, "nll_loss": 0.8064233660697937, "rewards/accuracies": 1.0, "rewards/chosen": 2.3788528442382812, "rewards/margins": 5.160189628601074, "rewards/rejected": -2.781336545944214, "step": 2185 }, { "epoch": 0.36433333333333334, "grad_norm": 37.286949157714844, "learning_rate": 1.4688359632011064e-07, "logits/chosen": 2.73134183883667, "logits/rejected": 2.9563028812408447, "logps/chosen": -104.57588195800781, "logps/rejected": -195.2185516357422, "loss": 1.2123, "nll_loss": 1.1883624792099, "rewards/accuracies": 1.0, "rewards/chosen": 1.363057017326355, "rewards/margins": 8.082867622375488, "rewards/rejected": -6.719810485839844, "step": 2186 }, { "epoch": 0.3645, "grad_norm": 60.059547424316406, "learning_rate": 1.468359104119043e-07, "logits/chosen": 2.9431114196777344, "logits/rejected": 2.9000468254089355, "logps/chosen": -16.435335159301758, "logps/rejected": -193.89605712890625, "loss": 0.7522, "nll_loss": 0.7470607757568359, "rewards/accuracies": 1.0, "rewards/chosen": 3.164095878601074, "rewards/margins": 9.124565124511719, "rewards/rejected": -5.9604692459106445, "step": 2187 }, { "epoch": 0.36466666666666664, "grad_norm": 36.92555236816406, "learning_rate": 1.467882108568381e-07, "logits/chosen": 2.065001964569092, "logits/rejected": 2.159071683883667, "logps/chosen": -26.08876609802246, "logps/rejected": -86.25679779052734, "loss": 0.5302, "nll_loss": 0.45769768953323364, "rewards/accuracies": 1.0, "rewards/chosen": 0.6783083081245422, "rewards/margins": 4.004073619842529, "rewards/rejected": -3.325765371322632, "step": 2188 }, { "epoch": 0.36483333333333334, "grad_norm": 62.55386734008789, "learning_rate": 1.4674049766881052e-07, "logits/chosen": 3.0619606971740723, "logits/rejected": 2.9914491176605225, "logps/chosen": -54.906158447265625, "logps/rejected": -61.95521926879883, "loss": 1.8814, "nll_loss": 1.8302052021026611, "rewards/accuracies": 1.0, "rewards/chosen": 1.4585602283477783, "rewards/margins": 4.421906471252441, "rewards/rejected": -2.963346004486084, "step": 2189 }, { "epoch": 0.365, "grad_norm": 51.615020751953125, "learning_rate": 1.4669277086172403e-07, "logits/chosen": 1.1835999488830566, "logits/rejected": 2.1022017002105713, "logps/chosen": -52.83831787109375, "logps/rejected": -225.9739990234375, "loss": 1.0739, "nll_loss": 1.0161213874816895, "rewards/accuracies": 1.0, "rewards/chosen": 0.3596252501010895, "rewards/margins": 9.518598556518555, "rewards/rejected": -9.158973693847656, "step": 2190 }, { "epoch": 0.36516666666666664, "grad_norm": 13.065043449401855, "learning_rate": 1.4664503044948516e-07, "logits/chosen": 2.2507009506225586, "logits/rejected": 2.2586562633514404, "logps/chosen": -173.27096557617188, "logps/rejected": -203.24600219726562, "loss": 0.6631, "nll_loss": 0.6588250994682312, "rewards/accuracies": 1.0, "rewards/chosen": 3.7784485816955566, "rewards/margins": 9.10134506225586, "rewards/rejected": -5.3228960037231445, "step": 2191 }, { "epoch": 0.36533333333333334, "grad_norm": 27.704179763793945, "learning_rate": 1.4659727644600421e-07, "logits/chosen": 2.197002649307251, "logits/rejected": 2.8448379039764404, "logps/chosen": -86.90975952148438, "logps/rejected": -66.67646026611328, "loss": 0.9056, "nll_loss": 0.877876341342926, "rewards/accuracies": 1.0, "rewards/chosen": 2.1579270362854004, "rewards/margins": 5.535869598388672, "rewards/rejected": -3.3779428005218506, "step": 2192 }, { "epoch": 0.3655, "grad_norm": 12.645252227783203, "learning_rate": 1.4654950886519562e-07, "logits/chosen": 2.240778684616089, "logits/rejected": 2.1684765815734863, "logps/chosen": -195.35877990722656, "logps/rejected": -235.25247192382812, "loss": 0.7077, "nll_loss": 0.7027295231819153, "rewards/accuracies": 1.0, "rewards/chosen": 3.1362321376800537, "rewards/margins": 9.38040828704834, "rewards/rejected": -6.244175910949707, "step": 2193 }, { "epoch": 0.36566666666666664, "grad_norm": 27.921449661254883, "learning_rate": 1.4650172772097768e-07, "logits/chosen": 0.5010798573493958, "logits/rejected": 1.7411524057388306, "logps/chosen": -33.87528610229492, "logps/rejected": -361.99163818359375, "loss": 0.6251, "nll_loss": 0.5943031907081604, "rewards/accuracies": 1.0, "rewards/chosen": 1.0551131963729858, "rewards/margins": 13.134366989135742, "rewards/rejected": -12.079254150390625, "step": 2194 }, { "epoch": 0.36583333333333334, "grad_norm": 87.38367462158203, "learning_rate": 1.4645393302727268e-07, "logits/chosen": 0.7841561436653137, "logits/rejected": 3.310121774673462, "logps/chosen": -43.09347915649414, "logps/rejected": -69.39096069335938, "loss": 1.2222, "nll_loss": 0.9368146657943726, "rewards/accuracies": 1.0, "rewards/chosen": 0.7270138263702393, "rewards/margins": 1.733117699623108, "rewards/rejected": -1.0061038732528687, "step": 2195 }, { "epoch": 0.366, "grad_norm": 317.33441162109375, "learning_rate": 1.4640612479800686e-07, "logits/chosen": 2.0700416564941406, "logits/rejected": 1.9967360496520996, "logps/chosen": -108.84782409667969, "logps/rejected": -11.725107192993164, "loss": 4.1231, "nll_loss": 1.7556098699569702, "rewards/accuracies": 0.0, "rewards/chosen": 0.15842211246490479, "rewards/margins": -2.0085244178771973, "rewards/rejected": 2.1669466495513916, "step": 2196 }, { "epoch": 0.36616666666666664, "grad_norm": 133.87042236328125, "learning_rate": 1.4635830304711036e-07, "logits/chosen": 2.195687770843506, "logits/rejected": 1.7194340229034424, "logps/chosen": -84.94894409179688, "logps/rejected": -35.005393981933594, "loss": 1.8213, "nll_loss": 1.2871053218841553, "rewards/accuracies": 1.0, "rewards/chosen": -0.605010986328125, "rewards/margins": 0.711821436882019, "rewards/rejected": -1.316832423210144, "step": 2197 }, { "epoch": 0.36633333333333334, "grad_norm": 29.34528923034668, "learning_rate": 1.4631046778851733e-07, "logits/chosen": 1.6266074180603027, "logits/rejected": 1.5112148523330688, "logps/chosen": -61.960662841796875, "logps/rejected": -78.42985534667969, "loss": 0.739, "nll_loss": 0.6662436723709106, "rewards/accuracies": 1.0, "rewards/chosen": 1.1129379272460938, "rewards/margins": 3.818037271499634, "rewards/rejected": -2.70509934425354, "step": 2198 }, { "epoch": 0.3665, "grad_norm": 36.9654426574707, "learning_rate": 1.4626261903616578e-07, "logits/chosen": 2.81179141998291, "logits/rejected": 2.735574722290039, "logps/chosen": -65.1064224243164, "logps/rejected": -52.75908660888672, "loss": 1.0606, "nll_loss": 0.9717377424240112, "rewards/accuracies": 1.0, "rewards/chosen": 1.5747207403182983, "rewards/margins": 3.604980945587158, "rewards/rejected": -2.0302600860595703, "step": 2199 }, { "epoch": 0.36666666666666664, "grad_norm": 27.487092971801758, "learning_rate": 1.462147568039977e-07, "logits/chosen": 2.608238458633423, "logits/rejected": 2.5518109798431396, "logps/chosen": -44.98123550415039, "logps/rejected": -56.30107879638672, "loss": 0.6978, "nll_loss": 0.6161813139915466, "rewards/accuracies": 1.0, "rewards/chosen": 1.5813122987747192, "rewards/margins": 3.7252745628356934, "rewards/rejected": -2.1439621448516846, "step": 2200 }, { "epoch": 0.36683333333333334, "grad_norm": 30.98788833618164, "learning_rate": 1.46166881105959e-07, "logits/chosen": 1.162211298942566, "logits/rejected": 1.9967721700668335, "logps/chosen": -67.96907043457031, "logps/rejected": -236.02780151367188, "loss": 0.8739, "nll_loss": 0.8391244411468506, "rewards/accuracies": 1.0, "rewards/chosen": 0.925061047077179, "rewards/margins": 11.016465187072754, "rewards/rejected": -10.09140396118164, "step": 2201 }, { "epoch": 0.367, "grad_norm": 49.3350944519043, "learning_rate": 1.461189919559995e-07, "logits/chosen": 3.0835063457489014, "logits/rejected": 2.9751017093658447, "logps/chosen": -17.62855339050293, "logps/rejected": -92.99303436279297, "loss": 0.6947, "nll_loss": 0.5876184105873108, "rewards/accuracies": 1.0, "rewards/chosen": 0.8334665894508362, "rewards/margins": 3.176788568496704, "rewards/rejected": -2.3433220386505127, "step": 2202 }, { "epoch": 0.36716666666666664, "grad_norm": 39.65412139892578, "learning_rate": 1.4607108936807297e-07, "logits/chosen": 2.439401149749756, "logits/rejected": 2.6939685344696045, "logps/chosen": -26.4941349029541, "logps/rejected": -103.19224548339844, "loss": 0.7663, "nll_loss": 0.6972140669822693, "rewards/accuracies": 1.0, "rewards/chosen": 0.5006133913993835, "rewards/margins": 4.394049167633057, "rewards/rejected": -3.8934359550476074, "step": 2203 }, { "epoch": 0.36733333333333335, "grad_norm": 30.71034812927246, "learning_rate": 1.4602317335613704e-07, "logits/chosen": 1.6757864952087402, "logits/rejected": 1.8749715089797974, "logps/chosen": -58.21443176269531, "logps/rejected": -117.493408203125, "loss": 0.7749, "nll_loss": 0.7276803255081177, "rewards/accuracies": 1.0, "rewards/chosen": 0.7770805954933167, "rewards/margins": 5.424685001373291, "rewards/rejected": -4.647604465484619, "step": 2204 }, { "epoch": 0.3675, "grad_norm": 33.264041900634766, "learning_rate": 1.4597524393415335e-07, "logits/chosen": 2.925468921661377, "logits/rejected": 2.8426971435546875, "logps/chosen": -62.03483963012695, "logps/rejected": -98.85452270507812, "loss": 0.9814, "nll_loss": 0.9399217963218689, "rewards/accuracies": 1.0, "rewards/chosen": 1.6288989782333374, "rewards/margins": 4.794038772583008, "rewards/rejected": -3.16513991355896, "step": 2205 }, { "epoch": 0.36766666666666664, "grad_norm": 26.808149337768555, "learning_rate": 1.4592730111608728e-07, "logits/chosen": 2.489372730255127, "logits/rejected": 2.714906930923462, "logps/chosen": -79.22459411621094, "logps/rejected": -177.5437469482422, "loss": 0.8321, "nll_loss": 0.8167483806610107, "rewards/accuracies": 1.0, "rewards/chosen": 1.9967727661132812, "rewards/margins": 7.241702079772949, "rewards/rejected": -5.244929313659668, "step": 2206 }, { "epoch": 0.36783333333333335, "grad_norm": 188.9410858154297, "learning_rate": 1.4587934491590833e-07, "logits/chosen": 1.958634853363037, "logits/rejected": 2.0990846157073975, "logps/chosen": -31.86339569091797, "logps/rejected": -16.153329849243164, "loss": 3.8251, "nll_loss": 0.677944540977478, "rewards/accuracies": 0.0, "rewards/chosen": 0.8498283624649048, "rewards/margins": -2.73970890045166, "rewards/rejected": 3.5895371437072754, "step": 2207 }, { "epoch": 0.368, "grad_norm": 84.47394561767578, "learning_rate": 1.4583137534758967e-07, "logits/chosen": 2.961946964263916, "logits/rejected": 2.8472061157226562, "logps/chosen": -68.30314636230469, "logps/rejected": -67.18866729736328, "loss": 1.0579, "nll_loss": 0.6443691849708557, "rewards/accuracies": 1.0, "rewards/chosen": 1.6273247003555298, "rewards/margins": 1.5309898853302002, "rewards/rejected": 0.09633484482765198, "step": 2208 }, { "epoch": 0.36816666666666664, "grad_norm": 46.789039611816406, "learning_rate": 1.4578339242510857e-07, "logits/chosen": 3.958869218826294, "logits/rejected": 4.187252044677734, "logps/chosen": -89.68708038330078, "logps/rejected": -233.67471313476562, "loss": 1.3759, "nll_loss": 1.281244158744812, "rewards/accuracies": 1.0, "rewards/chosen": 0.9855904579162598, "rewards/margins": 3.3777430057525635, "rewards/rejected": -2.3921525478363037, "step": 2209 }, { "epoch": 0.36833333333333335, "grad_norm": 33.13407897949219, "learning_rate": 1.4573539616244608e-07, "logits/chosen": 1.8335211277008057, "logits/rejected": 2.8314969539642334, "logps/chosen": -33.050315856933594, "logps/rejected": -254.9324493408203, "loss": 0.7721, "nll_loss": 0.7511435151100159, "rewards/accuracies": 1.0, "rewards/chosen": 1.5890305042266846, "rewards/margins": 7.109633445739746, "rewards/rejected": -5.520603179931641, "step": 2210 }, { "epoch": 0.3685, "grad_norm": 29.933656692504883, "learning_rate": 1.4568738657358712e-07, "logits/chosen": 3.0495083332061768, "logits/rejected": 3.2047033309936523, "logps/chosen": -57.90972900390625, "logps/rejected": -258.5473937988281, "loss": 0.8178, "nll_loss": 0.7932838797569275, "rewards/accuracies": 1.0, "rewards/chosen": 1.332849144935608, "rewards/margins": 8.06326675415039, "rewards/rejected": -6.730417251586914, "step": 2211 }, { "epoch": 0.36866666666666664, "grad_norm": 62.759910583496094, "learning_rate": 1.4563936367252062e-07, "logits/chosen": 1.6420925855636597, "logits/rejected": 2.146515130996704, "logps/chosen": -10.909955978393555, "logps/rejected": -155.567138671875, "loss": 0.4847, "nll_loss": 0.474345862865448, "rewards/accuracies": 1.0, "rewards/chosen": 2.3420534133911133, "rewards/margins": 8.211000442504883, "rewards/rejected": -5.868947505950928, "step": 2212 }, { "epoch": 0.36883333333333335, "grad_norm": 41.21712112426758, "learning_rate": 1.4559132747323918e-07, "logits/chosen": 2.409365653991699, "logits/rejected": 2.577195405960083, "logps/chosen": -51.272705078125, "logps/rejected": -224.04173278808594, "loss": 1.019, "nll_loss": 1.0053472518920898, "rewards/accuracies": 1.0, "rewards/chosen": 1.934895396232605, "rewards/margins": 9.3326997756958, "rewards/rejected": -7.397804260253906, "step": 2213 }, { "epoch": 0.369, "grad_norm": 26.092090606689453, "learning_rate": 1.455432779897395e-07, "logits/chosen": 2.876887083053589, "logits/rejected": 2.981590509414673, "logps/chosen": -49.269691467285156, "logps/rejected": -117.4575424194336, "loss": 0.7673, "nll_loss": 0.7465105056762695, "rewards/accuracies": 1.0, "rewards/chosen": 1.8364404439926147, "rewards/margins": 6.324274063110352, "rewards/rejected": -4.487833499908447, "step": 2214 }, { "epoch": 0.36916666666666664, "grad_norm": 27.700668334960938, "learning_rate": 1.4549521523602195e-07, "logits/chosen": 1.6731886863708496, "logits/rejected": 2.172959804534912, "logps/chosen": -71.05156707763672, "logps/rejected": -257.25927734375, "loss": 0.8639, "nll_loss": 0.8458520770072937, "rewards/accuracies": 1.0, "rewards/chosen": 1.6307258605957031, "rewards/margins": 9.547117233276367, "rewards/rejected": -7.916390895843506, "step": 2215 }, { "epoch": 0.36933333333333335, "grad_norm": 26.172475814819336, "learning_rate": 1.4544713922609098e-07, "logits/chosen": 1.8079804182052612, "logits/rejected": 2.5064048767089844, "logps/chosen": -59.784671783447266, "logps/rejected": -182.259521484375, "loss": 0.7566, "nll_loss": 0.7380822896957397, "rewards/accuracies": 1.0, "rewards/chosen": 1.6540204286575317, "rewards/margins": 7.948284149169922, "rewards/rejected": -6.29426383972168, "step": 2216 }, { "epoch": 0.3695, "grad_norm": 26.713071823120117, "learning_rate": 1.4539904997395468e-07, "logits/chosen": 2.617540121078491, "logits/rejected": 2.6609373092651367, "logps/chosen": -68.63015747070312, "logps/rejected": -149.62303161621094, "loss": 0.8277, "nll_loss": 0.807413637638092, "rewards/accuracies": 1.0, "rewards/chosen": 1.6480379104614258, "rewards/margins": 6.978559970855713, "rewards/rejected": -5.330522060394287, "step": 2217 }, { "epoch": 0.36966666666666664, "grad_norm": 47.317413330078125, "learning_rate": 1.4535094749362516e-07, "logits/chosen": 3.579852819442749, "logits/rejected": 3.558729887008667, "logps/chosen": -83.90475463867188, "logps/rejected": -102.4590072631836, "loss": 1.6536, "nll_loss": 1.645190954208374, "rewards/accuracies": 1.0, "rewards/chosen": 3.9420440196990967, "rewards/margins": 8.116604804992676, "rewards/rejected": -4.174560546875, "step": 2218 }, { "epoch": 0.36983333333333335, "grad_norm": 47.29480743408203, "learning_rate": 1.4530283179911831e-07, "logits/chosen": 2.200849771499634, "logits/rejected": 3.0010616779327393, "logps/chosen": -22.756134033203125, "logps/rejected": -44.319976806640625, "loss": 0.8282, "nll_loss": 0.7340688109397888, "rewards/accuracies": 1.0, "rewards/chosen": 1.1634025573730469, "rewards/margins": 3.4055097103118896, "rewards/rejected": -2.2421071529388428, "step": 2219 }, { "epoch": 0.37, "grad_norm": 51.47677230834961, "learning_rate": 1.4525470290445388e-07, "logits/chosen": 1.5711238384246826, "logits/rejected": 1.793894648551941, "logps/chosen": -31.380395889282227, "logps/rejected": -103.28707885742188, "loss": 0.6819, "nll_loss": 0.4903186559677124, "rewards/accuracies": 1.0, "rewards/chosen": 1.2680624723434448, "rewards/margins": 2.4444541931152344, "rewards/rejected": -1.1763916015625, "step": 2220 }, { "epoch": 0.37016666666666664, "grad_norm": 73.00150299072266, "learning_rate": 1.452065608236555e-07, "logits/chosen": 2.823498487472534, "logits/rejected": 3.058239459991455, "logps/chosen": -5.635462760925293, "logps/rejected": -146.307373046875, "loss": 0.5421, "nll_loss": 0.4334971010684967, "rewards/accuracies": 1.0, "rewards/chosen": 0.31884339451789856, "rewards/margins": 3.3005917072296143, "rewards/rejected": -2.981748342514038, "step": 2221 }, { "epoch": 0.37033333333333335, "grad_norm": 310.4862976074219, "learning_rate": 1.4515840557075062e-07, "logits/chosen": 2.901622772216797, "logits/rejected": 2.705376148223877, "logps/chosen": -410.8261413574219, "logps/rejected": -126.45640563964844, "loss": 2.4175, "nll_loss": 1.2083121538162231, "rewards/accuracies": 1.0, "rewards/chosen": -6.0077056884765625, "rewards/margins": 1.5402541160583496, "rewards/rejected": -7.547959804534912, "step": 2222 }, { "epoch": 0.3705, "grad_norm": 23.491186141967773, "learning_rate": 1.4511023715977047e-07, "logits/chosen": 1.2975677251815796, "logits/rejected": 2.205214738845825, "logps/chosen": -57.757347106933594, "logps/rejected": -254.15631103515625, "loss": 0.8013, "nll_loss": 0.7911965847015381, "rewards/accuracies": 1.0, "rewards/chosen": 2.239988088607788, "rewards/margins": 9.756245613098145, "rewards/rejected": -7.516257286071777, "step": 2223 }, { "epoch": 0.37066666666666664, "grad_norm": 28.529264450073242, "learning_rate": 1.4506205560475023e-07, "logits/chosen": 2.8495893478393555, "logits/rejected": 2.976552963256836, "logps/chosen": -50.54594039916992, "logps/rejected": -337.07373046875, "loss": 0.8173, "nll_loss": 0.8152572512626648, "rewards/accuracies": 1.0, "rewards/chosen": 3.912339448928833, "rewards/margins": 11.707786560058594, "rewards/rejected": -7.79544734954834, "step": 2224 }, { "epoch": 0.37083333333333335, "grad_norm": 28.880130767822266, "learning_rate": 1.4501386091972878e-07, "logits/chosen": 2.6665077209472656, "logits/rejected": 2.7040083408355713, "logps/chosen": -74.54074096679688, "logps/rejected": -153.7357177734375, "loss": 0.7652, "nll_loss": 0.7454074621200562, "rewards/accuracies": 1.0, "rewards/chosen": 1.6714836359024048, "rewards/margins": 7.01699686050415, "rewards/rejected": -5.345513343811035, "step": 2225 }, { "epoch": 0.371, "grad_norm": 25.942285537719727, "learning_rate": 1.44965653118749e-07, "logits/chosen": 1.901924729347229, "logits/rejected": 2.323415756225586, "logps/chosen": -45.85362243652344, "logps/rejected": -162.20753479003906, "loss": 0.599, "nll_loss": 0.5731703042984009, "rewards/accuracies": 1.0, "rewards/chosen": 1.285459280014038, "rewards/margins": 7.526220321655273, "rewards/rejected": -6.240760803222656, "step": 2226 }, { "epoch": 0.37116666666666664, "grad_norm": 141.5979766845703, "learning_rate": 1.4491743221585735e-07, "logits/chosen": 1.8872449398040771, "logits/rejected": 1.796561598777771, "logps/chosen": -27.31566047668457, "logps/rejected": -10.573424339294434, "loss": 2.1164, "nll_loss": 0.6070147156715393, "rewards/accuracies": 0.0, "rewards/chosen": 1.4754419326782227, "rewards/margins": -0.715625524520874, "rewards/rejected": 2.1910674571990967, "step": 2227 }, { "epoch": 0.37133333333333335, "grad_norm": 60.63994216918945, "learning_rate": 1.4486919822510438e-07, "logits/chosen": 3.0345263481140137, "logits/rejected": 2.9507274627685547, "logps/chosen": -179.0568084716797, "logps/rejected": -86.16708374023438, "loss": 1.2983, "nll_loss": 1.1478002071380615, "rewards/accuracies": 1.0, "rewards/chosen": 0.8502700924873352, "rewards/margins": 2.657850742340088, "rewards/rejected": -1.807580590248108, "step": 2228 }, { "epoch": 0.3715, "grad_norm": 22.828359603881836, "learning_rate": 1.448209511605442e-07, "logits/chosen": 1.6840705871582031, "logits/rejected": 2.268012285232544, "logps/chosen": -107.33451080322266, "logps/rejected": -193.4657440185547, "loss": 0.7449, "nll_loss": 0.7252331972122192, "rewards/accuracies": 1.0, "rewards/chosen": 1.550153374671936, "rewards/margins": 8.689170837402344, "rewards/rejected": -7.139017105102539, "step": 2229 }, { "epoch": 0.37166666666666665, "grad_norm": 113.90187072753906, "learning_rate": 1.4477269103623494e-07, "logits/chosen": 2.4969310760498047, "logits/rejected": 2.441624164581299, "logps/chosen": -25.259445190429688, "logps/rejected": -68.86122131347656, "loss": 2.002, "nll_loss": 1.6839630603790283, "rewards/accuracies": 1.0, "rewards/chosen": -0.09966202080249786, "rewards/margins": 1.4919075965881348, "rewards/rejected": -1.5915696620941162, "step": 2230 }, { "epoch": 0.37183333333333335, "grad_norm": 221.22323608398438, "learning_rate": 1.4472441786623838e-07, "logits/chosen": 2.1349809169769287, "logits/rejected": 2.2371506690979004, "logps/chosen": -41.62147521972656, "logps/rejected": -40.717689514160156, "loss": 3.9828, "nll_loss": 0.7567540407180786, "rewards/accuracies": 0.0, "rewards/chosen": 2.092797040939331, "rewards/margins": -2.5881259441375732, "rewards/rejected": 4.680922985076904, "step": 2231 }, { "epoch": 0.372, "grad_norm": 64.18487548828125, "learning_rate": 1.4467613166462024e-07, "logits/chosen": 3.0876564979553223, "logits/rejected": 3.146744966506958, "logps/chosen": -12.49599552154541, "logps/rejected": -69.70667266845703, "loss": 0.5809, "nll_loss": 0.5679998397827148, "rewards/accuracies": 1.0, "rewards/chosen": 2.0892505645751953, "rewards/margins": 7.96954870223999, "rewards/rejected": -5.880298137664795, "step": 2232 }, { "epoch": 0.37216666666666665, "grad_norm": 39.13412857055664, "learning_rate": 1.4462783244544983e-07, "logits/chosen": 2.9190409183502197, "logits/rejected": 2.815671443939209, "logps/chosen": -105.65005493164062, "logps/rejected": -111.44292449951172, "loss": 1.4368, "nll_loss": 1.4086672067642212, "rewards/accuracies": 1.0, "rewards/chosen": 1.564965844154358, "rewards/margins": 5.7073259353637695, "rewards/rejected": -4.142360210418701, "step": 2233 }, { "epoch": 0.37233333333333335, "grad_norm": 30.266027450561523, "learning_rate": 1.4457952022280052e-07, "logits/chosen": 2.154491901397705, "logits/rejected": 2.3853983879089355, "logps/chosen": -21.065513610839844, "logps/rejected": -227.16094970703125, "loss": 0.511, "nll_loss": 0.4579460024833679, "rewards/accuracies": 1.0, "rewards/chosen": 0.46691837906837463, "rewards/margins": 7.245965003967285, "rewards/rejected": -6.779046535491943, "step": 2234 }, { "epoch": 0.3725, "grad_norm": 31.59273338317871, "learning_rate": 1.4453119501074922e-07, "logits/chosen": 1.1912978887557983, "logits/rejected": 2.0826966762542725, "logps/chosen": -67.43821716308594, "logps/rejected": -217.14334106445312, "loss": 0.9351, "nll_loss": 0.9113272428512573, "rewards/accuracies": 1.0, "rewards/chosen": 1.829830288887024, "rewards/margins": 5.918247222900391, "rewards/rejected": -4.088417053222656, "step": 2235 }, { "epoch": 0.37266666666666665, "grad_norm": 37.48198699951172, "learning_rate": 1.4448285682337682e-07, "logits/chosen": 2.2004196643829346, "logits/rejected": 2.598442792892456, "logps/chosen": -19.963855743408203, "logps/rejected": -215.55506896972656, "loss": 0.5045, "nll_loss": 0.4869232773780823, "rewards/accuracies": 1.0, "rewards/chosen": 2.4666519165039062, "rewards/margins": 6.374485969543457, "rewards/rejected": -3.907834053039551, "step": 2236 }, { "epoch": 0.37283333333333335, "grad_norm": 28.21770668029785, "learning_rate": 1.4443450567476782e-07, "logits/chosen": 2.542468309402466, "logits/rejected": 2.3386096954345703, "logps/chosen": -150.0328369140625, "logps/rejected": -134.84158325195312, "loss": 0.7189, "nll_loss": 0.6697893738746643, "rewards/accuracies": 1.0, "rewards/chosen": 0.5600342154502869, "rewards/margins": 7.120030403137207, "rewards/rejected": -6.559996128082275, "step": 2237 }, { "epoch": 0.373, "grad_norm": 32.85219955444336, "learning_rate": 1.443861415790107e-07, "logits/chosen": 2.907891035079956, "logits/rejected": 2.8211252689361572, "logps/chosen": -25.23243522644043, "logps/rejected": -136.6900177001953, "loss": 0.653, "nll_loss": 0.6469855904579163, "rewards/accuracies": 1.0, "rewards/chosen": 2.776719093322754, "rewards/margins": 10.541568756103516, "rewards/rejected": -7.76485013961792, "step": 2238 }, { "epoch": 0.37316666666666665, "grad_norm": 38.24871063232422, "learning_rate": 1.443377645501975e-07, "logits/chosen": 4.174563884735107, "logits/rejected": 4.2971062660217285, "logps/chosen": -85.33930206298828, "logps/rejected": -235.60989379882812, "loss": 1.2848, "nll_loss": 1.2191332578659058, "rewards/accuracies": 1.0, "rewards/chosen": 1.4203681945800781, "rewards/margins": 4.006038665771484, "rewards/rejected": -2.5856707096099854, "step": 2239 }, { "epoch": 0.37333333333333335, "grad_norm": 32.78388977050781, "learning_rate": 1.4428937460242416e-07, "logits/chosen": 0.7284614443778992, "logits/rejected": 1.472344994544983, "logps/chosen": -44.10317611694336, "logps/rejected": -252.97373962402344, "loss": 0.7659, "nll_loss": 0.7475113868713379, "rewards/accuracies": 1.0, "rewards/chosen": 1.6395150423049927, "rewards/margins": 8.173371315002441, "rewards/rejected": -6.533856391906738, "step": 2240 }, { "epoch": 0.3735, "grad_norm": 22.965517044067383, "learning_rate": 1.4424097174979036e-07, "logits/chosen": 2.21368408203125, "logits/rejected": 2.2995357513427734, "logps/chosen": -55.22926330566406, "logps/rejected": -101.25646209716797, "loss": 0.675, "nll_loss": 0.6574912071228027, "rewards/accuracies": 1.0, "rewards/chosen": 2.894763946533203, "rewards/margins": 6.501574516296387, "rewards/rejected": -3.6068103313446045, "step": 2241 }, { "epoch": 0.37366666666666665, "grad_norm": 40.072818756103516, "learning_rate": 1.441925560063995e-07, "logits/chosen": 1.5807764530181885, "logits/rejected": 1.7030081748962402, "logps/chosen": -73.500244140625, "logps/rejected": -93.90116119384766, "loss": 1.0144, "nll_loss": 0.9671084880828857, "rewards/accuracies": 1.0, "rewards/chosen": 0.7003440856933594, "rewards/margins": 5.7578558921813965, "rewards/rejected": -5.057511806488037, "step": 2242 }, { "epoch": 0.37383333333333335, "grad_norm": 25.766189575195312, "learning_rate": 1.441441273863588e-07, "logits/chosen": 2.7977609634399414, "logits/rejected": 2.7481729984283447, "logps/chosen": -39.0128173828125, "logps/rejected": -99.70123291015625, "loss": 0.5208, "nll_loss": 0.45897433161735535, "rewards/accuracies": 1.0, "rewards/chosen": 1.3837273120880127, "rewards/margins": 4.098768711090088, "rewards/rejected": -2.715041399002075, "step": 2243 }, { "epoch": 0.374, "grad_norm": 45.351219177246094, "learning_rate": 1.4409568590377917e-07, "logits/chosen": 3.2312021255493164, "logits/rejected": 3.4812498092651367, "logps/chosen": -48.564117431640625, "logps/rejected": -279.9445495605469, "loss": 1.1181, "nll_loss": 1.1037300825119019, "rewards/accuracies": 1.0, "rewards/chosen": 1.8588457107543945, "rewards/margins": 9.993895530700684, "rewards/rejected": -8.135049819946289, "step": 2244 }, { "epoch": 0.37416666666666665, "grad_norm": 21.42039680480957, "learning_rate": 1.440472315727753e-07, "logits/chosen": 1.684618353843689, "logits/rejected": 1.2161779403686523, "logps/chosen": -48.78153991699219, "logps/rejected": -71.66023254394531, "loss": 0.6154, "nll_loss": 0.5877295136451721, "rewards/accuracies": 1.0, "rewards/chosen": 2.2921502590179443, "rewards/margins": 5.575592994689941, "rewards/rejected": -3.283442974090576, "step": 2245 }, { "epoch": 0.37433333333333335, "grad_norm": 27.936092376708984, "learning_rate": 1.439987644074656e-07, "logits/chosen": 0.931555449962616, "logits/rejected": 2.9012012481689453, "logps/chosen": -71.70612335205078, "logps/rejected": -378.07977294921875, "loss": 0.8269, "nll_loss": 0.8056867718696594, "rewards/accuracies": 1.0, "rewards/chosen": 1.5470367670059204, "rewards/margins": 7.181887626647949, "rewards/rejected": -5.634850978851318, "step": 2246 }, { "epoch": 0.3745, "grad_norm": 22.137325286865234, "learning_rate": 1.439502844219723e-07, "logits/chosen": 2.301480531692505, "logits/rejected": 2.1402626037597656, "logps/chosen": -80.16215515136719, "logps/rejected": -93.7251205444336, "loss": 0.763, "nll_loss": 0.7354325652122498, "rewards/accuracies": 1.0, "rewards/chosen": 2.219369649887085, "rewards/margins": 5.5570855140686035, "rewards/rejected": -3.3377158641815186, "step": 2247 }, { "epoch": 0.37466666666666665, "grad_norm": 45.64075469970703, "learning_rate": 1.4390179163042126e-07, "logits/chosen": 2.5425937175750732, "logits/rejected": 2.4992010593414307, "logps/chosen": -41.209835052490234, "logps/rejected": -114.77815246582031, "loss": 0.8649, "nll_loss": 0.7775440812110901, "rewards/accuracies": 1.0, "rewards/chosen": 0.16589699685573578, "rewards/margins": 4.153140544891357, "rewards/rejected": -3.98724365234375, "step": 2248 }, { "epoch": 0.37483333333333335, "grad_norm": 205.18991088867188, "learning_rate": 1.438532860469421e-07, "logits/chosen": 2.323455333709717, "logits/rejected": 2.187558889389038, "logps/chosen": -45.457035064697266, "logps/rejected": -28.547988891601562, "loss": 3.2699, "nll_loss": 2.2728514671325684, "rewards/accuracies": 0.0, "rewards/chosen": -1.8156194686889648, "rewards/margins": -0.23217451572418213, "rewards/rejected": -1.5834449529647827, "step": 2249 }, { "epoch": 0.375, "grad_norm": 39.36994171142578, "learning_rate": 1.4380476768566823e-07, "logits/chosen": 1.4390822649002075, "logits/rejected": 1.7070471048355103, "logps/chosen": -22.346981048583984, "logps/rejected": -255.90635681152344, "loss": 0.615, "nll_loss": 0.6039724349975586, "rewards/accuracies": 1.0, "rewards/chosen": 2.2242443561553955, "rewards/margins": 8.329130172729492, "rewards/rejected": -6.104886054992676, "step": 2250 }, { "epoch": 0.37516666666666665, "grad_norm": 24.312089920043945, "learning_rate": 1.437562365607367e-07, "logits/chosen": 2.2970030307769775, "logits/rejected": 2.1161611080169678, "logps/chosen": -75.83161926269531, "logps/rejected": -81.75665283203125, "loss": 0.7721, "nll_loss": 0.7434472441673279, "rewards/accuracies": 1.0, "rewards/chosen": 3.283780813217163, "rewards/margins": 6.124555587768555, "rewards/rejected": -2.8407745361328125, "step": 2251 }, { "epoch": 0.37533333333333335, "grad_norm": 26.65292739868164, "learning_rate": 1.437076926862883e-07, "logits/chosen": 2.164548635482788, "logits/rejected": 2.377744674682617, "logps/chosen": -131.85781860351562, "logps/rejected": -148.564208984375, "loss": 1.0131, "nll_loss": 0.9989227652549744, "rewards/accuracies": 1.0, "rewards/chosen": 1.9291199445724487, "rewards/margins": 8.229764938354492, "rewards/rejected": -6.300644874572754, "step": 2252 }, { "epoch": 0.3755, "grad_norm": 23.633991241455078, "learning_rate": 1.436591360764676e-07, "logits/chosen": 2.5820980072021484, "logits/rejected": 2.7601354122161865, "logps/chosen": -209.26641845703125, "logps/rejected": -323.302734375, "loss": 1.0068, "nll_loss": 1.001274824142456, "rewards/accuracies": 1.0, "rewards/chosen": 2.819581985473633, "rewards/margins": 13.834868431091309, "rewards/rejected": -11.015286445617676, "step": 2253 }, { "epoch": 0.37566666666666665, "grad_norm": 26.965145111083984, "learning_rate": 1.4361056674542278e-07, "logits/chosen": 1.8322442770004272, "logits/rejected": 2.6009695529937744, "logps/chosen": -134.5380859375, "logps/rejected": -344.24835205078125, "loss": 1.3129, "nll_loss": 1.2936354875564575, "rewards/accuracies": 1.0, "rewards/chosen": 1.586212158203125, "rewards/margins": 8.089850425720215, "rewards/rejected": -6.50363826751709, "step": 2254 }, { "epoch": 0.37583333333333335, "grad_norm": 27.554384231567383, "learning_rate": 1.4356198470730583e-07, "logits/chosen": 1.0878392457962036, "logits/rejected": 2.4843149185180664, "logps/chosen": -62.89501953125, "logps/rejected": -447.9910888671875, "loss": 1.0318, "nll_loss": 1.0310660600662231, "rewards/accuracies": 1.0, "rewards/chosen": 5.0999755859375, "rewards/margins": 12.803906440734863, "rewards/rejected": -7.703930854797363, "step": 2255 }, { "epoch": 0.376, "grad_norm": 28.249595642089844, "learning_rate": 1.435133899762723e-07, "logits/chosen": 2.0759100914001465, "logits/rejected": 2.456514358520508, "logps/chosen": -103.6552734375, "logps/rejected": -260.3709716796875, "loss": 1.0983, "nll_loss": 1.0686111450195312, "rewards/accuracies": 1.0, "rewards/chosen": 1.0971466302871704, "rewards/margins": 8.140655517578125, "rewards/rejected": -7.043509006500244, "step": 2256 }, { "epoch": 0.37616666666666665, "grad_norm": 23.161684036254883, "learning_rate": 1.4346478256648166e-07, "logits/chosen": 2.358952045440674, "logits/rejected": 2.579162836074829, "logps/chosen": -27.09757423400879, "logps/rejected": -441.56671142578125, "loss": 0.4728, "nll_loss": 0.4592810273170471, "rewards/accuracies": 1.0, "rewards/chosen": 1.9272880554199219, "rewards/margins": 9.36312198638916, "rewards/rejected": -7.435833930969238, "step": 2257 }, { "epoch": 0.37633333333333335, "grad_norm": 36.065887451171875, "learning_rate": 1.4341616249209684e-07, "logits/chosen": 2.572263717651367, "logits/rejected": 2.1581013202667236, "logps/chosen": -19.535751342773438, "logps/rejected": -33.617435455322266, "loss": 0.5629, "nll_loss": 0.46513694524765015, "rewards/accuracies": 1.0, "rewards/chosen": 1.1782325506210327, "rewards/margins": 3.3544392585754395, "rewards/rejected": -2.1762068271636963, "step": 2258 }, { "epoch": 0.3765, "grad_norm": 29.866331100463867, "learning_rate": 1.4336752976728459e-07, "logits/chosen": 4.244290828704834, "logits/rejected": 4.356698513031006, "logps/chosen": -61.432376861572266, "logps/rejected": -127.60139465332031, "loss": 0.7734, "nll_loss": 0.7143300175666809, "rewards/accuracies": 1.0, "rewards/chosen": 1.0097408294677734, "rewards/margins": 4.2427978515625, "rewards/rejected": -3.2330570220947266, "step": 2259 }, { "epoch": 0.37666666666666665, "grad_norm": 88.5563735961914, "learning_rate": 1.4331888440621531e-07, "logits/chosen": 2.177640676498413, "logits/rejected": 2.063359022140503, "logps/chosen": -39.13078689575195, "logps/rejected": -60.16185760498047, "loss": 1.1406, "nll_loss": 1.0575886964797974, "rewards/accuracies": 1.0, "rewards/chosen": 1.9812737703323364, "rewards/margins": 3.9005074501037598, "rewards/rejected": -1.9192336797714233, "step": 2260 }, { "epoch": 0.37683333333333335, "grad_norm": 30.45411491394043, "learning_rate": 1.432702264230631e-07, "logits/chosen": 2.8014562129974365, "logits/rejected": 2.778498649597168, "logps/chosen": -82.53193664550781, "logps/rejected": -172.62908935546875, "loss": 0.9333, "nll_loss": 0.9170216917991638, "rewards/accuracies": 1.0, "rewards/chosen": 1.8932526111602783, "rewards/margins": 7.218593597412109, "rewards/rejected": -5.325340747833252, "step": 2261 }, { "epoch": 0.377, "grad_norm": 44.593971252441406, "learning_rate": 1.4322155583200575e-07, "logits/chosen": 1.6723822355270386, "logits/rejected": 2.267862319946289, "logps/chosen": -23.656661987304688, "logps/rejected": -277.43206787109375, "loss": 0.9946, "nll_loss": 0.9856943488121033, "rewards/accuracies": 1.0, "rewards/chosen": 2.389693021774292, "rewards/margins": 9.282608032226562, "rewards/rejected": -6.89291524887085, "step": 2262 }, { "epoch": 0.37716666666666665, "grad_norm": 22.764684677124023, "learning_rate": 1.4317287264722467e-07, "logits/chosen": 2.3630473613739014, "logits/rejected": 2.3799667358398438, "logps/chosen": -92.168701171875, "logps/rejected": -174.40162658691406, "loss": 0.7976, "nll_loss": 0.787766695022583, "rewards/accuracies": 1.0, "rewards/chosen": 2.2725441455841064, "rewards/margins": 9.509955406188965, "rewards/rejected": -7.2374114990234375, "step": 2263 }, { "epoch": 0.37733333333333335, "grad_norm": 151.41339111328125, "learning_rate": 1.4312417688290499e-07, "logits/chosen": 3.0428216457366943, "logits/rejected": 3.0405192375183105, "logps/chosen": -60.45668411254883, "logps/rejected": -68.9777603149414, "loss": 1.8621, "nll_loss": 1.1195682287216187, "rewards/accuracies": 1.0, "rewards/chosen": 2.4382169246673584, "rewards/margins": 0.9413212537765503, "rewards/rejected": 1.496895670890808, "step": 2264 }, { "epoch": 0.3775, "grad_norm": 35.654632568359375, "learning_rate": 1.4307546855323547e-07, "logits/chosen": 2.879255533218384, "logits/rejected": 2.776066541671753, "logps/chosen": -96.01167297363281, "logps/rejected": -159.92730712890625, "loss": 1.1112, "nll_loss": 1.0787829160690308, "rewards/accuracies": 1.0, "rewards/chosen": 1.6364412307739258, "rewards/margins": 5.27498722076416, "rewards/rejected": -3.6385462284088135, "step": 2265 }, { "epoch": 0.37766666666666665, "grad_norm": 24.156957626342773, "learning_rate": 1.4302674767240856e-07, "logits/chosen": 1.6186063289642334, "logits/rejected": 2.133087635040283, "logps/chosen": -80.46239471435547, "logps/rejected": -157.5455322265625, "loss": 0.7129, "nll_loss": 0.6761544346809387, "rewards/accuracies": 1.0, "rewards/chosen": 1.0981254577636719, "rewards/margins": 5.588037967681885, "rewards/rejected": -4.489912509918213, "step": 2266 }, { "epoch": 0.37783333333333335, "grad_norm": 22.241424560546875, "learning_rate": 1.4297801425462032e-07, "logits/chosen": 2.4314308166503906, "logits/rejected": 2.589128017425537, "logps/chosen": -140.25379943847656, "logps/rejected": -273.46600341796875, "loss": 1.0164, "nll_loss": 1.0018125772476196, "rewards/accuracies": 1.0, "rewards/chosen": 2.012010335922241, "rewards/margins": 7.396292686462402, "rewards/rejected": -5.38428258895874, "step": 2267 }, { "epoch": 0.378, "grad_norm": 36.1255989074707, "learning_rate": 1.429292683140706e-07, "logits/chosen": 2.4089977741241455, "logits/rejected": 2.4813640117645264, "logps/chosen": -47.8277587890625, "logps/rejected": -157.63821411132812, "loss": 0.8414, "nll_loss": 0.7971293926239014, "rewards/accuracies": 1.0, "rewards/chosen": 1.119700312614441, "rewards/margins": 4.871995449066162, "rewards/rejected": -3.7522950172424316, "step": 2268 }, { "epoch": 0.37816666666666665, "grad_norm": 205.4901123046875, "learning_rate": 1.4288050986496266e-07, "logits/chosen": 3.1166224479675293, "logits/rejected": 3.2462401390075684, "logps/chosen": -37.01491165161133, "logps/rejected": -166.23046875, "loss": 1.4958, "nll_loss": 0.7554062604904175, "rewards/accuracies": 1.0, "rewards/chosen": 0.5807430148124695, "rewards/margins": 0.3373286724090576, "rewards/rejected": 0.24341432750225067, "step": 2269 }, { "epoch": 0.37833333333333335, "grad_norm": 155.3717498779297, "learning_rate": 1.4283173892150365e-07, "logits/chosen": 1.9528788328170776, "logits/rejected": 1.9627188444137573, "logps/chosen": -17.89931297302246, "logps/rejected": -10.190583229064941, "loss": 2.3775, "nll_loss": 0.37290239334106445, "rewards/accuracies": 0.0, "rewards/chosen": 0.5976602435112, "rewards/margins": -1.518108606338501, "rewards/rejected": 2.1157689094543457, "step": 2270 }, { "epoch": 0.3785, "grad_norm": 511.50628662109375, "learning_rate": 1.427829554979042e-07, "logits/chosen": 1.9662220478057861, "logits/rejected": 1.6141375303268433, "logps/chosen": -193.61056518554688, "logps/rejected": -18.838741302490234, "loss": 5.6278, "nll_loss": 1.132225513458252, "rewards/accuracies": 0.0, "rewards/chosen": -1.9266846179962158, "rewards/margins": -4.424339771270752, "rewards/rejected": 2.497655153274536, "step": 2271 }, { "epoch": 0.37866666666666665, "grad_norm": 19.5986328125, "learning_rate": 1.4273415960837862e-07, "logits/chosen": 2.2382144927978516, "logits/rejected": 2.6111600399017334, "logps/chosen": -42.636375427246094, "logps/rejected": -310.04449462890625, "loss": 0.487, "nll_loss": 0.47906041145324707, "rewards/accuracies": 1.0, "rewards/chosen": 3.381877899169922, "rewards/margins": 7.920534610748291, "rewards/rejected": -4.538656711578369, "step": 2272 }, { "epoch": 0.37883333333333336, "grad_norm": 23.413312911987305, "learning_rate": 1.426853512671449e-07, "logits/chosen": 1.293149709701538, "logits/rejected": 2.39821720123291, "logps/chosen": -57.847389221191406, "logps/rejected": -294.54510498046875, "loss": 0.6721, "nll_loss": 0.6573566794395447, "rewards/accuracies": 1.0, "rewards/chosen": 1.8837608098983765, "rewards/margins": 8.197816848754883, "rewards/rejected": -6.314056396484375, "step": 2273 }, { "epoch": 0.379, "grad_norm": 70.80902099609375, "learning_rate": 1.4263653048842458e-07, "logits/chosen": 2.1345794200897217, "logits/rejected": 2.0072152614593506, "logps/chosen": -19.505657196044922, "logps/rejected": -77.17875671386719, "loss": 0.7162, "nll_loss": 0.5001450777053833, "rewards/accuracies": 1.0, "rewards/chosen": 0.6041879653930664, "rewards/margins": 2.1005473136901855, "rewards/rejected": -1.4963592290878296, "step": 2274 }, { "epoch": 0.37916666666666665, "grad_norm": 72.77462768554688, "learning_rate": 1.425876972864429e-07, "logits/chosen": 2.9985158443450928, "logits/rejected": 3.027100086212158, "logps/chosen": -14.602012634277344, "logps/rejected": -27.811355590820312, "loss": 1.1716, "nll_loss": 0.7685268521308899, "rewards/accuracies": 1.0, "rewards/chosen": 0.5304798483848572, "rewards/margins": 1.2133846282958984, "rewards/rejected": -0.682904839515686, "step": 2275 }, { "epoch": 0.37933333333333336, "grad_norm": 84.24488830566406, "learning_rate": 1.4253885167542864e-07, "logits/chosen": 2.4156594276428223, "logits/rejected": 2.4989328384399414, "logps/chosen": -9.852797508239746, "logps/rejected": -76.94215393066406, "loss": 0.6966, "nll_loss": 0.6568531394004822, "rewards/accuracies": 1.0, "rewards/chosen": 1.7767822742462158, "rewards/margins": 4.874523162841797, "rewards/rejected": -3.09774112701416, "step": 2276 }, { "epoch": 0.3795, "grad_norm": 28.079647064208984, "learning_rate": 1.4248999366961427e-07, "logits/chosen": 2.478085517883301, "logits/rejected": 2.4153313636779785, "logps/chosen": -113.90657043457031, "logps/rejected": -112.48693084716797, "loss": 0.9446, "nll_loss": 0.9186014533042908, "rewards/accuracies": 1.0, "rewards/chosen": 1.3884453773498535, "rewards/margins": 6.45811128616333, "rewards/rejected": -5.069665908813477, "step": 2277 }, { "epoch": 0.37966666666666665, "grad_norm": 41.3472785949707, "learning_rate": 1.4244112328323587e-07, "logits/chosen": 2.023731231689453, "logits/rejected": 2.7172632217407227, "logps/chosen": -47.571739196777344, "logps/rejected": -731.0768432617188, "loss": 0.9732, "nll_loss": 0.9327791333198547, "rewards/accuracies": 1.0, "rewards/chosen": 0.7563068866729736, "rewards/margins": 7.792659759521484, "rewards/rejected": -7.036352634429932, "step": 2278 }, { "epoch": 0.37983333333333336, "grad_norm": 28.23869514465332, "learning_rate": 1.42392240530533e-07, "logits/chosen": 2.9224259853363037, "logits/rejected": 2.976008415222168, "logps/chosen": -122.54026794433594, "logps/rejected": -221.55934143066406, "loss": 1.0734, "nll_loss": 1.04735267162323, "rewards/accuracies": 1.0, "rewards/chosen": 1.4570236206054688, "rewards/margins": 6.184060573577881, "rewards/rejected": -4.727036952972412, "step": 2279 }, { "epoch": 0.38, "grad_norm": 24.9991455078125, "learning_rate": 1.4234334542574904e-07, "logits/chosen": 1.4866459369659424, "logits/rejected": 1.3479021787643433, "logps/chosen": -58.43358612060547, "logps/rejected": -82.86294555664062, "loss": 0.6804, "nll_loss": 0.6565572023391724, "rewards/accuracies": 1.0, "rewards/chosen": 1.664048194885254, "rewards/margins": 6.09624719619751, "rewards/rejected": -4.432199001312256, "step": 2280 }, { "epoch": 0.38016666666666665, "grad_norm": 67.93365478515625, "learning_rate": 1.4229443798313075e-07, "logits/chosen": 2.6668078899383545, "logits/rejected": 2.2927396297454834, "logps/chosen": -10.432951927185059, "logps/rejected": -33.457550048828125, "loss": 0.7944, "nll_loss": 0.7452108263969421, "rewards/accuracies": 1.0, "rewards/chosen": 1.8132057189941406, "rewards/margins": 4.544614791870117, "rewards/rejected": -2.7314090728759766, "step": 2281 }, { "epoch": 0.38033333333333336, "grad_norm": 21.63179588317871, "learning_rate": 1.4224551821692862e-07, "logits/chosen": 2.289128303527832, "logits/rejected": 2.0602550506591797, "logps/chosen": -99.94535827636719, "logps/rejected": -74.71380615234375, "loss": 0.7188, "nll_loss": 0.6799003481864929, "rewards/accuracies": 1.0, "rewards/chosen": 1.5038574934005737, "rewards/margins": 4.932342052459717, "rewards/rejected": -3.4284844398498535, "step": 2282 }, { "epoch": 0.3805, "grad_norm": 39.81053161621094, "learning_rate": 1.421965861413967e-07, "logits/chosen": 1.404369831085205, "logits/rejected": 1.9821417331695557, "logps/chosen": -77.35758972167969, "logps/rejected": -198.84442138671875, "loss": 0.9229, "nll_loss": 0.8891677260398865, "rewards/accuracies": 1.0, "rewards/chosen": 0.9541481137275696, "rewards/margins": 7.970156192779541, "rewards/rejected": -7.016007900238037, "step": 2283 }, { "epoch": 0.38066666666666665, "grad_norm": 30.5823917388916, "learning_rate": 1.4214764177079263e-07, "logits/chosen": 1.4974524974822998, "logits/rejected": 2.7387757301330566, "logps/chosen": -31.736289978027344, "logps/rejected": -275.4375, "loss": 0.5498, "nll_loss": 0.5118756294250488, "rewards/accuracies": 1.0, "rewards/chosen": 0.8769813776016235, "rewards/margins": 6.62645959854126, "rewards/rejected": -5.749478340148926, "step": 2284 }, { "epoch": 0.38083333333333336, "grad_norm": 32.726646423339844, "learning_rate": 1.4209868511937765e-07, "logits/chosen": 3.32358717918396, "logits/rejected": 3.0917067527770996, "logps/chosen": -83.18244934082031, "logps/rejected": -286.0933532714844, "loss": 1.0937, "nll_loss": 1.0664416551589966, "rewards/accuracies": 1.0, "rewards/chosen": 1.2163925170898438, "rewards/margins": 7.413649082183838, "rewards/rejected": -6.197256565093994, "step": 2285 }, { "epoch": 0.381, "grad_norm": 32.974491119384766, "learning_rate": 1.4204971620141645e-07, "logits/chosen": 2.498507261276245, "logits/rejected": 2.5549943447113037, "logps/chosen": -30.30088996887207, "logps/rejected": -112.46217346191406, "loss": 0.665, "nll_loss": 0.6312685608863831, "rewards/accuracies": 1.0, "rewards/chosen": 1.5287456512451172, "rewards/margins": 5.237843036651611, "rewards/rejected": -3.709097385406494, "step": 2286 }, { "epoch": 0.38116666666666665, "grad_norm": 26.6278018951416, "learning_rate": 1.420007350311775e-07, "logits/chosen": 2.1108062267303467, "logits/rejected": 2.2138659954071045, "logps/chosen": -140.08642578125, "logps/rejected": -122.62055969238281, "loss": 1.037, "nll_loss": 1.0151190757751465, "rewards/accuracies": 1.0, "rewards/chosen": 1.5185974836349487, "rewards/margins": 7.038586139678955, "rewards/rejected": -5.519988536834717, "step": 2287 }, { "epoch": 0.38133333333333336, "grad_norm": 36.61474609375, "learning_rate": 1.419517416229327e-07, "logits/chosen": 2.5135231018066406, "logits/rejected": 2.5402262210845947, "logps/chosen": -23.28520393371582, "logps/rejected": -131.32839965820312, "loss": 0.6148, "nll_loss": 0.5292091965675354, "rewards/accuracies": 1.0, "rewards/chosen": 0.5069293975830078, "rewards/margins": 3.710523843765259, "rewards/rejected": -3.203594446182251, "step": 2288 }, { "epoch": 0.3815, "grad_norm": 62.019287109375, "learning_rate": 1.419027359909576e-07, "logits/chosen": 1.4552806615829468, "logits/rejected": 1.8353309631347656, "logps/chosen": -49.07403564453125, "logps/rejected": -144.89454650878906, "loss": 1.4411, "nll_loss": 1.4021154642105103, "rewards/accuracies": 1.0, "rewards/chosen": 0.9451717734336853, "rewards/margins": 5.815110683441162, "rewards/rejected": -4.869938850402832, "step": 2289 }, { "epoch": 0.38166666666666665, "grad_norm": 30.064783096313477, "learning_rate": 1.4185371814953115e-07, "logits/chosen": 2.7377712726593018, "logits/rejected": 2.831660032272339, "logps/chosen": -21.285953521728516, "logps/rejected": -121.58574676513672, "loss": 0.4846, "nll_loss": 0.45289260149002075, "rewards/accuracies": 1.0, "rewards/chosen": 1.4966892004013062, "rewards/margins": 5.416656970977783, "rewards/rejected": -3.9199676513671875, "step": 2290 }, { "epoch": 0.38183333333333336, "grad_norm": 31.810258865356445, "learning_rate": 1.418046881129361e-07, "logits/chosen": 2.0553908348083496, "logits/rejected": 2.556471109390259, "logps/chosen": -56.23522186279297, "logps/rejected": -171.01551818847656, "loss": 0.9267, "nll_loss": 0.9070197939872742, "rewards/accuracies": 1.0, "rewards/chosen": 1.5537124872207642, "rewards/margins": 8.099023818969727, "rewards/rejected": -6.545310974121094, "step": 2291 }, { "epoch": 0.382, "grad_norm": 47.55940246582031, "learning_rate": 1.417556458954585e-07, "logits/chosen": 2.2915396690368652, "logits/rejected": 2.732998847961426, "logps/chosen": -24.172582626342773, "logps/rejected": -132.77557373046875, "loss": 0.7487, "nll_loss": 0.7109582424163818, "rewards/accuracies": 1.0, "rewards/chosen": 0.9554373621940613, "rewards/margins": 5.981147766113281, "rewards/rejected": -5.025710582733154, "step": 2292 }, { "epoch": 0.38216666666666665, "grad_norm": 30.243938446044922, "learning_rate": 1.4170659151138824e-07, "logits/chosen": 2.298255205154419, "logits/rejected": 2.4412968158721924, "logps/chosen": -155.681884765625, "logps/rejected": -232.37149047851562, "loss": 1.3188, "nll_loss": 1.297349214553833, "rewards/accuracies": 1.0, "rewards/chosen": 1.4689819812774658, "rewards/margins": 7.814459800720215, "rewards/rejected": -6.34547758102417, "step": 2293 }, { "epoch": 0.38233333333333336, "grad_norm": 33.61355209350586, "learning_rate": 1.416575249750184e-07, "logits/chosen": 1.4718984365463257, "logits/rejected": 2.447624444961548, "logps/chosen": -14.778732299804688, "logps/rejected": -158.78134155273438, "loss": 0.4785, "nll_loss": 0.4618353843688965, "rewards/accuracies": 1.0, "rewards/chosen": 3.4465482234954834, "rewards/margins": 6.916821002960205, "rewards/rejected": -3.4702727794647217, "step": 2294 }, { "epoch": 0.3825, "grad_norm": 30.708932876586914, "learning_rate": 1.4160844630064594e-07, "logits/chosen": 2.220834493637085, "logits/rejected": 2.50376033782959, "logps/chosen": -19.366662979125977, "logps/rejected": -475.6244201660156, "loss": 0.4816, "nll_loss": 0.44015151262283325, "rewards/accuracies": 1.0, "rewards/chosen": 0.7008745670318604, "rewards/margins": 14.059661865234375, "rewards/rejected": -13.358787536621094, "step": 2295 }, { "epoch": 0.38266666666666665, "grad_norm": 187.3562774658203, "learning_rate": 1.4155935550257114e-07, "logits/chosen": 2.842759847640991, "logits/rejected": 2.7661006450653076, "logps/chosen": -42.63380432128906, "logps/rejected": -22.636817932128906, "loss": 3.3909, "nll_loss": 0.5329225659370422, "rewards/accuracies": 0.0, "rewards/chosen": 1.061838150024414, "rewards/margins": -2.389402151107788, "rewards/rejected": 3.451240301132202, "step": 2296 }, { "epoch": 0.38283333333333336, "grad_norm": 75.43521881103516, "learning_rate": 1.415102525950979e-07, "logits/chosen": 2.43845534324646, "logits/rejected": 2.6395809650421143, "logps/chosen": -21.344932556152344, "logps/rejected": -206.99334716796875, "loss": 0.6718, "nll_loss": 0.508212685585022, "rewards/accuracies": 1.0, "rewards/chosen": 1.2447090148925781, "rewards/margins": 2.657529592514038, "rewards/rejected": -1.41282057762146, "step": 2297 }, { "epoch": 0.383, "grad_norm": 30.351646423339844, "learning_rate": 1.414611375925336e-07, "logits/chosen": 2.858745574951172, "logits/rejected": 2.873730182647705, "logps/chosen": -65.3330307006836, "logps/rejected": -59.441192626953125, "loss": 0.9936, "nll_loss": 0.9607798457145691, "rewards/accuracies": 1.0, "rewards/chosen": 1.2675689458847046, "rewards/margins": 5.6311540603637695, "rewards/rejected": -4.363584995269775, "step": 2298 }, { "epoch": 0.38316666666666666, "grad_norm": 18.082843780517578, "learning_rate": 1.414120105091892e-07, "logits/chosen": 1.708759069442749, "logits/rejected": 1.9169015884399414, "logps/chosen": -160.6439208984375, "logps/rejected": -249.35076904296875, "loss": 0.8939, "nll_loss": 0.8875354528427124, "rewards/accuracies": 1.0, "rewards/chosen": 2.6655168533325195, "rewards/margins": 11.662944793701172, "rewards/rejected": -8.997427940368652, "step": 2299 }, { "epoch": 0.38333333333333336, "grad_norm": 17.925159454345703, "learning_rate": 1.4136287135937914e-07, "logits/chosen": 2.2355575561523438, "logits/rejected": 2.3788726329803467, "logps/chosen": -107.33560180664062, "logps/rejected": -197.06344604492188, "loss": 0.7497, "nll_loss": 0.7453860640525818, "rewards/accuracies": 1.0, "rewards/chosen": 3.090808153152466, "rewards/margins": 11.189834594726562, "rewards/rejected": -8.099026679992676, "step": 2300 }, { "epoch": 0.3835, "grad_norm": 60.04051208496094, "learning_rate": 1.413137201574214e-07, "logits/chosen": 2.1179449558258057, "logits/rejected": 2.5281591415405273, "logps/chosen": -46.02606964111328, "logps/rejected": -362.26165771484375, "loss": 1.5521, "nll_loss": 1.5342023372650146, "rewards/accuracies": 1.0, "rewards/chosen": 1.6803674697875977, "rewards/margins": 7.7236480712890625, "rewards/rejected": -6.043280601501465, "step": 2301 }, { "epoch": 0.38366666666666666, "grad_norm": 41.70201110839844, "learning_rate": 1.4126455691763745e-07, "logits/chosen": 2.6372158527374268, "logits/rejected": 2.565833330154419, "logps/chosen": -54.38521194458008, "logps/rejected": -117.7091064453125, "loss": 0.8414, "nll_loss": 0.7450029253959656, "rewards/accuracies": 1.0, "rewards/chosen": 1.3314968347549438, "rewards/margins": 3.417651653289795, "rewards/rejected": -2.0861549377441406, "step": 2302 }, { "epoch": 0.38383333333333336, "grad_norm": 24.477476119995117, "learning_rate": 1.4121538165435226e-07, "logits/chosen": 2.950516700744629, "logits/rejected": 3.019216299057007, "logps/chosen": -36.61623764038086, "logps/rejected": -140.24862670898438, "loss": 0.5426, "nll_loss": 0.5230890512466431, "rewards/accuracies": 1.0, "rewards/chosen": 1.8137249946594238, "rewards/margins": 6.544729232788086, "rewards/rejected": -4.731004238128662, "step": 2303 }, { "epoch": 0.384, "grad_norm": 26.928627014160156, "learning_rate": 1.4116619438189438e-07, "logits/chosen": 2.4503700733184814, "logits/rejected": 2.535234212875366, "logps/chosen": -73.89143371582031, "logps/rejected": -143.71783447265625, "loss": 0.7923, "nll_loss": 0.7617672681808472, "rewards/accuracies": 1.0, "rewards/chosen": 1.0729316473007202, "rewards/margins": 7.492641925811768, "rewards/rejected": -6.419710159301758, "step": 2304 }, { "epoch": 0.38416666666666666, "grad_norm": 96.18873596191406, "learning_rate": 1.4111699511459576e-07, "logits/chosen": 3.1570658683776855, "logits/rejected": 2.856116533279419, "logps/chosen": -214.800537109375, "logps/rejected": -54.36317825317383, "loss": 1.2226, "nll_loss": 0.7839435338973999, "rewards/accuracies": 1.0, "rewards/chosen": 1.5503143072128296, "rewards/margins": 1.4248321056365967, "rewards/rejected": 0.12548218667507172, "step": 2305 }, { "epoch": 0.38433333333333336, "grad_norm": 36.06521224975586, "learning_rate": 1.4106778386679188e-07, "logits/chosen": 2.388944625854492, "logits/rejected": 2.8120319843292236, "logps/chosen": -103.32694244384766, "logps/rejected": -359.58538818359375, "loss": 1.0909, "nll_loss": 1.0543566942214966, "rewards/accuracies": 1.0, "rewards/chosen": 0.8481025695800781, "rewards/margins": 8.455842018127441, "rewards/rejected": -7.607739448547363, "step": 2306 }, { "epoch": 0.3845, "grad_norm": 35.99397659301758, "learning_rate": 1.410185606528217e-07, "logits/chosen": 1.4832738637924194, "logits/rejected": 2.4001779556274414, "logps/chosen": -21.861520767211914, "logps/rejected": -183.906982421875, "loss": 0.6051, "nll_loss": 0.5908518433570862, "rewards/accuracies": 1.0, "rewards/chosen": 2.0222368240356445, "rewards/margins": 7.462889194488525, "rewards/rejected": -5.440652370452881, "step": 2307 }, { "epoch": 0.38466666666666666, "grad_norm": 39.43782043457031, "learning_rate": 1.4096932548702776e-07, "logits/chosen": 1.7607645988464355, "logits/rejected": 2.041904926300049, "logps/chosen": -41.029869079589844, "logps/rejected": -125.56845092773438, "loss": 0.6274, "nll_loss": 0.5861409902572632, "rewards/accuracies": 1.0, "rewards/chosen": 0.8833366632461548, "rewards/margins": 5.672724723815918, "rewards/rejected": -4.789388179779053, "step": 2308 }, { "epoch": 0.38483333333333336, "grad_norm": 34.315242767333984, "learning_rate": 1.409200783837559e-07, "logits/chosen": 2.912161111831665, "logits/rejected": 2.8588685989379883, "logps/chosen": -5.982897758483887, "logps/rejected": -44.947052001953125, "loss": 0.3175, "nll_loss": 0.24928738176822662, "rewards/accuracies": 1.0, "rewards/chosen": 0.9775339961051941, "rewards/margins": 3.953625202178955, "rewards/rejected": -2.976091146469116, "step": 2309 }, { "epoch": 0.385, "grad_norm": 33.54550552368164, "learning_rate": 1.4087081935735563e-07, "logits/chosen": 2.4774081707000732, "logits/rejected": 2.6924123764038086, "logps/chosen": -54.789913177490234, "logps/rejected": -147.13133239746094, "loss": 0.7467, "nll_loss": 0.6935432553291321, "rewards/accuracies": 1.0, "rewards/chosen": 0.4461650848388672, "rewards/margins": 6.971747875213623, "rewards/rejected": -6.525582790374756, "step": 2310 }, { "epoch": 0.38516666666666666, "grad_norm": 51.321590423583984, "learning_rate": 1.408215484221798e-07, "logits/chosen": 2.684072494506836, "logits/rejected": 2.8691020011901855, "logps/chosen": -36.434913635253906, "logps/rejected": -48.27798843383789, "loss": 0.5747, "nll_loss": 0.5280422568321228, "rewards/accuracies": 1.0, "rewards/chosen": 1.1439812183380127, "rewards/margins": 4.700687885284424, "rewards/rejected": -3.556706666946411, "step": 2311 }, { "epoch": 0.38533333333333336, "grad_norm": 23.601329803466797, "learning_rate": 1.407722655925848e-07, "logits/chosen": 0.9105783700942993, "logits/rejected": 2.1973164081573486, "logps/chosen": -75.76751708984375, "logps/rejected": -394.34222412109375, "loss": 0.8386, "nll_loss": 0.8235600590705872, "rewards/accuracies": 1.0, "rewards/chosen": 1.9433960914611816, "rewards/margins": 7.425745010375977, "rewards/rejected": -5.482348918914795, "step": 2312 }, { "epoch": 0.3855, "grad_norm": 17.489139556884766, "learning_rate": 1.4072297088293042e-07, "logits/chosen": 3.011880397796631, "logits/rejected": 2.8930821418762207, "logps/chosen": -146.9460906982422, "logps/rejected": -50.819976806640625, "loss": 0.7224, "nll_loss": 0.6803058981895447, "rewards/accuracies": 1.0, "rewards/chosen": 3.534306287765503, "rewards/margins": 5.894540309906006, "rewards/rejected": -2.360234022140503, "step": 2313 }, { "epoch": 0.38566666666666666, "grad_norm": 28.8328914642334, "learning_rate": 1.4067366430758004e-07, "logits/chosen": 3.0631895065307617, "logits/rejected": 3.1343142986297607, "logps/chosen": -57.694183349609375, "logps/rejected": -228.2822265625, "loss": 0.805, "nll_loss": 0.7796510457992554, "rewards/accuracies": 1.0, "rewards/chosen": 1.3589050769805908, "rewards/margins": 6.734893798828125, "rewards/rejected": -5.375988960266113, "step": 2314 }, { "epoch": 0.3858333333333333, "grad_norm": 19.206676483154297, "learning_rate": 1.406243458809003e-07, "logits/chosen": 2.0525009632110596, "logits/rejected": 1.8289835453033447, "logps/chosen": -150.4629669189453, "logps/rejected": -203.00637817382812, "loss": 0.7892, "nll_loss": 0.7716050148010254, "rewards/accuracies": 1.0, "rewards/chosen": 2.033482313156128, "rewards/margins": 6.537350654602051, "rewards/rejected": -4.503868103027344, "step": 2315 }, { "epoch": 0.386, "grad_norm": 36.127845764160156, "learning_rate": 1.4057501561726155e-07, "logits/chosen": 2.888343095779419, "logits/rejected": 2.9035568237304688, "logps/chosen": -102.32952880859375, "logps/rejected": -198.3540496826172, "loss": 1.1814, "nll_loss": 1.1628355979919434, "rewards/accuracies": 1.0, "rewards/chosen": 1.587692379951477, "rewards/margins": 8.621052742004395, "rewards/rejected": -7.033360481262207, "step": 2316 }, { "epoch": 0.38616666666666666, "grad_norm": 33.85161209106445, "learning_rate": 1.405256735310373e-07, "logits/chosen": 3.1493873596191406, "logits/rejected": 3.034527540206909, "logps/chosen": -73.96554565429688, "logps/rejected": -159.92068481445312, "loss": 0.8666, "nll_loss": 0.8039733171463013, "rewards/accuracies": 1.0, "rewards/chosen": 0.9683815240859985, "rewards/margins": 4.130718231201172, "rewards/rejected": -3.162336826324463, "step": 2317 }, { "epoch": 0.3863333333333333, "grad_norm": 32.108577728271484, "learning_rate": 1.4047631963660472e-07, "logits/chosen": 2.6798524856567383, "logits/rejected": 2.5476553440093994, "logps/chosen": -72.08541870117188, "logps/rejected": -37.92131805419922, "loss": 0.7295, "nll_loss": 0.6553220152854919, "rewards/accuracies": 1.0, "rewards/chosen": 1.6734970808029175, "rewards/margins": 3.9101247787475586, "rewards/rejected": -2.2366275787353516, "step": 2318 }, { "epoch": 0.3865, "grad_norm": 32.37987518310547, "learning_rate": 1.4042695394834434e-07, "logits/chosen": 2.2247138023376465, "logits/rejected": 2.547393798828125, "logps/chosen": -50.08467102050781, "logps/rejected": -322.2043762207031, "loss": 0.8874, "nll_loss": 0.8635287880897522, "rewards/accuracies": 1.0, "rewards/chosen": 1.319963812828064, "rewards/margins": 8.193633079528809, "rewards/rejected": -6.873669624328613, "step": 2319 }, { "epoch": 0.38666666666666666, "grad_norm": 81.19430541992188, "learning_rate": 1.4037757648064018e-07, "logits/chosen": 0.9324986934661865, "logits/rejected": 1.7950557470321655, "logps/chosen": -14.440601348876953, "logps/rejected": -219.90103149414062, "loss": 0.981, "nll_loss": 0.9627067446708679, "rewards/accuracies": 1.0, "rewards/chosen": 1.5720577239990234, "rewards/margins": 10.623281478881836, "rewards/rejected": -9.051223754882812, "step": 2320 }, { "epoch": 0.3868333333333333, "grad_norm": 27.10074806213379, "learning_rate": 1.4032818724787953e-07, "logits/chosen": 0.9818899631500244, "logits/rejected": 1.7902787923812866, "logps/chosen": -63.47785186767578, "logps/rejected": -279.1513977050781, "loss": 0.7774, "nll_loss": 0.7556886672973633, "rewards/accuracies": 1.0, "rewards/chosen": 1.3868964910507202, "rewards/margins": 11.445294380187988, "rewards/rejected": -10.058398246765137, "step": 2321 }, { "epoch": 0.387, "grad_norm": 42.7553825378418, "learning_rate": 1.4027878626445337e-07, "logits/chosen": 2.7885513305664062, "logits/rejected": 2.974630355834961, "logps/chosen": -20.873416900634766, "logps/rejected": -94.32551574707031, "loss": 0.7925, "nll_loss": 0.7730895280838013, "rewards/accuracies": 1.0, "rewards/chosen": 3.2669453620910645, "rewards/margins": 6.6069769859313965, "rewards/rejected": -3.340031623840332, "step": 2322 }, { "epoch": 0.38716666666666666, "grad_norm": 31.042922973632812, "learning_rate": 1.4022937354475585e-07, "logits/chosen": 2.17901349067688, "logits/rejected": 2.8251688480377197, "logps/chosen": -52.72754669189453, "logps/rejected": -213.33316040039062, "loss": 0.7802, "nll_loss": 0.7426414489746094, "rewards/accuracies": 1.0, "rewards/chosen": 0.8128654360771179, "rewards/margins": 8.216782569885254, "rewards/rejected": -7.403916835784912, "step": 2323 }, { "epoch": 0.3873333333333333, "grad_norm": 33.24462127685547, "learning_rate": 1.4017994910318474e-07, "logits/chosen": 0.5866440534591675, "logits/rejected": 2.4942619800567627, "logps/chosen": -65.89900970458984, "logps/rejected": -412.357177734375, "loss": 0.9531, "nll_loss": 0.9152640104293823, "rewards/accuracies": 1.0, "rewards/chosen": 0.9378105401992798, "rewards/margins": 5.989923000335693, "rewards/rejected": -5.052112579345703, "step": 2324 }, { "epoch": 0.3875, "grad_norm": 38.18219757080078, "learning_rate": 1.4013051295414106e-07, "logits/chosen": 2.256943941116333, "logits/rejected": 2.319084882736206, "logps/chosen": -11.89185905456543, "logps/rejected": -145.75372314453125, "loss": 0.4673, "nll_loss": 0.45737919211387634, "rewards/accuracies": 1.0, "rewards/chosen": 2.2440526485443115, "rewards/margins": 9.242390632629395, "rewards/rejected": -6.998338222503662, "step": 2325 }, { "epoch": 0.38766666666666666, "grad_norm": 39.4891357421875, "learning_rate": 1.4008106511202934e-07, "logits/chosen": 2.4241952896118164, "logits/rejected": 2.8317906856536865, "logps/chosen": -27.06580924987793, "logps/rejected": -168.08004760742188, "loss": 0.8181, "nll_loss": 0.7733086943626404, "rewards/accuracies": 1.0, "rewards/chosen": 0.7662191390991211, "rewards/margins": 5.64178991317749, "rewards/rejected": -4.875570774078369, "step": 2326 }, { "epoch": 0.3878333333333333, "grad_norm": 34.11384963989258, "learning_rate": 1.4003160559125752e-07, "logits/chosen": 2.0159480571746826, "logits/rejected": 2.3487086296081543, "logps/chosen": -31.285371780395508, "logps/rejected": -102.46854400634766, "loss": 0.6751, "nll_loss": 0.6257074475288391, "rewards/accuracies": 1.0, "rewards/chosen": 0.8237596750259399, "rewards/margins": 4.933558940887451, "rewards/rejected": -4.109799385070801, "step": 2327 }, { "epoch": 0.388, "grad_norm": 22.99995231628418, "learning_rate": 1.399821344062369e-07, "logits/chosen": 2.8900911808013916, "logits/rejected": 3.216836452484131, "logps/chosen": -61.76622009277344, "logps/rejected": -262.79693603515625, "loss": 0.6956, "nll_loss": 0.678749680519104, "rewards/accuracies": 1.0, "rewards/chosen": 1.6603264808654785, "rewards/margins": 9.878751754760742, "rewards/rejected": -8.218424797058105, "step": 2328 }, { "epoch": 0.38816666666666666, "grad_norm": 48.47675323486328, "learning_rate": 1.3993265157138219e-07, "logits/chosen": 2.6218769550323486, "logits/rejected": 2.767646074295044, "logps/chosen": -54.23082733154297, "logps/rejected": -206.63104248046875, "loss": 1.4319, "nll_loss": 1.4271268844604492, "rewards/accuracies": 1.0, "rewards/chosen": 2.943429708480835, "rewards/margins": 12.123449325561523, "rewards/rejected": -9.18001937866211, "step": 2329 }, { "epoch": 0.3883333333333333, "grad_norm": 64.89881896972656, "learning_rate": 1.3988315710111148e-07, "logits/chosen": 2.485257863998413, "logits/rejected": 2.528768539428711, "logps/chosen": -60.48234939575195, "logps/rejected": -68.57789611816406, "loss": 0.985, "nll_loss": 0.7375895977020264, "rewards/accuracies": 1.0, "rewards/chosen": 0.9607830047607422, "rewards/margins": 2.0052661895751953, "rewards/rejected": -1.0444831848144531, "step": 2330 }, { "epoch": 0.3885, "grad_norm": 120.8839111328125, "learning_rate": 1.3983365100984632e-07, "logits/chosen": 2.494206428527832, "logits/rejected": 2.5479581356048584, "logps/chosen": -23.110233306884766, "logps/rejected": -69.62551879882812, "loss": 1.8024, "nll_loss": 1.5406821966171265, "rewards/accuracies": 1.0, "rewards/chosen": 0.11525917053222656, "rewards/margins": 1.7832584381103516, "rewards/rejected": -1.667999267578125, "step": 2331 }, { "epoch": 0.38866666666666666, "grad_norm": 23.84625816345215, "learning_rate": 1.3978413331201156e-07, "logits/chosen": 1.9152015447616577, "logits/rejected": 1.720725178718567, "logps/chosen": -44.41631317138672, "logps/rejected": -151.07110595703125, "loss": 0.5226, "nll_loss": 0.4880913496017456, "rewards/accuracies": 1.0, "rewards/chosen": 0.985759437084198, "rewards/margins": 6.491261005401611, "rewards/rejected": -5.505501747131348, "step": 2332 }, { "epoch": 0.3888333333333333, "grad_norm": 33.96881103515625, "learning_rate": 1.3973460402203548e-07, "logits/chosen": 3.391608953475952, "logits/rejected": 3.461961030960083, "logps/chosen": -30.62413787841797, "logps/rejected": -159.09986877441406, "loss": 0.7601, "nll_loss": 0.7469302415847778, "rewards/accuracies": 1.0, "rewards/chosen": 2.287320375442505, "rewards/margins": 7.131745338439941, "rewards/rejected": -4.844425201416016, "step": 2333 }, { "epoch": 0.389, "grad_norm": 72.27549743652344, "learning_rate": 1.3968506315434972e-07, "logits/chosen": 2.51387619972229, "logits/rejected": 3.0123291015625, "logps/chosen": -67.30467224121094, "logps/rejected": -270.3762512207031, "loss": 2.0375, "nll_loss": 1.9795494079589844, "rewards/accuracies": 1.0, "rewards/chosen": 0.31002044677734375, "rewards/margins": 8.669211387634277, "rewards/rejected": -8.359190940856934, "step": 2334 }, { "epoch": 0.38916666666666666, "grad_norm": 53.226131439208984, "learning_rate": 1.3963551072338931e-07, "logits/chosen": 0.8568979501724243, "logits/rejected": 2.5845212936401367, "logps/chosen": -46.73772430419922, "logps/rejected": -460.82427978515625, "loss": 1.3844, "nll_loss": 1.3746387958526611, "rewards/accuracies": 1.0, "rewards/chosen": 2.223522901535034, "rewards/margins": 10.264871597290039, "rewards/rejected": -8.041348457336426, "step": 2335 }, { "epoch": 0.3893333333333333, "grad_norm": 26.154296875, "learning_rate": 1.3958594674359263e-07, "logits/chosen": 1.2204700708389282, "logits/rejected": 1.7822625637054443, "logps/chosen": -39.949493408203125, "logps/rejected": -173.05577087402344, "loss": 0.5607, "nll_loss": 0.5398579835891724, "rewards/accuracies": 1.0, "rewards/chosen": 1.4464695453643799, "rewards/margins": 8.64885425567627, "rewards/rejected": -7.202384948730469, "step": 2336 }, { "epoch": 0.3895, "grad_norm": 23.872085571289062, "learning_rate": 1.3953637122940144e-07, "logits/chosen": 1.921005129814148, "logits/rejected": 1.6648404598236084, "logps/chosen": -69.57149505615234, "logps/rejected": -97.8259506225586, "loss": 0.8015, "nll_loss": 0.7905852794647217, "rewards/accuracies": 1.0, "rewards/chosen": 2.1006417274475098, "rewards/margins": 10.451057434082031, "rewards/rejected": -8.35041618347168, "step": 2337 }, { "epoch": 0.38966666666666666, "grad_norm": 27.35713768005371, "learning_rate": 1.3948678419526086e-07, "logits/chosen": 2.511951208114624, "logits/rejected": 2.622044563293457, "logps/chosen": -75.16941833496094, "logps/rejected": -258.03375244140625, "loss": 0.8476, "nll_loss": 0.8260374665260315, "rewards/accuracies": 1.0, "rewards/chosen": 1.3842308521270752, "rewards/margins": 13.090564727783203, "rewards/rejected": -11.706334114074707, "step": 2338 }, { "epoch": 0.3898333333333333, "grad_norm": 32.412147521972656, "learning_rate": 1.3943718565561935e-07, "logits/chosen": 1.1961467266082764, "logits/rejected": 1.2419549226760864, "logps/chosen": -58.960533142089844, "logps/rejected": -118.19425201416016, "loss": 0.7197, "nll_loss": 0.6272397041320801, "rewards/accuracies": 1.0, "rewards/chosen": 0.7824456095695496, "rewards/margins": 3.4261844158172607, "rewards/rejected": -2.6437387466430664, "step": 2339 }, { "epoch": 0.39, "grad_norm": 31.394411087036133, "learning_rate": 1.393875756249287e-07, "logits/chosen": 2.4566586017608643, "logits/rejected": 2.4561972618103027, "logps/chosen": -81.7374267578125, "logps/rejected": -129.8654022216797, "loss": 0.99, "nll_loss": 0.9616168141365051, "rewards/accuracies": 1.0, "rewards/chosen": 1.1706024408340454, "rewards/margins": 6.987870693206787, "rewards/rejected": -5.817268371582031, "step": 2340 }, { "epoch": 0.39016666666666666, "grad_norm": 29.794925689697266, "learning_rate": 1.3933795411764419e-07, "logits/chosen": 2.779759645462036, "logits/rejected": 2.834440231323242, "logps/chosen": -62.38066864013672, "logps/rejected": -138.83181762695312, "loss": 0.9246, "nll_loss": 0.9040676355361938, "rewards/accuracies": 1.0, "rewards/chosen": 1.6697769165039062, "rewards/margins": 6.638099193572998, "rewards/rejected": -4.968322277069092, "step": 2341 }, { "epoch": 0.3903333333333333, "grad_norm": 107.6756820678711, "learning_rate": 1.3928832114822417e-07, "logits/chosen": 0.6875755786895752, "logits/rejected": 2.272942543029785, "logps/chosen": -23.476396560668945, "logps/rejected": -313.5097961425781, "loss": 1.4908, "nll_loss": 1.4672749042510986, "rewards/accuracies": 1.0, "rewards/chosen": 1.3581714630126953, "rewards/margins": 7.454865455627441, "rewards/rejected": -6.096693992614746, "step": 2342 }, { "epoch": 0.3905, "grad_norm": 21.261760711669922, "learning_rate": 1.3923867673113066e-07, "logits/chosen": 1.5099176168441772, "logits/rejected": 2.112731456756592, "logps/chosen": -157.6007080078125, "logps/rejected": -272.1527099609375, "loss": 0.9019, "nll_loss": 0.8954586386680603, "rewards/accuracies": 1.0, "rewards/chosen": 2.7470855712890625, "rewards/margins": 9.2616548538208, "rewards/rejected": -6.514569282531738, "step": 2343 }, { "epoch": 0.39066666666666666, "grad_norm": 198.87953186035156, "learning_rate": 1.3918902088082875e-07, "logits/chosen": 2.907839298248291, "logits/rejected": 2.879096031188965, "logps/chosen": -59.29688262939453, "logps/rejected": -44.577110290527344, "loss": 3.7386, "nll_loss": 3.1208884716033936, "rewards/accuracies": 1.0, "rewards/chosen": -1.9733966588974, "rewards/margins": 0.7810484170913696, "rewards/rejected": -2.7544450759887695, "step": 2344 }, { "epoch": 0.3908333333333333, "grad_norm": 22.62847900390625, "learning_rate": 1.3913935361178704e-07, "logits/chosen": 1.3768442869186401, "logits/rejected": 2.1106486320495605, "logps/chosen": -94.50135040283203, "logps/rejected": -276.279296875, "loss": 0.7836, "nll_loss": 0.7683037519454956, "rewards/accuracies": 1.0, "rewards/chosen": 1.7426445484161377, "rewards/margins": 11.625649452209473, "rewards/rejected": -9.883005142211914, "step": 2345 }, { "epoch": 0.391, "grad_norm": 24.891897201538086, "learning_rate": 1.3908967493847727e-07, "logits/chosen": 3.0187108516693115, "logits/rejected": 3.0948596000671387, "logps/chosen": -77.3667984008789, "logps/rejected": -223.05108642578125, "loss": 0.7997, "nll_loss": 0.7814827561378479, "rewards/accuracies": 1.0, "rewards/chosen": 1.7388123273849487, "rewards/margins": 7.011551380157471, "rewards/rejected": -5.272738933563232, "step": 2346 }, { "epoch": 0.39116666666666666, "grad_norm": 66.64125061035156, "learning_rate": 1.3903998487537472e-07, "logits/chosen": 2.2585856914520264, "logits/rejected": 2.6507351398468018, "logps/chosen": -10.997686386108398, "logps/rejected": -311.3910827636719, "loss": 0.7803, "nll_loss": 0.7331792116165161, "rewards/accuracies": 1.0, "rewards/chosen": 0.64191073179245, "rewards/margins": 5.9056830406188965, "rewards/rejected": -5.263772487640381, "step": 2347 }, { "epoch": 0.3913333333333333, "grad_norm": 45.837623596191406, "learning_rate": 1.389902834369578e-07, "logits/chosen": 1.542733073234558, "logits/rejected": 2.3375542163848877, "logps/chosen": -41.96807098388672, "logps/rejected": -267.08880615234375, "loss": 1.2112, "nll_loss": 1.1990877389907837, "rewards/accuracies": 1.0, "rewards/chosen": 2.0447781085968018, "rewards/margins": 8.575325012207031, "rewards/rejected": -6.53054666519165, "step": 2348 }, { "epoch": 0.3915, "grad_norm": 45.90674591064453, "learning_rate": 1.389405706377084e-07, "logits/chosen": 2.7180471420288086, "logits/rejected": 2.700871467590332, "logps/chosen": -130.50067138671875, "logps/rejected": -106.72417449951172, "loss": 1.1537, "nll_loss": 1.0966442823410034, "rewards/accuracies": 1.0, "rewards/chosen": 0.9464325904846191, "rewards/margins": 4.336915969848633, "rewards/rejected": -3.3904833793640137, "step": 2349 }, { "epoch": 0.39166666666666666, "grad_norm": 29.658239364624023, "learning_rate": 1.3889084649211155e-07, "logits/chosen": 2.935727119445801, "logits/rejected": 2.9163222312927246, "logps/chosen": -25.099367141723633, "logps/rejected": -131.9754638671875, "loss": 0.5396, "nll_loss": 0.5122318863868713, "rewards/accuracies": 1.0, "rewards/chosen": 1.3203309774398804, "rewards/margins": 6.275334358215332, "rewards/rejected": -4.955003261566162, "step": 2350 }, { "epoch": 0.3918333333333333, "grad_norm": 30.50078773498535, "learning_rate": 1.3884111101465573e-07, "logits/chosen": 3.2217071056365967, "logits/rejected": 3.2982828617095947, "logps/chosen": -34.8008918762207, "logps/rejected": -135.74603271484375, "loss": 0.6742, "nll_loss": 0.6444609761238098, "rewards/accuracies": 1.0, "rewards/chosen": 1.2573566436767578, "rewards/margins": 6.039739608764648, "rewards/rejected": -4.782382965087891, "step": 2351 }, { "epoch": 0.392, "grad_norm": 25.42485809326172, "learning_rate": 1.3879136421983264e-07, "logits/chosen": 1.9791593551635742, "logits/rejected": 2.303927421569824, "logps/chosen": -136.06121826171875, "logps/rejected": -228.55487060546875, "loss": 1.1333, "nll_loss": 1.1244728565216064, "rewards/accuracies": 1.0, "rewards/chosen": 2.3531951904296875, "rewards/margins": 9.504196166992188, "rewards/rejected": -7.1510009765625, "step": 2352 }, { "epoch": 0.39216666666666666, "grad_norm": 21.543733596801758, "learning_rate": 1.3874160612213732e-07, "logits/chosen": 2.511091709136963, "logits/rejected": 2.5509109497070312, "logps/chosen": -10.84021282196045, "logps/rejected": -72.82606506347656, "loss": 0.2558, "nll_loss": 0.24089360237121582, "rewards/accuracies": 1.0, "rewards/chosen": 2.6372182369232178, "rewards/margins": 6.695590019226074, "rewards/rejected": -4.0583720207214355, "step": 2353 }, { "epoch": 0.3923333333333333, "grad_norm": 22.888935089111328, "learning_rate": 1.3869183673606805e-07, "logits/chosen": 3.0314834117889404, "logits/rejected": 3.2482283115386963, "logps/chosen": -162.936279296875, "logps/rejected": -483.29888916015625, "loss": 0.995, "nll_loss": 0.9815438389778137, "rewards/accuracies": 1.0, "rewards/chosen": 1.924453854560852, "rewards/margins": 8.543967247009277, "rewards/rejected": -6.619513034820557, "step": 2354 }, { "epoch": 0.3925, "grad_norm": 34.19163131713867, "learning_rate": 1.3864205607612647e-07, "logits/chosen": 2.208629608154297, "logits/rejected": 2.157909393310547, "logps/chosen": -104.58160400390625, "logps/rejected": -72.7125244140625, "loss": 1.0781, "nll_loss": 1.0354613065719604, "rewards/accuracies": 1.0, "rewards/chosen": 1.2782082557678223, "rewards/margins": 4.812424659729004, "rewards/rejected": -3.5342166423797607, "step": 2355 }, { "epoch": 0.39266666666666666, "grad_norm": 39.52253341674805, "learning_rate": 1.385922641568175e-07, "logits/chosen": 3.111238718032837, "logits/rejected": 3.0493361949920654, "logps/chosen": -24.575117111206055, "logps/rejected": -160.44972229003906, "loss": 0.7384, "nll_loss": 0.7227976322174072, "rewards/accuracies": 1.0, "rewards/chosen": 1.8835406303405762, "rewards/margins": 7.372256278991699, "rewards/rejected": -5.488715648651123, "step": 2356 }, { "epoch": 0.3928333333333333, "grad_norm": 39.32695007324219, "learning_rate": 1.3854246099264919e-07, "logits/chosen": 2.160090208053589, "logits/rejected": 2.443635940551758, "logps/chosen": -54.64766311645508, "logps/rejected": -269.4560241699219, "loss": 0.7851, "nll_loss": 0.7485980987548828, "rewards/accuracies": 1.0, "rewards/chosen": 0.8079906702041626, "rewards/margins": 11.18256950378418, "rewards/rejected": -10.374578475952148, "step": 2357 }, { "epoch": 0.393, "grad_norm": 23.65723991394043, "learning_rate": 1.3849264659813312e-07, "logits/chosen": 2.635864019393921, "logits/rejected": 2.751739501953125, "logps/chosen": -111.86741638183594, "logps/rejected": -296.82244873046875, "loss": 0.9491, "nll_loss": 0.9400622844696045, "rewards/accuracies": 1.0, "rewards/chosen": 2.275499105453491, "rewards/margins": 12.039974212646484, "rewards/rejected": -9.764474868774414, "step": 2358 }, { "epoch": 0.39316666666666666, "grad_norm": 24.527143478393555, "learning_rate": 1.3844282098778394e-07, "logits/chosen": 2.5381875038146973, "logits/rejected": 2.635326862335205, "logps/chosen": -86.1968002319336, "logps/rejected": -183.89915466308594, "loss": 0.9022, "nll_loss": 0.8886268138885498, "rewards/accuracies": 1.0, "rewards/chosen": 2.2725563049316406, "rewards/margins": 7.014634132385254, "rewards/rejected": -4.742077827453613, "step": 2359 }, { "epoch": 0.3933333333333333, "grad_norm": 27.120018005371094, "learning_rate": 1.3839298417611963e-07, "logits/chosen": 2.2221477031707764, "logits/rejected": 2.5847883224487305, "logps/chosen": -29.998065948486328, "logps/rejected": -101.02088165283203, "loss": 0.5266, "nll_loss": 0.4999677538871765, "rewards/accuracies": 1.0, "rewards/chosen": 2.1191234588623047, "rewards/margins": 5.606497764587402, "rewards/rejected": -3.4873743057250977, "step": 2360 }, { "epoch": 0.3935, "grad_norm": 22.765180587768555, "learning_rate": 1.3834313617766146e-07, "logits/chosen": 2.7122530937194824, "logits/rejected": 2.9272520542144775, "logps/chosen": -93.57765197753906, "logps/rejected": -284.79364013671875, "loss": 0.9218, "nll_loss": 0.9174279570579529, "rewards/accuracies": 1.0, "rewards/chosen": 3.2441158294677734, "rewards/margins": 9.52392578125, "rewards/rejected": -6.279809474945068, "step": 2361 }, { "epoch": 0.39366666666666666, "grad_norm": 20.425769805908203, "learning_rate": 1.3829327700693395e-07, "logits/chosen": 2.5186214447021484, "logits/rejected": 2.5770699977874756, "logps/chosen": -26.431137084960938, "logps/rejected": -209.24005126953125, "loss": 0.4137, "nll_loss": 0.40663281083106995, "rewards/accuracies": 1.0, "rewards/chosen": 2.5301382541656494, "rewards/margins": 11.852983474731445, "rewards/rejected": -9.322845458984375, "step": 2362 }, { "epoch": 0.3938333333333333, "grad_norm": 20.138031005859375, "learning_rate": 1.3824340667846483e-07, "logits/chosen": 2.715582847595215, "logits/rejected": 2.913539171218872, "logps/chosen": -13.601491928100586, "logps/rejected": -238.30792236328125, "loss": 0.3224, "nll_loss": 0.30225542187690735, "rewards/accuracies": 1.0, "rewards/chosen": 3.1022467613220215, "rewards/margins": 6.463769912719727, "rewards/rejected": -3.361523389816284, "step": 2363 }, { "epoch": 0.394, "grad_norm": 22.375307083129883, "learning_rate": 1.3819352520678517e-07, "logits/chosen": 2.6805830001831055, "logits/rejected": 3.0017337799072266, "logps/chosen": -19.115596771240234, "logps/rejected": -95.217529296875, "loss": 0.3799, "nll_loss": 0.3606715798377991, "rewards/accuracies": 1.0, "rewards/chosen": 3.7693848609924316, "rewards/margins": 7.0252180099487305, "rewards/rejected": -3.255833387374878, "step": 2364 }, { "epoch": 0.39416666666666667, "grad_norm": 24.421052932739258, "learning_rate": 1.3814363260642917e-07, "logits/chosen": 2.7001473903656006, "logits/rejected": 2.6287124156951904, "logps/chosen": -19.936664581298828, "logps/rejected": -129.34439086914062, "loss": 0.449, "nll_loss": 0.4153471887111664, "rewards/accuracies": 1.0, "rewards/chosen": 1.3038837909698486, "rewards/margins": 5.4167680740356445, "rewards/rejected": -4.112884044647217, "step": 2365 }, { "epoch": 0.3943333333333333, "grad_norm": 41.61924362182617, "learning_rate": 1.3809372889193441e-07, "logits/chosen": 2.847259759902954, "logits/rejected": 3.182565450668335, "logps/chosen": -58.185211181640625, "logps/rejected": -117.96488952636719, "loss": 0.9018, "nll_loss": 0.8815940618515015, "rewards/accuracies": 1.0, "rewards/chosen": 1.5687189102172852, "rewards/margins": 7.108192443847656, "rewards/rejected": -5.539473533630371, "step": 2366 }, { "epoch": 0.3945, "grad_norm": 25.118913650512695, "learning_rate": 1.380438140778416e-07, "logits/chosen": 2.4869630336761475, "logits/rejected": 2.593688488006592, "logps/chosen": -15.680384635925293, "logps/rejected": -263.8089599609375, "loss": 0.3537, "nll_loss": 0.34845298528671265, "rewards/accuracies": 1.0, "rewards/chosen": 2.830894947052002, "rewards/margins": 11.497610092163086, "rewards/rejected": -8.666714668273926, "step": 2367 }, { "epoch": 0.39466666666666667, "grad_norm": 48.743473052978516, "learning_rate": 1.3799388817869465e-07, "logits/chosen": 2.539438009262085, "logits/rejected": 2.6376376152038574, "logps/chosen": -45.896766662597656, "logps/rejected": -91.24884033203125, "loss": 0.8631, "nll_loss": 0.7171370983123779, "rewards/accuracies": 1.0, "rewards/chosen": 1.1679203510284424, "rewards/margins": 2.798316478729248, "rewards/rejected": -1.6303962469100952, "step": 2368 }, { "epoch": 0.3948333333333333, "grad_norm": 27.640853881835938, "learning_rate": 1.3794395120904086e-07, "logits/chosen": 3.94333553314209, "logits/rejected": 3.959148406982422, "logps/chosen": -70.13790893554688, "logps/rejected": -265.8945007324219, "loss": 0.9051, "nll_loss": 0.887821614742279, "rewards/accuracies": 1.0, "rewards/chosen": 1.6058701276779175, "rewards/margins": 10.511932373046875, "rewards/rejected": -8.906062126159668, "step": 2369 }, { "epoch": 0.395, "grad_norm": 110.42317962646484, "learning_rate": 1.3789400318343066e-07, "logits/chosen": 2.215512990951538, "logits/rejected": 2.4258878231048584, "logps/chosen": -187.20565795898438, "logps/rejected": -160.64857482910156, "loss": 1.4684, "nll_loss": 1.1627681255340576, "rewards/accuracies": 1.0, "rewards/chosen": -1.1017974615097046, "rewards/margins": 1.8797928094863892, "rewards/rejected": -2.9815902709960938, "step": 2370 }, { "epoch": 0.39516666666666667, "grad_norm": 21.749229431152344, "learning_rate": 1.378440441164177e-07, "logits/chosen": 1.2433828115463257, "logits/rejected": 1.9412102699279785, "logps/chosen": -29.55929946899414, "logps/rejected": -233.220458984375, "loss": 0.4249, "nll_loss": 0.42227569222450256, "rewards/accuracies": 1.0, "rewards/chosen": 3.759084701538086, "rewards/margins": 10.416424751281738, "rewards/rejected": -6.657340049743652, "step": 2371 }, { "epoch": 0.3953333333333333, "grad_norm": 40.32481002807617, "learning_rate": 1.377940740225588e-07, "logits/chosen": 2.728437662124634, "logits/rejected": 2.8187928199768066, "logps/chosen": -20.203876495361328, "logps/rejected": -272.7826232910156, "loss": 0.7116, "nll_loss": 0.6966853737831116, "rewards/accuracies": 1.0, "rewards/chosen": 2.014479398727417, "rewards/margins": 7.114546775817871, "rewards/rejected": -5.100067138671875, "step": 2372 }, { "epoch": 0.3955, "grad_norm": 17.409597396850586, "learning_rate": 1.3774409291641403e-07, "logits/chosen": 1.793528437614441, "logits/rejected": 1.9833307266235352, "logps/chosen": -19.4542179107666, "logps/rejected": -445.59417724609375, "loss": 0.3213, "nll_loss": 0.308797150850296, "rewards/accuracies": 1.0, "rewards/chosen": 1.931916356086731, "rewards/margins": 18.31981658935547, "rewards/rejected": -16.38789939880371, "step": 2373 }, { "epoch": 0.39566666666666667, "grad_norm": 33.023292541503906, "learning_rate": 1.3769410081254678e-07, "logits/chosen": 2.0709526538848877, "logits/rejected": 2.0716018676757812, "logps/chosen": -35.41067886352539, "logps/rejected": -36.86864471435547, "loss": 0.6865, "nll_loss": 0.621239960193634, "rewards/accuracies": 1.0, "rewards/chosen": 1.9101524353027344, "rewards/margins": 4.2083330154418945, "rewards/rejected": -2.2981808185577393, "step": 2374 }, { "epoch": 0.3958333333333333, "grad_norm": 99.38212585449219, "learning_rate": 1.3764409772552352e-07, "logits/chosen": 1.8494065999984741, "logits/rejected": 2.045910358428955, "logps/chosen": -70.08187866210938, "logps/rejected": -87.4144287109375, "loss": 1.3226, "nll_loss": 0.9221301078796387, "rewards/accuracies": 1.0, "rewards/chosen": 1.7012299299240112, "rewards/margins": 1.6430084705352783, "rewards/rejected": 0.05822144076228142, "step": 2375 }, { "epoch": 0.396, "grad_norm": 56.019161224365234, "learning_rate": 1.375940836699139e-07, "logits/chosen": 2.4570930004119873, "logits/rejected": 2.3822927474975586, "logps/chosen": -72.03289794921875, "logps/rejected": -96.57599639892578, "loss": 1.2265, "nll_loss": 1.1081985235214233, "rewards/accuracies": 1.0, "rewards/chosen": 0.5454277396202087, "rewards/margins": 3.0263497829437256, "rewards/rejected": -2.480921983718872, "step": 2376 }, { "epoch": 0.39616666666666667, "grad_norm": 172.33631896972656, "learning_rate": 1.3754405866029085e-07, "logits/chosen": 2.6070375442504883, "logits/rejected": 2.8015148639678955, "logps/chosen": -37.30038070678711, "logps/rejected": -131.69247436523438, "loss": 1.5886, "nll_loss": 0.761232316493988, "rewards/accuracies": 1.0, "rewards/chosen": 1.938092589378357, "rewards/margins": 0.5702961683273315, "rewards/rejected": 1.3677964210510254, "step": 2377 }, { "epoch": 0.3963333333333333, "grad_norm": 32.26909637451172, "learning_rate": 1.3749402271123048e-07, "logits/chosen": 3.108598470687866, "logits/rejected": 3.2513821125030518, "logps/chosen": -16.084516525268555, "logps/rejected": -182.34927368164062, "loss": 0.544, "nll_loss": 0.5188553929328918, "rewards/accuracies": 1.0, "rewards/chosen": 1.3233579397201538, "rewards/margins": 6.823556423187256, "rewards/rejected": -5.5001983642578125, "step": 2378 }, { "epoch": 0.3965, "grad_norm": 28.343502044677734, "learning_rate": 1.37443975837312e-07, "logits/chosen": 1.7591303586959839, "logits/rejected": 1.6781107187271118, "logps/chosen": -53.99916076660156, "logps/rejected": -65.4853744506836, "loss": 0.7257, "nll_loss": 0.6666563153266907, "rewards/accuracies": 1.0, "rewards/chosen": 2.1088624000549316, "rewards/margins": 4.446612358093262, "rewards/rejected": -2.33774995803833, "step": 2379 }, { "epoch": 0.39666666666666667, "grad_norm": 192.86907958984375, "learning_rate": 1.3739391805311793e-07, "logits/chosen": 2.6665518283843994, "logits/rejected": 2.7531750202178955, "logps/chosen": -42.637367248535156, "logps/rejected": -41.42667770385742, "loss": 2.9473, "nll_loss": 0.4737486243247986, "rewards/accuracies": 0.0, "rewards/chosen": 2.2427735328674316, "rewards/margins": -1.7213749885559082, "rewards/rejected": 3.96414852142334, "step": 2380 }, { "epoch": 0.3968333333333333, "grad_norm": 37.41273880004883, "learning_rate": 1.373438493732339e-07, "logits/chosen": 1.0781692266464233, "logits/rejected": 1.79653000831604, "logps/chosen": -54.00090789794922, "logps/rejected": -214.00518798828125, "loss": 0.737, "nll_loss": 0.6923192143440247, "rewards/accuracies": 1.0, "rewards/chosen": 0.7245746850967407, "rewards/margins": 5.754680156707764, "rewards/rejected": -5.0301055908203125, "step": 2381 }, { "epoch": 0.397, "grad_norm": 62.701011657714844, "learning_rate": 1.3729376981224866e-07, "logits/chosen": 2.40474271774292, "logits/rejected": 1.87879478931427, "logps/chosen": -52.33263397216797, "logps/rejected": -27.74089813232422, "loss": 1.0407, "nll_loss": 0.8051174283027649, "rewards/accuracies": 1.0, "rewards/chosen": 2.0647408962249756, "rewards/margins": 2.600844383239746, "rewards/rejected": -0.5361034274101257, "step": 2382 }, { "epoch": 0.39716666666666667, "grad_norm": 32.00558090209961, "learning_rate": 1.3724367938475427e-07, "logits/chosen": 3.5506186485290527, "logits/rejected": 3.5118958950042725, "logps/chosen": -121.00016021728516, "logps/rejected": -140.6911163330078, "loss": 0.7929, "nll_loss": 0.7707016468048096, "rewards/accuracies": 1.0, "rewards/chosen": 1.3927100896835327, "rewards/margins": 7.721293926239014, "rewards/rejected": -6.328583717346191, "step": 2383 }, { "epoch": 0.3973333333333333, "grad_norm": 51.98756790161133, "learning_rate": 1.371935781053458e-07, "logits/chosen": 2.5735228061676025, "logits/rejected": 2.475419282913208, "logps/chosen": -93.50682830810547, "logps/rejected": -49.669124603271484, "loss": 1.5845, "nll_loss": 1.5081747770309448, "rewards/accuracies": 1.0, "rewards/chosen": 1.25553297996521, "rewards/margins": 3.7518789768218994, "rewards/rejected": -2.4963459968566895, "step": 2384 }, { "epoch": 0.3975, "grad_norm": 32.435245513916016, "learning_rate": 1.3714346598862164e-07, "logits/chosen": 2.745009422302246, "logits/rejected": 2.885298252105713, "logps/chosen": -26.24588394165039, "logps/rejected": -271.07208251953125, "loss": 0.6126, "nll_loss": 0.5705627202987671, "rewards/accuracies": 1.0, "rewards/chosen": 1.0178158283233643, "rewards/margins": 5.115796089172363, "rewards/rejected": -4.09798002243042, "step": 2385 }, { "epoch": 0.39766666666666667, "grad_norm": 24.343671798706055, "learning_rate": 1.370933430491832e-07, "logits/chosen": 2.3910059928894043, "logits/rejected": 2.478717088699341, "logps/chosen": -152.6063232421875, "logps/rejected": -196.91403198242188, "loss": 1.0634, "nll_loss": 1.0452488660812378, "rewards/accuracies": 1.0, "rewards/chosen": 1.5903472900390625, "rewards/margins": 8.376974105834961, "rewards/rejected": -6.786627292633057, "step": 2386 }, { "epoch": 0.3978333333333333, "grad_norm": 31.054977416992188, "learning_rate": 1.3704320930163516e-07, "logits/chosen": 2.775580883026123, "logits/rejected": 2.7568752765655518, "logps/chosen": -71.00094604492188, "logps/rejected": -73.28972625732422, "loss": 0.7917, "nll_loss": 0.7634509801864624, "rewards/accuracies": 1.0, "rewards/chosen": 1.143060326576233, "rewards/margins": 7.185622215270996, "rewards/rejected": -6.042562007904053, "step": 2387 }, { "epoch": 0.398, "grad_norm": 25.061473846435547, "learning_rate": 1.369930647605852e-07, "logits/chosen": 1.0758390426635742, "logits/rejected": 1.4930914640426636, "logps/chosen": -78.56047058105469, "logps/rejected": -210.00668334960938, "loss": 0.837, "nll_loss": 0.8269522190093994, "rewards/accuracies": 1.0, "rewards/chosen": 2.3227920532226562, "rewards/margins": 8.153850555419922, "rewards/rejected": -5.831058025360107, "step": 2388 }, { "epoch": 0.39816666666666667, "grad_norm": 19.565372467041016, "learning_rate": 1.3694290944064433e-07, "logits/chosen": 2.6898386478424072, "logits/rejected": 2.8875205516815186, "logps/chosen": -57.34822463989258, "logps/rejected": -258.74456787109375, "loss": 0.6458, "nll_loss": 0.6372025012969971, "rewards/accuracies": 1.0, "rewards/chosen": 2.893587112426758, "rewards/margins": 7.759306907653809, "rewards/rejected": -4.865719795227051, "step": 2389 }, { "epoch": 0.3983333333333333, "grad_norm": 88.42778778076172, "learning_rate": 1.3689274335642652e-07, "logits/chosen": 2.264296293258667, "logits/rejected": 1.8280245065689087, "logps/chosen": -122.15929412841797, "logps/rejected": -158.01251220703125, "loss": 1.2767, "nll_loss": 1.0810558795928955, "rewards/accuracies": 1.0, "rewards/chosen": -0.04704361408948898, "rewards/margins": 2.266848087310791, "rewards/rejected": -2.313891649246216, "step": 2390 }, { "epoch": 0.3985, "grad_norm": 44.953800201416016, "learning_rate": 1.3684256652254904e-07, "logits/chosen": 2.2043159008026123, "logits/rejected": 2.2199058532714844, "logps/chosen": -37.326622009277344, "logps/rejected": -89.87089538574219, "loss": 0.7582, "nll_loss": 0.6119118332862854, "rewards/accuracies": 1.0, "rewards/chosen": 1.316199541091919, "rewards/margins": 2.852301836013794, "rewards/rejected": -1.536102294921875, "step": 2391 }, { "epoch": 0.39866666666666667, "grad_norm": 47.18528366088867, "learning_rate": 1.3679237895363217e-07, "logits/chosen": 0.05758886784315109, "logits/rejected": 1.0913764238357544, "logps/chosen": -27.233291625976562, "logps/rejected": -301.41424560546875, "loss": 0.9409, "nll_loss": 0.9390790462493896, "rewards/accuracies": 1.0, "rewards/chosen": 3.879544258117676, "rewards/margins": 14.960968017578125, "rewards/rejected": -11.08142375946045, "step": 2392 }, { "epoch": 0.3988333333333333, "grad_norm": 35.500431060791016, "learning_rate": 1.3674218066429937e-07, "logits/chosen": 1.8334187269210815, "logits/rejected": 2.5929763317108154, "logps/chosen": -17.64666748046875, "logps/rejected": -573.5661010742188, "loss": 0.5481, "nll_loss": 0.519019603729248, "rewards/accuracies": 1.0, "rewards/chosen": 1.0442107915878296, "rewards/margins": 10.927483558654785, "rewards/rejected": -9.883273124694824, "step": 2393 }, { "epoch": 0.399, "grad_norm": 47.265869140625, "learning_rate": 1.366919716691772e-07, "logits/chosen": 3.141083240509033, "logits/rejected": 3.1609995365142822, "logps/chosen": -114.64706420898438, "logps/rejected": -137.66754150390625, "loss": 1.5691, "nll_loss": 1.5492846965789795, "rewards/accuracies": 1.0, "rewards/chosen": 1.8044862747192383, "rewards/margins": 6.414022922515869, "rewards/rejected": -4.609536647796631, "step": 2394 }, { "epoch": 0.39916666666666667, "grad_norm": 44.93606185913086, "learning_rate": 1.366417519828954e-07, "logits/chosen": 1.9424501657485962, "logits/rejected": 1.935142993927002, "logps/chosen": -60.807228088378906, "logps/rejected": -107.18827056884766, "loss": 1.2199, "nll_loss": 1.1693700551986694, "rewards/accuracies": 1.0, "rewards/chosen": 0.8036866188049316, "rewards/margins": 4.831707000732422, "rewards/rejected": -4.02802038192749, "step": 2395 }, { "epoch": 0.3993333333333333, "grad_norm": 41.12606430053711, "learning_rate": 1.3659152162008677e-07, "logits/chosen": 2.860403537750244, "logits/rejected": 2.7904698848724365, "logps/chosen": -19.87786293029785, "logps/rejected": -273.04254150390625, "loss": 0.6999, "nll_loss": 0.6854435205459595, "rewards/accuracies": 1.0, "rewards/chosen": 2.0470807552337646, "rewards/margins": 7.173139572143555, "rewards/rejected": -5.126059055328369, "step": 2396 }, { "epoch": 0.3995, "grad_norm": 44.9614372253418, "learning_rate": 1.3654128059538718e-07, "logits/chosen": 2.154667854309082, "logits/rejected": 2.543426513671875, "logps/chosen": -10.057626724243164, "logps/rejected": -290.17523193359375, "loss": 0.4301, "nll_loss": 0.4023050367832184, "rewards/accuracies": 1.0, "rewards/chosen": 1.105514407157898, "rewards/margins": 8.605504989624023, "rewards/rejected": -7.499990940093994, "step": 2397 }, { "epoch": 0.39966666666666667, "grad_norm": 33.08951187133789, "learning_rate": 1.3649102892343574e-07, "logits/chosen": 1.0773630142211914, "logits/rejected": 1.8522779941558838, "logps/chosen": -47.30791473388672, "logps/rejected": -190.01663208007812, "loss": 0.733, "nll_loss": 0.7060883045196533, "rewards/accuracies": 1.0, "rewards/chosen": 1.1374202966690063, "rewards/margins": 8.88781452178955, "rewards/rejected": -7.750393867492676, "step": 2398 }, { "epoch": 0.3998333333333333, "grad_norm": 169.0769805908203, "learning_rate": 1.3644076661887448e-07, "logits/chosen": 2.5540366172790527, "logits/rejected": 2.363661527633667, "logps/chosen": -58.63544464111328, "logps/rejected": -17.70477294921875, "loss": 2.8907, "nll_loss": 1.6287624835968018, "rewards/accuracies": 0.0, "rewards/chosen": 1.1432517766952515, "rewards/margins": -0.4238152503967285, "rewards/rejected": 1.56706702709198, "step": 2399 }, { "epoch": 0.4, "grad_norm": 28.507091522216797, "learning_rate": 1.3639049369634876e-07, "logits/chosen": 2.656015634536743, "logits/rejected": 2.7446491718292236, "logps/chosen": -68.0504379272461, "logps/rejected": -74.3296890258789, "loss": 0.7531, "nll_loss": 0.7317252159118652, "rewards/accuracies": 1.0, "rewards/chosen": 1.4381111860275269, "rewards/margins": 7.584670066833496, "rewards/rejected": -6.14655876159668, "step": 2400 }, { "epoch": 0.40016666666666667, "grad_norm": 36.25128173828125, "learning_rate": 1.363402101705068e-07, "logits/chosen": 3.058931589126587, "logits/rejected": 3.02118182182312, "logps/chosen": -88.77964782714844, "logps/rejected": -98.62744140625, "loss": 1.0402, "nll_loss": 0.9864404201507568, "rewards/accuracies": 1.0, "rewards/chosen": 1.200905680656433, "rewards/margins": 4.3512163162231445, "rewards/rejected": -3.150310516357422, "step": 2401 }, { "epoch": 0.4003333333333333, "grad_norm": 58.081233978271484, "learning_rate": 1.3628991605600002e-07, "logits/chosen": 2.411716938018799, "logits/rejected": 2.021688461303711, "logps/chosen": -41.03409194946289, "logps/rejected": -37.598514556884766, "loss": 1.136, "nll_loss": 1.0008314847946167, "rewards/accuracies": 1.0, "rewards/chosen": 0.6812512278556824, "rewards/margins": 2.806124687194824, "rewards/rejected": -2.124873399734497, "step": 2402 }, { "epoch": 0.4005, "grad_norm": 31.273618698120117, "learning_rate": 1.3623961136748294e-07, "logits/chosen": 2.3299505710601807, "logits/rejected": 1.7089930772781372, "logps/chosen": -67.98316955566406, "logps/rejected": -65.44102478027344, "loss": 0.9783, "nll_loss": 0.9575093388557434, "rewards/accuracies": 1.0, "rewards/chosen": 2.1152069568634033, "rewards/margins": 6.06566047668457, "rewards/rejected": -3.950453281402588, "step": 2403 }, { "epoch": 0.40066666666666667, "grad_norm": 32.45335388183594, "learning_rate": 1.3618929611961317e-07, "logits/chosen": 1.785300612449646, "logits/rejected": 2.6115787029266357, "logps/chosen": -15.067363739013672, "logps/rejected": -191.91375732421875, "loss": 0.4887, "nll_loss": 0.4565867781639099, "rewards/accuracies": 1.0, "rewards/chosen": 0.9969038367271423, "rewards/margins": 6.994032382965088, "rewards/rejected": -5.997128486633301, "step": 2404 }, { "epoch": 0.4008333333333333, "grad_norm": 47.117759704589844, "learning_rate": 1.361389703270513e-07, "logits/chosen": 3.4642040729522705, "logits/rejected": 3.4675042629241943, "logps/chosen": -37.89306640625, "logps/rejected": -120.91172790527344, "loss": 1.0046, "nll_loss": 0.971616804599762, "rewards/accuracies": 1.0, "rewards/chosen": 1.3071529865264893, "rewards/margins": 5.462444305419922, "rewards/rejected": -4.1552910804748535, "step": 2405 }, { "epoch": 0.401, "grad_norm": 48.27787780761719, "learning_rate": 1.360886340044611e-07, "logits/chosen": 1.9393372535705566, "logits/rejected": 2.079991102218628, "logps/chosen": -21.33819580078125, "logps/rejected": -252.19085693359375, "loss": 0.8309, "nll_loss": 0.8206998109817505, "rewards/accuracies": 1.0, "rewards/chosen": 2.9177699089050293, "rewards/margins": 7.406685829162598, "rewards/rejected": -4.488915920257568, "step": 2406 }, { "epoch": 0.40116666666666667, "grad_norm": 31.82624626159668, "learning_rate": 1.3603828716650937e-07, "logits/chosen": 2.0050530433654785, "logits/rejected": 2.855849027633667, "logps/chosen": -57.817840576171875, "logps/rejected": -330.307373046875, "loss": 0.7706, "nll_loss": 0.7508811354637146, "rewards/accuracies": 1.0, "rewards/chosen": 1.4566268920898438, "rewards/margins": 10.619826316833496, "rewards/rejected": -9.163199424743652, "step": 2407 }, { "epoch": 0.4013333333333333, "grad_norm": 35.28471755981445, "learning_rate": 1.3598792982786594e-07, "logits/chosen": 1.412529706954956, "logits/rejected": 1.7752374410629272, "logps/chosen": -20.811786651611328, "logps/rejected": -295.285400390625, "loss": 0.5969, "nll_loss": 0.5781051516532898, "rewards/accuracies": 1.0, "rewards/chosen": 1.511273980140686, "rewards/margins": 9.916680335998535, "rewards/rejected": -8.40540599822998, "step": 2408 }, { "epoch": 0.4015, "grad_norm": 26.342031478881836, "learning_rate": 1.3593756200320373e-07, "logits/chosen": 2.034921884536743, "logits/rejected": 2.7462596893310547, "logps/chosen": -85.0521240234375, "logps/rejected": -69.71774291992188, "loss": 0.8804, "nll_loss": 0.8591124415397644, "rewards/accuracies": 1.0, "rewards/chosen": 2.3436906337738037, "rewards/margins": 6.025761604309082, "rewards/rejected": -3.6820709705352783, "step": 2409 }, { "epoch": 0.40166666666666667, "grad_norm": 42.086605072021484, "learning_rate": 1.3588718370719877e-07, "logits/chosen": 2.7515835762023926, "logits/rejected": 2.685568332672119, "logps/chosen": -47.58522033691406, "logps/rejected": -67.09310913085938, "loss": 0.839, "nll_loss": 0.8204347491264343, "rewards/accuracies": 1.0, "rewards/chosen": 1.6072155237197876, "rewards/margins": 7.549483299255371, "rewards/rejected": -5.942267894744873, "step": 2410 }, { "epoch": 0.4018333333333333, "grad_norm": 48.90869903564453, "learning_rate": 1.3583679495453e-07, "logits/chosen": 2.559803009033203, "logits/rejected": 2.333009719848633, "logps/chosen": -20.510700225830078, "logps/rejected": -89.36054992675781, "loss": 0.7897, "nll_loss": 0.7596554756164551, "rewards/accuracies": 1.0, "rewards/chosen": 1.274332046508789, "rewards/margins": 5.8772196769714355, "rewards/rejected": -4.6028876304626465, "step": 2411 }, { "epoch": 0.402, "grad_norm": 27.351362228393555, "learning_rate": 1.3578639575987958e-07, "logits/chosen": 1.8822331428527832, "logits/rejected": 2.5999202728271484, "logps/chosen": -79.94776153564453, "logps/rejected": -248.8792724609375, "loss": 0.89, "nll_loss": 0.8596534132957458, "rewards/accuracies": 1.0, "rewards/chosen": 0.9958351254463196, "rewards/margins": 9.5844144821167, "rewards/rejected": -8.588579177856445, "step": 2412 }, { "epoch": 0.4021666666666667, "grad_norm": 42.51251983642578, "learning_rate": 1.3573598613793259e-07, "logits/chosen": 2.5906600952148438, "logits/rejected": 2.7518270015716553, "logps/chosen": -9.749465942382812, "logps/rejected": -245.4043731689453, "loss": 0.4373, "nll_loss": 0.42388972640037537, "rewards/accuracies": 1.0, "rewards/chosen": 1.8852598667144775, "rewards/margins": 9.095788955688477, "rewards/rejected": -7.21052885055542, "step": 2413 }, { "epoch": 0.4023333333333333, "grad_norm": 91.96414184570312, "learning_rate": 1.3568556610337717e-07, "logits/chosen": 3.181739568710327, "logits/rejected": 3.2712669372558594, "logps/chosen": -12.044390678405762, "logps/rejected": -224.18179321289062, "loss": 1.0732, "nll_loss": 1.0036993026733398, "rewards/accuracies": 1.0, "rewards/chosen": 0.0821242406964302, "rewards/margins": 6.855397701263428, "rewards/rejected": -6.773273468017578, "step": 2414 }, { "epoch": 0.4025, "grad_norm": 30.192174911499023, "learning_rate": 1.3563513567090449e-07, "logits/chosen": 1.6904165744781494, "logits/rejected": 2.2311060428619385, "logps/chosen": -74.00990295410156, "logps/rejected": -178.84243774414062, "loss": 1.0886, "nll_loss": 1.0726072788238525, "rewards/accuracies": 1.0, "rewards/chosen": 1.7789604663848877, "rewards/margins": 7.652007102966309, "rewards/rejected": -5.873046398162842, "step": 2415 }, { "epoch": 0.4026666666666667, "grad_norm": 46.8499870300293, "learning_rate": 1.355846948552088e-07, "logits/chosen": 2.7511932849884033, "logits/rejected": 2.812898874282837, "logps/chosen": -45.60515594482422, "logps/rejected": -207.4936065673828, "loss": 1.3493, "nll_loss": 1.3413281440734863, "rewards/accuracies": 1.0, "rewards/chosen": 2.4333038330078125, "rewards/margins": 9.71855354309082, "rewards/rejected": -7.28524923324585, "step": 2416 }, { "epoch": 0.4028333333333333, "grad_norm": 32.46113586425781, "learning_rate": 1.3553424367098735e-07, "logits/chosen": 2.2385361194610596, "logits/rejected": 2.420650005340576, "logps/chosen": -68.74152374267578, "logps/rejected": -352.3778991699219, "loss": 0.9389, "nll_loss": 0.9165534377098083, "rewards/accuracies": 1.0, "rewards/chosen": 1.4863519668579102, "rewards/margins": 6.703793525695801, "rewards/rejected": -5.217441558837891, "step": 2417 }, { "epoch": 0.403, "grad_norm": 41.79499816894531, "learning_rate": 1.354837821329404e-07, "logits/chosen": 2.623692512512207, "logits/rejected": 2.6714401245117188, "logps/chosen": -114.69200134277344, "logps/rejected": -62.6488037109375, "loss": 1.5671, "nll_loss": 1.4159506559371948, "rewards/accuracies": 1.0, "rewards/chosen": 1.8423629999160767, "rewards/margins": 3.076676845550537, "rewards/rejected": -1.23431396484375, "step": 2418 }, { "epoch": 0.4031666666666667, "grad_norm": 32.8486442565918, "learning_rate": 1.354333102557712e-07, "logits/chosen": 2.343681573867798, "logits/rejected": 2.6083734035491943, "logps/chosen": -24.9630069732666, "logps/rejected": -232.38616943359375, "loss": 0.6943, "nll_loss": 0.6934168934822083, "rewards/accuracies": 1.0, "rewards/chosen": 4.721248149871826, "rewards/margins": 13.201910018920898, "rewards/rejected": -8.48066234588623, "step": 2419 }, { "epoch": 0.4033333333333333, "grad_norm": 45.23573303222656, "learning_rate": 1.3538282805418607e-07, "logits/chosen": 2.6418070793151855, "logits/rejected": 2.841114044189453, "logps/chosen": -15.256765365600586, "logps/rejected": -180.9191436767578, "loss": 0.5953, "nll_loss": 0.5867986083030701, "rewards/accuracies": 1.0, "rewards/chosen": 2.3478808403015137, "rewards/margins": 9.998294830322266, "rewards/rejected": -7.650413513183594, "step": 2420 }, { "epoch": 0.4035, "grad_norm": 25.28841209411621, "learning_rate": 1.3533233554289431e-07, "logits/chosen": 2.6744940280914307, "logits/rejected": 2.8107287883758545, "logps/chosen": -13.333845138549805, "logps/rejected": -154.98362731933594, "loss": 0.3825, "nll_loss": 0.3603741526603699, "rewards/accuracies": 1.0, "rewards/chosen": 1.9603381156921387, "rewards/margins": 5.966919898986816, "rewards/rejected": -4.006581783294678, "step": 2421 }, { "epoch": 0.4036666666666667, "grad_norm": 154.75271606445312, "learning_rate": 1.3528183273660823e-07, "logits/chosen": 2.6818766593933105, "logits/rejected": 2.6124565601348877, "logps/chosen": -38.65407943725586, "logps/rejected": -38.64801788330078, "loss": 2.0337, "nll_loss": 0.8989320993423462, "rewards/accuracies": 0.0, "rewards/chosen": 0.4361438751220703, "rewards/margins": -0.38111352920532227, "rewards/rejected": 0.8172574043273926, "step": 2422 }, { "epoch": 0.4038333333333333, "grad_norm": 38.23613357543945, "learning_rate": 1.352313196500431e-07, "logits/chosen": 0.2166985273361206, "logits/rejected": 2.0707526206970215, "logps/chosen": -121.13955688476562, "logps/rejected": -422.9339599609375, "loss": 1.7693, "nll_loss": 1.7305651903152466, "rewards/accuracies": 1.0, "rewards/chosen": 0.7200760245323181, "rewards/margins": 13.697632789611816, "rewards/rejected": -12.977557182312012, "step": 2423 }, { "epoch": 0.404, "grad_norm": 28.22946548461914, "learning_rate": 1.3518079629791724e-07, "logits/chosen": 2.1819515228271484, "logits/rejected": 1.9309372901916504, "logps/chosen": -26.27097511291504, "logps/rejected": -54.08766174316406, "loss": 0.6263, "nll_loss": 0.6109529137611389, "rewards/accuracies": 1.0, "rewards/chosen": 2.58528733253479, "rewards/margins": 6.639308929443359, "rewards/rejected": -4.05402135848999, "step": 2424 }, { "epoch": 0.4041666666666667, "grad_norm": 28.065458297729492, "learning_rate": 1.3513026269495198e-07, "logits/chosen": 2.4992141723632812, "logits/rejected": 2.538074254989624, "logps/chosen": -54.75702667236328, "logps/rejected": -215.95880126953125, "loss": 0.7832, "nll_loss": 0.7712257504463196, "rewards/accuracies": 1.0, "rewards/chosen": 1.996163249015808, "rewards/margins": 9.24501895904541, "rewards/rejected": -7.2488555908203125, "step": 2425 }, { "epoch": 0.4043333333333333, "grad_norm": 34.42196273803711, "learning_rate": 1.3507971885587155e-07, "logits/chosen": 0.7280040383338928, "logits/rejected": 1.4389514923095703, "logps/chosen": -80.673828125, "logps/rejected": -337.53314208984375, "loss": 0.9462, "nll_loss": 0.9064475297927856, "rewards/accuracies": 1.0, "rewards/chosen": 0.936579167842865, "rewards/margins": 5.484394073486328, "rewards/rejected": -4.547814846038818, "step": 2426 }, { "epoch": 0.4045, "grad_norm": 22.992177963256836, "learning_rate": 1.3502916479540324e-07, "logits/chosen": 1.8447684049606323, "logits/rejected": 2.503264904022217, "logps/chosen": -7.120110511779785, "logps/rejected": -58.38721466064453, "loss": 0.2682, "nll_loss": 0.209415003657341, "rewards/accuracies": 1.0, "rewards/chosen": 1.1123368740081787, "rewards/margins": 4.194001197814941, "rewards/rejected": -3.081664562225342, "step": 2427 }, { "epoch": 0.4046666666666667, "grad_norm": 24.55394744873047, "learning_rate": 1.3497860052827728e-07, "logits/chosen": 0.7250329256057739, "logits/rejected": 2.065843343734741, "logps/chosen": -63.77437210083008, "logps/rejected": -380.91839599609375, "loss": 0.8076, "nll_loss": 0.7971795797348022, "rewards/accuracies": 1.0, "rewards/chosen": 2.146118640899658, "rewards/margins": 9.359659194946289, "rewards/rejected": -7.213540554046631, "step": 2428 }, { "epoch": 0.4048333333333333, "grad_norm": 33.39115905761719, "learning_rate": 1.3492802606922688e-07, "logits/chosen": 0.39913731813430786, "logits/rejected": 1.1566025018692017, "logps/chosen": -47.37876510620117, "logps/rejected": -248.9102325439453, "loss": 0.7923, "nll_loss": 0.7767010927200317, "rewards/accuracies": 1.0, "rewards/chosen": 1.7787647247314453, "rewards/margins": 7.877692699432373, "rewards/rejected": -6.098927974700928, "step": 2429 }, { "epoch": 0.405, "grad_norm": 47.599708557128906, "learning_rate": 1.3487744143298822e-07, "logits/chosen": 2.5384702682495117, "logits/rejected": 2.5038623809814453, "logps/chosen": -50.51930618286133, "logps/rejected": -32.10114288330078, "loss": 0.9295, "nll_loss": 0.7115395665168762, "rewards/accuracies": 1.0, "rewards/chosen": 2.3680431842803955, "rewards/margins": 2.9202826023101807, "rewards/rejected": -0.5522394180297852, "step": 2430 }, { "epoch": 0.4051666666666667, "grad_norm": 77.19392395019531, "learning_rate": 1.3482684663430045e-07, "logits/chosen": 2.587839126586914, "logits/rejected": 2.6006009578704834, "logps/chosen": -30.07376480102539, "logps/rejected": -50.83409118652344, "loss": 1.1031, "nll_loss": 0.47736138105392456, "rewards/accuracies": 1.0, "rewards/chosen": 2.2653560638427734, "rewards/margins": 1.2000510692596436, "rewards/rejected": 1.0653049945831299, "step": 2431 }, { "epoch": 0.4053333333333333, "grad_norm": 21.63926124572754, "learning_rate": 1.347762416879057e-07, "logits/chosen": 2.5561370849609375, "logits/rejected": 2.7635562419891357, "logps/chosen": -52.069725036621094, "logps/rejected": -341.23321533203125, "loss": 0.5879, "nll_loss": 0.5721947550773621, "rewards/accuracies": 1.0, "rewards/chosen": 2.053157091140747, "rewards/margins": 6.805302619934082, "rewards/rejected": -4.752145290374756, "step": 2432 }, { "epoch": 0.4055, "grad_norm": 36.10125732421875, "learning_rate": 1.34725626608549e-07, "logits/chosen": 3.171943187713623, "logits/rejected": 3.121943473815918, "logps/chosen": -70.33808898925781, "logps/rejected": -91.51956176757812, "loss": 1.1469, "nll_loss": 1.1344850063323975, "rewards/accuracies": 1.0, "rewards/chosen": 2.289813995361328, "rewards/margins": 7.24213981628418, "rewards/rejected": -4.952325820922852, "step": 2433 }, { "epoch": 0.4056666666666667, "grad_norm": 54.5245475769043, "learning_rate": 1.346750014109784e-07, "logits/chosen": 2.4894917011260986, "logits/rejected": 2.736393690109253, "logps/chosen": -87.6258544921875, "logps/rejected": -291.722412109375, "loss": 1.6386, "nll_loss": 1.5931971073150635, "rewards/accuracies": 1.0, "rewards/chosen": 0.5400543212890625, "rewards/margins": 12.617966651916504, "rewards/rejected": -12.077912330627441, "step": 2434 }, { "epoch": 0.4058333333333333, "grad_norm": 45.83332061767578, "learning_rate": 1.3462436610994486e-07, "logits/chosen": 2.2496497631073, "logits/rejected": 2.3925459384918213, "logps/chosen": -37.75758361816406, "logps/rejected": -324.8762512207031, "loss": 0.7853, "nll_loss": 0.7705629467964172, "rewards/accuracies": 1.0, "rewards/chosen": 1.7432670593261719, "rewards/margins": 11.236492156982422, "rewards/rejected": -9.49322509765625, "step": 2435 }, { "epoch": 0.406, "grad_norm": 57.872047424316406, "learning_rate": 1.3457372072020227e-07, "logits/chosen": 1.6198034286499023, "logits/rejected": 2.545009136199951, "logps/chosen": -6.644334316253662, "logps/rejected": -195.72491455078125, "loss": 0.5505, "nll_loss": 0.474595308303833, "rewards/accuracies": 1.0, "rewards/chosen": 0.5051036477088928, "rewards/margins": 3.9574167728424072, "rewards/rejected": -3.452313184738159, "step": 2436 }, { "epoch": 0.4061666666666667, "grad_norm": 60.934165954589844, "learning_rate": 1.345230652565075e-07, "logits/chosen": 1.8899469375610352, "logits/rejected": 2.256596088409424, "logps/chosen": -52.555381774902344, "logps/rejected": -268.46966552734375, "loss": 1.027, "nll_loss": 0.784408688545227, "rewards/accuracies": 1.0, "rewards/chosen": 5.511726379394531, "rewards/margins": 5.5044264793396, "rewards/rejected": 0.0072998045943677425, "step": 2437 }, { "epoch": 0.4063333333333333, "grad_norm": 145.76393127441406, "learning_rate": 1.3447239973362035e-07, "logits/chosen": 2.3600666522979736, "logits/rejected": 2.4711577892303467, "logps/chosen": -68.04539489746094, "logps/rejected": -12.9244966506958, "loss": 2.3064, "nll_loss": 0.986164927482605, "rewards/accuracies": 0.0, "rewards/chosen": 1.5146729946136475, "rewards/margins": -0.41608381271362305, "rewards/rejected": 1.9307568073272705, "step": 2438 }, { "epoch": 0.4065, "grad_norm": 80.7271499633789, "learning_rate": 1.3442172416630353e-07, "logits/chosen": 1.3223685026168823, "logits/rejected": 1.6759188175201416, "logps/chosen": -58.25987243652344, "logps/rejected": -65.5983657836914, "loss": 1.6547, "nll_loss": 1.5331546068191528, "rewards/accuracies": 1.0, "rewards/chosen": -0.5105747580528259, "rewards/margins": 4.484516143798828, "rewards/rejected": -4.995090961456299, "step": 2439 }, { "epoch": 0.4066666666666667, "grad_norm": 48.7454719543457, "learning_rate": 1.3437103856932265e-07, "logits/chosen": 2.262077569961548, "logits/rejected": 2.6047089099884033, "logps/chosen": -25.71056365966797, "logps/rejected": -284.685302734375, "loss": 0.9584, "nll_loss": 0.9522431492805481, "rewards/accuracies": 1.0, "rewards/chosen": 2.750725507736206, "rewards/margins": 9.467352867126465, "rewards/rejected": -6.716627597808838, "step": 2440 }, { "epoch": 0.4068333333333333, "grad_norm": 238.98060607910156, "learning_rate": 1.343203429574463e-07, "logits/chosen": 4.159737586975098, "logits/rejected": 4.20823335647583, "logps/chosen": -18.615657806396484, "logps/rejected": -157.0503692626953, "loss": 2.7215, "nll_loss": 2.659379720687866, "rewards/accuracies": 1.0, "rewards/chosen": 0.6056482195854187, "rewards/margins": 4.424532413482666, "rewards/rejected": -3.8188843727111816, "step": 2441 }, { "epoch": 0.407, "grad_norm": 43.84589385986328, "learning_rate": 1.3426963734544598e-07, "logits/chosen": 2.6755709648132324, "logits/rejected": 2.7539331912994385, "logps/chosen": -129.39971923828125, "logps/rejected": -284.49896240234375, "loss": 1.3989, "nll_loss": 1.3479137420654297, "rewards/accuracies": 1.0, "rewards/chosen": 1.0665359497070312, "rewards/margins": 4.504709243774414, "rewards/rejected": -3.438173294067383, "step": 2442 }, { "epoch": 0.4071666666666667, "grad_norm": 29.73126220703125, "learning_rate": 1.3421892174809608e-07, "logits/chosen": 1.2775400876998901, "logits/rejected": 2.0697381496429443, "logps/chosen": -69.06831359863281, "logps/rejected": -175.909912109375, "loss": 0.8059, "nll_loss": 0.7507426142692566, "rewards/accuracies": 1.0, "rewards/chosen": 0.6954147815704346, "rewards/margins": 4.678442478179932, "rewards/rejected": -3.983027696609497, "step": 2443 }, { "epoch": 0.4073333333333333, "grad_norm": 49.622467041015625, "learning_rate": 1.3416819618017387e-07, "logits/chosen": 2.572829484939575, "logits/rejected": 2.3591115474700928, "logps/chosen": -111.84855651855469, "logps/rejected": -89.38023376464844, "loss": 1.2698, "nll_loss": 1.1650891304016113, "rewards/accuracies": 1.0, "rewards/chosen": 0.9192520380020142, "rewards/margins": 3.220994472503662, "rewards/rejected": -2.3017425537109375, "step": 2444 }, { "epoch": 0.4075, "grad_norm": 188.56285095214844, "learning_rate": 1.341174606564596e-07, "logits/chosen": 1.842545747756958, "logits/rejected": 2.02101469039917, "logps/chosen": -33.56317901611328, "logps/rejected": -28.077373504638672, "loss": 4.5363, "nll_loss": 0.5888277292251587, "rewards/accuracies": 0.0, "rewards/chosen": 0.9029170870780945, "rewards/margins": -3.545102834701538, "rewards/rejected": 4.448019981384277, "step": 2445 }, { "epoch": 0.4076666666666667, "grad_norm": 31.492746353149414, "learning_rate": 1.340667151917364e-07, "logits/chosen": 2.1571156978607178, "logits/rejected": 2.252974033355713, "logps/chosen": -139.33407592773438, "logps/rejected": -411.8226318359375, "loss": 1.0399, "nll_loss": 1.0170369148254395, "rewards/accuracies": 1.0, "rewards/chosen": 1.2922500371932983, "rewards/margins": 9.581162452697754, "rewards/rejected": -8.288912773132324, "step": 2446 }, { "epoch": 0.4078333333333333, "grad_norm": 22.535118103027344, "learning_rate": 1.3401595980079023e-07, "logits/chosen": 3.1164071559906006, "logits/rejected": 3.115633249282837, "logps/chosen": -59.8758544921875, "logps/rejected": -195.641357421875, "loss": 0.6471, "nll_loss": 0.6172769069671631, "rewards/accuracies": 1.0, "rewards/chosen": 1.8235712051391602, "rewards/margins": 5.3888397216796875, "rewards/rejected": -3.5652682781219482, "step": 2447 }, { "epoch": 0.408, "grad_norm": 30.778099060058594, "learning_rate": 1.3396519449841003e-07, "logits/chosen": 2.303541898727417, "logits/rejected": 1.627082109451294, "logps/chosen": -65.8590087890625, "logps/rejected": -66.08773803710938, "loss": 0.9454, "nll_loss": 0.9275915622711182, "rewards/accuracies": 1.0, "rewards/chosen": 2.327623128890991, "rewards/margins": 6.342747688293457, "rewards/rejected": -4.015124797821045, "step": 2448 }, { "epoch": 0.4081666666666667, "grad_norm": 32.85145950317383, "learning_rate": 1.3391441929938758e-07, "logits/chosen": 2.5161826610565186, "logits/rejected": 2.8855180740356445, "logps/chosen": -45.344032287597656, "logps/rejected": -45.452640533447266, "loss": 0.8734, "nll_loss": 0.8397042751312256, "rewards/accuracies": 1.0, "rewards/chosen": 1.6975631713867188, "rewards/margins": 5.166304588317871, "rewards/rejected": -3.4687411785125732, "step": 2449 }, { "epoch": 0.4083333333333333, "grad_norm": 30.169776916503906, "learning_rate": 1.3386363421851757e-07, "logits/chosen": 2.776379346847534, "logits/rejected": 2.6639153957366943, "logps/chosen": -63.065711975097656, "logps/rejected": -169.4922637939453, "loss": 0.8034, "nll_loss": 0.7419496774673462, "rewards/accuracies": 1.0, "rewards/chosen": 0.8476188778877258, "rewards/margins": 4.205910682678223, "rewards/rejected": -3.3582916259765625, "step": 2450 }, { "epoch": 0.4085, "grad_norm": 30.597490310668945, "learning_rate": 1.338128392705975e-07, "logits/chosen": 1.7657448053359985, "logits/rejected": 2.1157498359680176, "logps/chosen": -97.12696838378906, "logps/rejected": -156.52908325195312, "loss": 0.9455, "nll_loss": 0.9162920713424683, "rewards/accuracies": 1.0, "rewards/chosen": 1.0731658935546875, "rewards/margins": 7.325279235839844, "rewards/rejected": -6.252113342285156, "step": 2451 }, { "epoch": 0.4086666666666667, "grad_norm": 19.1430721282959, "learning_rate": 1.3376203447042785e-07, "logits/chosen": 1.3907910585403442, "logits/rejected": 1.4214754104614258, "logps/chosen": -60.44355392456055, "logps/rejected": -118.89341735839844, "loss": 0.5573, "nll_loss": 0.5396746397018433, "rewards/accuracies": 1.0, "rewards/chosen": 1.8483648300170898, "rewards/margins": 6.747570037841797, "rewards/rejected": -4.899205207824707, "step": 2452 }, { "epoch": 0.4088333333333333, "grad_norm": 22.445903778076172, "learning_rate": 1.3371121983281192e-07, "logits/chosen": 2.2144906520843506, "logits/rejected": 2.736452341079712, "logps/chosen": -51.018272399902344, "logps/rejected": -660.9642944335938, "loss": 0.5351, "nll_loss": 0.5101827383041382, "rewards/accuracies": 1.0, "rewards/chosen": 1.2695587873458862, "rewards/margins": 7.215671062469482, "rewards/rejected": -5.946112155914307, "step": 2453 }, { "epoch": 0.409, "grad_norm": 44.221012115478516, "learning_rate": 1.3366039537255587e-07, "logits/chosen": 2.1515896320343018, "logits/rejected": 2.145508050918579, "logps/chosen": -40.39458465576172, "logps/rejected": -83.61507415771484, "loss": 0.8426, "nll_loss": 0.7920507788658142, "rewards/accuracies": 1.0, "rewards/chosen": 1.0614150762557983, "rewards/margins": 4.525006294250488, "rewards/rejected": -3.4635913372039795, "step": 2454 }, { "epoch": 0.4091666666666667, "grad_norm": 34.401790618896484, "learning_rate": 1.336095611044687e-07, "logits/chosen": 2.563478708267212, "logits/rejected": 2.381399631500244, "logps/chosen": -69.04135131835938, "logps/rejected": -57.8546257019043, "loss": 0.9568, "nll_loss": 0.9205514788627625, "rewards/accuracies": 1.0, "rewards/chosen": 0.94537353515625, "rewards/margins": 5.993133068084717, "rewards/rejected": -5.047759532928467, "step": 2455 }, { "epoch": 0.4093333333333333, "grad_norm": 136.22317504882812, "learning_rate": 1.335587170433623e-07, "logits/chosen": 3.275418996810913, "logits/rejected": 3.638861894607544, "logps/chosen": -50.794734954833984, "logps/rejected": -94.88984680175781, "loss": 1.051, "nll_loss": 0.976821780204773, "rewards/accuracies": 1.0, "rewards/chosen": 0.555270791053772, "rewards/margins": 3.9703822135925293, "rewards/rejected": -3.4151113033294678, "step": 2456 }, { "epoch": 0.4095, "grad_norm": 169.52362060546875, "learning_rate": 1.3350786320405144e-07, "logits/chosen": 2.565143585205078, "logits/rejected": 2.6016392707824707, "logps/chosen": -71.3781509399414, "logps/rejected": -26.47845458984375, "loss": 2.3718, "nll_loss": 0.9913632273674011, "rewards/accuracies": 0.0, "rewards/chosen": 0.28900986909866333, "rewards/margins": -0.7665820717811584, "rewards/rejected": 1.0555919408798218, "step": 2457 }, { "epoch": 0.4096666666666667, "grad_norm": 30.149023056030273, "learning_rate": 1.3345699960135369e-07, "logits/chosen": 2.8089537620544434, "logits/rejected": 2.875917911529541, "logps/chosen": -85.67418670654297, "logps/rejected": -127.47439575195312, "loss": 1.0971, "nll_loss": 1.070927381515503, "rewards/accuracies": 1.0, "rewards/chosen": 2.358267307281494, "rewards/margins": 5.710052967071533, "rewards/rejected": -3.351785659790039, "step": 2458 }, { "epoch": 0.4098333333333333, "grad_norm": 36.750396728515625, "learning_rate": 1.3340612625008945e-07, "logits/chosen": 1.8373709917068481, "logits/rejected": 1.3094547986984253, "logps/chosen": -106.72660827636719, "logps/rejected": -68.53173828125, "loss": 1.0305, "nll_loss": 0.9974449276924133, "rewards/accuracies": 1.0, "rewards/chosen": 1.6372452974319458, "rewards/margins": 5.215607643127441, "rewards/rejected": -3.578362226486206, "step": 2459 }, { "epoch": 0.41, "grad_norm": 18.021738052368164, "learning_rate": 1.3335524316508206e-07, "logits/chosen": 3.1547398567199707, "logits/rejected": 3.311248779296875, "logps/chosen": -58.78124237060547, "logps/rejected": -222.8593292236328, "loss": 0.5141, "nll_loss": 0.5024037957191467, "rewards/accuracies": 1.0, "rewards/chosen": 2.109344482421875, "rewards/margins": 8.072776794433594, "rewards/rejected": -5.963432312011719, "step": 2460 }, { "epoch": 0.4101666666666667, "grad_norm": 28.706750869750977, "learning_rate": 1.333043503611576e-07, "logits/chosen": 1.860862135887146, "logits/rejected": 2.2624635696411133, "logps/chosen": -57.43669891357422, "logps/rejected": -307.5223693847656, "loss": 0.8763, "nll_loss": 0.8572640419006348, "rewards/accuracies": 1.0, "rewards/chosen": 1.4770653247833252, "rewards/margins": 10.200394630432129, "rewards/rejected": -8.723329544067383, "step": 2461 }, { "epoch": 0.4103333333333333, "grad_norm": 155.9991912841797, "learning_rate": 1.33253447853145e-07, "logits/chosen": 2.101365089416504, "logits/rejected": 1.7899080514907837, "logps/chosen": -51.46018600463867, "logps/rejected": -33.26029586791992, "loss": 2.3111, "nll_loss": 0.830003023147583, "rewards/accuracies": 0.0, "rewards/chosen": 1.4938817024230957, "rewards/margins": -0.6521239280700684, "rewards/rejected": 2.146005630493164, "step": 2462 }, { "epoch": 0.4105, "grad_norm": 790.4500732421875, "learning_rate": 1.33202535655876e-07, "logits/chosen": 4.55043363571167, "logits/rejected": 4.444836139678955, "logps/chosen": -351.50079345703125, "logps/rejected": -169.41293334960938, "loss": 7.4919, "nll_loss": 1.123005747795105, "rewards/accuracies": 0.0, "rewards/chosen": -7.401806831359863, "rewards/margins": -6.186352729797363, "rewards/rejected": -1.2154541015625, "step": 2463 }, { "epoch": 0.4106666666666667, "grad_norm": 177.3726043701172, "learning_rate": 1.3315161378418525e-07, "logits/chosen": 2.746687412261963, "logits/rejected": 2.817166328430176, "logps/chosen": -73.81834411621094, "logps/rejected": -66.47998809814453, "loss": 1.9639, "nll_loss": 0.7237091660499573, "rewards/accuracies": 0.0, "rewards/chosen": 2.4219415187835693, "rewards/margins": -0.0335845947265625, "rewards/rejected": 2.455526113510132, "step": 2464 }, { "epoch": 0.41083333333333333, "grad_norm": 40.174808502197266, "learning_rate": 1.3310068225291015e-07, "logits/chosen": 2.375183343887329, "logits/rejected": 2.550394296646118, "logps/chosen": -47.75035858154297, "logps/rejected": -145.34005737304688, "loss": 1.018, "nll_loss": 1.0159651041030884, "rewards/accuracies": 1.0, "rewards/chosen": 4.02337121963501, "rewards/margins": 10.887016296386719, "rewards/rejected": -6.863644599914551, "step": 2465 }, { "epoch": 0.411, "grad_norm": 22.318408966064453, "learning_rate": 1.3304974107689088e-07, "logits/chosen": 1.6429184675216675, "logits/rejected": 3.621155261993408, "logps/chosen": -58.68397903442383, "logps/rejected": -384.8877868652344, "loss": 0.6185, "nll_loss": 0.6049894690513611, "rewards/accuracies": 1.0, "rewards/chosen": 2.028358221054077, "rewards/margins": 7.480504035949707, "rewards/rejected": -5.452146053314209, "step": 2466 }, { "epoch": 0.4111666666666667, "grad_norm": 36.587833404541016, "learning_rate": 1.3299879027097052e-07, "logits/chosen": 2.273531436920166, "logits/rejected": 2.5506889820098877, "logps/chosen": -50.49546813964844, "logps/rejected": -228.4867401123047, "loss": 1.0016, "nll_loss": 0.9901072382926941, "rewards/accuracies": 1.0, "rewards/chosen": 2.0126190185546875, "rewards/margins": 9.854924201965332, "rewards/rejected": -7.8423051834106445, "step": 2467 }, { "epoch": 0.41133333333333333, "grad_norm": 80.83421325683594, "learning_rate": 1.329478298499949e-07, "logits/chosen": 2.4906396865844727, "logits/rejected": 2.384904623031616, "logps/chosen": -51.67873001098633, "logps/rejected": -40.331939697265625, "loss": 1.4837, "nll_loss": 1.3599666357040405, "rewards/accuracies": 1.0, "rewards/chosen": 0.09886856377124786, "rewards/margins": 3.093648910522461, "rewards/rejected": -2.9947803020477295, "step": 2468 }, { "epoch": 0.4115, "grad_norm": 39.92485046386719, "learning_rate": 1.3289685982881268e-07, "logits/chosen": 2.933734178543091, "logits/rejected": 2.858250141143799, "logps/chosen": -66.07640838623047, "logps/rejected": -63.974853515625, "loss": 1.0214, "nll_loss": 0.9862150549888611, "rewards/accuracies": 1.0, "rewards/chosen": 2.26409912109375, "rewards/margins": 5.241363525390625, "rewards/rejected": -2.977264404296875, "step": 2469 }, { "epoch": 0.4116666666666667, "grad_norm": 26.479816436767578, "learning_rate": 1.3284588022227527e-07, "logits/chosen": 3.063216209411621, "logits/rejected": 3.2250332832336426, "logps/chosen": -71.2508316040039, "logps/rejected": -227.65635681152344, "loss": 0.8047, "nll_loss": 0.7829760909080505, "rewards/accuracies": 1.0, "rewards/chosen": 1.9008492231369019, "rewards/margins": 6.038973331451416, "rewards/rejected": -4.138123989105225, "step": 2470 }, { "epoch": 0.41183333333333333, "grad_norm": 175.61151123046875, "learning_rate": 1.3279489104523693e-07, "logits/chosen": 2.579967975616455, "logits/rejected": 2.1037895679473877, "logps/chosen": -111.36903381347656, "logps/rejected": -44.49281692504883, "loss": 2.4836, "nll_loss": 1.148134469985962, "rewards/accuracies": 0.0, "rewards/chosen": 1.261621117591858, "rewards/margins": -0.499883770942688, "rewards/rejected": 1.761504888534546, "step": 2471 }, { "epoch": 0.412, "grad_norm": 24.736228942871094, "learning_rate": 1.3274389231255465e-07, "logits/chosen": 1.9529715776443481, "logits/rejected": 2.3467886447906494, "logps/chosen": -66.72657775878906, "logps/rejected": -301.7171936035156, "loss": 0.6796, "nll_loss": 0.6541820764541626, "rewards/accuracies": 1.0, "rewards/chosen": 1.209346055984497, "rewards/margins": 7.802641868591309, "rewards/rejected": -6.593296051025391, "step": 2472 }, { "epoch": 0.4121666666666667, "grad_norm": 28.066709518432617, "learning_rate": 1.326928840390883e-07, "logits/chosen": 1.884498119354248, "logits/rejected": 2.26193904876709, "logps/chosen": -66.26370239257812, "logps/rejected": -204.81259155273438, "loss": 0.851, "nll_loss": 0.8282963037490845, "rewards/accuracies": 1.0, "rewards/chosen": 1.2892951965332031, "rewards/margins": 9.843470573425293, "rewards/rejected": -8.55417537689209, "step": 2473 }, { "epoch": 0.41233333333333333, "grad_norm": 41.78960037231445, "learning_rate": 1.3264186623970037e-07, "logits/chosen": 2.520190954208374, "logits/rejected": 2.3115789890289307, "logps/chosen": -41.94879913330078, "logps/rejected": -56.01013946533203, "loss": 0.7731, "nll_loss": 0.6453661322593689, "rewards/accuracies": 1.0, "rewards/chosen": 1.1543006896972656, "rewards/margins": 2.9868664741516113, "rewards/rejected": -1.8325656652450562, "step": 2474 }, { "epoch": 0.4125, "grad_norm": 66.43059539794922, "learning_rate": 1.325908389292563e-07, "logits/chosen": 2.8051931858062744, "logits/rejected": 2.7966387271881104, "logps/chosen": -97.08563232421875, "logps/rejected": -72.99043273925781, "loss": 1.5679, "nll_loss": 1.427729845046997, "rewards/accuracies": 1.0, "rewards/chosen": -0.15897750854492188, "rewards/margins": 2.993969678878784, "rewards/rejected": -3.152947187423706, "step": 2475 }, { "epoch": 0.4126666666666667, "grad_norm": 29.162477493286133, "learning_rate": 1.3253980212262418e-07, "logits/chosen": 0.9058613181114197, "logits/rejected": 2.021554946899414, "logps/chosen": -84.7603988647461, "logps/rejected": -355.9410400390625, "loss": 1.1988, "nll_loss": 1.1772277355194092, "rewards/accuracies": 1.0, "rewards/chosen": 1.3371528387069702, "rewards/margins": 15.411846160888672, "rewards/rejected": -14.07469367980957, "step": 2476 }, { "epoch": 0.41283333333333333, "grad_norm": 137.81793212890625, "learning_rate": 1.3248875583467495e-07, "logits/chosen": 2.159386157989502, "logits/rejected": 2.4101674556732178, "logps/chosen": -73.6403579711914, "logps/rejected": -184.27206420898438, "loss": 2.1814, "nll_loss": 1.712566614151001, "rewards/accuracies": 1.0, "rewards/chosen": -2.7006325721740723, "rewards/margins": 2.4753775596618652, "rewards/rejected": -5.1760101318359375, "step": 2477 }, { "epoch": 0.413, "grad_norm": 50.382259368896484, "learning_rate": 1.3243770008028224e-07, "logits/chosen": 2.3741462230682373, "logits/rejected": 2.062415599822998, "logps/chosen": -38.491004943847656, "logps/rejected": -39.451210021972656, "loss": 1.042, "nll_loss": 0.9388049840927124, "rewards/accuracies": 1.0, "rewards/chosen": 0.935559868812561, "rewards/margins": 3.2457027435302734, "rewards/rejected": -2.310142993927002, "step": 2478 }, { "epoch": 0.4131666666666667, "grad_norm": 34.96710205078125, "learning_rate": 1.323866348743225e-07, "logits/chosen": 1.8928700685501099, "logits/rejected": 1.8368804454803467, "logps/chosen": -68.7718734741211, "logps/rejected": -36.07929611206055, "loss": 0.9719, "nll_loss": 0.9048929810523987, "rewards/accuracies": 1.0, "rewards/chosen": 1.702479600906372, "rewards/margins": 4.088888168334961, "rewards/rejected": -2.386408567428589, "step": 2479 }, { "epoch": 0.41333333333333333, "grad_norm": 221.60433959960938, "learning_rate": 1.3233556023167486e-07, "logits/chosen": 2.696734666824341, "logits/rejected": 2.6097593307495117, "logps/chosen": -100.58059692382812, "logps/rejected": -22.732791900634766, "loss": 2.1118, "nll_loss": 1.0932673215866089, "rewards/accuracies": 0.0, "rewards/chosen": -0.8240005373954773, "rewards/margins": -0.34373968839645386, "rewards/rejected": -0.48026084899902344, "step": 2480 }, { "epoch": 0.4135, "grad_norm": 75.59270477294922, "learning_rate": 1.3228447616722127e-07, "logits/chosen": 2.1304051876068115, "logits/rejected": 2.369288206100464, "logps/chosen": -59.661319732666016, "logps/rejected": -207.50994873046875, "loss": 2.0756, "nll_loss": 2.0572867393493652, "rewards/accuracies": 1.0, "rewards/chosen": 1.511637568473816, "rewards/margins": 10.63563060760498, "rewards/rejected": -9.123992919921875, "step": 2481 }, { "epoch": 0.4136666666666667, "grad_norm": 140.9442901611328, "learning_rate": 1.322333826958464e-07, "logits/chosen": 1.9627948999404907, "logits/rejected": 1.5800552368164062, "logps/chosen": -79.55172729492188, "logps/rejected": -12.410823822021484, "loss": 2.1483, "nll_loss": 0.8553948402404785, "rewards/accuracies": 0.0, "rewards/chosen": 2.2560386657714844, "rewards/margins": -0.16773653030395508, "rewards/rejected": 2.4237751960754395, "step": 2482 }, { "epoch": 0.41383333333333333, "grad_norm": 25.18315315246582, "learning_rate": 1.321822798324377e-07, "logits/chosen": 3.005481481552124, "logits/rejected": 3.127812147140503, "logps/chosen": -14.123515129089355, "logps/rejected": -163.40725708007812, "loss": 0.3644, "nll_loss": 0.3530879318714142, "rewards/accuracies": 1.0, "rewards/chosen": 2.314973831176758, "rewards/margins": 7.51032018661499, "rewards/rejected": -5.195346355438232, "step": 2483 }, { "epoch": 0.414, "grad_norm": 44.26122283935547, "learning_rate": 1.3213116759188524e-07, "logits/chosen": 1.723327875137329, "logits/rejected": 1.3272678852081299, "logps/chosen": -38.49433517456055, "logps/rejected": -33.462257385253906, "loss": 0.7353, "nll_loss": 0.6110212206840515, "rewards/accuracies": 1.0, "rewards/chosen": 1.0077221393585205, "rewards/margins": 2.984955072402954, "rewards/rejected": -1.9772329330444336, "step": 2484 }, { "epoch": 0.4141666666666667, "grad_norm": 81.21700286865234, "learning_rate": 1.3208004598908197e-07, "logits/chosen": 0.7669782042503357, "logits/rejected": 2.485741376876831, "logps/chosen": -34.70273208618164, "logps/rejected": -264.30712890625, "loss": 1.8414, "nll_loss": 1.8264597654342651, "rewards/accuracies": 1.0, "rewards/chosen": 1.944819450378418, "rewards/margins": 7.221226692199707, "rewards/rejected": -5.276407241821289, "step": 2485 }, { "epoch": 0.41433333333333333, "grad_norm": 40.556373596191406, "learning_rate": 1.3202891503892342e-07, "logits/chosen": 2.341451406478882, "logits/rejected": 2.385474920272827, "logps/chosen": -47.675445556640625, "logps/rejected": -44.757850646972656, "loss": 0.9792, "nll_loss": 0.9168356657028198, "rewards/accuracies": 1.0, "rewards/chosen": 1.4074318408966064, "rewards/margins": 4.099618434906006, "rewards/rejected": -2.6921865940093994, "step": 2486 }, { "epoch": 0.4145, "grad_norm": 88.68386840820312, "learning_rate": 1.3197777475630798e-07, "logits/chosen": 2.0782485008239746, "logits/rejected": 2.156862258911133, "logps/chosen": -11.874954223632812, "logps/rejected": -19.088481903076172, "loss": 1.437, "nll_loss": 0.456728994846344, "rewards/accuracies": 1.0, "rewards/chosen": 2.1996967792510986, "rewards/margins": 0.3617439270019531, "rewards/rejected": 1.8379528522491455, "step": 2487 }, { "epoch": 0.4146666666666667, "grad_norm": 38.54570388793945, "learning_rate": 1.3192662515613674e-07, "logits/chosen": 2.1909992694854736, "logits/rejected": 2.5474722385406494, "logps/chosen": -31.285198211669922, "logps/rejected": -206.72666931152344, "loss": 0.6127, "nll_loss": 0.5586642622947693, "rewards/accuracies": 1.0, "rewards/chosen": 0.33568230271339417, "rewards/margins": 9.865177154541016, "rewards/rejected": -9.529495239257812, "step": 2488 }, { "epoch": 0.41483333333333333, "grad_norm": 25.13865089416504, "learning_rate": 1.3187546625331333e-07, "logits/chosen": 2.2358815670013428, "logits/rejected": 2.303488254547119, "logps/chosen": -71.24169158935547, "logps/rejected": -156.37359619140625, "loss": 0.7904, "nll_loss": 0.7743663191795349, "rewards/accuracies": 1.0, "rewards/chosen": 2.1841208934783936, "rewards/margins": 6.617745399475098, "rewards/rejected": -4.433624267578125, "step": 2489 }, { "epoch": 0.415, "grad_norm": 19.41975212097168, "learning_rate": 1.318242980627444e-07, "logits/chosen": 1.9877263307571411, "logits/rejected": 2.231926441192627, "logps/chosen": -98.87489318847656, "logps/rejected": -325.19195556640625, "loss": 0.845, "nll_loss": 0.8379228115081787, "rewards/accuracies": 1.0, "rewards/chosen": 2.4883744716644287, "rewards/margins": 12.554977416992188, "rewards/rejected": -10.06660270690918, "step": 2490 }, { "epoch": 0.4151666666666667, "grad_norm": 25.977153778076172, "learning_rate": 1.3177312059933898e-07, "logits/chosen": 2.3431646823883057, "logits/rejected": 2.4593188762664795, "logps/chosen": -74.5731201171875, "logps/rejected": -138.12646484375, "loss": 0.7735, "nll_loss": 0.7383477091789246, "rewards/accuracies": 1.0, "rewards/chosen": 1.386042833328247, "rewards/margins": 5.178118705749512, "rewards/rejected": -3.7920761108398438, "step": 2491 }, { "epoch": 0.41533333333333333, "grad_norm": 49.406253814697266, "learning_rate": 1.3172193387800906e-07, "logits/chosen": 3.0090277194976807, "logits/rejected": 3.1074540615081787, "logps/chosen": -26.64554214477539, "logps/rejected": -295.54876708984375, "loss": 0.9203, "nll_loss": 0.9188117384910583, "rewards/accuracies": 1.0, "rewards/chosen": 4.164429187774658, "rewards/margins": 12.134479522705078, "rewards/rejected": -7.970050811767578, "step": 2492 }, { "epoch": 0.4155, "grad_norm": 47.76030731201172, "learning_rate": 1.3167073791366915e-07, "logits/chosen": 2.8189890384674072, "logits/rejected": 2.7853810787200928, "logps/chosen": -37.39767074584961, "logps/rejected": -128.03089904785156, "loss": 0.7603, "nll_loss": 0.7332876324653625, "rewards/accuracies": 1.0, "rewards/chosen": 1.2234257459640503, "rewards/margins": 6.627625465393066, "rewards/rejected": -5.404199600219727, "step": 2493 }, { "epoch": 0.4156666666666667, "grad_norm": 68.16609191894531, "learning_rate": 1.3161953272123657e-07, "logits/chosen": 3.1982223987579346, "logits/rejected": 3.1903302669525146, "logps/chosen": -108.8846206665039, "logps/rejected": -139.00953674316406, "loss": 1.4839, "nll_loss": 1.4714137315750122, "rewards/accuracies": 1.0, "rewards/chosen": 2.380730390548706, "rewards/margins": 7.124466896057129, "rewards/rejected": -4.743736267089844, "step": 2494 }, { "epoch": 0.41583333333333333, "grad_norm": 23.74209976196289, "learning_rate": 1.3156831831563126e-07, "logits/chosen": 2.8467183113098145, "logits/rejected": 2.8831729888916016, "logps/chosen": -44.72553253173828, "logps/rejected": -208.1309051513672, "loss": 0.6945, "nll_loss": 0.6776596903800964, "rewards/accuracies": 1.0, "rewards/chosen": 2.687842607498169, "rewards/margins": 6.510973930358887, "rewards/rejected": -3.8231310844421387, "step": 2495 }, { "epoch": 0.416, "grad_norm": 150.4718780517578, "learning_rate": 1.315170947117759e-07, "logits/chosen": 1.9109218120574951, "logits/rejected": 1.8821548223495483, "logps/chosen": -81.933837890625, "logps/rejected": -76.35077667236328, "loss": 1.8123, "nll_loss": 1.0780768394470215, "rewards/accuracies": 1.0, "rewards/chosen": -0.3886261284351349, "rewards/margins": 0.2172599732875824, "rewards/rejected": -0.6058861017227173, "step": 2496 }, { "epoch": 0.4161666666666667, "grad_norm": 27.836132049560547, "learning_rate": 1.3146586192459572e-07, "logits/chosen": 2.155775308609009, "logits/rejected": 3.3335559368133545, "logps/chosen": -78.76001739501953, "logps/rejected": -69.94622802734375, "loss": 0.7825, "nll_loss": 0.7500953078269958, "rewards/accuracies": 1.0, "rewards/chosen": 1.448682427406311, "rewards/margins": 5.341884136199951, "rewards/rejected": -3.8932018280029297, "step": 2497 }, { "epoch": 0.41633333333333333, "grad_norm": 18.769805908203125, "learning_rate": 1.3141461996901888e-07, "logits/chosen": 2.5810747146606445, "logits/rejected": 2.5470266342163086, "logps/chosen": -115.87168884277344, "logps/rejected": -202.27151489257812, "loss": 0.821, "nll_loss": 0.8159977793693542, "rewards/accuracies": 1.0, "rewards/chosen": 3.298518419265747, "rewards/margins": 8.867758750915527, "rewards/rejected": -5.569240570068359, "step": 2498 }, { "epoch": 0.4165, "grad_norm": 26.042177200317383, "learning_rate": 1.313633688599759e-07, "logits/chosen": 2.79591703414917, "logits/rejected": 2.913145065307617, "logps/chosen": -42.38127517700195, "logps/rejected": -172.0416717529297, "loss": 0.652, "nll_loss": 0.6232540011405945, "rewards/accuracies": 1.0, "rewards/chosen": 1.6713093519210815, "rewards/margins": 5.492398738861084, "rewards/rejected": -3.821089267730713, "step": 2499 }, { "epoch": 0.4166666666666667, "grad_norm": 19.376497268676758, "learning_rate": 1.3131210861240024e-07, "logits/chosen": 2.40254807472229, "logits/rejected": 2.5579710006713867, "logps/chosen": -198.60939025878906, "logps/rejected": -288.7725830078125, "loss": 0.991, "nll_loss": 0.9783715605735779, "rewards/accuracies": 1.0, "rewards/chosen": 1.9270492792129517, "rewards/margins": 9.017951965332031, "rewards/rejected": -7.090902805328369, "step": 2500 }, { "epoch": 0.41683333333333333, "grad_norm": 22.892864227294922, "learning_rate": 1.3126083924122786e-07, "logits/chosen": 1.826942801475525, "logits/rejected": 2.4212708473205566, "logps/chosen": -50.642181396484375, "logps/rejected": -162.905517578125, "loss": 0.5932, "nll_loss": 0.5690132975578308, "rewards/accuracies": 1.0, "rewards/chosen": 1.6366897821426392, "rewards/margins": 5.963479042053223, "rewards/rejected": -4.326789379119873, "step": 2501 }, { "epoch": 0.417, "grad_norm": 39.10463333129883, "learning_rate": 1.3120956076139745e-07, "logits/chosen": 2.8309624195098877, "logits/rejected": 2.82899808883667, "logps/chosen": -78.59478759765625, "logps/rejected": -110.45535278320312, "loss": 1.0163, "nll_loss": 0.9703059196472168, "rewards/accuracies": 1.0, "rewards/chosen": 0.8070686459541321, "rewards/margins": 5.126003742218018, "rewards/rejected": -4.318934917449951, "step": 2502 }, { "epoch": 0.4171666666666667, "grad_norm": 139.2108917236328, "learning_rate": 1.3115827318785026e-07, "logits/chosen": 2.950570821762085, "logits/rejected": 2.9049811363220215, "logps/chosen": -113.46522521972656, "logps/rejected": -34.660606384277344, "loss": 2.0772, "nll_loss": 1.0604227781295776, "rewards/accuracies": 1.0, "rewards/chosen": 2.493481397628784, "rewards/margins": 0.3953580856323242, "rewards/rejected": 2.09812331199646, "step": 2503 }, { "epoch": 0.41733333333333333, "grad_norm": 50.73121643066406, "learning_rate": 1.3110697653553034e-07, "logits/chosen": 3.0110273361206055, "logits/rejected": 2.9231479167938232, "logps/chosen": -89.81939697265625, "logps/rejected": -101.707275390625, "loss": 1.5387, "nll_loss": 1.4724493026733398, "rewards/accuracies": 1.0, "rewards/chosen": 0.530957043170929, "rewards/margins": 4.31020975112915, "rewards/rejected": -3.779252529144287, "step": 2504 }, { "epoch": 0.4175, "grad_norm": 157.0304718017578, "learning_rate": 1.3105567081938423e-07, "logits/chosen": 1.943904161453247, "logits/rejected": 2.3602004051208496, "logps/chosen": -62.93885803222656, "logps/rejected": -91.95476531982422, "loss": 1.5301, "nll_loss": 0.7404570579528809, "rewards/accuracies": 1.0, "rewards/chosen": 0.9285003542900085, "rewards/margins": 0.3410552740097046, "rewards/rejected": 0.587445080280304, "step": 2505 }, { "epoch": 0.4176666666666667, "grad_norm": 31.678760528564453, "learning_rate": 1.3100435605436125e-07, "logits/chosen": 0.81809401512146, "logits/rejected": 1.8592697381973267, "logps/chosen": -61.48207092285156, "logps/rejected": -262.9160461425781, "loss": 0.8945, "nll_loss": 0.8422200679779053, "rewards/accuracies": 1.0, "rewards/chosen": 0.4462699890136719, "rewards/margins": 6.078936576843262, "rewards/rejected": -5.63266658782959, "step": 2506 }, { "epoch": 0.41783333333333333, "grad_norm": 46.770042419433594, "learning_rate": 1.3095303225541323e-07, "logits/chosen": 2.070521354675293, "logits/rejected": 2.09735107421875, "logps/chosen": -51.74908447265625, "logps/rejected": -214.8771209716797, "loss": 1.0939, "nll_loss": 1.0561037063598633, "rewards/accuracies": 1.0, "rewards/chosen": 1.6469459533691406, "rewards/margins": 4.956905364990234, "rewards/rejected": -3.3099594116210938, "step": 2507 }, { "epoch": 0.418, "grad_norm": 25.997455596923828, "learning_rate": 1.3090169943749475e-07, "logits/chosen": 2.2683799266815186, "logits/rejected": 2.49796724319458, "logps/chosen": -57.159912109375, "logps/rejected": -303.3233947753906, "loss": 0.7034, "nll_loss": 0.6804751753807068, "rewards/accuracies": 1.0, "rewards/chosen": 1.3019592761993408, "rewards/margins": 8.14566421508789, "rewards/rejected": -6.843704700469971, "step": 2508 }, { "epoch": 0.4181666666666667, "grad_norm": 37.62495040893555, "learning_rate": 1.3085035761556289e-07, "logits/chosen": 2.3205835819244385, "logits/rejected": 2.8008410930633545, "logps/chosen": -49.3477783203125, "logps/rejected": -379.38702392578125, "loss": 1.0329, "nll_loss": 1.0280786752700806, "rewards/accuracies": 1.0, "rewards/chosen": 3.733872175216675, "rewards/margins": 8.888840675354004, "rewards/rejected": -5.15496826171875, "step": 2509 }, { "epoch": 0.41833333333333333, "grad_norm": 162.7836456298828, "learning_rate": 1.307990068045775e-07, "logits/chosen": 3.049961805343628, "logits/rejected": 3.0249550342559814, "logps/chosen": -62.046302795410156, "logps/rejected": -51.49974060058594, "loss": 2.2224, "nll_loss": 0.7853962182998657, "rewards/accuracies": 0.0, "rewards/chosen": 2.320425510406494, "rewards/margins": -0.37115478515625, "rewards/rejected": 2.691580295562744, "step": 2510 }, { "epoch": 0.4185, "grad_norm": 35.70144271850586, "learning_rate": 1.3074764701950094e-07, "logits/chosen": 2.7716193199157715, "logits/rejected": 2.9472899436950684, "logps/chosen": -9.439302444458008, "logps/rejected": -85.4949951171875, "loss": 0.3722, "nll_loss": 0.36305004358291626, "rewards/accuracies": 1.0, "rewards/chosen": 2.36958646774292, "rewards/margins": 8.447196960449219, "rewards/rejected": -6.077610015869141, "step": 2511 }, { "epoch": 0.4186666666666667, "grad_norm": 27.08454704284668, "learning_rate": 1.3069627827529823e-07, "logits/chosen": 0.9677869081497192, "logits/rejected": 2.537360191345215, "logps/chosen": -39.545902252197266, "logps/rejected": -397.24993896484375, "loss": 0.6879, "nll_loss": 0.6702696681022644, "rewards/accuracies": 1.0, "rewards/chosen": 1.6579304933547974, "rewards/margins": 7.453487396240234, "rewards/rejected": -5.795557022094727, "step": 2512 }, { "epoch": 0.41883333333333334, "grad_norm": 160.16421508789062, "learning_rate": 1.3064490058693695e-07, "logits/chosen": 4.391879081726074, "logits/rejected": 4.466272354125977, "logps/chosen": -118.86143493652344, "logps/rejected": -45.53254699707031, "loss": 1.8361, "nll_loss": 1.2644832134246826, "rewards/accuracies": 1.0, "rewards/chosen": -2.089815616607666, "rewards/margins": 0.9967372417449951, "rewards/rejected": -3.086552858352661, "step": 2513 }, { "epoch": 0.419, "grad_norm": 44.1799201965332, "learning_rate": 1.3059351396938738e-07, "logits/chosen": 2.401190996170044, "logits/rejected": 2.53167462348938, "logps/chosen": -21.989748001098633, "logps/rejected": -238.48919677734375, "loss": 0.7265, "nll_loss": 0.6871795058250427, "rewards/accuracies": 1.0, "rewards/chosen": 0.9016185998916626, "rewards/margins": 5.613038063049316, "rewards/rejected": -4.711419582366943, "step": 2514 }, { "epoch": 0.4191666666666667, "grad_norm": 76.25450134277344, "learning_rate": 1.305421184376223e-07, "logits/chosen": 1.259801983833313, "logits/rejected": 2.094010829925537, "logps/chosen": -8.950791358947754, "logps/rejected": -351.87762451171875, "loss": 0.654, "nll_loss": 0.639342188835144, "rewards/accuracies": 1.0, "rewards/chosen": 1.8530206680297852, "rewards/margins": 7.7783989906311035, "rewards/rejected": -5.925378322601318, "step": 2515 }, { "epoch": 0.41933333333333334, "grad_norm": 19.7747745513916, "learning_rate": 1.3049071400661715e-07, "logits/chosen": 2.8084423542022705, "logits/rejected": 2.688192129135132, "logps/chosen": -171.69586181640625, "logps/rejected": -65.43041229248047, "loss": 0.98, "nll_loss": 0.9700330495834351, "rewards/accuracies": 1.0, "rewards/chosen": 2.4722230434417725, "rewards/margins": 7.703035354614258, "rewards/rejected": -5.230812072753906, "step": 2516 }, { "epoch": 0.4195, "grad_norm": 19.952428817749023, "learning_rate": 1.3043930069134996e-07, "logits/chosen": 2.2845444679260254, "logits/rejected": 2.164877414703369, "logps/chosen": -194.80177307128906, "logps/rejected": -204.07472229003906, "loss": 0.9585, "nll_loss": 0.9502525329589844, "rewards/accuracies": 1.0, "rewards/chosen": 2.340541124343872, "rewards/margins": 11.08604621887207, "rewards/rejected": -8.745505332946777, "step": 2517 }, { "epoch": 0.4196666666666667, "grad_norm": 44.548065185546875, "learning_rate": 1.3038787850680133e-07, "logits/chosen": 2.703054428100586, "logits/rejected": 2.9992263317108154, "logps/chosen": -87.85317993164062, "logps/rejected": -284.819091796875, "loss": 1.444, "nll_loss": 1.4169869422912598, "rewards/accuracies": 1.0, "rewards/chosen": 1.1049377918243408, "rewards/margins": 8.619954109191895, "rewards/rejected": -7.515016078948975, "step": 2518 }, { "epoch": 0.41983333333333334, "grad_norm": 183.14404296875, "learning_rate": 1.3033644746795446e-07, "logits/chosen": 2.8020150661468506, "logits/rejected": 2.738295555114746, "logps/chosen": -48.80263137817383, "logps/rejected": -17.609201431274414, "loss": 3.6936, "nll_loss": 0.7072843909263611, "rewards/accuracies": 0.0, "rewards/chosen": 1.1605972051620483, "rewards/margins": -2.494678020477295, "rewards/rejected": 3.6552751064300537, "step": 2519 }, { "epoch": 0.42, "grad_norm": 26.579017639160156, "learning_rate": 1.3028500758979505e-07, "logits/chosen": 2.639528512954712, "logits/rejected": 2.87211012840271, "logps/chosen": -62.179203033447266, "logps/rejected": -394.834228515625, "loss": 0.8592, "nll_loss": 0.8402596116065979, "rewards/accuracies": 1.0, "rewards/chosen": 1.4657528400421143, "rewards/margins": 11.159408569335938, "rewards/rejected": -9.693655967712402, "step": 2520 }, { "epoch": 0.4201666666666667, "grad_norm": 220.43577575683594, "learning_rate": 1.3023355888731155e-07, "logits/chosen": 3.099231004714966, "logits/rejected": 3.0951154232025146, "logps/chosen": -17.843978881835938, "logps/rejected": -83.1141128540039, "loss": 1.6342, "nll_loss": 0.6153095960617065, "rewards/accuracies": 0.0, "rewards/chosen": 0.9467892646789551, "rewards/margins": -0.07686567306518555, "rewards/rejected": 1.0236549377441406, "step": 2521 }, { "epoch": 0.42033333333333334, "grad_norm": 113.0089340209961, "learning_rate": 1.301821013754948e-07, "logits/chosen": 2.501863956451416, "logits/rejected": 2.6400179862976074, "logps/chosen": -16.333232879638672, "logps/rejected": -46.051475524902344, "loss": 0.8165, "nll_loss": 0.6282012462615967, "rewards/accuracies": 1.0, "rewards/chosen": 1.4466361999511719, "rewards/margins": 2.5879642963409424, "rewards/rejected": -1.1413280963897705, "step": 2522 }, { "epoch": 0.4205, "grad_norm": 24.75213050842285, "learning_rate": 1.3013063506933837e-07, "logits/chosen": 1.7065300941467285, "logits/rejected": 1.3551807403564453, "logps/chosen": -63.895896911621094, "logps/rejected": -37.27557373046875, "loss": 0.6699, "nll_loss": 0.5862009525299072, "rewards/accuracies": 1.0, "rewards/chosen": 1.7564873695373535, "rewards/margins": 3.8153138160705566, "rewards/rejected": -2.058826446533203, "step": 2523 }, { "epoch": 0.4206666666666667, "grad_norm": 22.237056732177734, "learning_rate": 1.300791599838382e-07, "logits/chosen": 2.53051495552063, "logits/rejected": 2.689824104309082, "logps/chosen": -93.78565979003906, "logps/rejected": -170.44581604003906, "loss": 0.8158, "nll_loss": 0.8015868067741394, "rewards/accuracies": 1.0, "rewards/chosen": 2.258801221847534, "rewards/margins": 6.859418869018555, "rewards/rejected": -4.6006178855896, "step": 2524 }, { "epoch": 0.42083333333333334, "grad_norm": 36.00155258178711, "learning_rate": 1.3002767613399296e-07, "logits/chosen": 3.368649959564209, "logits/rejected": 3.2699451446533203, "logps/chosen": -52.71688461303711, "logps/rejected": -92.13724517822266, "loss": 0.9973, "nll_loss": 0.9946581125259399, "rewards/accuracies": 1.0, "rewards/chosen": 4.145051956176758, "rewards/margins": 10.046666145324707, "rewards/rejected": -5.901614189147949, "step": 2525 }, { "epoch": 0.421, "grad_norm": 27.532758712768555, "learning_rate": 1.2997618353480377e-07, "logits/chosen": 2.583364248275757, "logits/rejected": 2.568514585494995, "logps/chosen": -76.95669555664062, "logps/rejected": -156.68649291992188, "loss": 0.8821, "nll_loss": 0.864682137966156, "rewards/accuracies": 1.0, "rewards/chosen": 1.841069221496582, "rewards/margins": 6.788417339324951, "rewards/rejected": -4.947348117828369, "step": 2526 }, { "epoch": 0.4211666666666667, "grad_norm": 28.43510627746582, "learning_rate": 1.2992468220127438e-07, "logits/chosen": 2.5395545959472656, "logits/rejected": 2.473093271255493, "logps/chosen": -57.34740447998047, "logps/rejected": -121.40728759765625, "loss": 0.7341, "nll_loss": 0.7168425917625427, "rewards/accuracies": 1.0, "rewards/chosen": 1.6432113647460938, "rewards/margins": 7.845135688781738, "rewards/rejected": -6.2019243240356445, "step": 2527 }, { "epoch": 0.42133333333333334, "grad_norm": 41.3637580871582, "learning_rate": 1.2987317214841097e-07, "logits/chosen": 2.359921932220459, "logits/rejected": 2.355320453643799, "logps/chosen": -21.9223575592041, "logps/rejected": -92.9022445678711, "loss": 0.7755, "nll_loss": 0.7559433579444885, "rewards/accuracies": 1.0, "rewards/chosen": 3.150141477584839, "rewards/margins": 6.564708709716797, "rewards/rejected": -3.414567470550537, "step": 2528 }, { "epoch": 0.4215, "grad_norm": 225.81903076171875, "learning_rate": 1.2982165339122246e-07, "logits/chosen": 2.9338059425354004, "logits/rejected": 3.0382394790649414, "logps/chosen": -98.84992980957031, "logps/rejected": -282.6046142578125, "loss": 2.0617, "nll_loss": 1.3358097076416016, "rewards/accuracies": 1.0, "rewards/chosen": 1.2925316095352173, "rewards/margins": 0.583422064781189, "rewards/rejected": 0.7091095447540283, "step": 2529 }, { "epoch": 0.4216666666666667, "grad_norm": 28.68623161315918, "learning_rate": 1.2977012594472005e-07, "logits/chosen": 2.2790510654449463, "logits/rejected": 2.5340449810028076, "logps/chosen": -40.38744354248047, "logps/rejected": -617.579833984375, "loss": 0.6487, "nll_loss": 0.6410704851150513, "rewards/accuracies": 1.0, "rewards/chosen": 2.4586923122406006, "rewards/margins": 9.718494415283203, "rewards/rejected": -7.259802341461182, "step": 2530 }, { "epoch": 0.42183333333333334, "grad_norm": 37.44463348388672, "learning_rate": 1.2971858982391767e-07, "logits/chosen": 2.8618032932281494, "logits/rejected": 2.74682354927063, "logps/chosen": -61.019798278808594, "logps/rejected": -57.687652587890625, "loss": 1.077, "nll_loss": 0.9841902852058411, "rewards/accuracies": 1.0, "rewards/chosen": 1.3563560247421265, "rewards/margins": 3.503633975982666, "rewards/rejected": -2.14727783203125, "step": 2531 }, { "epoch": 0.422, "grad_norm": 30.064956665039062, "learning_rate": 1.2966704504383168e-07, "logits/chosen": 1.7545132637023926, "logits/rejected": 1.9383403062820435, "logps/chosen": -85.10563659667969, "logps/rejected": -162.79281616210938, "loss": 0.9668, "nll_loss": 0.9456180334091187, "rewards/accuracies": 1.0, "rewards/chosen": 1.510833740234375, "rewards/margins": 6.849646091461182, "rewards/rejected": -5.338812351226807, "step": 2532 }, { "epoch": 0.4221666666666667, "grad_norm": 27.15550994873047, "learning_rate": 1.2961549161948103e-07, "logits/chosen": 2.580379009246826, "logits/rejected": 2.45957350730896, "logps/chosen": -49.65904235839844, "logps/rejected": -73.15152740478516, "loss": 0.7958, "nll_loss": 0.763985276222229, "rewards/accuracies": 1.0, "rewards/chosen": 1.931431770324707, "rewards/margins": 5.284174919128418, "rewards/rejected": -3.352743148803711, "step": 2533 }, { "epoch": 0.42233333333333334, "grad_norm": 35.341983795166016, "learning_rate": 1.2956392956588706e-07, "logits/chosen": 1.6594464778900146, "logits/rejected": 1.2853695154190063, "logps/chosen": -97.03985595703125, "logps/rejected": -48.900089263916016, "loss": 1.2295, "nll_loss": 1.1691547632217407, "rewards/accuracies": 1.0, "rewards/chosen": 1.524057149887085, "rewards/margins": 4.1790618896484375, "rewards/rejected": -2.6550045013427734, "step": 2534 }, { "epoch": 0.4225, "grad_norm": 28.012880325317383, "learning_rate": 1.2951235889807383e-07, "logits/chosen": 2.2526259422302246, "logits/rejected": 2.193974018096924, "logps/chosen": -53.984989166259766, "logps/rejected": -145.48167419433594, "loss": 0.9823, "nll_loss": 0.9640176892280579, "rewards/accuracies": 1.0, "rewards/chosen": 1.8469905853271484, "rewards/margins": 6.578649520874023, "rewards/rejected": -4.731658935546875, "step": 2535 }, { "epoch": 0.4226666666666667, "grad_norm": 23.177448272705078, "learning_rate": 1.2946077963106773e-07, "logits/chosen": 0.858363926410675, "logits/rejected": 2.876628875732422, "logps/chosen": -55.42368698120117, "logps/rejected": -120.361572265625, "loss": 0.6419, "nll_loss": 0.6158187389373779, "rewards/accuracies": 1.0, "rewards/chosen": 1.3297580480575562, "rewards/margins": 6.331960678100586, "rewards/rejected": -5.00220251083374, "step": 2536 }, { "epoch": 0.42283333333333334, "grad_norm": 39.21195602416992, "learning_rate": 1.2940919177989772e-07, "logits/chosen": 1.761474847793579, "logits/rejected": 2.663539409637451, "logps/chosen": -45.41287612915039, "logps/rejected": -155.1907958984375, "loss": 0.9302, "nll_loss": 0.9082573652267456, "rewards/accuracies": 1.0, "rewards/chosen": 1.3586530685424805, "rewards/margins": 7.883345127105713, "rewards/rejected": -6.524692058563232, "step": 2537 }, { "epoch": 0.423, "grad_norm": 26.523412704467773, "learning_rate": 1.2935759535959526e-07, "logits/chosen": 1.9647369384765625, "logits/rejected": 0.6875874996185303, "logps/chosen": -198.751708984375, "logps/rejected": -74.63549041748047, "loss": 0.9244, "nll_loss": 0.8872843980789185, "rewards/accuracies": 1.0, "rewards/chosen": 0.8355530500411987, "rewards/margins": 6.434108257293701, "rewards/rejected": -5.598555088043213, "step": 2538 }, { "epoch": 0.4231666666666667, "grad_norm": 24.805864334106445, "learning_rate": 1.2930599038519434e-07, "logits/chosen": 3.0336883068084717, "logits/rejected": 3.0979082584381104, "logps/chosen": -81.5595703125, "logps/rejected": -133.7696533203125, "loss": 0.9139, "nll_loss": 0.8865170478820801, "rewards/accuracies": 1.0, "rewards/chosen": 1.8889825344085693, "rewards/margins": 5.547045707702637, "rewards/rejected": -3.6580629348754883, "step": 2539 }, { "epoch": 0.42333333333333334, "grad_norm": 87.53717803955078, "learning_rate": 1.292543768717314e-07, "logits/chosen": 2.291699171066284, "logits/rejected": 3.004077911376953, "logps/chosen": -28.96672821044922, "logps/rejected": -332.4014587402344, "loss": 1.2144, "nll_loss": 1.2069469690322876, "rewards/accuracies": 1.0, "rewards/chosen": 2.4761550426483154, "rewards/margins": 9.736485481262207, "rewards/rejected": -7.2603302001953125, "step": 2540 }, { "epoch": 0.4235, "grad_norm": 267.7635498046875, "learning_rate": 1.2920275483424537e-07, "logits/chosen": 3.1695594787597656, "logits/rejected": 3.165619373321533, "logps/chosen": -123.63037872314453, "logps/rejected": -55.524658203125, "loss": 3.4551, "nll_loss": 1.236303687095642, "rewards/accuracies": 0.0, "rewards/chosen": 1.0156303644180298, "rewards/margins": -1.6746026277542114, "rewards/rejected": 2.690232992172241, "step": 2541 }, { "epoch": 0.4236666666666667, "grad_norm": 29.3151798248291, "learning_rate": 1.2915112428777769e-07, "logits/chosen": 2.488626718521118, "logits/rejected": 2.541290044784546, "logps/chosen": -23.643417358398438, "logps/rejected": -87.54399108886719, "loss": 0.5816, "nll_loss": 0.5498469471931458, "rewards/accuracies": 1.0, "rewards/chosen": 1.460680365562439, "rewards/margins": 5.377456188201904, "rewards/rejected": -3.916775703430176, "step": 2542 }, { "epoch": 0.42383333333333334, "grad_norm": 56.480892181396484, "learning_rate": 1.2909948524737218e-07, "logits/chosen": 2.9808859825134277, "logits/rejected": 2.8246703147888184, "logps/chosen": -93.03102111816406, "logps/rejected": -74.52735137939453, "loss": 1.4649, "nll_loss": 1.368103265762329, "rewards/accuracies": 1.0, "rewards/chosen": 0.24648360908031464, "rewards/margins": 3.5531227588653564, "rewards/rejected": -3.3066391944885254, "step": 2543 }, { "epoch": 0.424, "grad_norm": 26.305362701416016, "learning_rate": 1.2904783772807532e-07, "logits/chosen": 2.467630386352539, "logits/rejected": 2.6895649433135986, "logps/chosen": -58.4163932800293, "logps/rejected": -187.66842651367188, "loss": 0.6682, "nll_loss": 0.6563639044761658, "rewards/accuracies": 1.0, "rewards/chosen": 2.034381628036499, "rewards/margins": 8.505593299865723, "rewards/rejected": -6.4712114334106445, "step": 2544 }, { "epoch": 0.4241666666666667, "grad_norm": 32.009002685546875, "learning_rate": 1.289961817449359e-07, "logits/chosen": 2.7712292671203613, "logits/rejected": 2.7852351665496826, "logps/chosen": -36.623619079589844, "logps/rejected": -114.84414672851562, "loss": 0.7458, "nll_loss": 0.718110203742981, "rewards/accuracies": 1.0, "rewards/chosen": 1.1194041967391968, "rewards/margins": 7.333763599395752, "rewards/rejected": -6.214359283447266, "step": 2545 }, { "epoch": 0.42433333333333334, "grad_norm": 92.32955932617188, "learning_rate": 1.2894451731300532e-07, "logits/chosen": 2.990626335144043, "logits/rejected": 3.007783889770508, "logps/chosen": -57.76247024536133, "logps/rejected": -49.978851318359375, "loss": 2.2719, "nll_loss": 1.9254156351089478, "rewards/accuracies": 1.0, "rewards/chosen": 1.0725475549697876, "rewards/margins": 1.604149341583252, "rewards/rejected": -0.5316017270088196, "step": 2546 }, { "epoch": 0.4245, "grad_norm": 31.724292755126953, "learning_rate": 1.2889284444733722e-07, "logits/chosen": 1.8489857912063599, "logits/rejected": 2.230103015899658, "logps/chosen": -53.805908203125, "logps/rejected": -186.31004333496094, "loss": 0.8067, "nll_loss": 0.77979576587677, "rewards/accuracies": 1.0, "rewards/chosen": 1.2595070600509644, "rewards/margins": 6.431197643280029, "rewards/rejected": -5.171690464019775, "step": 2547 }, { "epoch": 0.4246666666666667, "grad_norm": 35.873626708984375, "learning_rate": 1.288411631629879e-07, "logits/chosen": 0.9721578359603882, "logits/rejected": 1.891520380973816, "logps/chosen": -15.781978607177734, "logps/rejected": -216.41378784179688, "loss": 0.4926, "nll_loss": 0.45091360807418823, "rewards/accuracies": 1.0, "rewards/chosen": 0.8994192481040955, "rewards/margins": 5.313426971435547, "rewards/rejected": -4.414007663726807, "step": 2548 }, { "epoch": 0.42483333333333334, "grad_norm": 26.68351173400879, "learning_rate": 1.2878947347501605e-07, "logits/chosen": 1.5716902017593384, "logits/rejected": 1.6281687021255493, "logps/chosen": -74.32861328125, "logps/rejected": -117.16741943359375, "loss": 0.7734, "nll_loss": 0.7507942318916321, "rewards/accuracies": 1.0, "rewards/chosen": 1.7742767333984375, "rewards/margins": 6.008714199066162, "rewards/rejected": -4.234437465667725, "step": 2549 }, { "epoch": 0.425, "grad_norm": 37.54792022705078, "learning_rate": 1.2873777539848284e-07, "logits/chosen": 2.207629442214966, "logits/rejected": 2.1506545543670654, "logps/chosen": -77.66358184814453, "logps/rejected": -126.60973358154297, "loss": 1.0818, "nll_loss": 1.0355143547058105, "rewards/accuracies": 1.0, "rewards/chosen": 0.6680030822753906, "rewards/margins": 5.55474853515625, "rewards/rejected": -4.886745452880859, "step": 2550 }, { "epoch": 0.4251666666666667, "grad_norm": 36.80956268310547, "learning_rate": 1.2868606894845172e-07, "logits/chosen": 2.9144554138183594, "logits/rejected": 3.078866958618164, "logps/chosen": -59.045204162597656, "logps/rejected": -322.5299377441406, "loss": 0.8123, "nll_loss": 0.7872693538665771, "rewards/accuracies": 1.0, "rewards/chosen": 1.4511475563049316, "rewards/margins": 6.159088134765625, "rewards/rejected": -4.707940578460693, "step": 2551 }, { "epoch": 0.42533333333333334, "grad_norm": 51.841827392578125, "learning_rate": 1.286343541399889e-07, "logits/chosen": 2.5282225608825684, "logits/rejected": 2.537287473678589, "logps/chosen": -13.358621597290039, "logps/rejected": -37.316429138183594, "loss": 0.8509, "nll_loss": 0.7421455979347229, "rewards/accuracies": 1.0, "rewards/chosen": 0.23182201385498047, "rewards/margins": 3.286813735961914, "rewards/rejected": -3.0549917221069336, "step": 2552 }, { "epoch": 0.4255, "grad_norm": 36.82525634765625, "learning_rate": 1.2858263098816265e-07, "logits/chosen": 1.3134751319885254, "logits/rejected": 2.202321767807007, "logps/chosen": -94.34607696533203, "logps/rejected": -306.36822509765625, "loss": 1.1234, "nll_loss": 1.0970474481582642, "rewards/accuracies": 1.0, "rewards/chosen": 1.1089378595352173, "rewards/margins": 13.674695014953613, "rewards/rejected": -12.565756797790527, "step": 2553 }, { "epoch": 0.4256666666666667, "grad_norm": 57.2149658203125, "learning_rate": 1.2853089950804396e-07, "logits/chosen": 2.153998613357544, "logits/rejected": 2.452725887298584, "logps/chosen": -19.137489318847656, "logps/rejected": -237.0845489501953, "loss": 0.7391, "nll_loss": 0.7360572814941406, "rewards/accuracies": 1.0, "rewards/chosen": 3.6081979274749756, "rewards/margins": 10.084650039672852, "rewards/rejected": -6.476451873779297, "step": 2554 }, { "epoch": 0.42583333333333334, "grad_norm": 25.67987823486328, "learning_rate": 1.2847915971470611e-07, "logits/chosen": 2.84460711479187, "logits/rejected": 2.8907670974731445, "logps/chosen": -41.82997131347656, "logps/rejected": -173.05960083007812, "loss": 0.6417, "nll_loss": 0.6151466965675354, "rewards/accuracies": 1.0, "rewards/chosen": 1.7264397144317627, "rewards/margins": 5.649321556091309, "rewards/rejected": -3.922882080078125, "step": 2555 }, { "epoch": 0.426, "grad_norm": 20.637941360473633, "learning_rate": 1.2842741162322484e-07, "logits/chosen": 1.8150097131729126, "logits/rejected": 2.3988685607910156, "logps/chosen": -93.67546081542969, "logps/rejected": -389.6033630371094, "loss": 0.8494, "nll_loss": 0.8439230918884277, "rewards/accuracies": 1.0, "rewards/chosen": 2.733163595199585, "rewards/margins": 19.19556999206543, "rewards/rejected": -16.462406158447266, "step": 2556 }, { "epoch": 0.4261666666666667, "grad_norm": 121.00846862792969, "learning_rate": 1.2837565524867832e-07, "logits/chosen": 1.4650059938430786, "logits/rejected": 2.3516845703125, "logps/chosen": -129.1298065185547, "logps/rejected": -162.8888702392578, "loss": 2.1078, "nll_loss": 1.8447115421295166, "rewards/accuracies": 1.0, "rewards/chosen": -1.3118011951446533, "rewards/margins": 2.497737169265747, "rewards/rejected": -3.8095383644104004, "step": 2557 }, { "epoch": 0.42633333333333334, "grad_norm": 134.04042053222656, "learning_rate": 1.2832389060614705e-07, "logits/chosen": 2.3717002868652344, "logits/rejected": 2.3657195568084717, "logps/chosen": -34.61155700683594, "logps/rejected": -11.891983032226562, "loss": 2.3618, "nll_loss": 0.5866365432739258, "rewards/accuracies": 0.0, "rewards/chosen": 1.2313880920410156, "rewards/margins": -1.096221685409546, "rewards/rejected": 2.3276097774505615, "step": 2558 }, { "epoch": 0.4265, "grad_norm": 47.090911865234375, "learning_rate": 1.2827211771071408e-07, "logits/chosen": 1.8114007711410522, "logits/rejected": 1.9067264795303345, "logps/chosen": -34.50474548339844, "logps/rejected": -54.40459060668945, "loss": 0.9479, "nll_loss": 0.9080196022987366, "rewards/accuracies": 1.0, "rewards/chosen": 1.3711498975753784, "rewards/margins": 4.895237445831299, "rewards/rejected": -3.524087429046631, "step": 2559 }, { "epoch": 0.4266666666666667, "grad_norm": 23.36213493347168, "learning_rate": 1.2822033657746476e-07, "logits/chosen": 2.232665538787842, "logits/rejected": 2.2765839099884033, "logps/chosen": -135.44203186035156, "logps/rejected": -410.7046203613281, "loss": 1.0041, "nll_loss": 0.9886278510093689, "rewards/accuracies": 1.0, "rewards/chosen": 1.6814546585083008, "rewards/margins": 9.858566284179688, "rewards/rejected": -8.177111625671387, "step": 2560 }, { "epoch": 0.42683333333333334, "grad_norm": 34.37957000732422, "learning_rate": 1.281685472214869e-07, "logits/chosen": 3.3504812717437744, "logits/rejected": 3.422661781311035, "logps/chosen": -74.11690521240234, "logps/rejected": -164.6121826171875, "loss": 0.9704, "nll_loss": 0.9038645029067993, "rewards/accuracies": 1.0, "rewards/chosen": 0.1786651611328125, "rewards/margins": 5.4939284324646, "rewards/rejected": -5.315263271331787, "step": 2561 }, { "epoch": 0.427, "grad_norm": 24.702674865722656, "learning_rate": 1.2811674965787056e-07, "logits/chosen": 2.9469118118286133, "logits/rejected": 2.9958600997924805, "logps/chosen": -25.099365234375, "logps/rejected": -270.150146484375, "loss": 0.5637, "nll_loss": 0.545638382434845, "rewards/accuracies": 1.0, "rewards/chosen": 2.095752716064453, "rewards/margins": 6.364398956298828, "rewards/rejected": -4.268646240234375, "step": 2562 }, { "epoch": 0.42716666666666664, "grad_norm": 24.871795654296875, "learning_rate": 1.2806494390170845e-07, "logits/chosen": 1.777923822402954, "logits/rejected": 1.919067621231079, "logps/chosen": -78.99360656738281, "logps/rejected": -128.7161865234375, "loss": 0.7058, "nll_loss": 0.6869009137153625, "rewards/accuracies": 1.0, "rewards/chosen": 1.7466583251953125, "rewards/margins": 6.63856840133667, "rewards/rejected": -4.891910076141357, "step": 2563 }, { "epoch": 0.42733333333333334, "grad_norm": 118.43026733398438, "learning_rate": 1.2801312996809544e-07, "logits/chosen": 3.0480709075927734, "logits/rejected": 3.2432072162628174, "logps/chosen": -37.48756408691406, "logps/rejected": -177.06094360351562, "loss": 1.1303, "nll_loss": 0.7650523781776428, "rewards/accuracies": 1.0, "rewards/chosen": 0.533477783203125, "rewards/margins": 1.3731110095977783, "rewards/rejected": -0.8396332263946533, "step": 2564 }, { "epoch": 0.4275, "grad_norm": 32.77342987060547, "learning_rate": 1.2796130787212888e-07, "logits/chosen": 2.053060531616211, "logits/rejected": 2.5445308685302734, "logps/chosen": -58.50604248046875, "logps/rejected": -219.2520751953125, "loss": 0.8813, "nll_loss": 0.8732243776321411, "rewards/accuracies": 1.0, "rewards/chosen": 2.3377320766448975, "rewards/margins": 11.277284622192383, "rewards/rejected": -8.939552307128906, "step": 2565 }, { "epoch": 0.42766666666666664, "grad_norm": 17.201656341552734, "learning_rate": 1.279094776289085e-07, "logits/chosen": 2.6283962726593018, "logits/rejected": 2.594393014907837, "logps/chosen": -129.12124633789062, "logps/rejected": -186.30874633789062, "loss": 0.6589, "nll_loss": 0.6329472661018372, "rewards/accuracies": 1.0, "rewards/chosen": 2.232501268386841, "rewards/margins": 5.685894966125488, "rewards/rejected": -3.4533939361572266, "step": 2566 }, { "epoch": 0.42783333333333334, "grad_norm": 274.7993469238281, "learning_rate": 1.278576392535364e-07, "logits/chosen": 2.7540063858032227, "logits/rejected": 2.779353380203247, "logps/chosen": -30.045516967773438, "logps/rejected": -85.85202026367188, "loss": 3.3014, "nll_loss": 1.5022757053375244, "rewards/accuracies": 0.0, "rewards/chosen": 0.5863037109375, "rewards/margins": -1.2547357082366943, "rewards/rejected": 1.8410394191741943, "step": 2567 }, { "epoch": 0.428, "grad_norm": 37.71147537231445, "learning_rate": 1.27805792761117e-07, "logits/chosen": 1.9621895551681519, "logits/rejected": 2.844623565673828, "logps/chosen": -44.973487854003906, "logps/rejected": -448.02056884765625, "loss": 1.1951, "nll_loss": 1.183512806892395, "rewards/accuracies": 1.0, "rewards/chosen": 2.4021737575531006, "rewards/margins": 7.305402755737305, "rewards/rejected": -4.903228759765625, "step": 2568 }, { "epoch": 0.42816666666666664, "grad_norm": 37.18513488769531, "learning_rate": 1.2775393816675716e-07, "logits/chosen": 1.6210585832595825, "logits/rejected": 1.994240164756775, "logps/chosen": -36.171546936035156, "logps/rejected": -133.68252563476562, "loss": 0.7187, "nll_loss": 0.65766441822052, "rewards/accuracies": 1.0, "rewards/chosen": 0.9380291700363159, "rewards/margins": 4.1614766120910645, "rewards/rejected": -3.223447561264038, "step": 2569 }, { "epoch": 0.42833333333333334, "grad_norm": 25.282825469970703, "learning_rate": 1.2770207548556605e-07, "logits/chosen": 2.957343339920044, "logits/rejected": 3.124300003051758, "logps/chosen": -64.43546295166016, "logps/rejected": -314.24273681640625, "loss": 0.7283, "nll_loss": 0.7080820202827454, "rewards/accuracies": 1.0, "rewards/chosen": 1.4066460132598877, "rewards/margins": 8.88217544555664, "rewards/rejected": -7.475529670715332, "step": 2570 }, { "epoch": 0.4285, "grad_norm": 80.77908325195312, "learning_rate": 1.2765020473265518e-07, "logits/chosen": 2.996697425842285, "logits/rejected": 2.995997428894043, "logps/chosen": -26.40699577331543, "logps/rejected": -55.02167510986328, "loss": 0.9807, "nll_loss": 0.6601748466491699, "rewards/accuracies": 1.0, "rewards/chosen": 0.9194445013999939, "rewards/margins": 1.6617445945739746, "rewards/rejected": -0.7423000931739807, "step": 2571 }, { "epoch": 0.42866666666666664, "grad_norm": 138.56814575195312, "learning_rate": 1.2759832592313848e-07, "logits/chosen": 2.953655481338501, "logits/rejected": 2.973818302154541, "logps/chosen": -124.96154022216797, "logps/rejected": -72.05706787109375, "loss": 1.6571, "nll_loss": 1.2372430562973022, "rewards/accuracies": 1.0, "rewards/chosen": -0.9411293268203735, "rewards/margins": 1.1441200971603394, "rewards/rejected": -2.085249423980713, "step": 2572 }, { "epoch": 0.42883333333333334, "grad_norm": 62.90468215942383, "learning_rate": 1.275464390721322e-07, "logits/chosen": 2.8356597423553467, "logits/rejected": 2.911958932876587, "logps/chosen": -89.32682037353516, "logps/rejected": -42.77046585083008, "loss": 1.2448, "nll_loss": 1.1307193040847778, "rewards/accuracies": 1.0, "rewards/chosen": 0.1514640897512436, "rewards/margins": 3.2254230976104736, "rewards/rejected": -3.0739591121673584, "step": 2573 }, { "epoch": 0.429, "grad_norm": 88.8562240600586, "learning_rate": 1.2749454419475485e-07, "logits/chosen": 2.107635736465454, "logits/rejected": 2.6644694805145264, "logps/chosen": -7.828036785125732, "logps/rejected": -316.0313720703125, "loss": 0.6205, "nll_loss": 0.6021566987037659, "rewards/accuracies": 1.0, "rewards/chosen": 1.4986599683761597, "rewards/margins": 9.767807006835938, "rewards/rejected": -8.269146919250488, "step": 2574 }, { "epoch": 0.42916666666666664, "grad_norm": 23.78079605102539, "learning_rate": 1.2744264130612745e-07, "logits/chosen": 1.9789204597473145, "logits/rejected": 1.7121057510375977, "logps/chosen": -41.76530075073242, "logps/rejected": -44.51512145996094, "loss": 0.6583, "nll_loss": 0.6233627796173096, "rewards/accuracies": 1.0, "rewards/chosen": 1.96450936794281, "rewards/margins": 5.141531944274902, "rewards/rejected": -3.1770224571228027, "step": 2575 }, { "epoch": 0.42933333333333334, "grad_norm": 38.3743782043457, "learning_rate": 1.273907304213732e-07, "logits/chosen": 2.9033539295196533, "logits/rejected": 3.075205087661743, "logps/chosen": -49.90868377685547, "logps/rejected": -403.6734313964844, "loss": 1.0239, "nll_loss": 1.0185444355010986, "rewards/accuracies": 1.0, "rewards/chosen": 2.74299955368042, "rewards/margins": 16.408828735351562, "rewards/rejected": -13.6658296585083, "step": 2576 }, { "epoch": 0.4295, "grad_norm": 25.24241065979004, "learning_rate": 1.2733881155561767e-07, "logits/chosen": 1.6649924516677856, "logits/rejected": 2.2353031635284424, "logps/chosen": -38.38576126098633, "logps/rejected": -438.2518310546875, "loss": 0.5693, "nll_loss": 0.5563154220581055, "rewards/accuracies": 1.0, "rewards/chosen": 1.8748611211776733, "rewards/margins": 9.130531311035156, "rewards/rejected": -7.255670547485352, "step": 2577 }, { "epoch": 0.42966666666666664, "grad_norm": 26.693689346313477, "learning_rate": 1.272868847239888e-07, "logits/chosen": 4.089961528778076, "logits/rejected": 4.033804416656494, "logps/chosen": -79.62371063232422, "logps/rejected": -187.20208740234375, "loss": 0.8615, "nll_loss": 0.8470608592033386, "rewards/accuracies": 1.0, "rewards/chosen": 1.7638481855392456, "rewards/margins": 9.076658248901367, "rewards/rejected": -7.312809944152832, "step": 2578 }, { "epoch": 0.42983333333333335, "grad_norm": 111.9602279663086, "learning_rate": 1.2723494994161682e-07, "logits/chosen": 2.15346097946167, "logits/rejected": 1.9554777145385742, "logps/chosen": -59.85374069213867, "logps/rejected": -11.07816219329834, "loss": 2.1365, "nll_loss": 0.9208269119262695, "rewards/accuracies": 1.0, "rewards/chosen": 2.40679669380188, "rewards/margins": 0.011631011962890625, "rewards/rejected": 2.3951656818389893, "step": 2579 }, { "epoch": 0.43, "grad_norm": 28.906736373901367, "learning_rate": 1.2718300722363428e-07, "logits/chosen": 2.6740517616271973, "logits/rejected": 2.7878191471099854, "logps/chosen": -63.13360595703125, "logps/rejected": -226.4282989501953, "loss": 0.9498, "nll_loss": 0.9284354448318481, "rewards/accuracies": 1.0, "rewards/chosen": 1.3720475435256958, "rewards/margins": 7.921274185180664, "rewards/rejected": -6.549226760864258, "step": 2580 }, { "epoch": 0.43016666666666664, "grad_norm": 25.36790657043457, "learning_rate": 1.2713105658517604e-07, "logits/chosen": 1.6567294597625732, "logits/rejected": 1.3975512981414795, "logps/chosen": -91.78592681884766, "logps/rejected": -73.6573486328125, "loss": 0.8409, "nll_loss": 0.8195170760154724, "rewards/accuracies": 1.0, "rewards/chosen": 1.993515968322754, "rewards/margins": 6.027523517608643, "rewards/rejected": -4.034007549285889, "step": 2581 }, { "epoch": 0.43033333333333335, "grad_norm": 28.919485092163086, "learning_rate": 1.2707909804137926e-07, "logits/chosen": 2.279385566711426, "logits/rejected": 2.359078884124756, "logps/chosen": -69.78379821777344, "logps/rejected": -305.4078369140625, "loss": 1.0238, "nll_loss": 1.011359453201294, "rewards/accuracies": 1.0, "rewards/chosen": 1.9260941743850708, "rewards/margins": 8.97563362121582, "rewards/rejected": -7.049539089202881, "step": 2582 }, { "epoch": 0.4305, "grad_norm": 25.808399200439453, "learning_rate": 1.2702713160738344e-07, "logits/chosen": 1.8659963607788086, "logits/rejected": 2.8126883506774902, "logps/chosen": -92.56616973876953, "logps/rejected": -391.58294677734375, "loss": 0.8755, "nll_loss": 0.8492310047149658, "rewards/accuracies": 1.0, "rewards/chosen": 1.2468208074569702, "rewards/margins": 6.602875709533691, "rewards/rejected": -5.356054782867432, "step": 2583 }, { "epoch": 0.43066666666666664, "grad_norm": 21.773984909057617, "learning_rate": 1.269751572983303e-07, "logits/chosen": 1.7678351402282715, "logits/rejected": 1.50355863571167, "logps/chosen": -149.11697387695312, "logps/rejected": -104.62609100341797, "loss": 1.1462, "nll_loss": 1.129673957824707, "rewards/accuracies": 1.0, "rewards/chosen": 2.3753418922424316, "rewards/margins": 6.492733001708984, "rewards/rejected": -4.117391109466553, "step": 2584 }, { "epoch": 0.43083333333333335, "grad_norm": 57.34320831298828, "learning_rate": 1.2692317512936396e-07, "logits/chosen": 2.5033774375915527, "logits/rejected": 2.4955332279205322, "logps/chosen": -55.60759353637695, "logps/rejected": -51.61751937866211, "loss": 1.4951, "nll_loss": 1.4258358478546143, "rewards/accuracies": 1.0, "rewards/chosen": 0.8440166711807251, "rewards/margins": 3.936511993408203, "rewards/rejected": -3.0924954414367676, "step": 2585 }, { "epoch": 0.431, "grad_norm": 30.734582901000977, "learning_rate": 1.2687118511563073e-07, "logits/chosen": 0.7958407998085022, "logits/rejected": 2.2474682331085205, "logps/chosen": -62.32293701171875, "logps/rejected": -317.7620849609375, "loss": 0.8712, "nll_loss": 0.8422020673751831, "rewards/accuracies": 1.0, "rewards/chosen": 1.0384178161621094, "rewards/margins": 7.608742713928223, "rewards/rejected": -6.570324897766113, "step": 2586 }, { "epoch": 0.43116666666666664, "grad_norm": 16.812801361083984, "learning_rate": 1.2681918727227928e-07, "logits/chosen": 2.0074429512023926, "logits/rejected": 2.346100330352783, "logps/chosen": -102.86225891113281, "logps/rejected": -303.32012939453125, "loss": 0.6838, "nll_loss": 0.6723024249076843, "rewards/accuracies": 1.0, "rewards/chosen": 2.0166985988616943, "rewards/margins": 8.936652183532715, "rewards/rejected": -6.9199538230896, "step": 2587 }, { "epoch": 0.43133333333333335, "grad_norm": 119.29541015625, "learning_rate": 1.2676718161446052e-07, "logits/chosen": 2.298551559448242, "logits/rejected": 2.079667568206787, "logps/chosen": -66.96675109863281, "logps/rejected": -28.09716796875, "loss": 1.2824, "nll_loss": 0.7786831259727478, "rewards/accuracies": 1.0, "rewards/chosen": 1.5020875930786133, "rewards/margins": 1.235658049583435, "rewards/rejected": 0.26642951369285583, "step": 2588 }, { "epoch": 0.4315, "grad_norm": 26.51197052001953, "learning_rate": 1.2671516815732767e-07, "logits/chosen": 1.4178447723388672, "logits/rejected": 2.035031318664551, "logps/chosen": -87.5447998046875, "logps/rejected": -204.13729858398438, "loss": 0.9256, "nll_loss": 0.9025235772132874, "rewards/accuracies": 1.0, "rewards/chosen": 1.2426551580429077, "rewards/margins": 10.694085121154785, "rewards/rejected": -9.451430320739746, "step": 2589 }, { "epoch": 0.43166666666666664, "grad_norm": 23.493852615356445, "learning_rate": 1.2666314691603614e-07, "logits/chosen": 2.2347543239593506, "logits/rejected": 2.4982364177703857, "logps/chosen": -69.02030944824219, "logps/rejected": -135.44679260253906, "loss": 0.704, "nll_loss": 0.6902032494544983, "rewards/accuracies": 1.0, "rewards/chosen": 2.630411148071289, "rewards/margins": 6.838399410247803, "rewards/rejected": -4.207988262176514, "step": 2590 }, { "epoch": 0.43183333333333335, "grad_norm": 195.14866638183594, "learning_rate": 1.2661111790574376e-07, "logits/chosen": 2.5401406288146973, "logits/rejected": 2.7560646533966064, "logps/chosen": -62.508426666259766, "logps/rejected": -127.94268798828125, "loss": 2.5001, "nll_loss": 1.6027802228927612, "rewards/accuracies": 1.0, "rewards/chosen": 1.9701054096221924, "rewards/margins": 0.45533931255340576, "rewards/rejected": 1.5147660970687866, "step": 2591 }, { "epoch": 0.432, "grad_norm": 28.44391632080078, "learning_rate": 1.265590811416105e-07, "logits/chosen": 2.2168190479278564, "logits/rejected": 2.3236472606658936, "logps/chosen": -30.515426635742188, "logps/rejected": -96.58795928955078, "loss": 0.5447, "nll_loss": 0.5085904598236084, "rewards/accuracies": 1.0, "rewards/chosen": 1.41802978515625, "rewards/margins": 5.09004020690918, "rewards/rejected": -3.672010660171509, "step": 2592 }, { "epoch": 0.43216666666666664, "grad_norm": 91.0144271850586, "learning_rate": 1.265070366387986e-07, "logits/chosen": 1.9423617124557495, "logits/rejected": 2.6811068058013916, "logps/chosen": -24.155887603759766, "logps/rejected": -139.0888671875, "loss": 0.9775, "nll_loss": 0.5891679525375366, "rewards/accuracies": 1.0, "rewards/chosen": 0.7273754477500916, "rewards/margins": 1.3396739959716797, "rewards/rejected": -0.6122986078262329, "step": 2593 }, { "epoch": 0.43233333333333335, "grad_norm": 27.579435348510742, "learning_rate": 1.2645498441247262e-07, "logits/chosen": 2.5301756858825684, "logits/rejected": 2.6626477241516113, "logps/chosen": -157.08187866210938, "logps/rejected": -256.857421875, "loss": 1.3884, "nll_loss": 1.3779113292694092, "rewards/accuracies": 1.0, "rewards/chosen": 2.077775716781616, "rewards/margins": 10.284062385559082, "rewards/rejected": -8.206286430358887, "step": 2594 }, { "epoch": 0.4325, "grad_norm": 32.119972229003906, "learning_rate": 1.264029244777993e-07, "logits/chosen": 2.5366508960723877, "logits/rejected": 2.452728748321533, "logps/chosen": -56.92051315307617, "logps/rejected": -103.61746215820312, "loss": 0.6895, "nll_loss": 0.654258668422699, "rewards/accuracies": 1.0, "rewards/chosen": 1.2739858627319336, "rewards/margins": 5.250433921813965, "rewards/rejected": -3.976447820663452, "step": 2595 }, { "epoch": 0.43266666666666664, "grad_norm": 34.103580474853516, "learning_rate": 1.2635085684994767e-07, "logits/chosen": 2.545916795730591, "logits/rejected": 2.538322687149048, "logps/chosen": -57.299442291259766, "logps/rejected": -198.42855834960938, "loss": 0.8554, "nll_loss": 0.8185634016990662, "rewards/accuracies": 1.0, "rewards/chosen": 0.8681057691574097, "rewards/margins": 6.112414360046387, "rewards/rejected": -5.2443084716796875, "step": 2596 }, { "epoch": 0.43283333333333335, "grad_norm": 77.83666229248047, "learning_rate": 1.2629878154408892e-07, "logits/chosen": 2.5224857330322266, "logits/rejected": 2.394289016723633, "logps/chosen": -101.13121795654297, "logps/rejected": -139.13783264160156, "loss": 1.5251, "nll_loss": 1.385359287261963, "rewards/accuracies": 1.0, "rewards/chosen": -0.8553078174591064, "rewards/margins": 6.243232727050781, "rewards/rejected": -7.098540306091309, "step": 2597 }, { "epoch": 0.433, "grad_norm": 20.182344436645508, "learning_rate": 1.2624669857539668e-07, "logits/chosen": 2.1162307262420654, "logits/rejected": 2.522252082824707, "logps/chosen": -67.643798828125, "logps/rejected": -242.48294067382812, "loss": 0.7183, "nll_loss": 0.7120400071144104, "rewards/accuracies": 1.0, "rewards/chosen": 2.6059815883636475, "rewards/margins": 11.163739204406738, "rewards/rejected": -8.557757377624512, "step": 2598 }, { "epoch": 0.43316666666666664, "grad_norm": 88.22509765625, "learning_rate": 1.2619460795904655e-07, "logits/chosen": 2.800551176071167, "logits/rejected": 2.787574291229248, "logps/chosen": -24.066930770874023, "logps/rejected": -97.39813232421875, "loss": 0.7999, "nll_loss": 0.48133864998817444, "rewards/accuracies": 1.0, "rewards/chosen": 0.384694904088974, "rewards/margins": 1.5349475145339966, "rewards/rejected": -1.1502525806427002, "step": 2599 }, { "epoch": 0.43333333333333335, "grad_norm": 38.17351531982422, "learning_rate": 1.2614250971021657e-07, "logits/chosen": 2.2754392623901367, "logits/rejected": 2.3375661373138428, "logps/chosen": -27.678308486938477, "logps/rejected": -137.52734375, "loss": 0.7407, "nll_loss": 0.728376567363739, "rewards/accuracies": 1.0, "rewards/chosen": 2.9428634643554688, "rewards/margins": 7.100262641906738, "rewards/rejected": -4.1573991775512695, "step": 2600 }, { "epoch": 0.4335, "grad_norm": 25.688676834106445, "learning_rate": 1.2609040384408683e-07, "logits/chosen": 0.9045411348342896, "logits/rejected": 1.6801177263259888, "logps/chosen": -42.07844924926758, "logps/rejected": -177.66903686523438, "loss": 0.5598, "nll_loss": 0.5131518840789795, "rewards/accuracies": 1.0, "rewards/chosen": 0.5041484832763672, "rewards/margins": 7.104489326477051, "rewards/rejected": -6.600340843200684, "step": 2601 }, { "epoch": 0.43366666666666664, "grad_norm": 239.95433044433594, "learning_rate": 1.2603829037583982e-07, "logits/chosen": 3.132533311843872, "logits/rejected": 3.1637814044952393, "logps/chosen": -45.26533126831055, "logps/rejected": -58.78437423706055, "loss": 3.4003, "nll_loss": 0.5803247690200806, "rewards/accuracies": 0.0, "rewards/chosen": 1.3520878553390503, "rewards/margins": -2.275662899017334, "rewards/rejected": 3.627750873565674, "step": 2602 }, { "epoch": 0.43383333333333335, "grad_norm": 26.187124252319336, "learning_rate": 1.2598616932066008e-07, "logits/chosen": 2.4638726711273193, "logits/rejected": 2.553790330886841, "logps/chosen": -84.39691162109375, "logps/rejected": -263.39276123046875, "loss": 0.9558, "nll_loss": 0.9482797980308533, "rewards/accuracies": 1.0, "rewards/chosen": 2.591169834136963, "rewards/margins": 8.611303329467773, "rewards/rejected": -6.020133018493652, "step": 2603 }, { "epoch": 0.434, "grad_norm": 64.335205078125, "learning_rate": 1.259340406937345e-07, "logits/chosen": 2.4129226207733154, "logits/rejected": 2.5017082691192627, "logps/chosen": -30.437223434448242, "logps/rejected": -143.94845581054688, "loss": 1.1304, "nll_loss": 1.1273046731948853, "rewards/accuracies": 1.0, "rewards/chosen": 3.544355869293213, "rewards/margins": 10.118362426757812, "rewards/rejected": -6.5740065574646, "step": 2604 }, { "epoch": 0.43416666666666665, "grad_norm": 43.49078369140625, "learning_rate": 1.2588190451025208e-07, "logits/chosen": 2.8113510608673096, "logits/rejected": 2.8295280933380127, "logps/chosen": -43.74015808105469, "logps/rejected": -45.196475982666016, "loss": 0.8413, "nll_loss": 0.8100029826164246, "rewards/accuracies": 1.0, "rewards/chosen": 1.8579505681991577, "rewards/margins": 5.301075458526611, "rewards/rejected": -3.443124771118164, "step": 2605 }, { "epoch": 0.43433333333333335, "grad_norm": 32.465110778808594, "learning_rate": 1.2582976078540403e-07, "logits/chosen": 2.0003440380096436, "logits/rejected": 2.227977752685547, "logps/chosen": -51.519691467285156, "logps/rejected": -434.1905822753906, "loss": 0.6958, "nll_loss": 0.6778905987739563, "rewards/accuracies": 1.0, "rewards/chosen": 1.5137786865234375, "rewards/margins": 9.797595024108887, "rewards/rejected": -8.28381633758545, "step": 2606 }, { "epoch": 0.4345, "grad_norm": 89.2852783203125, "learning_rate": 1.257776095343838e-07, "logits/chosen": 1.6963517665863037, "logits/rejected": 3.201964855194092, "logps/chosen": -132.8537139892578, "logps/rejected": -485.56964111328125, "loss": 1.7892, "nll_loss": 1.413337230682373, "rewards/accuracies": 1.0, "rewards/chosen": -2.4284043312072754, "rewards/margins": 3.4866719245910645, "rewards/rejected": -5.91507625579834, "step": 2607 }, { "epoch": 0.43466666666666665, "grad_norm": 40.511112213134766, "learning_rate": 1.2572545077238702e-07, "logits/chosen": 3.0307183265686035, "logits/rejected": 3.0752580165863037, "logps/chosen": -30.891382217407227, "logps/rejected": -264.5652160644531, "loss": 0.6747, "nll_loss": 0.657263457775116, "rewards/accuracies": 1.0, "rewards/chosen": 2.327815532684326, "rewards/margins": 6.388551712036133, "rewards/rejected": -4.060736179351807, "step": 2608 }, { "epoch": 0.43483333333333335, "grad_norm": 123.14566040039062, "learning_rate": 1.256732845146115e-07, "logits/chosen": 3.1673507690429688, "logits/rejected": 3.1995317935943604, "logps/chosen": -45.67136764526367, "logps/rejected": -30.65158462524414, "loss": 1.5478, "nll_loss": 0.5373101830482483, "rewards/accuracies": 1.0, "rewards/chosen": 1.3832578659057617, "rewards/margins": 0.05911612510681152, "rewards/rejected": 1.3241417407989502, "step": 2609 }, { "epoch": 0.435, "grad_norm": 27.427162170410156, "learning_rate": 1.2562111077625722e-07, "logits/chosen": 1.9362868070602417, "logits/rejected": 1.8771388530731201, "logps/chosen": -48.61321258544922, "logps/rejected": -109.13976287841797, "loss": 0.8111, "nll_loss": 0.7840840816497803, "rewards/accuracies": 1.0, "rewards/chosen": 1.447428584098816, "rewards/margins": 5.836203098297119, "rewards/rejected": -4.388774394989014, "step": 2610 }, { "epoch": 0.43516666666666665, "grad_norm": 24.607458114624023, "learning_rate": 1.255689295725264e-07, "logits/chosen": 1.002579689025879, "logits/rejected": 2.2708325386047363, "logps/chosen": -59.30094909667969, "logps/rejected": -365.7573547363281, "loss": 0.7169, "nll_loss": 0.6976581811904907, "rewards/accuracies": 1.0, "rewards/chosen": 1.4381484985351562, "rewards/margins": 9.404399871826172, "rewards/rejected": -7.966250896453857, "step": 2611 }, { "epoch": 0.43533333333333335, "grad_norm": 52.780914306640625, "learning_rate": 1.255167409186233e-07, "logits/chosen": 2.1374382972717285, "logits/rejected": 1.9402999877929688, "logps/chosen": -74.44886016845703, "logps/rejected": -78.73535919189453, "loss": 1.2311, "nll_loss": 1.0060657262802124, "rewards/accuracies": 1.0, "rewards/chosen": 1.9426841735839844, "rewards/margins": 2.6280486583709717, "rewards/rejected": -0.6853645443916321, "step": 2612 }, { "epoch": 0.4355, "grad_norm": 21.159963607788086, "learning_rate": 1.254645448297545e-07, "logits/chosen": 1.3745336532592773, "logits/rejected": 1.861374855041504, "logps/chosen": -62.227203369140625, "logps/rejected": -296.6981201171875, "loss": 0.647, "nll_loss": 0.6285576224327087, "rewards/accuracies": 1.0, "rewards/chosen": 1.6178597211837769, "rewards/margins": 7.168891906738281, "rewards/rejected": -5.551032066345215, "step": 2613 }, { "epoch": 0.43566666666666665, "grad_norm": 28.96836280822754, "learning_rate": 1.2541234132112872e-07, "logits/chosen": 1.9724985361099243, "logits/rejected": 1.3438146114349365, "logps/chosen": -80.3740463256836, "logps/rejected": -88.10006713867188, "loss": 0.9539, "nll_loss": 0.9345819354057312, "rewards/accuracies": 1.0, "rewards/chosen": 2.155794620513916, "rewards/margins": 6.192805767059326, "rewards/rejected": -4.03701114654541, "step": 2614 }, { "epoch": 0.43583333333333335, "grad_norm": 27.527009963989258, "learning_rate": 1.2536013040795674e-07, "logits/chosen": 2.906451463699341, "logits/rejected": 3.0033557415008545, "logps/chosen": -132.12771606445312, "logps/rejected": -221.72467041015625, "loss": 1.4385, "nll_loss": 1.420728087425232, "rewards/accuracies": 1.0, "rewards/chosen": 1.7841720581054688, "rewards/margins": 6.776145935058594, "rewards/rejected": -4.991973876953125, "step": 2615 }, { "epoch": 0.436, "grad_norm": 10.854241371154785, "learning_rate": 1.2530791210545163e-07, "logits/chosen": 1.44202721118927, "logits/rejected": 1.49004065990448, "logps/chosen": -117.51178741455078, "logps/rejected": -210.38522338867188, "loss": 0.4948, "nll_loss": 0.48760080337524414, "rewards/accuracies": 1.0, "rewards/chosen": 3.4598259925842285, "rewards/margins": 8.150249481201172, "rewards/rejected": -4.690423488616943, "step": 2616 }, { "epoch": 0.43616666666666665, "grad_norm": 20.888399124145508, "learning_rate": 1.252556864288285e-07, "logits/chosen": 3.065899133682251, "logits/rejected": 3.175602912902832, "logps/chosen": -18.930028915405273, "logps/rejected": -334.1777038574219, "loss": 0.3501, "nll_loss": 0.3380362391471863, "rewards/accuracies": 1.0, "rewards/chosen": 2.1963751316070557, "rewards/margins": 7.447815895080566, "rewards/rejected": -5.251440525054932, "step": 2617 }, { "epoch": 0.43633333333333335, "grad_norm": 222.72113037109375, "learning_rate": 1.2520345339330465e-07, "logits/chosen": 1.7725574970245361, "logits/rejected": 1.4520403146743774, "logps/chosen": -303.71600341796875, "logps/rejected": -282.0962219238281, "loss": 1.7866, "nll_loss": 1.1460981369018555, "rewards/accuracies": 1.0, "rewards/chosen": -2.864798069000244, "rewards/margins": 1.215184211730957, "rewards/rejected": -4.079982280731201, "step": 2618 }, { "epoch": 0.4365, "grad_norm": 46.93648910522461, "learning_rate": 1.251512130140996e-07, "logits/chosen": 2.6917037963867188, "logits/rejected": 2.853537082672119, "logps/chosen": -29.30521011352539, "logps/rejected": -209.80072021484375, "loss": 1.0213, "nll_loss": 1.0105243921279907, "rewards/accuracies": 1.0, "rewards/chosen": 2.1103405952453613, "rewards/margins": 8.59829330444336, "rewards/rejected": -6.487953186035156, "step": 2619 }, { "epoch": 0.43666666666666665, "grad_norm": 27.920089721679688, "learning_rate": 1.2509896530643487e-07, "logits/chosen": 2.910181999206543, "logits/rejected": 2.8653318881988525, "logps/chosen": -80.38343811035156, "logps/rejected": -64.19011688232422, "loss": 0.8866, "nll_loss": 0.8737330436706543, "rewards/accuracies": 1.0, "rewards/chosen": 2.2452316284179688, "rewards/margins": 7.1466498374938965, "rewards/rejected": -4.901418209075928, "step": 2620 }, { "epoch": 0.43683333333333335, "grad_norm": 21.056888580322266, "learning_rate": 1.2504671028553425e-07, "logits/chosen": 2.487015962600708, "logits/rejected": 2.6470837593078613, "logps/chosen": -137.84454345703125, "logps/rejected": -281.4149169921875, "loss": 0.9944, "nll_loss": 0.9846038818359375, "rewards/accuracies": 1.0, "rewards/chosen": 2.2529358863830566, "rewards/margins": 8.432109832763672, "rewards/rejected": -6.179173469543457, "step": 2621 }, { "epoch": 0.437, "grad_norm": 61.52276611328125, "learning_rate": 1.2499444796662354e-07, "logits/chosen": 3.038274049758911, "logits/rejected": 2.8880772590637207, "logps/chosen": -51.65576934814453, "logps/rejected": -65.9671401977539, "loss": 1.756, "nll_loss": 1.7218589782714844, "rewards/accuracies": 1.0, "rewards/chosen": 1.7835991382598877, "rewards/margins": 5.148137092590332, "rewards/rejected": -3.3645379543304443, "step": 2622 }, { "epoch": 0.43716666666666665, "grad_norm": 71.93506622314453, "learning_rate": 1.2494217836493074e-07, "logits/chosen": 2.639223098754883, "logits/rejected": 3.1137959957122803, "logps/chosen": -139.09056091308594, "logps/rejected": -218.25033569335938, "loss": 1.6383, "nll_loss": 1.4339234828948975, "rewards/accuracies": 1.0, "rewards/chosen": -1.4179909229278564, "rewards/margins": 5.533805847167969, "rewards/rejected": -6.951796531677246, "step": 2623 }, { "epoch": 0.43733333333333335, "grad_norm": 19.528766632080078, "learning_rate": 1.24889901495686e-07, "logits/chosen": 2.357041597366333, "logits/rejected": 2.1630797386169434, "logps/chosen": -30.354106903076172, "logps/rejected": -99.64508056640625, "loss": 0.4104, "nll_loss": 0.38915523886680603, "rewards/accuracies": 1.0, "rewards/chosen": 1.9756172895431519, "rewards/margins": 6.039377689361572, "rewards/rejected": -4.063760280609131, "step": 2624 }, { "epoch": 0.4375, "grad_norm": 84.11211395263672, "learning_rate": 1.248376173741215e-07, "logits/chosen": 4.159972190856934, "logits/rejected": 4.1396002769470215, "logps/chosen": -33.46316909790039, "logps/rejected": -12.032700538635254, "loss": 1.6501, "nll_loss": 0.7967420220375061, "rewards/accuracies": 1.0, "rewards/chosen": 2.005100727081299, "rewards/margins": 0.5617964267730713, "rewards/rejected": 1.4433043003082275, "step": 2625 }, { "epoch": 0.43766666666666665, "grad_norm": 34.10402297973633, "learning_rate": 1.2478532601547157e-07, "logits/chosen": 1.3218636512756348, "logits/rejected": 2.520296812057495, "logps/chosen": -67.54429626464844, "logps/rejected": -472.08831787109375, "loss": 0.9017, "nll_loss": 0.8887408375740051, "rewards/accuracies": 1.0, "rewards/chosen": 1.8600311279296875, "rewards/margins": 9.481189727783203, "rewards/rejected": -7.621158599853516, "step": 2626 }, { "epoch": 0.43783333333333335, "grad_norm": 16.46677017211914, "learning_rate": 1.2473302743497273e-07, "logits/chosen": 2.365817070007324, "logits/rejected": 2.4357314109802246, "logps/chosen": -64.927490234375, "logps/rejected": -128.2357177734375, "loss": 0.5177, "nll_loss": 0.5072460770606995, "rewards/accuracies": 1.0, "rewards/chosen": 2.877300262451172, "rewards/margins": 7.363454341888428, "rewards/rejected": -4.486154079437256, "step": 2627 }, { "epoch": 0.438, "grad_norm": 23.920562744140625, "learning_rate": 1.2468072164786342e-07, "logits/chosen": 1.2595407962799072, "logits/rejected": 2.045470714569092, "logps/chosen": -56.69047546386719, "logps/rejected": -270.12548828125, "loss": 0.672, "nll_loss": 0.6516146659851074, "rewards/accuracies": 1.0, "rewards/chosen": 1.656319499015808, "rewards/margins": 6.474808692932129, "rewards/rejected": -4.818489074707031, "step": 2628 }, { "epoch": 0.43816666666666665, "grad_norm": 60.975013732910156, "learning_rate": 1.2462840866938435e-07, "logits/chosen": 1.1983014345169067, "logits/rejected": 1.8445450067520142, "logps/chosen": -24.853759765625, "logps/rejected": -226.5142822265625, "loss": 0.8102, "nll_loss": 0.801734209060669, "rewards/accuracies": 1.0, "rewards/chosen": 2.273909330368042, "rewards/margins": 13.426433563232422, "rewards/rejected": -11.1525239944458, "step": 2629 }, { "epoch": 0.43833333333333335, "grad_norm": 64.74788665771484, "learning_rate": 1.245760885147783e-07, "logits/chosen": 2.9246511459350586, "logits/rejected": 2.8881640434265137, "logps/chosen": -25.398950576782227, "logps/rejected": -40.472373962402344, "loss": 1.5561, "nll_loss": 1.494055986404419, "rewards/accuracies": 1.0, "rewards/chosen": 1.2558637857437134, "rewards/margins": 4.0898118019104, "rewards/rejected": -2.8339481353759766, "step": 2630 }, { "epoch": 0.4385, "grad_norm": 91.17825317382812, "learning_rate": 1.2452376119929006e-07, "logits/chosen": 2.1214234828948975, "logits/rejected": 2.3837924003601074, "logps/chosen": -81.38096618652344, "logps/rejected": -313.26031494140625, "loss": 2.0509, "nll_loss": 1.595705270767212, "rewards/accuracies": 1.0, "rewards/chosen": -3.0282089710235596, "rewards/margins": 5.874839782714844, "rewards/rejected": -8.903048515319824, "step": 2631 }, { "epoch": 0.43866666666666665, "grad_norm": 28.705474853515625, "learning_rate": 1.244714267381666e-07, "logits/chosen": 1.4867500066757202, "logits/rejected": 2.184795379638672, "logps/chosen": -63.04840850830078, "logps/rejected": -137.07455444335938, "loss": 0.8275, "nll_loss": 0.7980810403823853, "rewards/accuracies": 1.0, "rewards/chosen": 1.540461778640747, "rewards/margins": 5.491527557373047, "rewards/rejected": -3.9510657787323, "step": 2632 }, { "epoch": 0.43883333333333335, "grad_norm": 30.357606887817383, "learning_rate": 1.2441908514665683e-07, "logits/chosen": 1.5442544221878052, "logits/rejected": 1.9884843826293945, "logps/chosen": -60.96161651611328, "logps/rejected": -199.19442749023438, "loss": 1.0961, "nll_loss": 1.0886002779006958, "rewards/accuracies": 1.0, "rewards/chosen": 2.4091453552246094, "rewards/margins": 10.718415260314941, "rewards/rejected": -8.309269905090332, "step": 2633 }, { "epoch": 0.439, "grad_norm": 129.32896423339844, "learning_rate": 1.2436673644001195e-07, "logits/chosen": 3.015934705734253, "logits/rejected": 2.926462411880493, "logps/chosen": -182.5825958251953, "logps/rejected": -131.09524536132812, "loss": 1.8275, "nll_loss": 1.4965788125991821, "rewards/accuracies": 1.0, "rewards/chosen": -1.1668075323104858, "rewards/margins": 1.6923125982284546, "rewards/rejected": -2.8591201305389404, "step": 2634 }, { "epoch": 0.43916666666666665, "grad_norm": 2639.8310546875, "learning_rate": 1.2431438063348502e-07, "logits/chosen": 2.26411771774292, "logits/rejected": 2.261030912399292, "logps/chosen": -48.75896072387695, "logps/rejected": -39.98781204223633, "loss": 3.0871, "nll_loss": 0.8865265846252441, "rewards/accuracies": 0.0, "rewards/chosen": 1.305877447128296, "rewards/margins": -1.5901694297790527, "rewards/rejected": 2.8960468769073486, "step": 2635 }, { "epoch": 0.43933333333333335, "grad_norm": 33.99591064453125, "learning_rate": 1.2426201774233135e-07, "logits/chosen": 2.431671380996704, "logits/rejected": 3.364663600921631, "logps/chosen": -71.58216094970703, "logps/rejected": -496.974609375, "loss": 1.0469, "nll_loss": 1.037422776222229, "rewards/accuracies": 1.0, "rewards/chosen": 2.384547472000122, "rewards/margins": 8.044642448425293, "rewards/rejected": -5.66009521484375, "step": 2636 }, { "epoch": 0.4395, "grad_norm": 32.175086975097656, "learning_rate": 1.2420964778180814e-07, "logits/chosen": 2.7924296855926514, "logits/rejected": 2.7595643997192383, "logps/chosen": -45.890960693359375, "logps/rejected": -308.75189208984375, "loss": 0.9138, "nll_loss": 0.8825184106826782, "rewards/accuracies": 1.0, "rewards/chosen": 0.9014671444892883, "rewards/margins": 12.316596984863281, "rewards/rejected": -11.415129661560059, "step": 2637 }, { "epoch": 0.43966666666666665, "grad_norm": 27.883922576904297, "learning_rate": 1.241572707671748e-07, "logits/chosen": 2.726539373397827, "logits/rejected": 3.1748757362365723, "logps/chosen": -60.79643249511719, "logps/rejected": -325.02044677734375, "loss": 0.7439, "nll_loss": 0.7237670421600342, "rewards/accuracies": 1.0, "rewards/chosen": 1.3747711181640625, "rewards/margins": 10.1095552444458, "rewards/rejected": -8.734784126281738, "step": 2638 }, { "epoch": 0.43983333333333335, "grad_norm": 42.594947814941406, "learning_rate": 1.2410488671369272e-07, "logits/chosen": 2.679262161254883, "logits/rejected": 2.602348804473877, "logps/chosen": -53.167240142822266, "logps/rejected": -90.05998992919922, "loss": 0.9198, "nll_loss": 0.8307380676269531, "rewards/accuracies": 1.0, "rewards/chosen": 0.5866169333457947, "rewards/margins": 3.5189058780670166, "rewards/rejected": -2.932288885116577, "step": 2639 }, { "epoch": 0.44, "grad_norm": 37.7574348449707, "learning_rate": 1.2405249563662536e-07, "logits/chosen": 2.303629159927368, "logits/rejected": 2.596602439880371, "logps/chosen": -37.422821044921875, "logps/rejected": -230.84231567382812, "loss": 0.8732, "nll_loss": 0.8505188226699829, "rewards/accuracies": 1.0, "rewards/chosen": 1.2495625019073486, "rewards/margins": 9.353694915771484, "rewards/rejected": -8.104132652282715, "step": 2640 }, { "epoch": 0.44016666666666665, "grad_norm": 26.6296443939209, "learning_rate": 1.2400009755123822e-07, "logits/chosen": 2.8584609031677246, "logits/rejected": 3.045346736907959, "logps/chosen": -46.31255340576172, "logps/rejected": -298.003173828125, "loss": 0.808, "nll_loss": 0.798492431640625, "rewards/accuracies": 1.0, "rewards/chosen": 4.726611614227295, "rewards/margins": 8.728903770446777, "rewards/rejected": -4.002292156219482, "step": 2641 }, { "epoch": 0.44033333333333335, "grad_norm": 92.07032775878906, "learning_rate": 1.2394769247279885e-07, "logits/chosen": 2.477288246154785, "logits/rejected": 2.554144859313965, "logps/chosen": -115.74371337890625, "logps/rejected": -157.8721923828125, "loss": 1.4585, "nll_loss": 1.231316089630127, "rewards/accuracies": 1.0, "rewards/chosen": -1.3662819862365723, "rewards/margins": 3.288247585296631, "rewards/rejected": -4.654529571533203, "step": 2642 }, { "epoch": 0.4405, "grad_norm": 29.721628189086914, "learning_rate": 1.2389528041657678e-07, "logits/chosen": 2.6574313640594482, "logits/rejected": 2.6808011531829834, "logps/chosen": -88.18402099609375, "logps/rejected": -320.0948181152344, "loss": 0.9398, "nll_loss": 0.9185835719108582, "rewards/accuracies": 1.0, "rewards/chosen": 1.3120530843734741, "rewards/margins": 10.579954147338867, "rewards/rejected": -9.267901420593262, "step": 2643 }, { "epoch": 0.44066666666666665, "grad_norm": 37.21088790893555, "learning_rate": 1.238428613978437e-07, "logits/chosen": 3.5719754695892334, "logits/rejected": 3.6815381050109863, "logps/chosen": -37.527183532714844, "logps/rejected": -156.78082275390625, "loss": 0.8741, "nll_loss": 0.8727251291275024, "rewards/accuracies": 1.0, "rewards/chosen": 4.389835357666016, "rewards/margins": 11.734025955200195, "rewards/rejected": -7.344191074371338, "step": 2644 }, { "epoch": 0.44083333333333335, "grad_norm": 34.19178009033203, "learning_rate": 1.237904354318732e-07, "logits/chosen": 2.024749994277954, "logits/rejected": 2.108884334564209, "logps/chosen": -88.32566833496094, "logps/rejected": -205.53504943847656, "loss": 0.8445, "nll_loss": 0.8178302645683289, "rewards/accuracies": 1.0, "rewards/chosen": 1.0684326887130737, "rewards/margins": 11.11102294921875, "rewards/rejected": -10.042590141296387, "step": 2645 }, { "epoch": 0.441, "grad_norm": 38.04246139526367, "learning_rate": 1.23738002533941e-07, "logits/chosen": 2.542202949523926, "logits/rejected": 2.711151599884033, "logps/chosen": -13.916818618774414, "logps/rejected": -155.35369873046875, "loss": 0.5593, "nll_loss": 0.5566726922988892, "rewards/accuracies": 1.0, "rewards/chosen": 3.676909923553467, "rewards/margins": 10.528985977172852, "rewards/rejected": -6.852076053619385, "step": 2646 }, { "epoch": 0.44116666666666665, "grad_norm": 33.93844985961914, "learning_rate": 1.2368556271932474e-07, "logits/chosen": 2.4005379676818848, "logits/rejected": 2.4161174297332764, "logps/chosen": -52.90060806274414, "logps/rejected": -150.4244842529297, "loss": 0.9867, "nll_loss": 0.9796409010887146, "rewards/accuracies": 1.0, "rewards/chosen": 2.4676601886749268, "rewards/margins": 10.572491645812988, "rewards/rejected": -8.10483169555664, "step": 2647 }, { "epoch": 0.44133333333333336, "grad_norm": 31.47388458251953, "learning_rate": 1.2363311600330412e-07, "logits/chosen": 1.748706340789795, "logits/rejected": 1.0599979162216187, "logps/chosen": -165.81527709960938, "logps/rejected": -88.71292114257812, "loss": 1.0008, "nll_loss": 0.9110730290412903, "rewards/accuracies": 1.0, "rewards/chosen": 2.409271240234375, "rewards/margins": 4.160593509674072, "rewards/rejected": -1.7513221502304077, "step": 2648 }, { "epoch": 0.4415, "grad_norm": 32.87346649169922, "learning_rate": 1.2358066240116088e-07, "logits/chosen": 2.92378306388855, "logits/rejected": 2.967674732208252, "logps/chosen": -5.626683712005615, "logps/rejected": -45.95751953125, "loss": 0.2968, "nll_loss": 0.23444515466690063, "rewards/accuracies": 1.0, "rewards/chosen": 1.0131553411483765, "rewards/margins": 4.0902934074401855, "rewards/rejected": -3.0771379470825195, "step": 2649 }, { "epoch": 0.44166666666666665, "grad_norm": 43.460208892822266, "learning_rate": 1.2352820192817876e-07, "logits/chosen": 2.8723537921905518, "logits/rejected": 2.8852596282958984, "logps/chosen": -29.477750778198242, "logps/rejected": -42.630897521972656, "loss": 0.8378, "nll_loss": 0.6699488162994385, "rewards/accuracies": 1.0, "rewards/chosen": 5.996579647064209, "rewards/margins": 6.614490509033203, "rewards/rejected": -0.6179108023643494, "step": 2650 }, { "epoch": 0.44183333333333336, "grad_norm": 42.584312438964844, "learning_rate": 1.2347573459964345e-07, "logits/chosen": 2.953601837158203, "logits/rejected": 3.2930307388305664, "logps/chosen": -88.94949340820312, "logps/rejected": -260.7919616699219, "loss": 1.042, "nll_loss": 1.0342963933944702, "rewards/accuracies": 1.0, "rewards/chosen": 2.651407241821289, "rewards/margins": 8.250297546386719, "rewards/rejected": -5.598890781402588, "step": 2651 }, { "epoch": 0.442, "grad_norm": 31.051939010620117, "learning_rate": 1.2342326043084267e-07, "logits/chosen": 3.192432403564453, "logits/rejected": 3.115889072418213, "logps/chosen": -48.878780364990234, "logps/rejected": -37.400672912597656, "loss": 0.8297, "nll_loss": 0.7519813179969788, "rewards/accuracies": 1.0, "rewards/chosen": 1.4027637243270874, "rewards/margins": 3.7824225425720215, "rewards/rejected": -2.3796586990356445, "step": 2652 }, { "epoch": 0.44216666666666665, "grad_norm": 21.690919876098633, "learning_rate": 1.2337077943706613e-07, "logits/chosen": 2.411726474761963, "logits/rejected": 2.4121131896972656, "logps/chosen": -137.35316467285156, "logps/rejected": -158.18048095703125, "loss": 0.991, "nll_loss": 0.9810941219329834, "rewards/accuracies": 1.0, "rewards/chosen": 2.303143262863159, "rewards/margins": 8.078742980957031, "rewards/rejected": -5.775599956512451, "step": 2653 }, { "epoch": 0.44233333333333336, "grad_norm": 57.79195022583008, "learning_rate": 1.2331829163360554e-07, "logits/chosen": 2.2317252159118652, "logits/rejected": 2.3818702697753906, "logps/chosen": -66.29584503173828, "logps/rejected": -253.69677734375, "loss": 1.072, "nll_loss": 0.9470834732055664, "rewards/accuracies": 1.0, "rewards/chosen": -0.1132148802280426, "rewards/margins": 3.203244686126709, "rewards/rejected": -3.3164596557617188, "step": 2654 }, { "epoch": 0.4425, "grad_norm": 25.465740203857422, "learning_rate": 1.232657970357546e-07, "logits/chosen": 2.488534688949585, "logits/rejected": 2.373141050338745, "logps/chosen": -66.52926635742188, "logps/rejected": -65.16473388671875, "loss": 0.7117, "nll_loss": 0.6522476077079773, "rewards/accuracies": 1.0, "rewards/chosen": 2.2017266750335693, "rewards/margins": 4.536766052246094, "rewards/rejected": -2.3350396156311035, "step": 2655 }, { "epoch": 0.44266666666666665, "grad_norm": 29.91022491455078, "learning_rate": 1.2321329565880896e-07, "logits/chosen": 1.3787986040115356, "logits/rejected": 2.356678009033203, "logps/chosen": -88.31674194335938, "logps/rejected": -189.28524780273438, "loss": 0.9608, "nll_loss": 0.9395397901535034, "rewards/accuracies": 1.0, "rewards/chosen": 1.4129005670547485, "rewards/margins": 7.223280906677246, "rewards/rejected": -5.810380458831787, "step": 2656 }, { "epoch": 0.44283333333333336, "grad_norm": 37.341739654541016, "learning_rate": 1.2316078751806626e-07, "logits/chosen": 1.2468498945236206, "logits/rejected": 2.663973569869995, "logps/chosen": -23.537662506103516, "logps/rejected": -357.08563232421875, "loss": 0.7422, "nll_loss": 0.7355519533157349, "rewards/accuracies": 1.0, "rewards/chosen": 3.2125699520111084, "rewards/margins": 8.230008125305176, "rewards/rejected": -5.017437934875488, "step": 2657 }, { "epoch": 0.443, "grad_norm": 25.34688377380371, "learning_rate": 1.2310827262882612e-07, "logits/chosen": 2.2739248275756836, "logits/rejected": 2.2579023838043213, "logps/chosen": -38.43225860595703, "logps/rejected": -108.69940185546875, "loss": 0.5298, "nll_loss": 0.4927212595939636, "rewards/accuracies": 1.0, "rewards/chosen": 2.345928192138672, "rewards/margins": 5.239649295806885, "rewards/rejected": -2.893721103668213, "step": 2658 }, { "epoch": 0.44316666666666665, "grad_norm": 26.322084426879883, "learning_rate": 1.230557510063901e-07, "logits/chosen": 1.2156621217727661, "logits/rejected": 1.793212652206421, "logps/chosen": -95.96913146972656, "logps/rejected": -202.8382110595703, "loss": 0.989, "nll_loss": 0.9792768359184265, "rewards/accuracies": 1.0, "rewards/chosen": 2.570883274078369, "rewards/margins": 7.614274978637695, "rewards/rejected": -5.043391704559326, "step": 2659 }, { "epoch": 0.44333333333333336, "grad_norm": 30.437965393066406, "learning_rate": 1.2300322266606176e-07, "logits/chosen": 2.6725711822509766, "logits/rejected": 3.3527138233184814, "logps/chosen": -142.9673614501953, "logps/rejected": -149.83152770996094, "loss": 1.1805, "nll_loss": 1.1529626846313477, "rewards/accuracies": 1.0, "rewards/chosen": 1.0848067998886108, "rewards/margins": 7.405789375305176, "rewards/rejected": -6.320982456207275, "step": 2660 }, { "epoch": 0.4435, "grad_norm": 33.66489028930664, "learning_rate": 1.229506876231466e-07, "logits/chosen": 1.1828278303146362, "logits/rejected": 1.6694717407226562, "logps/chosen": -70.447509765625, "logps/rejected": -462.87579345703125, "loss": 1.2004, "nll_loss": 1.1741251945495605, "rewards/accuracies": 1.0, "rewards/chosen": 1.0897743701934814, "rewards/margins": 9.113956451416016, "rewards/rejected": -8.024182319641113, "step": 2661 }, { "epoch": 0.44366666666666665, "grad_norm": 34.912330627441406, "learning_rate": 1.2289814589295205e-07, "logits/chosen": 2.8928029537200928, "logits/rejected": 2.9291749000549316, "logps/chosen": -69.33242797851562, "logps/rejected": -189.9339599609375, "loss": 0.8171, "nll_loss": 0.7878685593605042, "rewards/accuracies": 1.0, "rewards/chosen": 1.0004043579101562, "rewards/margins": 7.7289347648620605, "rewards/rejected": -6.728530406951904, "step": 2662 }, { "epoch": 0.44383333333333336, "grad_norm": 298.51324462890625, "learning_rate": 1.2284559749078755e-07, "logits/chosen": 2.7971808910369873, "logits/rejected": 2.6627416610717773, "logps/chosen": -135.89718627929688, "logps/rejected": -183.17202758789062, "loss": 3.6642, "nll_loss": 2.1570980548858643, "rewards/accuracies": 1.0, "rewards/chosen": -5.657419681549072, "rewards/margins": 0.012923240661621094, "rewards/rejected": -5.670342922210693, "step": 2663 }, { "epoch": 0.444, "grad_norm": 21.53105354309082, "learning_rate": 1.2279304243196436e-07, "logits/chosen": 2.655592441558838, "logits/rejected": 2.8463830947875977, "logps/chosen": -38.64384078979492, "logps/rejected": -286.12420654296875, "loss": 0.5092, "nll_loss": 0.48916250467300415, "rewards/accuracies": 1.0, "rewards/chosen": 1.6367063522338867, "rewards/margins": 6.571174621582031, "rewards/rejected": -4.9344682693481445, "step": 2664 }, { "epoch": 0.44416666666666665, "grad_norm": 225.48675537109375, "learning_rate": 1.2274048073179583e-07, "logits/chosen": 2.6506741046905518, "logits/rejected": 2.625030994415283, "logps/chosen": -73.7739028930664, "logps/rejected": -42.843807220458984, "loss": 4.1531, "nll_loss": 0.9836521148681641, "rewards/accuracies": 0.0, "rewards/chosen": 2.435570478439331, "rewards/margins": -2.430344820022583, "rewards/rejected": 4.865915298461914, "step": 2665 }, { "epoch": 0.44433333333333336, "grad_norm": 51.22864532470703, "learning_rate": 1.2268791240559714e-07, "logits/chosen": 3.2671945095062256, "logits/rejected": 3.510626792907715, "logps/chosen": -41.4022216796875, "logps/rejected": -351.605224609375, "loss": 1.2428, "nll_loss": 1.217712163925171, "rewards/accuracies": 1.0, "rewards/chosen": 1.1268532276153564, "rewards/margins": 12.40966796875, "rewards/rejected": -11.282814979553223, "step": 2666 }, { "epoch": 0.4445, "grad_norm": 34.23150634765625, "learning_rate": 1.226353374686855e-07, "logits/chosen": 2.8181891441345215, "logits/rejected": 3.1374282836914062, "logps/chosen": -30.160917282104492, "logps/rejected": -98.85606384277344, "loss": 0.6672, "nll_loss": 0.6283524632453918, "rewards/accuracies": 1.0, "rewards/chosen": 1.4827944040298462, "rewards/margins": 4.9137797355651855, "rewards/rejected": -3.43098521232605, "step": 2667 }, { "epoch": 0.44466666666666665, "grad_norm": 17.263256072998047, "learning_rate": 1.2258275593637993e-07, "logits/chosen": 1.875788927078247, "logits/rejected": 2.5388054847717285, "logps/chosen": -9.822731018066406, "logps/rejected": -137.47280883789062, "loss": 0.2242, "nll_loss": 0.21828290820121765, "rewards/accuracies": 1.0, "rewards/chosen": 3.0297646522521973, "rewards/margins": 8.614250183105469, "rewards/rejected": -5.584485054016113, "step": 2668 }, { "epoch": 0.44483333333333336, "grad_norm": 104.3100814819336, "learning_rate": 1.2253016782400146e-07, "logits/chosen": 0.7174630165100098, "logits/rejected": 1.6708619594573975, "logps/chosen": -44.07944869995117, "logps/rejected": -153.62539672851562, "loss": 1.7296, "nll_loss": 1.4693148136138916, "rewards/accuracies": 1.0, "rewards/chosen": -1.1968258619308472, "rewards/margins": 2.3288354873657227, "rewards/rejected": -3.5256614685058594, "step": 2669 }, { "epoch": 0.445, "grad_norm": 32.27140808105469, "learning_rate": 1.2247757314687295e-07, "logits/chosen": 2.7048258781433105, "logits/rejected": 2.89056396484375, "logps/chosen": -32.5162353515625, "logps/rejected": -333.3321838378906, "loss": 0.7845, "nll_loss": 0.7741961479187012, "rewards/accuracies": 1.0, "rewards/chosen": 2.09159517288208, "rewards/margins": 9.387691497802734, "rewards/rejected": -7.2960968017578125, "step": 2670 }, { "epoch": 0.44516666666666665, "grad_norm": 45.89254379272461, "learning_rate": 1.2242497192031926e-07, "logits/chosen": 2.6209933757781982, "logits/rejected": 3.028095245361328, "logps/chosen": -21.110240936279297, "logps/rejected": -101.26016235351562, "loss": 0.7962, "nll_loss": 0.7818605899810791, "rewards/accuracies": 1.0, "rewards/chosen": 1.7537510395050049, "rewards/margins": 8.793931007385254, "rewards/rejected": -7.040180206298828, "step": 2671 }, { "epoch": 0.44533333333333336, "grad_norm": 57.675209045410156, "learning_rate": 1.2237236415966714e-07, "logits/chosen": 2.5422956943511963, "logits/rejected": 2.741016149520874, "logps/chosen": -40.804996490478516, "logps/rejected": -366.5841369628906, "loss": 1.4153, "nll_loss": 1.4070690870285034, "rewards/accuracies": 1.0, "rewards/chosen": 2.2856686115264893, "rewards/margins": 11.921420097351074, "rewards/rejected": -9.635751724243164, "step": 2672 }, { "epoch": 0.4455, "grad_norm": 34.486595153808594, "learning_rate": 1.223197498802452e-07, "logits/chosen": 1.8971645832061768, "logits/rejected": 2.1902968883514404, "logps/chosen": -14.874624252319336, "logps/rejected": -193.5069580078125, "loss": 0.5374, "nll_loss": 0.5312365889549255, "rewards/accuracies": 1.0, "rewards/chosen": 3.6372909545898438, "rewards/margins": 8.463705062866211, "rewards/rejected": -4.826414585113525, "step": 2673 }, { "epoch": 0.44566666666666666, "grad_norm": 24.99322509765625, "learning_rate": 1.2226712909738402e-07, "logits/chosen": 2.6465351581573486, "logits/rejected": 2.9511935710906982, "logps/chosen": -30.93966293334961, "logps/rejected": -181.5049591064453, "loss": 0.5648, "nll_loss": 0.5428010821342468, "rewards/accuracies": 1.0, "rewards/chosen": 1.460932970046997, "rewards/margins": 6.6373796463012695, "rewards/rejected": -5.176446437835693, "step": 2674 }, { "epoch": 0.44583333333333336, "grad_norm": 295.3063659667969, "learning_rate": 1.2221450182641598e-07, "logits/chosen": 2.9896271228790283, "logits/rejected": 2.949591636657715, "logps/chosen": -93.34386444091797, "logps/rejected": -68.51581573486328, "loss": 4.4546, "nll_loss": 1.1383397579193115, "rewards/accuracies": 0.0, "rewards/chosen": -2.2459328174591064, "rewards/margins": -3.2096147537231445, "rewards/rejected": 0.9636818766593933, "step": 2675 }, { "epoch": 0.446, "grad_norm": 149.62142944335938, "learning_rate": 1.2216186808267542e-07, "logits/chosen": 2.0876996517181396, "logits/rejected": 1.8238921165466309, "logps/chosen": -97.20247650146484, "logps/rejected": -74.286865234375, "loss": 2.0523, "nll_loss": 1.4727648496627808, "rewards/accuracies": 1.0, "rewards/chosen": -3.291738986968994, "rewards/margins": 2.2529306411743164, "rewards/rejected": -5.5446696281433105, "step": 2676 }, { "epoch": 0.44616666666666666, "grad_norm": 71.19385528564453, "learning_rate": 1.221092278814986e-07, "logits/chosen": 3.1790361404418945, "logits/rejected": 3.080421209335327, "logps/chosen": -58.335243225097656, "logps/rejected": -87.9339599609375, "loss": 1.1003, "nll_loss": 0.6705199480056763, "rewards/accuracies": 1.0, "rewards/chosen": 2.2815093994140625, "rewards/margins": 1.8753738403320312, "rewards/rejected": 0.40613558888435364, "step": 2677 }, { "epoch": 0.44633333333333336, "grad_norm": 39.46961975097656, "learning_rate": 1.220565812382235e-07, "logits/chosen": 2.5820536613464355, "logits/rejected": 2.8141281604766846, "logps/chosen": -23.16331672668457, "logps/rejected": -284.3370666503906, "loss": 0.7256, "nll_loss": 0.7238537073135376, "rewards/accuracies": 1.0, "rewards/chosen": 4.098571300506592, "rewards/margins": 11.244762420654297, "rewards/rejected": -7.146190643310547, "step": 2678 }, { "epoch": 0.4465, "grad_norm": 22.89594078063965, "learning_rate": 1.220039281681902e-07, "logits/chosen": 2.248648166656494, "logits/rejected": 2.0059263706207275, "logps/chosen": -98.27517700195312, "logps/rejected": -145.7205047607422, "loss": 0.8184, "nll_loss": 0.7989853024482727, "rewards/accuracies": 1.0, "rewards/chosen": 1.495208740234375, "rewards/margins": 7.43426513671875, "rewards/rejected": -5.939056396484375, "step": 2679 }, { "epoch": 0.44666666666666666, "grad_norm": 24.944164276123047, "learning_rate": 1.219512686867405e-07, "logits/chosen": 3.0927658081054688, "logits/rejected": 3.2110085487365723, "logps/chosen": -46.95697784423828, "logps/rejected": -93.62904357910156, "loss": 0.5752, "nll_loss": 0.5160107612609863, "rewards/accuracies": 1.0, "rewards/chosen": 1.4571282863616943, "rewards/margins": 4.203357696533203, "rewards/rejected": -2.746229648590088, "step": 2680 }, { "epoch": 0.44683333333333336, "grad_norm": 16.207971572875977, "learning_rate": 1.2189860280921807e-07, "logits/chosen": 2.003490447998047, "logits/rejected": 2.3582773208618164, "logps/chosen": -100.58580017089844, "logps/rejected": -299.8100891113281, "loss": 0.6667, "nll_loss": 0.6574234366416931, "rewards/accuracies": 1.0, "rewards/chosen": 2.244344472885132, "rewards/margins": 8.813295364379883, "rewards/rejected": -6.568950653076172, "step": 2681 }, { "epoch": 0.447, "grad_norm": 34.81601333618164, "learning_rate": 1.2184593055096852e-07, "logits/chosen": 2.7770869731903076, "logits/rejected": 2.888716220855713, "logps/chosen": -89.85515594482422, "logps/rejected": -174.61866760253906, "loss": 0.9044, "nll_loss": 0.8723801374435425, "rewards/accuracies": 1.0, "rewards/chosen": 0.9406288862228394, "rewards/margins": 6.795905113220215, "rewards/rejected": -5.855276107788086, "step": 2682 }, { "epoch": 0.44716666666666666, "grad_norm": 22.47220802307129, "learning_rate": 1.217932519273393e-07, "logits/chosen": 2.7174620628356934, "logits/rejected": 3.103368043899536, "logps/chosen": -49.609893798828125, "logps/rejected": -262.314208984375, "loss": 0.6258, "nll_loss": 0.6201237440109253, "rewards/accuracies": 1.0, "rewards/chosen": 2.667034864425659, "rewards/margins": 11.26635456085205, "rewards/rejected": -8.599319458007812, "step": 2683 }, { "epoch": 0.44733333333333336, "grad_norm": 23.279508590698242, "learning_rate": 1.2174056695367968e-07, "logits/chosen": 2.3822085857391357, "logits/rejected": 2.567640781402588, "logps/chosen": -21.689477920532227, "logps/rejected": -92.12554931640625, "loss": 0.4058, "nll_loss": 0.36761823296546936, "rewards/accuracies": 1.0, "rewards/chosen": 1.9226882457733154, "rewards/margins": 5.007077217102051, "rewards/rejected": -3.0843887329101562, "step": 2684 }, { "epoch": 0.4475, "grad_norm": 197.13734436035156, "learning_rate": 1.2168787564534076e-07, "logits/chosen": 3.9319794178009033, "logits/rejected": 4.091157913208008, "logps/chosen": -61.68903350830078, "logps/rejected": -209.41436767578125, "loss": 1.0216, "nll_loss": 0.9949843287467957, "rewards/accuracies": 1.0, "rewards/chosen": 1.805977702140808, "rewards/margins": 5.606212615966797, "rewards/rejected": -3.8002350330352783, "step": 2685 }, { "epoch": 0.44766666666666666, "grad_norm": 30.672149658203125, "learning_rate": 1.216351780176756e-07, "logits/chosen": 2.266362190246582, "logits/rejected": 2.200460910797119, "logps/chosen": -49.876434326171875, "logps/rejected": -67.01611328125, "loss": 0.7364, "nll_loss": 0.6832388639450073, "rewards/accuracies": 1.0, "rewards/chosen": 0.9374973177909851, "rewards/margins": 4.447022438049316, "rewards/rejected": -3.5095252990722656, "step": 2686 }, { "epoch": 0.44783333333333336, "grad_norm": 42.165687561035156, "learning_rate": 1.2158247408603892e-07, "logits/chosen": 2.5006651878356934, "logits/rejected": 2.666553020477295, "logps/chosen": -25.867496490478516, "logps/rejected": -139.5177001953125, "loss": 0.7092, "nll_loss": 0.6991214752197266, "rewards/accuracies": 1.0, "rewards/chosen": 2.314547300338745, "rewards/margins": 7.872289657592773, "rewards/rejected": -5.557742595672607, "step": 2687 }, { "epoch": 0.448, "grad_norm": 26.30988311767578, "learning_rate": 1.215297638657875e-07, "logits/chosen": 1.1510303020477295, "logits/rejected": 1.6765952110290527, "logps/chosen": -129.46600341796875, "logps/rejected": -261.66839599609375, "loss": 1.1635, "nll_loss": 1.1559464931488037, "rewards/accuracies": 1.0, "rewards/chosen": 2.369288682937622, "rewards/margins": 12.608393669128418, "rewards/rejected": -10.239105224609375, "step": 2688 }, { "epoch": 0.44816666666666666, "grad_norm": 33.56622314453125, "learning_rate": 1.214770473722797e-07, "logits/chosen": 2.8453032970428467, "logits/rejected": 2.9830703735351562, "logps/chosen": -21.30910301208496, "logps/rejected": -172.43511962890625, "loss": 0.6142, "nll_loss": 0.5759217739105225, "rewards/accuracies": 1.0, "rewards/chosen": 0.7715229988098145, "rewards/margins": 6.2461676597595215, "rewards/rejected": -5.474644660949707, "step": 2689 }, { "epoch": 0.4483333333333333, "grad_norm": 29.081377029418945, "learning_rate": 1.2142432462087597e-07, "logits/chosen": 2.7364468574523926, "logits/rejected": 2.5006778240203857, "logps/chosen": -87.42277526855469, "logps/rejected": -56.676185607910156, "loss": 0.9541, "nll_loss": 0.8830583095550537, "rewards/accuracies": 1.0, "rewards/chosen": 2.8220436573028564, "rewards/margins": 4.774270057678223, "rewards/rejected": -1.9522266387939453, "step": 2690 }, { "epoch": 0.4485, "grad_norm": 25.278963088989258, "learning_rate": 1.2137159562693837e-07, "logits/chosen": 2.8850672245025635, "logits/rejected": 2.9892067909240723, "logps/chosen": -81.21672058105469, "logps/rejected": -118.02836608886719, "loss": 0.8635, "nll_loss": 0.8372856974601746, "rewards/accuracies": 1.0, "rewards/chosen": 1.5984742641448975, "rewards/margins": 5.723610877990723, "rewards/rejected": -4.125136852264404, "step": 2691 }, { "epoch": 0.44866666666666666, "grad_norm": 88.04177856445312, "learning_rate": 1.2131886040583087e-07, "logits/chosen": 3.017894744873047, "logits/rejected": 3.3360140323638916, "logps/chosen": -11.84015941619873, "logps/rejected": -227.2779998779297, "loss": 1.0505, "nll_loss": 0.9866799712181091, "rewards/accuracies": 1.0, "rewards/chosen": 0.10254736244678497, "rewards/margins": 7.185440540313721, "rewards/rejected": -7.082893371582031, "step": 2692 }, { "epoch": 0.4488333333333333, "grad_norm": 29.572200775146484, "learning_rate": 1.212661189729193e-07, "logits/chosen": 2.263943672180176, "logits/rejected": 2.4357919692993164, "logps/chosen": -35.96602249145508, "logps/rejected": -171.77525329589844, "loss": 0.6443, "nll_loss": 0.5994336605072021, "rewards/accuracies": 1.0, "rewards/chosen": 0.4952682554721832, "rewards/margins": 8.2085599899292, "rewards/rejected": -7.713292121887207, "step": 2693 }, { "epoch": 0.449, "grad_norm": 32.66150665283203, "learning_rate": 1.2121337134357119e-07, "logits/chosen": 2.1151747703552246, "logits/rejected": 2.135495185852051, "logps/chosen": -86.32006072998047, "logps/rejected": -183.52700805664062, "loss": 0.9018, "nll_loss": 0.8808168172836304, "rewards/accuracies": 1.0, "rewards/chosen": 1.5846261978149414, "rewards/margins": 6.460972309112549, "rewards/rejected": -4.876346111297607, "step": 2694 }, { "epoch": 0.44916666666666666, "grad_norm": 29.743680953979492, "learning_rate": 1.2116061753315596e-07, "logits/chosen": 1.6170499324798584, "logits/rejected": 1.7200218439102173, "logps/chosen": -96.19192504882812, "logps/rejected": -130.3782958984375, "loss": 0.977, "nll_loss": 0.9619191884994507, "rewards/accuracies": 1.0, "rewards/chosen": 2.151859998703003, "rewards/margins": 6.747198104858398, "rewards/rejected": -4.595337867736816, "step": 2695 }, { "epoch": 0.4493333333333333, "grad_norm": 34.07683181762695, "learning_rate": 1.211078575570448e-07, "logits/chosen": 0.9203994870185852, "logits/rejected": 2.1272051334381104, "logps/chosen": -71.63043212890625, "logps/rejected": -353.22149658203125, "loss": 1.0336, "nll_loss": 1.0232919454574585, "rewards/accuracies": 1.0, "rewards/chosen": 2.1149284839630127, "rewards/margins": 8.9226655960083, "rewards/rejected": -6.807736873626709, "step": 2696 }, { "epoch": 0.4495, "grad_norm": 33.99272537231445, "learning_rate": 1.210550914306107e-07, "logits/chosen": 2.561285972595215, "logits/rejected": 2.356774091720581, "logps/chosen": -40.7583122253418, "logps/rejected": -77.0492172241211, "loss": 0.9246, "nll_loss": 0.8860503435134888, "rewards/accuracies": 1.0, "rewards/chosen": 3.9761099815368652, "rewards/margins": 6.5004987716674805, "rewards/rejected": -2.5243890285491943, "step": 2697 }, { "epoch": 0.44966666666666666, "grad_norm": 299.7198791503906, "learning_rate": 1.2100231916922845e-07, "logits/chosen": 1.5345803499221802, "logits/rejected": 1.8071457147598267, "logps/chosen": -69.06848907470703, "logps/rejected": -142.92478942871094, "loss": 2.7886, "nll_loss": 1.0308730602264404, "rewards/accuracies": 0.0, "rewards/chosen": 0.4125351011753082, "rewards/margins": -1.2274597883224487, "rewards/rejected": 1.6399948596954346, "step": 2698 }, { "epoch": 0.4498333333333333, "grad_norm": 40.1331787109375, "learning_rate": 1.209495407882746e-07, "logits/chosen": 2.486675262451172, "logits/rejected": 2.3599038124084473, "logps/chosen": -15.581493377685547, "logps/rejected": -48.414825439453125, "loss": 0.5943, "nll_loss": 0.5564819574356079, "rewards/accuracies": 1.0, "rewards/chosen": 1.742251992225647, "rewards/margins": 4.971980094909668, "rewards/rejected": -3.2297279834747314, "step": 2699 }, { "epoch": 0.45, "grad_norm": 36.845787048339844, "learning_rate": 1.208967563031275e-07, "logits/chosen": 1.3869167566299438, "logits/rejected": 1.9838024377822876, "logps/chosen": -22.394878387451172, "logps/rejected": -221.23245239257812, "loss": 0.6005, "nll_loss": 0.5598719716072083, "rewards/accuracies": 1.0, "rewards/chosen": 0.5978330969810486, "rewards/margins": 9.962687492370605, "rewards/rejected": -9.36485481262207, "step": 2700 }, { "epoch": 0.45016666666666666, "grad_norm": 38.258296966552734, "learning_rate": 1.208439657291673e-07, "logits/chosen": 3.1869759559631348, "logits/rejected": 3.356593370437622, "logps/chosen": -19.448347091674805, "logps/rejected": -146.61849975585938, "loss": 0.7039, "nll_loss": 0.6482782959938049, "rewards/accuracies": 1.0, "rewards/chosen": 0.5576555132865906, "rewards/margins": 4.790395259857178, "rewards/rejected": -4.2327399253845215, "step": 2701 }, { "epoch": 0.4503333333333333, "grad_norm": 42.457000732421875, "learning_rate": 1.207911690817759e-07, "logits/chosen": 2.2755417823791504, "logits/rejected": 2.3179616928100586, "logps/chosen": -16.8441162109375, "logps/rejected": -158.92596435546875, "loss": 0.6409, "nll_loss": 0.5808315873146057, "rewards/accuracies": 1.0, "rewards/chosen": 2.330218553543091, "rewards/margins": 4.617505073547363, "rewards/rejected": -2.2872865200042725, "step": 2702 }, { "epoch": 0.4505, "grad_norm": 172.68270874023438, "learning_rate": 1.2073836637633704e-07, "logits/chosen": 2.071030616760254, "logits/rejected": 2.0850231647491455, "logps/chosen": -38.21003723144531, "logps/rejected": -25.755855560302734, "loss": 2.634, "nll_loss": 0.6065086126327515, "rewards/accuracies": 0.0, "rewards/chosen": 0.8497955799102783, "rewards/margins": -1.473365306854248, "rewards/rejected": 2.3231608867645264, "step": 2703 }, { "epoch": 0.45066666666666666, "grad_norm": 27.953290939331055, "learning_rate": 1.2068555762823603e-07, "logits/chosen": 2.381213426589966, "logits/rejected": 2.545222043991089, "logps/chosen": -20.501996994018555, "logps/rejected": -260.3446350097656, "loss": 0.5683, "nll_loss": 0.5541080832481384, "rewards/accuracies": 1.0, "rewards/chosen": 1.7375783920288086, "rewards/margins": 9.302373886108398, "rewards/rejected": -7.564795017242432, "step": 2704 }, { "epoch": 0.4508333333333333, "grad_norm": 34.89653015136719, "learning_rate": 1.2063274285286015e-07, "logits/chosen": 2.9303107261657715, "logits/rejected": 2.9298505783081055, "logps/chosen": -99.07569122314453, "logps/rejected": -244.7764892578125, "loss": 1.135, "nll_loss": 1.1008409261703491, "rewards/accuracies": 1.0, "rewards/chosen": 0.7921119928359985, "rewards/margins": 8.867510795593262, "rewards/rejected": -8.075398445129395, "step": 2705 }, { "epoch": 0.451, "grad_norm": 36.22417068481445, "learning_rate": 1.2057992206559835e-07, "logits/chosen": 1.4277328252792358, "logits/rejected": 2.3242619037628174, "logps/chosen": -36.7000617980957, "logps/rejected": -74.61245727539062, "loss": 0.6307, "nll_loss": 0.5919365286827087, "rewards/accuracies": 1.0, "rewards/chosen": 0.9421696662902832, "rewards/margins": 5.393960475921631, "rewards/rejected": -4.451790809631348, "step": 2706 }, { "epoch": 0.45116666666666666, "grad_norm": 42.187808990478516, "learning_rate": 1.2052709528184138e-07, "logits/chosen": 1.8219943046569824, "logits/rejected": 2.4424211978912354, "logps/chosen": -63.050743103027344, "logps/rejected": -207.97140502929688, "loss": 1.163, "nll_loss": 1.1061533689498901, "rewards/accuracies": 1.0, "rewards/chosen": 0.45152512192726135, "rewards/margins": 4.978692054748535, "rewards/rejected": -4.527166843414307, "step": 2707 }, { "epoch": 0.4513333333333333, "grad_norm": 24.548921585083008, "learning_rate": 1.2047426251698158e-07, "logits/chosen": 2.2794065475463867, "logits/rejected": 2.2509546279907227, "logps/chosen": -86.3290786743164, "logps/rejected": -115.15107727050781, "loss": 0.8349, "nll_loss": 0.8144251704216003, "rewards/accuracies": 1.0, "rewards/chosen": 1.762750267982483, "rewards/margins": 6.260667324066162, "rewards/rejected": -4.497917175292969, "step": 2708 }, { "epoch": 0.4515, "grad_norm": 99.03570556640625, "learning_rate": 1.2042142378641328e-07, "logits/chosen": 3.600107431411743, "logits/rejected": 3.744575262069702, "logps/chosen": -17.935462951660156, "logps/rejected": -56.314029693603516, "loss": 0.9682, "nll_loss": 0.9439717531204224, "rewards/accuracies": 1.0, "rewards/chosen": 1.8530510663986206, "rewards/margins": 5.785544395446777, "rewards/rejected": -3.932493209838867, "step": 2709 }, { "epoch": 0.45166666666666666, "grad_norm": 18.88953971862793, "learning_rate": 1.2036857910553233e-07, "logits/chosen": 1.9300754070281982, "logits/rejected": 1.7337536811828613, "logps/chosen": -129.19187927246094, "logps/rejected": -143.42648315429688, "loss": 0.8242, "nll_loss": 0.807449221611023, "rewards/accuracies": 1.0, "rewards/chosen": 2.110795736312866, "rewards/margins": 6.509572982788086, "rewards/rejected": -4.398777008056641, "step": 2710 }, { "epoch": 0.4518333333333333, "grad_norm": 21.732776641845703, "learning_rate": 1.2031572848973645e-07, "logits/chosen": 0.9257459044456482, "logits/rejected": 2.247591495513916, "logps/chosen": -73.12128448486328, "logps/rejected": -393.3912353515625, "loss": 0.8062, "nll_loss": 0.7947965860366821, "rewards/accuracies": 1.0, "rewards/chosen": 2.208019256591797, "rewards/margins": 7.595269203186035, "rewards/rejected": -5.387249946594238, "step": 2711 }, { "epoch": 0.452, "grad_norm": 29.23257064819336, "learning_rate": 1.20262871954425e-07, "logits/chosen": 2.671635389328003, "logits/rejected": 2.896131992340088, "logps/chosen": -11.980720520019531, "logps/rejected": -326.31365966796875, "loss": 0.3431, "nll_loss": 0.30719801783561707, "rewards/accuracies": 1.0, "rewards/chosen": 0.7499539256095886, "rewards/margins": 7.754605293273926, "rewards/rejected": -7.0046515464782715, "step": 2712 }, { "epoch": 0.45216666666666666, "grad_norm": 32.603477478027344, "learning_rate": 1.2021000951499915e-07, "logits/chosen": 2.2481467723846436, "logits/rejected": 2.932091474533081, "logps/chosen": -43.41826629638672, "logps/rejected": -324.59967041015625, "loss": 0.9265, "nll_loss": 0.9045471549034119, "rewards/accuracies": 1.0, "rewards/chosen": 1.5286157131195068, "rewards/margins": 6.404628753662109, "rewards/rejected": -4.876013278961182, "step": 2713 }, { "epoch": 0.4523333333333333, "grad_norm": 33.402095794677734, "learning_rate": 1.201571411868617e-07, "logits/chosen": 2.4167845249176025, "logits/rejected": 2.331531286239624, "logps/chosen": -18.36666488647461, "logps/rejected": -126.4014892578125, "loss": 0.585, "nll_loss": 0.5739582777023315, "rewards/accuracies": 1.0, "rewards/chosen": 2.0333621501922607, "rewards/margins": 8.851337432861328, "rewards/rejected": -6.817975044250488, "step": 2714 }, { "epoch": 0.4525, "grad_norm": 45.081356048583984, "learning_rate": 1.2010426698541726e-07, "logits/chosen": 3.8755815029144287, "logits/rejected": 3.921823501586914, "logps/chosen": -78.15689086914062, "logps/rejected": -144.36416625976562, "loss": 1.0157, "nll_loss": 0.9088010787963867, "rewards/accuracies": 1.0, "rewards/chosen": 0.8052521347999573, "rewards/margins": 3.184980869293213, "rewards/rejected": -2.3797287940979004, "step": 2715 }, { "epoch": 0.45266666666666666, "grad_norm": 91.80126953125, "learning_rate": 1.200513869260721e-07, "logits/chosen": 2.073895215988159, "logits/rejected": 2.0897562503814697, "logps/chosen": -34.44402313232422, "logps/rejected": -39.73258972167969, "loss": 1.4185, "nll_loss": 0.564656138420105, "rewards/accuracies": 1.0, "rewards/chosen": 2.419992208480835, "rewards/margins": 0.7304509878158569, "rewards/rejected": 1.689541220664978, "step": 2716 }, { "epoch": 0.4528333333333333, "grad_norm": 176.259033203125, "learning_rate": 1.1999850102423412e-07, "logits/chosen": 2.106539487838745, "logits/rejected": 1.7768833637237549, "logps/chosen": -50.701690673828125, "logps/rejected": -32.442081451416016, "loss": 2.55, "nll_loss": 0.5696819424629211, "rewards/accuracies": 0.0, "rewards/chosen": 1.563951849937439, "rewards/margins": -1.268609642982483, "rewards/rejected": 2.832561492919922, "step": 2717 }, { "epoch": 0.453, "grad_norm": 115.86772155761719, "learning_rate": 1.1994560929531308e-07, "logits/chosen": 1.975231409072876, "logits/rejected": 1.521166443824768, "logps/chosen": -75.87081909179688, "logps/rejected": -13.215435028076172, "loss": 1.9273, "nll_loss": 0.815815269947052, "rewards/accuracies": 1.0, "rewards/chosen": 2.6241297721862793, "rewards/margins": 0.28081583976745605, "rewards/rejected": 2.3433139324188232, "step": 2718 }, { "epoch": 0.45316666666666666, "grad_norm": 23.153234481811523, "learning_rate": 1.1989271175472034e-07, "logits/chosen": 2.1858603954315186, "logits/rejected": 2.922680377960205, "logps/chosen": -91.85139465332031, "logps/rejected": -220.89163208007812, "loss": 0.9095, "nll_loss": 0.9005038738250732, "rewards/accuracies": 1.0, "rewards/chosen": 2.777571439743042, "rewards/margins": 7.678680419921875, "rewards/rejected": -4.901109218597412, "step": 2719 }, { "epoch": 0.4533333333333333, "grad_norm": 37.99546813964844, "learning_rate": 1.1983980841786897e-07, "logits/chosen": 2.145122528076172, "logits/rejected": 2.3431155681610107, "logps/chosen": -72.02733612060547, "logps/rejected": -101.97514343261719, "loss": 1.09, "nll_loss": 1.0438743829727173, "rewards/accuracies": 1.0, "rewards/chosen": 0.5838310122489929, "rewards/margins": 5.7926225662231445, "rewards/rejected": -5.208791732788086, "step": 2720 }, { "epoch": 0.4535, "grad_norm": 24.00527572631836, "learning_rate": 1.1978689930017378e-07, "logits/chosen": 2.6208534240722656, "logits/rejected": 2.594851016998291, "logps/chosen": -57.545982360839844, "logps/rejected": -129.1675262451172, "loss": 0.76, "nll_loss": 0.7571839094161987, "rewards/accuracies": 1.0, "rewards/chosen": 3.4973411560058594, "rewards/margins": 10.878210067749023, "rewards/rejected": -7.380869388580322, "step": 2721 }, { "epoch": 0.45366666666666666, "grad_norm": 23.54596710205078, "learning_rate": 1.1973398441705112e-07, "logits/chosen": 2.4264774322509766, "logits/rejected": 2.828059196472168, "logps/chosen": -14.500144004821777, "logps/rejected": -469.54937744140625, "loss": 0.3314, "nll_loss": 0.3222254514694214, "rewards/accuracies": 1.0, "rewards/chosen": 2.1619420051574707, "rewards/margins": 16.50092315673828, "rewards/rejected": -14.338980674743652, "step": 2722 }, { "epoch": 0.4538333333333333, "grad_norm": 25.439878463745117, "learning_rate": 1.196810637839192e-07, "logits/chosen": 2.5358173847198486, "logits/rejected": 2.480682849884033, "logps/chosen": -87.3317642211914, "logps/rejected": -118.16816711425781, "loss": 0.8458, "nll_loss": 0.8012088537216187, "rewards/accuracies": 1.0, "rewards/chosen": 1.3164727687835693, "rewards/margins": 4.672898292541504, "rewards/rejected": -3.3564255237579346, "step": 2723 }, { "epoch": 0.454, "grad_norm": 19.63418960571289, "learning_rate": 1.1962813741619775e-07, "logits/chosen": 2.5336718559265137, "logits/rejected": 2.595709800720215, "logps/chosen": -65.3692626953125, "logps/rejected": -233.23512268066406, "loss": 0.5402, "nll_loss": 0.5314573645591736, "rewards/accuracies": 1.0, "rewards/chosen": 2.8935532569885254, "rewards/margins": 7.712581157684326, "rewards/rejected": -4.819027900695801, "step": 2724 }, { "epoch": 0.45416666666666666, "grad_norm": 40.12833023071289, "learning_rate": 1.195752053293083e-07, "logits/chosen": 1.8899022340774536, "logits/rejected": 1.9161651134490967, "logps/chosen": -40.04972839355469, "logps/rejected": -136.70709228515625, "loss": 0.7192, "nll_loss": 0.6905125379562378, "rewards/accuracies": 1.0, "rewards/chosen": 1.03087317943573, "rewards/margins": 7.236209869384766, "rewards/rejected": -6.205336570739746, "step": 2725 }, { "epoch": 0.4543333333333333, "grad_norm": 33.6784553527832, "learning_rate": 1.1952226753867398e-07, "logits/chosen": 2.8215227127075195, "logits/rejected": 2.847486734390259, "logps/chosen": -55.02806854248047, "logps/rejected": -41.29122543334961, "loss": 0.7567, "nll_loss": 0.6629887819290161, "rewards/accuracies": 1.0, "rewards/chosen": 2.444844961166382, "rewards/margins": 4.144363880157471, "rewards/rejected": -1.6995190382003784, "step": 2726 }, { "epoch": 0.4545, "grad_norm": 37.98081588745117, "learning_rate": 1.194693240597196e-07, "logits/chosen": 2.314742088317871, "logits/rejected": 2.2169923782348633, "logps/chosen": -31.62725067138672, "logps/rejected": -47.27933883666992, "loss": 0.8865, "nll_loss": 0.8109551072120667, "rewards/accuracies": 1.0, "rewards/chosen": 1.6022405624389648, "rewards/margins": 3.907119035720825, "rewards/rejected": -2.3048784732818604, "step": 2727 }, { "epoch": 0.45466666666666666, "grad_norm": 30.543540954589844, "learning_rate": 1.194163749078716e-07, "logits/chosen": 2.3992507457733154, "logits/rejected": 2.923222303390503, "logps/chosen": -70.02335357666016, "logps/rejected": -298.72808837890625, "loss": 0.8269, "nll_loss": 0.8048661947250366, "rewards/accuracies": 1.0, "rewards/chosen": 1.8275116682052612, "rewards/margins": 6.005823135375977, "rewards/rejected": -4.178311347961426, "step": 2728 }, { "epoch": 0.4548333333333333, "grad_norm": 36.85272979736328, "learning_rate": 1.1936342009855805e-07, "logits/chosen": 2.6686511039733887, "logits/rejected": 2.798595666885376, "logps/chosen": -104.21235656738281, "logps/rejected": -320.4723815917969, "loss": 1.0186, "nll_loss": 0.9739474058151245, "rewards/accuracies": 1.0, "rewards/chosen": 0.5217193961143494, "rewards/margins": 7.0203938484191895, "rewards/rejected": -6.498674392700195, "step": 2729 }, { "epoch": 0.455, "grad_norm": 31.68899154663086, "learning_rate": 1.193104596472088e-07, "logits/chosen": 2.6500444412231445, "logits/rejected": 2.8019044399261475, "logps/chosen": -102.52569580078125, "logps/rejected": -310.54156494140625, "loss": 1.2472, "nll_loss": 1.235249400138855, "rewards/accuracies": 1.0, "rewards/chosen": 2.130035400390625, "rewards/margins": 7.561074733734131, "rewards/rejected": -5.431039333343506, "step": 2730 }, { "epoch": 0.45516666666666666, "grad_norm": 44.02415466308594, "learning_rate": 1.192574935692552e-07, "logits/chosen": 2.8109130859375, "logits/rejected": 2.7232115268707275, "logps/chosen": -21.945465087890625, "logps/rejected": -132.771484375, "loss": 0.8501, "nll_loss": 0.844056248664856, "rewards/accuracies": 1.0, "rewards/chosen": 3.0643434524536133, "rewards/margins": 8.500910758972168, "rewards/rejected": -5.436567306518555, "step": 2731 }, { "epoch": 0.4553333333333333, "grad_norm": 32.28204345703125, "learning_rate": 1.1920452188013028e-07, "logits/chosen": 2.6117992401123047, "logits/rejected": 2.3902957439422607, "logps/chosen": -85.68167114257812, "logps/rejected": -57.102020263671875, "loss": 0.9547, "nll_loss": 0.9019123315811157, "rewards/accuracies": 1.0, "rewards/chosen": 0.6093368530273438, "rewards/margins": 4.873449325561523, "rewards/rejected": -4.26411247253418, "step": 2732 }, { "epoch": 0.4555, "grad_norm": 18.828657150268555, "learning_rate": 1.1915154459526874e-07, "logits/chosen": 2.587691307067871, "logits/rejected": 2.6213600635528564, "logps/chosen": -73.1205825805664, "logps/rejected": -82.55256652832031, "loss": 0.626, "nll_loss": 0.6144587993621826, "rewards/accuracies": 1.0, "rewards/chosen": 2.6638970375061035, "rewards/margins": 7.17689323425293, "rewards/rejected": -4.512996196746826, "step": 2733 }, { "epoch": 0.45566666666666666, "grad_norm": 24.845848083496094, "learning_rate": 1.1909856173010693e-07, "logits/chosen": 0.9947044849395752, "logits/rejected": 2.2515180110931396, "logps/chosen": -54.81045150756836, "logps/rejected": -365.28094482421875, "loss": 0.7385, "nll_loss": 0.7211902141571045, "rewards/accuracies": 1.0, "rewards/chosen": 1.5062954425811768, "rewards/margins": 10.466503143310547, "rewards/rejected": -8.96020793914795, "step": 2734 }, { "epoch": 0.4558333333333333, "grad_norm": 36.41762161254883, "learning_rate": 1.1904557330008271e-07, "logits/chosen": 2.4077017307281494, "logits/rejected": 2.468644618988037, "logps/chosen": -35.09194564819336, "logps/rejected": -86.87651062011719, "loss": 0.663, "nll_loss": 0.5848658084869385, "rewards/accuracies": 1.0, "rewards/chosen": 1.3921982049942017, "rewards/margins": 3.7781310081481934, "rewards/rejected": -2.3859329223632812, "step": 2735 }, { "epoch": 0.456, "grad_norm": 4.271626949310303, "learning_rate": 1.1899257932063569e-07, "logits/chosen": 2.875075101852417, "logits/rejected": 1.4958792924880981, "logps/chosen": -52.459693908691406, "logps/rejected": -51.315467834472656, "loss": 0.0652, "nll_loss": 0.053204555064439774, "rewards/accuracies": 1.0, "rewards/chosen": 4.791121006011963, "rewards/margins": 8.550418853759766, "rewards/rejected": -3.759298324584961, "step": 2736 }, { "epoch": 0.45616666666666666, "grad_norm": 33.10847091674805, "learning_rate": 1.1893957980720702e-07, "logits/chosen": 2.9668169021606445, "logits/rejected": 3.125251293182373, "logps/chosen": -19.586414337158203, "logps/rejected": -169.2040557861328, "loss": 0.4921, "nll_loss": 0.4777173399925232, "rewards/accuracies": 1.0, "rewards/chosen": 1.912290334701538, "rewards/margins": 7.330073356628418, "rewards/rejected": -5.417782783508301, "step": 2737 }, { "epoch": 0.4563333333333333, "grad_norm": 40.33528137207031, "learning_rate": 1.188865747752395e-07, "logits/chosen": 1.7020947933197021, "logits/rejected": 1.4615367650985718, "logps/chosen": -39.5936393737793, "logps/rejected": -73.0849609375, "loss": 0.7147, "nll_loss": 0.5909498929977417, "rewards/accuracies": 1.0, "rewards/chosen": 0.4677814543247223, "rewards/margins": 2.9469411373138428, "rewards/rejected": -2.4791595935821533, "step": 2738 }, { "epoch": 0.4565, "grad_norm": 28.544313430786133, "learning_rate": 1.1883356424017747e-07, "logits/chosen": 1.920780062675476, "logits/rejected": 2.0783679485321045, "logps/chosen": -26.020294189453125, "logps/rejected": -260.70867919921875, "loss": 0.4776, "nll_loss": 0.47309619188308716, "rewards/accuracies": 1.0, "rewards/chosen": 4.459792613983154, "rewards/margins": 9.40539836883545, "rewards/rejected": -4.945605754852295, "step": 2739 }, { "epoch": 0.45666666666666667, "grad_norm": 56.95542526245117, "learning_rate": 1.1878054821746701e-07, "logits/chosen": 2.167124032974243, "logits/rejected": 1.9932563304901123, "logps/chosen": -61.95050048828125, "logps/rejected": -65.7287368774414, "loss": 1.2695, "nll_loss": 1.1688772439956665, "rewards/accuracies": 1.0, "rewards/chosen": 0.9720413684844971, "rewards/margins": 3.299306631088257, "rewards/rejected": -2.3272652626037598, "step": 2740 }, { "epoch": 0.4568333333333333, "grad_norm": 23.712627410888672, "learning_rate": 1.1872752672255566e-07, "logits/chosen": 2.8534255027770996, "logits/rejected": 2.5633020401000977, "logps/chosen": -337.38336181640625, "logps/rejected": -135.61795043945312, "loss": 1.1851, "nll_loss": 1.1674163341522217, "rewards/accuracies": 1.0, "rewards/chosen": 1.5467225313186646, "rewards/margins": 7.925663948059082, "rewards/rejected": -6.378941535949707, "step": 2741 }, { "epoch": 0.457, "grad_norm": 57.30289840698242, "learning_rate": 1.1867449977089263e-07, "logits/chosen": 2.3862357139587402, "logits/rejected": 2.5889341831207275, "logps/chosen": -42.64434051513672, "logps/rejected": -53.052528381347656, "loss": 1.5652, "nll_loss": 1.523012399673462, "rewards/accuracies": 1.0, "rewards/chosen": 1.5031330585479736, "rewards/margins": 4.760276794433594, "rewards/rejected": -3.257143497467041, "step": 2742 }, { "epoch": 0.45716666666666667, "grad_norm": 30.312198638916016, "learning_rate": 1.1862146737792871e-07, "logits/chosen": 1.6014752388000488, "logits/rejected": 1.5123473405838013, "logps/chosen": -11.088127136230469, "logps/rejected": -54.14141845703125, "loss": 0.3324, "nll_loss": 0.27720320224761963, "rewards/accuracies": 1.0, "rewards/chosen": 1.0002355575561523, "rewards/margins": 4.33168888092041, "rewards/rejected": -3.331453323364258, "step": 2743 }, { "epoch": 0.4573333333333333, "grad_norm": 25.83025550842285, "learning_rate": 1.1856842955911623e-07, "logits/chosen": 1.1321673393249512, "logits/rejected": 2.353411912918091, "logps/chosen": -87.97085571289062, "logps/rejected": -291.5913391113281, "loss": 0.9512, "nll_loss": 0.9358601570129395, "rewards/accuracies": 1.0, "rewards/chosen": 1.6290985345840454, "rewards/margins": 10.694846153259277, "rewards/rejected": -9.065747261047363, "step": 2744 }, { "epoch": 0.4575, "grad_norm": 20.920894622802734, "learning_rate": 1.185153863299092e-07, "logits/chosen": 3.2256481647491455, "logits/rejected": 3.254242181777954, "logps/chosen": -93.8270492553711, "logps/rejected": -159.50881958007812, "loss": 0.8746, "nll_loss": 0.868769109249115, "rewards/accuracies": 1.0, "rewards/chosen": 3.27128529548645, "rewards/margins": 8.522603988647461, "rewards/rejected": -5.25131893157959, "step": 2745 }, { "epoch": 0.45766666666666667, "grad_norm": 34.65296173095703, "learning_rate": 1.1846233770576308e-07, "logits/chosen": 2.5061001777648926, "logits/rejected": 2.6967995166778564, "logps/chosen": -23.064861297607422, "logps/rejected": -66.83602142333984, "loss": 0.5804, "nll_loss": 0.5491633415222168, "rewards/accuracies": 1.0, "rewards/chosen": 0.9738903641700745, "rewards/margins": 6.633166313171387, "rewards/rejected": -5.659276008605957, "step": 2746 }, { "epoch": 0.4578333333333333, "grad_norm": 109.9751205444336, "learning_rate": 1.1840928370213501e-07, "logits/chosen": 2.7163476943969727, "logits/rejected": 2.728926420211792, "logps/chosen": -8.984987258911133, "logps/rejected": -159.0750732421875, "loss": 0.7661, "nll_loss": 0.7487488389015198, "rewards/accuracies": 1.0, "rewards/chosen": 1.7884187698364258, "rewards/margins": 6.771801948547363, "rewards/rejected": -4.9833831787109375, "step": 2747 }, { "epoch": 0.458, "grad_norm": 202.38645935058594, "learning_rate": 1.183562243344836e-07, "logits/chosen": 2.8684706687927246, "logits/rejected": 2.720916986465454, "logps/chosen": -57.377197265625, "logps/rejected": -17.21868324279785, "loss": 3.1841, "nll_loss": 0.925438642501831, "rewards/accuracies": 0.0, "rewards/chosen": 0.31674158573150635, "rewards/margins": -1.8352547883987427, "rewards/rejected": 2.151996374130249, "step": 2748 }, { "epoch": 0.45816666666666667, "grad_norm": 19.961057662963867, "learning_rate": 1.1830315961826915e-07, "logits/chosen": 2.8258144855499268, "logits/rejected": 2.946816921234131, "logps/chosen": -72.95729064941406, "logps/rejected": -533.9026489257812, "loss": 0.7284, "nll_loss": 0.7223493456840515, "rewards/accuracies": 1.0, "rewards/chosen": 2.639223098754883, "rewards/margins": 10.091484069824219, "rewards/rejected": -7.452261447906494, "step": 2749 }, { "epoch": 0.4583333333333333, "grad_norm": 226.35440063476562, "learning_rate": 1.1825008956895338e-07, "logits/chosen": 2.3184964656829834, "logits/rejected": 2.255322217941284, "logps/chosen": -99.83386993408203, "logps/rejected": -23.66942596435547, "loss": 3.56, "nll_loss": 0.9692607522010803, "rewards/accuracies": 0.0, "rewards/chosen": 1.7578651905059814, "rewards/margins": -1.9352505207061768, "rewards/rejected": 3.693115711212158, "step": 2750 }, { "epoch": 0.4585, "grad_norm": 20.96988296508789, "learning_rate": 1.1819701420199968e-07, "logits/chosen": 2.389677047729492, "logits/rejected": 2.6627163887023926, "logps/chosen": -56.22869873046875, "logps/rejected": -284.7382507324219, "loss": 0.5915, "nll_loss": 0.5796773433685303, "rewards/accuracies": 1.0, "rewards/chosen": 1.912290334701538, "rewards/margins": 10.218165397644043, "rewards/rejected": -8.305874824523926, "step": 2751 }, { "epoch": 0.45866666666666667, "grad_norm": 21.994428634643555, "learning_rate": 1.1814393353287286e-07, "logits/chosen": 2.4863834381103516, "logits/rejected": 2.684173107147217, "logps/chosen": -25.725954055786133, "logps/rejected": -446.431396484375, "loss": 0.4463, "nll_loss": 0.4360331594944, "rewards/accuracies": 1.0, "rewards/chosen": 2.0644500255584717, "rewards/margins": 9.9867525100708, "rewards/rejected": -7.92230224609375, "step": 2752 }, { "epoch": 0.4588333333333333, "grad_norm": 24.784725189208984, "learning_rate": 1.1809084757703944e-07, "logits/chosen": 2.552525520324707, "logits/rejected": 2.802564859390259, "logps/chosen": -95.57159423828125, "logps/rejected": -197.6004638671875, "loss": 1.0255, "nll_loss": 1.006016731262207, "rewards/accuracies": 1.0, "rewards/chosen": 2.0485565662384033, "rewards/margins": 6.1931915283203125, "rewards/rejected": -4.144635200500488, "step": 2753 }, { "epoch": 0.459, "grad_norm": 24.941570281982422, "learning_rate": 1.1803775634996734e-07, "logits/chosen": 2.258244752883911, "logits/rejected": 2.2774927616119385, "logps/chosen": -37.269615173339844, "logps/rejected": -110.99383544921875, "loss": 0.5082, "nll_loss": 0.47781553864479065, "rewards/accuracies": 1.0, "rewards/chosen": 2.4621925354003906, "rewards/margins": 5.585356712341309, "rewards/rejected": -3.123164415359497, "step": 2754 }, { "epoch": 0.45916666666666667, "grad_norm": 30.34119415283203, "learning_rate": 1.1798465986712612e-07, "logits/chosen": 2.5649867057800293, "logits/rejected": 2.5890774726867676, "logps/chosen": -66.0602798461914, "logps/rejected": -61.92692184448242, "loss": 0.906, "nll_loss": 0.880803644657135, "rewards/accuracies": 1.0, "rewards/chosen": 1.2434806823730469, "rewards/margins": 6.698469161987305, "rewards/rejected": -5.454988479614258, "step": 2755 }, { "epoch": 0.4593333333333333, "grad_norm": 67.25879669189453, "learning_rate": 1.1793155814398672e-07, "logits/chosen": 3.135544538497925, "logits/rejected": 3.02428936958313, "logps/chosen": -115.22886657714844, "logps/rejected": -36.11595153808594, "loss": 1.2679, "nll_loss": 0.9002254009246826, "rewards/accuracies": 1.0, "rewards/chosen": 2.5035877227783203, "rewards/margins": 2.272432327270508, "rewards/rejected": 0.2311553955078125, "step": 2756 }, { "epoch": 0.4595, "grad_norm": 103.03313446044922, "learning_rate": 1.1787845119602182e-07, "logits/chosen": 1.25270676612854, "logits/rejected": 2.1191818714141846, "logps/chosen": -34.91477966308594, "logps/rejected": -98.6748046875, "loss": 2.1525, "nll_loss": 2.0538108348846436, "rewards/accuracies": 1.0, "rewards/chosen": 0.8143722414970398, "rewards/margins": 3.311262607574463, "rewards/rejected": -2.4968903064727783, "step": 2757 }, { "epoch": 0.45966666666666667, "grad_norm": 27.133115768432617, "learning_rate": 1.1782533903870545e-07, "logits/chosen": 1.757475733757019, "logits/rejected": 2.254307746887207, "logps/chosen": -62.94628143310547, "logps/rejected": -433.7530517578125, "loss": 0.9108, "nll_loss": 0.8992325663566589, "rewards/accuracies": 1.0, "rewards/chosen": 1.913398027420044, "rewards/margins": 16.626808166503906, "rewards/rejected": -14.713410377502441, "step": 2758 }, { "epoch": 0.4598333333333333, "grad_norm": 24.21143341064453, "learning_rate": 1.1777222168751324e-07, "logits/chosen": 2.3032917976379395, "logits/rejected": 2.293459892272949, "logps/chosen": -84.38763427734375, "logps/rejected": -161.251708984375, "loss": 0.8908, "nll_loss": 0.8790377974510193, "rewards/accuracies": 1.0, "rewards/chosen": 2.3189218044281006, "rewards/margins": 7.275005340576172, "rewards/rejected": -4.95608377456665, "step": 2759 }, { "epoch": 0.46, "grad_norm": 34.796024322509766, "learning_rate": 1.1771909915792229e-07, "logits/chosen": 2.0697696208953857, "logits/rejected": 2.4849231243133545, "logps/chosen": -46.38418197631836, "logps/rejected": -377.02392578125, "loss": 0.9715, "nll_loss": 0.9663372039794922, "rewards/accuracies": 1.0, "rewards/chosen": 4.030231952667236, "rewards/margins": 8.948890686035156, "rewards/rejected": -4.91865873336792, "step": 2760 }, { "epoch": 0.46016666666666667, "grad_norm": 35.28239440917969, "learning_rate": 1.1766597146541124e-07, "logits/chosen": 2.3761932849884033, "logits/rejected": 2.689762830734253, "logps/chosen": -55.08087158203125, "logps/rejected": -370.9295654296875, "loss": 0.9192, "nll_loss": 0.8884011507034302, "rewards/accuracies": 1.0, "rewards/chosen": 0.9170472025871277, "rewards/margins": 7.788842678070068, "rewards/rejected": -6.871795654296875, "step": 2761 }, { "epoch": 0.4603333333333333, "grad_norm": 22.549217224121094, "learning_rate": 1.1761283862546021e-07, "logits/chosen": 1.5592917203903198, "logits/rejected": 2.3625338077545166, "logps/chosen": -38.57706832885742, "logps/rejected": -192.23345947265625, "loss": 0.5126, "nll_loss": 0.5010009407997131, "rewards/accuracies": 1.0, "rewards/chosen": 1.9955974817276, "rewards/margins": 8.483856201171875, "rewards/rejected": -6.488258361816406, "step": 2762 }, { "epoch": 0.4605, "grad_norm": 38.49258804321289, "learning_rate": 1.1755970065355085e-07, "logits/chosen": 2.645501136779785, "logits/rejected": 2.8705646991729736, "logps/chosen": -22.266361236572266, "logps/rejected": -222.73883056640625, "loss": 0.8273, "nll_loss": 0.824679970741272, "rewards/accuracies": 1.0, "rewards/chosen": 4.469523906707764, "rewards/margins": 10.144119262695312, "rewards/rejected": -5.674594879150391, "step": 2763 }, { "epoch": 0.46066666666666667, "grad_norm": 105.54021453857422, "learning_rate": 1.1750655756516633e-07, "logits/chosen": 3.0357584953308105, "logits/rejected": 2.96321177482605, "logps/chosen": -61.085391998291016, "logps/rejected": -33.859825134277344, "loss": 2.169, "nll_loss": 2.036179542541504, "rewards/accuracies": 1.0, "rewards/chosen": 0.6902104020118713, "rewards/margins": 2.8422982692718506, "rewards/rejected": -2.152087926864624, "step": 2764 }, { "epoch": 0.4608333333333333, "grad_norm": 75.90201568603516, "learning_rate": 1.1745340937579118e-07, "logits/chosen": 1.708249807357788, "logits/rejected": 1.9523968696594238, "logps/chosen": -44.07616424560547, "logps/rejected": -156.32266235351562, "loss": 1.1944, "nll_loss": 1.1598989963531494, "rewards/accuracies": 1.0, "rewards/chosen": 0.9648365378379822, "rewards/margins": 5.8653645515441895, "rewards/rejected": -4.9005279541015625, "step": 2765 }, { "epoch": 0.461, "grad_norm": 27.8690185546875, "learning_rate": 1.1740025610091158e-07, "logits/chosen": 1.6254839897155762, "logits/rejected": 2.1175377368927, "logps/chosen": -94.96298217773438, "logps/rejected": -251.21287536621094, "loss": 1.0647, "nll_loss": 1.0435492992401123, "rewards/accuracies": 1.0, "rewards/chosen": 1.2953393459320068, "rewards/margins": 9.371842384338379, "rewards/rejected": -8.076502799987793, "step": 2766 }, { "epoch": 0.46116666666666667, "grad_norm": 39.06489181518555, "learning_rate": 1.1734709775601508e-07, "logits/chosen": 2.2376952171325684, "logits/rejected": 2.0791380405426025, "logps/chosen": -18.994640350341797, "logps/rejected": -123.48522186279297, "loss": 0.6086, "nll_loss": 0.5935825705528259, "rewards/accuracies": 1.0, "rewards/chosen": 1.7374776601791382, "rewards/margins": 7.916655540466309, "rewards/rejected": -6.179177761077881, "step": 2767 }, { "epoch": 0.4613333333333333, "grad_norm": 45.81684875488281, "learning_rate": 1.1729393435659076e-07, "logits/chosen": 2.2238729000091553, "logits/rejected": 2.2675352096557617, "logps/chosen": -41.45629119873047, "logps/rejected": -65.19699096679688, "loss": 1.0426, "nll_loss": 1.036407470703125, "rewards/accuracies": 1.0, "rewards/chosen": 3.5953853130340576, "rewards/margins": 8.441742897033691, "rewards/rejected": -4.846357822418213, "step": 2768 }, { "epoch": 0.4615, "grad_norm": 17.946535110473633, "learning_rate": 1.1724076591812917e-07, "logits/chosen": 2.513406276702881, "logits/rejected": 2.744305372238159, "logps/chosen": -91.05966186523438, "logps/rejected": -272.7140197753906, "loss": 0.6376, "nll_loss": 0.6279976963996887, "rewards/accuracies": 1.0, "rewards/chosen": 2.2241806983947754, "rewards/margins": 8.518187522888184, "rewards/rejected": -6.294006824493408, "step": 2769 }, { "epoch": 0.46166666666666667, "grad_norm": 39.46083068847656, "learning_rate": 1.1718759245612228e-07, "logits/chosen": 2.6417906284332275, "logits/rejected": 2.6749651432037354, "logps/chosen": -124.31565856933594, "logps/rejected": -116.1793441772461, "loss": 1.2231, "nll_loss": 1.183958649635315, "rewards/accuracies": 1.0, "rewards/chosen": 0.8361434936523438, "rewards/margins": 5.626435279846191, "rewards/rejected": -4.790291786193848, "step": 2770 }, { "epoch": 0.4618333333333333, "grad_norm": 18.321977615356445, "learning_rate": 1.1713441398606361e-07, "logits/chosen": 1.6262997388839722, "logits/rejected": 2.4146578311920166, "logps/chosen": -176.9017333984375, "logps/rejected": -346.3250732421875, "loss": 1.0255, "nll_loss": 1.0166765451431274, "rewards/accuracies": 1.0, "rewards/chosen": 2.2333710193634033, "rewards/margins": 9.501456260681152, "rewards/rejected": -7.26808500289917, "step": 2771 }, { "epoch": 0.462, "grad_norm": 27.65104103088379, "learning_rate": 1.1708123052344803e-07, "logits/chosen": 2.3331029415130615, "logits/rejected": 2.6845128536224365, "logps/chosen": -11.457099914550781, "logps/rejected": -201.08349609375, "loss": 0.3918, "nll_loss": 0.3695838451385498, "rewards/accuracies": 1.0, "rewards/chosen": 1.3202648162841797, "rewards/margins": 7.28394889831543, "rewards/rejected": -5.96368408203125, "step": 2772 }, { "epoch": 0.46216666666666667, "grad_norm": 320.33624267578125, "learning_rate": 1.1702804208377194e-07, "logits/chosen": 1.3832186460494995, "logits/rejected": 2.2597692012786865, "logps/chosen": -53.66175842285156, "logps/rejected": -147.95480346679688, "loss": 2.5379, "nll_loss": 0.8130568861961365, "rewards/accuracies": 0.0, "rewards/chosen": 1.2056525945663452, "rewards/margins": -1.0255242586135864, "rewards/rejected": 2.2311768531799316, "step": 2773 }, { "epoch": 0.4623333333333333, "grad_norm": 82.50496673583984, "learning_rate": 1.1697484868253326e-07, "logits/chosen": 0.8562636971473694, "logits/rejected": 2.0989573001861572, "logps/chosen": -59.417152404785156, "logps/rejected": -320.0768127441406, "loss": 1.7595, "nll_loss": 1.747563362121582, "rewards/accuracies": 1.0, "rewards/chosen": 1.8813972473144531, "rewards/margins": 13.818419456481934, "rewards/rejected": -11.93702220916748, "step": 2774 }, { "epoch": 0.4625, "grad_norm": 46.90553665161133, "learning_rate": 1.1692165033523116e-07, "logits/chosen": 2.328716993331909, "logits/rejected": 2.6021156311035156, "logps/chosen": -11.479870796203613, "logps/rejected": -119.48461151123047, "loss": 0.4642, "nll_loss": 0.44153350591659546, "rewards/accuracies": 1.0, "rewards/chosen": 2.0773913860321045, "rewards/margins": 5.894203186035156, "rewards/rejected": -3.8168115615844727, "step": 2775 }, { "epoch": 0.46266666666666667, "grad_norm": 17.561315536499023, "learning_rate": 1.1686844705736641e-07, "logits/chosen": 2.9510533809661865, "logits/rejected": 2.8308064937591553, "logps/chosen": -139.88818359375, "logps/rejected": -140.0294189453125, "loss": 0.7861, "nll_loss": 0.7686165571212769, "rewards/accuracies": 1.0, "rewards/chosen": 2.6945481300354004, "rewards/margins": 6.483142852783203, "rewards/rejected": -3.788594961166382, "step": 2776 }, { "epoch": 0.4628333333333333, "grad_norm": 34.97629928588867, "learning_rate": 1.1681523886444116e-07, "logits/chosen": 2.1951911449432373, "logits/rejected": 2.4512839317321777, "logps/chosen": -32.209999084472656, "logps/rejected": -119.41073608398438, "loss": 0.7611, "nll_loss": 0.685319185256958, "rewards/accuracies": 1.0, "rewards/chosen": 0.2385173887014389, "rewards/margins": 4.222221851348877, "rewards/rejected": -3.9837045669555664, "step": 2777 }, { "epoch": 0.463, "grad_norm": 144.6463623046875, "learning_rate": 1.16762025771959e-07, "logits/chosen": 3.096259593963623, "logits/rejected": 3.11908221244812, "logps/chosen": -40.81959915161133, "logps/rejected": -16.60114097595215, "loss": 3.4153, "nll_loss": 0.559172511100769, "rewards/accuracies": 0.0, "rewards/chosen": 1.2848598957061768, "rewards/margins": -2.3188459873199463, "rewards/rejected": 3.603705883026123, "step": 2778 }, { "epoch": 0.46316666666666667, "grad_norm": 28.91254234313965, "learning_rate": 1.1670880779542494e-07, "logits/chosen": 2.547173500061035, "logits/rejected": 2.4819819927215576, "logps/chosen": -87.3488540649414, "logps/rejected": -100.9536361694336, "loss": 0.9333, "nll_loss": 0.8823117613792419, "rewards/accuracies": 1.0, "rewards/chosen": 1.514692783355713, "rewards/margins": 4.452488422393799, "rewards/rejected": -2.937795639038086, "step": 2779 }, { "epoch": 0.4633333333333333, "grad_norm": 34.731651306152344, "learning_rate": 1.1665558495034545e-07, "logits/chosen": 2.301870346069336, "logits/rejected": 2.2984304428100586, "logps/chosen": -38.19142150878906, "logps/rejected": -124.00572204589844, "loss": 0.9669, "nll_loss": 0.9547854661941528, "rewards/accuracies": 1.0, "rewards/chosen": 2.2763078212738037, "rewards/margins": 7.228536605834961, "rewards/rejected": -4.952228546142578, "step": 2780 }, { "epoch": 0.4635, "grad_norm": 29.138290405273438, "learning_rate": 1.1660235725222833e-07, "logits/chosen": 2.793872117996216, "logits/rejected": 2.6875500679016113, "logps/chosen": -127.39215087890625, "logps/rejected": -96.82372283935547, "loss": 1.1969, "nll_loss": 1.1581103801727295, "rewards/accuracies": 1.0, "rewards/chosen": 1.7889313697814941, "rewards/margins": 4.945969581604004, "rewards/rejected": -3.157038450241089, "step": 2781 }, { "epoch": 0.46366666666666667, "grad_norm": 29.180831909179688, "learning_rate": 1.1654912471658293e-07, "logits/chosen": 2.0116233825683594, "logits/rejected": 2.3064091205596924, "logps/chosen": -85.68385314941406, "logps/rejected": -399.8466491699219, "loss": 0.9639, "nll_loss": 0.9415807127952576, "rewards/accuracies": 1.0, "rewards/chosen": 1.231116533279419, "rewards/margins": 10.658539772033691, "rewards/rejected": -9.427423477172852, "step": 2782 }, { "epoch": 0.4638333333333333, "grad_norm": 35.55800247192383, "learning_rate": 1.1649588735891986e-07, "logits/chosen": 2.8934566974639893, "logits/rejected": 2.794431209564209, "logps/chosen": -113.83755493164062, "logps/rejected": -65.67713165283203, "loss": 1.4483, "nll_loss": 1.4229693412780762, "rewards/accuracies": 1.0, "rewards/chosen": 1.2588615417480469, "rewards/margins": 6.5212721824646, "rewards/rejected": -5.262410640716553, "step": 2783 }, { "epoch": 0.464, "grad_norm": 38.25171661376953, "learning_rate": 1.1644264519475129e-07, "logits/chosen": 1.6591559648513794, "logits/rejected": 2.1328399181365967, "logps/chosen": -31.12212371826172, "logps/rejected": -128.87962341308594, "loss": 0.8211, "nll_loss": 0.7980031371116638, "rewards/accuracies": 1.0, "rewards/chosen": 1.2122634649276733, "rewards/margins": 8.411447525024414, "rewards/rejected": -7.199184417724609, "step": 2784 }, { "epoch": 0.46416666666666667, "grad_norm": 27.28510284423828, "learning_rate": 1.163893982395906e-07, "logits/chosen": 1.9498831033706665, "logits/rejected": 1.8958157300949097, "logps/chosen": -21.050737380981445, "logps/rejected": -96.98939514160156, "loss": 0.4665, "nll_loss": 0.45762476325035095, "rewards/accuracies": 1.0, "rewards/chosen": 2.6865484714508057, "rewards/margins": 7.747591018676758, "rewards/rejected": -5.061042785644531, "step": 2785 }, { "epoch": 0.4643333333333333, "grad_norm": 198.23141479492188, "learning_rate": 1.1633614650895279e-07, "logits/chosen": 2.4331448078155518, "logits/rejected": 1.6719520092010498, "logps/chosen": -122.85108947753906, "logps/rejected": -116.40402221679688, "loss": 1.5925, "nll_loss": 0.8082308769226074, "rewards/accuracies": 1.0, "rewards/chosen": 1.405720591545105, "rewards/margins": 0.5154152512550354, "rewards/rejected": 0.8903053402900696, "step": 2786 }, { "epoch": 0.4645, "grad_norm": 33.765438079833984, "learning_rate": 1.1628289001835403e-07, "logits/chosen": 2.255129814147949, "logits/rejected": 1.8998057842254639, "logps/chosen": -94.9151840209961, "logps/rejected": -67.6662826538086, "loss": 1.2439, "nll_loss": 1.2168614864349365, "rewards/accuracies": 1.0, "rewards/chosen": 1.6649116277694702, "rewards/margins": 5.609044075012207, "rewards/rejected": -3.9441323280334473, "step": 2787 }, { "epoch": 0.4646666666666667, "grad_norm": 26.88033676147461, "learning_rate": 1.1622962878331208e-07, "logits/chosen": 2.59417986869812, "logits/rejected": 2.6666994094848633, "logps/chosen": -57.05061340332031, "logps/rejected": -244.57791137695312, "loss": 0.7871, "nll_loss": 0.7709542512893677, "rewards/accuracies": 1.0, "rewards/chosen": 1.863433837890625, "rewards/margins": 6.898872375488281, "rewards/rejected": -5.035438537597656, "step": 2788 }, { "epoch": 0.4648333333333333, "grad_norm": 28.799701690673828, "learning_rate": 1.1617636281934591e-07, "logits/chosen": 0.2136571705341339, "logits/rejected": 1.9898210763931274, "logps/chosen": -80.53617095947266, "logps/rejected": -602.6710205078125, "loss": 1.0676, "nll_loss": 1.045924186706543, "rewards/accuracies": 1.0, "rewards/chosen": 1.2551796436309814, "rewards/margins": 10.965933799743652, "rewards/rejected": -9.71075439453125, "step": 2789 }, { "epoch": 0.465, "grad_norm": 31.9005126953125, "learning_rate": 1.1612309214197597e-07, "logits/chosen": 2.6012282371520996, "logits/rejected": 2.3117082118988037, "logps/chosen": -107.26066589355469, "logps/rejected": -16.58921241760254, "loss": 1.2284, "nll_loss": 0.8379738926887512, "rewards/accuracies": 1.0, "rewards/chosen": 5.211400508880615, "rewards/margins": 4.384452819824219, "rewards/rejected": 0.8269476294517517, "step": 2790 }, { "epoch": 0.4651666666666667, "grad_norm": 33.17339324951172, "learning_rate": 1.1606981676672407e-07, "logits/chosen": 2.1240181922912598, "logits/rejected": 2.187635660171509, "logps/chosen": -96.82852172851562, "logps/rejected": -237.40350341796875, "loss": 1.1204, "nll_loss": 1.100324273109436, "rewards/accuracies": 1.0, "rewards/chosen": 1.3400604724884033, "rewards/margins": 10.114184379577637, "rewards/rejected": -8.774124145507812, "step": 2791 }, { "epoch": 0.4653333333333333, "grad_norm": 15.677998542785645, "learning_rate": 1.1601653670911338e-07, "logits/chosen": 2.669025182723999, "logits/rejected": 2.7180137634277344, "logps/chosen": -162.3150634765625, "logps/rejected": -245.81961059570312, "loss": 0.7466, "nll_loss": 0.7377957701683044, "rewards/accuracies": 1.0, "rewards/chosen": 2.3234193325042725, "rewards/margins": 8.558016777038574, "rewards/rejected": -6.234597682952881, "step": 2792 }, { "epoch": 0.4655, "grad_norm": 168.48446655273438, "learning_rate": 1.1596325198466839e-07, "logits/chosen": 1.7221647500991821, "logits/rejected": 1.94028639793396, "logps/chosen": -37.69245910644531, "logps/rejected": -65.42350769042969, "loss": 1.7999, "nll_loss": 0.6853175163269043, "rewards/accuracies": 0.0, "rewards/chosen": 0.21499024331569672, "rewards/margins": -0.37400132417678833, "rewards/rejected": 0.5889915823936462, "step": 2793 }, { "epoch": 0.4656666666666667, "grad_norm": 34.176025390625, "learning_rate": 1.15909962608915e-07, "logits/chosen": 1.4523309469223022, "logits/rejected": 2.0909478664398193, "logps/chosen": -36.166812896728516, "logps/rejected": -211.52603149414062, "loss": 0.8836, "nll_loss": 0.8821173906326294, "rewards/accuracies": 1.0, "rewards/chosen": 4.3468122482299805, "rewards/margins": 11.418296813964844, "rewards/rejected": -7.0714850425720215, "step": 2794 }, { "epoch": 0.4658333333333333, "grad_norm": 21.469844818115234, "learning_rate": 1.158566685973805e-07, "logits/chosen": 2.5295794010162354, "logits/rejected": 2.84946346282959, "logps/chosen": -30.523967742919922, "logps/rejected": -333.00750732421875, "loss": 0.4369, "nll_loss": 0.42991501092910767, "rewards/accuracies": 1.0, "rewards/chosen": 2.43338942527771, "rewards/margins": 12.583849906921387, "rewards/rejected": -10.150460243225098, "step": 2795 }, { "epoch": 0.466, "grad_norm": 27.191669464111328, "learning_rate": 1.1580336996559342e-07, "logits/chosen": 2.5568249225616455, "logits/rejected": 2.7247262001037598, "logps/chosen": -114.86172485351562, "logps/rejected": -219.75137329101562, "loss": 1.1846, "nll_loss": 1.1486172676086426, "rewards/accuracies": 1.0, "rewards/chosen": 1.4264588356018066, "rewards/margins": 5.074265003204346, "rewards/rejected": -3.647806167602539, "step": 2796 }, { "epoch": 0.4661666666666667, "grad_norm": 31.77311134338379, "learning_rate": 1.1575006672908376e-07, "logits/chosen": 2.727069139480591, "logits/rejected": 2.82738995552063, "logps/chosen": -98.70437622070312, "logps/rejected": -309.12481689453125, "loss": 1.1984, "nll_loss": 1.1892093420028687, "rewards/accuracies": 1.0, "rewards/chosen": 2.512167453765869, "rewards/margins": 7.80153226852417, "rewards/rejected": -5.289364814758301, "step": 2797 }, { "epoch": 0.4663333333333333, "grad_norm": 37.075408935546875, "learning_rate": 1.1569675890338277e-07, "logits/chosen": 3.113435745239258, "logits/rejected": 3.2086007595062256, "logps/chosen": -89.10877227783203, "logps/rejected": -197.04415893554688, "loss": 0.9438, "nll_loss": 0.9282164573669434, "rewards/accuracies": 1.0, "rewards/chosen": 1.688871145248413, "rewards/margins": 7.865751266479492, "rewards/rejected": -6.1768798828125, "step": 2798 }, { "epoch": 0.4665, "grad_norm": 32.19673538208008, "learning_rate": 1.1564344650402309e-07, "logits/chosen": 2.0699427127838135, "logits/rejected": 1.997921109199524, "logps/chosen": -92.85377502441406, "logps/rejected": -69.59524536132812, "loss": 1.2122, "nll_loss": 1.1904329061508179, "rewards/accuracies": 1.0, "rewards/chosen": 1.8710527420043945, "rewards/margins": 6.008081436157227, "rewards/rejected": -4.137028694152832, "step": 2799 }, { "epoch": 0.4666666666666667, "grad_norm": 226.94720458984375, "learning_rate": 1.1559012954653864e-07, "logits/chosen": 2.7552740573883057, "logits/rejected": 2.746500253677368, "logps/chosen": -68.43428802490234, "logps/rejected": -92.3265151977539, "loss": 2.5295, "nll_loss": 0.7128570675849915, "rewards/accuracies": 0.0, "rewards/chosen": 1.456800103187561, "rewards/margins": -1.0858460664749146, "rewards/rejected": 2.5426461696624756, "step": 2800 }, { "epoch": 0.4668333333333333, "grad_norm": 51.9109992980957, "learning_rate": 1.1553680804646475e-07, "logits/chosen": 3.4159350395202637, "logits/rejected": 3.2188971042633057, "logps/chosen": -79.55797576904297, "logps/rejected": -33.030643463134766, "loss": 1.1067, "nll_loss": 0.9821972250938416, "rewards/accuracies": 1.0, "rewards/chosen": 1.1667442321777344, "rewards/margins": 3.053532123565674, "rewards/rejected": -1.8867878913879395, "step": 2801 }, { "epoch": 0.467, "grad_norm": 28.426746368408203, "learning_rate": 1.1548348201933798e-07, "logits/chosen": 0.9025616645812988, "logits/rejected": 2.0507121086120605, "logps/chosen": -82.65560913085938, "logps/rejected": -362.6705627441406, "loss": 1.1644, "nll_loss": 1.1479946374893188, "rewards/accuracies": 1.0, "rewards/chosen": 1.5476319789886475, "rewards/margins": 16.295278549194336, "rewards/rejected": -14.74764633178711, "step": 2802 }, { "epoch": 0.4671666666666667, "grad_norm": 33.74467086791992, "learning_rate": 1.1543015148069628e-07, "logits/chosen": 2.7875988483428955, "logits/rejected": 3.1314234733581543, "logps/chosen": -67.6286849975586, "logps/rejected": -180.1764678955078, "loss": 1.0266, "nll_loss": 1.0093835592269897, "rewards/accuracies": 1.0, "rewards/chosen": 1.7873550653457642, "rewards/margins": 6.787829399108887, "rewards/rejected": -5.000474452972412, "step": 2803 }, { "epoch": 0.4673333333333333, "grad_norm": 30.260562896728516, "learning_rate": 1.1537681644607886e-07, "logits/chosen": 2.0600979328155518, "logits/rejected": 2.271832227706909, "logps/chosen": -17.28385353088379, "logps/rejected": -165.5610809326172, "loss": 0.5533, "nll_loss": 0.5237531661987305, "rewards/accuracies": 1.0, "rewards/chosen": 1.5884040594100952, "rewards/margins": 5.435231685638428, "rewards/rejected": -3.846827745437622, "step": 2804 }, { "epoch": 0.4675, "grad_norm": 40.28974914550781, "learning_rate": 1.1532347693102631e-07, "logits/chosen": 1.9054155349731445, "logits/rejected": 1.6695654392242432, "logps/chosen": -35.62346649169922, "logps/rejected": -41.69987487792969, "loss": 0.793, "nll_loss": 0.7421555519104004, "rewards/accuracies": 1.0, "rewards/chosen": 1.5843571424484253, "rewards/margins": 4.473820686340332, "rewards/rejected": -2.8894636631011963, "step": 2805 }, { "epoch": 0.4676666666666667, "grad_norm": 30.951873779296875, "learning_rate": 1.1527013295108045e-07, "logits/chosen": 0.6393945217132568, "logits/rejected": 2.0675172805786133, "logps/chosen": -11.590747833251953, "logps/rejected": -316.79949951171875, "loss": 0.3279, "nll_loss": 0.28976869583129883, "rewards/accuracies": 1.0, "rewards/chosen": 0.6461673974990845, "rewards/margins": 9.798707008361816, "rewards/rejected": -9.152539253234863, "step": 2806 }, { "epoch": 0.4678333333333333, "grad_norm": 69.39287567138672, "learning_rate": 1.1521678452178447e-07, "logits/chosen": 2.136679172515869, "logits/rejected": 2.0472006797790527, "logps/chosen": -79.50814819335938, "logps/rejected": -31.768291473388672, "loss": 1.321, "nll_loss": 1.0064321756362915, "rewards/accuracies": 1.0, "rewards/chosen": 1.308435082435608, "rewards/margins": 1.860316514968872, "rewards/rejected": -0.5518814325332642, "step": 2807 }, { "epoch": 0.468, "grad_norm": 26.435665130615234, "learning_rate": 1.1516343165868278e-07, "logits/chosen": 1.5635877847671509, "logits/rejected": 2.6093220710754395, "logps/chosen": -41.11092758178711, "logps/rejected": -226.99342346191406, "loss": 0.6752, "nll_loss": 0.6525543928146362, "rewards/accuracies": 1.0, "rewards/chosen": 1.2056121826171875, "rewards/margins": 10.617939949035645, "rewards/rejected": -9.412327766418457, "step": 2808 }, { "epoch": 0.4681666666666667, "grad_norm": 24.463773727416992, "learning_rate": 1.1511007437732117e-07, "logits/chosen": 1.5685169696807861, "logits/rejected": 2.411259889602661, "logps/chosen": -18.453048706054688, "logps/rejected": -395.70513916015625, "loss": 0.4128, "nll_loss": 0.40115323662757874, "rewards/accuracies": 1.0, "rewards/chosen": 1.898934245109558, "rewards/margins": 18.946434020996094, "rewards/rejected": -17.047500610351562, "step": 2809 }, { "epoch": 0.4683333333333333, "grad_norm": 15.941925048828125, "learning_rate": 1.1505671269324662e-07, "logits/chosen": 2.213172674179077, "logits/rejected": 2.3410279750823975, "logps/chosen": -153.96910095214844, "logps/rejected": -428.6925048828125, "loss": 0.808, "nll_loss": 0.8061210513114929, "rewards/accuracies": 1.0, "rewards/chosen": 3.747903823852539, "rewards/margins": 13.287775039672852, "rewards/rejected": -9.539871215820312, "step": 2810 }, { "epoch": 0.4685, "grad_norm": 56.06699752807617, "learning_rate": 1.1500334662200749e-07, "logits/chosen": 1.629050612449646, "logits/rejected": 2.7442615032196045, "logps/chosen": -5.838408946990967, "logps/rejected": -200.50558471679688, "loss": 0.4754, "nll_loss": 0.4170292317867279, "rewards/accuracies": 1.0, "rewards/chosen": 0.5856961607933044, "rewards/margins": 4.516076564788818, "rewards/rejected": -3.930380344390869, "step": 2811 }, { "epoch": 0.4686666666666667, "grad_norm": 56.223419189453125, "learning_rate": 1.1494997617915332e-07, "logits/chosen": 2.988541841506958, "logits/rejected": 2.9183669090270996, "logps/chosen": -13.438166618347168, "logps/rejected": -33.11946105957031, "loss": 0.7065, "nll_loss": 0.47993454337120056, "rewards/accuracies": 1.0, "rewards/chosen": 0.7552618980407715, "rewards/margins": 2.104771614074707, "rewards/rejected": -1.349509835243225, "step": 2812 }, { "epoch": 0.4688333333333333, "grad_norm": 63.72279739379883, "learning_rate": 1.1489660138023503e-07, "logits/chosen": 2.2951877117156982, "logits/rejected": 2.302361488342285, "logps/chosen": -39.909812927246094, "logps/rejected": -71.59138488769531, "loss": 0.9335, "nll_loss": 0.928135097026825, "rewards/accuracies": 1.0, "rewards/chosen": 3.07281494140625, "rewards/margins": 8.80233383178711, "rewards/rejected": -5.729518413543701, "step": 2813 }, { "epoch": 0.469, "grad_norm": 50.32878112792969, "learning_rate": 1.1484322224080472e-07, "logits/chosen": 2.743040084838867, "logits/rejected": 2.7262558937072754, "logps/chosen": -33.029720306396484, "logps/rejected": -132.15036010742188, "loss": 1.0073, "nll_loss": 0.9437062740325928, "rewards/accuracies": 1.0, "rewards/chosen": 1.297323226928711, "rewards/margins": 4.063333511352539, "rewards/rejected": -2.7660105228424072, "step": 2814 }, { "epoch": 0.4691666666666667, "grad_norm": 35.899383544921875, "learning_rate": 1.1478983877641579e-07, "logits/chosen": 2.7369391918182373, "logits/rejected": 2.731792449951172, "logps/chosen": -30.937484741210938, "logps/rejected": -199.08969116210938, "loss": 0.7622, "nll_loss": 0.7194763422012329, "rewards/accuracies": 1.0, "rewards/chosen": 0.5475765466690063, "rewards/margins": 7.161741256713867, "rewards/rejected": -6.61416482925415, "step": 2815 }, { "epoch": 0.4693333333333333, "grad_norm": 78.36261749267578, "learning_rate": 1.1473645100262293e-07, "logits/chosen": 2.2645273208618164, "logits/rejected": 2.157205104827881, "logps/chosen": -95.22676849365234, "logps/rejected": -48.811824798583984, "loss": 1.6304, "nll_loss": 1.3412224054336548, "rewards/accuracies": 1.0, "rewards/chosen": 3.3860909938812256, "rewards/margins": 3.3347742557525635, "rewards/rejected": 0.05131683871150017, "step": 2816 }, { "epoch": 0.4695, "grad_norm": 26.223474502563477, "learning_rate": 1.1468305893498203e-07, "logits/chosen": 2.3966596126556396, "logits/rejected": 2.7313382625579834, "logps/chosen": -13.663077354431152, "logps/rejected": -251.020263671875, "loss": 0.4082, "nll_loss": 0.3795298635959625, "rewards/accuracies": 1.0, "rewards/chosen": 0.9498074650764465, "rewards/margins": 11.587037086486816, "rewards/rejected": -10.637229919433594, "step": 2817 }, { "epoch": 0.4696666666666667, "grad_norm": 22.841033935546875, "learning_rate": 1.1462966258905026e-07, "logits/chosen": 1.3093029260635376, "logits/rejected": 1.6583445072174072, "logps/chosen": -72.03759765625, "logps/rejected": -164.63768005371094, "loss": 0.719, "nll_loss": 0.692669153213501, "rewards/accuracies": 1.0, "rewards/chosen": 1.3235543966293335, "rewards/margins": 6.061226844787598, "rewards/rejected": -4.737672328948975, "step": 2818 }, { "epoch": 0.4698333333333333, "grad_norm": 35.739013671875, "learning_rate": 1.1457626198038603e-07, "logits/chosen": 1.8468375205993652, "logits/rejected": 2.67625093460083, "logps/chosen": -31.903013229370117, "logps/rejected": -438.0097351074219, "loss": 0.8201, "nll_loss": 0.7975753545761108, "rewards/accuracies": 1.0, "rewards/chosen": 1.2073614597320557, "rewards/margins": 10.58178424835205, "rewards/rejected": -9.374423027038574, "step": 2819 }, { "epoch": 0.47, "grad_norm": 35.14491653442383, "learning_rate": 1.1452285712454903e-07, "logits/chosen": 2.500656843185425, "logits/rejected": 2.429023027420044, "logps/chosen": -39.864723205566406, "logps/rejected": -79.75898742675781, "loss": 0.8974, "nll_loss": 0.8666243553161621, "rewards/accuracies": 1.0, "rewards/chosen": 4.065468788146973, "rewards/margins": 6.860835075378418, "rewards/rejected": -2.795366048812866, "step": 2820 }, { "epoch": 0.4701666666666667, "grad_norm": 100.4468765258789, "learning_rate": 1.144694480371001e-07, "logits/chosen": 2.6214375495910645, "logits/rejected": 2.4854001998901367, "logps/chosen": -51.60136413574219, "logps/rejected": -4.359287261962891, "loss": 1.749, "nll_loss": 0.6370538473129272, "rewards/accuracies": 0.0, "rewards/chosen": 1.5984926223754883, "rewards/margins": -0.03821396827697754, "rewards/rejected": 1.6367065906524658, "step": 2821 }, { "epoch": 0.4703333333333333, "grad_norm": 32.397823333740234, "learning_rate": 1.1441603473360139e-07, "logits/chosen": 2.386535406112671, "logits/rejected": 2.593045473098755, "logps/chosen": -55.556304931640625, "logps/rejected": -198.3291473388672, "loss": 0.8241, "nll_loss": 0.7936614155769348, "rewards/accuracies": 1.0, "rewards/chosen": 1.04241943359375, "rewards/margins": 6.276786804199219, "rewards/rejected": -5.234367370605469, "step": 2822 }, { "epoch": 0.4705, "grad_norm": 15.927347183227539, "learning_rate": 1.1436261722961626e-07, "logits/chosen": 2.666877269744873, "logits/rejected": 3.4835360050201416, "logps/chosen": -8.698970794677734, "logps/rejected": -212.27908325195312, "loss": 0.1993, "nll_loss": 0.19331048429012299, "rewards/accuracies": 1.0, "rewards/chosen": 2.730818748474121, "rewards/margins": 9.233213424682617, "rewards/rejected": -6.502394199371338, "step": 2823 }, { "epoch": 0.4706666666666667, "grad_norm": 31.295454025268555, "learning_rate": 1.143091955407093e-07, "logits/chosen": 1.9249762296676636, "logits/rejected": 2.108161687850952, "logps/chosen": -90.39960479736328, "logps/rejected": -135.59510803222656, "loss": 1.0612, "nll_loss": 1.0390759706497192, "rewards/accuracies": 1.0, "rewards/chosen": 1.6413367986679077, "rewards/margins": 6.129054546356201, "rewards/rejected": -4.487717628479004, "step": 2824 }, { "epoch": 0.4708333333333333, "grad_norm": 28.94449806213379, "learning_rate": 1.1425576968244626e-07, "logits/chosen": 2.574371814727783, "logits/rejected": 2.4549055099487305, "logps/chosen": -60.64244842529297, "logps/rejected": -105.18707275390625, "loss": 0.7448, "nll_loss": 0.7051446437835693, "rewards/accuracies": 1.0, "rewards/chosen": 1.3322381973266602, "rewards/margins": 4.894973278045654, "rewards/rejected": -3.562735080718994, "step": 2825 }, { "epoch": 0.471, "grad_norm": 47.1841926574707, "learning_rate": 1.1420233967039421e-07, "logits/chosen": 1.2492544651031494, "logits/rejected": 2.118583917617798, "logps/chosen": -126.39295959472656, "logps/rejected": -281.0684814453125, "loss": 1.4143, "nll_loss": 1.3304520845413208, "rewards/accuracies": 1.0, "rewards/chosen": -0.23202286660671234, "rewards/margins": 6.12035608291626, "rewards/rejected": -6.352378845214844, "step": 2826 }, { "epoch": 0.4711666666666667, "grad_norm": 38.54541015625, "learning_rate": 1.1414890552012133e-07, "logits/chosen": 2.0127670764923096, "logits/rejected": 2.810305118560791, "logps/chosen": -20.156539916992188, "logps/rejected": -225.51708984375, "loss": 0.6697, "nll_loss": 0.6298917531967163, "rewards/accuracies": 1.0, "rewards/chosen": 0.7123306393623352, "rewards/margins": 6.074888706207275, "rewards/rejected": -5.362557888031006, "step": 2827 }, { "epoch": 0.4713333333333333, "grad_norm": 49.814308166503906, "learning_rate": 1.1409546724719708e-07, "logits/chosen": 2.5102055072784424, "logits/rejected": 2.6499838829040527, "logps/chosen": -19.337055206298828, "logps/rejected": -130.797119140625, "loss": 0.8162, "nll_loss": 0.7734822034835815, "rewards/accuracies": 1.0, "rewards/chosen": 0.7149170637130737, "rewards/margins": 5.53416109085083, "rewards/rejected": -4.819243907928467, "step": 2828 }, { "epoch": 0.4715, "grad_norm": 34.42094421386719, "learning_rate": 1.1404202486719203e-07, "logits/chosen": 2.0348896980285645, "logits/rejected": 2.4710192680358887, "logps/chosen": -20.059925079345703, "logps/rejected": -510.4067077636719, "loss": 0.7463, "nll_loss": 0.7429600954055786, "rewards/accuracies": 1.0, "rewards/chosen": 3.219487190246582, "rewards/margins": 10.92686939239502, "rewards/rejected": -7.7073822021484375, "step": 2829 }, { "epoch": 0.4716666666666667, "grad_norm": 27.426958084106445, "learning_rate": 1.1398857839567811e-07, "logits/chosen": 1.311713695526123, "logits/rejected": 2.680793046951294, "logps/chosen": -31.690021514892578, "logps/rejected": -281.78076171875, "loss": 0.5429, "nll_loss": 0.5111293196678162, "rewards/accuracies": 1.0, "rewards/chosen": 0.8816081881523132, "rewards/margins": 7.2654128074646, "rewards/rejected": -6.383804798126221, "step": 2830 }, { "epoch": 0.4718333333333333, "grad_norm": 56.70554733276367, "learning_rate": 1.1393512784822823e-07, "logits/chosen": 3.244081974029541, "logits/rejected": 3.269808053970337, "logps/chosen": -30.317977905273438, "logps/rejected": -52.446903228759766, "loss": 0.9258, "nll_loss": 0.8194047808647156, "rewards/accuracies": 1.0, "rewards/chosen": 0.271505743265152, "rewards/margins": 3.2689170837402344, "rewards/rejected": -2.9974112510681152, "step": 2831 }, { "epoch": 0.472, "grad_norm": 40.29832458496094, "learning_rate": 1.1388167324041669e-07, "logits/chosen": 2.139615297317505, "logits/rejected": 2.086524486541748, "logps/chosen": -35.791290283203125, "logps/rejected": -50.71963119506836, "loss": 1.0074, "nll_loss": 0.9942023158073425, "rewards/accuracies": 1.0, "rewards/chosen": 3.4879395961761475, "rewards/margins": 7.3514933586120605, "rewards/rejected": -3.863553762435913, "step": 2832 }, { "epoch": 0.4721666666666667, "grad_norm": 32.216670989990234, "learning_rate": 1.1382821458781879e-07, "logits/chosen": 1.545786738395691, "logits/rejected": 1.4004218578338623, "logps/chosen": -29.950206756591797, "logps/rejected": -60.90754318237305, "loss": 0.5705, "nll_loss": 0.5076307058334351, "rewards/accuracies": 1.0, "rewards/chosen": 1.4977185726165771, "rewards/margins": 4.134014129638672, "rewards/rejected": -2.6362953186035156, "step": 2833 }, { "epoch": 0.4723333333333333, "grad_norm": 34.65019607543945, "learning_rate": 1.1377475190601118e-07, "logits/chosen": 2.569697380065918, "logits/rejected": 2.4209206104278564, "logps/chosen": -74.2171630859375, "logps/rejected": -190.30868530273438, "loss": 1.1115, "nll_loss": 1.107718825340271, "rewards/accuracies": 1.0, "rewards/chosen": 3.047945499420166, "rewards/margins": 12.102132797241211, "rewards/rejected": -9.054186820983887, "step": 2834 }, { "epoch": 0.4725, "grad_norm": 23.361276626586914, "learning_rate": 1.1372128521057153e-07, "logits/chosen": 1.5041124820709229, "logits/rejected": 2.2571613788604736, "logps/chosen": -28.420757293701172, "logps/rejected": -219.88467407226562, "loss": 0.4842, "nll_loss": 0.4736793339252472, "rewards/accuracies": 1.0, "rewards/chosen": 2.2006280422210693, "rewards/margins": 7.901183128356934, "rewards/rejected": -5.700555324554443, "step": 2835 }, { "epoch": 0.4726666666666667, "grad_norm": 23.510814666748047, "learning_rate": 1.1366781451707879e-07, "logits/chosen": 1.3265607357025146, "logits/rejected": 1.6239112615585327, "logps/chosen": -62.131317138671875, "logps/rejected": -92.5671615600586, "loss": 0.6578, "nll_loss": 0.615161657333374, "rewards/accuracies": 1.0, "rewards/chosen": 2.1735382080078125, "rewards/margins": 4.982667922973633, "rewards/rejected": -2.8091297149658203, "step": 2836 }, { "epoch": 0.4728333333333333, "grad_norm": 37.54029083251953, "learning_rate": 1.1361433984111306e-07, "logits/chosen": 3.20721697807312, "logits/rejected": 3.302622079849243, "logps/chosen": -18.781410217285156, "logps/rejected": -145.16213989257812, "loss": 0.68, "nll_loss": 0.6260471343994141, "rewards/accuracies": 1.0, "rewards/chosen": 0.6243492364883423, "rewards/margins": 4.711453437805176, "rewards/rejected": -4.087104320526123, "step": 2837 }, { "epoch": 0.473, "grad_norm": 37.73828125, "learning_rate": 1.1356086119825552e-07, "logits/chosen": 2.3009731769561768, "logits/rejected": 2.5012404918670654, "logps/chosen": -38.02265930175781, "logps/rejected": -223.14906311035156, "loss": 0.7699, "nll_loss": 0.7455424070358276, "rewards/accuracies": 1.0, "rewards/chosen": 1.1841068267822266, "rewards/margins": 7.401517391204834, "rewards/rejected": -6.217410564422607, "step": 2838 }, { "epoch": 0.4731666666666667, "grad_norm": 113.94324493408203, "learning_rate": 1.1350737860408863e-07, "logits/chosen": 2.8385393619537354, "logits/rejected": 3.018678903579712, "logps/chosen": -38.31094741821289, "logps/rejected": -173.22952270507812, "loss": 2.3107, "nll_loss": 2.253584861755371, "rewards/accuracies": 1.0, "rewards/chosen": 0.30577048659324646, "rewards/margins": 5.518063545227051, "rewards/rejected": -5.2122931480407715, "step": 2839 }, { "epoch": 0.47333333333333333, "grad_norm": 77.3326416015625, "learning_rate": 1.1345389207419586e-07, "logits/chosen": 1.4339730739593506, "logits/rejected": 2.3918285369873047, "logps/chosen": -12.019342422485352, "logps/rejected": -166.68743896484375, "loss": 0.8164, "nll_loss": 0.8012894988059998, "rewards/accuracies": 1.0, "rewards/chosen": 1.8488258123397827, "rewards/margins": 7.1859636306762695, "rewards/rejected": -5.337137699127197, "step": 2840 }, { "epoch": 0.4735, "grad_norm": 28.719078063964844, "learning_rate": 1.1340040162416195e-07, "logits/chosen": 3.314755439758301, "logits/rejected": 3.2536203861236572, "logps/chosen": -81.47117614746094, "logps/rejected": -279.080078125, "loss": 0.7851, "nll_loss": 0.7614129185676575, "rewards/accuracies": 1.0, "rewards/chosen": 1.4225631952285767, "rewards/margins": 6.268867492675781, "rewards/rejected": -4.846304416656494, "step": 2841 }, { "epoch": 0.4736666666666667, "grad_norm": 19.59304428100586, "learning_rate": 1.1334690726957272e-07, "logits/chosen": 1.5527008771896362, "logits/rejected": 1.5294615030288696, "logps/chosen": -207.020263671875, "logps/rejected": -252.54750061035156, "loss": 0.8525, "nll_loss": 0.8347590565681458, "rewards/accuracies": 1.0, "rewards/chosen": 1.47735595703125, "rewards/margins": 8.783164978027344, "rewards/rejected": -7.305809020996094, "step": 2842 }, { "epoch": 0.47383333333333333, "grad_norm": 47.35548400878906, "learning_rate": 1.1329340902601514e-07, "logits/chosen": 2.979680061340332, "logits/rejected": 3.0084621906280518, "logps/chosen": -55.541412353515625, "logps/rejected": -101.51095581054688, "loss": 1.1441, "nll_loss": 1.1108282804489136, "rewards/accuracies": 1.0, "rewards/chosen": 1.2897957563400269, "rewards/margins": 5.32347297668457, "rewards/rejected": -4.033677101135254, "step": 2843 }, { "epoch": 0.474, "grad_norm": 20.163022994995117, "learning_rate": 1.1323990690907732e-07, "logits/chosen": 2.5417070388793945, "logits/rejected": 2.4783213138580322, "logps/chosen": -78.60061645507812, "logps/rejected": -40.37576675415039, "loss": 0.6882, "nll_loss": 0.6442672610282898, "rewards/accuracies": 1.0, "rewards/chosen": 3.1692216396331787, "rewards/margins": 5.665034770965576, "rewards/rejected": -2.4958131313323975, "step": 2844 }, { "epoch": 0.4741666666666667, "grad_norm": 46.776100158691406, "learning_rate": 1.1318640093434848e-07, "logits/chosen": 2.6971583366394043, "logits/rejected": 2.5946130752563477, "logps/chosen": -104.65727233886719, "logps/rejected": -51.62735366821289, "loss": 1.9976, "nll_loss": 1.9746654033660889, "rewards/accuracies": 1.0, "rewards/chosen": 2.244391679763794, "rewards/margins": 5.909479141235352, "rewards/rejected": -3.6650872230529785, "step": 2845 }, { "epoch": 0.47433333333333333, "grad_norm": 42.30179977416992, "learning_rate": 1.1313289111741898e-07, "logits/chosen": 2.728065252304077, "logits/rejected": 2.2875993251800537, "logps/chosen": -50.01329040527344, "logps/rejected": -68.75352478027344, "loss": 1.0219, "nll_loss": 1.0002658367156982, "rewards/accuracies": 1.0, "rewards/chosen": 1.4638084173202515, "rewards/margins": 6.57425594329834, "rewards/rejected": -5.110447406768799, "step": 2846 }, { "epoch": 0.4745, "grad_norm": 21.623361587524414, "learning_rate": 1.1307937747388032e-07, "logits/chosen": 2.6039791107177734, "logits/rejected": 2.8691318035125732, "logps/chosen": -89.07688903808594, "logps/rejected": -411.41107177734375, "loss": 0.8744, "nll_loss": 0.8648242950439453, "rewards/accuracies": 1.0, "rewards/chosen": 2.4418563842773438, "rewards/margins": 7.7556657791137695, "rewards/rejected": -5.313809394836426, "step": 2847 }, { "epoch": 0.4746666666666667, "grad_norm": 34.90437698364258, "learning_rate": 1.1302586001932503e-07, "logits/chosen": 2.3377153873443604, "logits/rejected": 2.590394973754883, "logps/chosen": -42.7260627746582, "logps/rejected": -316.2659912109375, "loss": 0.8226, "nll_loss": 0.8061520457267761, "rewards/accuracies": 1.0, "rewards/chosen": 1.5286335945129395, "rewards/margins": 11.039987564086914, "rewards/rejected": -9.511354446411133, "step": 2848 }, { "epoch": 0.47483333333333333, "grad_norm": 221.9369659423828, "learning_rate": 1.1297233876934689e-07, "logits/chosen": 3.0203239917755127, "logits/rejected": 2.932138442993164, "logps/chosen": -47.14547348022461, "logps/rejected": -16.269058227539062, "loss": 4.9109, "nll_loss": 0.8895373940467834, "rewards/accuracies": 0.0, "rewards/chosen": 0.47620508074760437, "rewards/margins": -3.6752312183380127, "rewards/rejected": 4.1514363288879395, "step": 2849 }, { "epoch": 0.475, "grad_norm": 37.87919616699219, "learning_rate": 1.1291881373954064e-07, "logits/chosen": 2.7364954948425293, "logits/rejected": 2.7752504348754883, "logps/chosen": -19.18853187561035, "logps/rejected": -61.344757080078125, "loss": 0.7487, "nll_loss": 0.710686445236206, "rewards/accuracies": 1.0, "rewards/chosen": 4.189797401428223, "rewards/margins": 6.743824005126953, "rewards/rejected": -2.5540266036987305, "step": 2850 }, { "epoch": 0.4751666666666667, "grad_norm": 73.06995391845703, "learning_rate": 1.1286528494550224e-07, "logits/chosen": 2.497873306274414, "logits/rejected": 2.5087313652038574, "logps/chosen": -88.19047546386719, "logps/rejected": -182.9324951171875, "loss": 1.3818, "nll_loss": 1.1163352727890015, "rewards/accuracies": 1.0, "rewards/chosen": -1.872545599937439, "rewards/margins": 4.372373104095459, "rewards/rejected": -6.2449188232421875, "step": 2851 }, { "epoch": 0.47533333333333333, "grad_norm": 38.42603302001953, "learning_rate": 1.1281175240282865e-07, "logits/chosen": 2.086780071258545, "logits/rejected": 2.0050716400146484, "logps/chosen": -126.74781799316406, "logps/rejected": -144.39610290527344, "loss": 1.3339, "nll_loss": 1.2802809476852417, "rewards/accuracies": 1.0, "rewards/chosen": 0.32762300968170166, "rewards/margins": 6.040585517883301, "rewards/rejected": -5.712962627410889, "step": 2852 }, { "epoch": 0.4755, "grad_norm": 30.950374603271484, "learning_rate": 1.1275821612711802e-07, "logits/chosen": 1.8321107625961304, "logits/rejected": 1.913733720779419, "logps/chosen": -71.99552917480469, "logps/rejected": -106.58937072753906, "loss": 0.9425, "nll_loss": 0.9113358855247498, "rewards/accuracies": 1.0, "rewards/chosen": 1.0280853509902954, "rewards/margins": 6.1450371742248535, "rewards/rejected": -5.116951942443848, "step": 2853 }, { "epoch": 0.4756666666666667, "grad_norm": 50.96138000488281, "learning_rate": 1.1270467613396948e-07, "logits/chosen": 2.9205493927001953, "logits/rejected": 3.1784024238586426, "logps/chosen": -19.291034698486328, "logps/rejected": -209.13919067382812, "loss": 0.7144, "nll_loss": 0.6889654397964478, "rewards/accuracies": 1.0, "rewards/chosen": 1.3875858783721924, "rewards/margins": 6.047299385070801, "rewards/rejected": -4.6597137451171875, "step": 2854 }, { "epoch": 0.47583333333333333, "grad_norm": 26.48321533203125, "learning_rate": 1.1265113243898332e-07, "logits/chosen": 3.6554348468780518, "logits/rejected": 3.4218695163726807, "logps/chosen": -117.16930389404297, "logps/rejected": -148.80960083007812, "loss": 0.7597, "nll_loss": 0.7463013529777527, "rewards/accuracies": 1.0, "rewards/chosen": 1.775795817375183, "rewards/margins": 8.916227340698242, "rewards/rejected": -7.140431880950928, "step": 2855 }, { "epoch": 0.476, "grad_norm": 33.72891616821289, "learning_rate": 1.1259758505776091e-07, "logits/chosen": 2.9462926387786865, "logits/rejected": 3.3397252559661865, "logps/chosen": -19.825681686401367, "logps/rejected": -256.51361083984375, "loss": 0.642, "nll_loss": 0.6395381093025208, "rewards/accuracies": 1.0, "rewards/chosen": 3.670041799545288, "rewards/margins": 10.764662742614746, "rewards/rejected": -7.094621181488037, "step": 2856 }, { "epoch": 0.4761666666666667, "grad_norm": 29.769529342651367, "learning_rate": 1.1254403400590464e-07, "logits/chosen": 1.9695090055465698, "logits/rejected": 2.666893243789673, "logps/chosen": -70.14476013183594, "logps/rejected": -868.3621826171875, "loss": 0.9193, "nll_loss": 0.8992918133735657, "rewards/accuracies": 1.0, "rewards/chosen": 1.333723545074463, "rewards/margins": 9.348371505737305, "rewards/rejected": -8.0146484375, "step": 2857 }, { "epoch": 0.47633333333333333, "grad_norm": 81.15088653564453, "learning_rate": 1.1249047929901805e-07, "logits/chosen": 2.976287603378296, "logits/rejected": 2.8561971187591553, "logps/chosen": -80.240966796875, "logps/rejected": -117.05912780761719, "loss": 2.0336, "nll_loss": 1.866068959236145, "rewards/accuracies": 1.0, "rewards/chosen": -1.02362060546875, "rewards/margins": 3.9058737754821777, "rewards/rejected": -4.929494380950928, "step": 2858 }, { "epoch": 0.4765, "grad_norm": 27.017215728759766, "learning_rate": 1.1243692095270563e-07, "logits/chosen": 1.813664197921753, "logits/rejected": 2.0214970111846924, "logps/chosen": -7.426846504211426, "logps/rejected": -64.38796997070312, "loss": 0.3015, "nll_loss": 0.2856479585170746, "rewards/accuracies": 1.0, "rewards/chosen": 2.6857898235321045, "rewards/margins": 6.643370628356934, "rewards/rejected": -3.95758056640625, "step": 2859 }, { "epoch": 0.4766666666666667, "grad_norm": 26.44658851623535, "learning_rate": 1.1238335898257302e-07, "logits/chosen": 1.671875, "logits/rejected": 2.3516087532043457, "logps/chosen": -72.19566345214844, "logps/rejected": -206.67401123046875, "loss": 0.8121, "nll_loss": 0.7933589220046997, "rewards/accuracies": 1.0, "rewards/chosen": 1.4037415981292725, "rewards/margins": 9.16446590423584, "rewards/rejected": -7.7607245445251465, "step": 2860 }, { "epoch": 0.47683333333333333, "grad_norm": 35.966087341308594, "learning_rate": 1.123297934042269e-07, "logits/chosen": 1.0466907024383545, "logits/rejected": 1.6046364307403564, "logps/chosen": -54.131656646728516, "logps/rejected": -251.4949188232422, "loss": 0.7836, "nll_loss": 0.7518285512924194, "rewards/accuracies": 1.0, "rewards/chosen": 1.021677851676941, "rewards/margins": 6.025430202484131, "rewards/rejected": -5.0037522315979, "step": 2861 }, { "epoch": 0.477, "grad_norm": 36.257015228271484, "learning_rate": 1.12276224233275e-07, "logits/chosen": 2.427079439163208, "logits/rejected": 2.572693347930908, "logps/chosen": -83.42675018310547, "logps/rejected": -156.06179809570312, "loss": 1.0434, "nll_loss": 1.0173994302749634, "rewards/accuracies": 1.0, "rewards/chosen": 1.3481979370117188, "rewards/margins": 6.029544830322266, "rewards/rejected": -4.681346893310547, "step": 2862 }, { "epoch": 0.4771666666666667, "grad_norm": 29.701818466186523, "learning_rate": 1.1222265148532609e-07, "logits/chosen": 1.2128140926361084, "logits/rejected": 1.261335015296936, "logps/chosen": -120.63670349121094, "logps/rejected": -89.16438293457031, "loss": 1.0245, "nll_loss": 0.9888254404067993, "rewards/accuracies": 1.0, "rewards/chosen": 1.9589462280273438, "rewards/margins": 5.136099815368652, "rewards/rejected": -3.1771538257598877, "step": 2863 }, { "epoch": 0.47733333333333333, "grad_norm": 33.208213806152344, "learning_rate": 1.1216907517598995e-07, "logits/chosen": 2.13908314704895, "logits/rejected": 2.3069217205047607, "logps/chosen": -117.34622192382812, "logps/rejected": -224.32017517089844, "loss": 1.0443, "nll_loss": 1.0204020738601685, "rewards/accuracies": 1.0, "rewards/chosen": 1.1326980590820312, "rewards/margins": 9.936960220336914, "rewards/rejected": -8.804262161254883, "step": 2864 }, { "epoch": 0.4775, "grad_norm": 66.00040435791016, "learning_rate": 1.1211549532087747e-07, "logits/chosen": 1.3256211280822754, "logits/rejected": 2.2113072872161865, "logps/chosen": -43.69635009765625, "logps/rejected": -195.195068359375, "loss": 1.6362, "nll_loss": 1.618383526802063, "rewards/accuracies": 1.0, "rewards/chosen": 1.639485239982605, "rewards/margins": 6.994714736938477, "rewards/rejected": -5.355229377746582, "step": 2865 }, { "epoch": 0.4776666666666667, "grad_norm": 34.0257682800293, "learning_rate": 1.1206191193560053e-07, "logits/chosen": 2.5763728618621826, "logits/rejected": 2.5843732357025146, "logps/chosen": -32.23445510864258, "logps/rejected": -157.95855712890625, "loss": 0.6974, "nll_loss": 0.6858396530151367, "rewards/accuracies": 1.0, "rewards/chosen": 2.178626775741577, "rewards/margins": 7.518852233886719, "rewards/rejected": -5.3402252197265625, "step": 2866 }, { "epoch": 0.47783333333333333, "grad_norm": 24.118621826171875, "learning_rate": 1.12008325035772e-07, "logits/chosen": 1.8762742280960083, "logits/rejected": 1.7645025253295898, "logps/chosen": -62.063289642333984, "logps/rejected": -95.60418701171875, "loss": 0.7357, "nll_loss": 0.7216660976409912, "rewards/accuracies": 1.0, "rewards/chosen": 2.0439794063568115, "rewards/margins": 7.038544654846191, "rewards/rejected": -4.994565010070801, "step": 2867 }, { "epoch": 0.478, "grad_norm": 35.1197395324707, "learning_rate": 1.1195473463700589e-07, "logits/chosen": 2.1131536960601807, "logits/rejected": 2.127176523208618, "logps/chosen": -29.908618927001953, "logps/rejected": -62.97895812988281, "loss": 0.7757, "nll_loss": 0.7477154731750488, "rewards/accuracies": 1.0, "rewards/chosen": 1.2384175062179565, "rewards/margins": 5.98114538192749, "rewards/rejected": -4.742727756500244, "step": 2868 }, { "epoch": 0.4781666666666667, "grad_norm": 22.875324249267578, "learning_rate": 1.1190114075491708e-07, "logits/chosen": 1.9404298067092896, "logits/rejected": 1.6400997638702393, "logps/chosen": -41.1411247253418, "logps/rejected": -48.595680236816406, "loss": 0.6407, "nll_loss": 0.614046573638916, "rewards/accuracies": 1.0, "rewards/chosen": 2.0269267559051514, "rewards/margins": 5.612005233764648, "rewards/rejected": -3.585078239440918, "step": 2869 }, { "epoch": 0.47833333333333333, "grad_norm": 210.51902770996094, "learning_rate": 1.118475434051216e-07, "logits/chosen": 1.555876612663269, "logits/rejected": 1.2552685737609863, "logps/chosen": -127.39614868164062, "logps/rejected": -13.140933990478516, "loss": 3.0168, "nll_loss": 1.0705558061599731, "rewards/accuracies": 0.0, "rewards/chosen": 1.2349884510040283, "rewards/margins": -1.2899441719055176, "rewards/rejected": 2.524932622909546, "step": 2870 }, { "epoch": 0.4785, "grad_norm": 29.034894943237305, "learning_rate": 1.1179394260323638e-07, "logits/chosen": 1.6034562587738037, "logits/rejected": 2.3537862300872803, "logps/chosen": -42.68330764770508, "logps/rejected": -173.52809143066406, "loss": 0.7076, "nll_loss": 0.6884405016899109, "rewards/accuracies": 1.0, "rewards/chosen": 1.5732907056808472, "rewards/margins": 6.8342390060424805, "rewards/rejected": -5.260948181152344, "step": 2871 }, { "epoch": 0.4786666666666667, "grad_norm": 32.67533493041992, "learning_rate": 1.1174033836487944e-07, "logits/chosen": 1.4928765296936035, "logits/rejected": 1.1745328903198242, "logps/chosen": -107.18240356445312, "logps/rejected": -45.5919189453125, "loss": 0.9725, "nll_loss": 0.8643741011619568, "rewards/accuracies": 1.0, "rewards/chosen": 1.6655586957931519, "rewards/margins": 3.4782662391662598, "rewards/rejected": -1.8127076625823975, "step": 2872 }, { "epoch": 0.47883333333333333, "grad_norm": 30.67386245727539, "learning_rate": 1.1168673070566977e-07, "logits/chosen": 1.1620696783065796, "logits/rejected": 2.06658935546875, "logps/chosen": -52.31553268432617, "logps/rejected": -284.809814453125, "loss": 0.7663, "nll_loss": 0.7473646998405457, "rewards/accuracies": 1.0, "rewards/chosen": 1.3780994415283203, "rewards/margins": 10.815013885498047, "rewards/rejected": -9.436914443969727, "step": 2873 }, { "epoch": 0.479, "grad_norm": 20.868942260742188, "learning_rate": 1.1163311964122733e-07, "logits/chosen": 2.105255365371704, "logits/rejected": 2.78421688079834, "logps/chosen": -49.164241790771484, "logps/rejected": -668.4544067382812, "loss": 0.51, "nll_loss": 0.49164241552352905, "rewards/accuracies": 1.0, "rewards/chosen": 1.4549617767333984, "rewards/margins": 8.15008544921875, "rewards/rejected": -6.695123195648193, "step": 2874 }, { "epoch": 0.4791666666666667, "grad_norm": 24.92414093017578, "learning_rate": 1.115795051871731e-07, "logits/chosen": 2.0337462425231934, "logits/rejected": 1.9501221179962158, "logps/chosen": -82.0462646484375, "logps/rejected": -85.72388458251953, "loss": 0.7695, "nll_loss": 0.7260730862617493, "rewards/accuracies": 1.0, "rewards/chosen": 1.7296028137207031, "rewards/margins": 4.763036727905273, "rewards/rejected": -3.0334339141845703, "step": 2875 }, { "epoch": 0.47933333333333333, "grad_norm": 37.12194061279297, "learning_rate": 1.115258873591291e-07, "logits/chosen": 3.3304195404052734, "logits/rejected": 3.484035015106201, "logps/chosen": -33.2971305847168, "logps/rejected": -335.8144226074219, "loss": 0.6118, "nll_loss": 0.5841602087020874, "rewards/accuracies": 1.0, "rewards/chosen": 1.0027878284454346, "rewards/margins": 7.905082702636719, "rewards/rejected": -6.902295112609863, "step": 2876 }, { "epoch": 0.4795, "grad_norm": 31.466859817504883, "learning_rate": 1.1147226617271819e-07, "logits/chosen": 3.7539477348327637, "logits/rejected": 3.817866802215576, "logps/chosen": -53.033721923828125, "logps/rejected": -132.3941192626953, "loss": 0.8038, "nll_loss": 0.7469537854194641, "rewards/accuracies": 1.0, "rewards/chosen": 1.1516326665878296, "rewards/margins": 4.238032341003418, "rewards/rejected": -3.086399793624878, "step": 2877 }, { "epoch": 0.4796666666666667, "grad_norm": 17.918481826782227, "learning_rate": 1.1141864164356437e-07, "logits/chosen": 2.5664126873016357, "logits/rejected": 2.698249340057373, "logps/chosen": -106.4451675415039, "logps/rejected": -65.25137329101562, "loss": 0.6518, "nll_loss": 0.6336022615432739, "rewards/accuracies": 1.0, "rewards/chosen": 2.078218936920166, "rewards/margins": 6.320109844207764, "rewards/rejected": -4.241890907287598, "step": 2878 }, { "epoch": 0.47983333333333333, "grad_norm": 52.16718292236328, "learning_rate": 1.1136501378729247e-07, "logits/chosen": 3.6270692348480225, "logits/rejected": 3.5404348373413086, "logps/chosen": -37.26785659790039, "logps/rejected": -46.350128173828125, "loss": 1.076, "nll_loss": 0.9555860757827759, "rewards/accuracies": 1.0, "rewards/chosen": 0.8330173492431641, "rewards/margins": 3.0142180919647217, "rewards/rejected": -2.1812007427215576, "step": 2879 }, { "epoch": 0.48, "grad_norm": 56.05906295776367, "learning_rate": 1.1131138261952844e-07, "logits/chosen": 2.1702980995178223, "logits/rejected": 2.0426151752471924, "logps/chosen": -27.687931060791016, "logps/rejected": -12.717756271362305, "loss": 1.8622, "nll_loss": 0.6921983361244202, "rewards/accuracies": 1.0, "rewards/chosen": 4.972671985626221, "rewards/margins": 1.1599040031433105, "rewards/rejected": 3.81276798248291, "step": 2880 }, { "epoch": 0.4801666666666667, "grad_norm": 84.88748931884766, "learning_rate": 1.1125774815589904e-07, "logits/chosen": 1.926596999168396, "logits/rejected": 2.3068532943725586, "logps/chosen": -7.247638702392578, "logps/rejected": -348.51861572265625, "loss": 0.6174, "nll_loss": 0.6039699912071228, "rewards/accuracies": 1.0, "rewards/chosen": 1.7863489389419556, "rewards/margins": 8.55212688446045, "rewards/rejected": -6.765777587890625, "step": 2881 }, { "epoch": 0.48033333333333333, "grad_norm": 33.91972351074219, "learning_rate": 1.1120411041203213e-07, "logits/chosen": 3.0462024211883545, "logits/rejected": 3.043360710144043, "logps/chosen": -31.401714324951172, "logps/rejected": -84.23949432373047, "loss": 0.7261, "nll_loss": 0.6978158950805664, "rewards/accuracies": 1.0, "rewards/chosen": 2.2312943935394287, "rewards/margins": 5.58745813369751, "rewards/rejected": -3.356163740158081, "step": 2882 }, { "epoch": 0.4805, "grad_norm": 64.3201675415039, "learning_rate": 1.1115046940355641e-07, "logits/chosen": 2.6412432193756104, "logits/rejected": 2.747612953186035, "logps/chosen": -72.5937271118164, "logps/rejected": -208.83143615722656, "loss": 2.0797, "nll_loss": 2.074106454849243, "rewards/accuracies": 1.0, "rewards/chosen": 2.7952468395233154, "rewards/margins": 9.332883834838867, "rewards/rejected": -6.537637233734131, "step": 2883 }, { "epoch": 0.4806666666666667, "grad_norm": 93.18611145019531, "learning_rate": 1.1109682514610161e-07, "logits/chosen": 1.8296693563461304, "logits/rejected": 1.7269611358642578, "logps/chosen": -11.288804054260254, "logps/rejected": -27.650754928588867, "loss": 1.0639, "nll_loss": 0.37629351019859314, "rewards/accuracies": 1.0, "rewards/chosen": 1.1960946321487427, "rewards/margins": 0.6685473322868347, "rewards/rejected": 0.527547299861908, "step": 2884 }, { "epoch": 0.48083333333333333, "grad_norm": 23.99264907836914, "learning_rate": 1.1104317765529837e-07, "logits/chosen": 1.4399175643920898, "logits/rejected": 1.8913015127182007, "logps/chosen": -87.56233978271484, "logps/rejected": -280.40887451171875, "loss": 0.8574, "nll_loss": 0.8501198291778564, "rewards/accuracies": 1.0, "rewards/chosen": 2.444373369216919, "rewards/margins": 9.356037139892578, "rewards/rejected": -6.911664009094238, "step": 2885 }, { "epoch": 0.481, "grad_norm": 27.928112030029297, "learning_rate": 1.1098952694677828e-07, "logits/chosen": 2.3555827140808105, "logits/rejected": 2.330789089202881, "logps/chosen": -31.685684204101562, "logps/rejected": -268.4332580566406, "loss": 0.6508, "nll_loss": 0.6466466188430786, "rewards/accuracies": 1.0, "rewards/chosen": 5.377903938293457, "rewards/margins": 10.274372100830078, "rewards/rejected": -4.896467685699463, "step": 2886 }, { "epoch": 0.4811666666666667, "grad_norm": 57.629661560058594, "learning_rate": 1.1093587303617389e-07, "logits/chosen": 2.2755489349365234, "logits/rejected": 2.3585309982299805, "logps/chosen": -33.05178451538086, "logps/rejected": -69.36557006835938, "loss": 1.1338, "nll_loss": 1.0661864280700684, "rewards/accuracies": 1.0, "rewards/chosen": 1.5385288000106812, "rewards/margins": 4.049576282501221, "rewards/rejected": -2.51104736328125, "step": 2887 }, { "epoch": 0.48133333333333334, "grad_norm": 27.014787673950195, "learning_rate": 1.1088221593911862e-07, "logits/chosen": 1.1426931619644165, "logits/rejected": 2.1378047466278076, "logps/chosen": -58.21247863769531, "logps/rejected": -260.0132751464844, "loss": 0.7205, "nll_loss": 0.7186725735664368, "rewards/accuracies": 1.0, "rewards/chosen": 3.81946063041687, "rewards/margins": 12.222776412963867, "rewards/rejected": -8.403315544128418, "step": 2888 }, { "epoch": 0.4815, "grad_norm": 108.53685760498047, "learning_rate": 1.1082855567124691e-07, "logits/chosen": 2.769808053970337, "logits/rejected": 2.9664580821990967, "logps/chosen": -34.236595153808594, "logps/rejected": -215.52438354492188, "loss": 2.0472, "nll_loss": 2.0139174461364746, "rewards/accuracies": 1.0, "rewards/chosen": 0.9391822814941406, "rewards/margins": 6.095957279205322, "rewards/rejected": -5.156774997711182, "step": 2889 }, { "epoch": 0.4816666666666667, "grad_norm": 88.33419799804688, "learning_rate": 1.1077489224819401e-07, "logits/chosen": 1.4437949657440186, "logits/rejected": 1.2778401374816895, "logps/chosen": -85.60850524902344, "logps/rejected": -8.449080467224121, "loss": 1.5779, "nll_loss": 0.7926713228225708, "rewards/accuracies": 1.0, "rewards/chosen": 2.073155164718628, "rewards/margins": 0.7625397443771362, "rewards/rejected": 1.3106154203414917, "step": 2890 }, { "epoch": 0.48183333333333334, "grad_norm": 25.707443237304688, "learning_rate": 1.1072122568559623e-07, "logits/chosen": 2.1734352111816406, "logits/rejected": 2.1654670238494873, "logps/chosen": -130.01568603515625, "logps/rejected": -156.33767700195312, "loss": 1.0904, "nll_loss": 1.0745099782943726, "rewards/accuracies": 1.0, "rewards/chosen": 1.930189609527588, "rewards/margins": 6.794960021972656, "rewards/rejected": -4.864770412445068, "step": 2891 }, { "epoch": 0.482, "grad_norm": 33.16899871826172, "learning_rate": 1.1066755599909063e-07, "logits/chosen": 3.09871506690979, "logits/rejected": 3.1786930561065674, "logps/chosen": -10.989755630493164, "logps/rejected": -142.06857299804688, "loss": 0.4225, "nll_loss": 0.4070279598236084, "rewards/accuracies": 1.0, "rewards/chosen": 2.0186784267425537, "rewards/margins": 6.770781517028809, "rewards/rejected": -4.752103328704834, "step": 2892 }, { "epoch": 0.4821666666666667, "grad_norm": 20.90483856201172, "learning_rate": 1.1061388320431537e-07, "logits/chosen": 1.6879355907440186, "logits/rejected": 1.7544591426849365, "logps/chosen": -157.53561401367188, "logps/rejected": -143.3924560546875, "loss": 0.9289, "nll_loss": 0.9159047603607178, "rewards/accuracies": 1.0, "rewards/chosen": 2.180502414703369, "rewards/margins": 7.115761756896973, "rewards/rejected": -4.9352593421936035, "step": 2893 }, { "epoch": 0.48233333333333334, "grad_norm": 31.621156692504883, "learning_rate": 1.1056020731690932e-07, "logits/chosen": 3.1356923580169678, "logits/rejected": 3.078138589859009, "logps/chosen": -66.66383361816406, "logps/rejected": -66.28996276855469, "loss": 0.8035, "nll_loss": 0.7936172485351562, "rewards/accuracies": 1.0, "rewards/chosen": 2.235321044921875, "rewards/margins": 8.124540328979492, "rewards/rejected": -5.889219760894775, "step": 2894 }, { "epoch": 0.4825, "grad_norm": 36.27315902709961, "learning_rate": 1.105065283525124e-07, "logits/chosen": 2.7438712120056152, "logits/rejected": 2.713263750076294, "logps/chosen": -36.31605529785156, "logps/rejected": -237.12811279296875, "loss": 0.7225, "nll_loss": 0.6983855962753296, "rewards/accuracies": 1.0, "rewards/chosen": 1.3418331146240234, "rewards/margins": 6.387765884399414, "rewards/rejected": -5.045932769775391, "step": 2895 }, { "epoch": 0.4826666666666667, "grad_norm": 33.74943923950195, "learning_rate": 1.1045284632676535e-07, "logits/chosen": 3.7105584144592285, "logits/rejected": 3.727947950363159, "logps/chosen": -50.123233795166016, "logps/rejected": -160.51791381835938, "loss": 0.856, "nll_loss": 0.8353872895240784, "rewards/accuracies": 1.0, "rewards/chosen": 1.3601082563400269, "rewards/margins": 7.532408237457275, "rewards/rejected": -6.172299861907959, "step": 2896 }, { "epoch": 0.48283333333333334, "grad_norm": 25.885929107666016, "learning_rate": 1.1039916125530984e-07, "logits/chosen": 1.7167178392410278, "logits/rejected": 2.3222434520721436, "logps/chosen": -44.21977996826172, "logps/rejected": -371.2220458984375, "loss": 0.601, "nll_loss": 0.5818393230438232, "rewards/accuracies": 1.0, "rewards/chosen": 1.3869072198867798, "rewards/margins": 8.726962089538574, "rewards/rejected": -7.340054512023926, "step": 2897 }, { "epoch": 0.483, "grad_norm": 43.09560012817383, "learning_rate": 1.1034547315378837e-07, "logits/chosen": 1.3282924890518188, "logits/rejected": 1.2524521350860596, "logps/chosen": -74.14061737060547, "logps/rejected": -147.10159301757812, "loss": 1.1649, "nll_loss": 1.140625, "rewards/accuracies": 1.0, "rewards/chosen": 1.313684105873108, "rewards/margins": 6.44986629486084, "rewards/rejected": -5.1361823081970215, "step": 2898 }, { "epoch": 0.4831666666666667, "grad_norm": 29.183589935302734, "learning_rate": 1.1029178203784442e-07, "logits/chosen": 1.9534434080123901, "logits/rejected": 2.504382371902466, "logps/chosen": -139.81024169921875, "logps/rejected": -504.43841552734375, "loss": 1.4669, "nll_loss": 1.4563565254211426, "rewards/accuracies": 1.0, "rewards/chosen": 1.9773178100585938, "rewards/margins": 12.385498046875, "rewards/rejected": -10.408180236816406, "step": 2899 }, { "epoch": 0.48333333333333334, "grad_norm": 222.39344787597656, "learning_rate": 1.1023808792312226e-07, "logits/chosen": 1.2762887477874756, "logits/rejected": 1.4598612785339355, "logps/chosen": -59.6800422668457, "logps/rejected": -69.68732452392578, "loss": 2.3677, "nll_loss": 0.9042431116104126, "rewards/accuracies": 0.0, "rewards/chosen": 1.385178804397583, "rewards/margins": -0.6276593208312988, "rewards/rejected": 2.012838125228882, "step": 2900 }, { "epoch": 0.4835, "grad_norm": 27.203054428100586, "learning_rate": 1.1018439082526707e-07, "logits/chosen": 1.5767794847488403, "logits/rejected": 2.17276930809021, "logps/chosen": -102.13240051269531, "logps/rejected": -174.44290161132812, "loss": 0.9499, "nll_loss": 0.9201117157936096, "rewards/accuracies": 1.0, "rewards/chosen": 1.123693823814392, "rewards/margins": 6.0022149085998535, "rewards/rejected": -4.878520965576172, "step": 2901 }, { "epoch": 0.4836666666666667, "grad_norm": 77.20049285888672, "learning_rate": 1.101306907599249e-07, "logits/chosen": 2.1280910968780518, "logits/rejected": 2.0025506019592285, "logps/chosen": -37.684322357177734, "logps/rejected": -46.80833435058594, "loss": 0.9778, "nll_loss": 0.6729344725608826, "rewards/accuracies": 1.0, "rewards/chosen": 1.4554554224014282, "rewards/margins": 1.9813563823699951, "rewards/rejected": -0.5259010195732117, "step": 2902 }, { "epoch": 0.48383333333333334, "grad_norm": 22.562286376953125, "learning_rate": 1.1007698774274266e-07, "logits/chosen": 3.1074154376983643, "logits/rejected": 2.722607135772705, "logps/chosen": -191.34988403320312, "logps/rejected": -76.8389892578125, "loss": 1.1058, "nll_loss": 1.0934277772903442, "rewards/accuracies": 1.0, "rewards/chosen": 2.2205779552459717, "rewards/margins": 7.208453178405762, "rewards/rejected": -4.987875461578369, "step": 2903 }, { "epoch": 0.484, "grad_norm": 113.98564910888672, "learning_rate": 1.100232817893681e-07, "logits/chosen": 3.070866823196411, "logits/rejected": 3.1481845378875732, "logps/chosen": -95.08219146728516, "logps/rejected": -117.99993896484375, "loss": 1.3874, "nll_loss": 0.9414077401161194, "rewards/accuracies": 1.0, "rewards/chosen": 0.8955055475234985, "rewards/margins": 1.2195923328399658, "rewards/rejected": -0.3240867853164673, "step": 2904 }, { "epoch": 0.4841666666666667, "grad_norm": 42.02065658569336, "learning_rate": 1.0996957291544991e-07, "logits/chosen": 2.2551660537719727, "logits/rejected": 2.390383005142212, "logps/chosen": -16.837982177734375, "logps/rejected": -89.21192169189453, "loss": 0.5994, "nll_loss": 0.5806199908256531, "rewards/accuracies": 1.0, "rewards/chosen": 1.6626865863800049, "rewards/margins": 6.649543762207031, "rewards/rejected": -4.9868574142456055, "step": 2905 }, { "epoch": 0.48433333333333334, "grad_norm": 60.55422592163086, "learning_rate": 1.0991586113663747e-07, "logits/chosen": 2.342571973800659, "logits/rejected": 2.0015816688537598, "logps/chosen": -53.29175567626953, "logps/rejected": -53.0625, "loss": 1.2639, "nll_loss": 1.087586760520935, "rewards/accuracies": 1.0, "rewards/chosen": 0.41361886262893677, "rewards/margins": 2.3935678005218506, "rewards/rejected": -1.979948878288269, "step": 2906 }, { "epoch": 0.4845, "grad_norm": 37.19847869873047, "learning_rate": 1.0986214646858114e-07, "logits/chosen": 2.1984148025512695, "logits/rejected": 2.518519401550293, "logps/chosen": -19.87980079650879, "logps/rejected": -215.15902709960938, "loss": 0.5032, "nll_loss": 0.4848731756210327, "rewards/accuracies": 1.0, "rewards/chosen": 2.475057363510132, "rewards/margins": 6.343286991119385, "rewards/rejected": -3.868229627609253, "step": 2907 }, { "epoch": 0.4846666666666667, "grad_norm": 30.894540786743164, "learning_rate": 1.0980842892693214e-07, "logits/chosen": 2.4011569023132324, "logits/rejected": 2.584178924560547, "logps/chosen": -25.681591033935547, "logps/rejected": -170.65110778808594, "loss": 0.6354, "nll_loss": 0.6114664673805237, "rewards/accuracies": 1.0, "rewards/chosen": 1.406323790550232, "rewards/margins": 6.22855281829834, "rewards/rejected": -4.822228908538818, "step": 2908 }, { "epoch": 0.48483333333333334, "grad_norm": 21.67348289489746, "learning_rate": 1.0975470852734242e-07, "logits/chosen": 2.5803844928741455, "logits/rejected": 2.8919732570648193, "logps/chosen": -83.38761901855469, "logps/rejected": -358.626220703125, "loss": 0.7627, "nll_loss": 0.7512399554252625, "rewards/accuracies": 1.0, "rewards/chosen": 1.9240401983261108, "rewards/margins": 9.367399215698242, "rewards/rejected": -7.443359375, "step": 2909 }, { "epoch": 0.485, "grad_norm": 46.267616271972656, "learning_rate": 1.097009852854648e-07, "logits/chosen": 2.543757677078247, "logits/rejected": 2.5863399505615234, "logps/chosen": -15.412199020385742, "logps/rejected": -79.25051879882812, "loss": 0.5767, "nll_loss": 0.5708222389221191, "rewards/accuracies": 1.0, "rewards/chosen": 4.257596492767334, "rewards/margins": 8.939410209655762, "rewards/rejected": -4.681813716888428, "step": 2910 }, { "epoch": 0.4851666666666667, "grad_norm": 24.762954711914062, "learning_rate": 1.0964725921695299e-07, "logits/chosen": 1.8977320194244385, "logits/rejected": 2.807497501373291, "logps/chosen": -90.71249389648438, "logps/rejected": -391.4705810546875, "loss": 0.8534, "nll_loss": 0.8322244882583618, "rewards/accuracies": 1.0, "rewards/chosen": 1.4321885108947754, "rewards/margins": 6.77700662612915, "rewards/rejected": -5.344818115234375, "step": 2911 }, { "epoch": 0.48533333333333334, "grad_norm": 61.47359085083008, "learning_rate": 1.0959353033746146e-07, "logits/chosen": 0.7834421396255493, "logits/rejected": 2.5425491333007812, "logps/chosen": -15.182580947875977, "logps/rejected": -322.9701843261719, "loss": 0.8019, "nll_loss": 0.7990832328796387, "rewards/accuracies": 1.0, "rewards/chosen": 3.42236328125, "rewards/margins": 11.007452011108398, "rewards/rejected": -7.585089206695557, "step": 2912 }, { "epoch": 0.4855, "grad_norm": 27.067981719970703, "learning_rate": 1.0953979866264546e-07, "logits/chosen": 0.19272571802139282, "logits/rejected": 1.0190551280975342, "logps/chosen": -51.296287536621094, "logps/rejected": -371.4695129394531, "loss": 0.7295, "nll_loss": 0.7224829196929932, "rewards/accuracies": 1.0, "rewards/chosen": 2.4064743518829346, "rewards/margins": 11.625971794128418, "rewards/rejected": -9.219497680664062, "step": 2913 }, { "epoch": 0.4856666666666667, "grad_norm": 41.31174087524414, "learning_rate": 1.0948606420816122e-07, "logits/chosen": 2.618372917175293, "logits/rejected": 2.6253790855407715, "logps/chosen": -47.770145416259766, "logps/rejected": -68.8963623046875, "loss": 0.9732, "nll_loss": 0.9554028511047363, "rewards/accuracies": 1.0, "rewards/chosen": 1.6881229877471924, "rewards/margins": 6.812853813171387, "rewards/rejected": -5.124731063842773, "step": 2914 }, { "epoch": 0.48583333333333334, "grad_norm": 179.34825134277344, "learning_rate": 1.0943232698966555e-07, "logits/chosen": 3.1275265216827393, "logits/rejected": 2.9602999687194824, "logps/chosen": -67.76289367675781, "logps/rejected": -36.571075439453125, "loss": 3.2847, "nll_loss": 0.8067010045051575, "rewards/accuracies": 0.0, "rewards/chosen": 3.1090517044067383, "rewards/margins": -1.5036344528198242, "rewards/rejected": 4.6126861572265625, "step": 2915 }, { "epoch": 0.486, "grad_norm": 32.07888412475586, "learning_rate": 1.093785870228163e-07, "logits/chosen": 3.2250490188598633, "logits/rejected": 3.4372522830963135, "logps/chosen": -128.7946014404297, "logps/rejected": -333.9638366699219, "loss": 1.3058, "nll_loss": 1.2751941680908203, "rewards/accuracies": 1.0, "rewards/chosen": 0.8775039911270142, "rewards/margins": 8.350709915161133, "rewards/rejected": -7.473206043243408, "step": 2916 }, { "epoch": 0.4861666666666667, "grad_norm": 21.51180648803711, "learning_rate": 1.0932484432327191e-07, "logits/chosen": 2.7193827629089355, "logits/rejected": 2.768944263458252, "logps/chosen": -60.515159606933594, "logps/rejected": -127.4715805053711, "loss": 0.6222, "nll_loss": 0.6051515340805054, "rewards/accuracies": 1.0, "rewards/chosen": 1.8956917524337769, "rewards/margins": 6.598578929901123, "rewards/rejected": -4.702887058258057, "step": 2917 }, { "epoch": 0.48633333333333334, "grad_norm": 57.847862243652344, "learning_rate": 1.0927109890669178e-07, "logits/chosen": 2.5147194862365723, "logits/rejected": 2.539734125137329, "logps/chosen": -54.53971481323242, "logps/rejected": -54.40386199951172, "loss": 1.454, "nll_loss": 1.398454189300537, "rewards/accuracies": 1.0, "rewards/chosen": 0.950804591178894, "rewards/margins": 4.321934700012207, "rewards/rejected": -3.3711299896240234, "step": 2918 }, { "epoch": 0.4865, "grad_norm": 37.220481872558594, "learning_rate": 1.0921735078873597e-07, "logits/chosen": 2.5647144317626953, "logits/rejected": 2.5544519424438477, "logps/chosen": -40.504093170166016, "logps/rejected": -74.1174545288086, "loss": 0.7446, "nll_loss": 0.7105982303619385, "rewards/accuracies": 1.0, "rewards/chosen": 1.3892383575439453, "rewards/margins": 5.202454566955566, "rewards/rejected": -3.8132164478302, "step": 2919 }, { "epoch": 0.4866666666666667, "grad_norm": 24.26643943786621, "learning_rate": 1.0916359998506547e-07, "logits/chosen": 0.6595402359962463, "logits/rejected": 1.8266327381134033, "logps/chosen": -70.67048645019531, "logps/rejected": -264.09832763671875, "loss": 0.7467, "nll_loss": 0.7285616993904114, "rewards/accuracies": 1.0, "rewards/chosen": 1.411004662513733, "rewards/margins": 14.279122352600098, "rewards/rejected": -12.868117332458496, "step": 2920 }, { "epoch": 0.48683333333333334, "grad_norm": 25.968704223632812, "learning_rate": 1.0910984651134189e-07, "logits/chosen": 2.000436544418335, "logits/rejected": 2.435884714126587, "logps/chosen": -35.55437088012695, "logps/rejected": -195.037109375, "loss": 0.5312, "nll_loss": 0.5152807235717773, "rewards/accuracies": 1.0, "rewards/chosen": 1.673848032951355, "rewards/margins": 7.563244819641113, "rewards/rejected": -5.889396667480469, "step": 2921 }, { "epoch": 0.487, "grad_norm": 38.32783508300781, "learning_rate": 1.0905609038322778e-07, "logits/chosen": 2.085170269012451, "logits/rejected": 2.0892367362976074, "logps/chosen": -29.102699279785156, "logps/rejected": -135.58505249023438, "loss": 0.7435, "nll_loss": 0.6929214596748352, "rewards/accuracies": 1.0, "rewards/chosen": 0.8836578726768494, "rewards/margins": 4.57232141494751, "rewards/rejected": -3.6886634826660156, "step": 2922 }, { "epoch": 0.4871666666666667, "grad_norm": 28.95521354675293, "learning_rate": 1.0900233161638632e-07, "logits/chosen": 2.2514634132385254, "logits/rejected": 2.7638237476348877, "logps/chosen": -49.23211669921875, "logps/rejected": -227.13931274414062, "loss": 0.7165, "nll_loss": 0.6934100389480591, "rewards/accuracies": 1.0, "rewards/chosen": 1.162408471107483, "rewards/margins": 9.946941375732422, "rewards/rejected": -8.78453254699707, "step": 2923 }, { "epoch": 0.48733333333333334, "grad_norm": 22.967683792114258, "learning_rate": 1.0894857022648158e-07, "logits/chosen": 2.4369046688079834, "logits/rejected": 2.6660945415496826, "logps/chosen": -63.6087760925293, "logps/rejected": -419.1897888183594, "loss": 0.7478, "nll_loss": 0.7396368384361267, "rewards/accuracies": 1.0, "rewards/chosen": 2.2784688472747803, "rewards/margins": 9.821849822998047, "rewards/rejected": -7.5433807373046875, "step": 2924 }, { "epoch": 0.4875, "grad_norm": 25.997474670410156, "learning_rate": 1.0889480622917828e-07, "logits/chosen": 2.4364964962005615, "logits/rejected": 2.298257350921631, "logps/chosen": -83.86492156982422, "logps/rejected": -152.53688049316406, "loss": 0.9025, "nll_loss": 0.8735930323600769, "rewards/accuracies": 1.0, "rewards/chosen": 0.923469603061676, "rewards/margins": 9.10746955871582, "rewards/rejected": -8.184000015258789, "step": 2925 }, { "epoch": 0.4876666666666667, "grad_norm": 72.34471893310547, "learning_rate": 1.0884103964014202e-07, "logits/chosen": 1.1499589681625366, "logits/rejected": 2.0774269104003906, "logps/chosen": -57.57433319091797, "logps/rejected": -370.4277648925781, "loss": 1.5621, "nll_loss": 1.4762650728225708, "rewards/accuracies": 1.0, "rewards/chosen": -0.2667221128940582, "rewards/margins": 5.758558750152588, "rewards/rejected": -6.025280952453613, "step": 2926 }, { "epoch": 0.48783333333333334, "grad_norm": 18.6466007232666, "learning_rate": 1.0878727047503903e-07, "logits/chosen": 0.8530301451683044, "logits/rejected": 1.2018368244171143, "logps/chosen": -119.77522277832031, "logps/rejected": -290.03729248046875, "loss": 0.8526, "nll_loss": 0.8434875011444092, "rewards/accuracies": 1.0, "rewards/chosen": 2.646772861480713, "rewards/margins": 7.692166805267334, "rewards/rejected": -5.045393943786621, "step": 2927 }, { "epoch": 0.488, "grad_norm": 233.90538024902344, "learning_rate": 1.0873349874953639e-07, "logits/chosen": 2.3917880058288574, "logits/rejected": 2.6007237434387207, "logps/chosen": -42.007408142089844, "logps/rejected": -41.0214958190918, "loss": 4.9414, "nll_loss": 0.6775386929512024, "rewards/accuracies": 0.0, "rewards/chosen": 1.1856483221054077, "rewards/margins": -3.7971630096435547, "rewards/rejected": 4.982811450958252, "step": 2928 }, { "epoch": 0.4881666666666667, "grad_norm": 328.87176513671875, "learning_rate": 1.0867972447930191e-07, "logits/chosen": 1.5474601984024048, "logits/rejected": 1.9203919172286987, "logps/chosen": -80.26847076416016, "logps/rejected": -64.11631774902344, "loss": 4.9618, "nll_loss": 1.042447566986084, "rewards/accuracies": 0.0, "rewards/chosen": 0.31303179264068604, "rewards/margins": -3.5948071479797363, "rewards/rejected": 3.907838821411133, "step": 2929 }, { "epoch": 0.48833333333333334, "grad_norm": 22.828676223754883, "learning_rate": 1.0862594768000409e-07, "logits/chosen": 2.9502878189086914, "logits/rejected": 2.9915409088134766, "logps/chosen": -137.17849731445312, "logps/rejected": -182.78787231445312, "loss": 0.8352, "nll_loss": 0.811707079410553, "rewards/accuracies": 1.0, "rewards/chosen": 2.1783249378204346, "rewards/margins": 5.856391906738281, "rewards/rejected": -3.6780669689178467, "step": 2930 }, { "epoch": 0.4885, "grad_norm": 91.76615905761719, "learning_rate": 1.085721683673122e-07, "logits/chosen": 1.1496294736862183, "logits/rejected": 1.9909605979919434, "logps/chosen": -9.619742393493652, "logps/rejected": -203.80503845214844, "loss": 0.8126, "nll_loss": 0.8016451001167297, "rewards/accuracies": 1.0, "rewards/chosen": 2.6248559951782227, "rewards/margins": 7.2716240882873535, "rewards/rejected": -4.646768093109131, "step": 2931 }, { "epoch": 0.4886666666666667, "grad_norm": 29.702756881713867, "learning_rate": 1.0851838655689624e-07, "logits/chosen": 2.6921255588531494, "logits/rejected": 2.8009583950042725, "logps/chosen": -58.95354461669922, "logps/rejected": -98.19789123535156, "loss": 0.7263, "nll_loss": 0.7102836966514587, "rewards/accuracies": 1.0, "rewards/chosen": 1.5499581098556519, "rewards/margins": 10.436437606811523, "rewards/rejected": -8.886479377746582, "step": 2932 }, { "epoch": 0.48883333333333334, "grad_norm": 32.66815948486328, "learning_rate": 1.0846460226442695e-07, "logits/chosen": 2.0841009616851807, "logits/rejected": 2.3570189476013184, "logps/chosen": -44.45505142211914, "logps/rejected": -285.98846435546875, "loss": 0.9282, "nll_loss": 0.926146924495697, "rewards/accuracies": 1.0, "rewards/chosen": 4.956910610198975, "rewards/margins": 10.817861557006836, "rewards/rejected": -5.860950469970703, "step": 2933 }, { "epoch": 0.489, "grad_norm": 32.51848602294922, "learning_rate": 1.0841081550557576e-07, "logits/chosen": 3.047492742538452, "logits/rejected": 2.897108793258667, "logps/chosen": -38.162811279296875, "logps/rejected": -51.23291015625, "loss": 0.8791, "nll_loss": 0.8296263217926025, "rewards/accuracies": 1.0, "rewards/chosen": 1.6428265571594238, "rewards/margins": 4.5417160987854, "rewards/rejected": -2.8988895416259766, "step": 2934 }, { "epoch": 0.4891666666666667, "grad_norm": 37.81586456298828, "learning_rate": 1.0835702629601489e-07, "logits/chosen": 2.7913334369659424, "logits/rejected": 3.0980679988861084, "logps/chosen": -162.25872802734375, "logps/rejected": -557.0311279296875, "loss": 1.3735, "nll_loss": 1.3521560430526733, "rewards/accuracies": 1.0, "rewards/chosen": 2.6240875720977783, "rewards/margins": 6.179934501647949, "rewards/rejected": -3.55584716796875, "step": 2935 }, { "epoch": 0.48933333333333334, "grad_norm": 29.986867904663086, "learning_rate": 1.0830323465141718e-07, "logits/chosen": 2.4386942386627197, "logits/rejected": 2.5977697372436523, "logps/chosen": -76.23698425292969, "logps/rejected": -164.15478515625, "loss": 1.0264, "nll_loss": 1.0164930820465088, "rewards/accuracies": 1.0, "rewards/chosen": 2.301015615463257, "rewards/margins": 7.8703413009643555, "rewards/rejected": -5.5693254470825195, "step": 2936 }, { "epoch": 0.4895, "grad_norm": 26.186735153198242, "learning_rate": 1.0824944058745622e-07, "logits/chosen": 2.8895459175109863, "logits/rejected": 2.9457526206970215, "logps/chosen": -42.904266357421875, "logps/rejected": -122.63706970214844, "loss": 0.6836, "nll_loss": 0.670379102230072, "rewards/accuracies": 1.0, "rewards/chosen": 2.377934217453003, "rewards/margins": 6.925973892211914, "rewards/rejected": -4.548039436340332, "step": 2937 }, { "epoch": 0.48966666666666664, "grad_norm": 173.15708923339844, "learning_rate": 1.0819564411980633e-07, "logits/chosen": 2.8710365295410156, "logits/rejected": 2.9060332775115967, "logps/chosen": -58.63325881958008, "logps/rejected": -21.47667121887207, "loss": 2.2465, "nll_loss": 0.8143508434295654, "rewards/accuracies": 0.0, "rewards/chosen": 1.5403515100479126, "rewards/margins": -0.5437906980514526, "rewards/rejected": 2.0841422080993652, "step": 2938 }, { "epoch": 0.48983333333333334, "grad_norm": 66.80155944824219, "learning_rate": 1.081418452641425e-07, "logits/chosen": 3.1324384212493896, "logits/rejected": 2.884082317352295, "logps/chosen": -97.20651245117188, "logps/rejected": -82.11346435546875, "loss": 1.5456, "nll_loss": 1.4087903499603271, "rewards/accuracies": 1.0, "rewards/chosen": 0.006027984898537397, "rewards/margins": 2.871227741241455, "rewards/rejected": -2.8651998043060303, "step": 2939 }, { "epoch": 0.49, "grad_norm": 30.371702194213867, "learning_rate": 1.0808804403614043e-07, "logits/chosen": 1.5110735893249512, "logits/rejected": 2.265874147415161, "logps/chosen": -30.929603576660156, "logps/rejected": -302.1243896484375, "loss": 0.5183, "nll_loss": 0.4758400619029999, "rewards/accuracies": 1.0, "rewards/chosen": 0.9639374017715454, "rewards/margins": 4.965951442718506, "rewards/rejected": -4.00201416015625, "step": 2940 }, { "epoch": 0.49016666666666664, "grad_norm": 24.556461334228516, "learning_rate": 1.0803424045147651e-07, "logits/chosen": 2.53348708152771, "logits/rejected": 2.8221569061279297, "logps/chosen": -79.38412475585938, "logps/rejected": -520.033935546875, "loss": 0.9809, "nll_loss": 0.968099057674408, "rewards/accuracies": 1.0, "rewards/chosen": 1.8652901649475098, "rewards/margins": 8.225238800048828, "rewards/rejected": -6.359948635101318, "step": 2941 }, { "epoch": 0.49033333333333334, "grad_norm": 33.71756362915039, "learning_rate": 1.0798043452582777e-07, "logits/chosen": 2.9574856758117676, "logits/rejected": 2.9092156887054443, "logps/chosen": -96.91358947753906, "logps/rejected": -249.18026733398438, "loss": 1.1035, "nll_loss": 1.0768178701400757, "rewards/accuracies": 1.0, "rewards/chosen": 1.0083221197128296, "rewards/margins": 9.52409839630127, "rewards/rejected": -8.515776634216309, "step": 2942 }, { "epoch": 0.4905, "grad_norm": 28.307283401489258, "learning_rate": 1.0792662627487206e-07, "logits/chosen": 2.1975066661834717, "logits/rejected": 2.1601529121398926, "logps/chosen": -52.65839385986328, "logps/rejected": -150.7252197265625, "loss": 0.954, "nll_loss": 0.9403284788131714, "rewards/accuracies": 1.0, "rewards/chosen": 1.9796501398086548, "rewards/margins": 7.235663414001465, "rewards/rejected": -5.2560133934021, "step": 2943 }, { "epoch": 0.49066666666666664, "grad_norm": 49.27812576293945, "learning_rate": 1.078728157142877e-07, "logits/chosen": 2.4096810817718506, "logits/rejected": 2.4637248516082764, "logps/chosen": -12.739888191223145, "logps/rejected": -97.50797271728516, "loss": 0.5778, "nll_loss": 0.5539081692695618, "rewards/accuracies": 1.0, "rewards/chosen": 1.9694805145263672, "rewards/margins": 5.798617362976074, "rewards/rejected": -3.829136848449707, "step": 2944 }, { "epoch": 0.49083333333333334, "grad_norm": 55.70598220825195, "learning_rate": 1.0781900285975387e-07, "logits/chosen": 2.836574077606201, "logits/rejected": 2.7935280799865723, "logps/chosen": -21.862977981567383, "logps/rejected": -60.22052764892578, "loss": 1.0078, "nll_loss": 0.9937717914581299, "rewards/accuracies": 1.0, "rewards/chosen": 2.7815606594085693, "rewards/margins": 6.873671531677246, "rewards/rejected": -4.092110633850098, "step": 2945 }, { "epoch": 0.491, "grad_norm": 62.20378494262695, "learning_rate": 1.0776518772695033e-07, "logits/chosen": 2.7147345542907715, "logits/rejected": 2.972136974334717, "logps/chosen": -5.145050048828125, "logps/rejected": -152.84844970703125, "loss": 0.471, "nll_loss": 0.3957730829715729, "rewards/accuracies": 1.0, "rewards/chosen": 0.36788463592529297, "rewards/margins": 4.003740310668945, "rewards/rejected": -3.6358559131622314, "step": 2946 }, { "epoch": 0.49116666666666664, "grad_norm": 61.277950286865234, "learning_rate": 1.077113703315575e-07, "logits/chosen": 1.9747391939163208, "logits/rejected": 2.383692502975464, "logps/chosen": -50.8408203125, "logps/rejected": -151.13645935058594, "loss": 1.1698, "nll_loss": 1.1052350997924805, "rewards/accuracies": 1.0, "rewards/chosen": 0.06512832641601562, "rewards/margins": 6.2914886474609375, "rewards/rejected": -6.226360321044922, "step": 2947 }, { "epoch": 0.49133333333333334, "grad_norm": 24.092243194580078, "learning_rate": 1.0765755068925648e-07, "logits/chosen": 1.6469454765319824, "logits/rejected": 2.372450828552246, "logps/chosen": -41.81105041503906, "logps/rejected": -389.54071044921875, "loss": 0.5867, "nll_loss": 0.5807090401649475, "rewards/accuracies": 1.0, "rewards/chosen": 2.5902726650238037, "rewards/margins": 10.301222801208496, "rewards/rejected": -7.710949897766113, "step": 2948 }, { "epoch": 0.4915, "grad_norm": 31.024877548217773, "learning_rate": 1.0760372881572904e-07, "logits/chosen": 1.0328459739685059, "logits/rejected": 1.9212349653244019, "logps/chosen": -27.74379539489746, "logps/rejected": -445.7585144042969, "loss": 0.5839, "nll_loss": 0.5548758506774902, "rewards/accuracies": 1.0, "rewards/chosen": 0.9109911322593689, "rewards/margins": 12.660829544067383, "rewards/rejected": -11.749838829040527, "step": 2949 }, { "epoch": 0.49166666666666664, "grad_norm": 25.639455795288086, "learning_rate": 1.0754990472665759e-07, "logits/chosen": 0.8253422975540161, "logits/rejected": 2.89619779586792, "logps/chosen": -49.465248107910156, "logps/rejected": -551.7364501953125, "loss": 0.6683, "nll_loss": 0.6508586406707764, "rewards/accuracies": 1.0, "rewards/chosen": 1.4517723321914673, "rewards/margins": 11.439702987670898, "rewards/rejected": -9.987930297851562, "step": 2950 }, { "epoch": 0.49183333333333334, "grad_norm": 47.288272857666016, "learning_rate": 1.0749607843772512e-07, "logits/chosen": 2.837519884109497, "logits/rejected": 2.899982213973999, "logps/chosen": -12.456415176391602, "logps/rejected": -285.263916015625, "loss": 0.5454, "nll_loss": 0.541583240032196, "rewards/accuracies": 1.0, "rewards/chosen": 3.264817953109741, "rewards/margins": 9.691204071044922, "rewards/rejected": -6.426385879516602, "step": 2951 }, { "epoch": 0.492, "grad_norm": 34.74062728881836, "learning_rate": 1.074422499646154e-07, "logits/chosen": 2.1304848194122314, "logits/rejected": 2.0509848594665527, "logps/chosen": -50.10231018066406, "logps/rejected": -88.94004821777344, "loss": 0.7296, "nll_loss": 0.6592409014701843, "rewards/accuracies": 1.0, "rewards/chosen": 1.8285073041915894, "rewards/margins": 4.137980937957764, "rewards/rejected": -2.3094735145568848, "step": 2952 }, { "epoch": 0.49216666666666664, "grad_norm": 34.14167785644531, "learning_rate": 1.0738841932301269e-07, "logits/chosen": 2.501214027404785, "logits/rejected": 2.811815023422241, "logps/chosen": -14.664562225341797, "logps/rejected": -426.5510559082031, "loss": 0.4522, "nll_loss": 0.4189874529838562, "rewards/accuracies": 1.0, "rewards/chosen": 1.00434148311615, "rewards/margins": 5.791500091552734, "rewards/rejected": -4.787158489227295, "step": 2953 }, { "epoch": 0.49233333333333335, "grad_norm": 25.569076538085938, "learning_rate": 1.0733458652860194e-07, "logits/chosen": 2.70385479927063, "logits/rejected": 2.6449146270751953, "logps/chosen": -65.9712905883789, "logps/rejected": -98.72137451171875, "loss": 0.7871, "nll_loss": 0.7761330008506775, "rewards/accuracies": 1.0, "rewards/chosen": 2.483741044998169, "rewards/margins": 7.31256103515625, "rewards/rejected": -4.82882022857666, "step": 2954 }, { "epoch": 0.4925, "grad_norm": 46.48905563354492, "learning_rate": 1.072807515970688e-07, "logits/chosen": 2.3229925632476807, "logits/rejected": 2.6565778255462646, "logps/chosen": -36.74580383300781, "logps/rejected": -51.518577575683594, "loss": 0.6531, "nll_loss": 0.4899440407752991, "rewards/accuracies": 1.0, "rewards/chosen": 0.3842727839946747, "rewards/margins": 2.5087246894836426, "rewards/rejected": -2.1244518756866455, "step": 2955 }, { "epoch": 0.49266666666666664, "grad_norm": 21.165081024169922, "learning_rate": 1.0722691454409943e-07, "logits/chosen": 1.6855928897857666, "logits/rejected": 1.3604460954666138, "logps/chosen": -45.900169372558594, "logps/rejected": -95.55029296875, "loss": 0.5681, "nll_loss": 0.5530141592025757, "rewards/accuracies": 1.0, "rewards/chosen": 1.8711998462677002, "rewards/margins": 7.0694122314453125, "rewards/rejected": -5.198212623596191, "step": 2956 }, { "epoch": 0.49283333333333335, "grad_norm": 101.15115356445312, "learning_rate": 1.0717307538538065e-07, "logits/chosen": 2.616258382797241, "logits/rejected": 2.543059825897217, "logps/chosen": -65.97250366210938, "logps/rejected": -111.47516632080078, "loss": 1.4432, "nll_loss": 1.0815165042877197, "rewards/accuracies": 1.0, "rewards/chosen": -0.727786660194397, "rewards/margins": 1.3335026502609253, "rewards/rejected": -2.0612893104553223, "step": 2957 }, { "epoch": 0.493, "grad_norm": 20.598102569580078, "learning_rate": 1.0711923413659993e-07, "logits/chosen": 2.3284144401550293, "logits/rejected": 2.642906904220581, "logps/chosen": -59.70176315307617, "logps/rejected": -228.493896484375, "loss": 0.582, "nll_loss": 0.5796287655830383, "rewards/accuracies": 1.0, "rewards/chosen": 3.5150387287139893, "rewards/margins": 12.040645599365234, "rewards/rejected": -8.525607109069824, "step": 2958 }, { "epoch": 0.49316666666666664, "grad_norm": 39.33842086791992, "learning_rate": 1.0706539081344528e-07, "logits/chosen": 1.9547147750854492, "logits/rejected": 2.337035894393921, "logps/chosen": -58.71419143676758, "logps/rejected": -134.14320373535156, "loss": 0.8825, "nll_loss": 0.8387740254402161, "rewards/accuracies": 1.0, "rewards/chosen": 1.309726357460022, "rewards/margins": 4.704774856567383, "rewards/rejected": -3.3950486183166504, "step": 2959 }, { "epoch": 0.49333333333333335, "grad_norm": 25.643892288208008, "learning_rate": 1.070115454316054e-07, "logits/chosen": 1.804197072982788, "logits/rejected": 1.952241063117981, "logps/chosen": -94.68524169921875, "logps/rejected": -87.98391723632812, "loss": 0.9307, "nll_loss": 0.9192743301391602, "rewards/accuracies": 1.0, "rewards/chosen": 1.923535943031311, "rewards/margins": 9.24061107635498, "rewards/rejected": -7.317075252532959, "step": 2960 }, { "epoch": 0.4935, "grad_norm": 27.687416076660156, "learning_rate": 1.0695769800676949e-07, "logits/chosen": 1.903867483139038, "logits/rejected": 2.1704440116882324, "logps/chosen": -34.838375091552734, "logps/rejected": -183.79757690429688, "loss": 0.5336, "nll_loss": 0.5123289823532104, "rewards/accuracies": 1.0, "rewards/chosen": 2.1425976753234863, "rewards/margins": 6.01838493347168, "rewards/rejected": -3.8757874965667725, "step": 2961 }, { "epoch": 0.49366666666666664, "grad_norm": 40.358253479003906, "learning_rate": 1.0690384855462744e-07, "logits/chosen": 3.831693649291992, "logits/rejected": 3.915639877319336, "logps/chosen": -17.943374633789062, "logps/rejected": -250.3051300048828, "loss": 0.7211, "nll_loss": 0.7177349328994751, "rewards/accuracies": 1.0, "rewards/chosen": 3.5514039993286133, "rewards/margins": 9.720104217529297, "rewards/rejected": -6.168699741363525, "step": 2962 }, { "epoch": 0.49383333333333335, "grad_norm": 36.22542190551758, "learning_rate": 1.0684999709086964e-07, "logits/chosen": 2.650308847427368, "logits/rejected": 2.7830913066864014, "logps/chosen": -45.49826431274414, "logps/rejected": -142.76194763183594, "loss": 0.8649, "nll_loss": 0.8584578633308411, "rewards/accuracies": 1.0, "rewards/chosen": 5.251914978027344, "rewards/margins": 9.692596435546875, "rewards/rejected": -4.440680980682373, "step": 2963 }, { "epoch": 0.494, "grad_norm": 20.108762741088867, "learning_rate": 1.0679614363118717e-07, "logits/chosen": 2.751091718673706, "logits/rejected": 2.8307406902313232, "logps/chosen": -90.24481201171875, "logps/rejected": -205.89395141601562, "loss": 0.8141, "nll_loss": 0.8057572245597839, "rewards/accuracies": 1.0, "rewards/chosen": 2.2412445545196533, "rewards/margins": 9.820152282714844, "rewards/rejected": -7.5789079666137695, "step": 2964 }, { "epoch": 0.49416666666666664, "grad_norm": 38.69986343383789, "learning_rate": 1.0674228819127158e-07, "logits/chosen": 2.925950527191162, "logits/rejected": 2.8677587509155273, "logps/chosen": -91.30517578125, "logps/rejected": -241.44601440429688, "loss": 1.2321, "nll_loss": 1.2013839483261108, "rewards/accuracies": 1.0, "rewards/chosen": 0.9613686203956604, "rewards/margins": 6.522181034088135, "rewards/rejected": -5.560812473297119, "step": 2965 }, { "epoch": 0.49433333333333335, "grad_norm": 1267.5926513671875, "learning_rate": 1.0668843078681509e-07, "logits/chosen": 2.3093464374542236, "logits/rejected": 2.041172742843628, "logps/chosen": -200.1024169921875, "logps/rejected": -31.15665054321289, "loss": 1.9527, "nll_loss": 1.1770728826522827, "rewards/accuracies": 1.0, "rewards/chosen": 0.8800202012062073, "rewards/margins": 0.3866768479347229, "rewards/rejected": 0.4933433532714844, "step": 2966 }, { "epoch": 0.4945, "grad_norm": 34.64166259765625, "learning_rate": 1.0663457143351042e-07, "logits/chosen": 0.29123762249946594, "logits/rejected": 2.8954718112945557, "logps/chosen": -20.22034454345703, "logps/rejected": -745.8426513671875, "loss": 0.5542, "nll_loss": 0.546495795249939, "rewards/accuracies": 1.0, "rewards/chosen": 2.2825660705566406, "rewards/margins": 13.693607330322266, "rewards/rejected": -11.411041259765625, "step": 2967 }, { "epoch": 0.49466666666666664, "grad_norm": 95.30474090576172, "learning_rate": 1.0658071014705094e-07, "logits/chosen": 1.6131279468536377, "logits/rejected": 1.7693129777908325, "logps/chosen": -72.36943817138672, "logps/rejected": -110.70951843261719, "loss": 2.6224, "nll_loss": 2.495497703552246, "rewards/accuracies": 1.0, "rewards/chosen": 1.1980453729629517, "rewards/margins": 3.0539016723632812, "rewards/rejected": -1.8558562994003296, "step": 2968 }, { "epoch": 0.49483333333333335, "grad_norm": 69.08158874511719, "learning_rate": 1.065268469431305e-07, "logits/chosen": 2.7238540649414062, "logits/rejected": 2.9905741214752197, "logps/chosen": -135.85150146484375, "logps/rejected": -223.15872192382812, "loss": 1.5555, "nll_loss": 1.4005310535430908, "rewards/accuracies": 1.0, "rewards/chosen": -1.0940849781036377, "rewards/margins": 6.348549842834473, "rewards/rejected": -7.442634582519531, "step": 2969 }, { "epoch": 0.495, "grad_norm": 196.8551483154297, "learning_rate": 1.0647298183744358e-07, "logits/chosen": 1.4887099266052246, "logits/rejected": 1.5380147695541382, "logps/chosen": -60.21062469482422, "logps/rejected": -18.393123626708984, "loss": 4.267, "nll_loss": 0.5017551779747009, "rewards/accuracies": 0.0, "rewards/chosen": 1.5551155805587769, "rewards/margins": -3.2161645889282227, "rewards/rejected": 4.771280288696289, "step": 2970 }, { "epoch": 0.49516666666666664, "grad_norm": 58.18788528442383, "learning_rate": 1.0641911484568513e-07, "logits/chosen": 1.4398096799850464, "logits/rejected": 2.1437149047851562, "logps/chosen": -46.219566345214844, "logps/rejected": -147.83984375, "loss": 1.3462, "nll_loss": 1.320558786392212, "rewards/accuracies": 1.0, "rewards/chosen": 1.2306187152862549, "rewards/margins": 6.395087242126465, "rewards/rejected": -5.164468288421631, "step": 2971 }, { "epoch": 0.49533333333333335, "grad_norm": 36.40802001953125, "learning_rate": 1.0636524598355072e-07, "logits/chosen": 2.4655580520629883, "logits/rejected": 3.9826951026916504, "logps/chosen": -50.00938034057617, "logps/rejected": -80.01667022705078, "loss": 0.8231, "nll_loss": 0.757718026638031, "rewards/accuracies": 1.0, "rewards/chosen": 0.7316207885742188, "rewards/margins": 4.062891960144043, "rewards/rejected": -3.331271171569824, "step": 2972 }, { "epoch": 0.4955, "grad_norm": 71.43307495117188, "learning_rate": 1.0631137526673646e-07, "logits/chosen": 2.3927383422851562, "logits/rejected": 2.312408208847046, "logps/chosen": -48.851951599121094, "logps/rejected": -45.513397216796875, "loss": 1.3634, "nll_loss": 1.2855777740478516, "rewards/accuracies": 1.0, "rewards/chosen": 0.3815464377403259, "rewards/margins": 3.894472360610962, "rewards/rejected": -3.512925863265991, "step": 2973 }, { "epoch": 0.49566666666666664, "grad_norm": 74.68555450439453, "learning_rate": 1.0625750271093896e-07, "logits/chosen": 1.8390077352523804, "logits/rejected": 1.5154131650924683, "logps/chosen": -72.24421691894531, "logps/rejected": -18.666051864624023, "loss": 1.3688, "nll_loss": 0.9030526876449585, "rewards/accuracies": 1.0, "rewards/chosen": 1.96826171875, "rewards/margins": 1.610787034034729, "rewards/rejected": 0.3574747145175934, "step": 2974 }, { "epoch": 0.49583333333333335, "grad_norm": 39.153072357177734, "learning_rate": 1.0620362833185539e-07, "logits/chosen": 2.6764745712280273, "logits/rejected": 2.87705659866333, "logps/chosen": -36.93315887451172, "logps/rejected": -284.85406494140625, "loss": 0.9537, "nll_loss": 0.9233290553092957, "rewards/accuracies": 1.0, "rewards/chosen": 1.1463050842285156, "rewards/margins": 5.805553436279297, "rewards/rejected": -4.659248352050781, "step": 2975 }, { "epoch": 0.496, "grad_norm": 38.78164291381836, "learning_rate": 1.0614975214518349e-07, "logits/chosen": 2.8657562732696533, "logits/rejected": 2.6597535610198975, "logps/chosen": -156.91505432128906, "logps/rejected": -151.40072631835938, "loss": 1.0925, "nll_loss": 1.0531213283538818, "rewards/accuracies": 1.0, "rewards/chosen": 0.5966079831123352, "rewards/margins": 7.582424163818359, "rewards/rejected": -6.98581600189209, "step": 2976 }, { "epoch": 0.49616666666666664, "grad_norm": 39.58734893798828, "learning_rate": 1.0609587416662142e-07, "logits/chosen": 2.2366464138031006, "logits/rejected": 2.3808727264404297, "logps/chosen": -39.00814437866211, "logps/rejected": -83.91600799560547, "loss": 0.9434, "nll_loss": 0.9071659445762634, "rewards/accuracies": 1.0, "rewards/chosen": 1.8982212543487549, "rewards/margins": 5.09788703918457, "rewards/rejected": -3.1996655464172363, "step": 2977 }, { "epoch": 0.49633333333333335, "grad_norm": 217.37770080566406, "learning_rate": 1.0604199441186797e-07, "logits/chosen": 3.004235029220581, "logits/rejected": 3.0222012996673584, "logps/chosen": -41.926795959472656, "logps/rejected": -43.984222412109375, "loss": 2.9101, "nll_loss": 0.6257730722427368, "rewards/accuracies": 0.0, "rewards/chosen": 0.8526794910430908, "rewards/margins": -1.7605102062225342, "rewards/rejected": 2.613189697265625, "step": 2978 }, { "epoch": 0.4965, "grad_norm": 152.8380584716797, "learning_rate": 1.0598811289662242e-07, "logits/chosen": 0.9247437119483948, "logits/rejected": 0.7237629890441895, "logps/chosen": -42.80144500732422, "logps/rejected": -19.497217178344727, "loss": 2.1519, "nll_loss": 0.611449122428894, "rewards/accuracies": 0.0, "rewards/chosen": 0.783887505531311, "rewards/margins": -0.8657442331314087, "rewards/rejected": 1.6496317386627197, "step": 2979 }, { "epoch": 0.49666666666666665, "grad_norm": 29.382802963256836, "learning_rate": 1.0593422963658451e-07, "logits/chosen": 2.6235711574554443, "logits/rejected": 2.621626138687134, "logps/chosen": -84.42958068847656, "logps/rejected": -182.78712463378906, "loss": 0.7977, "nll_loss": 0.7606269121170044, "rewards/accuracies": 1.0, "rewards/chosen": 0.8841094970703125, "rewards/margins": 5.605710029602051, "rewards/rejected": -4.721600532531738, "step": 2980 }, { "epoch": 0.49683333333333335, "grad_norm": 46.009647369384766, "learning_rate": 1.0588034464745459e-07, "logits/chosen": 2.426835060119629, "logits/rejected": 2.5956459045410156, "logps/chosen": -22.93309211730957, "logps/rejected": -132.00086975097656, "loss": 0.7943, "nll_loss": 0.7907963395118713, "rewards/accuracies": 1.0, "rewards/chosen": 4.173287868499756, "rewards/margins": 9.591395378112793, "rewards/rejected": -5.418107509613037, "step": 2981 }, { "epoch": 0.497, "grad_norm": 34.428897857666016, "learning_rate": 1.0582645794493337e-07, "logits/chosen": 2.682987928390503, "logits/rejected": 2.6380462646484375, "logps/chosen": -265.06231689453125, "logps/rejected": -47.097450256347656, "loss": 1.8057, "nll_loss": 1.7670823335647583, "rewards/accuracies": 1.0, "rewards/chosen": 1.4956483840942383, "rewards/margins": 4.920488357543945, "rewards/rejected": -3.424840211868286, "step": 2982 }, { "epoch": 0.49716666666666665, "grad_norm": 172.30397033691406, "learning_rate": 1.0577256954472222e-07, "logits/chosen": 2.6240310668945312, "logits/rejected": 2.486473798751831, "logps/chosen": -118.10324096679688, "logps/rejected": -9.11967658996582, "loss": 2.5622, "nll_loss": 1.3420823812484741, "rewards/accuracies": 0.0, "rewards/chosen": 0.945843517780304, "rewards/margins": -0.37466758489608765, "rewards/rejected": 1.3205111026763916, "step": 2983 }, { "epoch": 0.49733333333333335, "grad_norm": 36.298423767089844, "learning_rate": 1.0571867946252287e-07, "logits/chosen": 2.635266065597534, "logits/rejected": 2.6050500869750977, "logps/chosen": -47.87013626098633, "logps/rejected": -101.92561340332031, "loss": 0.8647, "nll_loss": 0.8398268222808838, "rewards/accuracies": 1.0, "rewards/chosen": 1.2642735242843628, "rewards/margins": 6.442526817321777, "rewards/rejected": -5.178253173828125, "step": 2984 }, { "epoch": 0.4975, "grad_norm": 51.98236083984375, "learning_rate": 1.0566478771403761e-07, "logits/chosen": 2.756439685821533, "logits/rejected": 2.7818753719329834, "logps/chosen": -24.42156982421875, "logps/rejected": -89.39070892333984, "loss": 0.7195, "nll_loss": 0.660042405128479, "rewards/accuracies": 1.0, "rewards/chosen": 0.6626898050308228, "rewards/margins": 4.336071491241455, "rewards/rejected": -3.6733815670013428, "step": 2985 }, { "epoch": 0.49766666666666665, "grad_norm": 26.78835678100586, "learning_rate": 1.0561089431496923e-07, "logits/chosen": 2.772691011428833, "logits/rejected": 2.7950315475463867, "logps/chosen": -83.13673400878906, "logps/rejected": -230.01828002929688, "loss": 0.7782, "nll_loss": 0.7627223134040833, "rewards/accuracies": 1.0, "rewards/chosen": 1.6814316511154175, "rewards/margins": 7.736758708953857, "rewards/rejected": -6.05532693862915, "step": 2986 }, { "epoch": 0.49783333333333335, "grad_norm": 33.35950469970703, "learning_rate": 1.0555699928102095e-07, "logits/chosen": 2.4449620246887207, "logits/rejected": 2.5909810066223145, "logps/chosen": -90.56180572509766, "logps/rejected": -406.0506896972656, "loss": 1.1083, "nll_loss": 1.0911062955856323, "rewards/accuracies": 1.0, "rewards/chosen": 1.4619897603988647, "rewards/margins": 11.82015323638916, "rewards/rejected": -10.358163833618164, "step": 2987 }, { "epoch": 0.498, "grad_norm": 155.69085693359375, "learning_rate": 1.0550310262789647e-07, "logits/chosen": 2.1034114360809326, "logits/rejected": 1.9112553596496582, "logps/chosen": -41.07829666137695, "logps/rejected": -31.30905532836914, "loss": 2.0303, "nll_loss": 0.5266448259353638, "rewards/accuracies": 0.0, "rewards/chosen": 0.42345887422561646, "rewards/margins": -0.8880756497383118, "rewards/rejected": 1.3115345239639282, "step": 2988 }, { "epoch": 0.49816666666666665, "grad_norm": 34.548465728759766, "learning_rate": 1.0544920437130002e-07, "logits/chosen": 1.5620700120925903, "logits/rejected": 2.283777952194214, "logps/chosen": -54.96681213378906, "logps/rejected": -124.49462890625, "loss": 0.6819, "nll_loss": 0.5974653363227844, "rewards/accuracies": 1.0, "rewards/chosen": 1.7567611932754517, "rewards/margins": 3.8615288734436035, "rewards/rejected": -2.1047675609588623, "step": 2989 }, { "epoch": 0.49833333333333335, "grad_norm": 22.15540885925293, "learning_rate": 1.0539530452693623e-07, "logits/chosen": 2.4016518592834473, "logits/rejected": 2.6036717891693115, "logps/chosen": -109.59332275390625, "logps/rejected": -318.5853576660156, "loss": 0.9674, "nll_loss": 0.9613450765609741, "rewards/accuracies": 1.0, "rewards/chosen": 2.5992705821990967, "rewards/margins": 9.84476089477539, "rewards/rejected": -7.245490074157715, "step": 2990 }, { "epoch": 0.4985, "grad_norm": 21.580392837524414, "learning_rate": 1.0534140311051025e-07, "logits/chosen": 1.1043956279754639, "logits/rejected": 1.8504767417907715, "logps/chosen": -30.884796142578125, "logps/rejected": -223.79348754882812, "loss": 0.4962, "nll_loss": 0.4902348518371582, "rewards/accuracies": 1.0, "rewards/chosen": 2.5755927562713623, "rewards/margins": 10.640557289123535, "rewards/rejected": -8.064964294433594, "step": 2991 }, { "epoch": 0.49866666666666665, "grad_norm": 24.374723434448242, "learning_rate": 1.052875001377276e-07, "logits/chosen": 2.2721822261810303, "logits/rejected": 2.3245625495910645, "logps/chosen": -87.2959213256836, "logps/rejected": -161.69517517089844, "loss": 0.9644, "nll_loss": 0.9488686323165894, "rewards/accuracies": 1.0, "rewards/chosen": 1.5846359729766846, "rewards/margins": 9.178833961486816, "rewards/rejected": -7.594198226928711, "step": 2992 }, { "epoch": 0.49883333333333335, "grad_norm": 85.63728332519531, "learning_rate": 1.0523359562429439e-07, "logits/chosen": 1.919086217880249, "logits/rejected": 2.5184898376464844, "logps/chosen": -28.206315994262695, "logps/rejected": -504.4436340332031, "loss": 1.3334, "nll_loss": 1.2821050882339478, "rewards/accuracies": 1.0, "rewards/chosen": 0.36980462074279785, "rewards/margins": 5.939577102661133, "rewards/rejected": -5.569772243499756, "step": 2993 }, { "epoch": 0.499, "grad_norm": 84.60674285888672, "learning_rate": 1.0517968958591703e-07, "logits/chosen": 2.6012747287750244, "logits/rejected": 2.531066417694092, "logps/chosen": -129.48828125, "logps/rejected": -28.1256046295166, "loss": 1.8511, "nll_loss": 1.17716646194458, "rewards/accuracies": 1.0, "rewards/chosen": 2.695744276046753, "rewards/margins": 1.3340858221054077, "rewards/rejected": 1.3616584539413452, "step": 2994 }, { "epoch": 0.49916666666666665, "grad_norm": 29.667282104492188, "learning_rate": 1.0512578203830251e-07, "logits/chosen": 2.616773843765259, "logits/rejected": 2.827183246612549, "logps/chosen": -76.34783172607422, "logps/rejected": -379.06695556640625, "loss": 1.1113, "nll_loss": 1.0906833410263062, "rewards/accuracies": 1.0, "rewards/chosen": 1.5634064674377441, "rewards/margins": 6.468747138977051, "rewards/rejected": -4.905340671539307, "step": 2995 }, { "epoch": 0.49933333333333335, "grad_norm": 134.92422485351562, "learning_rate": 1.0507187299715814e-07, "logits/chosen": 1.4741984605789185, "logits/rejected": 1.8449939489364624, "logps/chosen": -42.82959747314453, "logps/rejected": -174.000732421875, "loss": 1.6189, "nll_loss": 1.5296283960342407, "rewards/accuracies": 1.0, "rewards/chosen": -0.05654144659638405, "rewards/margins": 4.113499641418457, "rewards/rejected": -4.170041084289551, "step": 2996 }, { "epoch": 0.4995, "grad_norm": 30.13693618774414, "learning_rate": 1.0501796247819175e-07, "logits/chosen": 1.9143954515457153, "logits/rejected": 2.332557439804077, "logps/chosen": -22.222095489501953, "logps/rejected": -195.28515625, "loss": 0.561, "nll_loss": 0.5555524230003357, "rewards/accuracies": 1.0, "rewards/chosen": 2.7119743824005127, "rewards/margins": 9.882490158081055, "rewards/rejected": -7.170515537261963, "step": 2997 }, { "epoch": 0.49966666666666665, "grad_norm": 122.78990936279297, "learning_rate": 1.0496405049711155e-07, "logits/chosen": 1.5998125076293945, "logits/rejected": 2.8427374362945557, "logps/chosen": -33.456398010253906, "logps/rejected": -223.3477325439453, "loss": 2.3166, "nll_loss": 2.230426788330078, "rewards/accuracies": 1.0, "rewards/chosen": -0.316497802734375, "rewards/margins": 6.777122497558594, "rewards/rejected": -7.093620300292969, "step": 2998 }, { "epoch": 0.49983333333333335, "grad_norm": 34.25053024291992, "learning_rate": 1.0491013706962621e-07, "logits/chosen": 3.1723685264587402, "logits/rejected": 3.1947054862976074, "logps/chosen": -50.69879150390625, "logps/rejected": -246.9383087158203, "loss": 0.8843, "nll_loss": 0.8593015670776367, "rewards/accuracies": 1.0, "rewards/chosen": 1.3586030006408691, "rewards/margins": 6.1184892654418945, "rewards/rejected": -4.759886264801025, "step": 2999 }, { "epoch": 0.5, "grad_norm": 34.626625061035156, "learning_rate": 1.0485622221144483e-07, "logits/chosen": 2.6623101234436035, "logits/rejected": 2.6874825954437256, "logps/chosen": -36.336761474609375, "logps/rejected": -191.90316772460938, "loss": 0.7708, "nll_loss": 0.7570158839225769, "rewards/accuracies": 1.0, "rewards/chosen": 2.100275754928589, "rewards/margins": 6.999672889709473, "rewards/rejected": -4.899397373199463, "step": 3000 }, { "epoch": 0.5001666666666666, "grad_norm": 37.411964416503906, "learning_rate": 1.0480230593827685e-07, "logits/chosen": 2.4237542152404785, "logits/rejected": 2.690807342529297, "logps/chosen": -35.696014404296875, "logps/rejected": -172.76885986328125, "loss": 0.7531, "nll_loss": 0.7139202952384949, "rewards/accuracies": 1.0, "rewards/chosen": 0.8781685829162598, "rewards/margins": 5.365799903869629, "rewards/rejected": -4.487631320953369, "step": 3001 }, { "epoch": 0.5003333333333333, "grad_norm": 145.0896759033203, "learning_rate": 1.0474838826583225e-07, "logits/chosen": 2.5176303386688232, "logits/rejected": 2.637367010116577, "logps/chosen": -16.021320343017578, "logps/rejected": -28.626079559326172, "loss": 1.2609, "nll_loss": 0.6162046194076538, "rewards/accuracies": 1.0, "rewards/chosen": 1.4778274297714233, "rewards/margins": 0.8766158819198608, "rewards/rejected": 0.6012115478515625, "step": 3002 }, { "epoch": 0.5005, "grad_norm": 37.33292770385742, "learning_rate": 1.0469446920982128e-07, "logits/chosen": 3.075643301010132, "logits/rejected": 3.2092134952545166, "logps/chosen": -44.511375427246094, "logps/rejected": -155.62991333007812, "loss": 0.9325, "nll_loss": 0.9083954691886902, "rewards/accuracies": 1.0, "rewards/chosen": 1.1616344451904297, "rewards/margins": 7.513922691345215, "rewards/rejected": -6.352288246154785, "step": 3003 }, { "epoch": 0.5006666666666667, "grad_norm": 43.874916076660156, "learning_rate": 1.0464054878595472e-07, "logits/chosen": 2.425182580947876, "logits/rejected": 2.577017307281494, "logps/chosen": -56.944244384765625, "logps/rejected": -280.5968933105469, "loss": 0.9606, "nll_loss": 0.8760652542114258, "rewards/accuracies": 1.0, "rewards/chosen": 1.3363922834396362, "rewards/margins": 3.666614055633545, "rewards/rejected": -2.330221652984619, "step": 3004 }, { "epoch": 0.5008333333333334, "grad_norm": 13.96264934539795, "learning_rate": 1.045866270099436e-07, "logits/chosen": 2.472669839859009, "logits/rejected": 2.5685126781463623, "logps/chosen": -109.7872543334961, "logps/rejected": -273.18096923828125, "loss": 0.5292, "nll_loss": 0.5227963924407959, "rewards/accuracies": 1.0, "rewards/chosen": 2.5154213905334473, "rewards/margins": 10.34693717956543, "rewards/rejected": -7.831515789031982, "step": 3005 }, { "epoch": 0.501, "grad_norm": 53.93803787231445, "learning_rate": 1.0453270389749955e-07, "logits/chosen": 1.9954012632369995, "logits/rejected": 2.121784210205078, "logps/chosen": -12.69818115234375, "logps/rejected": -59.65080642700195, "loss": 0.5981, "nll_loss": 0.5771899819374084, "rewards/accuracies": 1.0, "rewards/chosen": 2.0523178577423096, "rewards/margins": 6.045920372009277, "rewards/rejected": -3.993602752685547, "step": 3006 }, { "epoch": 0.5011666666666666, "grad_norm": 100.04544067382812, "learning_rate": 1.0447877946433434e-07, "logits/chosen": 2.5521116256713867, "logits/rejected": 2.416970729827881, "logps/chosen": -109.24958801269531, "logps/rejected": -154.5088653564453, "loss": 1.8472, "nll_loss": 1.3005902767181396, "rewards/accuracies": 1.0, "rewards/chosen": -3.476738691329956, "rewards/margins": 3.386113405227661, "rewards/rejected": -6.862852096557617, "step": 3007 }, { "epoch": 0.5013333333333333, "grad_norm": 27.622787475585938, "learning_rate": 1.0442485372616037e-07, "logits/chosen": 2.8029558658599854, "logits/rejected": 3.073471784591675, "logps/chosen": -72.16312408447266, "logps/rejected": -298.23870849609375, "loss": 0.9067, "nll_loss": 0.8800382018089294, "rewards/accuracies": 1.0, "rewards/chosen": 1.0057579278945923, "rewards/margins": 8.607757568359375, "rewards/rejected": -7.601999282836914, "step": 3008 }, { "epoch": 0.5015, "grad_norm": 39.21226501464844, "learning_rate": 1.0437092669869023e-07, "logits/chosen": 3.421713352203369, "logits/rejected": 3.4183144569396973, "logps/chosen": -49.54717254638672, "logps/rejected": -165.28750610351562, "loss": 0.9465, "nll_loss": 0.9008577466011047, "rewards/accuracies": 1.0, "rewards/chosen": 1.7852623462677002, "rewards/margins": 4.715065956115723, "rewards/rejected": -2.9298038482666016, "step": 3009 }, { "epoch": 0.5016666666666667, "grad_norm": 23.15936851501465, "learning_rate": 1.04316998397637e-07, "logits/chosen": 2.4564828872680664, "logits/rejected": 2.470324754714966, "logps/chosen": -99.86382293701172, "logps/rejected": -205.14279174804688, "loss": 0.9442, "nll_loss": 0.9421115517616272, "rewards/accuracies": 1.0, "rewards/chosen": 3.8040003776550293, "rewards/margins": 11.179021835327148, "rewards/rejected": -7.375021457672119, "step": 3010 }, { "epoch": 0.5018333333333334, "grad_norm": 24.354143142700195, "learning_rate": 1.0426306883871406e-07, "logits/chosen": 2.7793257236480713, "logits/rejected": 3.0611960887908936, "logps/chosen": -68.751220703125, "logps/rejected": -338.04345703125, "loss": 0.6369, "nll_loss": 0.6307451725006104, "rewards/accuracies": 1.0, "rewards/chosen": 2.499586582183838, "rewards/margins": 14.728334426879883, "rewards/rejected": -12.228747367858887, "step": 3011 }, { "epoch": 0.502, "grad_norm": 40.319976806640625, "learning_rate": 1.042091380376352e-07, "logits/chosen": 1.9848793745040894, "logits/rejected": 2.105264186859131, "logps/chosen": -29.577810287475586, "logps/rejected": -85.52320861816406, "loss": 0.5938, "nll_loss": 0.537778377532959, "rewards/accuracies": 1.0, "rewards/chosen": 0.6636472940444946, "rewards/margins": 4.487492084503174, "rewards/rejected": -3.8238449096679688, "step": 3012 }, { "epoch": 0.5021666666666667, "grad_norm": 22.368608474731445, "learning_rate": 1.0415520601011458e-07, "logits/chosen": 1.8569912910461426, "logits/rejected": 2.0660829544067383, "logps/chosen": -22.441701889038086, "logps/rejected": -65.13810729980469, "loss": 0.4333, "nll_loss": 0.37402838468551636, "rewards/accuracies": 1.0, "rewards/chosen": 1.6574606895446777, "rewards/margins": 4.293123245239258, "rewards/rejected": -2.63566255569458, "step": 3013 }, { "epoch": 0.5023333333333333, "grad_norm": 24.395219802856445, "learning_rate": 1.0410127277186671e-07, "logits/chosen": 1.9490885734558105, "logits/rejected": 2.3953418731689453, "logps/chosen": -22.53188705444336, "logps/rejected": -216.0275421142578, "loss": 0.4266, "nll_loss": 0.3952963352203369, "rewards/accuracies": 1.0, "rewards/chosen": 0.8442751169204712, "rewards/margins": 7.780038833618164, "rewards/rejected": -6.935763835906982, "step": 3014 }, { "epoch": 0.5025, "grad_norm": 21.128801345825195, "learning_rate": 1.0404733833860637e-07, "logits/chosen": 1.1566569805145264, "logits/rejected": 2.6735503673553467, "logps/chosen": -41.81328582763672, "logps/rejected": -431.50823974609375, "loss": 0.5583, "nll_loss": 0.5501748919487, "rewards/accuracies": 1.0, "rewards/chosen": 2.3081092834472656, "rewards/margins": 9.122773170471191, "rewards/rejected": -6.814663887023926, "step": 3015 }, { "epoch": 0.5026666666666667, "grad_norm": 25.632112503051758, "learning_rate": 1.0399340272604884e-07, "logits/chosen": 2.396225929260254, "logits/rejected": 2.7152364253997803, "logps/chosen": -49.18879318237305, "logps/rejected": -289.5398864746094, "loss": 0.6591, "nll_loss": 0.6388154625892639, "rewards/accuracies": 1.0, "rewards/chosen": 1.335010290145874, "rewards/margins": 7.792099952697754, "rewards/rejected": -6.457089900970459, "step": 3016 }, { "epoch": 0.5028333333333334, "grad_norm": 28.814998626708984, "learning_rate": 1.039394659499096e-07, "logits/chosen": 2.190356492996216, "logits/rejected": 1.3518040180206299, "logps/chosen": -129.12747192382812, "logps/rejected": -66.48249816894531, "loss": 1.3925, "nll_loss": 1.345077633857727, "rewards/accuracies": 1.0, "rewards/chosen": 2.809622287750244, "rewards/margins": 5.307419300079346, "rewards/rejected": -2.4977970123291016, "step": 3017 }, { "epoch": 0.503, "grad_norm": 45.78438949584961, "learning_rate": 1.038855280259046e-07, "logits/chosen": 2.0394716262817383, "logits/rejected": 2.294891119003296, "logps/chosen": -106.84585571289062, "logps/rejected": -164.16757202148438, "loss": 1.2968, "nll_loss": 1.2423937320709229, "rewards/accuracies": 1.0, "rewards/chosen": 0.20720291137695312, "rewards/margins": 8.38355541229248, "rewards/rejected": -8.176352500915527, "step": 3018 }, { "epoch": 0.5031666666666667, "grad_norm": 131.1303253173828, "learning_rate": 1.0383158896974997e-07, "logits/chosen": 1.7504220008850098, "logits/rejected": 1.7145816087722778, "logps/chosen": -105.95403289794922, "logps/rejected": -63.55658721923828, "loss": 1.8806, "nll_loss": 1.0702427625656128, "rewards/accuracies": 1.0, "rewards/chosen": 1.8676095008850098, "rewards/margins": 0.6375817060470581, "rewards/rejected": 1.2300277948379517, "step": 3019 }, { "epoch": 0.5033333333333333, "grad_norm": 98.1507797241211, "learning_rate": 1.0377764879716233e-07, "logits/chosen": 1.6924690008163452, "logits/rejected": 2.4616949558258057, "logps/chosen": -12.928396224975586, "logps/rejected": -130.74476623535156, "loss": 0.8762, "nll_loss": 0.8618931174278259, "rewards/accuracies": 1.0, "rewards/chosen": 1.8702540397644043, "rewards/margins": 7.280872344970703, "rewards/rejected": -5.410618305206299, "step": 3020 }, { "epoch": 0.5035, "grad_norm": 28.93049430847168, "learning_rate": 1.0372370752385853e-07, "logits/chosen": 2.1936159133911133, "logits/rejected": 1.8533350229263306, "logps/chosen": -120.46360778808594, "logps/rejected": -107.52393341064453, "loss": 1.1764, "nll_loss": 1.1364490985870361, "rewards/accuracies": 1.0, "rewards/chosen": 1.8240845203399658, "rewards/margins": 4.926972389221191, "rewards/rejected": -3.1028878688812256, "step": 3021 }, { "epoch": 0.5036666666666667, "grad_norm": 35.05315399169922, "learning_rate": 1.0366976516555572e-07, "logits/chosen": 0.9303731322288513, "logits/rejected": 2.6627426147460938, "logps/chosen": -52.29596710205078, "logps/rejected": -515.3603515625, "loss": 0.9899, "nll_loss": 0.9867165088653564, "rewards/accuracies": 1.0, "rewards/chosen": 3.289785146713257, "rewards/margins": 10.767693519592285, "rewards/rejected": -7.477908611297607, "step": 3022 }, { "epoch": 0.5038333333333334, "grad_norm": 176.70977783203125, "learning_rate": 1.0361582173797148e-07, "logits/chosen": 3.4078028202056885, "logits/rejected": 3.363298177719116, "logps/chosen": -44.934417724609375, "logps/rejected": -45.3590087890625, "loss": 2.1873, "nll_loss": 0.6328790783882141, "rewards/accuracies": 0.0, "rewards/chosen": 1.7526054382324219, "rewards/margins": -0.6584947109222412, "rewards/rejected": 2.411100149154663, "step": 3023 }, { "epoch": 0.504, "grad_norm": 23.040136337280273, "learning_rate": 1.0356187725682357e-07, "logits/chosen": 2.0706682205200195, "logits/rejected": 2.348615884780884, "logps/chosen": -133.7789764404297, "logps/rejected": -407.9877624511719, "loss": 0.9884, "nll_loss": 0.9764890074729919, "rewards/accuracies": 1.0, "rewards/chosen": 1.8477600812911987, "rewards/margins": 9.753186225891113, "rewards/rejected": -7.905426025390625, "step": 3024 }, { "epoch": 0.5041666666666667, "grad_norm": 45.95808792114258, "learning_rate": 1.0350793173783017e-07, "logits/chosen": 1.5603822469711304, "logits/rejected": 1.393485426902771, "logps/chosen": -18.37518310546875, "logps/rejected": -82.69309997558594, "loss": 0.5526, "nll_loss": 0.5104217529296875, "rewards/accuracies": 1.0, "rewards/chosen": 1.3481884002685547, "rewards/margins": 4.76602840423584, "rewards/rejected": -3.417839765548706, "step": 3025 }, { "epoch": 0.5043333333333333, "grad_norm": 31.2642822265625, "learning_rate": 1.0345398519670964e-07, "logits/chosen": 1.8776692152023315, "logits/rejected": 2.2846319675445557, "logps/chosen": -60.35387420654297, "logps/rejected": -177.83216857910156, "loss": 0.868, "nll_loss": 0.8500546216964722, "rewards/accuracies": 1.0, "rewards/chosen": 1.7054359912872314, "rewards/margins": 6.694684982299805, "rewards/rejected": -4.989248752593994, "step": 3026 }, { "epoch": 0.5045, "grad_norm": 165.64024353027344, "learning_rate": 1.0340003764918077e-07, "logits/chosen": 2.9600162506103516, "logits/rejected": 2.8099610805511475, "logps/chosen": -57.50129699707031, "logps/rejected": -20.86627960205078, "loss": 2.2177, "nll_loss": 0.7986290454864502, "rewards/accuracies": 0.0, "rewards/chosen": 1.653547763824463, "rewards/margins": -0.49163365364074707, "rewards/rejected": 2.14518141746521, "step": 3027 }, { "epoch": 0.5046666666666667, "grad_norm": 137.38247680664062, "learning_rate": 1.0334608911096254e-07, "logits/chosen": 1.6365368366241455, "logits/rejected": 1.379575490951538, "logps/chosen": -70.89605712890625, "logps/rejected": -48.69578552246094, "loss": 1.7632, "nll_loss": 0.9452806711196899, "rewards/accuracies": 1.0, "rewards/chosen": 2.28568434715271, "rewards/margins": 0.7821884155273438, "rewards/rejected": 1.5034959316253662, "step": 3028 }, { "epoch": 0.5048333333333334, "grad_norm": 20.461713790893555, "learning_rate": 1.032921395977743e-07, "logits/chosen": 1.979562520980835, "logits/rejected": 2.5944347381591797, "logps/chosen": -58.35945129394531, "logps/rejected": -145.6623992919922, "loss": 0.59, "nll_loss": 0.5778163075447083, "rewards/accuracies": 1.0, "rewards/chosen": 1.9304389953613281, "rewards/margins": 8.081573486328125, "rewards/rejected": -6.151134490966797, "step": 3029 }, { "epoch": 0.505, "grad_norm": 92.36473846435547, "learning_rate": 1.032381891253356e-07, "logits/chosen": 3.231041669845581, "logits/rejected": 3.065286159515381, "logps/chosen": -317.3583984375, "logps/rejected": -193.7311553955078, "loss": 1.1981, "nll_loss": 1.0204451084136963, "rewards/accuracies": 1.0, "rewards/chosen": -0.9579529762268066, "rewards/margins": 3.217066764831543, "rewards/rejected": -4.17501974105835, "step": 3030 }, { "epoch": 0.5051666666666667, "grad_norm": 62.04771423339844, "learning_rate": 1.0318423770936633e-07, "logits/chosen": 2.344416618347168, "logits/rejected": 2.537328004837036, "logps/chosen": -29.679622650146484, "logps/rejected": -1.7614260911941528, "loss": 1.057, "nll_loss": 0.4065701961517334, "rewards/accuracies": 1.0, "rewards/chosen": 1.2285137176513672, "rewards/margins": 0.7754311561584473, "rewards/rejected": 0.4530825614929199, "step": 3031 }, { "epoch": 0.5053333333333333, "grad_norm": 39.37288284301758, "learning_rate": 1.0313028536558663e-07, "logits/chosen": 2.9228689670562744, "logits/rejected": 3.0014536380767822, "logps/chosen": -71.68441772460938, "logps/rejected": -197.63540649414062, "loss": 1.2016, "nll_loss": 1.1947402954101562, "rewards/accuracies": 1.0, "rewards/chosen": 2.9281556606292725, "rewards/margins": 8.22248363494873, "rewards/rejected": -5.294328212738037, "step": 3032 }, { "epoch": 0.5055, "grad_norm": 248.99374389648438, "learning_rate": 1.0307633210971696e-07, "logits/chosen": 2.2486813068389893, "logits/rejected": 1.9972009658813477, "logps/chosen": -263.5628662109375, "logps/rejected": -78.19437408447266, "loss": 2.4301, "nll_loss": 1.0936217308044434, "rewards/accuracies": 0.0, "rewards/chosen": 1.6031402349472046, "rewards/margins": -0.38303911685943604, "rewards/rejected": 1.9861793518066406, "step": 3033 }, { "epoch": 0.5056666666666667, "grad_norm": 365.15032958984375, "learning_rate": 1.0302237795747793e-07, "logits/chosen": 1.5552058219909668, "logits/rejected": 2.839082956314087, "logps/chosen": -37.358917236328125, "logps/rejected": -395.1865234375, "loss": 0.8837, "nll_loss": 0.8688122034072876, "rewards/accuracies": 1.0, "rewards/chosen": 1.6282356977462769, "rewards/margins": 9.027899742126465, "rewards/rejected": -7.399664402008057, "step": 3034 }, { "epoch": 0.5058333333333334, "grad_norm": 35.19990539550781, "learning_rate": 1.0296842292459056e-07, "logits/chosen": 2.4424920082092285, "logits/rejected": 2.4361917972564697, "logps/chosen": -57.229393005371094, "logps/rejected": -61.6356086730957, "loss": 0.9345, "nll_loss": 0.8942093849182129, "rewards/accuracies": 1.0, "rewards/chosen": 0.7772148251533508, "rewards/margins": 5.493398189544678, "rewards/rejected": -4.716183185577393, "step": 3035 }, { "epoch": 0.506, "grad_norm": 47.155521392822266, "learning_rate": 1.0291446702677597e-07, "logits/chosen": 2.3291304111480713, "logits/rejected": 2.582571506500244, "logps/chosen": -39.60148620605469, "logps/rejected": -324.3330078125, "loss": 1.1733, "nll_loss": 1.1647495031356812, "rewards/accuracies": 1.0, "rewards/chosen": 4.323887825012207, "rewards/margins": 8.56303882598877, "rewards/rejected": -4.2391510009765625, "step": 3036 }, { "epoch": 0.5061666666666667, "grad_norm": 31.95548439025879, "learning_rate": 1.028605102797557e-07, "logits/chosen": 2.4829940795898438, "logits/rejected": 2.4234161376953125, "logps/chosen": -143.18548583984375, "logps/rejected": -131.79617309570312, "loss": 1.146, "nll_loss": 1.1014267206192017, "rewards/accuracies": 1.0, "rewards/chosen": 0.62347412109375, "rewards/margins": 5.491196632385254, "rewards/rejected": -4.867722511291504, "step": 3037 }, { "epoch": 0.5063333333333333, "grad_norm": 38.258182525634766, "learning_rate": 1.0280655269925138e-07, "logits/chosen": 3.005133867263794, "logits/rejected": 2.8552632331848145, "logps/chosen": -101.22783660888672, "logps/rejected": -103.71503448486328, "loss": 1.2774, "nll_loss": 1.2344858646392822, "rewards/accuracies": 1.0, "rewards/chosen": 0.990620493888855, "rewards/margins": 4.891587257385254, "rewards/rejected": -3.9009666442871094, "step": 3038 }, { "epoch": 0.5065, "grad_norm": 23.620588302612305, "learning_rate": 1.02752594300985e-07, "logits/chosen": 2.1049375534057617, "logits/rejected": 2.4512085914611816, "logps/chosen": -56.66908645629883, "logps/rejected": -285.4669189453125, "loss": 0.8279, "nll_loss": 0.8212910294532776, "rewards/accuracies": 1.0, "rewards/chosen": 2.4970920085906982, "rewards/margins": 9.893606185913086, "rewards/rejected": -7.396513938903809, "step": 3039 }, { "epoch": 0.5066666666666667, "grad_norm": 24.089523315429688, "learning_rate": 1.026986351006787e-07, "logits/chosen": 3.9510793685913086, "logits/rejected": 4.205102920532227, "logps/chosen": -36.75496292114258, "logps/rejected": -268.7562255859375, "loss": 0.5674, "nll_loss": 0.5568933486938477, "rewards/accuracies": 1.0, "rewards/chosen": 1.9948543310165405, "rewards/margins": 9.374948501586914, "rewards/rejected": -7.380094051361084, "step": 3040 }, { "epoch": 0.5068333333333334, "grad_norm": 26.077913284301758, "learning_rate": 1.0264467511405492e-07, "logits/chosen": 0.7821944952011108, "logits/rejected": 1.900909662246704, "logps/chosen": -81.02743530273438, "logps/rejected": -234.75587463378906, "loss": 0.7245, "nll_loss": 0.6866731643676758, "rewards/accuracies": 1.0, "rewards/chosen": 0.7070274353027344, "rewards/margins": 6.338962554931641, "rewards/rejected": -5.631935119628906, "step": 3041 }, { "epoch": 0.507, "grad_norm": 110.42333221435547, "learning_rate": 1.0259071435683635e-07, "logits/chosen": 3.0711071491241455, "logits/rejected": 2.8919506072998047, "logps/chosen": -44.68936538696289, "logps/rejected": -113.42815399169922, "loss": 1.1936, "nll_loss": 0.7980243563652039, "rewards/accuracies": 1.0, "rewards/chosen": 0.8324723243713379, "rewards/margins": 1.376284122467041, "rewards/rejected": -0.5438117980957031, "step": 3042 }, { "epoch": 0.5071666666666667, "grad_norm": 88.03179168701172, "learning_rate": 1.0253675284474576e-07, "logits/chosen": 2.3882381916046143, "logits/rejected": 2.8588192462921143, "logps/chosen": -12.322040557861328, "logps/rejected": -216.677734375, "loss": 0.8333, "nll_loss": 0.821469247341156, "rewards/accuracies": 1.0, "rewards/chosen": 1.9234298467636108, "rewards/margins": 8.469283103942871, "rewards/rejected": -6.545853137969971, "step": 3043 }, { "epoch": 0.5073333333333333, "grad_norm": 36.75920867919922, "learning_rate": 1.0248279059350634e-07, "logits/chosen": 2.6630630493164062, "logits/rejected": 2.7526614665985107, "logps/chosen": -51.44199752807617, "logps/rejected": -267.83319091796875, "loss": 0.9903, "nll_loss": 0.9892691373825073, "rewards/accuracies": 1.0, "rewards/chosen": 4.701922416687012, "rewards/margins": 12.110486030578613, "rewards/rejected": -7.408563613891602, "step": 3044 }, { "epoch": 0.5075, "grad_norm": 62.136375427246094, "learning_rate": 1.0242882761884131e-07, "logits/chosen": 2.415677547454834, "logits/rejected": 2.4602105617523193, "logps/chosen": -4.407254695892334, "logps/rejected": -126.9412841796875, "loss": 0.3335, "nll_loss": 0.31480392813682556, "rewards/accuracies": 1.0, "rewards/chosen": 1.5644409656524658, "rewards/margins": 6.886752128601074, "rewards/rejected": -5.3223114013671875, "step": 3045 }, { "epoch": 0.5076666666666667, "grad_norm": 35.07713317871094, "learning_rate": 1.0237486393647423e-07, "logits/chosen": 2.0354177951812744, "logits/rejected": 2.142932653427124, "logps/chosen": -104.78326416015625, "logps/rejected": -112.13384246826172, "loss": 1.288, "nll_loss": 1.2624489068984985, "rewards/accuracies": 1.0, "rewards/chosen": 1.0966377258300781, "rewards/margins": 7.328572750091553, "rewards/rejected": -6.231935024261475, "step": 3046 }, { "epoch": 0.5078333333333334, "grad_norm": 22.061962127685547, "learning_rate": 1.0232089956212883e-07, "logits/chosen": 1.9910377264022827, "logits/rejected": 2.809699296951294, "logps/chosen": -60.80564880371094, "logps/rejected": -77.7411117553711, "loss": 0.7429, "nll_loss": 0.7325981855392456, "rewards/accuracies": 1.0, "rewards/chosen": 2.5536296367645264, "rewards/margins": 7.4300994873046875, "rewards/rejected": -4.87647008895874, "step": 3047 }, { "epoch": 0.508, "grad_norm": 112.05289459228516, "learning_rate": 1.0226693451152899e-07, "logits/chosen": 2.6526570320129395, "logits/rejected": 2.8430466651916504, "logps/chosen": -22.29531478881836, "logps/rejected": -40.47169494628906, "loss": 1.8333, "nll_loss": 1.7150242328643799, "rewards/accuracies": 1.0, "rewards/chosen": 0.2705528438091278, "rewards/margins": 3.0500259399414062, "rewards/rejected": -2.779473066329956, "step": 3048 }, { "epoch": 0.5081666666666667, "grad_norm": 54.89875411987305, "learning_rate": 1.0221296880039887e-07, "logits/chosen": 2.070220470428467, "logits/rejected": 3.1205599308013916, "logps/chosen": -33.79220199584961, "logps/rejected": -237.7859649658203, "loss": 0.9687, "nll_loss": 0.9386723637580872, "rewards/accuracies": 1.0, "rewards/chosen": 0.8742290735244751, "rewards/margins": 8.281839370727539, "rewards/rejected": -7.407609939575195, "step": 3049 }, { "epoch": 0.5083333333333333, "grad_norm": 74.3154067993164, "learning_rate": 1.0215900244446278e-07, "logits/chosen": 2.2651729583740234, "logits/rejected": 1.8255577087402344, "logps/chosen": -56.88831329345703, "logps/rejected": -46.72734069824219, "loss": 2.0864, "nll_loss": 2.0317256450653076, "rewards/accuracies": 1.0, "rewards/chosen": 1.5621696710586548, "rewards/margins": 4.374120712280273, "rewards/rejected": -2.811951160430908, "step": 3050 }, { "epoch": 0.5085, "grad_norm": 21.208837509155273, "learning_rate": 1.021050354594452e-07, "logits/chosen": 1.6944845914840698, "logits/rejected": 2.6586196422576904, "logps/chosen": -71.84880828857422, "logps/rejected": -206.05557250976562, "loss": 0.7811, "nll_loss": 0.7725678086280823, "rewards/accuracies": 1.0, "rewards/chosen": 2.244415283203125, "rewards/margins": 9.172082901000977, "rewards/rejected": -6.927667140960693, "step": 3051 }, { "epoch": 0.5086666666666667, "grad_norm": 174.42489624023438, "learning_rate": 1.0205106786107085e-07, "logits/chosen": 2.5658655166625977, "logits/rejected": 2.3978192806243896, "logps/chosen": -59.21656036376953, "logps/rejected": -59.63360595703125, "loss": 2.1536, "nll_loss": 0.8002236485481262, "rewards/accuracies": 0.0, "rewards/chosen": 2.142843723297119, "rewards/margins": -0.25907444953918457, "rewards/rejected": 2.4019181728363037, "step": 3052 }, { "epoch": 0.5088333333333334, "grad_norm": 31.551694869995117, "learning_rate": 1.0199709966506454e-07, "logits/chosen": 1.5909090042114258, "logits/rejected": 2.256577491760254, "logps/chosen": -54.12891387939453, "logps/rejected": -126.44578552246094, "loss": 0.6594, "nll_loss": 0.5883577466011047, "rewards/accuracies": 1.0, "rewards/chosen": 1.8405510187149048, "rewards/margins": 4.140434265136719, "rewards/rejected": -2.2998833656311035, "step": 3053 }, { "epoch": 0.509, "grad_norm": 57.50941848754883, "learning_rate": 1.0194313088715134e-07, "logits/chosen": 2.5709636211395264, "logits/rejected": 2.584583282470703, "logps/chosen": -20.923208236694336, "logps/rejected": -68.16217803955078, "loss": 0.8505, "nll_loss": 0.8369282484054565, "rewards/accuracies": 1.0, "rewards/chosen": 2.02181339263916, "rewards/margins": 7.151035308837891, "rewards/rejected": -5.1292219161987305, "step": 3054 }, { "epoch": 0.5091666666666667, "grad_norm": 22.64412498474121, "learning_rate": 1.0188916154305644e-07, "logits/chosen": 2.9850265979766846, "logits/rejected": 3.031923770904541, "logps/chosen": -113.44605255126953, "logps/rejected": -234.8316192626953, "loss": 0.9995, "nll_loss": 0.9864875078201294, "rewards/accuracies": 1.0, "rewards/chosen": 1.9105767011642456, "rewards/margins": 7.6864333152771, "rewards/rejected": -5.7758564949035645, "step": 3055 }, { "epoch": 0.5093333333333333, "grad_norm": 21.26644515991211, "learning_rate": 1.0183519164850525e-07, "logits/chosen": 1.775201678276062, "logits/rejected": 2.373582601547241, "logps/chosen": -62.15713882446289, "logps/rejected": -281.4970703125, "loss": 0.7176, "nll_loss": 0.7063311338424683, "rewards/accuracies": 1.0, "rewards/chosen": 1.9256048202514648, "rewards/margins": 9.203845977783203, "rewards/rejected": -7.2782416343688965, "step": 3056 }, { "epoch": 0.5095, "grad_norm": 36.69697952270508, "learning_rate": 1.0178122121922323e-07, "logits/chosen": 0.47885748744010925, "logits/rejected": 2.507899761199951, "logps/chosen": -12.092636108398438, "logps/rejected": -398.73480224609375, "loss": 0.4219, "nll_loss": 0.39008501172065735, "rewards/accuracies": 1.0, "rewards/chosen": 0.8115814924240112, "rewards/margins": 8.299759864807129, "rewards/rejected": -7.488178253173828, "step": 3057 }, { "epoch": 0.5096666666666667, "grad_norm": 35.36046600341797, "learning_rate": 1.0172725027093614e-07, "logits/chosen": 2.1715595722198486, "logits/rejected": 2.196038007736206, "logps/chosen": -36.19267272949219, "logps/rejected": -95.64936828613281, "loss": 0.6886, "nll_loss": 0.5933224558830261, "rewards/accuracies": 1.0, "rewards/chosen": 1.4295945167541504, "rewards/margins": 3.543544054031372, "rewards/rejected": -2.1139495372772217, "step": 3058 }, { "epoch": 0.5098333333333334, "grad_norm": 22.752626419067383, "learning_rate": 1.0167327881936978e-07, "logits/chosen": 1.4168044328689575, "logits/rejected": 1.2677849531173706, "logps/chosen": -55.05804443359375, "logps/rejected": -90.95045471191406, "loss": 0.632, "nll_loss": 0.6186297535896301, "rewards/accuracies": 1.0, "rewards/chosen": 2.0016021728515625, "rewards/margins": 7.242552280426025, "rewards/rejected": -5.240950107574463, "step": 3059 }, { "epoch": 0.51, "grad_norm": 22.728225708007812, "learning_rate": 1.0161930688025015e-07, "logits/chosen": 1.8676543235778809, "logits/rejected": 2.3677282333374023, "logps/chosen": -133.596435546875, "logps/rejected": -275.5760498046875, "loss": 1.1027, "nll_loss": 1.0950528383255005, "rewards/accuracies": 1.0, "rewards/chosen": 2.4793457984924316, "rewards/margins": 8.53714370727539, "rewards/rejected": -6.057797431945801, "step": 3060 }, { "epoch": 0.5101666666666667, "grad_norm": 31.156827926635742, "learning_rate": 1.0156533446930336e-07, "logits/chosen": 1.3875997066497803, "logits/rejected": 1.7418607473373413, "logps/chosen": -58.553550720214844, "logps/rejected": -121.6466064453125, "loss": 0.8558, "nll_loss": 0.8246979117393494, "rewards/accuracies": 1.0, "rewards/chosen": 1.7793731689453125, "rewards/margins": 5.315720558166504, "rewards/rejected": -3.5363471508026123, "step": 3061 }, { "epoch": 0.5103333333333333, "grad_norm": 60.11142349243164, "learning_rate": 1.0151136160225571e-07, "logits/chosen": 1.4954488277435303, "logits/rejected": 2.22625732421875, "logps/chosen": -39.283775329589844, "logps/rejected": -193.55738830566406, "loss": 1.4677, "nll_loss": 1.4549545049667358, "rewards/accuracies": 1.0, "rewards/chosen": 2.080742597579956, "rewards/margins": 7.272204399108887, "rewards/rejected": -5.191461563110352, "step": 3062 }, { "epoch": 0.5105, "grad_norm": 30.297876358032227, "learning_rate": 1.0145738829483352e-07, "logits/chosen": 3.4460675716400146, "logits/rejected": 3.537407398223877, "logps/chosen": -51.36117172241211, "logps/rejected": -167.33920288085938, "loss": 0.7366, "nll_loss": 0.7337309718132019, "rewards/accuracies": 1.0, "rewards/chosen": 6.627850532531738, "rewards/margins": 11.878470420837402, "rewards/rejected": -5.250619888305664, "step": 3063 }, { "epoch": 0.5106666666666667, "grad_norm": 39.05511474609375, "learning_rate": 1.0140341456276343e-07, "logits/chosen": 3.1089489459991455, "logits/rejected": 3.14353609085083, "logps/chosen": -121.86245727539062, "logps/rejected": -29.584623336791992, "loss": 1.0739, "nll_loss": 0.8346742987632751, "rewards/accuracies": 1.0, "rewards/chosen": 2.3983399868011475, "rewards/margins": 2.903444766998291, "rewards/rejected": -0.5051048398017883, "step": 3064 }, { "epoch": 0.5108333333333334, "grad_norm": 29.951168060302734, "learning_rate": 1.0134944042177198e-07, "logits/chosen": 2.753718376159668, "logits/rejected": 2.670344591140747, "logps/chosen": -6.341093063354492, "logps/rejected": -65.58861541748047, "loss": 0.254, "nll_loss": 0.24388816952705383, "rewards/accuracies": 1.0, "rewards/chosen": 2.4745779037475586, "rewards/margins": 7.522406578063965, "rewards/rejected": -5.047828674316406, "step": 3065 }, { "epoch": 0.511, "grad_norm": 38.2259407043457, "learning_rate": 1.0129546588758604e-07, "logits/chosen": 2.953500509262085, "logits/rejected": 3.061307907104492, "logps/chosen": -20.99221420288086, "logps/rejected": -39.63088607788086, "loss": 0.5042, "nll_loss": 0.428412526845932, "rewards/accuracies": 1.0, "rewards/chosen": 1.376495361328125, "rewards/margins": 3.838672161102295, "rewards/rejected": -2.46217679977417, "step": 3066 }, { "epoch": 0.5111666666666667, "grad_norm": 61.54550552368164, "learning_rate": 1.012414909759324e-07, "logits/chosen": 2.400510549545288, "logits/rejected": 2.530229330062866, "logps/chosen": -18.865272521972656, "logps/rejected": -189.12762451171875, "loss": 0.5693, "nll_loss": 0.49645453691482544, "rewards/accuracies": 1.0, "rewards/chosen": 1.2845289707183838, "rewards/margins": 3.8664426803588867, "rewards/rejected": -2.581913709640503, "step": 3067 }, { "epoch": 0.5113333333333333, "grad_norm": 830.4867553710938, "learning_rate": 1.0118751570253812e-07, "logits/chosen": 1.7369621992111206, "logits/rejected": 1.0380767583847046, "logps/chosen": -448.8038635253906, "logps/rejected": -305.582763671875, "loss": 5.276, "nll_loss": 1.3122920989990234, "rewards/accuracies": 0.0, "rewards/chosen": -8.9371919631958, "rewards/margins": -2.9069061279296875, "rewards/rejected": -6.030285835266113, "step": 3068 }, { "epoch": 0.5115, "grad_norm": 111.92634582519531, "learning_rate": 1.0113354008313025e-07, "logits/chosen": 2.581132173538208, "logits/rejected": 2.4251222610473633, "logps/chosen": -27.431716918945312, "logps/rejected": -26.88040542602539, "loss": 1.96, "nll_loss": 1.7144824266433716, "rewards/accuracies": 1.0, "rewards/chosen": 0.2286583036184311, "rewards/margins": 1.8949103355407715, "rewards/rejected": -1.6662520170211792, "step": 3069 }, { "epoch": 0.5116666666666667, "grad_norm": 37.64796447753906, "learning_rate": 1.0107956413343601e-07, "logits/chosen": 2.129328489303589, "logits/rejected": 2.3014285564422607, "logps/chosen": -40.97797393798828, "logps/rejected": -75.90064239501953, "loss": 0.9206, "nll_loss": 0.8537078499794006, "rewards/accuracies": 1.0, "rewards/chosen": 2.188650608062744, "rewards/margins": 4.433657169342041, "rewards/rejected": -2.245006561279297, "step": 3070 }, { "epoch": 0.5118333333333334, "grad_norm": 30.135520935058594, "learning_rate": 1.0102558786918272e-07, "logits/chosen": 3.0558724403381348, "logits/rejected": 3.064540386199951, "logps/chosen": -90.6117172241211, "logps/rejected": -289.562744140625, "loss": 1.0096, "nll_loss": 0.9849100708961487, "rewards/accuracies": 1.0, "rewards/chosen": 1.0981193780899048, "rewards/margins": 8.029170989990234, "rewards/rejected": -6.931051731109619, "step": 3071 }, { "epoch": 0.512, "grad_norm": 26.650358200073242, "learning_rate": 1.0097161130609773e-07, "logits/chosen": 2.814605474472046, "logits/rejected": 2.915036201477051, "logps/chosen": -18.38884735107422, "logps/rejected": -309.28564453125, "loss": 0.4856, "nll_loss": 0.4839169979095459, "rewards/accuracies": 1.0, "rewards/chosen": 4.3742547035217285, "rewards/margins": 11.028545379638672, "rewards/rejected": -6.654290676116943, "step": 3072 }, { "epoch": 0.5121666666666667, "grad_norm": 67.97296905517578, "learning_rate": 1.0091763445990856e-07, "logits/chosen": 1.6579233407974243, "logits/rejected": 1.3361316919326782, "logps/chosen": -78.43486022949219, "logps/rejected": -69.45830535888672, "loss": 0.968, "nll_loss": 0.9227630496025085, "rewards/accuracies": 1.0, "rewards/chosen": 0.9533523917198181, "rewards/margins": 4.779543876647949, "rewards/rejected": -3.8261916637420654, "step": 3073 }, { "epoch": 0.5123333333333333, "grad_norm": 40.297847747802734, "learning_rate": 1.008636573463427e-07, "logits/chosen": 2.185290575027466, "logits/rejected": 2.374570846557617, "logps/chosen": -46.754783630371094, "logps/rejected": -82.48846435546875, "loss": 0.8004, "nll_loss": 0.7664719820022583, "rewards/accuracies": 1.0, "rewards/chosen": 1.6172151565551758, "rewards/margins": 5.154265403747559, "rewards/rejected": -3.537050247192383, "step": 3074 }, { "epoch": 0.5125, "grad_norm": 197.5764923095703, "learning_rate": 1.0080967998112786e-07, "logits/chosen": 2.6977856159210205, "logits/rejected": 2.669564962387085, "logps/chosen": -38.63648986816406, "logps/rejected": -46.23036575317383, "loss": 2.7211, "nll_loss": 0.9199162721633911, "rewards/accuracies": 0.0, "rewards/chosen": 0.6330097317695618, "rewards/margins": -1.2275598049163818, "rewards/rejected": 1.8605694770812988, "step": 3075 }, { "epoch": 0.5126666666666667, "grad_norm": 25.73467445373535, "learning_rate": 1.007557023799917e-07, "logits/chosen": 1.1615889072418213, "logits/rejected": 1.8677928447723389, "logps/chosen": -174.72239685058594, "logps/rejected": -347.8027038574219, "loss": 0.9497, "nll_loss": 0.9393677115440369, "rewards/accuracies": 1.0, "rewards/chosen": 1.968632698059082, "rewards/margins": 14.56902027130127, "rewards/rejected": -12.600387573242188, "step": 3076 }, { "epoch": 0.5128333333333334, "grad_norm": 56.18719482421875, "learning_rate": 1.0070172455866201e-07, "logits/chosen": 2.874307155609131, "logits/rejected": 2.865394115447998, "logps/chosen": -31.98657989501953, "logps/rejected": -75.25546264648438, "loss": 1.1, "nll_loss": 1.0662193298339844, "rewards/accuracies": 1.0, "rewards/chosen": 0.7771309614181519, "rewards/margins": 7.148400783538818, "rewards/rejected": -6.371269702911377, "step": 3077 }, { "epoch": 0.513, "grad_norm": 40.76004409790039, "learning_rate": 1.0064774653286661e-07, "logits/chosen": 1.5434482097625732, "logits/rejected": 1.5264692306518555, "logps/chosen": -79.34102630615234, "logps/rejected": -79.74948120117188, "loss": 1.2192, "nll_loss": 1.1498699188232422, "rewards/accuracies": 1.0, "rewards/chosen": 1.7420082092285156, "rewards/margins": 4.121914863586426, "rewards/rejected": -2.379906415939331, "step": 3078 }, { "epoch": 0.5131666666666667, "grad_norm": 46.62522888183594, "learning_rate": 1.0059376831833346e-07, "logits/chosen": 2.1517271995544434, "logits/rejected": 2.464599132537842, "logps/chosen": -23.030996322631836, "logps/rejected": -285.2620849609375, "loss": 0.8573, "nll_loss": 0.8529999256134033, "rewards/accuracies": 1.0, "rewards/chosen": 3.0186822414398193, "rewards/margins": 9.792987823486328, "rewards/rejected": -6.774305820465088, "step": 3079 }, { "epoch": 0.5133333333333333, "grad_norm": 26.971477508544922, "learning_rate": 1.0053978993079044e-07, "logits/chosen": 1.497304916381836, "logits/rejected": 1.4021196365356445, "logps/chosen": -52.46886444091797, "logps/rejected": -95.94908142089844, "loss": 0.6466, "nll_loss": 0.6321550011634827, "rewards/accuracies": 1.0, "rewards/chosen": 1.8559281826019287, "rewards/margins": 7.262481689453125, "rewards/rejected": -5.406553745269775, "step": 3080 }, { "epoch": 0.5135, "grad_norm": 31.239654541015625, "learning_rate": 1.0048581138596562e-07, "logits/chosen": 2.3795998096466064, "logits/rejected": 2.7243964672088623, "logps/chosen": -76.9302978515625, "logps/rejected": -149.4534149169922, "loss": 0.88, "nll_loss": 0.809792697429657, "rewards/accuracies": 1.0, "rewards/chosen": 0.8928284049034119, "rewards/margins": 3.878617286682129, "rewards/rejected": -2.9857888221740723, "step": 3081 }, { "epoch": 0.5136666666666667, "grad_norm": 19.51936912536621, "learning_rate": 1.0043183269958699e-07, "logits/chosen": 1.1835227012634277, "logits/rejected": 2.139127016067505, "logps/chosen": -76.33724975585938, "logps/rejected": -317.7770690917969, "loss": 0.7674, "nll_loss": 0.7633726596832275, "rewards/accuracies": 1.0, "rewards/chosen": 3.0816287994384766, "rewards/margins": 10.019830703735352, "rewards/rejected": -6.938201904296875, "step": 3082 }, { "epoch": 0.5138333333333334, "grad_norm": 22.51072120666504, "learning_rate": 1.0037785388738271e-07, "logits/chosen": 2.2666754722595215, "logits/rejected": 2.8505685329437256, "logps/chosen": -91.05889892578125, "logps/rejected": -229.46856689453125, "loss": 0.8989, "nll_loss": 0.8927342295646667, "rewards/accuracies": 1.0, "rewards/chosen": 2.856820821762085, "rewards/margins": 8.615623474121094, "rewards/rejected": -5.758802890777588, "step": 3083 }, { "epoch": 0.514, "grad_norm": 309.85723876953125, "learning_rate": 1.0032387496508088e-07, "logits/chosen": 2.7898271083831787, "logits/rejected": 3.0198092460632324, "logps/chosen": -81.17025756835938, "logps/rejected": -80.38024139404297, "loss": 5.2185, "nll_loss": 0.8727985620498657, "rewards/accuracies": 0.0, "rewards/chosen": 0.9425934553146362, "rewards/margins": -3.9197912216186523, "rewards/rejected": 4.862384796142578, "step": 3084 }, { "epoch": 0.5141666666666667, "grad_norm": 42.425968170166016, "learning_rate": 1.0026989594840965e-07, "logits/chosen": 2.397446393966675, "logits/rejected": 2.6276097297668457, "logps/chosen": -30.85555648803711, "logps/rejected": -102.95325469970703, "loss": 0.8842, "nll_loss": 0.8570988774299622, "rewards/accuracies": 1.0, "rewards/chosen": 1.7675111293792725, "rewards/margins": 5.562100410461426, "rewards/rejected": -3.7945892810821533, "step": 3085 }, { "epoch": 0.5143333333333333, "grad_norm": 24.851774215698242, "learning_rate": 1.002159168530972e-07, "logits/chosen": 1.4193698167800903, "logits/rejected": 1.3220187425613403, "logps/chosen": -59.480735778808594, "logps/rejected": -66.80562591552734, "loss": 0.7152, "nll_loss": 0.699773371219635, "rewards/accuracies": 1.0, "rewards/chosen": 1.6951959133148193, "rewards/margins": 7.553454399108887, "rewards/rejected": -5.8582587242126465, "step": 3086 }, { "epoch": 0.5145, "grad_norm": 270.4300231933594, "learning_rate": 1.001619376948718e-07, "logits/chosen": 2.816316843032837, "logits/rejected": 2.8314177989959717, "logps/chosen": -55.63030242919922, "logps/rejected": -32.90146255493164, "loss": 6.0618, "nll_loss": 0.7726431488990784, "rewards/accuracies": 0.0, "rewards/chosen": 0.6463661193847656, "rewards/margins": -4.924203872680664, "rewards/rejected": 5.57056999206543, "step": 3087 }, { "epoch": 0.5146666666666667, "grad_norm": 171.56724548339844, "learning_rate": 1.0010795848946163e-07, "logits/chosen": 2.309577226638794, "logits/rejected": 2.266024589538574, "logps/chosen": -75.28812408447266, "logps/rejected": -13.591833114624023, "loss": 2.6615, "nll_loss": 1.2548019886016846, "rewards/accuracies": 0.0, "rewards/chosen": 1.2273186445236206, "rewards/margins": -0.5787961483001709, "rewards/rejected": 1.8061147928237915, "step": 3088 }, { "epoch": 0.5148333333333334, "grad_norm": 54.91315841674805, "learning_rate": 1.0005397925259496e-07, "logits/chosen": 2.786738872528076, "logits/rejected": 2.783780097961426, "logps/chosen": -37.48133087158203, "logps/rejected": -49.57719421386719, "loss": 1.0976, "nll_loss": 0.9863508343696594, "rewards/accuracies": 1.0, "rewards/chosen": 1.02197265625, "rewards/margins": 3.1806952953338623, "rewards/rejected": -2.1587226390838623, "step": 3089 }, { "epoch": 0.515, "grad_norm": 27.885345458984375, "learning_rate": 1e-07, "logits/chosen": 2.589545249938965, "logits/rejected": 3.0071661472320557, "logps/chosen": -63.68815612792969, "logps/rejected": -70.65876007080078, "loss": 0.7433, "nll_loss": 0.71559739112854, "rewards/accuracies": 1.0, "rewards/chosen": 1.3937591314315796, "rewards/margins": 5.700704574584961, "rewards/rejected": -4.306945323944092, "step": 3090 }, { "epoch": 0.5151666666666667, "grad_norm": 51.37535095214844, "learning_rate": 9.994602074740504e-08, "logits/chosen": 2.8310554027557373, "logits/rejected": 2.7501730918884277, "logps/chosen": -22.570636749267578, "logps/rejected": -97.9306640625, "loss": 0.7555, "nll_loss": 0.752354621887207, "rewards/accuracies": 1.0, "rewards/chosen": 3.400285005569458, "rewards/margins": 10.245050430297852, "rewards/rejected": -6.844765663146973, "step": 3091 }, { "epoch": 0.5153333333333333, "grad_norm": 32.74984359741211, "learning_rate": 9.989204151053837e-08, "logits/chosen": 3.1626479625701904, "logits/rejected": 3.2137608528137207, "logps/chosen": -17.11288833618164, "logps/rejected": -238.2264404296875, "loss": 0.4689, "nll_loss": 0.46251049637794495, "rewards/accuracies": 1.0, "rewards/chosen": 2.5210800170898438, "rewards/margins": 9.976411819458008, "rewards/rejected": -7.455331802368164, "step": 3092 }, { "epoch": 0.5155, "grad_norm": 33.96976089477539, "learning_rate": 9.983806230512818e-08, "logits/chosen": 2.3907580375671387, "logits/rejected": 2.2352492809295654, "logps/chosen": -122.10283660888672, "logps/rejected": -22.179269790649414, "loss": 0.7162, "nll_loss": 0.49235016107559204, "rewards/accuracies": 1.0, "rewards/chosen": 2.723935842514038, "rewards/margins": 3.2423901557922363, "rewards/rejected": -0.5184541940689087, "step": 3093 }, { "epoch": 0.5156666666666667, "grad_norm": 25.348228454589844, "learning_rate": 9.978408314690279e-08, "logits/chosen": 2.933920383453369, "logits/rejected": 2.960451364517212, "logps/chosen": -83.05192565917969, "logps/rejected": -215.61859130859375, "loss": 0.9072, "nll_loss": 0.9027385115623474, "rewards/accuracies": 1.0, "rewards/chosen": 2.9302377700805664, "rewards/margins": 10.121938705444336, "rewards/rejected": -7.1917009353637695, "step": 3094 }, { "epoch": 0.5158333333333334, "grad_norm": 51.51656723022461, "learning_rate": 9.973010405159034e-08, "logits/chosen": 2.19480299949646, "logits/rejected": 2.2456114292144775, "logps/chosen": -11.539020538330078, "logps/rejected": -162.7054443359375, "loss": 0.5386, "nll_loss": 0.5245009064674377, "rewards/accuracies": 1.0, "rewards/chosen": 1.6902135610580444, "rewards/margins": 8.911197662353516, "rewards/rejected": -7.220983982086182, "step": 3095 }, { "epoch": 0.516, "grad_norm": 34.00577926635742, "learning_rate": 9.967612503491913e-08, "logits/chosen": 2.088197708129883, "logits/rejected": 2.154862880706787, "logps/chosen": -5.380980491638184, "logps/rejected": -70.76937103271484, "loss": 0.3193, "nll_loss": 0.19929558038711548, "rewards/accuracies": 1.0, "rewards/chosen": 0.7071530222892761, "rewards/margins": 3.0044875144958496, "rewards/rejected": -2.2973344326019287, "step": 3096 }, { "epoch": 0.5161666666666667, "grad_norm": 19.780624389648438, "learning_rate": 9.962214611261728e-08, "logits/chosen": 1.823746681213379, "logits/rejected": 2.9831864833831787, "logps/chosen": -164.6721649169922, "logps/rejected": -145.69349670410156, "loss": 0.9612, "nll_loss": 0.9518621563911438, "rewards/accuracies": 1.0, "rewards/chosen": 2.1960830688476562, "rewards/margins": 8.537983894348145, "rewards/rejected": -6.341900825500488, "step": 3097 }, { "epoch": 0.5163333333333333, "grad_norm": 41.20446014404297, "learning_rate": 9.956816730041301e-08, "logits/chosen": 3.147277355194092, "logits/rejected": 3.17973256111145, "logps/chosen": -55.62315368652344, "logps/rejected": -162.52020263671875, "loss": 0.9208, "nll_loss": 0.8427750468254089, "rewards/accuracies": 1.0, "rewards/chosen": 1.0293442010879517, "rewards/margins": 3.707869052886963, "rewards/rejected": -2.678524971008301, "step": 3098 }, { "epoch": 0.5165, "grad_norm": 33.52931594848633, "learning_rate": 9.951418861403439e-08, "logits/chosen": 2.856931209564209, "logits/rejected": 3.0522632598876953, "logps/chosen": -28.764667510986328, "logps/rejected": -186.21023559570312, "loss": 0.6887, "nll_loss": 0.6689457297325134, "rewards/accuracies": 1.0, "rewards/chosen": 1.8589709997177124, "rewards/margins": 6.211938858032227, "rewards/rejected": -4.352967739105225, "step": 3099 }, { "epoch": 0.5166666666666667, "grad_norm": 72.81642150878906, "learning_rate": 9.946021006920957e-08, "logits/chosen": 2.600963830947876, "logits/rejected": 2.480337142944336, "logps/chosen": -64.04359436035156, "logps/rejected": -117.1318588256836, "loss": 1.2779, "nll_loss": 1.049895167350769, "rewards/accuracies": 1.0, "rewards/chosen": -0.5348957180976868, "rewards/margins": 2.0920631885528564, "rewards/rejected": -2.6269588470458984, "step": 3100 }, { "epoch": 0.5168333333333334, "grad_norm": 26.973060607910156, "learning_rate": 9.940623168166655e-08, "logits/chosen": 2.5509257316589355, "logits/rejected": 2.5297954082489014, "logps/chosen": -29.787961959838867, "logps/rejected": -322.3359375, "loss": 0.5534, "nll_loss": 0.5319278836250305, "rewards/accuracies": 1.0, "rewards/chosen": 1.2108392715454102, "rewards/margins": 10.450640678405762, "rewards/rejected": -9.239801406860352, "step": 3101 }, { "epoch": 0.517, "grad_norm": 52.59492111206055, "learning_rate": 9.93522534671334e-08, "logits/chosen": 2.638557195663452, "logits/rejected": 2.6579689979553223, "logps/chosen": -21.947853088378906, "logps/rejected": -11.600053787231445, "loss": 1.4468, "nll_loss": 0.7838519215583801, "rewards/accuracies": 1.0, "rewards/chosen": 3.3635642528533936, "rewards/margins": 1.7452092170715332, "rewards/rejected": 1.6183550357818604, "step": 3102 }, { "epoch": 0.5171666666666667, "grad_norm": 27.094507217407227, "learning_rate": 9.929827544133799e-08, "logits/chosen": 3.1797308921813965, "logits/rejected": 3.3103058338165283, "logps/chosen": -40.38467025756836, "logps/rejected": -266.46343994140625, "loss": 0.5838, "nll_loss": 0.5532146096229553, "rewards/accuracies": 1.0, "rewards/chosen": 0.8473541736602783, "rewards/margins": 8.430010795593262, "rewards/rejected": -7.5826568603515625, "step": 3103 }, { "epoch": 0.5173333333333333, "grad_norm": 19.02665901184082, "learning_rate": 9.924429762000831e-08, "logits/chosen": 1.5363954305648804, "logits/rejected": 1.3447909355163574, "logps/chosen": -148.45034790039062, "logps/rejected": -150.14453125, "loss": 0.9207, "nll_loss": 0.9163602590560913, "rewards/accuracies": 1.0, "rewards/chosen": 2.9024460315704346, "rewards/margins": 10.820045471191406, "rewards/rejected": -7.917599201202393, "step": 3104 }, { "epoch": 0.5175, "grad_norm": 37.63683319091797, "learning_rate": 9.919032001887215e-08, "logits/chosen": 2.0477700233459473, "logits/rejected": 2.3356852531433105, "logps/chosen": -83.09898376464844, "logps/rejected": -162.7563018798828, "loss": 0.9929, "nll_loss": 0.9662672281265259, "rewards/accuracies": 1.0, "rewards/chosen": 1.65550696849823, "rewards/margins": 5.623515605926514, "rewards/rejected": -3.968008518218994, "step": 3105 }, { "epoch": 0.5176666666666667, "grad_norm": 23.457122802734375, "learning_rate": 9.91363426536573e-08, "logits/chosen": 2.13555645942688, "logits/rejected": 2.583073139190674, "logps/chosen": -102.34380340576172, "logps/rejected": -434.378662109375, "loss": 0.9482, "nll_loss": 0.9389339685440063, "rewards/accuracies": 1.0, "rewards/chosen": 2.104318380355835, "rewards/margins": 10.256659507751465, "rewards/rejected": -8.15234088897705, "step": 3106 }, { "epoch": 0.5178333333333334, "grad_norm": 29.665939331054688, "learning_rate": 9.908236554009145e-08, "logits/chosen": 2.8833200931549072, "logits/rejected": 3.0500845909118652, "logps/chosen": -49.19806671142578, "logps/rejected": -346.28497314453125, "loss": 0.7318, "nll_loss": 0.7130154967308044, "rewards/accuracies": 1.0, "rewards/chosen": 1.3506485223770142, "rewards/margins": 11.219761848449707, "rewards/rejected": -9.869112968444824, "step": 3107 }, { "epoch": 0.518, "grad_norm": 108.59429931640625, "learning_rate": 9.902838869390228e-08, "logits/chosen": 3.0078535079956055, "logits/rejected": 2.974486827850342, "logps/chosen": -39.69275665283203, "logps/rejected": -17.119159698486328, "loss": 1.1428, "nll_loss": 0.696364164352417, "rewards/accuracies": 1.0, "rewards/chosen": 2.11051344871521, "rewards/margins": 1.763084888458252, "rewards/rejected": 0.347428560256958, "step": 3108 }, { "epoch": 0.5181666666666667, "grad_norm": 31.59189796447754, "learning_rate": 9.897441213081729e-08, "logits/chosen": 2.1423401832580566, "logits/rejected": 2.5306949615478516, "logps/chosen": -11.545516967773438, "logps/rejected": -119.91377258300781, "loss": 0.4317, "nll_loss": 0.41233986616134644, "rewards/accuracies": 1.0, "rewards/chosen": 1.6200803518295288, "rewards/margins": 6.560787200927734, "rewards/rejected": -4.940706729888916, "step": 3109 }, { "epoch": 0.5183333333333333, "grad_norm": 30.89430809020996, "learning_rate": 9.8920435866564e-08, "logits/chosen": 2.2720470428466797, "logits/rejected": 2.5746803283691406, "logps/chosen": -81.02732849121094, "logps/rejected": -154.548095703125, "loss": 0.9744, "nll_loss": 0.9421781301498413, "rewards/accuracies": 1.0, "rewards/chosen": 0.9737548828125, "rewards/margins": 5.985403060913086, "rewards/rejected": -5.011648178100586, "step": 3110 }, { "epoch": 0.5185, "grad_norm": 23.3303165435791, "learning_rate": 9.886645991686976e-08, "logits/chosen": 2.0657269954681396, "logits/rejected": 1.8581569194793701, "logps/chosen": -40.56584167480469, "logps/rejected": -81.95914459228516, "loss": 0.5519, "nll_loss": 0.507072925567627, "rewards/accuracies": 1.0, "rewards/chosen": 1.4782612323760986, "rewards/margins": 4.661467552185059, "rewards/rejected": -3.183206558227539, "step": 3111 }, { "epoch": 0.5186666666666667, "grad_norm": 37.06785583496094, "learning_rate": 9.88124842974619e-08, "logits/chosen": 2.528319835662842, "logits/rejected": 2.545745849609375, "logps/chosen": -24.303251266479492, "logps/rejected": -48.665306091308594, "loss": 0.545, "nll_loss": 0.5063177943229675, "rewards/accuracies": 1.0, "rewards/chosen": 1.3860224485397339, "rewards/margins": 4.925653457641602, "rewards/rejected": -3.539630889892578, "step": 3112 }, { "epoch": 0.5188333333333334, "grad_norm": 49.813575744628906, "learning_rate": 9.87585090240676e-08, "logits/chosen": 2.040048599243164, "logits/rejected": 2.290679931640625, "logps/chosen": -15.485986709594727, "logps/rejected": -290.9420166015625, "loss": 0.7063, "nll_loss": 0.7039085030555725, "rewards/accuracies": 1.0, "rewards/chosen": 3.977031707763672, "rewards/margins": 10.272781372070312, "rewards/rejected": -6.295749664306641, "step": 3113 }, { "epoch": 0.519, "grad_norm": 866.597412109375, "learning_rate": 9.870453411241398e-08, "logits/chosen": 1.7580357789993286, "logits/rejected": 1.0541882514953613, "logps/chosen": -445.2255859375, "logps/rejected": -302.569580078125, "loss": 5.1504, "nll_loss": 1.3018290996551514, "rewards/accuracies": 0.0, "rewards/chosen": -8.579363822937012, "rewards/margins": -2.8503966331481934, "rewards/rejected": -5.728967189788818, "step": 3114 }, { "epoch": 0.5191666666666667, "grad_norm": 28.75353240966797, "learning_rate": 9.865055957822801e-08, "logits/chosen": 2.9176411628723145, "logits/rejected": 3.2017674446105957, "logps/chosen": -73.75462341308594, "logps/rejected": -248.22901916503906, "loss": 0.7834, "nll_loss": 0.7682772278785706, "rewards/accuracies": 1.0, "rewards/chosen": 1.589937686920166, "rewards/margins": 9.419136047363281, "rewards/rejected": -7.829198360443115, "step": 3115 }, { "epoch": 0.5193333333333333, "grad_norm": 32.84161376953125, "learning_rate": 9.859658543723658e-08, "logits/chosen": 0.8606443405151367, "logits/rejected": 2.4291837215423584, "logps/chosen": -30.810901641845703, "logps/rejected": -290.57171630859375, "loss": 0.6495, "nll_loss": 0.6041353344917297, "rewards/accuracies": 1.0, "rewards/chosen": 0.4013347625732422, "rewards/margins": 8.853137016296387, "rewards/rejected": -8.451802253723145, "step": 3116 }, { "epoch": 0.5195, "grad_norm": 39.44517517089844, "learning_rate": 9.854261170516647e-08, "logits/chosen": 2.588385820388794, "logits/rejected": 2.5773191452026367, "logps/chosen": -87.24299621582031, "logps/rejected": -56.181278228759766, "loss": 1.4451, "nll_loss": 1.407145380973816, "rewards/accuracies": 1.0, "rewards/chosen": 1.8819160461425781, "rewards/margins": 5.029477119445801, "rewards/rejected": -3.1475613117218018, "step": 3117 }, { "epoch": 0.5196666666666667, "grad_norm": 27.59984016418457, "learning_rate": 9.84886383977443e-08, "logits/chosen": 2.5868959426879883, "logits/rejected": 2.832505464553833, "logps/chosen": -66.732666015625, "logps/rejected": -301.6588134765625, "loss": 1.0484, "nll_loss": 1.0426979064941406, "rewards/accuracies": 1.0, "rewards/chosen": 5.029058933258057, "rewards/margins": 9.649276733398438, "rewards/rejected": -4.620217800140381, "step": 3118 }, { "epoch": 0.5198333333333334, "grad_norm": 21.703380584716797, "learning_rate": 9.843466553069664e-08, "logits/chosen": 1.344268798828125, "logits/rejected": 1.9205965995788574, "logps/chosen": -73.53514862060547, "logps/rejected": -162.1480255126953, "loss": 0.7626, "nll_loss": 0.7503586411476135, "rewards/accuracies": 1.0, "rewards/chosen": 1.816805362701416, "rewards/margins": 9.489389419555664, "rewards/rejected": -7.672584056854248, "step": 3119 }, { "epoch": 0.52, "grad_norm": 23.242816925048828, "learning_rate": 9.838069311974985e-08, "logits/chosen": 1.860138177871704, "logits/rejected": 2.3307886123657227, "logps/chosen": -51.016578674316406, "logps/rejected": -163.7606201171875, "loss": 0.5962, "nll_loss": 0.5732201337814331, "rewards/accuracies": 1.0, "rewards/chosen": 1.599250078201294, "rewards/margins": 6.011549949645996, "rewards/rejected": -4.412299633026123, "step": 3120 }, { "epoch": 0.5201666666666667, "grad_norm": 41.296695709228516, "learning_rate": 9.832672118063022e-08, "logits/chosen": 2.5811290740966797, "logits/rejected": 2.6417834758758545, "logps/chosen": -48.778541564941406, "logps/rejected": -94.57119750976562, "loss": 1.0213, "nll_loss": 0.9755707383155823, "rewards/accuracies": 1.0, "rewards/chosen": 1.0985348224639893, "rewards/margins": 4.661972522735596, "rewards/rejected": -3.5634377002716064, "step": 3121 }, { "epoch": 0.5203333333333333, "grad_norm": 76.32684326171875, "learning_rate": 9.827274972906385e-08, "logits/chosen": 2.843234062194824, "logits/rejected": 2.7763986587524414, "logps/chosen": -99.1376953125, "logps/rejected": -40.49317932128906, "loss": 1.4768, "nll_loss": 1.2549077272415161, "rewards/accuracies": 1.0, "rewards/chosen": -0.12078094482421875, "rewards/margins": 2.04124116897583, "rewards/rejected": -2.162022113800049, "step": 3122 }, { "epoch": 0.5205, "grad_norm": 122.05828094482422, "learning_rate": 9.821877878077676e-08, "logits/chosen": 3.195718288421631, "logits/rejected": 3.01471209526062, "logps/chosen": -144.89651489257812, "logps/rejected": -30.28977394104004, "loss": 2.1484, "nll_loss": 1.4636011123657227, "rewards/accuracies": 1.0, "rewards/chosen": 2.009385824203491, "rewards/margins": 0.9949344396591187, "rewards/rejected": 1.0144513845443726, "step": 3123 }, { "epoch": 0.5206666666666667, "grad_norm": 22.554351806640625, "learning_rate": 9.816480835149477e-08, "logits/chosen": 1.0645787715911865, "logits/rejected": 1.9112571477890015, "logps/chosen": -40.66524124145508, "logps/rejected": -322.97186279296875, "loss": 0.5625, "nll_loss": 0.5350689888000488, "rewards/accuracies": 1.0, "rewards/chosen": 1.3993358612060547, "rewards/margins": 5.716581344604492, "rewards/rejected": -4.3172454833984375, "step": 3124 }, { "epoch": 0.5208333333333334, "grad_norm": 395.7938232421875, "learning_rate": 9.811083845694356e-08, "logits/chosen": 2.5226545333862305, "logits/rejected": 2.277419328689575, "logps/chosen": -156.84800720214844, "logps/rejected": -115.8399887084961, "loss": 2.2936, "nll_loss": 1.2962645292282104, "rewards/accuracies": 1.0, "rewards/chosen": -3.2645602226257324, "rewards/margins": 0.13327383995056152, "rewards/rejected": -3.397834062576294, "step": 3125 }, { "epoch": 0.521, "grad_norm": 27.679357528686523, "learning_rate": 9.805686911284865e-08, "logits/chosen": 1.2246837615966797, "logits/rejected": 1.9769952297210693, "logps/chosen": -22.77539825439453, "logps/rejected": -179.5778045654297, "loss": 0.4832, "nll_loss": 0.47448742389678955, "rewards/accuracies": 1.0, "rewards/chosen": 2.9927804470062256, "rewards/margins": 7.721996307373047, "rewards/rejected": -4.7292160987854, "step": 3126 }, { "epoch": 0.5211666666666667, "grad_norm": 36.75009536743164, "learning_rate": 9.800290033493546e-08, "logits/chosen": 2.5112388134002686, "logits/rejected": 2.7085659503936768, "logps/chosen": -41.91661071777344, "logps/rejected": -239.07656860351562, "loss": 0.8636, "nll_loss": 0.8218944072723389, "rewards/accuracies": 1.0, "rewards/chosen": 0.5329437255859375, "rewards/margins": 6.9111480712890625, "rewards/rejected": -6.378204345703125, "step": 3127 }, { "epoch": 0.5213333333333333, "grad_norm": 35.59749221801758, "learning_rate": 9.794893213892916e-08, "logits/chosen": 2.778144359588623, "logits/rejected": 2.8995308876037598, "logps/chosen": -31.713293075561523, "logps/rejected": -131.26617431640625, "loss": 0.8627, "nll_loss": 0.8571160435676575, "rewards/accuracies": 1.0, "rewards/chosen": 2.9491305351257324, "rewards/margins": 8.776593208312988, "rewards/rejected": -5.827462673187256, "step": 3128 }, { "epoch": 0.5215, "grad_norm": 19.73208236694336, "learning_rate": 9.789496454055481e-08, "logits/chosen": 0.8345015645027161, "logits/rejected": 2.14656925201416, "logps/chosen": -23.184595108032227, "logps/rejected": -354.22845458984375, "loss": 0.3501, "nll_loss": 0.32200828194618225, "rewards/accuracies": 1.0, "rewards/chosen": 0.991933286190033, "rewards/margins": 7.074110984802246, "rewards/rejected": -6.082177639007568, "step": 3129 }, { "epoch": 0.5216666666666666, "grad_norm": 16.049488067626953, "learning_rate": 9.784099755553722e-08, "logits/chosen": 1.4598294496536255, "logits/rejected": 1.8454986810684204, "logps/chosen": -158.1644287109375, "logps/rejected": -283.5997314453125, "loss": 0.6549, "nll_loss": 0.6455691456794739, "rewards/accuracies": 1.0, "rewards/chosen": 2.06144118309021, "rewards/margins": 12.898614883422852, "rewards/rejected": -10.837173461914062, "step": 3130 }, { "epoch": 0.5218333333333334, "grad_norm": 24.40064811706543, "learning_rate": 9.778703119960114e-08, "logits/chosen": 3.247483253479004, "logits/rejected": 3.51163649559021, "logps/chosen": -43.41325378417969, "logps/rejected": -316.21209716796875, "loss": 0.601, "nll_loss": 0.5947021245956421, "rewards/accuracies": 1.0, "rewards/chosen": 2.626340627670288, "rewards/margins": 9.052088737487793, "rewards/rejected": -6.425747871398926, "step": 3131 }, { "epoch": 0.522, "grad_norm": 93.15252685546875, "learning_rate": 9.7733065488471e-08, "logits/chosen": 2.522615909576416, "logits/rejected": 2.5970842838287354, "logps/chosen": -23.610553741455078, "logps/rejected": -133.39401245117188, "loss": 1.1935, "nll_loss": 1.1805275678634644, "rewards/accuracies": 1.0, "rewards/chosen": 1.9198064804077148, "rewards/margins": 7.600185871124268, "rewards/rejected": -5.680379390716553, "step": 3132 }, { "epoch": 0.5221666666666667, "grad_norm": 23.468955993652344, "learning_rate": 9.767910043787117e-08, "logits/chosen": 2.6050405502319336, "logits/rejected": 2.9248998165130615, "logps/chosen": -42.34473419189453, "logps/rejected": -313.6321105957031, "loss": 0.6309, "nll_loss": 0.6227166056632996, "rewards/accuracies": 1.0, "rewards/chosen": 2.2527847290039062, "rewards/margins": 9.476051330566406, "rewards/rejected": -7.223267078399658, "step": 3133 }, { "epoch": 0.5223333333333333, "grad_norm": 33.38245391845703, "learning_rate": 9.762513606352577e-08, "logits/chosen": 1.6260100603103638, "logits/rejected": 1.8220926523208618, "logps/chosen": -60.240020751953125, "logps/rejected": -94.08784484863281, "loss": 0.7422, "nll_loss": 0.6693335771560669, "rewards/accuracies": 1.0, "rewards/chosen": 1.8191642761230469, "rewards/margins": 4.1029462814331055, "rewards/rejected": -2.2837822437286377, "step": 3134 }, { "epoch": 0.5225, "grad_norm": 26.995594024658203, "learning_rate": 9.75711723811587e-08, "logits/chosen": 2.9541399478912354, "logits/rejected": 3.173218011856079, "logps/chosen": -55.950382232666016, "logps/rejected": -231.21482849121094, "loss": 0.7741, "nll_loss": 0.7560862898826599, "rewards/accuracies": 1.0, "rewards/chosen": 1.5332852602005005, "rewards/margins": 7.2025346755981445, "rewards/rejected": -5.669249534606934, "step": 3135 }, { "epoch": 0.5226666666666666, "grad_norm": 203.21600341796875, "learning_rate": 9.751720940649368e-08, "logits/chosen": 3.3388683795928955, "logits/rejected": 3.436072826385498, "logps/chosen": -94.84414672851562, "logps/rejected": -94.87419128417969, "loss": 2.2034, "nll_loss": 1.2992349863052368, "rewards/accuracies": 1.0, "rewards/chosen": -2.7852866649627686, "rewards/margins": 0.1776726245880127, "rewards/rejected": -2.9629592895507812, "step": 3136 }, { "epoch": 0.5228333333333334, "grad_norm": 198.52828979492188, "learning_rate": 9.746324715525423e-08, "logits/chosen": 3.01965594291687, "logits/rejected": 2.78257417678833, "logps/chosen": -56.310394287109375, "logps/rejected": -18.718788146972656, "loss": 2.9709, "nll_loss": 0.9082322120666504, "rewards/accuracies": 0.0, "rewards/chosen": 0.42342185974121094, "rewards/margins": -1.578564167022705, "rewards/rejected": 2.001986026763916, "step": 3137 }, { "epoch": 0.523, "grad_norm": 104.1854248046875, "learning_rate": 9.740928564316367e-08, "logits/chosen": 1.0499802827835083, "logits/rejected": 2.020170211791992, "logps/chosen": -63.55120086669922, "logps/rejected": -220.29443359375, "loss": 1.9754, "nll_loss": 1.7175999879837036, "rewards/accuracies": 1.0, "rewards/chosen": -1.9391118288040161, "rewards/margins": 6.864355087280273, "rewards/rejected": -8.803466796875, "step": 3138 }, { "epoch": 0.5231666666666667, "grad_norm": 30.197734832763672, "learning_rate": 9.735532488594507e-08, "logits/chosen": 2.0821518898010254, "logits/rejected": 2.3039989471435547, "logps/chosen": -80.55599975585938, "logps/rejected": -148.85169982910156, "loss": 1.1115, "nll_loss": 1.1035066843032837, "rewards/accuracies": 1.0, "rewards/chosen": 2.5883820056915283, "rewards/margins": 8.079671859741211, "rewards/rejected": -5.4912896156311035, "step": 3139 }, { "epoch": 0.5233333333333333, "grad_norm": 144.1348114013672, "learning_rate": 9.730136489932131e-08, "logits/chosen": 2.5632474422454834, "logits/rejected": 2.540940999984741, "logps/chosen": -67.2942123413086, "logps/rejected": -26.614286422729492, "loss": 2.1021, "nll_loss": 0.9346418380737305, "rewards/accuracies": 0.0, "rewards/chosen": 0.6974037885665894, "rewards/margins": -0.3446049690246582, "rewards/rejected": 1.0420087575912476, "step": 3140 }, { "epoch": 0.5235, "grad_norm": 39.59721374511719, "learning_rate": 9.724740569901502e-08, "logits/chosen": 0.12465488910675049, "logits/rejected": 1.3607953786849976, "logps/chosen": -21.306787490844727, "logps/rejected": -439.204833984375, "loss": 0.5839, "nll_loss": 0.5758591294288635, "rewards/accuracies": 1.0, "rewards/chosen": 2.2528295516967773, "rewards/margins": 10.044290542602539, "rewards/rejected": -7.79146146774292, "step": 3141 }, { "epoch": 0.5236666666666666, "grad_norm": 110.82203674316406, "learning_rate": 9.719344730074864e-08, "logits/chosen": 2.491515874862671, "logits/rejected": 2.7458395957946777, "logps/chosen": -41.273643493652344, "logps/rejected": -80.84593963623047, "loss": 1.3765, "nll_loss": 0.5653923749923706, "rewards/accuracies": 1.0, "rewards/chosen": 2.004415988922119, "rewards/margins": 0.693932294845581, "rewards/rejected": 1.310483694076538, "step": 3142 }, { "epoch": 0.5238333333333334, "grad_norm": 42.12324905395508, "learning_rate": 9.713948972024433e-08, "logits/chosen": 1.1794594526290894, "logits/rejected": 1.8962335586547852, "logps/chosen": -48.20848846435547, "logps/rejected": -165.19577026367188, "loss": 0.8677, "nll_loss": 0.8170931339263916, "rewards/accuracies": 1.0, "rewards/chosen": 0.5022537112236023, "rewards/margins": 5.154479026794434, "rewards/rejected": -4.652225494384766, "step": 3143 }, { "epoch": 0.524, "grad_norm": 31.699392318725586, "learning_rate": 9.708553297322404e-08, "logits/chosen": 3.1479969024658203, "logits/rejected": 3.1275036334991455, "logps/chosen": -45.171363830566406, "logps/rejected": -142.78317260742188, "loss": 0.6379, "nll_loss": 0.6104238629341125, "rewards/accuracies": 1.0, "rewards/chosen": 2.2882354259490967, "rewards/margins": 5.674429893493652, "rewards/rejected": -3.3861942291259766, "step": 3144 }, { "epoch": 0.5241666666666667, "grad_norm": 19.605796813964844, "learning_rate": 9.703157707540948e-08, "logits/chosen": 2.084275245666504, "logits/rejected": 2.082937002182007, "logps/chosen": -88.85668182373047, "logps/rejected": -182.92855834960938, "loss": 0.7678, "nll_loss": 0.7594587802886963, "rewards/accuracies": 1.0, "rewards/chosen": 2.2622909545898438, "rewards/margins": 9.143851280212402, "rewards/rejected": -6.881560325622559, "step": 3145 }, { "epoch": 0.5243333333333333, "grad_norm": 29.528549194335938, "learning_rate": 9.69776220425221e-08, "logits/chosen": 2.752497911453247, "logits/rejected": 3.0026862621307373, "logps/chosen": -14.799823760986328, "logps/rejected": -300.0361022949219, "loss": 0.3956, "nll_loss": 0.3699955940246582, "rewards/accuracies": 1.0, "rewards/chosen": 1.2254265546798706, "rewards/margins": 6.325515270233154, "rewards/rejected": -5.100088596343994, "step": 3146 }, { "epoch": 0.5245, "grad_norm": 85.84642028808594, "learning_rate": 9.692366789028307e-08, "logits/chosen": 2.874271869659424, "logits/rejected": 2.943122625350952, "logps/chosen": -69.83605194091797, "logps/rejected": -71.25212097167969, "loss": 1.6301, "nll_loss": 1.1639341115951538, "rewards/accuracies": 1.0, "rewards/chosen": 2.102665662765503, "rewards/margins": 1.6932127475738525, "rewards/rejected": 0.4094528555870056, "step": 3147 }, { "epoch": 0.5246666666666666, "grad_norm": 204.88604736328125, "learning_rate": 9.686971463441339e-08, "logits/chosen": 2.926349639892578, "logits/rejected": 2.718235969543457, "logps/chosen": -317.1851806640625, "logps/rejected": -85.8513412475586, "loss": 2.435, "nll_loss": 1.8021880388259888, "rewards/accuracies": 1.0, "rewards/chosen": -3.404904365539551, "rewards/margins": 1.7540159225463867, "rewards/rejected": -5.1589202880859375, "step": 3148 }, { "epoch": 0.5248333333333334, "grad_norm": 46.2031135559082, "learning_rate": 9.681576229063369e-08, "logits/chosen": 2.1279456615448, "logits/rejected": 2.167649984359741, "logps/chosen": -57.33053970336914, "logps/rejected": -186.09408569335938, "loss": 1.1762, "nll_loss": 1.1466107368469238, "rewards/accuracies": 1.0, "rewards/chosen": 1.010470986366272, "rewards/margins": 6.34001350402832, "rewards/rejected": -5.329542636871338, "step": 3149 }, { "epoch": 0.525, "grad_norm": 28.478897094726562, "learning_rate": 9.676181087466443e-08, "logits/chosen": 3.4246602058410645, "logits/rejected": 3.3490378856658936, "logps/chosen": -57.0711784362793, "logps/rejected": -132.36463928222656, "loss": 0.7732, "nll_loss": 0.7609490752220154, "rewards/accuracies": 1.0, "rewards/chosen": 2.825165271759033, "rewards/margins": 7.10981559753418, "rewards/rejected": -4.2846503257751465, "step": 3150 }, { "epoch": 0.5251666666666667, "grad_norm": 25.195589065551758, "learning_rate": 9.670786040222572e-08, "logits/chosen": 2.389101505279541, "logits/rejected": 2.8080954551696777, "logps/chosen": -54.12519454956055, "logps/rejected": -246.02615356445312, "loss": 0.7434, "nll_loss": 0.7314215898513794, "rewards/accuracies": 1.0, "rewards/chosen": 2.155975818634033, "rewards/margins": 7.336238861083984, "rewards/rejected": -5.180263042449951, "step": 3151 }, { "epoch": 0.5253333333333333, "grad_norm": 25.677753448486328, "learning_rate": 9.665391088903749e-08, "logits/chosen": 1.4791371822357178, "logits/rejected": 1.7507516145706177, "logps/chosen": -72.63233184814453, "logps/rejected": -119.73087310791016, "loss": 0.7513, "nll_loss": 0.7336598634719849, "rewards/accuracies": 1.0, "rewards/chosen": 1.9439048767089844, "rewards/margins": 6.434688091278076, "rewards/rejected": -4.490783214569092, "step": 3152 }, { "epoch": 0.5255, "grad_norm": 28.969511032104492, "learning_rate": 9.659996235081925e-08, "logits/chosen": 2.8022680282592773, "logits/rejected": 2.9525272846221924, "logps/chosen": -37.668601989746094, "logps/rejected": -386.376953125, "loss": 0.6928, "nll_loss": 0.6726536750793457, "rewards/accuracies": 1.0, "rewards/chosen": 1.261941909790039, "rewards/margins": 14.234173774719238, "rewards/rejected": -12.9722318649292, "step": 3153 }, { "epoch": 0.5256666666666666, "grad_norm": 39.69307327270508, "learning_rate": 9.654601480329036e-08, "logits/chosen": 2.552772283554077, "logits/rejected": 2.616638422012329, "logps/chosen": -17.323617935180664, "logps/rejected": -40.58829879760742, "loss": 0.5525, "nll_loss": 0.4225272536277771, "rewards/accuracies": 1.0, "rewards/chosen": 1.2873624563217163, "rewards/margins": 3.0716490745544434, "rewards/rejected": -1.7842867374420166, "step": 3154 }, { "epoch": 0.5258333333333334, "grad_norm": 45.53044891357422, "learning_rate": 9.649206826216986e-08, "logits/chosen": 2.0937509536743164, "logits/rejected": 2.8268604278564453, "logps/chosen": -40.621612548828125, "logps/rejected": -226.28765869140625, "loss": 0.9747, "nll_loss": 0.9232184886932373, "rewards/accuracies": 1.0, "rewards/chosen": 0.3236698508262634, "rewards/margins": 6.093128204345703, "rewards/rejected": -5.769458293914795, "step": 3155 }, { "epoch": 0.526, "grad_norm": 25.067304611206055, "learning_rate": 9.643812274317644e-08, "logits/chosen": 3.92262864112854, "logits/rejected": 3.873772144317627, "logps/chosen": -27.645490646362305, "logps/rejected": -84.8193359375, "loss": 0.4708, "nll_loss": 0.4458950459957123, "rewards/accuracies": 1.0, "rewards/chosen": 2.2505452632904053, "rewards/margins": 5.800380706787109, "rewards/rejected": -3.549835443496704, "step": 3156 }, { "epoch": 0.5261666666666667, "grad_norm": 25.612749099731445, "learning_rate": 9.638417826202854e-08, "logits/chosen": 0.9755786657333374, "logits/rejected": 1.9991724491119385, "logps/chosen": -40.073238372802734, "logps/rejected": -195.80136108398438, "loss": 0.6024, "nll_loss": 0.5893121957778931, "rewards/accuracies": 1.0, "rewards/chosen": 1.7415608167648315, "rewards/margins": 9.23120403289795, "rewards/rejected": -7.489643096923828, "step": 3157 }, { "epoch": 0.5263333333333333, "grad_norm": 42.57163619995117, "learning_rate": 9.633023483444428e-08, "logits/chosen": 1.9988340139389038, "logits/rejected": 2.7973575592041016, "logps/chosen": -52.12455749511719, "logps/rejected": -177.2340545654297, "loss": 1.1119, "nll_loss": 1.063766598701477, "rewards/accuracies": 1.0, "rewards/chosen": 0.3470272123813629, "rewards/margins": 7.144375324249268, "rewards/rejected": -6.7973480224609375, "step": 3158 }, { "epoch": 0.5265, "grad_norm": 26.866884231567383, "learning_rate": 9.62762924761415e-08, "logits/chosen": 2.019984483718872, "logits/rejected": 1.337994933128357, "logps/chosen": -82.41783905029297, "logps/rejected": -35.119483947753906, "loss": 0.8296, "nll_loss": 0.7561269402503967, "rewards/accuracies": 1.0, "rewards/chosen": 1.328881859779358, "rewards/margins": 3.873056411743164, "rewards/rejected": -2.5441746711730957, "step": 3159 }, { "epoch": 0.5266666666666666, "grad_norm": 208.5548553466797, "learning_rate": 9.622235120283769e-08, "logits/chosen": 2.6021463871002197, "logits/rejected": 2.7753241062164307, "logps/chosen": -42.98151779174805, "logps/rejected": -26.371089935302734, "loss": 2.4821, "nll_loss": 1.023369550704956, "rewards/accuracies": 0.0, "rewards/chosen": 0.42226147651672363, "rewards/margins": -0.8219634294509888, "rewards/rejected": 1.2442249059677124, "step": 3160 }, { "epoch": 0.5268333333333334, "grad_norm": 41.05872344970703, "learning_rate": 9.616841103025003e-08, "logits/chosen": 2.8639485836029053, "logits/rejected": 2.9663476943969727, "logps/chosen": -109.74551391601562, "logps/rejected": -219.68539428710938, "loss": 1.3886, "nll_loss": 1.338360071182251, "rewards/accuracies": 1.0, "rewards/chosen": 0.3235069215297699, "rewards/margins": 6.505502223968506, "rewards/rejected": -6.181995391845703, "step": 3161 }, { "epoch": 0.527, "grad_norm": 27.857898712158203, "learning_rate": 9.611447197409542e-08, "logits/chosen": 2.8720431327819824, "logits/rejected": 3.0115842819213867, "logps/chosen": -22.65353012084961, "logps/rejected": -138.77255249023438, "loss": 0.4799, "nll_loss": 0.46231698989868164, "rewards/accuracies": 1.0, "rewards/chosen": 1.5649147033691406, "rewards/margins": 7.19962739944458, "rewards/rejected": -5.6347126960754395, "step": 3162 }, { "epoch": 0.5271666666666667, "grad_norm": 57.03451919555664, "learning_rate": 9.606053405009041e-08, "logits/chosen": 2.3485662937164307, "logits/rejected": 1.798711895942688, "logps/chosen": -49.42110061645508, "logps/rejected": -31.379913330078125, "loss": 0.9446, "nll_loss": 0.7603247165679932, "rewards/accuracies": 1.0, "rewards/chosen": 2.355894088745117, "rewards/margins": 3.25589919090271, "rewards/rejected": -0.900005042552948, "step": 3163 }, { "epoch": 0.5273333333333333, "grad_norm": 51.1025505065918, "learning_rate": 9.600659727395118e-08, "logits/chosen": 1.798222541809082, "logits/rejected": 2.7982444763183594, "logps/chosen": -53.34099197387695, "logps/rejected": -308.80706787109375, "loss": 1.2837, "nll_loss": 1.2700237035751343, "rewards/accuracies": 1.0, "rewards/chosen": 1.814378023147583, "rewards/margins": 7.722773551940918, "rewards/rejected": -5.908395767211914, "step": 3164 }, { "epoch": 0.5275, "grad_norm": 42.25105285644531, "learning_rate": 9.595266166139365e-08, "logits/chosen": 2.0574564933776855, "logits/rejected": 1.5950931310653687, "logps/chosen": -76.28662872314453, "logps/rejected": -65.39244842529297, "loss": 0.8661, "nll_loss": 0.6464967131614685, "rewards/accuracies": 1.0, "rewards/chosen": 2.5259180068969727, "rewards/margins": 3.1335535049438477, "rewards/rejected": -0.607635498046875, "step": 3165 }, { "epoch": 0.5276666666666666, "grad_norm": 37.538143157958984, "learning_rate": 9.589872722813332e-08, "logits/chosen": 2.8417274951934814, "logits/rejected": 2.697683334350586, "logps/chosen": -63.18279266357422, "logps/rejected": -77.8978042602539, "loss": 1.0593, "nll_loss": 1.019077181816101, "rewards/accuracies": 1.0, "rewards/chosen": 1.9208389520645142, "rewards/margins": 4.963040351867676, "rewards/rejected": -3.042201519012451, "step": 3166 }, { "epoch": 0.5278333333333334, "grad_norm": 25.576948165893555, "learning_rate": 9.584479398988544e-08, "logits/chosen": 1.8958494663238525, "logits/rejected": 2.2118570804595947, "logps/chosen": -43.94288635253906, "logps/rejected": -162.2838134765625, "loss": 0.5671, "nll_loss": 0.5492861270904541, "rewards/accuracies": 1.0, "rewards/chosen": 1.4765328168869019, "rewards/margins": 7.724920749664307, "rewards/rejected": -6.248387813568115, "step": 3167 }, { "epoch": 0.528, "grad_norm": 23.607309341430664, "learning_rate": 9.579086196236481e-08, "logits/chosen": 2.832531213760376, "logits/rejected": 2.7769951820373535, "logps/chosen": -29.550922393798828, "logps/rejected": -106.66236877441406, "loss": 0.4874, "nll_loss": 0.4546296000480652, "rewards/accuracies": 1.0, "rewards/chosen": 1.1265414953231812, "rewards/margins": 5.499306678771973, "rewards/rejected": -4.372765064239502, "step": 3168 }, { "epoch": 0.5281666666666667, "grad_norm": 29.31216049194336, "learning_rate": 9.573693116128599e-08, "logits/chosen": 2.612170696258545, "logits/rejected": 2.7675323486328125, "logps/chosen": -5.860146522521973, "logps/rejected": -66.88204956054688, "loss": 0.2347, "nll_loss": 0.2253902107477188, "rewards/accuracies": 1.0, "rewards/chosen": 2.522672653198242, "rewards/margins": 7.699843883514404, "rewards/rejected": -5.177171230316162, "step": 3169 }, { "epoch": 0.5283333333333333, "grad_norm": 26.310083389282227, "learning_rate": 9.568300160236303e-08, "logits/chosen": 1.55624520778656, "logits/rejected": 2.002261161804199, "logps/chosen": -18.611787796020508, "logps/rejected": -133.9369659423828, "loss": 0.4191, "nll_loss": 0.38774558901786804, "rewards/accuracies": 1.0, "rewards/chosen": 3.2172372341156006, "rewards/margins": 6.148257732391357, "rewards/rejected": -2.931020498275757, "step": 3170 }, { "epoch": 0.5285, "grad_norm": 23.414691925048828, "learning_rate": 9.56290733013098e-08, "logits/chosen": 0.4055666923522949, "logits/rejected": 1.950668215751648, "logps/chosen": -17.212175369262695, "logps/rejected": -334.38702392578125, "loss": 0.337, "nll_loss": 0.3187439441680908, "rewards/accuracies": 1.0, "rewards/chosen": 1.366883397102356, "rewards/margins": 10.649401664733887, "rewards/rejected": -9.28251838684082, "step": 3171 }, { "epoch": 0.5286666666666666, "grad_norm": 21.635391235351562, "learning_rate": 9.557514627383966e-08, "logits/chosen": 1.6940730810165405, "logits/rejected": 2.4722890853881836, "logps/chosen": -72.5979232788086, "logps/rejected": -258.2078857421875, "loss": 0.798, "nll_loss": 0.7891076803207397, "rewards/accuracies": 1.0, "rewards/chosen": 2.1186134815216064, "rewards/margins": 10.837396621704102, "rewards/rejected": -8.718783378601074, "step": 3172 }, { "epoch": 0.5288333333333334, "grad_norm": 106.79106903076172, "learning_rate": 9.552122053566566e-08, "logits/chosen": 2.6192445755004883, "logits/rejected": 2.3848726749420166, "logps/chosen": -129.40220642089844, "logps/rejected": -10.24294662475586, "loss": 2.0479, "nll_loss": 0.958534836769104, "rewards/accuracies": 1.0, "rewards/chosen": 3.1698975563049316, "rewards/margins": 0.5584466457366943, "rewards/rejected": 2.6114509105682373, "step": 3173 }, { "epoch": 0.529, "grad_norm": 23.663143157958984, "learning_rate": 9.546729610250048e-08, "logits/chosen": 1.76694655418396, "logits/rejected": 2.3225972652435303, "logps/chosen": -54.697975158691406, "logps/rejected": -220.70761108398438, "loss": 0.6854, "nll_loss": 0.675283670425415, "rewards/accuracies": 1.0, "rewards/chosen": 2.0104920864105225, "rewards/margins": 9.44092082977295, "rewards/rejected": -7.430428981781006, "step": 3174 }, { "epoch": 0.5291666666666667, "grad_norm": 33.08985137939453, "learning_rate": 9.541337299005639e-08, "logits/chosen": 1.4936292171478271, "logits/rejected": 2.390239953994751, "logps/chosen": -20.934999465942383, "logps/rejected": -190.37625122070312, "loss": 0.5761, "nll_loss": 0.5658107995986938, "rewards/accuracies": 1.0, "rewards/chosen": 2.114888906478882, "rewards/margins": 8.202467918395996, "rewards/rejected": -6.087579250335693, "step": 3175 }, { "epoch": 0.5293333333333333, "grad_norm": 29.657730102539062, "learning_rate": 9.53594512140453e-08, "logits/chosen": 3.0179529190063477, "logits/rejected": 3.0870752334594727, "logps/chosen": -62.13317108154297, "logps/rejected": -171.04925537109375, "loss": 0.7875, "nll_loss": 0.7670761346817017, "rewards/accuracies": 1.0, "rewards/chosen": 1.5706748962402344, "rewards/margins": 6.421086311340332, "rewards/rejected": -4.850411415100098, "step": 3176 }, { "epoch": 0.5295, "grad_norm": 36.25614547729492, "learning_rate": 9.530553079017872e-08, "logits/chosen": 2.743093729019165, "logits/rejected": 2.9988420009613037, "logps/chosen": -30.12071990966797, "logps/rejected": -326.4308166503906, "loss": 0.7658, "nll_loss": 0.7346517443656921, "rewards/accuracies": 1.0, "rewards/chosen": 0.81732177734375, "rewards/margins": 8.053174018859863, "rewards/rejected": -7.235852241516113, "step": 3177 }, { "epoch": 0.5296666666666666, "grad_norm": 26.01654052734375, "learning_rate": 9.525161173416777e-08, "logits/chosen": 2.642390012741089, "logits/rejected": 2.7190091609954834, "logps/chosen": -27.338714599609375, "logps/rejected": -115.48634338378906, "loss": 0.4623, "nll_loss": 0.44817566871643066, "rewards/accuracies": 1.0, "rewards/chosen": 1.7525086402893066, "rewards/margins": 7.79109525680542, "rewards/rejected": -6.038586616516113, "step": 3178 }, { "epoch": 0.5298333333333334, "grad_norm": 19.45937728881836, "learning_rate": 9.519769406172315e-08, "logits/chosen": 2.141868829727173, "logits/rejected": 2.29914927482605, "logps/chosen": -68.73294067382812, "logps/rejected": -128.53294372558594, "loss": 0.6795, "nll_loss": 0.6673101186752319, "rewards/accuracies": 1.0, "rewards/chosen": 2.249350070953369, "rewards/margins": 7.185577392578125, "rewards/rejected": -4.936227321624756, "step": 3179 }, { "epoch": 0.53, "grad_norm": 29.15556526184082, "learning_rate": 9.51437777885552e-08, "logits/chosen": 1.476717472076416, "logits/rejected": 2.114452362060547, "logps/chosen": -106.13043212890625, "logps/rejected": -270.2191162109375, "loss": 1.0968, "nll_loss": 1.082963466644287, "rewards/accuracies": 1.0, "rewards/chosen": 1.6793098449707031, "rewards/margins": 9.280145645141602, "rewards/rejected": -7.600836277008057, "step": 3180 }, { "epoch": 0.5301666666666667, "grad_norm": 223.6388397216797, "learning_rate": 9.508986293037378e-08, "logits/chosen": 2.6991865634918213, "logits/rejected": 2.6642420291900635, "logps/chosen": -49.873329162597656, "logps/rejected": -72.14341735839844, "loss": 2.6693, "nll_loss": 0.7024413347244263, "rewards/accuracies": 0.0, "rewards/chosen": 1.6228455305099487, "rewards/margins": -1.2198227643966675, "rewards/rejected": 2.842668294906616, "step": 3181 }, { "epoch": 0.5303333333333333, "grad_norm": 108.28517150878906, "learning_rate": 9.503594950288843e-08, "logits/chosen": 2.348046064376831, "logits/rejected": 2.170879364013672, "logps/chosen": -21.745845794677734, "logps/rejected": -68.2125244140625, "loss": 1.1566, "nll_loss": 0.5177581310272217, "rewards/accuracies": 1.0, "rewards/chosen": 1.6353965997695923, "rewards/margins": 0.9633018374443054, "rewards/rejected": 0.6720947623252869, "step": 3182 }, { "epoch": 0.5305, "grad_norm": 418.6872863769531, "learning_rate": 9.498203752180825e-08, "logits/chosen": 3.5524630546569824, "logits/rejected": 3.541347026824951, "logps/chosen": -118.20745086669922, "logps/rejected": -107.16825866699219, "loss": 4.2386, "nll_loss": 2.5697271823883057, "rewards/accuracies": 0.0, "rewards/chosen": -3.867022752761841, "rewards/margins": -1.004244089126587, "rewards/rejected": -2.862778663635254, "step": 3183 }, { "epoch": 0.5306666666666666, "grad_norm": 49.02680969238281, "learning_rate": 9.492812700284185e-08, "logits/chosen": 2.108818292617798, "logits/rejected": 2.185460090637207, "logps/chosen": -46.32525634765625, "logps/rejected": -142.76304626464844, "loss": 0.9646, "nll_loss": 0.7594302892684937, "rewards/accuracies": 1.0, "rewards/chosen": 4.925416469573975, "rewards/margins": 5.3608598709106445, "rewards/rejected": -0.4354431629180908, "step": 3184 }, { "epoch": 0.5308333333333334, "grad_norm": 141.08938598632812, "learning_rate": 9.48742179616975e-08, "logits/chosen": 3.006624937057495, "logits/rejected": 2.922355890274048, "logps/chosen": -36.80059814453125, "logps/rejected": -97.1725845336914, "loss": 2.5355, "nll_loss": 2.4533729553222656, "rewards/accuracies": 1.0, "rewards/chosen": 0.14890213310718536, "rewards/margins": 3.9717462062835693, "rewards/rejected": -3.8228440284729004, "step": 3185 }, { "epoch": 0.531, "grad_norm": 21.598323822021484, "learning_rate": 9.482031041408295e-08, "logits/chosen": 2.3314597606658936, "logits/rejected": 2.6824491024017334, "logps/chosen": -112.67242431640625, "logps/rejected": -268.6400146484375, "loss": 0.9384, "nll_loss": 0.9235443472862244, "rewards/accuracies": 1.0, "rewards/chosen": 1.5918169021606445, "rewards/margins": 9.821257591247559, "rewards/rejected": -8.229440689086914, "step": 3186 }, { "epoch": 0.5311666666666667, "grad_norm": 57.0616455078125, "learning_rate": 9.47664043757056e-08, "logits/chosen": 2.617962598800659, "logits/rejected": 2.698699712753296, "logps/chosen": -37.37451171875, "logps/rejected": -173.40072631835938, "loss": 1.5167, "nll_loss": 1.4949805736541748, "rewards/accuracies": 1.0, "rewards/chosen": 1.2884019613265991, "rewards/margins": 7.158012866973877, "rewards/rejected": -5.869610786437988, "step": 3187 }, { "epoch": 0.5313333333333333, "grad_norm": 19.864078521728516, "learning_rate": 9.471249986227237e-08, "logits/chosen": 0.9297587275505066, "logits/rejected": 2.0790932178497314, "logps/chosen": -60.48773956298828, "logps/rejected": -347.9666442871094, "loss": 0.6484, "nll_loss": 0.6434864401817322, "rewards/accuracies": 1.0, "rewards/chosen": 2.824958324432373, "rewards/margins": 9.825254440307617, "rewards/rejected": -7.000296115875244, "step": 3188 }, { "epoch": 0.5315, "grad_norm": 23.357975006103516, "learning_rate": 9.465859688948976e-08, "logits/chosen": 0.8217869997024536, "logits/rejected": 2.3852012157440186, "logps/chosen": -32.50845718383789, "logps/rejected": -316.10797119140625, "loss": 0.4855, "nll_loss": 0.4644065201282501, "rewards/accuracies": 1.0, "rewards/chosen": 1.3001583814620972, "rewards/margins": 7.381839275360107, "rewards/rejected": -6.081680774688721, "step": 3189 }, { "epoch": 0.5316666666666666, "grad_norm": 19.06769561767578, "learning_rate": 9.460469547306374e-08, "logits/chosen": 1.7861912250518799, "logits/rejected": 1.9520764350891113, "logps/chosen": -72.52208709716797, "logps/rejected": -232.6383056640625, "loss": 0.6763, "nll_loss": 0.6715008020401001, "rewards/accuracies": 1.0, "rewards/chosen": 2.862236738204956, "rewards/margins": 9.729549407958984, "rewards/rejected": -6.867312908172607, "step": 3190 }, { "epoch": 0.5318333333333334, "grad_norm": 46.63093566894531, "learning_rate": 9.455079562869997e-08, "logits/chosen": 1.8783670663833618, "logits/rejected": 2.1861109733581543, "logps/chosen": -36.904380798339844, "logps/rejected": -92.5701904296875, "loss": 0.9912, "nll_loss": 0.9001068472862244, "rewards/accuracies": 1.0, "rewards/chosen": 1.1736668348312378, "rewards/margins": 3.5161032676696777, "rewards/rejected": -2.3424363136291504, "step": 3191 }, { "epoch": 0.532, "grad_norm": 53.39065170288086, "learning_rate": 9.44968973721035e-08, "logits/chosen": 2.8617262840270996, "logits/rejected": 2.758399724960327, "logps/chosen": -44.470149993896484, "logps/rejected": -73.31454467773438, "loss": 0.9037, "nll_loss": 0.6444950103759766, "rewards/accuracies": 1.0, "rewards/chosen": 1.817209243774414, "rewards/margins": 2.4268887042999268, "rewards/rejected": -0.6096794009208679, "step": 3192 }, { "epoch": 0.5321666666666667, "grad_norm": 37.149658203125, "learning_rate": 9.444300071897903e-08, "logits/chosen": 1.6944059133529663, "logits/rejected": 1.9349623918533325, "logps/chosen": -49.69021987915039, "logps/rejected": -113.25035095214844, "loss": 0.8731, "nll_loss": 0.8422070741653442, "rewards/accuracies": 1.0, "rewards/chosen": 0.8918506503105164, "rewards/margins": 6.753294467926025, "rewards/rejected": -5.861443996429443, "step": 3193 }, { "epoch": 0.5323333333333333, "grad_norm": 127.11463928222656, "learning_rate": 9.438910568503076e-08, "logits/chosen": 2.829442024230957, "logits/rejected": 0.7866494655609131, "logps/chosen": -51.40510559082031, "logps/rejected": -90.59102630615234, "loss": 0.8761, "nll_loss": 0.23909349739551544, "rewards/accuracies": 1.0, "rewards/chosen": -2.2719647884368896, "rewards/margins": 0.7603747844696045, "rewards/rejected": -3.032339572906494, "step": 3194 }, { "epoch": 0.5325, "grad_norm": 196.88790893554688, "learning_rate": 9.433521228596235e-08, "logits/chosen": 2.816215753555298, "logits/rejected": 3.0219063758850098, "logps/chosen": -112.77386474609375, "logps/rejected": -321.0715026855469, "loss": 1.5333, "nll_loss": 1.1747277975082397, "rewards/accuracies": 1.0, "rewards/chosen": -0.3234237730503082, "rewards/margins": 1.3093918561935425, "rewards/rejected": -1.6328155994415283, "step": 3195 }, { "epoch": 0.5326666666666666, "grad_norm": 25.97826385498047, "learning_rate": 9.428132053747711e-08, "logits/chosen": 3.0065579414367676, "logits/rejected": 2.9569473266601562, "logps/chosen": -143.04678344726562, "logps/rejected": -99.28303527832031, "loss": 1.0383, "nll_loss": 1.0145161151885986, "rewards/accuracies": 1.0, "rewards/chosen": 1.2750030755996704, "rewards/margins": 6.528222560882568, "rewards/rejected": -5.2532196044921875, "step": 3196 }, { "epoch": 0.5328333333333334, "grad_norm": 65.14276885986328, "learning_rate": 9.422743045527777e-08, "logits/chosen": 2.703171491622925, "logits/rejected": 3.0759291648864746, "logps/chosen": -66.00934600830078, "logps/rejected": -446.31549072265625, "loss": 2.1437, "nll_loss": 2.129333734512329, "rewards/accuracies": 1.0, "rewards/chosen": 1.64244544506073, "rewards/margins": 9.069639205932617, "rewards/rejected": -7.427194118499756, "step": 3197 }, { "epoch": 0.533, "grad_norm": 26.11380386352539, "learning_rate": 9.417354205506662e-08, "logits/chosen": 1.525439977645874, "logits/rejected": 1.6512188911437988, "logps/chosen": -88.54498291015625, "logps/rejected": -82.34925079345703, "loss": 0.8563, "nll_loss": 0.8275232315063477, "rewards/accuracies": 1.0, "rewards/chosen": 1.5973961353302002, "rewards/margins": 5.467344284057617, "rewards/rejected": -3.869948148727417, "step": 3198 }, { "epoch": 0.5331666666666667, "grad_norm": 159.7918701171875, "learning_rate": 9.41196553525454e-08, "logits/chosen": 3.070913314819336, "logits/rejected": 2.999311923980713, "logps/chosen": -75.81454467773438, "logps/rejected": -11.115671157836914, "loss": 2.4473, "nll_loss": 1.052979826927185, "rewards/accuracies": 0.0, "rewards/chosen": 1.568732500076294, "rewards/margins": -0.47049999237060547, "rewards/rejected": 2.0392324924468994, "step": 3199 }, { "epoch": 0.5333333333333333, "grad_norm": 29.6291561126709, "learning_rate": 9.406577036341547e-08, "logits/chosen": 2.4409053325653076, "logits/rejected": 2.3590199947357178, "logps/chosen": -93.47370910644531, "logps/rejected": -119.11441040039062, "loss": 1.0441, "nll_loss": 1.0271836519241333, "rewards/accuracies": 1.0, "rewards/chosen": 1.6091049909591675, "rewards/margins": 7.2286834716796875, "rewards/rejected": -5.6195783615112305, "step": 3200 }, { "epoch": 0.5335, "grad_norm": 28.906627655029297, "learning_rate": 9.401188710337756e-08, "logits/chosen": 1.2431623935699463, "logits/rejected": 2.330683469772339, "logps/chosen": -34.05359649658203, "logps/rejected": -311.4421081542969, "loss": 0.4501, "nll_loss": 0.42566999793052673, "rewards/accuracies": 1.0, "rewards/chosen": 4.9210944175720215, "rewards/margins": 7.984446048736572, "rewards/rejected": -3.063351631164551, "step": 3201 }, { "epoch": 0.5336666666666666, "grad_norm": 36.904598236083984, "learning_rate": 9.395800558813201e-08, "logits/chosen": 1.5628961324691772, "logits/rejected": 2.0148210525512695, "logps/chosen": -113.51610565185547, "logps/rejected": -269.7180480957031, "loss": 1.4864, "nll_loss": 1.4742351770401, "rewards/accuracies": 1.0, "rewards/chosen": 1.7894493341445923, "rewards/margins": 10.487231254577637, "rewards/rejected": -8.697781562805176, "step": 3202 }, { "epoch": 0.5338333333333334, "grad_norm": 25.67097282409668, "learning_rate": 9.390412583337857e-08, "logits/chosen": 1.868377923965454, "logits/rejected": 2.2114880084991455, "logps/chosen": -63.836181640625, "logps/rejected": -463.521240234375, "loss": 0.9293, "nll_loss": 0.9119454622268677, "rewards/accuracies": 1.0, "rewards/chosen": 1.441693902015686, "rewards/margins": 8.696479797363281, "rewards/rejected": -7.254785537719727, "step": 3203 }, { "epoch": 0.534, "grad_norm": 22.87867546081543, "learning_rate": 9.385024785481652e-08, "logits/chosen": 1.6485621929168701, "logits/rejected": 2.618914842605591, "logps/chosen": -27.328388214111328, "logps/rejected": -283.79541015625, "loss": 0.458, "nll_loss": 0.44800642132759094, "rewards/accuracies": 1.0, "rewards/chosen": 1.9863163232803345, "rewards/margins": 11.775104522705078, "rewards/rejected": -9.788787841796875, "step": 3204 }, { "epoch": 0.5341666666666667, "grad_norm": 51.24028015136719, "learning_rate": 9.379637166814459e-08, "logits/chosen": 3.3563661575317383, "logits/rejected": 3.5571916103363037, "logps/chosen": -27.199398040771484, "logps/rejected": -137.44378662109375, "loss": 0.8426, "nll_loss": 0.8242241740226746, "rewards/accuracies": 1.0, "rewards/chosen": 4.459322929382324, "rewards/margins": 7.855846405029297, "rewards/rejected": -3.3965232372283936, "step": 3205 }, { "epoch": 0.5343333333333333, "grad_norm": 37.52412033081055, "learning_rate": 9.374249728906104e-08, "logits/chosen": 2.140695095062256, "logits/rejected": 2.360023021697998, "logps/chosen": -32.72658920288086, "logps/rejected": -212.18109130859375, "loss": 0.7664, "nll_loss": 0.7272575497627258, "rewards/accuracies": 1.0, "rewards/chosen": 0.6398587226867676, "rewards/margins": 6.296104907989502, "rewards/rejected": -5.656246185302734, "step": 3206 }, { "epoch": 0.5345, "grad_norm": 30.003496170043945, "learning_rate": 9.368862473326353e-08, "logits/chosen": 2.609243154525757, "logits/rejected": 2.516857862472534, "logps/chosen": -62.57527542114258, "logps/rejected": -96.43685913085938, "loss": 1.047, "nll_loss": 1.0258240699768066, "rewards/accuracies": 1.0, "rewards/chosen": 2.2186131477355957, "rewards/margins": 6.047595977783203, "rewards/rejected": -3.8289828300476074, "step": 3207 }, { "epoch": 0.5346666666666666, "grad_norm": 43.78373718261719, "learning_rate": 9.363475401644927e-08, "logits/chosen": 1.8634166717529297, "logits/rejected": 1.5361266136169434, "logps/chosen": -80.15396118164062, "logps/rejected": -76.74039459228516, "loss": 1.2039, "nll_loss": 1.1289291381835938, "rewards/accuracies": 1.0, "rewards/chosen": 0.5114906430244446, "rewards/margins": 3.860990047454834, "rewards/rejected": -3.349499464035034, "step": 3208 }, { "epoch": 0.5348333333333334, "grad_norm": 29.90025520324707, "learning_rate": 9.358088515431486e-08, "logits/chosen": 2.727088212966919, "logits/rejected": 2.674736738204956, "logps/chosen": -9.013517379760742, "logps/rejected": -100.5791015625, "loss": 0.3561, "nll_loss": 0.3466736972332001, "rewards/accuracies": 1.0, "rewards/chosen": 2.453052282333374, "rewards/margins": 7.7058610916137695, "rewards/rejected": -5.252809047698975, "step": 3209 }, { "epoch": 0.535, "grad_norm": 23.76708221435547, "learning_rate": 9.352701816255642e-08, "logits/chosen": 1.7901650667190552, "logits/rejected": 1.51834237575531, "logps/chosen": -97.635498046875, "logps/rejected": -68.0107421875, "loss": 0.9494, "nll_loss": 0.9298617839813232, "rewards/accuracies": 1.0, "rewards/chosen": 2.0766570568084717, "rewards/margins": 6.176119804382324, "rewards/rejected": -4.099462985992432, "step": 3210 }, { "epoch": 0.5351666666666667, "grad_norm": 32.27488708496094, "learning_rate": 9.347315305686948e-08, "logits/chosen": 0.8986825346946716, "logits/rejected": 2.5705678462982178, "logps/chosen": -71.05555725097656, "logps/rejected": -378.93475341796875, "loss": 1.1322, "nll_loss": 1.1102432012557983, "rewards/accuracies": 1.0, "rewards/chosen": 1.42582106590271, "rewards/margins": 6.426114082336426, "rewards/rejected": -5.000293254852295, "step": 3211 }, { "epoch": 0.5353333333333333, "grad_norm": 23.345230102539062, "learning_rate": 9.341928985294906e-08, "logits/chosen": 2.5912208557128906, "logits/rejected": 2.740041494369507, "logps/chosen": -63.27248764038086, "logps/rejected": -312.08709716796875, "loss": 0.6915, "nll_loss": 0.6803492903709412, "rewards/accuracies": 1.0, "rewards/chosen": 1.9733326435089111, "rewards/margins": 8.380888938903809, "rewards/rejected": -6.407556056976318, "step": 3212 }, { "epoch": 0.5355, "grad_norm": 44.65564727783203, "learning_rate": 9.336542856648955e-08, "logits/chosen": 2.137478828430176, "logits/rejected": 2.435757637023926, "logps/chosen": -21.33306312561035, "logps/rejected": -175.71340942382812, "loss": 0.7914, "nll_loss": 0.7901134490966797, "rewards/accuracies": 1.0, "rewards/chosen": 4.353290557861328, "rewards/margins": 11.79696273803711, "rewards/rejected": -7.443672180175781, "step": 3213 }, { "epoch": 0.5356666666666666, "grad_norm": 199.14593505859375, "learning_rate": 9.33115692131849e-08, "logits/chosen": 1.1401002407073975, "logits/rejected": 2.9088852405548096, "logps/chosen": -79.51934814453125, "logps/rejected": -488.3363342285156, "loss": 3.8268, "nll_loss": 3.180773973464966, "rewards/accuracies": 1.0, "rewards/chosen": -2.924042224884033, "rewards/margins": 1.133687973022461, "rewards/rejected": -4.057730197906494, "step": 3214 }, { "epoch": 0.5358333333333334, "grad_norm": 24.209407806396484, "learning_rate": 9.325771180872841e-08, "logits/chosen": 2.1309897899627686, "logits/rejected": 2.2306478023529053, "logps/chosen": -27.789209365844727, "logps/rejected": -115.36941528320312, "loss": 0.5061, "nll_loss": 0.487529993057251, "rewards/accuracies": 1.0, "rewards/chosen": 1.4598455429077148, "rewards/margins": 7.355395317077637, "rewards/rejected": -5.895549774169922, "step": 3215 }, { "epoch": 0.536, "grad_norm": 24.07874298095703, "learning_rate": 9.320385636881282e-08, "logits/chosen": 2.200528621673584, "logits/rejected": 1.6470049619674683, "logps/chosen": -123.45015716552734, "logps/rejected": -120.44268798828125, "loss": 0.9955, "nll_loss": 0.9797630906105042, "rewards/accuracies": 1.0, "rewards/chosen": 2.0255258083343506, "rewards/margins": 6.663307189941406, "rewards/rejected": -4.637781620025635, "step": 3216 }, { "epoch": 0.5361666666666667, "grad_norm": 18.103775024414062, "learning_rate": 9.315000290913034e-08, "logits/chosen": 2.51132869720459, "logits/rejected": 2.4639508724212646, "logps/chosen": -9.367741584777832, "logps/rejected": -110.60733795166016, "loss": 0.2119, "nll_loss": 0.2081720381975174, "rewards/accuracies": 1.0, "rewards/chosen": 3.1409554481506348, "rewards/margins": 10.103050231933594, "rewards/rejected": -6.962094306945801, "step": 3217 }, { "epoch": 0.5363333333333333, "grad_norm": 197.0936737060547, "learning_rate": 9.309615144537255e-08, "logits/chosen": 1.7060799598693848, "logits/rejected": 1.1755975484848022, "logps/chosen": -158.30850219726562, "logps/rejected": -51.014991760253906, "loss": 1.9717, "nll_loss": 0.6159864068031311, "rewards/accuracies": 0.0, "rewards/chosen": 1.9127564430236816, "rewards/margins": -0.3197157382965088, "rewards/rejected": 2.2324721813201904, "step": 3218 }, { "epoch": 0.5365, "grad_norm": 43.340789794921875, "learning_rate": 9.304230199323049e-08, "logits/chosen": 2.7594962120056152, "logits/rejected": 2.8728790283203125, "logps/chosen": -27.969932556152344, "logps/rejected": -73.25499725341797, "loss": 0.7272, "nll_loss": 0.6821934580802917, "rewards/accuracies": 1.0, "rewards/chosen": 0.4795166254043579, "rewards/margins": 6.124327182769775, "rewards/rejected": -5.644810676574707, "step": 3219 }, { "epoch": 0.5366666666666666, "grad_norm": 57.75129318237305, "learning_rate": 9.298845456839458e-08, "logits/chosen": 3.295070171356201, "logits/rejected": 3.71089243888855, "logps/chosen": -49.684085845947266, "logps/rejected": -100.92637634277344, "loss": 1.0072, "nll_loss": 0.955463171005249, "rewards/accuracies": 1.0, "rewards/chosen": 0.6663357615470886, "rewards/margins": 4.6850996017456055, "rewards/rejected": -4.018764019012451, "step": 3220 }, { "epoch": 0.5368333333333334, "grad_norm": 72.29126739501953, "learning_rate": 9.29346091865547e-08, "logits/chosen": 1.961814045906067, "logits/rejected": 2.372244119644165, "logps/chosen": -66.27046966552734, "logps/rejected": -369.5928955078125, "loss": 2.3127, "nll_loss": 2.2851884365081787, "rewards/accuracies": 1.0, "rewards/chosen": 0.9612579345703125, "rewards/margins": 7.675689697265625, "rewards/rejected": -6.7144317626953125, "step": 3221 }, { "epoch": 0.537, "grad_norm": 42.27145767211914, "learning_rate": 9.288076586340005e-08, "logits/chosen": 1.810998558998108, "logits/rejected": 1.5130650997161865, "logps/chosen": -33.7309684753418, "logps/rejected": -56.571815490722656, "loss": 0.5938, "nll_loss": 0.4269742965698242, "rewards/accuracies": 1.0, "rewards/chosen": 0.9424694776535034, "rewards/margins": 2.603407382965088, "rewards/rejected": -1.660938024520874, "step": 3222 }, { "epoch": 0.5371666666666667, "grad_norm": 29.777742385864258, "learning_rate": 9.282692461461933e-08, "logits/chosen": 2.5966715812683105, "logits/rejected": 2.6750309467315674, "logps/chosen": -13.63679027557373, "logps/rejected": -157.45347595214844, "loss": 0.4115, "nll_loss": 0.37879979610443115, "rewards/accuracies": 1.0, "rewards/chosen": 0.7515453100204468, "rewards/margins": 8.299613952636719, "rewards/rejected": -7.548069000244141, "step": 3223 }, { "epoch": 0.5373333333333333, "grad_norm": 27.220386505126953, "learning_rate": 9.277308545590055e-08, "logits/chosen": 0.5899093151092529, "logits/rejected": 1.7405543327331543, "logps/chosen": -17.54928970336914, "logps/rejected": -326.3367919921875, "loss": 0.4266, "nll_loss": 0.40812310576438904, "rewards/accuracies": 1.0, "rewards/chosen": 1.342685341835022, "rewards/margins": 11.30953311920166, "rewards/rejected": -9.96684741973877, "step": 3224 }, { "epoch": 0.5375, "grad_norm": 70.02610778808594, "learning_rate": 9.271924840293119e-08, "logits/chosen": 3.0242674350738525, "logits/rejected": 3.231839179992676, "logps/chosen": -10.034249305725098, "logps/rejected": -203.987060546875, "loss": 0.677, "nll_loss": 0.6689499616622925, "rewards/accuracies": 1.0, "rewards/chosen": 2.248973846435547, "rewards/margins": 9.523736953735352, "rewards/rejected": -7.274763584136963, "step": 3225 }, { "epoch": 0.5376666666666666, "grad_norm": 27.67844581604004, "learning_rate": 9.266541347139803e-08, "logits/chosen": 2.4004597663879395, "logits/rejected": 2.471701145172119, "logps/chosen": -54.97547149658203, "logps/rejected": -152.99949645996094, "loss": 0.8929, "nll_loss": 0.872626543045044, "rewards/accuracies": 1.0, "rewards/chosen": 1.2641853094100952, "rewards/margins": 8.921292304992676, "rewards/rejected": -7.657107353210449, "step": 3226 }, { "epoch": 0.5378333333333334, "grad_norm": 127.28263854980469, "learning_rate": 9.261158067698732e-08, "logits/chosen": 1.9501452445983887, "logits/rejected": 2.4243338108062744, "logps/chosen": -60.872886657714844, "logps/rejected": -94.54710388183594, "loss": 1.3455, "nll_loss": 0.7161515951156616, "rewards/accuracies": 1.0, "rewards/chosen": 1.1350975036621094, "rewards/margins": 0.8068863153457642, "rewards/rejected": 0.3282112181186676, "step": 3227 }, { "epoch": 0.538, "grad_norm": 18.939598083496094, "learning_rate": 9.255775003538461e-08, "logits/chosen": 1.2698942422866821, "logits/rejected": 1.3974295854568481, "logps/chosen": -293.8945617675781, "logps/rejected": -311.3561706542969, "loss": 0.8903, "nll_loss": 0.8720905780792236, "rewards/accuracies": 1.0, "rewards/chosen": 1.3524140119552612, "rewards/margins": 15.889273643493652, "rewards/rejected": -14.536859512329102, "step": 3228 }, { "epoch": 0.5381666666666667, "grad_norm": 151.2072296142578, "learning_rate": 9.250392156227488e-08, "logits/chosen": 2.6677472591400146, "logits/rejected": 2.6652159690856934, "logps/chosen": -134.47018432617188, "logps/rejected": -177.5294189453125, "loss": 2.3772, "nll_loss": 1.5820024013519287, "rewards/accuracies": 1.0, "rewards/chosen": -4.364410400390625, "rewards/margins": 1.8768820762634277, "rewards/rejected": -6.241292476654053, "step": 3229 }, { "epoch": 0.5383333333333333, "grad_norm": 21.44485092163086, "learning_rate": 9.245009527334243e-08, "logits/chosen": 1.9008244276046753, "logits/rejected": 0.7561548352241516, "logps/chosen": -193.9698028564453, "logps/rejected": -80.56073760986328, "loss": 0.8864, "nll_loss": 0.8659367561340332, "rewards/accuracies": 1.0, "rewards/chosen": 1.3137435913085938, "rewards/margins": 7.504823207855225, "rewards/rejected": -6.191079616546631, "step": 3230 }, { "epoch": 0.5385, "grad_norm": 37.22792053222656, "learning_rate": 9.239627118427095e-08, "logits/chosen": 2.797173261642456, "logits/rejected": 2.8378617763519287, "logps/chosen": -30.620758056640625, "logps/rejected": -89.6729965209961, "loss": 0.6764, "nll_loss": 0.6379325985908508, "rewards/accuracies": 1.0, "rewards/chosen": 0.600262463092804, "rewards/margins": 6.976812839508057, "rewards/rejected": -6.376550197601318, "step": 3231 }, { "epoch": 0.5386666666666666, "grad_norm": 149.00660705566406, "learning_rate": 9.234244931074351e-08, "logits/chosen": 2.558480739593506, "logits/rejected": 2.527977228164673, "logps/chosen": -184.00381469726562, "logps/rejected": -88.66329956054688, "loss": 0.997, "nll_loss": 0.6433699727058411, "rewards/accuracies": 1.0, "rewards/chosen": -1.3596223592758179, "rewards/margins": 1.5863534212112427, "rewards/rejected": -2.9459757804870605, "step": 3232 }, { "epoch": 0.5388333333333334, "grad_norm": 89.14482879638672, "learning_rate": 9.228862966844249e-08, "logits/chosen": 1.7984275817871094, "logits/rejected": 2.351527690887451, "logps/chosen": -84.52806091308594, "logps/rejected": -158.523193359375, "loss": 1.0148, "nll_loss": 0.9392005205154419, "rewards/accuracies": 1.0, "rewards/chosen": 1.4136956930160522, "rewards/margins": 3.8686165809631348, "rewards/rejected": -2.454921007156372, "step": 3233 }, { "epoch": 0.539, "grad_norm": 68.49778747558594, "learning_rate": 9.223481227304968e-08, "logits/chosen": 2.6390609741210938, "logits/rejected": 2.7655773162841797, "logps/chosen": -8.471370697021484, "logps/rejected": -207.04779052734375, "loss": 0.6651, "nll_loss": 0.6516438722610474, "rewards/accuracies": 1.0, "rewards/chosen": 1.779551386833191, "rewards/margins": 7.926042079925537, "rewards/rejected": -6.146490573883057, "step": 3234 }, { "epoch": 0.5391666666666667, "grad_norm": 18.313634872436523, "learning_rate": 9.218099714024612e-08, "logits/chosen": 2.1385657787323, "logits/rejected": 2.1713991165161133, "logps/chosen": -189.3625946044922, "logps/rejected": -215.56935119628906, "loss": 0.9278, "nll_loss": 0.9237197637557983, "rewards/accuracies": 1.0, "rewards/chosen": 2.8844592571258545, "rewards/margins": 12.779428482055664, "rewards/rejected": -9.89496898651123, "step": 3235 }, { "epoch": 0.5393333333333333, "grad_norm": 33.21815490722656, "learning_rate": 9.212718428571231e-08, "logits/chosen": 2.9233555793762207, "logits/rejected": 2.7167015075683594, "logps/chosen": -71.91674041748047, "logps/rejected": -107.11975860595703, "loss": 0.8877, "nll_loss": 0.8561517596244812, "rewards/accuracies": 1.0, "rewards/chosen": 1.5886093378067017, "rewards/margins": 5.287895202636719, "rewards/rejected": -3.6992859840393066, "step": 3236 }, { "epoch": 0.5395, "grad_norm": 24.105436325073242, "learning_rate": 9.207337372512796e-08, "logits/chosen": 2.4696624279022217, "logits/rejected": 2.616358995437622, "logps/chosen": -76.02161407470703, "logps/rejected": -229.53684997558594, "loss": 0.7627, "nll_loss": 0.7453098893165588, "rewards/accuracies": 1.0, "rewards/chosen": 1.399782657623291, "rewards/margins": 10.78529167175293, "rewards/rejected": -9.38550853729248, "step": 3237 }, { "epoch": 0.5396666666666666, "grad_norm": 23.845569610595703, "learning_rate": 9.201956547417222e-08, "logits/chosen": 1.4985952377319336, "logits/rejected": 1.8221518993377686, "logps/chosen": -41.58921432495117, "logps/rejected": -300.8126220703125, "loss": 0.5738, "nll_loss": 0.569715142250061, "rewards/accuracies": 1.0, "rewards/chosen": 2.897043228149414, "rewards/margins": 12.225403785705566, "rewards/rejected": -9.328360557556152, "step": 3238 }, { "epoch": 0.5398333333333334, "grad_norm": 699.1716918945312, "learning_rate": 9.19657595485235e-08, "logits/chosen": 1.7417056560516357, "logits/rejected": 1.266243577003479, "logps/chosen": -302.0664978027344, "logps/rejected": -167.88934326171875, "loss": 4.4285, "nll_loss": 0.8730246424674988, "rewards/accuracies": 0.0, "rewards/chosen": -8.521804809570312, "rewards/margins": -2.4826440811157227, "rewards/rejected": -6.03916072845459, "step": 3239 }, { "epoch": 0.54, "grad_norm": 33.54795837402344, "learning_rate": 9.191195596385959e-08, "logits/chosen": 1.1179437637329102, "logits/rejected": 1.5679796934127808, "logps/chosen": -63.85547637939453, "logps/rejected": -161.68447875976562, "loss": 1.0745, "nll_loss": 1.0468109846115112, "rewards/accuracies": 1.0, "rewards/chosen": 1.3218345642089844, "rewards/margins": 5.7389092445373535, "rewards/rejected": -4.417074680328369, "step": 3240 }, { "epoch": 0.5401666666666667, "grad_norm": 71.80615997314453, "learning_rate": 9.18581547358575e-08, "logits/chosen": 2.8083457946777344, "logits/rejected": 2.9144842624664307, "logps/chosen": -47.54541778564453, "logps/rejected": -269.54888916015625, "loss": 1.2274, "nll_loss": 1.0565649271011353, "rewards/accuracies": 1.0, "rewards/chosen": 5.376429080963135, "rewards/margins": 6.097949981689453, "rewards/rejected": -0.7215210199356079, "step": 3241 }, { "epoch": 0.5403333333333333, "grad_norm": 25.61945152282715, "learning_rate": 9.180435588019369e-08, "logits/chosen": 1.8614394664764404, "logits/rejected": 2.513950824737549, "logps/chosen": -97.80979919433594, "logps/rejected": -170.5703582763672, "loss": 1.014, "nll_loss": 0.9879777431488037, "rewards/accuracies": 1.0, "rewards/chosen": 1.5857468843460083, "rewards/margins": 5.690881252288818, "rewards/rejected": -4.1051344871521, "step": 3242 }, { "epoch": 0.5405, "grad_norm": 17.302509307861328, "learning_rate": 9.175055941254378e-08, "logits/chosen": 2.0381650924682617, "logits/rejected": 1.8525042533874512, "logps/chosen": -135.17288208007812, "logps/rejected": -235.52056884765625, "loss": 0.6784, "nll_loss": 0.6691727042198181, "rewards/accuracies": 1.0, "rewards/chosen": 2.0477356910705566, "rewards/margins": 12.394292831420898, "rewards/rejected": -10.3465576171875, "step": 3243 }, { "epoch": 0.5406666666666666, "grad_norm": 14.447929382324219, "learning_rate": 9.169676534858284e-08, "logits/chosen": 2.512824535369873, "logits/rejected": 2.5059337615966797, "logps/chosen": -168.708740234375, "logps/rejected": -250.35003662109375, "loss": 0.7739, "nll_loss": 0.7633880376815796, "rewards/accuracies": 1.0, "rewards/chosen": 2.5238022804260254, "rewards/margins": 7.367750644683838, "rewards/rejected": -4.8439483642578125, "step": 3244 }, { "epoch": 0.5408333333333334, "grad_norm": 73.75167846679688, "learning_rate": 9.16429737039851e-08, "logits/chosen": 2.3625993728637695, "logits/rejected": 2.4600415229797363, "logps/chosen": -54.821529388427734, "logps/rejected": -160.14553833007812, "loss": 1.9719, "nll_loss": 1.957911729812622, "rewards/accuracies": 1.0, "rewards/chosen": 2.0351719856262207, "rewards/margins": 6.976646900177002, "rewards/rejected": -4.941474914550781, "step": 3245 }, { "epoch": 0.541, "grad_norm": 35.873291015625, "learning_rate": 9.158918449442423e-08, "logits/chosen": 3.2559096813201904, "logits/rejected": 3.513150453567505, "logps/chosen": -61.68680953979492, "logps/rejected": -369.3588562011719, "loss": 1.155, "nll_loss": 1.1423484086990356, "rewards/accuracies": 1.0, "rewards/chosen": 2.2305691242218018, "rewards/margins": 7.069500923156738, "rewards/rejected": -4.838931560516357, "step": 3246 }, { "epoch": 0.5411666666666667, "grad_norm": 22.530712127685547, "learning_rate": 9.153539773557305e-08, "logits/chosen": 2.3478333950042725, "logits/rejected": 2.595555305480957, "logps/chosen": -74.98098754882812, "logps/rejected": -185.4757843017578, "loss": 0.7837, "nll_loss": 0.7729998826980591, "rewards/accuracies": 1.0, "rewards/chosen": 2.0849151611328125, "rewards/margins": 7.982150077819824, "rewards/rejected": -5.897234916687012, "step": 3247 }, { "epoch": 0.5413333333333333, "grad_norm": 49.57032012939453, "learning_rate": 9.148161344310376e-08, "logits/chosen": 2.2489662170410156, "logits/rejected": 2.449460744857788, "logps/chosen": -18.567066192626953, "logps/rejected": -112.87073516845703, "loss": 0.8099, "nll_loss": 0.8072637319564819, "rewards/accuracies": 1.0, "rewards/chosen": 3.7630200386047363, "rewards/margins": 10.209972381591797, "rewards/rejected": -6.446952819824219, "step": 3248 }, { "epoch": 0.5415, "grad_norm": 20.63463020324707, "learning_rate": 9.14278316326878e-08, "logits/chosen": 1.5212258100509644, "logits/rejected": 1.5751862525939941, "logps/chosen": -65.57658386230469, "logps/rejected": -99.24722290039062, "loss": 0.7043, "nll_loss": 0.6976232528686523, "rewards/accuracies": 1.0, "rewards/chosen": 2.727525472640991, "rewards/margins": 8.463109016418457, "rewards/rejected": -5.735583305358887, "step": 3249 }, { "epoch": 0.5416666666666666, "grad_norm": 26.223590850830078, "learning_rate": 9.137405231999592e-08, "logits/chosen": 1.1771572828292847, "logits/rejected": 2.13550066947937, "logps/chosen": -57.10048294067383, "logps/rejected": -190.04415893554688, "loss": 0.8116, "nll_loss": 0.7930622696876526, "rewards/accuracies": 1.0, "rewards/chosen": 1.3682377338409424, "rewards/margins": 8.394196510314941, "rewards/rejected": -7.025959014892578, "step": 3250 }, { "epoch": 0.5418333333333333, "grad_norm": 81.98936462402344, "learning_rate": 9.132027552069809e-08, "logits/chosen": 2.051461696624756, "logits/rejected": 1.9460558891296387, "logps/chosen": -40.63658905029297, "logps/rejected": -68.33180236816406, "loss": 1.2535, "nll_loss": 0.6661735773086548, "rewards/accuracies": 1.0, "rewards/chosen": 2.2276806831359863, "rewards/margins": 1.3781979084014893, "rewards/rejected": 0.8494827151298523, "step": 3251 }, { "epoch": 0.542, "grad_norm": 25.923051834106445, "learning_rate": 9.12665012504636e-08, "logits/chosen": 1.9744422435760498, "logits/rejected": 1.9734755754470825, "logps/chosen": -87.4215316772461, "logps/rejected": -113.87254333496094, "loss": 0.7905, "nll_loss": 0.7601873278617859, "rewards/accuracies": 1.0, "rewards/chosen": 1.7261368036270142, "rewards/margins": 5.3588151931762695, "rewards/rejected": -3.632678508758545, "step": 3252 }, { "epoch": 0.5421666666666667, "grad_norm": 37.68293380737305, "learning_rate": 9.121272952496096e-08, "logits/chosen": 2.471451759338379, "logits/rejected": 2.4177489280700684, "logps/chosen": -109.35250854492188, "logps/rejected": -92.79537963867188, "loss": 1.2136, "nll_loss": 1.139088749885559, "rewards/accuracies": 1.0, "rewards/chosen": 1.1688568592071533, "rewards/margins": 3.8121140003204346, "rewards/rejected": -2.6432571411132812, "step": 3253 }, { "epoch": 0.5423333333333333, "grad_norm": 31.894052505493164, "learning_rate": 9.115896035985798e-08, "logits/chosen": 2.5073957443237305, "logits/rejected": 2.16581392288208, "logps/chosen": -53.75725173950195, "logps/rejected": -78.9991455078125, "loss": 0.7887, "nll_loss": 0.7571443319320679, "rewards/accuracies": 1.0, "rewards/chosen": 0.7926959991455078, "rewards/margins": 7.790796279907227, "rewards/rejected": -6.998100280761719, "step": 3254 }, { "epoch": 0.5425, "grad_norm": 58.0030632019043, "learning_rate": 9.110519377082172e-08, "logits/chosen": 2.552719831466675, "logits/rejected": 3.0758190155029297, "logps/chosen": -26.108379364013672, "logps/rejected": -174.3558807373047, "loss": 0.942, "nll_loss": 0.9324421882629395, "rewards/accuracies": 1.0, "rewards/chosen": 2.528385877609253, "rewards/margins": 7.599470138549805, "rewards/rejected": -5.071084499359131, "step": 3255 }, { "epoch": 0.5426666666666666, "grad_norm": 34.75249481201172, "learning_rate": 9.105142977351842e-08, "logits/chosen": 3.3900792598724365, "logits/rejected": 3.5120725631713867, "logps/chosen": -17.00606918334961, "logps/rejected": -102.91413879394531, "loss": 0.5188, "nll_loss": 0.43605300784111023, "rewards/accuracies": 1.0, "rewards/chosen": 1.2744436264038086, "rewards/margins": 3.6915738582611084, "rewards/rejected": -2.4171302318573, "step": 3256 }, { "epoch": 0.5428333333333333, "grad_norm": 50.24502944946289, "learning_rate": 9.099766838361368e-08, "logits/chosen": 0.9761646389961243, "logits/rejected": 1.9493622779846191, "logps/chosen": -45.89292526245117, "logps/rejected": -257.08123779296875, "loss": 1.0943, "nll_loss": 1.0926885604858398, "rewards/accuracies": 1.0, "rewards/chosen": 4.26021146774292, "rewards/margins": 11.090985298156738, "rewards/rejected": -6.830773830413818, "step": 3257 }, { "epoch": 0.543, "grad_norm": 28.1967830657959, "learning_rate": 9.094390961677221e-08, "logits/chosen": 3.3436882495880127, "logits/rejected": 3.831005811691284, "logps/chosen": -29.918563842773438, "logps/rejected": -93.72354888916016, "loss": 0.5691, "nll_loss": 0.5248870253562927, "rewards/accuracies": 1.0, "rewards/chosen": 1.0393307209014893, "rewards/margins": 4.746023178100586, "rewards/rejected": -3.7066922187805176, "step": 3258 }, { "epoch": 0.5431666666666667, "grad_norm": 20.965410232543945, "learning_rate": 9.08901534886581e-08, "logits/chosen": 2.4816298484802246, "logits/rejected": 2.638617992401123, "logps/chosen": -45.8330078125, "logps/rejected": -227.51828002929688, "loss": 0.5866, "nll_loss": 0.5729124546051025, "rewards/accuracies": 1.0, "rewards/chosen": 1.6814515590667725, "rewards/margins": 8.883797645568848, "rewards/rejected": -7.202345848083496, "step": 3259 }, { "epoch": 0.5433333333333333, "grad_norm": 28.396451950073242, "learning_rate": 9.083640001493453e-08, "logits/chosen": 2.371464967727661, "logits/rejected": 2.492987871170044, "logps/chosen": -63.24945068359375, "logps/rejected": -222.66839599609375, "loss": 0.7805, "nll_loss": 0.7529696226119995, "rewards/accuracies": 1.0, "rewards/chosen": 0.9174209833145142, "rewards/margins": 8.990323066711426, "rewards/rejected": -8.072901725769043, "step": 3260 }, { "epoch": 0.5435, "grad_norm": 41.88639450073242, "learning_rate": 9.078264921126403e-08, "logits/chosen": 2.7630693912506104, "logits/rejected": 2.5338668823242188, "logps/chosen": -56.51739501953125, "logps/rejected": -115.6845474243164, "loss": 1.1726, "nll_loss": 1.1303478479385376, "rewards/accuracies": 1.0, "rewards/chosen": 1.5014420747756958, "rewards/margins": 4.769327640533447, "rewards/rejected": -3.267885684967041, "step": 3261 }, { "epoch": 0.5436666666666666, "grad_norm": 43.80830383300781, "learning_rate": 9.072890109330823e-08, "logits/chosen": 1.7762089967727661, "logits/rejected": 1.6215230226516724, "logps/chosen": -118.05524444580078, "logps/rejected": -114.58488464355469, "loss": 1.3013, "nll_loss": 1.1924773454666138, "rewards/accuracies": 1.0, "rewards/chosen": 0.38713765144348145, "rewards/margins": 3.1646816730499268, "rewards/rejected": -2.7775440216064453, "step": 3262 }, { "epoch": 0.5438333333333333, "grad_norm": 26.45213508605957, "learning_rate": 9.06751556767281e-08, "logits/chosen": 2.126558303833008, "logits/rejected": 1.851658821105957, "logps/chosen": -35.435279846191406, "logps/rejected": -146.55252075195312, "loss": 0.6119, "nll_loss": 0.6005980372428894, "rewards/accuracies": 1.0, "rewards/chosen": 1.985243320465088, "rewards/margins": 8.072357177734375, "rewards/rejected": -6.087114334106445, "step": 3263 }, { "epoch": 0.544, "grad_norm": 90.3404312133789, "learning_rate": 9.06214129771837e-08, "logits/chosen": 3.5437824726104736, "logits/rejected": 3.570857048034668, "logps/chosen": -91.6600341796875, "logps/rejected": -74.33776092529297, "loss": 2.21, "nll_loss": 2.0368900299072266, "rewards/accuracies": 1.0, "rewards/chosen": -0.6102951169013977, "rewards/margins": 2.7128746509552, "rewards/rejected": -3.323169708251953, "step": 3264 }, { "epoch": 0.5441666666666667, "grad_norm": 34.77303695678711, "learning_rate": 9.056767301033445e-08, "logits/chosen": 0.886809229850769, "logits/rejected": 2.8848683834075928, "logps/chosen": -54.55443572998047, "logps/rejected": -123.84109497070312, "loss": 0.9792, "nll_loss": 0.957095205783844, "rewards/accuracies": 1.0, "rewards/chosen": 1.1496223211288452, "rewards/margins": 9.541611671447754, "rewards/rejected": -8.391989707946777, "step": 3265 }, { "epoch": 0.5443333333333333, "grad_norm": 31.51568031311035, "learning_rate": 9.051393579183879e-08, "logits/chosen": 1.042512059211731, "logits/rejected": 2.53596830368042, "logps/chosen": -35.68437957763672, "logps/rejected": -222.04010009765625, "loss": 0.7629, "nll_loss": 0.7282528281211853, "rewards/accuracies": 1.0, "rewards/chosen": 0.65853351354599, "rewards/margins": 11.617341995239258, "rewards/rejected": -10.958808898925781, "step": 3266 }, { "epoch": 0.5445, "grad_norm": 48.58934783935547, "learning_rate": 9.046020133735454e-08, "logits/chosen": 2.2933664321899414, "logits/rejected": 2.6867728233337402, "logps/chosen": -85.70799255371094, "logps/rejected": -400.4042053222656, "loss": 1.0917, "nll_loss": 0.9966045022010803, "rewards/accuracies": 1.0, "rewards/chosen": 1.8390207290649414, "rewards/margins": 3.788376808166504, "rewards/rejected": -1.9493560791015625, "step": 3267 }, { "epoch": 0.5446666666666666, "grad_norm": 64.84821319580078, "learning_rate": 9.040646966253856e-08, "logits/chosen": 1.884739637374878, "logits/rejected": 1.5923444032669067, "logps/chosen": -79.40508270263672, "logps/rejected": -46.69334411621094, "loss": 1.1439, "nll_loss": 0.9683545827865601, "rewards/accuracies": 1.0, "rewards/chosen": -0.49697574973106384, "rewards/margins": 2.5886707305908203, "rewards/rejected": -3.085646390914917, "step": 3268 }, { "epoch": 0.5448333333333333, "grad_norm": 41.979827880859375, "learning_rate": 9.035274078304702e-08, "logits/chosen": 2.6036972999572754, "logits/rejected": 3.0143885612487793, "logps/chosen": -18.603466033935547, "logps/rejected": -106.94151306152344, "loss": 0.6989, "nll_loss": 0.6890171766281128, "rewards/accuracies": 1.0, "rewards/chosen": 2.0044286251068115, "rewards/margins": 9.612743377685547, "rewards/rejected": -7.608314514160156, "step": 3269 }, { "epoch": 0.545, "grad_norm": 34.36023712158203, "learning_rate": 9.029901471453519e-08, "logits/chosen": 3.414673328399658, "logits/rejected": 3.474543333053589, "logps/chosen": -18.830801010131836, "logps/rejected": -110.499267578125, "loss": 0.5491, "nll_loss": 0.5230777263641357, "rewards/accuracies": 1.0, "rewards/chosen": 1.5519862174987793, "rewards/margins": 5.6995768547058105, "rewards/rejected": -4.147590637207031, "step": 3270 }, { "epoch": 0.5451666666666667, "grad_norm": 37.997615814208984, "learning_rate": 9.02452914726576e-08, "logits/chosen": 2.3887693881988525, "logits/rejected": 2.384216070175171, "logps/chosen": -32.521339416503906, "logps/rejected": -61.72157287597656, "loss": 0.7593, "nll_loss": 0.7069855332374573, "rewards/accuracies": 1.0, "rewards/chosen": 0.9513851404190063, "rewards/margins": 4.420401096343994, "rewards/rejected": -3.4690160751342773, "step": 3271 }, { "epoch": 0.5453333333333333, "grad_norm": 33.62550354003906, "learning_rate": 9.019157107306786e-08, "logits/chosen": 1.5961517095565796, "logits/rejected": 2.241833448410034, "logps/chosen": -9.608973503112793, "logps/rejected": -206.68365478515625, "loss": 0.3635, "nll_loss": 0.35588791966438293, "rewards/accuracies": 1.0, "rewards/chosen": 2.2912099361419678, "rewards/margins": 9.605746269226074, "rewards/rejected": -7.314536094665527, "step": 3272 }, { "epoch": 0.5455, "grad_norm": 33.25421905517578, "learning_rate": 9.013785353141885e-08, "logits/chosen": 1.9306484460830688, "logits/rejected": 2.353123903274536, "logps/chosen": -64.18301391601562, "logps/rejected": -462.2496643066406, "loss": 0.9845, "nll_loss": 0.9579554200172424, "rewards/accuracies": 1.0, "rewards/chosen": 0.9468918442726135, "rewards/margins": 9.744624137878418, "rewards/rejected": -8.79773235321045, "step": 3273 }, { "epoch": 0.5456666666666666, "grad_norm": 788.3383178710938, "learning_rate": 9.008413886336255e-08, "logits/chosen": 2.5194826126098633, "logits/rejected": 0.9119521975517273, "logps/chosen": -229.912353515625, "logps/rejected": -49.232093811035156, "loss": 6.6418, "nll_loss": 0.7928013801574707, "rewards/accuracies": 0.0, "rewards/chosen": -8.547845840454102, "rewards/margins": -5.39487361907959, "rewards/rejected": -3.1529722213745117, "step": 3274 }, { "epoch": 0.5458333333333333, "grad_norm": 50.65000534057617, "learning_rate": 9.003042708455011e-08, "logits/chosen": 2.5368199348449707, "logits/rejected": 1.8670575618743896, "logps/chosen": -59.532981872558594, "logps/rejected": -54.896217346191406, "loss": 1.3874, "nll_loss": 1.3530222177505493, "rewards/accuracies": 1.0, "rewards/chosen": 1.1169884204864502, "rewards/margins": 5.327305793762207, "rewards/rejected": -4.210317134857178, "step": 3275 }, { "epoch": 0.546, "grad_norm": 38.049198150634766, "learning_rate": 8.99767182106319e-08, "logits/chosen": 1.9859753847122192, "logits/rejected": 2.5382776260375977, "logps/chosen": -12.806070327758789, "logps/rejected": -115.32284545898438, "loss": 0.5011, "nll_loss": 0.474298894405365, "rewards/accuracies": 1.0, "rewards/chosen": 1.0906667709350586, "rewards/margins": 6.480378150939941, "rewards/rejected": -5.389711380004883, "step": 3276 }, { "epoch": 0.5461666666666667, "grad_norm": 22.1749267578125, "learning_rate": 8.992301225725733e-08, "logits/chosen": 3.6597819328308105, "logits/rejected": 3.787829875946045, "logps/chosen": -42.03230667114258, "logps/rejected": -190.98472595214844, "loss": 0.5735, "nll_loss": 0.5604308843612671, "rewards/accuracies": 1.0, "rewards/chosen": 1.6978226900100708, "rewards/margins": 10.08125114440918, "rewards/rejected": -8.383428573608398, "step": 3277 }, { "epoch": 0.5463333333333333, "grad_norm": 53.29898452758789, "learning_rate": 8.986930924007511e-08, "logits/chosen": 1.343894362449646, "logits/rejected": 2.799997568130493, "logps/chosen": -39.838111877441406, "logps/rejected": -83.54927062988281, "loss": 1.3782, "nll_loss": 1.3737281560897827, "rewards/accuracies": 1.0, "rewards/chosen": 3.731574535369873, "rewards/margins": 9.070328712463379, "rewards/rejected": -5.338754177093506, "step": 3278 }, { "epoch": 0.5465, "grad_norm": 24.081893920898438, "learning_rate": 8.981560917473292e-08, "logits/chosen": 0.5536016225814819, "logits/rejected": 2.0983529090881348, "logps/chosen": -43.851112365722656, "logps/rejected": -418.33587646484375, "loss": 0.6207, "nll_loss": 0.5925825238227844, "rewards/accuracies": 1.0, "rewards/chosen": 0.885114312171936, "rewards/margins": 9.325592994689941, "rewards/rejected": -8.440478324890137, "step": 3279 }, { "epoch": 0.5466666666666666, "grad_norm": 23.49154281616211, "learning_rate": 8.976191207687775e-08, "logits/chosen": 2.879347085952759, "logits/rejected": 2.7871718406677246, "logps/chosen": -47.606021881103516, "logps/rejected": -98.90383911132812, "loss": 0.6429, "nll_loss": 0.6263949275016785, "rewards/accuracies": 1.0, "rewards/chosen": 2.2284328937530518, "rewards/margins": 6.485659599304199, "rewards/rejected": -4.257226467132568, "step": 3280 }, { "epoch": 0.5468333333333333, "grad_norm": 28.7922306060791, "learning_rate": 8.970821796215557e-08, "logits/chosen": 2.9327943325042725, "logits/rejected": 2.792661666870117, "logps/chosen": -135.23480224609375, "logps/rejected": -82.38996124267578, "loss": 1.0139, "nll_loss": 0.9659627676010132, "rewards/accuracies": 1.0, "rewards/chosen": 1.531330943107605, "rewards/margins": 4.576292991638184, "rewards/rejected": -3.044961929321289, "step": 3281 }, { "epoch": 0.547, "grad_norm": 32.79261779785156, "learning_rate": 8.965452684621164e-08, "logits/chosen": 1.8914170265197754, "logits/rejected": 1.644655466079712, "logps/chosen": -70.14604187011719, "logps/rejected": -77.06291198730469, "loss": 0.8443, "nll_loss": 0.7794004082679749, "rewards/accuracies": 1.0, "rewards/chosen": 1.4134316444396973, "rewards/margins": 4.0911545753479, "rewards/rejected": -2.677722930908203, "step": 3282 }, { "epoch": 0.5471666666666667, "grad_norm": 14.120565414428711, "learning_rate": 8.960083874469018e-08, "logits/chosen": 1.7019695043563843, "logits/rejected": 1.6599358320236206, "logps/chosen": -152.81179809570312, "logps/rejected": -153.79241943359375, "loss": 0.6361, "nll_loss": 0.6262779235839844, "rewards/accuracies": 1.0, "rewards/chosen": 2.1721208095550537, "rewards/margins": 8.098090171813965, "rewards/rejected": -5.92596960067749, "step": 3283 }, { "epoch": 0.5473333333333333, "grad_norm": 60.014034271240234, "learning_rate": 8.954715367323466e-08, "logits/chosen": 2.406093120574951, "logits/rejected": 2.36135196685791, "logps/chosen": -93.75370788574219, "logps/rejected": -28.023881912231445, "loss": 1.512, "nll_loss": 0.9868811368942261, "rewards/accuracies": 1.0, "rewards/chosen": 3.173887252807617, "rewards/margins": 2.1422810554504395, "rewards/rejected": 1.0316060781478882, "step": 3284 }, { "epoch": 0.5475, "grad_norm": 28.503610610961914, "learning_rate": 8.949347164748761e-08, "logits/chosen": 1.2902414798736572, "logits/rejected": 1.846441388130188, "logps/chosen": -76.42942810058594, "logps/rejected": -143.47958374023438, "loss": 0.9843, "nll_loss": 0.9553678631782532, "rewards/accuracies": 1.0, "rewards/chosen": 1.492889404296875, "rewards/margins": 5.485226631164551, "rewards/rejected": -3.9923369884490967, "step": 3285 }, { "epoch": 0.5476666666666666, "grad_norm": 37.862693786621094, "learning_rate": 8.943979268309069e-08, "logits/chosen": 2.9073829650878906, "logits/rejected": 2.8125128746032715, "logps/chosen": -32.26777267456055, "logps/rejected": -39.254783630371094, "loss": 0.6556, "nll_loss": 0.5975514054298401, "rewards/accuracies": 1.0, "rewards/chosen": 1.4508812427520752, "rewards/margins": 4.265873908996582, "rewards/rejected": -2.814992904663086, "step": 3286 }, { "epoch": 0.5478333333333333, "grad_norm": 35.83599090576172, "learning_rate": 8.938611679568465e-08, "logits/chosen": 1.764106035232544, "logits/rejected": 2.876542329788208, "logps/chosen": -29.21599578857422, "logps/rejected": -386.8475036621094, "loss": 0.6912, "nll_loss": 0.6794418096542358, "rewards/accuracies": 1.0, "rewards/chosen": 1.8716548681259155, "rewards/margins": 8.508621215820312, "rewards/rejected": -6.636966705322266, "step": 3287 }, { "epoch": 0.548, "grad_norm": 61.32816696166992, "learning_rate": 8.933244400090936e-08, "logits/chosen": 1.83305025100708, "logits/rejected": 2.134019136428833, "logps/chosen": -59.918643951416016, "logps/rejected": -465.5205078125, "loss": 2.0756, "nll_loss": 2.066160202026367, "rewards/accuracies": 1.0, "rewards/chosen": 2.011070728302002, "rewards/margins": 13.08220100402832, "rewards/rejected": -11.071130752563477, "step": 3288 }, { "epoch": 0.5481666666666667, "grad_norm": 192.04318237304688, "learning_rate": 8.927877431440378e-08, "logits/chosen": 2.3885679244995117, "logits/rejected": 2.5201871395111084, "logps/chosen": -49.493377685546875, "logps/rejected": -11.887979507446289, "loss": 3.0049, "nll_loss": 0.7733339667320251, "rewards/accuracies": 0.0, "rewards/chosen": 0.9568565487861633, "rewards/margins": -1.6679186820983887, "rewards/rejected": 2.6247751712799072, "step": 3289 }, { "epoch": 0.5483333333333333, "grad_norm": 16.403728485107422, "learning_rate": 8.922510775180598e-08, "logits/chosen": 3.1109580993652344, "logits/rejected": 3.0510618686676025, "logps/chosen": -9.971738815307617, "logps/rejected": -130.1225128173828, "loss": 0.2119, "nll_loss": 0.2077445536851883, "rewards/accuracies": 1.0, "rewards/chosen": 2.878782272338867, "rewards/margins": 11.657724380493164, "rewards/rejected": -8.778942108154297, "step": 3290 }, { "epoch": 0.5485, "grad_norm": 35.22256851196289, "learning_rate": 8.91714443287531e-08, "logits/chosen": 2.3901801109313965, "logits/rejected": 2.5738840103149414, "logps/chosen": -126.8822021484375, "logps/rejected": -328.773681640625, "loss": 1.2922, "nll_loss": 1.2318657636642456, "rewards/accuracies": 1.0, "rewards/chosen": 0.3330734372138977, "rewards/margins": 4.698179721832275, "rewards/rejected": -4.365106105804443, "step": 3291 }, { "epoch": 0.5486666666666666, "grad_norm": 33.51787185668945, "learning_rate": 8.911778406088137e-08, "logits/chosen": 3.13847017288208, "logits/rejected": 3.1976654529571533, "logps/chosen": -32.04323196411133, "logps/rejected": -317.97515869140625, "loss": 0.7359, "nll_loss": 0.7120718955993652, "rewards/accuracies": 1.0, "rewards/chosen": 1.1423553228378296, "rewards/margins": 7.188571453094482, "rewards/rejected": -6.046216011047363, "step": 3292 }, { "epoch": 0.5488333333333333, "grad_norm": 23.587915420532227, "learning_rate": 8.90641269638261e-08, "logits/chosen": 1.4676854610443115, "logits/rejected": 1.4351810216903687, "logps/chosen": -104.4012451171875, "logps/rejected": -74.48454284667969, "loss": 1.1371, "nll_loss": 1.1225939989089966, "rewards/accuracies": 1.0, "rewards/chosen": 2.0561394691467285, "rewards/margins": 6.843025207519531, "rewards/rejected": -4.786885738372803, "step": 3293 }, { "epoch": 0.549, "grad_norm": 25.39549446105957, "learning_rate": 8.901047305322171e-08, "logits/chosen": 2.5373804569244385, "logits/rejected": 2.5353965759277344, "logps/chosen": -85.24435424804688, "logps/rejected": -197.21995544433594, "loss": 0.8869, "nll_loss": 0.8788078427314758, "rewards/accuracies": 1.0, "rewards/chosen": 2.3678009510040283, "rewards/margins": 8.4419584274292, "rewards/rejected": -6.07415771484375, "step": 3294 }, { "epoch": 0.5491666666666667, "grad_norm": 25.27012825012207, "learning_rate": 8.895682234470162e-08, "logits/chosen": 1.6891580820083618, "logits/rejected": 2.0714128017425537, "logps/chosen": -166.01353454589844, "logps/rejected": -249.0306396484375, "loss": 1.0402, "nll_loss": 1.0061426162719727, "rewards/accuracies": 1.0, "rewards/chosen": 0.846565306186676, "rewards/margins": 6.000730991363525, "rewards/rejected": -5.154165744781494, "step": 3295 }, { "epoch": 0.5493333333333333, "grad_norm": 21.34957504272461, "learning_rate": 8.890317485389838e-08, "logits/chosen": 3.0764482021331787, "logits/rejected": 3.156001567840576, "logps/chosen": -72.21223449707031, "logps/rejected": -333.67156982421875, "loss": 0.692, "nll_loss": 0.6812474131584167, "rewards/accuracies": 1.0, "rewards/chosen": 1.9352692365646362, "rewards/margins": 8.976068496704102, "rewards/rejected": -7.040799140930176, "step": 3296 }, { "epoch": 0.5495, "grad_norm": 66.19416809082031, "learning_rate": 8.884953059644359e-08, "logits/chosen": 0.953924298286438, "logits/rejected": 1.762398362159729, "logps/chosen": -32.188053131103516, "logps/rejected": -250.13829040527344, "loss": 1.0071, "nll_loss": 1.0058765411376953, "rewards/accuracies": 1.0, "rewards/chosen": 5.756620407104492, "rewards/margins": 12.113861083984375, "rewards/rejected": -6.357240200042725, "step": 3297 }, { "epoch": 0.5496666666666666, "grad_norm": 27.651395797729492, "learning_rate": 8.879588958796787e-08, "logits/chosen": 2.3323094844818115, "logits/rejected": 2.584172487258911, "logps/chosen": -95.27252197265625, "logps/rejected": -270.894287109375, "loss": 1.1317, "nll_loss": 1.1078200340270996, "rewards/accuracies": 1.0, "rewards/chosen": 1.1439666748046875, "rewards/margins": 7.125120639801025, "rewards/rejected": -5.981153964996338, "step": 3298 }, { "epoch": 0.5498333333333333, "grad_norm": 27.981399536132812, "learning_rate": 8.874225184410097e-08, "logits/chosen": 1.560922622680664, "logits/rejected": 2.365190029144287, "logps/chosen": -87.25115966796875, "logps/rejected": -331.73876953125, "loss": 0.9182, "nll_loss": 0.8994964957237244, "rewards/accuracies": 1.0, "rewards/chosen": 1.3232421875, "rewards/margins": 9.132455825805664, "rewards/rejected": -7.809213638305664, "step": 3299 }, { "epoch": 0.55, "grad_norm": 24.980131149291992, "learning_rate": 8.868861738047158e-08, "logits/chosen": 0.9923567175865173, "logits/rejected": 2.4867565631866455, "logps/chosen": -8.947586059570312, "logps/rejected": -363.8792724609375, "loss": 0.3007, "nll_loss": 0.27961209416389465, "rewards/accuracies": 1.0, "rewards/chosen": 1.2820838689804077, "rewards/margins": 7.26303243637085, "rewards/rejected": -5.980948448181152, "step": 3300 }, { "epoch": 0.5501666666666667, "grad_norm": 22.193368911743164, "learning_rate": 8.863498621270754e-08, "logits/chosen": 2.4989748001098633, "logits/rejected": 2.520397186279297, "logps/chosen": -149.5230255126953, "logps/rejected": -201.62689208984375, "loss": 1.0351, "nll_loss": 1.0241303443908691, "rewards/accuracies": 1.0, "rewards/chosen": 1.8986772298812866, "rewards/margins": 9.156590461730957, "rewards/rejected": -7.257913589477539, "step": 3301 }, { "epoch": 0.5503333333333333, "grad_norm": 22.505245208740234, "learning_rate": 8.858135835643565e-08, "logits/chosen": 3.9256532192230225, "logits/rejected": 4.1067705154418945, "logps/chosen": -66.21902465820312, "logps/rejected": -166.9647216796875, "loss": 0.7755, "nll_loss": 0.769988477230072, "rewards/accuracies": 1.0, "rewards/chosen": 2.6036834716796875, "rewards/margins": 10.504608154296875, "rewards/rejected": -7.9009246826171875, "step": 3302 }, { "epoch": 0.5505, "grad_norm": 19.383277893066406, "learning_rate": 8.852773382728183e-08, "logits/chosen": 2.511209726333618, "logits/rejected": 2.8252971172332764, "logps/chosen": -56.67360305786133, "logps/rejected": -315.3894348144531, "loss": 0.6037, "nll_loss": 0.5903500914573669, "rewards/accuracies": 1.0, "rewards/chosen": 1.6644413471221924, "rewards/margins": 10.534893035888672, "rewards/rejected": -8.870451927185059, "step": 3303 }, { "epoch": 0.5506666666666666, "grad_norm": 32.304386138916016, "learning_rate": 8.847411264087092e-08, "logits/chosen": 2.203495502471924, "logits/rejected": 1.5474300384521484, "logps/chosen": -257.3394470214844, "logps/rejected": -271.90155029296875, "loss": 1.3167, "nll_loss": 1.2996941804885864, "rewards/accuracies": 1.0, "rewards/chosen": 1.5164703130722046, "rewards/margins": 7.546348571777344, "rewards/rejected": -6.02987813949585, "step": 3304 }, { "epoch": 0.5508333333333333, "grad_norm": 31.155696868896484, "learning_rate": 8.84204948128269e-08, "logits/chosen": 1.1164472103118896, "logits/rejected": 2.1429355144500732, "logps/chosen": -88.12775421142578, "logps/rejected": -354.03460693359375, "loss": 1.1952, "nll_loss": 1.1750366687774658, "rewards/accuracies": 1.0, "rewards/chosen": 1.3308753967285156, "rewards/margins": 7.3115644454956055, "rewards/rejected": -5.98068904876709, "step": 3305 }, { "epoch": 0.551, "grad_norm": 60.89202117919922, "learning_rate": 8.836688035877266e-08, "logits/chosen": 2.6698813438415527, "logits/rejected": 2.5092592239379883, "logps/chosen": -17.611244201660156, "logps/rejected": -48.429237365722656, "loss": 0.8385, "nll_loss": 0.8005111813545227, "rewards/accuracies": 1.0, "rewards/chosen": 1.613229751586914, "rewards/margins": 4.9658684730529785, "rewards/rejected": -3.3526387214660645, "step": 3306 }, { "epoch": 0.5511666666666667, "grad_norm": 44.95425033569336, "learning_rate": 8.831326929433025e-08, "logits/chosen": 2.318957805633545, "logits/rejected": 2.49661922454834, "logps/chosen": -9.404950141906738, "logps/rejected": -179.81768798828125, "loss": 0.4163, "nll_loss": 0.408910870552063, "rewards/accuracies": 1.0, "rewards/chosen": 2.426220417022705, "rewards/margins": 8.768077850341797, "rewards/rejected": -6.34185791015625, "step": 3307 }, { "epoch": 0.5513333333333333, "grad_norm": 30.846059799194336, "learning_rate": 8.825966163512055e-08, "logits/chosen": 3.0005733966827393, "logits/rejected": 3.2343056201934814, "logps/chosen": -37.390289306640625, "logps/rejected": -156.5017852783203, "loss": 0.6559, "nll_loss": 0.6337338089942932, "rewards/accuracies": 1.0, "rewards/chosen": 1.2092437744140625, "rewards/margins": 7.325650215148926, "rewards/rejected": -6.116406440734863, "step": 3308 }, { "epoch": 0.5515, "grad_norm": 75.68883514404297, "learning_rate": 8.820605739676362e-08, "logits/chosen": 1.877332091331482, "logits/rejected": 2.6220555305480957, "logps/chosen": -53.74995803833008, "logps/rejected": -248.41802978515625, "loss": 1.935, "nll_loss": 1.9196414947509766, "rewards/accuracies": 1.0, "rewards/chosen": 1.7534314393997192, "rewards/margins": 7.12476921081543, "rewards/rejected": -5.371337890625, "step": 3309 }, { "epoch": 0.5516666666666666, "grad_norm": 174.74679565429688, "learning_rate": 8.815245659487839e-08, "logits/chosen": 1.61161470413208, "logits/rejected": 1.7530677318572998, "logps/chosen": -25.80514907836914, "logps/rejected": -12.949254035949707, "loss": 3.6916, "nll_loss": 0.549045741558075, "rewards/accuracies": 0.0, "rewards/chosen": 0.8388828635215759, "rewards/margins": -2.6869559288024902, "rewards/rejected": 3.525838851928711, "step": 3310 }, { "epoch": 0.5518333333333333, "grad_norm": 37.33296585083008, "learning_rate": 8.809885924508293e-08, "logits/chosen": 2.74570894241333, "logits/rejected": 2.4141013622283936, "logps/chosen": -175.59896850585938, "logps/rejected": -27.03858184814453, "loss": 0.872, "nll_loss": 0.7256155014038086, "rewards/accuracies": 1.0, "rewards/chosen": 1.6844438314437866, "rewards/margins": 3.1417055130004883, "rewards/rejected": -1.457261562347412, "step": 3311 }, { "epoch": 0.552, "grad_norm": 30.807783126831055, "learning_rate": 8.804526536299412e-08, "logits/chosen": 2.432793378829956, "logits/rejected": 2.487183094024658, "logps/chosen": -66.5806884765625, "logps/rejected": -318.21112060546875, "loss": 0.871, "nll_loss": 0.8427936434745789, "rewards/accuracies": 1.0, "rewards/chosen": 0.9139907956123352, "rewards/margins": 7.533511638641357, "rewards/rejected": -6.619520664215088, "step": 3312 }, { "epoch": 0.5521666666666667, "grad_norm": 218.77503967285156, "learning_rate": 8.7991674964228e-08, "logits/chosen": 2.0524275302886963, "logits/rejected": 2.224536418914795, "logps/chosen": -25.98755645751953, "logps/rejected": -69.5029067993164, "loss": 2.4888, "nll_loss": 0.4480613172054291, "rewards/accuracies": 0.0, "rewards/chosen": 0.4990299344062805, "rewards/margins": -1.5315182209014893, "rewards/rejected": 2.030548095703125, "step": 3313 }, { "epoch": 0.5523333333333333, "grad_norm": 21.063955307006836, "learning_rate": 8.793808806439947e-08, "logits/chosen": 2.068650484085083, "logits/rejected": 2.6838901042938232, "logps/chosen": -49.76397705078125, "logps/rejected": -287.80023193359375, "loss": 0.5995, "nll_loss": 0.585458517074585, "rewards/accuracies": 1.0, "rewards/chosen": 1.6008331775665283, "rewards/margins": 12.583593368530273, "rewards/rejected": -10.982760429382324, "step": 3314 }, { "epoch": 0.5525, "grad_norm": 26.623567581176758, "learning_rate": 8.788450467912255e-08, "logits/chosen": 2.617753744125366, "logits/rejected": 2.909369468688965, "logps/chosen": -75.6725082397461, "logps/rejected": -303.1973571777344, "loss": 0.7954, "nll_loss": 0.7721685171127319, "rewards/accuracies": 1.0, "rewards/chosen": 2.6459357738494873, "rewards/margins": 6.123114585876465, "rewards/rejected": -3.4771790504455566, "step": 3315 }, { "epoch": 0.5526666666666666, "grad_norm": 27.184907913208008, "learning_rate": 8.783092482401004e-08, "logits/chosen": 1.7328712940216064, "logits/rejected": 2.0088071823120117, "logps/chosen": -82.57679748535156, "logps/rejected": -194.88571166992188, "loss": 0.9175, "nll_loss": 0.9074374437332153, "rewards/accuracies": 1.0, "rewards/chosen": 1.9613968133926392, "rewards/margins": 10.201102256774902, "rewards/rejected": -8.239705085754395, "step": 3316 }, { "epoch": 0.5528333333333333, "grad_norm": 27.292285919189453, "learning_rate": 8.77773485146739e-08, "logits/chosen": 0.5118628740310669, "logits/rejected": 1.7455394268035889, "logps/chosen": -33.65584945678711, "logps/rejected": -374.28204345703125, "loss": 0.6136, "nll_loss": 0.5904535055160522, "rewards/accuracies": 1.0, "rewards/chosen": 1.077056884765625, "rewards/margins": 14.38535213470459, "rewards/rejected": -13.308295249938965, "step": 3317 }, { "epoch": 0.553, "grad_norm": 32.466331481933594, "learning_rate": 8.7723775766725e-08, "logits/chosen": 0.35393041372299194, "logits/rejected": 1.0525764226913452, "logps/chosen": -45.71584701538086, "logps/rejected": -257.31951904296875, "loss": 0.7601, "nll_loss": 0.7494402527809143, "rewards/accuracies": 1.0, "rewards/chosen": 1.945056676864624, "rewards/margins": 8.884913444519043, "rewards/rejected": -6.93985652923584, "step": 3318 }, { "epoch": 0.5531666666666667, "grad_norm": 277.3243103027344, "learning_rate": 8.76702065957731e-08, "logits/chosen": 2.6397311687469482, "logits/rejected": 2.4975192546844482, "logps/chosen": -116.80152893066406, "logps/rejected": -38.22465896606445, "loss": 3.0711, "nll_loss": 1.4072471857070923, "rewards/accuracies": 0.0, "rewards/chosen": -1.8264970779418945, "rewards/margins": -1.3025496006011963, "rewards/rejected": -0.523947536945343, "step": 3319 }, { "epoch": 0.5533333333333333, "grad_norm": 28.000486373901367, "learning_rate": 8.761664101742699e-08, "logits/chosen": 1.090618371963501, "logits/rejected": 1.5558115243911743, "logps/chosen": -126.03749084472656, "logps/rejected": -270.735595703125, "loss": 1.1301, "nll_loss": 1.1253347396850586, "rewards/accuracies": 1.0, "rewards/chosen": 2.712139844894409, "rewards/margins": 13.857965469360352, "rewards/rejected": -11.145825386047363, "step": 3320 }, { "epoch": 0.5535, "grad_norm": 33.01564407348633, "learning_rate": 8.756307904729439e-08, "logits/chosen": 0.9520569443702698, "logits/rejected": 1.6118346452713013, "logps/chosen": -85.85355377197266, "logps/rejected": -309.05462646484375, "loss": 1.1559, "nll_loss": 1.1296519041061401, "rewards/accuracies": 1.0, "rewards/chosen": 0.9623283743858337, "rewards/margins": 8.340331077575684, "rewards/rejected": -7.378003120422363, "step": 3321 }, { "epoch": 0.5536666666666666, "grad_norm": 22.89649772644043, "learning_rate": 8.750952070098198e-08, "logits/chosen": 3.0729808807373047, "logits/rejected": 3.119140863418579, "logps/chosen": -53.385986328125, "logps/rejected": -197.15283203125, "loss": 0.6675, "nll_loss": 0.6510486006736755, "rewards/accuracies": 1.0, "rewards/chosen": 1.9246453046798706, "rewards/margins": 6.596419811248779, "rewards/rejected": -4.671774387359619, "step": 3322 }, { "epoch": 0.5538333333333333, "grad_norm": 23.219520568847656, "learning_rate": 8.745596599409535e-08, "logits/chosen": 2.252394199371338, "logits/rejected": 2.62766695022583, "logps/chosen": -12.787097930908203, "logps/rejected": -148.9256134033203, "loss": 0.3844, "nll_loss": 0.35519713163375854, "rewards/accuracies": 1.0, "rewards/chosen": 1.0909795761108398, "rewards/margins": 5.940689563751221, "rewards/rejected": -4.849709987640381, "step": 3323 }, { "epoch": 0.554, "grad_norm": 19.65143585205078, "learning_rate": 8.74024149422391e-08, "logits/chosen": 2.78975248336792, "logits/rejected": 3.100945234298706, "logps/chosen": -64.29457092285156, "logps/rejected": -316.5599670410156, "loss": 0.5817, "nll_loss": 0.5689785480499268, "rewards/accuracies": 1.0, "rewards/chosen": 1.7458152770996094, "rewards/margins": 8.866277694702148, "rewards/rejected": -7.120461940765381, "step": 3324 }, { "epoch": 0.5541666666666667, "grad_norm": 18.607486724853516, "learning_rate": 8.734886756101669e-08, "logits/chosen": 2.3279359340667725, "logits/rejected": 2.6237738132476807, "logps/chosen": -47.3909912109375, "logps/rejected": -237.19155883789062, "loss": 0.5342, "nll_loss": 0.520780086517334, "rewards/accuracies": 1.0, "rewards/chosen": 1.6445504426956177, "rewards/margins": 12.703886032104492, "rewards/rejected": -11.059335708618164, "step": 3325 }, { "epoch": 0.5543333333333333, "grad_norm": 543.5948486328125, "learning_rate": 8.729532386603054e-08, "logits/chosen": 2.5486700534820557, "logits/rejected": 2.4829115867614746, "logps/chosen": -276.37127685546875, "logps/rejected": -199.56076049804688, "loss": 4.3006, "nll_loss": 1.988282561302185, "rewards/accuracies": 0.0, "rewards/chosen": -3.614074945449829, "rewards/margins": -1.987680196762085, "rewards/rejected": -1.6263947486877441, "step": 3326 }, { "epoch": 0.5545, "grad_norm": 54.13216018676758, "learning_rate": 8.7241783872882e-08, "logits/chosen": 2.5832207202911377, "logits/rejected": 2.708062171936035, "logps/chosen": -68.89278411865234, "logps/rejected": -25.11960220336914, "loss": 1.1086, "nll_loss": 0.7918710112571716, "rewards/accuracies": 1.0, "rewards/chosen": 2.175016164779663, "rewards/margins": 2.376560926437378, "rewards/rejected": -0.20154476165771484, "step": 3327 }, { "epoch": 0.5546666666666666, "grad_norm": 27.87923812866211, "learning_rate": 8.718824759717137e-08, "logits/chosen": 2.8309073448181152, "logits/rejected": 2.816964626312256, "logps/chosen": -65.07403564453125, "logps/rejected": -201.88284301757812, "loss": 0.7562, "nll_loss": 0.7394776940345764, "rewards/accuracies": 1.0, "rewards/chosen": 1.4262436628341675, "rewards/margins": 9.349662780761719, "rewards/rejected": -7.923418998718262, "step": 3328 }, { "epoch": 0.5548333333333333, "grad_norm": 29.298307418823242, "learning_rate": 8.713471505449779e-08, "logits/chosen": 2.7662241458892822, "logits/rejected": 2.7421317100524902, "logps/chosen": -80.96243286132812, "logps/rejected": -138.1414794921875, "loss": 0.9616, "nll_loss": 0.9414234757423401, "rewards/accuracies": 1.0, "rewards/chosen": 1.290715217590332, "rewards/margins": 7.588033676147461, "rewards/rejected": -6.297318458557129, "step": 3329 }, { "epoch": 0.555, "grad_norm": 26.7702693939209, "learning_rate": 8.708118626045939e-08, "logits/chosen": 3.0253210067749023, "logits/rejected": 3.0483593940734863, "logps/chosen": -73.65013122558594, "logps/rejected": -139.3936004638672, "loss": 0.9348, "nll_loss": 0.9092608690261841, "rewards/accuracies": 1.0, "rewards/chosen": 1.9572221040725708, "rewards/margins": 5.68803596496582, "rewards/rejected": -3.730813980102539, "step": 3330 }, { "epoch": 0.5551666666666667, "grad_norm": 71.9868392944336, "learning_rate": 8.702766123065314e-08, "logits/chosen": 3.313647508621216, "logits/rejected": 3.3766191005706787, "logps/chosen": -12.983441352844238, "logps/rejected": -306.4344482421875, "loss": 0.8756, "nll_loss": 0.8655627965927124, "rewards/accuracies": 1.0, "rewards/chosen": 1.9367427825927734, "rewards/margins": 12.126594543457031, "rewards/rejected": -10.189851760864258, "step": 3331 }, { "epoch": 0.5553333333333333, "grad_norm": 24.066944122314453, "learning_rate": 8.697413998067498e-08, "logits/chosen": 1.9231494665145874, "logits/rejected": 2.275423288345337, "logps/chosen": -55.859336853027344, "logps/rejected": -358.35467529296875, "loss": 0.7409, "nll_loss": 0.7349912524223328, "rewards/accuracies": 1.0, "rewards/chosen": 2.495206594467163, "rewards/margins": 11.518959045410156, "rewards/rejected": -9.023752212524414, "step": 3332 }, { "epoch": 0.5555, "grad_norm": 96.02383422851562, "learning_rate": 8.692062252611971e-08, "logits/chosen": 1.126871943473816, "logits/rejected": 2.201552391052246, "logps/chosen": -8.448797225952148, "logps/rejected": -218.1661834716797, "loss": 0.717, "nll_loss": 0.7040664553642273, "rewards/accuracies": 1.0, "rewards/chosen": 2.0262675285339355, "rewards/margins": 7.211730480194092, "rewards/rejected": -5.185462951660156, "step": 3333 }, { "epoch": 0.5556666666666666, "grad_norm": 33.33598709106445, "learning_rate": 8.686710888258104e-08, "logits/chosen": 1.9380319118499756, "logits/rejected": 2.120335578918457, "logps/chosen": -25.060415267944336, "logps/rejected": -91.32862854003906, "loss": 0.5545, "nll_loss": 0.5220919251441956, "rewards/accuracies": 1.0, "rewards/chosen": 1.365248441696167, "rewards/margins": 5.2763142585754395, "rewards/rejected": -3.9110658168792725, "step": 3334 }, { "epoch": 0.5558333333333333, "grad_norm": 36.37420654296875, "learning_rate": 8.681359906565154e-08, "logits/chosen": 1.6130666732788086, "logits/rejected": 2.9854326248168945, "logps/chosen": -30.91812515258789, "logps/rejected": -314.2396240234375, "loss": 0.8857, "nll_loss": 0.8833749890327454, "rewards/accuracies": 1.0, "rewards/chosen": 3.746894598007202, "rewards/margins": 10.584104537963867, "rewards/rejected": -6.837210178375244, "step": 3335 }, { "epoch": 0.556, "grad_norm": 102.33338165283203, "learning_rate": 8.676009309092271e-08, "logits/chosen": 2.4305837154388428, "logits/rejected": 2.4989492893218994, "logps/chosen": -71.71504211425781, "logps/rejected": -18.696611404418945, "loss": 2.3879, "nll_loss": 1.4635721445083618, "rewards/accuracies": 1.0, "rewards/chosen": 3.317554473876953, "rewards/margins": 1.0123140811920166, "rewards/rejected": 2.3052403926849365, "step": 3336 }, { "epoch": 0.5561666666666667, "grad_norm": 42.33601760864258, "learning_rate": 8.670659097398487e-08, "logits/chosen": 2.279325485229492, "logits/rejected": 2.431492805480957, "logps/chosen": -30.19788360595703, "logps/rejected": -267.64056396484375, "loss": 0.8411, "nll_loss": 0.8388301134109497, "rewards/accuracies": 1.0, "rewards/chosen": 4.293393611907959, "rewards/margins": 10.395807266235352, "rewards/rejected": -6.102414131164551, "step": 3337 }, { "epoch": 0.5563333333333333, "grad_norm": 34.96859359741211, "learning_rate": 8.665309273042729e-08, "logits/chosen": 3.260012149810791, "logits/rejected": 3.498263359069824, "logps/chosen": -17.631765365600586, "logps/rejected": -148.548583984375, "loss": 0.6304, "nll_loss": 0.5877254605293274, "rewards/accuracies": 1.0, "rewards/chosen": 0.7393137216567993, "rewards/margins": 5.165061950683594, "rewards/rejected": -4.425748348236084, "step": 3338 }, { "epoch": 0.5565, "grad_norm": 19.707687377929688, "learning_rate": 8.659959837583807e-08, "logits/chosen": 3.3116774559020996, "logits/rejected": 3.364349126815796, "logps/chosen": -142.7676239013672, "logps/rejected": -299.8190002441406, "loss": 0.9155, "nll_loss": 0.9093477129936218, "rewards/accuracies": 1.0, "rewards/chosen": 2.467878818511963, "rewards/margins": 10.767717361450195, "rewards/rejected": -8.299838066101074, "step": 3339 }, { "epoch": 0.5566666666666666, "grad_norm": 163.39669799804688, "learning_rate": 8.654610792580414e-08, "logits/chosen": 3.8228988647460938, "logits/rejected": 3.862861156463623, "logps/chosen": -35.31819152832031, "logps/rejected": -95.62493133544922, "loss": 1.454, "nll_loss": 0.5886365175247192, "rewards/accuracies": 1.0, "rewards/chosen": 2.166285753250122, "rewards/margins": 0.6560394763946533, "rewards/rejected": 1.5102462768554688, "step": 3340 }, { "epoch": 0.5568333333333333, "grad_norm": 23.755319595336914, "learning_rate": 8.64926213959114e-08, "logits/chosen": 1.5770026445388794, "logits/rejected": 1.6445866823196411, "logps/chosen": -122.83590698242188, "logps/rejected": -152.41522216796875, "loss": 0.9295, "nll_loss": 0.9098955988883972, "rewards/accuracies": 1.0, "rewards/chosen": 2.0213470458984375, "rewards/margins": 6.168154239654541, "rewards/rejected": -4.1468071937561035, "step": 3341 }, { "epoch": 0.557, "grad_norm": 24.269126892089844, "learning_rate": 8.643913880174447e-08, "logits/chosen": 2.3104891777038574, "logits/rejected": 2.3192238807678223, "logps/chosen": -49.968353271484375, "logps/rejected": -81.25871276855469, "loss": 0.67, "nll_loss": 0.6489395499229431, "rewards/accuracies": 1.0, "rewards/chosen": 1.6851792335510254, "rewards/margins": 6.1260986328125, "rewards/rejected": -4.440919399261475, "step": 3342 }, { "epoch": 0.5571666666666667, "grad_norm": 26.851905822753906, "learning_rate": 8.638566015888696e-08, "logits/chosen": 1.2887929677963257, "logits/rejected": 1.8751500844955444, "logps/chosen": -59.591224670410156, "logps/rejected": -345.77154541015625, "loss": 0.9068, "nll_loss": 0.9028973579406738, "rewards/accuracies": 1.0, "rewards/chosen": 2.8952577114105225, "rewards/margins": 13.828862190246582, "rewards/rejected": -10.93360424041748, "step": 3343 }, { "epoch": 0.5573333333333333, "grad_norm": 16.911296844482422, "learning_rate": 8.63321854829212e-08, "logits/chosen": 2.9552550315856934, "logits/rejected": 3.1109604835510254, "logps/chosen": -222.30908203125, "logps/rejected": -382.5299377441406, "loss": 0.9908, "nll_loss": 0.9836684465408325, "rewards/accuracies": 1.0, "rewards/chosen": 2.440869092941284, "rewards/margins": 8.843387603759766, "rewards/rejected": -6.402518272399902, "step": 3344 }, { "epoch": 0.5575, "grad_norm": 33.34502410888672, "learning_rate": 8.62787147894285e-08, "logits/chosen": 2.0971460342407227, "logits/rejected": 1.995113492012024, "logps/chosen": -48.60970687866211, "logps/rejected": -105.7051773071289, "loss": 0.7485, "nll_loss": 0.725517988204956, "rewards/accuracies": 1.0, "rewards/chosen": 1.1103851795196533, "rewards/margins": 8.177078247070312, "rewards/rejected": -7.066693305969238, "step": 3345 }, { "epoch": 0.5576666666666666, "grad_norm": 35.384765625, "learning_rate": 8.622524809398886e-08, "logits/chosen": 1.8431907892227173, "logits/rejected": 2.5333240032196045, "logps/chosen": -41.73289489746094, "logps/rejected": -161.05252075195312, "loss": 0.8475, "nll_loss": 0.8346578478813171, "rewards/accuracies": 1.0, "rewards/chosen": 1.7266510725021362, "rewards/margins": 8.837515830993652, "rewards/rejected": -7.110864639282227, "step": 3346 }, { "epoch": 0.5578333333333333, "grad_norm": 25.204212188720703, "learning_rate": 8.617178541218124e-08, "logits/chosen": 1.1389079093933105, "logits/rejected": 3.0681369304656982, "logps/chosen": -57.2176513671875, "logps/rejected": -546.9151611328125, "loss": 0.7925, "nll_loss": 0.7838034629821777, "rewards/accuracies": 1.0, "rewards/chosen": 2.0860679149627686, "rewards/margins": 12.491470336914062, "rewards/rejected": -10.405402183532715, "step": 3347 }, { "epoch": 0.558, "grad_norm": 34.002994537353516, "learning_rate": 8.611832675958334e-08, "logits/chosen": 2.3839213848114014, "logits/rejected": 2.5749027729034424, "logps/chosen": -42.117122650146484, "logps/rejected": -313.51629638671875, "loss": 0.9194, "nll_loss": 0.9155898094177246, "rewards/accuracies": 1.0, "rewards/chosen": 4.759471416473389, "rewards/margins": 9.943577766418457, "rewards/rejected": -5.184106349945068, "step": 3348 }, { "epoch": 0.5581666666666667, "grad_norm": 39.524070739746094, "learning_rate": 8.60648721517718e-08, "logits/chosen": 1.0695853233337402, "logits/rejected": 2.0166187286376953, "logps/chosen": -64.00303649902344, "logps/rejected": -302.39019775390625, "loss": 1.1976, "nll_loss": 1.1852412223815918, "rewards/accuracies": 1.0, "rewards/chosen": 1.727325439453125, "rewards/margins": 10.898630142211914, "rewards/rejected": -9.171304702758789, "step": 3349 }, { "epoch": 0.5583333333333333, "grad_norm": 31.827199935913086, "learning_rate": 8.601142160432192e-08, "logits/chosen": 1.9162145853042603, "logits/rejected": 2.6793463230133057, "logps/chosen": -27.695144653320312, "logps/rejected": -202.12091064453125, "loss": 0.5903, "nll_loss": 0.5769822001457214, "rewards/accuracies": 1.0, "rewards/chosen": 1.6422044038772583, "rewards/margins": 11.986087799072266, "rewards/rejected": -10.343883514404297, "step": 3350 }, { "epoch": 0.5585, "grad_norm": 21.363994598388672, "learning_rate": 8.595797513280799e-08, "logits/chosen": 1.511707067489624, "logits/rejected": 2.2842111587524414, "logps/chosen": -70.88822937011719, "logps/rejected": -262.9447021484375, "loss": 0.7777, "nll_loss": 0.7705241441726685, "rewards/accuracies": 1.0, "rewards/chosen": 2.2895829677581787, "rewards/margins": 11.482048988342285, "rewards/rejected": -9.192465782165527, "step": 3351 }, { "epoch": 0.5586666666666666, "grad_norm": 27.591922760009766, "learning_rate": 8.590453275280296e-08, "logits/chosen": 1.8104782104492188, "logits/rejected": 2.380345106124878, "logps/chosen": -79.60299682617188, "logps/rejected": -156.82887268066406, "loss": 0.9545, "nll_loss": 0.9476546049118042, "rewards/accuracies": 1.0, "rewards/chosen": 2.5549087524414062, "rewards/margins": 8.652400970458984, "rewards/rejected": -6.097492694854736, "step": 3352 }, { "epoch": 0.5588333333333333, "grad_norm": 66.49998474121094, "learning_rate": 8.585109447987871e-08, "logits/chosen": 2.0194201469421387, "logits/rejected": 2.1560769081115723, "logps/chosen": -15.611985206604004, "logps/rejected": -33.92853927612305, "loss": 0.9428, "nll_loss": 0.7434279322624207, "rewards/accuracies": 1.0, "rewards/chosen": 1.733428955078125, "rewards/margins": 2.7685117721557617, "rewards/rejected": -1.0350826978683472, "step": 3353 }, { "epoch": 0.559, "grad_norm": 26.59214973449707, "learning_rate": 8.579766032960582e-08, "logits/chosen": 1.4490877389907837, "logits/rejected": 1.684624195098877, "logps/chosen": -35.583648681640625, "logps/rejected": -59.924076080322266, "loss": 0.5706, "nll_loss": 0.5157049894332886, "rewards/accuracies": 1.0, "rewards/chosen": 1.8202823400497437, "rewards/margins": 4.510122776031494, "rewards/rejected": -2.68984055519104, "step": 3354 }, { "epoch": 0.5591666666666667, "grad_norm": 31.152929306030273, "learning_rate": 8.574423031755375e-08, "logits/chosen": 2.9641170501708984, "logits/rejected": 3.0761513710021973, "logps/chosen": -74.78966522216797, "logps/rejected": -269.3736572265625, "loss": 1.0769, "nll_loss": 1.0684236288070679, "rewards/accuracies": 1.0, "rewards/chosen": 2.1433122158050537, "rewards/margins": 9.953408241271973, "rewards/rejected": -7.81009578704834, "step": 3355 }, { "epoch": 0.5593333333333333, "grad_norm": 39.22406005859375, "learning_rate": 8.569080445929072e-08, "logits/chosen": 2.263115644454956, "logits/rejected": 2.1726908683776855, "logps/chosen": -96.80484008789062, "logps/rejected": -44.05707550048828, "loss": 0.9902, "nll_loss": 0.7333701252937317, "rewards/accuracies": 1.0, "rewards/chosen": 2.838747501373291, "rewards/margins": 3.1769211292266846, "rewards/rejected": -0.33817368745803833, "step": 3356 }, { "epoch": 0.5595, "grad_norm": 101.25457000732422, "learning_rate": 8.563738277038376e-08, "logits/chosen": 3.1875569820404053, "logits/rejected": 3.2609212398529053, "logps/chosen": -13.750980377197266, "logps/rejected": -4.628179550170898, "loss": 1.5048, "nll_loss": 0.3437744975090027, "rewards/accuracies": 0.0, "rewards/chosen": 1.2531005144119263, "rewards/margins": -0.18388867378234863, "rewards/rejected": 1.436989188194275, "step": 3357 }, { "epoch": 0.5596666666666666, "grad_norm": 37.76780319213867, "learning_rate": 8.558396526639861e-08, "logits/chosen": 3.0466232299804688, "logits/rejected": 3.3260934352874756, "logps/chosen": -25.42443084716797, "logps/rejected": -276.47314453125, "loss": 0.6907, "nll_loss": 0.6871469020843506, "rewards/accuracies": 1.0, "rewards/chosen": 3.6878130435943604, "rewards/margins": 9.46080493927002, "rewards/rejected": -5.772992134094238, "step": 3358 }, { "epoch": 0.5598333333333333, "grad_norm": 52.56032943725586, "learning_rate": 8.553055196289991e-08, "logits/chosen": 2.9421987533569336, "logits/rejected": 2.9653353691101074, "logps/chosen": -13.610382080078125, "logps/rejected": -46.144287109375, "loss": 0.6772, "nll_loss": 0.6186537146568298, "rewards/accuracies": 1.0, "rewards/chosen": 1.1504634618759155, "rewards/margins": 4.190418243408203, "rewards/rejected": -3.039954662322998, "step": 3359 }, { "epoch": 0.56, "grad_norm": 35.02961349487305, "learning_rate": 8.547714287545099e-08, "logits/chosen": 2.0604476928710938, "logits/rejected": 2.062394618988037, "logps/chosen": -88.825927734375, "logps/rejected": -131.30294799804688, "loss": 1.112, "nll_loss": 1.096616268157959, "rewards/accuracies": 1.0, "rewards/chosen": 1.8388336896896362, "rewards/margins": 6.880873680114746, "rewards/rejected": -5.04203987121582, "step": 3360 }, { "epoch": 0.5601666666666667, "grad_norm": 20.82727813720703, "learning_rate": 8.542373801961396e-08, "logits/chosen": 2.344064712524414, "logits/rejected": 2.068635940551758, "logps/chosen": -78.23031616210938, "logps/rejected": -51.881629943847656, "loss": 0.701, "nll_loss": 0.6802636981010437, "rewards/accuracies": 1.0, "rewards/chosen": 2.384972333908081, "rewards/margins": 6.152640342712402, "rewards/rejected": -3.767667770385742, "step": 3361 }, { "epoch": 0.5603333333333333, "grad_norm": 20.77860450744629, "learning_rate": 8.537033741094975e-08, "logits/chosen": 2.6507344245910645, "logits/rejected": 2.516089677810669, "logps/chosen": -88.91692352294922, "logps/rejected": -52.1101188659668, "loss": 0.7246, "nll_loss": 0.6787551641464233, "rewards/accuracies": 1.0, "rewards/chosen": 2.5958945751190186, "rewards/margins": 5.24074649810791, "rewards/rejected": -2.6448521614074707, "step": 3362 }, { "epoch": 0.5605, "grad_norm": 35.19735336303711, "learning_rate": 8.531694106501795e-08, "logits/chosen": 1.9665040969848633, "logits/rejected": 2.292515993118286, "logps/chosen": -18.954635620117188, "logps/rejected": -489.59234619140625, "loss": 0.6233, "nll_loss": 0.6114398837089539, "rewards/accuracies": 1.0, "rewards/chosen": 4.138987064361572, "rewards/margins": 8.091050148010254, "rewards/rejected": -3.9520630836486816, "step": 3363 }, { "epoch": 0.5606666666666666, "grad_norm": 357.54119873046875, "learning_rate": 8.526354899737705e-08, "logits/chosen": 2.2172415256500244, "logits/rejected": 2.449544906616211, "logps/chosen": -83.6898193359375, "logps/rejected": -32.147064208984375, "loss": 5.5619, "nll_loss": 1.7806344032287598, "rewards/accuracies": 0.0, "rewards/chosen": -1.5979859828948975, "rewards/margins": -3.66225528717041, "rewards/rejected": 2.0642693042755127, "step": 3364 }, { "epoch": 0.5608333333333333, "grad_norm": 30.00697898864746, "learning_rate": 8.52101612235842e-08, "logits/chosen": 1.5371670722961426, "logits/rejected": 2.146655321121216, "logps/chosen": -106.14081573486328, "logps/rejected": -267.93365478515625, "loss": 1.0964, "nll_loss": 1.0830694437026978, "rewards/accuracies": 1.0, "rewards/chosen": 1.6782715320587158, "rewards/margins": 9.050561904907227, "rewards/rejected": -7.37229061126709, "step": 3365 }, { "epoch": 0.561, "grad_norm": 55.36207962036133, "learning_rate": 8.515677775919526e-08, "logits/chosen": 1.076481819152832, "logits/rejected": 1.8912910223007202, "logps/chosen": -69.2198715209961, "logps/rejected": -334.04302978515625, "loss": 1.7886, "nll_loss": 1.7304966449737549, "rewards/accuracies": 1.0, "rewards/chosen": 0.9390472769737244, "rewards/margins": 4.209539890289307, "rewards/rejected": -3.2704925537109375, "step": 3366 }, { "epoch": 0.5611666666666667, "grad_norm": 22.59783363342285, "learning_rate": 8.510339861976496e-08, "logits/chosen": 1.6661990880966187, "logits/rejected": 1.7561752796173096, "logps/chosen": -120.30352783203125, "logps/rejected": -152.85565185546875, "loss": 0.9079, "nll_loss": 0.8911372423171997, "rewards/accuracies": 1.0, "rewards/chosen": 2.274585008621216, "rewards/margins": 6.465435028076172, "rewards/rejected": -4.190850257873535, "step": 3367 }, { "epoch": 0.5613333333333334, "grad_norm": 34.717586517333984, "learning_rate": 8.505002382084666e-08, "logits/chosen": 1.3486149311065674, "logits/rejected": 1.7977466583251953, "logps/chosen": -19.494152069091797, "logps/rejected": -302.6070251464844, "loss": 0.5549, "nll_loss": 0.5415043234825134, "rewards/accuracies": 1.0, "rewards/chosen": 1.6430375576019287, "rewards/margins": 10.780606269836426, "rewards/rejected": -9.137568473815918, "step": 3368 }, { "epoch": 0.5615, "grad_norm": 30.505918502807617, "learning_rate": 8.499665337799252e-08, "logits/chosen": 1.2792850732803345, "logits/rejected": 1.9998118877410889, "logps/chosen": -18.712032318115234, "logps/rejected": -229.017578125, "loss": 0.4932, "nll_loss": 0.4678007960319519, "rewards/accuracies": 1.0, "rewards/chosen": 0.9661176800727844, "rewards/margins": 11.109484672546387, "rewards/rejected": -10.143366813659668, "step": 3369 }, { "epoch": 0.5616666666666666, "grad_norm": 16.334285736083984, "learning_rate": 8.494328730675337e-08, "logits/chosen": 1.1536483764648438, "logits/rejected": 1.296898603439331, "logps/chosen": -229.7735595703125, "logps/rejected": -265.620849609375, "loss": 0.8542, "nll_loss": 0.8478729724884033, "rewards/accuracies": 1.0, "rewards/chosen": 2.464586019515991, "rewards/margins": 10.021141052246094, "rewards/rejected": -7.556555271148682, "step": 3370 }, { "epoch": 0.5618333333333333, "grad_norm": 26.00081443786621, "learning_rate": 8.488992562267883e-08, "logits/chosen": 3.1860814094543457, "logits/rejected": 3.3942201137542725, "logps/chosen": -73.34199523925781, "logps/rejected": -255.41482543945312, "loss": 0.8156, "nll_loss": 0.8059560656547546, "rewards/accuracies": 1.0, "rewards/chosen": 2.0041139125823975, "rewards/margins": 9.761298179626465, "rewards/rejected": -7.757184028625488, "step": 3371 }, { "epoch": 0.562, "grad_norm": 26.674617767333984, "learning_rate": 8.48365683413172e-08, "logits/chosen": 2.475101947784424, "logits/rejected": 2.5930514335632324, "logps/chosen": -61.213069915771484, "logps/rejected": -223.7445526123047, "loss": 0.7508, "nll_loss": 0.7287270426750183, "rewards/accuracies": 1.0, "rewards/chosen": 1.1210590600967407, "rewards/margins": 9.3015775680542, "rewards/rejected": -8.18051815032959, "step": 3372 }, { "epoch": 0.5621666666666667, "grad_norm": 166.3050079345703, "learning_rate": 8.478321547821553e-08, "logits/chosen": 2.977041721343994, "logits/rejected": 3.126533031463623, "logps/chosen": -17.765235900878906, "logps/rejected": -89.09880065917969, "loss": 1.3425, "nll_loss": 0.6125944256782532, "rewards/accuracies": 1.0, "rewards/chosen": 0.9546634554862976, "rewards/margins": 0.5294772982597351, "rewards/rejected": 0.4251861572265625, "step": 3373 }, { "epoch": 0.5623333333333334, "grad_norm": 44.52350997924805, "learning_rate": 8.472986704891953e-08, "logits/chosen": 2.7891452312469482, "logits/rejected": 2.5791854858398438, "logps/chosen": -23.00737953186035, "logps/rejected": -88.84048461914062, "loss": 0.6754, "nll_loss": 0.6218210458755493, "rewards/accuracies": 1.0, "rewards/chosen": 0.8041088581085205, "rewards/margins": 4.422468185424805, "rewards/rejected": -3.618359327316284, "step": 3374 }, { "epoch": 0.5625, "grad_norm": 27.596263885498047, "learning_rate": 8.467652306897368e-08, "logits/chosen": 2.4933366775512695, "logits/rejected": 2.779829740524292, "logps/chosen": -68.07862854003906, "logps/rejected": -148.66217041015625, "loss": 0.9308, "nll_loss": 0.8841379284858704, "rewards/accuracies": 1.0, "rewards/chosen": 1.3466179370880127, "rewards/margins": 4.58920955657959, "rewards/rejected": -3.2425918579101562, "step": 3375 }, { "epoch": 0.5626666666666666, "grad_norm": 52.30241775512695, "learning_rate": 8.462318355392112e-08, "logits/chosen": 2.5380136966705322, "logits/rejected": 2.3382375240325928, "logps/chosen": -52.790557861328125, "logps/rejected": -74.62007904052734, "loss": 1.6226, "nll_loss": 1.5997138023376465, "rewards/accuracies": 1.0, "rewards/chosen": 1.1819061040878296, "rewards/margins": 7.058656692504883, "rewards/rejected": -5.876750469207764, "step": 3376 }, { "epoch": 0.5628333333333333, "grad_norm": 28.64083480834961, "learning_rate": 8.456984851930371e-08, "logits/chosen": 2.3007466793060303, "logits/rejected": 2.1142046451568604, "logps/chosen": -55.82419967651367, "logps/rejected": -106.34030151367188, "loss": 0.8722, "nll_loss": 0.8458212018013, "rewards/accuracies": 1.0, "rewards/chosen": 1.7463916540145874, "rewards/margins": 5.612030506134033, "rewards/rejected": -3.8656387329101562, "step": 3377 }, { "epoch": 0.563, "grad_norm": 76.3864517211914, "learning_rate": 8.451651798066201e-08, "logits/chosen": 3.075101852416992, "logits/rejected": 3.1420114040374756, "logps/chosen": -43.7797737121582, "logps/rejected": -53.079627990722656, "loss": 1.2329, "nll_loss": 0.8934648633003235, "rewards/accuracies": 1.0, "rewards/chosen": 1.3363803625106812, "rewards/margins": 1.8193775415420532, "rewards/rejected": -0.4829971492290497, "step": 3378 }, { "epoch": 0.5631666666666667, "grad_norm": 27.38187026977539, "learning_rate": 8.446319195353524e-08, "logits/chosen": 2.808201789855957, "logits/rejected": 2.8884384632110596, "logps/chosen": -82.03266143798828, "logps/rejected": -258.18682861328125, "loss": 0.8286, "nll_loss": 0.796433687210083, "rewards/accuracies": 1.0, "rewards/chosen": 0.7245125770568848, "rewards/margins": 8.437990188598633, "rewards/rejected": -7.71347713470459, "step": 3379 }, { "epoch": 0.5633333333333334, "grad_norm": 22.37790298461914, "learning_rate": 8.440987045346134e-08, "logits/chosen": 2.333810806274414, "logits/rejected": 2.7246315479278564, "logps/chosen": -84.2327880859375, "logps/rejected": -389.3082275390625, "loss": 0.734, "nll_loss": 0.7138371467590332, "rewards/accuracies": 1.0, "rewards/chosen": 1.4319305419921875, "rewards/margins": 6.696017742156982, "rewards/rejected": -5.264087200164795, "step": 3380 }, { "epoch": 0.5635, "grad_norm": 54.50912094116211, "learning_rate": 8.435655349597689e-08, "logits/chosen": 3.0711400508880615, "logits/rejected": 3.1065618991851807, "logps/chosen": -3.986109733581543, "logps/rejected": -201.49697875976562, "loss": 0.2982, "nll_loss": 0.28472211956977844, "rewards/accuracies": 1.0, "rewards/chosen": 1.638684868812561, "rewards/margins": 10.006878852844238, "rewards/rejected": -8.368193626403809, "step": 3381 }, { "epoch": 0.5636666666666666, "grad_norm": 39.04377365112305, "learning_rate": 8.430324109661722e-08, "logits/chosen": 2.9154701232910156, "logits/rejected": 3.221203088760376, "logps/chosen": -16.9134464263916, "logps/rejected": -157.14791870117188, "loss": 0.6445, "nll_loss": 0.6264238953590393, "rewards/accuracies": 1.0, "rewards/chosen": 1.457358717918396, "rewards/margins": 7.2696757316589355, "rewards/rejected": -5.81231689453125, "step": 3382 }, { "epoch": 0.5638333333333333, "grad_norm": 113.25624084472656, "learning_rate": 8.424993327091622e-08, "logits/chosen": 2.5003745555877686, "logits/rejected": 2.4551122188568115, "logps/chosen": -26.721805572509766, "logps/rejected": -27.98027801513672, "loss": 1.8894, "nll_loss": 1.6701128482818604, "rewards/accuracies": 1.0, "rewards/chosen": 0.2996494472026825, "rewards/margins": 2.0758886337280273, "rewards/rejected": -1.776239275932312, "step": 3383 }, { "epoch": 0.564, "grad_norm": 229.74871826171875, "learning_rate": 8.419663003440656e-08, "logits/chosen": 3.1348013877868652, "logits/rejected": 3.178361654281616, "logps/chosen": -16.826698303222656, "logps/rejected": -50.34717559814453, "loss": 6.9164, "nll_loss": 0.4547756314277649, "rewards/accuracies": 0.0, "rewards/chosen": 0.5910105109214783, "rewards/margins": -6.098465442657471, "rewards/rejected": 6.689476013183594, "step": 3384 }, { "epoch": 0.5641666666666667, "grad_norm": 19.037796020507812, "learning_rate": 8.414333140261949e-08, "logits/chosen": 2.5583596229553223, "logits/rejected": 2.693094253540039, "logps/chosen": -85.98812866210938, "logps/rejected": -209.96646118164062, "loss": 0.808, "nll_loss": 0.8036271333694458, "rewards/accuracies": 1.0, "rewards/chosen": 2.780024766921997, "rewards/margins": 12.775870323181152, "rewards/rejected": -9.995845794677734, "step": 3385 }, { "epoch": 0.5643333333333334, "grad_norm": 20.495590209960938, "learning_rate": 8.409003739108498e-08, "logits/chosen": 2.482607364654541, "logits/rejected": 2.6510422229766846, "logps/chosen": -54.16179275512695, "logps/rejected": -201.50950622558594, "loss": 0.5532, "nll_loss": 0.5309979319572449, "rewards/accuracies": 1.0, "rewards/chosen": 1.5342741012573242, "rewards/margins": 6.109483242034912, "rewards/rejected": -4.575209140777588, "step": 3386 }, { "epoch": 0.5645, "grad_norm": 33.4322395324707, "learning_rate": 8.40367480153316e-08, "logits/chosen": 1.4253638982772827, "logits/rejected": 1.9152418375015259, "logps/chosen": -106.45100402832031, "logps/rejected": -138.65135192871094, "loss": 1.2224, "nll_loss": 1.1960786581039429, "rewards/accuracies": 1.0, "rewards/chosen": 1.4550186395645142, "rewards/margins": 5.713891506195068, "rewards/rejected": -4.258872985839844, "step": 3387 }, { "epoch": 0.5646666666666667, "grad_norm": 39.57246017456055, "learning_rate": 8.398346329088663e-08, "logits/chosen": 0.6236076951026917, "logits/rejected": 1.7467997074127197, "logps/chosen": -63.44142150878906, "logps/rejected": -269.7118835449219, "loss": 1.2584, "nll_loss": 1.2439496517181396, "rewards/accuracies": 1.0, "rewards/chosen": 1.7369775772094727, "rewards/margins": 7.446846961975098, "rewards/rejected": -5.709869384765625, "step": 3388 }, { "epoch": 0.5648333333333333, "grad_norm": 42.92397689819336, "learning_rate": 8.39301832332759e-08, "logits/chosen": 2.2187724113464355, "logits/rejected": 2.301582098007202, "logps/chosen": -54.935142517089844, "logps/rejected": -77.89940643310547, "loss": 1.2419, "nll_loss": 1.2207810878753662, "rewards/accuracies": 1.0, "rewards/chosen": 1.376962423324585, "rewards/margins": 6.640285491943359, "rewards/rejected": -5.263322830200195, "step": 3389 }, { "epoch": 0.565, "grad_norm": 30.682628631591797, "learning_rate": 8.387690785802402e-08, "logits/chosen": 2.4571003913879395, "logits/rejected": 2.662741184234619, "logps/chosen": -11.45577621459961, "logps/rejected": -401.4388732910156, "loss": 0.5027, "nll_loss": 0.4980772137641907, "rewards/accuracies": 1.0, "rewards/chosen": 2.749081611633301, "rewards/margins": 10.922037124633789, "rewards/rejected": -8.172955513000488, "step": 3390 }, { "epoch": 0.5651666666666667, "grad_norm": 169.0624542236328, "learning_rate": 8.382363718065407e-08, "logits/chosen": 2.0147287845611572, "logits/rejected": 2.1714091300964355, "logps/chosen": -32.09418869018555, "logps/rejected": -14.995563507080078, "loss": 2.9244, "nll_loss": 0.5630559325218201, "rewards/accuracies": 0.0, "rewards/chosen": 0.6801014542579651, "rewards/margins": -1.8640167713165283, "rewards/rejected": 2.5441181659698486, "step": 3391 }, { "epoch": 0.5653333333333334, "grad_norm": 22.83107566833496, "learning_rate": 8.377037121668793e-08, "logits/chosen": 2.5059974193573, "logits/rejected": 2.5657999515533447, "logps/chosen": -100.16458129882812, "logps/rejected": -260.512939453125, "loss": 0.8918, "nll_loss": 0.8864122033119202, "rewards/accuracies": 1.0, "rewards/chosen": 2.58613920211792, "rewards/margins": 11.086872100830078, "rewards/rejected": -8.500732421875, "step": 3392 }, { "epoch": 0.5655, "grad_norm": 38.15096664428711, "learning_rate": 8.371710998164594e-08, "logits/chosen": 1.1482534408569336, "logits/rejected": 1.8667898178100586, "logps/chosen": -40.41306686401367, "logps/rejected": -371.03436279296875, "loss": 0.8092, "nll_loss": 0.8082612752914429, "rewards/accuracies": 1.0, "rewards/chosen": 4.4840474128723145, "rewards/margins": 13.050270080566406, "rewards/rejected": -8.56622314453125, "step": 3393 }, { "epoch": 0.5656666666666667, "grad_norm": 29.121618270874023, "learning_rate": 8.366385349104722e-08, "logits/chosen": 2.923506498336792, "logits/rejected": 2.7237648963928223, "logps/chosen": -19.156414031982422, "logps/rejected": -153.47018432617188, "loss": 0.477, "nll_loss": 0.4672295153141022, "rewards/accuracies": 1.0, "rewards/chosen": 2.33890962600708, "rewards/margins": 7.691800594329834, "rewards/rejected": -5.352890968322754, "step": 3394 }, { "epoch": 0.5658333333333333, "grad_norm": 241.34274291992188, "learning_rate": 8.361060176040938e-08, "logits/chosen": 1.5322682857513428, "logits/rejected": 1.7114287614822388, "logps/chosen": -30.269481658935547, "logps/rejected": -55.389991760253906, "loss": 3.8029, "nll_loss": 0.43868809938430786, "rewards/accuracies": 0.0, "rewards/chosen": 2.053452253341675, "rewards/margins": -2.6836793422698975, "rewards/rejected": 4.737131595611572, "step": 3395 }, { "epoch": 0.566, "grad_norm": 27.994165420532227, "learning_rate": 8.355735480524873e-08, "logits/chosen": 2.433612108230591, "logits/rejected": 2.4162352085113525, "logps/chosen": -97.69584655761719, "logps/rejected": -170.49627685546875, "loss": 1.1022, "nll_loss": 1.0855095386505127, "rewards/accuracies": 1.0, "rewards/chosen": 1.4314788579940796, "rewards/margins": 8.975900650024414, "rewards/rejected": -7.544422149658203, "step": 3396 }, { "epoch": 0.5661666666666667, "grad_norm": 27.049076080322266, "learning_rate": 8.350411264108012e-08, "logits/chosen": 2.4327220916748047, "logits/rejected": 2.407984972000122, "logps/chosen": -29.440462112426758, "logps/rejected": -86.1407241821289, "loss": 0.556, "nll_loss": 0.545193612575531, "rewards/accuracies": 1.0, "rewards/chosen": 2.391364812850952, "rewards/margins": 7.364092826843262, "rewards/rejected": -4.9727277755737305, "step": 3397 }, { "epoch": 0.5663333333333334, "grad_norm": 19.28308868408203, "learning_rate": 8.345087528341706e-08, "logits/chosen": 2.1647682189941406, "logits/rejected": 2.226036787033081, "logps/chosen": -114.91566467285156, "logps/rejected": -287.84954833984375, "loss": 0.7378, "nll_loss": 0.727314293384552, "rewards/accuracies": 1.0, "rewards/chosen": 2.2101821899414062, "rewards/margins": 7.629634380340576, "rewards/rejected": -5.41945219039917, "step": 3398 }, { "epoch": 0.5665, "grad_norm": 23.110553741455078, "learning_rate": 8.339764274777165e-08, "logits/chosen": 1.715468406677246, "logits/rejected": 1.556572675704956, "logps/chosen": -137.37425231933594, "logps/rejected": -92.23974609375, "loss": 0.9784, "nll_loss": 0.967424213886261, "rewards/accuracies": 1.0, "rewards/chosen": 1.9128830432891846, "rewards/margins": 8.735851287841797, "rewards/rejected": -6.822968482971191, "step": 3399 }, { "epoch": 0.5666666666666667, "grad_norm": 20.413671493530273, "learning_rate": 8.334441504965455e-08, "logits/chosen": 1.4348344802856445, "logits/rejected": 1.6202406883239746, "logps/chosen": -108.38505554199219, "logps/rejected": -131.6411895751953, "loss": 0.8247, "nll_loss": 0.8149252533912659, "rewards/accuracies": 1.0, "rewards/chosen": 2.168466329574585, "rewards/margins": 8.030776023864746, "rewards/rejected": -5.86230993270874, "step": 3400 }, { "epoch": 0.5668333333333333, "grad_norm": 86.60216522216797, "learning_rate": 8.329119220457505e-08, "logits/chosen": 2.5781261920928955, "logits/rejected": 2.6729671955108643, "logps/chosen": -52.655364990234375, "logps/rejected": -148.84805297851562, "loss": 2.0502, "nll_loss": 2.0252060890197754, "rewards/accuracies": 1.0, "rewards/chosen": 1.0811264514923096, "rewards/margins": 6.9722394943237305, "rewards/rejected": -5.89111328125, "step": 3401 }, { "epoch": 0.567, "grad_norm": 24.23502540588379, "learning_rate": 8.323797422804098e-08, "logits/chosen": 3.215258836746216, "logits/rejected": 3.532959222793579, "logps/chosen": -64.08888244628906, "logps/rejected": -188.9893798828125, "loss": 0.7971, "nll_loss": 0.7912208437919617, "rewards/accuracies": 1.0, "rewards/chosen": 2.512390375137329, "rewards/margins": 10.590723991394043, "rewards/rejected": -8.078333854675293, "step": 3402 }, { "epoch": 0.5671666666666667, "grad_norm": 31.269990921020508, "learning_rate": 8.318476113555883e-08, "logits/chosen": 1.8217737674713135, "logits/rejected": 2.365955352783203, "logps/chosen": -50.736244201660156, "logps/rejected": -157.10177612304688, "loss": 0.7436, "nll_loss": 0.6950170993804932, "rewards/accuracies": 1.0, "rewards/chosen": 0.3854149281978607, "rewards/margins": 5.691977500915527, "rewards/rejected": -5.306562423706055, "step": 3403 }, { "epoch": 0.5673333333333334, "grad_norm": 22.312849044799805, "learning_rate": 8.313155294263357e-08, "logits/chosen": 1.5251556634902954, "logits/rejected": 2.247734308242798, "logps/chosen": -55.86796188354492, "logps/rejected": -391.5321350097656, "loss": 0.6914, "nll_loss": 0.6813165545463562, "rewards/accuracies": 1.0, "rewards/chosen": 1.9278507232666016, "rewards/margins": 12.616843223571777, "rewards/rejected": -10.688992500305176, "step": 3404 }, { "epoch": 0.5675, "grad_norm": 37.75663757324219, "learning_rate": 8.307834966476883e-08, "logits/chosen": 1.4328523874282837, "logits/rejected": 2.3035666942596436, "logps/chosen": -29.910476684570312, "logps/rejected": -138.24273681640625, "loss": 0.7541, "nll_loss": 0.6797835230827332, "rewards/accuracies": 1.0, "rewards/chosen": 1.1587413549423218, "rewards/margins": 3.8224000930786133, "rewards/rejected": -2.663658857345581, "step": 3405 }, { "epoch": 0.5676666666666667, "grad_norm": 32.31235122680664, "learning_rate": 8.302515131746673e-08, "logits/chosen": 2.890625238418579, "logits/rejected": 2.9951822757720947, "logps/chosen": -27.00137710571289, "logps/rejected": -186.75299072265625, "loss": 0.645, "nll_loss": 0.6279388666152954, "rewards/accuracies": 1.0, "rewards/chosen": 2.0353000164031982, "rewards/margins": 6.442543029785156, "rewards/rejected": -4.407243251800537, "step": 3406 }, { "epoch": 0.5678333333333333, "grad_norm": 14.77462387084961, "learning_rate": 8.297195791622802e-08, "logits/chosen": 2.7277896404266357, "logits/rejected": 2.8142449855804443, "logps/chosen": -159.22357177734375, "logps/rejected": -253.2891387939453, "loss": 0.7293, "nll_loss": 0.7237435579299927, "rewards/accuracies": 1.0, "rewards/chosen": 2.632568359375, "rewards/margins": 9.614119529724121, "rewards/rejected": -6.981551170349121, "step": 3407 }, { "epoch": 0.568, "grad_norm": 124.695556640625, "learning_rate": 8.291876947655196e-08, "logits/chosen": 2.5227210521698, "logits/rejected": 2.3588545322418213, "logps/chosen": -65.17724609375, "logps/rejected": -18.881122589111328, "loss": 2.4027, "nll_loss": 1.7615472078323364, "rewards/accuracies": 1.0, "rewards/chosen": 1.7437362670898438, "rewards/margins": 1.0241925716400146, "rewards/rejected": 0.7195436358451843, "step": 3408 }, { "epoch": 0.5681666666666667, "grad_norm": 64.53257751464844, "learning_rate": 8.286558601393641e-08, "logits/chosen": 2.8066282272338867, "logits/rejected": 3.020596504211426, "logps/chosen": -17.818405151367188, "logps/rejected": -174.32070922851562, "loss": 0.8174, "nll_loss": 0.8099275231361389, "rewards/accuracies": 1.0, "rewards/chosen": 2.380317449569702, "rewards/margins": 8.827035903930664, "rewards/rejected": -6.446718215942383, "step": 3409 }, { "epoch": 0.5683333333333334, "grad_norm": 24.34798240661621, "learning_rate": 8.28124075438777e-08, "logits/chosen": 3.2593765258789062, "logits/rejected": 3.375185012817383, "logps/chosen": -64.91488647460938, "logps/rejected": -286.62457275390625, "loss": 0.7411, "nll_loss": 0.7212764024734497, "rewards/accuracies": 1.0, "rewards/chosen": 1.2753921747207642, "rewards/margins": 7.923407554626465, "rewards/rejected": -6.64801549911499, "step": 3410 }, { "epoch": 0.5685, "grad_norm": 40.854827880859375, "learning_rate": 8.275923408187084e-08, "logits/chosen": 2.868330478668213, "logits/rejected": 3.000385284423828, "logps/chosen": -20.351469039916992, "logps/rejected": -96.82633972167969, "loss": 0.7545, "nll_loss": 0.7537581324577332, "rewards/accuracies": 1.0, "rewards/chosen": 4.756495952606201, "rewards/margins": 13.339345932006836, "rewards/rejected": -8.582850456237793, "step": 3411 }, { "epoch": 0.5686666666666667, "grad_norm": 30.773435592651367, "learning_rate": 8.270606564340923e-08, "logits/chosen": 1.572165608406067, "logits/rejected": 1.6802208423614502, "logps/chosen": -110.5498275756836, "logps/rejected": -74.22640991210938, "loss": 1.097, "nll_loss": 1.0629791021347046, "rewards/accuracies": 1.0, "rewards/chosen": 0.8588356375694275, "rewards/margins": 5.836236000061035, "rewards/rejected": -4.977400302886963, "step": 3412 }, { "epoch": 0.5688333333333333, "grad_norm": 29.094167709350586, "learning_rate": 8.265290224398492e-08, "logits/chosen": 2.7050037384033203, "logits/rejected": 3.0985870361328125, "logps/chosen": -17.23984718322754, "logps/rejected": -600.34765625, "loss": 0.4887, "nll_loss": 0.47888463735580444, "rewards/accuracies": 1.0, "rewards/chosen": 1.9542502164840698, "rewards/margins": 10.622939109802246, "rewards/rejected": -8.668688774108887, "step": 3413 }, { "epoch": 0.569, "grad_norm": 27.83020782470703, "learning_rate": 8.259974389908841e-08, "logits/chosen": 2.6753439903259277, "logits/rejected": 2.379875421524048, "logps/chosen": -49.639896392822266, "logps/rejected": -52.66191864013672, "loss": 0.7882, "nll_loss": 0.7521197199821472, "rewards/accuracies": 1.0, "rewards/chosen": 1.6780239343643188, "rewards/margins": 5.065270900726318, "rewards/rejected": -3.387247085571289, "step": 3414 }, { "epoch": 0.5691666666666667, "grad_norm": 54.81333923339844, "learning_rate": 8.254659062420882e-08, "logits/chosen": 2.4868547916412354, "logits/rejected": 1.9778966903686523, "logps/chosen": -128.9468231201172, "logps/rejected": -40.70027160644531, "loss": 2.625, "nll_loss": 2.5789361000061035, "rewards/accuracies": 1.0, "rewards/chosen": 2.2386443614959717, "rewards/margins": 4.990653038024902, "rewards/rejected": -2.7520089149475098, "step": 3415 }, { "epoch": 0.5693333333333334, "grad_norm": 107.08976745605469, "learning_rate": 8.249344243483368e-08, "logits/chosen": 2.0855841636657715, "logits/rejected": 1.8983521461486816, "logps/chosen": -77.14929962158203, "logps/rejected": -78.9188003540039, "loss": 1.4894, "nll_loss": 1.0151225328445435, "rewards/accuracies": 1.0, "rewards/chosen": 0.08982773125171661, "rewards/margins": 0.9525161981582642, "rewards/rejected": -0.8626884818077087, "step": 3416 }, { "epoch": 0.5695, "grad_norm": 37.59584426879883, "learning_rate": 8.244029934644914e-08, "logits/chosen": 2.1563401222229004, "logits/rejected": 2.411318063735962, "logps/chosen": -105.92327880859375, "logps/rejected": -174.42657470703125, "loss": 0.8179, "nll_loss": 0.72056645154953, "rewards/accuracies": 1.0, "rewards/chosen": 2.5637969970703125, "rewards/margins": 4.297522068023682, "rewards/rejected": -1.7337249517440796, "step": 3417 }, { "epoch": 0.5696666666666667, "grad_norm": 36.4764289855957, "learning_rate": 8.238716137453978e-08, "logits/chosen": 2.1663460731506348, "logits/rejected": 2.4386775493621826, "logps/chosen": -28.847856521606445, "logps/rejected": -214.66468811035156, "loss": 0.7839, "nll_loss": 0.7796719670295715, "rewards/accuracies": 1.0, "rewards/chosen": 2.8296990394592285, "rewards/margins": 11.900468826293945, "rewards/rejected": -9.070769309997559, "step": 3418 }, { "epoch": 0.5698333333333333, "grad_norm": 27.251676559448242, "learning_rate": 8.233402853458876e-08, "logits/chosen": 2.1465654373168945, "logits/rejected": 2.244297504425049, "logps/chosen": -31.902210235595703, "logps/rejected": -388.74798583984375, "loss": 0.5482, "nll_loss": 0.5317035913467407, "rewards/accuracies": 1.0, "rewards/chosen": 1.4368467330932617, "rewards/margins": 9.094775199890137, "rewards/rejected": -7.657928466796875, "step": 3419 }, { "epoch": 0.57, "grad_norm": 33.184364318847656, "learning_rate": 8.228090084207773e-08, "logits/chosen": 2.761568307876587, "logits/rejected": 2.8166885375976562, "logps/chosen": -34.02875518798828, "logps/rejected": -239.35142517089844, "loss": 0.6723, "nll_loss": 0.654399037361145, "rewards/accuracies": 1.0, "rewards/chosen": 1.5705631971359253, "rewards/margins": 6.838826656341553, "rewards/rejected": -5.268263339996338, "step": 3420 }, { "epoch": 0.5701666666666667, "grad_norm": 41.21188735961914, "learning_rate": 8.222777831248676e-08, "logits/chosen": 2.678912401199341, "logits/rejected": 2.928307294845581, "logps/chosen": -36.126708984375, "logps/rejected": -257.5241394042969, "loss": 0.9015, "nll_loss": 0.8601597547531128, "rewards/accuracies": 1.0, "rewards/chosen": 3.3727762699127197, "rewards/margins": 6.009140968322754, "rewards/rejected": -2.636364698410034, "step": 3421 }, { "epoch": 0.5703333333333334, "grad_norm": 27.10000228881836, "learning_rate": 8.217466096129455e-08, "logits/chosen": 2.907998561859131, "logits/rejected": 3.175145387649536, "logps/chosen": -89.56206512451172, "logps/rejected": -299.0694580078125, "loss": 1.0849, "nll_loss": 1.0790610313415527, "rewards/accuracies": 1.0, "rewards/chosen": 3.069234848022461, "rewards/margins": 8.518585205078125, "rewards/rejected": -5.449349880218506, "step": 3422 }, { "epoch": 0.5705, "grad_norm": 61.75691604614258, "learning_rate": 8.212154880397817e-08, "logits/chosen": 2.459524393081665, "logits/rejected": 2.6054952144622803, "logps/chosen": -9.453073501586914, "logps/rejected": -248.6558837890625, "loss": 0.6825, "nll_loss": 0.6752195358276367, "rewards/accuracies": 1.0, "rewards/chosen": 2.3835184574127197, "rewards/margins": 8.968437194824219, "rewards/rejected": -6.58491849899292, "step": 3423 }, { "epoch": 0.5706666666666667, "grad_norm": 20.51625633239746, "learning_rate": 8.206844185601327e-08, "logits/chosen": 1.9524189233779907, "logits/rejected": 1.8907618522644043, "logps/chosen": -50.51649475097656, "logps/rejected": -128.52322387695312, "loss": 0.5644, "nll_loss": 0.5490923523902893, "rewards/accuracies": 1.0, "rewards/chosen": 1.6029525995254517, "rewards/margins": 7.691369533538818, "rewards/rejected": -6.088417053222656, "step": 3424 }, { "epoch": 0.5708333333333333, "grad_norm": 28.4134578704834, "learning_rate": 8.20153401328739e-08, "logits/chosen": 2.47123646736145, "logits/rejected": 2.7052106857299805, "logps/chosen": -64.62762451171875, "logps/rejected": -331.19769287109375, "loss": 0.902, "nll_loss": 0.8853098154067993, "rewards/accuracies": 1.0, "rewards/chosen": 1.447607398033142, "rewards/margins": 8.310220718383789, "rewards/rejected": -6.862613677978516, "step": 3425 }, { "epoch": 0.571, "grad_norm": 44.47957992553711, "learning_rate": 8.196224365003266e-08, "logits/chosen": 2.7941231727600098, "logits/rejected": 2.8271145820617676, "logps/chosen": -16.789077758789062, "logps/rejected": -60.60258102416992, "loss": 0.557, "nll_loss": 0.4663632810115814, "rewards/accuracies": 1.0, "rewards/chosen": 1.2338117361068726, "rewards/margins": 3.564716339111328, "rewards/rejected": -2.330904722213745, "step": 3426 }, { "epoch": 0.5711666666666667, "grad_norm": 26.69683265686035, "learning_rate": 8.190915242296056e-08, "logits/chosen": 1.3019919395446777, "logits/rejected": 1.3387020826339722, "logps/chosen": -43.741539001464844, "logps/rejected": -147.2286834716797, "loss": 0.7046, "nll_loss": 0.6834614276885986, "rewards/accuracies": 1.0, "rewards/chosen": 1.1593811511993408, "rewards/margins": 9.558794021606445, "rewards/rejected": -8.399413108825684, "step": 3427 }, { "epoch": 0.5713333333333334, "grad_norm": 33.766990661621094, "learning_rate": 8.185606646712713e-08, "logits/chosen": 2.165687322616577, "logits/rejected": 2.078986883163452, "logps/chosen": -75.40387725830078, "logps/rejected": -133.52682495117188, "loss": 1.0356, "nll_loss": 1.0053848028182983, "rewards/accuracies": 1.0, "rewards/chosen": 0.8939735889434814, "rewards/margins": 6.472428321838379, "rewards/rejected": -5.578454971313477, "step": 3428 }, { "epoch": 0.5715, "grad_norm": 25.318668365478516, "learning_rate": 8.180298579800033e-08, "logits/chosen": 1.5788085460662842, "logits/rejected": 2.0810091495513916, "logps/chosen": -49.393211364746094, "logps/rejected": -356.73541259765625, "loss": 0.7228, "nll_loss": 0.7056172490119934, "rewards/accuracies": 1.0, "rewards/chosen": 1.3686336278915405, "rewards/margins": 11.924328804016113, "rewards/rejected": -10.555695533752441, "step": 3429 }, { "epoch": 0.5716666666666667, "grad_norm": 30.04450798034668, "learning_rate": 8.174991043104661e-08, "logits/chosen": 1.4965205192565918, "logits/rejected": 1.9581929445266724, "logps/chosen": -51.389156341552734, "logps/rejected": -144.1056365966797, "loss": 0.7457, "nll_loss": 0.7341307401657104, "rewards/accuracies": 1.0, "rewards/chosen": 1.8290150165557861, "rewards/margins": 8.85783576965332, "rewards/rejected": -7.028820991516113, "step": 3430 }, { "epoch": 0.5718333333333333, "grad_norm": 40.609642028808594, "learning_rate": 8.169684038173085e-08, "logits/chosen": 2.6971523761749268, "logits/rejected": 2.698838949203491, "logps/chosen": -36.837867736816406, "logps/rejected": -217.45138549804688, "loss": 1.0494, "nll_loss": 1.0232740640640259, "rewards/accuracies": 1.0, "rewards/chosen": 0.9254989624023438, "rewards/margins": 10.000445365905762, "rewards/rejected": -9.074946403503418, "step": 3431 }, { "epoch": 0.572, "grad_norm": 39.15812683105469, "learning_rate": 8.164377566551638e-08, "logits/chosen": 2.6366934776306152, "logits/rejected": 2.7289223670959473, "logps/chosen": -59.597412109375, "logps/rejected": -127.97320556640625, "loss": 0.858, "nll_loss": 0.8164028525352478, "rewards/accuracies": 1.0, "rewards/chosen": 0.8615204095840454, "rewards/margins": 5.0244035720825195, "rewards/rejected": -4.162883281707764, "step": 3432 }, { "epoch": 0.5721666666666667, "grad_norm": 19.53349494934082, "learning_rate": 8.1590716297865e-08, "logits/chosen": 0.2047407478094101, "logits/rejected": 1.38309645652771, "logps/chosen": -67.88414001464844, "logps/rejected": -445.76678466796875, "loss": 0.6019, "nll_loss": 0.5902969241142273, "rewards/accuracies": 1.0, "rewards/chosen": 1.772519826889038, "rewards/margins": 11.734272956848145, "rewards/rejected": -9.961752891540527, "step": 3433 }, { "epoch": 0.5723333333333334, "grad_norm": 211.1924591064453, "learning_rate": 8.153766229423692e-08, "logits/chosen": 1.5558615922927856, "logits/rejected": 1.287981629371643, "logps/chosen": -77.87255859375, "logps/rejected": -10.413678169250488, "loss": 2.842, "nll_loss": 0.9496652483940125, "rewards/accuracies": 0.0, "rewards/chosen": 1.1569565534591675, "rewards/margins": -1.2164679765701294, "rewards/rejected": 2.373424530029297, "step": 3434 }, { "epoch": 0.5725, "grad_norm": 20.55493927001953, "learning_rate": 8.14846136700908e-08, "logits/chosen": 2.1359591484069824, "logits/rejected": 2.269303321838379, "logps/chosen": -112.03977966308594, "logps/rejected": -306.0169677734375, "loss": 0.8994, "nll_loss": 0.8892046809196472, "rewards/accuracies": 1.0, "rewards/chosen": 1.9090806245803833, "rewards/margins": 11.837504386901855, "rewards/rejected": -9.928423881530762, "step": 3435 }, { "epoch": 0.5726666666666667, "grad_norm": 71.46527099609375, "learning_rate": 8.143157044088376e-08, "logits/chosen": 2.9647164344787598, "logits/rejected": 3.081810235977173, "logps/chosen": -48.327415466308594, "logps/rejected": -51.100135803222656, "loss": 1.1268, "nll_loss": 0.4784891605377197, "rewards/accuracies": 1.0, "rewards/chosen": 2.0636589527130127, "rewards/margins": 1.1474311351776123, "rewards/rejected": 0.9162277579307556, "step": 3436 }, { "epoch": 0.5728333333333333, "grad_norm": 48.46231460571289, "learning_rate": 8.137853262207128e-08, "logits/chosen": 2.909348964691162, "logits/rejected": 2.8778555393218994, "logps/chosen": -90.529052734375, "logps/rejected": -79.31350708007812, "loss": 1.3935, "nll_loss": 1.33130943775177, "rewards/accuracies": 1.0, "rewards/chosen": 0.49668046832084656, "rewards/margins": 4.281935214996338, "rewards/rejected": -3.785254716873169, "step": 3437 }, { "epoch": 0.573, "grad_norm": 64.92607116699219, "learning_rate": 8.132550022910737e-08, "logits/chosen": 3.0020041465759277, "logits/rejected": 3.0496973991394043, "logps/chosen": -7.412704944610596, "logps/rejected": -177.95364379882812, "loss": 0.5838, "nll_loss": 0.5702081322669983, "rewards/accuracies": 1.0, "rewards/chosen": 1.6290395259857178, "rewards/margins": 9.41468620300293, "rewards/rejected": -7.785646915435791, "step": 3438 }, { "epoch": 0.5731666666666667, "grad_norm": 64.69374084472656, "learning_rate": 8.127247327744433e-08, "logits/chosen": 2.4553041458129883, "logits/rejected": 2.347179889678955, "logps/chosen": -9.193885803222656, "logps/rejected": -56.51629638671875, "loss": 0.7916, "nll_loss": 0.6567062139511108, "rewards/accuracies": 1.0, "rewards/chosen": 1.5427935123443604, "rewards/margins": 3.183882713317871, "rewards/rejected": -1.6410893201828003, "step": 3439 }, { "epoch": 0.5733333333333334, "grad_norm": 27.646520614624023, "learning_rate": 8.121945178253299e-08, "logits/chosen": 2.5612266063690186, "logits/rejected": 2.6570253372192383, "logps/chosen": -143.41873168945312, "logps/rejected": -286.9317626953125, "loss": 1.1578, "nll_loss": 1.1292812824249268, "rewards/accuracies": 1.0, "rewards/chosen": 0.8593963980674744, "rewards/margins": 7.860636234283447, "rewards/rejected": -7.001239776611328, "step": 3440 }, { "epoch": 0.5735, "grad_norm": 21.341156005859375, "learning_rate": 8.116643575982253e-08, "logits/chosen": 2.410472869873047, "logits/rejected": 2.4509475231170654, "logps/chosen": -48.53303527832031, "logps/rejected": -175.16802978515625, "loss": 0.5567, "nll_loss": 0.551511824131012, "rewards/accuracies": 1.0, "rewards/chosen": 2.758958578109741, "rewards/margins": 9.35814094543457, "rewards/rejected": -6.599182605743408, "step": 3441 }, { "epoch": 0.5736666666666667, "grad_norm": 20.11952018737793, "learning_rate": 8.111342522476051e-08, "logits/chosen": 2.573169469833374, "logits/rejected": 2.585188150405884, "logps/chosen": -186.31471252441406, "logps/rejected": -238.37210083007812, "loss": 0.8732, "nll_loss": 0.8546546697616577, "rewards/accuracies": 1.0, "rewards/chosen": 1.3090866804122925, "rewards/margins": 8.806792259216309, "rewards/rejected": -7.497705459594727, "step": 3442 }, { "epoch": 0.5738333333333333, "grad_norm": 84.09046936035156, "learning_rate": 8.1060420192793e-08, "logits/chosen": 2.7473721504211426, "logits/rejected": 3.0643672943115234, "logps/chosen": -7.576878547668457, "logps/rejected": -304.9775695800781, "loss": 0.6388, "nll_loss": 0.6314065456390381, "rewards/accuracies": 1.0, "rewards/chosen": 2.697700262069702, "rewards/margins": 8.128379821777344, "rewards/rejected": -5.4306793212890625, "step": 3443 }, { "epoch": 0.574, "grad_norm": 24.465801239013672, "learning_rate": 8.10074206793643e-08, "logits/chosen": 1.636002779006958, "logits/rejected": 2.4654200077056885, "logps/chosen": -30.72353744506836, "logps/rejected": -306.4881591796875, "loss": 0.5032, "nll_loss": 0.4726698398590088, "rewards/accuracies": 1.0, "rewards/chosen": 0.8336361646652222, "rewards/margins": 6.8402910232543945, "rewards/rejected": -6.006654739379883, "step": 3444 }, { "epoch": 0.5741666666666667, "grad_norm": 38.779083251953125, "learning_rate": 8.095442669991728e-08, "logits/chosen": 2.7518160343170166, "logits/rejected": 2.946472644805908, "logps/chosen": -30.389406204223633, "logps/rejected": -194.07615661621094, "loss": 0.72, "nll_loss": 0.6331127285957336, "rewards/accuracies": 1.0, "rewards/chosen": 2.3513219356536865, "rewards/margins": 4.281511306762695, "rewards/rejected": -1.930189609527588, "step": 3445 }, { "epoch": 0.5743333333333334, "grad_norm": 36.96552658081055, "learning_rate": 8.090143826989308e-08, "logits/chosen": 2.647735357284546, "logits/rejected": 2.8421061038970947, "logps/chosen": -36.1942253112793, "logps/rejected": -295.75592041015625, "loss": 0.9271, "nll_loss": 0.9048558473587036, "rewards/accuracies": 1.0, "rewards/chosen": 1.2201985120773315, "rewards/margins": 6.969633102416992, "rewards/rejected": -5.749434471130371, "step": 3446 }, { "epoch": 0.5745, "grad_norm": 22.66446304321289, "learning_rate": 8.084845540473125e-08, "logits/chosen": 1.9154053926467896, "logits/rejected": 2.6482090950012207, "logps/chosen": -96.10005187988281, "logps/rejected": -329.7363586425781, "loss": 0.8891, "nll_loss": 0.8736367225646973, "rewards/accuracies": 1.0, "rewards/chosen": 1.4918678998947144, "rewards/margins": 9.427842140197754, "rewards/rejected": -7.93597412109375, "step": 3447 }, { "epoch": 0.5746666666666667, "grad_norm": 29.855777740478516, "learning_rate": 8.079547811986971e-08, "logits/chosen": 1.710038185119629, "logits/rejected": 2.205000638961792, "logps/chosen": -49.76468276977539, "logps/rejected": -169.03836059570312, "loss": 0.7932, "nll_loss": 0.7775732278823853, "rewards/accuracies": 1.0, "rewards/chosen": 1.5319995880126953, "rewards/margins": 8.054014205932617, "rewards/rejected": -6.522014617919922, "step": 3448 }, { "epoch": 0.5748333333333333, "grad_norm": 232.4296875, "learning_rate": 8.074250643074483e-08, "logits/chosen": 2.066572427749634, "logits/rejected": 1.9825847148895264, "logps/chosen": -61.361236572265625, "logps/rejected": -27.389455795288086, "loss": 4.3163, "nll_loss": 0.889293372631073, "rewards/accuracies": 0.0, "rewards/chosen": 0.5701370239257812, "rewards/margins": -3.0270943641662598, "rewards/rejected": 3.597231388092041, "step": 3449 }, { "epoch": 0.575, "grad_norm": 36.117191314697266, "learning_rate": 8.068954035279121e-08, "logits/chosen": 2.4042630195617676, "logits/rejected": 2.311685085296631, "logps/chosen": -28.01249122619629, "logps/rejected": -58.307655334472656, "loss": 0.7816, "nll_loss": 0.7570944428443909, "rewards/accuracies": 1.0, "rewards/chosen": 4.16705846786499, "rewards/margins": 7.3241472244262695, "rewards/rejected": -3.1570885181427, "step": 3450 }, { "epoch": 0.5751666666666667, "grad_norm": 23.967897415161133, "learning_rate": 8.063657990144195e-08, "logits/chosen": 2.5993359088897705, "logits/rejected": 2.4715144634246826, "logps/chosen": -47.28523635864258, "logps/rejected": -110.22358703613281, "loss": 0.7221, "nll_loss": 0.7057498693466187, "rewards/accuracies": 1.0, "rewards/chosen": 1.6733274459838867, "rewards/margins": 6.970076560974121, "rewards/rejected": -5.296749114990234, "step": 3451 }, { "epoch": 0.5753333333333334, "grad_norm": 37.193668365478516, "learning_rate": 8.058362509212843e-08, "logits/chosen": 3.026594638824463, "logits/rejected": 3.0048604011535645, "logps/chosen": -43.27517318725586, "logps/rejected": -153.12356567382812, "loss": 0.97, "nll_loss": 0.9407646059989929, "rewards/accuracies": 1.0, "rewards/chosen": 0.9127434492111206, "rewards/margins": 6.633208751678467, "rewards/rejected": -5.720465183258057, "step": 3452 }, { "epoch": 0.5755, "grad_norm": 27.558870315551758, "learning_rate": 8.053067594028043e-08, "logits/chosen": 2.2874951362609863, "logits/rejected": 2.2165772914886475, "logps/chosen": -110.31085205078125, "logps/rejected": -123.98040771484375, "loss": 1.1458, "nll_loss": 1.1372251510620117, "rewards/accuracies": 1.0, "rewards/chosen": 2.2544524669647217, "rewards/margins": 8.42922592163086, "rewards/rejected": -6.174773693084717, "step": 3453 }, { "epoch": 0.5756666666666667, "grad_norm": 21.462390899658203, "learning_rate": 8.047773246132601e-08, "logits/chosen": 2.7359042167663574, "logits/rejected": 2.643784284591675, "logps/chosen": -288.4383544921875, "logps/rejected": -148.84930419921875, "loss": 1.0969, "nll_loss": 1.0843549966812134, "rewards/accuracies": 1.0, "rewards/chosen": 1.7517732381820679, "rewards/margins": 8.590209007263184, "rewards/rejected": -6.838435649871826, "step": 3454 }, { "epoch": 0.5758333333333333, "grad_norm": 44.22007369995117, "learning_rate": 8.04247946706917e-08, "logits/chosen": 2.07582950592041, "logits/rejected": 2.3544557094573975, "logps/chosen": -105.46114349365234, "logps/rejected": -167.93435668945312, "loss": 1.2707, "nll_loss": 1.226292371749878, "rewards/accuracies": 1.0, "rewards/chosen": 0.3456741273403168, "rewards/margins": 8.89870548248291, "rewards/rejected": -8.553030967712402, "step": 3455 }, { "epoch": 0.576, "grad_norm": 81.39636993408203, "learning_rate": 8.037186258380225e-08, "logits/chosen": 2.5690200328826904, "logits/rejected": 2.622040033340454, "logps/chosen": -9.885363578796387, "logps/rejected": -217.851318359375, "loss": 0.7232, "nll_loss": 0.706097424030304, "rewards/accuracies": 1.0, "rewards/chosen": 1.6321194171905518, "rewards/margins": 6.853727340698242, "rewards/rejected": -5.2216081619262695, "step": 3456 }, { "epoch": 0.5761666666666667, "grad_norm": 237.99081420898438, "learning_rate": 8.031893621608083e-08, "logits/chosen": 2.789395332336426, "logits/rejected": 2.7605128288269043, "logps/chosen": -72.29517364501953, "logps/rejected": -67.388427734375, "loss": 2.9344, "nll_loss": 0.6513076424598694, "rewards/accuracies": 0.0, "rewards/chosen": 1.397552490234375, "rewards/margins": -1.630345344543457, "rewards/rejected": 3.027897834777832, "step": 3457 }, { "epoch": 0.5763333333333334, "grad_norm": 52.06433868408203, "learning_rate": 8.026601558294889e-08, "logits/chosen": 1.4123526811599731, "logits/rejected": 1.7632943391799927, "logps/chosen": -20.1866512298584, "logps/rejected": -88.99905395507812, "loss": 0.8046, "nll_loss": 0.7209517359733582, "rewards/accuracies": 1.0, "rewards/chosen": 2.318969964981079, "rewards/margins": 4.3056182861328125, "rewards/rejected": -1.9866485595703125, "step": 3458 }, { "epoch": 0.5765, "grad_norm": 42.87034225463867, "learning_rate": 8.021310069982623e-08, "logits/chosen": 2.709895372390747, "logits/rejected": 2.686739206314087, "logps/chosen": -24.120712280273438, "logps/rejected": -64.51005554199219, "loss": 0.8537, "nll_loss": 0.8317487239837646, "rewards/accuracies": 1.0, "rewards/chosen": 1.6438438892364502, "rewards/margins": 6.031497955322266, "rewards/rejected": -4.387653827667236, "step": 3459 }, { "epoch": 0.5766666666666667, "grad_norm": 52.74860382080078, "learning_rate": 8.0160191582131e-08, "logits/chosen": 2.764408826828003, "logits/rejected": 3.1252684593200684, "logps/chosen": -57.18403244018555, "logps/rejected": -122.80231475830078, "loss": 0.8809, "nll_loss": 0.8664246201515198, "rewards/accuracies": 1.0, "rewards/chosen": 1.6688365936279297, "rewards/margins": 7.692052364349365, "rewards/rejected": -6.0232157707214355, "step": 3460 }, { "epoch": 0.5768333333333333, "grad_norm": 50.11970138549805, "learning_rate": 8.010728824527963e-08, "logits/chosen": 1.902411937713623, "logits/rejected": 1.4932153224945068, "logps/chosen": -36.29888153076172, "logps/rejected": -46.09079360961914, "loss": 1.0075, "nll_loss": 0.9074720144271851, "rewards/accuracies": 1.0, "rewards/chosen": 0.9370468854904175, "rewards/margins": 3.333446979522705, "rewards/rejected": -2.396400213241577, "step": 3461 }, { "epoch": 0.577, "grad_norm": 37.02466583251953, "learning_rate": 8.005439070468691e-08, "logits/chosen": 0.7475921511650085, "logits/rejected": 2.8580050468444824, "logps/chosen": -55.20941925048828, "logps/rejected": -103.12500762939453, "loss": 0.9165, "nll_loss": 0.8904744982719421, "rewards/accuracies": 1.0, "rewards/chosen": 0.9548935294151306, "rewards/margins": 7.943256378173828, "rewards/rejected": -6.988362789154053, "step": 3462 }, { "epoch": 0.5771666666666667, "grad_norm": 34.894283294677734, "learning_rate": 8.000149897576587e-08, "logits/chosen": 0.4536297023296356, "logits/rejected": 1.9298123121261597, "logps/chosen": -64.16295623779297, "logps/rejected": -350.8792724609375, "loss": 1.0741, "nll_loss": 1.0518518686294556, "rewards/accuracies": 1.0, "rewards/chosen": 1.0907478332519531, "rewards/margins": 10.721952438354492, "rewards/rejected": -9.631204605102539, "step": 3463 }, { "epoch": 0.5773333333333334, "grad_norm": 70.13838958740234, "learning_rate": 7.994861307392793e-08, "logits/chosen": 2.9042160511016846, "logits/rejected": 2.888047218322754, "logps/chosen": -11.849188804626465, "logps/rejected": -61.53382873535156, "loss": 0.9384, "nll_loss": 0.9114760160446167, "rewards/accuracies": 1.0, "rewards/chosen": 1.3294949531555176, "rewards/margins": 5.754300594329834, "rewards/rejected": -4.424805641174316, "step": 3464 }, { "epoch": 0.5775, "grad_norm": 27.29573631286621, "learning_rate": 7.989573301458272e-08, "logits/chosen": 1.4438214302062988, "logits/rejected": 1.9910616874694824, "logps/chosen": -62.221031188964844, "logps/rejected": -326.6994934082031, "loss": 0.7511, "nll_loss": 0.7235003709793091, "rewards/accuracies": 1.0, "rewards/chosen": 0.8586158752441406, "rewards/margins": 10.83266830444336, "rewards/rejected": -9.974052429199219, "step": 3465 }, { "epoch": 0.5776666666666667, "grad_norm": 19.794837951660156, "learning_rate": 7.984285881313829e-08, "logits/chosen": 2.4100348949432373, "logits/rejected": 2.5967676639556885, "logps/chosen": -84.09942626953125, "logps/rejected": -387.0321960449219, "loss": 0.7651, "nll_loss": 0.7508877515792847, "rewards/accuracies": 1.0, "rewards/chosen": 1.5563164949417114, "rewards/margins": 11.165255546569824, "rewards/rejected": -9.608939170837402, "step": 3466 }, { "epoch": 0.5778333333333333, "grad_norm": 43.45339584350586, "learning_rate": 7.978999048500084e-08, "logits/chosen": 3.3272318840026855, "logits/rejected": 3.3600220680236816, "logps/chosen": -79.83190155029297, "logps/rejected": -154.59860229492188, "loss": 1.4454, "nll_loss": 1.4255696535110474, "rewards/accuracies": 1.0, "rewards/chosen": 1.4639763832092285, "rewards/margins": 6.625283241271973, "rewards/rejected": -5.161306858062744, "step": 3467 }, { "epoch": 0.578, "grad_norm": 27.91092872619629, "learning_rate": 7.9737128045575e-08, "logits/chosen": 2.39758038520813, "logits/rejected": 2.2809805870056152, "logps/chosen": -67.94119262695312, "logps/rejected": -103.34611511230469, "loss": 0.8482, "nll_loss": 0.8285512328147888, "rewards/accuracies": 1.0, "rewards/chosen": 2.2941651344299316, "rewards/margins": 6.205918312072754, "rewards/rejected": -3.9117531776428223, "step": 3468 }, { "epoch": 0.5781666666666667, "grad_norm": 55.166107177734375, "learning_rate": 7.968427151026354e-08, "logits/chosen": 2.2182514667510986, "logits/rejected": 2.564750909805298, "logps/chosen": -50.82084655761719, "logps/rejected": -590.3712158203125, "loss": 1.2681, "nll_loss": 1.2395328283309937, "rewards/accuracies": 1.0, "rewards/chosen": 1.1765854358673096, "rewards/margins": 5.7766218185424805, "rewards/rejected": -4.60003662109375, "step": 3469 }, { "epoch": 0.5783333333333334, "grad_norm": 22.918407440185547, "learning_rate": 7.963142089446767e-08, "logits/chosen": 2.2748055458068848, "logits/rejected": 2.451873540878296, "logps/chosen": -44.38890838623047, "logps/rejected": -259.8905029296875, "loss": 0.5676, "nll_loss": 0.5480112433433533, "rewards/accuracies": 1.0, "rewards/chosen": 1.219004511833191, "rewards/margins": 11.831059455871582, "rewards/rejected": -10.612054824829102, "step": 3470 }, { "epoch": 0.5785, "grad_norm": 93.55738067626953, "learning_rate": 7.957857621358673e-08, "logits/chosen": 2.901874303817749, "logits/rejected": 2.8034250736236572, "logps/chosen": -7.883364677429199, "logps/rejected": -27.318561553955078, "loss": 0.7814, "nll_loss": 0.3284735381603241, "rewards/accuracies": 1.0, "rewards/chosen": 0.9839221239089966, "rewards/margins": 1.2686095237731934, "rewards/rejected": -0.28468742966651917, "step": 3471 }, { "epoch": 0.5786666666666667, "grad_norm": 114.7913818359375, "learning_rate": 7.952573748301841e-08, "logits/chosen": 2.9860174655914307, "logits/rejected": 3.168945074081421, "logps/chosen": -35.67545700073242, "logps/rejected": -176.03439331054688, "loss": 1.1008, "nll_loss": 0.7280705571174622, "rewards/accuracies": 1.0, "rewards/chosen": 0.7146885395050049, "rewards/margins": 1.4516667127609253, "rewards/rejected": -0.7369781732559204, "step": 3472 }, { "epoch": 0.5788333333333333, "grad_norm": 36.78938674926758, "learning_rate": 7.947290471815864e-08, "logits/chosen": 1.5493438243865967, "logits/rejected": 1.7787925004959106, "logps/chosen": -10.899361610412598, "logps/rejected": -86.4771499633789, "loss": 0.3675, "nll_loss": 0.33028364181518555, "rewards/accuracies": 1.0, "rewards/chosen": 1.784949541091919, "rewards/margins": 5.054237365722656, "rewards/rejected": -3.269287586212158, "step": 3473 }, { "epoch": 0.579, "grad_norm": 26.087257385253906, "learning_rate": 7.942007793440164e-08, "logits/chosen": 1.537367343902588, "logits/rejected": 1.7251017093658447, "logps/chosen": -29.080612182617188, "logps/rejected": -125.25630187988281, "loss": 0.4463, "nll_loss": 0.4340391159057617, "rewards/accuracies": 1.0, "rewards/chosen": 1.764778971672058, "rewards/margins": 8.7471284866333, "rewards/rejected": -6.982349395751953, "step": 3474 }, { "epoch": 0.5791666666666667, "grad_norm": 31.119115829467773, "learning_rate": 7.936725714713984e-08, "logits/chosen": 1.956364631652832, "logits/rejected": 1.618060827255249, "logps/chosen": -256.982177734375, "logps/rejected": -269.3703308105469, "loss": 1.3144, "nll_loss": 1.2978895902633667, "rewards/accuracies": 1.0, "rewards/chosen": 1.5521973371505737, "rewards/margins": 7.328953742980957, "rewards/rejected": -5.776756286621094, "step": 3475 }, { "epoch": 0.5793333333333334, "grad_norm": 27.226036071777344, "learning_rate": 7.931444237176398e-08, "logits/chosen": 1.7728527784347534, "logits/rejected": 1.3893877267837524, "logps/chosen": -96.73339080810547, "logps/rejected": -120.27149963378906, "loss": 0.8601, "nll_loss": 0.8339085578918457, "rewards/accuracies": 1.0, "rewards/chosen": 1.0207833051681519, "rewards/margins": 6.8385186195373535, "rewards/rejected": -5.817735195159912, "step": 3476 }, { "epoch": 0.5795, "grad_norm": 35.3375244140625, "learning_rate": 7.926163362366298e-08, "logits/chosen": 2.430155038833618, "logits/rejected": 2.666013240814209, "logps/chosen": -53.011505126953125, "logps/rejected": -326.62457275390625, "loss": 0.7629, "nll_loss": 0.7573072910308838, "rewards/accuracies": 1.0, "rewards/chosen": 2.5012099742889404, "rewards/margins": 17.22429847717285, "rewards/rejected": -14.723089218139648, "step": 3477 }, { "epoch": 0.5796666666666667, "grad_norm": 36.66256332397461, "learning_rate": 7.920883091822408e-08, "logits/chosen": 3.2722573280334473, "logits/rejected": 3.276822566986084, "logps/chosen": -30.197450637817383, "logps/rejected": -106.66244506835938, "loss": 0.6734, "nll_loss": 0.6162744164466858, "rewards/accuracies": 1.0, "rewards/chosen": 1.6864078044891357, "rewards/margins": 4.399980545043945, "rewards/rejected": -2.7135727405548096, "step": 3478 }, { "epoch": 0.5798333333333333, "grad_norm": 194.36643981933594, "learning_rate": 7.915603427083269e-08, "logits/chosen": 1.0188833475112915, "logits/rejected": 2.913285970687866, "logps/chosen": -79.282470703125, "logps/rejected": -490.65142822265625, "loss": 3.755, "nll_loss": 3.1712987422943115, "rewards/accuracies": 1.0, "rewards/chosen": -2.9003546237945557, "rewards/margins": 1.3888847827911377, "rewards/rejected": -4.289239406585693, "step": 3479 }, { "epoch": 0.58, "grad_norm": 30.280887603759766, "learning_rate": 7.910324369687249e-08, "logits/chosen": 2.0520834922790527, "logits/rejected": 2.242859125137329, "logps/chosen": -41.8265495300293, "logps/rejected": -290.7059631347656, "loss": 0.8728, "nll_loss": 0.8713864684104919, "rewards/accuracies": 1.0, "rewards/chosen": 5.219760417938232, "rewards/margins": 11.552459716796875, "rewards/rejected": -6.332699775695801, "step": 3480 }, { "epoch": 0.5801666666666667, "grad_norm": 273.9982604980469, "learning_rate": 7.905045921172542e-08, "logits/chosen": 2.967968225479126, "logits/rejected": 2.7806875705718994, "logps/chosen": -180.468017578125, "logps/rejected": -59.14800262451172, "loss": 2.5465, "nll_loss": 0.7457355856895447, "rewards/accuracies": 0.0, "rewards/chosen": 1.6144685745239258, "rewards/margins": -0.9966254234313965, "rewards/rejected": 2.6110939979553223, "step": 3481 }, { "epoch": 0.5803333333333334, "grad_norm": 30.228214263916016, "learning_rate": 7.899768083077155e-08, "logits/chosen": 2.0453763008117676, "logits/rejected": 2.0880086421966553, "logps/chosen": -2.028440237045288, "logps/rejected": -123.09950256347656, "loss": 0.1233, "nll_loss": 0.0922018364071846, "rewards/accuracies": 1.0, "rewards/chosen": 0.968269944190979, "rewards/margins": 5.901180267333984, "rewards/rejected": -4.932910442352295, "step": 3482 }, { "epoch": 0.5805, "grad_norm": 138.75115966796875, "learning_rate": 7.89449085693893e-08, "logits/chosen": 2.7520971298217773, "logits/rejected": 2.6627557277679443, "logps/chosen": -25.404682159423828, "logps/rejected": -47.34284591674805, "loss": 1.5652, "nll_loss": 1.4943931102752686, "rewards/accuracies": 1.0, "rewards/chosen": 0.20646458864212036, "rewards/margins": 4.257940292358398, "rewards/rejected": -4.051475524902344, "step": 3483 }, { "epoch": 0.5806666666666667, "grad_norm": 106.88539123535156, "learning_rate": 7.889214244295519e-08, "logits/chosen": 2.576280355453491, "logits/rejected": 2.9057729244232178, "logps/chosen": -319.0937805175781, "logps/rejected": -363.5104675292969, "loss": 1.8854, "nll_loss": 1.43735933303833, "rewards/accuracies": 1.0, "rewards/chosen": -3.182852268218994, "rewards/margins": 6.953808307647705, "rewards/rejected": -10.1366605758667, "step": 3484 }, { "epoch": 0.5808333333333333, "grad_norm": 25.49188804626465, "learning_rate": 7.883938246684404e-08, "logits/chosen": 0.9885165095329285, "logits/rejected": 2.0226314067840576, "logps/chosen": -109.27806091308594, "logps/rejected": -352.1912841796875, "loss": 0.9332, "nll_loss": 0.9183029532432556, "rewards/accuracies": 1.0, "rewards/chosen": 1.6272705793380737, "rewards/margins": 7.693695545196533, "rewards/rejected": -6.06642484664917, "step": 3485 }, { "epoch": 0.581, "grad_norm": 34.432044982910156, "learning_rate": 7.87866286564288e-08, "logits/chosen": 2.306323289871216, "logits/rejected": 2.237229824066162, "logps/chosen": -55.40262222290039, "logps/rejected": -75.8577651977539, "loss": 0.92, "nll_loss": 0.826904833316803, "rewards/accuracies": 1.0, "rewards/chosen": 2.4711246490478516, "rewards/margins": 4.294028282165527, "rewards/rejected": -1.8229036331176758, "step": 3486 }, { "epoch": 0.5811666666666667, "grad_norm": 30.759891510009766, "learning_rate": 7.87338810270807e-08, "logits/chosen": 2.7335364818573, "logits/rejected": 2.9020473957061768, "logps/chosen": -23.08574676513672, "logps/rejected": -323.5591735839844, "loss": 0.6418, "nll_loss": 0.641270637512207, "rewards/accuracies": 1.0, "rewards/chosen": 5.003371715545654, "rewards/margins": 15.504268646240234, "rewards/rejected": -10.500896453857422, "step": 3487 }, { "epoch": 0.5813333333333334, "grad_norm": 54.72574234008789, "learning_rate": 7.86811395941691e-08, "logits/chosen": 2.5601091384887695, "logits/rejected": 2.5283384323120117, "logps/chosen": -12.658707618713379, "logps/rejected": -64.94012451171875, "loss": 0.5816, "nll_loss": 0.5753958225250244, "rewards/accuracies": 1.0, "rewards/chosen": 3.6299986839294434, "rewards/margins": 8.540060043334961, "rewards/rejected": -4.910061836242676, "step": 3488 }, { "epoch": 0.5815, "grad_norm": 28.528894424438477, "learning_rate": 7.862840437306164e-08, "logits/chosen": 2.1203761100769043, "logits/rejected": 2.105297327041626, "logps/chosen": -27.652366638183594, "logps/rejected": -100.20144653320312, "loss": 0.5991, "nll_loss": 0.5883481502532959, "rewards/accuracies": 1.0, "rewards/chosen": 2.301487445831299, "rewards/margins": 7.405633449554443, "rewards/rejected": -5.1041460037231445, "step": 3489 }, { "epoch": 0.5816666666666667, "grad_norm": 28.61347007751465, "learning_rate": 7.857567537912403e-08, "logits/chosen": 2.288555383682251, "logits/rejected": 2.4073486328125, "logps/chosen": -31.688554763793945, "logps/rejected": -88.31742095947266, "loss": 0.6579, "nll_loss": 0.646705150604248, "rewards/accuracies": 1.0, "rewards/chosen": 1.8300169706344604, "rewards/margins": 9.246515274047852, "rewards/rejected": -7.41649866104126, "step": 3490 }, { "epoch": 0.5818333333333333, "grad_norm": 73.95133209228516, "learning_rate": 7.852295262772028e-08, "logits/chosen": 2.0729832649230957, "logits/rejected": 2.092390298843384, "logps/chosen": -39.88347244262695, "logps/rejected": -70.05050659179688, "loss": 1.1843, "nll_loss": 0.6538275480270386, "rewards/accuracies": 1.0, "rewards/chosen": 2.302992343902588, "rewards/margins": 1.625380039215088, "rewards/rejected": 0.6776123642921448, "step": 3491 }, { "epoch": 0.582, "grad_norm": 121.42243194580078, "learning_rate": 7.847023613421251e-08, "logits/chosen": 2.2952029705047607, "logits/rejected": 2.4862711429595947, "logps/chosen": -33.20497131347656, "logps/rejected": -126.51290130615234, "loss": 2.1314, "nll_loss": 2.0753109455108643, "rewards/accuracies": 1.0, "rewards/chosen": 0.5754623413085938, "rewards/margins": 4.471346855163574, "rewards/rejected": -3.8958847522735596, "step": 3492 }, { "epoch": 0.5821666666666667, "grad_norm": 115.80804443359375, "learning_rate": 7.841752591396108e-08, "logits/chosen": 3.1463236808776855, "logits/rejected": 3.055819034576416, "logps/chosen": -44.83789825439453, "logps/rejected": -46.47937774658203, "loss": 1.4797, "nll_loss": 1.179944634437561, "rewards/accuracies": 1.0, "rewards/chosen": -0.891356348991394, "rewards/margins": 1.6755915880203247, "rewards/rejected": -2.5669479370117188, "step": 3493 }, { "epoch": 0.5823333333333334, "grad_norm": 61.69679260253906, "learning_rate": 7.836482198232441e-08, "logits/chosen": 1.0740214586257935, "logits/rejected": 1.969094157218933, "logps/chosen": -11.569971084594727, "logps/rejected": -256.2380676269531, "loss": 0.8355, "nll_loss": 0.8264265656471252, "rewards/accuracies": 1.0, "rewards/chosen": 2.5328385829925537, "rewards/margins": 7.689485549926758, "rewards/rejected": -5.156646728515625, "step": 3494 }, { "epoch": 0.5825, "grad_norm": 26.755615234375, "learning_rate": 7.831212435465924e-08, "logits/chosen": 1.1546918153762817, "logits/rejected": 1.4158300161361694, "logps/chosen": -30.804218292236328, "logps/rejected": -196.81430053710938, "loss": 0.5459, "nll_loss": 0.513403594493866, "rewards/accuracies": 1.0, "rewards/chosen": 1.8963088989257812, "rewards/margins": 5.298971652984619, "rewards/rejected": -3.402662754058838, "step": 3495 }, { "epoch": 0.5826666666666667, "grad_norm": 49.71653366088867, "learning_rate": 7.825943304632032e-08, "logits/chosen": 2.181814670562744, "logits/rejected": 2.438965082168579, "logps/chosen": -5.022439956665039, "logps/rejected": -188.38816833496094, "loss": 0.3941, "nll_loss": 0.38634154200553894, "rewards/accuracies": 1.0, "rewards/chosen": 2.2387282848358154, "rewards/margins": 9.375322341918945, "rewards/rejected": -7.136593818664551, "step": 3496 }, { "epoch": 0.5828333333333333, "grad_norm": 20.79303550720215, "learning_rate": 7.82067480726607e-08, "logits/chosen": 3.2162675857543945, "logits/rejected": 3.209932804107666, "logps/chosen": -93.06256103515625, "logps/rejected": -164.64874267578125, "loss": 0.866, "nll_loss": 0.8616905212402344, "rewards/accuracies": 1.0, "rewards/chosen": 3.347734212875366, "rewards/margins": 9.113045692443848, "rewards/rejected": -5.765311241149902, "step": 3497 }, { "epoch": 0.583, "grad_norm": 55.76864242553711, "learning_rate": 7.815406944903147e-08, "logits/chosen": 2.5150270462036133, "logits/rejected": 2.9957501888275146, "logps/chosen": -15.436633110046387, "logps/rejected": -317.45550537109375, "loss": 0.6725, "nll_loss": 0.6431930065155029, "rewards/accuracies": 1.0, "rewards/chosen": 0.7884412407875061, "rewards/margins": 9.701146125793457, "rewards/rejected": -8.912704467773438, "step": 3498 }, { "epoch": 0.5831666666666667, "grad_norm": 26.370420455932617, "learning_rate": 7.810139719078195e-08, "logits/chosen": 2.007323980331421, "logits/rejected": 2.0765860080718994, "logps/chosen": -92.81916809082031, "logps/rejected": -282.7254638671875, "loss": 1.0168, "nll_loss": 1.0089040994644165, "rewards/accuracies": 1.0, "rewards/chosen": 2.1461944580078125, "rewards/margins": 13.98077392578125, "rewards/rejected": -11.834579467773438, "step": 3499 }, { "epoch": 0.5833333333333334, "grad_norm": 34.48750686645508, "learning_rate": 7.804873131325953e-08, "logits/chosen": 2.755596399307251, "logits/rejected": 2.833538293838501, "logps/chosen": -6.4994306564331055, "logps/rejected": -79.72956085205078, "loss": 0.3265, "nll_loss": 0.3094967007637024, "rewards/accuracies": 1.0, "rewards/chosen": 1.4864290952682495, "rewards/margins": 7.457394123077393, "rewards/rejected": -5.9709649085998535, "step": 3500 }, { "epoch": 0.5835, "grad_norm": 33.454193115234375, "learning_rate": 7.79960718318098e-08, "logits/chosen": 2.100827217102051, "logits/rejected": 2.1833739280700684, "logps/chosen": -76.04405975341797, "logps/rejected": -130.792236328125, "loss": 1.1338, "nll_loss": 1.0863438844680786, "rewards/accuracies": 1.0, "rewards/chosen": 1.4581489562988281, "rewards/margins": 4.585956573486328, "rewards/rejected": -3.1278076171875, "step": 3501 }, { "epoch": 0.5836666666666667, "grad_norm": 24.021577835083008, "learning_rate": 7.79434187617765e-08, "logits/chosen": 2.2636756896972656, "logits/rejected": 2.5673062801361084, "logps/chosen": -27.16179656982422, "logps/rejected": -110.71277618408203, "loss": 0.4663, "nll_loss": 0.45269668102264404, "rewards/accuracies": 1.0, "rewards/chosen": 2.4027504920959473, "rewards/margins": 6.85931396484375, "rewards/rejected": -4.456563472747803, "step": 3502 }, { "epoch": 0.5838333333333333, "grad_norm": 106.05935668945312, "learning_rate": 7.789077211850143e-08, "logits/chosen": 2.6305184364318848, "logits/rejected": 2.601229190826416, "logps/chosen": -52.574615478515625, "logps/rejected": -4.052043437957764, "loss": 1.8462, "nll_loss": 0.6490693688392639, "rewards/accuracies": 0.0, "rewards/chosen": 1.5011672973632812, "rewards/margins": -0.16626358032226562, "rewards/rejected": 1.6674308776855469, "step": 3503 }, { "epoch": 0.584, "grad_norm": 24.503437042236328, "learning_rate": 7.78381319173246e-08, "logits/chosen": 1.3620959520339966, "logits/rejected": 2.117849111557007, "logps/chosen": -186.11404418945312, "logps/rejected": -327.4519958496094, "loss": 0.9869, "nll_loss": 0.9744190573692322, "rewards/accuracies": 1.0, "rewards/chosen": 1.682835578918457, "rewards/margins": 11.643791198730469, "rewards/rejected": -9.960955619812012, "step": 3504 }, { "epoch": 0.5841666666666666, "grad_norm": 43.18165969848633, "learning_rate": 7.778549817358403e-08, "logits/chosen": 1.7781814336776733, "logits/rejected": 1.8842947483062744, "logps/chosen": -63.674747467041016, "logps/rejected": -84.34730529785156, "loss": 1.1561, "nll_loss": 1.1370490789413452, "rewards/accuracies": 1.0, "rewards/chosen": 1.3759167194366455, "rewards/margins": 7.182334899902344, "rewards/rejected": -5.806418418884277, "step": 3505 }, { "epoch": 0.5843333333333334, "grad_norm": 34.06118392944336, "learning_rate": 7.773287090261601e-08, "logits/chosen": 0.8928617238998413, "logits/rejected": 1.6563162803649902, "logps/chosen": -22.31877326965332, "logps/rejected": -281.4034423828125, "loss": 0.6989, "nll_loss": 0.697461724281311, "rewards/accuracies": 1.0, "rewards/chosen": 3.9794726371765137, "rewards/margins": 12.19700813293457, "rewards/rejected": -8.217535972595215, "step": 3506 }, { "epoch": 0.5845, "grad_norm": 26.149629592895508, "learning_rate": 7.76802501197548e-08, "logits/chosen": 2.5057735443115234, "logits/rejected": 2.2405741214752197, "logps/chosen": -95.72274017333984, "logps/rejected": -74.49052429199219, "loss": 0.935, "nll_loss": 0.9116452932357788, "rewards/accuracies": 1.0, "rewards/chosen": 2.2323670387268066, "rewards/margins": 5.9229936599731445, "rewards/rejected": -3.690626382827759, "step": 3507 }, { "epoch": 0.5846666666666667, "grad_norm": 114.42324829101562, "learning_rate": 7.762763584033289e-08, "logits/chosen": 1.8546591997146606, "logits/rejected": 1.8558133840560913, "logps/chosen": -13.931940078735352, "logps/rejected": -10.632559776306152, "loss": 1.8302, "nll_loss": 0.3239985704421997, "rewards/accuracies": 0.0, "rewards/chosen": 2.9838294982910156, "rewards/margins": -0.21913838386535645, "rewards/rejected": 3.202967882156372, "step": 3508 }, { "epoch": 0.5848333333333333, "grad_norm": 52.01879119873047, "learning_rate": 7.757502807968075e-08, "logits/chosen": 1.4110761880874634, "logits/rejected": 2.828495740890503, "logps/chosen": -37.27561569213867, "logps/rejected": -85.90546417236328, "loss": 1.2889, "nll_loss": 1.2853660583496094, "rewards/accuracies": 1.0, "rewards/chosen": 3.987823963165283, "rewards/margins": 9.562196731567383, "rewards/rejected": -5.574373245239258, "step": 3509 }, { "epoch": 0.585, "grad_norm": 25.13865089416504, "learning_rate": 7.752242685312709e-08, "logits/chosen": 2.600738525390625, "logits/rejected": 2.950148582458496, "logps/chosen": -92.52980041503906, "logps/rejected": -236.5784912109375, "loss": 0.9587, "nll_loss": 0.9539154767990112, "rewards/accuracies": 1.0, "rewards/chosen": 2.9417619705200195, "rewards/margins": 9.203081130981445, "rewards/rejected": -6.261319637298584, "step": 3510 }, { "epoch": 0.5851666666666666, "grad_norm": 40.67952346801758, "learning_rate": 7.746983217599857e-08, "logits/chosen": 0.5245911478996277, "logits/rejected": 1.7097245454788208, "logps/chosen": -35.69144821166992, "logps/rejected": -302.47698974609375, "loss": 0.994, "nll_loss": 0.9914290308952332, "rewards/accuracies": 1.0, "rewards/chosen": 3.6793949604034424, "rewards/margins": 10.254090309143066, "rewards/rejected": -6.574695110321045, "step": 3511 }, { "epoch": 0.5853333333333334, "grad_norm": 64.19261169433594, "learning_rate": 7.741724406362008e-08, "logits/chosen": 2.123527765274048, "logits/rejected": 1.9887479543685913, "logps/chosen": -35.17728042602539, "logps/rejected": -49.86826705932617, "loss": 0.8641, "nll_loss": 0.6281657814979553, "rewards/accuracies": 1.0, "rewards/chosen": 1.7061595916748047, "rewards/margins": 2.5380539894104004, "rewards/rejected": -0.8318943381309509, "step": 3512 }, { "epoch": 0.5855, "grad_norm": 143.78009033203125, "learning_rate": 7.73646625313145e-08, "logits/chosen": 2.21034574508667, "logits/rejected": 1.890047550201416, "logps/chosen": -16.365949630737305, "logps/rejected": -36.20773696899414, "loss": 3.7593, "nll_loss": 0.6294595003128052, "rewards/accuracies": 0.0, "rewards/chosen": 2.7137980461120605, "rewards/margins": -2.293058395385742, "rewards/rejected": 5.006856441497803, "step": 3513 }, { "epoch": 0.5856666666666667, "grad_norm": 27.466983795166016, "learning_rate": 7.731208759440287e-08, "logits/chosen": 2.8359410762786865, "logits/rejected": 2.9044349193573, "logps/chosen": -29.970355987548828, "logps/rejected": -209.4024200439453, "loss": 0.5496, "nll_loss": 0.5257956981658936, "rewards/accuracies": 1.0, "rewards/chosen": 4.419374465942383, "rewards/margins": 7.603391170501709, "rewards/rejected": -3.184016704559326, "step": 3514 }, { "epoch": 0.5858333333333333, "grad_norm": 33.919769287109375, "learning_rate": 7.72595192682042e-08, "logits/chosen": 2.5366029739379883, "logits/rejected": 2.6402788162231445, "logps/chosen": -75.5247802734375, "logps/rejected": -168.35552978515625, "loss": 1.1159, "nll_loss": 1.0789254903793335, "rewards/accuracies": 1.0, "rewards/chosen": 0.7694130539894104, "rewards/margins": 5.638224124908447, "rewards/rejected": -4.868811130523682, "step": 3515 }, { "epoch": 0.586, "grad_norm": 119.20795440673828, "learning_rate": 7.720695756803567e-08, "logits/chosen": 1.9581578969955444, "logits/rejected": 2.339331865310669, "logps/chosen": -60.61642837524414, "logps/rejected": -96.11853790283203, "loss": 1.284, "nll_loss": 0.7131344676017761, "rewards/accuracies": 1.0, "rewards/chosen": 1.1607433557510376, "rewards/margins": 0.9896755218505859, "rewards/rejected": 0.17106783390045166, "step": 3516 }, { "epoch": 0.5861666666666666, "grad_norm": 18.560211181640625, "learning_rate": 7.71544025092125e-08, "logits/chosen": 2.921311616897583, "logits/rejected": 2.630082607269287, "logps/chosen": -140.99200439453125, "logps/rejected": -53.39106750488281, "loss": 0.7841, "nll_loss": 0.7343334555625916, "rewards/accuracies": 1.0, "rewards/chosen": 2.870976448059082, "rewards/margins": 5.378801345825195, "rewards/rejected": -2.507824659347534, "step": 3517 }, { "epoch": 0.5863333333333334, "grad_norm": 90.4047622680664, "learning_rate": 7.710185410704798e-08, "logits/chosen": 1.743227243423462, "logits/rejected": 1.7918894290924072, "logps/chosen": -10.716991424560547, "logps/rejected": -29.717708587646484, "loss": 0.9625, "nll_loss": 0.3572330176830292, "rewards/accuracies": 1.0, "rewards/chosen": 1.2532758712768555, "rewards/margins": 0.9324239492416382, "rewards/rejected": 0.3208519220352173, "step": 3518 }, { "epoch": 0.5865, "grad_norm": 436.7142333984375, "learning_rate": 7.704931237685341e-08, "logits/chosen": 2.338312864303589, "logits/rejected": 2.5428783893585205, "logps/chosen": -75.44502258300781, "logps/rejected": -133.31997680664062, "loss": 5.6605, "nll_loss": 2.0956947803497314, "rewards/accuracies": 0.0, "rewards/chosen": -1.9615375995635986, "rewards/margins": -3.4568989276885986, "rewards/rejected": 1.495361328125, "step": 3519 }, { "epoch": 0.5866666666666667, "grad_norm": 30.626108169555664, "learning_rate": 7.699677733393826e-08, "logits/chosen": 1.8965778350830078, "logits/rejected": 2.5063369274139404, "logps/chosen": -75.34611511230469, "logps/rejected": -302.9351806640625, "loss": 1.0405, "nll_loss": 1.032138466835022, "rewards/accuracies": 1.0, "rewards/chosen": 2.1077728271484375, "rewards/margins": 11.075491905212402, "rewards/rejected": -8.967719078063965, "step": 3520 }, { "epoch": 0.5868333333333333, "grad_norm": 21.12318992614746, "learning_rate": 7.694424899360991e-08, "logits/chosen": 1.7069791555404663, "logits/rejected": 0.7733943462371826, "logps/chosen": -155.825927734375, "logps/rejected": -75.91815948486328, "loss": 1.0829, "nll_loss": 1.0746614933013916, "rewards/accuracies": 1.0, "rewards/chosen": 2.647007703781128, "rewards/margins": 7.866030693054199, "rewards/rejected": -5.219022750854492, "step": 3521 }, { "epoch": 0.587, "grad_norm": 32.072635650634766, "learning_rate": 7.689172737117389e-08, "logits/chosen": 1.4149576425552368, "logits/rejected": 2.073805332183838, "logps/chosen": -31.33646011352539, "logps/rejected": -268.9729919433594, "loss": 0.6816, "nll_loss": 0.6812273263931274, "rewards/accuracies": 1.0, "rewards/chosen": 5.3810224533081055, "rewards/margins": 15.484728813171387, "rewards/rejected": -10.103706359863281, "step": 3522 }, { "epoch": 0.5871666666666666, "grad_norm": 28.389806747436523, "learning_rate": 7.683921248193375e-08, "logits/chosen": 1.4937589168548584, "logits/rejected": 2.3570566177368164, "logps/chosen": -11.303507804870605, "logps/rejected": -165.13674926757812, "loss": 0.3644, "nll_loss": 0.35323458909988403, "rewards/accuracies": 1.0, "rewards/chosen": 3.7940704822540283, "rewards/margins": 7.899884223937988, "rewards/rejected": -4.105813503265381, "step": 3523 }, { "epoch": 0.5873333333333334, "grad_norm": 25.942659378051758, "learning_rate": 7.678670434119105e-08, "logits/chosen": 1.4614336490631104, "logits/rejected": 1.5857388973236084, "logps/chosen": -71.74681091308594, "logps/rejected": -167.73477172851562, "loss": 0.7339, "nll_loss": 0.7103644609451294, "rewards/accuracies": 1.0, "rewards/chosen": 1.0487350225448608, "rewards/margins": 8.303733825683594, "rewards/rejected": -7.254998683929443, "step": 3524 }, { "epoch": 0.5875, "grad_norm": 30.0970401763916, "learning_rate": 7.673420296424542e-08, "logits/chosen": 2.562330722808838, "logits/rejected": 2.772771120071411, "logps/chosen": -25.84917640686035, "logps/rejected": -399.2286682128906, "loss": 0.54, "nll_loss": 0.5068466663360596, "rewards/accuracies": 1.0, "rewards/chosen": 0.6642034649848938, "rewards/margins": 8.644895553588867, "rewards/rejected": -7.980691909790039, "step": 3525 }, { "epoch": 0.5876666666666667, "grad_norm": 85.89446258544922, "learning_rate": 7.668170836639445e-08, "logits/chosen": 2.390791654586792, "logits/rejected": 2.147312641143799, "logps/chosen": -61.3833122253418, "logps/rejected": -69.54351806640625, "loss": 1.3597, "nll_loss": 0.7869654297828674, "rewards/accuracies": 1.0, "rewards/chosen": 1.5384190082550049, "rewards/margins": 1.1319019794464111, "rewards/rejected": 0.40651705861091614, "step": 3526 }, { "epoch": 0.5878333333333333, "grad_norm": 25.582744598388672, "learning_rate": 7.662922056293388e-08, "logits/chosen": 0.9906366467475891, "logits/rejected": 2.381432056427002, "logps/chosen": -8.987210273742676, "logps/rejected": -367.4371643066406, "loss": 0.3007, "nll_loss": 0.280850350856781, "rewards/accuracies": 1.0, "rewards/chosen": 1.2781213521957397, "rewards/margins": 7.614858627319336, "rewards/rejected": -6.336737155914307, "step": 3527 }, { "epoch": 0.588, "grad_norm": 177.38504028320312, "learning_rate": 7.657673956915735e-08, "logits/chosen": 3.7051925659179688, "logits/rejected": 3.9552948474884033, "logps/chosen": -56.239295959472656, "logps/rejected": -118.56102752685547, "loss": 1.6251, "nll_loss": 0.6464287042617798, "rewards/accuracies": 1.0, "rewards/chosen": 1.3767814636230469, "rewards/margins": 0.17112576961517334, "rewards/rejected": 1.2056556940078735, "step": 3528 }, { "epoch": 0.5881666666666666, "grad_norm": 122.50385284423828, "learning_rate": 7.652426540035657e-08, "logits/chosen": 2.4747724533081055, "logits/rejected": 2.2073729038238525, "logps/chosen": -100.08944702148438, "logps/rejected": -58.58110809326172, "loss": 1.8443, "nll_loss": 1.33452570438385, "rewards/accuracies": 1.0, "rewards/chosen": -0.4356750547885895, "rewards/margins": 0.7845649719238281, "rewards/rejected": -1.2202399969100952, "step": 3529 }, { "epoch": 0.5883333333333334, "grad_norm": 36.80706787109375, "learning_rate": 7.647179807182125e-08, "logits/chosen": 2.426530122756958, "logits/rejected": 2.472745656967163, "logps/chosen": -36.424922943115234, "logps/rejected": -61.436729431152344, "loss": 0.8345, "nll_loss": 0.7749983668327332, "rewards/accuracies": 1.0, "rewards/chosen": 0.9921818375587463, "rewards/margins": 4.161229610443115, "rewards/rejected": -3.1690475940704346, "step": 3530 }, { "epoch": 0.5885, "grad_norm": 141.19317626953125, "learning_rate": 7.641933759883912e-08, "logits/chosen": 2.6384918689727783, "logits/rejected": 2.373272180557251, "logps/chosen": -76.02666473388672, "logps/rejected": -91.18611145019531, "loss": 2.0954, "nll_loss": 1.5205333232879639, "rewards/accuracies": 1.0, "rewards/chosen": -2.2763454914093018, "rewards/margins": 0.9643409252166748, "rewards/rejected": -3.2406864166259766, "step": 3531 }, { "epoch": 0.5886666666666667, "grad_norm": 22.709280014038086, "learning_rate": 7.636688399669588e-08, "logits/chosen": 2.680130958557129, "logits/rejected": 2.7460124492645264, "logps/chosen": -85.35234069824219, "logps/rejected": -212.39739990234375, "loss": 0.8343, "nll_loss": 0.8286635875701904, "rewards/accuracies": 1.0, "rewards/chosen": 2.7983620166778564, "rewards/margins": 8.845553398132324, "rewards/rejected": -6.047191143035889, "step": 3532 }, { "epoch": 0.5888333333333333, "grad_norm": 40.72364807128906, "learning_rate": 7.631443728067529e-08, "logits/chosen": 2.0150198936462402, "logits/rejected": 1.8433459997177124, "logps/chosen": -52.99547576904297, "logps/rejected": -50.53790283203125, "loss": 0.8973, "nll_loss": 0.7360482215881348, "rewards/accuracies": 1.0, "rewards/chosen": 1.5655510425567627, "rewards/margins": 2.9692940711975098, "rewards/rejected": -1.403743028640747, "step": 3533 }, { "epoch": 0.589, "grad_norm": 22.97353744506836, "learning_rate": 7.626199746605902e-08, "logits/chosen": 2.1737213134765625, "logits/rejected": 2.3516201972961426, "logps/chosen": -26.66474723815918, "logps/rejected": -118.00839233398438, "loss": 0.4833, "nll_loss": 0.46780261397361755, "rewards/accuracies": 1.0, "rewards/chosen": 1.5722917318344116, "rewards/margins": 7.731739521026611, "rewards/rejected": -6.15944766998291, "step": 3534 }, { "epoch": 0.5891666666666666, "grad_norm": 27.809728622436523, "learning_rate": 7.620956456812682e-08, "logits/chosen": 2.5396881103515625, "logits/rejected": 2.609127998352051, "logps/chosen": -43.27462387084961, "logps/rejected": -251.74658203125, "loss": 0.648, "nll_loss": 0.6271684765815735, "rewards/accuracies": 1.0, "rewards/chosen": 1.184841513633728, "rewards/margins": 8.174911499023438, "rewards/rejected": -6.99006986618042, "step": 3535 }, { "epoch": 0.5893333333333334, "grad_norm": 29.937698364257812, "learning_rate": 7.615713860215633e-08, "logits/chosen": 2.383942127227783, "logits/rejected": 2.654216766357422, "logps/chosen": -47.74606704711914, "logps/rejected": -327.701904296875, "loss": 0.8378, "nll_loss": 0.8232079148292542, "rewards/accuracies": 1.0, "rewards/chosen": 1.5538244247436523, "rewards/margins": 8.97724723815918, "rewards/rejected": -7.423422336578369, "step": 3536 }, { "epoch": 0.5895, "grad_norm": 87.09233093261719, "learning_rate": 7.610471958342325e-08, "logits/chosen": 2.9604134559631348, "logits/rejected": 3.091961145401001, "logps/chosen": -18.16169548034668, "logps/rejected": -256.3507995605469, "loss": 1.2237, "nll_loss": 1.2107795476913452, "rewards/accuracies": 1.0, "rewards/chosen": 1.6630237102508545, "rewards/margins": 9.729183197021484, "rewards/rejected": -8.06615924835205, "step": 3537 }, { "epoch": 0.5896666666666667, "grad_norm": 20.899608612060547, "learning_rate": 7.60523075272012e-08, "logits/chosen": 2.054872989654541, "logits/rejected": 2.3038554191589355, "logps/chosen": -40.25218200683594, "logps/rejected": -227.17776489257812, "loss": 0.5086, "nll_loss": 0.4969405233860016, "rewards/accuracies": 1.0, "rewards/chosen": 1.7995704412460327, "rewards/margins": 9.000336647033691, "rewards/rejected": -7.200766086578369, "step": 3538 }, { "epoch": 0.5898333333333333, "grad_norm": 47.98706817626953, "learning_rate": 7.599990244876182e-08, "logits/chosen": 2.367720603942871, "logits/rejected": 2.473151922225952, "logps/chosen": -79.2712631225586, "logps/rejected": -184.57342529296875, "loss": 1.019, "nll_loss": 0.9908908009529114, "rewards/accuracies": 1.0, "rewards/chosen": 1.0891541242599487, "rewards/margins": 6.01310920715332, "rewards/rejected": -4.923954963684082, "step": 3539 }, { "epoch": 0.59, "grad_norm": 26.20612335205078, "learning_rate": 7.594750436337466e-08, "logits/chosen": 1.6428014039993286, "logits/rejected": 1.785056710243225, "logps/chosen": -54.46932601928711, "logps/rejected": -128.40240478515625, "loss": 0.6682, "nll_loss": 0.6260843276977539, "rewards/accuracies": 1.0, "rewards/chosen": 1.2698001861572266, "rewards/margins": 4.765871047973633, "rewards/rejected": -3.4960708618164062, "step": 3540 }, { "epoch": 0.5901666666666666, "grad_norm": 33.880062103271484, "learning_rate": 7.58951132863073e-08, "logits/chosen": 2.419503688812256, "logits/rejected": 2.775730609893799, "logps/chosen": -38.197513580322266, "logps/rejected": -664.8028564453125, "loss": 0.8333, "nll_loss": 0.8127132058143616, "rewards/accuracies": 1.0, "rewards/chosen": 1.2793049812316895, "rewards/margins": 7.144490718841553, "rewards/rejected": -5.865185737609863, "step": 3541 }, { "epoch": 0.5903333333333334, "grad_norm": 99.86167907714844, "learning_rate": 7.584272923282523e-08, "logits/chosen": 3.1708970069885254, "logits/rejected": 3.408282995223999, "logps/chosen": -42.77883529663086, "logps/rejected": -200.30397033691406, "loss": 1.6944, "nll_loss": 1.6453399658203125, "rewards/accuracies": 1.0, "rewards/chosen": 0.2431362122297287, "rewards/margins": 7.170684814453125, "rewards/rejected": -6.927548408508301, "step": 3542 }, { "epoch": 0.5905, "grad_norm": 22.788686752319336, "learning_rate": 7.579035221819186e-08, "logits/chosen": 3.1604087352752686, "logits/rejected": 3.1038296222686768, "logps/chosen": -107.82683563232422, "logps/rejected": -191.67697143554688, "loss": 0.8086, "nll_loss": 0.7870572209358215, "rewards/accuracies": 1.0, "rewards/chosen": 1.1156425476074219, "rewards/margins": 9.756636619567871, "rewards/rejected": -8.64099407196045, "step": 3543 }, { "epoch": 0.5906666666666667, "grad_norm": 23.48126792907715, "learning_rate": 7.573798225766868e-08, "logits/chosen": 2.4852700233459473, "logits/rejected": 2.8035287857055664, "logps/chosen": -47.287662506103516, "logps/rejected": -314.3375244140625, "loss": 0.5585, "nll_loss": 0.5498565435409546, "rewards/accuracies": 1.0, "rewards/chosen": 2.0524914264678955, "rewards/margins": 14.424878120422363, "rewards/rejected": -12.372386932373047, "step": 3544 }, { "epoch": 0.5908333333333333, "grad_norm": 15.874468803405762, "learning_rate": 7.568561936651496e-08, "logits/chosen": 1.334916353225708, "logits/rejected": 1.6276233196258545, "logps/chosen": -159.29171752929688, "logps/rejected": -386.0281066894531, "loss": 0.7327, "nll_loss": 0.7273595929145813, "rewards/accuracies": 1.0, "rewards/chosen": 2.5525972843170166, "rewards/margins": 13.250210762023926, "rewards/rejected": -10.697613716125488, "step": 3545 }, { "epoch": 0.591, "grad_norm": 36.788394927978516, "learning_rate": 7.563326355998803e-08, "logits/chosen": 1.2209315299987793, "logits/rejected": 1.915724754333496, "logps/chosen": -18.339439392089844, "logps/rejected": -230.28489685058594, "loss": 0.5744, "nll_loss": 0.5731074213981628, "rewards/accuracies": 1.0, "rewards/chosen": 5.055909633636475, "rewards/margins": 11.541440963745117, "rewards/rejected": -6.485531806945801, "step": 3546 }, { "epoch": 0.5911666666666666, "grad_norm": 49.66835403442383, "learning_rate": 7.558091485334313e-08, "logits/chosen": 2.2862484455108643, "logits/rejected": 2.591430902481079, "logps/chosen": -100.73999786376953, "logps/rejected": -310.8682861328125, "loss": 1.7579, "nll_loss": 1.707457184791565, "rewards/accuracies": 1.0, "rewards/chosen": 0.19967499375343323, "rewards/margins": 7.524898052215576, "rewards/rejected": -7.325222969055176, "step": 3547 }, { "epoch": 0.5913333333333334, "grad_norm": 36.380950927734375, "learning_rate": 7.55285732618334e-08, "logits/chosen": 2.3092074394226074, "logits/rejected": 2.0833933353424072, "logps/chosen": -27.06433868408203, "logps/rejected": -100.574462890625, "loss": 0.6765, "nll_loss": 0.6601058840751648, "rewards/accuracies": 1.0, "rewards/chosen": 1.6528396606445312, "rewards/margins": 6.966844081878662, "rewards/rejected": -5.314004421234131, "step": 3548 }, { "epoch": 0.5915, "grad_norm": 26.686206817626953, "learning_rate": 7.547623880070992e-08, "logits/chosen": 1.6851577758789062, "logits/rejected": 2.1550004482269287, "logps/chosen": -63.65424346923828, "logps/rejected": -181.9295654296875, "loss": 0.7685, "nll_loss": 0.7488732933998108, "rewards/accuracies": 1.0, "rewards/chosen": 1.2239418029785156, "rewards/margins": 8.867533683776855, "rewards/rejected": -7.64359188079834, "step": 3549 }, { "epoch": 0.5916666666666667, "grad_norm": 20.987018585205078, "learning_rate": 7.542391148522167e-08, "logits/chosen": 1.458801507949829, "logits/rejected": 1.493553638458252, "logps/chosen": -35.019432067871094, "logps/rejected": -114.43509674072266, "loss": 0.4686, "nll_loss": 0.4489670693874359, "rewards/accuracies": 1.0, "rewards/chosen": 1.4163841009140015, "rewards/margins": 6.790131092071533, "rewards/rejected": -5.373746871948242, "step": 3550 }, { "epoch": 0.5918333333333333, "grad_norm": 39.57831573486328, "learning_rate": 7.537159133061561e-08, "logits/chosen": 0.929622232913971, "logits/rejected": 2.6962294578552246, "logps/chosen": -17.905895233154297, "logps/rejected": -518.9931640625, "loss": 0.6196, "nll_loss": 0.6174445748329163, "rewards/accuracies": 1.0, "rewards/chosen": 3.5690343379974365, "rewards/margins": 11.338282585144043, "rewards/rejected": -7.769248008728027, "step": 3551 }, { "epoch": 0.592, "grad_norm": 17.36972427368164, "learning_rate": 7.531927835213656e-08, "logits/chosen": 0.7671974301338196, "logits/rejected": 1.458059549331665, "logps/chosen": -145.3140869140625, "logps/rejected": -298.906494140625, "loss": 0.7569, "nll_loss": 0.7452004551887512, "rewards/accuracies": 1.0, "rewards/chosen": 1.7447541952133179, "rewards/margins": 11.177477836608887, "rewards/rejected": -9.432723999023438, "step": 3552 }, { "epoch": 0.5921666666666666, "grad_norm": 20.881614685058594, "learning_rate": 7.52669725650273e-08, "logits/chosen": 1.1462295055389404, "logits/rejected": 1.0207793712615967, "logps/chosen": -81.87652587890625, "logps/rejected": -122.2779541015625, "loss": 0.6071, "nll_loss": 0.5976388454437256, "rewards/accuracies": 1.0, "rewards/chosen": 2.0746140480041504, "rewards/margins": 8.543062210083008, "rewards/rejected": -6.468447685241699, "step": 3553 }, { "epoch": 0.5923333333333334, "grad_norm": 20.801271438598633, "learning_rate": 7.521467398452841e-08, "logits/chosen": 2.9964892864227295, "logits/rejected": 3.161555051803589, "logps/chosen": -17.036766052246094, "logps/rejected": -347.78448486328125, "loss": 0.3113, "nll_loss": 0.3042279779911041, "rewards/accuracies": 1.0, "rewards/chosen": 2.3857014179229736, "rewards/margins": 8.997820854187012, "rewards/rejected": -6.612119197845459, "step": 3554 }, { "epoch": 0.5925, "grad_norm": 30.747447967529297, "learning_rate": 7.516238262587851e-08, "logits/chosen": 2.1315977573394775, "logits/rejected": 2.493420124053955, "logps/chosen": -55.64509582519531, "logps/rejected": -384.114013671875, "loss": 0.8711, "nll_loss": 0.869454562664032, "rewards/accuracies": 1.0, "rewards/chosen": 3.9446840286254883, "rewards/margins": 11.364856719970703, "rewards/rejected": -7.420172214508057, "step": 3555 }, { "epoch": 0.5926666666666667, "grad_norm": 23.57273292541504, "learning_rate": 7.511009850431401e-08, "logits/chosen": 0.9432154893875122, "logits/rejected": 2.017637014389038, "logps/chosen": -44.02676773071289, "logps/rejected": -314.5348815917969, "loss": 0.6118, "nll_loss": 0.5870235562324524, "rewards/accuracies": 1.0, "rewards/chosen": 0.9646084308624268, "rewards/margins": 9.042196273803711, "rewards/rejected": -8.077588081359863, "step": 3556 }, { "epoch": 0.5928333333333333, "grad_norm": 34.22331237792969, "learning_rate": 7.505782163506926e-08, "logits/chosen": 2.8953309059143066, "logits/rejected": 2.6722168922424316, "logps/chosen": -88.17044830322266, "logps/rejected": -72.91722869873047, "loss": 0.8582, "nll_loss": 0.7802694439888, "rewards/accuracies": 1.0, "rewards/chosen": 1.6051628589630127, "rewards/margins": 3.9489994049072266, "rewards/rejected": -2.343836545944214, "step": 3557 }, { "epoch": 0.593, "grad_norm": 29.389015197753906, "learning_rate": 7.500555203337647e-08, "logits/chosen": 2.780566453933716, "logits/rejected": 2.6382884979248047, "logps/chosen": -71.5441665649414, "logps/rejected": -64.55486297607422, "loss": 0.8698, "nll_loss": 0.851716160774231, "rewards/accuracies": 1.0, "rewards/chosen": 1.866315484046936, "rewards/margins": 6.368751525878906, "rewards/rejected": -4.50243616104126, "step": 3558 }, { "epoch": 0.5931666666666666, "grad_norm": 36.203636169433594, "learning_rate": 7.495328971446578e-08, "logits/chosen": 2.498868703842163, "logits/rejected": 2.7215824127197266, "logps/chosen": -55.742347717285156, "logps/rejected": -287.1507263183594, "loss": 0.9098, "nll_loss": 0.8575747013092041, "rewards/accuracies": 1.0, "rewards/chosen": 1.456581950187683, "rewards/margins": 4.4421868324279785, "rewards/rejected": -2.985605001449585, "step": 3559 }, { "epoch": 0.5933333333333334, "grad_norm": 18.633024215698242, "learning_rate": 7.490103469356513e-08, "logits/chosen": 1.8848025798797607, "logits/rejected": 1.7507219314575195, "logps/chosen": -10.69876766204834, "logps/rejected": -52.00577163696289, "loss": 0.2541, "nll_loss": 0.22289098799228668, "rewards/accuracies": 1.0, "rewards/chosen": 1.9263010025024414, "rewards/margins": 5.375534534454346, "rewards/rejected": -3.4492335319519043, "step": 3560 }, { "epoch": 0.5935, "grad_norm": 30.50336456298828, "learning_rate": 7.484878698590039e-08, "logits/chosen": 2.203507661819458, "logits/rejected": 2.2460312843322754, "logps/chosen": -9.307536125183105, "logps/rejected": -155.73777770996094, "loss": 0.3637, "nll_loss": 0.3579821288585663, "rewards/accuracies": 1.0, "rewards/chosen": 2.5024852752685547, "rewards/margins": 10.499228477478027, "rewards/rejected": -7.996743202209473, "step": 3561 }, { "epoch": 0.5936666666666667, "grad_norm": 60.37765884399414, "learning_rate": 7.479654660669533e-08, "logits/chosen": 3.1611392498016357, "logits/rejected": 3.345737934112549, "logps/chosen": -8.851907730102539, "logps/rejected": -204.0791015625, "loss": 0.5611, "nll_loss": 0.5532442331314087, "rewards/accuracies": 1.0, "rewards/chosen": 2.7536940574645996, "rewards/margins": 7.9215779304504395, "rewards/rejected": -5.16788387298584, "step": 3562 }, { "epoch": 0.5938333333333333, "grad_norm": 45.71942901611328, "learning_rate": 7.474431357117149e-08, "logits/chosen": 1.5283608436584473, "logits/rejected": 1.9077955484390259, "logps/chosen": -8.015742301940918, "logps/rejected": -122.66138458251953, "loss": 0.4219, "nll_loss": 0.3817020058631897, "rewards/accuracies": 1.0, "rewards/chosen": 1.094987392425537, "rewards/margins": 4.903450965881348, "rewards/rejected": -3.8084635734558105, "step": 3563 }, { "epoch": 0.594, "grad_norm": 26.012483596801758, "learning_rate": 7.469208789454837e-08, "logits/chosen": 2.5650112628936768, "logits/rejected": 2.729349136352539, "logps/chosen": -75.90139770507812, "logps/rejected": -305.33917236328125, "loss": 0.7949, "nll_loss": 0.7745040655136108, "rewards/accuracies": 1.0, "rewards/chosen": 2.623046875, "rewards/margins": 6.3144073486328125, "rewards/rejected": -3.6913607120513916, "step": 3564 }, { "epoch": 0.5941666666666666, "grad_norm": 22.106590270996094, "learning_rate": 7.463986959204322e-08, "logits/chosen": 2.841554880142212, "logits/rejected": 2.8126416206359863, "logps/chosen": -165.17922973632812, "logps/rejected": -100.65612030029297, "loss": 1.0542, "nll_loss": 1.0259579420089722, "rewards/accuracies": 1.0, "rewards/chosen": 2.615657091140747, "rewards/margins": 5.884531021118164, "rewards/rejected": -3.268873691558838, "step": 3565 }, { "epoch": 0.5943333333333334, "grad_norm": 59.24945068359375, "learning_rate": 7.458765867887127e-08, "logits/chosen": 2.005261182785034, "logits/rejected": 2.0854580402374268, "logps/chosen": -19.941864013671875, "logps/rejected": -59.379608154296875, "loss": 0.9217, "nll_loss": 0.7385876774787903, "rewards/accuracies": 1.0, "rewards/chosen": 1.9373276233673096, "rewards/margins": 3.0348312854766846, "rewards/rejected": -1.097503662109375, "step": 3566 }, { "epoch": 0.5945, "grad_norm": 38.18098831176758, "learning_rate": 7.453545517024546e-08, "logits/chosen": 2.8054232597351074, "logits/rejected": 2.768669366836548, "logps/chosen": -16.074825286865234, "logps/rejected": -281.1922607421875, "loss": 0.562, "nll_loss": 0.554304301738739, "rewards/accuracies": 1.0, "rewards/chosen": 2.427384614944458, "rewards/margins": 8.368415832519531, "rewards/rejected": -5.941030979156494, "step": 3567 }, { "epoch": 0.5946666666666667, "grad_norm": 25.70992088317871, "learning_rate": 7.44832590813767e-08, "logits/chosen": 2.532205581665039, "logits/rejected": 2.710365056991577, "logps/chosen": -100.36943054199219, "logps/rejected": -358.22265625, "loss": 1.0913, "nll_loss": 1.0677597522735596, "rewards/accuracies": 1.0, "rewards/chosen": 1.0086029767990112, "rewards/margins": 11.894596099853516, "rewards/rejected": -10.885993003845215, "step": 3568 }, { "epoch": 0.5948333333333333, "grad_norm": 25.638216018676758, "learning_rate": 7.44310704274736e-08, "logits/chosen": 2.4229509830474854, "logits/rejected": 2.4383151531219482, "logps/chosen": -68.08627319335938, "logps/rejected": -82.8807373046875, "loss": 0.8308, "nll_loss": 0.7826009392738342, "rewards/accuracies": 1.0, "rewards/chosen": 3.547328472137451, "rewards/margins": 6.014137268066406, "rewards/rejected": -2.466808795928955, "step": 3569 }, { "epoch": 0.595, "grad_norm": 102.42214965820312, "learning_rate": 7.437888922374276e-08, "logits/chosen": 2.4431424140930176, "logits/rejected": 2.4914798736572266, "logps/chosen": -72.64762878417969, "logps/rejected": -19.333847045898438, "loss": 2.4092, "nll_loss": 1.4826045036315918, "rewards/accuracies": 1.0, "rewards/chosen": 3.2242960929870605, "rewards/margins": 0.9827792644500732, "rewards/rejected": 2.2415168285369873, "step": 3570 }, { "epoch": 0.5951666666666666, "grad_norm": 28.127958297729492, "learning_rate": 7.432671548538847e-08, "logits/chosen": 3.889615535736084, "logits/rejected": 4.016290187835693, "logps/chosen": -25.609617233276367, "logps/rejected": -268.28802490234375, "loss": 0.5546, "nll_loss": 0.5335336923599243, "rewards/accuracies": 1.0, "rewards/chosen": 1.3703527450561523, "rewards/margins": 6.553305625915527, "rewards/rejected": -5.182952880859375, "step": 3571 }, { "epoch": 0.5953333333333334, "grad_norm": 27.91183090209961, "learning_rate": 7.427454922761296e-08, "logits/chosen": 3.164351224899292, "logits/rejected": 3.204536199569702, "logps/chosen": -47.545223236083984, "logps/rejected": -42.860904693603516, "loss": 0.7839, "nll_loss": 0.7314649820327759, "rewards/accuracies": 1.0, "rewards/chosen": 1.5361194610595703, "rewards/margins": 4.461801528930664, "rewards/rejected": -2.9256818294525146, "step": 3572 }, { "epoch": 0.5955, "grad_norm": 20.149932861328125, "learning_rate": 7.422239046561618e-08, "logits/chosen": 2.6674771308898926, "logits/rejected": 2.8426177501678467, "logps/chosen": -42.06089401245117, "logps/rejected": -267.2381896972656, "loss": 0.5325, "nll_loss": 0.5257611870765686, "rewards/accuracies": 1.0, "rewards/chosen": 2.3042638301849365, "rewards/margins": 11.603397369384766, "rewards/rejected": -9.29913330078125, "step": 3573 }, { "epoch": 0.5956666666666667, "grad_norm": 48.76880645751953, "learning_rate": 7.417023921459596e-08, "logits/chosen": 2.6240861415863037, "logits/rejected": 2.644280195236206, "logps/chosen": -18.939491271972656, "logps/rejected": -134.81297302246094, "loss": 0.7926, "nll_loss": 0.7575797438621521, "rewards/accuracies": 1.0, "rewards/chosen": 0.754673421382904, "rewards/margins": 5.9755024909973145, "rewards/rejected": -5.220829010009766, "step": 3574 }, { "epoch": 0.5958333333333333, "grad_norm": 51.359375, "learning_rate": 7.411809548974791e-08, "logits/chosen": 1.8319697380065918, "logits/rejected": 2.1116349697113037, "logps/chosen": -18.25349998474121, "logps/rejected": -72.30369567871094, "loss": 0.724, "nll_loss": 0.5531362891197205, "rewards/accuracies": 1.0, "rewards/chosen": 1.354799509048462, "rewards/margins": 2.7847557067871094, "rewards/rejected": -1.429956078529358, "step": 3575 }, { "epoch": 0.596, "grad_norm": 22.73074722290039, "learning_rate": 7.40659593062655e-08, "logits/chosen": 2.3494083881378174, "logits/rejected": 2.5838119983673096, "logps/chosen": -13.00130844116211, "logps/rejected": -225.28225708007812, "loss": 0.2951, "nll_loss": 0.26533281803131104, "rewards/accuracies": 1.0, "rewards/chosen": 0.7668367624282837, "rewards/margins": 8.941933631896973, "rewards/rejected": -8.17509651184082, "step": 3576 }, { "epoch": 0.5961666666666666, "grad_norm": 23.29698944091797, "learning_rate": 7.401383067933989e-08, "logits/chosen": 1.8647785186767578, "logits/rejected": 2.061671733856201, "logps/chosen": -106.46183776855469, "logps/rejected": -140.8004913330078, "loss": 1.0474, "nll_loss": 1.0336101055145264, "rewards/accuracies": 1.0, "rewards/chosen": 1.7730073928833008, "rewards/margins": 7.442627906799316, "rewards/rejected": -5.669620513916016, "step": 3577 }, { "epoch": 0.5963333333333334, "grad_norm": 52.13308334350586, "learning_rate": 7.396170962416018e-08, "logits/chosen": 2.4227633476257324, "logits/rejected": 2.457552909851074, "logps/chosen": -33.383819580078125, "logps/rejected": -191.76918029785156, "loss": 0.9455, "nll_loss": 0.9022653698921204, "rewards/accuracies": 1.0, "rewards/chosen": 0.48491325974464417, "rewards/margins": 5.863390922546387, "rewards/rejected": -5.378477573394775, "step": 3578 }, { "epoch": 0.5965, "grad_norm": 42.15876388549805, "learning_rate": 7.390959615591315e-08, "logits/chosen": 1.3011488914489746, "logits/rejected": 1.9565963745117188, "logps/chosen": -49.753517150878906, "logps/rejected": -474.4962158203125, "loss": 1.038, "nll_loss": 1.0365315675735474, "rewards/accuracies": 1.0, "rewards/chosen": 3.8395328521728516, "rewards/margins": 18.393136978149414, "rewards/rejected": -14.553604125976562, "step": 3579 }, { "epoch": 0.5966666666666667, "grad_norm": 125.5688247680664, "learning_rate": 7.385749028978346e-08, "logits/chosen": 2.1045279502868652, "logits/rejected": 1.9197916984558105, "logps/chosen": -71.04240417480469, "logps/rejected": -43.34670639038086, "loss": 1.9405, "nll_loss": 1.5787200927734375, "rewards/accuracies": 1.0, "rewards/chosen": -0.2711448669433594, "rewards/margins": 1.3005889654159546, "rewards/rejected": -1.571733832359314, "step": 3580 }, { "epoch": 0.5968333333333333, "grad_norm": 254.156005859375, "learning_rate": 7.380539204095344e-08, "logits/chosen": 1.7017167806625366, "logits/rejected": 1.8190287351608276, "logps/chosen": -57.8580322265625, "logps/rejected": -33.84803771972656, "loss": 4.5949, "nll_loss": 1.091660737991333, "rewards/accuracies": 0.0, "rewards/chosen": 1.2904205322265625, "rewards/margins": -2.971776008605957, "rewards/rejected": 4.2621965408325195, "step": 3581 }, { "epoch": 0.597, "grad_norm": 37.41218185424805, "learning_rate": 7.37533014246033e-08, "logits/chosen": 1.3542782068252563, "logits/rejected": 2.1863725185394287, "logps/chosen": -23.959592819213867, "logps/rejected": -245.36061096191406, "loss": 0.6527, "nll_loss": 0.6305155754089355, "rewards/accuracies": 1.0, "rewards/chosen": 1.0750772953033447, "rewards/margins": 9.950425148010254, "rewards/rejected": -8.875348091125488, "step": 3582 }, { "epoch": 0.5971666666666666, "grad_norm": 28.510578155517578, "learning_rate": 7.370121845591104e-08, "logits/chosen": 1.0841518640518188, "logits/rejected": 1.3574094772338867, "logps/chosen": -105.62277221679688, "logps/rejected": -135.18719482421875, "loss": 0.9217, "nll_loss": 0.9027586579322815, "rewards/accuracies": 1.0, "rewards/chosen": 1.2799057960510254, "rewards/margins": 8.182416915893555, "rewards/rejected": -6.902511119842529, "step": 3583 }, { "epoch": 0.5973333333333334, "grad_norm": 162.38099670410156, "learning_rate": 7.364914315005233e-08, "logits/chosen": 2.38399338722229, "logits/rejected": 2.2863540649414062, "logps/chosen": -186.41310119628906, "logps/rejected": -274.63031005859375, "loss": 2.4084, "nll_loss": 1.4230010509490967, "rewards/accuracies": 1.0, "rewards/chosen": -5.876540660858154, "rewards/margins": 3.5112080574035645, "rewards/rejected": -9.387748718261719, "step": 3584 }, { "epoch": 0.5975, "grad_norm": 29.77931022644043, "learning_rate": 7.35970755222007e-08, "logits/chosen": 0.6371913552284241, "logits/rejected": 2.347505569458008, "logps/chosen": -63.056697845458984, "logps/rejected": -418.4762878417969, "loss": 0.8978, "nll_loss": 0.8757874965667725, "rewards/accuracies": 1.0, "rewards/chosen": 1.2220417261123657, "rewards/margins": 6.8860650062561035, "rewards/rejected": -5.664023399353027, "step": 3585 }, { "epoch": 0.5976666666666667, "grad_norm": 39.21742248535156, "learning_rate": 7.354501558752735e-08, "logits/chosen": 2.480855703353882, "logits/rejected": 2.547410249710083, "logps/chosen": -14.237541198730469, "logps/rejected": -59.482093811035156, "loss": 0.5205, "nll_loss": 0.4745846390724182, "rewards/accuracies": 1.0, "rewards/chosen": 0.6858402490615845, "rewards/margins": 4.916082382202148, "rewards/rejected": -4.2302422523498535, "step": 3586 }, { "epoch": 0.5978333333333333, "grad_norm": 106.76408386230469, "learning_rate": 7.349296336120136e-08, "logits/chosen": 3.282529830932617, "logits/rejected": 3.141052484512329, "logps/chosen": -78.81765747070312, "logps/rejected": -117.7625732421875, "loss": 1.6654, "nll_loss": 1.2712523937225342, "rewards/accuracies": 1.0, "rewards/chosen": -2.5617032051086426, "rewards/margins": 2.6762404441833496, "rewards/rejected": -5.237943649291992, "step": 3587 }, { "epoch": 0.598, "grad_norm": 38.29526138305664, "learning_rate": 7.344091885838947e-08, "logits/chosen": 0.532575786113739, "logits/rejected": 1.8970928192138672, "logps/chosen": -16.89154624938965, "logps/rejected": -319.9472351074219, "loss": 0.565, "nll_loss": 0.5630515217781067, "rewards/accuracies": 1.0, "rewards/chosen": 3.557427167892456, "rewards/margins": 16.52886962890625, "rewards/rejected": -12.971443176269531, "step": 3588 }, { "epoch": 0.5981666666666666, "grad_norm": 162.8354034423828, "learning_rate": 7.338888209425622e-08, "logits/chosen": 2.5229039192199707, "logits/rejected": 2.558650016784668, "logps/chosen": -123.98541259765625, "logps/rejected": -150.24713134765625, "loss": 1.752, "nll_loss": 1.6984305381774902, "rewards/accuracies": 1.0, "rewards/chosen": 0.24846269190311432, "rewards/margins": 5.464889049530029, "rewards/rejected": -5.216426372528076, "step": 3589 }, { "epoch": 0.5983333333333334, "grad_norm": 34.659828186035156, "learning_rate": 7.333685308396382e-08, "logits/chosen": 2.942457675933838, "logits/rejected": 2.9702517986297607, "logps/chosen": -57.79277801513672, "logps/rejected": -29.282148361206055, "loss": 0.8321, "nll_loss": 0.7315541505813599, "rewards/accuracies": 1.0, "rewards/chosen": 1.6843299865722656, "rewards/margins": 3.6651391983032227, "rewards/rejected": -1.980809211730957, "step": 3590 }, { "epoch": 0.5985, "grad_norm": 28.85276222229004, "learning_rate": 7.328483184267235e-08, "logits/chosen": 1.5899595022201538, "logits/rejected": 0.7741201519966125, "logps/chosen": -168.20608520507812, "logps/rejected": -63.4241828918457, "loss": 0.9112, "nll_loss": 0.849525511264801, "rewards/accuracies": 1.0, "rewards/chosen": 1.6948137283325195, "rewards/margins": 4.312300205230713, "rewards/rejected": -2.6174864768981934, "step": 3591 }, { "epoch": 0.5986666666666667, "grad_norm": 26.406776428222656, "learning_rate": 7.323281838553948e-08, "logits/chosen": 2.4332306385040283, "logits/rejected": 2.593151807785034, "logps/chosen": -80.05425262451172, "logps/rejected": -168.62208557128906, "loss": 0.8687, "nll_loss": 0.851641058921814, "rewards/accuracies": 1.0, "rewards/chosen": 1.567023515701294, "rewards/margins": 6.993914604187012, "rewards/rejected": -5.426891326904297, "step": 3592 }, { "epoch": 0.5988333333333333, "grad_norm": 32.18839645385742, "learning_rate": 7.318081272772074e-08, "logits/chosen": 2.6954751014709473, "logits/rejected": 2.668989419937134, "logps/chosen": -36.075653076171875, "logps/rejected": -40.338714599609375, "loss": 0.793, "nll_loss": 0.7515759468078613, "rewards/accuracies": 1.0, "rewards/chosen": 5.258703708648682, "rewards/margins": 7.824106216430664, "rewards/rejected": -2.5654027462005615, "step": 3593 }, { "epoch": 0.599, "grad_norm": 36.832523345947266, "learning_rate": 7.312881488436927e-08, "logits/chosen": 0.3643185496330261, "logits/rejected": 2.44338321685791, "logps/chosen": -11.760465621948242, "logps/rejected": -409.259765625, "loss": 0.4068, "nll_loss": 0.379369854927063, "rewards/accuracies": 1.0, "rewards/chosen": 0.8447985649108887, "rewards/margins": 9.385473251342773, "rewards/rejected": -8.540674209594727, "step": 3594 }, { "epoch": 0.5991666666666666, "grad_norm": 22.998214721679688, "learning_rate": 7.307682487063607e-08, "logits/chosen": 1.7957111597061157, "logits/rejected": 2.3126189708709717, "logps/chosen": -33.90204620361328, "logps/rejected": -160.83279418945312, "loss": 0.5119, "nll_loss": 0.4913339614868164, "rewards/accuracies": 1.0, "rewards/chosen": 1.2226104736328125, "rewards/margins": 7.5667805671691895, "rewards/rejected": -6.344170093536377, "step": 3595 }, { "epoch": 0.5993333333333334, "grad_norm": 31.39037322998047, "learning_rate": 7.30248427016697e-08, "logits/chosen": 2.714733123779297, "logits/rejected": 2.6450393199920654, "logps/chosen": -107.068603515625, "logps/rejected": -127.05259704589844, "loss": 1.0624, "nll_loss": 1.0496922731399536, "rewards/accuracies": 1.0, "rewards/chosen": 1.9377793073654175, "rewards/margins": 7.330798625946045, "rewards/rejected": -5.393019199371338, "step": 3596 }, { "epoch": 0.5995, "grad_norm": 34.708160400390625, "learning_rate": 7.297286839261658e-08, "logits/chosen": 1.7370057106018066, "logits/rejected": 2.075869560241699, "logps/chosen": -48.722686767578125, "logps/rejected": -151.3603057861328, "loss": 0.8811, "nll_loss": 0.870047926902771, "rewards/accuracies": 1.0, "rewards/chosen": 2.0221450328826904, "rewards/margins": 7.754648208618164, "rewards/rejected": -5.732503414154053, "step": 3597 }, { "epoch": 0.5996666666666667, "grad_norm": 26.76146125793457, "learning_rate": 7.292090195862073e-08, "logits/chosen": 1.8132942914962769, "logits/rejected": 2.2154603004455566, "logps/chosen": -43.06687927246094, "logps/rejected": -377.36199951171875, "loss": 0.6648, "nll_loss": 0.6625674366950989, "rewards/accuracies": 1.0, "rewards/chosen": 3.4223361015319824, "rewards/margins": 20.87748908996582, "rewards/rejected": -17.45515251159668, "step": 3598 }, { "epoch": 0.5998333333333333, "grad_norm": 61.472259521484375, "learning_rate": 7.286894341482396e-08, "logits/chosen": 2.7755250930786133, "logits/rejected": 2.811896800994873, "logps/chosen": -68.0604476928711, "logps/rejected": -73.65232849121094, "loss": 1.2003, "nll_loss": 1.173456072807312, "rewards/accuracies": 1.0, "rewards/chosen": 1.1579033136367798, "rewards/margins": 5.9966301918029785, "rewards/rejected": -4.838726997375488, "step": 3599 }, { "epoch": 0.6, "grad_norm": 40.518192291259766, "learning_rate": 7.281699277636571e-08, "logits/chosen": 2.013427257537842, "logits/rejected": 2.7991390228271484, "logps/chosen": -50.57913589477539, "logps/rejected": -181.30755615234375, "loss": 1.0704, "nll_loss": 1.0322273969650269, "rewards/accuracies": 1.0, "rewards/chosen": 0.501569390296936, "rewards/margins": 7.706267833709717, "rewards/rejected": -7.20469856262207, "step": 3600 }, { "epoch": 0.6001666666666666, "grad_norm": 213.35018920898438, "learning_rate": 7.276505005838319e-08, "logits/chosen": 2.125000238418579, "logits/rejected": 2.059920072555542, "logps/chosen": -45.538291931152344, "logps/rejected": -40.02439498901367, "loss": 4.7504, "nll_loss": 0.7851429581642151, "rewards/accuracies": 0.0, "rewards/chosen": 2.0326333045959473, "rewards/margins": -3.304771900177002, "rewards/rejected": 5.337405204772949, "step": 3601 }, { "epoch": 0.6003333333333334, "grad_norm": 554.5098266601562, "learning_rate": 7.271311527601121e-08, "logits/chosen": 2.124105930328369, "logits/rejected": 2.0348873138427734, "logps/chosen": -181.77557373046875, "logps/rejected": -59.27341079711914, "loss": 4.9032, "nll_loss": 1.3268288373947144, "rewards/accuracies": 0.0, "rewards/chosen": -2.087322950363159, "rewards/margins": -3.4733853340148926, "rewards/rejected": 1.3860623836517334, "step": 3602 }, { "epoch": 0.6005, "grad_norm": 40.106285095214844, "learning_rate": 7.266118844438233e-08, "logits/chosen": 2.6346709728240967, "logits/rejected": 2.610969066619873, "logps/chosen": -21.059524536132812, "logps/rejected": -94.81143188476562, "loss": 0.6035, "nll_loss": 0.5264880657196045, "rewards/accuracies": 1.0, "rewards/chosen": 1.1290687322616577, "rewards/margins": 3.7739081382751465, "rewards/rejected": -2.6448395252227783, "step": 3603 }, { "epoch": 0.6006666666666667, "grad_norm": 18.31675910949707, "learning_rate": 7.260926957862684e-08, "logits/chosen": 1.919634461402893, "logits/rejected": 2.0648038387298584, "logps/chosen": -155.72830200195312, "logps/rejected": -125.92581176757812, "loss": 0.8161, "nll_loss": 0.7986065745353699, "rewards/accuracies": 1.0, "rewards/chosen": 1.7709625959396362, "rewards/margins": 6.5159382820129395, "rewards/rejected": -4.744975566864014, "step": 3604 }, { "epoch": 0.6008333333333333, "grad_norm": 165.06405639648438, "learning_rate": 7.255735869387256e-08, "logits/chosen": 1.6227750778198242, "logits/rejected": 1.7374253273010254, "logps/chosen": -25.209579467773438, "logps/rejected": -16.42898941040039, "loss": 3.854, "nll_loss": 0.45835599303245544, "rewards/accuracies": 0.0, "rewards/chosen": 0.4482044279575348, "rewards/margins": -3.010256052017212, "rewards/rejected": 3.458460569381714, "step": 3605 }, { "epoch": 0.601, "grad_norm": 29.607629776000977, "learning_rate": 7.250545580524514e-08, "logits/chosen": 3.1981754302978516, "logits/rejected": 3.311565637588501, "logps/chosen": -45.49543380737305, "logps/rejected": -230.3419647216797, "loss": 0.6138, "nll_loss": 0.598624050617218, "rewards/accuracies": 1.0, "rewards/chosen": 1.4613900184631348, "rewards/margins": 10.934614181518555, "rewards/rejected": -9.473223686218262, "step": 3606 }, { "epoch": 0.6011666666666666, "grad_norm": 26.928691864013672, "learning_rate": 7.245356092786782e-08, "logits/chosen": 2.8189053535461426, "logits/rejected": 2.7840964794158936, "logps/chosen": -24.478744506835938, "logps/rejected": -118.39659118652344, "loss": 0.5183, "nll_loss": 0.4995662569999695, "rewards/accuracies": 1.0, "rewards/chosen": 1.3555939197540283, "rewards/margins": 7.378472328186035, "rewards/rejected": -6.022878170013428, "step": 3607 }, { "epoch": 0.6013333333333334, "grad_norm": 44.77596664428711, "learning_rate": 7.240167407686153e-08, "logits/chosen": 2.2938196659088135, "logits/rejected": 2.5930888652801514, "logps/chosen": -41.49333953857422, "logps/rejected": -163.90438842773438, "loss": 1.1006, "nll_loss": 1.0919300317764282, "rewards/accuracies": 1.0, "rewards/chosen": 2.1667726039886475, "rewards/margins": 8.658896446228027, "rewards/rejected": -6.492123603820801, "step": 3608 }, { "epoch": 0.6015, "grad_norm": 25.941064834594727, "learning_rate": 7.234979526734482e-08, "logits/chosen": 1.9091846942901611, "logits/rejected": 2.0328166484832764, "logps/chosen": -78.84599304199219, "logps/rejected": -98.29644775390625, "loss": 0.8662, "nll_loss": 0.8387870788574219, "rewards/accuracies": 1.0, "rewards/chosen": 2.209646701812744, "rewards/margins": 5.686075210571289, "rewards/rejected": -3.476428747177124, "step": 3609 }, { "epoch": 0.6016666666666667, "grad_norm": 44.0056266784668, "learning_rate": 7.229792451443398e-08, "logits/chosen": 0.6405710577964783, "logits/rejected": 2.5665712356567383, "logps/chosen": -15.214737892150879, "logps/rejected": -417.51171875, "loss": 0.6121, "nll_loss": 0.6085894703865051, "rewards/accuracies": 1.0, "rewards/chosen": 3.322035312652588, "rewards/margins": 9.71592903137207, "rewards/rejected": -6.393894195556641, "step": 3610 }, { "epoch": 0.6018333333333333, "grad_norm": 52.43075180053711, "learning_rate": 7.224606183324285e-08, "logits/chosen": 2.4629368782043457, "logits/rejected": 2.044691324234009, "logps/chosen": -37.59850311279297, "logps/rejected": -47.811622619628906, "loss": 0.7985, "nll_loss": 0.7094057202339172, "rewards/accuracies": 1.0, "rewards/chosen": 0.6305050253868103, "rewards/margins": 3.4774107933044434, "rewards/rejected": -2.8469057083129883, "step": 3611 }, { "epoch": 0.602, "grad_norm": 16.647974014282227, "learning_rate": 7.2194207238883e-08, "logits/chosen": 2.964488983154297, "logits/rejected": 2.8705739974975586, "logps/chosen": -136.5159912109375, "logps/rejected": -142.85369873046875, "loss": 0.7638, "nll_loss": 0.7500879764556885, "rewards/accuracies": 1.0, "rewards/chosen": 3.0317673683166504, "rewards/margins": 7.102790355682373, "rewards/rejected": -4.071022987365723, "step": 3612 }, { "epoch": 0.6021666666666666, "grad_norm": 132.092529296875, "learning_rate": 7.21423607464636e-08, "logits/chosen": 3.120265007019043, "logits/rejected": 3.073624610900879, "logps/chosen": -7.122074127197266, "logps/rejected": -12.82297420501709, "loss": 1.9348, "nll_loss": 0.5087195634841919, "rewards/accuracies": 0.0, "rewards/chosen": 2.090479612350464, "rewards/margins": -0.3519597053527832, "rewards/rejected": 2.442439317703247, "step": 3613 }, { "epoch": 0.6023333333333334, "grad_norm": 41.32734298706055, "learning_rate": 7.20905223710915e-08, "logits/chosen": 1.1813788414001465, "logits/rejected": 1.8854926824569702, "logps/chosen": -47.412296295166016, "logps/rejected": -169.8940887451172, "loss": 0.8445, "nll_loss": 0.8035982847213745, "rewards/accuracies": 1.0, "rewards/chosen": 0.5818729400634766, "rewards/margins": 5.703930854797363, "rewards/rejected": -5.122057914733887, "step": 3614 }, { "epoch": 0.6025, "grad_norm": 18.382915496826172, "learning_rate": 7.20386921278711e-08, "logits/chosen": 3.0531017780303955, "logits/rejected": 3.364624261856079, "logps/chosen": -8.158767700195312, "logps/rejected": -155.498291015625, "loss": 0.2238, "nll_loss": 0.20396916568279266, "rewards/accuracies": 1.0, "rewards/chosen": 1.216614007949829, "rewards/margins": 8.40903091430664, "rewards/rejected": -7.192416667938232, "step": 3615 }, { "epoch": 0.6026666666666667, "grad_norm": 30.11467933654785, "learning_rate": 7.198687003190457e-08, "logits/chosen": 1.6427654027938843, "logits/rejected": 1.9360789060592651, "logps/chosen": -49.25025177001953, "logps/rejected": -141.04127502441406, "loss": 0.713, "nll_loss": 0.7035750150680542, "rewards/accuracies": 1.0, "rewards/chosen": 2.042905569076538, "rewards/margins": 8.765291213989258, "rewards/rejected": -6.722385406494141, "step": 3616 }, { "epoch": 0.6028333333333333, "grad_norm": 24.604969024658203, "learning_rate": 7.193505609829155e-08, "logits/chosen": 2.8648908138275146, "logits/rejected": 3.1162147521972656, "logps/chosen": -57.14120101928711, "logps/rejected": -397.975341796875, "loss": 0.6201, "nll_loss": 0.6014862656593323, "rewards/accuracies": 1.0, "rewards/chosen": 1.3388257026672363, "rewards/margins": 7.581843852996826, "rewards/rejected": -6.24301815032959, "step": 3617 }, { "epoch": 0.603, "grad_norm": 42.6110954284668, "learning_rate": 7.188325034212943e-08, "logits/chosen": 2.66306471824646, "logits/rejected": 3.2519125938415527, "logps/chosen": -41.53601837158203, "logps/rejected": -268.48077392578125, "loss": 0.9303, "nll_loss": 0.9230226874351501, "rewards/accuracies": 1.0, "rewards/chosen": 2.499350070953369, "rewards/margins": 8.394397735595703, "rewards/rejected": -5.895047187805176, "step": 3618 }, { "epoch": 0.6031666666666666, "grad_norm": 31.051305770874023, "learning_rate": 7.183145277851312e-08, "logits/chosen": 2.2718093395233154, "logits/rejected": 2.2843332290649414, "logps/chosen": -12.31477165222168, "logps/rejected": -82.03970336914062, "loss": 0.3909, "nll_loss": 0.34207701683044434, "rewards/accuracies": 1.0, "rewards/chosen": 0.6153383255004883, "rewards/margins": 4.808586120605469, "rewards/rejected": -4.1932477951049805, "step": 3619 }, { "epoch": 0.6033333333333334, "grad_norm": 22.896381378173828, "learning_rate": 7.177966342253523e-08, "logits/chosen": 1.3911230564117432, "logits/rejected": 1.218434453010559, "logps/chosen": -137.17910766601562, "logps/rejected": -59.2315673828125, "loss": 0.8483, "nll_loss": 0.7929428815841675, "rewards/accuracies": 1.0, "rewards/chosen": 1.5087158679962158, "rewards/margins": 4.378592491149902, "rewards/rejected": -2.8698763847351074, "step": 3620 }, { "epoch": 0.6035, "grad_norm": 26.18180274963379, "learning_rate": 7.17278822892859e-08, "logits/chosen": 1.7284389734268188, "logits/rejected": 0.731345534324646, "logps/chosen": -303.975341796875, "logps/rejected": -144.79296875, "loss": 0.8887, "nll_loss": 0.8538632392883301, "rewards/accuracies": 1.0, "rewards/chosen": 0.787280261516571, "rewards/margins": 5.821710109710693, "rewards/rejected": -5.034430027008057, "step": 3621 }, { "epoch": 0.6036666666666667, "grad_norm": 201.04742431640625, "learning_rate": 7.167610939385294e-08, "logits/chosen": 2.9507203102111816, "logits/rejected": 2.998957633972168, "logps/chosen": -33.458717346191406, "logps/rejected": -35.32908630371094, "loss": 2.8849, "nll_loss": 0.6691743731498718, "rewards/accuracies": 0.0, "rewards/chosen": 1.1295826435089111, "rewards/margins": -1.6015117168426514, "rewards/rejected": 2.7310943603515625, "step": 3622 }, { "epoch": 0.6038333333333333, "grad_norm": 23.57963752746582, "learning_rate": 7.162434475132169e-08, "logits/chosen": 2.6712255477905273, "logits/rejected": 2.695455312728882, "logps/chosen": -61.09794616699219, "logps/rejected": -268.02935791015625, "loss": 0.6535, "nll_loss": 0.6431362628936768, "rewards/accuracies": 1.0, "rewards/chosen": 1.8528947830200195, "rewards/margins": 11.497267723083496, "rewards/rejected": -9.644372940063477, "step": 3623 }, { "epoch": 0.604, "grad_norm": 76.56926727294922, "learning_rate": 7.157258837677513e-08, "logits/chosen": 2.8961408138275146, "logits/rejected": 2.873589038848877, "logps/chosen": -63.77362060546875, "logps/rejected": -65.51968383789062, "loss": 1.4958, "nll_loss": 1.3015025854110718, "rewards/accuracies": 1.0, "rewards/chosen": -0.6955803036689758, "rewards/margins": 2.4444477558135986, "rewards/rejected": -3.1400279998779297, "step": 3624 }, { "epoch": 0.6041666666666666, "grad_norm": 24.923601150512695, "learning_rate": 7.152084028529388e-08, "logits/chosen": 1.0163919925689697, "logits/rejected": 1.6556941270828247, "logps/chosen": -78.08074951171875, "logps/rejected": -302.5110778808594, "loss": 0.915, "nll_loss": 0.9079155325889587, "rewards/accuracies": 1.0, "rewards/chosen": 2.299456834793091, "rewards/margins": 9.824433326721191, "rewards/rejected": -7.5249762535095215, "step": 3625 }, { "epoch": 0.6043333333333333, "grad_norm": 24.413002014160156, "learning_rate": 7.146910049195603e-08, "logits/chosen": 2.829202651977539, "logits/rejected": 2.7318265438079834, "logps/chosen": -49.53534698486328, "logps/rejected": -109.37836456298828, "loss": 0.5886, "nll_loss": 0.5759923458099365, "rewards/accuracies": 1.0, "rewards/chosen": 2.3244895935058594, "rewards/margins": 7.018543720245361, "rewards/rejected": -4.694054126739502, "step": 3626 }, { "epoch": 0.6045, "grad_norm": 32.97405242919922, "learning_rate": 7.141736901183735e-08, "logits/chosen": 3.024613380432129, "logits/rejected": 2.9859464168548584, "logps/chosen": -50.148841857910156, "logps/rejected": -201.14697265625, "loss": 0.7902, "nll_loss": 0.7598310708999634, "rewards/accuracies": 1.0, "rewards/chosen": 0.7667927742004395, "rewards/margins": 7.49693489074707, "rewards/rejected": -6.730142116546631, "step": 3627 }, { "epoch": 0.6046666666666667, "grad_norm": 19.43243980407715, "learning_rate": 7.136564586001112e-08, "logits/chosen": 2.5807905197143555, "logits/rejected": 2.6099584102630615, "logps/chosen": -8.382046699523926, "logps/rejected": -114.14005279541016, "loss": 0.2021, "nll_loss": 0.194931298494339, "rewards/accuracies": 1.0, "rewards/chosen": 2.4337406158447266, "rewards/margins": 8.658075332641602, "rewards/rejected": -6.224334239959717, "step": 3628 }, { "epoch": 0.6048333333333333, "grad_norm": 44.362972259521484, "learning_rate": 7.131393105154826e-08, "logits/chosen": 1.385154366493225, "logits/rejected": 1.1181601285934448, "logps/chosen": -72.1125717163086, "logps/rejected": -64.7306137084961, "loss": 1.0713, "nll_loss": 1.030179500579834, "rewards/accuracies": 1.0, "rewards/chosen": 1.9648491144180298, "rewards/margins": 5.004058837890625, "rewards/rejected": -3.0392098426818848, "step": 3629 }, { "epoch": 0.605, "grad_norm": 40.08494567871094, "learning_rate": 7.126222460151719e-08, "logits/chosen": 2.915755033493042, "logits/rejected": 2.759742498397827, "logps/chosen": -13.339128494262695, "logps/rejected": -180.56715393066406, "loss": 0.5111, "nll_loss": 0.4940418303012848, "rewards/accuracies": 1.0, "rewards/chosen": 1.4442955255508423, "rewards/margins": 7.5595622062683105, "rewards/rejected": -6.115266799926758, "step": 3630 }, { "epoch": 0.6051666666666666, "grad_norm": 17.488943099975586, "learning_rate": 7.121052652498396e-08, "logits/chosen": 2.1514861583709717, "logits/rejected": 2.052515983581543, "logps/chosen": -135.38540649414062, "logps/rejected": -78.43885040283203, "loss": 0.8776, "nll_loss": 0.862327516078949, "rewards/accuracies": 1.0, "rewards/chosen": 2.6968109607696533, "rewards/margins": 6.768260955810547, "rewards/rejected": -4.071450233459473, "step": 3631 }, { "epoch": 0.6053333333333333, "grad_norm": 29.492502212524414, "learning_rate": 7.11588368370121e-08, "logits/chosen": 2.3720831871032715, "logits/rejected": 2.374563455581665, "logps/chosen": -1.7256025075912476, "logps/rejected": -75.19993591308594, "loss": 0.1938, "nll_loss": 0.10150604695081711, "rewards/accuracies": 1.0, "rewards/chosen": 0.2343284785747528, "rewards/margins": 3.5081124305725098, "rewards/rejected": -3.2737839221954346, "step": 3632 }, { "epoch": 0.6055, "grad_norm": 24.69500160217285, "learning_rate": 7.11071555526628e-08, "logits/chosen": 2.395517349243164, "logits/rejected": 2.177917957305908, "logps/chosen": -90.37214660644531, "logps/rejected": -89.07959747314453, "loss": 0.8406, "nll_loss": 0.8291022777557373, "rewards/accuracies": 1.0, "rewards/chosen": 1.7808914184570312, "rewards/margins": 9.170495986938477, "rewards/rejected": -7.389604568481445, "step": 3633 }, { "epoch": 0.6056666666666667, "grad_norm": 17.948760986328125, "learning_rate": 7.10554826869947e-08, "logits/chosen": 2.609377145767212, "logits/rejected": 2.6217563152313232, "logps/chosen": -121.2987060546875, "logps/rejected": -82.64254760742188, "loss": 0.6219, "nll_loss": 0.5721637010574341, "rewards/accuracies": 1.0, "rewards/chosen": 2.8989624977111816, "rewards/margins": 5.423725605010986, "rewards/rejected": -2.5247631072998047, "step": 3634 }, { "epoch": 0.6058333333333333, "grad_norm": 204.41366577148438, "learning_rate": 7.100381825506407e-08, "logits/chosen": 1.5710208415985107, "logits/rejected": 2.7373461723327637, "logps/chosen": -46.41310501098633, "logps/rejected": -405.54705810546875, "loss": 1.11, "nll_loss": 1.0793743133544922, "rewards/accuracies": 1.0, "rewards/chosen": 0.7228168845176697, "rewards/margins": 9.158534049987793, "rewards/rejected": -8.435717582702637, "step": 3635 }, { "epoch": 0.606, "grad_norm": 58.46973419189453, "learning_rate": 7.095216227192466e-08, "logits/chosen": 2.7099618911743164, "logits/rejected": 2.5912420749664307, "logps/chosen": -19.383235931396484, "logps/rejected": -35.25331115722656, "loss": 0.9048, "nll_loss": 0.8427494168281555, "rewards/accuracies": 1.0, "rewards/chosen": 1.8020492792129517, "rewards/margins": 4.363816738128662, "rewards/rejected": -2.561767578125, "step": 3636 }, { "epoch": 0.6061666666666666, "grad_norm": 75.87141418457031, "learning_rate": 7.090051475262782e-08, "logits/chosen": 3.056825637817383, "logits/rejected": 2.961982488632202, "logps/chosen": -104.32413482666016, "logps/rejected": -107.22933197021484, "loss": 1.2602, "nll_loss": 1.1217650175094604, "rewards/accuracies": 1.0, "rewards/chosen": -0.015474701300263405, "rewards/margins": 2.806004285812378, "rewards/rejected": -2.821479082107544, "step": 3637 }, { "epoch": 0.6063333333333333, "grad_norm": 46.85861587524414, "learning_rate": 7.084887571222233e-08, "logits/chosen": 1.9601205587387085, "logits/rejected": 2.3382818698883057, "logps/chosen": -30.07651138305664, "logps/rejected": -173.91561889648438, "loss": 0.8206, "nll_loss": 0.8128786683082581, "rewards/accuracies": 1.0, "rewards/chosen": 2.513716220855713, "rewards/margins": 8.169933319091797, "rewards/rejected": -5.656217575073242, "step": 3638 }, { "epoch": 0.6065, "grad_norm": 25.975446701049805, "learning_rate": 7.079724516575464e-08, "logits/chosen": 2.488003730773926, "logits/rejected": 3.0525450706481934, "logps/chosen": -98.09661102294922, "logps/rejected": -228.9568328857422, "loss": 1.0489, "nll_loss": 1.0325958728790283, "rewards/accuracies": 1.0, "rewards/chosen": 2.4913978576660156, "rewards/margins": 6.583750247955322, "rewards/rejected": -4.092352390289307, "step": 3639 }, { "epoch": 0.6066666666666667, "grad_norm": 56.210514068603516, "learning_rate": 7.07456231282686e-08, "logits/chosen": 1.937324047088623, "logits/rejected": 2.710073471069336, "logps/chosen": -30.47852325439453, "logps/rejected": -169.72396850585938, "loss": 0.6834, "nll_loss": 0.6484792828559875, "rewards/accuracies": 1.0, "rewards/chosen": 0.6922375559806824, "rewards/margins": 6.303104877471924, "rewards/rejected": -5.610867500305176, "step": 3640 }, { "epoch": 0.6068333333333333, "grad_norm": 24.530092239379883, "learning_rate": 7.069400961480565e-08, "logits/chosen": 2.346210479736328, "logits/rejected": 2.353773593902588, "logps/chosen": -130.2281494140625, "logps/rejected": -216.41165161132812, "loss": 0.7217, "nll_loss": 0.6818225979804993, "rewards/accuracies": 1.0, "rewards/chosen": 0.434988409280777, "rewards/margins": 8.694268226623535, "rewards/rejected": -8.25928020477295, "step": 3641 }, { "epoch": 0.607, "grad_norm": 21.09463882446289, "learning_rate": 7.064240464040471e-08, "logits/chosen": 2.8979923725128174, "logits/rejected": 2.7816600799560547, "logps/chosen": -24.808815002441406, "logps/rejected": -258.1352844238281, "loss": 0.4004, "nll_loss": 0.4001421630382538, "rewards/accuracies": 1.0, "rewards/chosen": 6.086272716522217, "rewards/margins": 14.847009658813477, "rewards/rejected": -8.760736465454102, "step": 3642 }, { "epoch": 0.6071666666666666, "grad_norm": 23.929065704345703, "learning_rate": 7.05908082201023e-08, "logits/chosen": 1.3452357053756714, "logits/rejected": 1.890674114227295, "logps/chosen": -94.2401351928711, "logps/rejected": -220.83987426757812, "loss": 0.8388, "nll_loss": 0.8194793462753296, "rewards/accuracies": 1.0, "rewards/chosen": 1.205889105796814, "rewards/margins": 10.756897926330566, "rewards/rejected": -9.551009178161621, "step": 3643 }, { "epoch": 0.6073333333333333, "grad_norm": 17.227214813232422, "learning_rate": 7.053922036893228e-08, "logits/chosen": 1.8865128755569458, "logits/rejected": 2.0891306400299072, "logps/chosen": -65.5276870727539, "logps/rejected": -483.9277648925781, "loss": 0.5923, "nll_loss": 0.5903396010398865, "rewards/accuracies": 1.0, "rewards/chosen": 3.524116039276123, "rewards/margins": 14.197513580322266, "rewards/rejected": -10.6733980178833, "step": 3644 }, { "epoch": 0.6075, "grad_norm": 158.52552795410156, "learning_rate": 7.048764110192617e-08, "logits/chosen": 2.8956847190856934, "logits/rejected": 2.890279531478882, "logps/chosen": -37.529502868652344, "logps/rejected": -15.395574569702148, "loss": 2.2542, "nll_loss": 0.42647162079811096, "rewards/accuracies": 0.0, "rewards/chosen": 1.388243556022644, "rewards/margins": -1.0764464139938354, "rewards/rejected": 2.4646899700164795, "step": 3645 }, { "epoch": 0.6076666666666667, "grad_norm": 167.91993713378906, "learning_rate": 7.043607043411292e-08, "logits/chosen": 2.7309935092926025, "logits/rejected": 2.847529172897339, "logps/chosen": -137.64129638671875, "logps/rejected": -195.0811309814453, "loss": 2.896, "nll_loss": 2.1506452560424805, "rewards/accuracies": 1.0, "rewards/chosen": -4.771069526672363, "rewards/margins": 5.3805341720581055, "rewards/rejected": -10.151603698730469, "step": 3646 }, { "epoch": 0.6078333333333333, "grad_norm": 18.456697463989258, "learning_rate": 7.0384508380519e-08, "logits/chosen": 1.5957539081573486, "logits/rejected": 2.216740608215332, "logps/chosen": -100.24269104003906, "logps/rejected": -202.60650634765625, "loss": 0.6844, "nll_loss": 0.6773154139518738, "rewards/accuracies": 1.0, "rewards/chosen": 2.2593352794647217, "rewards/margins": 10.31242847442627, "rewards/rejected": -8.053092956542969, "step": 3647 }, { "epoch": 0.608, "grad_norm": 28.68613052368164, "learning_rate": 7.033295495616833e-08, "logits/chosen": 2.3823156356811523, "logits/rejected": 2.4028782844543457, "logps/chosen": -70.19139099121094, "logps/rejected": -120.3646240234375, "loss": 0.9266, "nll_loss": 0.9115763902664185, "rewards/accuracies": 1.0, "rewards/chosen": 1.5201866626739502, "rewards/margins": 8.309588432312012, "rewards/rejected": -6.789402008056641, "step": 3648 }, { "epoch": 0.6081666666666666, "grad_norm": 34.0610466003418, "learning_rate": 7.028141017608233e-08, "logits/chosen": 2.697512626647949, "logits/rejected": 2.6761841773986816, "logps/chosen": -129.94671630859375, "logps/rejected": -49.45473098754883, "loss": 1.2067, "nll_loss": 1.1706911325454712, "rewards/accuracies": 1.0, "rewards/chosen": 1.2213150262832642, "rewards/margins": 5.082150936126709, "rewards/rejected": -3.8608357906341553, "step": 3649 }, { "epoch": 0.6083333333333333, "grad_norm": 25.77027130126953, "learning_rate": 7.022987405527996e-08, "logits/chosen": 1.6444506645202637, "logits/rejected": 2.3419718742370605, "logps/chosen": -50.77569580078125, "logps/rejected": -165.5487518310547, "loss": 0.7107, "nll_loss": 0.6955574154853821, "rewards/accuracies": 1.0, "rewards/chosen": 1.4779655933380127, "rewards/margins": 9.080340385437012, "rewards/rejected": -7.60237455368042, "step": 3650 }, { "epoch": 0.6085, "grad_norm": 16.80785369873047, "learning_rate": 7.017834660877755e-08, "logits/chosen": 1.679743766784668, "logits/rejected": 1.675132393836975, "logps/chosen": -161.21913146972656, "logps/rejected": -172.08978271484375, "loss": 0.6374, "nll_loss": 0.6224676966667175, "rewards/accuracies": 1.0, "rewards/chosen": 1.6723037958145142, "rewards/margins": 7.303398132324219, "rewards/rejected": -5.631094455718994, "step": 3651 }, { "epoch": 0.6086666666666667, "grad_norm": 31.920948028564453, "learning_rate": 7.012682785158902e-08, "logits/chosen": 2.9572484493255615, "logits/rejected": 3.20162034034729, "logps/chosen": -105.06398010253906, "logps/rejected": -281.3039855957031, "loss": 1.1688, "nll_loss": 1.1419998407363892, "rewards/accuracies": 1.0, "rewards/chosen": 0.853334903717041, "rewards/margins": 10.435850143432617, "rewards/rejected": -9.582514762878418, "step": 3652 }, { "epoch": 0.6088333333333333, "grad_norm": 54.29756164550781, "learning_rate": 7.007531779872562e-08, "logits/chosen": 2.3458659648895264, "logits/rejected": 2.4868853092193604, "logps/chosen": -55.23651885986328, "logps/rejected": -136.26080322265625, "loss": 1.2545, "nll_loss": 1.2274781465530396, "rewards/accuracies": 1.0, "rewards/chosen": 1.5328301191329956, "rewards/margins": 5.593867778778076, "rewards/rejected": -4.061037540435791, "step": 3653 }, { "epoch": 0.609, "grad_norm": 42.28890609741211, "learning_rate": 7.002381646519624e-08, "logits/chosen": 3.4242894649505615, "logits/rejected": 3.3934075832366943, "logps/chosen": -75.96045684814453, "logps/rejected": -224.12477111816406, "loss": 1.3429, "nll_loss": 1.30966317653656, "rewards/accuracies": 1.0, "rewards/chosen": 0.628858208656311, "rewards/margins": 8.679987907409668, "rewards/rejected": -8.051129341125488, "step": 3654 }, { "epoch": 0.6091666666666666, "grad_norm": 29.17352867126465, "learning_rate": 6.997232386600705e-08, "logits/chosen": 1.6982756853103638, "logits/rejected": 2.2269411087036133, "logps/chosen": -42.64487838745117, "logps/rejected": -283.2499084472656, "loss": 0.6469, "nll_loss": 0.6271305084228516, "rewards/accuracies": 1.0, "rewards/chosen": 1.2354649305343628, "rewards/margins": 7.836528778076172, "rewards/rejected": -6.6010637283325195, "step": 3655 }, { "epoch": 0.6093333333333333, "grad_norm": 23.52242660522461, "learning_rate": 6.992084001616181e-08, "logits/chosen": 2.6548912525177, "logits/rejected": 2.6495656967163086, "logps/chosen": -78.47167205810547, "logps/rejected": -194.0419464111328, "loss": 0.9656, "nll_loss": 0.9569717049598694, "rewards/accuracies": 1.0, "rewards/chosen": 3.3532166481018066, "rewards/margins": 7.932734966278076, "rewards/rejected": -4.5795183181762695, "step": 3656 }, { "epoch": 0.6095, "grad_norm": 48.22144317626953, "learning_rate": 6.986936493066165e-08, "logits/chosen": 2.2688558101654053, "logits/rejected": 2.5031869411468506, "logps/chosen": -34.40135192871094, "logps/rejected": -132.61285400390625, "loss": 0.8908, "nll_loss": 0.8390573859214783, "rewards/accuracies": 1.0, "rewards/chosen": 0.5958915948867798, "rewards/margins": 4.642679691314697, "rewards/rejected": -4.046788215637207, "step": 3657 }, { "epoch": 0.6096666666666667, "grad_norm": 37.468326568603516, "learning_rate": 6.981789862450517e-08, "logits/chosen": 2.759079933166504, "logits/rejected": 2.7090837955474854, "logps/chosen": -27.658329010009766, "logps/rejected": -166.54940795898438, "loss": 0.6289, "nll_loss": 0.6012679934501648, "rewards/accuracies": 1.0, "rewards/chosen": 0.8588787317276001, "rewards/margins": 7.6332011222839355, "rewards/rejected": -6.774322509765625, "step": 3658 }, { "epoch": 0.6098333333333333, "grad_norm": 29.604589462280273, "learning_rate": 6.976644111268843e-08, "logits/chosen": 2.9158122539520264, "logits/rejected": 3.0275347232818604, "logps/chosen": -12.644824981689453, "logps/rejected": -244.7305908203125, "loss": 0.4301, "nll_loss": 0.4078976511955261, "rewards/accuracies": 1.0, "rewards/chosen": 4.357761383056641, "rewards/margins": 7.652204513549805, "rewards/rejected": -3.294442892074585, "step": 3659 }, { "epoch": 0.61, "grad_norm": 16.530174255371094, "learning_rate": 6.971499241020494e-08, "logits/chosen": 1.913015604019165, "logits/rejected": 2.526136636734009, "logps/chosen": -53.41672897338867, "logps/rejected": -130.5279998779297, "loss": 0.4777, "nll_loss": 0.46856778860092163, "rewards/accuracies": 1.0, "rewards/chosen": 2.2026264667510986, "rewards/margins": 8.086763381958008, "rewards/rejected": -5.884137153625488, "step": 3660 }, { "epoch": 0.6101666666666666, "grad_norm": 27.97361946105957, "learning_rate": 6.966355253204557e-08, "logits/chosen": 1.3606970310211182, "logits/rejected": 2.262775421142578, "logps/chosen": -52.758026123046875, "logps/rejected": -331.4099426269531, "loss": 0.7539, "nll_loss": 0.7227126359939575, "rewards/accuracies": 1.0, "rewards/chosen": 1.8256120681762695, "rewards/margins": 5.348259449005127, "rewards/rejected": -3.5226473808288574, "step": 3661 }, { "epoch": 0.6103333333333333, "grad_norm": 36.31145477294922, "learning_rate": 6.961212149319869e-08, "logits/chosen": 2.7171690464019775, "logits/rejected": 2.6445748805999756, "logps/chosen": -109.10874938964844, "logps/rejected": -65.96485900878906, "loss": 1.4648, "nll_loss": 1.347021460533142, "rewards/accuracies": 1.0, "rewards/chosen": 2.4006881713867188, "rewards/margins": 3.9666075706481934, "rewards/rejected": -1.5659195184707642, "step": 3662 }, { "epoch": 0.6105, "grad_norm": 105.6172103881836, "learning_rate": 6.956069930865003e-08, "logits/chosen": 2.0521082878112793, "logits/rejected": 2.9778153896331787, "logps/chosen": -22.803308486938477, "logps/rejected": -242.46484375, "loss": 1.5496, "nll_loss": 1.5202206373214722, "rewards/accuracies": 1.0, "rewards/chosen": 0.7619932293891907, "rewards/margins": 8.986566543579102, "rewards/rejected": -8.224573135375977, "step": 3663 }, { "epoch": 0.6106666666666667, "grad_norm": 157.49945068359375, "learning_rate": 6.950928599338286e-08, "logits/chosen": 2.590123176574707, "logits/rejected": 2.5428426265716553, "logps/chosen": -273.78741455078125, "logps/rejected": -285.50518798828125, "loss": 2.5537, "nll_loss": 1.6394455432891846, "rewards/accuracies": 1.0, "rewards/chosen": -5.520240783691406, "rewards/margins": 3.3888397216796875, "rewards/rejected": -8.909080505371094, "step": 3664 }, { "epoch": 0.6108333333333333, "grad_norm": 33.9085578918457, "learning_rate": 6.945788156237771e-08, "logits/chosen": 2.353184461593628, "logits/rejected": 2.544186592102051, "logps/chosen": -46.17906951904297, "logps/rejected": -121.11131286621094, "loss": 0.9248, "nll_loss": 0.8880590796470642, "rewards/accuracies": 1.0, "rewards/chosen": 0.5565814971923828, "rewards/margins": 7.048285961151123, "rewards/rejected": -6.49170446395874, "step": 3665 }, { "epoch": 0.611, "grad_norm": 366.58978271484375, "learning_rate": 6.940648603061263e-08, "logits/chosen": 1.5558463335037231, "logits/rejected": 2.250131130218506, "logps/chosen": -276.982666015625, "logps/rejected": -418.80084228515625, "loss": 3.0863, "nll_loss": 1.648706316947937, "rewards/accuracies": 1.0, "rewards/chosen": -6.13446044921875, "rewards/margins": 0.34706735610961914, "rewards/rejected": -6.481527805328369, "step": 3666 }, { "epoch": 0.6111666666666666, "grad_norm": 23.260643005371094, "learning_rate": 6.935509941306306e-08, "logits/chosen": 1.648485779762268, "logits/rejected": 2.057781219482422, "logps/chosen": -63.85439682006836, "logps/rejected": -182.93930053710938, "loss": 0.6927, "nll_loss": 0.672151505947113, "rewards/accuracies": 1.0, "rewards/chosen": 1.1558650732040405, "rewards/margins": 8.508193016052246, "rewards/rejected": -7.352327823638916, "step": 3667 }, { "epoch": 0.6113333333333333, "grad_norm": 240.0530242919922, "learning_rate": 6.930372172470178e-08, "logits/chosen": 2.1651294231414795, "logits/rejected": 2.3240127563476562, "logps/chosen": -91.13597106933594, "logps/rejected": -165.2379150390625, "loss": 2.8997, "nll_loss": 2.0712718963623047, "rewards/accuracies": 1.0, "rewards/chosen": -3.0603485107421875, "rewards/margins": 0.4567902088165283, "rewards/rejected": -3.517138719558716, "step": 3668 }, { "epoch": 0.6115, "grad_norm": 29.96108055114746, "learning_rate": 6.925235298049905e-08, "logits/chosen": 2.7372305393218994, "logits/rejected": 2.588409423828125, "logps/chosen": -10.91219711303711, "logps/rejected": -87.15837097167969, "loss": 0.4123, "nll_loss": 0.4041553735733032, "rewards/accuracies": 1.0, "rewards/chosen": 2.4153530597686768, "rewards/margins": 8.110455513000488, "rewards/rejected": -5.695102214813232, "step": 3669 }, { "epoch": 0.6116666666666667, "grad_norm": 27.216814041137695, "learning_rate": 6.920099319542249e-08, "logits/chosen": 3.2201218605041504, "logits/rejected": 3.0773987770080566, "logps/chosen": -333.8049011230469, "logps/rejected": -153.33233642578125, "loss": 1.0506, "nll_loss": 1.0024172067642212, "rewards/accuracies": 1.0, "rewards/chosen": 1.8377197980880737, "rewards/margins": 4.729157447814941, "rewards/rejected": -2.8914377689361572, "step": 3670 }, { "epoch": 0.6118333333333333, "grad_norm": 24.562986373901367, "learning_rate": 6.91496423844371e-08, "logits/chosen": 2.6834518909454346, "logits/rejected": 2.63567852973938, "logps/chosen": -44.235965728759766, "logps/rejected": -129.527587890625, "loss": 0.5806, "nll_loss": 0.5671277642250061, "rewards/accuracies": 1.0, "rewards/chosen": 1.6501710414886475, "rewards/margins": 8.182395935058594, "rewards/rejected": -6.532225131988525, "step": 3671 }, { "epoch": 0.612, "grad_norm": 35.53962326049805, "learning_rate": 6.909830056250527e-08, "logits/chosen": 1.8222355842590332, "logits/rejected": 2.327230453491211, "logps/chosen": -49.08098602294922, "logps/rejected": -124.40274047851562, "loss": 0.9589, "nll_loss": 0.9260563850402832, "rewards/accuracies": 1.0, "rewards/chosen": 1.3065941333770752, "rewards/margins": 5.24897575378418, "rewards/rejected": -3.9423813819885254, "step": 3672 }, { "epoch": 0.6121666666666666, "grad_norm": 42.90275955200195, "learning_rate": 6.904696774458678e-08, "logits/chosen": 2.669217109680176, "logits/rejected": 2.583223819732666, "logps/chosen": -55.36179733276367, "logps/rejected": -131.72669982910156, "loss": 1.0987, "nll_loss": 1.085525631904602, "rewards/accuracies": 1.0, "rewards/chosen": 2.3946826457977295, "rewards/margins": 6.924355506896973, "rewards/rejected": -4.529673099517822, "step": 3673 }, { "epoch": 0.6123333333333333, "grad_norm": 54.2037239074707, "learning_rate": 6.899564394563877e-08, "logits/chosen": 2.6108930110931396, "logits/rejected": 3.0485758781433105, "logps/chosen": -15.036094665527344, "logps/rejected": -352.1548767089844, "loss": 0.6316, "nll_loss": 0.6265038847923279, "rewards/accuracies": 1.0, "rewards/chosen": 2.9729340076446533, "rewards/margins": 8.910140991210938, "rewards/rejected": -5.937207221984863, "step": 3674 }, { "epoch": 0.6125, "grad_norm": 29.886913299560547, "learning_rate": 6.894432918061577e-08, "logits/chosen": 1.1736280918121338, "logits/rejected": 1.8891912698745728, "logps/chosen": -18.12579345703125, "logps/rejected": -229.2382049560547, "loss": 0.4758, "nll_loss": 0.4531448781490326, "rewards/accuracies": 1.0, "rewards/chosen": 1.024741530418396, "rewards/margins": 11.190171241760254, "rewards/rejected": -10.165430068969727, "step": 3675 }, { "epoch": 0.6126666666666667, "grad_norm": 31.48384666442871, "learning_rate": 6.889302346446968e-08, "logits/chosen": 2.7109310626983643, "logits/rejected": 2.841075897216797, "logps/chosen": -19.73653793334961, "logps/rejected": -141.99163818359375, "loss": 0.5419, "nll_loss": 0.5334200263023376, "rewards/accuracies": 1.0, "rewards/chosen": 2.2449169158935547, "rewards/margins": 8.35711669921875, "rewards/rejected": -6.112199306488037, "step": 3676 }, { "epoch": 0.6128333333333333, "grad_norm": 19.137575149536133, "learning_rate": 6.884172681214975e-08, "logits/chosen": 2.6792287826538086, "logits/rejected": 2.9975335597991943, "logps/chosen": -71.66201782226562, "logps/rejected": -549.74072265625, "loss": 0.7137, "nll_loss": 0.7095248699188232, "rewards/accuracies": 1.0, "rewards/chosen": 2.768749952316284, "rewards/margins": 11.804819107055664, "rewards/rejected": -9.0360689163208, "step": 3677 }, { "epoch": 0.613, "grad_norm": 29.11016845703125, "learning_rate": 6.879043923860258e-08, "logits/chosen": 2.093320608139038, "logits/rejected": 2.3025636672973633, "logps/chosen": -64.9154281616211, "logps/rejected": -175.5482635498047, "loss": 0.798, "nll_loss": 0.7821136713027954, "rewards/accuracies": 1.0, "rewards/chosen": 1.487952470779419, "rewards/margins": 7.833714485168457, "rewards/rejected": -6.345762252807617, "step": 3678 }, { "epoch": 0.6131666666666666, "grad_norm": 29.141664505004883, "learning_rate": 6.873916075877216e-08, "logits/chosen": 2.2001230716705322, "logits/rejected": 2.6877260208129883, "logps/chosen": -20.85767936706543, "logps/rejected": -69.58525085449219, "loss": 0.518, "nll_loss": 0.4966113567352295, "rewards/accuracies": 1.0, "rewards/chosen": 1.1946085691452026, "rewards/margins": 7.12880802154541, "rewards/rejected": -5.934199333190918, "step": 3679 }, { "epoch": 0.6133333333333333, "grad_norm": 32.321624755859375, "learning_rate": 6.868789138759976e-08, "logits/chosen": 2.4476470947265625, "logits/rejected": 2.3814003467559814, "logps/chosen": -18.275604248046875, "logps/rejected": -48.90188217163086, "loss": 0.5256, "nll_loss": 0.4939351975917816, "rewards/accuracies": 1.0, "rewards/chosen": 1.5976876020431519, "rewards/margins": 5.281885147094727, "rewards/rejected": -3.6841976642608643, "step": 3680 }, { "epoch": 0.6135, "grad_norm": 43.340850830078125, "learning_rate": 6.863663114002411e-08, "logits/chosen": 0.6258922219276428, "logits/rejected": 2.1832590103149414, "logps/chosen": -42.24757766723633, "logps/rejected": -330.28228759765625, "loss": 0.8335, "nll_loss": 0.7823626399040222, "rewards/accuracies": 1.0, "rewards/chosen": 0.14606666564941406, "rewards/margins": 7.828197002410889, "rewards/rejected": -7.682130336761475, "step": 3681 }, { "epoch": 0.6136666666666667, "grad_norm": 31.018693923950195, "learning_rate": 6.858538003098116e-08, "logits/chosen": 3.150805950164795, "logits/rejected": 3.091768980026245, "logps/chosen": -146.65451049804688, "logps/rejected": -177.08375549316406, "loss": 1.2627, "nll_loss": 1.232391119003296, "rewards/accuracies": 1.0, "rewards/chosen": 1.482997179031372, "rewards/margins": 5.363182067871094, "rewards/rejected": -3.8801848888397217, "step": 3682 }, { "epoch": 0.6138333333333333, "grad_norm": 25.393592834472656, "learning_rate": 6.85341380754043e-08, "logits/chosen": 2.772637128829956, "logits/rejected": 2.8995418548583984, "logps/chosen": -65.38636779785156, "logps/rejected": -196.30279541015625, "loss": 0.832, "nll_loss": 0.8276756405830383, "rewards/accuracies": 1.0, "rewards/chosen": 2.829735040664673, "rewards/margins": 10.247499465942383, "rewards/rejected": -7.417764186859131, "step": 3683 }, { "epoch": 0.614, "grad_norm": 32.826690673828125, "learning_rate": 6.848290528822415e-08, "logits/chosen": 2.298955202102661, "logits/rejected": 2.650351047515869, "logps/chosen": -67.27523803710938, "logps/rejected": -394.5826416015625, "loss": 1.2401, "nll_loss": 1.2231860160827637, "rewards/accuracies": 1.0, "rewards/chosen": 1.3986037969589233, "rewards/margins": 7.9628496170043945, "rewards/rejected": -6.564245700836182, "step": 3684 }, { "epoch": 0.6141666666666666, "grad_norm": 32.990074157714844, "learning_rate": 6.843168168436877e-08, "logits/chosen": 1.9929921627044678, "logits/rejected": 1.9232778549194336, "logps/chosen": -70.96842956542969, "logps/rejected": -137.37939453125, "loss": 0.889, "nll_loss": 0.8761533498764038, "rewards/accuracies": 1.0, "rewards/chosen": 1.6220229864120483, "rewards/margins": 10.106666564941406, "rewards/rejected": -8.484643936157227, "step": 3685 }, { "epoch": 0.6143333333333333, "grad_norm": 123.28437042236328, "learning_rate": 6.838046727876346e-08, "logits/chosen": 2.079385280609131, "logits/rejected": 2.1689155101776123, "logps/chosen": -31.63149070739746, "logps/rejected": -62.50745391845703, "loss": 2.294, "nll_loss": 2.259392261505127, "rewards/accuracies": 1.0, "rewards/chosen": 0.8612085580825806, "rewards/margins": 5.580338001251221, "rewards/rejected": -4.71912956237793, "step": 3686 }, { "epoch": 0.6145, "grad_norm": 35.605018615722656, "learning_rate": 6.832926208633086e-08, "logits/chosen": 1.5751601457595825, "logits/rejected": 1.927137851715088, "logps/chosen": -18.101776123046875, "logps/rejected": -189.61248779296875, "loss": 0.582, "nll_loss": 0.5656803846359253, "rewards/accuracies": 1.0, "rewards/chosen": 1.6713752746582031, "rewards/margins": 6.840046405792236, "rewards/rejected": -5.168671131134033, "step": 3687 }, { "epoch": 0.6146666666666667, "grad_norm": 31.856653213500977, "learning_rate": 6.827806612199097e-08, "logits/chosen": 1.0233478546142578, "logits/rejected": 2.4396166801452637, "logps/chosen": -51.51582336425781, "logps/rejected": -455.72344970703125, "loss": 0.8781, "nll_loss": 0.8585968613624573, "rewards/accuracies": 1.0, "rewards/chosen": 1.1818684339523315, "rewards/margins": 10.018309593200684, "rewards/rejected": -8.836441040039062, "step": 3688 }, { "epoch": 0.6148333333333333, "grad_norm": 32.96754837036133, "learning_rate": 6.822687940066101e-08, "logits/chosen": 1.6038405895233154, "logits/rejected": 2.6709046363830566, "logps/chosen": -21.895185470581055, "logps/rejected": -277.3313903808594, "loss": 0.465, "nll_loss": 0.4468405544757843, "rewards/accuracies": 1.0, "rewards/chosen": 1.2669275999069214, "rewards/margins": 9.22392749786377, "rewards/rejected": -7.956999778747559, "step": 3689 }, { "epoch": 0.615, "grad_norm": 61.84483337402344, "learning_rate": 6.817570193725564e-08, "logits/chosen": 2.1250994205474854, "logits/rejected": 2.7059168815612793, "logps/chosen": -15.241002082824707, "logps/rejected": -354.783203125, "loss": 0.67, "nll_loss": 0.662652313709259, "rewards/accuracies": 1.0, "rewards/chosen": 2.68280291557312, "rewards/margins": 8.101329803466797, "rewards/rejected": -5.418527126312256, "step": 3690 }, { "epoch": 0.6151666666666666, "grad_norm": 43.86542892456055, "learning_rate": 6.812453374668666e-08, "logits/chosen": 3.0171799659729004, "logits/rejected": 3.1663856506347656, "logps/chosen": -86.09718322753906, "logps/rejected": -104.11316680908203, "loss": 1.453, "nll_loss": 1.4114292860031128, "rewards/accuracies": 1.0, "rewards/chosen": 0.9031783938407898, "rewards/margins": 4.923019886016846, "rewards/rejected": -4.01984167098999, "step": 3691 }, { "epoch": 0.6153333333333333, "grad_norm": 28.987049102783203, "learning_rate": 6.807337484386331e-08, "logits/chosen": 2.479663133621216, "logits/rejected": 2.260936975479126, "logps/chosen": -32.6226921081543, "logps/rejected": -64.15522003173828, "loss": 0.5552, "nll_loss": 0.5347982048988342, "rewards/accuracies": 1.0, "rewards/chosen": 1.48344087600708, "rewards/margins": 6.3603010177612305, "rewards/rejected": -4.87686014175415, "step": 3692 }, { "epoch": 0.6155, "grad_norm": 36.09634017944336, "learning_rate": 6.802222524369201e-08, "logits/chosen": 2.212315320968628, "logits/rejected": 2.5163259506225586, "logps/chosen": -26.600547790527344, "logps/rejected": -164.13543701171875, "loss": 0.6359, "nll_loss": 0.6333462595939636, "rewards/accuracies": 1.0, "rewards/chosen": 3.780902862548828, "rewards/margins": 10.208852767944336, "rewards/rejected": -6.427949905395508, "step": 3693 }, { "epoch": 0.6156666666666667, "grad_norm": 223.1991729736328, "learning_rate": 6.79710849610766e-08, "logits/chosen": 1.93429696559906, "logits/rejected": 2.1654300689697266, "logps/chosen": -21.18326759338379, "logps/rejected": -122.3119125366211, "loss": 2.6997, "nll_loss": 2.6479082107543945, "rewards/accuracies": 1.0, "rewards/chosen": 0.13213025033473969, "rewards/margins": 7.471678733825684, "rewards/rejected": -7.339548587799072, "step": 3694 }, { "epoch": 0.6158333333333333, "grad_norm": 277.401123046875, "learning_rate": 6.791995401091806e-08, "logits/chosen": 3.1581742763519287, "logits/rejected": 3.075838565826416, "logps/chosen": -69.6600341796875, "logps/rejected": -88.04004669189453, "loss": 2.8281, "nll_loss": 0.7181447744369507, "rewards/accuracies": 0.0, "rewards/chosen": 2.4152984619140625, "rewards/margins": -1.1851158142089844, "rewards/rejected": 3.600414276123047, "step": 3695 }, { "epoch": 0.616, "grad_norm": 138.24513244628906, "learning_rate": 6.786883240811479e-08, "logits/chosen": 0.7997271418571472, "logits/rejected": 0.49588796496391296, "logps/chosen": -148.04782104492188, "logps/rejected": -55.8066520690918, "loss": 1.6237, "nll_loss": 0.6760175228118896, "rewards/accuracies": 1.0, "rewards/chosen": 2.665454149246216, "rewards/margins": 0.704797625541687, "rewards/rejected": 1.9606565237045288, "step": 3696 }, { "epoch": 0.6161666666666666, "grad_norm": 24.26725196838379, "learning_rate": 6.781772016756233e-08, "logits/chosen": 0.6695880889892578, "logits/rejected": 1.586970567703247, "logps/chosen": -95.48268127441406, "logps/rejected": -398.40997314453125, "loss": 0.768, "nll_loss": 0.7518322467803955, "rewards/accuracies": 1.0, "rewards/chosen": 1.3806488513946533, "rewards/margins": 9.813748359680176, "rewards/rejected": -8.433099746704102, "step": 3697 }, { "epoch": 0.6163333333333333, "grad_norm": 30.234018325805664, "learning_rate": 6.776661730415361e-08, "logits/chosen": 2.5523295402526855, "logits/rejected": 2.1727616786956787, "logps/chosen": -142.28744506835938, "logps/rejected": -133.42349243164062, "loss": 1.1311, "nll_loss": 1.0945186614990234, "rewards/accuracies": 1.0, "rewards/chosen": 0.7132782936096191, "rewards/margins": 5.743732929229736, "rewards/rejected": -5.030454635620117, "step": 3698 }, { "epoch": 0.6165, "grad_norm": 83.26891326904297, "learning_rate": 6.771552383277875e-08, "logits/chosen": 2.208127737045288, "logits/rejected": 2.267662763595581, "logps/chosen": -29.3571834564209, "logps/rejected": -273.7978515625, "loss": 1.3553, "nll_loss": 1.3344173431396484, "rewards/accuracies": 1.0, "rewards/chosen": 1.227958083152771, "rewards/margins": 7.0862345695495605, "rewards/rejected": -5.8582763671875, "step": 3699 }, { "epoch": 0.6166666666666667, "grad_norm": 79.62190246582031, "learning_rate": 6.766443976832517e-08, "logits/chosen": 4.195343494415283, "logits/rejected": 4.164381504058838, "logps/chosen": -31.045053482055664, "logps/rejected": -11.3763427734375, "loss": 1.5953, "nll_loss": 0.7391678690910339, "rewards/accuracies": 1.0, "rewards/chosen": 2.2469122409820557, "rewards/margins": 0.7379722595214844, "rewards/rejected": 1.5089399814605713, "step": 3700 }, { "epoch": 0.6168333333333333, "grad_norm": 31.483718872070312, "learning_rate": 6.761336512567752e-08, "logits/chosen": 1.8436607122421265, "logits/rejected": 2.533320665359497, "logps/chosen": -81.1405258178711, "logps/rejected": -194.71932983398438, "loss": 1.0887, "nll_loss": 1.0818737745285034, "rewards/accuracies": 1.0, "rewards/chosen": 2.506751537322998, "rewards/margins": 8.590621948242188, "rewards/rejected": -6.083869934082031, "step": 3701 }, { "epoch": 0.617, "grad_norm": 35.0994873046875, "learning_rate": 6.756229991971778e-08, "logits/chosen": 3.0228185653686523, "logits/rejected": 2.977287769317627, "logps/chosen": -66.35958862304688, "logps/rejected": -145.54307556152344, "loss": 0.9358, "nll_loss": 0.9090356826782227, "rewards/accuracies": 1.0, "rewards/chosen": 1.34868323802948, "rewards/margins": 5.701744556427002, "rewards/rejected": -4.353061199188232, "step": 3702 }, { "epoch": 0.6171666666666666, "grad_norm": 97.93543243408203, "learning_rate": 6.751124416532506e-08, "logits/chosen": 3.2321372032165527, "logits/rejected": 3.121774196624756, "logps/chosen": -13.587583541870117, "logps/rejected": -4.517332553863525, "loss": 1.5119, "nll_loss": 0.33968961238861084, "rewards/accuracies": 0.0, "rewards/chosen": 1.2694400548934937, "rewards/margins": -0.17863380908966064, "rewards/rejected": 1.4480738639831543, "step": 3703 }, { "epoch": 0.6173333333333333, "grad_norm": 22.955724716186523, "learning_rate": 6.746019787737582e-08, "logits/chosen": 2.568824529647827, "logits/rejected": 1.888128399848938, "logps/chosen": -90.88927459716797, "logps/rejected": -41.442649841308594, "loss": 0.9274, "nll_loss": 0.8910712599754333, "rewards/accuracies": 1.0, "rewards/chosen": 2.209763288497925, "rewards/margins": 5.31907320022583, "rewards/rejected": -3.1093099117279053, "step": 3704 }, { "epoch": 0.6175, "grad_norm": 44.07722091674805, "learning_rate": 6.740916107074371e-08, "logits/chosen": 2.5263712406158447, "logits/rejected": 2.3833112716674805, "logps/chosen": -88.58528137207031, "logps/rejected": -116.12464141845703, "loss": 1.1078, "nll_loss": 1.0803080797195435, "rewards/accuracies": 1.0, "rewards/chosen": 0.9138870239257812, "rewards/margins": 6.823706150054932, "rewards/rejected": -5.90981912612915, "step": 3705 }, { "epoch": 0.6176666666666667, "grad_norm": 165.3173828125, "learning_rate": 6.735813376029962e-08, "logits/chosen": 2.6071863174438477, "logits/rejected": 2.089266300201416, "logps/chosen": -109.77518463134766, "logps/rejected": -45.01798629760742, "loss": 2.3991, "nll_loss": 1.1317028999328613, "rewards/accuracies": 0.0, "rewards/chosen": 1.421006202697754, "rewards/margins": -0.28798162937164307, "rewards/rejected": 1.708987832069397, "step": 3706 }, { "epoch": 0.6178333333333333, "grad_norm": 19.117631912231445, "learning_rate": 6.730711596091172e-08, "logits/chosen": 2.239778518676758, "logits/rejected": 2.485262155532837, "logps/chosen": -108.48822021484375, "logps/rejected": -239.8251190185547, "loss": 0.7614, "nll_loss": 0.7533904314041138, "rewards/accuracies": 1.0, "rewards/chosen": 2.099199056625366, "rewards/margins": 11.32018756866455, "rewards/rejected": -9.220988273620605, "step": 3707 }, { "epoch": 0.618, "grad_norm": 98.36711120605469, "learning_rate": 6.725610768744534e-08, "logits/chosen": 3.040186882019043, "logits/rejected": 2.9036800861358643, "logps/chosen": -219.0662841796875, "logps/rejected": -115.55261993408203, "loss": 1.7523, "nll_loss": 1.4902467727661133, "rewards/accuracies": 1.0, "rewards/chosen": -1.9071792364120483, "rewards/margins": 3.465280055999756, "rewards/rejected": -5.372459411621094, "step": 3708 }, { "epoch": 0.6181666666666666, "grad_norm": 44.248016357421875, "learning_rate": 6.720510895476308e-08, "logits/chosen": 2.419644594192505, "logits/rejected": 2.550963878631592, "logps/chosen": -40.78200912475586, "logps/rejected": -168.9598388671875, "loss": 1.0808, "nll_loss": 1.0732107162475586, "rewards/accuracies": 1.0, "rewards/chosen": 2.237905502319336, "rewards/margins": 9.235574722290039, "rewards/rejected": -6.997668743133545, "step": 3709 }, { "epoch": 0.6183333333333333, "grad_norm": 83.085205078125, "learning_rate": 6.715411977772473e-08, "logits/chosen": 1.9284491539001465, "logits/rejected": 2.446244716644287, "logps/chosen": -14.360379219055176, "logps/rejected": -494.1839599609375, "loss": 0.8529, "nll_loss": 0.8447280526161194, "rewards/accuracies": 1.0, "rewards/chosen": 2.585988998413086, "rewards/margins": 7.895089626312256, "rewards/rejected": -5.30910062789917, "step": 3710 }, { "epoch": 0.6185, "grad_norm": 31.607383728027344, "learning_rate": 6.710314017118734e-08, "logits/chosen": 2.2593092918395996, "logits/rejected": 2.08841872215271, "logps/chosen": -111.57699584960938, "logps/rejected": -96.47206115722656, "loss": 0.9716, "nll_loss": 0.9455679059028625, "rewards/accuracies": 1.0, "rewards/chosen": 1.933465600013733, "rewards/margins": 5.6666951179504395, "rewards/rejected": -3.733229398727417, "step": 3711 }, { "epoch": 0.6186666666666667, "grad_norm": 23.838844299316406, "learning_rate": 6.705217015000509e-08, "logits/chosen": 3.094371795654297, "logits/rejected": 3.3098630905151367, "logps/chosen": -70.2728271484375, "logps/rejected": -426.5343017578125, "loss": 0.7792, "nll_loss": 0.7722287774085999, "rewards/accuracies": 1.0, "rewards/chosen": 2.2394044399261475, "rewards/margins": 13.087045669555664, "rewards/rejected": -10.847640991210938, "step": 3712 }, { "epoch": 0.6188333333333333, "grad_norm": 204.4388885498047, "learning_rate": 6.70012097290295e-08, "logits/chosen": 2.7690482139587402, "logits/rejected": 2.6793317794799805, "logps/chosen": -29.45835304260254, "logps/rejected": -55.38520431518555, "loss": 2.6479, "nll_loss": 0.49929413199424744, "rewards/accuracies": 0.0, "rewards/chosen": 1.4121545553207397, "rewards/margins": -1.4589060544967651, "rewards/rejected": 2.871060609817505, "step": 3713 }, { "epoch": 0.619, "grad_norm": 11.793595314025879, "learning_rate": 6.695025892310913e-08, "logits/chosen": 2.1096279621124268, "logits/rejected": 2.12424373626709, "logps/chosen": -160.58602905273438, "logps/rejected": -208.53067016601562, "loss": 0.6128, "nll_loss": 0.610593318939209, "rewards/accuracies": 1.0, "rewards/chosen": 5.046942234039307, "rewards/margins": 10.898305892944336, "rewards/rejected": -5.851363182067871, "step": 3714 }, { "epoch": 0.6191666666666666, "grad_norm": 27.289409637451172, "learning_rate": 6.68993177470899e-08, "logits/chosen": 2.235201120376587, "logits/rejected": 2.1409339904785156, "logps/chosen": -36.191341400146484, "logps/rejected": -84.00363159179688, "loss": 0.6172, "nll_loss": 0.5933005809783936, "rewards/accuracies": 1.0, "rewards/chosen": 1.5142704248428345, "rewards/margins": 5.877119541168213, "rewards/rejected": -4.362849235534668, "step": 3715 }, { "epoch": 0.6193333333333333, "grad_norm": 19.56183624267578, "learning_rate": 6.684838621581477e-08, "logits/chosen": 2.025735378265381, "logits/rejected": 2.0962531566619873, "logps/chosen": -118.72268676757812, "logps/rejected": -176.9868927001953, "loss": 0.9237, "nll_loss": 0.920330822467804, "rewards/accuracies": 1.0, "rewards/chosen": 2.9921233654022217, "rewards/margins": 12.280603408813477, "rewards/rejected": -9.288479804992676, "step": 3716 }, { "epoch": 0.6195, "grad_norm": 108.3419418334961, "learning_rate": 6.679746434412404e-08, "logits/chosen": 2.443528652191162, "logits/rejected": 2.6149070262908936, "logps/chosen": -63.885597229003906, "logps/rejected": -32.921669006347656, "loss": 1.6503, "nll_loss": 0.8872999548912048, "rewards/accuracies": 1.0, "rewards/chosen": 1.6250526905059814, "rewards/margins": 0.7059872150421143, "rewards/rejected": 0.9190654754638672, "step": 3717 }, { "epoch": 0.6196666666666667, "grad_norm": 38.70983123779297, "learning_rate": 6.674655214685505e-08, "logits/chosen": 1.9488282203674316, "logits/rejected": 2.228052854537964, "logps/chosen": -37.63387680053711, "logps/rejected": -177.7222442626953, "loss": 0.8653, "nll_loss": 0.8363083004951477, "rewards/accuracies": 1.0, "rewards/chosen": 1.0189484357833862, "rewards/margins": 5.943147659301758, "rewards/rejected": -4.924199104309082, "step": 3718 }, { "epoch": 0.6198333333333333, "grad_norm": 586.0556030273438, "learning_rate": 6.669564963884244e-08, "logits/chosen": 2.160418748855591, "logits/rejected": 1.3494341373443604, "logps/chosen": -241.09812927246094, "logps/rejected": -73.60881042480469, "loss": 4.7642, "nll_loss": 1.470110535621643, "rewards/accuracies": 0.0, "rewards/chosen": -3.4459595680236816, "rewards/margins": -3.1703414916992188, "rewards/rejected": -0.2756179869174957, "step": 3719 }, { "epoch": 0.62, "grad_norm": 133.53111267089844, "learning_rate": 6.664475683491795e-08, "logits/chosen": 2.691911458969116, "logits/rejected": 2.6214828491210938, "logps/chosen": -81.6211929321289, "logps/rejected": -80.02392578125, "loss": 2.0047, "nll_loss": 1.4319509267807007, "rewards/accuracies": 1.0, "rewards/chosen": -2.1958210468292236, "rewards/margins": 0.9072766304016113, "rewards/rejected": -3.103097677230835, "step": 3720 }, { "epoch": 0.6201666666666666, "grad_norm": 56.52266311645508, "learning_rate": 6.659387374991057e-08, "logits/chosen": 2.4639971256256104, "logits/rejected": 2.7218384742736816, "logps/chosen": -37.780250549316406, "logps/rejected": -375.84246826171875, "loss": 1.3077, "nll_loss": 1.302767276763916, "rewards/accuracies": 1.0, "rewards/chosen": 2.5881431102752686, "rewards/margins": 13.149727821350098, "rewards/rejected": -10.56158447265625, "step": 3721 }, { "epoch": 0.6203333333333333, "grad_norm": 172.67140197753906, "learning_rate": 6.654300039864633e-08, "logits/chosen": 2.1906652450561523, "logits/rejected": 2.284769058227539, "logps/chosen": -40.92247009277344, "logps/rejected": -40.13261413574219, "loss": 2.1824, "nll_loss": 0.5246471166610718, "rewards/accuracies": 0.0, "rewards/chosen": 1.946351408958435, "rewards/margins": -0.7152484655380249, "rewards/rejected": 2.66159987449646, "step": 3722 }, { "epoch": 0.6205, "grad_norm": 24.079347610473633, "learning_rate": 6.649213679594859e-08, "logits/chosen": 2.452005624771118, "logits/rejected": 2.600341320037842, "logps/chosen": -87.9690170288086, "logps/rejected": -163.14244079589844, "loss": 0.8781, "nll_loss": 0.8540682196617126, "rewards/accuracies": 1.0, "rewards/chosen": 2.3330750465393066, "rewards/margins": 5.946667671203613, "rewards/rejected": -3.6135926246643066, "step": 3723 }, { "epoch": 0.6206666666666667, "grad_norm": 17.5351505279541, "learning_rate": 6.644128295663771e-08, "logits/chosen": 0.8127803206443787, "logits/rejected": 1.4675239324569702, "logps/chosen": -51.11695861816406, "logps/rejected": -306.27252197265625, "loss": 0.5476, "nll_loss": 0.5437974333763123, "rewards/accuracies": 1.0, "rewards/chosen": 2.881601095199585, "rewards/margins": 11.27816104888916, "rewards/rejected": -8.396559715270996, "step": 3724 }, { "epoch": 0.6208333333333333, "grad_norm": 181.81280517578125, "learning_rate": 6.639043889553133e-08, "logits/chosen": 2.7909138202667236, "logits/rejected": 2.8748300075531006, "logps/chosen": -154.16937255859375, "logps/rejected": -132.49839782714844, "loss": 2.3223, "nll_loss": 1.526429533958435, "rewards/accuracies": 1.0, "rewards/chosen": -4.034368991851807, "rewards/margins": 1.2769989967346191, "rewards/rejected": -5.311367988586426, "step": 3725 }, { "epoch": 0.621, "grad_norm": 159.410888671875, "learning_rate": 6.633960462744415e-08, "logits/chosen": 2.0905773639678955, "logits/rejected": 2.0909459590911865, "logps/chosen": -77.7999267578125, "logps/rejected": -45.06248092651367, "loss": 1.9842, "nll_loss": 0.9848092198371887, "rewards/accuracies": 0.0, "rewards/chosen": 0.7935836911201477, "rewards/margins": -0.016130149364471436, "rewards/rejected": 0.8097138404846191, "step": 3726 }, { "epoch": 0.6211666666666666, "grad_norm": 27.145532608032227, "learning_rate": 6.628878016718805e-08, "logits/chosen": 2.133652925491333, "logits/rejected": 2.431211471557617, "logps/chosen": -84.31038665771484, "logps/rejected": -330.62957763671875, "loss": 0.954, "nll_loss": 0.9264877438545227, "rewards/accuracies": 1.0, "rewards/chosen": 1.750058889389038, "rewards/margins": 5.542387008666992, "rewards/rejected": -3.792327880859375, "step": 3727 }, { "epoch": 0.6213333333333333, "grad_norm": 22.364824295043945, "learning_rate": 6.62379655295721e-08, "logits/chosen": 1.9331607818603516, "logits/rejected": 2.4350926876068115, "logps/chosen": -81.72265625, "logps/rejected": -379.3526916503906, "loss": 0.8178, "nll_loss": 0.8091351985931396, "rewards/accuracies": 1.0, "rewards/chosen": 2.006134033203125, "rewards/margins": 14.74283504486084, "rewards/rejected": -12.736701011657715, "step": 3728 }, { "epoch": 0.6215, "grad_norm": 245.3043975830078, "learning_rate": 6.618716072940247e-08, "logits/chosen": 2.0942258834838867, "logits/rejected": 1.4155664443969727, "logps/chosen": -128.54580688476562, "logps/rejected": -46.766319274902344, "loss": 2.3908, "nll_loss": 1.0450879335403442, "rewards/accuracies": 0.0, "rewards/chosen": -0.8318015933036804, "rewards/margins": -0.8313896059989929, "rewards/rejected": -0.0004119873046875, "step": 3729 }, { "epoch": 0.6216666666666667, "grad_norm": 37.996761322021484, "learning_rate": 6.613636578148241e-08, "logits/chosen": 2.5246691703796387, "logits/rejected": 2.7208147048950195, "logps/chosen": -127.43519592285156, "logps/rejected": -292.9034423828125, "loss": 1.3567, "nll_loss": 1.327449917793274, "rewards/accuracies": 1.0, "rewards/chosen": 1.2629883289337158, "rewards/margins": 5.541609764099121, "rewards/rejected": -4.278621196746826, "step": 3730 }, { "epoch": 0.6218333333333333, "grad_norm": 30.670183181762695, "learning_rate": 6.60855807006124e-08, "logits/chosen": 0.8933613300323486, "logits/rejected": 2.1514365673065186, "logps/chosen": -12.534212112426758, "logps/rejected": -217.47695922851562, "loss": 0.3857, "nll_loss": 0.3581203520298004, "rewards/accuracies": 1.0, "rewards/chosen": 1.2241958379745483, "rewards/margins": 5.744520664215088, "rewards/rejected": -4.52032470703125, "step": 3731 }, { "epoch": 0.622, "grad_norm": 21.34774398803711, "learning_rate": 6.603480550158994e-08, "logits/chosen": 1.2437753677368164, "logits/rejected": 1.8342461585998535, "logps/chosen": -123.83295440673828, "logps/rejected": -268.7984619140625, "loss": 0.9326, "nll_loss": 0.9241265654563904, "rewards/accuracies": 1.0, "rewards/chosen": 2.0638375282287598, "rewards/margins": 9.922124862670898, "rewards/rejected": -7.858287811279297, "step": 3732 }, { "epoch": 0.6221666666666666, "grad_norm": 35.375125885009766, "learning_rate": 6.598404019920975e-08, "logits/chosen": 1.9808440208435059, "logits/rejected": 2.40769362449646, "logps/chosen": -10.987031936645508, "logps/rejected": -157.84303283691406, "loss": 0.4312, "nll_loss": 0.4225781559944153, "rewards/accuracies": 1.0, "rewards/chosen": 2.230828285217285, "rewards/margins": 8.265236854553223, "rewards/rejected": -6.0344085693359375, "step": 3733 }, { "epoch": 0.6223333333333333, "grad_norm": 116.57825469970703, "learning_rate": 6.593328480826358e-08, "logits/chosen": 2.4794938564300537, "logits/rejected": 2.4322030544281006, "logps/chosen": -100.51292419433594, "logps/rejected": -70.30810546875, "loss": 1.5119, "nll_loss": 1.0925320386886597, "rewards/accuracies": 1.0, "rewards/chosen": -0.49641573429107666, "rewards/margins": 1.0743610858917236, "rewards/rejected": -1.5707768201828003, "step": 3734 }, { "epoch": 0.6225, "grad_norm": 29.67068862915039, "learning_rate": 6.588253934354038e-08, "logits/chosen": 2.225834369659424, "logits/rejected": 2.328587055206299, "logps/chosen": -57.71329116821289, "logps/rejected": -165.79815673828125, "loss": 0.5869, "nll_loss": 0.5549355149269104, "rewards/accuracies": 1.0, "rewards/chosen": 0.914116382598877, "rewards/margins": 5.7825422286987305, "rewards/rejected": -4.8684258460998535, "step": 3735 }, { "epoch": 0.6226666666666667, "grad_norm": 20.511062622070312, "learning_rate": 6.583180381982612e-08, "logits/chosen": 2.1528022289276123, "logits/rejected": 2.5315773487091064, "logps/chosen": -44.13835906982422, "logps/rejected": -372.03338623046875, "loss": 0.4929, "nll_loss": 0.4904262125492096, "rewards/accuracies": 1.0, "rewards/chosen": 3.2789621353149414, "rewards/margins": 13.011009216308594, "rewards/rejected": -9.732047080993652, "step": 3736 }, { "epoch": 0.6228333333333333, "grad_norm": 19.33868980407715, "learning_rate": 6.578107825190394e-08, "logits/chosen": 1.0898566246032715, "logits/rejected": 2.120285749435425, "logps/chosen": -75.20369720458984, "logps/rejected": -321.13385009765625, "loss": 0.7552, "nll_loss": 0.7520368695259094, "rewards/accuracies": 1.0, "rewards/chosen": 3.194983959197998, "rewards/margins": 10.468864440917969, "rewards/rejected": -7.2738800048828125, "step": 3737 }, { "epoch": 0.623, "grad_norm": 26.976333618164062, "learning_rate": 6.573036265455401e-08, "logits/chosen": 0.4416749179363251, "logits/rejected": 2.5360724925994873, "logps/chosen": -18.184467315673828, "logps/rejected": -305.9674987792969, "loss": 0.4967, "nll_loss": 0.4785386025905609, "rewards/accuracies": 1.0, "rewards/chosen": 1.3204224109649658, "rewards/margins": 7.7713775634765625, "rewards/rejected": -6.450955390930176, "step": 3738 }, { "epoch": 0.6231666666666666, "grad_norm": 23.89967155456543, "learning_rate": 6.56796570425537e-08, "logits/chosen": 1.4871878623962402, "logits/rejected": 1.1539932489395142, "logps/chosen": -88.83743286132812, "logps/rejected": -98.36622619628906, "loss": 0.8803, "nll_loss": 0.8624992966651917, "rewards/accuracies": 1.0, "rewards/chosen": 1.312373399734497, "rewards/margins": 8.212873458862305, "rewards/rejected": -6.900500297546387, "step": 3739 }, { "epoch": 0.6233333333333333, "grad_norm": 34.992401123046875, "learning_rate": 6.562896143067733e-08, "logits/chosen": 0.7162299156188965, "logits/rejected": 1.9274425506591797, "logps/chosen": -60.58636474609375, "logps/rejected": -360.942138671875, "loss": 0.9851, "nll_loss": 0.9616883397102356, "rewards/accuracies": 1.0, "rewards/chosen": 0.9812980890274048, "rewards/margins": 9.927899360656738, "rewards/rejected": -8.946600914001465, "step": 3740 }, { "epoch": 0.6235, "grad_norm": 62.47017288208008, "learning_rate": 6.557827583369648e-08, "logits/chosen": 2.959838390350342, "logits/rejected": 2.894829511642456, "logps/chosen": -79.6147689819336, "logps/rejected": -64.00276184082031, "loss": 1.3545, "nll_loss": 1.2062842845916748, "rewards/accuracies": 1.0, "rewards/chosen": 1.4570343494415283, "rewards/margins": 3.0470268726348877, "rewards/rejected": -1.5899925231933594, "step": 3741 }, { "epoch": 0.6236666666666667, "grad_norm": 51.70134353637695, "learning_rate": 6.552760026637963e-08, "logits/chosen": 2.4046854972839355, "logits/rejected": 2.329836130142212, "logps/chosen": -9.071504592895508, "logps/rejected": -32.37317657470703, "loss": 0.589, "nll_loss": 0.3489040732383728, "rewards/accuracies": 1.0, "rewards/chosen": 1.1678584814071655, "rewards/margins": 2.2554726600646973, "rewards/rejected": -1.0876140594482422, "step": 3742 }, { "epoch": 0.6238333333333334, "grad_norm": 26.33188247680664, "learning_rate": 6.547693474349247e-08, "logits/chosen": 1.5637179613113403, "logits/rejected": 1.5220898389816284, "logps/chosen": -77.24372863769531, "logps/rejected": -93.8571548461914, "loss": 0.9103, "nll_loss": 0.8981828689575195, "rewards/accuracies": 1.0, "rewards/chosen": 2.4688262939453125, "rewards/margins": 7.081546306610107, "rewards/rejected": -4.612720012664795, "step": 3743 }, { "epoch": 0.624, "grad_norm": 25.0706844329834, "learning_rate": 6.542627927979772e-08, "logits/chosen": 2.918954849243164, "logits/rejected": 2.8698348999023438, "logps/chosen": -120.88321685791016, "logps/rejected": -60.70104217529297, "loss": 1.051, "nll_loss": 1.0331900119781494, "rewards/accuracies": 1.0, "rewards/chosen": 2.170177459716797, "rewards/margins": 6.356030464172363, "rewards/rejected": -4.185853004455566, "step": 3744 }, { "epoch": 0.6241666666666666, "grad_norm": 25.047800064086914, "learning_rate": 6.537563389005512e-08, "logits/chosen": 2.6909103393554688, "logits/rejected": 2.969268798828125, "logps/chosen": -61.276214599609375, "logps/rejected": -193.8997039794922, "loss": 0.7209, "nll_loss": 0.7125142216682434, "rewards/accuracies": 1.0, "rewards/chosen": 2.1304337978363037, "rewards/margins": 9.075583457946777, "rewards/rejected": -6.9451494216918945, "step": 3745 }, { "epoch": 0.6243333333333333, "grad_norm": 229.9172821044922, "learning_rate": 6.532499858902158e-08, "logits/chosen": 2.6642584800720215, "logits/rejected": 2.5335328578948975, "logps/chosen": -48.0914306640625, "logps/rejected": -37.679710388183594, "loss": 5.8747, "nll_loss": 0.7756683230400085, "rewards/accuracies": 0.0, "rewards/chosen": 0.7994469404220581, "rewards/margins": -4.682101249694824, "rewards/rejected": 5.481548309326172, "step": 3746 }, { "epoch": 0.6245, "grad_norm": 23.898223876953125, "learning_rate": 6.527437339145097e-08, "logits/chosen": 2.486633777618408, "logits/rejected": 2.57559871673584, "logps/chosen": -55.30326461791992, "logps/rejected": -151.65289306640625, "loss": 0.7759, "nll_loss": 0.768100917339325, "rewards/accuracies": 1.0, "rewards/chosen": 2.1404569149017334, "rewards/margins": 10.466032981872559, "rewards/rejected": -8.325575828552246, "step": 3747 }, { "epoch": 0.6246666666666667, "grad_norm": 42.608585357666016, "learning_rate": 6.522375831209429e-08, "logits/chosen": 2.261540651321411, "logits/rejected": 2.257272243499756, "logps/chosen": -37.10883712768555, "logps/rejected": -71.52627563476562, "loss": 0.9315, "nll_loss": 0.927720844745636, "rewards/accuracies": 1.0, "rewards/chosen": 4.030130386352539, "rewards/margins": 9.509416580200195, "rewards/rejected": -5.479286193847656, "step": 3748 }, { "epoch": 0.6248333333333334, "grad_norm": 24.6527042388916, "learning_rate": 6.517315336569952e-08, "logits/chosen": 2.420311450958252, "logits/rejected": 2.601353645324707, "logps/chosen": -61.10698699951172, "logps/rejected": -139.233642578125, "loss": 0.7875, "nll_loss": 0.7638373374938965, "rewards/accuracies": 1.0, "rewards/chosen": 1.5763893127441406, "rewards/margins": 5.859958171844482, "rewards/rejected": -4.283568859100342, "step": 3749 }, { "epoch": 0.625, "grad_norm": 47.130035400390625, "learning_rate": 6.512255856701176e-08, "logits/chosen": 2.261889934539795, "logits/rejected": 2.1842851638793945, "logps/chosen": -27.356346130371094, "logps/rejected": -71.70109558105469, "loss": 0.8535, "nll_loss": 0.8289801478385925, "rewards/accuracies": 1.0, "rewards/chosen": 1.459471583366394, "rewards/margins": 5.8445611000061035, "rewards/rejected": -4.38508939743042, "step": 3750 }, { "epoch": 0.6251666666666666, "grad_norm": 38.498111724853516, "learning_rate": 6.507197393077311e-08, "logits/chosen": 1.4615979194641113, "logits/rejected": 1.5498709678649902, "logps/chosen": -15.94377613067627, "logps/rejected": -83.72769927978516, "loss": 0.4779, "nll_loss": 0.4428826570510864, "rewards/accuracies": 1.0, "rewards/chosen": 1.5913290977478027, "rewards/margins": 5.112628936767578, "rewards/rejected": -3.5212998390197754, "step": 3751 }, { "epoch": 0.6253333333333333, "grad_norm": 40.468387603759766, "learning_rate": 6.502139947172272e-08, "logits/chosen": 1.241310715675354, "logits/rejected": 2.265324354171753, "logps/chosen": -86.3714370727539, "logps/rejected": -235.23580932617188, "loss": 1.1987, "nll_loss": 1.1671817302703857, "rewards/accuracies": 1.0, "rewards/chosen": 0.7428901791572571, "rewards/margins": 6.758069038391113, "rewards/rejected": -6.015178680419922, "step": 3752 }, { "epoch": 0.6255, "grad_norm": 174.8661651611328, "learning_rate": 6.497083520459674e-08, "logits/chosen": 4.065576553344727, "logits/rejected": 4.036407470703125, "logps/chosen": -53.2318115234375, "logps/rejected": -25.930492401123047, "loss": 2.7157, "nll_loss": 0.9022339582443237, "rewards/accuracies": 0.0, "rewards/chosen": 1.4854782819747925, "rewards/margins": -1.0309406518936157, "rewards/rejected": 2.516418933868408, "step": 3753 }, { "epoch": 0.6256666666666667, "grad_norm": 37.40681457519531, "learning_rate": 6.492028114412843e-08, "logits/chosen": 2.881950616836548, "logits/rejected": 2.8870697021484375, "logps/chosen": -88.65805053710938, "logps/rejected": -245.72093200683594, "loss": 1.1871, "nll_loss": 1.166553258895874, "rewards/accuracies": 1.0, "rewards/chosen": 1.226081132888794, "rewards/margins": 7.214385032653809, "rewards/rejected": -5.988304138183594, "step": 3754 }, { "epoch": 0.6258333333333334, "grad_norm": 35.33698654174805, "learning_rate": 6.4869737305048e-08, "logits/chosen": 2.1768696308135986, "logits/rejected": 2.307185173034668, "logps/chosen": -40.26914596557617, "logps/rejected": -279.3883056640625, "loss": 0.8955, "nll_loss": 0.8754162192344666, "rewards/accuracies": 1.0, "rewards/chosen": 3.7285306453704834, "rewards/margins": 7.202577590942383, "rewards/rejected": -3.4740467071533203, "step": 3755 }, { "epoch": 0.626, "grad_norm": 66.16705322265625, "learning_rate": 6.481920370208274e-08, "logits/chosen": 2.689246892929077, "logits/rejected": 2.574506998062134, "logps/chosen": -59.82801055908203, "logps/rejected": -70.48442840576172, "loss": 1.0959, "nll_loss": 0.8195619583129883, "rewards/accuracies": 1.0, "rewards/chosen": 2.541484832763672, "rewards/margins": 2.8942534923553467, "rewards/rejected": -0.3527687191963196, "step": 3756 }, { "epoch": 0.6261666666666666, "grad_norm": 24.888992309570312, "learning_rate": 6.476868034995688e-08, "logits/chosen": 1.9158276319503784, "logits/rejected": 2.012200117111206, "logps/chosen": -64.59676361083984, "logps/rejected": -118.47503662109375, "loss": 0.7721, "nll_loss": 0.751125156879425, "rewards/accuracies": 1.0, "rewards/chosen": 1.6217546463012695, "rewards/margins": 6.121547222137451, "rewards/rejected": -4.499792575836182, "step": 3757 }, { "epoch": 0.6263333333333333, "grad_norm": 21.40277099609375, "learning_rate": 6.471816726339178e-08, "logits/chosen": 1.1887598037719727, "logits/rejected": 2.424373149871826, "logps/chosen": -53.209068298339844, "logps/rejected": -276.67388916015625, "loss": 0.6233, "nll_loss": 0.6115984916687012, "rewards/accuracies": 1.0, "rewards/chosen": 2.004460096359253, "rewards/margins": 7.477789878845215, "rewards/rejected": -5.473329544067383, "step": 3758 }, { "epoch": 0.6265, "grad_norm": 19.38301658630371, "learning_rate": 6.466766445710568e-08, "logits/chosen": 2.4712812900543213, "logits/rejected": 2.691679000854492, "logps/chosen": -195.1041717529297, "logps/rejected": -259.5393371582031, "loss": 0.817, "nll_loss": 0.8062155842781067, "rewards/accuracies": 1.0, "rewards/chosen": 1.7850006818771362, "rewards/margins": 13.990376472473145, "rewards/rejected": -12.205375671386719, "step": 3759 }, { "epoch": 0.6266666666666667, "grad_norm": 67.57026672363281, "learning_rate": 6.461717194581393e-08, "logits/chosen": 2.7585678100585938, "logits/rejected": 2.8073596954345703, "logps/chosen": -135.78480529785156, "logps/rejected": -349.6119689941406, "loss": 1.3619, "nll_loss": 1.2690167427062988, "rewards/accuracies": 1.0, "rewards/chosen": -0.5802230834960938, "rewards/margins": 8.34306812286377, "rewards/rejected": -8.923291206359863, "step": 3760 }, { "epoch": 0.6268333333333334, "grad_norm": 53.52709197998047, "learning_rate": 6.456668974422879e-08, "logits/chosen": 1.1359736919403076, "logits/rejected": 1.9104548692703247, "logps/chosen": -67.38671112060547, "logps/rejected": -336.4092712402344, "loss": 1.7302, "nll_loss": 1.6846678256988525, "rewards/accuracies": 1.0, "rewards/chosen": 1.1223633289337158, "rewards/margins": 4.629480361938477, "rewards/rejected": -3.5071167945861816, "step": 3761 }, { "epoch": 0.627, "grad_norm": 26.516864776611328, "learning_rate": 6.451621786705961e-08, "logits/chosen": 0.4909133315086365, "logits/rejected": 0.8942502737045288, "logps/chosen": -68.4465103149414, "logps/rejected": -247.1107177734375, "loss": 0.9042, "nll_loss": 0.9006119966506958, "rewards/accuracies": 1.0, "rewards/chosen": 2.8992531299591064, "rewards/margins": 15.428625106811523, "rewards/rejected": -12.529372215270996, "step": 3762 }, { "epoch": 0.6271666666666667, "grad_norm": 59.814857482910156, "learning_rate": 6.446575632901263e-08, "logits/chosen": 2.2708590030670166, "logits/rejected": 2.4039816856384277, "logps/chosen": -101.50019073486328, "logps/rejected": -111.47239685058594, "loss": 1.112, "nll_loss": 0.8252048492431641, "rewards/accuracies": 1.0, "rewards/chosen": 2.7556328773498535, "rewards/margins": 2.9981935024261475, "rewards/rejected": -0.24256058037281036, "step": 3763 }, { "epoch": 0.6273333333333333, "grad_norm": 23.8389892578125, "learning_rate": 6.441530514479118e-08, "logits/chosen": 2.58707857131958, "logits/rejected": 2.5452399253845215, "logps/chosen": -56.82707214355469, "logps/rejected": -138.38311767578125, "loss": 0.7482, "nll_loss": 0.7380140423774719, "rewards/accuracies": 1.0, "rewards/chosen": 1.9081741571426392, "rewards/margins": 8.980525016784668, "rewards/rejected": -7.07235050201416, "step": 3764 }, { "epoch": 0.6275, "grad_norm": 36.43416976928711, "learning_rate": 6.436486432909549e-08, "logits/chosen": 0.9099734425544739, "logits/rejected": 1.6303740739822388, "logps/chosen": -67.40165710449219, "logps/rejected": -240.5933074951172, "loss": 1.0964, "nll_loss": 1.05315101146698, "rewards/accuracies": 1.0, "rewards/chosen": 0.7773652076721191, "rewards/margins": 4.931704998016357, "rewards/rejected": -4.154339790344238, "step": 3765 }, { "epoch": 0.6276666666666667, "grad_norm": 22.6394100189209, "learning_rate": 6.431443389662283e-08, "logits/chosen": 2.4900689125061035, "logits/rejected": 2.7049431800842285, "logps/chosen": -60.16133117675781, "logps/rejected": -311.1396484375, "loss": 0.6547, "nll_loss": 0.6468960642814636, "rewards/accuracies": 1.0, "rewards/chosen": 2.2844483852386475, "rewards/margins": 8.597259521484375, "rewards/rejected": -6.312811374664307, "step": 3766 }, { "epoch": 0.6278333333333334, "grad_norm": 30.790542602539062, "learning_rate": 6.426401386206742e-08, "logits/chosen": 1.617539882659912, "logits/rejected": 1.3474732637405396, "logps/chosen": -239.06320190429688, "logps/rejected": -253.0205078125, "loss": 0.9359, "nll_loss": 0.8854191899299622, "rewards/accuracies": 1.0, "rewards/chosen": 0.17068175971508026, "rewards/margins": 6.900366306304932, "rewards/rejected": -6.729684352874756, "step": 3767 }, { "epoch": 0.628, "grad_norm": 32.61522674560547, "learning_rate": 6.42136042401204e-08, "logits/chosen": 1.7774525880813599, "logits/rejected": 2.328890562057495, "logps/chosen": -55.83336639404297, "logps/rejected": -151.36935424804688, "loss": 0.9662, "nll_loss": 0.9305561780929565, "rewards/accuracies": 1.0, "rewards/chosen": 1.1524604558944702, "rewards/margins": 5.125179290771484, "rewards/rejected": -3.9727189540863037, "step": 3768 }, { "epoch": 0.6281666666666667, "grad_norm": 29.213735580444336, "learning_rate": 6.416320504546997e-08, "logits/chosen": 1.5552847385406494, "logits/rejected": 2.4978513717651367, "logps/chosen": -39.14883804321289, "logps/rejected": -226.10116577148438, "loss": 0.637, "nll_loss": 0.6214101910591125, "rewards/accuracies": 1.0, "rewards/chosen": 1.401821255683899, "rewards/margins": 10.724923133850098, "rewards/rejected": -9.323101997375488, "step": 3769 }, { "epoch": 0.6283333333333333, "grad_norm": 28.2982120513916, "learning_rate": 6.411281629280122e-08, "logits/chosen": 2.935236692428589, "logits/rejected": 2.8390190601348877, "logps/chosen": -123.71939849853516, "logps/rejected": -72.56376647949219, "loss": 1.1282, "nll_loss": 1.0948621034622192, "rewards/accuracies": 1.0, "rewards/chosen": 2.624591827392578, "rewards/margins": 5.710716247558594, "rewards/rejected": -3.0861244201660156, "step": 3770 }, { "epoch": 0.6285, "grad_norm": 22.072113037109375, "learning_rate": 6.406243799679625e-08, "logits/chosen": 2.2063541412353516, "logits/rejected": 2.461845874786377, "logps/chosen": -42.835472106933594, "logps/rejected": -254.9113006591797, "loss": 0.5448, "nll_loss": 0.5288329124450684, "rewards/accuracies": 1.0, "rewards/chosen": 1.3743481636047363, "rewards/margins": 11.488483428955078, "rewards/rejected": -10.114134788513184, "step": 3771 }, { "epoch": 0.6286666666666667, "grad_norm": 175.77516174316406, "learning_rate": 6.401207017213406e-08, "logits/chosen": 2.806490182876587, "logits/rejected": 2.98455548286438, "logps/chosen": -77.44631958007812, "logps/rejected": -7.439524173736572, "loss": 2.94, "nll_loss": 0.992901623249054, "rewards/accuracies": 0.0, "rewards/chosen": 1.1101951599121094, "rewards/margins": -1.2796943187713623, "rewards/rejected": 2.3898894786834717, "step": 3772 }, { "epoch": 0.6288333333333334, "grad_norm": 29.640851974487305, "learning_rate": 6.396171283349064e-08, "logits/chosen": 2.0040931701660156, "logits/rejected": 1.6889402866363525, "logps/chosen": -54.22774124145508, "logps/rejected": -53.843143463134766, "loss": 0.8385, "nll_loss": 0.8093693852424622, "rewards/accuracies": 1.0, "rewards/chosen": 3.441654682159424, "rewards/margins": 6.535409927368164, "rewards/rejected": -3.093755006790161, "step": 3773 }, { "epoch": 0.629, "grad_norm": 27.430089950561523, "learning_rate": 6.391136599553889e-08, "logits/chosen": 2.1628458499908447, "logits/rejected": 1.6501423120498657, "logps/chosen": -76.51008605957031, "logps/rejected": -64.96928405761719, "loss": 0.98, "nll_loss": 0.9563760757446289, "rewards/accuracies": 1.0, "rewards/chosen": 1.8430482149124146, "rewards/margins": 5.820756435394287, "rewards/rejected": -3.977708339691162, "step": 3774 }, { "epoch": 0.6291666666666667, "grad_norm": 34.39703369140625, "learning_rate": 6.38610296729487e-08, "logits/chosen": 0.4925221800804138, "logits/rejected": 3.1031129360198975, "logps/chosen": -38.10167694091797, "logps/rejected": -60.69526672363281, "loss": 0.8558, "nll_loss": 0.8282972574234009, "rewards/accuracies": 1.0, "rewards/chosen": 4.799800872802734, "rewards/margins": 7.860827922821045, "rewards/rejected": -3.0610270500183105, "step": 3775 }, { "epoch": 0.6293333333333333, "grad_norm": 28.57505989074707, "learning_rate": 6.381070388038682e-08, "logits/chosen": 2.07273530960083, "logits/rejected": 2.767484188079834, "logps/chosen": -5.945425987243652, "logps/rejected": -120.99354553222656, "loss": 0.2969, "nll_loss": 0.270246684551239, "rewards/accuracies": 1.0, "rewards/chosen": 0.918160617351532, "rewards/margins": 7.073434829711914, "rewards/rejected": -6.155274391174316, "step": 3776 }, { "epoch": 0.6295, "grad_norm": 23.855981826782227, "learning_rate": 6.376038863251705e-08, "logits/chosen": 1.2207708358764648, "logits/rejected": 1.7550948858261108, "logps/chosen": -66.95332336425781, "logps/rejected": -120.73395538330078, "loss": 0.7231, "nll_loss": 0.697430431842804, "rewards/accuracies": 1.0, "rewards/chosen": 1.2013558149337769, "rewards/margins": 6.012562274932861, "rewards/rejected": -4.811206340789795, "step": 3777 }, { "epoch": 0.6296666666666667, "grad_norm": 21.532991409301758, "learning_rate": 6.371008394399997e-08, "logits/chosen": 0.8207821846008301, "logits/rejected": 1.6471580266952515, "logps/chosen": -39.509483337402344, "logps/rejected": -180.84530639648438, "loss": 0.5114, "nll_loss": 0.4818229079246521, "rewards/accuracies": 1.0, "rewards/chosen": 0.7610451579093933, "rewards/margins": 7.679012298583984, "rewards/rejected": -6.917967319488525, "step": 3778 }, { "epoch": 0.6298333333333334, "grad_norm": 27.157033920288086, "learning_rate": 6.365978982949323e-08, "logits/chosen": 1.3789288997650146, "logits/rejected": 2.236675262451172, "logps/chosen": -51.81529235839844, "logps/rejected": -330.46820068359375, "loss": 0.7418, "nll_loss": 0.7097984552383423, "rewards/accuracies": 1.0, "rewards/chosen": 1.919885277748108, "rewards/margins": 5.348358631134033, "rewards/rejected": -3.4284732341766357, "step": 3779 }, { "epoch": 0.63, "grad_norm": 22.10832405090332, "learning_rate": 6.360950630365125e-08, "logits/chosen": 2.025186061859131, "logits/rejected": 1.9654366970062256, "logps/chosen": -103.17802429199219, "logps/rejected": -137.81613159179688, "loss": 1.0129, "nll_loss": 1.0017282962799072, "rewards/accuracies": 1.0, "rewards/chosen": 2.101388692855835, "rewards/margins": 7.472573280334473, "rewards/rejected": -5.371184825897217, "step": 3780 }, { "epoch": 0.6301666666666667, "grad_norm": 23.848474502563477, "learning_rate": 6.35592333811255e-08, "logits/chosen": 2.5567357540130615, "logits/rejected": 2.7710530757904053, "logps/chosen": -69.03890991210938, "logps/rejected": -292.66558837890625, "loss": 0.7324, "nll_loss": 0.7267252802848816, "rewards/accuracies": 1.0, "rewards/chosen": 2.445356845855713, "rewards/margins": 12.959718704223633, "rewards/rejected": -10.514362335205078, "step": 3781 }, { "epoch": 0.6303333333333333, "grad_norm": 365.0234069824219, "learning_rate": 6.350897107656426e-08, "logits/chosen": 1.6972441673278809, "logits/rejected": 2.12896728515625, "logps/chosen": -274.4006652832031, "logps/rejected": -414.757568359375, "loss": 3.0668, "nll_loss": 1.633337140083313, "rewards/accuracies": 1.0, "rewards/chosen": -5.876260280609131, "rewards/margins": 0.20094060897827148, "rewards/rejected": -6.077200889587402, "step": 3782 }, { "epoch": 0.6305, "grad_norm": 27.59003257751465, "learning_rate": 6.345871940461281e-08, "logits/chosen": 2.1674561500549316, "logits/rejected": 1.7346991300582886, "logps/chosen": -36.662628173828125, "logps/rejected": -53.177120208740234, "loss": 0.5758, "nll_loss": 0.5391563177108765, "rewards/accuracies": 1.0, "rewards/chosen": 1.9593334197998047, "rewards/margins": 5.175806045532227, "rewards/rejected": -3.216472625732422, "step": 3783 }, { "epoch": 0.6306666666666667, "grad_norm": 28.9932918548584, "learning_rate": 6.340847837991324e-08, "logits/chosen": 1.537086844444275, "logits/rejected": 1.8703926801681519, "logps/chosen": -129.18328857421875, "logps/rejected": -490.6524658203125, "loss": 1.2982, "nll_loss": 1.279042363166809, "rewards/accuracies": 1.0, "rewards/chosen": 1.2015289068222046, "rewards/margins": 8.87170696258545, "rewards/rejected": -7.670178413391113, "step": 3784 }, { "epoch": 0.6308333333333334, "grad_norm": 31.467355728149414, "learning_rate": 6.33582480171046e-08, "logits/chosen": 2.1212894916534424, "logits/rejected": 3.121091365814209, "logps/chosen": -31.059185028076172, "logps/rejected": -235.56712341308594, "loss": 0.6769, "nll_loss": 0.6608336567878723, "rewards/accuracies": 1.0, "rewards/chosen": 1.8805503845214844, "rewards/margins": 6.621439456939697, "rewards/rejected": -4.740889072418213, "step": 3785 }, { "epoch": 0.631, "grad_norm": 25.33682632446289, "learning_rate": 6.33080283308228e-08, "logits/chosen": 2.6181764602661133, "logits/rejected": 2.6301660537719727, "logps/chosen": -95.55697631835938, "logps/rejected": -79.11773681640625, "loss": 0.9181, "nll_loss": 0.9014807939529419, "rewards/accuracies": 1.0, "rewards/chosen": 1.4548372030258179, "rewards/margins": 7.499668598175049, "rewards/rejected": -6.044831275939941, "step": 3786 }, { "epoch": 0.6311666666666667, "grad_norm": 25.812246322631836, "learning_rate": 6.325781933570064e-08, "logits/chosen": 2.5777509212493896, "logits/rejected": 2.7069990634918213, "logps/chosen": -87.65633392333984, "logps/rejected": -169.4466552734375, "loss": 0.9233, "nll_loss": 0.9130867123603821, "rewards/accuracies": 1.0, "rewards/chosen": 2.1724395751953125, "rewards/margins": 7.663818359375, "rewards/rejected": -5.4913787841796875, "step": 3787 }, { "epoch": 0.6313333333333333, "grad_norm": 26.905057907104492, "learning_rate": 6.320762104636785e-08, "logits/chosen": 1.6506342887878418, "logits/rejected": 2.4932894706726074, "logps/chosen": -86.66184997558594, "logps/rejected": -208.00656127929688, "loss": 1.0148, "nll_loss": 1.0076957941055298, "rewards/accuracies": 1.0, "rewards/chosen": 2.414989471435547, "rewards/margins": 8.664667129516602, "rewards/rejected": -6.249678134918213, "step": 3788 }, { "epoch": 0.6315, "grad_norm": 23.565876007080078, "learning_rate": 6.315743347745098e-08, "logits/chosen": 2.664478302001953, "logits/rejected": 2.6780903339385986, "logps/chosen": -15.529891967773438, "logps/rejected": -185.27328491210938, "loss": 0.3249, "nll_loss": 0.31059780716896057, "rewards/accuracies": 1.0, "rewards/chosen": 2.6472995281219482, "rewards/margins": 6.8542938232421875, "rewards/rejected": -4.206994533538818, "step": 3789 }, { "epoch": 0.6316666666666667, "grad_norm": 46.218299865722656, "learning_rate": 6.310725664357348e-08, "logits/chosen": 2.7699971199035645, "logits/rejected": 2.986379861831665, "logps/chosen": -38.71976089477539, "logps/rejected": -436.4309997558594, "loss": 0.9541, "nll_loss": 0.9443845748901367, "rewards/accuracies": 1.0, "rewards/chosen": 1.907325029373169, "rewards/margins": 9.906558990478516, "rewards/rejected": -7.999234199523926, "step": 3790 }, { "epoch": 0.6318333333333334, "grad_norm": 29.878782272338867, "learning_rate": 6.305709055935568e-08, "logits/chosen": 2.039065361022949, "logits/rejected": 2.008406162261963, "logps/chosen": -89.78941345214844, "logps/rejected": -118.11426544189453, "loss": 1.0587, "nll_loss": 1.0440629720687866, "rewards/accuracies": 1.0, "rewards/chosen": 1.9093681573867798, "rewards/margins": 6.85311222076416, "rewards/rejected": -4.94374418258667, "step": 3791 }, { "epoch": 0.632, "grad_norm": 675.7650756835938, "learning_rate": 6.300693523941481e-08, "logits/chosen": 2.7476484775543213, "logits/rejected": 2.788630247116089, "logps/chosen": -62.90607833862305, "logps/rejected": -217.53182983398438, "loss": 0.7761, "nll_loss": 0.757904589176178, "rewards/accuracies": 1.0, "rewards/chosen": 1.2613880634307861, "rewards/margins": 8.718213081359863, "rewards/rejected": -7.456825256347656, "step": 3792 }, { "epoch": 0.6321666666666667, "grad_norm": 67.39342498779297, "learning_rate": 6.295679069836485e-08, "logits/chosen": 2.9502360820770264, "logits/rejected": 3.0174717903137207, "logps/chosen": -12.175490379333496, "logps/rejected": -79.5933837890625, "loss": 0.5619, "nll_loss": 0.5534313917160034, "rewards/accuracies": 1.0, "rewards/chosen": 2.1213009357452393, "rewards/margins": 8.990269660949707, "rewards/rejected": -6.868968963623047, "step": 3793 }, { "epoch": 0.6323333333333333, "grad_norm": 27.011932373046875, "learning_rate": 6.29066569508168e-08, "logits/chosen": 0.7185977697372437, "logits/rejected": 2.2752506732940674, "logps/chosen": -81.48097229003906, "logps/rejected": -331.99658203125, "loss": 0.9791, "nll_loss": 0.9700114727020264, "rewards/accuracies": 1.0, "rewards/chosen": 1.9783334732055664, "rewards/margins": 9.998780250549316, "rewards/rejected": -8.02044677734375, "step": 3794 }, { "epoch": 0.6325, "grad_norm": 145.8190155029297, "learning_rate": 6.285653401137836e-08, "logits/chosen": 2.984449625015259, "logits/rejected": 2.9216010570526123, "logps/chosen": -75.17401885986328, "logps/rejected": -40.57130813598633, "loss": 2.0108, "nll_loss": 1.3188424110412598, "rewards/accuracies": 1.0, "rewards/chosen": -0.5271377563476562, "rewards/margins": 0.32097935676574707, "rewards/rejected": -0.8481171131134033, "step": 3795 }, { "epoch": 0.6326666666666667, "grad_norm": 40.51074981689453, "learning_rate": 6.280642189465419e-08, "logits/chosen": 2.3133463859558105, "logits/rejected": 2.44840931892395, "logps/chosen": -14.207412719726562, "logps/rejected": -163.23455810546875, "loss": 0.5486, "nll_loss": 0.5464389324188232, "rewards/accuracies": 1.0, "rewards/chosen": 3.9564738273620605, "rewards/margins": 10.453160285949707, "rewards/rejected": -6.4966864585876465, "step": 3796 }, { "epoch": 0.6328333333333334, "grad_norm": 39.846275329589844, "learning_rate": 6.275632061524574e-08, "logits/chosen": 2.1973118782043457, "logits/rejected": 2.674656867980957, "logps/chosen": -34.325775146484375, "logps/rejected": -128.26036071777344, "loss": 0.8609, "nll_loss": 0.8581444025039673, "rewards/accuracies": 1.0, "rewards/chosen": 3.7834010124206543, "rewards/margins": 9.960161209106445, "rewards/rejected": -6.176759719848633, "step": 3797 }, { "epoch": 0.633, "grad_norm": 37.137332916259766, "learning_rate": 6.270623018775134e-08, "logits/chosen": 2.15781307220459, "logits/rejected": 2.715662717819214, "logps/chosen": -19.260913848876953, "logps/rejected": -89.80934143066406, "loss": 0.557, "nll_loss": 0.5205652117729187, "rewards/accuracies": 1.0, "rewards/chosen": 0.909445583820343, "rewards/margins": 5.277647972106934, "rewards/rejected": -4.368202209472656, "step": 3798 }, { "epoch": 0.6331666666666667, "grad_norm": 33.65772247314453, "learning_rate": 6.265615062676611e-08, "logits/chosen": 3.6152637004852295, "logits/rejected": 3.5449843406677246, "logps/chosen": -20.241012573242188, "logps/rejected": -56.85584259033203, "loss": 0.525, "nll_loss": 0.5060253143310547, "rewards/accuracies": 1.0, "rewards/chosen": 2.1104648113250732, "rewards/margins": 6.239233016967773, "rewards/rejected": -4.128768444061279, "step": 3799 }, { "epoch": 0.6333333333333333, "grad_norm": 23.16034507751465, "learning_rate": 6.260608194688207e-08, "logits/chosen": 2.7069318294525146, "logits/rejected": 2.747594118118286, "logps/chosen": -43.21650695800781, "logps/rejected": -135.96498107910156, "loss": 0.5657, "nll_loss": 0.5540578961372375, "rewards/accuracies": 1.0, "rewards/chosen": 1.7521167993545532, "rewards/margins": 8.928081512451172, "rewards/rejected": -7.175964832305908, "step": 3800 }, { "epoch": 0.6335, "grad_norm": 53.011417388916016, "learning_rate": 6.255602416268798e-08, "logits/chosen": 2.5686516761779785, "logits/rejected": 2.7023022174835205, "logps/chosen": -74.18671417236328, "logps/rejected": -143.6031036376953, "loss": 1.1344, "nll_loss": 1.090981125831604, "rewards/accuracies": 1.0, "rewards/chosen": 0.8031341433525085, "rewards/margins": 4.882408618927002, "rewards/rejected": -4.079274654388428, "step": 3801 }, { "epoch": 0.6336666666666667, "grad_norm": 77.73717498779297, "learning_rate": 6.250597728876953e-08, "logits/chosen": 0.9801225066184998, "logits/rejected": 2.3681960105895996, "logps/chosen": -54.21643829345703, "logps/rejected": -232.0074920654297, "loss": 1.4476, "nll_loss": 1.290867567062378, "rewards/accuracies": 1.0, "rewards/chosen": 1.744532823562622, "rewards/margins": 3.1496994495391846, "rewards/rejected": -1.4051666259765625, "step": 3802 }, { "epoch": 0.6338333333333334, "grad_norm": 157.2534637451172, "learning_rate": 6.245594133970913e-08, "logits/chosen": 2.8279647827148438, "logits/rejected": 2.8417112827301025, "logps/chosen": -20.16021156311035, "logps/rejected": -89.27392578125, "loss": 1.8782, "nll_loss": 1.8327465057373047, "rewards/accuracies": 1.0, "rewards/chosen": 0.434335321187973, "rewards/margins": 5.509754180908203, "rewards/rejected": -5.075418949127197, "step": 3803 }, { "epoch": 0.634, "grad_norm": 25.094648361206055, "learning_rate": 6.240591633008609e-08, "logits/chosen": 2.059955596923828, "logits/rejected": 2.685189723968506, "logps/chosen": -70.54004669189453, "logps/rejected": -174.5015869140625, "loss": 0.9022, "nll_loss": 0.8929120898246765, "rewards/accuracies": 1.0, "rewards/chosen": 2.012669324874878, "rewards/margins": 8.832930564880371, "rewards/rejected": -6.820261001586914, "step": 3804 }, { "epoch": 0.6341666666666667, "grad_norm": 27.733539581298828, "learning_rate": 6.23559022744765e-08, "logits/chosen": 3.1025190353393555, "logits/rejected": 3.076936721801758, "logps/chosen": -68.40937805175781, "logps/rejected": -163.49945068359375, "loss": 0.7798, "nll_loss": 0.7435802221298218, "rewards/accuracies": 1.0, "rewards/chosen": 1.5239982604980469, "rewards/margins": 5.044211387634277, "rewards/rejected": -3.5202133655548096, "step": 3805 }, { "epoch": 0.6343333333333333, "grad_norm": 36.359642028808594, "learning_rate": 6.230589918745322e-08, "logits/chosen": 2.3654751777648926, "logits/rejected": 2.252558708190918, "logps/chosen": -12.245828628540039, "logps/rejected": -125.51095581054688, "loss": 0.4796, "nll_loss": 0.47099339962005615, "rewards/accuracies": 1.0, "rewards/chosen": 2.322239637374878, "rewards/margins": 8.014623641967773, "rewards/rejected": -5.692383766174316, "step": 3806 }, { "epoch": 0.6345, "grad_norm": 97.8131332397461, "learning_rate": 6.225590708358596e-08, "logits/chosen": 2.6564431190490723, "logits/rejected": 2.5748777389526367, "logps/chosen": -51.295867919921875, "logps/rejected": -5.427305698394775, "loss": 1.7049, "nll_loss": 0.6332823038101196, "rewards/accuracies": 1.0, "rewards/chosen": 1.6290420293807983, "rewards/margins": 0.0991373062133789, "rewards/rejected": 1.5299047231674194, "step": 3807 }, { "epoch": 0.6346666666666667, "grad_norm": 29.945432662963867, "learning_rate": 6.220592597744121e-08, "logits/chosen": 2.3501973152160645, "logits/rejected": 2.8502466678619385, "logps/chosen": -113.21233367919922, "logps/rejected": -377.9611511230469, "loss": 1.2883, "nll_loss": 1.2865036725997925, "rewards/accuracies": 1.0, "rewards/chosen": 3.6933634281158447, "rewards/margins": 11.855250358581543, "rewards/rejected": -8.161887168884277, "step": 3808 }, { "epoch": 0.6348333333333334, "grad_norm": 53.2451057434082, "learning_rate": 6.215595588358234e-08, "logits/chosen": 1.7259081602096558, "logits/rejected": 1.7953790426254272, "logps/chosen": -11.440150260925293, "logps/rejected": -89.78714752197266, "loss": 0.4452, "nll_loss": 0.42370930314064026, "rewards/accuracies": 1.0, "rewards/chosen": 1.1365426778793335, "rewards/margins": 7.442137718200684, "rewards/rejected": -6.3055949211120605, "step": 3809 }, { "epoch": 0.635, "grad_norm": 21.756162643432617, "learning_rate": 6.210599681656931e-08, "logits/chosen": 2.2526228427886963, "logits/rejected": 2.520962953567505, "logps/chosen": -73.39302062988281, "logps/rejected": -183.2037811279297, "loss": 0.7658, "nll_loss": 0.7566290497779846, "rewards/accuracies": 1.0, "rewards/chosen": 2.2437119483947754, "rewards/margins": 7.913746356964111, "rewards/rejected": -5.670034408569336, "step": 3810 }, { "epoch": 0.6351666666666667, "grad_norm": 29.966285705566406, "learning_rate": 6.205604879095911e-08, "logits/chosen": 1.103204607963562, "logits/rejected": 1.6963847875595093, "logps/chosen": -50.46830368041992, "logps/rejected": -221.67764282226562, "loss": 0.6708, "nll_loss": 0.6470295786857605, "rewards/accuracies": 1.0, "rewards/chosen": 1.0778350830078125, "rewards/margins": 6.875186920166016, "rewards/rejected": -5.797351837158203, "step": 3811 }, { "epoch": 0.6353333333333333, "grad_norm": 22.084514617919922, "learning_rate": 6.200611182130533e-08, "logits/chosen": 1.4875062704086304, "logits/rejected": 2.3791239261627197, "logps/chosen": -30.95794105529785, "logps/rejected": -314.05816650390625, "loss": 0.4084, "nll_loss": 0.3869742751121521, "rewards/accuracies": 1.0, "rewards/chosen": 5.230660438537598, "rewards/margins": 8.555618286132812, "rewards/rejected": -3.3249573707580566, "step": 3812 }, { "epoch": 0.6355, "grad_norm": 47.06940460205078, "learning_rate": 6.195618592215843e-08, "logits/chosen": 2.5284433364868164, "logits/rejected": 2.431138515472412, "logps/chosen": -25.572338104248047, "logps/rejected": -143.75027465820312, "loss": 0.9087, "nll_loss": 0.881804883480072, "rewards/accuracies": 1.0, "rewards/chosen": 1.1120113134384155, "rewards/margins": 6.014062404632568, "rewards/rejected": -4.902050971984863, "step": 3813 }, { "epoch": 0.6356666666666667, "grad_norm": 33.076568603515625, "learning_rate": 6.190627110806559e-08, "logits/chosen": 2.6262733936309814, "logits/rejected": 2.6423184871673584, "logps/chosen": -48.12824249267578, "logps/rejected": -271.6346740722656, "loss": 0.9262, "nll_loss": 0.9255431890487671, "rewards/accuracies": 1.0, "rewards/chosen": 5.033298015594482, "rewards/margins": 12.822010040283203, "rewards/rejected": -7.7887115478515625, "step": 3814 }, { "epoch": 0.6358333333333334, "grad_norm": 62.07084655761719, "learning_rate": 6.185636739357082e-08, "logits/chosen": 2.7176756858825684, "logits/rejected": 3.037337303161621, "logps/chosen": -24.37885284423828, "logps/rejected": -254.1903076171875, "loss": 0.9838, "nll_loss": 0.9751540422439575, "rewards/accuracies": 1.0, "rewards/chosen": 2.374699354171753, "rewards/margins": 7.907963752746582, "rewards/rejected": -5.53326416015625, "step": 3815 }, { "epoch": 0.636, "grad_norm": 27.616649627685547, "learning_rate": 6.180647479321483e-08, "logits/chosen": 2.4720358848571777, "logits/rejected": 1.8012964725494385, "logps/chosen": -65.69137573242188, "logps/rejected": -55.813106536865234, "loss": 0.8935, "nll_loss": 0.8643600940704346, "rewards/accuracies": 1.0, "rewards/chosen": 1.7395668029785156, "rewards/margins": 5.446547508239746, "rewards/rejected": -3.7069807052612305, "step": 3816 }, { "epoch": 0.6361666666666667, "grad_norm": 24.771808624267578, "learning_rate": 6.175659332153517e-08, "logits/chosen": 1.4659180641174316, "logits/rejected": 1.5696606636047363, "logps/chosen": -135.6184539794922, "logps/rejected": -126.83855438232422, "loss": 0.8987, "nll_loss": 0.8638120293617249, "rewards/accuracies": 1.0, "rewards/chosen": 2.2592408657073975, "rewards/margins": 5.412928581237793, "rewards/rejected": -3.1536874771118164, "step": 3817 }, { "epoch": 0.6363333333333333, "grad_norm": 17.37641716003418, "learning_rate": 6.170672299306605e-08, "logits/chosen": 0.9952993988990784, "logits/rejected": 2.6577038764953613, "logps/chosen": -5.821516990661621, "logps/rejected": -475.5116271972656, "loss": 0.1824, "nll_loss": 0.15733829140663147, "rewards/accuracies": 1.0, "rewards/chosen": 0.9012622833251953, "rewards/margins": 9.485109329223633, "rewards/rejected": -8.583847045898438, "step": 3818 }, { "epoch": 0.6365, "grad_norm": 131.26055908203125, "learning_rate": 6.165686382233855e-08, "logits/chosen": 2.179473638534546, "logits/rejected": 2.353132724761963, "logps/chosen": -46.10264205932617, "logps/rejected": -47.176300048828125, "loss": 1.3814, "nll_loss": 0.7317880392074585, "rewards/accuracies": 1.0, "rewards/chosen": 0.766386866569519, "rewards/margins": 0.6802761554718018, "rewards/rejected": 0.0861106887459755, "step": 3819 }, { "epoch": 0.6366666666666667, "grad_norm": 36.96601104736328, "learning_rate": 6.160701582388038e-08, "logits/chosen": 2.7211363315582275, "logits/rejected": 2.616131067276001, "logps/chosen": -5.739049911499023, "logps/rejected": -53.731868743896484, "loss": 0.2788, "nll_loss": 0.23912708461284637, "rewards/accuracies": 1.0, "rewards/chosen": 1.1271356344223022, "rewards/margins": 4.899258613586426, "rewards/rejected": -3.772123098373413, "step": 3820 }, { "epoch": 0.6368333333333334, "grad_norm": 56.28815841674805, "learning_rate": 6.155717901221608e-08, "logits/chosen": 3.0172362327575684, "logits/rejected": 3.3520915508270264, "logps/chosen": -7.504687309265137, "logps/rejected": -202.55645751953125, "loss": 0.5061, "nll_loss": 0.5003125071525574, "rewards/accuracies": 1.0, "rewards/chosen": 2.501929998397827, "rewards/margins": 9.633633613586426, "rewards/rejected": -7.1317033767700195, "step": 3821 }, { "epoch": 0.637, "grad_norm": 16.073118209838867, "learning_rate": 6.150735340186689e-08, "logits/chosen": 2.431837320327759, "logits/rejected": 2.4524598121643066, "logps/chosen": -247.30369567871094, "logps/rejected": -315.3265380859375, "loss": 0.8551, "nll_loss": 0.8498408198356628, "rewards/accuracies": 1.0, "rewards/chosen": 2.5071823596954346, "rewards/margins": 12.094579696655273, "rewards/rejected": -9.587397575378418, "step": 3822 }, { "epoch": 0.6371666666666667, "grad_norm": 42.356834411621094, "learning_rate": 6.14575390073508e-08, "logits/chosen": 2.3456904888153076, "logits/rejected": 2.5925209522247314, "logps/chosen": -18.934864044189453, "logps/rejected": -432.5533752441406, "loss": 0.7291, "nll_loss": 0.7282640933990479, "rewards/accuracies": 1.0, "rewards/chosen": 4.356271266937256, "rewards/margins": 15.162639617919922, "rewards/rejected": -10.806368827819824, "step": 3823 }, { "epoch": 0.6373333333333333, "grad_norm": 20.6835880279541, "learning_rate": 6.140773584318253e-08, "logits/chosen": 2.8238487243652344, "logits/rejected": 3.089082956314087, "logps/chosen": -88.11119842529297, "logps/rejected": -217.2867431640625, "loss": 0.7703, "nll_loss": 0.7595792412757874, "rewards/accuracies": 1.0, "rewards/chosen": 1.8005897998809814, "rewards/margins": 9.760581016540527, "rewards/rejected": -7.959991455078125, "step": 3824 }, { "epoch": 0.6375, "grad_norm": 32.99528884887695, "learning_rate": 6.135794392387352e-08, "logits/chosen": 1.9839047193527222, "logits/rejected": 2.6428682804107666, "logps/chosen": -78.41770935058594, "logps/rejected": -284.2579040527344, "loss": 1.0151, "nll_loss": 1.0053552389144897, "rewards/accuracies": 1.0, "rewards/chosen": 2.1042428016662598, "rewards/margins": 7.9895501136779785, "rewards/rejected": -5.885307312011719, "step": 3825 }, { "epoch": 0.6376666666666667, "grad_norm": 107.17988586425781, "learning_rate": 6.130816326393194e-08, "logits/chosen": 2.1017870903015137, "logits/rejected": 1.8213295936584473, "logps/chosen": -20.727148056030273, "logps/rejected": -44.71311569213867, "loss": 1.2472, "nll_loss": 0.41454288363456726, "rewards/accuracies": 1.0, "rewards/chosen": 1.8069963455200195, "rewards/margins": 0.6258660554885864, "rewards/rejected": 1.181130290031433, "step": 3826 }, { "epoch": 0.6378333333333334, "grad_norm": 22.332876205444336, "learning_rate": 6.125839387786268e-08, "logits/chosen": 2.039597272872925, "logits/rejected": 2.4956791400909424, "logps/chosen": -55.606510162353516, "logps/rejected": -180.0391082763672, "loss": 0.7182, "nll_loss": 0.7038798928260803, "rewards/accuracies": 1.0, "rewards/chosen": 1.6620151996612549, "rewards/margins": 7.464699745178223, "rewards/rejected": -5.802684783935547, "step": 3827 }, { "epoch": 0.638, "grad_norm": 135.48782348632812, "learning_rate": 6.120863578016735e-08, "logits/chosen": 2.578399419784546, "logits/rejected": 2.64463210105896, "logps/chosen": -8.975082397460938, "logps/rejected": -68.27332305908203, "loss": 1.0169, "nll_loss": 0.8975082635879517, "rewards/accuracies": 1.0, "rewards/chosen": -0.18503637611865997, "rewards/margins": 3.169966220855713, "rewards/rejected": -3.3550026416778564, "step": 3828 }, { "epoch": 0.6381666666666667, "grad_norm": 36.14495849609375, "learning_rate": 6.115888898534426e-08, "logits/chosen": 2.7758002281188965, "logits/rejected": 2.9230055809020996, "logps/chosen": -39.00278091430664, "logps/rejected": -246.93646240234375, "loss": 0.8309, "nll_loss": 0.8125579357147217, "rewards/accuracies": 1.0, "rewards/chosen": 1.2312138080596924, "rewards/margins": 9.466531753540039, "rewards/rejected": -8.235318183898926, "step": 3829 }, { "epoch": 0.6383333333333333, "grad_norm": 28.983348846435547, "learning_rate": 6.110915350788845e-08, "logits/chosen": 2.48049259185791, "logits/rejected": 2.7369425296783447, "logps/chosen": -95.2072525024414, "logps/rejected": -360.42864990234375, "loss": 0.9411, "nll_loss": 0.9243422746658325, "rewards/accuracies": 1.0, "rewards/chosen": 1.320450782775879, "rewards/margins": 10.19870376586914, "rewards/rejected": -8.878252983093262, "step": 3830 }, { "epoch": 0.6385, "grad_norm": 24.343807220458984, "learning_rate": 6.105942936229161e-08, "logits/chosen": 1.9347248077392578, "logits/rejected": 1.4977900981903076, "logps/chosen": -77.82044982910156, "logps/rejected": -149.26974487304688, "loss": 0.8534, "nll_loss": 0.8367791175842285, "rewards/accuracies": 1.0, "rewards/chosen": 1.3743057250976562, "rewards/margins": 8.343414306640625, "rewards/rejected": -6.969108581542969, "step": 3831 }, { "epoch": 0.6386666666666667, "grad_norm": 41.47721481323242, "learning_rate": 6.100971656304219e-08, "logits/chosen": 2.374307870864868, "logits/rejected": 2.540707588195801, "logps/chosen": -22.491409301757812, "logps/rejected": -198.11973571777344, "loss": 0.6409, "nll_loss": 0.6078760027885437, "rewards/accuracies": 1.0, "rewards/chosen": 0.5956894159317017, "rewards/margins": 9.98703384399414, "rewards/rejected": -9.39134407043457, "step": 3832 }, { "epoch": 0.6388333333333334, "grad_norm": 20.595422744750977, "learning_rate": 6.096001512462528e-08, "logits/chosen": 2.4990599155426025, "logits/rejected": 2.4771931171417236, "logps/chosen": -60.6249885559082, "logps/rejected": -313.74981689453125, "loss": 0.6001, "nll_loss": 0.5885921716690063, "rewards/accuracies": 1.0, "rewards/chosen": 1.9183224439620972, "rewards/margins": 7.745150566101074, "rewards/rejected": -5.8268280029296875, "step": 3833 }, { "epoch": 0.639, "grad_norm": 29.88111686706543, "learning_rate": 6.091032506152274e-08, "logits/chosen": 2.255582094192505, "logits/rejected": 2.361851930618286, "logps/chosen": -20.814992904663086, "logps/rejected": -235.01495361328125, "loss": 0.5787, "nll_loss": 0.5781943202018738, "rewards/accuracies": 1.0, "rewards/chosen": 5.136048793792725, "rewards/margins": 13.879589080810547, "rewards/rejected": -8.74354076385498, "step": 3834 }, { "epoch": 0.6391666666666667, "grad_norm": 24.586774826049805, "learning_rate": 6.086064638821298e-08, "logits/chosen": 0.46873122453689575, "logits/rejected": 2.513735771179199, "logps/chosen": -15.523117065429688, "logps/rejected": -374.16033935546875, "loss": 0.3926, "nll_loss": 0.38807782530784607, "rewards/accuracies": 1.0, "rewards/chosen": 2.9153785705566406, "rewards/margins": 9.408658981323242, "rewards/rejected": -6.493279933929443, "step": 3835 }, { "epoch": 0.6393333333333333, "grad_norm": 30.590330123901367, "learning_rate": 6.081097911917124e-08, "logits/chosen": 2.8421130180358887, "logits/rejected": 2.9821808338165283, "logps/chosen": -42.16109085083008, "logps/rejected": -153.42588806152344, "loss": 0.7991, "nll_loss": 0.7807608246803284, "rewards/accuracies": 1.0, "rewards/chosen": 1.7885701656341553, "rewards/margins": 6.340703010559082, "rewards/rejected": -4.552133083343506, "step": 3836 }, { "epoch": 0.6395, "grad_norm": 25.04990577697754, "learning_rate": 6.076132326886933e-08, "logits/chosen": 2.5155112743377686, "logits/rejected": 2.50480055809021, "logps/chosen": -44.278602600097656, "logps/rejected": -99.56808471679688, "loss": 0.6631, "nll_loss": 0.641718864440918, "rewards/accuracies": 1.0, "rewards/chosen": 2.09368896484375, "rewards/margins": 6.034706115722656, "rewards/rejected": -3.9410171508789062, "step": 3837 }, { "epoch": 0.6396666666666667, "grad_norm": 33.146690368652344, "learning_rate": 6.071167885177582e-08, "logits/chosen": 1.9979206323623657, "logits/rejected": 1.4323089122772217, "logps/chosen": -54.23414993286133, "logps/rejected": -57.778255462646484, "loss": 0.9182, "nll_loss": 0.8890843987464905, "rewards/accuracies": 1.0, "rewards/chosen": 1.5343021154403687, "rewards/margins": 5.436517715454102, "rewards/rejected": -3.9022154808044434, "step": 3838 }, { "epoch": 0.6398333333333334, "grad_norm": 19.668743133544922, "learning_rate": 6.066204588235584e-08, "logits/chosen": 2.4242522716522217, "logits/rejected": 2.5061140060424805, "logps/chosen": -133.9287567138672, "logps/rejected": -285.472900390625, "loss": 0.9621, "nll_loss": 0.956633985042572, "rewards/accuracies": 1.0, "rewards/chosen": 2.644514799118042, "rewards/margins": 9.229487419128418, "rewards/rejected": -6.584972381591797, "step": 3839 }, { "epoch": 0.64, "grad_norm": 30.830307006835938, "learning_rate": 6.06124243750713e-08, "logits/chosen": 2.1925384998321533, "logits/rejected": 2.5197231769561768, "logps/chosen": -69.54293060302734, "logps/rejected": -280.5155029296875, "loss": 0.8783, "nll_loss": 0.8585547208786011, "rewards/accuracies": 1.0, "rewards/chosen": 1.3775299787521362, "rewards/margins": 6.684582710266113, "rewards/rejected": -5.3070526123046875, "step": 3840 }, { "epoch": 0.6401666666666667, "grad_norm": 22.738866806030273, "learning_rate": 6.056281434438066e-08, "logits/chosen": 1.7586183547973633, "logits/rejected": 2.2065577507019043, "logps/chosen": -31.37110710144043, "logps/rejected": -141.67025756835938, "loss": 0.4642, "nll_loss": 0.441846638917923, "rewards/accuracies": 1.0, "rewards/chosen": 1.266847848892212, "rewards/margins": 6.4166259765625, "rewards/rejected": -5.149778366088867, "step": 3841 }, { "epoch": 0.6403333333333333, "grad_norm": 138.0361785888672, "learning_rate": 6.051321580473916e-08, "logits/chosen": 2.4226739406585693, "logits/rejected": 2.369534969329834, "logps/chosen": -31.131973266601562, "logps/rejected": -21.594390869140625, "loss": 1.7917, "nll_loss": 0.4384785294532776, "rewards/accuracies": 0.0, "rewards/chosen": 1.7741680145263672, "rewards/margins": -0.3171505928039551, "rewards/rejected": 2.0913186073303223, "step": 3842 }, { "epoch": 0.6405, "grad_norm": 22.22892189025879, "learning_rate": 6.046362877059857e-08, "logits/chosen": 2.6358792781829834, "logits/rejected": 2.5651044845581055, "logps/chosen": -124.5739974975586, "logps/rejected": -42.1651611328125, "loss": 0.8939, "nll_loss": 0.8591309785842896, "rewards/accuracies": 1.0, "rewards/chosen": 2.5098304748535156, "rewards/margins": 5.584196090698242, "rewards/rejected": -3.0743656158447266, "step": 3843 }, { "epoch": 0.6406666666666667, "grad_norm": 93.91572570800781, "learning_rate": 6.041405325640738e-08, "logits/chosen": 3.193571090698242, "logits/rejected": 3.159792184829712, "logps/chosen": -86.56706237792969, "logps/rejected": -19.20050621032715, "loss": 1.6873, "nll_loss": 0.8924439549446106, "rewards/accuracies": 1.0, "rewards/chosen": 3.0249345302581787, "rewards/margins": 1.2506545782089233, "rewards/rejected": 1.7742799520492554, "step": 3844 }, { "epoch": 0.6408333333333334, "grad_norm": 37.723148345947266, "learning_rate": 6.036448927661069e-08, "logits/chosen": 2.224818706512451, "logits/rejected": 2.3018078804016113, "logps/chosen": -44.90398406982422, "logps/rejected": -101.1052017211914, "loss": 0.8335, "nll_loss": 0.7742066979408264, "rewards/accuracies": 1.0, "rewards/chosen": 1.0522743463516235, "rewards/margins": 4.169904708862305, "rewards/rejected": -3.1176302433013916, "step": 3845 }, { "epoch": 0.641, "grad_norm": 26.518972396850586, "learning_rate": 6.031493684565029e-08, "logits/chosen": 1.8120476007461548, "logits/rejected": 2.2289879322052, "logps/chosen": -55.077857971191406, "logps/rejected": -180.05152893066406, "loss": 0.7238, "nll_loss": 0.7152969837188721, "rewards/accuracies": 1.0, "rewards/chosen": 2.094306230545044, "rewards/margins": 9.133840560913086, "rewards/rejected": -7.039534568786621, "step": 3846 }, { "epoch": 0.6411666666666667, "grad_norm": 32.520851135253906, "learning_rate": 6.026539597796452e-08, "logits/chosen": 3.0233986377716064, "logits/rejected": 3.0312514305114746, "logps/chosen": -60.41160583496094, "logps/rejected": -177.93251037597656, "loss": 0.8539, "nll_loss": 0.8163731694221497, "rewards/accuracies": 1.0, "rewards/chosen": 0.5455261468887329, "rewards/margins": 6.433970928192139, "rewards/rejected": -5.888444900512695, "step": 3847 }, { "epoch": 0.6413333333333333, "grad_norm": 28.4099063873291, "learning_rate": 6.021586668798846e-08, "logits/chosen": 1.9012213945388794, "logits/rejected": 1.710695743560791, "logps/chosen": -99.692626953125, "logps/rejected": -119.82185363769531, "loss": 0.9429, "nll_loss": 0.914611279964447, "rewards/accuracies": 1.0, "rewards/chosen": 1.547776222229004, "rewards/margins": 5.4861674308776855, "rewards/rejected": -3.9383912086486816, "step": 3848 }, { "epoch": 0.6415, "grad_norm": 22.862855911254883, "learning_rate": 6.016634899015369e-08, "logits/chosen": 1.8066484928131104, "logits/rejected": 2.010709762573242, "logps/chosen": -6.086383819580078, "logps/rejected": -66.38948059082031, "loss": 0.2481, "nll_loss": 0.23409166932106018, "rewards/accuracies": 1.0, "rewards/chosen": 2.819836139678955, "rewards/margins": 6.977567672729492, "rewards/rejected": -4.157731533050537, "step": 3849 }, { "epoch": 0.6416666666666667, "grad_norm": 28.548315048217773, "learning_rate": 6.01168428988885e-08, "logits/chosen": 2.934062957763672, "logits/rejected": 2.705625057220459, "logps/chosen": -179.23056030273438, "logps/rejected": -148.42929077148438, "loss": 1.2629, "nll_loss": 1.2276064157485962, "rewards/accuracies": 1.0, "rewards/chosen": 0.7031616568565369, "rewards/margins": 5.913717746734619, "rewards/rejected": -5.2105560302734375, "step": 3850 }, { "epoch": 0.6418333333333334, "grad_norm": 36.384613037109375, "learning_rate": 6.006734842861783e-08, "logits/chosen": 0.4372067451477051, "logits/rejected": 1.7924659252166748, "logps/chosen": -64.39960479736328, "logps/rejected": -342.4298400878906, "loss": 1.077, "nll_loss": 1.0557315349578857, "rewards/accuracies": 1.0, "rewards/chosen": 1.0670830011367798, "rewards/margins": 9.853344917297363, "rewards/rejected": -8.786261558532715, "step": 3851 }, { "epoch": 0.642, "grad_norm": 26.656841278076172, "learning_rate": 6.00178655937631e-08, "logits/chosen": 3.102069616317749, "logits/rejected": 3.227875232696533, "logps/chosen": -52.18693161010742, "logps/rejected": -101.2017593383789, "loss": 0.6783, "nll_loss": 0.6442831158638, "rewards/accuracies": 1.0, "rewards/chosen": 1.4659290313720703, "rewards/margins": 5.149779319763184, "rewards/rejected": -3.6838502883911133, "step": 3852 }, { "epoch": 0.6421666666666667, "grad_norm": 28.634794235229492, "learning_rate": 5.996839440874248e-08, "logits/chosen": 0.5675894021987915, "logits/rejected": 3.0550947189331055, "logps/chosen": -61.76441955566406, "logps/rejected": -85.7847671508789, "loss": 0.8418, "nll_loss": 0.823525607585907, "rewards/accuracies": 1.0, "rewards/chosen": 1.5822755098342896, "rewards/margins": 6.553894996643066, "rewards/rejected": -4.971619606018066, "step": 3853 }, { "epoch": 0.6423333333333333, "grad_norm": 19.608232498168945, "learning_rate": 5.991893488797064e-08, "logits/chosen": 1.7444920539855957, "logits/rejected": 1.5633760690689087, "logps/chosen": -44.240562438964844, "logps/rejected": -96.88896179199219, "loss": 0.5448, "nll_loss": 0.5330188274383545, "rewards/accuracies": 1.0, "rewards/chosen": 2.037160634994507, "rewards/margins": 7.369239807128906, "rewards/rejected": -5.3320794105529785, "step": 3854 }, { "epoch": 0.6425, "grad_norm": 26.058185577392578, "learning_rate": 5.986948704585894e-08, "logits/chosen": 2.763518810272217, "logits/rejected": 2.939159870147705, "logps/chosen": -11.017431259155273, "logps/rejected": -123.84284210205078, "loss": 0.3336, "nll_loss": 0.2899324297904968, "rewards/accuracies": 1.0, "rewards/chosen": 1.3412492275238037, "rewards/margins": 4.707300186157227, "rewards/rejected": -3.3660507202148438, "step": 3855 }, { "epoch": 0.6426666666666667, "grad_norm": 23.374406814575195, "learning_rate": 5.982005089681526e-08, "logits/chosen": 1.750244140625, "logits/rejected": 1.937112808227539, "logps/chosen": -94.49858093261719, "logps/rejected": -404.79119873046875, "loss": 0.8269, "nll_loss": 0.8217266798019409, "rewards/accuracies": 1.0, "rewards/chosen": 2.509631633758545, "rewards/margins": 13.154951095581055, "rewards/rejected": -10.645318984985352, "step": 3856 }, { "epoch": 0.6428333333333334, "grad_norm": 25.252321243286133, "learning_rate": 5.977062645524414e-08, "logits/chosen": 1.3839759826660156, "logits/rejected": 2.5679609775543213, "logps/chosen": -25.171661376953125, "logps/rejected": -232.3795166015625, "loss": 0.5201, "nll_loss": 0.4935619831085205, "rewards/accuracies": 1.0, "rewards/chosen": 0.8798893690109253, "rewards/margins": 7.430797576904297, "rewards/rejected": -6.550908088684082, "step": 3857 }, { "epoch": 0.643, "grad_norm": 30.55828857421875, "learning_rate": 5.972121373554664e-08, "logits/chosen": 2.9643123149871826, "logits/rejected": 2.978330135345459, "logps/chosen": -18.51886749267578, "logps/rejected": -170.6728515625, "loss": 0.4629, "nll_loss": 0.4516797363758087, "rewards/accuracies": 1.0, "rewards/chosen": 2.019044876098633, "rewards/margins": 7.583707809448242, "rewards/rejected": -5.564662933349609, "step": 3858 }, { "epoch": 0.6431666666666667, "grad_norm": 22.88274574279785, "learning_rate": 5.967181275212046e-08, "logits/chosen": 2.3297226428985596, "logits/rejected": 2.7757928371429443, "logps/chosen": -47.23777770996094, "logps/rejected": -202.36294555664062, "loss": 0.584, "nll_loss": 0.5760705471038818, "rewards/accuracies": 1.0, "rewards/chosen": 2.1074378490448, "rewards/margins": 10.129467010498047, "rewards/rejected": -8.022028923034668, "step": 3859 }, { "epoch": 0.6433333333333333, "grad_norm": 33.0975456237793, "learning_rate": 5.962242351935984e-08, "logits/chosen": 3.169078826904297, "logits/rejected": 3.4366819858551025, "logps/chosen": -123.75663757324219, "logps/rejected": -270.19482421875, "loss": 0.9257, "nll_loss": 0.8903355598449707, "rewards/accuracies": 1.0, "rewards/chosen": 0.5621780753135681, "rewards/margins": 7.1438188552856445, "rewards/rejected": -6.581640720367432, "step": 3860 }, { "epoch": 0.6435, "grad_norm": 17.960494995117188, "learning_rate": 5.9573046051655664e-08, "logits/chosen": 2.1351723670959473, "logits/rejected": 2.6422760486602783, "logps/chosen": -104.49942016601562, "logps/rejected": -314.63287353515625, "loss": 0.7614, "nll_loss": 0.7517943382263184, "rewards/accuracies": 1.0, "rewards/chosen": 1.9289109706878662, "rewards/margins": 9.29783821105957, "rewards/rejected": -7.368927001953125, "step": 3861 }, { "epoch": 0.6436666666666667, "grad_norm": 39.70698928833008, "learning_rate": 5.952368036339528e-08, "logits/chosen": 2.85916805267334, "logits/rejected": 3.119959592819214, "logps/chosen": -23.233816146850586, "logps/rejected": -117.83154296875, "loss": 0.6527, "nll_loss": 0.6279410719871521, "rewards/accuracies": 1.0, "rewards/chosen": 1.2614200115203857, "rewards/margins": 6.02033805847168, "rewards/rejected": -4.758918285369873, "step": 3862 }, { "epoch": 0.6438333333333334, "grad_norm": 43.655433654785156, "learning_rate": 5.9474326468962734e-08, "logits/chosen": 2.706756353378296, "logits/rejected": 2.635988235473633, "logps/chosen": -39.0379638671875, "logps/rejected": -162.5998992919922, "loss": 0.7886, "nll_loss": 0.736565351486206, "rewards/accuracies": 1.0, "rewards/chosen": 0.2994484007358551, "rewards/margins": 5.185007572174072, "rewards/rejected": -4.88555908203125, "step": 3863 }, { "epoch": 0.644, "grad_norm": 26.42266845703125, "learning_rate": 5.942498438273849e-08, "logits/chosen": 2.850210666656494, "logits/rejected": 2.9206392765045166, "logps/chosen": -40.7998046875, "logps/rejected": -205.71630859375, "loss": 0.6842, "nll_loss": 0.6580613851547241, "rewards/accuracies": 1.0, "rewards/chosen": 0.941531777381897, "rewards/margins": 6.913263320922852, "rewards/rejected": -5.971731662750244, "step": 3864 }, { "epoch": 0.6441666666666667, "grad_norm": 31.9625244140625, "learning_rate": 5.9375654119099705e-08, "logits/chosen": 0.446027010679245, "logits/rejected": 1.578269600868225, "logps/chosen": -69.15742492675781, "logps/rejected": -399.4005126953125, "loss": 0.9156, "nll_loss": 0.8981483578681946, "rewards/accuracies": 1.0, "rewards/chosen": 1.2653541564941406, "rewards/margins": 11.043997764587402, "rewards/rejected": -9.778643608093262, "step": 3865 }, { "epoch": 0.6443333333333333, "grad_norm": 92.4217529296875, "learning_rate": 5.9326335692419996e-08, "logits/chosen": 2.985091209411621, "logits/rejected": 2.987797975540161, "logps/chosen": -10.099056243896484, "logps/rejected": -47.102779388427734, "loss": 0.8731, "nll_loss": 0.8415878415107727, "rewards/accuracies": 1.0, "rewards/chosen": 1.2851858139038086, "rewards/margins": 5.331486225128174, "rewards/rejected": -4.046300411224365, "step": 3866 }, { "epoch": 0.6445, "grad_norm": 27.14118194580078, "learning_rate": 5.9277029117069596e-08, "logits/chosen": 2.132631301879883, "logits/rejected": 2.3283324241638184, "logps/chosen": -83.64786529541016, "logps/rejected": -264.70550537109375, "loss": 0.966, "nll_loss": 0.961469829082489, "rewards/accuracies": 1.0, "rewards/chosen": 2.7800402641296387, "rewards/margins": 9.92779541015625, "rewards/rejected": -7.147754669189453, "step": 3867 }, { "epoch": 0.6446666666666667, "grad_norm": 40.00465393066406, "learning_rate": 5.9227734407415245e-08, "logits/chosen": 2.8148033618927, "logits/rejected": 2.844867706298828, "logps/chosen": -19.747529983520508, "logps/rejected": -159.2833709716797, "loss": 0.4841, "nll_loss": 0.4816470146179199, "rewards/accuracies": 1.0, "rewards/chosen": 4.205480098724365, "rewards/margins": 10.234533309936523, "rewards/rejected": -6.029052734375, "step": 3868 }, { "epoch": 0.6448333333333334, "grad_norm": 73.06661224365234, "learning_rate": 5.9178451577820244e-08, "logits/chosen": 2.6850426197052, "logits/rejected": 2.5900962352752686, "logps/chosen": -76.47227478027344, "logps/rejected": -11.319826126098633, "loss": 2.1656, "nll_loss": 1.0770742893218994, "rewards/accuracies": 1.0, "rewards/chosen": 4.458285808563232, "rewards/margins": 1.2072365283966064, "rewards/rejected": 3.251049280166626, "step": 3869 }, { "epoch": 0.645, "grad_norm": 34.367431640625, "learning_rate": 5.9129180642644404e-08, "logits/chosen": 3.704347848892212, "logits/rejected": 3.793835163116455, "logps/chosen": -54.01871109008789, "logps/rejected": -247.05857849121094, "loss": 0.8934, "nll_loss": 0.8712695240974426, "rewards/accuracies": 1.0, "rewards/chosen": 1.3575496673583984, "rewards/margins": 6.254199028015137, "rewards/rejected": -4.896649360656738, "step": 3870 }, { "epoch": 0.6451666666666667, "grad_norm": 21.674955368041992, "learning_rate": 5.907992161624411e-08, "logits/chosen": 1.7718579769134521, "logits/rejected": 2.2817764282226562, "logps/chosen": -21.651639938354492, "logps/rejected": -67.2081298828125, "loss": 0.4132, "nll_loss": 0.3608607351779938, "rewards/accuracies": 1.0, "rewards/chosen": 1.7364667654037476, "rewards/margins": 4.579131603240967, "rewards/rejected": -2.8426647186279297, "step": 3871 }, { "epoch": 0.6453333333333333, "grad_norm": 28.758821487426758, "learning_rate": 5.903067451297228e-08, "logits/chosen": 2.4972472190856934, "logits/rejected": 2.5159974098205566, "logps/chosen": -102.11259460449219, "logps/rejected": -185.52288818359375, "loss": 1.0689, "nll_loss": 1.0527070760726929, "rewards/accuracies": 1.0, "rewards/chosen": 1.7009233236312866, "rewards/margins": 6.771159648895264, "rewards/rejected": -5.0702362060546875, "step": 3872 }, { "epoch": 0.6455, "grad_norm": 40.71930694580078, "learning_rate": 5.89814393471783e-08, "logits/chosen": 2.210775375366211, "logits/rejected": 2.4334511756896973, "logps/chosen": -49.24195098876953, "logps/rejected": -300.770751953125, "loss": 1.0848, "nll_loss": 1.047701120376587, "rewards/accuracies": 1.0, "rewards/chosen": 0.5712906122207642, "rewards/margins": 6.271849632263184, "rewards/rejected": -5.700559139251709, "step": 3873 }, { "epoch": 0.6456666666666667, "grad_norm": 16.042882919311523, "learning_rate": 5.8932216133208144e-08, "logits/chosen": 2.59482479095459, "logits/rejected": 2.8662047386169434, "logps/chosen": -99.84705352783203, "logps/rejected": -325.1875, "loss": 0.605, "nll_loss": 0.5978866219520569, "rewards/accuracies": 1.0, "rewards/chosen": 2.234206438064575, "rewards/margins": 9.992714881896973, "rewards/rejected": -7.758508205413818, "step": 3874 }, { "epoch": 0.6458333333333334, "grad_norm": 21.99032211303711, "learning_rate": 5.8883004885404254e-08, "logits/chosen": 2.878743886947632, "logits/rejected": 3.0801761150360107, "logps/chosen": -90.02838134765625, "logps/rejected": -229.01182556152344, "loss": 0.8539, "nll_loss": 0.8493242859840393, "rewards/accuracies": 1.0, "rewards/chosen": 2.6624557971954346, "rewards/margins": 11.05882453918457, "rewards/rejected": -8.396368980407715, "step": 3875 }, { "epoch": 0.646, "grad_norm": 29.149494171142578, "learning_rate": 5.883380561810563e-08, "logits/chosen": 2.753544330596924, "logits/rejected": 2.7180285453796387, "logps/chosen": -33.11655807495117, "logps/rejected": -121.69796752929688, "loss": 0.6645, "nll_loss": 0.6493443250656128, "rewards/accuracies": 1.0, "rewards/chosen": 1.470110297203064, "rewards/margins": 8.369852066040039, "rewards/rejected": -6.899742126464844, "step": 3876 }, { "epoch": 0.6461666666666667, "grad_norm": 22.593769073486328, "learning_rate": 5.878461834564772e-08, "logits/chosen": 1.6503493785858154, "logits/rejected": 1.8352667093276978, "logps/chosen": -59.65134048461914, "logps/rejected": -156.50267028808594, "loss": 0.7292, "nll_loss": 0.718690812587738, "rewards/accuracies": 1.0, "rewards/chosen": 1.9625682830810547, "rewards/margins": 8.073604583740234, "rewards/rejected": -6.1110358238220215, "step": 3877 }, { "epoch": 0.6463333333333333, "grad_norm": 18.38722038269043, "learning_rate": 5.873544308236257e-08, "logits/chosen": 2.377246856689453, "logits/rejected": 2.2135097980499268, "logps/chosen": -222.74014282226562, "logps/rejected": -117.33924865722656, "loss": 0.9828, "nll_loss": 0.9642431139945984, "rewards/accuracies": 1.0, "rewards/chosen": 2.5340821743011475, "rewards/margins": 6.427191734313965, "rewards/rejected": -3.8931093215942383, "step": 3878 }, { "epoch": 0.6465, "grad_norm": 50.64334487915039, "learning_rate": 5.868627984257861e-08, "logits/chosen": 3.1421568393707275, "logits/rejected": 3.1783175468444824, "logps/chosen": -13.393096923828125, "logps/rejected": -68.53119659423828, "loss": 0.4572, "nll_loss": 0.4464365839958191, "rewards/accuracies": 1.0, "rewards/chosen": 2.465101718902588, "rewards/margins": 7.301731586456299, "rewards/rejected": -4.836629867553711, "step": 3879 }, { "epoch": 0.6466666666666666, "grad_norm": 26.00762176513672, "learning_rate": 5.863712864062088e-08, "logits/chosen": 1.857787847518921, "logits/rejected": 2.342785358428955, "logps/chosen": -57.409095764160156, "logps/rejected": -278.78271484375, "loss": 0.7708, "nll_loss": 0.7654545307159424, "rewards/accuracies": 1.0, "rewards/chosen": 2.473187208175659, "rewards/margins": 14.985150337219238, "rewards/rejected": -12.511962890625, "step": 3880 }, { "epoch": 0.6468333333333334, "grad_norm": 21.901927947998047, "learning_rate": 5.858798949081081e-08, "logits/chosen": 2.2183375358581543, "logits/rejected": 2.3164520263671875, "logps/chosen": -92.15768432617188, "logps/rejected": -141.4560089111328, "loss": 0.8137, "nll_loss": 0.8084006905555725, "rewards/accuracies": 1.0, "rewards/chosen": 2.5518250465393066, "rewards/margins": 10.198199272155762, "rewards/rejected": -7.646374225616455, "step": 3881 }, { "epoch": 0.647, "grad_norm": 35.7661018371582, "learning_rate": 5.853886240746642e-08, "logits/chosen": 1.248348355293274, "logits/rejected": 2.3190085887908936, "logps/chosen": -85.74493408203125, "logps/rejected": -236.31900024414062, "loss": 1.1878, "nll_loss": 1.1587153673171997, "rewards/accuracies": 1.0, "rewards/chosen": 0.8055405616760254, "rewards/margins": 6.929037570953369, "rewards/rejected": -6.123497009277344, "step": 3882 }, { "epoch": 0.6471666666666667, "grad_norm": 64.2586669921875, "learning_rate": 5.848974740490211e-08, "logits/chosen": 1.0356026887893677, "logits/rejected": 1.9040330648422241, "logps/chosen": -11.24969482421875, "logps/rejected": -255.23828125, "loss": 0.8128, "nll_loss": 0.8035497069358826, "rewards/accuracies": 1.0, "rewards/chosen": 2.564866304397583, "rewards/margins": 7.62153434753418, "rewards/rejected": -5.056668281555176, "step": 3883 }, { "epoch": 0.6473333333333333, "grad_norm": 46.31940841674805, "learning_rate": 5.844064449742887e-08, "logits/chosen": 2.412790536880493, "logits/rejected": 1.8297317028045654, "logps/chosen": -122.39909362792969, "logps/rejected": -81.01732635498047, "loss": 1.7701, "nll_loss": 1.699987530708313, "rewards/accuracies": 1.0, "rewards/chosen": 1.0031540393829346, "rewards/margins": 3.8967201709747314, "rewards/rejected": -2.893566131591797, "step": 3884 }, { "epoch": 0.6475, "grad_norm": 26.043121337890625, "learning_rate": 5.839155369935407e-08, "logits/chosen": 1.4875433444976807, "logits/rejected": 2.303602933883667, "logps/chosen": -70.39496612548828, "logps/rejected": -251.52975463867188, "loss": 0.8244, "nll_loss": 0.8185461163520813, "rewards/accuracies": 1.0, "rewards/chosen": 2.382016897201538, "rewards/margins": 12.569341659545898, "rewards/rejected": -10.187324523925781, "step": 3885 }, { "epoch": 0.6476666666666666, "grad_norm": 35.321571350097656, "learning_rate": 5.834247502498157e-08, "logits/chosen": 2.3928067684173584, "logits/rejected": 2.2409961223602295, "logps/chosen": -49.514286041259766, "logps/rejected": -118.75288391113281, "loss": 0.6472, "nll_loss": 0.6430427432060242, "rewards/accuracies": 1.0, "rewards/chosen": 3.531702756881714, "rewards/margins": 9.15372371673584, "rewards/rejected": -5.622020721435547, "step": 3886 }, { "epoch": 0.6478333333333334, "grad_norm": 30.64507293701172, "learning_rate": 5.829340848861182e-08, "logits/chosen": 2.7030489444732666, "logits/rejected": 2.8210182189941406, "logps/chosen": -59.40657043457031, "logps/rejected": -184.62754821777344, "loss": 0.8708, "nll_loss": 0.8486652970314026, "rewards/accuracies": 1.0, "rewards/chosen": 1.1744095087051392, "rewards/margins": 6.772577285766602, "rewards/rejected": -5.598167896270752, "step": 3887 }, { "epoch": 0.648, "grad_norm": 23.87752342224121, "learning_rate": 5.824435410454149e-08, "logits/chosen": 2.6606266498565674, "logits/rejected": 2.6705427169799805, "logps/chosen": -67.74983978271484, "logps/rejected": -102.8295669555664, "loss": 0.8736, "nll_loss": 0.8575928807258606, "rewards/accuracies": 1.0, "rewards/chosen": 2.3939170837402344, "rewards/margins": 6.5887627601623535, "rewards/rejected": -4.194845676422119, "step": 3888 }, { "epoch": 0.6481666666666667, "grad_norm": 27.877744674682617, "learning_rate": 5.8195311887063925e-08, "logits/chosen": 2.234175443649292, "logits/rejected": 0.7798375487327576, "logps/chosen": -231.9793701171875, "logps/rejected": -60.7857666015625, "loss": 1.1851, "nll_loss": 1.1316066980361938, "rewards/accuracies": 1.0, "rewards/chosen": 1.5229616165161133, "rewards/margins": 4.454595565795898, "rewards/rejected": -2.9316341876983643, "step": 3889 }, { "epoch": 0.6483333333333333, "grad_norm": 34.841819763183594, "learning_rate": 5.8146281850468834e-08, "logits/chosen": 2.5066401958465576, "logits/rejected": 2.502019166946411, "logps/chosen": -125.88111877441406, "logps/rejected": -329.0636291503906, "loss": 1.274, "nll_loss": 1.2221468687057495, "rewards/accuracies": 1.0, "rewards/chosen": 0.4331817924976349, "rewards/margins": 4.827282905578613, "rewards/rejected": -4.394101142883301, "step": 3890 }, { "epoch": 0.6485, "grad_norm": 48.53623962402344, "learning_rate": 5.809726400904241e-08, "logits/chosen": 2.143094778060913, "logits/rejected": 2.3646271228790283, "logps/chosen": -17.701969146728516, "logps/rejected": -111.09864044189453, "loss": 0.7722, "nll_loss": 0.7696509957313538, "rewards/accuracies": 1.0, "rewards/chosen": 3.849529981613159, "rewards/margins": 10.119274139404297, "rewards/rejected": -6.269743919372559, "step": 3891 }, { "epoch": 0.6486666666666666, "grad_norm": 54.604034423828125, "learning_rate": 5.8048258377067305e-08, "logits/chosen": 2.3390700817108154, "logits/rejected": 2.4707117080688477, "logps/chosen": -18.032005310058594, "logps/rejected": -51.447017669677734, "loss": 0.6383, "nll_loss": 0.5816775560379028, "rewards/accuracies": 1.0, "rewards/chosen": 0.8721378445625305, "rewards/margins": 4.254207134246826, "rewards/rejected": -3.3820691108703613, "step": 3892 }, { "epoch": 0.6488333333333334, "grad_norm": 27.881120681762695, "learning_rate": 5.79992649688225e-08, "logits/chosen": 0.940895676612854, "logits/rejected": 2.0494580268859863, "logps/chosen": -60.55201721191406, "logps/rejected": -298.357421875, "loss": 0.7736, "nll_loss": 0.7475557923316956, "rewards/accuracies": 1.0, "rewards/chosen": 0.9302895069122314, "rewards/margins": 7.023270606994629, "rewards/rejected": -6.092981338500977, "step": 3893 }, { "epoch": 0.649, "grad_norm": 45.15180587768555, "learning_rate": 5.795028379858354e-08, "logits/chosen": 2.373011589050293, "logits/rejected": 1.6277282238006592, "logps/chosen": -40.236045837402344, "logps/rejected": -32.31307601928711, "loss": 0.7857, "nll_loss": 0.6190160512924194, "rewards/accuracies": 1.0, "rewards/chosen": 3.274399757385254, "rewards/margins": 4.267721176147461, "rewards/rejected": -0.9933212399482727, "step": 3894 }, { "epoch": 0.6491666666666667, "grad_norm": 29.36418914794922, "learning_rate": 5.7901314880622374e-08, "logits/chosen": 0.5272322297096252, "logits/rejected": 1.2158359289169312, "logps/chosen": -74.9428482055664, "logps/rejected": -388.94989013671875, "loss": 0.9996, "nll_loss": 0.9992379546165466, "rewards/accuracies": 1.0, "rewards/chosen": 5.2496867179870605, "rewards/margins": 18.449350357055664, "rewards/rejected": -13.199664115905762, "step": 3895 }, { "epoch": 0.6493333333333333, "grad_norm": 25.228843688964844, "learning_rate": 5.7852358229207396e-08, "logits/chosen": 2.731951951980591, "logits/rejected": 2.7078356742858887, "logps/chosen": -44.97285079956055, "logps/rejected": -206.52597045898438, "loss": 0.7443, "nll_loss": 0.7253684997558594, "rewards/accuracies": 1.0, "rewards/chosen": 1.5889980792999268, "rewards/margins": 6.412167549133301, "rewards/rejected": -4.823169231414795, "step": 3896 }, { "epoch": 0.6495, "grad_norm": 43.388648986816406, "learning_rate": 5.7803413858603325e-08, "logits/chosen": 2.55400013923645, "logits/rejected": 2.4857449531555176, "logps/chosen": -32.34257507324219, "logps/rejected": -69.45225524902344, "loss": 0.6816, "nll_loss": 0.5053526759147644, "rewards/accuracies": 1.0, "rewards/chosen": 1.730085015296936, "rewards/margins": 2.993882656097412, "rewards/rejected": -1.2637977600097656, "step": 3897 }, { "epoch": 0.6496666666666666, "grad_norm": 230.2701416015625, "learning_rate": 5.77544817830714e-08, "logits/chosen": 2.0963633060455322, "logits/rejected": 2.018949508666992, "logps/chosen": -76.41607666015625, "logps/rejected": -40.71363067626953, "loss": 2.8176, "nll_loss": 0.8586075901985168, "rewards/accuracies": 0.0, "rewards/chosen": 1.4635865688323975, "rewards/margins": -1.2126059532165527, "rewards/rejected": 2.67619252204895, "step": 3898 }, { "epoch": 0.6498333333333334, "grad_norm": 51.52128219604492, "learning_rate": 5.770556201686927e-08, "logits/chosen": 3.0775938034057617, "logits/rejected": 2.972604513168335, "logps/chosen": -99.15914916992188, "logps/rejected": -119.8884048461914, "loss": 1.3159, "nll_loss": 1.3047256469726562, "rewards/accuracies": 1.0, "rewards/chosen": 2.936575412750244, "rewards/margins": 7.35972785949707, "rewards/rejected": -4.423152446746826, "step": 3899 }, { "epoch": 0.65, "grad_norm": 27.319177627563477, "learning_rate": 5.7656654574251016e-08, "logits/chosen": 1.9807361364364624, "logits/rejected": 2.351553440093994, "logps/chosen": -36.27129364013672, "logps/rejected": -108.88844299316406, "loss": 0.6552, "nll_loss": 0.6253671050071716, "rewards/accuracies": 1.0, "rewards/chosen": 0.7917472720146179, "rewards/margins": 6.7301740646362305, "rewards/rejected": -5.938426971435547, "step": 3900 }, { "epoch": 0.6501666666666667, "grad_norm": 30.34246826171875, "learning_rate": 5.760775946946701e-08, "logits/chosen": 2.9413223266601562, "logits/rejected": 2.9861271381378174, "logps/chosen": -69.41661834716797, "logps/rejected": -324.7375183105469, "loss": 0.855, "nll_loss": 0.8465441465377808, "rewards/accuracies": 1.0, "rewards/chosen": 2.254276990890503, "rewards/margins": 8.224251747131348, "rewards/rejected": -5.969974517822266, "step": 3901 }, { "epoch": 0.6503333333333333, "grad_norm": 33.82303237915039, "learning_rate": 5.7558876716764157e-08, "logits/chosen": 1.960092544555664, "logits/rejected": 2.3249146938323975, "logps/chosen": -9.918966293334961, "logps/rejected": -158.5247802734375, "loss": 0.3892, "nll_loss": 0.38149869441986084, "rewards/accuracies": 1.0, "rewards/chosen": 2.337634801864624, "rewards/margins": 8.440217971801758, "rewards/rejected": -6.102583408355713, "step": 3902 }, { "epoch": 0.6505, "grad_norm": 18.65239906311035, "learning_rate": 5.7510006330385716e-08, "logits/chosen": 2.246487855911255, "logits/rejected": 2.6576483249664307, "logps/chosen": -41.026947021484375, "logps/rejected": -317.912109375, "loss": 0.4659, "nll_loss": 0.4609769284725189, "rewards/accuracies": 1.0, "rewards/chosen": 3.542820930480957, "rewards/margins": 8.86823844909668, "rewards/rejected": -5.325417995452881, "step": 3903 }, { "epoch": 0.6506666666666666, "grad_norm": 37.30181121826172, "learning_rate": 5.746114832457139e-08, "logits/chosen": 0.5593783855438232, "logits/rejected": 1.882571816444397, "logps/chosen": -16.735910415649414, "logps/rejected": -317.90765380859375, "loss": 0.5597, "nll_loss": 0.5578638315200806, "rewards/accuracies": 1.0, "rewards/chosen": 3.572991132736206, "rewards/margins": 16.34047508239746, "rewards/rejected": -12.767484664916992, "step": 3904 }, { "epoch": 0.6508333333333334, "grad_norm": 50.58981704711914, "learning_rate": 5.7412302713557127e-08, "logits/chosen": 2.187615394592285, "logits/rejected": 2.363112688064575, "logps/chosen": -13.361191749572754, "logps/rejected": -240.4612579345703, "loss": 0.5156, "nll_loss": 0.5138919353485107, "rewards/accuracies": 1.0, "rewards/chosen": 4.185827732086182, "rewards/margins": 10.999950408935547, "rewards/rejected": -6.814122200012207, "step": 3905 }, { "epoch": 0.651, "grad_norm": 26.38105010986328, "learning_rate": 5.736346951157544e-08, "logits/chosen": 2.6354029178619385, "logits/rejected": 2.5479915142059326, "logps/chosen": -99.45557403564453, "logps/rejected": -83.63479614257812, "loss": 1.0355, "nll_loss": 1.01485276222229, "rewards/accuracies": 1.0, "rewards/chosen": 1.9529061317443848, "rewards/margins": 6.071377754211426, "rewards/rejected": -4.118471622467041, "step": 3906 }, { "epoch": 0.6511666666666667, "grad_norm": 49.01581573486328, "learning_rate": 5.731464873285512e-08, "logits/chosen": 3.385146141052246, "logits/rejected": 3.4186313152313232, "logps/chosen": -31.59433937072754, "logps/rejected": -53.776649475097656, "loss": 0.9249, "nll_loss": 0.8101113438606262, "rewards/accuracies": 1.0, "rewards/chosen": 0.9635820388793945, "rewards/margins": 3.1714560985565186, "rewards/rejected": -2.207874059677124, "step": 3907 }, { "epoch": 0.6513333333333333, "grad_norm": 29.546323776245117, "learning_rate": 5.7265840391621415e-08, "logits/chosen": 1.28713059425354, "logits/rejected": 2.093356132507324, "logps/chosen": -38.409767150878906, "logps/rejected": -177.33737182617188, "loss": 0.6613, "nll_loss": 0.6296683549880981, "rewards/accuracies": 1.0, "rewards/chosen": 0.7686607837677002, "rewards/margins": 6.318456649780273, "rewards/rejected": -5.549795627593994, "step": 3908 }, { "epoch": 0.6515, "grad_norm": 28.83964729309082, "learning_rate": 5.72170445020958e-08, "logits/chosen": 1.6197046041488647, "logits/rejected": 2.17358660697937, "logps/chosen": -29.695140838623047, "logps/rejected": -303.9852294921875, "loss": 0.4907, "nll_loss": 0.4568483233451843, "rewards/accuracies": 1.0, "rewards/chosen": 1.0873836278915405, "rewards/margins": 5.275482177734375, "rewards/rejected": -4.188098430633545, "step": 3909 }, { "epoch": 0.6516666666666666, "grad_norm": 19.427227020263672, "learning_rate": 5.716826107849633e-08, "logits/chosen": 2.0439836978912354, "logits/rejected": 2.0891754627227783, "logps/chosen": -93.60110473632812, "logps/rejected": -156.787109375, "loss": 0.675, "nll_loss": 0.6591628193855286, "rewards/accuracies": 1.0, "rewards/chosen": 1.6163040399551392, "rewards/margins": 7.028128623962402, "rewards/rejected": -5.411824703216553, "step": 3910 }, { "epoch": 0.6518333333333334, "grad_norm": 157.28196716308594, "learning_rate": 5.7119490135037295e-08, "logits/chosen": 1.922472357749939, "logits/rejected": 1.977087140083313, "logps/chosen": -50.04457473754883, "logps/rejected": -31.917251586914062, "loss": 2.3621, "nll_loss": 0.807170569896698, "rewards/accuracies": 0.0, "rewards/chosen": 1.635442852973938, "rewards/margins": -0.6448673009872437, "rewards/rejected": 2.2803101539611816, "step": 3911 }, { "epoch": 0.652, "grad_norm": 19.367046356201172, "learning_rate": 5.707073168592942e-08, "logits/chosen": 2.605222463607788, "logits/rejected": 2.549973487854004, "logps/chosen": -137.83016967773438, "logps/rejected": -181.79974365234375, "loss": 0.7961, "nll_loss": 0.7831261157989502, "rewards/accuracies": 1.0, "rewards/chosen": 1.9180634021759033, "rewards/margins": 7.192700386047363, "rewards/rejected": -5.274636745452881, "step": 3912 }, { "epoch": 0.6521666666666667, "grad_norm": 29.478788375854492, "learning_rate": 5.702198574537964e-08, "logits/chosen": 2.267803192138672, "logits/rejected": 2.6732544898986816, "logps/chosen": -82.470703125, "logps/rejected": -76.97952270507812, "loss": 0.986, "nll_loss": 0.9702434539794922, "rewards/accuracies": 1.0, "rewards/chosen": 1.8847321271896362, "rewards/margins": 6.656792163848877, "rewards/rejected": -4.772059917449951, "step": 3913 }, { "epoch": 0.6523333333333333, "grad_norm": 42.67396926879883, "learning_rate": 5.6973252327591426e-08, "logits/chosen": 1.641068696975708, "logits/rejected": 2.1927566528320312, "logps/chosen": -93.0291748046875, "logps/rejected": -196.59107971191406, "loss": 1.2217, "nll_loss": 1.1775845289230347, "rewards/accuracies": 1.0, "rewards/chosen": 0.310455322265625, "rewards/margins": 6.831932544708252, "rewards/rejected": -6.521477222442627, "step": 3914 }, { "epoch": 0.6525, "grad_norm": 35.28463363647461, "learning_rate": 5.69245314467645e-08, "logits/chosen": 1.359647274017334, "logits/rejected": 2.244417667388916, "logps/chosen": -39.259117126464844, "logps/rejected": -229.91323852539062, "loss": 0.7886, "nll_loss": 0.7697865962982178, "rewards/accuracies": 1.0, "rewards/chosen": 1.1872237920761108, "rewards/margins": 9.89509105682373, "rewards/rejected": -8.707867622375488, "step": 3915 }, { "epoch": 0.6526666666666666, "grad_norm": 20.548551559448242, "learning_rate": 5.6875823117095025e-08, "logits/chosen": 2.2296502590179443, "logits/rejected": 2.669992446899414, "logps/chosen": -155.38380432128906, "logps/rejected": -567.9022216796875, "loss": 0.8947, "nll_loss": 0.8828625082969666, "rewards/accuracies": 1.0, "rewards/chosen": 1.6624374389648438, "rewards/margins": 17.108646392822266, "rewards/rejected": -15.446209907531738, "step": 3916 }, { "epoch": 0.6528333333333334, "grad_norm": 27.80133056640625, "learning_rate": 5.6827127352775327e-08, "logits/chosen": 2.436774492263794, "logits/rejected": 2.3181402683258057, "logps/chosen": -67.16921997070312, "logps/rejected": -104.17874145507812, "loss": 0.8376, "nll_loss": 0.8191368579864502, "rewards/accuracies": 1.0, "rewards/chosen": 2.3713624477386475, "rewards/margins": 6.366378307342529, "rewards/rejected": -3.995015859603882, "step": 3917 }, { "epoch": 0.653, "grad_norm": 23.555206298828125, "learning_rate": 5.677844416799423e-08, "logits/chosen": 2.0739166736602783, "logits/rejected": 2.6202306747436523, "logps/chosen": -60.33967208862305, "logps/rejected": -85.79007720947266, "loss": 0.734, "nll_loss": 0.7269840240478516, "rewards/accuracies": 1.0, "rewards/chosen": 2.6002273559570312, "rewards/margins": 8.281594276428223, "rewards/rejected": -5.681366920471191, "step": 3918 }, { "epoch": 0.6531666666666667, "grad_norm": 36.23146057128906, "learning_rate": 5.672977357693687e-08, "logits/chosen": 3.35128116607666, "logits/rejected": 3.3451297283172607, "logps/chosen": -76.31168365478516, "logps/rejected": -155.3719482421875, "loss": 1.0897, "nll_loss": 1.045365571975708, "rewards/accuracies": 1.0, "rewards/chosen": 1.1329933404922485, "rewards/margins": 4.67812442779541, "rewards/rejected": -3.545131206512451, "step": 3919 }, { "epoch": 0.6533333333333333, "grad_norm": 34.494781494140625, "learning_rate": 5.66811155937847e-08, "logits/chosen": 2.30713152885437, "logits/rejected": 2.1484215259552, "logps/chosen": -77.91888427734375, "logps/rejected": -142.62863159179688, "loss": 0.9817, "nll_loss": 0.9387816190719604, "rewards/accuracies": 1.0, "rewards/chosen": 0.8653938174247742, "rewards/margins": 4.843164443969727, "rewards/rejected": -3.9777705669403076, "step": 3920 }, { "epoch": 0.6535, "grad_norm": 88.40047454833984, "learning_rate": 5.6632470232715425e-08, "logits/chosen": 2.25274658203125, "logits/rejected": 1.7497414350509644, "logps/chosen": -294.86328125, "logps/rejected": -116.88911437988281, "loss": 1.144, "nll_loss": 1.100236177444458, "rewards/accuracies": 1.0, "rewards/chosen": 0.9807251691818237, "rewards/margins": 4.737559795379639, "rewards/rejected": -3.7568345069885254, "step": 3921 }, { "epoch": 0.6536666666666666, "grad_norm": 40.07252883911133, "learning_rate": 5.6583837507903167e-08, "logits/chosen": 2.3644776344299316, "logits/rejected": 2.5484418869018555, "logps/chosen": -9.011290550231934, "logps/rejected": -118.93538665771484, "loss": 0.3686, "nll_loss": 0.3465881049633026, "rewards/accuracies": 1.0, "rewards/chosen": 2.324249267578125, "rewards/margins": 6.0861382484436035, "rewards/rejected": -3.7618889808654785, "step": 3922 }, { "epoch": 0.6538333333333334, "grad_norm": 28.829011917114258, "learning_rate": 5.653521743351832e-08, "logits/chosen": 2.4495649337768555, "logits/rejected": 2.6544392108917236, "logps/chosen": -50.37268829345703, "logps/rejected": -322.7972717285156, "loss": 0.7236, "nll_loss": 0.7196097373962402, "rewards/accuracies": 1.0, "rewards/chosen": 2.76509165763855, "rewards/margins": 17.105449676513672, "rewards/rejected": -14.340357780456543, "step": 3923 }, { "epoch": 0.654, "grad_norm": 60.602481842041016, "learning_rate": 5.648661002372768e-08, "logits/chosen": 2.5211341381073, "logits/rejected": 2.7898318767547607, "logps/chosen": -51.64168930053711, "logps/rejected": -375.1836853027344, "loss": 1.8485, "nll_loss": 1.844346046447754, "rewards/accuracies": 1.0, "rewards/chosen": 2.739393949508667, "rewards/margins": 13.253640174865723, "rewards/rejected": -10.514245986938477, "step": 3924 }, { "epoch": 0.6541666666666667, "grad_norm": 30.48443603515625, "learning_rate": 5.643801529269419e-08, "logits/chosen": 2.4502227306365967, "logits/rejected": 2.496091604232788, "logps/chosen": -37.240570068359375, "logps/rejected": -216.46319580078125, "loss": 0.7629, "nll_loss": 0.760011613368988, "rewards/accuracies": 1.0, "rewards/chosen": 4.042115211486816, "rewards/margins": 9.91383171081543, "rewards/rejected": -5.871716499328613, "step": 3925 }, { "epoch": 0.6543333333333333, "grad_norm": 164.09762573242188, "learning_rate": 5.6389433254577213e-08, "logits/chosen": 2.3300440311431885, "logits/rejected": 2.4130096435546875, "logps/chosen": -48.87627410888672, "logps/rejected": -37.327003479003906, "loss": 2.9314, "nll_loss": 1.9550504684448242, "rewards/accuracies": 0.0, "rewards/chosen": -1.6834824085235596, "rewards/margins": -0.2572319507598877, "rewards/rejected": -1.4262504577636719, "step": 3926 }, { "epoch": 0.6545, "grad_norm": 24.779970169067383, "learning_rate": 5.634086392353239e-08, "logits/chosen": 2.869215488433838, "logits/rejected": 2.8179216384887695, "logps/chosen": -58.968971252441406, "logps/rejected": -190.4784698486328, "loss": 0.7288, "nll_loss": 0.7191337943077087, "rewards/accuracies": 1.0, "rewards/chosen": 1.9441423416137695, "rewards/margins": 8.826301574707031, "rewards/rejected": -6.882159233093262, "step": 3927 }, { "epoch": 0.6546666666666666, "grad_norm": 29.82697296142578, "learning_rate": 5.629230731371171e-08, "logits/chosen": 2.4214248657226562, "logits/rejected": 2.528141975402832, "logps/chosen": -56.26768112182617, "logps/rejected": -212.46917724609375, "loss": 0.7101, "nll_loss": 0.6946626901626587, "rewards/accuracies": 1.0, "rewards/chosen": 1.5544445514678955, "rewards/margins": 7.365753173828125, "rewards/rejected": -5.811308860778809, "step": 3928 }, { "epoch": 0.6548333333333334, "grad_norm": 16.75943374633789, "learning_rate": 5.624376343926332e-08, "logits/chosen": 1.2223327159881592, "logits/rejected": 1.3125125169754028, "logps/chosen": -291.34173583984375, "logps/rejected": -311.8480529785156, "loss": 0.8769, "nll_loss": 0.8645155429840088, "rewards/accuracies": 1.0, "rewards/chosen": 1.607696533203125, "rewards/margins": 16.193744659423828, "rewards/rejected": -14.586047172546387, "step": 3929 }, { "epoch": 0.655, "grad_norm": 38.876678466796875, "learning_rate": 5.6195232314331765e-08, "logits/chosen": 1.7164702415466309, "logits/rejected": 2.239981174468994, "logps/chosen": -18.37712860107422, "logps/rejected": -283.1114501953125, "loss": 0.7695, "nll_loss": 0.7657137513160706, "rewards/accuracies": 1.0, "rewards/chosen": 2.9176464080810547, "rewards/margins": 10.378499984741211, "rewards/rejected": -7.460853576660156, "step": 3930 }, { "epoch": 0.6551666666666667, "grad_norm": 32.495975494384766, "learning_rate": 5.6146713953057865e-08, "logits/chosen": 2.342482328414917, "logits/rejected": 2.6004223823547363, "logps/chosen": -24.73754119873047, "logps/rejected": -334.10198974609375, "loss": 0.5858, "nll_loss": 0.5622169971466064, "rewards/accuracies": 1.0, "rewards/chosen": 0.9715698957443237, "rewards/margins": 8.007092475891113, "rewards/rejected": -7.0355224609375, "step": 3931 }, { "epoch": 0.6553333333333333, "grad_norm": 338.1222229003906, "learning_rate": 5.60982083695787e-08, "logits/chosen": 2.8255093097686768, "logits/rejected": 2.485807180404663, "logps/chosen": -173.32925415039062, "logps/rejected": -157.48306274414062, "loss": 2.9435, "nll_loss": 1.699306607246399, "rewards/accuracies": 1.0, "rewards/chosen": -4.700122356414795, "rewards/margins": 0.08332347869873047, "rewards/rejected": -4.783445835113525, "step": 3932 }, { "epoch": 0.6555, "grad_norm": 94.130859375, "learning_rate": 5.604971557802769e-08, "logits/chosen": 2.610201597213745, "logits/rejected": 2.850094795227051, "logps/chosen": -26.980243682861328, "logps/rejected": -259.9711608886719, "loss": 1.1316, "nll_loss": 1.1241767406463623, "rewards/accuracies": 1.0, "rewards/chosen": 2.3196773529052734, "rewards/margins": 8.645060539245605, "rewards/rejected": -6.325383186340332, "step": 3933 }, { "epoch": 0.6556666666666666, "grad_norm": 25.794376373291016, "learning_rate": 5.6001235592534366e-08, "logits/chosen": 1.0252662897109985, "logits/rejected": 1.8685349225997925, "logps/chosen": -70.59115600585938, "logps/rejected": -182.6490478515625, "loss": 0.7757, "nll_loss": 0.7672950625419617, "rewards/accuracies": 1.0, "rewards/chosen": 2.034491777420044, "rewards/margins": 10.128362655639648, "rewards/rejected": -8.093871116638184, "step": 3934 }, { "epoch": 0.6558333333333334, "grad_norm": 26.1448917388916, "learning_rate": 5.595276842722468e-08, "logits/chosen": 2.9335055351257324, "logits/rejected": 3.0521059036254883, "logps/chosen": -42.64140319824219, "logps/rejected": -43.780479431152344, "loss": 0.6439, "nll_loss": 0.5922417640686035, "rewards/accuracies": 1.0, "rewards/chosen": 1.6912751197814941, "rewards/margins": 4.583479404449463, "rewards/rejected": -2.8922042846679688, "step": 3935 }, { "epoch": 0.656, "grad_norm": 28.746421813964844, "learning_rate": 5.590431409622081e-08, "logits/chosen": 1.4763915538787842, "logits/rejected": 2.270012855529785, "logps/chosen": -29.114660263061523, "logps/rejected": -304.1072082519531, "loss": 0.4802, "nll_loss": 0.4479178488254547, "rewards/accuracies": 1.0, "rewards/chosen": 1.1454317569732666, "rewards/margins": 5.345727920532227, "rewards/rejected": -4.200295925140381, "step": 3936 }, { "epoch": 0.6561666666666667, "grad_norm": 596.0192260742188, "learning_rate": 5.58558726136412e-08, "logits/chosen": 2.099325180053711, "logits/rejected": 1.0307412147521973, "logps/chosen": -386.396240234375, "logps/rejected": -272.75762939453125, "loss": 3.5155, "nll_loss": 1.2710403203964233, "rewards/accuracies": 0.0, "rewards/chosen": -7.030072212219238, "rewards/margins": -1.0251283645629883, "rewards/rejected": -6.00494384765625, "step": 3937 }, { "epoch": 0.6563333333333333, "grad_norm": 23.855426788330078, "learning_rate": 5.580744399360049e-08, "logits/chosen": 0.4160650670528412, "logits/rejected": 1.4664536714553833, "logps/chosen": -56.3505859375, "logps/rejected": -359.48577880859375, "loss": 0.7254, "nll_loss": 0.7043823003768921, "rewards/accuracies": 1.0, "rewards/chosen": 1.0523666143417358, "rewards/margins": 12.201434135437012, "rewards/rejected": -11.149067878723145, "step": 3938 }, { "epoch": 0.6565, "grad_norm": 28.626935958862305, "learning_rate": 5.5759028250209614e-08, "logits/chosen": 2.678004503250122, "logits/rejected": 2.5621511936187744, "logps/chosen": -50.90278244018555, "logps/rejected": -93.13836669921875, "loss": 0.7823, "nll_loss": 0.771254301071167, "rewards/accuracies": 1.0, "rewards/chosen": 1.790114402770996, "rewards/margins": 8.846729278564453, "rewards/rejected": -7.056614875793457, "step": 3939 }, { "epoch": 0.6566666666666666, "grad_norm": 42.55340576171875, "learning_rate": 5.57106253975758e-08, "logits/chosen": 2.728410005569458, "logits/rejected": 2.6674981117248535, "logps/chosen": -19.820192337036133, "logps/rejected": -134.3792266845703, "loss": 0.7671, "nll_loss": 0.7623150944709778, "rewards/accuracies": 1.0, "rewards/chosen": 3.2768707275390625, "rewards/margins": 8.874212265014648, "rewards/rejected": -5.597341060638428, "step": 3940 }, { "epoch": 0.6568333333333334, "grad_norm": 125.26925659179688, "learning_rate": 5.566223544980251e-08, "logits/chosen": 2.8173105716705322, "logits/rejected": 2.832333564758301, "logps/chosen": -81.81526947021484, "logps/rejected": -83.68451690673828, "loss": 1.774, "nll_loss": 1.3412339687347412, "rewards/accuracies": 1.0, "rewards/chosen": -2.239520311355591, "rewards/margins": 1.6264214515686035, "rewards/rejected": -3.8659417629241943, "step": 3941 }, { "epoch": 0.657, "grad_norm": 27.904712677001953, "learning_rate": 5.561385842098929e-08, "logits/chosen": 2.2937533855438232, "logits/rejected": 2.298276424407959, "logps/chosen": -102.96589660644531, "logps/rejected": -81.44700622558594, "loss": 1.0602, "nll_loss": 1.0296589136123657, "rewards/accuracies": 1.0, "rewards/chosen": 1.4623695611953735, "rewards/margins": 5.345325946807861, "rewards/rejected": -3.8829562664031982, "step": 3942 }, { "epoch": 0.6571666666666667, "grad_norm": 38.75846862792969, "learning_rate": 5.5565494325232144e-08, "logits/chosen": 2.6805620193481445, "logits/rejected": 2.6676621437072754, "logps/chosen": -122.57530212402344, "logps/rejected": -120.258056640625, "loss": 1.1943, "nll_loss": 1.1673836708068848, "rewards/accuracies": 1.0, "rewards/chosen": 1.0101791620254517, "rewards/margins": 6.208342552185059, "rewards/rejected": -5.1981635093688965, "step": 3943 }, { "epoch": 0.6573333333333333, "grad_norm": 23.92027473449707, "learning_rate": 5.5517143176623174e-08, "logits/chosen": 3.0000734329223633, "logits/rejected": 2.8619561195373535, "logps/chosen": -105.34927368164062, "logps/rejected": -78.52603149414062, "loss": 0.9057, "nll_loss": 0.8852880597114563, "rewards/accuracies": 1.0, "rewards/chosen": 2.210498094558716, "rewards/margins": 6.153457164764404, "rewards/rejected": -3.9429590702056885, "step": 3944 }, { "epoch": 0.6575, "grad_norm": 35.08087158203125, "learning_rate": 5.546880498925078e-08, "logits/chosen": 2.4606916904449463, "logits/rejected": 2.6790268421173096, "logps/chosen": -88.69261169433594, "logps/rejected": -191.9861297607422, "loss": 1.1153, "nll_loss": 1.0816171169281006, "rewards/accuracies": 1.0, "rewards/chosen": 0.5768699645996094, "rewards/margins": 7.743740081787109, "rewards/rejected": -7.1668701171875, "step": 3945 }, { "epoch": 0.6576666666666666, "grad_norm": 25.8604679107666, "learning_rate": 5.5420479777199494e-08, "logits/chosen": 2.448451280593872, "logits/rejected": 2.4713234901428223, "logps/chosen": -151.34384155273438, "logps/rejected": -246.55007934570312, "loss": 1.2708, "nll_loss": 1.26119863986969, "rewards/accuracies": 1.0, "rewards/chosen": 1.902786374092102, "rewards/margins": 9.666122436523438, "rewards/rejected": -7.763336181640625, "step": 3946 }, { "epoch": 0.6578333333333334, "grad_norm": 26.115432739257812, "learning_rate": 5.537216755455014e-08, "logits/chosen": 2.7851521968841553, "logits/rejected": 2.704110860824585, "logps/chosen": -18.604045867919922, "logps/rejected": -128.11814880371094, "loss": 0.4142, "nll_loss": 0.39583081007003784, "rewards/accuracies": 1.0, "rewards/chosen": 1.7648801803588867, "rewards/margins": 6.338088035583496, "rewards/rejected": -4.573207855224609, "step": 3947 }, { "epoch": 0.658, "grad_norm": 33.465171813964844, "learning_rate": 5.532386833537976e-08, "logits/chosen": 2.535316228866577, "logits/rejected": 2.458423614501953, "logps/chosen": -74.4886474609375, "logps/rejected": -92.72723388671875, "loss": 0.926, "nll_loss": 0.8867697715759277, "rewards/accuracies": 1.0, "rewards/chosen": 0.5933891534805298, "rewards/margins": 5.624111175537109, "rewards/rejected": -5.030722141265869, "step": 3948 }, { "epoch": 0.6581666666666667, "grad_norm": 13.401338577270508, "learning_rate": 5.52755821337616e-08, "logits/chosen": 2.465916156768799, "logits/rejected": 2.5452938079833984, "logps/chosen": -106.65090942382812, "logps/rejected": -275.35546875, "loss": 0.5118, "nll_loss": 0.5078614354133606, "rewards/accuracies": 1.0, "rewards/chosen": 2.8290557861328125, "rewards/margins": 10.878021240234375, "rewards/rejected": -8.048965454101562, "step": 3949 }, { "epoch": 0.6583333333333333, "grad_norm": 20.83664894104004, "learning_rate": 5.5227308963765054e-08, "logits/chosen": 1.1278624534606934, "logits/rejected": 3.192683458328247, "logps/chosen": -80.9668197631836, "logps/rejected": -475.3850402832031, "loss": 0.7895, "nll_loss": 0.7860854864120483, "rewards/accuracies": 1.0, "rewards/chosen": 2.9115519523620605, "rewards/margins": 13.11911392211914, "rewards/rejected": -10.207562446594238, "step": 3950 }, { "epoch": 0.6585, "grad_norm": 24.32452964782715, "learning_rate": 5.5179048839455764e-08, "logits/chosen": 2.5527713298797607, "logits/rejected": 2.646540641784668, "logps/chosen": -61.01824188232422, "logps/rejected": -167.82806396484375, "loss": 0.7211, "nll_loss": 0.7095145583152771, "rewards/accuracies": 1.0, "rewards/chosen": 2.2574074268341064, "rewards/margins": 7.192263603210449, "rewards/rejected": -4.934855937957764, "step": 3951 }, { "epoch": 0.6586666666666666, "grad_norm": 44.22125244140625, "learning_rate": 5.5130801774895616e-08, "logits/chosen": 1.7844105958938599, "logits/rejected": 1.6694040298461914, "logps/chosen": -107.10008239746094, "logps/rejected": -90.92137145996094, "loss": 1.1521, "nll_loss": 1.1156257390975952, "rewards/accuracies": 1.0, "rewards/chosen": 1.141992211341858, "rewards/margins": 5.061789035797119, "rewards/rejected": -3.9197967052459717, "step": 3952 }, { "epoch": 0.6588333333333334, "grad_norm": 30.00531005859375, "learning_rate": 5.508256778414262e-08, "logits/chosen": 2.492103338241577, "logits/rejected": 2.7550208568573, "logps/chosen": -40.59758758544922, "logps/rejected": -203.55567932128906, "loss": 0.5816, "nll_loss": 0.5561313629150391, "rewards/accuracies": 1.0, "rewards/chosen": 0.8510963916778564, "rewards/margins": 9.653799057006836, "rewards/rejected": -8.802702903747559, "step": 3953 }, { "epoch": 0.659, "grad_norm": 54.31302261352539, "learning_rate": 5.503434688125104e-08, "logits/chosen": 2.3631539344787598, "logits/rejected": 2.567596435546875, "logps/chosen": -41.163551330566406, "logps/rejected": -58.62720489501953, "loss": 1.4988, "nll_loss": 1.470126986503601, "rewards/accuracies": 1.0, "rewards/chosen": 1.6512119770050049, "rewards/margins": 5.465823173522949, "rewards/rejected": -3.8146111965179443, "step": 3954 }, { "epoch": 0.6591666666666667, "grad_norm": 31.79372787475586, "learning_rate": 5.498613908027121e-08, "logits/chosen": 1.8164477348327637, "logits/rejected": 1.8926151990890503, "logps/chosen": -76.54360961914062, "logps/rejected": -89.82945251464844, "loss": 0.9937, "nll_loss": 0.9449827671051025, "rewards/accuracies": 1.0, "rewards/chosen": 1.121307373046875, "rewards/margins": 4.508434295654297, "rewards/rejected": -3.387126922607422, "step": 3955 }, { "epoch": 0.6593333333333333, "grad_norm": 32.88855743408203, "learning_rate": 5.4937944395249794e-08, "logits/chosen": 2.2247259616851807, "logits/rejected": 2.5221526622772217, "logps/chosen": -68.86565399169922, "logps/rejected": -272.72113037109375, "loss": 0.9079, "nll_loss": 0.8943592309951782, "rewards/accuracies": 1.0, "rewards/chosen": 1.5520530939102173, "rewards/margins": 8.861989974975586, "rewards/rejected": -7.309937000274658, "step": 3956 }, { "epoch": 0.6595, "grad_norm": 28.29782485961914, "learning_rate": 5.4889762840229526e-08, "logits/chosen": 0.41195717453956604, "logits/rejected": 2.145754337310791, "logps/chosen": -19.46745491027832, "logps/rejected": -374.395263671875, "loss": 0.4527, "nll_loss": 0.4424421489238739, "rewards/accuracies": 1.0, "rewards/chosen": 1.796568512916565, "rewards/margins": 15.897005081176758, "rewards/rejected": -14.100436210632324, "step": 3957 }, { "epoch": 0.6596666666666666, "grad_norm": 32.80600357055664, "learning_rate": 5.4841594429249416e-08, "logits/chosen": 2.106823444366455, "logits/rejected": 2.1921226978302, "logps/chosen": -75.40316009521484, "logps/rejected": -130.96205139160156, "loss": 1.1236, "nll_loss": 1.0771878957748413, "rewards/accuracies": 1.0, "rewards/chosen": 1.5222389698028564, "rewards/margins": 4.667028427124023, "rewards/rejected": -3.144789218902588, "step": 3958 }, { "epoch": 0.6598333333333334, "grad_norm": 63.41715621948242, "learning_rate": 5.47934391763445e-08, "logits/chosen": 2.82135009765625, "logits/rejected": 2.9587812423706055, "logps/chosen": -30.726163864135742, "logps/rejected": -79.87320709228516, "loss": 1.0484, "nll_loss": 1.0242055654525757, "rewards/accuracies": 1.0, "rewards/chosen": 2.0927751064300537, "rewards/margins": 5.852384567260742, "rewards/rejected": -3.7596092224121094, "step": 3959 }, { "epoch": 0.66, "grad_norm": 65.02674102783203, "learning_rate": 5.4745297095546114e-08, "logits/chosen": 2.174142360687256, "logits/rejected": 2.1128365993499756, "logps/chosen": -65.20057678222656, "logps/rejected": -30.33499526977539, "loss": 1.2036, "nll_loss": 0.7581461668014526, "rewards/accuracies": 1.0, "rewards/chosen": 1.6787049770355225, "rewards/margins": 1.6360582113265991, "rewards/rejected": 0.04264679178595543, "step": 3960 }, { "epoch": 0.6601666666666667, "grad_norm": 159.08395385742188, "learning_rate": 5.469716820088168e-08, "logits/chosen": 2.712049961090088, "logits/rejected": 2.7326364517211914, "logps/chosen": -19.81452178955078, "logps/rejected": -17.866718292236328, "loss": 3.842, "nll_loss": 0.6391781568527222, "rewards/accuracies": 0.0, "rewards/chosen": 1.405592441558838, "rewards/margins": -2.619410991668701, "rewards/rejected": 4.025003433227539, "step": 3961 }, { "epoch": 0.6603333333333333, "grad_norm": 29.08290672302246, "learning_rate": 5.464905250637486e-08, "logits/chosen": 3.140087127685547, "logits/rejected": 3.1090457439422607, "logps/chosen": -12.431184768676758, "logps/rejected": -46.21530532836914, "loss": 0.3641, "nll_loss": 0.33597803115844727, "rewards/accuracies": 1.0, "rewards/chosen": 2.2666547298431396, "rewards/margins": 5.722161293029785, "rewards/rejected": -3.4555065631866455, "step": 3962 }, { "epoch": 0.6605, "grad_norm": 40.39992141723633, "learning_rate": 5.460095002604532e-08, "logits/chosen": 2.2122669219970703, "logits/rejected": 2.245706081390381, "logps/chosen": -10.81363296508789, "logps/rejected": -77.97875213623047, "loss": 0.4104, "nll_loss": 0.4005049169063568, "rewards/accuracies": 1.0, "rewards/chosen": 2.1709344387054443, "rewards/margins": 7.724465370178223, "rewards/rejected": -5.553531169891357, "step": 3963 }, { "epoch": 0.6606666666666666, "grad_norm": 184.7126007080078, "learning_rate": 5.4552860773909024e-08, "logits/chosen": 2.56851863861084, "logits/rejected": 2.551198720932007, "logps/chosen": -26.68979263305664, "logps/rejected": -32.20462417602539, "loss": 2.8243, "nll_loss": 0.6672447919845581, "rewards/accuracies": 0.0, "rewards/chosen": 1.5816166400909424, "rewards/margins": -1.422297716140747, "rewards/rejected": 3.0039143562316895, "step": 3964 }, { "epoch": 0.6608333333333334, "grad_norm": 25.3983097076416, "learning_rate": 5.450478476397802e-08, "logits/chosen": 1.6138136386871338, "logits/rejected": 2.394735336303711, "logps/chosen": -63.163330078125, "logps/rejected": -288.71551513671875, "loss": 0.8007, "nll_loss": 0.7995358109474182, "rewards/accuracies": 1.0, "rewards/chosen": 4.250814914703369, "rewards/margins": 12.034008979797363, "rewards/rejected": -7.783194065093994, "step": 3965 }, { "epoch": 0.661, "grad_norm": 33.6633415222168, "learning_rate": 5.4456722010260534e-08, "logits/chosen": 1.9873512983322144, "logits/rejected": 2.3831093311309814, "logps/chosen": -19.58095932006836, "logps/rejected": -234.94056701660156, "loss": 0.5454, "nll_loss": 0.5292150974273682, "rewards/accuracies": 1.0, "rewards/chosen": 1.469832420349121, "rewards/margins": 7.403750419616699, "rewards/rejected": -5.933917999267578, "step": 3966 }, { "epoch": 0.6611666666666667, "grad_norm": 22.036800384521484, "learning_rate": 5.4408672526760826e-08, "logits/chosen": 1.1389378309249878, "logits/rejected": 1.9998071193695068, "logps/chosen": -112.93724060058594, "logps/rejected": -299.03021240234375, "loss": 0.8994, "nll_loss": 0.8892695307731628, "rewards/accuracies": 1.0, "rewards/chosen": 1.8453903198242188, "rewards/margins": 9.430241584777832, "rewards/rejected": -7.584851264953613, "step": 3967 }, { "epoch": 0.6613333333333333, "grad_norm": 34.755096435546875, "learning_rate": 5.4360636327479404e-08, "logits/chosen": 1.7085785865783691, "logits/rejected": 1.7344815731048584, "logps/chosen": -56.66480255126953, "logps/rejected": -67.0085678100586, "loss": 1.0049, "nll_loss": 0.9769793152809143, "rewards/accuracies": 1.0, "rewards/chosen": 1.1258010864257812, "rewards/margins": 5.783046245574951, "rewards/rejected": -4.65724515914917, "step": 3968 }, { "epoch": 0.6615, "grad_norm": 23.591026306152344, "learning_rate": 5.4312613426412854e-08, "logits/chosen": 0.5610553026199341, "logits/rejected": 2.10068941116333, "logps/chosen": -43.297119140625, "logps/rejected": -418.92498779296875, "loss": 0.6086, "nll_loss": 0.5850962996482849, "rewards/accuracies": 1.0, "rewards/chosen": 0.9405136108398438, "rewards/margins": 9.439903259277344, "rewards/rejected": -8.4993896484375, "step": 3969 }, { "epoch": 0.6616666666666666, "grad_norm": 22.898746490478516, "learning_rate": 5.4264603837553946e-08, "logits/chosen": 2.803152084350586, "logits/rejected": 2.9073235988616943, "logps/chosen": -126.26345825195312, "logps/rejected": -199.88771057128906, "loss": 0.8853, "nll_loss": 0.8648181557655334, "rewards/accuracies": 1.0, "rewards/chosen": 1.741058349609375, "rewards/margins": 6.106611728668213, "rewards/rejected": -4.365553379058838, "step": 3970 }, { "epoch": 0.6618333333333334, "grad_norm": 27.382841110229492, "learning_rate": 5.421660757489144e-08, "logits/chosen": 2.350215435028076, "logits/rejected": 2.159635543823242, "logps/chosen": -52.7228889465332, "logps/rejected": -62.7552490234375, "loss": 0.8365, "nll_loss": 0.8237950801849365, "rewards/accuracies": 1.0, "rewards/chosen": 2.1304500102996826, "rewards/margins": 7.04128360748291, "rewards/rejected": -4.910833835601807, "step": 3971 }, { "epoch": 0.662, "grad_norm": 22.716819763183594, "learning_rate": 5.416862465241032e-08, "logits/chosen": 2.505723714828491, "logits/rejected": 2.432302713394165, "logps/chosen": -75.45672607421875, "logps/rejected": -73.67042541503906, "loss": 0.7742, "nll_loss": 0.7470963001251221, "rewards/accuracies": 1.0, "rewards/chosen": 1.8465805053710938, "rewards/margins": 5.591667175292969, "rewards/rejected": -3.745086669921875, "step": 3972 }, { "epoch": 0.6621666666666667, "grad_norm": 33.57539367675781, "learning_rate": 5.412065508409168e-08, "logits/chosen": 0.9277235269546509, "logits/rejected": 2.0669968128204346, "logps/chosen": -69.5022201538086, "logps/rejected": -358.00421142578125, "loss": 0.9994, "nll_loss": 0.9928889870643616, "rewards/accuracies": 1.0, "rewards/chosen": 2.327749729156494, "rewards/margins": 9.613758087158203, "rewards/rejected": -7.286007881164551, "step": 3973 }, { "epoch": 0.6623333333333333, "grad_norm": 119.3462142944336, "learning_rate": 5.4072698883912696e-08, "logits/chosen": 2.554072856903076, "logits/rejected": 1.6180784702301025, "logps/chosen": -140.8223114013672, "logps/rejected": -51.29476547241211, "loss": 2.3318, "nll_loss": 1.8529253005981445, "rewards/accuracies": 1.0, "rewards/chosen": 0.41309815645217896, "rewards/margins": 1.037286400794983, "rewards/rejected": -0.624188244342804, "step": 3974 }, { "epoch": 0.6625, "grad_norm": 30.465316772460938, "learning_rate": 5.402475606584669e-08, "logits/chosen": 2.7884340286254883, "logits/rejected": 2.611236333847046, "logps/chosen": -97.709716796875, "logps/rejected": -37.62596893310547, "loss": 1.1602, "nll_loss": 1.0856635570526123, "rewards/accuracies": 1.0, "rewards/chosen": 1.5252374410629272, "rewards/margins": 4.009262561798096, "rewards/rejected": -2.484025239944458, "step": 3975 }, { "epoch": 0.6626666666666666, "grad_norm": 29.748966217041016, "learning_rate": 5.397682664386295e-08, "logits/chosen": 2.354356050491333, "logits/rejected": 2.7488064765930176, "logps/chosen": -22.094493865966797, "logps/rejected": -42.70612716674805, "loss": 0.6054, "nll_loss": 0.5388902425765991, "rewards/accuracies": 1.0, "rewards/chosen": 1.3464683294296265, "rewards/margins": 4.082265377044678, "rewards/rejected": -2.7357969284057617, "step": 3976 }, { "epoch": 0.6628333333333334, "grad_norm": 29.715503692626953, "learning_rate": 5.3928910631927015e-08, "logits/chosen": 2.4780681133270264, "logits/rejected": 2.695208787918091, "logps/chosen": -24.44666290283203, "logps/rejected": -177.8690185546875, "loss": 0.5985, "nll_loss": 0.5820633769035339, "rewards/accuracies": 1.0, "rewards/chosen": 1.5298165082931519, "rewards/margins": 7.0738372802734375, "rewards/rejected": -5.544020652770996, "step": 3977 }, { "epoch": 0.663, "grad_norm": 37.22258377075195, "learning_rate": 5.3881008044000483e-08, "logits/chosen": 1.9991625547409058, "logits/rejected": 2.793236017227173, "logps/chosen": -43.494163513183594, "logps/rejected": -256.072021484375, "loss": 0.8421, "nll_loss": 0.8206444978713989, "rewards/accuracies": 1.0, "rewards/chosen": 1.05120849609375, "rewards/margins": 8.629688262939453, "rewards/rejected": -7.578479766845703, "step": 3978 }, { "epoch": 0.6631666666666667, "grad_norm": 146.7761688232422, "learning_rate": 5.383311889404102e-08, "logits/chosen": 2.198087453842163, "logits/rejected": 2.953169822692871, "logps/chosen": -30.27545166015625, "logps/rejected": -99.14077758789062, "loss": 1.047, "nll_loss": 0.5045908093452454, "rewards/accuracies": 1.0, "rewards/chosen": 1.352081298828125, "rewards/margins": 1.1784805059432983, "rewards/rejected": 0.17360077798366547, "step": 3979 }, { "epoch": 0.6633333333333333, "grad_norm": 22.973979949951172, "learning_rate": 5.378524319600231e-08, "logits/chosen": 2.815352201461792, "logits/rejected": 2.7369539737701416, "logps/chosen": -61.61160659790039, "logps/rejected": -123.97914123535156, "loss": 0.6572, "nll_loss": 0.6351711750030518, "rewards/accuracies": 1.0, "rewards/chosen": 1.1424511671066284, "rewards/margins": 6.895996570587158, "rewards/rejected": -5.75354528427124, "step": 3980 }, { "epoch": 0.6635, "grad_norm": 58.232696533203125, "learning_rate": 5.373738096383422e-08, "logits/chosen": 2.4095711708068848, "logits/rejected": 2.698648452758789, "logps/chosen": -18.56998062133789, "logps/rejected": -92.00349426269531, "loss": 0.7547, "nll_loss": 0.7427991628646851, "rewards/accuracies": 1.0, "rewards/chosen": 1.9107637405395508, "rewards/margins": 7.524118900299072, "rewards/rejected": -5.6133551597595215, "step": 3981 }, { "epoch": 0.6636666666666666, "grad_norm": 45.07590866088867, "learning_rate": 5.368953221148267e-08, "logits/chosen": 3.035104751586914, "logits/rejected": 3.0991978645324707, "logps/chosen": -25.602428436279297, "logps/rejected": -81.81507110595703, "loss": 0.6687, "nll_loss": 0.6400607824325562, "rewards/accuracies": 1.0, "rewards/chosen": 0.8322601914405823, "rewards/margins": 6.659634590148926, "rewards/rejected": -5.827374458312988, "step": 3982 }, { "epoch": 0.6638333333333334, "grad_norm": 105.13916778564453, "learning_rate": 5.364169695288966e-08, "logits/chosen": 3.1366591453552246, "logits/rejected": 3.2262346744537354, "logps/chosen": -67.83769226074219, "logps/rejected": -251.11944580078125, "loss": 2.0814, "nll_loss": 1.884380578994751, "rewards/accuracies": 1.0, "rewards/chosen": -1.660589575767517, "rewards/margins": 6.620716571807861, "rewards/rejected": -8.281306266784668, "step": 3983 }, { "epoch": 0.664, "grad_norm": 24.198625564575195, "learning_rate": 5.3593875201993164e-08, "logits/chosen": 1.6370433568954468, "logits/rejected": 1.570324182510376, "logps/chosen": -64.70643615722656, "logps/rejected": -95.62648010253906, "loss": 0.7154, "nll_loss": 0.7033307552337646, "rewards/accuracies": 1.0, "rewards/chosen": 1.8013076782226562, "rewards/margins": 7.808461666107178, "rewards/rejected": -6.0071539878845215, "step": 3984 }, { "epoch": 0.6641666666666667, "grad_norm": 219.82655334472656, "learning_rate": 5.354606697272732e-08, "logits/chosen": 2.219338893890381, "logits/rejected": 2.2387278079986572, "logps/chosen": -96.16607666015625, "logps/rejected": -20.77786636352539, "loss": 3.5712, "nll_loss": 0.9336512088775635, "rewards/accuracies": 0.0, "rewards/chosen": 2.1246445178985596, "rewards/margins": -1.8576273918151855, "rewards/rejected": 3.982271909713745, "step": 3985 }, { "epoch": 0.6643333333333333, "grad_norm": 52.30521774291992, "learning_rate": 5.3498272279022315e-08, "logits/chosen": 1.1806316375732422, "logits/rejected": 1.8459845781326294, "logps/chosen": -67.4842758178711, "logps/rejected": -341.93658447265625, "loss": 1.722, "nll_loss": 1.687106728553772, "rewards/accuracies": 1.0, "rewards/chosen": 1.1126068830490112, "rewards/margins": 5.172455310821533, "rewards/rejected": -4.059848308563232, "step": 3986 }, { "epoch": 0.6645, "grad_norm": 22.108009338378906, "learning_rate": 5.3450491134804407e-08, "logits/chosen": 3.0885465145111084, "logits/rejected": 2.919889450073242, "logps/chosen": -114.42425537109375, "logps/rejected": -73.91560363769531, "loss": 0.8832, "nll_loss": 0.8734676837921143, "rewards/accuracies": 1.0, "rewards/chosen": 1.9583863019943237, "rewards/margins": 8.59245491027832, "rewards/rejected": -6.634068965911865, "step": 3987 }, { "epoch": 0.6646666666666666, "grad_norm": 31.5615234375, "learning_rate": 5.34027235539958e-08, "logits/chosen": 2.9635426998138428, "logits/rejected": 2.6773245334625244, "logps/chosen": -1.9168906211853027, "logps/rejected": -80.6231689453125, "loss": 0.1272, "nll_loss": 0.036167748272418976, "rewards/accuracies": 1.0, "rewards/chosen": 0.6425544619560242, "rewards/margins": 3.4447593688964844, "rewards/rejected": -2.8022048473358154, "step": 3988 }, { "epoch": 0.6648333333333334, "grad_norm": 20.4694766998291, "learning_rate": 5.335496955051485e-08, "logits/chosen": 2.125819444656372, "logits/rejected": 2.3350512981414795, "logps/chosen": -116.61231994628906, "logps/rejected": -244.27618408203125, "loss": 0.9229, "nll_loss": 0.9182072877883911, "rewards/accuracies": 1.0, "rewards/chosen": 2.645411729812622, "rewards/margins": 10.39997386932373, "rewards/rejected": -7.7545623779296875, "step": 3989 }, { "epoch": 0.665, "grad_norm": 24.141071319580078, "learning_rate": 5.330722913827593e-08, "logits/chosen": 2.1872637271881104, "logits/rejected": 1.989745020866394, "logps/chosen": -152.0650634765625, "logps/rejected": -146.37045288085938, "loss": 1.0742, "nll_loss": 1.0560071468353271, "rewards/accuracies": 1.0, "rewards/chosen": 1.2413619756698608, "rewards/margins": 8.252714157104492, "rewards/rejected": -7.011352062225342, "step": 3990 }, { "epoch": 0.6651666666666667, "grad_norm": 26.970033645629883, "learning_rate": 5.32595023311895e-08, "logits/chosen": 2.6638293266296387, "logits/rejected": 2.6475744247436523, "logps/chosen": -71.48393249511719, "logps/rejected": -100.15618133544922, "loss": 0.8107, "nll_loss": 0.7769991159439087, "rewards/accuracies": 1.0, "rewards/chosen": 0.8842307925224304, "rewards/margins": 5.526162624359131, "rewards/rejected": -4.641932010650635, "step": 3991 }, { "epoch": 0.6653333333333333, "grad_norm": 35.347530364990234, "learning_rate": 5.321178914316191e-08, "logits/chosen": 2.7577242851257324, "logits/rejected": 2.6979072093963623, "logps/chosen": -68.00431060791016, "logps/rejected": -105.57386779785156, "loss": 0.9242, "nll_loss": 0.8947935700416565, "rewards/accuracies": 1.0, "rewards/chosen": 0.8411621451377869, "rewards/margins": 6.3605122566223145, "rewards/rejected": -5.519350051879883, "step": 3992 }, { "epoch": 0.6655, "grad_norm": 108.89534759521484, "learning_rate": 5.3164089588095694e-08, "logits/chosen": 2.49131178855896, "logits/rejected": 2.3592796325683594, "logps/chosen": -167.947998046875, "logps/rejected": -81.61157989501953, "loss": 1.8436, "nll_loss": 1.0240732431411743, "rewards/accuracies": 1.0, "rewards/chosen": 3.1830718517303467, "rewards/margins": 1.2790480852127075, "rewards/rejected": 1.9040237665176392, "step": 3993 }, { "epoch": 0.6656666666666666, "grad_norm": 138.31326293945312, "learning_rate": 5.311640367988934e-08, "logits/chosen": 3.2483229637145996, "logits/rejected": 3.365140914916992, "logps/chosen": -40.39801025390625, "logps/rejected": -184.10107421875, "loss": 2.4142, "nll_loss": 2.3763530254364014, "rewards/accuracies": 1.0, "rewards/chosen": 0.43009114265441895, "rewards/margins": 8.20931339263916, "rewards/rejected": -7.779222011566162, "step": 3994 }, { "epoch": 0.6658333333333334, "grad_norm": 22.167016983032227, "learning_rate": 5.30687314324374e-08, "logits/chosen": 1.5287574529647827, "logits/rejected": 1.2586660385131836, "logps/chosen": -69.94288635253906, "logps/rejected": -109.02824401855469, "loss": 0.7115, "nll_loss": 0.7064939737319946, "rewards/accuracies": 1.0, "rewards/chosen": 2.8684403896331787, "rewards/margins": 9.012801170349121, "rewards/rejected": -6.144360542297363, "step": 3995 }, { "epoch": 0.666, "grad_norm": 21.541837692260742, "learning_rate": 5.3021072859630443e-08, "logits/chosen": 0.2269018441438675, "logits/rejected": 1.3138175010681152, "logps/chosen": -78.37007141113281, "logps/rejected": -416.48944091796875, "loss": 0.8275, "nll_loss": 0.8249481320381165, "rewards/accuracies": 1.0, "rewards/chosen": 3.2243165969848633, "rewards/margins": 12.703815460205078, "rewards/rejected": -9.479498863220215, "step": 3996 }, { "epoch": 0.6661666666666667, "grad_norm": 34.389915466308594, "learning_rate": 5.297342797535496e-08, "logits/chosen": 4.450456619262695, "logits/rejected": 4.505050182342529, "logps/chosen": -45.90749740600586, "logps/rejected": -46.366050720214844, "loss": 0.8351, "nll_loss": 0.7915085554122925, "rewards/accuracies": 1.0, "rewards/chosen": 1.7941211462020874, "rewards/margins": 4.870360851287842, "rewards/rejected": -3.076239824295044, "step": 3997 }, { "epoch": 0.6663333333333333, "grad_norm": 66.44316864013672, "learning_rate": 5.292579679349356e-08, "logits/chosen": 2.675166606903076, "logits/rejected": 2.37188720703125, "logps/chosen": -60.73716735839844, "logps/rejected": -74.50950622558594, "loss": 1.0515, "nll_loss": 0.8320159912109375, "rewards/accuracies": 1.0, "rewards/chosen": 2.4505691528320312, "rewards/margins": 3.205845832824707, "rewards/rejected": -0.7552765607833862, "step": 3998 }, { "epoch": 0.6665, "grad_norm": 82.84430694580078, "learning_rate": 5.287817932792484e-08, "logits/chosen": 1.49024498462677, "logits/rejected": 1.7245501279830933, "logps/chosen": -64.4453125, "logps/rejected": -113.13827514648438, "loss": 2.3096, "nll_loss": 2.222252130508423, "rewards/accuracies": 1.0, "rewards/chosen": 1.9904578924179077, "rewards/margins": 4.0891900062561035, "rewards/rejected": -2.0987319946289062, "step": 3999 }, { "epoch": 0.6666666666666666, "grad_norm": 52.39791488647461, "learning_rate": 5.283057559252341e-08, "logits/chosen": 0.8029150366783142, "logits/rejected": 2.225497007369995, "logps/chosen": -12.709869384765625, "logps/rejected": -287.182373046875, "loss": 0.5304, "nll_loss": 0.5295779705047607, "rewards/accuracies": 1.0, "rewards/chosen": 4.6253180503845215, "rewards/margins": 12.73724365234375, "rewards/rejected": -8.11192512512207, "step": 4000 }, { "epoch": 0.6668333333333333, "grad_norm": 36.99045181274414, "learning_rate": 5.278298560115978e-08, "logits/chosen": 1.5072662830352783, "logits/rejected": 1.9907939434051514, "logps/chosen": -48.56832504272461, "logps/rejected": -367.82305908203125, "loss": 0.8554, "nll_loss": 0.837384819984436, "rewards/accuracies": 1.0, "rewards/chosen": 1.266528844833374, "rewards/margins": 8.028250694274902, "rewards/rejected": -6.761722087860107, "step": 4001 }, { "epoch": 0.667, "grad_norm": 21.603710174560547, "learning_rate": 5.273540936770058e-08, "logits/chosen": 2.772169351577759, "logits/rejected": 2.784048557281494, "logps/chosen": -63.80138397216797, "logps/rejected": -93.08282470703125, "loss": 0.6348, "nll_loss": 0.6194308996200562, "rewards/accuracies": 1.0, "rewards/chosen": 4.530638217926025, "rewards/margins": 8.2742280960083, "rewards/rejected": -3.7435898780822754, "step": 4002 }, { "epoch": 0.6671666666666667, "grad_norm": 20.85251808166504, "learning_rate": 5.268784690600837e-08, "logits/chosen": 1.3759100437164307, "logits/rejected": 1.3211811780929565, "logps/chosen": -43.227630615234375, "logps/rejected": -101.80043029785156, "loss": 0.5237, "nll_loss": 0.508560299873352, "rewards/accuracies": 1.0, "rewards/chosen": 1.6871871948242188, "rewards/margins": 7.012855052947998, "rewards/rejected": -5.325667858123779, "step": 4003 }, { "epoch": 0.6673333333333333, "grad_norm": 23.954553604125977, "learning_rate": 5.264029822994176e-08, "logits/chosen": 2.911180019378662, "logits/rejected": 2.9520676136016846, "logps/chosen": -14.129655838012695, "logps/rejected": -175.0447235107422, "loss": 0.322, "nll_loss": 0.31399235129356384, "rewards/accuracies": 1.0, "rewards/chosen": 2.2733030319213867, "rewards/margins": 8.38296890258789, "rewards/rejected": -6.109666347503662, "step": 4004 }, { "epoch": 0.6675, "grad_norm": 164.9175567626953, "learning_rate": 5.2592763353355206e-08, "logits/chosen": 1.738134503364563, "logits/rejected": 1.8632123470306396, "logps/chosen": -37.902183532714844, "logps/rejected": -66.61258697509766, "loss": 1.7685, "nll_loss": 0.6891306042671204, "rewards/accuracies": 0.0, "rewards/chosen": 0.19401779770851135, "rewards/margins": -0.2760658264160156, "rewards/rejected": 0.470083624124527, "step": 4005 }, { "epoch": 0.6676666666666666, "grad_norm": 23.937332153320312, "learning_rate": 5.2545242290099265e-08, "logits/chosen": 2.153108596801758, "logits/rejected": 2.1798453330993652, "logps/chosen": -78.67781829833984, "logps/rejected": -143.39662170410156, "loss": 0.8181, "nll_loss": 0.8028348088264465, "rewards/accuracies": 1.0, "rewards/chosen": 1.610334038734436, "rewards/margins": 7.165586471557617, "rewards/rejected": -5.555252552032471, "step": 4006 }, { "epoch": 0.6678333333333333, "grad_norm": 23.806398391723633, "learning_rate": 5.2497735054020456e-08, "logits/chosen": 1.7566320896148682, "logits/rejected": 1.9418739080429077, "logps/chosen": -62.23223114013672, "logps/rejected": -102.0467758178711, "loss": 0.7154, "nll_loss": 0.676437258720398, "rewards/accuracies": 1.0, "rewards/chosen": 1.6377900838851929, "rewards/margins": 4.967990875244141, "rewards/rejected": -3.3302009105682373, "step": 4007 }, { "epoch": 0.668, "grad_norm": 32.98666763305664, "learning_rate": 5.2450241658961256e-08, "logits/chosen": 2.600799322128296, "logits/rejected": 2.482351303100586, "logps/chosen": -81.06880187988281, "logps/rejected": -115.41648864746094, "loss": 1.0555, "nll_loss": 1.0528416633605957, "rewards/accuracies": 1.0, "rewards/chosen": 4.167752742767334, "rewards/margins": 10.137147903442383, "rewards/rejected": -5.969395637512207, "step": 4008 }, { "epoch": 0.6681666666666667, "grad_norm": 33.17910385131836, "learning_rate": 5.240276211876005e-08, "logits/chosen": 2.2166316509246826, "logits/rejected": 2.261866807937622, "logps/chosen": -9.714120864868164, "logps/rejected": -155.27554321289062, "loss": 0.3741, "nll_loss": 0.35978224873542786, "rewards/accuracies": 1.0, "rewards/chosen": 1.4756028652191162, "rewards/margins": 9.107693672180176, "rewards/rejected": -7.6320905685424805, "step": 4009 }, { "epoch": 0.6683333333333333, "grad_norm": 18.177560806274414, "learning_rate": 5.235529644725125e-08, "logits/chosen": 2.826124668121338, "logits/rejected": 2.9659218788146973, "logps/chosen": -91.79733276367188, "logps/rejected": -253.51382446289062, "loss": 0.7135, "nll_loss": 0.6954342126846313, "rewards/accuracies": 1.0, "rewards/chosen": 2.261279344558716, "rewards/margins": 6.363682746887207, "rewards/rejected": -4.102403163909912, "step": 4010 }, { "epoch": 0.6685, "grad_norm": 23.590591430664062, "learning_rate": 5.2307844658265234e-08, "logits/chosen": 1.4084980487823486, "logits/rejected": 2.5968668460845947, "logps/chosen": -38.668914794921875, "logps/rejected": -261.75146484375, "loss": 0.567, "nll_loss": 0.5446326732635498, "rewards/accuracies": 1.0, "rewards/chosen": 1.0230553150177002, "rewards/margins": 8.016264915466309, "rewards/rejected": -6.9932098388671875, "step": 4011 }, { "epoch": 0.6686666666666666, "grad_norm": 49.4011116027832, "learning_rate": 5.226040676562835e-08, "logits/chosen": 2.938433885574341, "logits/rejected": 3.2987782955169678, "logps/chosen": -9.197721481323242, "logps/rejected": -576.53515625, "loss": 0.4235, "nll_loss": 0.4180782437324524, "rewards/accuracies": 1.0, "rewards/chosen": 2.4669153690338135, "rewards/margins": 10.690443992614746, "rewards/rejected": -8.223528861999512, "step": 4012 }, { "epoch": 0.6688333333333333, "grad_norm": 35.662437438964844, "learning_rate": 5.221298278316277e-08, "logits/chosen": 1.7430413961410522, "logits/rejected": 2.118990659713745, "logps/chosen": -98.21089935302734, "logps/rejected": -94.20317840576172, "loss": 1.3554, "nll_loss": 1.3271743059158325, "rewards/accuracies": 1.0, "rewards/chosen": 0.7530677914619446, "rewards/margins": 8.243432998657227, "rewards/rejected": -7.490365505218506, "step": 4013 }, { "epoch": 0.669, "grad_norm": 21.48645782470703, "learning_rate": 5.216557272468675e-08, "logits/chosen": 2.970418930053711, "logits/rejected": 3.000605821609497, "logps/chosen": -231.1256103515625, "logps/rejected": -369.447021484375, "loss": 0.8291, "nll_loss": 0.8053154945373535, "rewards/accuracies": 1.0, "rewards/chosen": 0.9206756949424744, "rewards/margins": 9.505599975585938, "rewards/rejected": -8.584924697875977, "step": 4014 }, { "epoch": 0.6691666666666667, "grad_norm": 21.887184143066406, "learning_rate": 5.2118176604014444e-08, "logits/chosen": 1.06402587890625, "logits/rejected": 1.8694840669631958, "logps/chosen": -39.486061096191406, "logps/rejected": -325.7649841308594, "loss": 0.541, "nll_loss": 0.5195533633232117, "rewards/accuracies": 1.0, "rewards/chosen": 1.5172539949417114, "rewards/margins": 6.113811492919922, "rewards/rejected": -4.5965576171875, "step": 4015 }, { "epoch": 0.6693333333333333, "grad_norm": 24.411609649658203, "learning_rate": 5.207079443495594e-08, "logits/chosen": 1.8184245824813843, "logits/rejected": 2.01993727684021, "logps/chosen": -55.50999069213867, "logps/rejected": -239.50265502929688, "loss": 0.7304, "nll_loss": 0.7209089994430542, "rewards/accuracies": 1.0, "rewards/chosen": 1.9866596460342407, "rewards/margins": 8.574400901794434, "rewards/rejected": -6.587741374969482, "step": 4016 }, { "epoch": 0.6695, "grad_norm": 36.3374137878418, "learning_rate": 5.2023426231317305e-08, "logits/chosen": 2.392627477645874, "logits/rejected": 2.3241665363311768, "logps/chosen": -16.534452438354492, "logps/rejected": -83.0517349243164, "loss": 0.543, "nll_loss": 0.5167016386985779, "rewards/accuracies": 1.0, "rewards/chosen": 1.1283272504806519, "rewards/margins": 5.977816581726074, "rewards/rejected": -4.849489212036133, "step": 4017 }, { "epoch": 0.6696666666666666, "grad_norm": 28.374900817871094, "learning_rate": 5.197607200690042e-08, "logits/chosen": 2.269791603088379, "logits/rejected": 2.1583523750305176, "logps/chosen": -17.314172744750977, "logps/rejected": -80.66703033447266, "loss": 0.4478, "nll_loss": 0.41224217414855957, "rewards/accuracies": 1.0, "rewards/chosen": 1.6430189609527588, "rewards/margins": 5.110886096954346, "rewards/rejected": -3.467867136001587, "step": 4018 }, { "epoch": 0.6698333333333333, "grad_norm": 153.2118377685547, "learning_rate": 5.192873177550321e-08, "logits/chosen": 2.163572072982788, "logits/rejected": 1.8572735786437988, "logps/chosen": -102.09367370605469, "logps/rejected": -20.11277198791504, "loss": 2.5879, "nll_loss": 1.2450449466705322, "rewards/accuracies": 0.0, "rewards/chosen": 1.1786727905273438, "rewards/margins": -0.4513978958129883, "rewards/rejected": 1.630070686340332, "step": 4019 }, { "epoch": 0.67, "grad_norm": 45.18400573730469, "learning_rate": 5.1881405550919487e-08, "logits/chosen": 3.151611089706421, "logits/rejected": 3.122645378112793, "logps/chosen": -20.51656723022461, "logps/rejected": -64.59986114501953, "loss": 0.8288, "nll_loss": 0.8206626772880554, "rewards/accuracies": 1.0, "rewards/chosen": 3.1563282012939453, "rewards/margins": 7.95659065246582, "rewards/rejected": -4.800262451171875, "step": 4020 }, { "epoch": 0.6701666666666667, "grad_norm": 21.821043014526367, "learning_rate": 5.1834093346939004e-08, "logits/chosen": 2.351195812225342, "logits/rejected": 2.4911022186279297, "logps/chosen": -28.404495239257812, "logps/rejected": -282.02178955078125, "loss": 0.4476, "nll_loss": 0.4438202679157257, "rewards/accuracies": 1.0, "rewards/chosen": 2.815246105194092, "rewards/margins": 13.181461334228516, "rewards/rejected": -10.366215705871582, "step": 4021 }, { "epoch": 0.6703333333333333, "grad_norm": 23.621606826782227, "learning_rate": 5.178679517734732e-08, "logits/chosen": 2.0162084102630615, "logits/rejected": 2.5257279872894287, "logps/chosen": -72.8369140625, "logps/rejected": -177.42337036132812, "loss": 0.7138, "nll_loss": 0.7003549337387085, "rewards/accuracies": 1.0, "rewards/chosen": 1.5512101650238037, "rewards/margins": 8.905776023864746, "rewards/rejected": -7.354565620422363, "step": 4022 }, { "epoch": 0.6705, "grad_norm": 128.91322326660156, "learning_rate": 5.1739511055926044e-08, "logits/chosen": 2.8578755855560303, "logits/rejected": 2.9317245483398438, "logps/chosen": -185.74658203125, "logps/rejected": -136.19204711914062, "loss": 1.8302, "nll_loss": 1.5225127935409546, "rewards/accuracies": 1.0, "rewards/chosen": -1.4832061529159546, "rewards/margins": 1.8855944871902466, "rewards/rejected": -3.368800640106201, "step": 4023 }, { "epoch": 0.6706666666666666, "grad_norm": 113.33643341064453, "learning_rate": 5.169224099645262e-08, "logits/chosen": 2.8310959339141846, "logits/rejected": 2.6966793537139893, "logps/chosen": -152.59603881835938, "logps/rejected": -155.9681396484375, "loss": 1.9385, "nll_loss": 1.496039628982544, "rewards/accuracies": 1.0, "rewards/chosen": -2.626800537109375, "rewards/margins": 2.005153179168701, "rewards/rejected": -4.631953716278076, "step": 4024 }, { "epoch": 0.6708333333333333, "grad_norm": 83.76585388183594, "learning_rate": 5.164498501270045e-08, "logits/chosen": 1.2405741214752197, "logits/rejected": 1.2783540487289429, "logps/chosen": -118.68804931640625, "logps/rejected": -120.21402740478516, "loss": 1.2683, "nll_loss": 0.9973785877227783, "rewards/accuracies": 1.0, "rewards/chosen": 1.9403809309005737, "rewards/margins": 2.5436394214630127, "rewards/rejected": -0.6032585501670837, "step": 4025 }, { "epoch": 0.671, "grad_norm": 20.56447982788086, "learning_rate": 5.159774311843872e-08, "logits/chosen": 2.5733907222747803, "logits/rejected": 2.5146093368530273, "logps/chosen": -32.32266616821289, "logps/rejected": -102.17993927001953, "loss": 0.4293, "nll_loss": 0.4040333330631256, "rewards/accuracies": 1.0, "rewards/chosen": 1.0147885084152222, "rewards/margins": 6.543067932128906, "rewards/rejected": -5.5282793045043945, "step": 4026 }, { "epoch": 0.6711666666666667, "grad_norm": 28.647865295410156, "learning_rate": 5.155051532743261e-08, "logits/chosen": 2.1076321601867676, "logits/rejected": 1.8816674947738647, "logps/chosen": -80.28706359863281, "logps/rejected": -149.57020568847656, "loss": 0.8819, "nll_loss": 0.8451269865036011, "rewards/accuracies": 1.0, "rewards/chosen": 0.9179001450538635, "rewards/margins": 5.192007541656494, "rewards/rejected": -4.274107456207275, "step": 4027 }, { "epoch": 0.6713333333333333, "grad_norm": 32.04511260986328, "learning_rate": 5.1503301653443165e-08, "logits/chosen": 3.9221997261047363, "logits/rejected": 3.919445037841797, "logps/chosen": -60.943359375, "logps/rejected": -87.81471252441406, "loss": 0.7129, "nll_loss": 0.669707179069519, "rewards/accuracies": 1.0, "rewards/chosen": 1.3354812860488892, "rewards/margins": 4.731618404388428, "rewards/rejected": -3.396137237548828, "step": 4028 }, { "epoch": 0.6715, "grad_norm": 26.897497177124023, "learning_rate": 5.1456102110227376e-08, "logits/chosen": 1.2315771579742432, "logits/rejected": 2.51682186126709, "logps/chosen": -81.10028076171875, "logps/rejected": -252.1204071044922, "loss": 0.8967, "nll_loss": 0.8720460534095764, "rewards/accuracies": 1.0, "rewards/chosen": 0.8979843854904175, "rewards/margins": 8.36746597290039, "rewards/rejected": -7.469481468200684, "step": 4029 }, { "epoch": 0.6716666666666666, "grad_norm": 54.12467575073242, "learning_rate": 5.1408916711537964e-08, "logits/chosen": 2.4665918350219727, "logits/rejected": 3.2510852813720703, "logps/chosen": -92.79368591308594, "logps/rejected": -242.17832946777344, "loss": 1.1772, "nll_loss": 1.066594123840332, "rewards/accuracies": 1.0, "rewards/chosen": -0.34780049324035645, "rewards/margins": 3.497875452041626, "rewards/rejected": -3.8456759452819824, "step": 4030 }, { "epoch": 0.6718333333333333, "grad_norm": 44.585357666015625, "learning_rate": 5.1361745471123664e-08, "logits/chosen": 2.8658721446990967, "logits/rejected": 2.7059054374694824, "logps/chosen": -58.786251068115234, "logps/rejected": -41.81119155883789, "loss": 1.2765, "nll_loss": 1.175724983215332, "rewards/accuracies": 1.0, "rewards/chosen": 1.9838840961456299, "rewards/margins": 3.9076642990112305, "rewards/rejected": -1.9237803220748901, "step": 4031 }, { "epoch": 0.672, "grad_norm": 27.565046310424805, "learning_rate": 5.131458840272904e-08, "logits/chosen": 2.0985941886901855, "logits/rejected": 2.4604811668395996, "logps/chosen": -29.441686630249023, "logps/rejected": -154.23928833007812, "loss": 0.5657, "nll_loss": 0.5257444977760315, "rewards/accuracies": 1.0, "rewards/chosen": 0.3927484452724457, "rewards/margins": 7.087954044342041, "rewards/rejected": -6.6952056884765625, "step": 4032 }, { "epoch": 0.6721666666666667, "grad_norm": 36.93113708496094, "learning_rate": 5.1267445520094566e-08, "logits/chosen": 2.1864919662475586, "logits/rejected": 2.070930004119873, "logps/chosen": -43.205631256103516, "logps/rejected": -86.78598022460938, "loss": 0.7307, "nll_loss": 0.7082890272140503, "rewards/accuracies": 1.0, "rewards/chosen": 1.972130298614502, "rewards/margins": 5.938931465148926, "rewards/rejected": -3.966801404953003, "step": 4033 }, { "epoch": 0.6723333333333333, "grad_norm": 24.221811294555664, "learning_rate": 5.122031683695647e-08, "logits/chosen": 1.529032588005066, "logits/rejected": 2.0260934829711914, "logps/chosen": -61.352943420410156, "logps/rejected": -387.023193359375, "loss": 0.7019, "nll_loss": 0.6893590092658997, "rewards/accuracies": 1.0, "rewards/chosen": 1.5762138366699219, "rewards/margins": 14.141949653625488, "rewards/rejected": -12.565735816955566, "step": 4034 }, { "epoch": 0.6725, "grad_norm": 38.29515075683594, "learning_rate": 5.117320236704697e-08, "logits/chosen": 2.5057709217071533, "logits/rejected": 2.6782078742980957, "logps/chosen": -26.77108383178711, "logps/rejected": -68.82093048095703, "loss": 0.7345, "nll_loss": 0.7235428094863892, "rewards/accuracies": 1.0, "rewards/chosen": 1.8592064380645752, "rewards/margins": 8.135893821716309, "rewards/rejected": -6.2766876220703125, "step": 4035 }, { "epoch": 0.6726666666666666, "grad_norm": 31.384910583496094, "learning_rate": 5.112610212409406e-08, "logits/chosen": 1.2308903932571411, "logits/rejected": 1.904624342918396, "logps/chosen": -47.444149017333984, "logps/rejected": -165.08074951171875, "loss": 0.775, "nll_loss": 0.7652280926704407, "rewards/accuracies": 1.0, "rewards/chosen": 1.9219173192977905, "rewards/margins": 8.810317993164062, "rewards/rejected": -6.888401031494141, "step": 4036 }, { "epoch": 0.6728333333333333, "grad_norm": 27.620256423950195, "learning_rate": 5.1079016121821664e-08, "logits/chosen": 2.440073251724243, "logits/rejected": 2.369194507598877, "logps/chosen": -22.35865020751953, "logps/rejected": -166.57608032226562, "loss": 0.5281, "nll_loss": 0.519968569278717, "rewards/accuracies": 1.0, "rewards/chosen": 2.523261070251465, "rewards/margins": 7.924011707305908, "rewards/rejected": -5.400750637054443, "step": 4037 }, { "epoch": 0.673, "grad_norm": 17.091135025024414, "learning_rate": 5.103194437394951e-08, "logits/chosen": 2.562493085861206, "logits/rejected": 2.739119052886963, "logps/chosen": -88.41168975830078, "logps/rejected": -283.6689147949219, "loss": 0.6153, "nll_loss": 0.6097357273101807, "rewards/accuracies": 1.0, "rewards/chosen": 2.4889779090881348, "rewards/margins": 9.878473281860352, "rewards/rejected": -7.389495849609375, "step": 4038 }, { "epoch": 0.6731666666666667, "grad_norm": 41.64443588256836, "learning_rate": 5.098488689419313e-08, "logits/chosen": 2.9439210891723633, "logits/rejected": 2.938638687133789, "logps/chosen": -107.2017593383789, "logps/rejected": -33.602783203125, "loss": 1.0578, "nll_loss": 0.9745615720748901, "rewards/accuracies": 1.0, "rewards/chosen": 0.6807839274406433, "rewards/margins": 3.59077525138855, "rewards/rejected": -2.9099912643432617, "step": 4039 }, { "epoch": 0.6733333333333333, "grad_norm": 40.634979248046875, "learning_rate": 5.0937843696263963e-08, "logits/chosen": 1.9777864217758179, "logits/rejected": 2.384831190109253, "logps/chosen": -64.52862548828125, "logps/rejected": -151.41488647460938, "loss": 1.0888, "nll_loss": 1.0754770040512085, "rewards/accuracies": 1.0, "rewards/chosen": 1.9204232692718506, "rewards/margins": 7.082667350769043, "rewards/rejected": -5.1622443199157715, "step": 4040 }, { "epoch": 0.6735, "grad_norm": 22.25633430480957, "learning_rate": 5.089081479386928e-08, "logits/chosen": 1.799907922744751, "logits/rejected": 2.2137627601623535, "logps/chosen": -98.66352844238281, "logps/rejected": -177.0295867919922, "loss": 0.9349, "nll_loss": 0.9307879209518433, "rewards/accuracies": 1.0, "rewards/chosen": 2.9782655239105225, "rewards/margins": 9.536120414733887, "rewards/rejected": -6.557855129241943, "step": 4041 }, { "epoch": 0.6736666666666666, "grad_norm": 26.487472534179688, "learning_rate": 5.08438002007122e-08, "logits/chosen": 1.6467962265014648, "logits/rejected": 2.359307289123535, "logps/chosen": -85.4183578491211, "logps/rejected": -213.51463317871094, "loss": 0.9989, "nll_loss": 0.9932367205619812, "rewards/accuracies": 1.0, "rewards/chosen": 2.5393388271331787, "rewards/margins": 9.339823722839355, "rewards/rejected": -6.800485134124756, "step": 4042 }, { "epoch": 0.6738333333333333, "grad_norm": 48.28995895385742, "learning_rate": 5.079679993049156e-08, "logits/chosen": 2.36539888381958, "logits/rejected": 2.4656994342803955, "logps/chosen": -49.73088073730469, "logps/rejected": -176.1839599609375, "loss": 1.2464, "nll_loss": 1.243272066116333, "rewards/accuracies": 1.0, "rewards/chosen": 5.041993141174316, "rewards/margins": 10.49052906036377, "rewards/rejected": -5.448535919189453, "step": 4043 }, { "epoch": 0.674, "grad_norm": 46.821746826171875, "learning_rate": 5.074981399690218e-08, "logits/chosen": 2.7345714569091797, "logits/rejected": 2.968177556991577, "logps/chosen": -55.0243034362793, "logps/rejected": -74.96967315673828, "loss": 1.2521, "nll_loss": 1.2227623462677002, "rewards/accuracies": 1.0, "rewards/chosen": 0.9530925750732422, "rewards/margins": 5.916224479675293, "rewards/rejected": -4.963131904602051, "step": 4044 }, { "epoch": 0.6741666666666667, "grad_norm": 78.75386047363281, "learning_rate": 5.0702842413634613e-08, "logits/chosen": 1.96317458152771, "logits/rejected": 2.1498358249664307, "logps/chosen": -5.938694000244141, "logps/rejected": -152.49441528320312, "loss": 0.4772, "nll_loss": 0.45682260394096375, "rewards/accuracies": 1.0, "rewards/chosen": 1.602769136428833, "rewards/margins": 6.178017616271973, "rewards/rejected": -4.575248718261719, "step": 4045 }, { "epoch": 0.6743333333333333, "grad_norm": 23.637252807617188, "learning_rate": 5.06558851943753e-08, "logits/chosen": 2.291666030883789, "logits/rejected": 2.3849592208862305, "logps/chosen": -95.19275665283203, "logps/rejected": -403.36566162109375, "loss": 1.0768, "nll_loss": 1.0695817470550537, "rewards/accuracies": 1.0, "rewards/chosen": 2.609618663787842, "rewards/margins": 8.187414169311523, "rewards/rejected": -5.577795505523682, "step": 4046 }, { "epoch": 0.6745, "grad_norm": 36.73997116088867, "learning_rate": 5.060894235280636e-08, "logits/chosen": 3.044562816619873, "logits/rejected": 2.934230327606201, "logps/chosen": -61.86579132080078, "logps/rejected": -224.8278350830078, "loss": 1.1364, "nll_loss": 1.1248325109481812, "rewards/accuracies": 1.0, "rewards/chosen": 3.3641793727874756, "rewards/margins": 7.60585880279541, "rewards/rejected": -4.241679668426514, "step": 4047 }, { "epoch": 0.6746666666666666, "grad_norm": 109.77759552001953, "learning_rate": 5.056201390260586e-08, "logits/chosen": 2.478828191757202, "logits/rejected": 2.6135921478271484, "logps/chosen": -17.236087799072266, "logps/rejected": -43.31705093383789, "loss": 0.5328, "nll_loss": 0.42039239406585693, "rewards/accuracies": 1.0, "rewards/chosen": 1.2961153984069824, "rewards/margins": 3.3532774448394775, "rewards/rejected": -2.057162046432495, "step": 4048 }, { "epoch": 0.6748333333333333, "grad_norm": 127.63375091552734, "learning_rate": 5.051509985744761e-08, "logits/chosen": 2.6524007320404053, "logits/rejected": 2.571216583251953, "logps/chosen": -33.89813232421875, "logps/rejected": -24.225975036621094, "loss": 1.7091, "nll_loss": 0.7212368845939636, "rewards/accuracies": 1.0, "rewards/chosen": 0.7633392810821533, "rewards/margins": 0.010238707065582275, "rewards/rejected": 0.753100574016571, "step": 4049 }, { "epoch": 0.675, "grad_norm": 126.14012908935547, "learning_rate": 5.046820023100128e-08, "logits/chosen": 2.307262897491455, "logits/rejected": 2.183750867843628, "logps/chosen": -108.88583374023438, "logps/rejected": -32.66032409667969, "loss": 1.9704, "nll_loss": 1.1965477466583252, "rewards/accuracies": 1.0, "rewards/chosen": 1.6348876953125, "rewards/margins": 0.7060878872871399, "rewards/rejected": 0.9287998080253601, "step": 4050 }, { "epoch": 0.6751666666666667, "grad_norm": 40.878578186035156, "learning_rate": 5.042131503693221e-08, "logits/chosen": 3.2594618797302246, "logits/rejected": 3.464229106903076, "logps/chosen": -90.87470245361328, "logps/rejected": -286.551513671875, "loss": 1.46, "nll_loss": 1.4424556493759155, "rewards/accuracies": 1.0, "rewards/chosen": 1.3296432495117188, "rewards/margins": 7.579187393188477, "rewards/rejected": -6.249544143676758, "step": 4051 }, { "epoch": 0.6753333333333333, "grad_norm": 20.11345863342285, "learning_rate": 5.0374444288901684e-08, "logits/chosen": 2.843531370162964, "logits/rejected": 2.786635398864746, "logps/chosen": -162.5689697265625, "logps/rejected": -103.93917846679688, "loss": 1.0312, "nll_loss": 1.0097453594207764, "rewards/accuracies": 1.0, "rewards/chosen": 2.876683235168457, "rewards/margins": 6.473862648010254, "rewards/rejected": -3.597179412841797, "step": 4052 }, { "epoch": 0.6755, "grad_norm": 20.33858871459961, "learning_rate": 5.032758800056669e-08, "logits/chosen": 3.0221118927001953, "logits/rejected": 3.164016008377075, "logps/chosen": -60.084129333496094, "logps/rejected": -163.93832397460938, "loss": 0.6485, "nll_loss": 0.6258763670921326, "rewards/accuracies": 1.0, "rewards/chosen": 1.0529762506484985, "rewards/margins": 7.2579240798950195, "rewards/rejected": -6.2049479484558105, "step": 4053 }, { "epoch": 0.6756666666666666, "grad_norm": 28.926109313964844, "learning_rate": 5.028074618558007e-08, "logits/chosen": 2.769500494003296, "logits/rejected": 3.1440484523773193, "logps/chosen": -87.81317138671875, "logps/rejected": -302.68902587890625, "loss": 1.0624, "nll_loss": 1.0579899549484253, "rewards/accuracies": 1.0, "rewards/chosen": 3.244123935699463, "rewards/margins": 9.055431365966797, "rewards/rejected": -5.811306953430176, "step": 4054 }, { "epoch": 0.6758333333333333, "grad_norm": 53.23044967651367, "learning_rate": 5.023391885759033e-08, "logits/chosen": 2.209930419921875, "logits/rejected": 2.452251434326172, "logps/chosen": -23.648330688476562, "logps/rejected": -229.9908447265625, "loss": 0.9339, "nll_loss": 0.9095510244369507, "rewards/accuracies": 1.0, "rewards/chosen": 1.2247127294540405, "rewards/margins": 6.082993984222412, "rewards/rejected": -4.858281135559082, "step": 4055 }, { "epoch": 0.676, "grad_norm": 27.595151901245117, "learning_rate": 5.018710603024187e-08, "logits/chosen": 2.2438671588897705, "logits/rejected": 2.3379406929016113, "logps/chosen": -61.089256286621094, "logps/rejected": -147.7286376953125, "loss": 0.6849, "nll_loss": 0.6297861337661743, "rewards/accuracies": 1.0, "rewards/chosen": 2.8793466091156006, "rewards/margins": 5.348212718963623, "rewards/rejected": -2.4688661098480225, "step": 4056 }, { "epoch": 0.6761666666666667, "grad_norm": 29.451204299926758, "learning_rate": 5.0140307717174815e-08, "logits/chosen": 2.109121322631836, "logits/rejected": 2.2075557708740234, "logps/chosen": -69.27409362792969, "logps/rejected": -84.6849136352539, "loss": 0.9295, "nll_loss": 0.8768873810768127, "rewards/accuracies": 1.0, "rewards/chosen": 1.8855667114257812, "rewards/margins": 4.673030853271484, "rewards/rejected": -2.787464141845703, "step": 4057 }, { "epoch": 0.6763333333333333, "grad_norm": 31.721696853637695, "learning_rate": 5.009352393202506e-08, "logits/chosen": 1.7870197296142578, "logits/rejected": 2.381227493286133, "logps/chosen": -94.09375, "logps/rejected": -400.6908874511719, "loss": 1.174, "nll_loss": 1.1616512537002563, "rewards/accuracies": 1.0, "rewards/chosen": 1.6805658340454102, "rewards/margins": 8.357723236083984, "rewards/rejected": -6.677157878875732, "step": 4058 }, { "epoch": 0.6765, "grad_norm": 19.179216384887695, "learning_rate": 5.004675468842435e-08, "logits/chosen": 1.0517843961715698, "logits/rejected": 2.052191972732544, "logps/chosen": -120.89530944824219, "logps/rejected": -468.360595703125, "loss": 0.9341, "nll_loss": 0.9299639463424683, "rewards/accuracies": 1.0, "rewards/chosen": 2.6971635818481445, "rewards/margins": 15.98619556427002, "rewards/rejected": -13.289031982421875, "step": 4059 }, { "epoch": 0.6766666666666666, "grad_norm": 122.86480712890625, "learning_rate": 5.000000000000002e-08, "logits/chosen": 3.0354175567626953, "logits/rejected": 2.9968202114105225, "logps/chosen": -111.76463317871094, "logps/rejected": -237.3228759765625, "loss": 2.6554, "nll_loss": 2.1914639472961426, "rewards/accuracies": 1.0, "rewards/chosen": -3.3766887187957764, "rewards/margins": 7.8114728927612305, "rewards/rejected": -11.188161849975586, "step": 4060 }, { "epoch": 0.6768333333333333, "grad_norm": 68.90092468261719, "learning_rate": 4.99532598803753e-08, "logits/chosen": 1.526311993598938, "logits/rejected": 1.869357705116272, "logps/chosen": -5.297005653381348, "logps/rejected": -124.61981964111328, "loss": 0.3987, "nll_loss": 0.37835758924484253, "rewards/accuracies": 1.0, "rewards/chosen": 1.6492595672607422, "rewards/margins": 6.155943870544434, "rewards/rejected": -4.506684303283691, "step": 4061 }, { "epoch": 0.677, "grad_norm": 30.735637664794922, "learning_rate": 4.990653434316914e-08, "logits/chosen": 1.8381181955337524, "logits/rejected": 2.2054383754730225, "logps/chosen": -96.69728088378906, "logps/rejected": -106.50366973876953, "loss": 1.053, "nll_loss": 0.9968792796134949, "rewards/accuracies": 1.0, "rewards/chosen": 0.9360244870185852, "rewards/margins": 4.259509563446045, "rewards/rejected": -3.3234848976135254, "step": 4062 }, { "epoch": 0.6771666666666667, "grad_norm": 24.673418045043945, "learning_rate": 4.9859823401996296e-08, "logits/chosen": 1.355078101158142, "logits/rejected": 2.0521867275238037, "logps/chosen": -81.49531555175781, "logps/rejected": -163.79847717285156, "loss": 0.7662, "nll_loss": 0.7476634383201599, "rewards/accuracies": 1.0, "rewards/chosen": 1.24742591381073, "rewards/margins": 7.745718002319336, "rewards/rejected": -6.498291969299316, "step": 4063 }, { "epoch": 0.6773333333333333, "grad_norm": 28.929214477539062, "learning_rate": 4.981312707046711e-08, "logits/chosen": 1.2951918840408325, "logits/rejected": 2.0350453853607178, "logps/chosen": -64.61370086669922, "logps/rejected": -306.0545654296875, "loss": 0.9615, "nll_loss": 0.9502015113830566, "rewards/accuracies": 1.0, "rewards/chosen": 5.477439880371094, "rewards/margins": 9.519405364990234, "rewards/rejected": -4.041965007781982, "step": 4064 }, { "epoch": 0.6775, "grad_norm": 30.043466567993164, "learning_rate": 4.9766445362187824e-08, "logits/chosen": 2.681950569152832, "logits/rejected": 2.7955620288848877, "logps/chosen": -103.44677734375, "logps/rejected": -231.988037109375, "loss": 1.0637, "nll_loss": 1.0344678163528442, "rewards/accuracies": 1.0, "rewards/chosen": 0.6942596435546875, "rewards/margins": 8.966174125671387, "rewards/rejected": -8.2719144821167, "step": 4065 }, { "epoch": 0.6776666666666666, "grad_norm": 40.71542739868164, "learning_rate": 4.971977829076037e-08, "logits/chosen": 2.4460372924804688, "logits/rejected": 2.5135087966918945, "logps/chosen": -29.416688919067383, "logps/rejected": -179.69796752929688, "loss": 0.8047, "nll_loss": 0.7741233110427856, "rewards/accuracies": 1.0, "rewards/chosen": 1.9198137521743774, "rewards/margins": 5.439148426055908, "rewards/rejected": -3.519334554672241, "step": 4066 }, { "epoch": 0.6778333333333333, "grad_norm": 65.16670227050781, "learning_rate": 4.967312586978245e-08, "logits/chosen": 2.3921120166778564, "logits/rejected": 2.4131178855895996, "logps/chosen": -5.924842357635498, "logps/rejected": -38.800559997558594, "loss": 0.4939, "nll_loss": 0.3949894309043884, "rewards/accuracies": 1.0, "rewards/chosen": 1.2911598682403564, "rewards/margins": 3.5218677520751953, "rewards/rejected": -2.230707883834839, "step": 4067 }, { "epoch": 0.678, "grad_norm": 102.9825210571289, "learning_rate": 4.9626488112847374e-08, "logits/chosen": 2.5817644596099854, "logits/rejected": 2.6491146087646484, "logps/chosen": -112.212646484375, "logps/rejected": -84.32997131347656, "loss": 1.3527, "nll_loss": 1.0390058755874634, "rewards/accuracies": 1.0, "rewards/chosen": 1.398241400718689, "rewards/margins": 2.023472547531128, "rewards/rejected": -0.6252312064170837, "step": 4068 }, { "epoch": 0.6781666666666667, "grad_norm": 25.428977966308594, "learning_rate": 4.957986503354432e-08, "logits/chosen": 1.7000151872634888, "logits/rejected": 2.34355092048645, "logps/chosen": -20.666507720947266, "logps/rejected": -289.25494384765625, "loss": 0.4118, "nll_loss": 0.38271307945251465, "rewards/accuracies": 1.0, "rewards/chosen": 0.7316479086875916, "rewards/margins": 7.602413654327393, "rewards/rejected": -6.870765686035156, "step": 4069 }, { "epoch": 0.6783333333333333, "grad_norm": 274.2030029296875, "learning_rate": 4.9533256645458124e-08, "logits/chosen": 2.404648542404175, "logits/rejected": 2.5674655437469482, "logps/chosen": -104.56123352050781, "logps/rejected": -199.8555908203125, "loss": 4.0793, "nll_loss": 3.1685218811035156, "rewards/accuracies": 1.0, "rewards/chosen": -3.3407983779907227, "rewards/margins": 0.3162863254547119, "rewards/rejected": -3.6570847034454346, "step": 4070 }, { "epoch": 0.6785, "grad_norm": 40.51995849609375, "learning_rate": 4.948666296216938e-08, "logits/chosen": 2.231659412384033, "logits/rejected": 2.7005932331085205, "logps/chosen": -29.998403549194336, "logps/rejected": -400.1868896484375, "loss": 0.6979, "nll_loss": 0.6976372599601746, "rewards/accuracies": 1.0, "rewards/chosen": 5.399463653564453, "rewards/margins": 17.90205192565918, "rewards/rejected": -12.502588272094727, "step": 4071 }, { "epoch": 0.6786666666666666, "grad_norm": 50.23091125488281, "learning_rate": 4.94400839972543e-08, "logits/chosen": 2.079076051712036, "logits/rejected": 2.0489492416381836, "logps/chosen": -11.006040573120117, "logps/rejected": -66.0727767944336, "loss": 0.5139, "nll_loss": 0.5002745985984802, "rewards/accuracies": 1.0, "rewards/chosen": 2.221531867980957, "rewards/margins": 6.857331275939941, "rewards/rejected": -4.635799407958984, "step": 4072 }, { "epoch": 0.6788333333333333, "grad_norm": 56.32724380493164, "learning_rate": 4.9393519764284917e-08, "logits/chosen": 2.505244731903076, "logits/rejected": 2.4015464782714844, "logps/chosen": -54.03899383544922, "logps/rejected": -30.37115478515625, "loss": 1.0176, "nll_loss": 0.659011960029602, "rewards/accuracies": 1.0, "rewards/chosen": 2.0436012744903564, "rewards/margins": 2.18730092048645, "rewards/rejected": -0.14369964599609375, "step": 4073 }, { "epoch": 0.679, "grad_norm": 34.876461029052734, "learning_rate": 4.934697027682894e-08, "logits/chosen": 2.295499563217163, "logits/rejected": 2.594449043273926, "logps/chosen": -38.52116012573242, "logps/rejected": -192.12454223632812, "loss": 0.8731, "nll_loss": 0.856025755405426, "rewards/accuracies": 1.0, "rewards/chosen": 1.2946991920471191, "rewards/margins": 8.433370590209961, "rewards/rejected": -7.138670921325684, "step": 4074 }, { "epoch": 0.6791666666666667, "grad_norm": 29.359771728515625, "learning_rate": 4.9300435548449745e-08, "logits/chosen": 2.696476936340332, "logits/rejected": 3.0835657119750977, "logps/chosen": -56.588199615478516, "logps/rejected": -281.3385314941406, "loss": 0.7456, "nll_loss": 0.7254897952079773, "rewards/accuracies": 1.0, "rewards/chosen": 1.1338138580322266, "rewards/margins": 7.9728217124938965, "rewards/rejected": -6.83900785446167, "step": 4075 }, { "epoch": 0.6793333333333333, "grad_norm": 25.234664916992188, "learning_rate": 4.925391559270651e-08, "logits/chosen": 1.2089223861694336, "logits/rejected": 2.311452627182007, "logps/chosen": -12.00304126739502, "logps/rejected": -199.89346313476562, "loss": 0.3484, "nll_loss": 0.3244064748287201, "rewards/accuracies": 1.0, "rewards/chosen": 1.062343955039978, "rewards/margins": 6.612385272979736, "rewards/rejected": -5.550041198730469, "step": 4076 }, { "epoch": 0.6795, "grad_norm": 25.19601058959961, "learning_rate": 4.9207410423153914e-08, "logits/chosen": 1.8664299249649048, "logits/rejected": 1.895819067955017, "logps/chosen": -12.583284378051758, "logps/rejected": -140.73655700683594, "loss": 0.3844, "nll_loss": 0.3700965642929077, "rewards/accuracies": 1.0, "rewards/chosen": 1.5455864667892456, "rewards/margins": 7.908204078674316, "rewards/rejected": -6.362617492675781, "step": 4077 }, { "epoch": 0.6796666666666666, "grad_norm": 31.2701358795166, "learning_rate": 4.916092005334252e-08, "logits/chosen": 2.5425140857696533, "logits/rejected": 2.6109750270843506, "logps/chosen": -41.4651985168457, "logps/rejected": -385.2104187011719, "loss": 0.8666, "nll_loss": 0.8638584613800049, "rewards/accuracies": 1.0, "rewards/chosen": 4.522130489349365, "rewards/margins": 10.259439468383789, "rewards/rejected": -5.737308502197266, "step": 4078 }, { "epoch": 0.6798333333333333, "grad_norm": 205.9020233154297, "learning_rate": 4.9114444496818495e-08, "logits/chosen": 2.5671792030334473, "logits/rejected": 2.5796196460723877, "logps/chosen": -78.505126953125, "logps/rejected": -14.956939697265625, "loss": 4.0239, "nll_loss": 1.0329622030258179, "rewards/accuracies": 0.0, "rewards/chosen": 1.8156557083129883, "rewards/margins": -2.3082756996154785, "rewards/rejected": 4.123931407928467, "step": 4079 }, { "epoch": 0.68, "grad_norm": 33.338191986083984, "learning_rate": 4.906798376712373e-08, "logits/chosen": 2.4825077056884766, "logits/rejected": 2.5720088481903076, "logps/chosen": -53.99953842163086, "logps/rejected": -150.7802276611328, "loss": 0.8772, "nll_loss": 0.843742847442627, "rewards/accuracies": 1.0, "rewards/chosen": 1.122738242149353, "rewards/margins": 5.261036396026611, "rewards/rejected": -4.138298034667969, "step": 4080 }, { "epoch": 0.6801666666666667, "grad_norm": 43.33466720581055, "learning_rate": 4.9021537877795703e-08, "logits/chosen": 2.329049825668335, "logits/rejected": 2.5664093494415283, "logps/chosen": -83.76142120361328, "logps/rejected": -404.9598693847656, "loss": 1.0422, "nll_loss": 0.9739699363708496, "rewards/accuracies": 1.0, "rewards/chosen": 2.033677816390991, "rewards/margins": 4.438600540161133, "rewards/rejected": -2.4049224853515625, "step": 4081 }, { "epoch": 0.6803333333333333, "grad_norm": 45.61968231201172, "learning_rate": 4.8975106842367656e-08, "logits/chosen": 1.3043615818023682, "logits/rejected": 1.721073865890503, "logps/chosen": -17.912498474121094, "logps/rejected": -95.17050170898438, "loss": 0.6917, "nll_loss": 0.6397320628166199, "rewards/accuracies": 1.0, "rewards/chosen": 2.5463852882385254, "rewards/margins": 5.150178909301758, "rewards/rejected": -2.6037933826446533, "step": 4082 }, { "epoch": 0.6805, "grad_norm": 21.6257266998291, "learning_rate": 4.892869067436849e-08, "logits/chosen": 3.26287841796875, "logits/rejected": 3.2860608100891113, "logps/chosen": -79.47367858886719, "logps/rejected": -281.3910827636719, "loss": 0.7597, "nll_loss": 0.7427446842193604, "rewards/accuracies": 1.0, "rewards/chosen": 1.6223130226135254, "rewards/margins": 6.699717998504639, "rewards/rejected": -5.077404975891113, "step": 4083 }, { "epoch": 0.6806666666666666, "grad_norm": 33.66553497314453, "learning_rate": 4.8882289387322786e-08, "logits/chosen": 1.6141711473464966, "logits/rejected": 2.011683940887451, "logps/chosen": -211.1255645751953, "logps/rejected": -283.7958984375, "loss": 1.0521, "nll_loss": 0.9958752989768982, "rewards/accuracies": 1.0, "rewards/chosen": 0.04723053053021431, "rewards/margins": 5.873884677886963, "rewards/rejected": -5.826653957366943, "step": 4084 }, { "epoch": 0.6808333333333333, "grad_norm": 63.97410202026367, "learning_rate": 4.88359029947507e-08, "logits/chosen": 1.7750627994537354, "logits/rejected": 2.2046234607696533, "logps/chosen": -58.06435775756836, "logps/rejected": -183.5157928466797, "loss": 1.8184, "nll_loss": 1.8145110607147217, "rewards/accuracies": 1.0, "rewards/chosen": 2.947659730911255, "rewards/margins": 9.933343887329102, "rewards/rejected": -6.985684394836426, "step": 4085 }, { "epoch": 0.681, "grad_norm": 37.51923370361328, "learning_rate": 4.878953151016816e-08, "logits/chosen": 1.6872801780700684, "logits/rejected": 2.2181923389434814, "logps/chosen": -21.583362579345703, "logps/rejected": -111.22383117675781, "loss": 0.6076, "nll_loss": 0.5833341479301453, "rewards/accuracies": 1.0, "rewards/chosen": 1.6379989385604858, "rewards/margins": 5.772384166717529, "rewards/rejected": -4.134385108947754, "step": 4086 }, { "epoch": 0.6811666666666667, "grad_norm": 33.900691986083984, "learning_rate": 4.8743174947086674e-08, "logits/chosen": 2.676346778869629, "logits/rejected": 2.773806095123291, "logps/chosen": -57.071685791015625, "logps/rejected": -253.77093505859375, "loss": 0.8358, "nll_loss": 0.803826630115509, "rewards/accuracies": 1.0, "rewards/chosen": 0.6629341244697571, "rewards/margins": 6.8521318435668945, "rewards/rejected": -6.189197540283203, "step": 4087 }, { "epoch": 0.6813333333333333, "grad_norm": 207.45579528808594, "learning_rate": 4.86968333190135e-08, "logits/chosen": 2.122204542160034, "logits/rejected": 2.5378308296203613, "logps/chosen": -57.122894287109375, "logps/rejected": -35.178531646728516, "loss": 3.2465, "nll_loss": 1.1424578428268433, "rewards/accuracies": 0.0, "rewards/chosen": 1.2394622564315796, "rewards/margins": -1.4302128553390503, "rewards/rejected": 2.66967511177063, "step": 4088 }, { "epoch": 0.6815, "grad_norm": 32.858192443847656, "learning_rate": 4.865050663945138e-08, "logits/chosen": 2.383225679397583, "logits/rejected": 2.670750856399536, "logps/chosen": -16.9775390625, "logps/rejected": -72.8989028930664, "loss": 0.5459, "nll_loss": 0.530548095703125, "rewards/accuracies": 1.0, "rewards/chosen": 1.4861900806427002, "rewards/margins": 7.670514106750488, "rewards/rejected": -6.184324264526367, "step": 4089 }, { "epoch": 0.6816666666666666, "grad_norm": 17.19794464111328, "learning_rate": 4.860419492189885e-08, "logits/chosen": 1.8111375570297241, "logits/rejected": 1.8688157796859741, "logps/chosen": -112.14008331298828, "logps/rejected": -182.82693481445312, "loss": 0.6864, "nll_loss": 0.663550853729248, "rewards/accuracies": 1.0, "rewards/chosen": 1.7993568181991577, "rewards/margins": 5.879305362701416, "rewards/rejected": -4.079948425292969, "step": 4090 }, { "epoch": 0.6818333333333333, "grad_norm": 34.1116943359375, "learning_rate": 4.855789817985002e-08, "logits/chosen": 1.9783666133880615, "logits/rejected": 2.6970717906951904, "logps/chosen": -81.14702606201172, "logps/rejected": -287.02276611328125, "loss": 1.0517, "nll_loss": 1.0403465032577515, "rewards/accuracies": 1.0, "rewards/chosen": 1.8313111066818237, "rewards/margins": 7.993105411529541, "rewards/rejected": -6.161794185638428, "step": 4091 }, { "epoch": 0.682, "grad_norm": 82.87496948242188, "learning_rate": 4.851161642679466e-08, "logits/chosen": 2.849902391433716, "logits/rejected": 2.659606695175171, "logps/chosen": -73.61070251464844, "logps/rejected": -41.21205139160156, "loss": 1.5014, "nll_loss": 0.8976914882659912, "rewards/accuracies": 1.0, "rewards/chosen": 2.0988616943359375, "rewards/margins": 1.3448524475097656, "rewards/rejected": 0.7540092468261719, "step": 4092 }, { "epoch": 0.6821666666666667, "grad_norm": 26.54056167602539, "learning_rate": 4.8465349676218145e-08, "logits/chosen": 2.181656837463379, "logits/rejected": 2.560255527496338, "logps/chosen": -28.681673049926758, "logps/rejected": -363.39727783203125, "loss": 0.52, "nll_loss": 0.5121726989746094, "rewards/accuracies": 1.0, "rewards/chosen": 2.059969902038574, "rewards/margins": 15.382128715515137, "rewards/rejected": -13.322158813476562, "step": 4093 }, { "epoch": 0.6823333333333333, "grad_norm": 45.51106643676758, "learning_rate": 4.8419097941601515e-08, "logits/chosen": 3.4074251651763916, "logits/rejected": 3.349050998687744, "logps/chosen": -25.779380798339844, "logps/rejected": -77.9684829711914, "loss": 0.6711, "nll_loss": 0.6444844603538513, "rewards/accuracies": 1.0, "rewards/chosen": 0.913700520992279, "rewards/margins": 6.68009614944458, "rewards/rejected": -5.766395568847656, "step": 4094 }, { "epoch": 0.6825, "grad_norm": 62.95628356933594, "learning_rate": 4.8372861236421405e-08, "logits/chosen": 1.9402880668640137, "logits/rejected": 2.092287540435791, "logps/chosen": -55.93394470214844, "logps/rejected": -178.72093200683594, "loss": 1.7516, "nll_loss": 1.7479357719421387, "rewards/accuracies": 1.0, "rewards/chosen": 3.160701036453247, "rewards/margins": 9.666899681091309, "rewards/rejected": -6.506198883056641, "step": 4095 }, { "epoch": 0.6826666666666666, "grad_norm": 32.663841247558594, "learning_rate": 4.832663957415012e-08, "logits/chosen": 2.1551883220672607, "logits/rejected": 2.579946279525757, "logps/chosen": -15.962154388427734, "logps/rejected": -735.56640625, "loss": 0.436, "nll_loss": 0.4314095675945282, "rewards/accuracies": 1.0, "rewards/chosen": 3.1309638023376465, "rewards/margins": 8.994306564331055, "rewards/rejected": -5.86334228515625, "step": 4096 }, { "epoch": 0.6828333333333333, "grad_norm": 28.400876998901367, "learning_rate": 4.828043296825546e-08, "logits/chosen": 2.750447988510132, "logits/rejected": 2.954397201538086, "logps/chosen": -37.103965759277344, "logps/rejected": -401.22332763671875, "loss": 0.6786, "nll_loss": 0.6625709533691406, "rewards/accuracies": 1.0, "rewards/chosen": 1.3184056282043457, "rewards/margins": 15.775276184082031, "rewards/rejected": -14.456871032714844, "step": 4097 }, { "epoch": 0.683, "grad_norm": 36.753318786621094, "learning_rate": 4.8234241432200965e-08, "logits/chosen": 2.673679828643799, "logits/rejected": 2.710221767425537, "logps/chosen": -57.52278518676758, "logps/rejected": -254.87643432617188, "loss": 0.8432, "nll_loss": 0.810180127620697, "rewards/accuracies": 1.0, "rewards/chosen": 0.6178241968154907, "rewards/margins": 6.917571544647217, "rewards/rejected": -6.299747467041016, "step": 4098 }, { "epoch": 0.6831666666666667, "grad_norm": 23.585172653198242, "learning_rate": 4.8188064979445745e-08, "logits/chosen": 2.640923500061035, "logits/rejected": 2.89715313911438, "logps/chosen": -41.69110870361328, "logps/rejected": -218.1148223876953, "loss": 0.6178, "nll_loss": 0.613104522228241, "rewards/accuracies": 1.0, "rewards/chosen": 2.7305171489715576, "rewards/margins": 9.636885643005371, "rewards/rejected": -6.906368255615234, "step": 4099 }, { "epoch": 0.6833333333333333, "grad_norm": 32.29002380371094, "learning_rate": 4.814190362344453e-08, "logits/chosen": 1.1903775930404663, "logits/rejected": 2.275831699371338, "logps/chosen": -19.393821716308594, "logps/rejected": -216.8823699951172, "loss": 0.648, "nll_loss": 0.6464606523513794, "rewards/accuracies": 1.0, "rewards/chosen": 4.6683831214904785, "rewards/margins": 11.157560348510742, "rewards/rejected": -6.489177227020264, "step": 4100 }, { "epoch": 0.6835, "grad_norm": 78.08367156982422, "learning_rate": 4.809575737764758e-08, "logits/chosen": 2.1010313034057617, "logits/rejected": 2.5941455364227295, "logps/chosen": -5.839668273925781, "logps/rejected": -322.20526123046875, "loss": 0.4604, "nll_loss": 0.4492052495479584, "rewards/accuracies": 1.0, "rewards/chosen": 1.697496771812439, "rewards/margins": 10.58403205871582, "rewards/rejected": -8.88653564453125, "step": 4101 }, { "epoch": 0.6836666666666666, "grad_norm": 33.64107131958008, "learning_rate": 4.804962625550083e-08, "logits/chosen": 3.016441822052002, "logits/rejected": 3.0841588973999023, "logps/chosen": -43.236976623535156, "logps/rejected": -257.4150695800781, "loss": 0.7551, "nll_loss": 0.7328301668167114, "rewards/accuracies": 1.0, "rewards/chosen": 0.9968914985656738, "rewards/margins": 8.48314380645752, "rewards/rejected": -7.486252307891846, "step": 4102 }, { "epoch": 0.6838333333333333, "grad_norm": 46.907203674316406, "learning_rate": 4.800351027044578e-08, "logits/chosen": 2.0149943828582764, "logits/rejected": 1.9419852495193481, "logps/chosen": -12.581912994384766, "logps/rejected": -43.566162109375, "loss": 0.5201, "nll_loss": 0.4839196801185608, "rewards/accuracies": 1.0, "rewards/chosen": 2.2186145782470703, "rewards/margins": 5.369775772094727, "rewards/rejected": -3.1511611938476562, "step": 4103 }, { "epoch": 0.684, "grad_norm": 33.88533401489258, "learning_rate": 4.795740943591955e-08, "logits/chosen": 2.0080697536468506, "logits/rejected": 2.865355968475342, "logps/chosen": -21.988006591796875, "logps/rejected": -370.6058654785156, "loss": 0.612, "nll_loss": 0.6107779741287231, "rewards/accuracies": 1.0, "rewards/chosen": 4.141330718994141, "rewards/margins": 12.258698463439941, "rewards/rejected": -8.1173677444458, "step": 4104 }, { "epoch": 0.6841666666666667, "grad_norm": 135.9754638671875, "learning_rate": 4.791132376535475e-08, "logits/chosen": 2.591704845428467, "logits/rejected": 2.4789514541625977, "logps/chosen": -76.15318298339844, "logps/rejected": -92.72959899902344, "loss": 2.0527, "nll_loss": 1.5230637788772583, "rewards/accuracies": 1.0, "rewards/chosen": -2.2889974117279053, "rewards/margins": 1.1060376167297363, "rewards/rejected": -3.3950350284576416, "step": 4105 }, { "epoch": 0.6843333333333333, "grad_norm": 94.60110473632812, "learning_rate": 4.786525327217968e-08, "logits/chosen": 1.625663161277771, "logits/rejected": 1.932733178138733, "logps/chosen": -208.04757690429688, "logps/rejected": -320.5941162109375, "loss": 1.1091, "nll_loss": 0.9246559739112854, "rewards/accuracies": 1.0, "rewards/chosen": -0.7014527320861816, "rewards/margins": 2.5068328380584717, "rewards/rejected": -3.2082855701446533, "step": 4106 }, { "epoch": 0.6845, "grad_norm": 32.06606674194336, "learning_rate": 4.781919796981817e-08, "logits/chosen": 2.942302703857422, "logits/rejected": 3.0899178981781006, "logps/chosen": -29.118946075439453, "logps/rejected": -193.0054931640625, "loss": 0.6167, "nll_loss": 0.6066446304321289, "rewards/accuracies": 1.0, "rewards/chosen": 1.9956212043762207, "rewards/margins": 8.056859970092773, "rewards/rejected": -6.061238765716553, "step": 4107 }, { "epoch": 0.6846666666666666, "grad_norm": 200.39219665527344, "learning_rate": 4.7773157871689673e-08, "logits/chosen": 3.126365900039673, "logits/rejected": 3.1114954948425293, "logps/chosen": -70.62290954589844, "logps/rejected": -34.2850341796875, "loss": 3.8848, "nll_loss": 0.8612549901008606, "rewards/accuracies": 0.0, "rewards/chosen": 1.934596300125122, "rewards/margins": -2.318171739578247, "rewards/rejected": 4.252768039703369, "step": 4108 }, { "epoch": 0.6848333333333333, "grad_norm": 29.14151954650879, "learning_rate": 4.772713299120908e-08, "logits/chosen": 1.8708628416061401, "logits/rejected": 2.4454216957092285, "logps/chosen": -59.512786865234375, "logps/rejected": -158.704833984375, "loss": 0.9195, "nll_loss": 0.9017087817192078, "rewards/accuracies": 1.0, "rewards/chosen": 2.218351125717163, "rewards/margins": 6.383124351501465, "rewards/rejected": -4.164773464202881, "step": 4109 }, { "epoch": 0.685, "grad_norm": 91.1459732055664, "learning_rate": 4.768112334178699e-08, "logits/chosen": 3.1641838550567627, "logits/rejected": 3.201000690460205, "logps/chosen": -11.856780052185059, "logps/rejected": -230.55477905273438, "loss": 1.0384, "nll_loss": 0.9880649447441101, "rewards/accuracies": 1.0, "rewards/chosen": 0.1008852943778038, "rewards/margins": 7.5114569664001465, "rewards/rejected": -7.410571575164795, "step": 4110 }, { "epoch": 0.6851666666666667, "grad_norm": 28.516178131103516, "learning_rate": 4.7635128936829495e-08, "logits/chosen": 3.159576654434204, "logits/rejected": 3.560598611831665, "logps/chosen": -111.2239990234375, "logps/rejected": -294.99505615234375, "loss": 1.0866, "nll_loss": 1.07984459400177, "rewards/accuracies": 1.0, "rewards/chosen": 2.2334229946136475, "rewards/margins": 10.370062828063965, "rewards/rejected": -8.136639595031738, "step": 4111 }, { "epoch": 0.6853333333333333, "grad_norm": 40.02473068237305, "learning_rate": 4.7589149789738303e-08, "logits/chosen": 2.510430097579956, "logits/rejected": 2.720336437225342, "logps/chosen": -36.30467987060547, "logps/rejected": -252.8209991455078, "loss": 0.8793, "nll_loss": 0.8643971681594849, "rewards/accuracies": 1.0, "rewards/chosen": 1.4714847803115845, "rewards/margins": 8.024422645568848, "rewards/rejected": -6.5529375076293945, "step": 4112 }, { "epoch": 0.6855, "grad_norm": 43.80795669555664, "learning_rate": 4.7543185913910566e-08, "logits/chosen": 1.6317867040634155, "logits/rejected": 2.473513126373291, "logps/chosen": -10.818897247314453, "logps/rejected": -175.4072265625, "loss": 0.3786, "nll_loss": 0.37306541204452515, "rewards/accuracies": 1.0, "rewards/chosen": 2.5258471965789795, "rewards/margins": 9.599658012390137, "rewards/rejected": -7.073810577392578, "step": 4113 }, { "epoch": 0.6856666666666666, "grad_norm": 20.73347282409668, "learning_rate": 4.749723732273908e-08, "logits/chosen": 2.2296738624572754, "logits/rejected": 2.066420316696167, "logps/chosen": -29.980236053466797, "logps/rejected": -107.55813598632812, "loss": 0.3983, "nll_loss": 0.38436201214790344, "rewards/accuracies": 1.0, "rewards/chosen": 2.0130043029785156, "rewards/margins": 6.868070125579834, "rewards/rejected": -4.855065822601318, "step": 4114 }, { "epoch": 0.6858333333333333, "grad_norm": 23.013887405395508, "learning_rate": 4.745130402961217e-08, "logits/chosen": 0.41613632440567017, "logits/rejected": 1.407484769821167, "logps/chosen": -55.46947479248047, "logps/rejected": -365.98797607421875, "loss": 0.7123, "nll_loss": 0.6933683156967163, "rewards/accuracies": 1.0, "rewards/chosen": 1.1404777765274048, "rewards/margins": 12.939765930175781, "rewards/rejected": -11.799287796020508, "step": 4115 }, { "epoch": 0.686, "grad_norm": 109.18809509277344, "learning_rate": 4.74053860479137e-08, "logits/chosen": 2.9541873931884766, "logits/rejected": 3.150648355484009, "logps/chosen": -32.31481170654297, "logps/rejected": -295.91424560546875, "loss": 2.0568, "nll_loss": 2.0196757316589355, "rewards/accuracies": 1.0, "rewards/chosen": 0.4180183708667755, "rewards/margins": 12.094915390014648, "rewards/rejected": -11.676897048950195, "step": 4116 }, { "epoch": 0.6861666666666667, "grad_norm": 32.022037506103516, "learning_rate": 4.735948339102309e-08, "logits/chosen": 0.8703833222389221, "logits/rejected": 2.5490353107452393, "logps/chosen": -48.23398208618164, "logps/rejected": -524.4275512695312, "loss": 0.9118, "nll_loss": 0.9100751280784607, "rewards/accuracies": 1.0, "rewards/chosen": 3.695983648300171, "rewards/margins": 12.080612182617188, "rewards/rejected": -8.384628295898438, "step": 4117 }, { "epoch": 0.6863333333333334, "grad_norm": 261.8694152832031, "learning_rate": 4.731359607231521e-08, "logits/chosen": 2.2701547145843506, "logits/rejected": 2.200252056121826, "logps/chosen": -177.8796844482422, "logps/rejected": -59.808319091796875, "loss": 2.612, "nll_loss": 0.9773608446121216, "rewards/accuracies": 0.0, "rewards/chosen": 1.50481116771698, "rewards/margins": -0.777262806892395, "rewards/rejected": 2.282073974609375, "step": 4118 }, { "epoch": 0.6865, "grad_norm": 28.63886070251465, "learning_rate": 4.7267724105160545e-08, "logits/chosen": 2.3626067638397217, "logits/rejected": 2.5885066986083984, "logps/chosen": -50.3804931640625, "logps/rejected": -242.8837127685547, "loss": 0.7932, "nll_loss": 0.7750844359397888, "rewards/accuracies": 1.0, "rewards/chosen": 1.3387298583984375, "rewards/margins": 7.177319526672363, "rewards/rejected": -5.838589668273926, "step": 4119 }, { "epoch": 0.6866666666666666, "grad_norm": 28.59357452392578, "learning_rate": 4.72218675029251e-08, "logits/chosen": 2.3303849697113037, "logits/rejected": 2.502340078353882, "logps/chosen": -20.699588775634766, "logps/rejected": -268.9698486328125, "loss": 0.5704, "nll_loss": 0.5594483613967896, "rewards/accuracies": 1.0, "rewards/chosen": 1.7178192138671875, "rewards/margins": 10.145135879516602, "rewards/rejected": -8.427316665649414, "step": 4120 }, { "epoch": 0.6868333333333333, "grad_norm": 422.5041809082031, "learning_rate": 4.717602627897043e-08, "logits/chosen": 3.580491304397583, "logits/rejected": 3.6640169620513916, "logps/chosen": -120.44451904296875, "logps/rejected": -111.1738052368164, "loss": 4.1878, "nll_loss": 2.618359088897705, "rewards/accuracies": 0.0, "rewards/chosen": -4.090729713439941, "rewards/margins": -0.8273966312408447, "rewards/rejected": -3.2633330821990967, "step": 4121 }, { "epoch": 0.687, "grad_norm": 33.523048400878906, "learning_rate": 4.7130200446653466e-08, "logits/chosen": 1.6799978017807007, "logits/rejected": 2.506741523742676, "logps/chosen": -20.287616729736328, "logps/rejected": -289.743408203125, "loss": 0.4284, "nll_loss": 0.4140329957008362, "rewards/accuracies": 1.0, "rewards/chosen": 1.4276844263076782, "rewards/margins": 10.625885963439941, "rewards/rejected": -9.198201179504395, "step": 4122 }, { "epoch": 0.6871666666666667, "grad_norm": 21.00830841064453, "learning_rate": 4.7084390019326815e-08, "logits/chosen": 1.70902681350708, "logits/rejected": 1.4921391010284424, "logps/chosen": -134.30484008789062, "logps/rejected": -93.08931732177734, "loss": 0.9531, "nll_loss": 0.9458088278770447, "rewards/accuracies": 1.0, "rewards/chosen": 2.2198243141174316, "rewards/margins": 9.127750396728516, "rewards/rejected": -6.907925605773926, "step": 4123 }, { "epoch": 0.6873333333333334, "grad_norm": 30.144899368286133, "learning_rate": 4.7038595010338535e-08, "logits/chosen": 3.260266065597534, "logits/rejected": 3.2163655757904053, "logps/chosen": -81.49396514892578, "logps/rejected": -137.00294494628906, "loss": 1.083, "nll_loss": 1.0583633184432983, "rewards/accuracies": 1.0, "rewards/chosen": 1.485451579093933, "rewards/margins": 5.77559757232666, "rewards/rejected": -4.2901458740234375, "step": 4124 }, { "epoch": 0.6875, "grad_norm": 24.997325897216797, "learning_rate": 4.699281543303222e-08, "logits/chosen": 2.081439256668091, "logits/rejected": 2.2992162704467773, "logps/chosen": -100.22036743164062, "logps/rejected": -321.36688232421875, "loss": 1.0215, "nll_loss": 1.0123270750045776, "rewards/accuracies": 1.0, "rewards/chosen": 1.8971787691116333, "rewards/margins": 11.036067008972168, "rewards/rejected": -9.138888359069824, "step": 4125 }, { "epoch": 0.6876666666666666, "grad_norm": 28.9740047454834, "learning_rate": 4.6947051300746866e-08, "logits/chosen": 2.7460384368896484, "logits/rejected": 2.8282861709594727, "logps/chosen": -62.52800369262695, "logps/rejected": -219.4506072998047, "loss": 0.7699, "nll_loss": 0.7533493638038635, "rewards/accuracies": 1.0, "rewards/chosen": 1.2991955280303955, "rewards/margins": 8.947898864746094, "rewards/rejected": -7.648703575134277, "step": 4126 }, { "epoch": 0.6878333333333333, "grad_norm": 10.452211380004883, "learning_rate": 4.6901302626817094e-08, "logits/chosen": 2.0255775451660156, "logits/rejected": 1.951716661453247, "logps/chosen": -129.84542846679688, "logps/rejected": -176.66400146484375, "loss": 0.5519, "nll_loss": 0.5501924753189087, "rewards/accuracies": 1.0, "rewards/chosen": 3.9830856323242188, "rewards/margins": 11.076611518859863, "rewards/rejected": -7.0935258865356445, "step": 4127 }, { "epoch": 0.688, "grad_norm": 144.63665771484375, "learning_rate": 4.6855569424572946e-08, "logits/chosen": 1.4563320875167847, "logits/rejected": 1.8581764698028564, "logps/chosen": -41.792686462402344, "logps/rejected": -175.98583984375, "loss": 1.5619, "nll_loss": 1.492595911026001, "rewards/accuracies": 1.0, "rewards/chosen": 0.047149658203125, "rewards/margins": 4.415701389312744, "rewards/rejected": -4.368551731109619, "step": 4128 }, { "epoch": 0.6881666666666667, "grad_norm": 53.78742980957031, "learning_rate": 4.6809851707340044e-08, "logits/chosen": 1.3606115579605103, "logits/rejected": 2.1860525608062744, "logps/chosen": -22.261587142944336, "logps/rejected": -230.15338134765625, "loss": 0.9313, "nll_loss": 0.9275661110877991, "rewards/accuracies": 1.0, "rewards/chosen": 3.3929314613342285, "rewards/margins": 9.41494369506836, "rewards/rejected": -6.022012233734131, "step": 4129 }, { "epoch": 0.6883333333333334, "grad_norm": 25.00879669189453, "learning_rate": 4.676414948843933e-08, "logits/chosen": 1.4872664213180542, "logits/rejected": 1.345016598701477, "logps/chosen": -74.93144226074219, "logps/rejected": -63.00553894042969, "loss": 0.8541, "nll_loss": 0.8325713872909546, "rewards/accuracies": 1.0, "rewards/chosen": 1.3276809453964233, "rewards/margins": 6.312605857849121, "rewards/rejected": -4.984924793243408, "step": 4130 }, { "epoch": 0.6885, "grad_norm": 39.66503143310547, "learning_rate": 4.6718462781187395e-08, "logits/chosen": 0.9137279391288757, "logits/rejected": 3.0875020027160645, "logps/chosen": -78.73463439941406, "logps/rejected": -371.659423828125, "loss": 0.8774, "nll_loss": 0.8201525211334229, "rewards/accuracies": 1.0, "rewards/chosen": 1.9448624849319458, "rewards/margins": 4.609449863433838, "rewards/rejected": -2.6645874977111816, "step": 4131 }, { "epoch": 0.6886666666666666, "grad_norm": 25.861265182495117, "learning_rate": 4.6672791598896234e-08, "logits/chosen": 1.2287416458129883, "logits/rejected": 2.473024845123291, "logps/chosen": -57.1265869140625, "logps/rejected": -277.85675048828125, "loss": 0.7806, "nll_loss": 0.7616878151893616, "rewards/accuracies": 1.0, "rewards/chosen": 1.1689239740371704, "rewards/margins": 8.399653434753418, "rewards/rejected": -7.230729579925537, "step": 4132 }, { "epoch": 0.6888333333333333, "grad_norm": 354.5068664550781, "learning_rate": 4.6627135954873375e-08, "logits/chosen": 2.5691444873809814, "logits/rejected": 2.5046615600585938, "logps/chosen": -146.6807861328125, "logps/rejected": -115.68008422851562, "loss": 4.762, "nll_loss": 2.2224361896514893, "rewards/accuracies": 0.0, "rewards/chosen": -7.025880336761475, "rewards/margins": -1.508338451385498, "rewards/rejected": -5.517541885375977, "step": 4133 }, { "epoch": 0.689, "grad_norm": 32.89715576171875, "learning_rate": 4.658149586242169e-08, "logits/chosen": 0.6327527761459351, "logits/rejected": 1.3108546733856201, "logps/chosen": -80.76516723632812, "logps/rejected": -343.3691711425781, "loss": 0.9361, "nll_loss": 0.9074736833572388, "rewards/accuracies": 1.0, "rewards/chosen": 0.9274452328681946, "rewards/margins": 6.058863639831543, "rewards/rejected": -5.131418228149414, "step": 4134 }, { "epoch": 0.6891666666666667, "grad_norm": 47.40974807739258, "learning_rate": 4.653587133483967e-08, "logits/chosen": 0.8793490529060364, "logits/rejected": 2.128007411956787, "logps/chosen": -62.97208023071289, "logps/rejected": -304.29156494140625, "loss": 1.2081, "nll_loss": 1.1449469327926636, "rewards/accuracies": 1.0, "rewards/chosen": -0.11535225808620453, "rewards/margins": 5.902560710906982, "rewards/rejected": -6.017912864685059, "step": 4135 }, { "epoch": 0.6893333333333334, "grad_norm": 31.20123291015625, "learning_rate": 4.64902623854212e-08, "logits/chosen": 1.8621013164520264, "logits/rejected": 1.6323267221450806, "logps/chosen": -69.94883728027344, "logps/rejected": -79.92813110351562, "loss": 0.8322, "nll_loss": 0.7772092223167419, "rewards/accuracies": 1.0, "rewards/chosen": 1.4331520795822144, "rewards/margins": 4.397397041320801, "rewards/rejected": -2.964244842529297, "step": 4136 }, { "epoch": 0.6895, "grad_norm": 12.746437072753906, "learning_rate": 4.644466902745561e-08, "logits/chosen": 1.2965087890625, "logits/rejected": 1.0991815328598022, "logps/chosen": -159.86337280273438, "logps/rejected": -209.51524353027344, "loss": 0.661, "nll_loss": 0.6551777124404907, "rewards/accuracies": 1.0, "rewards/chosen": 2.4210784435272217, "rewards/margins": 9.766348838806152, "rewards/rejected": -7.345270156860352, "step": 4137 }, { "epoch": 0.6896666666666667, "grad_norm": 26.222116470336914, "learning_rate": 4.639909127422777e-08, "logits/chosen": 3.5307576656341553, "logits/rejected": 3.793950319290161, "logps/chosen": -29.065343856811523, "logps/rejected": -97.6557388305664, "loss": 0.5438, "nll_loss": 0.5099183320999146, "rewards/accuracies": 1.0, "rewards/chosen": 1.1246527433395386, "rewards/margins": 5.224564075469971, "rewards/rejected": -4.099911212921143, "step": 4138 }, { "epoch": 0.6898333333333333, "grad_norm": 208.79222106933594, "learning_rate": 4.6353529139017845e-08, "logits/chosen": 2.8171072006225586, "logits/rejected": 2.6794681549072266, "logps/chosen": -408.7762145996094, "logps/rejected": -134.4949493408203, "loss": 2.1893, "nll_loss": 1.2022830247879028, "rewards/accuracies": 1.0, "rewards/chosen": -5.802712917327881, "rewards/margins": 2.5491013526916504, "rewards/rejected": -8.351814270019531, "step": 4139 }, { "epoch": 0.69, "grad_norm": 24.38829231262207, "learning_rate": 4.630798263510162e-08, "logits/chosen": 2.366868257522583, "logits/rejected": 2.0651588439941406, "logps/chosen": -81.87318420410156, "logps/rejected": -82.61115264892578, "loss": 0.9428, "nll_loss": 0.930377185344696, "rewards/accuracies": 1.0, "rewards/chosen": 2.236464023590088, "rewards/margins": 7.036340713500977, "rewards/rejected": -4.799876689910889, "step": 4140 }, { "epoch": 0.6901666666666667, "grad_norm": 18.814035415649414, "learning_rate": 4.6262451775750224e-08, "logits/chosen": 2.5685958862304688, "logits/rejected": 2.5349273681640625, "logps/chosen": -8.669147491455078, "logps/rejected": -112.73970031738281, "loss": 0.2087, "nll_loss": 0.2016080766916275, "rewards/accuracies": 1.0, "rewards/chosen": 2.4050304889678955, "rewards/margins": 8.48932933807373, "rewards/rejected": -6.084299087524414, "step": 4141 }, { "epoch": 0.6903333333333334, "grad_norm": 156.9281463623047, "learning_rate": 4.621693657423029e-08, "logits/chosen": 2.24871563911438, "logits/rejected": 1.8313008546829224, "logps/chosen": -83.31913757324219, "logps/rejected": -63.19047164916992, "loss": 2.5867, "nll_loss": 1.7003905773162842, "rewards/accuracies": 1.0, "rewards/chosen": -4.263825416564941, "rewards/margins": 0.969573974609375, "rewards/rejected": -5.233399391174316, "step": 4142 }, { "epoch": 0.6905, "grad_norm": 27.425182342529297, "learning_rate": 4.6171437043803807e-08, "logits/chosen": 2.5424044132232666, "logits/rejected": 2.6953907012939453, "logps/chosen": -31.354167938232422, "logps/rejected": -250.254638671875, "loss": 0.5265, "nll_loss": 0.5140027403831482, "rewards/accuracies": 1.0, "rewards/chosen": 1.587850570678711, "rewards/margins": 9.542963981628418, "rewards/rejected": -7.955113410949707, "step": 4143 }, { "epoch": 0.6906666666666667, "grad_norm": 30.503719329833984, "learning_rate": 4.6125953197728275e-08, "logits/chosen": 0.8700304627418518, "logits/rejected": 0.7793940305709839, "logps/chosen": -216.55880737304688, "logps/rejected": -78.4683837890625, "loss": 0.8045, "nll_loss": 0.698576807975769, "rewards/accuracies": 1.0, "rewards/chosen": 1.9531280994415283, "rewards/margins": 3.83766770362854, "rewards/rejected": -1.8845396041870117, "step": 4144 }, { "epoch": 0.6908333333333333, "grad_norm": 135.86094665527344, "learning_rate": 4.608048504925658e-08, "logits/chosen": 1.579908013343811, "logits/rejected": 2.5628442764282227, "logps/chosen": -63.92958068847656, "logps/rejected": -180.5186309814453, "loss": 1.1848, "nll_loss": 1.1623560190200806, "rewards/accuracies": 1.0, "rewards/chosen": 0.9782662391662598, "rewards/margins": 8.30734634399414, "rewards/rejected": -7.329080104827881, "step": 4145 }, { "epoch": 0.691, "grad_norm": 124.967529296875, "learning_rate": 4.603503261163709e-08, "logits/chosen": 1.824042558670044, "logits/rejected": 1.8244028091430664, "logps/chosen": -35.696231842041016, "logps/rejected": -13.48715591430664, "loss": 2.223, "nll_loss": 0.4406942129135132, "rewards/accuracies": 0.0, "rewards/chosen": 2.4702320098876953, "rewards/margins": -0.7299561500549316, "rewards/rejected": 3.200188159942627, "step": 4146 }, { "epoch": 0.6911666666666667, "grad_norm": 79.39283752441406, "learning_rate": 4.59895958981135e-08, "logits/chosen": 2.350623369216919, "logits/rejected": 2.0104048252105713, "logps/chosen": -29.40365982055664, "logps/rejected": -13.518095016479492, "loss": 1.3486, "nll_loss": 0.9188644289970398, "rewards/accuracies": 1.0, "rewards/chosen": 1.1108452081680298, "rewards/margins": 1.4431893825531006, "rewards/rejected": -0.3323441445827484, "step": 4147 }, { "epoch": 0.6913333333333334, "grad_norm": 65.64108276367188, "learning_rate": 4.5944174921924993e-08, "logits/chosen": 2.245551824569702, "logits/rejected": 2.08957839012146, "logps/chosen": -14.076655387878418, "logps/rejected": -36.74599838256836, "loss": 0.8366, "nll_loss": 0.6703169941902161, "rewards/accuracies": 1.0, "rewards/chosen": 1.8869619369506836, "rewards/margins": 3.2037906646728516, "rewards/rejected": -1.3168286085128784, "step": 4148 }, { "epoch": 0.6915, "grad_norm": 34.77921676635742, "learning_rate": 4.589876969630615e-08, "logits/chosen": 2.0104801654815674, "logits/rejected": 2.2805819511413574, "logps/chosen": -81.64488983154297, "logps/rejected": -165.64056396484375, "loss": 0.9701, "nll_loss": 0.9493589997291565, "rewards/accuracies": 1.0, "rewards/chosen": 1.800916314125061, "rewards/margins": 6.057351112365723, "rewards/rejected": -4.256434917449951, "step": 4149 }, { "epoch": 0.6916666666666667, "grad_norm": 25.98054313659668, "learning_rate": 4.585338023448702e-08, "logits/chosen": 1.6017613410949707, "logits/rejected": 2.5402610301971436, "logps/chosen": -56.895835876464844, "logps/rejected": -255.4296875, "loss": 0.8506, "nll_loss": 0.8491916656494141, "rewards/accuracies": 1.0, "rewards/chosen": 4.552926540374756, "rewards/margins": 11.335073471069336, "rewards/rejected": -6.782147407531738, "step": 4150 }, { "epoch": 0.6918333333333333, "grad_norm": 27.426515579223633, "learning_rate": 4.5808006549692914e-08, "logits/chosen": 2.232729911804199, "logits/rejected": 2.173309087753296, "logps/chosen": -80.98268127441406, "logps/rejected": -76.56929779052734, "loss": 0.9582, "nll_loss": 0.9308354258537292, "rewards/accuracies": 1.0, "rewards/chosen": 1.2670693397521973, "rewards/margins": 5.654577732086182, "rewards/rejected": -4.387508392333984, "step": 4151 }, { "epoch": 0.692, "grad_norm": 82.77982330322266, "learning_rate": 4.5762648655144665e-08, "logits/chosen": 2.3732330799102783, "logits/rejected": 2.047846794128418, "logps/chosen": -66.0994873046875, "logps/rejected": -40.01359558105469, "loss": 1.5247, "nll_loss": 1.3219897747039795, "rewards/accuracies": 1.0, "rewards/chosen": -0.11337433010339737, "rewards/margins": 2.1717772483825684, "rewards/rejected": -2.285151481628418, "step": 4152 }, { "epoch": 0.6921666666666667, "grad_norm": 23.653615951538086, "learning_rate": 4.5717306564058486e-08, "logits/chosen": 2.7846217155456543, "logits/rejected": 3.085904121398926, "logps/chosen": -23.363252639770508, "logps/rejected": -280.0751953125, "loss": 0.5179, "nll_loss": 0.5078967809677124, "rewards/accuracies": 1.0, "rewards/chosen": 2.26936411857605, "rewards/margins": 7.530515670776367, "rewards/rejected": -5.261151313781738, "step": 4153 }, { "epoch": 0.6923333333333334, "grad_norm": 20.36902618408203, "learning_rate": 4.567198028964602e-08, "logits/chosen": 2.895573377609253, "logits/rejected": 2.719444990158081, "logps/chosen": -94.58197021484375, "logps/rejected": -96.40663146972656, "loss": 0.7896, "nll_loss": 0.7566558122634888, "rewards/accuracies": 1.0, "rewards/chosen": 1.551509141921997, "rewards/margins": 5.218406677246094, "rewards/rejected": -3.666897773742676, "step": 4154 }, { "epoch": 0.6925, "grad_norm": 207.20672607421875, "learning_rate": 4.562666984511415e-08, "logits/chosen": 2.7915802001953125, "logits/rejected": 2.6351709365844727, "logps/chosen": -110.7367935180664, "logps/rejected": -28.821962356567383, "loss": 2.9166, "nll_loss": 1.1907180547714233, "rewards/accuracies": 0.0, "rewards/chosen": 1.1931129693984985, "rewards/margins": -0.9693909883499146, "rewards/rejected": 2.162503957748413, "step": 4155 }, { "epoch": 0.6926666666666667, "grad_norm": 53.33198165893555, "learning_rate": 4.558137524366533e-08, "logits/chosen": 3.1019341945648193, "logits/rejected": 3.094882011413574, "logps/chosen": -113.31959533691406, "logps/rejected": -41.70901107788086, "loss": 1.1686, "nll_loss": 0.8853093385696411, "rewards/accuracies": 1.0, "rewards/chosen": 2.694514751434326, "rewards/margins": 3.02266526222229, "rewards/rejected": -0.32815057039260864, "step": 4156 }, { "epoch": 0.6928333333333333, "grad_norm": 31.974763870239258, "learning_rate": 4.553609649849729e-08, "logits/chosen": 1.321700096130371, "logits/rejected": 2.478069305419922, "logps/chosen": -64.11555480957031, "logps/rejected": -482.56103515625, "loss": 0.8504, "nll_loss": 0.8436257243156433, "rewards/accuracies": 1.0, "rewards/chosen": 2.2029054164886475, "rewards/margins": 10.87133502960205, "rewards/rejected": -8.668429374694824, "step": 4157 }, { "epoch": 0.693, "grad_norm": 796.5753173828125, "learning_rate": 4.549083362280317e-08, "logits/chosen": 1.8842638731002808, "logits/rejected": 1.199253797531128, "logps/chosen": -356.133056640625, "logps/rejected": -162.3775634765625, "loss": 5.0543, "nll_loss": 1.3093127012252808, "rewards/accuracies": 0.0, "rewards/chosen": -6.361810684204102, "rewards/margins": -3.3126347064971924, "rewards/rejected": -3.049175977706909, "step": 4158 }, { "epoch": 0.6931666666666667, "grad_norm": 24.308982849121094, "learning_rate": 4.544558662977154e-08, "logits/chosen": 1.1342500448226929, "logits/rejected": 1.5069936513900757, "logps/chosen": -71.18084716796875, "logps/rejected": -334.74053955078125, "loss": 0.8075, "nll_loss": 0.7997848987579346, "rewards/accuracies": 1.0, "rewards/chosen": 2.055304765701294, "rewards/margins": 11.850492477416992, "rewards/rejected": -9.795187950134277, "step": 4159 }, { "epoch": 0.6933333333333334, "grad_norm": 95.93001556396484, "learning_rate": 4.540035553258619e-08, "logits/chosen": 3.0831902027130127, "logits/rejected": 3.0402896404266357, "logps/chosen": -51.77772903442383, "logps/rejected": -34.6697998046875, "loss": 1.3158, "nll_loss": 0.7396817207336426, "rewards/accuracies": 1.0, "rewards/chosen": 1.2063896656036377, "rewards/margins": 1.0375988483428955, "rewards/rejected": 0.1687908172607422, "step": 4160 }, { "epoch": 0.6935, "grad_norm": 27.266937255859375, "learning_rate": 4.5355140344426436e-08, "logits/chosen": 3.2721621990203857, "logits/rejected": 3.304058313369751, "logps/chosen": -124.49848937988281, "logps/rejected": -67.71578979492188, "loss": 0.7039, "nll_loss": 0.6319720149040222, "rewards/accuracies": 1.0, "rewards/chosen": 1.9640061855316162, "rewards/margins": 4.334661483764648, "rewards/rejected": -2.3706555366516113, "step": 4161 }, { "epoch": 0.6936666666666667, "grad_norm": 34.064964294433594, "learning_rate": 4.530994107846689e-08, "logits/chosen": 2.313406229019165, "logits/rejected": 2.4900941848754883, "logps/chosen": -63.6961669921875, "logps/rejected": -255.793212890625, "loss": 1.038, "nll_loss": 0.9952526688575745, "rewards/accuracies": 1.0, "rewards/chosen": 0.3827865719795227, "rewards/margins": 5.913872718811035, "rewards/rejected": -5.531085968017578, "step": 4162 }, { "epoch": 0.6938333333333333, "grad_norm": 23.503040313720703, "learning_rate": 4.526475774787756e-08, "logits/chosen": 2.534973621368408, "logits/rejected": 2.7319016456604004, "logps/chosen": -68.01969146728516, "logps/rejected": -295.855224609375, "loss": 0.7208, "nll_loss": 0.7159968614578247, "rewards/accuracies": 1.0, "rewards/chosen": 2.547278642654419, "rewards/margins": 13.380603790283203, "rewards/rejected": -10.833325386047363, "step": 4163 }, { "epoch": 0.694, "grad_norm": 28.107080459594727, "learning_rate": 4.5219590365823715e-08, "logits/chosen": 1.9130691289901733, "logits/rejected": 1.6851545572280884, "logps/chosen": -122.66388702392578, "logps/rejected": -46.08550262451172, "loss": 1.1832, "nll_loss": 1.1463912725448608, "rewards/accuracies": 1.0, "rewards/chosen": 1.9704598188400269, "rewards/margins": 5.207464218139648, "rewards/rejected": -3.237004518508911, "step": 4164 }, { "epoch": 0.6941666666666667, "grad_norm": 24.934513092041016, "learning_rate": 4.517443894546609e-08, "logits/chosen": 1.9839118719100952, "logits/rejected": 2.1299424171447754, "logps/chosen": -70.85755920410156, "logps/rejected": -155.33944702148438, "loss": 0.8694, "nll_loss": 0.8641164898872375, "rewards/accuracies": 1.0, "rewards/chosen": 2.6203629970550537, "rewards/margins": 9.374719619750977, "rewards/rejected": -6.754356384277344, "step": 4165 }, { "epoch": 0.6943333333333334, "grad_norm": 84.92941284179688, "learning_rate": 4.512930349996073e-08, "logits/chosen": 3.110062837600708, "logits/rejected": 2.9560768604278564, "logps/chosen": -139.21160888671875, "logps/rejected": -31.908069610595703, "loss": 1.9731, "nll_loss": 1.4061778783798218, "rewards/accuracies": 1.0, "rewards/chosen": 2.577876567840576, "rewards/margins": 1.725254774093628, "rewards/rejected": 0.852621853351593, "step": 4166 }, { "epoch": 0.6945, "grad_norm": 27.694515228271484, "learning_rate": 4.508418404245903e-08, "logits/chosen": 1.9021995067596436, "logits/rejected": 1.6269396543502808, "logps/chosen": -137.74215698242188, "logps/rejected": -131.48117065429688, "loss": 1.3186, "nll_loss": 1.2994542121887207, "rewards/accuracies": 1.0, "rewards/chosen": 1.284083604812622, "rewards/margins": 7.009608268737793, "rewards/rejected": -5.72552490234375, "step": 4167 }, { "epoch": 0.6946666666666667, "grad_norm": 34.34218978881836, "learning_rate": 4.503908058610767e-08, "logits/chosen": 2.0811915397644043, "logits/rejected": 2.1358184814453125, "logps/chosen": -9.925163269042969, "logps/rejected": -81.5721435546875, "loss": 0.4416, "nll_loss": 0.41354846954345703, "rewards/accuracies": 1.0, "rewards/chosen": 1.4235538244247437, "rewards/margins": 5.514472961425781, "rewards/rejected": -4.090919017791748, "step": 4168 }, { "epoch": 0.6948333333333333, "grad_norm": 77.76473236083984, "learning_rate": 4.499399314404874e-08, "logits/chosen": 2.318751573562622, "logits/rejected": 2.4972927570343018, "logps/chosen": -129.44520568847656, "logps/rejected": -458.0870361328125, "loss": 1.523, "nll_loss": 1.1455328464508057, "rewards/accuracies": 1.0, "rewards/chosen": -2.9022369384765625, "rewards/margins": 7.561212539672852, "rewards/rejected": -10.463449478149414, "step": 4169 }, { "epoch": 0.695, "grad_norm": 43.628543853759766, "learning_rate": 4.494892172941964e-08, "logits/chosen": 1.2390937805175781, "logits/rejected": 2.1082446575164795, "logps/chosen": -20.804424285888672, "logps/rejected": -105.18450164794922, "loss": 0.5607, "nll_loss": 0.533446729183197, "rewards/accuracies": 1.0, "rewards/chosen": 1.2628469467163086, "rewards/margins": 5.6602349281311035, "rewards/rejected": -4.397387981414795, "step": 4170 }, { "epoch": 0.6951666666666667, "grad_norm": 196.1432647705078, "learning_rate": 4.490386635535314e-08, "logits/chosen": 2.60589337348938, "logits/rejected": 2.4464330673217773, "logps/chosen": -64.26435852050781, "logps/rejected": -13.892215728759766, "loss": 2.9277, "nll_loss": 0.765052080154419, "rewards/accuracies": 0.0, "rewards/chosen": 0.2520813047885895, "rewards/margins": -1.6890183687210083, "rewards/rejected": 1.9410996437072754, "step": 4171 }, { "epoch": 0.6953333333333334, "grad_norm": 29.660999298095703, "learning_rate": 4.485882703497721e-08, "logits/chosen": 1.3909984827041626, "logits/rejected": 2.0420987606048584, "logps/chosen": -14.427175521850586, "logps/rejected": -101.57438659667969, "loss": 0.4122, "nll_loss": 0.36067938804626465, "rewards/accuracies": 1.0, "rewards/chosen": 1.390886902809143, "rewards/margins": 4.475935459136963, "rewards/rejected": -3.0850486755371094, "step": 4172 }, { "epoch": 0.6955, "grad_norm": 74.8376693725586, "learning_rate": 4.481380378141527e-08, "logits/chosen": 1.8583954572677612, "logits/rejected": 2.5147013664245605, "logps/chosen": -51.918312072753906, "logps/rejected": -243.70469665527344, "loss": 0.7823, "nll_loss": 0.7635045647621155, "rewards/accuracies": 1.0, "rewards/chosen": 1.16412353515625, "rewards/margins": 8.700910568237305, "rewards/rejected": -7.536787509918213, "step": 4173 }, { "epoch": 0.6956666666666667, "grad_norm": 37.959197998046875, "learning_rate": 4.4768796607786006e-08, "logits/chosen": 2.027090311050415, "logits/rejected": 3.3958606719970703, "logps/chosen": -58.171531677246094, "logps/rejected": -176.07546997070312, "loss": 1.1904, "nll_loss": 1.1634308099746704, "rewards/accuracies": 1.0, "rewards/chosen": 1.0487778186798096, "rewards/margins": 5.997519493103027, "rewards/rejected": -4.948741436004639, "step": 4174 }, { "epoch": 0.6958333333333333, "grad_norm": 42.69542694091797, "learning_rate": 4.472380552720348e-08, "logits/chosen": 1.8193944692611694, "logits/rejected": 1.9013442993164062, "logps/chosen": -18.63395881652832, "logps/rejected": -81.38902282714844, "loss": 0.6679, "nll_loss": 0.6654985547065735, "rewards/accuracies": 1.0, "rewards/chosen": 4.119527339935303, "rewards/margins": 10.240745544433594, "rewards/rejected": -6.121218204498291, "step": 4175 }, { "epoch": 0.696, "grad_norm": 30.717811584472656, "learning_rate": 4.467883055277695e-08, "logits/chosen": 2.3455052375793457, "logits/rejected": 1.9409903287887573, "logps/chosen": -33.197975158691406, "logps/rejected": -65.1125259399414, "loss": 0.5704, "nll_loss": 0.5354512929916382, "rewards/accuracies": 1.0, "rewards/chosen": 0.6932060718536377, "rewards/margins": 5.790953636169434, "rewards/rejected": -5.097747325897217, "step": 4176 }, { "epoch": 0.6961666666666667, "grad_norm": 34.838382720947266, "learning_rate": 4.463387169761104e-08, "logits/chosen": 2.093010425567627, "logits/rejected": 2.507161855697632, "logps/chosen": -78.79611206054688, "logps/rejected": -217.2411346435547, "loss": 1.1202, "nll_loss": 1.0943903923034668, "rewards/accuracies": 1.0, "rewards/chosen": 1.294442892074585, "rewards/margins": 5.784631729125977, "rewards/rejected": -4.4901885986328125, "step": 4177 }, { "epoch": 0.6963333333333334, "grad_norm": 30.711454391479492, "learning_rate": 4.4588928974805703e-08, "logits/chosen": 2.6112253665924072, "logits/rejected": 2.6431634426116943, "logps/chosen": -60.64764404296875, "logps/rejected": -79.46052551269531, "loss": 0.7897, "nll_loss": 0.7676916718482971, "rewards/accuracies": 1.0, "rewards/chosen": 1.055439829826355, "rewards/margins": 7.332754135131836, "rewards/rejected": -6.277314186096191, "step": 4178 }, { "epoch": 0.6965, "grad_norm": 45.099918365478516, "learning_rate": 4.454400239745618e-08, "logits/chosen": 2.8255531787872314, "logits/rejected": 2.8075578212738037, "logps/chosen": -21.519447326660156, "logps/rejected": -80.67140197753906, "loss": 0.8432, "nll_loss": 0.7970166206359863, "rewards/accuracies": 1.0, "rewards/chosen": 1.592718243598938, "rewards/margins": 4.711771011352539, "rewards/rejected": -3.1190528869628906, "step": 4179 }, { "epoch": 0.6966666666666667, "grad_norm": 29.044424057006836, "learning_rate": 4.4499091978653026e-08, "logits/chosen": 1.7973277568817139, "logits/rejected": 1.4667582511901855, "logps/chosen": -27.606260299682617, "logps/rejected": -81.37859344482422, "loss": 0.4324, "nll_loss": 0.4247116148471832, "rewards/accuracies": 1.0, "rewards/chosen": 2.412909984588623, "rewards/margins": 8.171760559082031, "rewards/rejected": -5.75885009765625, "step": 4180 }, { "epoch": 0.6968333333333333, "grad_norm": 49.94948196411133, "learning_rate": 4.445419773148198e-08, "logits/chosen": 2.0899856090545654, "logits/rejected": 2.258470296859741, "logps/chosen": -11.512413024902344, "logps/rejected": -234.7316436767578, "loss": 0.5043, "nll_loss": 0.5005397200584412, "rewards/accuracies": 1.0, "rewards/chosen": 2.8222248554229736, "rewards/margins": 11.292825698852539, "rewards/rejected": -8.470601081848145, "step": 4181 }, { "epoch": 0.697, "grad_norm": 105.81269073486328, "learning_rate": 4.440931966902418e-08, "logits/chosen": 2.95843505859375, "logits/rejected": 2.9087226390838623, "logps/chosen": -82.36145782470703, "logps/rejected": -71.9161605834961, "loss": 2.1565, "nll_loss": 1.9609873294830322, "rewards/accuracies": 1.0, "rewards/chosen": -1.3969825506210327, "rewards/margins": 3.313556671142578, "rewards/rejected": -4.7105393409729, "step": 4182 }, { "epoch": 0.6971666666666667, "grad_norm": 42.19544219970703, "learning_rate": 4.4364457804356035e-08, "logits/chosen": 2.2752063274383545, "logits/rejected": 2.562358856201172, "logps/chosen": -95.64582061767578, "logps/rejected": -186.178955078125, "loss": 1.1916, "nll_loss": 1.166412591934204, "rewards/accuracies": 1.0, "rewards/chosen": 0.8737701773643494, "rewards/margins": 7.608685493469238, "rewards/rejected": -6.734915256500244, "step": 4183 }, { "epoch": 0.6973333333333334, "grad_norm": 183.02130126953125, "learning_rate": 4.431961215054923e-08, "logits/chosen": 3.4745659828186035, "logits/rejected": 3.4440078735351562, "logps/chosen": -98.63603210449219, "logps/rejected": -19.297109603881836, "loss": 2.9428, "nll_loss": 1.1742384433746338, "rewards/accuracies": 0.0, "rewards/chosen": 2.475297689437866, "rewards/margins": -0.7092721462249756, "rewards/rejected": 3.184569835662842, "step": 4184 }, { "epoch": 0.6975, "grad_norm": 22.82417869567871, "learning_rate": 4.4274782720670654e-08, "logits/chosen": 1.0501930713653564, "logits/rejected": 1.6763150691986084, "logps/chosen": -53.11991882324219, "logps/rejected": -279.9306945800781, "loss": 0.6162, "nll_loss": 0.5968529582023621, "rewards/accuracies": 1.0, "rewards/chosen": 1.251249074935913, "rewards/margins": 7.08804988861084, "rewards/rejected": -5.836801052093506, "step": 4185 }, { "epoch": 0.6976666666666667, "grad_norm": 26.688295364379883, "learning_rate": 4.422996952778256e-08, "logits/chosen": 2.9167962074279785, "logits/rejected": 2.818352460861206, "logps/chosen": -11.422060012817383, "logps/rejected": -113.42168426513672, "loss": 0.3374, "nll_loss": 0.30058053135871887, "rewards/accuracies": 1.0, "rewards/chosen": 1.2057729959487915, "rewards/margins": 5.016981601715088, "rewards/rejected": -3.811208724975586, "step": 4186 }, { "epoch": 0.6978333333333333, "grad_norm": 23.982656478881836, "learning_rate": 4.418517258494244e-08, "logits/chosen": 1.5306193828582764, "logits/rejected": 2.18855357170105, "logps/chosen": -17.05369758605957, "logps/rejected": -223.89776611328125, "loss": 0.3826, "nll_loss": 0.37897107005119324, "rewards/accuracies": 1.0, "rewards/chosen": 2.8327317237854004, "rewards/margins": 11.902326583862305, "rewards/rejected": -9.069595336914062, "step": 4187 }, { "epoch": 0.698, "grad_norm": 33.941837310791016, "learning_rate": 4.414039190520308e-08, "logits/chosen": 2.0079469680786133, "logits/rejected": 2.1213955879211426, "logps/chosen": -16.524879455566406, "logps/rejected": -489.48260498046875, "loss": 0.5463, "nll_loss": 0.533060610294342, "rewards/accuracies": 1.0, "rewards/chosen": 4.381962776184082, "rewards/margins": 8.323051452636719, "rewards/rejected": -3.941088914871216, "step": 4188 }, { "epoch": 0.6981666666666667, "grad_norm": 26.046266555786133, "learning_rate": 4.409562750161241e-08, "logits/chosen": 2.670243501663208, "logits/rejected": 2.7144651412963867, "logps/chosen": -45.38512420654297, "logps/rejected": -40.324405670166016, "loss": 0.6177, "nll_loss": 0.5402990579605103, "rewards/accuracies": 1.0, "rewards/chosen": 1.4361779689788818, "rewards/margins": 3.929183006286621, "rewards/rejected": -2.4930050373077393, "step": 4189 }, { "epoch": 0.6983333333333334, "grad_norm": 24.063936233520508, "learning_rate": 4.405087938721376e-08, "logits/chosen": 2.8056466579437256, "logits/rejected": 2.8149333000183105, "logps/chosen": -48.50831604003906, "logps/rejected": -184.14657592773438, "loss": 0.6374, "nll_loss": 0.6140292882919312, "rewards/accuracies": 1.0, "rewards/chosen": 0.9727573394775391, "rewards/margins": 7.454645156860352, "rewards/rejected": -6.4818878173828125, "step": 4190 }, { "epoch": 0.6985, "grad_norm": 26.57185173034668, "learning_rate": 4.400614757504564e-08, "logits/chosen": 1.9197068214416504, "logits/rejected": 2.4280154705047607, "logps/chosen": -102.91960144042969, "logps/rejected": -267.43780517578125, "loss": 1.0796, "nll_loss": 1.0610268115997314, "rewards/accuracies": 1.0, "rewards/chosen": 1.1707137823104858, "rewards/margins": 8.920906066894531, "rewards/rejected": -7.750192165374756, "step": 4191 }, { "epoch": 0.6986666666666667, "grad_norm": 20.545669555664062, "learning_rate": 4.396143207814187e-08, "logits/chosen": 1.5722928047180176, "logits/rejected": 1.9874168634414673, "logps/chosen": -61.39280700683594, "logps/rejected": -205.58511352539062, "loss": 0.6691, "nll_loss": 0.6673131585121155, "rewards/accuracies": 1.0, "rewards/chosen": 4.749685764312744, "rewards/margins": 10.982137680053711, "rewards/rejected": -6.232451438903809, "step": 4192 }, { "epoch": 0.6988333333333333, "grad_norm": 18.534069061279297, "learning_rate": 4.391673290953141e-08, "logits/chosen": 2.6308786869049072, "logits/rejected": 2.6038458347320557, "logps/chosen": -222.3872833251953, "logps/rejected": -103.220703125, "loss": 0.8699, "nll_loss": 0.8236565589904785, "rewards/accuracies": 1.0, "rewards/chosen": 2.487077474594116, "rewards/margins": 5.262951374053955, "rewards/rejected": -2.775873899459839, "step": 4193 }, { "epoch": 0.699, "grad_norm": 33.551422119140625, "learning_rate": 4.3872050082238535e-08, "logits/chosen": 2.4604368209838867, "logits/rejected": 3.347677230834961, "logps/chosen": -68.62675476074219, "logps/rejected": -507.58447265625, "loss": 0.9996, "nll_loss": 0.9945906400680542, "rewards/accuracies": 1.0, "rewards/chosen": 2.6800880432128906, "rewards/margins": 9.401169776916504, "rewards/rejected": -6.721081733703613, "step": 4194 }, { "epoch": 0.6991666666666667, "grad_norm": 150.06378173828125, "learning_rate": 4.382738360928276e-08, "logits/chosen": 3.317357301712036, "logits/rejected": 3.404949426651001, "logps/chosen": -46.2365608215332, "logps/rejected": -23.611799240112305, "loss": 1.7867, "nll_loss": 0.8562327027320862, "rewards/accuracies": 0.0, "rewards/chosen": 0.15838661789894104, "rewards/margins": -0.026208505034446716, "rewards/rejected": 0.18459512293338776, "step": 4195 }, { "epoch": 0.6993333333333334, "grad_norm": 20.853954315185547, "learning_rate": 4.3782733503678884e-08, "logits/chosen": 0.7623388171195984, "logits/rejected": 1.834507703781128, "logps/chosen": -70.48663330078125, "logps/rejected": -328.6014404296875, "loss": 0.6731, "nll_loss": 0.6587535738945007, "rewards/accuracies": 1.0, "rewards/chosen": 1.4608490467071533, "rewards/margins": 8.64261245727539, "rewards/rejected": -7.181763172149658, "step": 4196 }, { "epoch": 0.6995, "grad_norm": 23.770137786865234, "learning_rate": 4.373809977843676e-08, "logits/chosen": 1.742356538772583, "logits/rejected": 2.206529378890991, "logps/chosen": -92.72932434082031, "logps/rejected": -248.299560546875, "loss": 0.7823, "nll_loss": 0.7663581967353821, "rewards/accuracies": 1.0, "rewards/chosen": 1.3070878982543945, "rewards/margins": 11.309971809387207, "rewards/rejected": -10.002883911132812, "step": 4197 }, { "epoch": 0.6996666666666667, "grad_norm": 22.892324447631836, "learning_rate": 4.369348244656165e-08, "logits/chosen": 1.630913496017456, "logits/rejected": 1.7566579580307007, "logps/chosen": -84.48328399658203, "logps/rejected": -81.64591217041016, "loss": 0.8122, "nll_loss": 0.7970120310783386, "rewards/accuracies": 1.0, "rewards/chosen": 1.388851284980774, "rewards/margins": 8.873069763183594, "rewards/rejected": -7.484218597412109, "step": 4198 }, { "epoch": 0.6998333333333333, "grad_norm": 45.09172439575195, "learning_rate": 4.364888152105396e-08, "logits/chosen": 2.598555088043213, "logits/rejected": 2.8424482345581055, "logps/chosen": -30.51693344116211, "logps/rejected": -106.43659973144531, "loss": 0.7795, "nll_loss": 0.7629233598709106, "rewards/accuracies": 1.0, "rewards/chosen": 1.2852500677108765, "rewards/margins": 9.081205368041992, "rewards/rejected": -7.795955657958984, "step": 4199 }, { "epoch": 0.7, "grad_norm": 37.433902740478516, "learning_rate": 4.360429701490934e-08, "logits/chosen": 2.1055471897125244, "logits/rejected": 2.491607904434204, "logps/chosen": -21.476634979248047, "logps/rejected": -144.78091430664062, "loss": 0.5872, "nll_loss": 0.5804494619369507, "rewards/accuracies": 1.0, "rewards/chosen": 2.9459190368652344, "rewards/margins": 8.208427429199219, "rewards/rejected": -5.262507915496826, "step": 4200 }, { "epoch": 0.7001666666666667, "grad_norm": 41.244998931884766, "learning_rate": 4.355972894111868e-08, "logits/chosen": 2.598383903503418, "logits/rejected": 2.5767414569854736, "logps/chosen": -36.07822036743164, "logps/rejected": -80.61653137207031, "loss": 1.1098, "nll_loss": 1.0611240863800049, "rewards/accuracies": 1.0, "rewards/chosen": 1.968019962310791, "rewards/margins": 4.838903427124023, "rewards/rejected": -2.8708834648132324, "step": 4201 }, { "epoch": 0.7003333333333334, "grad_norm": 74.85003662109375, "learning_rate": 4.351517731266795e-08, "logits/chosen": 2.0323894023895264, "logits/rejected": 1.9105538129806519, "logps/chosen": -18.50010108947754, "logps/rejected": -30.26009750366211, "loss": 0.9538, "nll_loss": 0.5781282186508179, "rewards/accuracies": 1.0, "rewards/chosen": 0.8219187259674072, "rewards/margins": 1.5237507820129395, "rewards/rejected": -0.7018320560455322, "step": 4202 }, { "epoch": 0.7005, "grad_norm": 40.344058990478516, "learning_rate": 4.34706421425385e-08, "logits/chosen": 2.0640621185302734, "logits/rejected": 1.9732375144958496, "logps/chosen": -130.96694946289062, "logps/rejected": -399.82513427734375, "loss": 1.0794, "nll_loss": 1.0561851263046265, "rewards/accuracies": 1.0, "rewards/chosen": 0.9545364379882812, "rewards/margins": 7.839186668395996, "rewards/rejected": -6.884650230407715, "step": 4203 }, { "epoch": 0.7006666666666667, "grad_norm": 29.630826950073242, "learning_rate": 4.342612344370677e-08, "logits/chosen": 1.8947480916976929, "logits/rejected": 1.5427676439285278, "logps/chosen": -31.588661193847656, "logps/rejected": -46.236839294433594, "loss": 0.612, "nll_loss": 0.5743393898010254, "rewards/accuracies": 1.0, "rewards/chosen": 2.13086199760437, "rewards/margins": 5.272668361663818, "rewards/rejected": -3.1418063640594482, "step": 4204 }, { "epoch": 0.7008333333333333, "grad_norm": 28.695327758789062, "learning_rate": 4.3381621229144514e-08, "logits/chosen": 2.4249420166015625, "logits/rejected": 2.691537380218506, "logps/chosen": -25.34670639038086, "logps/rejected": -295.6619873046875, "loss": 0.5075, "nll_loss": 0.5069341063499451, "rewards/accuracies": 1.0, "rewards/chosen": 5.50911808013916, "rewards/margins": 13.168519973754883, "rewards/rejected": -7.6594014167785645, "step": 4205 }, { "epoch": 0.701, "grad_norm": 219.32069396972656, "learning_rate": 4.333713551181851e-08, "logits/chosen": 2.6148974895477295, "logits/rejected": 2.4957141876220703, "logps/chosen": -109.13778686523438, "logps/rejected": -27.881031036376953, "loss": 2.8782, "nll_loss": 1.1735243797302246, "rewards/accuracies": 0.0, "rewards/chosen": 1.3530136346817017, "rewards/margins": -0.9035836458206177, "rewards/rejected": 2.2565972805023193, "step": 4206 }, { "epoch": 0.7011666666666667, "grad_norm": 24.80544090270996, "learning_rate": 4.3292666304690876e-08, "logits/chosen": 2.657115936279297, "logits/rejected": 2.731233596801758, "logps/chosen": -84.49385070800781, "logps/rejected": -333.4021301269531, "loss": 0.8893, "nll_loss": 0.8710706830024719, "rewards/accuracies": 1.0, "rewards/chosen": 1.1985687017440796, "rewards/margins": 8.594757080078125, "rewards/rejected": -7.396188259124756, "step": 4207 }, { "epoch": 0.7013333333333334, "grad_norm": 31.002384185791016, "learning_rate": 4.324821362071886e-08, "logits/chosen": 2.5896542072296143, "logits/rejected": 2.8434665203094482, "logps/chosen": -77.08821105957031, "logps/rejected": -291.14996337890625, "loss": 1.013, "nll_loss": 1.0011454820632935, "rewards/accuracies": 1.0, "rewards/chosen": 1.622937798500061, "rewards/margins": 10.391316413879395, "rewards/rejected": -8.768378257751465, "step": 4208 }, { "epoch": 0.7015, "grad_norm": 22.060346603393555, "learning_rate": 4.3203777472854964e-08, "logits/chosen": 2.8930869102478027, "logits/rejected": 2.882441997528076, "logps/chosen": -125.34324645996094, "logps/rejected": -198.85824584960938, "loss": 0.8789, "nll_loss": 0.8585152626037598, "rewards/accuracies": 1.0, "rewards/chosen": 1.8330795764923096, "rewards/margins": 6.095686912536621, "rewards/rejected": -4.262607097625732, "step": 4209 }, { "epoch": 0.7016666666666667, "grad_norm": 31.69324493408203, "learning_rate": 4.315935787404672e-08, "logits/chosen": 2.489821434020996, "logits/rejected": 2.7698633670806885, "logps/chosen": -64.92826080322266, "logps/rejected": -219.65367126464844, "loss": 1.0382, "nll_loss": 1.014504075050354, "rewards/accuracies": 1.0, "rewards/chosen": 1.0228843688964844, "rewards/margins": 6.830135822296143, "rewards/rejected": -5.807251453399658, "step": 4210 }, { "epoch": 0.7018333333333333, "grad_norm": 22.784793853759766, "learning_rate": 4.311495483723698e-08, "logits/chosen": 2.6991400718688965, "logits/rejected": 2.7891123294830322, "logps/chosen": -69.55740356445312, "logps/rejected": -72.64608764648438, "loss": 0.7095, "nll_loss": 0.6819354295730591, "rewards/accuracies": 1.0, "rewards/chosen": 1.347852349281311, "rewards/margins": 5.577327728271484, "rewards/rejected": -4.229475498199463, "step": 4211 }, { "epoch": 0.702, "grad_norm": 30.418235778808594, "learning_rate": 4.3070568375363726e-08, "logits/chosen": 2.9128589630126953, "logits/rejected": 2.965472936630249, "logps/chosen": -90.8904800415039, "logps/rejected": -200.4649658203125, "loss": 1.0298, "nll_loss": 1.021241307258606, "rewards/accuracies": 1.0, "rewards/chosen": 1.9714195728302002, "rewards/margins": 10.231718063354492, "rewards/rejected": -8.260298728942871, "step": 4212 }, { "epoch": 0.7021666666666667, "grad_norm": 32.30977249145508, "learning_rate": 4.302619850136013e-08, "logits/chosen": 2.5507075786590576, "logits/rejected": 2.4599616527557373, "logps/chosen": -58.11704635620117, "logps/rejected": -45.7899169921875, "loss": 0.8492, "nll_loss": 0.7853654623031616, "rewards/accuracies": 1.0, "rewards/chosen": 1.8532695770263672, "rewards/margins": 4.418567180633545, "rewards/rejected": -2.5652976036071777, "step": 4213 }, { "epoch": 0.7023333333333334, "grad_norm": 46.838768005371094, "learning_rate": 4.2981845228154446e-08, "logits/chosen": 2.2559638023376465, "logits/rejected": 2.3643758296966553, "logps/chosen": -38.89502716064453, "logps/rejected": -71.87869262695312, "loss": 0.9955, "nll_loss": 0.9486590027809143, "rewards/accuracies": 1.0, "rewards/chosen": 0.6234947443008423, "rewards/margins": 4.789351463317871, "rewards/rejected": -4.165856838226318, "step": 4214 }, { "epoch": 0.7025, "grad_norm": 24.744178771972656, "learning_rate": 4.2937508568670186e-08, "logits/chosen": 2.0765326023101807, "logits/rejected": 2.260209560394287, "logps/chosen": -61.52804946899414, "logps/rejected": -135.0095672607422, "loss": 0.8622, "nll_loss": 0.8545563220977783, "rewards/accuracies": 1.0, "rewards/chosen": 2.241185426712036, "rewards/margins": 8.588088035583496, "rewards/rejected": -6.346902370452881, "step": 4215 }, { "epoch": 0.7026666666666667, "grad_norm": 48.76865768432617, "learning_rate": 4.2893188535825994e-08, "logits/chosen": 2.0810327529907227, "logits/rejected": 2.1507999897003174, "logps/chosen": -45.195281982421875, "logps/rejected": -144.53411865234375, "loss": 0.9481, "nll_loss": 0.7409060597419739, "rewards/accuracies": 1.0, "rewards/chosen": 5.038414001464844, "rewards/margins": 5.650964260101318, "rewards/rejected": -0.6125503778457642, "step": 4216 }, { "epoch": 0.7028333333333333, "grad_norm": 33.52822494506836, "learning_rate": 4.284888514253571e-08, "logits/chosen": 2.55523943901062, "logits/rejected": 2.7877492904663086, "logps/chosen": -43.936912536621094, "logps/rejected": -445.197509765625, "loss": 0.9164, "nll_loss": 0.9153524041175842, "rewards/accuracies": 1.0, "rewards/chosen": 4.203608989715576, "rewards/margins": 13.073863983154297, "rewards/rejected": -8.870254516601562, "step": 4217 }, { "epoch": 0.703, "grad_norm": 38.11103057861328, "learning_rate": 4.280459840170817e-08, "logits/chosen": 1.9502675533294678, "logits/rejected": 2.152120351791382, "logps/chosen": -16.350297927856445, "logps/rejected": -260.82220458984375, "loss": 0.6341, "nll_loss": 0.6288576722145081, "rewards/accuracies": 1.0, "rewards/chosen": 3.4165596961975098, "rewards/margins": 8.768610000610352, "rewards/rejected": -5.35205078125, "step": 4218 }, { "epoch": 0.7031666666666667, "grad_norm": 79.9964828491211, "learning_rate": 4.276032832624753e-08, "logits/chosen": 1.535131812095642, "logits/rejected": 1.7514466047286987, "logps/chosen": -38.7352294921875, "logps/rejected": -253.84451293945312, "loss": 1.6895, "nll_loss": 1.6841405630111694, "rewards/accuracies": 1.0, "rewards/chosen": 2.459420919418335, "rewards/margins": 10.802292823791504, "rewards/rejected": -8.34287166595459, "step": 4219 }, { "epoch": 0.7033333333333334, "grad_norm": 26.051467895507812, "learning_rate": 4.271607492905303e-08, "logits/chosen": 2.4412403106689453, "logits/rejected": 2.634753942489624, "logps/chosen": -150.25372314453125, "logps/rejected": -310.1611633300781, "loss": 1.3733, "nll_loss": 1.365943193435669, "rewards/accuracies": 1.0, "rewards/chosen": 2.120858907699585, "rewards/margins": 10.412238121032715, "rewards/rejected": -8.29137897491455, "step": 4220 }, { "epoch": 0.7035, "grad_norm": 29.955636978149414, "learning_rate": 4.267183822301903e-08, "logits/chosen": 2.238715171813965, "logits/rejected": 2.289078950881958, "logps/chosen": -94.70708465576172, "logps/rejected": -120.10457611083984, "loss": 1.1495, "nll_loss": 1.1274653673171997, "rewards/accuracies": 1.0, "rewards/chosen": 1.1873635053634644, "rewards/margins": 6.538644790649414, "rewards/rejected": -5.35128116607666, "step": 4221 }, { "epoch": 0.7036666666666667, "grad_norm": 26.748796463012695, "learning_rate": 4.26276182210351e-08, "logits/chosen": 1.0901620388031006, "logits/rejected": 2.0906565189361572, "logps/chosen": -65.22818756103516, "logps/rejected": -252.2348175048828, "loss": 0.8229, "nll_loss": 0.8052863478660583, "rewards/accuracies": 1.0, "rewards/chosen": 1.1991493701934814, "rewards/margins": 12.911253929138184, "rewards/rejected": -11.712104797363281, "step": 4222 }, { "epoch": 0.7038333333333333, "grad_norm": 20.16349983215332, "learning_rate": 4.25834149359858e-08, "logits/chosen": 2.658623456954956, "logits/rejected": 2.6023833751678467, "logps/chosen": -133.659423828125, "logps/rejected": -292.7396545410156, "loss": 0.8671, "nll_loss": 0.8513339161872864, "rewards/accuracies": 1.0, "rewards/chosen": 2.499725341796875, "rewards/margins": 6.6796555519104, "rewards/rejected": -4.179930210113525, "step": 4223 }, { "epoch": 0.704, "grad_norm": 244.92857360839844, "learning_rate": 4.2539228380750946e-08, "logits/chosen": 1.9018893241882324, "logits/rejected": 1.5733720064163208, "logps/chosen": -109.51714324951172, "logps/rejected": -48.478294372558594, "loss": 3.4831, "nll_loss": 1.2734551429748535, "rewards/accuracies": 0.0, "rewards/chosen": 0.9223991632461548, "rewards/margins": -1.6149286031723022, "rewards/rejected": 2.537327766418457, "step": 4224 }, { "epoch": 0.7041666666666667, "grad_norm": 26.585664749145508, "learning_rate": 4.249505856820545e-08, "logits/chosen": 3.2089171409606934, "logits/rejected": 3.342259168624878, "logps/chosen": -88.22583770751953, "logps/rejected": -413.02520751953125, "loss": 0.9597, "nll_loss": 0.9385728240013123, "rewards/accuracies": 1.0, "rewards/chosen": 1.1623848676681519, "rewards/margins": 6.9121012687683105, "rewards/rejected": -5.749716281890869, "step": 4225 }, { "epoch": 0.7043333333333334, "grad_norm": 26.166366577148438, "learning_rate": 4.245090551121935e-08, "logits/chosen": 2.276437759399414, "logits/rejected": 2.3396413326263428, "logps/chosen": -83.71402740478516, "logps/rejected": -266.33502197265625, "loss": 0.9665, "nll_loss": 0.9622302651405334, "rewards/accuracies": 1.0, "rewards/chosen": 2.7734241485595703, "rewards/margins": 10.084129333496094, "rewards/rejected": -7.310705661773682, "step": 4226 }, { "epoch": 0.7045, "grad_norm": 31.61259651184082, "learning_rate": 4.240676922265774e-08, "logits/chosen": 2.541050910949707, "logits/rejected": 2.6390624046325684, "logps/chosen": -28.99653434753418, "logps/rejected": -223.6391143798828, "loss": 0.67, "nll_loss": 0.6590122580528259, "rewards/accuracies": 1.0, "rewards/chosen": 1.819172978401184, "rewards/margins": 8.183292388916016, "rewards/rejected": -6.364119052886963, "step": 4227 }, { "epoch": 0.7046666666666667, "grad_norm": 19.152114868164062, "learning_rate": 4.2362649715380884e-08, "logits/chosen": 1.0775830745697021, "logits/rejected": 0.6594593524932861, "logps/chosen": -206.06735229492188, "logps/rejected": -144.96835327148438, "loss": 0.7199, "nll_loss": 0.6891884803771973, "rewards/accuracies": 1.0, "rewards/chosen": 1.272894263267517, "rewards/margins": 5.367213726043701, "rewards/rejected": -4.0943193435668945, "step": 4228 }, { "epoch": 0.7048333333333333, "grad_norm": 29.55710792541504, "learning_rate": 4.231854700224416e-08, "logits/chosen": 1.7675087451934814, "logits/rejected": 2.311816453933716, "logps/chosen": -39.440589904785156, "logps/rejected": -303.4801330566406, "loss": 0.806, "nll_loss": 0.8049100637435913, "rewards/accuracies": 1.0, "rewards/chosen": 4.988663673400879, "rewards/margins": 11.882047653198242, "rewards/rejected": -6.893383979797363, "step": 4229 }, { "epoch": 0.705, "grad_norm": 32.64604187011719, "learning_rate": 4.227446109609808e-08, "logits/chosen": 2.6209943294525146, "logits/rejected": 3.07853364944458, "logps/chosen": -45.90407943725586, "logps/rejected": -213.72747802734375, "loss": 0.7746, "nll_loss": 0.7172513604164124, "rewards/accuracies": 1.0, "rewards/chosen": 0.42126965522766113, "rewards/margins": 4.436430931091309, "rewards/rejected": -4.015161037445068, "step": 4230 }, { "epoch": 0.7051666666666667, "grad_norm": 29.104379653930664, "learning_rate": 4.223039200978814e-08, "logits/chosen": 2.379755735397339, "logits/rejected": 2.6584577560424805, "logps/chosen": -20.910369873046875, "logps/rejected": -297.1441650390625, "loss": 0.5509, "nll_loss": 0.5502728819847107, "rewards/accuracies": 1.0, "rewards/chosen": 5.550574779510498, "rewards/margins": 12.952136993408203, "rewards/rejected": -7.401562690734863, "step": 4231 }, { "epoch": 0.7053333333333334, "grad_norm": 40.20827865600586, "learning_rate": 4.2186339756155064e-08, "logits/chosen": 1.6920782327651978, "logits/rejected": 1.5568406581878662, "logps/chosen": -33.22319412231445, "logps/rejected": -58.41694641113281, "loss": 0.5709, "nll_loss": 0.4205467104911804, "rewards/accuracies": 1.0, "rewards/chosen": 0.9932469129562378, "rewards/margins": 2.838697910308838, "rewards/rejected": -1.8454509973526, "step": 4232 }, { "epoch": 0.7055, "grad_norm": 29.016149520874023, "learning_rate": 4.2142304348034594e-08, "logits/chosen": 2.3156681060791016, "logits/rejected": 2.3098416328430176, "logps/chosen": -54.1867561340332, "logps/rejected": -150.82028198242188, "loss": 0.7841, "nll_loss": 0.7631937265396118, "rewards/accuracies": 1.0, "rewards/chosen": 1.0272587537765503, "rewards/margins": 9.339494705200195, "rewards/rejected": -8.312235832214355, "step": 4233 }, { "epoch": 0.7056666666666667, "grad_norm": 24.057788848876953, "learning_rate": 4.209828579825766e-08, "logits/chosen": 1.6117018461227417, "logits/rejected": 2.2580599784851074, "logps/chosen": -42.255653381347656, "logps/rejected": -391.957763671875, "loss": 0.5919, "nll_loss": 0.586884081363678, "rewards/accuracies": 1.0, "rewards/chosen": 2.5458123683929443, "rewards/margins": 10.498468399047852, "rewards/rejected": -7.952655792236328, "step": 4234 }, { "epoch": 0.7058333333333333, "grad_norm": 82.85806274414062, "learning_rate": 4.2054284119650106e-08, "logits/chosen": 2.698728561401367, "logits/rejected": 2.1395485401153564, "logps/chosen": -148.9918212890625, "logps/rejected": -164.39517211914062, "loss": 1.3993, "nll_loss": 1.30694580078125, "rewards/accuracies": 1.0, "rewards/chosen": -0.5878494381904602, "rewards/margins": 5.457939624786377, "rewards/rejected": -6.0457892417907715, "step": 4235 }, { "epoch": 0.706, "grad_norm": 34.08925247192383, "learning_rate": 4.201029932503303e-08, "logits/chosen": 2.7250988483428955, "logits/rejected": 2.766770362854004, "logps/chosen": -13.78919792175293, "logps/rejected": -192.37570190429688, "loss": 0.3826, "nll_loss": 0.3363218903541565, "rewards/accuracies": 1.0, "rewards/chosen": 1.2055469751358032, "rewards/margins": 4.601888179779053, "rewards/rejected": -3.396341323852539, "step": 4236 }, { "epoch": 0.7061666666666667, "grad_norm": 37.42695999145508, "learning_rate": 4.1966331427222524e-08, "logits/chosen": 3.125364065170288, "logits/rejected": 3.119558334350586, "logps/chosen": -73.53182220458984, "logps/rejected": -54.94183349609375, "loss": 1.047, "nll_loss": 1.021275520324707, "rewards/accuracies": 1.0, "rewards/chosen": 1.2426528930664062, "rewards/margins": 5.836674690246582, "rewards/rejected": -4.594021797180176, "step": 4237 }, { "epoch": 0.7063333333333334, "grad_norm": 28.2326602935791, "learning_rate": 4.1922380439029814e-08, "logits/chosen": 2.3700222969055176, "logits/rejected": 2.465611457824707, "logps/chosen": -52.46416473388672, "logps/rejected": -77.27024841308594, "loss": 0.7978, "nll_loss": 0.7389318346977234, "rewards/accuracies": 1.0, "rewards/chosen": 1.3474243879318237, "rewards/margins": 4.271193981170654, "rewards/rejected": -2.92376971244812, "step": 4238 }, { "epoch": 0.7065, "grad_norm": 20.165437698364258, "learning_rate": 4.18784463732611e-08, "logits/chosen": 3.1908931732177734, "logits/rejected": 3.243006706237793, "logps/chosen": -129.61380004882812, "logps/rejected": -178.577880859375, "loss": 0.9549, "nll_loss": 0.9460861086845398, "rewards/accuracies": 1.0, "rewards/chosen": 1.927717685699463, "rewards/margins": 10.262004852294922, "rewards/rejected": -8.3342866897583, "step": 4239 }, { "epoch": 0.7066666666666667, "grad_norm": 23.243370056152344, "learning_rate": 4.183452924271775e-08, "logits/chosen": 1.3675353527069092, "logits/rejected": 1.9138778448104858, "logps/chosen": -49.74747848510742, "logps/rejected": -307.9575500488281, "loss": 0.6265, "nll_loss": 0.6218434572219849, "rewards/accuracies": 1.0, "rewards/chosen": 2.8042378425598145, "rewards/margins": 9.325695037841797, "rewards/rejected": -6.521457672119141, "step": 4240 }, { "epoch": 0.7068333333333333, "grad_norm": 26.175607681274414, "learning_rate": 4.179062906019617e-08, "logits/chosen": 1.1920405626296997, "logits/rejected": 1.0878961086273193, "logps/chosen": -110.15166473388672, "logps/rejected": -121.59383392333984, "loss": 0.8008, "nll_loss": 0.749330997467041, "rewards/accuracies": 1.0, "rewards/chosen": 1.1473907232284546, "rewards/margins": 4.419851779937744, "rewards/rejected": -3.2724609375, "step": 4241 }, { "epoch": 0.707, "grad_norm": 29.649051666259766, "learning_rate": 4.17467458384878e-08, "logits/chosen": 3.08843994140625, "logits/rejected": 2.9686198234558105, "logps/chosen": -79.00662231445312, "logps/rejected": -78.03285217285156, "loss": 0.8746, "nll_loss": 0.8682048320770264, "rewards/accuracies": 1.0, "rewards/chosen": 2.3544089794158936, "rewards/margins": 9.258748054504395, "rewards/rejected": -6.90433931350708, "step": 4242 }, { "epoch": 0.7071666666666667, "grad_norm": 181.445556640625, "learning_rate": 4.17028795903792e-08, "logits/chosen": 2.0033624172210693, "logits/rejected": 2.0765254497528076, "logps/chosen": -65.4339599609375, "logps/rejected": -100.67060852050781, "loss": 2.503, "nll_loss": 1.8176100254058838, "rewards/accuracies": 1.0, "rewards/chosen": -1.7104949951171875, "rewards/margins": 0.3721787929534912, "rewards/rejected": -2.0826737880706787, "step": 4243 }, { "epoch": 0.7073333333333334, "grad_norm": 24.239097595214844, "learning_rate": 4.1659030328651866e-08, "logits/chosen": 1.7534940242767334, "logits/rejected": 1.8031020164489746, "logps/chosen": -39.17974090576172, "logps/rejected": -109.64677429199219, "loss": 0.5844, "nll_loss": 0.5678223371505737, "rewards/accuracies": 1.0, "rewards/chosen": 1.3197166919708252, "rewards/margins": 8.123419761657715, "rewards/rejected": -6.803703308105469, "step": 4244 }, { "epoch": 0.7075, "grad_norm": 34.213417053222656, "learning_rate": 4.161519806608247e-08, "logits/chosen": 3.339101791381836, "logits/rejected": 3.392916679382324, "logps/chosen": -69.28376770019531, "logps/rejected": -60.85187530517578, "loss": 1.0715, "nll_loss": 1.0340862274169922, "rewards/accuracies": 1.0, "rewards/chosen": 2.4925057888031006, "rewards/margins": 5.528717041015625, "rewards/rejected": -3.0362114906311035, "step": 4245 }, { "epoch": 0.7076666666666667, "grad_norm": 38.632225036621094, "learning_rate": 4.157138281544268e-08, "logits/chosen": 2.5850043296813965, "logits/rejected": 2.565063953399658, "logps/chosen": -46.26115417480469, "logps/rejected": -70.15060424804688, "loss": 0.9388, "nll_loss": 0.9252232313156128, "rewards/accuracies": 1.0, "rewards/chosen": 1.8390220403671265, "rewards/margins": 7.08917760848999, "rewards/rejected": -5.250155448913574, "step": 4246 }, { "epoch": 0.7078333333333333, "grad_norm": 121.84833526611328, "learning_rate": 4.152758458949922e-08, "logits/chosen": 2.7128732204437256, "logits/rejected": 3.0263168811798096, "logps/chosen": -38.28700256347656, "logps/rejected": -114.16552734375, "loss": 1.9476, "nll_loss": 1.6646521091461182, "rewards/accuracies": 1.0, "rewards/chosen": -2.0533926486968994, "rewards/margins": 3.073842763900757, "rewards/rejected": -5.127235412597656, "step": 4247 }, { "epoch": 0.708, "grad_norm": 133.4755401611328, "learning_rate": 4.1483803401013795e-08, "logits/chosen": 2.7228965759277344, "logits/rejected": 2.7412710189819336, "logps/chosen": -39.914756774902344, "logps/rejected": -94.81014251708984, "loss": 2.0376, "nll_loss": 1.9007024765014648, "rewards/accuracies": 1.0, "rewards/chosen": 0.16268311440944672, "rewards/margins": 2.7827956676483154, "rewards/rejected": -2.620112657546997, "step": 4248 }, { "epoch": 0.7081666666666667, "grad_norm": 23.339099884033203, "learning_rate": 4.144003926274322e-08, "logits/chosen": 1.603965401649475, "logits/rejected": 1.6072635650634766, "logps/chosen": -18.602012634277344, "logps/rejected": -152.97674560546875, "loss": 0.3534, "nll_loss": 0.3207243084907532, "rewards/accuracies": 1.0, "rewards/chosen": 1.2896828651428223, "rewards/margins": 5.236110687255859, "rewards/rejected": -3.946428060531616, "step": 4249 }, { "epoch": 0.7083333333333334, "grad_norm": 26.75328254699707, "learning_rate": 4.139629218743931e-08, "logits/chosen": 1.5493261814117432, "logits/rejected": 2.543201446533203, "logps/chosen": -56.20878601074219, "logps/rejected": -256.34222412109375, "loss": 0.8402, "nll_loss": 0.8389371037483215, "rewards/accuracies": 1.0, "rewards/chosen": 4.621631622314453, "rewards/margins": 11.495033264160156, "rewards/rejected": -6.873401641845703, "step": 4250 }, { "epoch": 0.7085, "grad_norm": 27.75149154663086, "learning_rate": 4.135256218784895e-08, "logits/chosen": 0.8985016345977783, "logits/rejected": 2.1541526317596436, "logps/chosen": -131.272705078125, "logps/rejected": -362.68658447265625, "loss": 1.1379, "nll_loss": 1.131661295890808, "rewards/accuracies": 1.0, "rewards/chosen": 2.295156955718994, "rewards/margins": 10.261636734008789, "rewards/rejected": -7.966479778289795, "step": 4251 }, { "epoch": 0.7086666666666667, "grad_norm": 46.08702850341797, "learning_rate": 4.1308849276713955e-08, "logits/chosen": 2.844963550567627, "logits/rejected": 2.7293026447296143, "logps/chosen": -103.57902526855469, "logps/rejected": -71.49633026123047, "loss": 1.0363, "nll_loss": 0.856024980545044, "rewards/accuracies": 1.0, "rewards/chosen": 1.2755264043807983, "rewards/margins": 2.7343735694885254, "rewards/rejected": -1.4588470458984375, "step": 4252 }, { "epoch": 0.7088333333333333, "grad_norm": 28.314001083374023, "learning_rate": 4.126515346677123e-08, "logits/chosen": 2.375170946121216, "logits/rejected": 2.3562588691711426, "logps/chosen": -69.49278259277344, "logps/rejected": -116.53239440917969, "loss": 0.9159, "nll_loss": 0.9025035500526428, "rewards/accuracies": 1.0, "rewards/chosen": 1.5900474786758423, "rewards/margins": 7.996225833892822, "rewards/rejected": -6.4061784744262695, "step": 4253 }, { "epoch": 0.709, "grad_norm": 20.51759147644043, "learning_rate": 4.1221474770752695e-08, "logits/chosen": 3.5706849098205566, "logits/rejected": 3.5269522666931152, "logps/chosen": -114.6956787109375, "logps/rejected": -150.71749877929688, "loss": 0.7389, "nll_loss": 0.730545699596405, "rewards/accuracies": 1.0, "rewards/chosen": 2.023158311843872, "rewards/margins": 9.35438060760498, "rewards/rejected": -7.3312225341796875, "step": 4254 }, { "epoch": 0.7091666666666666, "grad_norm": 31.465303421020508, "learning_rate": 4.1177813201385316e-08, "logits/chosen": 2.290374994277954, "logits/rejected": 3.1107940673828125, "logps/chosen": -40.93046569824219, "logps/rejected": -391.431884765625, "loss": 0.8545, "nll_loss": 0.8527181148529053, "rewards/accuracies": 1.0, "rewards/chosen": 4.575603485107422, "rewards/margins": 10.93505859375, "rewards/rejected": -6.359455108642578, "step": 4255 }, { "epoch": 0.7093333333333334, "grad_norm": 27.850732803344727, "learning_rate": 4.113416877139094e-08, "logits/chosen": 1.5028530359268188, "logits/rejected": 1.7230342626571655, "logps/chosen": -127.45317077636719, "logps/rejected": -489.4129333496094, "loss": 1.2772, "nll_loss": 1.2619127035140991, "rewards/accuracies": 1.0, "rewards/chosen": 1.3745408058166504, "rewards/margins": 8.92076587677002, "rewards/rejected": -7.546225070953369, "step": 4256 }, { "epoch": 0.7095, "grad_norm": 34.060054779052734, "learning_rate": 4.109054149348655e-08, "logits/chosen": 1.505926489830017, "logits/rejected": 1.7446165084838867, "logps/chosen": -28.69316864013672, "logps/rejected": -123.15042114257812, "loss": 0.4389, "nll_loss": 0.4282562732696533, "rewards/accuracies": 1.0, "rewards/chosen": 1.803523302078247, "rewards/margins": 8.575284004211426, "rewards/rejected": -6.7717604637146, "step": 4257 }, { "epoch": 0.7096666666666667, "grad_norm": 20.490705490112305, "learning_rate": 4.10469313803841e-08, "logits/chosen": 2.5690181255340576, "logits/rejected": 2.867478132247925, "logps/chosen": -80.60899353027344, "logps/rejected": -358.1692810058594, "loss": 0.7333, "nll_loss": 0.7262070775032043, "rewards/accuracies": 1.0, "rewards/chosen": 2.2019028663635254, "rewards/margins": 9.599568367004395, "rewards/rejected": -7.397665500640869, "step": 4258 }, { "epoch": 0.7098333333333333, "grad_norm": 31.966279983520508, "learning_rate": 4.100333844479055e-08, "logits/chosen": 2.5893783569335938, "logits/rejected": 2.7103512287139893, "logps/chosen": -30.275745391845703, "logps/rejected": -381.7578430175781, "loss": 0.7158, "nll_loss": 0.7040870785713196, "rewards/accuracies": 1.0, "rewards/chosen": 1.6129642724990845, "rewards/margins": 16.167804718017578, "rewards/rejected": -14.554840087890625, "step": 4259 }, { "epoch": 0.71, "grad_norm": 40.110408782958984, "learning_rate": 4.0959762699407765e-08, "logits/chosen": 2.0102615356445312, "logits/rejected": 1.917933702468872, "logps/chosen": -93.54659271240234, "logps/rejected": -113.556396484375, "loss": 1.1815, "nll_loss": 1.1408121585845947, "rewards/accuracies": 1.0, "rewards/chosen": 0.47649767994880676, "rewards/margins": 5.689768314361572, "rewards/rejected": -5.213270664215088, "step": 4260 }, { "epoch": 0.7101666666666666, "grad_norm": 22.778911590576172, "learning_rate": 4.091620415693271e-08, "logits/chosen": 2.946763277053833, "logits/rejected": 2.980243682861328, "logps/chosen": -27.143295288085938, "logps/rejected": -221.73313903808594, "loss": 0.4598, "nll_loss": 0.45238828659057617, "rewards/accuracies": 1.0, "rewards/chosen": 2.1320748329162598, "rewards/margins": 9.676057815551758, "rewards/rejected": -7.54398250579834, "step": 4261 }, { "epoch": 0.7103333333333334, "grad_norm": 39.10060501098633, "learning_rate": 4.087266283005729e-08, "logits/chosen": 1.683877944946289, "logits/rejected": 2.086890459060669, "logps/chosen": -21.414392471313477, "logps/rejected": -62.82809066772461, "loss": 0.6122, "nll_loss": 0.5353597402572632, "rewards/accuracies": 1.0, "rewards/chosen": 0.6645010709762573, "rewards/margins": 3.72507905960083, "rewards/rejected": -3.060577869415283, "step": 4262 }, { "epoch": 0.7105, "grad_norm": 40.206180572509766, "learning_rate": 4.0829138731468416e-08, "logits/chosen": 2.6984598636627197, "logits/rejected": 2.716874837875366, "logps/chosen": -35.639732360839844, "logps/rejected": -218.9429168701172, "loss": 1.0103, "nll_loss": 0.9899925589561462, "rewards/accuracies": 1.0, "rewards/chosen": 1.045312523841858, "rewards/margins": 10.269412994384766, "rewards/rejected": -9.224100112915039, "step": 4263 }, { "epoch": 0.7106666666666667, "grad_norm": 275.9961853027344, "learning_rate": 4.078563187384799e-08, "logits/chosen": 2.330730438232422, "logits/rejected": 2.366504430770874, "logps/chosen": -103.69828033447266, "logps/rejected": -198.989990234375, "loss": 4.034, "nll_loss": 3.1423723697662354, "rewards/accuracies": 1.0, "rewards/chosen": -3.254503011703491, "rewards/margins": 0.3160216808319092, "rewards/rejected": -3.5705246925354004, "step": 4264 }, { "epoch": 0.7108333333333333, "grad_norm": 26.20884895324707, "learning_rate": 4.0742142269872804e-08, "logits/chosen": 2.3829338550567627, "logits/rejected": 2.9259591102600098, "logps/chosen": -67.1619644165039, "logps/rejected": -244.333740234375, "loss": 0.8571, "nll_loss": 0.8395246267318726, "rewards/accuracies": 1.0, "rewards/chosen": 1.3922852277755737, "rewards/margins": 7.0158209800720215, "rewards/rejected": -5.623535633087158, "step": 4265 }, { "epoch": 0.711, "grad_norm": 51.769622802734375, "learning_rate": 4.0698669932214726e-08, "logits/chosen": 1.605854868888855, "logits/rejected": 2.2469027042388916, "logps/chosen": -51.46205139160156, "logps/rejected": -140.18115234375, "loss": 1.2046, "nll_loss": 1.169592022895813, "rewards/accuracies": 1.0, "rewards/chosen": 0.5550239682197571, "rewards/margins": 6.483733177185059, "rewards/rejected": -5.928709030151367, "step": 4266 }, { "epoch": 0.7111666666666666, "grad_norm": 30.270856857299805, "learning_rate": 4.0655214873540524e-08, "logits/chosen": 1.9394789934158325, "logits/rejected": 2.6341733932495117, "logps/chosen": -31.948251724243164, "logps/rejected": -163.40484619140625, "loss": 0.595, "nll_loss": 0.5808773040771484, "rewards/accuracies": 1.0, "rewards/chosen": 1.8231258392333984, "rewards/margins": 6.975137233734131, "rewards/rejected": -5.152011394500732, "step": 4267 }, { "epoch": 0.7113333333333334, "grad_norm": 27.859472274780273, "learning_rate": 4.0611777106512016e-08, "logits/chosen": 2.6282668113708496, "logits/rejected": 2.7309978008270264, "logps/chosen": -67.1510009765625, "logps/rejected": -390.5014953613281, "loss": 0.8816, "nll_loss": 0.8720911145210266, "rewards/accuracies": 1.0, "rewards/chosen": 1.8393791913986206, "rewards/margins": 10.74206829071045, "rewards/rejected": -8.902688980102539, "step": 4268 }, { "epoch": 0.7115, "grad_norm": 110.958984375, "learning_rate": 4.0568356643785854e-08, "logits/chosen": 2.2321908473968506, "logits/rejected": 2.1358933448791504, "logps/chosen": -67.31700134277344, "logps/rejected": -11.682498931884766, "loss": 1.8007, "nll_loss": 0.8013929724693298, "rewards/accuracies": 1.0, "rewards/chosen": 2.3278939723968506, "rewards/margins": 0.500293493270874, "rewards/rejected": 1.8276004791259766, "step": 4269 }, { "epoch": 0.7116666666666667, "grad_norm": 20.758697509765625, "learning_rate": 4.0524953498013745e-08, "logits/chosen": 2.1931824684143066, "logits/rejected": 2.3131861686706543, "logps/chosen": -117.12036895751953, "logps/rejected": -168.15689086914062, "loss": 0.8088, "nll_loss": 0.7808024287223816, "rewards/accuracies": 1.0, "rewards/chosen": 1.3956199884414673, "rewards/margins": 5.527270317077637, "rewards/rejected": -4.131650447845459, "step": 4270 }, { "epoch": 0.7118333333333333, "grad_norm": 51.13799285888672, "learning_rate": 4.048156768184232e-08, "logits/chosen": 2.232089042663574, "logits/rejected": 2.087357521057129, "logps/chosen": -38.25900650024414, "logps/rejected": -76.71168518066406, "loss": 0.8934, "nll_loss": 0.8897444009780884, "rewards/accuracies": 1.0, "rewards/chosen": 3.2378957271575928, "rewards/margins": 9.479443550109863, "rewards/rejected": -6.24154806137085, "step": 4271 }, { "epoch": 0.712, "grad_norm": 28.9720401763916, "learning_rate": 4.043819920791321e-08, "logits/chosen": 2.222970485687256, "logits/rejected": 1.7288910150527954, "logps/chosen": -113.48713684082031, "logps/rejected": -198.484375, "loss": 0.8884, "nll_loss": 0.8797452449798584, "rewards/accuracies": 1.0, "rewards/chosen": 2.000065565109253, "rewards/margins": 9.144874572753906, "rewards/rejected": -7.144809246063232, "step": 4272 }, { "epoch": 0.7121666666666666, "grad_norm": 79.07398223876953, "learning_rate": 4.039484808886285e-08, "logits/chosen": 2.3635013103485107, "logits/rejected": 2.132403612136841, "logps/chosen": -28.553436279296875, "logps/rejected": -43.371665954589844, "loss": 1.6539, "nll_loss": 1.5863021612167358, "rewards/accuracies": 1.0, "rewards/chosen": 2.0065879821777344, "rewards/margins": 4.452603340148926, "rewards/rejected": -2.4460151195526123, "step": 4273 }, { "epoch": 0.7123333333333334, "grad_norm": 34.33214569091797, "learning_rate": 4.03515143373228e-08, "logits/chosen": 2.3760786056518555, "logits/rejected": 2.369486093521118, "logps/chosen": -60.96881866455078, "logps/rejected": -189.40423583984375, "loss": 0.9851, "nll_loss": 0.96775883436203, "rewards/accuracies": 1.0, "rewards/chosen": 1.2369941473007202, "rewards/margins": 8.578333854675293, "rewards/rejected": -7.341340065002441, "step": 4274 }, { "epoch": 0.7125, "grad_norm": 34.763790130615234, "learning_rate": 4.030819796591949e-08, "logits/chosen": 1.0835994482040405, "logits/rejected": 1.8443470001220703, "logps/chosen": -36.959075927734375, "logps/rejected": -375.30914306640625, "loss": 0.7397, "nll_loss": 0.7391815781593323, "rewards/accuracies": 1.0, "rewards/chosen": 4.829446315765381, "rewards/margins": 13.82314682006836, "rewards/rejected": -8.993700981140137, "step": 4275 }, { "epoch": 0.7126666666666667, "grad_norm": 28.076847076416016, "learning_rate": 4.0264898987274185e-08, "logits/chosen": 1.8028007745742798, "logits/rejected": 2.365971088409424, "logps/chosen": -49.07958984375, "logps/rejected": -320.0676574707031, "loss": 0.6472, "nll_loss": 0.629225492477417, "rewards/accuracies": 1.0, "rewards/chosen": 1.184098482131958, "rewards/margins": 9.234169960021973, "rewards/rejected": -8.050071716308594, "step": 4276 }, { "epoch": 0.7128333333333333, "grad_norm": 27.52627944946289, "learning_rate": 4.0221617414003216e-08, "logits/chosen": 1.74578857421875, "logits/rejected": 1.6879594326019287, "logps/chosen": -37.843421936035156, "logps/rejected": -103.50293731689453, "loss": 0.5906, "nll_loss": 0.556520938873291, "rewards/accuracies": 1.0, "rewards/chosen": 2.0300300121307373, "rewards/margins": 5.354319095611572, "rewards/rejected": -3.324289083480835, "step": 4277 }, { "epoch": 0.713, "grad_norm": 28.9830265045166, "learning_rate": 4.0178353258717804e-08, "logits/chosen": 3.427208662033081, "logits/rejected": 3.8321545124053955, "logps/chosen": -108.14103698730469, "logps/rejected": -284.3484802246094, "loss": 1.1842, "nll_loss": 1.1754461526870728, "rewards/accuracies": 1.0, "rewards/chosen": 1.9156930446624756, "rewards/margins": 11.199894905090332, "rewards/rejected": -9.284201622009277, "step": 4278 }, { "epoch": 0.7131666666666666, "grad_norm": 28.486019134521484, "learning_rate": 4.0135106534024054e-08, "logits/chosen": 2.0532941818237305, "logits/rejected": 2.0484201908111572, "logps/chosen": -75.79248809814453, "logps/rejected": -153.00201416015625, "loss": 1.0429, "nll_loss": 1.0382533073425293, "rewards/accuracies": 1.0, "rewards/chosen": 3.0647335052490234, "rewards/margins": 8.971055030822754, "rewards/rejected": -5.9063215255737305, "step": 4279 }, { "epoch": 0.7133333333333334, "grad_norm": 25.12154197692871, "learning_rate": 4.009187725252309e-08, "logits/chosen": 1.0979560613632202, "logits/rejected": 2.226438283920288, "logps/chosen": -15.957401275634766, "logps/rejected": -434.68939208984375, "loss": 0.2959, "nll_loss": 0.2901345193386078, "rewards/accuracies": 1.0, "rewards/chosen": 2.3635241985321045, "rewards/margins": 10.519964218139648, "rewards/rejected": -8.156439781188965, "step": 4280 }, { "epoch": 0.7135, "grad_norm": 17.51656150817871, "learning_rate": 4.004866542681079e-08, "logits/chosen": 2.1337366104125977, "logits/rejected": 2.3532392978668213, "logps/chosen": -51.669708251953125, "logps/rejected": -363.5160827636719, "loss": 0.5746, "nll_loss": 0.5741078853607178, "rewards/accuracies": 1.0, "rewards/chosen": 4.938928127288818, "rewards/margins": 13.802373886108398, "rewards/rejected": -8.863446235656738, "step": 4281 }, { "epoch": 0.7136666666666667, "grad_norm": 35.90814208984375, "learning_rate": 4.000547106947809e-08, "logits/chosen": 2.7311136722564697, "logits/rejected": 2.6745798587799072, "logps/chosen": -99.37545776367188, "logps/rejected": -185.33575439453125, "loss": 0.5364, "nll_loss": 0.5070176124572754, "rewards/accuracies": 1.0, "rewards/chosen": 0.735273003578186, "rewards/margins": 6.81903076171875, "rewards/rejected": -6.0837578773498535, "step": 4282 }, { "epoch": 0.7138333333333333, "grad_norm": 54.31262969970703, "learning_rate": 3.99622941931108e-08, "logits/chosen": 2.21506667137146, "logits/rejected": 2.4078726768493652, "logps/chosen": -30.276752471923828, "logps/rejected": -103.98838806152344, "loss": 1.135, "nll_loss": 1.0440258979797363, "rewards/accuracies": 1.0, "rewards/chosen": 0.7781959772109985, "rewards/margins": 3.470391273498535, "rewards/rejected": -2.692195177078247, "step": 4283 }, { "epoch": 0.714, "grad_norm": 21.893465042114258, "learning_rate": 3.991913481028964e-08, "logits/chosen": 2.406531810760498, "logits/rejected": 2.7150137424468994, "logps/chosen": -80.2769775390625, "logps/rejected": -297.2869567871094, "loss": 0.7702, "nll_loss": 0.7573298811912537, "rewards/accuracies": 1.0, "rewards/chosen": 1.6613250970840454, "rewards/margins": 7.812485218048096, "rewards/rejected": -6.15116024017334, "step": 4284 }, { "epoch": 0.7141666666666666, "grad_norm": 36.32404327392578, "learning_rate": 3.9875992933590175e-08, "logits/chosen": 2.2334742546081543, "logits/rejected": 1.7693551778793335, "logps/chosen": -69.1344223022461, "logps/rejected": -49.89818572998047, "loss": 1.1433, "nll_loss": 1.0802253484725952, "rewards/accuracies": 1.0, "rewards/chosen": 2.1989731788635254, "rewards/margins": 4.678310394287109, "rewards/rejected": -2.479337453842163, "step": 4285 }, { "epoch": 0.7143333333333334, "grad_norm": 185.46678161621094, "learning_rate": 3.983286857558291e-08, "logits/chosen": 2.31854248046875, "logits/rejected": 2.187608480453491, "logps/chosen": -93.82999420166016, "logps/rejected": -15.9104642868042, "loss": 3.3921, "nll_loss": 1.066249966621399, "rewards/accuracies": 0.0, "rewards/chosen": 3.377408742904663, "rewards/margins": -1.1969335079193115, "rewards/rejected": 4.574342250823975, "step": 4286 }, { "epoch": 0.7145, "grad_norm": 25.796186447143555, "learning_rate": 3.978976174883328e-08, "logits/chosen": 2.531351089477539, "logits/rejected": 2.6795167922973633, "logps/chosen": -81.6611557006836, "logps/rejected": -269.0224304199219, "loss": 0.9219, "nll_loss": 0.917540967464447, "rewards/accuracies": 1.0, "rewards/chosen": 2.864745616912842, "rewards/margins": 9.447845458984375, "rewards/rejected": -6.583099365234375, "step": 4287 }, { "epoch": 0.7146666666666667, "grad_norm": 22.45151710510254, "learning_rate": 3.97466724659016e-08, "logits/chosen": 0.7364829778671265, "logits/rejected": 2.048064708709717, "logps/chosen": -43.20458221435547, "logps/rejected": -317.54327392578125, "loss": 0.5964, "nll_loss": 0.5760611891746521, "rewards/accuracies": 1.0, "rewards/chosen": 1.046826958656311, "rewards/margins": 9.425254821777344, "rewards/rejected": -8.378427505493164, "step": 4288 }, { "epoch": 0.7148333333333333, "grad_norm": 22.553836822509766, "learning_rate": 3.970360073934298e-08, "logits/chosen": 1.5880341529846191, "logits/rejected": 2.3064310550689697, "logps/chosen": -115.553955078125, "logps/rejected": -329.2244873046875, "loss": 0.9308, "nll_loss": 0.9244315028190613, "rewards/accuracies": 1.0, "rewards/chosen": 2.330828905105591, "rewards/margins": 9.3945894241333, "rewards/rejected": -7.063760280609131, "step": 4289 }, { "epoch": 0.715, "grad_norm": 42.25136184692383, "learning_rate": 3.9660546581707534e-08, "logits/chosen": 2.011469841003418, "logits/rejected": 1.921012043952942, "logps/chosen": -37.125831604003906, "logps/rejected": -66.80062103271484, "loss": 0.9574, "nll_loss": 0.9055081009864807, "rewards/accuracies": 1.0, "rewards/chosen": 0.2441047728061676, "rewards/margins": 5.1515679359436035, "rewards/rejected": -4.907463073730469, "step": 4290 }, { "epoch": 0.7151666666666666, "grad_norm": 27.833328247070312, "learning_rate": 3.961751000554019e-08, "logits/chosen": 2.3717846870422363, "logits/rejected": 2.0780396461486816, "logps/chosen": -77.68778991699219, "logps/rejected": -113.45722961425781, "loss": 0.9149, "nll_loss": 0.8929629921913147, "rewards/accuracies": 1.0, "rewards/chosen": 1.1723480224609375, "rewards/margins": 6.576770305633545, "rewards/rejected": -5.404422283172607, "step": 4291 }, { "epoch": 0.7153333333333334, "grad_norm": 32.15803527832031, "learning_rate": 3.957449102338083e-08, "logits/chosen": 3.709869623184204, "logits/rejected": 3.791929244995117, "logps/chosen": -63.177215576171875, "logps/rejected": -239.50283813476562, "loss": 0.8761, "nll_loss": 0.8654412031173706, "rewards/accuracies": 1.0, "rewards/chosen": 2.121220350265503, "rewards/margins": 7.479219436645508, "rewards/rejected": -5.357999324798584, "step": 4292 }, { "epoch": 0.7155, "grad_norm": 31.487668991088867, "learning_rate": 3.953148964776407e-08, "logits/chosen": 2.963144540786743, "logits/rejected": 3.1099345684051514, "logps/chosen": -78.8857650756836, "logps/rejected": -104.26950073242188, "loss": 0.9144, "nll_loss": 0.8765085935592651, "rewards/accuracies": 1.0, "rewards/chosen": 1.2029975652694702, "rewards/margins": 4.9575300216674805, "rewards/rejected": -3.7545325756073, "step": 4293 }, { "epoch": 0.7156666666666667, "grad_norm": 29.805349349975586, "learning_rate": 3.9488505891219514e-08, "logits/chosen": 2.6783511638641357, "logits/rejected": 2.8302133083343506, "logps/chosen": -56.61094665527344, "logps/rejected": -83.55731201171875, "loss": 0.7957, "nll_loss": 0.7754924893379211, "rewards/accuracies": 1.0, "rewards/chosen": 1.5746163129806519, "rewards/margins": 6.1949968338012695, "rewards/rejected": -4.620380401611328, "step": 4294 }, { "epoch": 0.7158333333333333, "grad_norm": 32.396217346191406, "learning_rate": 3.944553976627161e-08, "logits/chosen": 1.962117075920105, "logits/rejected": 1.7534259557724, "logps/chosen": -54.74036407470703, "logps/rejected": -94.27589416503906, "loss": 0.9702, "nll_loss": 0.9437993764877319, "rewards/accuracies": 1.0, "rewards/chosen": 0.9433311820030212, "rewards/margins": 6.382147789001465, "rewards/rejected": -5.438816547393799, "step": 4295 }, { "epoch": 0.716, "grad_norm": 32.39982604980469, "learning_rate": 3.9402591285439665e-08, "logits/chosen": 2.6090445518493652, "logits/rejected": 2.7295641899108887, "logps/chosen": -24.554718017578125, "logps/rejected": -136.6217041015625, "loss": 0.4543, "nll_loss": 0.423357218503952, "rewards/accuracies": 1.0, "rewards/chosen": 0.742931604385376, "rewards/margins": 6.271171569824219, "rewards/rejected": -5.528239727020264, "step": 4296 }, { "epoch": 0.7161666666666666, "grad_norm": 24.336076736450195, "learning_rate": 3.935966046123779e-08, "logits/chosen": 1.52757728099823, "logits/rejected": 2.320396900177002, "logps/chosen": -36.23891067504883, "logps/rejected": -446.82073974609375, "loss": 0.5327, "nll_loss": 0.5252015590667725, "rewards/accuracies": 1.0, "rewards/chosen": 2.0895462036132812, "rewards/margins": 10.202107429504395, "rewards/rejected": -8.112561225891113, "step": 4297 }, { "epoch": 0.7163333333333334, "grad_norm": 31.735639572143555, "learning_rate": 3.931674730617501e-08, "logits/chosen": 1.7309074401855469, "logits/rejected": 1.1124812364578247, "logps/chosen": -100.70819091796875, "logps/rejected": -77.90962219238281, "loss": 0.9556, "nll_loss": 0.9411979913711548, "rewards/accuracies": 1.0, "rewards/chosen": 2.239086866378784, "rewards/margins": 6.755237579345703, "rewards/rejected": -4.51615047454834, "step": 4298 }, { "epoch": 0.7165, "grad_norm": 36.86384582519531, "learning_rate": 3.9273851832755215e-08, "logits/chosen": 1.1646044254302979, "logits/rejected": 1.5414139032363892, "logps/chosen": -52.13766860961914, "logps/rejected": -185.85240173339844, "loss": 1.0592, "nll_loss": 1.0427534580230713, "rewards/accuracies": 1.0, "rewards/chosen": 1.7939565181732178, "rewards/margins": 6.5575151443481445, "rewards/rejected": -4.763558864593506, "step": 4299 }, { "epoch": 0.7166666666666667, "grad_norm": 27.895044326782227, "learning_rate": 3.923097405347708e-08, "logits/chosen": 3.202065944671631, "logits/rejected": 3.1904900074005127, "logps/chosen": -32.69203186035156, "logps/rejected": -143.10382080078125, "loss": 0.6222, "nll_loss": 0.6054079532623291, "rewards/accuracies": 1.0, "rewards/chosen": 1.4682426452636719, "rewards/margins": 6.9864044189453125, "rewards/rejected": -5.518161773681641, "step": 4300 }, { "epoch": 0.7168333333333333, "grad_norm": 30.002838134765625, "learning_rate": 3.9188113980834235e-08, "logits/chosen": 1.7117565870285034, "logits/rejected": 2.2773706912994385, "logps/chosen": -48.91206359863281, "logps/rejected": -155.11785888671875, "loss": 0.7082, "nll_loss": 0.6700282692909241, "rewards/accuracies": 1.0, "rewards/chosen": 0.5678329467773438, "rewards/margins": 5.676003456115723, "rewards/rejected": -5.108170509338379, "step": 4301 }, { "epoch": 0.717, "grad_norm": 188.39312744140625, "learning_rate": 3.914527162731498e-08, "logits/chosen": 2.34800386428833, "logits/rejected": 2.316824197769165, "logps/chosen": -66.32933044433594, "logps/rejected": -68.52281188964844, "loss": 1.3824, "nll_loss": 0.9342159032821655, "rewards/accuracies": 1.0, "rewards/chosen": -0.9828365445137024, "rewards/margins": 0.9848789572715759, "rewards/rejected": -1.9677155017852783, "step": 4302 }, { "epoch": 0.7171666666666666, "grad_norm": 132.34165954589844, "learning_rate": 3.9102447005402596e-08, "logits/chosen": 1.8268113136291504, "logits/rejected": 1.917448878288269, "logps/chosen": -21.060760498046875, "logps/rejected": -64.3871078491211, "loss": 1.4336, "nll_loss": 0.5542305111885071, "rewards/accuracies": 1.0, "rewards/chosen": 1.6412529945373535, "rewards/margins": 0.49569249153137207, "rewards/rejected": 1.1455605030059814, "step": 4303 }, { "epoch": 0.7173333333333334, "grad_norm": 29.882673263549805, "learning_rate": 3.905964012757513e-08, "logits/chosen": 2.5421478748321533, "logits/rejected": 2.7612314224243164, "logps/chosen": -83.46888732910156, "logps/rejected": -357.2603759765625, "loss": 0.9395, "nll_loss": 0.9172405004501343, "rewards/accuracies": 1.0, "rewards/chosen": 0.9560050964355469, "rewards/margins": 8.774645805358887, "rewards/rejected": -7.81864070892334, "step": 4304 }, { "epoch": 0.7175, "grad_norm": 63.95846176147461, "learning_rate": 3.901685100630554e-08, "logits/chosen": 2.574730157852173, "logits/rejected": 2.5864129066467285, "logps/chosen": -13.12889289855957, "logps/rejected": -26.99950408935547, "loss": 0.8979, "nll_loss": 0.3202168941497803, "rewards/accuracies": 1.0, "rewards/chosen": 2.709928035736084, "rewards/margins": 1.778151512145996, "rewards/rejected": 0.9317764639854431, "step": 4305 }, { "epoch": 0.7176666666666667, "grad_norm": 23.066848754882812, "learning_rate": 3.897407965406146e-08, "logits/chosen": 2.5330841541290283, "logits/rejected": 2.6875720024108887, "logps/chosen": -78.52009582519531, "logps/rejected": -292.5107116699219, "loss": 0.8588, "nll_loss": 0.8534793853759766, "rewards/accuracies": 1.0, "rewards/chosen": 2.4140305519104004, "rewards/margins": 12.57381820678711, "rewards/rejected": -10.15978717803955, "step": 4306 }, { "epoch": 0.7178333333333333, "grad_norm": 41.281578063964844, "learning_rate": 3.893132608330548e-08, "logits/chosen": 2.6909749507904053, "logits/rejected": 2.6975762844085693, "logps/chosen": -36.98768615722656, "logps/rejected": -128.89288330078125, "loss": 1.0118, "nll_loss": 0.9996671676635742, "rewards/accuracies": 1.0, "rewards/chosen": 2.089892625808716, "rewards/margins": 7.1551618576049805, "rewards/rejected": -5.0652689933776855, "step": 4307 }, { "epoch": 0.718, "grad_norm": 422.9586486816406, "learning_rate": 3.8888590306494974e-08, "logits/chosen": 2.3277747631073, "logits/rejected": 2.4499576091766357, "logps/chosen": -75.56121826171875, "logps/rejected": -132.93798828125, "loss": 5.7178, "nll_loss": 2.0989224910736084, "rewards/accuracies": 0.0, "rewards/chosen": -1.9731571674346924, "rewards/margins": -3.5067172050476074, "rewards/rejected": 1.5335601568222046, "step": 4308 }, { "epoch": 0.7181666666666666, "grad_norm": 32.230525970458984, "learning_rate": 3.884587233608212e-08, "logits/chosen": 1.2662451267242432, "logits/rejected": 3.006571054458618, "logps/chosen": -84.3622817993164, "logps/rejected": -529.3990478515625, "loss": 0.9437, "nll_loss": 0.9373586773872375, "rewards/accuracies": 1.0, "rewards/chosen": 2.2504913806915283, "rewards/margins": 11.745401382446289, "rewards/rejected": -9.49491024017334, "step": 4309 }, { "epoch": 0.7183333333333334, "grad_norm": 36.391536712646484, "learning_rate": 3.8803172184513874e-08, "logits/chosen": 2.488476276397705, "logits/rejected": 2.594130277633667, "logps/chosen": -42.5005989074707, "logps/rejected": -112.58415985107422, "loss": 0.9144, "nll_loss": 0.8854290843009949, "rewards/accuracies": 1.0, "rewards/chosen": 1.7765979766845703, "rewards/margins": 5.480064868927002, "rewards/rejected": -3.7034668922424316, "step": 4310 }, { "epoch": 0.7185, "grad_norm": 143.92079162597656, "learning_rate": 3.876048986423206e-08, "logits/chosen": 2.5836942195892334, "logits/rejected": 2.663151264190674, "logps/chosen": -66.2193603515625, "logps/rejected": -25.880373001098633, "loss": 2.1153, "nll_loss": 0.9197133183479309, "rewards/accuracies": 0.0, "rewards/chosen": 0.804888904094696, "rewards/margins": -0.3105112910270691, "rewards/rejected": 1.1154001951217651, "step": 4311 }, { "epoch": 0.7186666666666667, "grad_norm": 44.0219841003418, "learning_rate": 3.871782538767327e-08, "logits/chosen": 1.997334599494934, "logits/rejected": 1.8474222421646118, "logps/chosen": -45.71317672729492, "logps/rejected": -66.03292846679688, "loss": 1.1464, "nll_loss": 1.0884088277816772, "rewards/accuracies": 1.0, "rewards/chosen": 0.5541965365409851, "rewards/margins": 4.292450904846191, "rewards/rejected": -3.7382545471191406, "step": 4312 }, { "epoch": 0.7188333333333333, "grad_norm": 26.049724578857422, "learning_rate": 3.867517876726896e-08, "logits/chosen": 2.472123384475708, "logits/rejected": 2.3204312324523926, "logps/chosen": -77.12422180175781, "logps/rejected": -69.55593872070312, "loss": 0.7883, "nll_loss": 0.7636062502861023, "rewards/accuracies": 1.0, "rewards/chosen": 1.082891821861267, "rewards/margins": 6.257933616638184, "rewards/rejected": -5.175041675567627, "step": 4313 }, { "epoch": 0.719, "grad_norm": 33.487091064453125, "learning_rate": 3.863255001544525e-08, "logits/chosen": 3.071657419204712, "logits/rejected": 3.0414786338806152, "logps/chosen": -12.77957534790039, "logps/rejected": -42.88760757446289, "loss": 0.3871, "nll_loss": 0.3194893002510071, "rewards/accuracies": 1.0, "rewards/chosen": 2.297321081161499, "rewards/margins": 4.665349960327148, "rewards/rejected": -2.3680291175842285, "step": 4314 }, { "epoch": 0.7191666666666666, "grad_norm": 29.84770393371582, "learning_rate": 3.8589939144623174e-08, "logits/chosen": 1.0205906629562378, "logits/rejected": 2.208517074584961, "logps/chosen": -15.298259735107422, "logps/rejected": -340.6350402832031, "loss": 0.4792, "nll_loss": 0.47807061672210693, "rewards/accuracies": 1.0, "rewards/chosen": 4.069501876831055, "rewards/margins": 13.298417091369629, "rewards/rejected": -9.228915214538574, "step": 4315 }, { "epoch": 0.7193333333333334, "grad_norm": 59.72667694091797, "learning_rate": 3.854734616721852e-08, "logits/chosen": 2.936229705810547, "logits/rejected": 3.0306196212768555, "logps/chosen": -28.479007720947266, "logps/rejected": -81.98616027832031, "loss": 0.9688, "nll_loss": 0.9493001699447632, "rewards/accuracies": 1.0, "rewards/chosen": 2.317490816116333, "rewards/margins": 6.288395404815674, "rewards/rejected": -3.970904588699341, "step": 4316 }, { "epoch": 0.7195, "grad_norm": 23.46990203857422, "learning_rate": 3.8504771095641894e-08, "logits/chosen": 2.3616178035736084, "logits/rejected": 2.356635808944702, "logps/chosen": -60.21662139892578, "logps/rejected": -195.52850341796875, "loss": 0.7194, "nll_loss": 0.7001932263374329, "rewards/accuracies": 1.0, "rewards/chosen": 1.111975073814392, "rewards/margins": 9.064569473266602, "rewards/rejected": -7.952594757080078, "step": 4317 }, { "epoch": 0.7196666666666667, "grad_norm": 16.036563873291016, "learning_rate": 3.846221394229857e-08, "logits/chosen": 1.387776494026184, "logits/rejected": 1.4980638027191162, "logps/chosen": -201.87342834472656, "logps/rejected": -245.49441528320312, "loss": 0.8231, "nll_loss": 0.8140057325363159, "rewards/accuracies": 1.0, "rewards/chosen": 1.9920395612716675, "rewards/margins": 8.592540740966797, "rewards/rejected": -6.60050106048584, "step": 4318 }, { "epoch": 0.7198333333333333, "grad_norm": 26.542098999023438, "learning_rate": 3.8419674719588725e-08, "logits/chosen": 1.4617072343826294, "logits/rejected": 2.2404394149780273, "logps/chosen": -47.945404052734375, "logps/rejected": -194.65032958984375, "loss": 0.6835, "nll_loss": 0.6752873063087463, "rewards/accuracies": 1.0, "rewards/chosen": 2.375845432281494, "rewards/margins": 7.989801406860352, "rewards/rejected": -5.613955974578857, "step": 4319 }, { "epoch": 0.72, "grad_norm": 56.99986267089844, "learning_rate": 3.837715343990726e-08, "logits/chosen": 2.889024496078491, "logits/rejected": 2.927827835083008, "logps/chosen": -20.690792083740234, "logps/rejected": -172.05154418945312, "loss": 0.9643, "nll_loss": 0.9404904842376709, "rewards/accuracies": 1.0, "rewards/chosen": 2.523176908493042, "rewards/margins": 6.126503944396973, "rewards/rejected": -3.6033267974853516, "step": 4320 }, { "epoch": 0.7201666666666666, "grad_norm": 29.005430221557617, "learning_rate": 3.833465011564386e-08, "logits/chosen": 1.3539063930511475, "logits/rejected": 1.912044644355774, "logps/chosen": -38.25138854980469, "logps/rejected": -304.09173583984375, "loss": 0.803, "nll_loss": 0.7969038486480713, "rewards/accuracies": 1.0, "rewards/chosen": 2.7505204677581787, "rewards/margins": 8.500439643859863, "rewards/rejected": -5.749919414520264, "step": 4321 }, { "epoch": 0.7203333333333334, "grad_norm": 39.75768280029297, "learning_rate": 3.829216475918302e-08, "logits/chosen": 1.1193543672561646, "logits/rejected": 2.1032514572143555, "logps/chosen": -34.36951446533203, "logps/rejected": -268.8125915527344, "loss": 0.6753, "nll_loss": 0.673911988735199, "rewards/accuracies": 1.0, "rewards/chosen": 3.83931827545166, "rewards/margins": 13.01870346069336, "rewards/rejected": -9.1793851852417, "step": 4322 }, { "epoch": 0.7205, "grad_norm": 22.731882095336914, "learning_rate": 3.824969738290386e-08, "logits/chosen": 2.7703356742858887, "logits/rejected": 2.8110363483428955, "logps/chosen": -64.10130310058594, "logps/rejected": -99.81016540527344, "loss": 0.7572, "nll_loss": 0.7453640699386597, "rewards/accuracies": 1.0, "rewards/chosen": 3.172197103500366, "rewards/margins": 7.4704179763793945, "rewards/rejected": -4.298221111297607, "step": 4323 }, { "epoch": 0.7206666666666667, "grad_norm": 33.674957275390625, "learning_rate": 3.82072479991804e-08, "logits/chosen": 2.967883586883545, "logits/rejected": 3.219966411590576, "logps/chosen": -21.21783447265625, "logps/rejected": -279.2228698730469, "loss": 0.5868, "nll_loss": 0.5734550356864929, "rewards/accuracies": 1.0, "rewards/chosen": 1.6637283563613892, "rewards/margins": 7.549882411956787, "rewards/rejected": -5.8861541748046875, "step": 4324 }, { "epoch": 0.7208333333333333, "grad_norm": 36.191158294677734, "learning_rate": 3.816481662038137e-08, "logits/chosen": 2.5996484756469727, "logits/rejected": 2.799644947052002, "logps/chosen": -51.67372131347656, "logps/rejected": -327.43975830078125, "loss": 0.7811, "nll_loss": 0.7599076628684998, "rewards/accuracies": 1.0, "rewards/chosen": 1.0047889947891235, "rewards/margins": 9.134211540222168, "rewards/rejected": -8.129422187805176, "step": 4325 }, { "epoch": 0.721, "grad_norm": 26.854074478149414, "learning_rate": 3.81224032588703e-08, "logits/chosen": 2.663210868835449, "logits/rejected": 2.74503231048584, "logps/chosen": -45.670143127441406, "logps/rejected": -171.07546997070312, "loss": 0.6507, "nll_loss": 0.6343075037002563, "rewards/accuracies": 1.0, "rewards/chosen": 1.5800895690917969, "rewards/margins": 6.812961578369141, "rewards/rejected": -5.232872009277344, "step": 4326 }, { "epoch": 0.7211666666666666, "grad_norm": 32.257225036621094, "learning_rate": 3.8080007927005344e-08, "logits/chosen": 3.6475563049316406, "logits/rejected": 3.7967405319213867, "logps/chosen": -18.45806884765625, "logps/rejected": -184.7133026123047, "loss": 0.514, "nll_loss": 0.5127240419387817, "rewards/accuracies": 1.0, "rewards/chosen": 4.286769390106201, "rewards/margins": 11.5833740234375, "rewards/rejected": -7.296604156494141, "step": 4327 }, { "epoch": 0.7213333333333334, "grad_norm": 34.86585235595703, "learning_rate": 3.803763063713951e-08, "logits/chosen": 2.1424343585968018, "logits/rejected": 2.1470298767089844, "logps/chosen": -51.76129913330078, "logps/rejected": -53.0533332824707, "loss": 0.7902, "nll_loss": 0.7290325164794922, "rewards/accuracies": 1.0, "rewards/chosen": 0.8764358758926392, "rewards/margins": 4.112433910369873, "rewards/rejected": -3.2359979152679443, "step": 4328 }, { "epoch": 0.7215, "grad_norm": 26.55755615234375, "learning_rate": 3.799527140162054e-08, "logits/chosen": 1.5885796546936035, "logits/rejected": 1.9576407670974731, "logps/chosen": -71.85317993164062, "logps/rejected": -144.11434936523438, "loss": 0.9197, "nll_loss": 0.8981648683547974, "rewards/accuracies": 1.0, "rewards/chosen": 1.9505141973495483, "rewards/margins": 6.006328105926514, "rewards/rejected": -4.055813789367676, "step": 4329 }, { "epoch": 0.7216666666666667, "grad_norm": 33.4299201965332, "learning_rate": 3.7952930232790925e-08, "logits/chosen": 2.6699090003967285, "logits/rejected": 2.6799840927124023, "logps/chosen": -121.69074249267578, "logps/rejected": -119.70006561279297, "loss": 1.1834, "nll_loss": 1.1589595079421997, "rewards/accuracies": 1.0, "rewards/chosen": 1.0986350774765015, "rewards/margins": 6.240999221801758, "rewards/rejected": -5.142364025115967, "step": 4330 }, { "epoch": 0.7218333333333333, "grad_norm": 26.378751754760742, "learning_rate": 3.79106071429878e-08, "logits/chosen": 1.974770426750183, "logits/rejected": 1.9704089164733887, "logps/chosen": -47.67860794067383, "logps/rejected": -44.32589340209961, "loss": 0.6867, "nll_loss": 0.6443054676055908, "rewards/accuracies": 1.0, "rewards/chosen": 1.8629704713821411, "rewards/margins": 4.971579074859619, "rewards/rejected": -3.1086087226867676, "step": 4331 }, { "epoch": 0.722, "grad_norm": 24.657848358154297, "learning_rate": 3.786830214454314e-08, "logits/chosen": 2.8687539100646973, "logits/rejected": 2.727931261062622, "logps/chosen": -12.862323760986328, "logps/rejected": -49.292842864990234, "loss": 0.3111, "nll_loss": 0.28582942485809326, "rewards/accuracies": 1.0, "rewards/chosen": 1.992980718612671, "rewards/margins": 5.767717361450195, "rewards/rejected": -3.7747364044189453, "step": 4332 }, { "epoch": 0.7221666666666666, "grad_norm": 31.69844627380371, "learning_rate": 3.782601524978358e-08, "logits/chosen": 0.46663421392440796, "logits/rejected": 2.275193214416504, "logps/chosen": -63.78065490722656, "logps/rejected": -373.31671142578125, "loss": 0.8003, "nll_loss": 0.7778128385543823, "rewards/accuracies": 1.0, "rewards/chosen": 0.9417503476142883, "rewards/margins": 8.921730995178223, "rewards/rejected": -7.97998046875, "step": 4333 }, { "epoch": 0.7223333333333334, "grad_norm": 13.181014060974121, "learning_rate": 3.778374647103056e-08, "logits/chosen": 1.8041232824325562, "logits/rejected": 2.044543743133545, "logps/chosen": -168.4979248046875, "logps/rejected": -285.4012145996094, "loss": 0.5844, "nll_loss": 0.5770477652549744, "rewards/accuracies": 1.0, "rewards/chosen": 2.188546895980835, "rewards/margins": 9.075684547424316, "rewards/rejected": -6.887137413024902, "step": 4334 }, { "epoch": 0.7225, "grad_norm": 26.076093673706055, "learning_rate": 3.774149582060012e-08, "logits/chosen": 0.662532389163971, "logits/rejected": 1.6629619598388672, "logps/chosen": -24.04840660095215, "logps/rejected": -299.3753662109375, "loss": 0.4998, "nll_loss": 0.47153738141059875, "rewards/accuracies": 1.0, "rewards/chosen": 0.6848112344741821, "rewards/margins": 9.946937561035156, "rewards/rejected": -9.262125968933105, "step": 4335 }, { "epoch": 0.7226666666666667, "grad_norm": 28.33662986755371, "learning_rate": 3.769926331080311e-08, "logits/chosen": 2.1835079193115234, "logits/rejected": 2.03210186958313, "logps/chosen": -6.914756774902344, "logps/rejected": -53.62980651855469, "loss": 0.3204, "nll_loss": 0.2765902876853943, "rewards/accuracies": 1.0, "rewards/chosen": 1.0835379362106323, "rewards/margins": 4.698747634887695, "rewards/rejected": -3.6152095794677734, "step": 4336 }, { "epoch": 0.7228333333333333, "grad_norm": 23.644651412963867, "learning_rate": 3.765704895394508e-08, "logits/chosen": 1.6670728921890259, "logits/rejected": 1.602607011795044, "logps/chosen": -134.78073120117188, "logps/rejected": -130.00863647460938, "loss": 0.8863, "nll_loss": 0.858475923538208, "rewards/accuracies": 1.0, "rewards/chosen": 2.343013048171997, "rewards/margins": 5.813708305358887, "rewards/rejected": -3.4706954956054688, "step": 4337 }, { "epoch": 0.723, "grad_norm": 24.558338165283203, "learning_rate": 3.76148527623263e-08, "logits/chosen": 3.1901133060455322, "logits/rejected": 3.1489055156707764, "logps/chosen": -81.96022033691406, "logps/rejected": -136.11717224121094, "loss": 0.8529, "nll_loss": 0.8363286852836609, "rewards/accuracies": 1.0, "rewards/chosen": 2.137943983078003, "rewards/margins": 6.487758636474609, "rewards/rejected": -4.3498148918151855, "step": 4338 }, { "epoch": 0.7231666666666666, "grad_norm": 30.384523391723633, "learning_rate": 3.757267474824167e-08, "logits/chosen": 2.7754979133605957, "logits/rejected": 2.9793922901153564, "logps/chosen": -29.16736602783203, "logps/rejected": -280.5412902832031, "loss": 0.7483, "nll_loss": 0.7478811740875244, "rewards/accuracies": 1.0, "rewards/chosen": 5.125208377838135, "rewards/margins": 14.373086929321289, "rewards/rejected": -9.247878074645996, "step": 4339 }, { "epoch": 0.7233333333333334, "grad_norm": 22.019624710083008, "learning_rate": 3.7530514923980885e-08, "logits/chosen": 2.0659661293029785, "logits/rejected": 2.1541740894317627, "logps/chosen": -81.98248291015625, "logps/rejected": -409.742919921875, "loss": 0.903, "nll_loss": 0.9009063839912415, "rewards/accuracies": 1.0, "rewards/chosen": 3.429918050765991, "rewards/margins": 11.805729866027832, "rewards/rejected": -8.375811576843262, "step": 4340 }, { "epoch": 0.7235, "grad_norm": 198.3323211669922, "learning_rate": 3.7488373301828294e-08, "logits/chosen": 2.753939151763916, "logits/rejected": 2.73250675201416, "logps/chosen": -88.85730743408203, "logps/rejected": -12.057467460632324, "loss": 3.9725, "nll_loss": 0.9452905654907227, "rewards/accuracies": 0.0, "rewards/chosen": 0.6229118704795837, "rewards/margins": -2.5705771446228027, "rewards/rejected": 3.1934890747070312, "step": 4341 }, { "epoch": 0.7236666666666667, "grad_norm": 39.782798767089844, "learning_rate": 3.744624989406296e-08, "logits/chosen": 3.0285613536834717, "logits/rejected": 3.137542486190796, "logps/chosen": -47.532676696777344, "logps/rejected": -114.722900390625, "loss": 0.8983, "nll_loss": 0.8339066505432129, "rewards/accuracies": 1.0, "rewards/chosen": 0.1483737975358963, "rewards/margins": 4.433645725250244, "rewards/rejected": -4.285272121429443, "step": 4342 }, { "epoch": 0.7238333333333333, "grad_norm": 27.589828491210938, "learning_rate": 3.7404144712958684e-08, "logits/chosen": 2.5200634002685547, "logits/rejected": 2.7263975143432617, "logps/chosen": -23.39286994934082, "logps/rejected": -161.25747680664062, "loss": 0.4556, "nll_loss": 0.4498628079891205, "rewards/accuracies": 1.0, "rewards/chosen": 2.8411364555358887, "rewards/margins": 8.60976791381836, "rewards/rejected": -5.7686309814453125, "step": 4343 }, { "epoch": 0.724, "grad_norm": 18.245458602905273, "learning_rate": 3.7362057770783805e-08, "logits/chosen": 2.5607950687408447, "logits/rejected": 2.677396535873413, "logps/chosen": -233.5752716064453, "logps/rejected": -364.9517822265625, "loss": 1.0092, "nll_loss": 1.0067898035049438, "rewards/accuracies": 1.0, "rewards/chosen": 3.2330703735351562, "rewards/margins": 12.007820129394531, "rewards/rejected": -8.774749755859375, "step": 4344 }, { "epoch": 0.7241666666666666, "grad_norm": 72.80673217773438, "learning_rate": 3.731998907980151e-08, "logits/chosen": 3.0738837718963623, "logits/rejected": 3.0528347492218018, "logps/chosen": -64.92438507080078, "logps/rejected": -61.57384490966797, "loss": 1.3332, "nll_loss": 1.1804431676864624, "rewards/accuracies": 1.0, "rewards/chosen": 1.149941325187683, "rewards/margins": 2.897270441055298, "rewards/rejected": -1.7473291158676147, "step": 4345 }, { "epoch": 0.7243333333333334, "grad_norm": 26.655071258544922, "learning_rate": 3.727793865226959e-08, "logits/chosen": 2.5927891731262207, "logits/rejected": 2.562593460083008, "logps/chosen": -76.29170227050781, "logps/rejected": -169.1020050048828, "loss": 0.8675, "nll_loss": 0.8572100400924683, "rewards/accuracies": 1.0, "rewards/chosen": 1.9075684547424316, "rewards/margins": 8.096467971801758, "rewards/rejected": -6.188899993896484, "step": 4346 }, { "epoch": 0.7245, "grad_norm": 69.85417938232422, "learning_rate": 3.723590650044057e-08, "logits/chosen": 2.4821646213531494, "logits/rejected": 2.119757652282715, "logps/chosen": -130.454345703125, "logps/rejected": -59.60237503051758, "loss": 1.2965, "nll_loss": 1.1149945259094238, "rewards/accuracies": 1.0, "rewards/chosen": -0.9307495355606079, "rewards/margins": 2.7080283164978027, "rewards/rejected": -3.6387779712677, "step": 4347 }, { "epoch": 0.7246666666666667, "grad_norm": 26.292011260986328, "learning_rate": 3.719389263656154e-08, "logits/chosen": 1.0486290454864502, "logits/rejected": 1.925325632095337, "logps/chosen": -64.9493179321289, "logps/rejected": -178.6057586669922, "loss": 0.7399, "nll_loss": 0.713728666305542, "rewards/accuracies": 1.0, "rewards/chosen": 0.8531838059425354, "rewards/margins": 7.011407375335693, "rewards/rejected": -6.158223628997803, "step": 4348 }, { "epoch": 0.7248333333333333, "grad_norm": 27.397972106933594, "learning_rate": 3.715189707287436e-08, "logits/chosen": 0.9850813746452332, "logits/rejected": 2.082143783569336, "logps/chosen": -10.002337455749512, "logps/rejected": -233.0155029296875, "loss": 0.3578, "nll_loss": 0.3334112763404846, "rewards/accuracies": 1.0, "rewards/chosen": 1.062502384185791, "rewards/margins": 6.357275485992432, "rewards/rejected": -5.294773101806641, "step": 4349 }, { "epoch": 0.725, "grad_norm": 637.9271850585938, "learning_rate": 3.7109919821615543e-08, "logits/chosen": 1.777686595916748, "logits/rejected": 1.2643251419067383, "logps/chosen": -298.46514892578125, "logps/rejected": -174.40802001953125, "loss": 3.5966, "nll_loss": 0.8626158833503723, "rewards/accuracies": 0.0, "rewards/chosen": -8.161669731140137, "rewards/margins": -1.4706411361694336, "rewards/rejected": -6.691028594970703, "step": 4350 }, { "epoch": 0.7251666666666666, "grad_norm": 226.83255004882812, "learning_rate": 3.706796089501627e-08, "logits/chosen": 2.2132554054260254, "logits/rejected": 2.2513904571533203, "logps/chosen": -82.7916259765625, "logps/rejected": -18.14679718017578, "loss": 4.5571, "nll_loss": 0.8624128699302673, "rewards/accuracies": 0.0, "rewards/chosen": 2.2508301734924316, "rewards/margins": -2.9562525749206543, "rewards/rejected": 5.207082748413086, "step": 4351 }, { "epoch": 0.7253333333333334, "grad_norm": 25.128543853759766, "learning_rate": 3.702602030530231e-08, "logits/chosen": 2.4942328929901123, "logits/rejected": 2.835364818572998, "logps/chosen": -61.738922119140625, "logps/rejected": -396.8771667480469, "loss": 0.8472, "nll_loss": 0.8343096971511841, "rewards/accuracies": 1.0, "rewards/chosen": 1.5097808837890625, "rewards/margins": 11.407730102539062, "rewards/rejected": -9.89794921875, "step": 4352 }, { "epoch": 0.7255, "grad_norm": 28.747732162475586, "learning_rate": 3.6984098064694167e-08, "logits/chosen": 2.286349058151245, "logits/rejected": 2.495250940322876, "logps/chosen": -66.03337097167969, "logps/rejected": -208.43368530273438, "loss": 0.7621, "nll_loss": 0.7256415486335754, "rewards/accuracies": 1.0, "rewards/chosen": 0.4133712947368622, "rewards/margins": 8.433916091918945, "rewards/rejected": -8.02054500579834, "step": 4353 }, { "epoch": 0.7256666666666667, "grad_norm": 22.668970108032227, "learning_rate": 3.6942194185406985e-08, "logits/chosen": 1.3291716575622559, "logits/rejected": 1.6361913681030273, "logps/chosen": -68.07735443115234, "logps/rejected": -167.91500854492188, "loss": 0.6886, "nll_loss": 0.6740331649780273, "rewards/accuracies": 1.0, "rewards/chosen": 1.4156807661056519, "rewards/margins": 8.688703536987305, "rewards/rejected": -7.273022651672363, "step": 4354 }, { "epoch": 0.7258333333333333, "grad_norm": 30.881479263305664, "learning_rate": 3.690030867965057e-08, "logits/chosen": 2.0072414875030518, "logits/rejected": 2.3465142250061035, "logps/chosen": -60.72187805175781, "logps/rejected": -244.8817138671875, "loss": 0.7918, "nll_loss": 0.7496528029441833, "rewards/accuracies": 1.0, "rewards/chosen": 0.25349923968315125, "rewards/margins": 8.283851623535156, "rewards/rejected": -8.030352592468262, "step": 4355 }, { "epoch": 0.726, "grad_norm": 85.9958267211914, "learning_rate": 3.6858441559629305e-08, "logits/chosen": 2.689777374267578, "logits/rejected": 2.646239995956421, "logps/chosen": -76.74505615234375, "logps/rejected": -5.2867817878723145, "loss": 1.6355, "nll_loss": 1.1991416215896606, "rewards/accuracies": 1.0, "rewards/chosen": 1.5955597162246704, "rewards/margins": 1.6552293300628662, "rewards/rejected": -0.05966959148645401, "step": 4356 }, { "epoch": 0.7261666666666666, "grad_norm": 23.613895416259766, "learning_rate": 3.681659283754228e-08, "logits/chosen": 2.182849168777466, "logits/rejected": 2.2208244800567627, "logps/chosen": -221.72317504882812, "logps/rejected": -178.92269897460938, "loss": 1.3744, "nll_loss": 1.3686615228652954, "rewards/accuracies": 1.0, "rewards/chosen": 2.475598096847534, "rewards/margins": 9.316807746887207, "rewards/rejected": -6.841209411621094, "step": 4357 }, { "epoch": 0.7263333333333334, "grad_norm": 27.42668342590332, "learning_rate": 3.6774762525583216e-08, "logits/chosen": 2.6651744842529297, "logits/rejected": 2.7157580852508545, "logps/chosen": -92.2082290649414, "logps/rejected": -349.98614501953125, "loss": 1.1567, "nll_loss": 1.1526029109954834, "rewards/accuracies": 1.0, "rewards/chosen": 2.7074549198150635, "rewards/margins": 11.457698822021484, "rewards/rejected": -8.750244140625, "step": 4358 }, { "epoch": 0.7265, "grad_norm": 25.173429489135742, "learning_rate": 3.6732950635940484e-08, "logits/chosen": 1.5228832960128784, "logits/rejected": 2.426780939102173, "logps/chosen": -11.590421676635742, "logps/rejected": -424.9669189453125, "loss": 0.3062, "nll_loss": 0.28269320726394653, "rewards/accuracies": 1.0, "rewards/chosen": 0.8706749081611633, "rewards/margins": 14.994585990905762, "rewards/rejected": -14.123910903930664, "step": 4359 }, { "epoch": 0.7266666666666667, "grad_norm": 45.115116119384766, "learning_rate": 3.669115718079702e-08, "logits/chosen": 1.778222918510437, "logits/rejected": 1.887658715248108, "logps/chosen": -42.92095947265625, "logps/rejected": -85.56729125976562, "loss": 1.008, "nll_loss": 0.9754762053489685, "rewards/accuracies": 1.0, "rewards/chosen": 1.0815109014511108, "rewards/margins": 5.324934005737305, "rewards/rejected": -4.243422985076904, "step": 4360 }, { "epoch": 0.7268333333333333, "grad_norm": 40.73110580444336, "learning_rate": 3.6649382172330443e-08, "logits/chosen": 1.1072908639907837, "logits/rejected": 1.9440745115280151, "logps/chosen": -47.84917449951172, "logps/rejected": -467.44171142578125, "loss": 0.9979, "nll_loss": 0.9968577027320862, "rewards/accuracies": 1.0, "rewards/chosen": 4.029966831207275, "rewards/margins": 17.87812042236328, "rewards/rejected": -13.848154067993164, "step": 4361 }, { "epoch": 0.727, "grad_norm": 31.320547103881836, "learning_rate": 3.6607625622713e-08, "logits/chosen": 2.6067769527435303, "logits/rejected": 2.7338316440582275, "logps/chosen": -92.6143798828125, "logps/rejected": -299.3883056640625, "loss": 1.2334, "nll_loss": 1.2186100482940674, "rewards/accuracies": 1.0, "rewards/chosen": 1.3589309453964233, "rewards/margins": 10.587506294250488, "rewards/rejected": -9.228575706481934, "step": 4362 }, { "epoch": 0.7271666666666666, "grad_norm": 20.809507369995117, "learning_rate": 3.656588754411155e-08, "logits/chosen": 2.055431842803955, "logits/rejected": 2.0896143913269043, "logps/chosen": -42.49046325683594, "logps/rejected": -50.38903045654297, "loss": 0.5042, "nll_loss": 0.47211626172065735, "rewards/accuracies": 1.0, "rewards/chosen": 1.891373872756958, "rewards/margins": 5.374233245849609, "rewards/rejected": -3.4828593730926514, "step": 4363 }, { "epoch": 0.7273333333333334, "grad_norm": 24.211959838867188, "learning_rate": 3.652416794868759e-08, "logits/chosen": 1.2893975973129272, "logits/rejected": 2.308776617050171, "logps/chosen": -67.10896301269531, "logps/rejected": -373.50970458984375, "loss": 0.8492, "nll_loss": 0.8388620615005493, "rewards/accuracies": 1.0, "rewards/chosen": 1.7917091846466064, "rewards/margins": 8.935139656066895, "rewards/rejected": -7.143430233001709, "step": 4364 }, { "epoch": 0.7275, "grad_norm": 27.92620086669922, "learning_rate": 3.648246684859716e-08, "logits/chosen": 2.5452983379364014, "logits/rejected": 2.654810905456543, "logps/chosen": -55.7376823425293, "logps/rejected": -154.15557861328125, "loss": 0.7217, "nll_loss": 0.6635438799858093, "rewards/accuracies": 1.0, "rewards/chosen": 2.075169086456299, "rewards/margins": 4.699016571044922, "rewards/rejected": -2.623847484588623, "step": 4365 }, { "epoch": 0.7276666666666667, "grad_norm": 46.86922836303711, "learning_rate": 3.6440784255990965e-08, "logits/chosen": 2.303272008895874, "logits/rejected": 2.63153076171875, "logps/chosen": -1.6265475749969482, "logps/rejected": -126.29417419433594, "loss": 0.1713, "nll_loss": 0.10843650996685028, "rewards/accuracies": 1.0, "rewards/chosen": 1.041955590248108, "rewards/margins": 4.088084697723389, "rewards/rejected": -3.046128988265991, "step": 4366 }, { "epoch": 0.7278333333333333, "grad_norm": 60.79815673828125, "learning_rate": 3.639912018301433e-08, "logits/chosen": 1.7042133808135986, "logits/rejected": 1.621870517730713, "logps/chosen": -11.860044479370117, "logps/rejected": -25.855451583862305, "loss": 0.8188, "nll_loss": 0.45615556836128235, "rewards/accuracies": 1.0, "rewards/chosen": 1.0182348489761353, "rewards/margins": 1.6620476245880127, "rewards/rejected": -0.6438127756118774, "step": 4367 }, { "epoch": 0.728, "grad_norm": 173.72117614746094, "learning_rate": 3.635747464180719e-08, "logits/chosen": 2.424107074737549, "logits/rejected": 2.6150026321411133, "logps/chosen": -27.4086856842041, "logps/rejected": -245.2584991455078, "loss": 2.5335, "nll_loss": 2.4916985034942627, "rewards/accuracies": 1.0, "rewards/chosen": 0.28488388657569885, "rewards/margins": 7.127198696136475, "rewards/rejected": -6.842314720153809, "step": 4368 }, { "epoch": 0.7281666666666666, "grad_norm": 69.25435638427734, "learning_rate": 3.6315847644504014e-08, "logits/chosen": 3.187119245529175, "logits/rejected": 3.4156529903411865, "logps/chosen": -11.757770538330078, "logps/rejected": -310.4829406738281, "loss": 0.7913, "nll_loss": 0.7838513851165771, "rewards/accuracies": 1.0, "rewards/chosen": 2.059309720993042, "rewards/margins": 12.654010772705078, "rewards/rejected": -10.594700813293457, "step": 4369 }, { "epoch": 0.7283333333333334, "grad_norm": 23.612873077392578, "learning_rate": 3.6274239203233913e-08, "logits/chosen": 1.3199756145477295, "logits/rejected": 2.3795998096466064, "logps/chosen": -85.10244750976562, "logps/rejected": -296.52362060546875, "loss": 0.914, "nll_loss": 0.9053452610969543, "rewards/accuracies": 1.0, "rewards/chosen": 1.9159393310546875, "rewards/margins": 11.47491455078125, "rewards/rejected": -9.558975219726562, "step": 4370 }, { "epoch": 0.7285, "grad_norm": 33.285221099853516, "learning_rate": 3.6232649330120604e-08, "logits/chosen": 2.412681818008423, "logits/rejected": 2.5929481983184814, "logps/chosen": -40.13690948486328, "logps/rejected": -314.7283935546875, "loss": 0.8763, "nll_loss": 0.8725414872169495, "rewards/accuracies": 1.0, "rewards/chosen": 4.957492828369141, "rewards/margins": 10.262809753417969, "rewards/rejected": -5.30531644821167, "step": 4371 }, { "epoch": 0.7286666666666667, "grad_norm": 16.390195846557617, "learning_rate": 3.6191078037282395e-08, "logits/chosen": 1.8262109756469727, "logits/rejected": 1.6098068952560425, "logps/chosen": -216.54513549804688, "logps/rejected": -277.55615234375, "loss": 0.8661, "nll_loss": 0.8458794355392456, "rewards/accuracies": 1.0, "rewards/chosen": 1.905114769935608, "rewards/margins": 6.111059665679932, "rewards/rejected": -4.205945014953613, "step": 4372 }, { "epoch": 0.7288333333333333, "grad_norm": 25.688573837280273, "learning_rate": 3.614952533683211e-08, "logits/chosen": 2.777674436569214, "logits/rejected": 2.748621940612793, "logps/chosen": -60.05323028564453, "logps/rejected": -74.53150939941406, "loss": 0.8156, "nll_loss": 0.7601675391197205, "rewards/accuracies": 1.0, "rewards/chosen": 2.875126600265503, "rewards/margins": 5.380723476409912, "rewards/rejected": -2.505596876144409, "step": 4373 }, { "epoch": 0.729, "grad_norm": 23.196144104003906, "learning_rate": 3.6107991240877246e-08, "logits/chosen": 1.662768840789795, "logits/rejected": 1.5852404832839966, "logps/chosen": -132.5133056640625, "logps/rejected": -93.68714904785156, "loss": 0.9392, "nll_loss": 0.9331923127174377, "rewards/accuracies": 1.0, "rewards/chosen": 2.398977756500244, "rewards/margins": 9.36668586730957, "rewards/rejected": -6.967708587646484, "step": 4374 }, { "epoch": 0.7291666666666666, "grad_norm": 30.195833206176758, "learning_rate": 3.606647576151983e-08, "logits/chosen": 2.182699680328369, "logits/rejected": 2.2916572093963623, "logps/chosen": -80.10006713867188, "logps/rejected": -105.1402587890625, "loss": 1.125, "nll_loss": 1.1125011444091797, "rewards/accuracies": 1.0, "rewards/chosen": 1.6227645874023438, "rewards/margins": 8.280600547790527, "rewards/rejected": -6.657835960388184, "step": 4375 }, { "epoch": 0.7293333333333333, "grad_norm": 29.102556228637695, "learning_rate": 3.6024978910856515e-08, "logits/chosen": 2.6987545490264893, "logits/rejected": 2.9493629932403564, "logps/chosen": -44.099178314208984, "logps/rejected": -138.48460388183594, "loss": 0.6559, "nll_loss": 0.6211152076721191, "rewards/accuracies": 1.0, "rewards/chosen": 1.2588443756103516, "rewards/margins": 5.114036560058594, "rewards/rejected": -3.855192184448242, "step": 4376 }, { "epoch": 0.7295, "grad_norm": 22.04100799560547, "learning_rate": 3.598350070097842e-08, "logits/chosen": 2.42341685295105, "logits/rejected": 2.64577054977417, "logps/chosen": -80.90406799316406, "logps/rejected": -295.8746337890625, "loss": 0.777, "nll_loss": 0.7632459998130798, "rewards/accuracies": 1.0, "rewards/chosen": 1.598616123199463, "rewards/margins": 7.608543395996094, "rewards/rejected": -6.009927272796631, "step": 4377 }, { "epoch": 0.7296666666666667, "grad_norm": 22.299922943115234, "learning_rate": 3.594204114397132e-08, "logits/chosen": 1.1647413969039917, "logits/rejected": 1.8871301412582397, "logps/chosen": -45.52302551269531, "logps/rejected": -245.6107940673828, "loss": 0.5671, "nll_loss": 0.5551588535308838, "rewards/accuracies": 1.0, "rewards/chosen": 1.6067086458206177, "rewards/margins": 9.3579740524292, "rewards/rejected": -7.751265048980713, "step": 4378 }, { "epoch": 0.7298333333333333, "grad_norm": 198.70155334472656, "learning_rate": 3.5900600251915556e-08, "logits/chosen": 2.718545436859131, "logits/rejected": 2.5446524620056152, "logps/chosen": -122.30815887451172, "logps/rejected": -19.794801712036133, "loss": 2.6045, "nll_loss": 1.1990994215011597, "rewards/accuracies": 0.0, "rewards/chosen": 1.0933418273925781, "rewards/margins": -0.5494544506072998, "rewards/rejected": 1.642796277999878, "step": 4379 }, { "epoch": 0.73, "grad_norm": 27.354341506958008, "learning_rate": 3.585917803688603e-08, "logits/chosen": 1.521376371383667, "logits/rejected": 2.4397499561309814, "logps/chosen": -72.2136001586914, "logps/rejected": -232.89109802246094, "loss": 0.958, "nll_loss": 0.9378389716148376, "rewards/accuracies": 1.0, "rewards/chosen": 1.0396050214767456, "rewards/margins": 9.520113945007324, "rewards/rejected": -8.480508804321289, "step": 4380 }, { "epoch": 0.7301666666666666, "grad_norm": 22.004117965698242, "learning_rate": 3.5817774510952116e-08, "logits/chosen": 2.244921922683716, "logits/rejected": 2.566495895385742, "logps/chosen": -47.06113052368164, "logps/rejected": -131.1705322265625, "loss": 0.5889, "nll_loss": 0.5810016393661499, "rewards/accuracies": 1.0, "rewards/chosen": 2.0374104976654053, "rewards/margins": 9.85629653930664, "rewards/rejected": -7.818885803222656, "step": 4381 }, { "epoch": 0.7303333333333333, "grad_norm": 131.40846252441406, "learning_rate": 3.577638968617784e-08, "logits/chosen": 2.5323143005371094, "logits/rejected": 2.352517604827881, "logps/chosen": -65.0696029663086, "logps/rejected": -11.632792472839355, "loss": 2.2742, "nll_loss": 0.9430378079414368, "rewards/accuracies": 0.0, "rewards/chosen": 1.8122520446777344, "rewards/margins": -0.24767494201660156, "rewards/rejected": 2.059926986694336, "step": 4382 }, { "epoch": 0.7305, "grad_norm": 43.82183837890625, "learning_rate": 3.573502357462176e-08, "logits/chosen": 2.358593463897705, "logits/rejected": 2.4559831619262695, "logps/chosen": -21.834043502807617, "logps/rejected": -145.5035858154297, "loss": 0.7008, "nll_loss": 0.6065011620521545, "rewards/accuracies": 1.0, "rewards/chosen": 2.7072417736053467, "rewards/margins": 4.594325065612793, "rewards/rejected": -1.8870835304260254, "step": 4383 }, { "epoch": 0.7306666666666667, "grad_norm": 29.508405685424805, "learning_rate": 3.569367618833694e-08, "logits/chosen": 1.5143665075302124, "logits/rejected": 2.158203125, "logps/chosen": -26.80324935913086, "logps/rejected": -271.9091796875, "loss": 0.7256, "nll_loss": 0.724412202835083, "rewards/accuracies": 1.0, "rewards/chosen": 4.734948635101318, "rewards/margins": 11.716903686523438, "rewards/rejected": -6.981955051422119, "step": 4384 }, { "epoch": 0.7308333333333333, "grad_norm": 23.18398094177246, "learning_rate": 3.5652347539371075e-08, "logits/chosen": 1.640596628189087, "logits/rejected": 1.9312808513641357, "logps/chosen": -84.49369812011719, "logps/rejected": -186.46961975097656, "loss": 0.9272, "nll_loss": 0.9184098243713379, "rewards/accuracies": 1.0, "rewards/chosen": 2.042376756668091, "rewards/margins": 8.491825103759766, "rewards/rejected": -6.449448585510254, "step": 4385 }, { "epoch": 0.731, "grad_norm": 28.643531799316406, "learning_rate": 3.5611037639766264e-08, "logits/chosen": 1.2345809936523438, "logits/rejected": 2.0216410160064697, "logps/chosen": -39.126766204833984, "logps/rejected": -168.2138671875, "loss": 0.6527, "nll_loss": 0.6414225101470947, "rewards/accuracies": 1.0, "rewards/chosen": 2.0881099700927734, "rewards/margins": 7.342121601104736, "rewards/rejected": -5.254011631011963, "step": 4386 }, { "epoch": 0.7311666666666666, "grad_norm": 29.585880279541016, "learning_rate": 3.556974650155925e-08, "logits/chosen": 1.8548222780227661, "logits/rejected": 1.7691117525100708, "logps/chosen": -59.312255859375, "logps/rejected": -68.83432006835938, "loss": 0.8115, "nll_loss": 0.7702891826629639, "rewards/accuracies": 1.0, "rewards/chosen": 1.955762505531311, "rewards/margins": 5.066366672515869, "rewards/rejected": -3.1106040477752686, "step": 4387 }, { "epoch": 0.7313333333333333, "grad_norm": 20.068431854248047, "learning_rate": 3.552847413678128e-08, "logits/chosen": 0.5965907573699951, "logits/rejected": 1.7357152700424194, "logps/chosen": -77.52703857421875, "logps/rejected": -403.212890625, "loss": 0.7569, "nll_loss": 0.7526895999908447, "rewards/accuracies": 1.0, "rewards/chosen": 3.1410202980041504, "rewards/margins": 9.209001541137695, "rewards/rejected": -6.067981243133545, "step": 4388 }, { "epoch": 0.7315, "grad_norm": 16.605363845825195, "learning_rate": 3.5487220557458173e-08, "logits/chosen": 1.7875990867614746, "logits/rejected": 1.856461524963379, "logps/chosen": -111.22764587402344, "logps/rejected": -184.4254608154297, "loss": 0.6781, "nll_loss": 0.6581516861915588, "rewards/accuracies": 1.0, "rewards/chosen": 1.8906006813049316, "rewards/margins": 6.130401611328125, "rewards/rejected": -4.239800930023193, "step": 4389 }, { "epoch": 0.7316666666666667, "grad_norm": 21.98750114440918, "learning_rate": 3.5445985775610153e-08, "logits/chosen": 2.841060161590576, "logits/rejected": 2.8302359580993652, "logps/chosen": -149.11048889160156, "logps/rejected": -177.51559448242188, "loss": 1.0114, "nll_loss": 1.0007416009902954, "rewards/accuracies": 1.0, "rewards/chosen": 1.8631576299667358, "rewards/margins": 8.064855575561523, "rewards/rejected": -6.201698303222656, "step": 4390 }, { "epoch": 0.7318333333333333, "grad_norm": 27.296689987182617, "learning_rate": 3.5404769803252085e-08, "logits/chosen": 2.461587429046631, "logits/rejected": 2.287637233734131, "logps/chosen": -112.64586639404297, "logps/rejected": -66.68757629394531, "loss": 1.1558, "nll_loss": 1.1153056621551514, "rewards/accuracies": 1.0, "rewards/chosen": 2.1797966957092285, "rewards/margins": 5.2289581298828125, "rewards/rejected": -3.049161195755005, "step": 4391 }, { "epoch": 0.732, "grad_norm": 23.540494918823242, "learning_rate": 3.5363572652393326e-08, "logits/chosen": 2.5173559188842773, "logits/rejected": 2.639086961746216, "logps/chosen": -56.1619873046875, "logps/rejected": -138.5764617919922, "loss": 0.7381, "nll_loss": 0.7293764352798462, "rewards/accuracies": 1.0, "rewards/chosen": 1.9746826887130737, "rewards/margins": 9.066367149353027, "rewards/rejected": -7.091684818267822, "step": 4392 }, { "epoch": 0.7321666666666666, "grad_norm": 126.41777038574219, "learning_rate": 3.5322394335037745e-08, "logits/chosen": 2.754221200942993, "logits/rejected": 2.757467031478882, "logps/chosen": -69.2834243774414, "logps/rejected": -63.70365905761719, "loss": 2.0477, "nll_loss": 1.8232483863830566, "rewards/accuracies": 1.0, "rewards/chosen": -1.6755188703536987, "rewards/margins": 3.2773022651672363, "rewards/rejected": -4.952821254730225, "step": 4393 }, { "epoch": 0.7323333333333333, "grad_norm": 27.037473678588867, "learning_rate": 3.528123486318366e-08, "logits/chosen": 1.8086014986038208, "logits/rejected": 1.983916163444519, "logps/chosen": -63.92056655883789, "logps/rejected": -155.83274841308594, "loss": 0.7901, "nll_loss": 0.7701272964477539, "rewards/accuracies": 1.0, "rewards/chosen": 1.1095927953720093, "rewards/margins": 7.721438407897949, "rewards/rejected": -6.61184549331665, "step": 4394 }, { "epoch": 0.7325, "grad_norm": 55.9880485534668, "learning_rate": 3.5240094248823995e-08, "logits/chosen": 2.527930736541748, "logits/rejected": 2.8527274131774902, "logps/chosen": -22.07099723815918, "logps/rejected": -279.10980224609375, "loss": 1.0757, "nll_loss": 1.0509998798370361, "rewards/accuracies": 1.0, "rewards/chosen": 0.8178099393844604, "rewards/margins": 11.191986083984375, "rewards/rejected": -10.374176025390625, "step": 4395 }, { "epoch": 0.7326666666666667, "grad_norm": 99.57428741455078, "learning_rate": 3.519897250394611e-08, "logits/chosen": 3.169907569885254, "logits/rejected": 3.351424217224121, "logps/chosen": -65.44595336914062, "logps/rejected": -246.9162139892578, "loss": 1.9798, "nll_loss": 1.8179430961608887, "rewards/accuracies": 1.0, "rewards/chosen": -1.4214156866073608, "rewards/margins": 6.439568042755127, "rewards/rejected": -7.860983848571777, "step": 4396 }, { "epoch": 0.7328333333333333, "grad_norm": 19.982481002807617, "learning_rate": 3.515786964053197e-08, "logits/chosen": 2.743206024169922, "logits/rejected": 2.892174482345581, "logps/chosen": -74.52011108398438, "logps/rejected": -311.56512451171875, "loss": 0.7475, "nll_loss": 0.7378227114677429, "rewards/accuracies": 1.0, "rewards/chosen": 1.9705413579940796, "rewards/margins": 8.162050247192383, "rewards/rejected": -6.191508769989014, "step": 4397 }, { "epoch": 0.733, "grad_norm": 25.838138580322266, "learning_rate": 3.511678567055786e-08, "logits/chosen": 2.9025650024414062, "logits/rejected": 2.872943878173828, "logps/chosen": -42.120933532714844, "logps/rejected": -124.82757568359375, "loss": 0.6694, "nll_loss": 0.658139705657959, "rewards/accuracies": 1.0, "rewards/chosen": 2.4562675952911377, "rewards/margins": 7.223357200622559, "rewards/rejected": -4.76708984375, "step": 4398 }, { "epoch": 0.7331666666666666, "grad_norm": 18.648576736450195, "learning_rate": 3.5075720605994706e-08, "logits/chosen": 1.5287076234817505, "logits/rejected": 1.0092421770095825, "logps/chosen": -93.91620635986328, "logps/rejected": -62.424888610839844, "loss": 0.6738, "nll_loss": 0.6613817811012268, "rewards/accuracies": 1.0, "rewards/chosen": 2.6404521465301514, "rewards/margins": 7.114805221557617, "rewards/rejected": -4.474353313446045, "step": 4399 }, { "epoch": 0.7333333333333333, "grad_norm": 23.7491512298584, "learning_rate": 3.503467445880789e-08, "logits/chosen": 1.9562304019927979, "logits/rejected": 2.293712854385376, "logps/chosen": -34.252201080322266, "logps/rejected": -203.54800415039062, "loss": 0.5069, "nll_loss": 0.49640870094299316, "rewards/accuracies": 1.0, "rewards/chosen": 1.8040649890899658, "rewards/margins": 8.544550895690918, "rewards/rejected": -6.740486145019531, "step": 4400 }, { "epoch": 0.7335, "grad_norm": 135.3563690185547, "learning_rate": 3.49936472409573e-08, "logits/chosen": 2.8385844230651855, "logits/rejected": 2.896512031555176, "logps/chosen": -78.35787963867188, "logps/rejected": -46.95380401611328, "loss": 1.9283, "nll_loss": 1.0176347494125366, "rewards/accuracies": 1.0, "rewards/chosen": 1.4774017333984375, "rewards/margins": 0.38140785694122314, "rewards/rejected": 1.0959938764572144, "step": 4401 }, { "epoch": 0.7336666666666667, "grad_norm": 33.54524230957031, "learning_rate": 3.495263896439723e-08, "logits/chosen": 3.1635446548461914, "logits/rejected": 3.10957407951355, "logps/chosen": -75.06829833984375, "logps/rejected": -139.33995056152344, "loss": 0.9698, "nll_loss": 0.9154670238494873, "rewards/accuracies": 1.0, "rewards/chosen": 1.0150032043457031, "rewards/margins": 4.320399761199951, "rewards/rejected": -3.305396556854248, "step": 4402 }, { "epoch": 0.7338333333333333, "grad_norm": 58.908912658691406, "learning_rate": 3.4911649641076515e-08, "logits/chosen": 2.146240234375, "logits/rejected": 2.124145269393921, "logps/chosen": -19.648170471191406, "logps/rejected": -81.59056854248047, "loss": 0.8538, "nll_loss": 0.8186737895011902, "rewards/accuracies": 1.0, "rewards/chosen": 2.2692887783050537, "rewards/margins": 5.468897342681885, "rewards/rejected": -3.199608564376831, "step": 4403 }, { "epoch": 0.734, "grad_norm": 71.86007690429688, "learning_rate": 3.4870679282938474e-08, "logits/chosen": 2.619368076324463, "logits/rejected": 2.5107412338256836, "logps/chosen": -86.93575286865234, "logps/rejected": -184.44131469726562, "loss": 1.3079, "nll_loss": 1.1004525423049927, "rewards/accuracies": 1.0, "rewards/chosen": -1.7470734119415283, "rewards/margins": 4.648728370666504, "rewards/rejected": -6.395801544189453, "step": 4404 }, { "epoch": 0.7341666666666666, "grad_norm": 129.0084991455078, "learning_rate": 3.482972790192088e-08, "logits/chosen": 1.7370110750198364, "logits/rejected": 1.8983513116836548, "logps/chosen": -17.75295639038086, "logps/rejected": -44.70830535888672, "loss": 1.2121, "nll_loss": 0.4931376874446869, "rewards/accuracies": 1.0, "rewards/chosen": 0.19984054565429688, "rewards/margins": 0.3995327055454254, "rewards/rejected": -0.19969215989112854, "step": 4405 }, { "epoch": 0.7343333333333333, "grad_norm": 49.42082214355469, "learning_rate": 3.4788795509956026e-08, "logits/chosen": 3.064016342163086, "logits/rejected": 3.0904147624969482, "logps/chosen": -10.202363967895508, "logps/rejected": -206.43405151367188, "loss": 0.4892, "nll_loss": 0.48582693934440613, "rewards/accuracies": 1.0, "rewards/chosen": 2.9494333267211914, "rewards/margins": 10.881998062133789, "rewards/rejected": -7.932565212249756, "step": 4406 }, { "epoch": 0.7345, "grad_norm": 33.10398864746094, "learning_rate": 3.474788211897056e-08, "logits/chosen": 2.0059235095977783, "logits/rejected": 1.4131847620010376, "logps/chosen": -111.4149169921875, "logps/rejected": -40.63103485107422, "loss": 1.1808, "nll_loss": 1.1141493320465088, "rewards/accuracies": 1.0, "rewards/chosen": 1.3413338661193848, "rewards/margins": 4.104175567626953, "rewards/rejected": -2.7628419399261475, "step": 4407 }, { "epoch": 0.7346666666666667, "grad_norm": 19.091890335083008, "learning_rate": 3.4706987740885684e-08, "logits/chosen": 2.5279390811920166, "logits/rejected": 2.9465696811676025, "logps/chosen": -42.31359100341797, "logps/rejected": -91.25885009765625, "loss": 0.4661, "nll_loss": 0.4501447081565857, "rewards/accuracies": 1.0, "rewards/chosen": 1.634796142578125, "rewards/margins": 6.790907859802246, "rewards/rejected": -5.156111717224121, "step": 4408 }, { "epoch": 0.7348333333333333, "grad_norm": 130.4712371826172, "learning_rate": 3.4666112387617044e-08, "logits/chosen": 2.5583415031433105, "logits/rejected": 2.337709665298462, "logps/chosen": -66.11991882324219, "logps/rejected": -68.86996459960938, "loss": 1.3632, "nll_loss": 0.9312664866447449, "rewards/accuracies": 1.0, "rewards/chosen": -0.9618954658508301, "rewards/margins": 1.0405352115631104, "rewards/rejected": -2.0024306774139404, "step": 4409 }, { "epoch": 0.735, "grad_norm": 60.53179168701172, "learning_rate": 3.462525607107477e-08, "logits/chosen": 2.668607711791992, "logits/rejected": 2.7743563652038574, "logps/chosen": -65.36248016357422, "logps/rejected": -211.57180786132812, "loss": 1.87, "nll_loss": 1.8674994707107544, "rewards/accuracies": 1.0, "rewards/chosen": 3.51837158203125, "rewards/margins": 10.330046653747559, "rewards/rejected": -6.811675071716309, "step": 4410 }, { "epoch": 0.7351666666666666, "grad_norm": 26.52422523498535, "learning_rate": 3.458441880316335e-08, "logits/chosen": 4.285658359527588, "logits/rejected": 4.449914932250977, "logps/chosen": -27.75625228881836, "logps/rejected": -158.0164031982422, "loss": 0.5145, "nll_loss": 0.504659116268158, "rewards/accuracies": 1.0, "rewards/chosen": 1.9977532625198364, "rewards/margins": 7.9619903564453125, "rewards/rejected": -5.964237213134766, "step": 4411 }, { "epoch": 0.7353333333333333, "grad_norm": 51.95535659790039, "learning_rate": 3.454360059578182e-08, "logits/chosen": 2.7838947772979736, "logits/rejected": 2.8752596378326416, "logps/chosen": -34.52742385864258, "logps/rejected": -151.63394165039062, "loss": 0.923, "nll_loss": 0.9086165428161621, "rewards/accuracies": 1.0, "rewards/chosen": 1.8994255065917969, "rewards/margins": 6.8311767578125, "rewards/rejected": -4.931751251220703, "step": 4412 }, { "epoch": 0.7355, "grad_norm": 46.1877326965332, "learning_rate": 3.4502801460823604e-08, "logits/chosen": 2.6657962799072266, "logits/rejected": 2.363619804382324, "logps/chosen": -59.541099548339844, "logps/rejected": -49.788490295410156, "loss": 1.0456, "nll_loss": 0.992351770401001, "rewards/accuracies": 1.0, "rewards/chosen": 1.1798622608184814, "rewards/margins": 4.378111839294434, "rewards/rejected": -3.198249340057373, "step": 4413 }, { "epoch": 0.7356666666666667, "grad_norm": 364.6678466796875, "learning_rate": 3.4462021410176656e-08, "logits/chosen": 2.1864075660705566, "logits/rejected": 2.314116954803467, "logps/chosen": -44.7902946472168, "logps/rejected": -37.25639343261719, "loss": 3.0435, "nll_loss": 0.8143690228462219, "rewards/accuracies": 0.0, "rewards/chosen": 1.7027438879013062, "rewards/margins": -1.4664448499679565, "rewards/rejected": 3.1691887378692627, "step": 4414 }, { "epoch": 0.7358333333333333, "grad_norm": 21.00322723388672, "learning_rate": 3.44212604557232e-08, "logits/chosen": 2.0306966304779053, "logits/rejected": 2.3317031860351562, "logps/chosen": -56.88840866088867, "logps/rejected": -257.6824951171875, "loss": 0.6724, "nll_loss": 0.6614930629730225, "rewards/accuracies": 1.0, "rewards/chosen": 2.0904500484466553, "rewards/margins": 7.423332214355469, "rewards/rejected": -5.332881927490234, "step": 4415 }, { "epoch": 0.736, "grad_norm": 63.20002746582031, "learning_rate": 3.438051860934007e-08, "logits/chosen": 2.63622784614563, "logits/rejected": 2.691051721572876, "logps/chosen": -50.834659576416016, "logps/rejected": -51.70391082763672, "loss": 1.849, "nll_loss": 1.8155237436294556, "rewards/accuracies": 1.0, "rewards/chosen": 1.9737331867218018, "rewards/margins": 5.359708309173584, "rewards/rejected": -3.3859751224517822, "step": 4416 }, { "epoch": 0.7361666666666666, "grad_norm": 29.23079490661621, "learning_rate": 3.4339795882898426e-08, "logits/chosen": 2.4009106159210205, "logits/rejected": 2.6553165912628174, "logps/chosen": -43.5157585144043, "logps/rejected": -248.822021484375, "loss": 0.6496, "nll_loss": 0.6306632161140442, "rewards/accuracies": 1.0, "rewards/chosen": 1.160728096961975, "rewards/margins": 7.858341693878174, "rewards/rejected": -6.697613716125488, "step": 4417 }, { "epoch": 0.7363333333333333, "grad_norm": 233.4296112060547, "learning_rate": 3.429909228826395e-08, "logits/chosen": 2.778329610824585, "logits/rejected": 2.7738168239593506, "logps/chosen": -71.16275024414062, "logps/rejected": -68.52554321289062, "loss": 2.7813, "nll_loss": 0.6411058306694031, "rewards/accuracies": 0.0, "rewards/chosen": 1.5107948780059814, "rewards/margins": -1.4033915996551514, "rewards/rejected": 2.914186477661133, "step": 4418 }, { "epoch": 0.7365, "grad_norm": 32.43133544921875, "learning_rate": 3.425840783729663e-08, "logits/chosen": 1.7688813209533691, "logits/rejected": 2.6155223846435547, "logps/chosen": -40.10017395019531, "logps/rejected": -160.18112182617188, "loss": 0.8115, "nll_loss": 0.8020035028457642, "rewards/accuracies": 1.0, "rewards/chosen": 1.889923095703125, "rewards/margins": 8.91364860534668, "rewards/rejected": -7.0237250328063965, "step": 4419 }, { "epoch": 0.7366666666666667, "grad_norm": 15.289630889892578, "learning_rate": 3.421774254185096e-08, "logits/chosen": 1.1467492580413818, "logits/rejected": 1.992287516593933, "logps/chosen": -123.40872192382812, "logps/rejected": -323.05169677734375, "loss": 0.6395, "nll_loss": 0.6361273527145386, "rewards/accuracies": 1.0, "rewards/chosen": 2.884472608566284, "rewards/margins": 11.59552001953125, "rewards/rejected": -8.711047172546387, "step": 4420 }, { "epoch": 0.7368333333333333, "grad_norm": 42.6740837097168, "learning_rate": 3.417709641377583e-08, "logits/chosen": 2.072702646255493, "logits/rejected": 2.0832924842834473, "logps/chosen": -16.994525909423828, "logps/rejected": -86.41203308105469, "loss": 0.5417, "nll_loss": 0.43575698137283325, "rewards/accuracies": 1.0, "rewards/chosen": 0.8553010821342468, "rewards/margins": 3.2749881744384766, "rewards/rejected": -2.419687032699585, "step": 4421 }, { "epoch": 0.737, "grad_norm": 34.05821990966797, "learning_rate": 3.4136469464914576e-08, "logits/chosen": 2.5639894008636475, "logits/rejected": 2.348661184310913, "logps/chosen": -78.9921875, "logps/rejected": -62.036922454833984, "loss": 1.0582, "nll_loss": 1.0393707752227783, "rewards/accuracies": 1.0, "rewards/chosen": 1.8564568758010864, "rewards/margins": 6.24359655380249, "rewards/rejected": -4.387139797210693, "step": 4422 }, { "epoch": 0.7371666666666666, "grad_norm": 124.71944427490234, "learning_rate": 3.409586170710485e-08, "logits/chosen": 2.2421393394470215, "logits/rejected": 2.580742359161377, "logps/chosen": -30.873687744140625, "logps/rejected": -473.0421142578125, "loss": 2.1418, "nll_loss": 2.058245897293091, "rewards/accuracies": 1.0, "rewards/chosen": -0.5569120645523071, "rewards/margins": 9.041162490844727, "rewards/rejected": -9.598074913024902, "step": 4423 }, { "epoch": 0.7373333333333333, "grad_norm": 185.05039978027344, "learning_rate": 3.405527315217883e-08, "logits/chosen": 2.663616418838501, "logits/rejected": 2.7294740676879883, "logps/chosen": -41.99945831298828, "logps/rejected": -35.180397033691406, "loss": 3.5374, "nll_loss": 0.4666605591773987, "rewards/accuracies": 0.0, "rewards/chosen": 2.3065643310546875, "rewards/margins": -2.282212257385254, "rewards/rejected": 4.588776588439941, "step": 4424 }, { "epoch": 0.7375, "grad_norm": 49.61262512207031, "learning_rate": 3.401470381196302e-08, "logits/chosen": 0.7422619462013245, "logits/rejected": 2.1224584579467773, "logps/chosen": -11.833138465881348, "logps/rejected": -285.4329833984375, "loss": 0.4938, "nll_loss": 0.4930473864078522, "rewards/accuracies": 1.0, "rewards/chosen": 4.712991237640381, "rewards/margins": 12.64997673034668, "rewards/rejected": -7.936985969543457, "step": 4425 }, { "epoch": 0.7376666666666667, "grad_norm": 24.60544204711914, "learning_rate": 3.3974153698278374e-08, "logits/chosen": 1.7482945919036865, "logits/rejected": 1.9035403728485107, "logps/chosen": -92.7166748046875, "logps/rejected": -162.56246948242188, "loss": 0.8882, "nll_loss": 0.8746857047080994, "rewards/accuracies": 1.0, "rewards/chosen": 1.5141953229904175, "rewards/margins": 8.369647026062012, "rewards/rejected": -6.855452060699463, "step": 4426 }, { "epoch": 0.7378333333333333, "grad_norm": 31.607057571411133, "learning_rate": 3.3933622822940237e-08, "logits/chosen": 2.587800979614258, "logits/rejected": 2.9161531925201416, "logps/chosen": -45.22269821166992, "logps/rejected": -397.30194091796875, "loss": 0.736, "nll_loss": 0.7178205847740173, "rewards/accuracies": 1.0, "rewards/chosen": 1.1456222534179688, "rewards/margins": 9.916783332824707, "rewards/rejected": -8.771161079406738, "step": 4427 }, { "epoch": 0.738, "grad_norm": 70.1834716796875, "learning_rate": 3.3893111197758275e-08, "logits/chosen": 2.0487186908721924, "logits/rejected": 2.6837103366851807, "logps/chosen": -53.85206604003906, "logps/rejected": -170.501953125, "loss": 1.9297, "nll_loss": 1.923288106918335, "rewards/accuracies": 1.0, "rewards/chosen": 2.3029487133026123, "rewards/margins": 9.485806465148926, "rewards/rejected": -7.182857513427734, "step": 4428 }, { "epoch": 0.7381666666666666, "grad_norm": 42.302799224853516, "learning_rate": 3.385261883453664e-08, "logits/chosen": 2.4795212745666504, "logits/rejected": 2.7218644618988037, "logps/chosen": -17.20905113220215, "logps/rejected": -49.28532409667969, "loss": 0.6946, "nll_loss": 0.6373722553253174, "rewards/accuracies": 1.0, "rewards/chosen": 1.1477851867675781, "rewards/margins": 4.257823944091797, "rewards/rejected": -3.1100387573242188, "step": 4429 }, { "epoch": 0.7383333333333333, "grad_norm": 22.972381591796875, "learning_rate": 3.3812145745073826e-08, "logits/chosen": 2.8281102180480957, "logits/rejected": 2.983719825744629, "logps/chosen": -50.20105743408203, "logps/rejected": -371.4195556640625, "loss": 0.6439, "nll_loss": 0.6275132298469543, "rewards/accuracies": 1.0, "rewards/chosen": 1.2446091175079346, "rewards/margins": 11.879607200622559, "rewards/rejected": -10.634998321533203, "step": 4430 }, { "epoch": 0.7385, "grad_norm": 321.9275207519531, "learning_rate": 3.377169194116275e-08, "logits/chosen": 0.45631495118141174, "logits/rejected": 0.5652573108673096, "logps/chosen": -164.8110809326172, "logps/rejected": -139.06829833984375, "loss": 2.4172, "nll_loss": 0.7423924207687378, "rewards/accuracies": 0.0, "rewards/chosen": -4.788674831390381, "rewards/margins": -0.8108470439910889, "rewards/rejected": -3.977827787399292, "step": 4431 }, { "epoch": 0.7386666666666667, "grad_norm": 25.929550170898438, "learning_rate": 3.3731257434590634e-08, "logits/chosen": 1.6436285972595215, "logits/rejected": 2.1067564487457275, "logps/chosen": -108.18675231933594, "logps/rejected": -541.345947265625, "loss": 1.0145, "nll_loss": 0.9925389885902405, "rewards/accuracies": 1.0, "rewards/chosen": 0.9342193603515625, "rewards/margins": 13.16125774383545, "rewards/rejected": -12.227038383483887, "step": 4432 }, { "epoch": 0.7388333333333333, "grad_norm": 34.8016242980957, "learning_rate": 3.3690842237139125e-08, "logits/chosen": 2.5378830432891846, "logits/rejected": 2.4460244178771973, "logps/chosen": -57.209110260009766, "logps/rejected": -62.67335510253906, "loss": 0.9277, "nll_loss": 0.8938924074172974, "rewards/accuracies": 1.0, "rewards/chosen": 0.7792431116104126, "rewards/margins": 5.59920072555542, "rewards/rejected": -4.819957733154297, "step": 4433 }, { "epoch": 0.739, "grad_norm": 26.992206573486328, "learning_rate": 3.365044636058427e-08, "logits/chosen": 2.754410743713379, "logits/rejected": 2.6837449073791504, "logps/chosen": -51.733558654785156, "logps/rejected": -106.54468536376953, "loss": 0.6628, "nll_loss": 0.6015529632568359, "rewards/accuracies": 1.0, "rewards/chosen": 2.0486199855804443, "rewards/margins": 4.621604919433594, "rewards/rejected": -2.5729851722717285, "step": 4434 }, { "epoch": 0.7391666666666666, "grad_norm": 227.19729614257812, "learning_rate": 3.3610069816696476e-08, "logits/chosen": 4.095959663391113, "logits/rejected": 4.232916355133057, "logps/chosen": -17.42325210571289, "logps/rejected": -163.11590576171875, "loss": 2.5284, "nll_loss": 2.4890360832214355, "rewards/accuracies": 1.0, "rewards/chosen": 0.724888801574707, "rewards/margins": 5.150326728820801, "rewards/rejected": -4.425437927246094, "step": 4435 }, { "epoch": 0.7393333333333333, "grad_norm": 28.313480377197266, "learning_rate": 3.356971261724043e-08, "logits/chosen": 3.0681076049804688, "logits/rejected": 3.197755813598633, "logps/chosen": -65.54495239257812, "logps/rejected": -292.597412109375, "loss": 0.9008, "nll_loss": 0.8857426047325134, "rewards/accuracies": 1.0, "rewards/chosen": 1.339440941810608, "rewards/margins": 10.033329010009766, "rewards/rejected": -8.693887710571289, "step": 4436 }, { "epoch": 0.7395, "grad_norm": 55.67386245727539, "learning_rate": 3.352937477397529e-08, "logits/chosen": 1.591637134552002, "logits/rejected": 1.952936053276062, "logps/chosen": -43.36653518676758, "logps/rejected": -111.17417907714844, "loss": 1.1976, "nll_loss": 1.1119623184204102, "rewards/accuracies": 1.0, "rewards/chosen": 0.6056777834892273, "rewards/margins": 3.544149875640869, "rewards/rejected": -2.938472032546997, "step": 4437 }, { "epoch": 0.7396666666666667, "grad_norm": 28.519405364990234, "learning_rate": 3.348905629865454e-08, "logits/chosen": 1.355151653289795, "logits/rejected": 2.114093542098999, "logps/chosen": -15.155498504638672, "logps/rejected": -174.30372619628906, "loss": 0.4253, "nll_loss": 0.40960800647735596, "rewards/accuracies": 1.0, "rewards/chosen": 1.354630708694458, "rewards/margins": 8.12624454498291, "rewards/rejected": -6.771614074707031, "step": 4438 }, { "epoch": 0.7398333333333333, "grad_norm": 29.096452713012695, "learning_rate": 3.344875720302604e-08, "logits/chosen": 2.7328717708587646, "logits/rejected": 2.7989020347595215, "logps/chosen": -22.686561584472656, "logps/rejected": -75.9841537475586, "loss": 0.4828, "nll_loss": 0.4726366698741913, "rewards/accuracies": 1.0, "rewards/chosen": 2.3902077674865723, "rewards/margins": 7.427120208740234, "rewards/rejected": -5.036912441253662, "step": 4439 }, { "epoch": 0.74, "grad_norm": 113.32510375976562, "learning_rate": 3.340847749883191e-08, "logits/chosen": 2.856511116027832, "logits/rejected": 2.72061824798584, "logps/chosen": -79.82642364501953, "logps/rejected": -83.36015319824219, "loss": 1.683, "nll_loss": 1.3086299896240234, "rewards/accuracies": 1.0, "rewards/chosen": -2.040635824203491, "rewards/margins": 1.7928698062896729, "rewards/rejected": -3.833505630493164, "step": 4440 }, { "epoch": 0.7401666666666666, "grad_norm": 25.446260452270508, "learning_rate": 3.3368217197808734e-08, "logits/chosen": 1.5851858854293823, "logits/rejected": 2.140037775039673, "logps/chosen": -80.93708801269531, "logps/rejected": -567.2486572265625, "loss": 1.0196, "nll_loss": 1.0117135047912598, "rewards/accuracies": 1.0, "rewards/chosen": 1.9932632446289062, "rewards/margins": 19.930513381958008, "rewards/rejected": -17.9372501373291, "step": 4441 }, { "epoch": 0.7403333333333333, "grad_norm": 41.60860061645508, "learning_rate": 3.332797631168739e-08, "logits/chosen": 1.7203097343444824, "logits/rejected": 2.2761411666870117, "logps/chosen": -17.983675003051758, "logps/rejected": -200.1513671875, "loss": 0.5867, "nll_loss": 0.5801185369491577, "rewards/accuracies": 1.0, "rewards/chosen": 2.1801862716674805, "rewards/margins": 11.592702865600586, "rewards/rejected": -9.412516593933105, "step": 4442 }, { "epoch": 0.7405, "grad_norm": 17.79561996459961, "learning_rate": 3.328775485219314e-08, "logits/chosen": 2.586644411087036, "logits/rejected": 2.6313834190368652, "logps/chosen": -75.8560791015625, "logps/rejected": -164.86322021484375, "loss": 0.6553, "nll_loss": 0.6483426094055176, "rewards/accuracies": 1.0, "rewards/chosen": 2.4154274463653564, "rewards/margins": 8.484662055969238, "rewards/rejected": -6.069234848022461, "step": 4443 }, { "epoch": 0.7406666666666667, "grad_norm": 21.022872924804688, "learning_rate": 3.3247552831045475e-08, "logits/chosen": 1.1117165088653564, "logits/rejected": 2.823686122894287, "logps/chosen": -44.95714569091797, "logps/rejected": -488.5502014160156, "loss": 0.6097, "nll_loss": 0.599428653717041, "rewards/accuracies": 1.0, "rewards/chosen": 2.0544114112854004, "rewards/margins": 7.669661045074463, "rewards/rejected": -5.6152496337890625, "step": 4444 }, { "epoch": 0.7408333333333333, "grad_norm": 21.808528900146484, "learning_rate": 3.320737025995834e-08, "logits/chosen": 1.7290759086608887, "logits/rejected": 2.559751272201538, "logps/chosen": -76.03950500488281, "logps/rejected": -102.23855590820312, "loss": 0.7107, "nll_loss": 0.6976101398468018, "rewards/accuracies": 1.0, "rewards/chosen": 3.1264896392822266, "rewards/margins": 7.315877437591553, "rewards/rejected": -4.189387798309326, "step": 4445 }, { "epoch": 0.741, "grad_norm": 33.72624969482422, "learning_rate": 3.316720715064e-08, "logits/chosen": 2.767315149307251, "logits/rejected": 2.9839391708374023, "logps/chosen": -56.67662048339844, "logps/rejected": -339.8316345214844, "loss": 1.0428, "nll_loss": 1.012082576751709, "rewards/accuracies": 1.0, "rewards/chosen": 0.6678314208984375, "rewards/margins": 6.715692520141602, "rewards/rejected": -6.047861099243164, "step": 4446 }, { "epoch": 0.7411666666666666, "grad_norm": 23.272056579589844, "learning_rate": 3.3127063514792975e-08, "logits/chosen": 2.738896369934082, "logits/rejected": 2.6416335105895996, "logps/chosen": -21.072982788085938, "logps/rejected": -54.90850067138672, "loss": 0.4278, "nll_loss": 0.3902404308319092, "rewards/accuracies": 1.0, "rewards/chosen": 2.1217305660247803, "rewards/margins": 5.292977333068848, "rewards/rejected": -3.1712470054626465, "step": 4447 }, { "epoch": 0.7413333333333333, "grad_norm": 26.664329528808594, "learning_rate": 3.3086939364114206e-08, "logits/chosen": 2.679959774017334, "logits/rejected": 2.803966760635376, "logps/chosen": -120.39311981201172, "logps/rejected": -414.6575012207031, "loss": 0.9945, "nll_loss": 0.9788058996200562, "rewards/accuracies": 1.0, "rewards/chosen": 1.2874977588653564, "rewards/margins": 10.270893096923828, "rewards/rejected": -8.98339557647705, "step": 4448 }, { "epoch": 0.7415, "grad_norm": 29.191781997680664, "learning_rate": 3.304683471029485e-08, "logits/chosen": 2.7168476581573486, "logits/rejected": 2.7461817264556885, "logps/chosen": -55.65968322753906, "logps/rejected": -81.91930389404297, "loss": 0.7827, "nll_loss": 0.7624614238739014, "rewards/accuracies": 1.0, "rewards/chosen": 1.6697425842285156, "rewards/margins": 6.126322269439697, "rewards/rejected": -4.456579685211182, "step": 4449 }, { "epoch": 0.7416666666666667, "grad_norm": 33.60956573486328, "learning_rate": 3.300674956502046e-08, "logits/chosen": 2.8364768028259277, "logits/rejected": 2.990494966506958, "logps/chosen": -46.089420318603516, "logps/rejected": -267.56927490234375, "loss": 0.9254, "nll_loss": 0.886335015296936, "rewards/accuracies": 1.0, "rewards/chosen": 1.0907936096191406, "rewards/margins": 4.910208702087402, "rewards/rejected": -3.819415330886841, "step": 4450 }, { "epoch": 0.7418333333333333, "grad_norm": 23.50255584716797, "learning_rate": 3.2966683939970906e-08, "logits/chosen": 2.788665771484375, "logits/rejected": 2.8125059604644775, "logps/chosen": -8.982057571411133, "logps/rejected": -192.85150146484375, "loss": 0.2849, "nll_loss": 0.28068920969963074, "rewards/accuracies": 1.0, "rewards/chosen": 2.7154061794281006, "rewards/margins": 10.244935989379883, "rewards/rejected": -7.529529571533203, "step": 4451 }, { "epoch": 0.742, "grad_norm": 10.099291801452637, "learning_rate": 3.292663784682036e-08, "logits/chosen": 1.2065142393112183, "logits/rejected": 0.9618873596191406, "logps/chosen": -130.1002655029297, "logps/rejected": -183.42715454101562, "loss": 0.5326, "nll_loss": 0.5310214757919312, "rewards/accuracies": 1.0, "rewards/chosen": 3.7786502838134766, "rewards/margins": 11.637617111206055, "rewards/rejected": -7.858967304229736, "step": 4452 }, { "epoch": 0.7421666666666666, "grad_norm": 19.073562622070312, "learning_rate": 3.288661129723724e-08, "logits/chosen": 1.445123314857483, "logits/rejected": 1.553080677986145, "logps/chosen": -200.61700439453125, "logps/rejected": -200.66705322265625, "loss": 1.0332, "nll_loss": 1.023556113243103, "rewards/accuracies": 1.0, "rewards/chosen": 1.7985825538635254, "rewards/margins": 10.610723495483398, "rewards/rejected": -8.812140464782715, "step": 4453 }, { "epoch": 0.7423333333333333, "grad_norm": 24.461225509643555, "learning_rate": 3.284660430288435e-08, "logits/chosen": 1.204533576965332, "logits/rejected": 2.729745626449585, "logps/chosen": -65.60909271240234, "logps/rejected": -445.28765869140625, "loss": 0.7565, "nll_loss": 0.7455577254295349, "rewards/accuracies": 1.0, "rewards/chosen": 1.7013664245605469, "rewards/margins": 9.164155960083008, "rewards/rejected": -7.462790012359619, "step": 4454 }, { "epoch": 0.7425, "grad_norm": 33.63466262817383, "learning_rate": 3.2806616875418756e-08, "logits/chosen": 2.5811874866485596, "logits/rejected": 2.4732327461242676, "logps/chosen": -22.02916717529297, "logps/rejected": -70.06352233886719, "loss": 0.5579, "nll_loss": 0.5245040059089661, "rewards/accuracies": 1.0, "rewards/chosen": 1.3640313148498535, "rewards/margins": 5.179958343505859, "rewards/rejected": -3.815927267074585, "step": 4455 }, { "epoch": 0.7426666666666667, "grad_norm": 15.456986427307129, "learning_rate": 3.276664902649186e-08, "logits/chosen": 2.5683186054229736, "logits/rejected": 2.7912967205047607, "logps/chosen": -96.95555114746094, "logps/rejected": -323.09185791015625, "loss": 0.5856, "nll_loss": 0.5805721879005432, "rewards/accuracies": 1.0, "rewards/chosen": 2.5233566761016846, "rewards/margins": 10.072300910949707, "rewards/rejected": -7.548944473266602, "step": 4456 }, { "epoch": 0.7428333333333333, "grad_norm": 44.93692398071289, "learning_rate": 3.2726700767749316e-08, "logits/chosen": 2.58542799949646, "logits/rejected": 2.5828335285186768, "logps/chosen": -141.35711669921875, "logps/rejected": -142.6256103515625, "loss": 1.6695, "nll_loss": 1.6436877250671387, "rewards/accuracies": 1.0, "rewards/chosen": 0.8723236322402954, "rewards/margins": 6.824402332305908, "rewards/rejected": -5.952078819274902, "step": 4457 }, { "epoch": 0.743, "grad_norm": 163.7613525390625, "learning_rate": 3.2686772110831085e-08, "logits/chosen": 2.3484303951263428, "logits/rejected": 2.3604156970977783, "logps/chosen": -48.68304443359375, "logps/rejected": -38.81244659423828, "loss": 2.8324, "nll_loss": 1.947322130203247, "rewards/accuracies": 0.0, "rewards/chosen": -1.6641594171524048, "rewards/margins": -0.08936464786529541, "rewards/rejected": -1.5747947692871094, "step": 4458 }, { "epoch": 0.7431666666666666, "grad_norm": 184.85743713378906, "learning_rate": 3.2646863067371466e-08, "logits/chosen": 2.365229606628418, "logits/rejected": 2.1422343254089355, "logps/chosen": -655.4044799804688, "logps/rejected": -376.8676452636719, "loss": 1.9931, "nll_loss": 1.085106611251831, "rewards/accuracies": 1.0, "rewards/chosen": -5.7255072593688965, "rewards/margins": 4.883312702178955, "rewards/rejected": -10.608819961547852, "step": 4459 }, { "epoch": 0.7433333333333333, "grad_norm": 41.359466552734375, "learning_rate": 3.260697364899892e-08, "logits/chosen": 2.9919559955596924, "logits/rejected": 3.028404712677002, "logps/chosen": -14.134299278259277, "logps/rejected": -18.868234634399414, "loss": 1.0585, "nll_loss": 0.48738956451416016, "rewards/accuracies": 1.0, "rewards/chosen": 5.158829689025879, "rewards/margins": 3.6947920322418213, "rewards/rejected": 1.4640376567840576, "step": 4460 }, { "epoch": 0.7435, "grad_norm": 38.65229797363281, "learning_rate": 3.256710386733629e-08, "logits/chosen": 2.087066411972046, "logits/rejected": 2.5884504318237305, "logps/chosen": -28.317712783813477, "logps/rejected": -311.0201721191406, "loss": 1.0495, "nll_loss": 1.0488041639328003, "rewards/accuracies": 1.0, "rewards/chosen": 4.9410080909729, "rewards/margins": 12.651592254638672, "rewards/rejected": -7.710583686828613, "step": 4461 }, { "epoch": 0.7436666666666667, "grad_norm": 28.747718811035156, "learning_rate": 3.252725373400069e-08, "logits/chosen": 2.4509518146514893, "logits/rejected": 2.739750862121582, "logps/chosen": -34.24539566040039, "logps/rejected": -420.2173156738281, "loss": 0.5534, "nll_loss": 0.5435776710510254, "rewards/accuracies": 1.0, "rewards/chosen": 1.7665843963623047, "rewards/margins": 11.052750587463379, "rewards/rejected": -9.286166191101074, "step": 4462 }, { "epoch": 0.7438333333333333, "grad_norm": 27.23161506652832, "learning_rate": 3.248742326060349e-08, "logits/chosen": 1.6370817422866821, "logits/rejected": 1.7973952293395996, "logps/chosen": -56.19334030151367, "logps/rejected": -125.74093627929688, "loss": 0.7274, "nll_loss": 0.702416718006134, "rewards/accuracies": 1.0, "rewards/chosen": 0.9791897535324097, "rewards/margins": 6.451547145843506, "rewards/rejected": -5.472357273101807, "step": 4463 }, { "epoch": 0.744, "grad_norm": 21.643795013427734, "learning_rate": 3.244761245875036e-08, "logits/chosen": 2.670559883117676, "logits/rejected": 2.5820558071136475, "logps/chosen": -49.483787536621094, "logps/rejected": -246.29632568359375, "loss": 0.6007, "nll_loss": 0.5961902141571045, "rewards/accuracies": 1.0, "rewards/chosen": 2.8439698219299316, "rewards/margins": 9.343353271484375, "rewards/rejected": -6.499383449554443, "step": 4464 }, { "epoch": 0.7441666666666666, "grad_norm": 297.13201904296875, "learning_rate": 3.240782134004115e-08, "logits/chosen": 2.7346267700195312, "logits/rejected": 2.9530491828918457, "logps/chosen": -31.71559715270996, "logps/rejected": -118.98857879638672, "loss": 2.854, "nll_loss": 0.7551332116127014, "rewards/accuracies": 0.0, "rewards/chosen": 0.8192373514175415, "rewards/margins": -1.4983781576156616, "rewards/rejected": 2.317615509033203, "step": 4465 }, { "epoch": 0.7443333333333333, "grad_norm": 25.024791717529297, "learning_rate": 3.236804991607007e-08, "logits/chosen": 2.962209463119507, "logits/rejected": 2.975327491760254, "logps/chosen": -87.5549087524414, "logps/rejected": -72.30712890625, "loss": 0.8471, "nll_loss": 0.810693621635437, "rewards/accuracies": 1.0, "rewards/chosen": 3.204387664794922, "rewards/margins": 6.169574737548828, "rewards/rejected": -2.9651870727539062, "step": 4466 }, { "epoch": 0.7445, "grad_norm": 217.4149627685547, "learning_rate": 3.2328298198425553e-08, "logits/chosen": 2.535879611968994, "logits/rejected": 2.2817933559417725, "logps/chosen": -158.1739959716797, "logps/rejected": -121.39360809326172, "loss": 2.1383, "nll_loss": 1.3072229623794556, "rewards/accuracies": 1.0, "rewards/chosen": -3.3971590995788574, "rewards/margins": 0.5560369491577148, "rewards/rejected": -3.9531960487365723, "step": 4467 }, { "epoch": 0.7446666666666667, "grad_norm": 26.02190399169922, "learning_rate": 3.228856619869034e-08, "logits/chosen": 1.3368865251541138, "logits/rejected": 1.2162235975265503, "logps/chosen": -51.5260009765625, "logps/rejected": -100.6021957397461, "loss": 0.6312, "nll_loss": 0.6207951307296753, "rewards/accuracies": 1.0, "rewards/chosen": 1.9502143859863281, "rewards/margins": 7.822079658508301, "rewards/rejected": -5.871865272521973, "step": 4468 }, { "epoch": 0.7448333333333333, "grad_norm": 29.715015411376953, "learning_rate": 3.22488539284413e-08, "logits/chosen": 2.1326045989990234, "logits/rejected": 2.36698579788208, "logps/chosen": -86.34379577636719, "logps/rejected": -212.66256713867188, "loss": 1.0213, "nll_loss": 0.9924575090408325, "rewards/accuracies": 1.0, "rewards/chosen": 1.3350242376327515, "rewards/margins": 5.469802379608154, "rewards/rejected": -4.134778022766113, "step": 4469 }, { "epoch": 0.745, "grad_norm": 24.607257843017578, "learning_rate": 3.220916139924967e-08, "logits/chosen": 2.938323974609375, "logits/rejected": 2.95613956451416, "logps/chosen": -75.52326965332031, "logps/rejected": -134.4503631591797, "loss": 0.7822, "nll_loss": 0.7706455588340759, "rewards/accuracies": 1.0, "rewards/chosen": 1.6251497268676758, "rewards/margins": 9.431718826293945, "rewards/rejected": -7.8065690994262695, "step": 4470 }, { "epoch": 0.7451666666666666, "grad_norm": 28.72939682006836, "learning_rate": 3.216948862268092e-08, "logits/chosen": 2.985217809677124, "logits/rejected": 3.1544406414031982, "logps/chosen": -37.999576568603516, "logps/rejected": -209.87466430664062, "loss": 0.6675, "nll_loss": 0.6551651954650879, "rewards/accuracies": 1.0, "rewards/chosen": 1.5571670532226562, "rewards/margins": 9.218437194824219, "rewards/rejected": -7.661270618438721, "step": 4471 }, { "epoch": 0.7453333333333333, "grad_norm": 24.853158950805664, "learning_rate": 3.212983561029475e-08, "logits/chosen": 3.2392122745513916, "logits/rejected": 3.4258499145507812, "logps/chosen": -62.238250732421875, "logps/rejected": -365.1705322265625, "loss": 0.8003, "nll_loss": 0.7779781222343445, "rewards/accuracies": 1.0, "rewards/chosen": 1.2830222845077515, "rewards/margins": 6.184975624084473, "rewards/rejected": -4.901953220367432, "step": 4472 }, { "epoch": 0.7455, "grad_norm": 18.773277282714844, "learning_rate": 3.209020237364505e-08, "logits/chosen": 2.893155336380005, "logits/rejected": 2.9674692153930664, "logps/chosen": -55.609764099121094, "logps/rejected": -215.05018615722656, "loss": 0.6078, "nll_loss": 0.6044540405273438, "rewards/accuracies": 1.0, "rewards/chosen": 3.3372573852539062, "rewards/margins": 9.688817977905273, "rewards/rejected": -6.351561069488525, "step": 4473 }, { "epoch": 0.7456666666666667, "grad_norm": 64.29395294189453, "learning_rate": 3.205058892428002e-08, "logits/chosen": 2.5702402591705322, "logits/rejected": 2.5494954586029053, "logps/chosen": -124.44654083251953, "logps/rejected": -154.47616577148438, "loss": 2.4971, "nll_loss": 2.4889304637908936, "rewards/accuracies": 1.0, "rewards/chosen": 2.700493574142456, "rewards/margins": 7.83017635345459, "rewards/rejected": -5.129682540893555, "step": 4474 }, { "epoch": 0.7458333333333333, "grad_norm": 26.058921813964844, "learning_rate": 3.201099527374207e-08, "logits/chosen": 1.1187494993209839, "logits/rejected": 1.60714852809906, "logps/chosen": -88.33901977539062, "logps/rejected": -213.58828735351562, "loss": 0.9989, "nll_loss": 0.9815448522567749, "rewards/accuracies": 1.0, "rewards/chosen": 1.2546334266662598, "rewards/margins": 7.824552536010742, "rewards/rejected": -6.569919109344482, "step": 4475 }, { "epoch": 0.746, "grad_norm": 203.51092529296875, "learning_rate": 3.197142143356787e-08, "logits/chosen": 3.073101282119751, "logits/rejected": 3.0111801624298096, "logps/chosen": -39.740478515625, "logps/rejected": -33.582252502441406, "loss": 2.1711, "nll_loss": 0.48463988304138184, "rewards/accuracies": 0.0, "rewards/chosen": 1.6649903059005737, "rewards/margins": -0.7938240766525269, "rewards/rejected": 2.4588143825531006, "step": 4476 }, { "epoch": 0.7461666666666666, "grad_norm": 33.908756256103516, "learning_rate": 3.1931867415288215e-08, "logits/chosen": 3.3541834354400635, "logits/rejected": 3.2697999477386475, "logps/chosen": -67.78692626953125, "logps/rejected": -60.403404235839844, "loss": 1.0504, "nll_loss": 1.0117452144622803, "rewards/accuracies": 1.0, "rewards/chosen": 2.6421899795532227, "rewards/margins": 5.633554458618164, "rewards/rejected": -2.9913642406463623, "step": 4477 }, { "epoch": 0.7463333333333333, "grad_norm": 13.342857360839844, "learning_rate": 3.1892333230428235e-08, "logits/chosen": 1.6470956802368164, "logits/rejected": 1.5708814859390259, "logps/chosen": -182.3797607421875, "logps/rejected": -245.05975341796875, "loss": 0.7463, "nll_loss": 0.7444072961807251, "rewards/accuracies": 1.0, "rewards/chosen": 3.773250102996826, "rewards/margins": 10.937599182128906, "rewards/rejected": -7.164348602294922, "step": 4478 }, { "epoch": 0.7465, "grad_norm": 24.807558059692383, "learning_rate": 3.1852818890507246e-08, "logits/chosen": 1.786221981048584, "logits/rejected": 2.200976610183716, "logps/chosen": -41.00816345214844, "logps/rejected": -371.26953125, "loss": 0.6324, "nll_loss": 0.6308949589729309, "rewards/accuracies": 1.0, "rewards/chosen": 3.6282074451446533, "rewards/margins": 20.4741153717041, "rewards/rejected": -16.84590721130371, "step": 4479 }, { "epoch": 0.7466666666666667, "grad_norm": 108.84809112548828, "learning_rate": 3.181332440703882e-08, "logits/chosen": 2.03109073638916, "logits/rejected": 2.5283305644989014, "logps/chosen": -11.647891998291016, "logps/rejected": -302.6277160644531, "loss": 1.0228, "nll_loss": 0.9706575870513916, "rewards/accuracies": 1.0, "rewards/chosen": -0.01654996909201145, "rewards/margins": 9.557304382324219, "rewards/rejected": -9.573854446411133, "step": 4480 }, { "epoch": 0.7468333333333333, "grad_norm": 25.92781639099121, "learning_rate": 3.1773849791530616e-08, "logits/chosen": 2.7551398277282715, "logits/rejected": 2.799283742904663, "logps/chosen": -85.11328125, "logps/rejected": -169.0806427001953, "loss": 0.7681, "nll_loss": 0.7401155829429626, "rewards/accuracies": 1.0, "rewards/chosen": 0.6864631772041321, "rewards/margins": 8.296252250671387, "rewards/rejected": -7.60978889465332, "step": 4481 }, { "epoch": 0.747, "grad_norm": 27.415428161621094, "learning_rate": 3.173439505548462e-08, "logits/chosen": 2.7780115604400635, "logits/rejected": 2.8570408821105957, "logps/chosen": -103.73603057861328, "logps/rejected": -191.06861877441406, "loss": 0.9364, "nll_loss": 0.9262147545814514, "rewards/accuracies": 1.0, "rewards/chosen": 1.742213487625122, "rewards/margins": 9.900729179382324, "rewards/rejected": -8.158515930175781, "step": 4482 }, { "epoch": 0.7471666666666666, "grad_norm": 29.088542938232422, "learning_rate": 3.1694960210397016e-08, "logits/chosen": 2.6327412128448486, "logits/rejected": 2.867748260498047, "logps/chosen": -41.51872253417969, "logps/rejected": -426.01348876953125, "loss": 0.7365, "nll_loss": 0.7283986210823059, "rewards/accuracies": 1.0, "rewards/chosen": 2.035362720489502, "rewards/margins": 9.177312850952148, "rewards/rejected": -7.141949653625488, "step": 4483 }, { "epoch": 0.7473333333333333, "grad_norm": 130.90257263183594, "learning_rate": 3.165554526775815e-08, "logits/chosen": 2.812037229537964, "logits/rejected": 2.716118335723877, "logps/chosen": -35.651466369628906, "logps/rejected": -29.828777313232422, "loss": 2.2832, "nll_loss": 1.6976884603500366, "rewards/accuracies": 1.0, "rewards/chosen": 0.7688743472099304, "rewards/margins": 0.874154269695282, "rewards/rejected": -0.10527992248535156, "step": 4484 }, { "epoch": 0.7475, "grad_norm": 28.600038528442383, "learning_rate": 3.161615023905264e-08, "logits/chosen": 1.8935205936431885, "logits/rejected": 2.257615566253662, "logps/chosen": -93.68968200683594, "logps/rejected": -250.56655883789062, "loss": 1.0241, "nll_loss": 1.0183662176132202, "rewards/accuracies": 1.0, "rewards/chosen": 2.323577880859375, "rewards/margins": 12.009465217590332, "rewards/rejected": -9.685887336730957, "step": 4485 }, { "epoch": 0.7476666666666667, "grad_norm": 269.2992248535156, "learning_rate": 3.157677513575917e-08, "logits/chosen": 2.5179810523986816, "logits/rejected": 2.7497193813323975, "logps/chosen": -68.75167846679688, "logps/rejected": -50.84088897705078, "loss": 4.882, "nll_loss": 0.8593959808349609, "rewards/accuracies": 0.0, "rewards/chosen": 1.3821258544921875, "rewards/margins": -3.4608774185180664, "rewards/rejected": 4.843003273010254, "step": 4486 }, { "epoch": 0.7478333333333333, "grad_norm": 20.87472152709961, "learning_rate": 3.1537419969350756e-08, "logits/chosen": 1.5843461751937866, "logits/rejected": 2.5870859622955322, "logps/chosen": -70.11251831054688, "logps/rejected": -210.45269775390625, "loss": 0.7595, "nll_loss": 0.7538981437683105, "rewards/accuracies": 1.0, "rewards/chosen": 2.418044328689575, "rewards/margins": 9.78542423248291, "rewards/rejected": -7.367379665374756, "step": 4487 }, { "epoch": 0.748, "grad_norm": 28.445880889892578, "learning_rate": 3.1498084751294516e-08, "logits/chosen": 1.5245662927627563, "logits/rejected": 1.3942015171051025, "logps/chosen": -91.80547332763672, "logps/rejected": -134.74349975585938, "loss": 0.9984, "nll_loss": 0.9766541123390198, "rewards/accuracies": 1.0, "rewards/chosen": 1.4074379205703735, "rewards/margins": 6.10138463973999, "rewards/rejected": -4.693946838378906, "step": 4488 }, { "epoch": 0.7481666666666666, "grad_norm": 22.120861053466797, "learning_rate": 3.145876949305185e-08, "logits/chosen": 2.4929418563842773, "logits/rejected": 2.2750916481018066, "logps/chosen": -128.79721069335938, "logps/rejected": -218.97756958007812, "loss": 0.7049, "nll_loss": 0.6743309497833252, "rewards/accuracies": 1.0, "rewards/chosen": 0.578082263469696, "rewards/margins": 9.093954086303711, "rewards/rejected": -8.51587200164795, "step": 4489 }, { "epoch": 0.7483333333333333, "grad_norm": 23.308761596679688, "learning_rate": 3.14194742060782e-08, "logits/chosen": 1.5480695962905884, "logits/rejected": 1.9027985334396362, "logps/chosen": -83.5425796508789, "logps/rejected": -160.0288543701172, "loss": 0.7941, "nll_loss": 0.7881375551223755, "rewards/accuracies": 1.0, "rewards/chosen": 2.3324532508850098, "rewards/margins": 10.005094528198242, "rewards/rejected": -7.672641754150391, "step": 4490 }, { "epoch": 0.7485, "grad_norm": 139.9385986328125, "learning_rate": 3.1380198901823304e-08, "logits/chosen": 2.094805955886841, "logits/rejected": 2.0497500896453857, "logps/chosen": -39.49811553955078, "logps/rejected": -14.637657165527344, "loss": 2.4707, "nll_loss": 0.4488421678543091, "rewards/accuracies": 0.0, "rewards/chosen": 2.226247787475586, "rewards/margins": -1.0905094146728516, "rewards/rejected": 3.3167572021484375, "step": 4491 }, { "epoch": 0.7486666666666667, "grad_norm": 29.09418296813965, "learning_rate": 3.134094359173104e-08, "logits/chosen": 2.2069737911224365, "logits/rejected": 2.3092150688171387, "logps/chosen": -94.32635498046875, "logps/rejected": -120.31576538085938, "loss": 1.1436, "nll_loss": 1.1229329109191895, "rewards/accuracies": 1.0, "rewards/chosen": 1.2254364490509033, "rewards/margins": 6.597836494445801, "rewards/rejected": -5.372399806976318, "step": 4492 }, { "epoch": 0.7488333333333334, "grad_norm": 36.59300994873047, "learning_rate": 3.1301708287239504e-08, "logits/chosen": 2.1222164630889893, "logits/rejected": 2.7272887229919434, "logps/chosen": -19.338665008544922, "logps/rejected": -233.27407836914062, "loss": 0.6314, "nll_loss": 0.6043333411216736, "rewards/accuracies": 1.0, "rewards/chosen": 0.7941181063652039, "rewards/margins": 6.932375431060791, "rewards/rejected": -6.1382575035095215, "step": 4493 }, { "epoch": 0.749, "grad_norm": 20.732566833496094, "learning_rate": 3.126249299978085e-08, "logits/chosen": 1.6490964889526367, "logits/rejected": 1.6133519411087036, "logps/chosen": -85.35614013671875, "logps/rejected": -119.79315185546875, "loss": 0.7495, "nll_loss": 0.7358289361000061, "rewards/accuracies": 1.0, "rewards/chosen": 3.7213973999023438, "rewards/margins": 7.74508810043335, "rewards/rejected": -4.023690700531006, "step": 4494 }, { "epoch": 0.7491666666666666, "grad_norm": 126.45172119140625, "learning_rate": 3.122329774078152e-08, "logits/chosen": 2.03000807762146, "logits/rejected": 2.151271343231201, "logps/chosen": -12.616819381713867, "logps/rejected": -72.5265121459961, "loss": 1.1477, "nll_loss": 0.5734917521476746, "rewards/accuracies": 1.0, "rewards/chosen": 2.2318201065063477, "rewards/margins": 1.5366754531860352, "rewards/rejected": 0.6951446533203125, "step": 4495 }, { "epoch": 0.7493333333333333, "grad_norm": 56.94768142700195, "learning_rate": 3.118412252166205e-08, "logits/chosen": 1.761813998222351, "logits/rejected": 2.4056003093719482, "logps/chosen": -48.19911193847656, "logps/rejected": -153.23057556152344, "loss": 1.088, "nll_loss": 1.047806739807129, "rewards/accuracies": 1.0, "rewards/chosen": 0.3292992115020752, "rewards/margins": 6.765070915222168, "rewards/rejected": -6.435771942138672, "step": 4496 }, { "epoch": 0.7495, "grad_norm": 66.40213012695312, "learning_rate": 3.114496735383719e-08, "logits/chosen": 2.650195360183716, "logits/rejected": 2.6624503135681152, "logps/chosen": -31.833486557006836, "logps/rejected": -228.58480834960938, "loss": 0.8703, "nll_loss": 0.8603647351264954, "rewards/accuracies": 1.0, "rewards/chosen": 2.3397862911224365, "rewards/margins": 7.4977521896362305, "rewards/rejected": -5.157965660095215, "step": 4497 }, { "epoch": 0.7496666666666667, "grad_norm": 40.87688064575195, "learning_rate": 3.110583224871577e-08, "logits/chosen": 2.520904779434204, "logits/rejected": 2.3690407276153564, "logps/chosen": -92.71640014648438, "logps/rejected": -185.08108520507812, "loss": 1.1493, "nll_loss": 1.1306878328323364, "rewards/accuracies": 1.0, "rewards/chosen": 1.1667121648788452, "rewards/margins": 7.791840553283691, "rewards/rejected": -6.625128269195557, "step": 4498 }, { "epoch": 0.7498333333333334, "grad_norm": 33.225975036621094, "learning_rate": 3.106671721770083e-08, "logits/chosen": 1.1628319025039673, "logits/rejected": 1.7180036306381226, "logps/chosen": -51.73514938354492, "logps/rejected": -167.11590576171875, "loss": 0.7521, "nll_loss": 0.7185438275337219, "rewards/accuracies": 1.0, "rewards/chosen": 0.845441460609436, "rewards/margins": 5.477147579193115, "rewards/rejected": -4.631706237792969, "step": 4499 }, { "epoch": 0.75, "grad_norm": 31.41991424560547, "learning_rate": 3.102762227218957e-08, "logits/chosen": 2.2003748416900635, "logits/rejected": 2.5988681316375732, "logps/chosen": -28.98406982421875, "logps/rejected": -307.8150939941406, "loss": 0.661, "nll_loss": 0.6440902948379517, "rewards/accuracies": 1.0, "rewards/chosen": 1.2255455255508423, "rewards/margins": 8.90251350402832, "rewards/rejected": -7.676967620849609, "step": 4500 }, { "epoch": 0.7501666666666666, "grad_norm": 33.49885559082031, "learning_rate": 3.098854742357334e-08, "logits/chosen": 3.3714990615844727, "logits/rejected": 3.507096290588379, "logps/chosen": -17.429052352905273, "logps/rejected": -114.72378540039062, "loss": 0.5031, "nll_loss": 0.4841403365135193, "rewards/accuracies": 1.0, "rewards/chosen": 1.692160964012146, "rewards/margins": 6.262203693389893, "rewards/rejected": -4.570042610168457, "step": 4501 }, { "epoch": 0.7503333333333333, "grad_norm": 35.75938415527344, "learning_rate": 3.0949492683237553e-08, "logits/chosen": 2.3980867862701416, "logits/rejected": 2.4816277027130127, "logps/chosen": -25.005313873291016, "logps/rejected": -175.84210205078125, "loss": 0.5742, "nll_loss": 0.5683026313781738, "rewards/accuracies": 1.0, "rewards/chosen": 2.353783369064331, "rewards/margins": 9.906421661376953, "rewards/rejected": -7.552638530731201, "step": 4502 }, { "epoch": 0.7505, "grad_norm": 64.58208465576172, "learning_rate": 3.0910458062561864e-08, "logits/chosen": 2.8680660724639893, "logits/rejected": 2.866081476211548, "logps/chosen": -7.118795394897461, "logps/rejected": -174.22332763671875, "loss": 0.5633, "nll_loss": 0.5475996136665344, "rewards/accuracies": 1.0, "rewards/chosen": 1.4364086389541626, "rewards/margins": 7.382360458374023, "rewards/rejected": -5.94595193862915, "step": 4503 }, { "epoch": 0.7506666666666667, "grad_norm": 298.4954528808594, "learning_rate": 3.087144357292001e-08, "logits/chosen": 1.8689485788345337, "logits/rejected": 2.134150981903076, "logps/chosen": -84.36289978027344, "logps/rejected": -75.1083755493164, "loss": 4.3275, "nll_loss": 0.8697205185890198, "rewards/accuracies": 0.0, "rewards/chosen": 1.750300645828247, "rewards/margins": -2.8040854930877686, "rewards/rejected": 4.554386138916016, "step": 4504 }, { "epoch": 0.7508333333333334, "grad_norm": 26.61410140991211, "learning_rate": 3.083244922567987e-08, "logits/chosen": 2.3983778953552246, "logits/rejected": 2.5761330127716064, "logps/chosen": -108.07572937011719, "logps/rejected": -361.0648498535156, "loss": 1.0758, "nll_loss": 1.0700565576553345, "rewards/accuracies": 1.0, "rewards/chosen": 2.303410291671753, "rewards/margins": 15.308198928833008, "rewards/rejected": -13.004788398742676, "step": 4505 }, { "epoch": 0.751, "grad_norm": 31.92315673828125, "learning_rate": 3.079347503220351e-08, "logits/chosen": 2.1861698627471924, "logits/rejected": 2.5621235370635986, "logps/chosen": -21.367977142333984, "logps/rejected": -69.80672454833984, "loss": 0.5776, "nll_loss": 0.5623151659965515, "rewards/accuracies": 1.0, "rewards/chosen": 1.5335133075714111, "rewards/margins": 7.1612653732299805, "rewards/rejected": -5.62775182723999, "step": 4506 }, { "epoch": 0.7511666666666666, "grad_norm": 26.88929557800293, "learning_rate": 3.0754521003847014e-08, "logits/chosen": 2.668243408203125, "logits/rejected": 2.697258949279785, "logps/chosen": -61.22207260131836, "logps/rejected": -136.96090698242188, "loss": 0.7846, "nll_loss": 0.7749630212783813, "rewards/accuracies": 1.0, "rewards/chosen": 2.174311399459839, "rewards/margins": 7.703547477722168, "rewards/rejected": -5.529236316680908, "step": 4507 }, { "epoch": 0.7513333333333333, "grad_norm": 32.224666595458984, "learning_rate": 3.0715587151960655e-08, "logits/chosen": 3.010720729827881, "logits/rejected": 2.9094974994659424, "logps/chosen": -59.53861999511719, "logps/rejected": -92.86095428466797, "loss": 0.7511, "nll_loss": 0.6765753030776978, "rewards/accuracies": 1.0, "rewards/chosen": 2.0135498046875, "rewards/margins": 4.360424041748047, "rewards/rejected": -2.346874237060547, "step": 4508 }, { "epoch": 0.7515, "grad_norm": 11.371455192565918, "learning_rate": 3.067667348788885e-08, "logits/chosen": 1.6311635971069336, "logits/rejected": 1.7838444709777832, "logps/chosen": -266.52801513671875, "logps/rejected": -233.0206298828125, "loss": 0.7864, "nll_loss": 0.7816071510314941, "rewards/accuracies": 1.0, "rewards/chosen": 3.329925537109375, "rewards/margins": 8.898904800415039, "rewards/rejected": -5.568978786468506, "step": 4509 }, { "epoch": 0.7516666666666667, "grad_norm": 82.41814422607422, "learning_rate": 3.0637780022970126e-08, "logits/chosen": 2.0193283557891846, "logits/rejected": 1.8974837064743042, "logps/chosen": -39.96818923950195, "logps/rejected": -51.89833068847656, "loss": 1.0522, "nll_loss": 0.6344156861305237, "rewards/accuracies": 1.0, "rewards/chosen": 1.090497612953186, "rewards/margins": 1.4973801374435425, "rewards/rejected": -0.40688249468803406, "step": 4510 }, { "epoch": 0.7518333333333334, "grad_norm": 38.438377380371094, "learning_rate": 3.0598906768537046e-08, "logits/chosen": 2.6648690700531006, "logits/rejected": 2.6258950233459473, "logps/chosen": -56.47859191894531, "logps/rejected": -160.6521453857422, "loss": 0.9366, "nll_loss": 0.8964856266975403, "rewards/accuracies": 1.0, "rewards/chosen": 0.3258434236049652, "rewards/margins": 6.828278541564941, "rewards/rejected": -6.502435207366943, "step": 4511 }, { "epoch": 0.752, "grad_norm": 264.9528503417969, "learning_rate": 3.056005373591637e-08, "logits/chosen": 2.758936643600464, "logits/rejected": 2.7515392303466797, "logps/chosen": -27.953372955322266, "logps/rejected": -51.3623161315918, "loss": 3.899, "nll_loss": 0.6817896366119385, "rewards/accuracies": 0.0, "rewards/chosen": -1.1837679147720337, "rewards/margins": -3.0268607139587402, "rewards/rejected": 1.843092918395996, "step": 4512 }, { "epoch": 0.7521666666666667, "grad_norm": 26.332714080810547, "learning_rate": 3.0521220936428947e-08, "logits/chosen": 2.461183786392212, "logits/rejected": 2.3717527389526367, "logps/chosen": -81.39253997802734, "logps/rejected": -86.87313079833984, "loss": 0.8879, "nll_loss": 0.8751885294914246, "rewards/accuracies": 1.0, "rewards/chosen": 1.564063310623169, "rewards/margins": 8.447504997253418, "rewards/rejected": -6.883441925048828, "step": 4513 }, { "epoch": 0.7523333333333333, "grad_norm": 105.93355560302734, "learning_rate": 3.0482408381389746e-08, "logits/chosen": 2.950493097305298, "logits/rejected": 3.2073707580566406, "logps/chosen": -45.1781005859375, "logps/rejected": -102.46823120117188, "loss": 1.0917, "nll_loss": 0.6743000149726868, "rewards/accuracies": 1.0, "rewards/chosen": -0.4219524562358856, "rewards/margins": 1.0921558141708374, "rewards/rejected": -1.5141083002090454, "step": 4514 }, { "epoch": 0.7525, "grad_norm": 28.574581146240234, "learning_rate": 3.044361608210775e-08, "logits/chosen": 2.731437921524048, "logits/rejected": 2.7376890182495117, "logps/chosen": -88.05389404296875, "logps/rejected": -103.30286407470703, "loss": 1.0002, "nll_loss": 0.9783765077590942, "rewards/accuracies": 1.0, "rewards/chosen": 1.6603134870529175, "rewards/margins": 5.969501972198486, "rewards/rejected": -4.309188365936279, "step": 4515 }, { "epoch": 0.7526666666666667, "grad_norm": 154.6287841796875, "learning_rate": 3.0404844049886134e-08, "logits/chosen": 2.417591094970703, "logits/rejected": 2.725872039794922, "logps/chosen": -106.6633071899414, "logps/rejected": -101.58523559570312, "loss": 2.5016, "nll_loss": 1.8390225172042847, "rewards/accuracies": 1.0, "rewards/chosen": -2.9936089515686035, "rewards/margins": 0.9326019287109375, "rewards/rejected": -3.926210880279541, "step": 4516 }, { "epoch": 0.7528333333333334, "grad_norm": 25.81608772277832, "learning_rate": 3.036609229602215e-08, "logits/chosen": 2.608518600463867, "logits/rejected": 2.4939825534820557, "logps/chosen": -85.5523910522461, "logps/rejected": -107.74723052978516, "loss": 0.8962, "nll_loss": 0.8641656637191772, "rewards/accuracies": 1.0, "rewards/chosen": 1.6943390369415283, "rewards/margins": 5.311493873596191, "rewards/rejected": -3.617155075073242, "step": 4517 }, { "epoch": 0.753, "grad_norm": 27.35323143005371, "learning_rate": 3.032736083180716e-08, "logits/chosen": 2.4533188343048096, "logits/rejected": 2.241459369659424, "logps/chosen": -66.15206146240234, "logps/rejected": -108.90913391113281, "loss": 0.8731, "nll_loss": 0.8591175675392151, "rewards/accuracies": 1.0, "rewards/chosen": 1.8527641296386719, "rewards/margins": 6.934536933898926, "rewards/rejected": -5.081772804260254, "step": 4518 }, { "epoch": 0.7531666666666667, "grad_norm": 159.76556396484375, "learning_rate": 3.02886496685265e-08, "logits/chosen": 2.8505916595458984, "logits/rejected": 2.8916587829589844, "logps/chosen": -27.3631534576416, "logps/rejected": -17.194826126098633, "loss": 4.0237, "nll_loss": 0.4209716320037842, "rewards/accuracies": 0.0, "rewards/chosen": 1.6418169736862183, "rewards/margins": -2.9767985343933105, "rewards/rejected": 4.618615627288818, "step": 4519 }, { "epoch": 0.7533333333333333, "grad_norm": 16.528221130371094, "learning_rate": 3.024995881745972e-08, "logits/chosen": 1.6930102109909058, "logits/rejected": 1.604114055633545, "logps/chosen": -158.4740753173828, "logps/rejected": -118.91165161132812, "loss": 0.5878, "nll_loss": 0.561964750289917, "rewards/accuracies": 1.0, "rewards/chosen": 1.566066026687622, "rewards/margins": 5.650839805603027, "rewards/rejected": -4.084773540496826, "step": 4520 }, { "epoch": 0.7535, "grad_norm": 25.52264404296875, "learning_rate": 3.02112882898804e-08, "logits/chosen": 2.6448917388916016, "logits/rejected": 2.8218891620635986, "logps/chosen": -57.107025146484375, "logps/rejected": -243.21212768554688, "loss": 0.6922, "nll_loss": 0.6798455119132996, "rewards/accuracies": 1.0, "rewards/chosen": 1.5475540161132812, "rewards/margins": 9.377967834472656, "rewards/rejected": -7.830414295196533, "step": 4521 }, { "epoch": 0.7536666666666667, "grad_norm": 36.20365905761719, "learning_rate": 3.017263809705625e-08, "logits/chosen": 1.9826138019561768, "logits/rejected": 2.383364200592041, "logps/chosen": -26.9583683013916, "logps/rejected": -217.84658813476562, "loss": 0.7315, "nll_loss": 0.7286046147346497, "rewards/accuracies": 1.0, "rewards/chosen": 3.0186476707458496, "rewards/margins": 12.407608032226562, "rewards/rejected": -9.388959884643555, "step": 4522 }, { "epoch": 0.7538333333333334, "grad_norm": 21.629741668701172, "learning_rate": 3.013400825024891e-08, "logits/chosen": 2.8326611518859863, "logits/rejected": 2.991101026535034, "logps/chosen": -48.294456481933594, "logps/rejected": -269.0851745605469, "loss": 0.6073, "nll_loss": 0.6036805510520935, "rewards/accuracies": 1.0, "rewards/chosen": 2.7985787391662598, "rewards/margins": 12.074995040893555, "rewards/rejected": -9.276415824890137, "step": 4523 }, { "epoch": 0.754, "grad_norm": 26.080074310302734, "learning_rate": 3.009539876071426e-08, "logits/chosen": 2.7754530906677246, "logits/rejected": 2.7161741256713867, "logps/chosen": -53.51569366455078, "logps/rejected": -70.01485443115234, "loss": 0.6798, "nll_loss": 0.6526303887367249, "rewards/accuracies": 1.0, "rewards/chosen": 1.0257195234298706, "rewards/margins": 5.904043674468994, "rewards/rejected": -4.878324031829834, "step": 4524 }, { "epoch": 0.7541666666666667, "grad_norm": 173.49594116210938, "learning_rate": 3.005680963970216e-08, "logits/chosen": 3.1128153800964355, "logits/rejected": 3.161165952682495, "logps/chosen": -68.60604858398438, "logps/rejected": -50.98796844482422, "loss": 2.2744, "nll_loss": 0.7622894048690796, "rewards/accuracies": 0.0, "rewards/chosen": 2.0116937160491943, "rewards/margins": -0.4591643810272217, "rewards/rejected": 2.470858097076416, "step": 4525 }, { "epoch": 0.7543333333333333, "grad_norm": 29.957181930541992, "learning_rate": 3.001824089845655e-08, "logits/chosen": 2.8600964546203613, "logits/rejected": 2.7408814430236816, "logps/chosen": -56.26148223876953, "logps/rejected": -229.53646850585938, "loss": 0.8105, "nll_loss": 0.8037354350090027, "rewards/accuracies": 1.0, "rewards/chosen": 2.1641221046447754, "rewards/margins": 10.529455184936523, "rewards/rejected": -8.36533260345459, "step": 4526 }, { "epoch": 0.7545, "grad_norm": 32.62266540527344, "learning_rate": 2.9979692548215475e-08, "logits/chosen": 2.1208388805389404, "logits/rejected": 2.614158868789673, "logps/chosen": -77.79032897949219, "logps/rejected": -288.13848876953125, "loss": 1.0053, "nll_loss": 0.9973118305206299, "rewards/accuracies": 1.0, "rewards/chosen": 2.166980743408203, "rewards/margins": 8.440346717834473, "rewards/rejected": -6.2733659744262695, "step": 4527 }, { "epoch": 0.7546666666666667, "grad_norm": 34.06987762451172, "learning_rate": 2.994116460021093e-08, "logits/chosen": 2.283353328704834, "logits/rejected": 2.234670400619507, "logps/chosen": -22.250118255615234, "logps/rejected": -109.90003967285156, "loss": 0.6203, "nll_loss": 0.601354718208313, "rewards/accuracies": 1.0, "rewards/chosen": 2.9222545623779297, "rewards/margins": 6.714836120605469, "rewards/rejected": -3.79258131980896, "step": 4528 }, { "epoch": 0.7548333333333334, "grad_norm": 32.200809478759766, "learning_rate": 2.990265706566908e-08, "logits/chosen": 1.6825213432312012, "logits/rejected": 2.6361489295959473, "logps/chosen": -30.988740921020508, "logps/rejected": -260.4804382324219, "loss": 0.7156, "nll_loss": 0.7042896747589111, "rewards/accuracies": 1.0, "rewards/chosen": 1.795188069343567, "rewards/margins": 7.870589256286621, "rewards/rejected": -6.075401306152344, "step": 4529 }, { "epoch": 0.755, "grad_norm": 40.60259246826172, "learning_rate": 2.986416995581008e-08, "logits/chosen": 2.9617953300476074, "logits/rejected": 2.9241650104522705, "logps/chosen": -84.45156860351562, "logps/rejected": -111.60557556152344, "loss": 1.4116, "nll_loss": 1.3844518661499023, "rewards/accuracies": 1.0, "rewards/chosen": 1.0677398443222046, "rewards/margins": 5.836822509765625, "rewards/rejected": -4.769082546234131, "step": 4530 }, { "epoch": 0.7551666666666667, "grad_norm": 36.6353759765625, "learning_rate": 2.982570328184818e-08, "logits/chosen": 1.946068525314331, "logits/rejected": 2.447326183319092, "logps/chosen": -43.74593734741211, "logps/rejected": -188.86737060546875, "loss": 0.8672, "nll_loss": 0.8577633500099182, "rewards/accuracies": 1.0, "rewards/chosen": 1.9107013940811157, "rewards/margins": 8.576464653015137, "rewards/rejected": -6.665762901306152, "step": 4531 }, { "epoch": 0.7553333333333333, "grad_norm": 52.86941909790039, "learning_rate": 2.978725705499159e-08, "logits/chosen": 2.868557929992676, "logits/rejected": 3.148155450820923, "logps/chosen": -11.841772079467773, "logps/rejected": -175.59677124023438, "loss": 0.6012, "nll_loss": 0.5920886397361755, "rewards/accuracies": 1.0, "rewards/chosen": 1.9603620767593384, "rewards/margins": 8.622321128845215, "rewards/rejected": -6.661959171295166, "step": 4532 }, { "epoch": 0.7555, "grad_norm": 23.138736724853516, "learning_rate": 2.9748831286442653e-08, "logits/chosen": 1.8401800394058228, "logits/rejected": 2.1972415447235107, "logps/chosen": -68.29447937011719, "logps/rejected": -209.20916748046875, "loss": 0.7499, "nll_loss": 0.7343490719795227, "rewards/accuracies": 1.0, "rewards/chosen": 1.2825416326522827, "rewards/margins": 13.14329719543457, "rewards/rejected": -11.860755920410156, "step": 4533 }, { "epoch": 0.7556666666666667, "grad_norm": 18.292661666870117, "learning_rate": 2.97104259873977e-08, "logits/chosen": 2.291409730911255, "logits/rejected": 2.115196466445923, "logps/chosen": -71.18065643310547, "logps/rejected": -100.54486083984375, "loss": 0.6682, "nll_loss": 0.6530333757400513, "rewards/accuracies": 1.0, "rewards/chosen": 3.1175193786621094, "rewards/margins": 7.137209415435791, "rewards/rejected": -4.019690036773682, "step": 4534 }, { "epoch": 0.7558333333333334, "grad_norm": 24.415586471557617, "learning_rate": 2.967204116904717e-08, "logits/chosen": 2.1783652305603027, "logits/rejected": 2.2226672172546387, "logps/chosen": -40.66447448730469, "logps/rejected": -64.29356384277344, "loss": 0.5394, "nll_loss": 0.5213394165039062, "rewards/accuracies": 1.0, "rewards/chosen": 1.751910924911499, "rewards/margins": 6.344625473022461, "rewards/rejected": -4.592714309692383, "step": 4535 }, { "epoch": 0.756, "grad_norm": 215.2652130126953, "learning_rate": 2.9633676842575385e-08, "logits/chosen": 2.0677263736724854, "logits/rejected": 2.210344076156616, "logps/chosen": -43.92204284667969, "logps/rejected": -57.45655059814453, "loss": 3.1157, "nll_loss": 0.5559751987457275, "rewards/accuracies": 0.0, "rewards/chosen": 0.5051963925361633, "rewards/margins": -2.0812814235687256, "rewards/rejected": 2.586477756500244, "step": 4536 }, { "epoch": 0.7561666666666667, "grad_norm": 33.6132698059082, "learning_rate": 2.959533301916084e-08, "logits/chosen": 2.071554183959961, "logits/rejected": 2.165710926055908, "logps/chosen": -28.66802978515625, "logps/rejected": -118.75818634033203, "loss": 0.579, "nll_loss": 0.551308274269104, "rewards/accuracies": 1.0, "rewards/chosen": 1.2542085647583008, "rewards/margins": 5.593372821807861, "rewards/rejected": -4.3391642570495605, "step": 4537 }, { "epoch": 0.7563333333333333, "grad_norm": 24.669775009155273, "learning_rate": 2.9557009709976e-08, "logits/chosen": 3.464590311050415, "logits/rejected": 3.571869373321533, "logps/chosen": -186.2845916748047, "logps/rejected": -175.88345336914062, "loss": 0.9105, "nll_loss": 0.8913137912750244, "rewards/accuracies": 1.0, "rewards/chosen": 1.2208679914474487, "rewards/margins": 7.044778823852539, "rewards/rejected": -5.823910713195801, "step": 4538 }, { "epoch": 0.7565, "grad_norm": 44.57868194580078, "learning_rate": 2.9518706926187385e-08, "logits/chosen": 2.490666627883911, "logits/rejected": 2.511221408843994, "logps/chosen": -83.48529052734375, "logps/rejected": -106.97972106933594, "loss": 1.1747, "nll_loss": 1.0435659885406494, "rewards/accuracies": 1.0, "rewards/chosen": 1.6175987720489502, "rewards/margins": 3.375037431716919, "rewards/rejected": -1.7574386596679688, "step": 4539 }, { "epoch": 0.7566666666666667, "grad_norm": 16.453201293945312, "learning_rate": 2.948042467895544e-08, "logits/chosen": 2.4455859661102295, "logits/rejected": 2.241823673248291, "logps/chosen": -220.14608764648438, "logps/rejected": -118.68190002441406, "loss": 0.9693, "nll_loss": 0.9530133008956909, "rewards/accuracies": 1.0, "rewards/chosen": 2.793487548828125, "rewards/margins": 6.82086181640625, "rewards/rejected": -4.027374267578125, "step": 4540 }, { "epoch": 0.7568333333333334, "grad_norm": 31.92212677001953, "learning_rate": 2.9442162979434737e-08, "logits/chosen": 2.0428566932678223, "logits/rejected": 2.2597107887268066, "logps/chosen": -45.0589714050293, "logps/rejected": -154.74368286132812, "loss": 0.7029, "nll_loss": 0.6827117204666138, "rewards/accuracies": 1.0, "rewards/chosen": 1.0618259906768799, "rewards/margins": 7.915616035461426, "rewards/rejected": -6.853790283203125, "step": 4541 }, { "epoch": 0.757, "grad_norm": 147.17518615722656, "learning_rate": 2.9403921838773815e-08, "logits/chosen": 1.967202067375183, "logits/rejected": 1.694799780845642, "logps/chosen": -478.6024169921875, "logps/rejected": -366.73504638671875, "loss": 1.8217, "nll_loss": 1.3872535228729248, "rewards/accuracies": 1.0, "rewards/chosen": -3.286041259765625, "rewards/margins": 10.587411880493164, "rewards/rejected": -13.873453140258789, "step": 4542 }, { "epoch": 0.7571666666666667, "grad_norm": 105.51114654541016, "learning_rate": 2.936570126811524e-08, "logits/chosen": 2.586810350418091, "logits/rejected": 2.857494831085205, "logps/chosen": -69.66438293457031, "logps/rejected": -220.01687622070312, "loss": 1.7158, "nll_loss": 1.3932877779006958, "rewards/accuracies": 1.0, "rewards/chosen": -2.224266767501831, "rewards/margins": 2.6211740970611572, "rewards/rejected": -4.845440864562988, "step": 4543 }, { "epoch": 0.7573333333333333, "grad_norm": 46.87550354003906, "learning_rate": 2.9327501278595525e-08, "logits/chosen": 3.5558085441589355, "logits/rejected": 3.6250507831573486, "logps/chosen": -17.097789764404297, "logps/rejected": -40.89460754394531, "loss": 0.7552, "nll_loss": 0.6106353402137756, "rewards/accuracies": 1.0, "rewards/chosen": 3.5586998462677, "rewards/margins": 4.8158464431762695, "rewards/rejected": -1.2571464776992798, "step": 4544 }, { "epoch": 0.7575, "grad_norm": 103.73760223388672, "learning_rate": 2.928932188134525e-08, "logits/chosen": 2.497237205505371, "logits/rejected": 2.4737977981567383, "logps/chosen": -58.713775634765625, "logps/rejected": -136.5641326904297, "loss": 1.1414, "nll_loss": 0.5487268567085266, "rewards/accuracies": 1.0, "rewards/chosen": 2.396252393722534, "rewards/margins": 1.56976318359375, "rewards/rejected": 0.826489269733429, "step": 4545 }, { "epoch": 0.7576666666666667, "grad_norm": 26.043424606323242, "learning_rate": 2.9251163087488985e-08, "logits/chosen": 2.2108426094055176, "logits/rejected": 2.4510974884033203, "logps/chosen": -63.54594421386719, "logps/rejected": -216.018310546875, "loss": 0.8091, "nll_loss": 0.7943243384361267, "rewards/accuracies": 1.0, "rewards/chosen": 1.9225976467132568, "rewards/margins": 6.737209320068359, "rewards/rejected": -4.814611911773682, "step": 4546 }, { "epoch": 0.7578333333333334, "grad_norm": 23.4030818939209, "learning_rate": 2.921302490814527e-08, "logits/chosen": 2.6493022441864014, "logits/rejected": 2.6381585597991943, "logps/chosen": -72.41500091552734, "logps/rejected": -98.83465576171875, "loss": 0.7104, "nll_loss": 0.689666748046875, "rewards/accuracies": 1.0, "rewards/chosen": 1.6447739601135254, "rewards/margins": 6.07833194732666, "rewards/rejected": -4.433557987213135, "step": 4547 }, { "epoch": 0.758, "grad_norm": 29.43911361694336, "learning_rate": 2.917490735442669e-08, "logits/chosen": 2.0821938514709473, "logits/rejected": 2.151205062866211, "logps/chosen": -97.90068817138672, "logps/rejected": -110.68006896972656, "loss": 1.0501, "nll_loss": 1.0197988748550415, "rewards/accuracies": 1.0, "rewards/chosen": 1.8879555463790894, "rewards/margins": 5.469057083129883, "rewards/rejected": -3.581101417541504, "step": 4548 }, { "epoch": 0.7581666666666667, "grad_norm": 83.45755767822266, "learning_rate": 2.913681043743974e-08, "logits/chosen": 2.7443296909332275, "logits/rejected": 2.9516096115112305, "logps/chosen": -26.72559356689453, "logps/rejected": -263.6900634765625, "loss": 1.2556, "nll_loss": 1.2147997617721558, "rewards/accuracies": 1.0, "rewards/chosen": 0.6905364990234375, "rewards/margins": 5.062613010406494, "rewards/rejected": -4.372076511383057, "step": 4549 }, { "epoch": 0.7583333333333333, "grad_norm": 26.38677406311035, "learning_rate": 2.9098734168284965e-08, "logits/chosen": 1.8211655616760254, "logits/rejected": 2.472425937652588, "logps/chosen": -67.7440185546875, "logps/rejected": -213.23898315429688, "loss": 0.7382, "nll_loss": 0.7206809520721436, "rewards/accuracies": 1.0, "rewards/chosen": 1.16832435131073, "rewards/margins": 9.680235862731934, "rewards/rejected": -8.511911392211914, "step": 4550 }, { "epoch": 0.7585, "grad_norm": 18.495304107666016, "learning_rate": 2.9060678558056872e-08, "logits/chosen": 2.448367118835449, "logits/rejected": 2.349809408187866, "logps/chosen": -222.76876831054688, "logps/rejected": -188.04385375976562, "loss": 0.8818, "nll_loss": 0.8701905608177185, "rewards/accuracies": 1.0, "rewards/chosen": 1.5964736938476562, "rewards/margins": 10.147269248962402, "rewards/rejected": -8.550795555114746, "step": 4551 }, { "epoch": 0.7586666666666667, "grad_norm": 356.4342956542969, "learning_rate": 2.902264361784399e-08, "logits/chosen": 2.546271562576294, "logits/rejected": 2.46810245513916, "logps/chosen": -147.20277404785156, "logps/rejected": -114.90990447998047, "loss": 4.8544, "nll_loss": 2.230344295501709, "rewards/accuracies": 0.0, "rewards/chosen": -7.0780792236328125, "rewards/margins": -1.6375555992126465, "rewards/rejected": -5.440523624420166, "step": 4552 }, { "epoch": 0.7588333333333334, "grad_norm": 153.74891662597656, "learning_rate": 2.898462935872872e-08, "logits/chosen": 2.7878496646881104, "logits/rejected": 2.829204559326172, "logps/chosen": -34.85249328613281, "logps/rejected": -130.30868530273438, "loss": 2.9319, "nll_loss": 2.904374361038208, "rewards/accuracies": 1.0, "rewards/chosen": 0.7210456728935242, "rewards/margins": 7.621954917907715, "rewards/rejected": -6.900909423828125, "step": 4553 }, { "epoch": 0.759, "grad_norm": 26.95816421508789, "learning_rate": 2.8946635791787543e-08, "logits/chosen": 2.821625232696533, "logits/rejected": 2.623450756072998, "logps/chosen": -61.01483154296875, "logps/rejected": -55.556278228759766, "loss": 0.7891, "nll_loss": 0.7532695531845093, "rewards/accuracies": 1.0, "rewards/chosen": 1.4933388233184814, "rewards/margins": 5.079596519470215, "rewards/rejected": -3.5862579345703125, "step": 4554 }, { "epoch": 0.7591666666666667, "grad_norm": 33.25082015991211, "learning_rate": 2.8908662928090866e-08, "logits/chosen": 0.3327046036720276, "logits/rejected": 1.710412859916687, "logps/chosen": -27.378929138183594, "logps/rejected": -322.986328125, "loss": 0.7053, "nll_loss": 0.6844732761383057, "rewards/accuracies": 1.0, "rewards/chosen": 0.9997467994689941, "rewards/margins": 8.502087593078613, "rewards/rejected": -7.502340793609619, "step": 4555 }, { "epoch": 0.7593333333333333, "grad_norm": 25.484073638916016, "learning_rate": 2.88707107787031e-08, "logits/chosen": 2.451251268386841, "logits/rejected": 2.3325753211975098, "logps/chosen": -111.40937805175781, "logps/rejected": -71.70581817626953, "loss": 1.1299, "nll_loss": 1.1030631065368652, "rewards/accuracies": 1.0, "rewards/chosen": 2.30344557762146, "rewards/margins": 5.85443115234375, "rewards/rejected": -3.550985336303711, "step": 4556 }, { "epoch": 0.7595, "grad_norm": 22.72011947631836, "learning_rate": 2.8832779354682536e-08, "logits/chosen": 1.2151882648468018, "logits/rejected": 1.8567843437194824, "logps/chosen": -82.84249877929688, "logps/rejected": -184.9283447265625, "loss": 0.7673, "nll_loss": 0.7531136274337769, "rewards/accuracies": 1.0, "rewards/chosen": 1.9056962728500366, "rewards/margins": 6.842717170715332, "rewards/rejected": -4.937020778656006, "step": 4557 }, { "epoch": 0.7596666666666667, "grad_norm": 248.9977569580078, "learning_rate": 2.8794868667081507e-08, "logits/chosen": 1.3154951333999634, "logits/rejected": 1.3502540588378906, "logps/chosen": -58.45966339111328, "logps/rejected": -61.58408737182617, "loss": 2.9565, "nll_loss": 0.8857525587081909, "rewards/accuracies": 0.0, "rewards/chosen": 1.5072166919708252, "rewards/margins": -1.3159451484680176, "rewards/rejected": 2.8231618404388428, "step": 4558 }, { "epoch": 0.7598333333333334, "grad_norm": 29.835186004638672, "learning_rate": 2.875697872694628e-08, "logits/chosen": 2.679640531539917, "logits/rejected": 2.723712921142578, "logps/chosen": -63.36076736450195, "logps/rejected": -64.86678314208984, "loss": 0.9862, "nll_loss": 0.9456831812858582, "rewards/accuracies": 1.0, "rewards/chosen": 1.749286413192749, "rewards/margins": 4.990316390991211, "rewards/rejected": -3.241029739379883, "step": 4559 }, { "epoch": 0.76, "grad_norm": 45.31850051879883, "learning_rate": 2.87191095453171e-08, "logits/chosen": 2.099517345428467, "logits/rejected": 2.407813549041748, "logps/chosen": -20.8189754486084, "logps/rejected": -287.37078857421875, "loss": 0.774, "nll_loss": 0.7710731029510498, "rewards/accuracies": 1.0, "rewards/chosen": 3.2398841381073, "rewards/margins": 10.22506046295166, "rewards/rejected": -6.985176086425781, "step": 4560 }, { "epoch": 0.7601666666666667, "grad_norm": 23.266958236694336, "learning_rate": 2.8681261133228063e-08, "logits/chosen": 2.000385046005249, "logits/rejected": 2.495163917541504, "logps/chosen": -55.591026306152344, "logps/rejected": -584.476318359375, "loss": 0.6921, "nll_loss": 0.677939236164093, "rewards/accuracies": 1.0, "rewards/chosen": 1.37633216381073, "rewards/margins": 12.250541687011719, "rewards/rejected": -10.8742094039917, "step": 4561 }, { "epoch": 0.7603333333333333, "grad_norm": 19.27504539489746, "learning_rate": 2.864343350170735e-08, "logits/chosen": 2.0275981426239014, "logits/rejected": 2.3619401454925537, "logps/chosen": -17.062410354614258, "logps/rejected": -141.6175537109375, "loss": 0.3465, "nll_loss": 0.3281233012676239, "rewards/accuracies": 1.0, "rewards/chosen": 1.8619215488433838, "rewards/margins": 6.292201042175293, "rewards/rejected": -4.430279731750488, "step": 4562 }, { "epoch": 0.7605, "grad_norm": 30.988353729248047, "learning_rate": 2.8605626661776993e-08, "logits/chosen": 2.119581937789917, "logits/rejected": 2.0611772537231445, "logps/chosen": -12.583808898925781, "logps/rejected": -70.96153259277344, "loss": 0.3571, "nll_loss": 0.29264670610427856, "rewards/accuracies": 1.0, "rewards/chosen": 1.5449714660644531, "rewards/margins": 4.257740020751953, "rewards/rejected": -2.712768316268921, "step": 4563 }, { "epoch": 0.7606666666666667, "grad_norm": 87.75802612304688, "learning_rate": 2.8567840624453054e-08, "logits/chosen": 2.416472911834717, "logits/rejected": 2.7693448066711426, "logps/chosen": -22.54917335510254, "logps/rejected": -130.19664001464844, "loss": 1.2644, "nll_loss": 1.2527319192886353, "rewards/accuracies": 1.0, "rewards/chosen": 1.7146263122558594, "rewards/margins": 8.061573028564453, "rewards/rejected": -6.3469462394714355, "step": 4564 }, { "epoch": 0.7608333333333334, "grad_norm": 26.356586456298828, "learning_rate": 2.85300754007454e-08, "logits/chosen": 1.4240230321884155, "logits/rejected": 1.896164894104004, "logps/chosen": -76.61735534667969, "logps/rejected": -162.04397583007812, "loss": 0.8761, "nll_loss": 0.8706518411636353, "rewards/accuracies": 1.0, "rewards/chosen": 3.6182785034179688, "rewards/margins": 8.804496765136719, "rewards/rejected": -5.186218738555908, "step": 4565 }, { "epoch": 0.761, "grad_norm": 26.784269332885742, "learning_rate": 2.8492331001657942e-08, "logits/chosen": 1.0034652948379517, "logits/rejected": 2.030789852142334, "logps/chosen": -51.56058883666992, "logps/rejected": -438.2939147949219, "loss": 0.6763, "nll_loss": 0.6445072889328003, "rewards/accuracies": 1.0, "rewards/chosen": 0.5228900909423828, "rewards/margins": 9.844780921936035, "rewards/rejected": -9.321890830993652, "step": 4566 }, { "epoch": 0.7611666666666667, "grad_norm": 28.017290115356445, "learning_rate": 2.8454607438188503e-08, "logits/chosen": 2.6034092903137207, "logits/rejected": 2.7575511932373047, "logps/chosen": -77.0413818359375, "logps/rejected": -216.12371826171875, "loss": 0.9114, "nll_loss": 0.8958300948143005, "rewards/accuracies": 1.0, "rewards/chosen": 1.2861642837524414, "rewards/margins": 9.689738273620605, "rewards/rejected": -8.403573989868164, "step": 4567 }, { "epoch": 0.7613333333333333, "grad_norm": 16.791290283203125, "learning_rate": 2.8416904721328816e-08, "logits/chosen": 1.6033499240875244, "logits/rejected": 2.0193095207214355, "logps/chosen": -172.98416137695312, "logps/rejected": -347.47393798828125, "loss": 0.9988, "nll_loss": 0.9941619634628296, "rewards/accuracies": 1.0, "rewards/chosen": 2.6251282691955566, "rewards/margins": 10.008100509643555, "rewards/rejected": -7.38297176361084, "step": 4568 }, { "epoch": 0.7615, "grad_norm": 37.92564010620117, "learning_rate": 2.8379222862064566e-08, "logits/chosen": 1.0753031969070435, "logits/rejected": 1.9798387289047241, "logps/chosen": -60.36951446533203, "logps/rejected": -304.3139343261719, "loss": 1.1251, "nll_loss": 1.1179537773132324, "rewards/accuracies": 1.0, "rewards/chosen": 2.0906777381896973, "rewards/margins": 11.454355239868164, "rewards/rejected": -9.363677978515625, "step": 4569 }, { "epoch": 0.7616666666666667, "grad_norm": 31.013410568237305, "learning_rate": 2.8341561871375307e-08, "logits/chosen": 2.5181331634521484, "logits/rejected": 2.7179250717163086, "logps/chosen": -74.82442474365234, "logps/rejected": -164.0181884765625, "loss": 1.0297, "nll_loss": 1.0111408233642578, "rewards/accuracies": 1.0, "rewards/chosen": 2.5637543201446533, "rewards/margins": 6.506525039672852, "rewards/rejected": -3.9427707195281982, "step": 4570 }, { "epoch": 0.7618333333333334, "grad_norm": 22.727371215820312, "learning_rate": 2.830392176023455e-08, "logits/chosen": 2.7542715072631836, "logits/rejected": 2.8203256130218506, "logps/chosen": -47.65019989013672, "logps/rejected": -227.30628967285156, "loss": 0.5767, "nll_loss": 0.5605904459953308, "rewards/accuracies": 1.0, "rewards/chosen": 1.2415558099746704, "rewards/margins": 11.528420448303223, "rewards/rejected": -10.286864280700684, "step": 4571 }, { "epoch": 0.762, "grad_norm": 32.17079544067383, "learning_rate": 2.8266302539609744e-08, "logits/chosen": 2.2263855934143066, "logits/rejected": 2.5318562984466553, "logps/chosen": -67.89854431152344, "logps/rejected": -266.75982666015625, "loss": 0.8937, "nll_loss": 0.8817992806434631, "rewards/accuracies": 1.0, "rewards/chosen": 1.6487640142440796, "rewards/margins": 8.362569808959961, "rewards/rejected": -6.71380615234375, "step": 4572 }, { "epoch": 0.7621666666666667, "grad_norm": 39.128231048583984, "learning_rate": 2.822870422046225e-08, "logits/chosen": 1.8250175714492798, "logits/rejected": 1.6684343814849854, "logps/chosen": -34.738399505615234, "logps/rejected": -44.588539123535156, "loss": 0.7674, "nll_loss": 0.7237167358398438, "rewards/accuracies": 1.0, "rewards/chosen": 1.6728638410568237, "rewards/margins": 4.851193904876709, "rewards/rejected": -3.1783299446105957, "step": 4573 }, { "epoch": 0.7623333333333333, "grad_norm": 28.99407958984375, "learning_rate": 2.8191126813747247e-08, "logits/chosen": 3.5619423389434814, "logits/rejected": 3.666231870651245, "logps/chosen": -46.2121696472168, "logps/rejected": -162.80917358398438, "loss": 0.7814, "nll_loss": 0.7702027559280396, "rewards/accuracies": 1.0, "rewards/chosen": 1.7512147426605225, "rewards/margins": 8.152641296386719, "rewards/rejected": -6.401426315307617, "step": 4574 }, { "epoch": 0.7625, "grad_norm": 146.5443115234375, "learning_rate": 2.815357033041392e-08, "logits/chosen": 1.9552562236785889, "logits/rejected": 1.9607818126678467, "logps/chosen": -25.262012481689453, "logps/rejected": -13.313202857971191, "loss": 3.3629, "nll_loss": 0.4678151309490204, "rewards/accuracies": 0.0, "rewards/chosen": 1.8804420232772827, "rewards/margins": -2.1753768920898438, "rewards/rejected": 4.055819034576416, "step": 4575 }, { "epoch": 0.7626666666666667, "grad_norm": 27.64590835571289, "learning_rate": 2.811603478140534e-08, "logits/chosen": 2.2246365547180176, "logits/rejected": 2.094947576522827, "logps/chosen": -31.515588760375977, "logps/rejected": -54.58485794067383, "loss": 0.5715, "nll_loss": 0.5529051423072815, "rewards/accuracies": 1.0, "rewards/chosen": 2.29966139793396, "rewards/margins": 6.369463920593262, "rewards/rejected": -4.069802284240723, "step": 4576 }, { "epoch": 0.7628333333333334, "grad_norm": 51.925926208496094, "learning_rate": 2.8078520177658472e-08, "logits/chosen": 2.0484442710876465, "logits/rejected": 2.318133592605591, "logps/chosen": -81.04773712158203, "logps/rejected": -239.01608276367188, "loss": 0.987, "nll_loss": 0.8622101545333862, "rewards/accuracies": 1.0, "rewards/chosen": 2.4419610500335693, "rewards/margins": 4.041244029998779, "rewards/rejected": -1.59928297996521, "step": 4577 }, { "epoch": 0.763, "grad_norm": 50.67055892944336, "learning_rate": 2.804102653010414e-08, "logits/chosen": 2.326667308807373, "logits/rejected": 2.5144569873809814, "logps/chosen": -32.240623474121094, "logps/rejected": -199.787109375, "loss": 0.9032, "nll_loss": 0.8713682293891907, "rewards/accuracies": 1.0, "rewards/chosen": 0.5992328524589539, "rewards/margins": 6.77950382232666, "rewards/rejected": -6.180271148681641, "step": 4578 }, { "epoch": 0.7631666666666667, "grad_norm": 19.945402145385742, "learning_rate": 2.8003553849667094e-08, "logits/chosen": 1.6836738586425781, "logits/rejected": 0.8901464939117432, "logps/chosen": -151.03216552734375, "logps/rejected": -77.74171447753906, "loss": 1.0474, "nll_loss": 1.0416010618209839, "rewards/accuracies": 1.0, "rewards/chosen": 3.1263840198516846, "rewards/margins": 8.527761459350586, "rewards/rejected": -5.4013776779174805, "step": 4579 }, { "epoch": 0.7633333333333333, "grad_norm": 31.50678253173828, "learning_rate": 2.7966102147265992e-08, "logits/chosen": 2.666991710662842, "logits/rejected": 2.789053440093994, "logps/chosen": -7.01639986038208, "logps/rejected": -83.51708984375, "loss": 0.3154, "nll_loss": 0.29235002398490906, "rewards/accuracies": 1.0, "rewards/chosen": 0.9184444546699524, "rewards/margins": 7.668038845062256, "rewards/rejected": -6.749594211578369, "step": 4580 }, { "epoch": 0.7635, "grad_norm": 30.84379005432129, "learning_rate": 2.792867143381339e-08, "logits/chosen": 3.03132963180542, "logits/rejected": 3.019268274307251, "logps/chosen": -68.8268814086914, "logps/rejected": -85.94358825683594, "loss": 0.857, "nll_loss": 0.8393521904945374, "rewards/accuracies": 1.0, "rewards/chosen": 1.223713755607605, "rewards/margins": 7.807117938995361, "rewards/rejected": -6.583404064178467, "step": 4581 }, { "epoch": 0.7636666666666667, "grad_norm": 31.185413360595703, "learning_rate": 2.789126172021563e-08, "logits/chosen": 3.044360399246216, "logits/rejected": 3.1205222606658936, "logps/chosen": -51.83715057373047, "logps/rejected": -151.35446166992188, "loss": 0.6965, "nll_loss": 0.6732097268104553, "rewards/accuracies": 1.0, "rewards/chosen": 1.0903576612472534, "rewards/margins": 6.400711536407471, "rewards/rejected": -5.310353755950928, "step": 4582 }, { "epoch": 0.7638333333333334, "grad_norm": 24.324647903442383, "learning_rate": 2.785387301737304e-08, "logits/chosen": 1.8608359098434448, "logits/rejected": 2.259366512298584, "logps/chosen": -104.74507141113281, "logps/rejected": -204.37689208984375, "loss": 0.86, "nll_loss": 0.8379606008529663, "rewards/accuracies": 1.0, "rewards/chosen": 0.910054087638855, "rewards/margins": 11.429039001464844, "rewards/rejected": -10.5189847946167, "step": 4583 }, { "epoch": 0.764, "grad_norm": 29.1666316986084, "learning_rate": 2.7816505336179798e-08, "logits/chosen": 2.4242048263549805, "logits/rejected": 2.3996927738189697, "logps/chosen": -55.22904968261719, "logps/rejected": -109.0274658203125, "loss": 0.6575, "nll_loss": 0.634816586971283, "rewards/accuracies": 1.0, "rewards/chosen": 1.4431320428848267, "rewards/margins": 5.960579872131348, "rewards/rejected": -4.5174479484558105, "step": 4584 }, { "epoch": 0.7641666666666667, "grad_norm": 116.05133056640625, "learning_rate": 2.7779158687523962e-08, "logits/chosen": 2.914123296737671, "logits/rejected": 3.215402603149414, "logps/chosen": -25.51344108581543, "logps/rejected": -177.809326171875, "loss": 1.3037, "nll_loss": 1.2756720781326294, "rewards/accuracies": 1.0, "rewards/chosen": 0.6936830878257751, "rewards/margins": 7.6513590812683105, "rewards/rejected": -6.957675933837891, "step": 4585 }, { "epoch": 0.7643333333333333, "grad_norm": 36.113502502441406, "learning_rate": 2.7741833082287402e-08, "logits/chosen": 2.642455577850342, "logits/rejected": 2.659078359603882, "logps/chosen": -37.65458679199219, "logps/rejected": -189.33941650390625, "loss": 0.7145, "nll_loss": 0.71046382188797, "rewards/accuracies": 1.0, "rewards/chosen": 2.7900550365448, "rewards/margins": 10.06546401977539, "rewards/rejected": -7.27540922164917, "step": 4586 }, { "epoch": 0.7645, "grad_norm": 37.77365493774414, "learning_rate": 2.770452853134593e-08, "logits/chosen": 2.113901138305664, "logits/rejected": 2.3745975494384766, "logps/chosen": -16.250394821166992, "logps/rejected": -275.3575439453125, "loss": 0.5429, "nll_loss": 0.5416799187660217, "rewards/accuracies": 1.0, "rewards/chosen": 4.210756778717041, "rewards/margins": 11.868932723999023, "rewards/rejected": -7.658175945281982, "step": 4587 }, { "epoch": 0.7646666666666667, "grad_norm": 25.6176815032959, "learning_rate": 2.766724504556919e-08, "logits/chosen": 2.1961891651153564, "logits/rejected": 2.3074417114257812, "logps/chosen": -42.786956787109375, "logps/rejected": -165.9074249267578, "loss": 0.6257, "nll_loss": 0.6201007962226868, "rewards/accuracies": 1.0, "rewards/chosen": 2.4340507984161377, "rewards/margins": 9.621407508850098, "rewards/rejected": -7.187356472015381, "step": 4588 }, { "epoch": 0.7648333333333334, "grad_norm": 45.5870475769043, "learning_rate": 2.7629982635820703e-08, "logits/chosen": 3.266754627227783, "logits/rejected": 3.595034122467041, "logps/chosen": -24.10584831237793, "logps/rejected": -316.9777526855469, "loss": 0.903, "nll_loss": 0.8928091526031494, "rewards/accuracies": 1.0, "rewards/chosen": 4.4536004066467285, "rewards/margins": 8.733330726623535, "rewards/rejected": -4.279730319976807, "step": 4589 }, { "epoch": 0.765, "grad_norm": 28.22442054748535, "learning_rate": 2.7592741312957867e-08, "logits/chosen": 1.9884756803512573, "logits/rejected": 2.3382039070129395, "logps/chosen": -28.51544952392578, "logps/rejected": -171.95071411132812, "loss": 0.5802, "nll_loss": 0.5483739376068115, "rewards/accuracies": 1.0, "rewards/chosen": 1.2969379425048828, "rewards/margins": 5.278137683868408, "rewards/rejected": -3.9811997413635254, "step": 4590 }, { "epoch": 0.7651666666666667, "grad_norm": 27.493310928344727, "learning_rate": 2.7555521087831855e-08, "logits/chosen": 3.3117287158966064, "logits/rejected": 3.397275924682617, "logps/chosen": -20.730445861816406, "logps/rejected": -116.60012817382812, "loss": 0.5011, "nll_loss": 0.4821033775806427, "rewards/accuracies": 1.0, "rewards/chosen": 1.2843327522277832, "rewards/margins": 6.811704635620117, "rewards/rejected": -5.527371883392334, "step": 4591 }, { "epoch": 0.7653333333333333, "grad_norm": 68.5268783569336, "learning_rate": 2.7518321971287772e-08, "logits/chosen": 1.9256253242492676, "logits/rejected": 2.2609236240386963, "logps/chosen": -61.56903076171875, "logps/rejected": -367.4379577636719, "loss": 2.1377, "nll_loss": 2.123070001602173, "rewards/accuracies": 1.0, "rewards/chosen": 1.4314018487930298, "rewards/margins": 7.930339813232422, "rewards/rejected": -6.498938083648682, "step": 4592 }, { "epoch": 0.7655, "grad_norm": 31.520645141601562, "learning_rate": 2.7481143974164545e-08, "logits/chosen": 2.3865606784820557, "logits/rejected": 2.502659797668457, "logps/chosen": -23.57004737854004, "logps/rejected": -336.0262756347656, "loss": 0.5549, "nll_loss": 0.5356828570365906, "rewards/accuracies": 1.0, "rewards/chosen": 1.088319182395935, "rewards/margins": 8.316269874572754, "rewards/rejected": -7.2279510498046875, "step": 4593 }, { "epoch": 0.7656666666666667, "grad_norm": 128.64694213867188, "learning_rate": 2.7443987107294974e-08, "logits/chosen": 2.3660671710968018, "logits/rejected": 2.3795180320739746, "logps/chosen": -80.67266845703125, "logps/rejected": -106.8651351928711, "loss": 0.8909, "nll_loss": 0.8674480319023132, "rewards/accuracies": 1.0, "rewards/chosen": 1.076593041419983, "rewards/margins": 6.402627468109131, "rewards/rejected": -5.3260345458984375, "step": 4594 }, { "epoch": 0.7658333333333334, "grad_norm": 28.191234588623047, "learning_rate": 2.7406851381505635e-08, "logits/chosen": 2.6445186138153076, "logits/rejected": 2.9006173610687256, "logps/chosen": -12.586869239807129, "logps/rejected": -191.5421142578125, "loss": 0.3634, "nll_loss": 0.3496352434158325, "rewards/accuracies": 1.0, "rewards/chosen": 1.8074510097503662, "rewards/margins": 7.013602256774902, "rewards/rejected": -5.206151008605957, "step": 4595 }, { "epoch": 0.766, "grad_norm": 19.376794815063477, "learning_rate": 2.7369736807617015e-08, "logits/chosen": 1.7652570009231567, "logits/rejected": 1.863148808479309, "logps/chosen": -39.72908401489258, "logps/rejected": -160.09449768066406, "loss": 0.4511, "nll_loss": 0.43658336997032166, "rewards/accuracies": 1.0, "rewards/chosen": 1.4544823169708252, "rewards/margins": 7.862322807312012, "rewards/rejected": -6.407840728759766, "step": 4596 }, { "epoch": 0.7661666666666667, "grad_norm": 103.95941925048828, "learning_rate": 2.73326433964434e-08, "logits/chosen": 1.8573288917541504, "logits/rejected": 1.6361804008483887, "logps/chosen": -72.71712493896484, "logps/rejected": -12.501219749450684, "loss": 1.8933, "nll_loss": 0.7819045782089233, "rewards/accuracies": 1.0, "rewards/chosen": 2.9394989013671875, "rewards/margins": 0.5247633457183838, "rewards/rejected": 2.4147355556488037, "step": 4597 }, { "epoch": 0.7663333333333333, "grad_norm": 22.161327362060547, "learning_rate": 2.729557115879294e-08, "logits/chosen": 2.3552680015563965, "logits/rejected": 2.4884631633758545, "logps/chosen": -108.19776916503906, "logps/rejected": -317.2923583984375, "loss": 0.9534, "nll_loss": 0.9491032958030701, "rewards/accuracies": 1.0, "rewards/chosen": 2.738826036453247, "rewards/margins": 9.855015754699707, "rewards/rejected": -7.116189479827881, "step": 4598 }, { "epoch": 0.7665, "grad_norm": 42.94673156738281, "learning_rate": 2.7258520105467565e-08, "logits/chosen": 2.057603359222412, "logits/rejected": 2.104628086090088, "logps/chosen": -73.26029205322266, "logps/rejected": -102.09304809570312, "loss": 1.179, "nll_loss": 1.144692301750183, "rewards/accuracies": 1.0, "rewards/chosen": 1.3145179748535156, "rewards/margins": 5.133185863494873, "rewards/rejected": -3.8186678886413574, "step": 4599 }, { "epoch": 0.7666666666666667, "grad_norm": 26.34986114501953, "learning_rate": 2.722149024726307e-08, "logits/chosen": 2.2342653274536133, "logits/rejected": 2.6932194232940674, "logps/chosen": -39.21502685546875, "logps/rejected": -291.0227355957031, "loss": 0.6745, "nll_loss": 0.6535837054252625, "rewards/accuracies": 1.0, "rewards/chosen": 1.255548119544983, "rewards/margins": 6.434561729431152, "rewards/rejected": -5.179013729095459, "step": 4600 }, { "epoch": 0.7668333333333334, "grad_norm": 11.108457565307617, "learning_rate": 2.7184481594969066e-08, "logits/chosen": 1.8837884664535522, "logits/rejected": 1.7253636121749878, "logps/chosen": -264.7547607421875, "logps/rejected": -237.9767303466797, "loss": 0.7798, "nll_loss": 0.7764069437980652, "rewards/accuracies": 1.0, "rewards/chosen": 3.507251262664795, "rewards/margins": 9.571840286254883, "rewards/rejected": -6.064589023590088, "step": 4601 }, { "epoch": 0.767, "grad_norm": 329.7371520996094, "learning_rate": 2.7147494159369034e-08, "logits/chosen": 2.325856924057007, "logits/rejected": 1.9845713376998901, "logps/chosen": -107.49822235107422, "logps/rejected": -32.04032897949219, "loss": 3.8873, "nll_loss": 1.604451060295105, "rewards/accuracies": 0.0, "rewards/chosen": -0.6848739981651306, "rewards/margins": -1.9598069190979004, "rewards/rejected": 1.274932861328125, "step": 4602 }, { "epoch": 0.7671666666666667, "grad_norm": 51.47330856323242, "learning_rate": 2.7110527951240137e-08, "logits/chosen": 2.5563485622406006, "logits/rejected": 2.6753954887390137, "logps/chosen": -72.44530487060547, "logps/rejected": -140.79037475585938, "loss": 1.1077, "nll_loss": 1.065372109413147, "rewards/accuracies": 1.0, "rewards/chosen": 0.9772750735282898, "rewards/margins": 4.775277137756348, "rewards/rejected": -3.798002004623413, "step": 4603 }, { "epoch": 0.7673333333333333, "grad_norm": 27.388517379760742, "learning_rate": 2.7073582981353494e-08, "logits/chosen": 3.383819580078125, "logits/rejected": 3.553586721420288, "logps/chosen": -70.87680053710938, "logps/rejected": -282.74072265625, "loss": 0.8552, "nll_loss": 0.8437715172767639, "rewards/accuracies": 1.0, "rewards/chosen": 1.5972076654434204, "rewards/margins": 11.962480545043945, "rewards/rejected": -10.365272521972656, "step": 4604 }, { "epoch": 0.7675, "grad_norm": 26.384967803955078, "learning_rate": 2.703665926047397e-08, "logits/chosen": 3.30936598777771, "logits/rejected": 3.6099202632904053, "logps/chosen": -23.41571617126465, "logps/rejected": -103.49199676513672, "loss": 0.5185, "nll_loss": 0.5090372562408447, "rewards/accuracies": 1.0, "rewards/chosen": 2.4805071353912354, "rewards/margins": 7.548360824584961, "rewards/rejected": -5.0678534507751465, "step": 4605 }, { "epoch": 0.7676666666666667, "grad_norm": 180.27880859375, "learning_rate": 2.6999756799360297e-08, "logits/chosen": 2.5909981727600098, "logits/rejected": 2.471619129180908, "logps/chosen": -42.8088264465332, "logps/rejected": -12.897638320922852, "loss": 4.0075, "nll_loss": 0.7783423066139221, "rewards/accuracies": 0.0, "rewards/chosen": 2.110308885574341, "rewards/margins": -2.4873201847076416, "rewards/rejected": 4.597629070281982, "step": 4606 }, { "epoch": 0.7678333333333334, "grad_norm": 29.486356735229492, "learning_rate": 2.696287560876489e-08, "logits/chosen": 2.6059627532958984, "logits/rejected": 2.648874044418335, "logps/chosen": -92.3532943725586, "logps/rejected": -208.92660522460938, "loss": 1.2626, "nll_loss": 1.2480175495147705, "rewards/accuracies": 1.0, "rewards/chosen": 1.7353966236114502, "rewards/margins": 6.906261444091797, "rewards/rejected": -5.170865058898926, "step": 4607 }, { "epoch": 0.768, "grad_norm": 28.619022369384766, "learning_rate": 2.692601569943407e-08, "logits/chosen": 2.6639273166656494, "logits/rejected": 2.800366163253784, "logps/chosen": -67.28144836425781, "logps/rejected": -149.86300659179688, "loss": 0.9165, "nll_loss": 0.8737850189208984, "rewards/accuracies": 1.0, "rewards/chosen": 1.4263360500335693, "rewards/margins": 4.7890119552612305, "rewards/rejected": -3.362675666809082, "step": 4608 }, { "epoch": 0.7681666666666667, "grad_norm": 52.170162200927734, "learning_rate": 2.688917708210794e-08, "logits/chosen": 2.800788164138794, "logits/rejected": 2.820279598236084, "logps/chosen": -24.604503631591797, "logps/rejected": -60.57898712158203, "loss": 0.9666, "nll_loss": 0.9112778902053833, "rewards/accuracies": 1.0, "rewards/chosen": 0.17246055603027344, "rewards/margins": 4.915546894073486, "rewards/rejected": -4.743086338043213, "step": 4609 }, { "epoch": 0.7683333333333333, "grad_norm": 23.076398849487305, "learning_rate": 2.6852359767520383e-08, "logits/chosen": 2.4191699028015137, "logits/rejected": 2.614283323287964, "logps/chosen": -27.574806213378906, "logps/rejected": -182.25457763671875, "loss": 0.4322, "nll_loss": 0.41156426072120667, "rewards/accuracies": 1.0, "rewards/chosen": 2.127530813217163, "rewards/margins": 6.140880584716797, "rewards/rejected": -4.013350009918213, "step": 4610 }, { "epoch": 0.7685, "grad_norm": 27.329368591308594, "learning_rate": 2.681556376639912e-08, "logits/chosen": 2.275094985961914, "logits/rejected": 2.269594430923462, "logps/chosen": -61.26124572753906, "logps/rejected": -148.9541015625, "loss": 0.6844, "nll_loss": 0.6315591335296631, "rewards/accuracies": 1.0, "rewards/chosen": 2.862147808074951, "rewards/margins": 5.453559875488281, "rewards/rejected": -2.591412305831909, "step": 4611 }, { "epoch": 0.7686666666666667, "grad_norm": 69.41035461425781, "learning_rate": 2.677878908946555e-08, "logits/chosen": 2.8769776821136475, "logits/rejected": 3.0137245655059814, "logps/chosen": -7.738081932067871, "logps/rejected": -172.16903686523438, "loss": 0.6122, "nll_loss": 0.5952370166778564, "rewards/accuracies": 1.0, "rewards/chosen": 1.3744800090789795, "rewards/margins": 7.115002632141113, "rewards/rejected": -5.740522861480713, "step": 4612 }, { "epoch": 0.7688333333333334, "grad_norm": 148.5750274658203, "learning_rate": 2.6742035747434965e-08, "logits/chosen": 2.5939929485321045, "logits/rejected": 2.7172553539276123, "logps/chosen": -100.06309509277344, "logps/rejected": -9.93710994720459, "loss": 2.5326, "nll_loss": 1.0995945930480957, "rewards/accuracies": 0.0, "rewards/chosen": 2.3319427967071533, "rewards/margins": -0.24639534950256348, "rewards/rejected": 2.578338146209717, "step": 4613 }, { "epoch": 0.769, "grad_norm": 27.984310150146484, "learning_rate": 2.6705303751016406e-08, "logits/chosen": 2.057539224624634, "logits/rejected": 1.8746027946472168, "logps/chosen": -31.629737854003906, "logps/rejected": -54.60987854003906, "loss": 0.5735, "nll_loss": 0.5549077391624451, "rewards/accuracies": 1.0, "rewards/chosen": 2.2882466316223145, "rewards/margins": 6.360550880432129, "rewards/rejected": -4.0723042488098145, "step": 4614 }, { "epoch": 0.7691666666666667, "grad_norm": 39.46418762207031, "learning_rate": 2.666859311091273e-08, "logits/chosen": 2.627443313598633, "logits/rejected": 2.7743418216705322, "logps/chosen": -36.84236145019531, "logps/rejected": -252.27035522460938, "loss": 0.8921, "nll_loss": 0.8771992921829224, "rewards/accuracies": 1.0, "rewards/chosen": 1.4177166223526, "rewards/margins": 7.915589809417725, "rewards/rejected": -6.497873306274414, "step": 4615 }, { "epoch": 0.7693333333333333, "grad_norm": 45.27139663696289, "learning_rate": 2.663190383782048e-08, "logits/chosen": 1.8706623315811157, "logits/rejected": 2.7127957344055176, "logps/chosen": -22.001060485839844, "logps/rejected": -178.30581665039062, "loss": 0.8216, "nll_loss": 0.7857521772384644, "rewards/accuracies": 1.0, "rewards/chosen": 0.9441956281661987, "rewards/margins": 5.158514499664307, "rewards/rejected": -4.214318752288818, "step": 4616 }, { "epoch": 0.7695, "grad_norm": 91.13150787353516, "learning_rate": 2.659523594243004e-08, "logits/chosen": 3.5954430103302, "logits/rejected": 3.6656973361968994, "logps/chosen": -14.56648063659668, "logps/rejected": -62.29225540161133, "loss": 0.7813, "nll_loss": 0.7666569352149963, "rewards/accuracies": 1.0, "rewards/chosen": 2.1899492740631104, "rewards/margins": 6.7202653884887695, "rewards/rejected": -4.53031587600708, "step": 4617 }, { "epoch": 0.7696666666666667, "grad_norm": 24.4045467376709, "learning_rate": 2.6558589435425573e-08, "logits/chosen": 2.6861109733581543, "logits/rejected": 3.1242175102233887, "logps/chosen": -63.5230827331543, "logps/rejected": -214.88290405273438, "loss": 0.6945, "nll_loss": 0.6830438375473022, "rewards/accuracies": 1.0, "rewards/chosen": 1.6550571918487549, "rewards/margins": 8.741779327392578, "rewards/rejected": -7.086722373962402, "step": 4618 }, { "epoch": 0.7698333333333334, "grad_norm": 65.6714096069336, "learning_rate": 2.6521964327485013e-08, "logits/chosen": 1.5296165943145752, "logits/rejected": 1.8521473407745361, "logps/chosen": -6.482402801513672, "logps/rejected": -55.358238220214844, "loss": 0.4711, "nll_loss": 0.4321601986885071, "rewards/accuracies": 1.0, "rewards/chosen": 1.6882301568984985, "rewards/margins": 5.023757457733154, "rewards/rejected": -3.335527181625366, "step": 4619 }, { "epoch": 0.77, "grad_norm": 141.59292602539062, "learning_rate": 2.6485360629279985e-08, "logits/chosen": 2.4514963626861572, "logits/rejected": 2.820132255554199, "logps/chosen": -9.414990425109863, "logps/rejected": -54.094268798828125, "loss": 0.5846, "nll_loss": 0.31383296847343445, "rewards/accuracies": 1.0, "rewards/chosen": -0.4544937312602997, "rewards/margins": 1.7336483001708984, "rewards/rejected": -2.1881420612335205, "step": 4620 }, { "epoch": 0.7701666666666667, "grad_norm": 29.434097290039062, "learning_rate": 2.6448778351475944e-08, "logits/chosen": 3.283679962158203, "logits/rejected": 3.188389778137207, "logps/chosen": -96.12651062011719, "logps/rejected": -86.01805114746094, "loss": 1.1384, "nll_loss": 1.1177499294281006, "rewards/accuracies": 1.0, "rewards/chosen": 1.1634186506271362, "rewards/margins": 6.798483848571777, "rewards/rejected": -5.635065078735352, "step": 4621 }, { "epoch": 0.7703333333333333, "grad_norm": 115.37039947509766, "learning_rate": 2.6412217504732092e-08, "logits/chosen": 2.670914888381958, "logits/rejected": 2.758270502090454, "logps/chosen": -66.69732666015625, "logps/rejected": -73.5442886352539, "loss": 3.1792, "nll_loss": 3.0316970348358154, "rewards/accuracies": 1.0, "rewards/chosen": -0.33975642919540405, "rewards/margins": 2.7430148124694824, "rewards/rejected": -3.0827713012695312, "step": 4622 }, { "epoch": 0.7705, "grad_norm": 192.35037231445312, "learning_rate": 2.6375678099701427e-08, "logits/chosen": 2.0102860927581787, "logits/rejected": 2.2891461849212646, "logps/chosen": -96.90758514404297, "logps/rejected": -128.43576049804688, "loss": 3.1006, "nll_loss": 2.484809637069702, "rewards/accuracies": 1.0, "rewards/chosen": -2.977403402328491, "rewards/margins": 1.1236655712127686, "rewards/rejected": -4.10106897354126, "step": 4623 }, { "epoch": 0.7706666666666667, "grad_norm": 33.37943649291992, "learning_rate": 2.633916014703057e-08, "logits/chosen": 2.251131296157837, "logits/rejected": 2.374732255935669, "logps/chosen": -40.30911636352539, "logps/rejected": -322.6365966796875, "loss": 0.6853, "nll_loss": 0.671818733215332, "rewards/accuracies": 1.0, "rewards/chosen": 1.477669596672058, "rewards/margins": 8.587701797485352, "rewards/rejected": -7.110032558441162, "step": 4624 }, { "epoch": 0.7708333333333334, "grad_norm": 46.765193939208984, "learning_rate": 2.6302663657360035e-08, "logits/chosen": 1.7975971698760986, "logits/rejected": 2.0378406047821045, "logps/chosen": -11.191473007202148, "logps/rejected": -195.52398681640625, "loss": 0.4376, "nll_loss": 0.43044131994247437, "rewards/accuracies": 1.0, "rewards/chosen": 2.1016478538513184, "rewards/margins": 10.26723861694336, "rewards/rejected": -8.1655912399292, "step": 4625 }, { "epoch": 0.771, "grad_norm": 57.28536605834961, "learning_rate": 2.6266188641323994e-08, "logits/chosen": 1.414038896560669, "logits/rejected": 1.3412024974822998, "logps/chosen": -138.36231994628906, "logps/rejected": -69.2027359008789, "loss": 1.1161, "nll_loss": 0.7861494421958923, "rewards/accuracies": 1.0, "rewards/chosen": 2.9338912963867188, "rewards/margins": 3.0054543018341064, "rewards/rejected": -0.07156296074390411, "step": 4626 }, { "epoch": 0.7711666666666667, "grad_norm": 36.797115325927734, "learning_rate": 2.6229735109550433e-08, "logits/chosen": 2.382014036178589, "logits/rejected": 2.8570659160614014, "logps/chosen": -74.41886138916016, "logps/rejected": -472.2318115234375, "loss": 1.3637, "nll_loss": 1.3530703783035278, "rewards/accuracies": 1.0, "rewards/chosen": 1.6657829284667969, "rewards/margins": 11.291805267333984, "rewards/rejected": -9.626022338867188, "step": 4627 }, { "epoch": 0.7713333333333333, "grad_norm": 83.48477172851562, "learning_rate": 2.6193303072660977e-08, "logits/chosen": 3.072589635848999, "logits/rejected": 2.9924142360687256, "logps/chosen": -51.16850280761719, "logps/rejected": -34.63630676269531, "loss": 1.304, "nll_loss": 0.7309785485267639, "rewards/accuracies": 1.0, "rewards/chosen": 1.2673122882843018, "rewards/margins": 1.0951721668243408, "rewards/rejected": 0.17214013636112213, "step": 4628 }, { "epoch": 0.7715, "grad_norm": 64.3587417602539, "learning_rate": 2.615689254127108e-08, "logits/chosen": 2.6502108573913574, "logits/rejected": 2.219449996948242, "logps/chosen": -53.213829040527344, "logps/rejected": -32.65850067138672, "loss": 1.7238, "nll_loss": 1.565112590789795, "rewards/accuracies": 1.0, "rewards/chosen": 0.6070019006729126, "rewards/margins": 2.6443567276000977, "rewards/rejected": -2.0373547077178955, "step": 4629 }, { "epoch": 0.7716666666666666, "grad_norm": 37.60184097290039, "learning_rate": 2.612050352598989e-08, "logits/chosen": 1.9416954517364502, "logits/rejected": 1.6304621696472168, "logps/chosen": -96.3959732055664, "logps/rejected": -142.42984008789062, "loss": 1.1207, "nll_loss": 1.1079996824264526, "rewards/accuracies": 1.0, "rewards/chosen": 1.7113289833068848, "rewards/margins": 7.504127502441406, "rewards/rejected": -5.7927985191345215, "step": 4630 }, { "epoch": 0.7718333333333334, "grad_norm": 23.309768676757812, "learning_rate": 2.6084136037420302e-08, "logits/chosen": 1.5811063051223755, "logits/rejected": 1.906029224395752, "logps/chosen": -39.62982177734375, "logps/rejected": -300.4935302734375, "loss": 0.5455, "nll_loss": 0.5428743362426758, "rewards/accuracies": 1.0, "rewards/chosen": 3.092982530593872, "rewards/margins": 12.389432907104492, "rewards/rejected": -9.2964506149292, "step": 4631 }, { "epoch": 0.772, "grad_norm": 28.80760955810547, "learning_rate": 2.604779008615895e-08, "logits/chosen": 1.700402021408081, "logits/rejected": 1.3391563892364502, "logps/chosen": -90.97234344482422, "logps/rejected": -131.8177947998047, "loss": 0.9909, "nll_loss": 0.9677909016609192, "rewards/accuracies": 1.0, "rewards/chosen": 1.4907509088516235, "rewards/margins": 5.89212703704834, "rewards/rejected": -4.401376247406006, "step": 4632 }, { "epoch": 0.7721666666666667, "grad_norm": 38.92247772216797, "learning_rate": 2.6011465682796107e-08, "logits/chosen": 2.538951873779297, "logits/rejected": 2.3844118118286133, "logps/chosen": -45.27721405029297, "logps/rejected": -121.81509399414062, "loss": 0.9004, "nll_loss": 0.8707157373428345, "rewards/accuracies": 1.0, "rewards/chosen": 0.6467670798301697, "rewards/margins": 7.208849906921387, "rewards/rejected": -6.562082767486572, "step": 4633 }, { "epoch": 0.7723333333333333, "grad_norm": 59.508026123046875, "learning_rate": 2.5975162837915876e-08, "logits/chosen": 2.2945170402526855, "logits/rejected": 2.2066657543182373, "logps/chosen": -75.37442016601562, "logps/rejected": -76.23973083496094, "loss": 1.2403, "nll_loss": 1.0616114139556885, "rewards/accuracies": 1.0, "rewards/chosen": -0.08623581379652023, "rewards/margins": 2.3663454055786133, "rewards/rejected": -2.4525811672210693, "step": 4634 }, { "epoch": 0.7725, "grad_norm": 28.124603271484375, "learning_rate": 2.5938881562096027e-08, "logits/chosen": 2.5790867805480957, "logits/rejected": 2.5844619274139404, "logps/chosen": -92.8632583618164, "logps/rejected": -181.45730590820312, "loss": 1.1277, "nll_loss": 1.1188344955444336, "rewards/accuracies": 1.0, "rewards/chosen": 1.8750367164611816, "rewards/margins": 10.036535263061523, "rewards/rejected": -8.161498069763184, "step": 4635 }, { "epoch": 0.7726666666666666, "grad_norm": 41.306175231933594, "learning_rate": 2.5902621865908046e-08, "logits/chosen": 3.0401222705841064, "logits/rejected": 2.9004464149475098, "logps/chosen": -33.621337890625, "logps/rejected": -46.64181137084961, "loss": 0.8237, "nll_loss": 0.7818916440010071, "rewards/accuracies": 1.0, "rewards/chosen": 1.4304847717285156, "rewards/margins": 4.821416854858398, "rewards/rejected": -3.3909318447113037, "step": 4636 }, { "epoch": 0.7728333333333334, "grad_norm": 24.896835327148438, "learning_rate": 2.5866383759917155e-08, "logits/chosen": 2.5877082347869873, "logits/rejected": 2.478470802307129, "logps/chosen": -74.89778900146484, "logps/rejected": -76.12248992919922, "loss": 0.9121, "nll_loss": 0.9023830890655518, "rewards/accuracies": 1.0, "rewards/chosen": 2.8445956707000732, "rewards/margins": 7.5866241455078125, "rewards/rejected": -4.742028713226318, "step": 4637 }, { "epoch": 0.773, "grad_norm": 17.757301330566406, "learning_rate": 2.5830167254682257e-08, "logits/chosen": 1.2454099655151367, "logits/rejected": 1.3605988025665283, "logps/chosen": -153.55526733398438, "logps/rejected": -209.97276306152344, "loss": 0.7965, "nll_loss": 0.779468297958374, "rewards/accuracies": 1.0, "rewards/chosen": 1.456512451171875, "rewards/margins": 6.832259178161621, "rewards/rejected": -5.375746726989746, "step": 4638 }, { "epoch": 0.7731666666666667, "grad_norm": 414.72906494140625, "learning_rate": 2.5793972360756e-08, "logits/chosen": 2.742483139038086, "logits/rejected": 2.0788521766662598, "logps/chosen": -176.31373596191406, "logps/rejected": -63.950286865234375, "loss": 2.5683, "nll_loss": 1.2416459321975708, "rewards/accuracies": 0.0, "rewards/chosen": -2.549992561340332, "rewards/margins": -0.7780977487564087, "rewards/rejected": -1.7718948125839233, "step": 4639 }, { "epoch": 0.7733333333333333, "grad_norm": 22.90376853942871, "learning_rate": 2.575779908868465e-08, "logits/chosen": 1.847211241722107, "logits/rejected": 1.65060293674469, "logps/chosen": -75.68436431884766, "logps/rejected": -150.7533416748047, "loss": 0.8259, "nll_loss": 0.8138104677200317, "rewards/accuracies": 1.0, "rewards/chosen": 1.5879143476486206, "rewards/margins": 8.70538330078125, "rewards/rejected": -7.11746883392334, "step": 4640 }, { "epoch": 0.7735, "grad_norm": 27.128873825073242, "learning_rate": 2.5721647449008265e-08, "logits/chosen": 2.7868800163269043, "logits/rejected": 2.846318244934082, "logps/chosen": -43.68881607055664, "logps/rejected": -344.5888366699219, "loss": 0.7054, "nll_loss": 0.7046583294868469, "rewards/accuracies": 1.0, "rewards/chosen": 4.598052024841309, "rewards/margins": 13.145009994506836, "rewards/rejected": -8.546957969665527, "step": 4641 }, { "epoch": 0.7736666666666666, "grad_norm": 33.8100471496582, "learning_rate": 2.5685517452260564e-08, "logits/chosen": 1.9788535833358765, "logits/rejected": 2.469205141067505, "logps/chosen": -34.771095275878906, "logps/rejected": -182.13516235351562, "loss": 0.761, "nll_loss": 0.7558935284614563, "rewards/accuracies": 1.0, "rewards/chosen": 2.765018939971924, "rewards/margins": 8.982070922851562, "rewards/rejected": -6.217052459716797, "step": 4642 }, { "epoch": 0.7738333333333334, "grad_norm": 28.958410263061523, "learning_rate": 2.5649409108968977e-08, "logits/chosen": 1.0892056226730347, "logits/rejected": 2.2864181995391846, "logps/chosen": -9.305708885192871, "logps/rejected": -366.87164306640625, "loss": 0.3084, "nll_loss": 0.2908034026622772, "rewards/accuracies": 1.0, "rewards/chosen": 1.2462714910507202, "rewards/margins": 7.5264573097229, "rewards/rejected": -6.280185699462891, "step": 4643 }, { "epoch": 0.774, "grad_norm": 27.164831161499023, "learning_rate": 2.561332242965457e-08, "logits/chosen": 2.8496036529541016, "logits/rejected": 2.878324508666992, "logps/chosen": -53.004817962646484, "logps/rejected": -134.5434112548828, "loss": 0.7114, "nll_loss": 0.6974318027496338, "rewards/accuracies": 1.0, "rewards/chosen": 1.4578931331634521, "rewards/margins": 8.194948196411133, "rewards/rejected": -6.73705530166626, "step": 4644 }, { "epoch": 0.7741666666666667, "grad_norm": 22.16531753540039, "learning_rate": 2.5577257424832145e-08, "logits/chosen": 1.2622628211975098, "logits/rejected": 1.937339186668396, "logps/chosen": -63.133567810058594, "logps/rejected": -223.32716369628906, "loss": 0.7068, "nll_loss": 0.7014840841293335, "rewards/accuracies": 1.0, "rewards/chosen": 2.6144633293151855, "rewards/margins": 9.089889526367188, "rewards/rejected": -6.475425720214844, "step": 4645 }, { "epoch": 0.7743333333333333, "grad_norm": 23.016414642333984, "learning_rate": 2.554121410501019e-08, "logits/chosen": 1.6767783164978027, "logits/rejected": 2.294585704803467, "logps/chosen": -15.51749038696289, "logps/rejected": -242.8030242919922, "loss": 0.3473, "nll_loss": 0.33733677864074707, "rewards/accuracies": 1.0, "rewards/chosen": 1.7286396026611328, "rewards/margins": 11.517627716064453, "rewards/rejected": -9.78898811340332, "step": 4646 }, { "epoch": 0.7745, "grad_norm": 65.38212585449219, "learning_rate": 2.5505192480690864e-08, "logits/chosen": 2.569143295288086, "logits/rejected": 2.7238669395446777, "logps/chosen": -30.911916732788086, "logps/rejected": -71.13980865478516, "loss": 0.9273, "nll_loss": 0.6439982056617737, "rewards/accuracies": 1.0, "rewards/chosen": 0.969426155090332, "rewards/margins": 2.0013933181762695, "rewards/rejected": -1.0319671630859375, "step": 4647 }, { "epoch": 0.7746666666666666, "grad_norm": 17.418001174926758, "learning_rate": 2.5469192562370022e-08, "logits/chosen": 2.990380048751831, "logits/rejected": 2.979541063308716, "logps/chosen": -246.52947998046875, "logps/rejected": -130.6071319580078, "loss": 0.8307, "nll_loss": 0.7926992177963257, "rewards/accuracies": 1.0, "rewards/chosen": 2.4931886196136475, "rewards/margins": 5.551746368408203, "rewards/rejected": -3.0585579872131348, "step": 4648 }, { "epoch": 0.7748333333333334, "grad_norm": 32.13876724243164, "learning_rate": 2.5433214360537124e-08, "logits/chosen": 1.8764878511428833, "logits/rejected": 1.5096046924591064, "logps/chosen": -69.43025207519531, "logps/rejected": -65.16313171386719, "loss": 0.9473, "nll_loss": 0.9257367849349976, "rewards/accuracies": 1.0, "rewards/chosen": 1.6136049032211304, "rewards/margins": 6.005547523498535, "rewards/rejected": -4.391942501068115, "step": 4649 }, { "epoch": 0.775, "grad_norm": 33.27042007446289, "learning_rate": 2.5397257885675393e-08, "logits/chosen": 2.0616657733917236, "logits/rejected": 1.8768696784973145, "logps/chosen": -105.24430084228516, "logps/rejected": -67.89398956298828, "loss": 1.1333, "nll_loss": 1.107834815979004, "rewards/accuracies": 1.0, "rewards/chosen": 1.4526580572128296, "rewards/margins": 5.697581768035889, "rewards/rejected": -4.2449235916137695, "step": 4650 }, { "epoch": 0.7751666666666667, "grad_norm": 14.797056198120117, "learning_rate": 2.536132314826166e-08, "logits/chosen": 1.1663472652435303, "logits/rejected": 2.0761866569519043, "logps/chosen": -140.30715942382812, "logps/rejected": -343.78912353515625, "loss": 0.6014, "nll_loss": 0.5895259380340576, "rewards/accuracies": 1.0, "rewards/chosen": 1.5721254348754883, "rewards/margins": 9.720184326171875, "rewards/rejected": -8.148058891296387, "step": 4651 }, { "epoch": 0.7753333333333333, "grad_norm": 24.921777725219727, "learning_rate": 2.5325410158766504e-08, "logits/chosen": 1.4435878992080688, "logits/rejected": 2.1311533451080322, "logps/chosen": -86.15705108642578, "logps/rejected": -364.03265380859375, "loss": 0.7631, "nll_loss": 0.7427331805229187, "rewards/accuracies": 1.0, "rewards/chosen": 1.0494911670684814, "rewards/margins": 7.719999313354492, "rewards/rejected": -6.670507907867432, "step": 4652 }, { "epoch": 0.7755, "grad_norm": 15.929131507873535, "learning_rate": 2.5289518927654018e-08, "logits/chosen": 1.3823643922805786, "logits/rejected": 1.4324678182601929, "logps/chosen": -120.20113372802734, "logps/rejected": -183.4405059814453, "loss": 0.6444, "nll_loss": 0.6427867412567139, "rewards/accuracies": 1.0, "rewards/chosen": 3.6915993690490723, "rewards/margins": 12.126667022705078, "rewards/rejected": -8.435067176818848, "step": 4653 }, { "epoch": 0.7756666666666666, "grad_norm": 18.69548225402832, "learning_rate": 2.52536494653821e-08, "logits/chosen": 2.6608951091766357, "logits/rejected": 2.9745330810546875, "logps/chosen": -69.64903259277344, "logps/rejected": -546.7174072265625, "loss": 0.6926, "nll_loss": 0.6895943880081177, "rewards/accuracies": 1.0, "rewards/chosen": 2.9700489044189453, "rewards/margins": 11.70378589630127, "rewards/rejected": -8.733736991882324, "step": 4654 }, { "epoch": 0.7758333333333334, "grad_norm": 26.32942008972168, "learning_rate": 2.5217801782402236e-08, "logits/chosen": 2.195126533508301, "logits/rejected": 2.381852626800537, "logps/chosen": -43.53043746948242, "logps/rejected": -163.59144592285156, "loss": 0.637, "nll_loss": 0.6308758854866028, "rewards/accuracies": 1.0, "rewards/chosen": 2.3597028255462646, "rewards/margins": 9.315462112426758, "rewards/rejected": -6.955759048461914, "step": 4655 }, { "epoch": 0.776, "grad_norm": 22.174123764038086, "learning_rate": 2.5181975889159613e-08, "logits/chosen": 2.9840877056121826, "logits/rejected": 2.9727630615234375, "logps/chosen": -63.642173767089844, "logps/rejected": -129.8956298828125, "loss": 0.7, "nll_loss": 0.6917627453804016, "rewards/accuracies": 1.0, "rewards/chosen": 2.0871360301971436, "rewards/margins": 8.561406135559082, "rewards/rejected": -6.474270343780518, "step": 4656 }, { "epoch": 0.7761666666666667, "grad_norm": 30.393220901489258, "learning_rate": 2.5146171796092975e-08, "logits/chosen": 1.4064337015151978, "logits/rejected": 1.3597899675369263, "logps/chosen": -30.247222900390625, "logps/rejected": -66.9302749633789, "loss": 0.5583, "nll_loss": 0.512664794921875, "rewards/accuracies": 1.0, "rewards/chosen": 1.4680169820785522, "rewards/margins": 4.706584930419922, "rewards/rejected": -3.238568067550659, "step": 4657 }, { "epoch": 0.7763333333333333, "grad_norm": 26.65452003479004, "learning_rate": 2.511038951363481e-08, "logits/chosen": 1.5382119417190552, "logits/rejected": 1.6246061325073242, "logps/chosen": -98.56260681152344, "logps/rejected": -126.19728088378906, "loss": 1.0151, "nll_loss": 1.005741000175476, "rewards/accuracies": 1.0, "rewards/chosen": 1.9280518293380737, "rewards/margins": 8.454660415649414, "rewards/rejected": -6.526608467102051, "step": 4658 }, { "epoch": 0.7765, "grad_norm": 21.991226196289062, "learning_rate": 2.5074629052211215e-08, "logits/chosen": 2.2245562076568604, "logits/rejected": 2.18815279006958, "logps/chosen": -83.62571716308594, "logps/rejected": -207.87689208984375, "loss": 0.7557, "nll_loss": 0.7400505542755127, "rewards/accuracies": 1.0, "rewards/chosen": 2.344595432281494, "rewards/margins": 6.648001194000244, "rewards/rejected": -4.30340576171875, "step": 4659 }, { "epoch": 0.7766666666666666, "grad_norm": 26.838708877563477, "learning_rate": 2.5038890422241955e-08, "logits/chosen": 2.34456467628479, "logits/rejected": 2.456674575805664, "logps/chosen": -23.688980102539062, "logps/rejected": -141.58287048339844, "loss": 0.5031, "nll_loss": 0.4935204088687897, "rewards/accuracies": 1.0, "rewards/chosen": 2.214219331741333, "rewards/margins": 7.643609046936035, "rewards/rejected": -5.429389476776123, "step": 4660 }, { "epoch": 0.7768333333333334, "grad_norm": 320.98345947265625, "learning_rate": 2.5003173634140353e-08, "logits/chosen": 0.547968864440918, "logits/rejected": 1.4609063863754272, "logps/chosen": -16.755802154541016, "logps/rejected": -233.5609130859375, "loss": 2.4338, "nll_loss": 2.393686056137085, "rewards/accuracies": 1.0, "rewards/chosen": 0.27466699481010437, "rewards/margins": 7.814289569854736, "rewards/rejected": -7.539622783660889, "step": 4661 }, { "epoch": 0.777, "grad_norm": 21.374061584472656, "learning_rate": 2.4967478698313448e-08, "logits/chosen": 1.8993234634399414, "logits/rejected": 2.2424535751342773, "logps/chosen": -52.764747619628906, "logps/rejected": -156.4710235595703, "loss": 0.6037, "nll_loss": 0.5928623676300049, "rewards/accuracies": 1.0, "rewards/chosen": 2.4065651893615723, "rewards/margins": 7.295591354370117, "rewards/rejected": -4.889026165008545, "step": 4662 }, { "epoch": 0.7771666666666667, "grad_norm": 37.86899948120117, "learning_rate": 2.4931805625161893e-08, "logits/chosen": 1.2842955589294434, "logits/rejected": 2.1382601261138916, "logps/chosen": -57.37553024291992, "logps/rejected": -177.715087890625, "loss": 1.0893, "nll_loss": 1.082557201385498, "rewards/accuracies": 1.0, "rewards/chosen": 2.2615811824798584, "rewards/margins": 9.044720649719238, "rewards/rejected": -6.783139228820801, "step": 4663 }, { "epoch": 0.7773333333333333, "grad_norm": 31.570270538330078, "learning_rate": 2.4896154425079997e-08, "logits/chosen": 1.3785583972930908, "logits/rejected": 2.0831027030944824, "logps/chosen": -36.17680358886719, "logps/rejected": -229.67205810546875, "loss": 0.736, "nll_loss": 0.7235360145568848, "rewards/accuracies": 1.0, "rewards/chosen": 1.5573548078536987, "rewards/margins": 8.595827102661133, "rewards/rejected": -7.0384721755981445, "step": 4664 }, { "epoch": 0.7775, "grad_norm": 35.79064178466797, "learning_rate": 2.4860525108455598e-08, "logits/chosen": 1.9585342407226562, "logits/rejected": 2.1484763622283936, "logps/chosen": -32.61915588378906, "logps/rejected": -135.43765258789062, "loss": 0.943, "nll_loss": 0.9319759011268616, "rewards/accuracies": 1.0, "rewards/chosen": 3.3059310913085938, "rewards/margins": 7.688395977020264, "rewards/rejected": -4.38246488571167, "step": 4665 }, { "epoch": 0.7776666666666666, "grad_norm": 28.38429069519043, "learning_rate": 2.4824917685670255e-08, "logits/chosen": 2.295307159423828, "logits/rejected": 2.3917672634124756, "logps/chosen": -150.88462829589844, "logps/rejected": -289.5970458984375, "loss": 1.4262, "nll_loss": 1.3970801830291748, "rewards/accuracies": 1.0, "rewards/chosen": 1.3502976894378662, "rewards/margins": 5.442042350769043, "rewards/rejected": -4.091744899749756, "step": 4666 }, { "epoch": 0.7778333333333334, "grad_norm": 105.58909606933594, "learning_rate": 2.478933216709913e-08, "logits/chosen": 3.0345661640167236, "logits/rejected": 2.8900046348571777, "logps/chosen": -181.74449157714844, "logps/rejected": -135.93258666992188, "loss": 1.7151, "nll_loss": 1.489708662033081, "rewards/accuracies": 1.0, "rewards/chosen": -1.08299720287323, "rewards/margins": 2.259857177734375, "rewards/rejected": -3.3428544998168945, "step": 4667 }, { "epoch": 0.778, "grad_norm": 43.135040283203125, "learning_rate": 2.4753768563110968e-08, "logits/chosen": 2.188926935195923, "logits/rejected": 2.322124719619751, "logps/chosen": -62.03278350830078, "logps/rejected": -208.84962463378906, "loss": 1.1739, "nll_loss": 1.1704298257827759, "rewards/accuracies": 1.0, "rewards/chosen": 3.2554421424865723, "rewards/margins": 9.606283187866211, "rewards/rejected": -6.350841045379639, "step": 4668 }, { "epoch": 0.7781666666666667, "grad_norm": 15.563392639160156, "learning_rate": 2.4718226884068182e-08, "logits/chosen": 2.9327547550201416, "logits/rejected": 3.0656321048736572, "logps/chosen": -217.56378173828125, "logps/rejected": -384.5006103515625, "loss": 0.9668, "nll_loss": 0.96267169713974, "rewards/accuracies": 1.0, "rewards/chosen": 2.9153993129730225, "rewards/margins": 9.514985084533691, "rewards/rejected": -6.59958553314209, "step": 4669 }, { "epoch": 0.7783333333333333, "grad_norm": 54.21376037597656, "learning_rate": 2.4682707140326707e-08, "logits/chosen": 1.603545069694519, "logits/rejected": 2.2246856689453125, "logps/chosen": -9.533238410949707, "logps/rejected": -159.3785858154297, "loss": 0.4232, "nll_loss": 0.41448861360549927, "rewards/accuracies": 1.0, "rewards/chosen": 2.116861343383789, "rewards/margins": 8.118948936462402, "rewards/rejected": -6.002087593078613, "step": 4670 }, { "epoch": 0.7785, "grad_norm": 20.074710845947266, "learning_rate": 2.4647209342236187e-08, "logits/chosen": 2.4616127014160156, "logits/rejected": 2.7739036083221436, "logps/chosen": -49.76683044433594, "logps/rejected": -351.602783203125, "loss": 0.555, "nll_loss": 0.5468882322311401, "rewards/accuracies": 1.0, "rewards/chosen": 2.2834465503692627, "rewards/margins": 8.072548866271973, "rewards/rejected": -5.789102554321289, "step": 4671 }, { "epoch": 0.7786666666666666, "grad_norm": 26.934757232666016, "learning_rate": 2.461173350013981e-08, "logits/chosen": 2.3034920692443848, "logits/rejected": 2.373068332672119, "logps/chosen": -45.089996337890625, "logps/rejected": -179.59774780273438, "loss": 0.5565, "nll_loss": 0.5432529449462891, "rewards/accuracies": 1.0, "rewards/chosen": 1.7325220108032227, "rewards/margins": 7.2528886795043945, "rewards/rejected": -5.520366668701172, "step": 4672 }, { "epoch": 0.7788333333333334, "grad_norm": 20.178865432739258, "learning_rate": 2.457627962437442e-08, "logits/chosen": 2.6402909755706787, "logits/rejected": 2.485602378845215, "logps/chosen": -114.30740356445312, "logps/rejected": -74.1793212890625, "loss": 0.9333, "nll_loss": 0.9293285608291626, "rewards/accuracies": 1.0, "rewards/chosen": 3.1006577014923096, "rewards/margins": 9.375711441040039, "rewards/rejected": -6.27505350112915, "step": 4673 }, { "epoch": 0.779, "grad_norm": 23.89954376220703, "learning_rate": 2.4540847725270375e-08, "logits/chosen": 3.0891690254211426, "logits/rejected": 3.032090902328491, "logps/chosen": -26.346302032470703, "logps/rejected": -202.42453002929688, "loss": 0.4142, "nll_loss": 0.41166093945503235, "rewards/accuracies": 1.0, "rewards/chosen": 4.816319942474365, "rewards/margins": 10.645885467529297, "rewards/rejected": -5.829565525054932, "step": 4674 }, { "epoch": 0.7791666666666667, "grad_norm": 52.69150161743164, "learning_rate": 2.4505437813151696e-08, "logits/chosen": 1.331112265586853, "logits/rejected": 1.9443418979644775, "logps/chosen": -29.9658203125, "logps/rejected": -354.728271484375, "loss": 1.1537, "nll_loss": 1.1525315046310425, "rewards/accuracies": 1.0, "rewards/chosen": 3.912060499191284, "rewards/margins": 14.860860824584961, "rewards/rejected": -10.948800086975098, "step": 4675 }, { "epoch": 0.7793333333333333, "grad_norm": 32.74936294555664, "learning_rate": 2.4470049898335988e-08, "logits/chosen": 1.8898993730545044, "logits/rejected": 2.2780802249908447, "logps/chosen": -57.544837951660156, "logps/rejected": -266.72186279296875, "loss": 1.1992, "nll_loss": 1.1988507509231567, "rewards/accuracies": 1.0, "rewards/chosen": 5.0794782638549805, "rewards/margins": 16.95965576171875, "rewards/rejected": -11.880176544189453, "step": 4676 }, { "epoch": 0.7795, "grad_norm": 54.40336608886719, "learning_rate": 2.4434683991134475e-08, "logits/chosen": 3.234792709350586, "logits/rejected": 3.5371720790863037, "logps/chosen": -8.012443542480469, "logps/rejected": -149.38116455078125, "loss": 0.4273, "nll_loss": 0.42170754075050354, "rewards/accuracies": 1.0, "rewards/chosen": 2.5311856269836426, "rewards/margins": 9.064453125, "rewards/rejected": -6.533267498016357, "step": 4677 }, { "epoch": 0.7796666666666666, "grad_norm": 36.9011116027832, "learning_rate": 2.4399340101851862e-08, "logits/chosen": 1.2771601676940918, "logits/rejected": 2.0724565982818604, "logps/chosen": -56.92829132080078, "logps/rejected": -180.1783447265625, "loss": 1.0804, "nll_loss": 1.0741188526153564, "rewards/accuracies": 1.0, "rewards/chosen": 2.306304931640625, "rewards/margins": 9.335769653320312, "rewards/rejected": -7.0294647216796875, "step": 4678 }, { "epoch": 0.7798333333333334, "grad_norm": 19.388187408447266, "learning_rate": 2.4364018240786544e-08, "logits/chosen": 3.040712356567383, "logits/rejected": 2.9398412704467773, "logps/chosen": -152.1260528564453, "logps/rejected": -93.23069763183594, "loss": 0.9178, "nll_loss": 0.8948591947555542, "rewards/accuracies": 1.0, "rewards/chosen": 2.086552381515503, "rewards/margins": 5.971118927001953, "rewards/rejected": -3.8845667839050293, "step": 4679 }, { "epoch": 0.78, "grad_norm": 45.78532791137695, "learning_rate": 2.4328718418230464e-08, "logits/chosen": 2.7445688247680664, "logits/rejected": 2.7891845703125, "logps/chosen": -40.83222579956055, "logps/rejected": -218.8550567626953, "loss": 1.2042, "nll_loss": 1.2009477615356445, "rewards/accuracies": 1.0, "rewards/chosen": 2.910597085952759, "rewards/margins": 11.331991195678711, "rewards/rejected": -8.421394348144531, "step": 4680 }, { "epoch": 0.7801666666666667, "grad_norm": 23.15081214904785, "learning_rate": 2.4293440644469176e-08, "logits/chosen": 1.478451132774353, "logits/rejected": 2.3559393882751465, "logps/chosen": -24.405763626098633, "logps/rejected": -211.72518920898438, "loss": 0.4445, "nll_loss": 0.4358171820640564, "rewards/accuracies": 1.0, "rewards/chosen": 2.301328182220459, "rewards/margins": 7.851725101470947, "rewards/rejected": -5.550396919250488, "step": 4681 }, { "epoch": 0.7803333333333333, "grad_norm": 33.90287780761719, "learning_rate": 2.425818492978171e-08, "logits/chosen": 3.199978828430176, "logits/rejected": 3.268249034881592, "logps/chosen": -125.06057739257812, "logps/rejected": -269.54547119140625, "loss": 1.4567, "nll_loss": 1.437477707862854, "rewards/accuracies": 1.0, "rewards/chosen": 1.7605957984924316, "rewards/margins": 6.2109575271606445, "rewards/rejected": -4.450361728668213, "step": 4682 }, { "epoch": 0.7805, "grad_norm": 36.48506164550781, "learning_rate": 2.4222951284440773e-08, "logits/chosen": 2.464911937713623, "logits/rejected": 2.4314849376678467, "logps/chosen": -28.75406837463379, "logps/rejected": -161.29595947265625, "loss": 0.5876, "nll_loss": 0.5228012204170227, "rewards/accuracies": 1.0, "rewards/chosen": 0.19208946824073792, "rewards/margins": 4.2803168296813965, "rewards/rejected": -4.088227272033691, "step": 4683 }, { "epoch": 0.7806666666666666, "grad_norm": 24.964717864990234, "learning_rate": 2.4187739718712595e-08, "logits/chosen": 1.0426955223083496, "logits/rejected": 1.9309930801391602, "logps/chosen": -74.33949279785156, "logps/rejected": -256.94049072265625, "loss": 0.7962, "nll_loss": 0.7825209498405457, "rewards/accuracies": 1.0, "rewards/chosen": 2.1032638549804688, "rewards/margins": 6.85373067855835, "rewards/rejected": -4.750466823577881, "step": 4684 }, { "epoch": 0.7808333333333334, "grad_norm": 25.45345687866211, "learning_rate": 2.415255024285702e-08, "logits/chosen": 3.0066821575164795, "logits/rejected": 2.755309820175171, "logps/chosen": -58.371421813964844, "logps/rejected": -143.22418212890625, "loss": 0.7214, "nll_loss": 0.7118465900421143, "rewards/accuracies": 1.0, "rewards/chosen": 1.8534889221191406, "rewards/margins": 8.847297668457031, "rewards/rejected": -6.993808269500732, "step": 4685 }, { "epoch": 0.781, "grad_norm": 105.57801818847656, "learning_rate": 2.4117382867127344e-08, "logits/chosen": 2.9294092655181885, "logits/rejected": 2.9309771060943604, "logps/chosen": -82.26143646240234, "logps/rejected": -73.42211151123047, "loss": 2.1409, "nll_loss": 1.9586056470870972, "rewards/accuracies": 1.0, "rewards/chosen": -1.386980414390564, "rewards/margins": 3.474153995513916, "rewards/rejected": -4.8611345291137695, "step": 4686 }, { "epoch": 0.7811666666666667, "grad_norm": 28.603496551513672, "learning_rate": 2.408223760177054e-08, "logits/chosen": 1.0921510457992554, "logits/rejected": 1.9117956161499023, "logps/chosen": -25.917715072631836, "logps/rejected": -438.0268859863281, "loss": 0.5366, "nll_loss": 0.5183542966842651, "rewards/accuracies": 1.0, "rewards/chosen": 1.0935992002487183, "rewards/margins": 12.07027530670166, "rewards/rejected": -10.976675987243652, "step": 4687 }, { "epoch": 0.7813333333333333, "grad_norm": 25.374500274658203, "learning_rate": 2.4047114457027086e-08, "logits/chosen": 3.0438907146453857, "logits/rejected": 3.0892179012298584, "logps/chosen": -61.84148406982422, "logps/rejected": -203.6099853515625, "loss": 0.7504, "nll_loss": 0.7450781464576721, "rewards/accuracies": 1.0, "rewards/chosen": 2.484771728515625, "rewards/margins": 9.658673286437988, "rewards/rejected": -7.173901557922363, "step": 4688 }, { "epoch": 0.7815, "grad_norm": 30.36302375793457, "learning_rate": 2.4012013443131018e-08, "logits/chosen": 2.405705690383911, "logits/rejected": 2.017218828201294, "logps/chosen": -67.47894287109375, "logps/rejected": -97.88187408447266, "loss": 0.8675, "nll_loss": 0.8541637659072876, "rewards/accuracies": 1.0, "rewards/chosen": 1.7156822681427002, "rewards/margins": 7.263106346130371, "rewards/rejected": -5.547423839569092, "step": 4689 }, { "epoch": 0.7816666666666666, "grad_norm": 35.048160552978516, "learning_rate": 2.397693457030997e-08, "logits/chosen": 3.229515552520752, "logits/rejected": 3.2253265380859375, "logps/chosen": -42.73695373535156, "logps/rejected": -148.4472198486328, "loss": 0.8014, "nll_loss": 0.7914251685142517, "rewards/accuracies": 1.0, "rewards/chosen": 2.0519683361053467, "rewards/margins": 7.710787773132324, "rewards/rejected": -5.658819198608398, "step": 4690 }, { "epoch": 0.7818333333333334, "grad_norm": 106.80463409423828, "learning_rate": 2.394187784878502e-08, "logits/chosen": 2.5815181732177734, "logits/rejected": 2.660402774810791, "logps/chosen": -61.08794403076172, "logps/rejected": -38.35371780395508, "loss": 1.5217, "nll_loss": 0.6170498728752136, "rewards/accuracies": 1.0, "rewards/chosen": 1.2935394048690796, "rewards/margins": 0.34519684314727783, "rewards/rejected": 0.9483425617218018, "step": 4691 }, { "epoch": 0.782, "grad_norm": 44.86079788208008, "learning_rate": 2.3906843288770883e-08, "logits/chosen": 2.746248722076416, "logits/rejected": 3.031008720397949, "logps/chosen": -15.687167167663574, "logps/rejected": -545.4639892578125, "loss": 0.6552, "nll_loss": 0.6536320447921753, "rewards/accuracies": 1.0, "rewards/chosen": 4.450153350830078, "rewards/margins": 11.069862365722656, "rewards/rejected": -6.61970853805542, "step": 4692 }, { "epoch": 0.7821666666666667, "grad_norm": 37.80628967285156, "learning_rate": 2.3871830900475785e-08, "logits/chosen": 3.382254123687744, "logits/rejected": 3.352262020111084, "logps/chosen": -50.435001373291016, "logps/rejected": -90.25425720214844, "loss": 0.7747, "nll_loss": 0.6724666953086853, "rewards/accuracies": 1.0, "rewards/chosen": 1.6727635860443115, "rewards/margins": 3.744290828704834, "rewards/rejected": -2.0715272426605225, "step": 4693 }, { "epoch": 0.7823333333333333, "grad_norm": 33.43196105957031, "learning_rate": 2.3836840694101545e-08, "logits/chosen": 2.987973213195801, "logits/rejected": 3.181530714035034, "logps/chosen": -63.514366149902344, "logps/rejected": -127.43231201171875, "loss": 0.88, "nll_loss": 0.8468582034111023, "rewards/accuracies": 1.0, "rewards/chosen": 2.2640678882598877, "rewards/margins": 5.564731597900391, "rewards/rejected": -3.300663948059082, "step": 4694 }, { "epoch": 0.7825, "grad_norm": 26.537424087524414, "learning_rate": 2.3801872679843383e-08, "logits/chosen": 2.3573625087738037, "logits/rejected": 2.7924792766571045, "logps/chosen": -39.455108642578125, "logps/rejected": -248.66165161132812, "loss": 0.6797, "nll_loss": 0.6687306761741638, "rewards/accuracies": 1.0, "rewards/chosen": 1.6676697731018066, "rewards/margins": 9.154504776000977, "rewards/rejected": -7.48683500289917, "step": 4695 }, { "epoch": 0.7826666666666666, "grad_norm": 52.70030975341797, "learning_rate": 2.376692686789018e-08, "logits/chosen": 1.642931342124939, "logits/rejected": 1.997349739074707, "logps/chosen": -24.056880950927734, "logps/rejected": -288.8843078613281, "loss": 0.6898, "nll_loss": 0.6682467460632324, "rewards/accuracies": 1.0, "rewards/chosen": 1.025064468383789, "rewards/margins": 7.207319259643555, "rewards/rejected": -6.182254791259766, "step": 4696 }, { "epoch": 0.7828333333333334, "grad_norm": 412.5520935058594, "learning_rate": 2.3732003268424306e-08, "logits/chosen": 2.0950098037719727, "logits/rejected": 1.507917881011963, "logps/chosen": -118.1998291015625, "logps/rejected": -12.837003707885742, "loss": 5.145, "nll_loss": 1.218554973602295, "rewards/accuracies": 0.0, "rewards/chosen": -2.0183136463165283, "rewards/margins": -3.8235952854156494, "rewards/rejected": 1.805281639099121, "step": 4697 }, { "epoch": 0.783, "grad_norm": 27.745973587036133, "learning_rate": 2.3697101891621697e-08, "logits/chosen": 2.6151938438415527, "logits/rejected": 2.6023688316345215, "logps/chosen": -24.592628479003906, "logps/rejected": -296.68310546875, "loss": 0.4924, "nll_loss": 0.4918525218963623, "rewards/accuracies": 1.0, "rewards/chosen": 5.5845255851745605, "rewards/margins": 13.346038818359375, "rewards/rejected": -7.761512756347656, "step": 4698 }, { "epoch": 0.7831666666666667, "grad_norm": 37.22665786743164, "learning_rate": 2.3662222747651695e-08, "logits/chosen": -0.03602011874318123, "logits/rejected": 1.9516328573226929, "logps/chosen": -36.0109977722168, "logps/rejected": -517.2655029296875, "loss": 0.7572, "nll_loss": 0.7502289414405823, "rewards/accuracies": 1.0, "rewards/chosen": 2.09112811088562, "rewards/margins": 11.224029541015625, "rewards/rejected": -9.132901191711426, "step": 4699 }, { "epoch": 0.7833333333333333, "grad_norm": 29.750024795532227, "learning_rate": 2.3627365846677304e-08, "logits/chosen": 2.869776964187622, "logits/rejected": 2.942140579223633, "logps/chosen": -12.121894836425781, "logps/rejected": -110.15037536621094, "loss": 0.3528, "nll_loss": 0.31081777811050415, "rewards/accuracies": 1.0, "rewards/chosen": 1.0010708570480347, "rewards/margins": 4.7836384773254395, "rewards/rejected": -3.7825677394866943, "step": 4700 }, { "epoch": 0.7835, "grad_norm": 32.161895751953125, "learning_rate": 2.359253119885497e-08, "logits/chosen": 2.5818352699279785, "logits/rejected": 2.6799476146698, "logps/chosen": -33.96891403198242, "logps/rejected": -233.69618225097656, "loss": 0.6454, "nll_loss": 0.606587827205658, "rewards/accuracies": 1.0, "rewards/chosen": 0.935717761516571, "rewards/margins": 4.966697692871094, "rewards/rejected": -4.030980110168457, "step": 4701 }, { "epoch": 0.7836666666666666, "grad_norm": 20.85154151916504, "learning_rate": 2.3557718814334704e-08, "logits/chosen": 1.6165132522583008, "logits/rejected": 2.1213018894195557, "logps/chosen": -119.13172912597656, "logps/rejected": -371.6044921875, "loss": 0.9733, "nll_loss": 0.9685507416725159, "rewards/accuracies": 1.0, "rewards/chosen": 2.499276876449585, "rewards/margins": 11.753994941711426, "rewards/rejected": -9.254717826843262, "step": 4702 }, { "epoch": 0.7838333333333334, "grad_norm": 31.51940155029297, "learning_rate": 2.352292870325995e-08, "logits/chosen": 2.7859408855438232, "logits/rejected": 3.066053628921509, "logps/chosen": -17.637226104736328, "logps/rejected": -418.7088623046875, "loss": 0.5085, "nll_loss": 0.5039207935333252, "rewards/accuracies": 1.0, "rewards/chosen": 4.199835777282715, "rewards/margins": 9.43353271484375, "rewards/rejected": -5.233697414398193, "step": 4703 }, { "epoch": 0.784, "grad_norm": 36.60379409790039, "learning_rate": 2.3488160875767716e-08, "logits/chosen": 1.9930304288864136, "logits/rejected": 2.222161054611206, "logps/chosen": -39.4050407409668, "logps/rejected": -154.77659606933594, "loss": 0.9626, "nll_loss": 0.9382151365280151, "rewards/accuracies": 1.0, "rewards/chosen": 0.9657257199287415, "rewards/margins": 6.543632984161377, "rewards/rejected": -5.577907085418701, "step": 4704 }, { "epoch": 0.7841666666666667, "grad_norm": 214.4287109375, "learning_rate": 2.3453415341988548e-08, "logits/chosen": 3.1900992393493652, "logits/rejected": 3.2634100914001465, "logps/chosen": -214.0580291748047, "logps/rejected": -333.95355224609375, "loss": 1.6515, "nll_loss": 1.1148855686187744, "rewards/accuracies": 1.0, "rewards/chosen": -3.552851915359497, "rewards/margins": 2.6170151233673096, "rewards/rejected": -6.169867038726807, "step": 4705 }, { "epoch": 0.7843333333333333, "grad_norm": 67.79071807861328, "learning_rate": 2.3418692112046455e-08, "logits/chosen": 2.6357359886169434, "logits/rejected": 2.8947653770446777, "logps/chosen": -9.131208419799805, "logps/rejected": -150.468017578125, "loss": 0.5823, "nll_loss": 0.5707005262374878, "rewards/accuracies": 1.0, "rewards/chosen": 2.4989893436431885, "rewards/margins": 7.189332008361816, "rewards/rejected": -4.690342903137207, "step": 4706 }, { "epoch": 0.7845, "grad_norm": 44.879764556884766, "learning_rate": 2.3383991196058918e-08, "logits/chosen": 2.3895092010498047, "logits/rejected": 2.3355960845947266, "logps/chosen": -20.719139099121094, "logps/rejected": -244.06158447265625, "loss": 0.7097, "nll_loss": 0.6906378865242004, "rewards/accuracies": 1.0, "rewards/chosen": 1.0731953382492065, "rewards/margins": 8.762371063232422, "rewards/rejected": -7.689175605773926, "step": 4707 }, { "epoch": 0.7846666666666666, "grad_norm": 31.547548294067383, "learning_rate": 2.3349312604136972e-08, "logits/chosen": 1.5542725324630737, "logits/rejected": 2.3501508235931396, "logps/chosen": -12.744756698608398, "logps/rejected": -183.14706420898438, "loss": 0.3745, "nll_loss": 0.36413589119911194, "rewards/accuracies": 1.0, "rewards/chosen": 1.7528390884399414, "rewards/margins": 8.832974433898926, "rewards/rejected": -7.080135345458984, "step": 4708 }, { "epoch": 0.7848333333333334, "grad_norm": 40.29305648803711, "learning_rate": 2.331465634638513e-08, "logits/chosen": 2.3819782733917236, "logits/rejected": 2.48125958442688, "logps/chosen": -36.694644927978516, "logps/rejected": -239.64378356933594, "loss": 1.115, "nll_loss": 1.1119589805603027, "rewards/accuracies": 1.0, "rewards/chosen": 2.9935269355773926, "rewards/margins": 10.974885940551758, "rewards/rejected": -7.981358528137207, "step": 4709 }, { "epoch": 0.785, "grad_norm": 44.758907318115234, "learning_rate": 2.328002243290138e-08, "logits/chosen": 2.6455514430999756, "logits/rejected": 2.7623486518859863, "logps/chosen": -43.65550994873047, "logps/rejected": -175.61517333984375, "loss": 0.8633, "nll_loss": 0.8559902906417847, "rewards/accuracies": 1.0, "rewards/chosen": 2.55915904045105, "rewards/margins": 8.098336219787598, "rewards/rejected": -5.539176940917969, "step": 4710 }, { "epoch": 0.7851666666666667, "grad_norm": 152.51490783691406, "learning_rate": 2.324541087377726e-08, "logits/chosen": 2.484588146209717, "logits/rejected": 2.550654411315918, "logps/chosen": -20.797256469726562, "logps/rejected": -56.72843551635742, "loss": 1.5444, "nll_loss": 0.5199313759803772, "rewards/accuracies": 0.0, "rewards/chosen": 0.2426973581314087, "rewards/margins": -0.1541711986064911, "rewards/rejected": 0.3968685567378998, "step": 4711 }, { "epoch": 0.7853333333333333, "grad_norm": 22.11802864074707, "learning_rate": 2.3210821679097668e-08, "logits/chosen": 2.986637830734253, "logits/rejected": 3.0376710891723633, "logps/chosen": -91.52687072753906, "logps/rejected": -268.94720458984375, "loss": 0.9127, "nll_loss": 0.9062066674232483, "rewards/accuracies": 1.0, "rewards/chosen": 2.2393577098846436, "rewards/margins": 9.548894882202148, "rewards/rejected": -7.309536933898926, "step": 4712 }, { "epoch": 0.7855, "grad_norm": 32.62908935546875, "learning_rate": 2.3176254858941125e-08, "logits/chosen": 2.1923954486846924, "logits/rejected": 2.0788443088531494, "logps/chosen": -65.12812805175781, "logps/rejected": -87.11968231201172, "loss": 0.9533, "nll_loss": 0.9304019212722778, "rewards/accuracies": 1.0, "rewards/chosen": 1.0004570484161377, "rewards/margins": 6.797926902770996, "rewards/rejected": -5.797469615936279, "step": 4713 }, { "epoch": 0.7856666666666666, "grad_norm": 29.334741592407227, "learning_rate": 2.3141710423379545e-08, "logits/chosen": 2.3908846378326416, "logits/rejected": 2.4215307235717773, "logps/chosen": -97.29174041748047, "logps/rejected": -176.1384735107422, "loss": 1.0939, "nll_loss": 1.081019401550293, "rewards/accuracies": 1.0, "rewards/chosen": 1.4718894958496094, "rewards/margins": 9.580531120300293, "rewards/rejected": -8.108641624450684, "step": 4714 }, { "epoch": 0.7858333333333334, "grad_norm": 26.36034393310547, "learning_rate": 2.3107188382478383e-08, "logits/chosen": 2.388633966445923, "logits/rejected": 2.580918550491333, "logps/chosen": -52.90080261230469, "logps/rejected": -346.54425048828125, "loss": 0.6503, "nll_loss": 0.637359082698822, "rewards/accuracies": 1.0, "rewards/chosen": 1.4501153230667114, "rewards/margins": 11.155562400817871, "rewards/rejected": -9.70544719696045, "step": 4715 }, { "epoch": 0.786, "grad_norm": 32.960784912109375, "learning_rate": 2.3072688746296487e-08, "logits/chosen": 2.0454959869384766, "logits/rejected": 2.2851622104644775, "logps/chosen": -69.27003479003906, "logps/rejected": -391.87322998046875, "loss": 1.2359, "nll_loss": 1.215263843536377, "rewards/accuracies": 1.0, "rewards/chosen": 0.9635658264160156, "rewards/margins": 14.237601280212402, "rewards/rejected": -13.274035453796387, "step": 4716 }, { "epoch": 0.7861666666666667, "grad_norm": 35.607913970947266, "learning_rate": 2.3038211524886232e-08, "logits/chosen": 2.8109328746795654, "logits/rejected": 2.9662420749664307, "logps/chosen": -17.515605926513672, "logps/rejected": -138.04652404785156, "loss": 0.5753, "nll_loss": 0.5650195479393005, "rewards/accuracies": 1.0, "rewards/chosen": 2.146138906478882, "rewards/margins": 7.510957717895508, "rewards/rejected": -5.364818572998047, "step": 4717 }, { "epoch": 0.7863333333333333, "grad_norm": 29.424392700195312, "learning_rate": 2.3003756728293467e-08, "logits/chosen": 1.961695909500122, "logits/rejected": 1.9394909143447876, "logps/chosen": -23.568830490112305, "logps/rejected": -105.37604522705078, "loss": 0.4401, "nll_loss": 0.4134882092475891, "rewards/accuracies": 1.0, "rewards/chosen": 0.9303018450737, "rewards/margins": 6.167991638183594, "rewards/rejected": -5.237689971923828, "step": 4718 }, { "epoch": 0.7865, "grad_norm": 30.132978439331055, "learning_rate": 2.2969324366557518e-08, "logits/chosen": 2.807539939880371, "logits/rejected": 3.025524377822876, "logps/chosen": -23.558696746826172, "logps/rejected": -214.92263793945312, "loss": 0.5287, "nll_loss": 0.5235266089439392, "rewards/accuracies": 1.0, "rewards/chosen": 2.4281094074249268, "rewards/margins": 10.385513305664062, "rewards/rejected": -7.957403659820557, "step": 4719 }, { "epoch": 0.7866666666666666, "grad_norm": 67.13517761230469, "learning_rate": 2.2934914449711086e-08, "logits/chosen": 3.010519027709961, "logits/rejected": 3.1533122062683105, "logps/chosen": -59.90477752685547, "logps/rejected": -232.57418823242188, "loss": 2.1579, "nll_loss": 2.1394565105438232, "rewards/accuracies": 1.0, "rewards/chosen": 1.626355767250061, "rewards/margins": 6.349817752838135, "rewards/rejected": -4.723462104797363, "step": 4720 }, { "epoch": 0.7868333333333334, "grad_norm": 44.66743469238281, "learning_rate": 2.2900526987780434e-08, "logits/chosen": 3.009683609008789, "logits/rejected": 3.0087432861328125, "logps/chosen": -108.65260314941406, "logps/rejected": -50.478782653808594, "loss": 1.4378, "nll_loss": 1.3581576347351074, "rewards/accuracies": 1.0, "rewards/chosen": 2.6416964530944824, "rewards/margins": 4.786869049072266, "rewards/rejected": -2.1451728343963623, "step": 4721 }, { "epoch": 0.787, "grad_norm": 10.641277313232422, "learning_rate": 2.2866161990785228e-08, "logits/chosen": 2.654672384262085, "logits/rejected": 2.7031168937683105, "logps/chosen": -176.01904296875, "logps/rejected": -143.47080993652344, "loss": 0.5867, "nll_loss": 0.5847808718681335, "rewards/accuracies": 1.0, "rewards/chosen": 4.121267795562744, "rewards/margins": 10.726070404052734, "rewards/rejected": -6.604802131652832, "step": 4722 }, { "epoch": 0.7871666666666667, "grad_norm": 26.227981567382812, "learning_rate": 2.2831819468738644e-08, "logits/chosen": 2.2191216945648193, "logits/rejected": 1.9901549816131592, "logps/chosen": -34.767433166503906, "logps/rejected": -116.62438201904297, "loss": 0.5556, "nll_loss": 0.5348835587501526, "rewards/accuracies": 1.0, "rewards/chosen": 2.738826036453247, "rewards/margins": 6.488858222961426, "rewards/rejected": -3.7500321865081787, "step": 4723 }, { "epoch": 0.7873333333333333, "grad_norm": 20.960983276367188, "learning_rate": 2.2797499431647215e-08, "logits/chosen": 3.636457920074463, "logits/rejected": 3.7564427852630615, "logps/chosen": -39.92976379394531, "logps/rejected": -195.60557556152344, "loss": 0.5408, "nll_loss": 0.5323967337608337, "rewards/accuracies": 1.0, "rewards/chosen": 1.9080768823623657, "rewards/margins": 10.75359058380127, "rewards/rejected": -8.845513343811035, "step": 4724 }, { "epoch": 0.7875, "grad_norm": 25.492856979370117, "learning_rate": 2.2763201889510986e-08, "logits/chosen": 2.117969036102295, "logits/rejected": 2.5213775634765625, "logps/chosen": -74.95184326171875, "logps/rejected": -337.3309326171875, "loss": 0.7501, "nll_loss": 0.7070926427841187, "rewards/accuracies": 1.0, "rewards/chosen": 0.3183448910713196, "rewards/margins": 5.7746806144714355, "rewards/rejected": -5.456335544586182, "step": 4725 }, { "epoch": 0.7876666666666666, "grad_norm": 95.6971435546875, "learning_rate": 2.272892685232345e-08, "logits/chosen": 1.2209808826446533, "logits/rejected": 2.3488943576812744, "logps/chosen": -81.6322250366211, "logps/rejected": -317.3948974609375, "loss": 1.9458, "nll_loss": 1.774613380432129, "rewards/accuracies": 1.0, "rewards/chosen": -1.537543535232544, "rewards/margins": 5.792723655700684, "rewards/rejected": -7.330267429351807, "step": 4726 }, { "epoch": 0.7878333333333334, "grad_norm": 22.606420516967773, "learning_rate": 2.2694674330071562e-08, "logits/chosen": 1.6007922887802124, "logits/rejected": 2.2511467933654785, "logps/chosen": -19.933528900146484, "logps/rejected": -301.2667236328125, "loss": 0.4021, "nll_loss": 0.39085355401039124, "rewards/accuracies": 1.0, "rewards/chosen": 1.592210054397583, "rewards/margins": 14.376465797424316, "rewards/rejected": -12.784255981445312, "step": 4727 }, { "epoch": 0.788, "grad_norm": 29.684017181396484, "learning_rate": 2.2660444332735617e-08, "logits/chosen": 2.890916109085083, "logits/rejected": 3.035676956176758, "logps/chosen": -103.62055969238281, "logps/rejected": -116.92415618896484, "loss": 1.1107, "nll_loss": 1.102346420288086, "rewards/accuracies": 1.0, "rewards/chosen": 2.5083909034729004, "rewards/margins": 7.811782360076904, "rewards/rejected": -5.303391456604004, "step": 4728 }, { "epoch": 0.7881666666666667, "grad_norm": 140.9199981689453, "learning_rate": 2.262623687028945e-08, "logits/chosen": 0.8934005498886108, "logits/rejected": 1.7833646535873413, "logps/chosen": -51.03291702270508, "logps/rejected": -155.38096618652344, "loss": 2.0507, "nll_loss": 1.7010971307754517, "rewards/accuracies": 1.0, "rewards/chosen": -1.8921726942062378, "rewards/margins": 1.809045672416687, "rewards/rejected": -3.701218366622925, "step": 4729 }, { "epoch": 0.7883333333333333, "grad_norm": 34.08424758911133, "learning_rate": 2.25920519527003e-08, "logits/chosen": 2.173457145690918, "logits/rejected": 2.3163514137268066, "logps/chosen": -10.006973266601562, "logps/rejected": -199.99658203125, "loss": 0.4213, "nll_loss": 0.4169571101665497, "rewards/accuracies": 1.0, "rewards/chosen": 2.6894888877868652, "rewards/margins": 9.891668319702148, "rewards/rejected": -7.202178955078125, "step": 4730 }, { "epoch": 0.7885, "grad_norm": 18.57888412475586, "learning_rate": 2.2557889589928815e-08, "logits/chosen": 1.8861143589019775, "logits/rejected": 2.286937713623047, "logps/chosen": -64.75161743164062, "logps/rejected": -204.20252990722656, "loss": 0.5814, "nll_loss": 0.5679965615272522, "rewards/accuracies": 1.0, "rewards/chosen": 1.4297196865081787, "rewards/margins": 9.484890937805176, "rewards/rejected": -8.055171012878418, "step": 4731 }, { "epoch": 0.7886666666666666, "grad_norm": 30.15082359313965, "learning_rate": 2.2523749791929126e-08, "logits/chosen": 2.8807599544525146, "logits/rejected": 2.9115190505981445, "logps/chosen": -30.74364471435547, "logps/rejected": -80.79678344726562, "loss": 0.5908, "nll_loss": 0.5589753985404968, "rewards/accuracies": 1.0, "rewards/chosen": 1.3257187604904175, "rewards/margins": 5.269741058349609, "rewards/rejected": -3.9440224170684814, "step": 4732 }, { "epoch": 0.7888333333333334, "grad_norm": 28.214874267578125, "learning_rate": 2.2489632568648697e-08, "logits/chosen": 2.591399908065796, "logits/rejected": 2.682311773300171, "logps/chosen": -78.03791046142578, "logps/rejected": -140.58309936523438, "loss": 0.9085, "nll_loss": 0.8867945075035095, "rewards/accuracies": 1.0, "rewards/chosen": 1.0012222528457642, "rewards/margins": 7.256743431091309, "rewards/rejected": -6.255521297454834, "step": 4733 }, { "epoch": 0.789, "grad_norm": 105.51839447021484, "learning_rate": 2.245553793002849e-08, "logits/chosen": 1.7133164405822754, "logits/rejected": 1.6623048782348633, "logps/chosen": -103.23204803466797, "logps/rejected": -81.52677154541016, "loss": 1.714, "nll_loss": 1.2289528846740723, "rewards/accuracies": 1.0, "rewards/chosen": 1.3884193897247314, "rewards/margins": 1.4171921014785767, "rewards/rejected": -0.028772735968232155, "step": 4734 }, { "epoch": 0.7891666666666667, "grad_norm": 26.09694480895996, "learning_rate": 2.2421465886002854e-08, "logits/chosen": 1.5281157493591309, "logits/rejected": 2.42999529838562, "logps/chosen": -42.53347396850586, "logps/rejected": -225.89712524414062, "loss": 0.5814, "nll_loss": 0.5671130418777466, "rewards/accuracies": 1.0, "rewards/chosen": 1.5061439275741577, "rewards/margins": 7.534301280975342, "rewards/rejected": -6.0281572341918945, "step": 4735 }, { "epoch": 0.7893333333333333, "grad_norm": 25.618335723876953, "learning_rate": 2.2387416446499618e-08, "logits/chosen": 1.2516655921936035, "logits/rejected": 2.5396921634674072, "logps/chosen": -80.29192352294922, "logps/rejected": -256.883056640625, "loss": 0.8839, "nll_loss": 0.8633539080619812, "rewards/accuracies": 1.0, "rewards/chosen": 0.9788200259208679, "rewards/margins": 8.924566268920898, "rewards/rejected": -7.945746421813965, "step": 4736 }, { "epoch": 0.7895, "grad_norm": 41.29881286621094, "learning_rate": 2.23533896214399e-08, "logits/chosen": 3.3344969749450684, "logits/rejected": 3.430349588394165, "logps/chosen": -30.806171417236328, "logps/rejected": -368.35516357421875, "loss": 0.8809, "nll_loss": 0.8801762461662292, "rewards/accuracies": 1.0, "rewards/chosen": 5.1987409591674805, "rewards/margins": 12.558168411254883, "rewards/rejected": -7.359427452087402, "step": 4737 }, { "epoch": 0.7896666666666666, "grad_norm": 50.963436126708984, "learning_rate": 2.2319385420738325e-08, "logits/chosen": 3.0722572803497314, "logits/rejected": 2.9779486656188965, "logps/chosen": -48.264678955078125, "logps/rejected": -30.44548797607422, "loss": 0.9027, "nll_loss": 0.7541356682777405, "rewards/accuracies": 1.0, "rewards/chosen": 1.584621548652649, "rewards/margins": 3.209054946899414, "rewards/rejected": -1.6244332790374756, "step": 4738 }, { "epoch": 0.7898333333333334, "grad_norm": 25.962299346923828, "learning_rate": 2.228540385430291e-08, "logits/chosen": 2.130709171295166, "logits/rejected": 2.906903028488159, "logps/chosen": -75.28250122070312, "logps/rejected": -518.298583984375, "loss": 0.8647, "nll_loss": 0.8554829955101013, "rewards/accuracies": 1.0, "rewards/chosen": 1.7967681884765625, "rewards/margins": 10.990747451782227, "rewards/rejected": -9.193979263305664, "step": 4739 }, { "epoch": 0.79, "grad_norm": 26.361671447753906, "learning_rate": 2.225144493203509e-08, "logits/chosen": 2.4073171615600586, "logits/rejected": 2.5342013835906982, "logps/chosen": -25.582828521728516, "logps/rejected": -174.4476318359375, "loss": 0.4117, "nll_loss": 0.39973166584968567, "rewards/accuracies": 1.0, "rewards/chosen": 1.7025768756866455, "rewards/margins": 7.785797119140625, "rewards/rejected": -6.083220481872559, "step": 4740 }, { "epoch": 0.7901666666666667, "grad_norm": 33.62925720214844, "learning_rate": 2.221750866382962e-08, "logits/chosen": 2.4337058067321777, "logits/rejected": 2.347973585128784, "logps/chosen": -58.26312255859375, "logps/rejected": -41.77556610107422, "loss": 1.0141, "nll_loss": 0.9710520505905151, "rewards/accuracies": 1.0, "rewards/chosen": 4.120175361633301, "rewards/margins": 6.855055809020996, "rewards/rejected": -2.734880208969116, "step": 4741 }, { "epoch": 0.7903333333333333, "grad_norm": 44.480220794677734, "learning_rate": 2.2183595059574744e-08, "logits/chosen": 1.3407546281814575, "logits/rejected": 2.1912009716033936, "logps/chosen": -49.50096893310547, "logps/rejected": -240.23292541503906, "loss": 1.0435, "nll_loss": 1.031270146369934, "rewards/accuracies": 1.0, "rewards/chosen": 3.5593178272247314, "rewards/margins": 7.765475273132324, "rewards/rejected": -4.206157207489014, "step": 4742 }, { "epoch": 0.7905, "grad_norm": 31.12467384338379, "learning_rate": 2.214970412915208e-08, "logits/chosen": 2.346080780029297, "logits/rejected": 2.208583116531372, "logps/chosen": -88.31185913085938, "logps/rejected": -345.9296875, "loss": 1.0519, "nll_loss": 1.0389630794525146, "rewards/accuracies": 1.0, "rewards/chosen": 1.451917290687561, "rewards/margins": 10.711953163146973, "rewards/rejected": -9.260035514831543, "step": 4743 }, { "epoch": 0.7906666666666666, "grad_norm": 166.65708923339844, "learning_rate": 2.211583588243665e-08, "logits/chosen": 1.9998667240142822, "logits/rejected": 2.1416690349578857, "logps/chosen": -31.988014221191406, "logps/rejected": -31.92823028564453, "loss": 3.8579, "nll_loss": 0.6151541471481323, "rewards/accuracies": 0.0, "rewards/chosen": 1.6249959468841553, "rewards/margins": -2.5961740016937256, "rewards/rejected": 4.221169948577881, "step": 4744 }, { "epoch": 0.7908333333333334, "grad_norm": 19.246349334716797, "learning_rate": 2.2081990329296806e-08, "logits/chosen": 2.136162042617798, "logits/rejected": 2.441403865814209, "logps/chosen": -81.51410675048828, "logps/rejected": -181.57730102539062, "loss": 0.699, "nll_loss": 0.6967018246650696, "rewards/accuracies": 1.0, "rewards/chosen": 3.338003635406494, "rewards/margins": 11.29298210144043, "rewards/rejected": -7.954978942871094, "step": 4745 }, { "epoch": 0.791, "grad_norm": 194.42356872558594, "learning_rate": 2.2048167479594336e-08, "logits/chosen": 2.820814609527588, "logits/rejected": 2.818603754043579, "logps/chosen": -72.03394317626953, "logps/rejected": -62.909912109375, "loss": 2.1723, "nll_loss": 0.7062152624130249, "rewards/accuracies": 0.0, "rewards/chosen": 2.60038161277771, "rewards/margins": -0.21215200424194336, "rewards/rejected": 2.8125336170196533, "step": 4746 }, { "epoch": 0.7911666666666667, "grad_norm": 27.599618911743164, "learning_rate": 2.2014367343184427e-08, "logits/chosen": 1.9407382011413574, "logits/rejected": 1.9378060102462769, "logps/chosen": -18.5549373626709, "logps/rejected": -80.40107727050781, "loss": 0.4938, "nll_loss": 0.4757676124572754, "rewards/accuracies": 1.0, "rewards/chosen": 1.8507144451141357, "rewards/margins": 6.329689979553223, "rewards/rejected": -4.478975296020508, "step": 4747 }, { "epoch": 0.7913333333333333, "grad_norm": 41.22445297241211, "learning_rate": 2.198058992991564e-08, "logits/chosen": 2.2537412643432617, "logits/rejected": 2.78794527053833, "logps/chosen": -76.87370300292969, "logps/rejected": -243.05726623535156, "loss": 1.2708, "nll_loss": 1.2398983240127563, "rewards/accuracies": 1.0, "rewards/chosen": 0.5380561947822571, "rewards/margins": 8.872013092041016, "rewards/rejected": -8.333956718444824, "step": 4748 }, { "epoch": 0.7915, "grad_norm": 35.799983978271484, "learning_rate": 2.194683524962986e-08, "logits/chosen": 2.9371895790100098, "logits/rejected": 3.1347548961639404, "logps/chosen": -48.91474533081055, "logps/rejected": -149.7692108154297, "loss": 0.8729, "nll_loss": 0.8581533432006836, "rewards/accuracies": 1.0, "rewards/chosen": 1.6273632049560547, "rewards/margins": 7.019482612609863, "rewards/rejected": -5.392119407653809, "step": 4749 }, { "epoch": 0.7916666666666666, "grad_norm": 23.523645401000977, "learning_rate": 2.1913103312162394e-08, "logits/chosen": 2.557896852493286, "logits/rejected": 2.960575580596924, "logps/chosen": -97.69325256347656, "logps/rejected": -349.8220520019531, "loss": 0.8888, "nll_loss": 0.8722610473632812, "rewards/accuracies": 1.0, "rewards/chosen": 2.7485368251800537, "rewards/margins": 6.7865705490112305, "rewards/rejected": -4.038033962249756, "step": 4750 }, { "epoch": 0.7918333333333333, "grad_norm": 47.44052505493164, "learning_rate": 2.1879394127341954e-08, "logits/chosen": 2.229541063308716, "logits/rejected": 2.3574225902557373, "logps/chosen": -44.879486083984375, "logps/rejected": -123.6158447265625, "loss": 0.766, "nll_loss": 0.6321054100990295, "rewards/accuracies": 1.0, "rewards/chosen": 1.458979845046997, "rewards/margins": 3.268146514892578, "rewards/rejected": -1.8091667890548706, "step": 4751 }, { "epoch": 0.792, "grad_norm": 28.431325912475586, "learning_rate": 2.1845707704990556e-08, "logits/chosen": 2.6745998859405518, "logits/rejected": 2.676318883895874, "logps/chosen": -112.63664245605469, "logps/rejected": -127.57872772216797, "loss": 1.1476, "nll_loss": 1.1377437114715576, "rewards/accuracies": 1.0, "rewards/chosen": 2.1079957485198975, "rewards/margins": 7.6777849197387695, "rewards/rejected": -5.569788932800293, "step": 4752 }, { "epoch": 0.7921666666666667, "grad_norm": 17.433826446533203, "learning_rate": 2.1812044054923662e-08, "logits/chosen": 1.9156895875930786, "logits/rejected": 0.7765379548072815, "logps/chosen": -187.68319702148438, "logps/rejected": -84.0168228149414, "loss": 0.847, "nll_loss": 0.8378713726997375, "rewards/accuracies": 1.0, "rewards/chosen": 1.9424042701721191, "rewards/margins": 8.479092597961426, "rewards/rejected": -6.536688327789307, "step": 4753 }, { "epoch": 0.7923333333333333, "grad_norm": 74.7397689819336, "learning_rate": 2.177840318694999e-08, "logits/chosen": 2.9154365062713623, "logits/rejected": 2.9045510292053223, "logps/chosen": -15.387848854064941, "logps/rejected": -74.08721160888672, "loss": 0.7099, "nll_loss": 0.6994476914405823, "rewards/accuracies": 1.0, "rewards/chosen": 2.03088116645813, "rewards/margins": 7.563892364501953, "rewards/rejected": -5.533010959625244, "step": 4754 }, { "epoch": 0.7925, "grad_norm": 24.50350570678711, "learning_rate": 2.174478511087171e-08, "logits/chosen": 2.519723415374756, "logits/rejected": 2.574111223220825, "logps/chosen": -83.97233581542969, "logps/rejected": -202.43836975097656, "loss": 0.8832, "nll_loss": 0.8568605780601501, "rewards/accuracies": 1.0, "rewards/chosen": 2.2901597023010254, "rewards/margins": 5.889569282531738, "rewards/rejected": -3.599409818649292, "step": 4755 }, { "epoch": 0.7926666666666666, "grad_norm": 39.91724395751953, "learning_rate": 2.1711189836484312e-08, "logits/chosen": 2.8624303340911865, "logits/rejected": 2.9517621994018555, "logps/chosen": -38.768856048583984, "logps/rejected": -258.13482666015625, "loss": 0.9987, "nll_loss": 0.9692214131355286, "rewards/accuracies": 1.0, "rewards/chosen": 0.6701927781105042, "rewards/margins": 6.781044006347656, "rewards/rejected": -6.110851287841797, "step": 4756 }, { "epoch": 0.7928333333333333, "grad_norm": 35.13833999633789, "learning_rate": 2.1677617373576696e-08, "logits/chosen": 1.481101393699646, "logits/rejected": 2.169964551925659, "logps/chosen": -17.56101417541504, "logps/rejected": -322.8253173828125, "loss": 0.6762, "nll_loss": 0.6754237413406372, "rewards/accuracies": 1.0, "rewards/chosen": 4.359659671783447, "rewards/margins": 15.605392456054688, "rewards/rejected": -11.245732307434082, "step": 4757 }, { "epoch": 0.793, "grad_norm": 25.554927825927734, "learning_rate": 2.1644067731931003e-08, "logits/chosen": 2.219482898712158, "logits/rejected": 2.666916847229004, "logps/chosen": -16.67780113220215, "logps/rejected": -241.20291137695312, "loss": 0.413, "nll_loss": 0.38785579800605774, "rewards/accuracies": 1.0, "rewards/chosen": 0.7526251077651978, "rewards/margins": 9.7169828414917, "rewards/rejected": -8.964357376098633, "step": 4758 }, { "epoch": 0.7931666666666667, "grad_norm": 27.92268180847168, "learning_rate": 2.161054092132284e-08, "logits/chosen": 1.4932236671447754, "logits/rejected": 1.4629502296447754, "logps/chosen": -51.7358283996582, "logps/rejected": -93.46713256835938, "loss": 0.7731, "nll_loss": 0.7608210444450378, "rewards/accuracies": 1.0, "rewards/chosen": 2.211242437362671, "rewards/margins": 7.055183410644531, "rewards/rejected": -4.843940734863281, "step": 4759 }, { "epoch": 0.7933333333333333, "grad_norm": 14.889509201049805, "learning_rate": 2.157703695152109e-08, "logits/chosen": 2.9444961547851562, "logits/rejected": 2.903266429901123, "logps/chosen": -228.52178955078125, "logps/rejected": -234.7236785888672, "loss": 0.8978, "nll_loss": 0.8926631212234497, "rewards/accuracies": 1.0, "rewards/chosen": 2.976706027984619, "rewards/margins": 8.7719087600708, "rewards/rejected": -5.795202732086182, "step": 4760 }, { "epoch": 0.7935, "grad_norm": 27.582334518432617, "learning_rate": 2.154355583228805e-08, "logits/chosen": 2.724485397338867, "logits/rejected": 3.0781915187835693, "logps/chosen": -43.73768997192383, "logps/rejected": -532.3090209960938, "loss": 0.7201, "nll_loss": 0.7054466605186462, "rewards/accuracies": 1.0, "rewards/chosen": 1.3165619373321533, "rewards/margins": 11.294708251953125, "rewards/rejected": -9.97814655303955, "step": 4761 }, { "epoch": 0.7936666666666666, "grad_norm": 62.88485336303711, "learning_rate": 2.151009757337925e-08, "logits/chosen": 2.906733274459839, "logits/rejected": 3.065735340118408, "logps/chosen": -22.981491088867188, "logps/rejected": -57.4001579284668, "loss": 0.8397, "nll_loss": 0.5745373368263245, "rewards/accuracies": 1.0, "rewards/chosen": 1.2619949579238892, "rewards/margins": 2.242143154144287, "rewards/rejected": -0.9801483154296875, "step": 4762 }, { "epoch": 0.7938333333333333, "grad_norm": 31.89027976989746, "learning_rate": 2.147666218454366e-08, "logits/chosen": 2.0885040760040283, "logits/rejected": 2.5825629234313965, "logps/chosen": -34.64189147949219, "logps/rejected": -402.2228698730469, "loss": 0.6778, "nll_loss": 0.6536206007003784, "rewards/accuracies": 1.0, "rewards/chosen": 0.7908439636230469, "rewards/margins": 10.310341835021973, "rewards/rejected": -9.519497871398926, "step": 4763 }, { "epoch": 0.794, "grad_norm": 21.19320297241211, "learning_rate": 2.1443249675523534e-08, "logits/chosen": 2.329003095626831, "logits/rejected": 2.4335174560546875, "logps/chosen": -46.01240539550781, "logps/rejected": -136.96835327148438, "loss": 0.5748, "nll_loss": 0.5680543780326843, "rewards/accuracies": 1.0, "rewards/chosen": 2.1422829627990723, "rewards/margins": 10.540950775146484, "rewards/rejected": -8.39866828918457, "step": 4764 }, { "epoch": 0.7941666666666667, "grad_norm": 61.71160125732422, "learning_rate": 2.140986005605452e-08, "logits/chosen": 2.3748600482940674, "logits/rejected": 2.3604235649108887, "logps/chosen": -63.97322082519531, "logps/rejected": -190.19143676757812, "loss": 1.7984, "nll_loss": 1.777033805847168, "rewards/accuracies": 1.0, "rewards/chosen": 1.3121888637542725, "rewards/margins": 6.219830513000488, "rewards/rejected": -4.907641887664795, "step": 4765 }, { "epoch": 0.7943333333333333, "grad_norm": 25.629819869995117, "learning_rate": 2.1376493335865485e-08, "logits/chosen": 2.3081350326538086, "logits/rejected": 2.132028102874756, "logps/chosen": -68.61060333251953, "logps/rejected": -106.547607421875, "loss": 0.8531, "nll_loss": 0.8367147445678711, "rewards/accuracies": 1.0, "rewards/chosen": 2.057353973388672, "rewards/margins": 6.503238677978516, "rewards/rejected": -4.445884704589844, "step": 4766 }, { "epoch": 0.7945, "grad_norm": 50.31364822387695, "learning_rate": 2.1343149524678726e-08, "logits/chosen": 2.3482565879821777, "logits/rejected": 2.466081380844116, "logps/chosen": -13.55940055847168, "logps/rejected": -94.53685760498047, "loss": 0.4595, "nll_loss": 0.45197999477386475, "rewards/accuracies": 1.0, "rewards/chosen": 2.10384202003479, "rewards/margins": 9.070147514343262, "rewards/rejected": -6.966305732727051, "step": 4767 }, { "epoch": 0.7946666666666666, "grad_norm": 20.777246475219727, "learning_rate": 2.1309828632209826e-08, "logits/chosen": 2.6780998706817627, "logits/rejected": 2.9857606887817383, "logps/chosen": -113.02337646484375, "logps/rejected": -80.8595199584961, "loss": 0.7424, "nll_loss": 0.715337872505188, "rewards/accuracies": 1.0, "rewards/chosen": 3.7176547050476074, "rewards/margins": 6.999520301818848, "rewards/rejected": -3.281865358352661, "step": 4768 }, { "epoch": 0.7948333333333333, "grad_norm": 29.598976135253906, "learning_rate": 2.1276530668167724e-08, "logits/chosen": 2.0846457481384277, "logits/rejected": 2.302931547164917, "logps/chosen": -45.557456970214844, "logps/rejected": -223.33396911621094, "loss": 0.7995, "nll_loss": 0.7854733467102051, "rewards/accuracies": 1.0, "rewards/chosen": 1.440958023071289, "rewards/margins": 8.11534309387207, "rewards/rejected": -6.674385070800781, "step": 4769 }, { "epoch": 0.795, "grad_norm": 26.499799728393555, "learning_rate": 2.1243255642254575e-08, "logits/chosen": 1.1741822957992554, "logits/rejected": 1.915718674659729, "logps/chosen": -111.99995422363281, "logps/rejected": -428.9003601074219, "loss": 1.0779, "nll_loss": 1.0666663646697998, "rewards/accuracies": 1.0, "rewards/chosen": 1.606562852859497, "rewards/margins": 10.050310134887695, "rewards/rejected": -8.443747520446777, "step": 4770 }, { "epoch": 0.7951666666666667, "grad_norm": 72.10376739501953, "learning_rate": 2.1210003564165967e-08, "logits/chosen": 1.3308286666870117, "logits/rejected": 1.4460701942443848, "logps/chosen": -17.934154510498047, "logps/rejected": -6.094461441040039, "loss": 1.3558, "nll_loss": 0.3146342933177948, "rewards/accuracies": 0.0, "rewards/chosen": 0.8507915735244751, "rewards/margins": -0.03094327449798584, "rewards/rejected": 0.8817348480224609, "step": 4771 }, { "epoch": 0.7953333333333333, "grad_norm": 279.7102966308594, "learning_rate": 2.117677444359076e-08, "logits/chosen": 2.271420955657959, "logits/rejected": 2.519747257232666, "logps/chosen": -93.16070556640625, "logps/rejected": -165.89761352539062, "loss": 2.9976, "nll_loss": 2.1172893047332764, "rewards/accuracies": 1.0, "rewards/chosen": -3.262822151184082, "rewards/margins": 0.32028675079345703, "rewards/rejected": -3.583108901977539, "step": 4772 }, { "epoch": 0.7955, "grad_norm": 26.453474044799805, "learning_rate": 2.1143568290211112e-08, "logits/chosen": 2.634561538696289, "logits/rejected": 2.931304454803467, "logps/chosen": -68.08497619628906, "logps/rejected": -302.6199645996094, "loss": 0.816, "nll_loss": 0.8105355501174927, "rewards/accuracies": 1.0, "rewards/chosen": 2.370295763015747, "rewards/margins": 10.268141746520996, "rewards/rejected": -7.897846221923828, "step": 4773 }, { "epoch": 0.7956666666666666, "grad_norm": 76.8197021484375, "learning_rate": 2.1110385113702535e-08, "logits/chosen": 2.6374220848083496, "logits/rejected": 2.393078327178955, "logps/chosen": -164.20082092285156, "logps/rejected": -57.69548034667969, "loss": 1.1261, "nll_loss": 0.8463959693908691, "rewards/accuracies": 1.0, "rewards/chosen": 0.5970382690429688, "rewards/margins": 1.878255844116211, "rewards/rejected": -1.2812175750732422, "step": 4774 }, { "epoch": 0.7958333333333333, "grad_norm": 37.201393127441406, "learning_rate": 2.107722492373375e-08, "logits/chosen": 2.7965898513793945, "logits/rejected": 2.7728967666625977, "logps/chosen": -110.0423355102539, "logps/rejected": -140.1421661376953, "loss": 1.2883, "nll_loss": 1.250481128692627, "rewards/accuracies": 1.0, "rewards/chosen": 0.40917131304740906, "rewards/margins": 6.288427352905273, "rewards/rejected": -5.879256248474121, "step": 4775 }, { "epoch": 0.796, "grad_norm": 32.07796859741211, "learning_rate": 2.1044087729966854e-08, "logits/chosen": 0.7599745988845825, "logits/rejected": 1.544266939163208, "logps/chosen": -25.185100555419922, "logps/rejected": -315.37799072265625, "loss": 0.5383, "nll_loss": 0.5358531475067139, "rewards/accuracies": 1.0, "rewards/chosen": 3.3521816730499268, "rewards/margins": 10.58545207977295, "rewards/rejected": -7.233270645141602, "step": 4776 }, { "epoch": 0.7961666666666667, "grad_norm": 55.30732345581055, "learning_rate": 2.1010973542057252e-08, "logits/chosen": 1.9875239133834839, "logits/rejected": 0.9306775331497192, "logps/chosen": -181.14939880371094, "logps/rejected": -51.519683837890625, "loss": 1.3362, "nll_loss": 1.2239824533462524, "rewards/accuracies": 1.0, "rewards/chosen": 0.7397689819335938, "rewards/margins": 3.1732630729675293, "rewards/rejected": -2.4334940910339355, "step": 4777 }, { "epoch": 0.7963333333333333, "grad_norm": 64.69747924804688, "learning_rate": 2.0977882369653645e-08, "logits/chosen": 1.957869529724121, "logits/rejected": 2.341845750808716, "logps/chosen": -24.571868896484375, "logps/rejected": -213.87539672851562, "loss": 1.0001, "nll_loss": 0.9828746914863586, "rewards/accuracies": 1.0, "rewards/chosen": 1.1565560102462769, "rewards/margins": 9.627967834472656, "rewards/rejected": -8.47141170501709, "step": 4778 }, { "epoch": 0.7965, "grad_norm": 24.95986557006836, "learning_rate": 2.0944814222397944e-08, "logits/chosen": 2.781801700592041, "logits/rejected": 2.952503204345703, "logps/chosen": -103.80296325683594, "logps/rejected": -323.88934326171875, "loss": 1.001, "nll_loss": 0.9885994791984558, "rewards/accuracies": 1.0, "rewards/chosen": 1.8492333889007568, "rewards/margins": 7.262249946594238, "rewards/rejected": -5.413016319274902, "step": 4779 }, { "epoch": 0.7966666666666666, "grad_norm": 23.051254272460938, "learning_rate": 2.091176910992545e-08, "logits/chosen": 2.6509220600128174, "logits/rejected": 2.789888858795166, "logps/chosen": -77.01460266113281, "logps/rejected": -196.57867431640625, "loss": 0.9467, "nll_loss": 0.9392024874687195, "rewards/accuracies": 1.0, "rewards/chosen": 3.4989235401153564, "rewards/margins": 8.332114219665527, "rewards/rejected": -4.83319091796875, "step": 4780 }, { "epoch": 0.7968333333333333, "grad_norm": 34.73747634887695, "learning_rate": 2.087874704186471e-08, "logits/chosen": 2.856778144836426, "logits/rejected": 2.9834070205688477, "logps/chosen": -10.406254768371582, "logps/rejected": -62.99646759033203, "loss": 0.4286, "nll_loss": 0.34687522053718567, "rewards/accuracies": 1.0, "rewards/chosen": 1.567806601524353, "rewards/margins": 3.9751439094543457, "rewards/rejected": -2.407337188720703, "step": 4781 }, { "epoch": 0.797, "grad_norm": 11.120309829711914, "learning_rate": 2.0845748027837585e-08, "logits/chosen": 2.49106764793396, "logits/rejected": 2.3653016090393066, "logps/chosen": -143.19076538085938, "logps/rejected": -166.18507385253906, "loss": 0.5877, "nll_loss": 0.5844520926475525, "rewards/accuracies": 1.0, "rewards/chosen": 4.433291912078857, "rewards/margins": 10.063405990600586, "rewards/rejected": -5.63011360168457, "step": 4782 }, { "epoch": 0.7971666666666667, "grad_norm": 25.16899871826172, "learning_rate": 2.081277207745915e-08, "logits/chosen": 0.6256349682807922, "logits/rejected": 1.40229070186615, "logps/chosen": -68.31539916992188, "logps/rejected": -257.027587890625, "loss": 0.6767, "nll_loss": 0.6444849967956543, "rewards/accuracies": 1.0, "rewards/chosen": 0.5339187979698181, "rewards/margins": 7.09111213684082, "rewards/rejected": -6.557193279266357, "step": 4783 }, { "epoch": 0.7973333333333333, "grad_norm": 25.118473052978516, "learning_rate": 2.0779819200337824e-08, "logits/chosen": 1.7379781007766724, "logits/rejected": 2.531919240951538, "logps/chosen": -32.737030029296875, "logps/rejected": -227.62484741210938, "loss": 0.5282, "nll_loss": 0.5196353793144226, "rewards/accuracies": 1.0, "rewards/chosen": 1.937280297279358, "rewards/margins": 9.113219261169434, "rewards/rejected": -7.175938606262207, "step": 4784 }, { "epoch": 0.7975, "grad_norm": 26.869258880615234, "learning_rate": 2.0746889406075286e-08, "logits/chosen": 0.8886933922767639, "logits/rejected": 2.437997817993164, "logps/chosen": -26.486122131347656, "logps/rejected": -434.6806640625, "loss": 0.5389, "nll_loss": 0.5297224521636963, "rewards/accuracies": 1.0, "rewards/chosen": 1.7931121587753296, "rewards/margins": 11.82167911529541, "rewards/rejected": -10.02856731414795, "step": 4785 }, { "epoch": 0.7976666666666666, "grad_norm": 23.11106300354004, "learning_rate": 2.071398270426652e-08, "logits/chosen": 1.984121322631836, "logits/rejected": 2.211930990219116, "logps/chosen": -56.13066482543945, "logps/rejected": -202.39430236816406, "loss": 0.5809, "nll_loss": 0.5613066554069519, "rewards/accuracies": 1.0, "rewards/chosen": 1.0161281824111938, "rewards/margins": 9.881906509399414, "rewards/rejected": -8.865777969360352, "step": 4786 }, { "epoch": 0.7978333333333333, "grad_norm": 38.2921028137207, "learning_rate": 2.06810991044997e-08, "logits/chosen": 2.922919511795044, "logits/rejected": 3.005448818206787, "logps/chosen": -18.063430786132812, "logps/rejected": -102.03722381591797, "loss": 0.6695, "nll_loss": 0.6690159440040588, "rewards/accuracies": 1.0, "rewards/chosen": 4.985299587249756, "rewards/margins": 14.089239120483398, "rewards/rejected": -9.103939056396484, "step": 4787 }, { "epoch": 0.798, "grad_norm": 30.907873153686523, "learning_rate": 2.064823861635633e-08, "logits/chosen": 3.320920467376709, "logits/rejected": 3.58967661857605, "logps/chosen": -24.13226318359375, "logps/rejected": -85.44967651367188, "loss": 0.6365, "nll_loss": 0.5745777487754822, "rewards/accuracies": 1.0, "rewards/chosen": 1.5065762996673584, "rewards/margins": 4.307191371917725, "rewards/rejected": -2.800615072250366, "step": 4788 }, { "epoch": 0.7981666666666667, "grad_norm": 23.335912704467773, "learning_rate": 2.061540124941117e-08, "logits/chosen": 3.309964418411255, "logits/rejected": 3.037619113922119, "logps/chosen": -331.10791015625, "logps/rejected": -155.6473388671875, "loss": 1.0344, "nll_loss": 0.9943179488182068, "rewards/accuracies": 1.0, "rewards/chosen": 2.1074187755584717, "rewards/margins": 5.230356693267822, "rewards/rejected": -3.1229379177093506, "step": 4789 }, { "epoch": 0.7983333333333333, "grad_norm": 317.5543518066406, "learning_rate": 2.0582587013232266e-08, "logits/chosen": 2.6588056087493896, "logits/rejected": 2.8364975452423096, "logps/chosen": -16.111831665039062, "logps/rejected": -127.64329528808594, "loss": 2.3452, "nll_loss": 2.301689863204956, "rewards/accuracies": 1.0, "rewards/chosen": 0.2714899182319641, "rewards/margins": 5.908053398132324, "rewards/rejected": -5.636563301086426, "step": 4790 }, { "epoch": 0.7985, "grad_norm": 60.81728744506836, "learning_rate": 2.0549795917380863e-08, "logits/chosen": 3.427812099456787, "logits/rejected": 3.5247249603271484, "logps/chosen": -12.574679374694824, "logps/rejected": -57.98234558105469, "loss": 0.6897, "nll_loss": 0.6618252992630005, "rewards/accuracies": 1.0, "rewards/chosen": 1.8006069660186768, "rewards/margins": 5.569126129150391, "rewards/rejected": -3.7685189247131348, "step": 4791 }, { "epoch": 0.7986666666666666, "grad_norm": 22.170812606811523, "learning_rate": 2.0517027971411503e-08, "logits/chosen": 1.258075475692749, "logits/rejected": 1.2705744504928589, "logps/chosen": -37.55860900878906, "logps/rejected": -100.73727416992188, "loss": 0.4933, "nll_loss": 0.4754253923892975, "rewards/accuracies": 1.0, "rewards/chosen": 1.2698711156845093, "rewards/margins": 7.125733852386475, "rewards/rejected": -5.855862617492676, "step": 4792 }, { "epoch": 0.7988333333333333, "grad_norm": 224.1832275390625, "learning_rate": 2.0484283184872e-08, "logits/chosen": 2.7531723976135254, "logits/rejected": 2.8840491771698, "logps/chosen": -68.02418518066406, "logps/rejected": -24.97738265991211, "loss": 4.5773, "nll_loss": 0.8195685148239136, "rewards/accuracies": 0.0, "rewards/chosen": 2.1294121742248535, "rewards/margins": -3.0338993072509766, "rewards/rejected": 5.16331148147583, "step": 4793 }, { "epoch": 0.799, "grad_norm": 25.92862892150879, "learning_rate": 2.0451561567303376e-08, "logits/chosen": 2.7524800300598145, "logits/rejected": 2.9940972328186035, "logps/chosen": -45.63426971435547, "logps/rejected": -216.34091186523438, "loss": 0.6117, "nll_loss": 0.6004508137702942, "rewards/accuracies": 1.0, "rewards/chosen": 1.6031287908554077, "rewards/margins": 9.972769737243652, "rewards/rejected": -8.369641304016113, "step": 4794 }, { "epoch": 0.7991666666666667, "grad_norm": 31.67070770263672, "learning_rate": 2.041886312823996e-08, "logits/chosen": 2.687575578689575, "logits/rejected": 2.7051358222961426, "logps/chosen": -97.67928314208984, "logps/rejected": -136.74871826171875, "loss": 0.9859, "nll_loss": 0.9767927527427673, "rewards/accuracies": 1.0, "rewards/chosen": 1.9079856872558594, "rewards/margins": 8.668495178222656, "rewards/rejected": -6.760509014129639, "step": 4795 }, { "epoch": 0.7993333333333333, "grad_norm": 36.378334045410156, "learning_rate": 2.038618787720925e-08, "logits/chosen": 2.6896183490753174, "logits/rejected": 2.6845626831054688, "logps/chosen": -199.61439514160156, "logps/rejected": -303.0626525878906, "loss": 1.2989, "nll_loss": 1.239841103553772, "rewards/accuracies": 1.0, "rewards/chosen": 0.3710830807685852, "rewards/margins": 4.329420566558838, "rewards/rejected": -3.9583375453948975, "step": 4796 }, { "epoch": 0.7995, "grad_norm": 28.255109786987305, "learning_rate": 2.035353582373205e-08, "logits/chosen": 2.494398593902588, "logits/rejected": 2.6040732860565186, "logps/chosen": -39.629329681396484, "logps/rejected": -173.72911071777344, "loss": 0.6511, "nll_loss": 0.6290369629859924, "rewards/accuracies": 1.0, "rewards/chosen": 1.8734197616577148, "rewards/margins": 5.962970733642578, "rewards/rejected": -4.089550971984863, "step": 4797 }, { "epoch": 0.7996666666666666, "grad_norm": 45.0821418762207, "learning_rate": 2.032090697732237e-08, "logits/chosen": 2.2991745471954346, "logits/rejected": 2.1960997581481934, "logps/chosen": -82.55433654785156, "logps/rejected": -97.11213684082031, "loss": 1.4648, "nll_loss": 1.3759057521820068, "rewards/accuracies": 1.0, "rewards/chosen": 2.27646803855896, "rewards/margins": 4.3624725341796875, "rewards/rejected": -2.0860047340393066, "step": 4798 }, { "epoch": 0.7998333333333333, "grad_norm": 40.59224319458008, "learning_rate": 2.0288301347487524e-08, "logits/chosen": 2.81379771232605, "logits/rejected": 2.836174249649048, "logps/chosen": -32.755821228027344, "logps/rejected": -153.5992889404297, "loss": 0.8737, "nll_loss": 0.8619952201843262, "rewards/accuracies": 1.0, "rewards/chosen": 2.0765857696533203, "rewards/margins": 7.204871654510498, "rewards/rejected": -5.128285884857178, "step": 4799 }, { "epoch": 0.8, "grad_norm": 46.16620635986328, "learning_rate": 2.025571894372794e-08, "logits/chosen": 2.161745548248291, "logits/rejected": 2.381558895111084, "logps/chosen": -63.463829040527344, "logps/rejected": -146.5230712890625, "loss": 0.7826, "nll_loss": 0.7555216550827026, "rewards/accuracies": 1.0, "rewards/chosen": 1.8448768854141235, "rewards/margins": 5.631002902984619, "rewards/rejected": -3.786126136779785, "step": 4800 }, { "epoch": 0.8001666666666667, "grad_norm": 27.619770050048828, "learning_rate": 2.0223159775537367e-08, "logits/chosen": 1.89761221408844, "logits/rejected": 1.5414268970489502, "logps/chosen": -84.46881866455078, "logps/rejected": -65.21887969970703, "loss": 0.9985, "nll_loss": 0.9598730206489563, "rewards/accuracies": 1.0, "rewards/chosen": 2.138998508453369, "rewards/margins": 5.296109676361084, "rewards/rejected": -3.157111167907715, "step": 4801 }, { "epoch": 0.8003333333333333, "grad_norm": 51.81922912597656, "learning_rate": 2.019062385240279e-08, "logits/chosen": 2.3131296634674072, "logits/rejected": 2.4363062381744385, "logps/chosen": -13.00652027130127, "logps/rejected": -182.92337036132812, "loss": 0.5722, "nll_loss": 0.5655008554458618, "rewards/accuracies": 1.0, "rewards/chosen": 2.1568219661712646, "rewards/margins": 9.95883846282959, "rewards/rejected": -7.802016258239746, "step": 4802 }, { "epoch": 0.8005, "grad_norm": 65.85245513916016, "learning_rate": 2.0158111183804406e-08, "logits/chosen": 2.7260029315948486, "logits/rejected": 2.836430311203003, "logps/chosen": -148.63775634765625, "logps/rejected": -181.95416259765625, "loss": 1.6464, "nll_loss": 1.5013916492462158, "rewards/accuracies": 1.0, "rewards/chosen": -1.2697480916976929, "rewards/margins": 5.079204082489014, "rewards/rejected": -6.348952293395996, "step": 4803 }, { "epoch": 0.8006666666666666, "grad_norm": 25.294078826904297, "learning_rate": 2.0125621779215575e-08, "logits/chosen": 3.260733127593994, "logits/rejected": 3.134632110595703, "logps/chosen": -144.80422973632812, "logps/rejected": -180.99288940429688, "loss": 1.239, "nll_loss": 1.2168421745300293, "rewards/accuracies": 1.0, "rewards/chosen": 1.668025255203247, "rewards/margins": 5.93912410736084, "rewards/rejected": -4.271098613739014, "step": 4804 }, { "epoch": 0.8008333333333333, "grad_norm": 189.53219604492188, "learning_rate": 2.0093155648102965e-08, "logits/chosen": 2.433133363723755, "logits/rejected": 2.3969473838806152, "logps/chosen": -37.76263427734375, "logps/rejected": -19.93532943725586, "loss": 3.429, "nll_loss": 0.686593234539032, "rewards/accuracies": 0.0, "rewards/chosen": 2.4451119899749756, "rewards/margins": -1.880892038345337, "rewards/rejected": 4.3260040283203125, "step": 4805 }, { "epoch": 0.801, "grad_norm": 23.169767379760742, "learning_rate": 2.0060712799926405e-08, "logits/chosen": 1.8306760787963867, "logits/rejected": 1.2941292524337769, "logps/chosen": -95.06689453125, "logps/rejected": -70.8257064819336, "loss": 0.9205, "nll_loss": 0.9053988456726074, "rewards/accuracies": 1.0, "rewards/chosen": 2.333517551422119, "rewards/margins": 6.714477062225342, "rewards/rejected": -4.380959510803223, "step": 4806 }, { "epoch": 0.8011666666666667, "grad_norm": 30.130266189575195, "learning_rate": 2.0028293244139016e-08, "logits/chosen": 1.8376260995864868, "logits/rejected": 2.456186294555664, "logps/chosen": -15.21023941040039, "logps/rejected": -45.98255157470703, "loss": 0.4834, "nll_loss": 0.44736000895500183, "rewards/accuracies": 1.0, "rewards/chosen": 0.9577226638793945, "rewards/margins": 5.122528553009033, "rewards/rejected": -4.164805889129639, "step": 4807 }, { "epoch": 0.8013333333333333, "grad_norm": 37.31787872314453, "learning_rate": 1.9995896990187e-08, "logits/chosen": 3.0886447429656982, "logits/rejected": 3.0708088874816895, "logps/chosen": -82.21575164794922, "logps/rejected": -64.107421875, "loss": 1.6886, "nll_loss": 1.6778720617294312, "rewards/accuracies": 1.0, "rewards/chosen": 5.276031017303467, "rewards/margins": 9.474063873291016, "rewards/rejected": -4.198033332824707, "step": 4808 }, { "epoch": 0.8015, "grad_norm": 30.47842025756836, "learning_rate": 1.9963524047509896e-08, "logits/chosen": 1.893225908279419, "logits/rejected": 2.009143352508545, "logps/chosen": -47.26509475708008, "logps/rejected": -124.09772491455078, "loss": 0.7243, "nll_loss": 0.716137707233429, "rewards/accuracies": 1.0, "rewards/chosen": 1.9872890710830688, "rewards/margins": 9.171082496643066, "rewards/rejected": -7.183793067932129, "step": 4809 }, { "epoch": 0.8016666666666666, "grad_norm": 34.70720672607422, "learning_rate": 1.9931174425540387e-08, "logits/chosen": 2.337425470352173, "logits/rejected": 2.357081174850464, "logps/chosen": -61.4625244140625, "logps/rejected": -294.4425964355469, "loss": 1.069, "nll_loss": 1.0596988201141357, "rewards/accuracies": 1.0, "rewards/chosen": 1.8327233791351318, "rewards/margins": 9.34405517578125, "rewards/rejected": -7.511331558227539, "step": 4810 }, { "epoch": 0.8018333333333333, "grad_norm": 39.10330581665039, "learning_rate": 1.9898848133704415e-08, "logits/chosen": 2.022052049636841, "logits/rejected": 1.8203275203704834, "logps/chosen": -23.42426109313965, "logps/rejected": -70.53306579589844, "loss": 0.7278, "nll_loss": 0.7098261117935181, "rewards/accuracies": 1.0, "rewards/chosen": 2.2612059116363525, "rewards/margins": 6.415195465087891, "rewards/rejected": -4.153989315032959, "step": 4811 }, { "epoch": 0.802, "grad_norm": 58.590232849121094, "learning_rate": 1.9866545181421013e-08, "logits/chosen": 1.0849151611328125, "logits/rejected": 2.2198171615600586, "logps/chosen": -30.815101623535156, "logps/rejected": -305.15106201171875, "loss": 1.1979, "nll_loss": 1.1851961612701416, "rewards/accuracies": 1.0, "rewards/chosen": 2.3845696449279785, "rewards/margins": 7.013802528381348, "rewards/rejected": -4.629232883453369, "step": 4812 }, { "epoch": 0.8021666666666667, "grad_norm": 23.744159698486328, "learning_rate": 1.9834265578102537e-08, "logits/chosen": 1.0204546451568604, "logits/rejected": 1.673994541168213, "logps/chosen": -44.98811340332031, "logps/rejected": -252.33889770507812, "loss": 0.5214, "nll_loss": 0.49986785650253296, "rewards/accuracies": 1.0, "rewards/chosen": 2.240903615951538, "rewards/margins": 6.138129711151123, "rewards/rejected": -3.897226095199585, "step": 4813 }, { "epoch": 0.8023333333333333, "grad_norm": 26.735416412353516, "learning_rate": 1.9802009333154467e-08, "logits/chosen": 1.8409147262573242, "logits/rejected": 2.0035452842712402, "logps/chosen": -111.5461196899414, "logps/rejected": -240.37954711914062, "loss": 1.0488, "nll_loss": 1.0424869060516357, "rewards/accuracies": 1.0, "rewards/chosen": 2.338735342025757, "rewards/margins": 9.094817161560059, "rewards/rejected": -6.756082057952881, "step": 4814 }, { "epoch": 0.8025, "grad_norm": 27.233545303344727, "learning_rate": 1.9769776455975516e-08, "logits/chosen": 0.9258133172988892, "logits/rejected": 1.5558370351791382, "logps/chosen": -103.69255065917969, "logps/rejected": -393.558837890625, "loss": 0.9589, "nll_loss": 0.9341670274734497, "rewards/accuracies": 1.0, "rewards/chosen": 0.7731162905693054, "rewards/margins": 9.03872013092041, "rewards/rejected": -8.265604019165039, "step": 4815 }, { "epoch": 0.8026666666666666, "grad_norm": 27.436079025268555, "learning_rate": 1.9737566955957584e-08, "logits/chosen": 2.889634847640991, "logits/rejected": 3.168369770050049, "logps/chosen": -69.12259674072266, "logps/rejected": -228.8426055908203, "loss": 0.8903, "nll_loss": 0.8861871361732483, "rewards/accuracies": 1.0, "rewards/chosen": 2.8129937648773193, "rewards/margins": 9.668213844299316, "rewards/rejected": -6.855220317840576, "step": 4816 }, { "epoch": 0.8028333333333333, "grad_norm": 24.88592529296875, "learning_rate": 1.9705380842485696e-08, "logits/chosen": 2.6980178356170654, "logits/rejected": 2.7301206588745117, "logps/chosen": -59.55662155151367, "logps/rejected": -195.34765625, "loss": 0.6987, "nll_loss": 0.6925188899040222, "rewards/accuracies": 1.0, "rewards/chosen": 2.3023929595947266, "rewards/margins": 9.392337799072266, "rewards/rejected": -7.089944362640381, "step": 4817 }, { "epoch": 0.803, "grad_norm": 36.295684814453125, "learning_rate": 1.9673218124938128e-08, "logits/chosen": 2.534501314163208, "logits/rejected": 2.480098247528076, "logps/chosen": -68.6703109741211, "logps/rejected": -49.909393310546875, "loss": 0.9892, "nll_loss": 0.9406893253326416, "rewards/accuracies": 1.0, "rewards/chosen": 1.0582183599472046, "rewards/margins": 4.518795967102051, "rewards/rejected": -3.4605774879455566, "step": 4818 }, { "epoch": 0.8031666666666667, "grad_norm": 23.634572982788086, "learning_rate": 1.9641078812686372e-08, "logits/chosen": 2.3932220935821533, "logits/rejected": 2.43727970123291, "logps/chosen": -87.96369934082031, "logps/rejected": -208.87445068359375, "loss": 0.8618, "nll_loss": 0.8540164828300476, "rewards/accuracies": 1.0, "rewards/chosen": 1.9676010608673096, "rewards/margins": 10.825227737426758, "rewards/rejected": -8.857626914978027, "step": 4819 }, { "epoch": 0.8033333333333333, "grad_norm": 57.46195983886719, "learning_rate": 1.9608962915094994e-08, "logits/chosen": 0.993298351764679, "logits/rejected": 0.8642166256904602, "logps/chosen": -106.46969604492188, "logps/rejected": -65.12065887451172, "loss": 0.9021, "nll_loss": 0.6696207523345947, "rewards/accuracies": 1.0, "rewards/chosen": 0.6945526003837585, "rewards/margins": 2.1660964488983154, "rewards/rejected": -1.4715439081192017, "step": 4820 }, { "epoch": 0.8035, "grad_norm": 23.62620735168457, "learning_rate": 1.9576870441521832e-08, "logits/chosen": 1.3759377002716064, "logits/rejected": 1.9838616847991943, "logps/chosen": -54.910133361816406, "logps/rejected": -252.5626983642578, "loss": 0.635, "nll_loss": 0.6169677376747131, "rewards/accuracies": 1.0, "rewards/chosen": 1.29705810546875, "rewards/margins": 6.94598388671875, "rewards/rejected": -5.64892578125, "step": 4821 }, { "epoch": 0.8036666666666666, "grad_norm": 27.561962127685547, "learning_rate": 1.9544801401317835e-08, "logits/chosen": 1.7376965284347534, "logits/rejected": 1.8106839656829834, "logps/chosen": -82.3014907836914, "logps/rejected": -168.9994354248047, "loss": 0.9258, "nll_loss": 0.9144611954689026, "rewards/accuracies": 1.0, "rewards/chosen": 1.7912483215332031, "rewards/margins": 7.750722885131836, "rewards/rejected": -5.959474563598633, "step": 4822 }, { "epoch": 0.8038333333333333, "grad_norm": 38.6366081237793, "learning_rate": 1.9512755803827198e-08, "logits/chosen": 1.9320807456970215, "logits/rejected": 2.349351167678833, "logps/chosen": -16.27164649963379, "logps/rejected": -273.951904296875, "loss": 0.5436, "nll_loss": 0.5423881411552429, "rewards/accuracies": 1.0, "rewards/chosen": 4.2086310386657715, "rewards/margins": 11.726243019104004, "rewards/rejected": -7.517611980438232, "step": 4823 }, { "epoch": 0.804, "grad_norm": 33.94535446166992, "learning_rate": 1.948073365838717e-08, "logits/chosen": 1.82023024559021, "logits/rejected": 1.8536605834960938, "logps/chosen": -34.2325439453125, "logps/rejected": -47.2110595703125, "loss": 0.6901, "nll_loss": 0.5433736443519592, "rewards/accuracies": 1.0, "rewards/chosen": 1.3456547260284424, "rewards/margins": 3.0869972705841064, "rewards/rejected": -1.741342544555664, "step": 4824 }, { "epoch": 0.8041666666666667, "grad_norm": 103.13861846923828, "learning_rate": 1.9448734974328283e-08, "logits/chosen": 3.02858567237854, "logits/rejected": 3.0246617794036865, "logps/chosen": -77.77789306640625, "logps/rejected": -90.02581787109375, "loss": 1.3247, "nll_loss": 0.9722237586975098, "rewards/accuracies": 1.0, "rewards/chosen": -0.5832168459892273, "rewards/margins": 1.3353424072265625, "rewards/rejected": -1.9185593128204346, "step": 4825 }, { "epoch": 0.8043333333333333, "grad_norm": 34.794349670410156, "learning_rate": 1.9416759760974165e-08, "logits/chosen": 2.053204298019409, "logits/rejected": 2.233696460723877, "logps/chosen": -46.859134674072266, "logps/rejected": -190.2715301513672, "loss": 0.7167, "nll_loss": 0.6791179180145264, "rewards/accuracies": 1.0, "rewards/chosen": 0.6475403308868408, "rewards/margins": 5.3461408615112305, "rewards/rejected": -4.698600769042969, "step": 4826 }, { "epoch": 0.8045, "grad_norm": 16.041318893432617, "learning_rate": 1.9384808027641662e-08, "logits/chosen": 1.9276902675628662, "logits/rejected": 2.082301378250122, "logps/chosen": -45.55512237548828, "logps/rejected": -299.90948486328125, "loss": 0.4252, "nll_loss": 0.4141375720500946, "rewards/accuracies": 1.0, "rewards/chosen": 1.610116958618164, "rewards/margins": 10.140634536743164, "rewards/rejected": -8.530517578125, "step": 4827 }, { "epoch": 0.8046666666666666, "grad_norm": 29.72763442993164, "learning_rate": 1.9352879783640696e-08, "logits/chosen": 1.9292844533920288, "logits/rejected": 2.3542728424072266, "logps/chosen": -14.845025062561035, "logps/rejected": -45.52184295654297, "loss": 0.4725, "nll_loss": 0.4366183280944824, "rewards/accuracies": 1.0, "rewards/chosen": 0.9942441582679749, "rewards/margins": 5.112979412078857, "rewards/rejected": -4.118735313415527, "step": 4828 }, { "epoch": 0.8048333333333333, "grad_norm": 23.691102981567383, "learning_rate": 1.93209750382744e-08, "logits/chosen": 1.8638057708740234, "logits/rejected": 1.9650275707244873, "logps/chosen": -91.00848388671875, "logps/rejected": -93.99858856201172, "loss": 0.8895, "nll_loss": 0.8835775852203369, "rewards/accuracies": 1.0, "rewards/chosen": 2.2912118434906006, "rewards/margins": 10.20975399017334, "rewards/rejected": -7.91854190826416, "step": 4829 }, { "epoch": 0.805, "grad_norm": 19.356685638427734, "learning_rate": 1.9289093800839064e-08, "logits/chosen": 1.6445201635360718, "logits/rejected": 1.6392635107040405, "logps/chosen": -63.54616928100586, "logps/rejected": -102.56732177734375, "loss": 0.6807, "nll_loss": 0.6760230660438538, "rewards/accuracies": 1.0, "rewards/chosen": 2.9305670261383057, "rewards/margins": 8.998160362243652, "rewards/rejected": -6.067593097686768, "step": 4830 }, { "epoch": 0.8051666666666667, "grad_norm": 33.75218200683594, "learning_rate": 1.9257236080624107e-08, "logits/chosen": 2.6934611797332764, "logits/rejected": 2.6463122367858887, "logps/chosen": -35.1800422668457, "logps/rejected": -43.48168182373047, "loss": 0.7698, "nll_loss": 0.7329176068305969, "rewards/accuracies": 1.0, "rewards/chosen": 5.348264694213867, "rewards/margins": 8.227964401245117, "rewards/rejected": -2.879699468612671, "step": 4831 }, { "epoch": 0.8053333333333333, "grad_norm": 39.43833923339844, "learning_rate": 1.922540188691214e-08, "logits/chosen": 2.3919379711151123, "logits/rejected": 2.478621006011963, "logps/chosen": -12.951112747192383, "logps/rejected": -281.9983215332031, "loss": 0.2903, "nll_loss": 0.287802517414093, "rewards/accuracies": 1.0, "rewards/chosen": 3.1038222312927246, "rewards/margins": 13.589473724365234, "rewards/rejected": -10.485651016235352, "step": 4832 }, { "epoch": 0.8055, "grad_norm": 114.48636627197266, "learning_rate": 1.9193591228978812e-08, "logits/chosen": 2.331158399581909, "logits/rejected": 2.308716297149658, "logps/chosen": -107.85153198242188, "logps/rejected": -32.61237335205078, "loss": 1.9521, "nll_loss": 1.185181736946106, "rewards/accuracies": 1.0, "rewards/chosen": 1.7383179664611816, "rewards/margins": 0.8047230243682861, "rewards/rejected": 0.9335949420928955, "step": 4833 }, { "epoch": 0.8056666666666666, "grad_norm": 26.393184661865234, "learning_rate": 1.916180411609304e-08, "logits/chosen": 4.250010013580322, "logits/rejected": 4.416243076324463, "logps/chosen": -27.800615310668945, "logps/rejected": -157.38742065429688, "loss": 0.5152, "nll_loss": 0.5054657459259033, "rewards/accuracies": 1.0, "rewards/chosen": 1.993316888809204, "rewards/margins": 7.894656181335449, "rewards/rejected": -5.901339054107666, "step": 4834 }, { "epoch": 0.8058333333333333, "grad_norm": 62.451141357421875, "learning_rate": 1.913004055751679e-08, "logits/chosen": 2.537221908569336, "logits/rejected": 2.492692470550537, "logps/chosen": -39.09032440185547, "logps/rejected": -50.723106384277344, "loss": 1.3923, "nll_loss": 1.3479422330856323, "rewards/accuracies": 1.0, "rewards/chosen": 1.1975983381271362, "rewards/margins": 4.68280553817749, "rewards/rejected": -3.4852073192596436, "step": 4835 }, { "epoch": 0.806, "grad_norm": 32.3419189453125, "learning_rate": 1.9098300562505266e-08, "logits/chosen": 2.072676658630371, "logits/rejected": 2.3107500076293945, "logps/chosen": -51.34946823120117, "logps/rejected": -57.14805221557617, "loss": 0.8271, "nll_loss": 0.7899918556213379, "rewards/accuracies": 1.0, "rewards/chosen": 0.7330921292304993, "rewards/margins": 5.253519535064697, "rewards/rejected": -4.520427227020264, "step": 4836 }, { "epoch": 0.8061666666666667, "grad_norm": 41.29873275756836, "learning_rate": 1.906658414030665e-08, "logits/chosen": 1.7451428174972534, "logits/rejected": 1.5913965702056885, "logps/chosen": -115.60049438476562, "logps/rejected": -112.87979125976562, "loss": 1.2729, "nll_loss": 1.1676818132400513, "rewards/accuracies": 1.0, "rewards/chosen": 0.6326126456260681, "rewards/margins": 3.239647150039673, "rewards/rejected": -2.60703444480896, "step": 4837 }, { "epoch": 0.8063333333333333, "grad_norm": 27.27815818786621, "learning_rate": 1.9034891300162403e-08, "logits/chosen": 1.4410741329193115, "logits/rejected": 2.3518829345703125, "logps/chosen": -20.251920700073242, "logps/rejected": -183.1431121826172, "loss": 0.4381, "nll_loss": 0.4308919310569763, "rewards/accuracies": 1.0, "rewards/chosen": 2.6091647148132324, "rewards/margins": 8.115323066711426, "rewards/rejected": -5.506158351898193, "step": 4838 }, { "epoch": 0.8065, "grad_norm": 36.19757080078125, "learning_rate": 1.9003222051307045e-08, "logits/chosen": 1.552749514579773, "logits/rejected": 2.1031250953674316, "logps/chosen": -49.31869125366211, "logps/rejected": -256.48828125, "loss": 1.0278, "nll_loss": 1.0274726152420044, "rewards/accuracies": 1.0, "rewards/chosen": 5.415932655334473, "rewards/margins": 15.226634979248047, "rewards/rejected": -9.810702323913574, "step": 4839 }, { "epoch": 0.8066666666666666, "grad_norm": 30.09048080444336, "learning_rate": 1.8971576402968248e-08, "logits/chosen": 1.6886335611343384, "logits/rejected": 1.6563054323196411, "logps/chosen": -76.22759246826172, "logps/rejected": -105.34370422363281, "loss": 1.0858, "nll_loss": 1.0587166547775269, "rewards/accuracies": 1.0, "rewards/chosen": 2.027646780014038, "rewards/margins": 5.707095146179199, "rewards/rejected": -3.679448127746582, "step": 4840 }, { "epoch": 0.8068333333333333, "grad_norm": 32.93644714355469, "learning_rate": 1.893995436436676e-08, "logits/chosen": 2.062978744506836, "logits/rejected": 2.127336263656616, "logps/chosen": -48.89349365234375, "logps/rejected": -92.9312744140625, "loss": 0.7013, "nll_loss": 0.6433354020118713, "rewards/accuracies": 1.0, "rewards/chosen": 1.949388861656189, "rewards/margins": 4.657984733581543, "rewards/rejected": -2.7085959911346436, "step": 4841 }, { "epoch": 0.807, "grad_norm": 21.023147583007812, "learning_rate": 1.8908355944716513e-08, "logits/chosen": 2.3576459884643555, "logits/rejected": 3.0061194896698, "logps/chosen": -34.03557586669922, "logps/rejected": -355.0627746582031, "loss": 0.4969, "nll_loss": 0.4932692050933838, "rewards/accuracies": 1.0, "rewards/chosen": 2.9812684059143066, "rewards/margins": 9.820877075195312, "rewards/rejected": -6.839608669281006, "step": 4842 }, { "epoch": 0.8071666666666667, "grad_norm": 20.523937225341797, "learning_rate": 1.887678115322451e-08, "logits/chosen": 2.6066677570343018, "logits/rejected": 2.6302690505981445, "logps/chosen": -113.98772430419922, "logps/rejected": -72.35072326660156, "loss": 0.9308, "nll_loss": 0.9267293810844421, "rewards/accuracies": 1.0, "rewards/chosen": 3.1326255798339844, "rewards/margins": 9.22481918334961, "rewards/rejected": -6.092193603515625, "step": 4843 }, { "epoch": 0.8073333333333333, "grad_norm": 45.45710372924805, "learning_rate": 1.8845229999090927e-08, "logits/chosen": 1.6546462774276733, "logits/rejected": 1.7871534824371338, "logps/chosen": -6.843505859375, "logps/rejected": -123.45011901855469, "loss": 0.3609, "nll_loss": 0.3258812427520752, "rewards/accuracies": 1.0, "rewards/chosen": 1.212211012840271, "rewards/margins": 5.099547863006592, "rewards/rejected": -3.8873367309570312, "step": 4844 }, { "epoch": 0.8075, "grad_norm": 32.77421188354492, "learning_rate": 1.8813702491508953e-08, "logits/chosen": 0.36607763171195984, "logits/rejected": 1.7385988235473633, "logps/chosen": -40.63024139404297, "logps/rejected": -375.2928466796875, "loss": 0.891, "nll_loss": 0.8644731640815735, "rewards/accuracies": 1.0, "rewards/chosen": 0.6908180713653564, "rewards/margins": 9.55494499206543, "rewards/rejected": -8.864127159118652, "step": 4845 }, { "epoch": 0.8076666666666666, "grad_norm": 30.206987380981445, "learning_rate": 1.878219863966498e-08, "logits/chosen": 2.3811097145080566, "logits/rejected": 2.580472946166992, "logps/chosen": -22.8780460357666, "logps/rejected": -337.8580322265625, "loss": 0.5374, "nll_loss": 0.5199555158615112, "rewards/accuracies": 1.0, "rewards/chosen": 1.1575193405151367, "rewards/margins": 8.568645477294922, "rewards/rejected": -7.411126613616943, "step": 4846 }, { "epoch": 0.8078333333333333, "grad_norm": 24.113365173339844, "learning_rate": 1.8750718452738478e-08, "logits/chosen": 3.1343071460723877, "logits/rejected": 3.1946821212768555, "logps/chosen": -43.984352111816406, "logps/rejected": -230.95321655273438, "loss": 0.5897, "nll_loss": 0.5787414908409119, "rewards/accuracies": 1.0, "rewards/chosen": 1.612498164176941, "rewards/margins": 11.14684772491455, "rewards/rejected": -9.53434944152832, "step": 4847 }, { "epoch": 0.808, "grad_norm": 14.999411582946777, "learning_rate": 1.871926193990202e-08, "logits/chosen": 2.4338090419769287, "logits/rejected": 2.643441677093506, "logps/chosen": -118.39083099365234, "logps/rejected": -318.8481750488281, "loss": 0.6374, "nll_loss": 0.629738450050354, "rewards/accuracies": 1.0, "rewards/chosen": 3.621814727783203, "rewards/margins": 8.391746520996094, "rewards/rejected": -4.769931316375732, "step": 4848 }, { "epoch": 0.8081666666666667, "grad_norm": 28.83751106262207, "learning_rate": 1.868782911032125e-08, "logits/chosen": 2.9123971462249756, "logits/rejected": 3.043565034866333, "logps/chosen": -47.314632415771484, "logps/rejected": -155.80331420898438, "loss": 0.6773, "nll_loss": 0.6393869519233704, "rewards/accuracies": 1.0, "rewards/chosen": 1.4108209609985352, "rewards/margins": 4.978549003601074, "rewards/rejected": -3.56772780418396, "step": 4849 }, { "epoch": 0.8083333333333333, "grad_norm": 28.01056480407715, "learning_rate": 1.8656419973154958e-08, "logits/chosen": 2.6122817993164062, "logits/rejected": 2.988283395767212, "logps/chosen": -29.658374786376953, "logps/rejected": -59.24508285522461, "loss": 0.4466, "nll_loss": 0.42983150482177734, "rewards/accuracies": 1.0, "rewards/chosen": 1.8216352462768555, "rewards/margins": 6.475051403045654, "rewards/rejected": -4.653416156768799, "step": 4850 }, { "epoch": 0.8085, "grad_norm": 18.706085205078125, "learning_rate": 1.8625034537555018e-08, "logits/chosen": 1.4182980060577393, "logits/rejected": 1.6463899612426758, "logps/chosen": -105.87771606445312, "logps/rejected": -131.72412109375, "loss": 0.8032, "nll_loss": 0.7960730195045471, "rewards/accuracies": 1.0, "rewards/chosen": 2.4192001819610596, "rewards/margins": 8.289803504943848, "rewards/rejected": -5.870603561401367, "step": 4851 }, { "epoch": 0.8086666666666666, "grad_norm": 41.409786224365234, "learning_rate": 1.8593672812666384e-08, "logits/chosen": 2.224768877029419, "logits/rejected": 2.43575119972229, "logps/chosen": -61.057594299316406, "logps/rejected": -124.82240295410156, "loss": 1.2403, "nll_loss": 1.1972076892852783, "rewards/accuracies": 1.0, "rewards/chosen": 0.4915611445903778, "rewards/margins": 5.137060642242432, "rewards/rejected": -4.6454997062683105, "step": 4852 }, { "epoch": 0.8088333333333333, "grad_norm": 39.70250701904297, "learning_rate": 1.8562334807627143e-08, "logits/chosen": 2.241225004196167, "logits/rejected": 2.739496946334839, "logps/chosen": -30.220317840576172, "logps/rejected": -284.33013916015625, "loss": 0.6042, "nll_loss": 0.5811598896980286, "rewards/accuracies": 1.0, "rewards/chosen": 0.8426617383956909, "rewards/margins": 9.124781608581543, "rewards/rejected": -8.282119750976562, "step": 4853 }, { "epoch": 0.809, "grad_norm": 62.1616325378418, "learning_rate": 1.8531020531568376e-08, "logits/chosen": 2.4959096908569336, "logits/rejected": 2.6245369911193848, "logps/chosen": -6.6101603507995605, "logps/rejected": -207.77609252929688, "loss": 0.5178, "nll_loss": 0.5084739327430725, "rewards/accuracies": 1.0, "rewards/chosen": 1.9656723737716675, "rewards/margins": 8.184992790222168, "rewards/rejected": -6.219320774078369, "step": 4854 }, { "epoch": 0.8091666666666667, "grad_norm": 23.2716064453125, "learning_rate": 1.8499729993614345e-08, "logits/chosen": 2.096247434616089, "logits/rejected": 2.066819667816162, "logps/chosen": -144.48117065429688, "logps/rejected": -124.05945587158203, "loss": 0.8216, "nll_loss": 0.7809793949127197, "rewards/accuracies": 1.0, "rewards/chosen": 2.3515594005584717, "rewards/margins": 5.384998321533203, "rewards/rejected": -3.0334389209747314, "step": 4855 }, { "epoch": 0.8093333333333333, "grad_norm": 16.421171188354492, "learning_rate": 1.8468463202882356e-08, "logits/chosen": 2.6463069915771484, "logits/rejected": 2.720088005065918, "logps/chosen": -83.18610382080078, "logps/rejected": -199.85650634765625, "loss": 0.6009, "nll_loss": 0.5984610915184021, "rewards/accuracies": 1.0, "rewards/chosen": 3.1634111404418945, "rewards/margins": 12.269848823547363, "rewards/rejected": -9.106437683105469, "step": 4856 }, { "epoch": 0.8095, "grad_norm": 31.687074661254883, "learning_rate": 1.8437220168482836e-08, "logits/chosen": 2.0126490592956543, "logits/rejected": 2.454416275024414, "logps/chosen": -17.311607360839844, "logps/rejected": -522.5521850585938, "loss": 0.643, "nll_loss": 0.6411707401275635, "rewards/accuracies": 1.0, "rewards/chosen": 3.494318723678589, "rewards/margins": 12.41624927520752, "rewards/rejected": -8.921930313110352, "step": 4857 }, { "epoch": 0.8096666666666666, "grad_norm": 30.885934829711914, "learning_rate": 1.8406000899519204e-08, "logits/chosen": 2.5662591457366943, "logits/rejected": 2.7097935676574707, "logps/chosen": -29.549041748046875, "logps/rejected": -341.17620849609375, "loss": 0.7089, "nll_loss": 0.7035486102104187, "rewards/accuracies": 1.0, "rewards/chosen": 2.3883144855499268, "rewards/margins": 10.4688138961792, "rewards/rejected": -8.080499649047852, "step": 4858 }, { "epoch": 0.8098333333333333, "grad_norm": 200.97450256347656, "learning_rate": 1.8374805405088013e-08, "logits/chosen": 2.6181840896606445, "logits/rejected": 2.5382931232452393, "logps/chosen": -77.86261749267578, "logps/rejected": -29.0900821685791, "loss": 3.463, "nll_loss": 0.778626024723053, "rewards/accuracies": 0.0, "rewards/chosen": 2.732624053955078, "rewards/margins": -1.7512335777282715, "rewards/rejected": 4.48385763168335, "step": 4859 }, { "epoch": 0.81, "grad_norm": 86.77630615234375, "learning_rate": 1.8343633694278894e-08, "logits/chosen": 1.5838655233383179, "logits/rejected": 2.2537739276885986, "logps/chosen": -126.5638427734375, "logps/rejected": -173.67453002929688, "loss": 1.9486, "nll_loss": 1.808054804801941, "rewards/accuracies": 1.0, "rewards/chosen": -1.0552047491073608, "rewards/margins": 3.832899570465088, "rewards/rejected": -4.888104438781738, "step": 4860 }, { "epoch": 0.8101666666666667, "grad_norm": 27.281421661376953, "learning_rate": 1.831248577617457e-08, "logits/chosen": 2.694498062133789, "logits/rejected": 2.780383348464966, "logps/chosen": -67.79651641845703, "logps/rejected": -229.1591796875, "loss": 0.9779, "nll_loss": 0.968521773815155, "rewards/accuracies": 1.0, "rewards/chosen": 1.826280951499939, "rewards/margins": 9.200347900390625, "rewards/rejected": -7.3740668296813965, "step": 4861 }, { "epoch": 0.8103333333333333, "grad_norm": 21.92300796508789, "learning_rate": 1.8281361659850726e-08, "logits/chosen": 0.26127898693084717, "logits/rejected": 1.1372699737548828, "logps/chosen": -64.9176025390625, "logps/rejected": -390.3795166015625, "loss": 0.7317, "nll_loss": 0.7213066220283508, "rewards/accuracies": 1.0, "rewards/chosen": 1.661669135093689, "rewards/margins": 11.380772590637207, "rewards/rejected": -9.719103813171387, "step": 4862 }, { "epoch": 0.8105, "grad_norm": 25.237945556640625, "learning_rate": 1.8250261354376217e-08, "logits/chosen": 1.8021719455718994, "logits/rejected": 2.4306344985961914, "logps/chosen": -95.79499816894531, "logps/rejected": -454.10443115234375, "loss": 0.992, "nll_loss": 0.9774999618530273, "rewards/accuracies": 1.0, "rewards/chosen": 1.31182861328125, "rewards/margins": 12.708551406860352, "rewards/rejected": -11.396722793579102, "step": 4863 }, { "epoch": 0.8106666666666666, "grad_norm": 26.40884017944336, "learning_rate": 1.8219184868812932e-08, "logits/chosen": 1.2537670135498047, "logits/rejected": 1.8329442739486694, "logps/chosen": -58.1053466796875, "logps/rejected": -351.86334228515625, "loss": 0.883, "nll_loss": 0.8803839087486267, "rewards/accuracies": 1.0, "rewards/chosen": 3.0438456535339355, "rewards/margins": 14.586631774902344, "rewards/rejected": -11.54278564453125, "step": 4864 }, { "epoch": 0.8108333333333333, "grad_norm": 70.20201873779297, "learning_rate": 1.8188132212215833e-08, "logits/chosen": 2.441809892654419, "logits/rejected": 2.64737868309021, "logps/chosen": -46.62812805175781, "logps/rejected": -391.2693786621094, "loss": 1.671, "nll_loss": 1.6652902364730835, "rewards/accuracies": 1.0, "rewards/chosen": 2.314014434814453, "rewards/margins": 10.296586036682129, "rewards/rejected": -7.982571601867676, "step": 4865 }, { "epoch": 0.811, "grad_norm": 29.48185157775879, "learning_rate": 1.8157103393632868e-08, "logits/chosen": 2.568912982940674, "logits/rejected": 2.87362003326416, "logps/chosen": -67.24201965332031, "logps/rejected": -175.0802764892578, "loss": 0.9925, "nll_loss": 0.9745221138000488, "rewards/accuracies": 1.0, "rewards/chosen": 2.0747926235198975, "rewards/margins": 6.353363037109375, "rewards/rejected": -4.278570652008057, "step": 4866 }, { "epoch": 0.8111666666666667, "grad_norm": 30.292173385620117, "learning_rate": 1.8126098422105106e-08, "logits/chosen": 1.4383097887039185, "logits/rejected": 2.1182754039764404, "logps/chosen": -118.54841613769531, "logps/rejected": -220.17449951171875, "loss": 1.1471, "nll_loss": 1.1398885250091553, "rewards/accuracies": 1.0, "rewards/chosen": 2.038710117340088, "rewards/margins": 11.828191757202148, "rewards/rejected": -9.789481163024902, "step": 4867 }, { "epoch": 0.8113333333333334, "grad_norm": 28.869565963745117, "learning_rate": 1.8095117306666662e-08, "logits/chosen": 1.0833585262298584, "logits/rejected": 2.5044968128204346, "logps/chosen": -85.98563385009766, "logps/rejected": -364.7947082519531, "loss": 0.9823, "nll_loss": 0.9661307334899902, "rewards/accuracies": 1.0, "rewards/chosen": 1.199414849281311, "rewards/margins": 11.06922721862793, "rewards/rejected": -9.86981201171875, "step": 4868 }, { "epoch": 0.8115, "grad_norm": 23.2918643951416, "learning_rate": 1.806416005634471e-08, "logits/chosen": 2.521885633468628, "logits/rejected": 2.430864095687866, "logps/chosen": -15.045699119567871, "logps/rejected": -108.97586059570312, "loss": 0.3359, "nll_loss": 0.32012125849723816, "rewards/accuracies": 1.0, "rewards/chosen": 1.8173134326934814, "rewards/margins": 6.610257148742676, "rewards/rejected": -4.792943477630615, "step": 4869 }, { "epoch": 0.8116666666666666, "grad_norm": 67.62337493896484, "learning_rate": 1.803322668015941e-08, "logits/chosen": 2.744396209716797, "logits/rejected": 2.809185028076172, "logps/chosen": -68.24398803710938, "logps/rejected": -52.92658233642578, "loss": 1.237, "nll_loss": 0.8028702735900879, "rewards/accuracies": 1.0, "rewards/chosen": 2.3577637672424316, "rewards/margins": 2.1370859146118164, "rewards/rejected": 0.22067797183990479, "step": 4870 }, { "epoch": 0.8118333333333333, "grad_norm": 44.34667205810547, "learning_rate": 1.8002317187124017e-08, "logits/chosen": 2.56740403175354, "logits/rejected": 2.5186402797698975, "logps/chosen": -11.793784141540527, "logps/rejected": -79.2164306640625, "loss": 0.5304, "nll_loss": 0.5127731561660767, "rewards/accuracies": 1.0, "rewards/chosen": 2.1227879524230957, "rewards/margins": 6.398262977600098, "rewards/rejected": -4.275475025177002, "step": 4871 }, { "epoch": 0.812, "grad_norm": 26.27899169921875, "learning_rate": 1.797143158624481e-08, "logits/chosen": 1.6163091659545898, "logits/rejected": 2.1198298931121826, "logps/chosen": -101.73165893554688, "logps/rejected": -274.3910217285156, "loss": 1.0449, "nll_loss": 1.0380780696868896, "rewards/accuracies": 1.0, "rewards/chosen": 2.119187116622925, "rewards/margins": 10.137214660644531, "rewards/rejected": -8.018027305603027, "step": 4872 }, { "epoch": 0.8121666666666667, "grad_norm": 27.32356834411621, "learning_rate": 1.794056988652113e-08, "logits/chosen": 1.8254015445709229, "logits/rejected": 1.9646549224853516, "logps/chosen": -81.86506652832031, "logps/rejected": -169.4589080810547, "loss": 0.9204, "nll_loss": 0.9096118211746216, "rewards/accuracies": 1.0, "rewards/chosen": 1.8348907232284546, "rewards/margins": 7.840312480926514, "rewards/rejected": -6.0054216384887695, "step": 4873 }, { "epoch": 0.8123333333333334, "grad_norm": 133.2279510498047, "learning_rate": 1.7909732096945353e-08, "logits/chosen": 1.7696912288665771, "logits/rejected": 1.8435739278793335, "logps/chosen": -19.682010650634766, "logps/rejected": -26.737560272216797, "loss": 1.689, "nll_loss": 0.3578547537326813, "rewards/accuracies": 0.0, "rewards/chosen": 2.0648891925811768, "rewards/margins": -0.15537524223327637, "rewards/rejected": 2.220264434814453, "step": 4874 }, { "epoch": 0.8125, "grad_norm": 23.84518814086914, "learning_rate": 1.7878918226502815e-08, "logits/chosen": 2.4049408435821533, "logits/rejected": 2.6535751819610596, "logps/chosen": -79.42657470703125, "logps/rejected": -382.04864501953125, "loss": 0.9618, "nll_loss": 0.9569467902183533, "rewards/accuracies": 1.0, "rewards/chosen": 2.509068250656128, "rewards/margins": 10.220757484436035, "rewards/rejected": -7.711688995361328, "step": 4875 }, { "epoch": 0.8126666666666666, "grad_norm": 48.33464431762695, "learning_rate": 1.784812828417197e-08, "logits/chosen": 3.008413314819336, "logits/rejected": 3.229504108428955, "logps/chosen": -8.660499572753906, "logps/rejected": -574.4108276367188, "loss": 0.3984, "nll_loss": 0.3936590850353241, "rewards/accuracies": 1.0, "rewards/chosen": 2.5206375122070312, "rewards/margins": 10.531733512878418, "rewards/rejected": -8.011096000671387, "step": 4876 }, { "epoch": 0.8128333333333333, "grad_norm": 32.3972053527832, "learning_rate": 1.7817362278924263e-08, "logits/chosen": 0.8308451175689697, "logits/rejected": 2.1165292263031006, "logps/chosen": -89.41572570800781, "logps/rejected": -330.0965576171875, "loss": 0.8634, "nll_loss": 0.835660994052887, "rewards/accuracies": 1.0, "rewards/chosen": 0.957003116607666, "rewards/margins": 5.881093978881836, "rewards/rejected": -4.92409086227417, "step": 4877 }, { "epoch": 0.813, "grad_norm": 17.05297088623047, "learning_rate": 1.77866202197242e-08, "logits/chosen": 2.1818859577178955, "logits/rejected": 2.2009286880493164, "logps/chosen": -157.95263671875, "logps/rejected": -99.30621337890625, "loss": 0.7185, "nll_loss": 0.7083076238632202, "rewards/accuracies": 1.0, "rewards/chosen": 1.7616472244262695, "rewards/margins": 8.74382495880127, "rewards/rejected": -6.982177734375, "step": 4878 }, { "epoch": 0.8131666666666667, "grad_norm": 20.4144229888916, "learning_rate": 1.775590211552922e-08, "logits/chosen": 3.4984610080718994, "logits/rejected": 3.6912577152252197, "logps/chosen": -74.43556213378906, "logps/rejected": -523.2030029296875, "loss": 0.6954, "nll_loss": 0.689218282699585, "rewards/accuracies": 1.0, "rewards/chosen": 2.248430013656616, "rewards/margins": 9.852568626403809, "rewards/rejected": -7.604138374328613, "step": 4879 }, { "epoch": 0.8133333333333334, "grad_norm": 64.1901626586914, "learning_rate": 1.772520797528988e-08, "logits/chosen": 1.445967197418213, "logits/rejected": 1.5337527990341187, "logps/chosen": -11.927584648132324, "logps/rejected": -26.403461456298828, "loss": 0.9432, "nll_loss": 0.2293766438961029, "rewards/accuracies": 1.0, "rewards/chosen": 2.8191909790039062, "rewards/margins": 1.4521404504776, "rewards/rejected": 1.3670505285263062, "step": 4880 }, { "epoch": 0.8135, "grad_norm": 52.06245803833008, "learning_rate": 1.7694537807949704e-08, "logits/chosen": 1.420753836631775, "logits/rejected": 2.047788619995117, "logps/chosen": -36.96671676635742, "logps/rejected": -257.57806396484375, "loss": 1.0329, "nll_loss": 1.026853084564209, "rewards/accuracies": 1.0, "rewards/chosen": 2.2464985847473145, "rewards/margins": 10.60760498046875, "rewards/rejected": -8.361106872558594, "step": 4881 }, { "epoch": 0.8136666666666666, "grad_norm": 32.85615539550781, "learning_rate": 1.7663891622445272e-08, "logits/chosen": 1.2241263389587402, "logits/rejected": 2.1797358989715576, "logps/chosen": -38.43566131591797, "logps/rejected": -233.59466552734375, "loss": 0.7688, "nll_loss": 0.7536404728889465, "rewards/accuracies": 1.0, "rewards/chosen": 1.2695693969726562, "rewards/margins": 10.345580101013184, "rewards/rejected": -9.076010704040527, "step": 4882 }, { "epoch": 0.8138333333333333, "grad_norm": 30.364486694335938, "learning_rate": 1.7633269427706087e-08, "logits/chosen": 1.6309713125228882, "logits/rejected": 1.5355281829833984, "logps/chosen": -67.53948211669922, "logps/rejected": -93.42245483398438, "loss": 0.9664, "nll_loss": 0.938048243522644, "rewards/accuracies": 1.0, "rewards/chosen": 0.9963729977607727, "rewards/margins": 5.733896255493164, "rewards/rejected": -4.737523078918457, "step": 4883 }, { "epoch": 0.814, "grad_norm": 58.618167877197266, "learning_rate": 1.7602671232654755e-08, "logits/chosen": 2.90341854095459, "logits/rejected": 3.181671142578125, "logps/chosen": -4.894089221954346, "logps/rejected": -113.08251953125, "loss": 0.3936, "nll_loss": 0.37646839022636414, "rewards/accuracies": 1.0, "rewards/chosen": 1.6588610410690308, "rewards/margins": 6.494168758392334, "rewards/rejected": -4.835307598114014, "step": 4884 }, { "epoch": 0.8141666666666667, "grad_norm": 182.69117736816406, "learning_rate": 1.7572097046206856e-08, "logits/chosen": 2.7163050174713135, "logits/rejected": 2.715104818344116, "logps/chosen": -209.98602294921875, "logps/rejected": -293.66796875, "loss": 2.7204, "nll_loss": 2.2579140663146973, "rewards/accuracies": 1.0, "rewards/chosen": -2.1746156215667725, "rewards/margins": 1.2961058616638184, "rewards/rejected": -3.470721483230591, "step": 4885 }, { "epoch": 0.8143333333333334, "grad_norm": 185.23626708984375, "learning_rate": 1.7541546877271008e-08, "logits/chosen": 2.0010693073272705, "logits/rejected": 2.026538848876953, "logps/chosen": -30.240320205688477, "logps/rejected": -11.552057266235352, "loss": 4.1964, "nll_loss": 0.6434110403060913, "rewards/accuracies": 0.0, "rewards/chosen": 1.0121358633041382, "rewards/margins": -3.0375285148620605, "rewards/rejected": 4.049664497375488, "step": 4886 }, { "epoch": 0.8145, "grad_norm": 33.31019592285156, "learning_rate": 1.7511020734748728e-08, "logits/chosen": 2.923152208328247, "logits/rejected": 2.9857234954833984, "logps/chosen": -96.97004699707031, "logps/rejected": -107.87130737304688, "loss": 1.2076, "nll_loss": 1.1825615167617798, "rewards/accuracies": 1.0, "rewards/chosen": 1.416399359703064, "rewards/margins": 5.7329936027526855, "rewards/rejected": -4.316594123840332, "step": 4887 }, { "epoch": 0.8146666666666667, "grad_norm": 22.670995712280273, "learning_rate": 1.7480518627534634e-08, "logits/chosen": 2.307727098464966, "logits/rejected": 2.3746073246002197, "logps/chosen": -11.141828536987305, "logps/rejected": -138.965576171875, "loss": 0.3145, "nll_loss": 0.27854567766189575, "rewards/accuracies": 1.0, "rewards/chosen": 1.8019325733184814, "rewards/margins": 5.204648017883301, "rewards/rejected": -3.4027156829833984, "step": 4888 }, { "epoch": 0.8148333333333333, "grad_norm": 54.29316711425781, "learning_rate": 1.745004056451632e-08, "logits/chosen": 0.9888814091682434, "logits/rejected": 1.7307885885238647, "logps/chosen": -27.054723739624023, "logps/rejected": -279.225830078125, "loss": 1.0043, "nll_loss": 1.0020267963409424, "rewards/accuracies": 1.0, "rewards/chosen": 3.220229387283325, "rewards/margins": 14.34633731842041, "rewards/rejected": -11.126108169555664, "step": 4889 }, { "epoch": 0.815, "grad_norm": 39.858489990234375, "learning_rate": 1.741958655457436e-08, "logits/chosen": 2.5744340419769287, "logits/rejected": 2.549147129058838, "logps/chosen": -21.712512969970703, "logps/rejected": -100.2954330444336, "loss": 0.692, "nll_loss": 0.6579548716545105, "rewards/accuracies": 1.0, "rewards/chosen": 0.7938485145568848, "rewards/margins": 5.430183410644531, "rewards/rejected": -4.6363348960876465, "step": 4890 }, { "epoch": 0.8151666666666667, "grad_norm": 66.73081970214844, "learning_rate": 1.73891566065823e-08, "logits/chosen": 2.273186683654785, "logits/rejected": 2.2718276977539062, "logps/chosen": -53.71078109741211, "logps/rejected": -147.7735595703125, "loss": 1.9229, "nll_loss": 1.9182422161102295, "rewards/accuracies": 1.0, "rewards/chosen": 2.9198012351989746, "rewards/margins": 9.023820877075195, "rewards/rejected": -6.1040191650390625, "step": 4891 }, { "epoch": 0.8153333333333334, "grad_norm": 97.50509643554688, "learning_rate": 1.7358750729406703e-08, "logits/chosen": 2.269113302230835, "logits/rejected": 2.5190937519073486, "logps/chosen": -42.38819122314453, "logps/rejected": -204.47299194335938, "loss": 2.0292, "nll_loss": 2.0184853076934814, "rewards/accuracies": 1.0, "rewards/chosen": 1.8394196033477783, "rewards/margins": 7.847230911254883, "rewards/rejected": -6.007811546325684, "step": 4892 }, { "epoch": 0.8155, "grad_norm": 57.10873031616211, "learning_rate": 1.732836893190711e-08, "logits/chosen": 1.930118441581726, "logits/rejected": 2.0383975505828857, "logps/chosen": -91.11454010009766, "logps/rejected": -115.04356384277344, "loss": 1.1418, "nll_loss": 1.11115300655365, "rewards/accuracies": 1.0, "rewards/chosen": 0.7197028994560242, "rewards/margins": 6.081690788269043, "rewards/rejected": -5.361988067626953, "step": 4893 }, { "epoch": 0.8156666666666667, "grad_norm": 42.182594299316406, "learning_rate": 1.7298011222936058e-08, "logits/chosen": 2.1976022720336914, "logits/rejected": 2.43764066696167, "logps/chosen": -60.636009216308594, "logps/rejected": -247.21563720703125, "loss": 0.7035, "nll_loss": 0.6251135468482971, "rewards/accuracies": 1.0, "rewards/chosen": 1.062233805656433, "rewards/margins": 3.789727210998535, "rewards/rejected": -2.7274932861328125, "step": 4894 }, { "epoch": 0.8158333333333333, "grad_norm": 25.549827575683594, "learning_rate": 1.726767761133908e-08, "logits/chosen": 0.4686034619808197, "logits/rejected": 2.253602981567383, "logps/chosen": -10.772563934326172, "logps/rejected": -327.24896240234375, "loss": 0.3579, "nll_loss": 0.3366425931453705, "rewards/accuracies": 1.0, "rewards/chosen": 0.9957769513130188, "rewards/margins": 7.390437126159668, "rewards/rejected": -6.394659996032715, "step": 4895 }, { "epoch": 0.816, "grad_norm": 28.006284713745117, "learning_rate": 1.723736810595461e-08, "logits/chosen": 2.656109094619751, "logits/rejected": 2.866358757019043, "logps/chosen": -35.97663497924805, "logps/rejected": -262.4084167480469, "loss": 0.6097, "nll_loss": 0.5897809267044067, "rewards/accuracies": 1.0, "rewards/chosen": 1.1735153198242188, "rewards/margins": 6.81317138671875, "rewards/rejected": -5.639656066894531, "step": 4896 }, { "epoch": 0.8161666666666667, "grad_norm": 23.621503829956055, "learning_rate": 1.720708271561413e-08, "logits/chosen": 1.469873309135437, "logits/rejected": 1.7307335138320923, "logps/chosen": -54.2606315612793, "logps/rejected": -132.34628295898438, "loss": 0.6571, "nll_loss": 0.6236854791641235, "rewards/accuracies": 1.0, "rewards/chosen": 1.2906696796417236, "rewards/margins": 5.18112850189209, "rewards/rejected": -3.8904590606689453, "step": 4897 }, { "epoch": 0.8163333333333334, "grad_norm": 67.57036590576172, "learning_rate": 1.7176821449142077e-08, "logits/chosen": 3.504317283630371, "logits/rejected": 3.5306451320648193, "logps/chosen": -35.2846565246582, "logps/rejected": -53.66730499267578, "loss": 1.2407, "nll_loss": 1.1761552095413208, "rewards/accuracies": 1.0, "rewards/chosen": 0.5158149600028992, "rewards/margins": 4.048770427703857, "rewards/rejected": -3.5329554080963135, "step": 4898 }, { "epoch": 0.8165, "grad_norm": 18.193164825439453, "learning_rate": 1.7146584315355883e-08, "logits/chosen": 2.557321548461914, "logits/rejected": 2.5128519535064697, "logps/chosen": -9.025014877319336, "logps/rejected": -80.48811340332031, "loss": 0.2099, "nll_loss": 0.20055589079856873, "rewards/accuracies": 1.0, "rewards/chosen": 2.8187379837036133, "rewards/margins": 7.643314838409424, "rewards/rejected": -4.8245768547058105, "step": 4899 }, { "epoch": 0.8166666666666667, "grad_norm": 26.638248443603516, "learning_rate": 1.7116371323065882e-08, "logits/chosen": 0.6600687503814697, "logits/rejected": 1.583725929260254, "logps/chosen": -96.57106018066406, "logps/rejected": -403.1040344238281, "loss": 0.7756, "nll_loss": 0.7604019045829773, "rewards/accuracies": 1.0, "rewards/chosen": 1.2718110084533691, "rewards/margins": 10.17431640625, "rewards/rejected": -8.902505874633789, "step": 4900 }, { "epoch": 0.8168333333333333, "grad_norm": 31.85724639892578, "learning_rate": 1.708618248107543e-08, "logits/chosen": 2.424736499786377, "logits/rejected": 2.4062089920043945, "logps/chosen": -53.9358024597168, "logps/rejected": -180.564697265625, "loss": 0.8055, "nll_loss": 0.7491083145141602, "rewards/accuracies": 1.0, "rewards/chosen": 0.710065484046936, "rewards/margins": 4.263779640197754, "rewards/rejected": -3.5537142753601074, "step": 4901 }, { "epoch": 0.817, "grad_norm": 23.668540954589844, "learning_rate": 1.705601779818082e-08, "logits/chosen": 2.590308904647827, "logits/rejected": 2.6020538806915283, "logps/chosen": -52.690948486328125, "logps/rejected": -135.33969116210938, "loss": 0.6021, "nll_loss": 0.5854549407958984, "rewards/accuracies": 1.0, "rewards/chosen": 1.590876817703247, "rewards/margins": 6.644203186035156, "rewards/rejected": -5.053326606750488, "step": 4902 }, { "epoch": 0.8171666666666667, "grad_norm": 25.272716522216797, "learning_rate": 1.7025877283171363e-08, "logits/chosen": 1.790199637413025, "logits/rejected": 2.2850513458251953, "logps/chosen": -84.25210571289062, "logps/rejected": -123.02241516113281, "loss": 0.8474, "nll_loss": 0.8179816603660583, "rewards/accuracies": 1.0, "rewards/chosen": 1.4447205066680908, "rewards/margins": 5.414671421051025, "rewards/rejected": -3.9699509143829346, "step": 4903 }, { "epoch": 0.8173333333333334, "grad_norm": 49.27442169189453, "learning_rate": 1.699576094482923e-08, "logits/chosen": 2.897350549697876, "logits/rejected": 2.990293264389038, "logps/chosen": -38.17169189453125, "logps/rejected": -92.20879364013672, "loss": 0.8755, "nll_loss": 0.7484647035598755, "rewards/accuracies": 1.0, "rewards/chosen": 1.1810439825057983, "rewards/margins": 3.1932668685913086, "rewards/rejected": -2.0122230052948, "step": 4904 }, { "epoch": 0.8175, "grad_norm": 33.85188674926758, "learning_rate": 1.69656687919296e-08, "logits/chosen": 1.2285202741622925, "logits/rejected": 2.9782633781433105, "logps/chosen": -82.96580505371094, "logps/rejected": -532.2391357421875, "loss": 0.9269, "nll_loss": 0.9218422770500183, "rewards/accuracies": 1.0, "rewards/chosen": 2.390139102935791, "rewards/margins": 12.169057846069336, "rewards/rejected": -9.778918266296387, "step": 4905 }, { "epoch": 0.8176666666666667, "grad_norm": 37.65819549560547, "learning_rate": 1.693560083324064e-08, "logits/chosen": 3.685150384902954, "logits/rejected": 3.5998525619506836, "logps/chosen": -40.03805160522461, "logps/rejected": -64.53568267822266, "loss": 0.8389, "nll_loss": 0.8341259956359863, "rewards/accuracies": 1.0, "rewards/chosen": 4.0901288986206055, "rewards/margins": 9.321165084838867, "rewards/rejected": -5.23103666305542, "step": 4906 }, { "epoch": 0.8178333333333333, "grad_norm": 35.0289306640625, "learning_rate": 1.6905557077523424e-08, "logits/chosen": 2.403026819229126, "logits/rejected": 2.332280397415161, "logps/chosen": -44.097618103027344, "logps/rejected": -102.10846710205078, "loss": 0.8146, "nll_loss": 0.7603036761283875, "rewards/accuracies": 1.0, "rewards/chosen": 1.132910966873169, "rewards/margins": 4.350867748260498, "rewards/rejected": -3.217956781387329, "step": 4907 }, { "epoch": 0.818, "grad_norm": 26.840835571289062, "learning_rate": 1.6875537533531948e-08, "logits/chosen": 2.477330446243286, "logits/rejected": 2.463502883911133, "logps/chosen": -104.9228286743164, "logps/rejected": -165.826171875, "loss": 1.1319, "nll_loss": 1.116200566291809, "rewards/accuracies": 1.0, "rewards/chosen": 1.2949464321136475, "rewards/margins": 8.1403226852417, "rewards/rejected": -6.845376014709473, "step": 4908 }, { "epoch": 0.8181666666666667, "grad_norm": 24.997459411621094, "learning_rate": 1.6845542210013197e-08, "logits/chosen": 2.64326810836792, "logits/rejected": 2.7536747455596924, "logps/chosen": -61.25285339355469, "logps/rejected": -290.4915771484375, "loss": 0.6236, "nll_loss": 0.6187155842781067, "rewards/accuracies": 1.0, "rewards/chosen": 2.4375054836273193, "rewards/margins": 11.797070503234863, "rewards/rejected": -9.359564781188965, "step": 4909 }, { "epoch": 0.8183333333333334, "grad_norm": 23.3583984375, "learning_rate": 1.6815571115707105e-08, "logits/chosen": 3.1925389766693115, "logits/rejected": 3.2762537002563477, "logps/chosen": -64.17330932617188, "logps/rejected": -289.22808837890625, "loss": 0.7279, "nll_loss": 0.7130367755889893, "rewards/accuracies": 1.0, "rewards/chosen": 1.3495498895645142, "rewards/margins": 8.257916450500488, "rewards/rejected": -6.908366680145264, "step": 4910 }, { "epoch": 0.8185, "grad_norm": 21.3049259185791, "learning_rate": 1.6785624259346554e-08, "logits/chosen": 2.4908525943756104, "logits/rejected": 2.711198091506958, "logps/chosen": -60.54832077026367, "logps/rejected": -426.5125732421875, "loss": 0.7084, "nll_loss": 0.7040502429008484, "rewards/accuracies": 1.0, "rewards/chosen": 2.5845143795013428, "rewards/margins": 10.860174179077148, "rewards/rejected": -8.275659561157227, "step": 4911 }, { "epoch": 0.8186666666666667, "grad_norm": 20.147584915161133, "learning_rate": 1.6755701649657272e-08, "logits/chosen": 2.7640020847320557, "logits/rejected": 2.871516227722168, "logps/chosen": -74.68975067138672, "logps/rejected": -310.9725341796875, "loss": 0.749, "nll_loss": 0.7395025491714478, "rewards/accuracies": 1.0, "rewards/chosen": 1.9535773992538452, "rewards/margins": 8.085826873779297, "rewards/rejected": -6.132249355316162, "step": 4912 }, { "epoch": 0.8188333333333333, "grad_norm": 262.0027770996094, "learning_rate": 1.6725803295358033e-08, "logits/chosen": 1.99042809009552, "logits/rejected": 1.4417587518692017, "logps/chosen": -130.16575622558594, "logps/rejected": -48.995216369628906, "loss": 2.362, "nll_loss": 1.0582581758499146, "rewards/accuracies": 0.0, "rewards/chosen": -0.9937965273857117, "rewards/margins": -0.770494818687439, "rewards/rejected": -0.2233017086982727, "step": 4913 }, { "epoch": 0.819, "grad_norm": 16.97956085205078, "learning_rate": 1.6695929205160485e-08, "logits/chosen": 1.8880571126937866, "logits/rejected": 2.4744749069213867, "logps/chosen": -144.968505859375, "logps/rejected": -479.02783203125, "loss": 0.9045, "nll_loss": 0.9004254341125488, "rewards/accuracies": 1.0, "rewards/chosen": 2.6218814849853516, "rewards/margins": 11.627137184143066, "rewards/rejected": -9.005255699157715, "step": 4914 }, { "epoch": 0.8191666666666667, "grad_norm": 30.649322509765625, "learning_rate": 1.666607938776924e-08, "logits/chosen": 1.2764860391616821, "logits/rejected": 2.3247694969177246, "logps/chosen": -57.69242858886719, "logps/rejected": -271.61602783203125, "loss": 0.8797, "nll_loss": 0.8741276860237122, "rewards/accuracies": 1.0, "rewards/chosen": 2.291339874267578, "rewards/margins": 14.377570152282715, "rewards/rejected": -12.086230278015137, "step": 4915 }, { "epoch": 0.8193333333333334, "grad_norm": 28.901145935058594, "learning_rate": 1.663625385188182e-08, "logits/chosen": 1.5976934432983398, "logits/rejected": 2.2790911197662354, "logps/chosen": -79.097900390625, "logps/rejected": -319.0898742675781, "loss": 1.0588, "nll_loss": 1.0546388626098633, "rewards/accuracies": 1.0, "rewards/chosen": 2.615255117416382, "rewards/margins": 11.45406436920166, "rewards/rejected": -8.8388090133667, "step": 4916 }, { "epoch": 0.8195, "grad_norm": 29.91408920288086, "learning_rate": 1.6606452606188636e-08, "logits/chosen": 2.8359012603759766, "logits/rejected": 2.653134822845459, "logps/chosen": -71.36617279052734, "logps/rejected": -75.20550537109375, "loss": 0.7713, "nll_loss": 0.7592145204544067, "rewards/accuracies": 1.0, "rewards/chosen": 1.589208960533142, "rewards/margins": 8.345947265625, "rewards/rejected": -6.756738662719727, "step": 4917 }, { "epoch": 0.8196666666666667, "grad_norm": 20.193750381469727, "learning_rate": 1.657667565937306e-08, "logits/chosen": 2.3750107288360596, "logits/rejected": 2.6242637634277344, "logps/chosen": -49.41936492919922, "logps/rejected": -149.45263671875, "loss": 0.5461, "nll_loss": 0.5430699586868286, "rewards/accuracies": 1.0, "rewards/chosen": 3.1026458740234375, "rewards/margins": 10.248315811157227, "rewards/rejected": -7.145669460296631, "step": 4918 }, { "epoch": 0.8198333333333333, "grad_norm": 33.323490142822266, "learning_rate": 1.6546923020111415e-08, "logits/chosen": 0.7880572080612183, "logits/rejected": 1.4746936559677124, "logps/chosen": -81.26573181152344, "logps/rejected": -226.54891967773438, "loss": 1.1965, "nll_loss": 1.1950843334197998, "rewards/accuracies": 1.0, "rewards/chosen": 4.705539226531982, "rewards/margins": 11.33770751953125, "rewards/rejected": -6.632168769836426, "step": 4919 }, { "epoch": 0.82, "grad_norm": 51.926422119140625, "learning_rate": 1.65171946970729e-08, "logits/chosen": 1.1950451135635376, "logits/rejected": 1.8923568725585938, "logps/chosen": -66.25691986083984, "logps/rejected": -339.9096374511719, "loss": 1.6915, "nll_loss": 1.6564232110977173, "rewards/accuracies": 1.0, "rewards/chosen": 1.2353423833847046, "rewards/margins": 5.092495918273926, "rewards/rejected": -3.8571534156799316, "step": 4920 }, { "epoch": 0.8201666666666667, "grad_norm": 10.960786819458008, "learning_rate": 1.6487490698919593e-08, "logits/chosen": 2.976173162460327, "logits/rejected": 2.859907627105713, "logps/chosen": -145.61578369140625, "logps/rejected": -158.83309936523438, "loss": 0.5865, "nll_loss": 0.5848023295402527, "rewards/accuracies": 1.0, "rewards/chosen": 3.681528091430664, "rewards/margins": 11.62435531616211, "rewards/rejected": -7.942826747894287, "step": 4921 }, { "epoch": 0.8203333333333334, "grad_norm": 24.27281951904297, "learning_rate": 1.645781103430656e-08, "logits/chosen": 1.713903784751892, "logits/rejected": 1.8787951469421387, "logps/chosen": -86.48484802246094, "logps/rejected": -139.39886474609375, "loss": 0.6622, "nll_loss": 0.655188262462616, "rewards/accuracies": 1.0, "rewards/chosen": 2.6555886268615723, "rewards/margins": 8.149396896362305, "rewards/rejected": -5.493808746337891, "step": 4922 }, { "epoch": 0.8205, "grad_norm": 170.17042541503906, "learning_rate": 1.642815571188172e-08, "logits/chosen": 2.2290163040161133, "logits/rejected": 2.294579029083252, "logps/chosen": -50.41802978515625, "logps/rejected": -59.387184143066406, "loss": 2.151, "nll_loss": 0.6813247203826904, "rewards/accuracies": 0.0, "rewards/chosen": 1.6367219686508179, "rewards/margins": -0.4852689504623413, "rewards/rejected": 2.121990919113159, "step": 4923 }, { "epoch": 0.8206666666666667, "grad_norm": 26.314006805419922, "learning_rate": 1.6398524740285978e-08, "logits/chosen": 3.719691514968872, "logits/rejected": 3.6780779361724854, "logps/chosen": -28.945270538330078, "logps/rejected": -98.07366943359375, "loss": 0.5401, "nll_loss": 0.5078118443489075, "rewards/accuracies": 1.0, "rewards/chosen": 1.1366599798202515, "rewards/margins": 5.278364181518555, "rewards/rejected": -4.141704082489014, "step": 4924 }, { "epoch": 0.8208333333333333, "grad_norm": 28.746675491333008, "learning_rate": 1.636891812815302e-08, "logits/chosen": 2.0416932106018066, "logits/rejected": 1.0313950777053833, "logps/chosen": -137.9250946044922, "logps/rejected": -72.70938110351562, "loss": 0.9769, "nll_loss": 0.9195006489753723, "rewards/accuracies": 1.0, "rewards/chosen": 1.6647446155548096, "rewards/margins": 4.501476287841797, "rewards/rejected": -2.8367316722869873, "step": 4925 }, { "epoch": 0.821, "grad_norm": 33.26896286010742, "learning_rate": 1.6339335884109517e-08, "logits/chosen": 2.5866763591766357, "logits/rejected": 2.723160743713379, "logps/chosen": -87.96609497070312, "logps/rejected": -303.6673278808594, "loss": 1.4032, "nll_loss": 1.3962873220443726, "rewards/accuracies": 1.0, "rewards/chosen": 2.072274923324585, "rewards/margins": 12.610864639282227, "rewards/rejected": -10.538589477539062, "step": 4926 }, { "epoch": 0.8211666666666667, "grad_norm": 272.1694030761719, "learning_rate": 1.6309778016775055e-08, "logits/chosen": 2.7126331329345703, "logits/rejected": 2.652939558029175, "logps/chosen": -86.1424789428711, "logps/rejected": -58.1053581237793, "loss": 3.1053, "nll_loss": 1.266801118850708, "rewards/accuracies": 0.0, "rewards/chosen": 1.0036629438400269, "rewards/margins": -1.1321781873703003, "rewards/rejected": 2.135841131210327, "step": 4927 }, { "epoch": 0.8213333333333334, "grad_norm": 39.878990173339844, "learning_rate": 1.6280244534762078e-08, "logits/chosen": 2.848536729812622, "logits/rejected": 2.940650463104248, "logps/chosen": -39.113521575927734, "logps/rejected": -200.75564575195312, "loss": 0.9783, "nll_loss": 0.977837860584259, "rewards/accuracies": 1.0, "rewards/chosen": 5.341398239135742, "rewards/margins": 13.682888984680176, "rewards/rejected": -8.341490745544434, "step": 4928 }, { "epoch": 0.8215, "grad_norm": 39.11045837402344, "learning_rate": 1.6250735446675912e-08, "logits/chosen": 2.7239785194396973, "logits/rejected": 2.84456205368042, "logps/chosen": -27.881807327270508, "logps/rejected": -89.43737030029297, "loss": 0.7135, "nll_loss": 0.6970452070236206, "rewards/accuracies": 1.0, "rewards/chosen": 1.3736032247543335, "rewards/margins": 7.146849155426025, "rewards/rejected": -5.773245811462402, "step": 4929 }, { "epoch": 0.8216666666666667, "grad_norm": 27.259586334228516, "learning_rate": 1.6221250761114803e-08, "logits/chosen": 2.8540964126586914, "logits/rejected": 3.0790796279907227, "logps/chosen": -60.42820739746094, "logps/rejected": -211.88726806640625, "loss": 0.7993, "nll_loss": 0.7847820520401001, "rewards/accuracies": 1.0, "rewards/chosen": 1.594365119934082, "rewards/margins": 7.111081123352051, "rewards/rejected": -5.516716003417969, "step": 4930 }, { "epoch": 0.8218333333333333, "grad_norm": 27.489917755126953, "learning_rate": 1.6191790486669887e-08, "logits/chosen": 2.324988842010498, "logits/rejected": 2.391371011734009, "logps/chosen": -101.111572265625, "logps/rejected": -354.86700439453125, "loss": 1.108, "nll_loss": 1.0990387201309204, "rewards/accuracies": 1.0, "rewards/chosen": 1.9010871648788452, "rewards/margins": 8.773639678955078, "rewards/rejected": -6.872552871704102, "step": 4931 }, { "epoch": 0.822, "grad_norm": 34.903663635253906, "learning_rate": 1.61623546319252e-08, "logits/chosen": 0.9888160824775696, "logits/rejected": 2.167665481567383, "logps/chosen": -19.030441284179688, "logps/rejected": -272.78302001953125, "loss": 0.5216, "nll_loss": 0.5143362879753113, "rewards/accuracies": 1.0, "rewards/chosen": 2.5676045417785645, "rewards/margins": 8.11563777923584, "rewards/rejected": -5.548033237457275, "step": 4932 }, { "epoch": 0.8221666666666667, "grad_norm": 40.897159576416016, "learning_rate": 1.6132943205457607e-08, "logits/chosen": 2.987179756164551, "logits/rejected": 3.058259963989258, "logps/chosen": -24.13538932800293, "logps/rejected": -275.69287109375, "loss": 0.5205, "nll_loss": 0.5135188698768616, "rewards/accuracies": 1.0, "rewards/chosen": 3.0034148693084717, "rewards/margins": 8.176916122436523, "rewards/rejected": -5.173501491546631, "step": 4933 }, { "epoch": 0.8223333333333334, "grad_norm": 26.43620491027832, "learning_rate": 1.610355621583691e-08, "logits/chosen": 1.5279455184936523, "logits/rejected": 1.9003289937973022, "logps/chosen": -77.27186584472656, "logps/rejected": -157.6498260498047, "loss": 0.886, "nll_loss": 0.8780894875526428, "rewards/accuracies": 1.0, "rewards/chosen": 3.5528275966644287, "rewards/margins": 8.299631118774414, "rewards/rejected": -4.746803283691406, "step": 4934 }, { "epoch": 0.8225, "grad_norm": 43.18136215209961, "learning_rate": 1.607419367162577e-08, "logits/chosen": 2.2980520725250244, "logits/rejected": 2.488285779953003, "logps/chosen": -21.128767013549805, "logps/rejected": -145.15206909179688, "loss": 0.6877, "nll_loss": 0.5869102478027344, "rewards/accuracies": 1.0, "rewards/chosen": 2.7777693271636963, "rewards/margins": 4.629701137542725, "rewards/rejected": -1.8519318103790283, "step": 4935 }, { "epoch": 0.8226666666666667, "grad_norm": 119.37553405761719, "learning_rate": 1.6044855581379724e-08, "logits/chosen": 2.433281660079956, "logits/rejected": 2.4016103744506836, "logps/chosen": -34.890384674072266, "logps/rejected": -74.32014465332031, "loss": 2.0339, "nll_loss": 0.7120485305786133, "rewards/accuracies": 1.0, "rewards/chosen": 3.995551824569702, "rewards/margins": 0.5307900905609131, "rewards/rejected": 3.464761734008789, "step": 4936 }, { "epoch": 0.8228333333333333, "grad_norm": 286.099609375, "learning_rate": 1.6015541953647216e-08, "logits/chosen": 1.4764825105667114, "logits/rejected": 1.5958189964294434, "logps/chosen": -71.83440399169922, "logps/rejected": -60.6032600402832, "loss": 6.8339, "nll_loss": 1.3060799837112427, "rewards/accuracies": 0.0, "rewards/chosen": -0.1687309294939041, "rewards/margins": -5.2441253662109375, "rewards/rejected": 5.075394630432129, "step": 4937 }, { "epoch": 0.823, "grad_norm": 36.78449249267578, "learning_rate": 1.5986252796969478e-08, "logits/chosen": 2.1164982318878174, "logits/rejected": 2.1814382076263428, "logps/chosen": -14.267745971679688, "logps/rejected": -90.57127380371094, "loss": 0.4693, "nll_loss": 0.3479938209056854, "rewards/accuracies": 1.0, "rewards/chosen": 1.1274160146713257, "rewards/margins": 3.2278356552124023, "rewards/rejected": -2.100419759750366, "step": 4938 }, { "epoch": 0.8231666666666667, "grad_norm": 26.999006271362305, "learning_rate": 1.595698811988071e-08, "logits/chosen": 0.8302217721939087, "logits/rejected": 1.7571167945861816, "logps/chosen": -60.43583679199219, "logps/rejected": -259.70458984375, "loss": 0.8393, "nll_loss": 0.8278881907463074, "rewards/accuracies": 1.0, "rewards/chosen": 1.5793198347091675, "rewards/margins": 9.819002151489258, "rewards/rejected": -8.2396821975708, "step": 4939 }, { "epoch": 0.8233333333333334, "grad_norm": 24.226835250854492, "learning_rate": 1.592774793090792e-08, "logits/chosen": 1.9440627098083496, "logits/rejected": 2.1463382244110107, "logps/chosen": -28.840843200683594, "logps/rejected": -272.25567626953125, "loss": 0.492, "nll_loss": 0.4806806147098541, "rewards/accuracies": 1.0, "rewards/chosen": 1.5802011489868164, "rewards/margins": 10.110130310058594, "rewards/rejected": -8.529929161071777, "step": 4940 }, { "epoch": 0.8235, "grad_norm": 40.82550811767578, "learning_rate": 1.5898532238571027e-08, "logits/chosen": 2.337893009185791, "logits/rejected": 2.4042201042175293, "logps/chosen": -30.19625473022461, "logps/rejected": -83.86582946777344, "loss": 0.7729, "nll_loss": 0.7549064755439758, "rewards/accuracies": 1.0, "rewards/chosen": 1.3173179626464844, "rewards/margins": 6.85619592666626, "rewards/rejected": -5.538877964019775, "step": 4941 }, { "epoch": 0.8236666666666667, "grad_norm": 16.133441925048828, "learning_rate": 1.5869341051382723e-08, "logits/chosen": 1.5850199460983276, "logits/rejected": 1.8586854934692383, "logps/chosen": -154.11192321777344, "logps/rejected": -256.08087158203125, "loss": 0.8535, "nll_loss": 0.8514469265937805, "rewards/accuracies": 1.0, "rewards/chosen": 3.318716526031494, "rewards/margins": 12.989154815673828, "rewards/rejected": -9.670437812805176, "step": 4942 }, { "epoch": 0.8238333333333333, "grad_norm": 100.4600601196289, "learning_rate": 1.584017437784867e-08, "logits/chosen": 2.9815683364868164, "logits/rejected": 2.907057285308838, "logps/chosen": -43.087242126464844, "logps/rejected": -35.94346237182617, "loss": 1.6388, "nll_loss": 1.538830041885376, "rewards/accuracies": 1.0, "rewards/chosen": 0.13962821662425995, "rewards/margins": 3.3133485317230225, "rewards/rejected": -3.173720359802246, "step": 4943 }, { "epoch": 0.824, "grad_norm": 173.4539031982422, "learning_rate": 1.5811032226467303e-08, "logits/chosen": 1.7968623638153076, "logits/rejected": 1.941036343574524, "logps/chosen": -77.9625244140625, "logps/rejected": -32.90837860107422, "loss": 2.4842, "nll_loss": 0.9745315313339233, "rewards/accuracies": 0.0, "rewards/chosen": 1.1242073774337769, "rewards/margins": -0.672569751739502, "rewards/rejected": 1.7967771291732788, "step": 4944 }, { "epoch": 0.8241666666666667, "grad_norm": 189.3555145263672, "learning_rate": 1.5781914605729997e-08, "logits/chosen": 2.2572178840637207, "logits/rejected": 2.1880640983581543, "logps/chosen": -76.5924301147461, "logps/rejected": -12.109663009643555, "loss": 2.9177, "nll_loss": 1.2765406370162964, "rewards/accuracies": 0.0, "rewards/chosen": 1.0968879461288452, "rewards/margins": -0.8574439287185669, "rewards/rejected": 1.954331874847412, "step": 4945 }, { "epoch": 0.8243333333333334, "grad_norm": 35.64766311645508, "learning_rate": 1.575282152412085e-08, "logits/chosen": 3.113656759262085, "logits/rejected": 3.0502753257751465, "logps/chosen": -12.78309154510498, "logps/rejected": -152.68331909179688, "loss": 0.467, "nll_loss": 0.45653897523880005, "rewards/accuracies": 1.0, "rewards/chosen": 1.716352105140686, "rewards/margins": 8.777359008789062, "rewards/rejected": -7.061006546020508, "step": 4946 }, { "epoch": 0.8245, "grad_norm": 136.6981658935547, "learning_rate": 1.5723752990116945e-08, "logits/chosen": 2.8267199993133545, "logits/rejected": 2.800995111465454, "logps/chosen": -52.72555160522461, "logps/rejected": -33.88039779663086, "loss": 1.967, "nll_loss": 0.9586464166641235, "rewards/accuracies": 1.0, "rewards/chosen": 1.3492634296417236, "rewards/margins": 0.17574775218963623, "rewards/rejected": 1.1735156774520874, "step": 4947 }, { "epoch": 0.8246666666666667, "grad_norm": 208.5814971923828, "learning_rate": 1.5694709012188135e-08, "logits/chosen": 2.0196444988250732, "logits/rejected": 2.4092466831207275, "logps/chosen": -24.93410873413086, "logps/rejected": -68.43487548828125, "loss": 2.5307, "nll_loss": 0.4298984706401825, "rewards/accuracies": 0.0, "rewards/chosen": 0.6043747067451477, "rewards/margins": -1.5329766273498535, "rewards/rejected": 2.1373512744903564, "step": 4948 }, { "epoch": 0.8248333333333333, "grad_norm": 35.42551040649414, "learning_rate": 1.566568959879717e-08, "logits/chosen": 2.5436973571777344, "logits/rejected": 2.438555955886841, "logps/chosen": -55.368499755859375, "logps/rejected": -54.082489013671875, "loss": 0.6771, "nll_loss": 0.5649846792221069, "rewards/accuracies": 1.0, "rewards/chosen": 0.3771873712539673, "rewards/margins": 3.1061010360717773, "rewards/rejected": -2.7289137840270996, "step": 4949 }, { "epoch": 0.825, "grad_norm": 27.124977111816406, "learning_rate": 1.563669475839956e-08, "logits/chosen": 2.6589066982269287, "logits/rejected": 2.4869801998138428, "logps/chosen": -75.1748275756836, "logps/rejected": -153.21287536621094, "loss": 0.8425, "nll_loss": 0.7913140654563904, "rewards/accuracies": 1.0, "rewards/chosen": 1.0683754682540894, "rewards/margins": 4.430110454559326, "rewards/rejected": -3.3617351055145264, "step": 4950 }, { "epoch": 0.8251666666666667, "grad_norm": 19.672149658203125, "learning_rate": 1.560772449944372e-08, "logits/chosen": 2.622232675552368, "logits/rejected": 2.9889063835144043, "logps/chosen": -40.96831130981445, "logps/rejected": -457.8677062988281, "loss": 0.4706, "nll_loss": 0.4552035331726074, "rewards/accuracies": 1.0, "rewards/chosen": 1.244454264640808, "rewards/margins": 15.592571258544922, "rewards/rejected": -14.348116874694824, "step": 4951 }, { "epoch": 0.8253333333333334, "grad_norm": 34.52259826660156, "learning_rate": 1.5578778830370898e-08, "logits/chosen": 2.5118613243103027, "logits/rejected": 2.623866081237793, "logps/chosen": -16.746200561523438, "logps/rejected": -58.84096145629883, "loss": 0.5847, "nll_loss": 0.5401999950408936, "rewards/accuracies": 1.0, "rewards/chosen": 2.0209100246429443, "rewards/margins": 5.0467705726623535, "rewards/rejected": -3.025860548019409, "step": 4952 }, { "epoch": 0.8255, "grad_norm": 29.14655113220215, "learning_rate": 1.554985775961519e-08, "logits/chosen": 1.6821106672286987, "logits/rejected": 1.2169303894042969, "logps/chosen": -155.99417114257812, "logps/rejected": -153.52740478515625, "loss": 1.0689, "nll_loss": 1.0399612188339233, "rewards/accuracies": 1.0, "rewards/chosen": 1.8192840814590454, "rewards/margins": 5.526949405670166, "rewards/rejected": -3.707665205001831, "step": 4953 }, { "epoch": 0.8256666666666667, "grad_norm": 26.81466293334961, "learning_rate": 1.5520961295603462e-08, "logits/chosen": 2.8870832920074463, "logits/rejected": 2.9753823280334473, "logps/chosen": -52.79195022583008, "logps/rejected": -136.4191436767578, "loss": 0.7078, "nll_loss": 0.6946309804916382, "rewards/accuracies": 1.0, "rewards/chosen": 1.4791797399520874, "rewards/margins": 8.40380859375, "rewards/rejected": -6.924628734588623, "step": 4954 }, { "epoch": 0.8258333333333333, "grad_norm": 112.98731994628906, "learning_rate": 1.549208944675545e-08, "logits/chosen": 1.9190800189971924, "logits/rejected": 2.497967004776001, "logps/chosen": -22.275371551513672, "logps/rejected": -94.47901916503906, "loss": 1.262, "nll_loss": 0.6020370125770569, "rewards/accuracies": 1.0, "rewards/chosen": 2.5669517517089844, "rewards/margins": 1.4786125421524048, "rewards/rejected": 1.0883392095565796, "step": 4955 }, { "epoch": 0.826, "grad_norm": 247.38465881347656, "learning_rate": 1.5463242221483742e-08, "logits/chosen": 2.7707602977752686, "logits/rejected": 2.924583911895752, "logps/chosen": -71.35102844238281, "logps/rejected": -32.74966049194336, "loss": 5.3887, "nll_loss": 0.8701344728469849, "rewards/accuracies": 0.0, "rewards/chosen": 1.998300313949585, "rewards/margins": -3.8386385440826416, "rewards/rejected": 5.836938858032227, "step": 4956 }, { "epoch": 0.8261666666666667, "grad_norm": 30.186038970947266, "learning_rate": 1.543441962819372e-08, "logits/chosen": 1.912144422531128, "logits/rejected": 1.927432656288147, "logps/chosen": -18.930139541625977, "logps/rejected": -81.09521484375, "loss": 0.5031, "nll_loss": 0.4853881895542145, "rewards/accuracies": 1.0, "rewards/chosen": 1.8131941556930542, "rewards/margins": 6.361583232879639, "rewards/rejected": -4.548388957977295, "step": 4957 }, { "epoch": 0.8263333333333334, "grad_norm": 26.375709533691406, "learning_rate": 1.5405621675283608e-08, "logits/chosen": 2.8908050060272217, "logits/rejected": 3.214141607284546, "logps/chosen": -87.98580932617188, "logps/rejected": -492.7227478027344, "loss": 0.9806, "nll_loss": 0.9776201248168945, "rewards/accuracies": 1.0, "rewards/chosen": 2.9989335536956787, "rewards/margins": 11.243602752685547, "rewards/rejected": -8.244668960571289, "step": 4958 }, { "epoch": 0.8265, "grad_norm": 81.88813781738281, "learning_rate": 1.53768483711444e-08, "logits/chosen": 1.5861457586288452, "logits/rejected": 1.8983640670776367, "logps/chosen": -36.813594818115234, "logps/rejected": -254.00650024414062, "loss": 1.6047, "nll_loss": 1.6005909442901611, "rewards/accuracies": 1.0, "rewards/chosen": 2.6515843868255615, "rewards/margins": 11.01065444946289, "rewards/rejected": -8.35906982421875, "step": 4959 }, { "epoch": 0.8266666666666667, "grad_norm": 34.82741928100586, "learning_rate": 1.534809972415998e-08, "logits/chosen": 2.0088565349578857, "logits/rejected": 2.36683988571167, "logps/chosen": -34.42512893676758, "logps/rejected": -344.1025390625, "loss": 0.8042, "nll_loss": 0.7823894023895264, "rewards/accuracies": 1.0, "rewards/chosen": 1.020236611366272, "rewards/margins": 6.954593658447266, "rewards/rejected": -5.934357166290283, "step": 4960 }, { "epoch": 0.8268333333333333, "grad_norm": 30.97862434387207, "learning_rate": 1.5319375742706986e-08, "logits/chosen": 2.689401149749756, "logits/rejected": 2.839103937149048, "logps/chosen": -25.892932891845703, "logps/rejected": -234.15560913085938, "loss": 0.5961, "nll_loss": 0.58847576379776, "rewards/accuracies": 1.0, "rewards/chosen": 2.2769947052001953, "rewards/margins": 8.284769058227539, "rewards/rejected": -6.007774353027344, "step": 4961 }, { "epoch": 0.827, "grad_norm": 32.31245422363281, "learning_rate": 1.5290676435154948e-08, "logits/chosen": 0.6152282953262329, "logits/rejected": 1.9604038000106812, "logps/chosen": -18.039268493652344, "logps/rejected": -295.999755859375, "loss": 0.4966, "nll_loss": 0.48754778504371643, "rewards/accuracies": 1.0, "rewards/chosen": 2.6065266132354736, "rewards/margins": 7.650712966918945, "rewards/rejected": -5.044186592102051, "step": 4962 }, { "epoch": 0.8271666666666667, "grad_norm": 21.764482498168945, "learning_rate": 1.52620018098661e-08, "logits/chosen": 1.724089503288269, "logits/rejected": 2.2386345863342285, "logps/chosen": -113.43620300292969, "logps/rejected": -331.4909973144531, "loss": 0.9124, "nll_loss": 0.9074896574020386, "rewards/accuracies": 1.0, "rewards/chosen": 2.5426042079925537, "rewards/margins": 9.833015441894531, "rewards/rejected": -7.290411472320557, "step": 4963 }, { "epoch": 0.8273333333333334, "grad_norm": 27.35565757751465, "learning_rate": 1.5233351875195543e-08, "logits/chosen": 2.939091682434082, "logits/rejected": 2.8028995990753174, "logps/chosen": -81.14323425292969, "logps/rejected": -139.9108428955078, "loss": 0.8604, "nll_loss": 0.8541392087936401, "rewards/accuracies": 1.0, "rewards/chosen": 2.4109742641448975, "rewards/margins": 8.770625114440918, "rewards/rejected": -6.3596510887146, "step": 4964 }, { "epoch": 0.8275, "grad_norm": 133.4253387451172, "learning_rate": 1.5204726639491215e-08, "logits/chosen": 1.9548531770706177, "logits/rejected": 1.9875855445861816, "logps/chosen": -55.19306945800781, "logps/rejected": -89.43598175048828, "loss": 1.7243, "nll_loss": 0.951604425907135, "rewards/accuracies": 1.0, "rewards/chosen": 1.9372353553771973, "rewards/margins": 0.8763077259063721, "rewards/rejected": 1.0609276294708252, "step": 4965 }, { "epoch": 0.8276666666666667, "grad_norm": 34.27503204345703, "learning_rate": 1.517612611109381e-08, "logits/chosen": 2.749277114868164, "logits/rejected": 2.5814614295959473, "logps/chosen": -91.74505615234375, "logps/rejected": -65.51435852050781, "loss": 1.0631, "nll_loss": 1.0425572395324707, "rewards/accuracies": 1.0, "rewards/chosen": 1.1946090459823608, "rewards/margins": 6.584342956542969, "rewards/rejected": -5.389733791351318, "step": 4966 }, { "epoch": 0.8278333333333333, "grad_norm": 39.721675872802734, "learning_rate": 1.514755029833682e-08, "logits/chosen": 2.1094701290130615, "logits/rejected": 2.523961067199707, "logps/chosen": -39.15311050415039, "logps/rejected": -594.062744140625, "loss": 1.0386, "nll_loss": 1.03034508228302, "rewards/accuracies": 1.0, "rewards/chosen": 1.8824058771133423, "rewards/margins": 13.9433012008667, "rewards/rejected": -12.060894966125488, "step": 4967 }, { "epoch": 0.828, "grad_norm": 21.58188819885254, "learning_rate": 1.5118999209546555e-08, "logits/chosen": 0.5144894123077393, "logits/rejected": 2.387827157974243, "logps/chosen": -13.32634162902832, "logps/rejected": -355.22882080078125, "loss": 0.3282, "nll_loss": 0.3172938823699951, "rewards/accuracies": 1.0, "rewards/chosen": 1.5976080894470215, "rewards/margins": 12.94538688659668, "rewards/rejected": -11.347779273986816, "step": 4968 }, { "epoch": 0.8281666666666667, "grad_norm": 29.81810760498047, "learning_rate": 1.5090472853042136e-08, "logits/chosen": 2.6469931602478027, "logits/rejected": 2.716702461242676, "logps/chosen": -112.89202880859375, "logps/rejected": -176.7606201171875, "loss": 1.1129, "nll_loss": 1.0960392951965332, "rewards/accuracies": 1.0, "rewards/chosen": 1.1693238019943237, "rewards/margins": 9.232769012451172, "rewards/rejected": -8.063445091247559, "step": 4969 }, { "epoch": 0.8283333333333334, "grad_norm": 31.947952270507812, "learning_rate": 1.506197123713545e-08, "logits/chosen": 2.9690682888031006, "logits/rejected": 2.875887155532837, "logps/chosen": -18.879638671875, "logps/rejected": -176.15316772460938, "loss": 0.4698, "nll_loss": 0.46047893166542053, "rewards/accuracies": 1.0, "rewards/chosen": 1.982967734336853, "rewards/margins": 8.095662117004395, "rewards/rejected": -6.112694263458252, "step": 4970 }, { "epoch": 0.8285, "grad_norm": 14.628509521484375, "learning_rate": 1.5033494370131162e-08, "logits/chosen": 2.545677900314331, "logits/rejected": 2.5747156143188477, "logps/chosen": -117.29585266113281, "logps/rejected": -324.1264343261719, "loss": 0.6289, "nll_loss": 0.6239141821861267, "rewards/accuracies": 1.0, "rewards/chosen": 3.7313125133514404, "rewards/margins": 9.029069900512695, "rewards/rejected": -5.297757148742676, "step": 4971 }, { "epoch": 0.8286666666666667, "grad_norm": 24.461769104003906, "learning_rate": 1.500504226032676e-08, "logits/chosen": 2.738536834716797, "logits/rejected": 2.9089412689208984, "logps/chosen": -71.81253051757812, "logps/rejected": -463.59075927734375, "loss": 0.9291, "nll_loss": 0.9206735491752625, "rewards/accuracies": 1.0, "rewards/chosen": 1.9282715320587158, "rewards/margins": 9.391138076782227, "rewards/rejected": -7.462866306304932, "step": 4972 }, { "epoch": 0.8288333333333333, "grad_norm": 42.449832916259766, "learning_rate": 1.497661491601252e-08, "logits/chosen": 2.480316162109375, "logits/rejected": 2.6134533882141113, "logps/chosen": -16.809823989868164, "logps/rejected": -49.338836669921875, "loss": 0.6792, "nll_loss": 0.6225860714912415, "rewards/accuracies": 1.0, "rewards/chosen": 1.1877079010009766, "rewards/margins": 4.303097724914551, "rewards/rejected": -3.115389585494995, "step": 4973 }, { "epoch": 0.829, "grad_norm": 52.05221176147461, "learning_rate": 1.494821234547149e-08, "logits/chosen": 2.0727405548095703, "logits/rejected": 2.30338454246521, "logps/chosen": -18.504507064819336, "logps/rejected": -112.24713134765625, "loss": 0.7484, "nll_loss": 0.740180253982544, "rewards/accuracies": 1.0, "rewards/chosen": 1.9603151082992554, "rewards/margins": 9.194510459899902, "rewards/rejected": -7.234195709228516, "step": 4974 }, { "epoch": 0.8291666666666667, "grad_norm": 24.40326690673828, "learning_rate": 1.4919834556979472e-08, "logits/chosen": 2.0809237957000732, "logits/rejected": 1.390808343887329, "logps/chosen": -80.91535186767578, "logps/rejected": -37.691837310791016, "loss": 0.8052, "nll_loss": 0.7423426508903503, "rewards/accuracies": 1.0, "rewards/chosen": 1.479130744934082, "rewards/margins": 4.280540466308594, "rewards/rejected": -2.801409959793091, "step": 4975 }, { "epoch": 0.8293333333333334, "grad_norm": 26.70008087158203, "learning_rate": 1.4891481558805074e-08, "logits/chosen": 2.9299099445343018, "logits/rejected": 3.058809757232666, "logps/chosen": -30.10077667236328, "logps/rejected": -377.1210632324219, "loss": 0.5632, "nll_loss": 0.5472868084907532, "rewards/accuracies": 1.0, "rewards/chosen": 1.2362136840820312, "rewards/margins": 9.076667785644531, "rewards/rejected": -7.8404541015625, "step": 4976 }, { "epoch": 0.8295, "grad_norm": 28.902982711791992, "learning_rate": 1.486315335920969e-08, "logits/chosen": 1.9481921195983887, "logits/rejected": 1.6659722328186035, "logps/chosen": -58.90370178222656, "logps/rejected": -69.94636535644531, "loss": 0.8036, "nll_loss": 0.7649831175804138, "rewards/accuracies": 1.0, "rewards/chosen": 1.9966179132461548, "rewards/margins": 5.21842622756958, "rewards/rejected": -3.221808433532715, "step": 4977 }, { "epoch": 0.8296666666666667, "grad_norm": 26.91926383972168, "learning_rate": 1.4834849966447482e-08, "logits/chosen": 1.2833552360534668, "logits/rejected": 1.5352957248687744, "logps/chosen": -77.55487060546875, "logps/rejected": -198.97149658203125, "loss": 0.7205, "nll_loss": 0.6803059577941895, "rewards/accuracies": 1.0, "rewards/chosen": 1.3406037092208862, "rewards/margins": 4.872803211212158, "rewards/rejected": -3.5321993827819824, "step": 4978 }, { "epoch": 0.8298333333333333, "grad_norm": 31.007064819335938, "learning_rate": 1.4806571388765399e-08, "logits/chosen": 2.056980609893799, "logits/rejected": 2.339848279953003, "logps/chosen": -142.007568359375, "logps/rejected": -227.8433837890625, "loss": 1.1951, "nll_loss": 1.1639964580535889, "rewards/accuracies": 1.0, "rewards/chosen": 1.0028564929962158, "rewards/margins": 5.447339057922363, "rewards/rejected": -4.444482326507568, "step": 4979 }, { "epoch": 0.83, "grad_norm": 28.947402954101562, "learning_rate": 1.477831763440308e-08, "logits/chosen": 2.2406582832336426, "logits/rejected": 2.061286211013794, "logps/chosen": -53.45180130004883, "logps/rejected": -93.66270446777344, "loss": 0.8305, "nll_loss": 0.7977879643440247, "rewards/accuracies": 1.0, "rewards/chosen": 0.9483288526535034, "rewards/margins": 5.358795166015625, "rewards/rejected": -4.410466194152832, "step": 4980 }, { "epoch": 0.8301666666666667, "grad_norm": 30.986970901489258, "learning_rate": 1.4750088711593034e-08, "logits/chosen": 2.9636545181274414, "logits/rejected": 2.837265968322754, "logps/chosen": -94.12910461425781, "logps/rejected": -254.3462677001953, "loss": 1.0607, "nll_loss": 1.0458788871765137, "rewards/accuracies": 1.0, "rewards/chosen": 1.2867707014083862, "rewards/margins": 10.319147109985352, "rewards/rejected": -9.032376289367676, "step": 4981 }, { "epoch": 0.8303333333333334, "grad_norm": 25.40505599975586, "learning_rate": 1.4721884628560488e-08, "logits/chosen": 2.544610023498535, "logits/rejected": 2.1653974056243896, "logps/chosen": -83.25123596191406, "logps/rejected": -85.50873565673828, "loss": 0.8551, "nll_loss": 0.8325122594833374, "rewards/accuracies": 1.0, "rewards/chosen": 1.5598313808441162, "rewards/margins": 5.911694526672363, "rewards/rejected": -4.351863384246826, "step": 4982 }, { "epoch": 0.8305, "grad_norm": 53.06403732299805, "learning_rate": 1.4693705393523448e-08, "logits/chosen": 2.4684417247772217, "logits/rejected": 2.6223649978637695, "logps/chosen": -37.81772232055664, "logps/rejected": -58.4886589050293, "loss": 1.3634, "nll_loss": 1.3506327867507935, "rewards/accuracies": 1.0, "rewards/chosen": 2.5095250606536865, "rewards/margins": 7.046109199523926, "rewards/rejected": -4.536584377288818, "step": 4983 }, { "epoch": 0.8306666666666667, "grad_norm": 16.96236801147461, "learning_rate": 1.4665551014692623e-08, "logits/chosen": 1.9478503465652466, "logits/rejected": 2.5536394119262695, "logps/chosen": -13.928038597106934, "logps/rejected": -524.8927001953125, "loss": 0.2473, "nll_loss": 0.24013859033584595, "rewards/accuracies": 1.0, "rewards/chosen": 2.1198887825012207, "rewards/margins": 9.386825561523438, "rewards/rejected": -7.266937255859375, "step": 4984 }, { "epoch": 0.8308333333333333, "grad_norm": 24.262474060058594, "learning_rate": 1.4637421500271551e-08, "logits/chosen": 1.4701157808303833, "logits/rejected": 2.403007984161377, "logps/chosen": -42.21950149536133, "logps/rejected": -222.91452026367188, "loss": 0.5774, "nll_loss": 0.562926709651947, "rewards/accuracies": 1.0, "rewards/chosen": 1.5375412702560425, "rewards/margins": 7.267437934875488, "rewards/rejected": -5.729896545410156, "step": 4985 }, { "epoch": 0.831, "grad_norm": 233.34901428222656, "learning_rate": 1.4609316858456489e-08, "logits/chosen": 2.514730930328369, "logits/rejected": 2.7734663486480713, "logps/chosen": -107.14700317382812, "logps/rejected": -13.161333084106445, "loss": 3.9, "nll_loss": 1.0504608154296875, "rewards/accuracies": 0.0, "rewards/chosen": 2.2368927001953125, "rewards/margins": -2.041464328765869, "rewards/rejected": 4.278357028961182, "step": 4986 }, { "epoch": 0.8311666666666667, "grad_norm": 98.55927276611328, "learning_rate": 1.4581237097436482e-08, "logits/chosen": 1.9055259227752686, "logits/rejected": 2.8998799324035645, "logps/chosen": -23.62490463256836, "logps/rejected": -507.46954345703125, "loss": 1.4877, "nll_loss": 1.476556658744812, "rewards/accuracies": 1.0, "rewards/chosen": 1.6045112609863281, "rewards/margins": 9.591651916503906, "rewards/rejected": -7.987140655517578, "step": 4987 }, { "epoch": 0.8313333333333334, "grad_norm": 97.13651275634766, "learning_rate": 1.4553182225393257e-08, "logits/chosen": 1.8305535316467285, "logits/rejected": 2.258333921432495, "logps/chosen": -97.66902923583984, "logps/rejected": -378.814453125, "loss": 2.5994, "nll_loss": 2.325453519821167, "rewards/accuracies": 1.0, "rewards/chosen": -2.2879388332366943, "rewards/margins": 4.402093887329102, "rewards/rejected": -6.690032958984375, "step": 4988 }, { "epoch": 0.8315, "grad_norm": 141.99276733398438, "learning_rate": 1.452515225050136e-08, "logits/chosen": 1.1777715682983398, "logits/rejected": 2.4810354709625244, "logps/chosen": -37.829689025878906, "logps/rejected": -226.151123046875, "loss": 0.9642, "nll_loss": 0.4788568615913391, "rewards/accuracies": 1.0, "rewards/chosen": 1.1601914167404175, "rewards/margins": 1.3252854347229004, "rewards/rejected": -0.16509400308132172, "step": 4989 }, { "epoch": 0.8316666666666667, "grad_norm": 31.928552627563477, "learning_rate": 1.4497147180928027e-08, "logits/chosen": 2.1582913398742676, "logits/rejected": 1.893019437789917, "logps/chosen": -80.8492431640625, "logps/rejected": -127.25083923339844, "loss": 0.9486, "nll_loss": 0.9293015599250793, "rewards/accuracies": 1.0, "rewards/chosen": 1.0533889532089233, "rewards/margins": 8.109674453735352, "rewards/rejected": -7.056285858154297, "step": 4990 }, { "epoch": 0.8318333333333333, "grad_norm": 71.85832214355469, "learning_rate": 1.4469167024833306e-08, "logits/chosen": 1.7014304399490356, "logits/rejected": 1.9263142347335815, "logps/chosen": -32.87868118286133, "logps/rejected": -107.14948272705078, "loss": 1.2031, "nll_loss": 1.1742388010025024, "rewards/accuracies": 1.0, "rewards/chosen": 0.8074566125869751, "rewards/margins": 6.068519592285156, "rewards/rejected": -5.261063098907471, "step": 4991 }, { "epoch": 0.832, "grad_norm": 28.470672607421875, "learning_rate": 1.4441211790369889e-08, "logits/chosen": 3.906055212020874, "logits/rejected": 3.710989236831665, "logps/chosen": -51.472984313964844, "logps/rejected": -103.83450317382812, "loss": 0.8188, "nll_loss": 0.7918920516967773, "rewards/accuracies": 1.0, "rewards/chosen": 2.040756940841675, "rewards/margins": 5.72986364364624, "rewards/rejected": -3.6891067028045654, "step": 4992 }, { "epoch": 0.8321666666666667, "grad_norm": 129.39418029785156, "learning_rate": 1.4413281485683292e-08, "logits/chosen": 3.1001806259155273, "logits/rejected": 3.1607604026794434, "logps/chosen": -44.375343322753906, "logps/rejected": -28.064952850341797, "loss": 1.7011, "nll_loss": 0.5220628976821899, "rewards/accuracies": 0.0, "rewards/chosen": 1.5128601789474487, "rewards/margins": -0.06994473934173584, "rewards/rejected": 1.5828049182891846, "step": 4993 }, { "epoch": 0.8323333333333334, "grad_norm": 73.23295593261719, "learning_rate": 1.438537611891173e-08, "logits/chosen": 1.526916742324829, "logits/rejected": 2.328206777572632, "logps/chosen": -14.500629425048828, "logps/rejected": -235.9878387451172, "loss": 0.818, "nll_loss": 0.8055904507637024, "rewards/accuracies": 1.0, "rewards/chosen": 1.598475694656372, "rewards/margins": 7.987375259399414, "rewards/rejected": -6.388899803161621, "step": 4994 }, { "epoch": 0.8325, "grad_norm": 94.81830596923828, "learning_rate": 1.4357495698186183e-08, "logits/chosen": 2.615412950515747, "logits/rejected": 2.465128183364868, "logps/chosen": -126.51026916503906, "logps/rejected": -9.491560935974121, "loss": 2.0401, "nll_loss": 0.9371130466461182, "rewards/accuracies": 1.0, "rewards/chosen": 3.4590911865234375, "rewards/margins": 0.7725014686584473, "rewards/rejected": 2.6865897178649902, "step": 4995 }, { "epoch": 0.8326666666666667, "grad_norm": 61.0550422668457, "learning_rate": 1.432964023163028e-08, "logits/chosen": 2.596764326095581, "logits/rejected": 2.7282488346099854, "logps/chosen": -63.86851119995117, "logps/rejected": -212.79241943359375, "loss": 1.8269, "nll_loss": 1.8248145580291748, "rewards/accuracies": 1.0, "rewards/chosen": 3.6677684783935547, "rewards/margins": 10.601505279541016, "rewards/rejected": -6.933736324310303, "step": 4996 }, { "epoch": 0.8328333333333333, "grad_norm": 287.5101013183594, "learning_rate": 1.4301809727360458e-08, "logits/chosen": 1.7303704023361206, "logits/rejected": 1.8974530696868896, "logps/chosen": -87.9713134765625, "logps/rejected": -84.34115600585938, "loss": 2.5065, "nll_loss": 0.9667178988456726, "rewards/accuracies": 0.0, "rewards/chosen": 0.5169571042060852, "rewards/margins": -0.8514809012413025, "rewards/rejected": 1.3684380054473877, "step": 4997 }, { "epoch": 0.833, "grad_norm": 40.53066635131836, "learning_rate": 1.4274004193485877e-08, "logits/chosen": 0.2743454873561859, "logits/rejected": 1.2417917251586914, "logps/chosen": -27.19220733642578, "logps/rejected": -438.0714111328125, "loss": 0.7408, "nll_loss": 0.7349244356155396, "rewards/accuracies": 1.0, "rewards/chosen": 2.251497745513916, "rewards/margins": 11.1256103515625, "rewards/rejected": -8.874112129211426, "step": 4998 }, { "epoch": 0.8331666666666667, "grad_norm": 19.0809383392334, "learning_rate": 1.4246223638108379e-08, "logits/chosen": 2.15963077545166, "logits/rejected": 2.5039024353027344, "logps/chosen": -124.42306518554688, "logps/rejected": -383.8253173828125, "loss": 0.7751, "nll_loss": 0.7728140950202942, "rewards/accuracies": 1.0, "rewards/chosen": 3.4328629970550537, "rewards/margins": 10.675337791442871, "rewards/rejected": -7.242474555969238, "step": 4999 }, { "epoch": 0.8333333333333334, "grad_norm": 39.48408889770508, "learning_rate": 1.4218468069322575e-08, "logits/chosen": 2.717552423477173, "logits/rejected": 2.832937240600586, "logps/chosen": -55.30912780761719, "logps/rejected": -126.80136108398438, "loss": 0.8076, "nll_loss": 0.7090913653373718, "rewards/accuracies": 1.0, "rewards/chosen": 3.130018711090088, "rewards/margins": 4.969402313232422, "rewards/rejected": -1.8393837213516235, "step": 5000 }, { "epoch": 0.8335, "grad_norm": 30.28285026550293, "learning_rate": 1.4190737495215743e-08, "logits/chosen": 2.637503147125244, "logits/rejected": 2.6552846431732178, "logps/chosen": -39.20457458496094, "logps/rejected": -142.11651611328125, "loss": 0.6985, "nll_loss": 0.675940990447998, "rewards/accuracies": 1.0, "rewards/chosen": 1.2555755376815796, "rewards/margins": 6.10720157623291, "rewards/rejected": -4.851625919342041, "step": 5001 }, { "epoch": 0.8336666666666667, "grad_norm": 27.39492416381836, "learning_rate": 1.416303192386793e-08, "logits/chosen": 2.172539472579956, "logits/rejected": 2.1141934394836426, "logps/chosen": -102.26950073242188, "logps/rejected": -222.3735809326172, "loss": 1.1849, "nll_loss": 1.1755115985870361, "rewards/accuracies": 1.0, "rewards/chosen": 1.7783318758010864, "rewards/margins": 10.215177536010742, "rewards/rejected": -8.436845779418945, "step": 5002 }, { "epoch": 0.8338333333333333, "grad_norm": 22.924821853637695, "learning_rate": 1.4135351363351889e-08, "logits/chosen": 1.5935739278793335, "logits/rejected": 2.7969727516174316, "logps/chosen": -87.01041412353516, "logps/rejected": -322.81414794921875, "loss": 1.0099, "nll_loss": 1.000119686126709, "rewards/accuracies": 1.0, "rewards/chosen": 2.2655022144317627, "rewards/margins": 7.529256820678711, "rewards/rejected": -5.263754844665527, "step": 5003 }, { "epoch": 0.834, "grad_norm": 23.4881649017334, "learning_rate": 1.4107695821733023e-08, "logits/chosen": 2.8174779415130615, "logits/rejected": 2.656890392303467, "logps/chosen": -202.13107299804688, "logps/rejected": -101.82210540771484, "loss": 1.006, "nll_loss": 0.9579671621322632, "rewards/accuracies": 1.0, "rewards/chosen": 1.7039262056350708, "rewards/margins": 4.759296417236328, "rewards/rejected": -3.055370330810547, "step": 5004 }, { "epoch": 0.8341666666666666, "grad_norm": 26.657773971557617, "learning_rate": 1.4080065307069523e-08, "logits/chosen": 1.8827983140945435, "logits/rejected": 1.74441397190094, "logps/chosen": -47.15653991699219, "logps/rejected": -117.64686584472656, "loss": 0.7762, "nll_loss": 0.7605894804000854, "rewards/accuracies": 1.0, "rewards/chosen": 1.5930957794189453, "rewards/margins": 6.832580089569092, "rewards/rejected": -5.2394843101501465, "step": 5005 }, { "epoch": 0.8343333333333334, "grad_norm": 42.568172454833984, "learning_rate": 1.4052459827412244e-08, "logits/chosen": 0.5203291773796082, "logits/rejected": 1.6897263526916504, "logps/chosen": -78.52175903320312, "logps/rejected": -300.92236328125, "loss": 1.4159, "nll_loss": 1.3775746822357178, "rewards/accuracies": 1.0, "rewards/chosen": 0.6939637064933777, "rewards/margins": 5.188040256500244, "rewards/rejected": -4.494076728820801, "step": 5006 }, { "epoch": 0.8345, "grad_norm": 30.54026985168457, "learning_rate": 1.4024879390804788e-08, "logits/chosen": 0.42002472281455994, "logits/rejected": 2.7704713344573975, "logps/chosen": -30.461170196533203, "logps/rejected": -567.19970703125, "loss": 0.5404, "nll_loss": 0.5162909626960754, "rewards/accuracies": 1.0, "rewards/chosen": 0.8624893426895142, "rewards/margins": 7.1483540534973145, "rewards/rejected": -6.28586483001709, "step": 5007 }, { "epoch": 0.8346666666666667, "grad_norm": 32.88626480102539, "learning_rate": 1.3997324005283406e-08, "logits/chosen": 2.1853742599487305, "logits/rejected": 2.026488780975342, "logps/chosen": -46.79010772705078, "logps/rejected": -94.02366638183594, "loss": 0.6665, "nll_loss": 0.615659236907959, "rewards/accuracies": 1.0, "rewards/chosen": 2.1597275733947754, "rewards/margins": 4.97756290435791, "rewards/rejected": -2.817835569381714, "step": 5008 }, { "epoch": 0.8348333333333333, "grad_norm": 57.063236236572266, "learning_rate": 1.3969793678877074e-08, "logits/chosen": 2.069777250289917, "logits/rejected": 2.331486701965332, "logps/chosen": -21.82326316833496, "logps/rejected": -63.53424072265625, "loss": 0.9329, "nll_loss": 0.9093027114868164, "rewards/accuracies": 1.0, "rewards/chosen": 1.299999475479126, "rewards/margins": 5.934841156005859, "rewards/rejected": -4.634841442108154, "step": 5009 }, { "epoch": 0.835, "grad_norm": 17.30122947692871, "learning_rate": 1.3942288419607474e-08, "logits/chosen": 2.384862184524536, "logits/rejected": 2.4323859214782715, "logps/chosen": -195.34324645996094, "logps/rejected": -302.0408935546875, "loss": 0.9682, "nll_loss": 0.9622818231582642, "rewards/accuracies": 1.0, "rewards/chosen": 2.2536637783050537, "rewards/margins": 10.671398162841797, "rewards/rejected": -8.417734146118164, "step": 5010 }, { "epoch": 0.8351666666666666, "grad_norm": 28.24090003967285, "learning_rate": 1.3914808235488996e-08, "logits/chosen": 1.826039433479309, "logits/rejected": 1.991191029548645, "logps/chosen": -116.60199737548828, "logps/rejected": -126.9178695678711, "loss": 1.0363, "nll_loss": 1.0318762063980103, "rewards/accuracies": 1.0, "rewards/chosen": 4.143540382385254, "rewards/margins": 9.458906173706055, "rewards/rejected": -5.315365314483643, "step": 5011 }, { "epoch": 0.8353333333333334, "grad_norm": 28.493505477905273, "learning_rate": 1.3887353134528678e-08, "logits/chosen": 2.7567756175994873, "logits/rejected": 2.855945110321045, "logps/chosen": -25.92086410522461, "logps/rejected": -121.88685607910156, "loss": 0.6188, "nll_loss": 0.6028107404708862, "rewards/accuracies": 1.0, "rewards/chosen": 4.484487056732178, "rewards/margins": 8.306684494018555, "rewards/rejected": -3.822197675704956, "step": 5012 }, { "epoch": 0.8355, "grad_norm": 24.78487205505371, "learning_rate": 1.385992312472628e-08, "logits/chosen": 0.21477054059505463, "logits/rejected": 1.263993501663208, "logps/chosen": -63.84959030151367, "logps/rejected": -351.5216369628906, "loss": 0.7706, "nll_loss": 0.7601141333580017, "rewards/accuracies": 1.0, "rewards/chosen": 1.7200603485107422, "rewards/margins": 8.748558044433594, "rewards/rejected": -7.028497695922852, "step": 5013 }, { "epoch": 0.8356666666666667, "grad_norm": 25.53000259399414, "learning_rate": 1.3832518214074262e-08, "logits/chosen": 1.8869054317474365, "logits/rejected": 2.4782872200012207, "logps/chosen": -54.30604553222656, "logps/rejected": -182.2999725341797, "loss": 0.6733, "nll_loss": 0.6704450845718384, "rewards/accuracies": 1.0, "rewards/chosen": 3.001974582672119, "rewards/margins": 11.547327041625977, "rewards/rejected": -8.545352935791016, "step": 5014 }, { "epoch": 0.8358333333333333, "grad_norm": 101.23982238769531, "learning_rate": 1.3805138410557782e-08, "logits/chosen": 2.3232603073120117, "logits/rejected": 1.8502711057662964, "logps/chosen": -56.86090087890625, "logps/rejected": -9.122584342956543, "loss": 2.1669, "nll_loss": 0.8747832179069519, "rewards/accuracies": 1.0, "rewards/chosen": 2.706080675125122, "rewards/margins": 0.1153571605682373, "rewards/rejected": 2.5907235145568848, "step": 5015 }, { "epoch": 0.836, "grad_norm": 84.3620834350586, "learning_rate": 1.3777783722154602e-08, "logits/chosen": 1.8552610874176025, "logits/rejected": 2.5945515632629395, "logps/chosen": -21.327890396118164, "logps/rejected": -139.0260009765625, "loss": 0.9019, "nll_loss": 0.5201925039291382, "rewards/accuracies": 1.0, "rewards/chosen": 1.010175108909607, "rewards/margins": 1.6161870956420898, "rewards/rejected": -0.6060119867324829, "step": 5016 }, { "epoch": 0.8361666666666666, "grad_norm": 30.390228271484375, "learning_rate": 1.3750454156835246e-08, "logits/chosen": 2.702542304992676, "logits/rejected": 3.0000784397125244, "logps/chosen": -92.3597183227539, "logps/rejected": -362.2368469238281, "loss": 1.0949, "nll_loss": 1.073950171470642, "rewards/accuracies": 1.0, "rewards/chosen": 0.9346336722373962, "rewards/margins": 9.140934944152832, "rewards/rejected": -8.20630168914795, "step": 5017 }, { "epoch": 0.8363333333333334, "grad_norm": 40.17919921875, "learning_rate": 1.37231497225629e-08, "logits/chosen": 2.8634955883026123, "logits/rejected": 3.066776990890503, "logps/chosen": -135.82798767089844, "logps/rejected": -540.0441284179688, "loss": 1.5821, "nll_loss": 1.5793951749801636, "rewards/accuracies": 1.0, "rewards/chosen": 3.0636870861053467, "rewards/margins": 11.436810493469238, "rewards/rejected": -8.373123168945312, "step": 5018 }, { "epoch": 0.8365, "grad_norm": 33.48660659790039, "learning_rate": 1.369587042729341e-08, "logits/chosen": 1.3944034576416016, "logits/rejected": 2.3058547973632812, "logps/chosen": -49.201908111572266, "logps/rejected": -275.3214416503906, "loss": 0.9474, "nll_loss": 0.9461905360221863, "rewards/accuracies": 1.0, "rewards/chosen": 4.626630783081055, "rewards/margins": 11.660627365112305, "rewards/rejected": -7.03399658203125, "step": 5019 }, { "epoch": 0.8366666666666667, "grad_norm": 23.36210060119629, "learning_rate": 1.3668616278975342e-08, "logits/chosen": 2.6563351154327393, "logits/rejected": 2.8279500007629395, "logps/chosen": -40.329063415527344, "logps/rejected": -319.888916015625, "loss": 0.5981, "nll_loss": 0.5930745005607605, "rewards/accuracies": 1.0, "rewards/chosen": 2.4543519020080566, "rewards/margins": 10.303298950195312, "rewards/rejected": -7.848947048187256, "step": 5020 }, { "epoch": 0.8368333333333333, "grad_norm": 179.69070434570312, "learning_rate": 1.3641387285549843e-08, "logits/chosen": 2.844451904296875, "logits/rejected": 2.8171355724334717, "logps/chosen": -157.09970092773438, "logps/rejected": -136.6019287109375, "loss": 2.3531, "nll_loss": 1.5554425716400146, "rewards/accuracies": 1.0, "rewards/chosen": -4.327401638031006, "rewards/margins": 1.3943195343017578, "rewards/rejected": -5.721721172332764, "step": 5021 }, { "epoch": 0.837, "grad_norm": 61.27266311645508, "learning_rate": 1.3614183454950823e-08, "logits/chosen": 2.697880744934082, "logits/rejected": 2.7986185550689697, "logps/chosen": -5.207290172576904, "logps/rejected": -42.70844650268555, "loss": 0.4537, "nll_loss": 0.40056082606315613, "rewards/accuracies": 1.0, "rewards/chosen": 1.9287605285644531, "rewards/margins": 4.764327049255371, "rewards/rejected": -2.835566759109497, "step": 5022 }, { "epoch": 0.8371666666666666, "grad_norm": 25.17074966430664, "learning_rate": 1.3587004795104817e-08, "logits/chosen": 0.847513735294342, "logits/rejected": 1.3783729076385498, "logps/chosen": -58.33666229248047, "logps/rejected": -258.4411926269531, "loss": 0.8519, "nll_loss": 0.8454588055610657, "rewards/accuracies": 1.0, "rewards/chosen": 2.250174045562744, "rewards/margins": 9.324749946594238, "rewards/rejected": -7.074575901031494, "step": 5023 }, { "epoch": 0.8373333333333334, "grad_norm": 25.78445816040039, "learning_rate": 1.3559851313931059e-08, "logits/chosen": 1.8647814989089966, "logits/rejected": 2.087799310684204, "logps/chosen": -43.76153564453125, "logps/rejected": -201.63800048828125, "loss": 0.6629, "nll_loss": 0.6435519456863403, "rewards/accuracies": 1.0, "rewards/chosen": 1.0142033100128174, "rewards/margins": 9.409987449645996, "rewards/rejected": -8.395784378051758, "step": 5024 }, { "epoch": 0.8375, "grad_norm": 26.07699203491211, "learning_rate": 1.3532723019341374e-08, "logits/chosen": 1.3095706701278687, "logits/rejected": 1.825118064880371, "logps/chosen": -58.168312072753906, "logps/rejected": -351.44073486328125, "loss": 0.884, "nll_loss": 0.8813380002975464, "rewards/accuracies": 1.0, "rewards/chosen": 3.0375490188598633, "rewards/margins": 14.538073539733887, "rewards/rejected": -11.500524520874023, "step": 5025 }, { "epoch": 0.8376666666666667, "grad_norm": 31.1207275390625, "learning_rate": 1.3505619919240319e-08, "logits/chosen": 2.261029005050659, "logits/rejected": 2.5471303462982178, "logps/chosen": -15.932268142700195, "logps/rejected": -273.5513916015625, "loss": 0.4333, "nll_loss": 0.4306018650531769, "rewards/accuracies": 1.0, "rewards/chosen": 3.0168142318725586, "rewards/margins": 12.93541145324707, "rewards/rejected": -9.918597221374512, "step": 5026 }, { "epoch": 0.8378333333333333, "grad_norm": 28.327964782714844, "learning_rate": 1.3478542021525085e-08, "logits/chosen": 2.5504660606384277, "logits/rejected": 2.7223782539367676, "logps/chosen": -24.682754516601562, "logps/rejected": -296.496337890625, "loss": 0.4942, "nll_loss": 0.49365517497062683, "rewards/accuracies": 1.0, "rewards/chosen": 5.5755133628845215, "rewards/margins": 13.318349838256836, "rewards/rejected": -7.742835998535156, "step": 5027 }, { "epoch": 0.838, "grad_norm": 32.178436279296875, "learning_rate": 1.3451489334085553e-08, "logits/chosen": 2.7379703521728516, "logits/rejected": 2.6913492679595947, "logps/chosen": -12.162301063537598, "logps/rejected": -49.8661003112793, "loss": 0.3816, "nll_loss": 0.337841659784317, "rewards/accuracies": 1.0, "rewards/chosen": 1.066603183746338, "rewards/margins": 4.694851875305176, "rewards/rejected": -3.628248929977417, "step": 5028 }, { "epoch": 0.8381666666666666, "grad_norm": 143.8618927001953, "learning_rate": 1.3424461864804181e-08, "logits/chosen": 1.5442290306091309, "logits/rejected": 1.36536705493927, "logps/chosen": -68.66592407226562, "logps/rejected": -52.65370178222656, "loss": 1.522, "nll_loss": 0.8174513578414917, "rewards/accuracies": 1.0, "rewards/chosen": 1.0937111377716064, "rewards/margins": 0.7131813764572144, "rewards/rejected": 0.3805297911167145, "step": 5029 }, { "epoch": 0.8383333333333334, "grad_norm": 24.210205078125, "learning_rate": 1.3397459621556128e-08, "logits/chosen": 3.3259634971618652, "logits/rejected": 3.368408441543579, "logps/chosen": -47.61208724975586, "logps/rejected": -174.609130859375, "loss": 0.682, "nll_loss": 0.6801726818084717, "rewards/accuracies": 1.0, "rewards/chosen": 7.002758979797363, "rewards/margins": 12.980371475219727, "rewards/rejected": -5.977612495422363, "step": 5030 }, { "epoch": 0.8385, "grad_norm": 11.507716178894043, "learning_rate": 1.3370482612209222e-08, "logits/chosen": 2.106956720352173, "logits/rejected": 2.1290218830108643, "logps/chosen": -156.28509521484375, "logps/rejected": -208.81639099121094, "loss": 0.5965, "nll_loss": 0.5942399501800537, "rewards/accuracies": 1.0, "rewards/chosen": 5.4770355224609375, "rewards/margins": 11.35697078704834, "rewards/rejected": -5.879935264587402, "step": 5031 }, { "epoch": 0.8386666666666667, "grad_norm": 19.161481857299805, "learning_rate": 1.3343530844623929e-08, "logits/chosen": 2.516204833984375, "logits/rejected": 2.545191526412964, "logps/chosen": -79.7427978515625, "logps/rejected": -188.117919921875, "loss": 0.6905, "nll_loss": 0.687437891960144, "rewards/accuracies": 1.0, "rewards/chosen": 2.9129717350006104, "rewards/margins": 12.036364555358887, "rewards/rejected": -9.123393058776855, "step": 5032 }, { "epoch": 0.8388333333333333, "grad_norm": 21.01244354248047, "learning_rate": 1.3316604326653302e-08, "logits/chosen": 2.4832162857055664, "logits/rejected": 2.588027000427246, "logps/chosen": -90.71900177001953, "logps/rejected": -174.23768615722656, "loss": 0.7849, "nll_loss": 0.7753760814666748, "rewards/accuracies": 1.0, "rewards/chosen": 2.565467119216919, "rewards/margins": 7.545271873474121, "rewards/rejected": -4.979804992675781, "step": 5033 }, { "epoch": 0.839, "grad_norm": 33.1271858215332, "learning_rate": 1.328970306614311e-08, "logits/chosen": 2.0330114364624023, "logits/rejected": 2.0543551445007324, "logps/chosen": -85.26356506347656, "logps/rejected": -70.94700622558594, "loss": 1.0429, "nll_loss": 0.991436779499054, "rewards/accuracies": 1.0, "rewards/chosen": 0.9385574460029602, "rewards/margins": 4.412501811981201, "rewards/rejected": -3.4739444255828857, "step": 5034 }, { "epoch": 0.8391666666666666, "grad_norm": 24.85455894470215, "learning_rate": 1.3262827070931714e-08, "logits/chosen": 1.8294153213500977, "logits/rejected": 1.9706231355667114, "logps/chosen": -33.37613296508789, "logps/rejected": -151.61557006835938, "loss": 0.5729, "nll_loss": 0.5656972527503967, "rewards/accuracies": 1.0, "rewards/chosen": 2.1911580562591553, "rewards/margins": 8.784577369689941, "rewards/rejected": -6.593419075012207, "step": 5035 }, { "epoch": 0.8393333333333334, "grad_norm": 37.10083770751953, "learning_rate": 1.3235976348850164e-08, "logits/chosen": 2.6402087211608887, "logits/rejected": 2.7802932262420654, "logps/chosen": -16.49678611755371, "logps/rejected": -210.67588806152344, "loss": 0.6157, "nll_loss": 0.6109921336174011, "rewards/accuracies": 1.0, "rewards/chosen": 3.984517812728882, "rewards/margins": 9.269379615783691, "rewards/rejected": -5.284862041473389, "step": 5036 }, { "epoch": 0.8395, "grad_norm": 51.5920524597168, "learning_rate": 1.3209150907722123e-08, "logits/chosen": 2.8387808799743652, "logits/rejected": 2.8711953163146973, "logps/chosen": -19.397348403930664, "logps/rejected": -71.56446838378906, "loss": 0.8884, "nll_loss": 0.8816976547241211, "rewards/accuracies": 1.0, "rewards/chosen": 3.028123617172241, "rewards/margins": 8.25462818145752, "rewards/rejected": -5.226504802703857, "step": 5037 }, { "epoch": 0.8396666666666667, "grad_norm": 21.970985412597656, "learning_rate": 1.3182350755363825e-08, "logits/chosen": 1.3303078413009644, "logits/rejected": 1.2699105739593506, "logps/chosen": -52.80702209472656, "logps/rejected": -97.5475845336914, "loss": 0.6014, "nll_loss": 0.5933372378349304, "rewards/accuracies": 1.0, "rewards/chosen": 2.2267043590545654, "rewards/margins": 8.127367973327637, "rewards/rejected": -5.900663375854492, "step": 5038 }, { "epoch": 0.8398333333333333, "grad_norm": 33.390560150146484, "learning_rate": 1.3155575899584225e-08, "logits/chosen": 0.7630746960639954, "logits/rejected": 1.8718383312225342, "logps/chosen": -54.94377517700195, "logps/rejected": -286.55517578125, "loss": 0.865, "nll_loss": 0.858496367931366, "rewards/accuracies": 1.0, "rewards/chosen": 2.140115737915039, "rewards/margins": 10.770367622375488, "rewards/rejected": -8.63025188446045, "step": 5039 }, { "epoch": 0.84, "grad_norm": 29.28778839111328, "learning_rate": 1.3128826348184884e-08, "logits/chosen": 2.3155934810638428, "logits/rejected": 2.5745043754577637, "logps/chosen": -53.314453125, "logps/rejected": -202.44886779785156, "loss": 0.78, "nll_loss": 0.7616350650787354, "rewards/accuracies": 1.0, "rewards/chosen": 1.2666046619415283, "rewards/margins": 6.9129438400268555, "rewards/rejected": -5.646339416503906, "step": 5040 }, { "epoch": 0.8401666666666666, "grad_norm": 32.605552673339844, "learning_rate": 1.3102102108959989e-08, "logits/chosen": 2.398313045501709, "logits/rejected": 2.448775291442871, "logps/chosen": -37.856319427490234, "logps/rejected": -34.48212432861328, "loss": 0.7029, "nll_loss": 0.5567105412483215, "rewards/accuracies": 1.0, "rewards/chosen": 2.6417813301086426, "rewards/margins": 4.033154487609863, "rewards/rejected": -1.3913732767105103, "step": 5041 }, { "epoch": 0.8403333333333334, "grad_norm": 32.9068489074707, "learning_rate": 1.3075403189696288e-08, "logits/chosen": 2.535675525665283, "logits/rejected": 2.556196451187134, "logps/chosen": -75.2030029296875, "logps/rejected": -125.11978912353516, "loss": 1.0674, "nll_loss": 0.989513099193573, "rewards/accuracies": 1.0, "rewards/chosen": 1.669891357421875, "rewards/margins": 4.110166549682617, "rewards/rejected": -2.440274953842163, "step": 5042 }, { "epoch": 0.8405, "grad_norm": 15.620347023010254, "learning_rate": 1.3048729598173247e-08, "logits/chosen": 2.6440985202789307, "logits/rejected": 2.56441068649292, "logps/chosen": -171.78245544433594, "logps/rejected": -84.14750671386719, "loss": 0.6861, "nll_loss": 0.676308810710907, "rewards/accuracies": 1.0, "rewards/chosen": 3.2978198528289795, "rewards/margins": 7.854297637939453, "rewards/rejected": -4.556478023529053, "step": 5043 }, { "epoch": 0.8406666666666667, "grad_norm": 19.83200454711914, "learning_rate": 1.3022081342162893e-08, "logits/chosen": 3.262021780014038, "logits/rejected": 3.186521053314209, "logps/chosen": -89.22022247314453, "logps/rejected": -166.5821533203125, "loss": 0.8293, "nll_loss": 0.8261132836341858, "rewards/accuracies": 1.0, "rewards/chosen": 3.7319681644439697, "rewards/margins": 9.690620422363281, "rewards/rejected": -5.958652019500732, "step": 5044 }, { "epoch": 0.8408333333333333, "grad_norm": 30.114585876464844, "learning_rate": 1.2995458429429917e-08, "logits/chosen": 3.1673974990844727, "logits/rejected": 3.237257242202759, "logps/chosen": -81.27523803710938, "logps/rejected": -303.8717041015625, "loss": 1.0554, "nll_loss": 1.0419901609420776, "rewards/accuracies": 1.0, "rewards/chosen": 1.4071136713027954, "rewards/margins": 9.38220500946045, "rewards/rejected": -7.975091457366943, "step": 5045 }, { "epoch": 0.841, "grad_norm": 24.062705993652344, "learning_rate": 1.2968860867731567e-08, "logits/chosen": 2.97006893157959, "logits/rejected": 3.141284465789795, "logps/chosen": -36.724700927734375, "logps/rejected": -503.3829040527344, "loss": 0.5726, "nll_loss": 0.5649954080581665, "rewards/accuracies": 1.0, "rewards/chosen": 1.9762616157531738, "rewards/margins": 11.572853088378906, "rewards/rejected": -9.596590995788574, "step": 5046 }, { "epoch": 0.8411666666666666, "grad_norm": 96.57597351074219, "learning_rate": 1.2942288664817724e-08, "logits/chosen": 2.896284341812134, "logits/rejected": 2.9430160522460938, "logps/chosen": -42.08317565917969, "logps/rejected": -98.1630630493164, "loss": 1.5715, "nll_loss": 1.202376365661621, "rewards/accuracies": 1.0, "rewards/chosen": 1.8710952997207642, "rewards/margins": 2.1110031604766846, "rewards/rejected": -0.2399078607559204, "step": 5047 }, { "epoch": 0.8413333333333334, "grad_norm": 27.111738204956055, "learning_rate": 1.2915741828430926e-08, "logits/chosen": 0.29462847113609314, "logits/rejected": 2.3340511322021484, "logps/chosen": -18.097578048706055, "logps/rejected": -377.8680419921875, "loss": 0.4191, "nll_loss": 0.41130855679512024, "rewards/accuracies": 1.0, "rewards/chosen": 1.9335561990737915, "rewards/margins": 16.381271362304688, "rewards/rejected": -14.447715759277344, "step": 5048 }, { "epoch": 0.8415, "grad_norm": 141.8604278564453, "learning_rate": 1.2889220366306275e-08, "logits/chosen": 1.7513004541397095, "logits/rejected": 1.590952754020691, "logps/chosen": -98.2514877319336, "logps/rejected": -39.735477447509766, "loss": 1.9408, "nll_loss": 1.0679510831832886, "rewards/accuracies": 1.0, "rewards/chosen": 0.8689277768135071, "rewards/margins": 0.28557318449020386, "rewards/rejected": 0.5833545923233032, "step": 5049 }, { "epoch": 0.8416666666666667, "grad_norm": 603.356201171875, "learning_rate": 1.2862724286171466e-08, "logits/chosen": 1.860280156135559, "logits/rejected": 1.7798951864242554, "logps/chosen": -409.31781005859375, "logps/rejected": -106.61003112792969, "loss": 5.7152, "nll_loss": 1.1032823324203491, "rewards/accuracies": 0.0, "rewards/chosen": -3.8199801445007324, "rewards/margins": -4.56363582611084, "rewards/rejected": 0.7436554431915283, "step": 5050 }, { "epoch": 0.8418333333333333, "grad_norm": 152.42678833007812, "learning_rate": 1.2836253595746827e-08, "logits/chosen": 2.4244322776794434, "logits/rejected": 2.5467119216918945, "logps/chosen": -211.3375244140625, "logps/rejected": -137.36880493164062, "loss": 2.4004, "nll_loss": 1.6256731748580933, "rewards/accuracies": 1.0, "rewards/chosen": -4.681805610656738, "rewards/margins": 2.148345947265625, "rewards/rejected": -6.830151557922363, "step": 5051 }, { "epoch": 0.842, "grad_norm": 26.306047439575195, "learning_rate": 1.2809808302745295e-08, "logits/chosen": 1.221207618713379, "logits/rejected": 2.0669970512390137, "logps/chosen": -69.03932189941406, "logps/rejected": -368.2776184082031, "loss": 0.8173, "nll_loss": 0.8122274279594421, "rewards/accuracies": 1.0, "rewards/chosen": 2.5082650184631348, "rewards/margins": 9.649467468261719, "rewards/rejected": -7.141201972961426, "step": 5052 }, { "epoch": 0.8421666666666666, "grad_norm": 16.661787033081055, "learning_rate": 1.2783388414872399e-08, "logits/chosen": 2.960237979888916, "logits/rejected": 2.8145384788513184, "logps/chosen": -165.1609344482422, "logps/rejected": -102.4871826171875, "loss": 0.905, "nll_loss": 0.8976137638092041, "rewards/accuracies": 1.0, "rewards/chosen": 2.6329331398010254, "rewards/margins": 8.038881301879883, "rewards/rejected": -5.405948162078857, "step": 5053 }, { "epoch": 0.8423333333333334, "grad_norm": 70.4345932006836, "learning_rate": 1.2756993939826233e-08, "logits/chosen": 2.927482843399048, "logits/rejected": 2.858976125717163, "logps/chosen": -50.20387268066406, "logps/rejected": -54.134521484375, "loss": 1.9059, "nll_loss": 1.6734625101089478, "rewards/accuracies": 1.0, "rewards/chosen": 1.8284072875976562, "rewards/margins": 2.775576114654541, "rewards/rejected": -0.94716876745224, "step": 5054 }, { "epoch": 0.8425, "grad_norm": 30.330501556396484, "learning_rate": 1.2730624885297536e-08, "logits/chosen": 3.2514138221740723, "logits/rejected": 3.2646517753601074, "logps/chosen": -81.2559814453125, "logps/rejected": -303.31024169921875, "loss": 1.0551, "nll_loss": 1.041743278503418, "rewards/accuracies": 1.0, "rewards/chosen": 1.4090393781661987, "rewards/margins": 9.327984809875488, "rewards/rejected": -7.9189453125, "step": 5055 }, { "epoch": 0.8426666666666667, "grad_norm": 23.67348289489746, "learning_rate": 1.2704281258969596e-08, "logits/chosen": 2.388721227645874, "logits/rejected": 2.6694300174713135, "logps/chosen": -59.17483139038086, "logps/rejected": -289.458984375, "loss": 0.7141, "nll_loss": 0.7044624090194702, "rewards/accuracies": 1.0, "rewards/chosen": 1.9109928607940674, "rewards/margins": 8.164661407470703, "rewards/rejected": -6.253668785095215, "step": 5056 }, { "epoch": 0.8428333333333333, "grad_norm": 42.85346603393555, "learning_rate": 1.2677963068518337e-08, "logits/chosen": 2.719592571258545, "logits/rejected": 3.0436108112335205, "logps/chosen": -17.648799896240234, "logps/rejected": -110.07709503173828, "loss": 0.6606, "nll_loss": 0.6536594033241272, "rewards/accuracies": 1.0, "rewards/chosen": 2.0998952388763428, "rewards/margins": 10.021767616271973, "rewards/rejected": -7.921872138977051, "step": 5057 }, { "epoch": 0.843, "grad_norm": 31.147144317626953, "learning_rate": 1.2651670321612263e-08, "logits/chosen": 1.6423078775405884, "logits/rejected": 1.6697334051132202, "logps/chosen": -116.46366119384766, "logps/rejected": -139.59814453125, "loss": 1.284, "nll_loss": 1.265909194946289, "rewards/accuracies": 1.0, "rewards/chosen": 1.2478065490722656, "rewards/margins": 7.05353307723999, "rewards/rejected": -5.805726528167725, "step": 5058 }, { "epoch": 0.8431666666666666, "grad_norm": 26.088502883911133, "learning_rate": 1.2625403025912396e-08, "logits/chosen": 2.792433500289917, "logits/rejected": 3.0872113704681396, "logps/chosen": -58.72013854980469, "logps/rejected": -336.19256591796875, "loss": 0.7101, "nll_loss": 0.6990493535995483, "rewards/accuracies": 1.0, "rewards/chosen": 1.5824006795883179, "rewards/margins": 11.434396743774414, "rewards/rejected": -9.851996421813965, "step": 5059 }, { "epoch": 0.8433333333333334, "grad_norm": 19.580669403076172, "learning_rate": 1.2599161189072427e-08, "logits/chosen": 1.4315979480743408, "logits/rejected": 0.8290650844573975, "logps/chosen": -88.04834747314453, "logps/rejected": -56.85050964355469, "loss": 0.7278, "nll_loss": 0.6932940483093262, "rewards/accuracies": 1.0, "rewards/chosen": 2.8235924243927, "rewards/margins": 5.962882041931152, "rewards/rejected": -3.1392898559570312, "step": 5060 }, { "epoch": 0.8435, "grad_norm": 22.994033813476562, "learning_rate": 1.2572944818738584e-08, "logits/chosen": 1.8838850259780884, "logits/rejected": 2.107341766357422, "logps/chosen": -70.89012145996094, "logps/rejected": -241.32113647460938, "loss": 0.6752, "nll_loss": 0.6625245809555054, "rewards/accuracies": 1.0, "rewards/chosen": 1.4453620910644531, "rewards/margins": 11.372381210327148, "rewards/rejected": -9.927019119262695, "step": 5061 }, { "epoch": 0.8436666666666667, "grad_norm": 29.973007202148438, "learning_rate": 1.2546753922549735e-08, "logits/chosen": 2.7600698471069336, "logits/rejected": 2.8232946395874023, "logps/chosen": -55.018768310546875, "logps/rejected": -87.19850158691406, "loss": 0.7715, "nll_loss": 0.7434969544410706, "rewards/accuracies": 1.0, "rewards/chosen": 1.0809814929962158, "rewards/margins": 5.6559038162231445, "rewards/rejected": -4.57492208480835, "step": 5062 }, { "epoch": 0.8438333333333333, "grad_norm": 27.818777084350586, "learning_rate": 1.2520588508137209e-08, "logits/chosen": 2.6916985511779785, "logits/rejected": 2.9343526363372803, "logps/chosen": -30.850482940673828, "logps/rejected": -240.63877868652344, "loss": 0.5866, "nll_loss": 0.5713053345680237, "rewards/accuracies": 1.0, "rewards/chosen": 1.396041989326477, "rewards/margins": 7.50271463394165, "rewards/rejected": -6.106672763824463, "step": 5063 }, { "epoch": 0.844, "grad_norm": 30.985370635986328, "learning_rate": 1.2494448583125018e-08, "logits/chosen": 2.7825076580047607, "logits/rejected": 3.036048650741577, "logps/chosen": -47.011962890625, "logps/rejected": -211.2247314453125, "loss": 0.8037, "nll_loss": 0.7968131303787231, "rewards/accuracies": 1.0, "rewards/chosen": 2.1110992431640625, "rewards/margins": 9.918939590454102, "rewards/rejected": -7.807840347290039, "step": 5064 }, { "epoch": 0.8441666666666666, "grad_norm": 23.871557235717773, "learning_rate": 1.24683341551297e-08, "logits/chosen": 2.9920568466186523, "logits/rejected": 3.040330171585083, "logps/chosen": -116.92398071289062, "logps/rejected": -152.906494140625, "loss": 1.022, "nll_loss": 1.007965326309204, "rewards/accuracies": 1.0, "rewards/chosen": 1.9594131708145142, "rewards/margins": 6.819384574890137, "rewards/rejected": -4.859971523284912, "step": 5065 }, { "epoch": 0.8443333333333334, "grad_norm": 21.71895408630371, "learning_rate": 1.2442245231760396e-08, "logits/chosen": 2.712388753890991, "logits/rejected": 2.770206928253174, "logps/chosen": -14.960067749023438, "logps/rejected": -188.2747344970703, "loss": 0.3113, "nll_loss": 0.2992013990879059, "rewards/accuracies": 1.0, "rewards/chosen": 2.7042815685272217, "rewards/margins": 7.211421012878418, "rewards/rejected": -4.507139682769775, "step": 5066 }, { "epoch": 0.8445, "grad_norm": 79.09848022460938, "learning_rate": 1.2416181820618743e-08, "logits/chosen": 2.189903736114502, "logits/rejected": 2.1222164630889893, "logps/chosen": -9.302108764648438, "logps/rejected": -99.29544830322266, "loss": 0.678, "nll_loss": 0.6644362807273865, "rewards/accuracies": 1.0, "rewards/chosen": 2.1265616416931152, "rewards/margins": 6.858452796936035, "rewards/rejected": -4.73189115524292, "step": 5067 }, { "epoch": 0.8446666666666667, "grad_norm": 30.311710357666016, "learning_rate": 1.2390143929299023e-08, "logits/chosen": 2.7306466102600098, "logits/rejected": 2.982402801513672, "logps/chosen": -27.821359634399414, "logps/rejected": -240.4058380126953, "loss": 0.5815, "nll_loss": 0.5564272403717041, "rewards/accuracies": 1.0, "rewards/chosen": 0.8483713269233704, "rewards/margins": 6.814674377441406, "rewards/rejected": -5.966302871704102, "step": 5068 }, { "epoch": 0.8448333333333333, "grad_norm": 26.748138427734375, "learning_rate": 1.2364131565388058e-08, "logits/chosen": 2.5428895950317383, "logits/rejected": 2.646827220916748, "logps/chosen": -27.91336441040039, "logps/rejected": -328.9354248046875, "loss": 0.5755, "nll_loss": 0.569660484790802, "rewards/accuracies": 1.0, "rewards/chosen": 2.23386549949646, "rewards/margins": 14.876909255981445, "rewards/rejected": -12.643043518066406, "step": 5069 }, { "epoch": 0.845, "grad_norm": 21.02984619140625, "learning_rate": 1.2338144736465239e-08, "logits/chosen": 2.5954442024230957, "logits/rejected": 2.9648163318634033, "logps/chosen": -113.2273178100586, "logps/rejected": -82.04246520996094, "loss": 0.7415, "nll_loss": 0.7166286110877991, "rewards/accuracies": 1.0, "rewards/chosen": 3.6972603797912598, "rewards/margins": 7.0974202156066895, "rewards/rejected": -3.4001598358154297, "step": 5070 }, { "epoch": 0.8451666666666666, "grad_norm": 61.415096282958984, "learning_rate": 1.2312183450102464e-08, "logits/chosen": 1.5579426288604736, "logits/rejected": 2.549823522567749, "logps/chosen": -37.23770523071289, "logps/rejected": -209.98377990722656, "loss": 1.2877, "nll_loss": 1.2840584516525269, "rewards/accuracies": 1.0, "rewards/chosen": 2.8140499591827393, "rewards/margins": 10.536415100097656, "rewards/rejected": -7.722365379333496, "step": 5071 }, { "epoch": 0.8453333333333334, "grad_norm": 31.624807357788086, "learning_rate": 1.228624771386424e-08, "logits/chosen": 2.4981460571289062, "logits/rejected": 2.5738298892974854, "logps/chosen": -20.862504959106445, "logps/rejected": -137.84881591796875, "loss": 0.5463, "nll_loss": 0.5349360108375549, "rewards/accuracies": 1.0, "rewards/chosen": 2.3986525535583496, "rewards/margins": 7.20806884765625, "rewards/rejected": -4.8094162940979, "step": 5072 }, { "epoch": 0.8455, "grad_norm": 22.25510025024414, "learning_rate": 1.226033753530763e-08, "logits/chosen": 2.4330129623413086, "logits/rejected": 2.897385597229004, "logps/chosen": -31.572689056396484, "logps/rejected": -328.2094421386719, "loss": 0.5252, "nll_loss": 0.5092369318008423, "rewards/accuracies": 1.0, "rewards/chosen": 1.3948822021484375, "rewards/margins": 7.210943698883057, "rewards/rejected": -5.816061496734619, "step": 5073 }, { "epoch": 0.8456666666666667, "grad_norm": 57.18408966064453, "learning_rate": 1.2234452921982263e-08, "logits/chosen": 2.4137954711914062, "logits/rejected": 2.647124767303467, "logps/chosen": -24.563716888427734, "logps/rejected": -135.97238159179688, "loss": 0.9624, "nll_loss": 0.909767210483551, "rewards/accuracies": 1.0, "rewards/chosen": 0.6172782778739929, "rewards/margins": 4.426639556884766, "rewards/rejected": -3.809361219406128, "step": 5074 }, { "epoch": 0.8458333333333333, "grad_norm": 30.109336853027344, "learning_rate": 1.220859388143024e-08, "logits/chosen": 1.7889975309371948, "logits/rejected": 2.6131205558776855, "logps/chosen": -31.413549423217773, "logps/rejected": -190.13516235351562, "loss": 0.5759, "nll_loss": 0.5416128635406494, "rewards/accuracies": 1.0, "rewards/chosen": 0.38777485489845276, "rewards/margins": 10.196317672729492, "rewards/rejected": -9.80854320526123, "step": 5075 }, { "epoch": 0.846, "grad_norm": 36.28724670410156, "learning_rate": 1.218276042118629e-08, "logits/chosen": 2.7565059661865234, "logits/rejected": 2.8804664611816406, "logps/chosen": -19.620466232299805, "logps/rejected": -215.85354614257812, "loss": 0.5718, "nll_loss": 0.5605847835540771, "rewards/accuracies": 1.0, "rewards/chosen": 1.735131025314331, "rewards/margins": 7.919878005981445, "rewards/rejected": -6.184747219085693, "step": 5076 }, { "epoch": 0.8461666666666666, "grad_norm": 30.659215927124023, "learning_rate": 1.2156952548777655e-08, "logits/chosen": 1.827347993850708, "logits/rejected": 1.712415099143982, "logps/chosen": -116.06866455078125, "logps/rejected": -141.59689331054688, "loss": 1.2786, "nll_loss": 1.2616158723831177, "rewards/accuracies": 1.0, "rewards/chosen": 1.287306308746338, "rewards/margins": 7.29290771484375, "rewards/rejected": -6.005601406097412, "step": 5077 }, { "epoch": 0.8463333333333334, "grad_norm": 48.5043830871582, "learning_rate": 1.2131170271724145e-08, "logits/chosen": 2.8532512187957764, "logits/rejected": 2.7475950717926025, "logps/chosen": -34.73738479614258, "logps/rejected": -54.57199478149414, "loss": 0.9892, "nll_loss": 0.9141417145729065, "rewards/accuracies": 1.0, "rewards/chosen": 1.2963674068450928, "rewards/margins": 3.9545700550079346, "rewards/rejected": -2.658202648162842, "step": 5078 }, { "epoch": 0.8465, "grad_norm": 24.203819274902344, "learning_rate": 1.2105413597538105e-08, "logits/chosen": 2.2374024391174316, "logits/rejected": 2.103689432144165, "logps/chosen": -90.38823699951172, "logps/rejected": -98.32820892333984, "loss": 1.0318, "nll_loss": 1.0271388292312622, "rewards/accuracies": 1.0, "rewards/chosen": 3.1265757083892822, "rewards/margins": 8.935945510864258, "rewards/rejected": -5.8093695640563965, "step": 5079 }, { "epoch": 0.8466666666666667, "grad_norm": 22.623592376708984, "learning_rate": 1.2079682533724378e-08, "logits/chosen": 1.322622537612915, "logits/rejected": 1.4174416065216064, "logps/chosen": -75.68345642089844, "logps/rejected": -140.91433715820312, "loss": 0.7537, "nll_loss": 0.7419946193695068, "rewards/accuracies": 1.0, "rewards/chosen": 2.153656005859375, "rewards/margins": 7.15395450592041, "rewards/rejected": -5.000298500061035, "step": 5080 }, { "epoch": 0.8468333333333333, "grad_norm": 17.522518157958984, "learning_rate": 1.2053977087780398e-08, "logits/chosen": 2.0558881759643555, "logits/rejected": 1.885585069656372, "logps/chosen": -132.0452880859375, "logps/rejected": -128.25277709960938, "loss": 0.768, "nll_loss": 0.7632675766944885, "rewards/accuracies": 1.0, "rewards/chosen": 2.98529052734375, "rewards/margins": 8.96754264831543, "rewards/rejected": -5.9822516441345215, "step": 5081 }, { "epoch": 0.847, "grad_norm": 262.0534362792969, "learning_rate": 1.2028297267196109e-08, "logits/chosen": 2.9412841796875, "logits/rejected": 3.1207995414733887, "logps/chosen": -24.874099731445312, "logps/rejected": -98.42505645751953, "loss": 2.361, "nll_loss": 0.777315616607666, "rewards/accuracies": 0.0, "rewards/chosen": 0.6546744108200073, "rewards/margins": -0.8783624172210693, "rewards/rejected": 1.5330368280410767, "step": 5082 }, { "epoch": 0.8471666666666666, "grad_norm": 26.106815338134766, "learning_rate": 1.2002643079454022e-08, "logits/chosen": 2.8302409648895264, "logits/rejected": 3.0119659900665283, "logps/chosen": -82.30506896972656, "logps/rejected": -281.84539794921875, "loss": 0.8751, "nll_loss": 0.8573445677757263, "rewards/accuracies": 1.0, "rewards/chosen": 1.782915472984314, "rewards/margins": 6.3651299476623535, "rewards/rejected": -4.58221435546875, "step": 5083 }, { "epoch": 0.8473333333333334, "grad_norm": 14.528467178344727, "learning_rate": 1.1977014532029107e-08, "logits/chosen": 2.3181142807006836, "logits/rejected": 2.3701155185699463, "logps/chosen": -190.0546875, "logps/rejected": -147.50601196289062, "loss": 0.6745, "nll_loss": 0.6692067384719849, "rewards/accuracies": 1.0, "rewards/chosen": 3.1062774658203125, "rewards/margins": 8.683927536010742, "rewards/rejected": -5.5776495933532715, "step": 5084 }, { "epoch": 0.8475, "grad_norm": 264.7221984863281, "learning_rate": 1.195141163238892e-08, "logits/chosen": 2.952023983001709, "logits/rejected": 2.8821704387664795, "logps/chosen": -63.15718460083008, "logps/rejected": -20.6796817779541, "loss": 3.3815, "nll_loss": 1.1483123302459717, "rewards/accuracies": 0.0, "rewards/chosen": -0.6264083981513977, "rewards/margins": -1.8897535800933838, "rewards/rejected": 1.2633452415466309, "step": 5085 }, { "epoch": 0.8476666666666667, "grad_norm": 29.62980842590332, "learning_rate": 1.1925834387993538e-08, "logits/chosen": 2.482588052749634, "logits/rejected": 2.6165668964385986, "logps/chosen": -23.91451644897461, "logps/rejected": -205.07037353515625, "loss": 0.5607, "nll_loss": 0.5435118079185486, "rewards/accuracies": 1.0, "rewards/chosen": 1.5958667993545532, "rewards/margins": 6.5321855545043945, "rewards/rejected": -4.936318874359131, "step": 5086 }, { "epoch": 0.8478333333333333, "grad_norm": 227.79676818847656, "learning_rate": 1.1900282806295592e-08, "logits/chosen": 3.411198854446411, "logits/rejected": 3.359123706817627, "logps/chosen": -46.50010681152344, "logps/rejected": -66.3202896118164, "loss": 3.2255, "nll_loss": 1.2236870527267456, "rewards/accuracies": 0.0, "rewards/chosen": 2.2708823680877686, "rewards/margins": -1.0399606227874756, "rewards/rejected": 3.310842990875244, "step": 5087 }, { "epoch": 0.848, "grad_norm": 50.56968688964844, "learning_rate": 1.1874756894740135e-08, "logits/chosen": 1.6545644998550415, "logits/rejected": 2.6572091579437256, "logps/chosen": -102.67457580566406, "logps/rejected": -287.9089050292969, "loss": 1.4152, "nll_loss": 1.4065009355545044, "rewards/accuracies": 1.0, "rewards/chosen": 1.9425981044769287, "rewards/margins": 8.72722053527832, "rewards/rejected": -6.7846221923828125, "step": 5088 }, { "epoch": 0.8481666666666666, "grad_norm": 53.669105529785156, "learning_rate": 1.1849256660764828e-08, "logits/chosen": 2.3746988773345947, "logits/rejected": 2.684894561767578, "logps/chosen": -39.71847152709961, "logps/rejected": -168.6275634765625, "loss": 0.9438, "nll_loss": 0.8634451031684875, "rewards/accuracies": 1.0, "rewards/chosen": 0.2800022065639496, "rewards/margins": 3.687117576599121, "rewards/rejected": -3.4071154594421387, "step": 5089 }, { "epoch": 0.8483333333333334, "grad_norm": 53.58294677734375, "learning_rate": 1.1823782111799841e-08, "logits/chosen": 2.292088031768799, "logits/rejected": 2.308025598526001, "logps/chosen": -91.69347381591797, "logps/rejected": -29.928142547607422, "loss": 1.4764, "nll_loss": 0.9651946425437927, "rewards/accuracies": 1.0, "rewards/chosen": 3.3799102306365967, "rewards/margins": 2.5387301445007324, "rewards/rejected": 0.8411800265312195, "step": 5090 }, { "epoch": 0.8485, "grad_norm": 22.466075897216797, "learning_rate": 1.1798333255267855e-08, "logits/chosen": 2.168731212615967, "logits/rejected": 2.686112403869629, "logps/chosen": -12.526546478271484, "logps/rejected": -152.1455841064453, "loss": 0.3708, "nll_loss": 0.34795957803726196, "rewards/accuracies": 1.0, "rewards/chosen": 1.1170347929000854, "rewards/margins": 6.2887420654296875, "rewards/rejected": -5.1717071533203125, "step": 5091 }, { "epoch": 0.8486666666666667, "grad_norm": 28.24310874938965, "learning_rate": 1.177291009858402e-08, "logits/chosen": 1.4379229545593262, "logits/rejected": 1.445936679840088, "logps/chosen": -55.97455978393555, "logps/rejected": -69.79106140136719, "loss": 0.7118, "nll_loss": 0.6826165318489075, "rewards/accuracies": 1.0, "rewards/chosen": 0.7987174987792969, "rewards/margins": 6.017770767211914, "rewards/rejected": -5.219053268432617, "step": 5092 }, { "epoch": 0.8488333333333333, "grad_norm": 60.53874588012695, "learning_rate": 1.1747512649156055e-08, "logits/chosen": 3.1889429092407227, "logits/rejected": 3.132716655731201, "logps/chosen": -55.63716125488281, "logps/rejected": -105.77677917480469, "loss": 1.8182, "nll_loss": 1.7947471141815186, "rewards/accuracies": 1.0, "rewards/chosen": 1.863826036453247, "rewards/margins": 5.869331359863281, "rewards/rejected": -4.005505561828613, "step": 5093 }, { "epoch": 0.849, "grad_norm": 35.39863204956055, "learning_rate": 1.172214091438416e-08, "logits/chosen": 1.6231714487075806, "logits/rejected": 2.2637739181518555, "logps/chosen": -45.2922477722168, "logps/rejected": -163.8565673828125, "loss": 0.9944, "nll_loss": 0.9636648297309875, "rewards/accuracies": 1.0, "rewards/chosen": 0.8806740641593933, "rewards/margins": 5.626593112945557, "rewards/rejected": -4.745919227600098, "step": 5094 }, { "epoch": 0.8491666666666666, "grad_norm": 53.75872802734375, "learning_rate": 1.1696794901661078e-08, "logits/chosen": 2.7589101791381836, "logits/rejected": 2.928140640258789, "logps/chosen": -43.67804718017578, "logps/rejected": -166.70223999023438, "loss": 0.8247, "nll_loss": 0.8088527917861938, "rewards/accuracies": 1.0, "rewards/chosen": 1.2740074396133423, "rewards/margins": 8.15943717956543, "rewards/rejected": -6.885429382324219, "step": 5095 }, { "epoch": 0.8493333333333334, "grad_norm": 22.663978576660156, "learning_rate": 1.1671474618371979e-08, "logits/chosen": 2.6191623210906982, "logits/rejected": 2.457735300064087, "logps/chosen": -10.05056095123291, "logps/rejected": -46.6176643371582, "loss": 0.2537, "nll_loss": 0.2233457714319229, "rewards/accuracies": 1.0, "rewards/chosen": 2.7442290782928467, "rewards/margins": 6.052044868469238, "rewards/rejected": -3.3078160285949707, "step": 5096 }, { "epoch": 0.8495, "grad_norm": 97.05767822265625, "learning_rate": 1.1646180071894606e-08, "logits/chosen": 2.824925422668457, "logits/rejected": 2.7880990505218506, "logps/chosen": -66.43997192382812, "logps/rejected": -42.88172912597656, "loss": 1.4604, "nll_loss": 0.7636778354644775, "rewards/accuracies": 1.0, "rewards/chosen": 1.532392978668213, "rewards/margins": 0.8987041711807251, "rewards/rejected": 0.6336888074874878, "step": 5097 }, { "epoch": 0.8496666666666667, "grad_norm": 28.017906188964844, "learning_rate": 1.1620911269599199e-08, "logits/chosen": 2.217689037322998, "logits/rejected": 2.2627172470092773, "logps/chosen": -68.01728820800781, "logps/rejected": -316.4819641113281, "loss": 0.9926, "nll_loss": 0.9857578277587891, "rewards/accuracies": 1.0, "rewards/chosen": 2.1027450561523438, "rewards/margins": 10.259696960449219, "rewards/rejected": -8.156951904296875, "step": 5098 }, { "epoch": 0.8498333333333333, "grad_norm": 26.064794540405273, "learning_rate": 1.1595668218848465e-08, "logits/chosen": 1.8652832508087158, "logits/rejected": 2.867550849914551, "logps/chosen": -73.54598236083984, "logps/rejected": -305.1517028808594, "loss": 0.9189, "nll_loss": 0.9079750776290894, "rewards/accuracies": 1.0, "rewards/chosen": 1.6090021133422852, "rewards/margins": 10.104873657226562, "rewards/rejected": -8.495871543884277, "step": 5099 }, { "epoch": 0.85, "grad_norm": 23.93058967590332, "learning_rate": 1.1570450926997655e-08, "logits/chosen": 1.9552701711654663, "logits/rejected": 2.4750866889953613, "logps/chosen": -92.9524154663086, "logps/rejected": -535.4852294921875, "loss": 0.9332, "nll_loss": 0.9295242428779602, "rewards/accuracies": 1.0, "rewards/chosen": 2.7191970348358154, "rewards/margins": 12.637147903442383, "rewards/rejected": -9.917950630187988, "step": 5100 }, { "epoch": 0.8501666666666666, "grad_norm": 28.843856811523438, "learning_rate": 1.1545259401394425e-08, "logits/chosen": 1.8399320840835571, "logits/rejected": 2.4402856826782227, "logps/chosen": -59.471229553222656, "logps/rejected": -158.6636962890625, "loss": 0.9193, "nll_loss": 0.9010794162750244, "rewards/accuracies": 1.0, "rewards/chosen": 2.2225067615509033, "rewards/margins": 6.383166313171387, "rewards/rejected": -4.1606597900390625, "step": 5101 }, { "epoch": 0.8503333333333334, "grad_norm": 28.077974319458008, "learning_rate": 1.1520093649379015e-08, "logits/chosen": 1.9145468473434448, "logits/rejected": 2.4699547290802, "logps/chosen": -41.742855072021484, "logps/rejected": -302.8493957519531, "loss": 0.6666, "nll_loss": 0.642197847366333, "rewards/accuracies": 1.0, "rewards/chosen": 1.0343598127365112, "rewards/margins": 6.197197437286377, "rewards/rejected": -5.162837505340576, "step": 5102 }, { "epoch": 0.8505, "grad_norm": 36.730899810791016, "learning_rate": 1.1494953678284103e-08, "logits/chosen": 0.1924019753932953, "logits/rejected": 1.5902608633041382, "logps/chosen": -63.70797348022461, "logps/rejected": -558.3157958984375, "loss": 1.066, "nll_loss": 1.0443931818008423, "rewards/accuracies": 1.0, "rewards/chosen": 0.8792301416397095, "rewards/margins": 17.274330139160156, "rewards/rejected": -16.395099639892578, "step": 5103 }, { "epoch": 0.8506666666666667, "grad_norm": 30.860675811767578, "learning_rate": 1.1469839495434919e-08, "logits/chosen": 2.448983669281006, "logits/rejected": 2.475165605545044, "logps/chosen": -111.683349609375, "logps/rejected": -131.8330841064453, "loss": 1.1767, "nll_loss": 1.1513746976852417, "rewards/accuracies": 1.0, "rewards/chosen": 2.4110443592071533, "rewards/margins": 6.032766819000244, "rewards/rejected": -3.621722459793091, "step": 5104 }, { "epoch": 0.8508333333333333, "grad_norm": 24.460081100463867, "learning_rate": 1.1444751108149075e-08, "logits/chosen": 1.7263613939285278, "logits/rejected": 2.4601495265960693, "logps/chosen": -95.57907104492188, "logps/rejected": -174.57693481445312, "loss": 0.9836, "nll_loss": 0.9654449820518494, "rewards/accuracies": 1.0, "rewards/chosen": 1.8088196516036987, "rewards/margins": 6.314611911773682, "rewards/rejected": -4.505792140960693, "step": 5105 }, { "epoch": 0.851, "grad_norm": 35.46180725097656, "learning_rate": 1.1419688523736759e-08, "logits/chosen": 3.1059699058532715, "logits/rejected": 3.0007054805755615, "logps/chosen": -63.262062072753906, "logps/rejected": -131.2398681640625, "loss": 0.8686, "nll_loss": 0.843494176864624, "rewards/accuracies": 1.0, "rewards/chosen": 2.2892982959747314, "rewards/margins": 5.970717430114746, "rewards/rejected": -3.6814193725585938, "step": 5106 }, { "epoch": 0.8511666666666666, "grad_norm": 29.950464248657227, "learning_rate": 1.1394651749500594e-08, "logits/chosen": 2.5482776165008545, "logits/rejected": 2.6463396549224854, "logps/chosen": -111.03201293945312, "logps/rejected": -159.33534240722656, "loss": 1.0532, "nll_loss": 1.0376824140548706, "rewards/accuracies": 1.0, "rewards/chosen": 1.327835202217102, "rewards/margins": 7.820592403411865, "rewards/rejected": -6.492757320404053, "step": 5107 }, { "epoch": 0.8513333333333334, "grad_norm": 37.92477035522461, "learning_rate": 1.1369640792735713e-08, "logits/chosen": 3.381150722503662, "logits/rejected": 3.324814796447754, "logps/chosen": -48.240318298339844, "logps/rejected": -166.8029022216797, "loss": 0.9213, "nll_loss": 0.8770966529846191, "rewards/accuracies": 1.0, "rewards/chosen": 1.9159477949142456, "rewards/margins": 4.997291088104248, "rewards/rejected": -3.081343173980713, "step": 5108 }, { "epoch": 0.8515, "grad_norm": 23.907474517822266, "learning_rate": 1.1344655660729674e-08, "logits/chosen": 2.6688127517700195, "logits/rejected": 2.7217133045196533, "logps/chosen": -89.06587219238281, "logps/rejected": -255.29327392578125, "loss": 0.9003, "nll_loss": 0.8906585574150085, "rewards/accuracies": 1.0, "rewards/chosen": 1.7357276678085327, "rewards/margins": 10.13828182220459, "rewards/rejected": -8.402554512023926, "step": 5109 }, { "epoch": 0.8516666666666667, "grad_norm": 15.763911247253418, "learning_rate": 1.1319696360762566e-08, "logits/chosen": 2.41436767578125, "logits/rejected": 2.363435745239258, "logps/chosen": -247.05697631835938, "logps/rejected": -320.3453369140625, "loss": 0.8533, "nll_loss": 0.8489929437637329, "rewards/accuracies": 1.0, "rewards/chosen": 2.5318546295166016, "rewards/margins": 12.621132850646973, "rewards/rejected": -10.089278221130371, "step": 5110 }, { "epoch": 0.8518333333333333, "grad_norm": 29.000164031982422, "learning_rate": 1.1294762900106925e-08, "logits/chosen": 2.597032070159912, "logits/rejected": 2.795654773712158, "logps/chosen": -88.37540435791016, "logps/rejected": -240.89674377441406, "loss": 1.2338, "nll_loss": 1.2274360656738281, "rewards/accuracies": 1.0, "rewards/chosen": 2.7004618644714355, "rewards/margins": 8.345124244689941, "rewards/rejected": -5.644662380218506, "step": 5111 }, { "epoch": 0.852, "grad_norm": 19.251544952392578, "learning_rate": 1.1269855286027796e-08, "logits/chosen": 2.644611120223999, "logits/rejected": 2.73366641998291, "logps/chosen": -123.73268127441406, "logps/rejected": -79.81632232666016, "loss": 0.6628, "nll_loss": 0.6345265507698059, "rewards/accuracies": 1.0, "rewards/chosen": 1.0634323358535767, "rewards/margins": 5.643139839172363, "rewards/rejected": -4.579707622528076, "step": 5112 }, { "epoch": 0.8521666666666666, "grad_norm": 22.73198890686035, "learning_rate": 1.1244973525782597e-08, "logits/chosen": 2.3793890476226807, "logits/rejected": 2.3079237937927246, "logps/chosen": -53.831111907958984, "logps/rejected": -128.14248657226562, "loss": 0.6045, "nll_loss": 0.5851207971572876, "rewards/accuracies": 1.0, "rewards/chosen": 2.4750194549560547, "rewards/margins": 6.429478645324707, "rewards/rejected": -3.9544594287872314, "step": 5113 }, { "epoch": 0.8523333333333334, "grad_norm": 91.18299865722656, "learning_rate": 1.1220117626621317e-08, "logits/chosen": 2.5031545162200928, "logits/rejected": 2.6943204402923584, "logps/chosen": -22.263229370117188, "logps/rejected": -138.96469116210938, "loss": 1.1217, "nll_loss": 1.1131614446640015, "rewards/accuracies": 1.0, "rewards/chosen": 2.0545387268066406, "rewards/margins": 8.291985511779785, "rewards/rejected": -6.2374467849731445, "step": 5114 }, { "epoch": 0.8525, "grad_norm": 25.574188232421875, "learning_rate": 1.1195287595786351e-08, "logits/chosen": 2.140324115753174, "logits/rejected": 2.270413637161255, "logps/chosen": -32.104331970214844, "logps/rejected": -83.97415924072266, "loss": 0.4663, "nll_loss": 0.4458934962749481, "rewards/accuracies": 1.0, "rewards/chosen": 1.1756668090820312, "rewards/margins": 6.636634826660156, "rewards/rejected": -5.460968017578125, "step": 5115 }, { "epoch": 0.8526666666666667, "grad_norm": 65.10804748535156, "learning_rate": 1.1170483440512612e-08, "logits/chosen": 2.0629420280456543, "logits/rejected": 1.9944111108779907, "logps/chosen": -53.71894454956055, "logps/rejected": -114.62578582763672, "loss": 1.8613, "nll_loss": 1.8523776531219482, "rewards/accuracies": 1.0, "rewards/chosen": 1.9767497777938843, "rewards/margins": 8.338664054870605, "rewards/rejected": -6.361914157867432, "step": 5116 }, { "epoch": 0.8528333333333333, "grad_norm": 128.74720764160156, "learning_rate": 1.1145705168027375e-08, "logits/chosen": 2.2484116554260254, "logits/rejected": 1.839155673980713, "logps/chosen": -51.15364074707031, "logps/rejected": -46.36909484863281, "loss": 1.8643, "nll_loss": 1.1367474794387817, "rewards/accuracies": 1.0, "rewards/chosen": 1.8093292713165283, "rewards/margins": 0.9381683468818665, "rewards/rejected": 0.8711609244346619, "step": 5117 }, { "epoch": 0.853, "grad_norm": 24.747203826904297, "learning_rate": 1.1120952785550476e-08, "logits/chosen": 1.6697584390640259, "logits/rejected": 1.503033995628357, "logps/chosen": -52.19110107421875, "logps/rejected": -74.57624816894531, "loss": 0.6795, "nll_loss": 0.644334614276886, "rewards/accuracies": 1.0, "rewards/chosen": 2.289668321609497, "rewards/margins": 5.536505699157715, "rewards/rejected": -3.246837615966797, "step": 5118 }, { "epoch": 0.8531666666666666, "grad_norm": 23.431184768676758, "learning_rate": 1.1096226300294142e-08, "logits/chosen": 1.4066654443740845, "logits/rejected": 2.280801773071289, "logps/chosen": -46.449729919433594, "logps/rejected": -195.26548767089844, "loss": 0.6613, "nll_loss": 0.6542216539382935, "rewards/accuracies": 1.0, "rewards/chosen": 2.5254127979278564, "rewards/margins": 8.200884819030762, "rewards/rejected": -5.675472259521484, "step": 5119 }, { "epoch": 0.8533333333333334, "grad_norm": 25.117284774780273, "learning_rate": 1.1071525719463092e-08, "logits/chosen": 0.9288985729217529, "logits/rejected": 1.465327262878418, "logps/chosen": -124.36634826660156, "logps/rejected": -227.5496826171875, "loss": 0.9602, "nll_loss": 0.9493615031242371, "rewards/accuracies": 1.0, "rewards/chosen": 1.6053177118301392, "rewards/margins": 10.631580352783203, "rewards/rejected": -9.026262283325195, "step": 5120 }, { "epoch": 0.8535, "grad_norm": 226.82821655273438, "learning_rate": 1.1046851050254501e-08, "logits/chosen": 2.898667573928833, "logits/rejected": 2.961080551147461, "logps/chosen": -75.18114471435547, "logps/rejected": -16.69450569152832, "loss": 4.8335, "nll_loss": 1.2530192136764526, "rewards/accuracies": 0.0, "rewards/chosen": 0.8236343264579773, "rewards/margins": -3.0981101989746094, "rewards/rejected": 3.9217445850372314, "step": 5121 }, { "epoch": 0.8536666666666667, "grad_norm": 18.000106811523438, "learning_rate": 1.1022202299857941e-08, "logits/chosen": 2.795109510421753, "logits/rejected": 2.8448684215545654, "logps/chosen": -145.3325653076172, "logps/rejected": -50.60966491699219, "loss": 0.72, "nll_loss": 0.6920599341392517, "rewards/accuracies": 1.0, "rewards/chosen": 2.4711196422576904, "rewards/margins": 5.9508209228515625, "rewards/rejected": -3.479701280593872, "step": 5122 }, { "epoch": 0.8538333333333333, "grad_norm": 27.46475601196289, "learning_rate": 1.0997579475455465e-08, "logits/chosen": 2.454742908477783, "logits/rejected": 2.4593796730041504, "logps/chosen": -77.79734802246094, "logps/rejected": -146.79225158691406, "loss": 0.9269, "nll_loss": 0.9152628183364868, "rewards/accuracies": 1.0, "rewards/chosen": 1.564610481262207, "rewards/margins": 9.074563980102539, "rewards/rejected": -7.509953498840332, "step": 5123 }, { "epoch": 0.854, "grad_norm": 47.45281219482422, "learning_rate": 1.097298258422159e-08, "logits/chosen": 2.1347782611846924, "logits/rejected": 2.030229091644287, "logps/chosen": -24.855365753173828, "logps/rejected": -87.02163696289062, "loss": 0.6832, "nll_loss": 0.6717665791511536, "rewards/accuracies": 1.0, "rewards/chosen": 2.4945034980773926, "rewards/margins": 7.221250534057617, "rewards/rejected": -4.726747035980225, "step": 5124 }, { "epoch": 0.8541666666666666, "grad_norm": 30.12635612487793, "learning_rate": 1.0948411633323284e-08, "logits/chosen": 2.1432204246520996, "logits/rejected": 2.467705488204956, "logps/chosen": -43.878936767578125, "logps/rejected": -169.236083984375, "loss": 0.7825, "nll_loss": 0.7698058485984802, "rewards/accuracies": 1.0, "rewards/chosen": 2.110572099685669, "rewards/margins": 6.989931106567383, "rewards/rejected": -4.879358768463135, "step": 5125 }, { "epoch": 0.8543333333333333, "grad_norm": 32.65489959716797, "learning_rate": 1.0923866629919875e-08, "logits/chosen": 2.148427724838257, "logits/rejected": 2.487226724624634, "logps/chosen": -9.332413673400879, "logps/rejected": -80.81869506835938, "loss": 0.4169, "nll_loss": 0.38885053992271423, "rewards/accuracies": 1.0, "rewards/chosen": 1.4828288555145264, "rewards/margins": 5.4984025955200195, "rewards/rejected": -4.015573978424072, "step": 5126 }, { "epoch": 0.8545, "grad_norm": 95.68143463134766, "learning_rate": 1.089934758116322e-08, "logits/chosen": 2.5434975624084473, "logits/rejected": 2.521836757659912, "logps/chosen": -162.43603515625, "logps/rejected": -252.26336669921875, "loss": 1.4765, "nll_loss": 1.2591941356658936, "rewards/accuracies": 1.0, "rewards/chosen": -1.0623321533203125, "rewards/margins": 2.298511028289795, "rewards/rejected": -3.3608431816101074, "step": 5127 }, { "epoch": 0.8546666666666667, "grad_norm": 27.89240837097168, "learning_rate": 1.0874854494197571e-08, "logits/chosen": 2.699462652206421, "logits/rejected": 2.6175148487091064, "logps/chosen": -98.82633209228516, "logps/rejected": -128.66781616210938, "loss": 1.0211, "nll_loss": 1.0084319114685059, "rewards/accuracies": 1.0, "rewards/chosen": 1.518470048904419, "rewards/margins": 8.30557632446289, "rewards/rejected": -6.787106513977051, "step": 5128 }, { "epoch": 0.8548333333333333, "grad_norm": 32.705440521240234, "learning_rate": 1.0850387376159664e-08, "logits/chosen": 2.4898462295532227, "logits/rejected": 2.7836196422576904, "logps/chosen": -23.503219604492188, "logps/rejected": -124.02235412597656, "loss": 0.6248, "nll_loss": 0.5875805020332336, "rewards/accuracies": 1.0, "rewards/chosen": 0.596230685710907, "rewards/margins": 5.448153972625732, "rewards/rejected": -4.85192346572876, "step": 5129 }, { "epoch": 0.855, "grad_norm": 24.850229263305664, "learning_rate": 1.0825946234178573e-08, "logits/chosen": 1.0780775547027588, "logits/rejected": 2.2074079513549805, "logps/chosen": -66.8503646850586, "logps/rejected": -336.58551025390625, "loss": 0.8659, "nll_loss": 0.8570559620857239, "rewards/accuracies": 1.0, "rewards/chosen": 1.9384148120880127, "rewards/margins": 8.586684226989746, "rewards/rejected": -6.6482696533203125, "step": 5130 }, { "epoch": 0.8551666666666666, "grad_norm": 30.57651138305664, "learning_rate": 1.0801531075375881e-08, "logits/chosen": 2.655829906463623, "logits/rejected": 2.833399772644043, "logps/chosen": -85.97088623046875, "logps/rejected": -246.04457092285156, "loss": 1.0133, "nll_loss": 0.9996612668037415, "rewards/accuracies": 1.0, "rewards/chosen": 1.610907793045044, "rewards/margins": 7.293612480163574, "rewards/rejected": -5.682704448699951, "step": 5131 }, { "epoch": 0.8553333333333333, "grad_norm": 63.13911819458008, "learning_rate": 1.0777141906865583e-08, "logits/chosen": 2.244283437728882, "logits/rejected": 2.5076048374176025, "logps/chosen": -51.09501266479492, "logps/rejected": -298.1199035644531, "loss": 1.8324, "nll_loss": 1.8248215913772583, "rewards/accuracies": 1.0, "rewards/chosen": 2.191499710083008, "rewards/margins": 8.477333068847656, "rewards/rejected": -6.285833835601807, "step": 5132 }, { "epoch": 0.8555, "grad_norm": 18.7597713470459, "learning_rate": 1.075277873575412e-08, "logits/chosen": 2.0803422927856445, "logits/rejected": 2.593144655227661, "logps/chosen": -55.88213348388672, "logps/rejected": -151.07261657714844, "loss": 0.5604, "nll_loss": 0.553288459777832, "rewards/accuracies": 1.0, "rewards/chosen": 2.178170919418335, "rewards/margins": 8.87032699584961, "rewards/rejected": -6.6921563148498535, "step": 5133 }, { "epoch": 0.8556666666666667, "grad_norm": 24.205257415771484, "learning_rate": 1.0728441569140289e-08, "logits/chosen": 2.8899924755096436, "logits/rejected": 2.64455246925354, "logps/chosen": -138.42141723632812, "logps/rejected": -65.58830261230469, "loss": 0.9241, "nll_loss": 0.8988402485847473, "rewards/accuracies": 1.0, "rewards/chosen": 0.9144729971885681, "rewards/margins": 6.371016502380371, "rewards/rejected": -5.456543445587158, "step": 5134 }, { "epoch": 0.8558333333333333, "grad_norm": 23.725313186645508, "learning_rate": 1.0704130414115387e-08, "logits/chosen": 1.4078351259231567, "logits/rejected": 2.1001551151275635, "logps/chosen": -54.40418243408203, "logps/rejected": -252.53103637695312, "loss": 0.6283, "nll_loss": 0.6112829446792603, "rewards/accuracies": 1.0, "rewards/chosen": 1.3476532697677612, "rewards/margins": 6.993412971496582, "rewards/rejected": -5.645759582519531, "step": 5135 }, { "epoch": 0.856, "grad_norm": 31.28285026550293, "learning_rate": 1.067984527776309e-08, "logits/chosen": 2.837639808654785, "logits/rejected": 3.0440433025360107, "logps/chosen": -61.070945739746094, "logps/rejected": -211.63873291015625, "loss": 0.8084, "nll_loss": 0.7931291460990906, "rewards/accuracies": 1.0, "rewards/chosen": 1.5300911664962769, "rewards/margins": 7.02195405960083, "rewards/rejected": -5.491862773895264, "step": 5136 }, { "epoch": 0.8561666666666666, "grad_norm": 75.80364227294922, "learning_rate": 1.0655586167159525e-08, "logits/chosen": 2.8466956615448, "logits/rejected": 2.771049976348877, "logps/chosen": -68.50546264648438, "logps/rejected": -77.36587524414062, "loss": 1.2037, "nll_loss": 1.1811286211013794, "rewards/accuracies": 1.0, "rewards/chosen": 1.1134017705917358, "rewards/margins": 6.323483943939209, "rewards/rejected": -5.210082054138184, "step": 5137 }, { "epoch": 0.8563333333333333, "grad_norm": 37.01739501953125, "learning_rate": 1.0631353089373185e-08, "logits/chosen": 1.6438868045806885, "logits/rejected": 1.5582702159881592, "logps/chosen": -37.171791076660156, "logps/rejected": -84.38545227050781, "loss": 0.8429, "nll_loss": 0.8080823421478271, "rewards/accuracies": 1.0, "rewards/chosen": 0.8916946649551392, "rewards/margins": 5.234850883483887, "rewards/rejected": -4.343156337738037, "step": 5138 }, { "epoch": 0.8565, "grad_norm": 32.32474136352539, "learning_rate": 1.0607146051465011e-08, "logits/chosen": 2.5827035903930664, "logits/rejected": 2.5190269947052, "logps/chosen": -36.40596389770508, "logps/rejected": -53.9149169921875, "loss": 0.7873, "nll_loss": 0.7584574818611145, "rewards/accuracies": 1.0, "rewards/chosen": 1.4848953485488892, "rewards/margins": 5.452865123748779, "rewards/rejected": -3.9679696559906006, "step": 5139 }, { "epoch": 0.8566666666666667, "grad_norm": 29.349807739257812, "learning_rate": 1.0582965060488358e-08, "logits/chosen": 1.5398586988449097, "logits/rejected": 1.7063863277435303, "logps/chosen": -10.012712478637695, "logps/rejected": -61.27614212036133, "loss": 0.2846, "nll_loss": 0.2503177523612976, "rewards/accuracies": 1.0, "rewards/chosen": 1.1077769994735718, "rewards/margins": 5.152702808380127, "rewards/rejected": -4.044925689697266, "step": 5140 }, { "epoch": 0.8568333333333333, "grad_norm": 22.405025482177734, "learning_rate": 1.0558810123488992e-08, "logits/chosen": 2.44290828704834, "logits/rejected": 2.4119770526885986, "logps/chosen": -114.92240142822266, "logps/rejected": -128.97329711914062, "loss": 0.9647, "nll_loss": 0.9497719407081604, "rewards/accuracies": 1.0, "rewards/chosen": 2.9414210319519043, "rewards/margins": 7.071928024291992, "rewards/rejected": -4.130506992340088, "step": 5141 }, { "epoch": 0.857, "grad_norm": 32.60691833496094, "learning_rate": 1.0534681247505106e-08, "logits/chosen": 2.0362296104431152, "logits/rejected": 2.465040922164917, "logps/chosen": -18.063312530517578, "logps/rejected": -521.2152099609375, "loss": 0.6709, "nll_loss": 0.669011652469635, "rewards/accuracies": 1.0, "rewards/chosen": 3.4191484451293945, "rewards/margins": 12.207381248474121, "rewards/rejected": -8.788232803344727, "step": 5142 }, { "epoch": 0.8571666666666666, "grad_norm": 22.913986206054688, "learning_rate": 1.0510578439567219e-08, "logits/chosen": 1.3659100532531738, "logits/rejected": 1.7425408363342285, "logps/chosen": -85.84435272216797, "logps/rejected": -355.32275390625, "loss": 0.841, "nll_loss": 0.8334403038024902, "rewards/accuracies": 1.0, "rewards/chosen": 1.988111138343811, "rewards/margins": 10.410245895385742, "rewards/rejected": -8.422134399414062, "step": 5143 }, { "epoch": 0.8573333333333333, "grad_norm": 26.71038818359375, "learning_rate": 1.0486501706698336e-08, "logits/chosen": 2.8396525382995605, "logits/rejected": 2.9166629314422607, "logps/chosen": -117.55113220214844, "logps/rejected": -86.2042236328125, "loss": 1.1456, "nll_loss": 1.1195344924926758, "rewards/accuracies": 1.0, "rewards/chosen": 1.643096923828125, "rewards/margins": 5.642389297485352, "rewards/rejected": -3.9992923736572266, "step": 5144 }, { "epoch": 0.8575, "grad_norm": 26.305349349975586, "learning_rate": 1.0462451055913845e-08, "logits/chosen": 2.534527540206909, "logits/rejected": 2.464939594268799, "logps/chosen": -7.548459529876709, "logps/rejected": -103.40054321289062, "loss": 0.2975, "nll_loss": 0.2903253734111786, "rewards/accuracies": 1.0, "rewards/chosen": 2.5995583534240723, "rewards/margins": 8.134511947631836, "rewards/rejected": -5.534953594207764, "step": 5145 }, { "epoch": 0.8576666666666667, "grad_norm": 29.662935256958008, "learning_rate": 1.0438426494221552e-08, "logits/chosen": 2.421541690826416, "logits/rejected": 2.62165904045105, "logps/chosen": -3.4742467403411865, "logps/rejected": -103.56852722167969, "loss": 0.1901, "nll_loss": 0.15792030096054077, "rewards/accuracies": 1.0, "rewards/chosen": 0.9947928190231323, "rewards/margins": 5.3585638999938965, "rewards/rejected": -4.363770961761475, "step": 5146 }, { "epoch": 0.8578333333333333, "grad_norm": 33.75948715209961, "learning_rate": 1.041442802862158e-08, "logits/chosen": 1.7149322032928467, "logits/rejected": 2.6704061031341553, "logps/chosen": -37.88365936279297, "logps/rejected": -452.97210693359375, "loss": 1.0028, "nll_loss": 0.9969385266304016, "rewards/accuracies": 1.0, "rewards/chosen": 3.111156463623047, "rewards/margins": 8.509538650512695, "rewards/rejected": -5.398382663726807, "step": 5147 }, { "epoch": 0.858, "grad_norm": 28.223939895629883, "learning_rate": 1.0390455666106546e-08, "logits/chosen": 1.4357112646102905, "logits/rejected": 2.303426742553711, "logps/chosen": -84.64627838134766, "logps/rejected": -221.92233276367188, "loss": 1.0049, "nll_loss": 0.9958386421203613, "rewards/accuracies": 1.0, "rewards/chosen": 2.4772629737854004, "rewards/margins": 7.651304721832275, "rewards/rejected": -5.174041748046875, "step": 5148 }, { "epoch": 0.8581666666666666, "grad_norm": 40.32651901245117, "learning_rate": 1.0366509413661406e-08, "logits/chosen": 2.8299782276153564, "logits/rejected": 3.0402846336364746, "logps/chosen": -55.01411437988281, "logps/rejected": -326.31634521484375, "loss": 1.1111, "nll_loss": 1.1002821922302246, "rewards/accuracies": 1.0, "rewards/chosen": 1.6078202724456787, "rewards/margins": 10.247971534729004, "rewards/rejected": -8.640151023864746, "step": 5149 }, { "epoch": 0.8583333333333333, "grad_norm": 23.996809005737305, "learning_rate": 1.0342589278263558e-08, "logits/chosen": 0.7183679938316345, "logits/rejected": 1.94092857837677, "logps/chosen": -74.40026092529297, "logps/rejected": -727.0191040039062, "loss": 0.7737, "nll_loss": 0.7591862082481384, "rewards/accuracies": 1.0, "rewards/chosen": 1.304265022277832, "rewards/margins": 10.050414085388184, "rewards/rejected": -8.746149063110352, "step": 5150 }, { "epoch": 0.8585, "grad_norm": 22.505104064941406, "learning_rate": 1.0318695266882693e-08, "logits/chosen": 1.627179741859436, "logits/rejected": 2.338893175125122, "logps/chosen": -67.79743957519531, "logps/rejected": -208.92822265625, "loss": 0.7538, "nll_loss": 0.7450268268585205, "rewards/accuracies": 1.0, "rewards/chosen": 1.8435639142990112, "rewards/margins": 9.82970905303955, "rewards/rejected": -7.98614501953125, "step": 5151 }, { "epoch": 0.8586666666666667, "grad_norm": 20.61873435974121, "learning_rate": 1.0294827386480987e-08, "logits/chosen": 2.994610548019409, "logits/rejected": 2.9639384746551514, "logps/chosen": -64.945068359375, "logps/rejected": -224.942626953125, "loss": 0.6468, "nll_loss": 0.6430204510688782, "rewards/accuracies": 1.0, "rewards/chosen": 2.8847100734710693, "rewards/margins": 9.800475120544434, "rewards/rejected": -6.915765285491943, "step": 5152 }, { "epoch": 0.8588333333333333, "grad_norm": 98.49097442626953, "learning_rate": 1.0270985644012975e-08, "logits/chosen": 2.5238864421844482, "logits/rejected": 2.545727014541626, "logps/chosen": -58.96352767944336, "logps/rejected": -138.31234741210938, "loss": 1.1041, "nll_loss": 0.5510610342025757, "rewards/accuracies": 1.0, "rewards/chosen": 2.371277332305908, "rewards/margins": 1.7196094989776611, "rewards/rejected": 0.6516677737236023, "step": 5153 }, { "epoch": 0.859, "grad_norm": 32.234439849853516, "learning_rate": 1.0247170046425568e-08, "logits/chosen": 2.2342369556427, "logits/rejected": 2.3104684352874756, "logps/chosen": -62.85620880126953, "logps/rejected": -167.43101501464844, "loss": 0.9722, "nll_loss": 0.9670185446739197, "rewards/accuracies": 1.0, "rewards/chosen": 2.34806227684021, "rewards/margins": 13.248292922973633, "rewards/rejected": -10.900230407714844, "step": 5154 }, { "epoch": 0.8591666666666666, "grad_norm": 22.387226104736328, "learning_rate": 1.0223380600658038e-08, "logits/chosen": 2.9423084259033203, "logits/rejected": 2.93264102935791, "logps/chosen": -88.81629180908203, "logps/rejected": -229.89199829101562, "loss": 0.8267, "nll_loss": 0.8223729729652405, "rewards/accuracies": 1.0, "rewards/chosen": 2.7091453075408936, "rewards/margins": 9.764537811279297, "rewards/rejected": -7.055392742156982, "step": 5155 }, { "epoch": 0.8593333333333333, "grad_norm": 36.47602462768555, "learning_rate": 1.0199617313642062e-08, "logits/chosen": 2.735281467437744, "logits/rejected": 2.6166839599609375, "logps/chosen": -16.795568466186523, "logps/rejected": -73.81687927246094, "loss": 0.5469, "nll_loss": 0.5417925119400024, "rewards/accuracies": 1.0, "rewards/chosen": 2.582697868347168, "rewards/margins": 9.214851379394531, "rewards/rejected": -6.632153511047363, "step": 5156 }, { "epoch": 0.8595, "grad_norm": 31.125831604003906, "learning_rate": 1.017588019230171e-08, "logits/chosen": 3.221465826034546, "logits/rejected": 3.2277779579162598, "logps/chosen": -15.139811515808105, "logps/rejected": -188.30606079101562, "loss": 0.5033, "nll_loss": 0.488381028175354, "rewards/accuracies": 1.0, "rewards/chosen": 1.4178285598754883, "rewards/margins": 7.513705730438232, "rewards/rejected": -6.095877170562744, "step": 5157 }, { "epoch": 0.8596666666666667, "grad_norm": 39.13685607910156, "learning_rate": 1.015216924355341e-08, "logits/chosen": 3.2592225074768066, "logits/rejected": 3.1190967559814453, "logps/chosen": -121.61102294921875, "logps/rejected": -31.9316463470459, "loss": 1.0729, "nll_loss": 0.8329522609710693, "rewards/accuracies": 1.0, "rewards/chosen": 2.423483371734619, "rewards/margins": 3.163290500640869, "rewards/rejected": -0.73980712890625, "step": 5158 }, { "epoch": 0.8598333333333333, "grad_norm": 23.782310485839844, "learning_rate": 1.0128484474305931e-08, "logits/chosen": -0.5843551754951477, "logits/rejected": 0.16583476960659027, "logps/chosen": -28.750886917114258, "logps/rejected": -574.20849609375, "loss": 0.5005, "nll_loss": 0.487303227186203, "rewards/accuracies": 1.0, "rewards/chosen": 1.3933264017105103, "rewards/margins": 11.60743236541748, "rewards/rejected": -10.214105606079102, "step": 5159 }, { "epoch": 0.86, "grad_norm": 112.95020294189453, "learning_rate": 1.0104825891460478e-08, "logits/chosen": 3.027193546295166, "logits/rejected": 3.1133651733398438, "logps/chosen": -31.396011352539062, "logps/rejected": -288.10028076171875, "loss": 1.9926, "nll_loss": 1.962250828742981, "rewards/accuracies": 1.0, "rewards/chosen": 0.5098983645439148, "rewards/margins": 11.40539836883545, "rewards/rejected": -10.895500183105469, "step": 5160 }, { "epoch": 0.8601666666666666, "grad_norm": 30.914287567138672, "learning_rate": 1.0081193501910579e-08, "logits/chosen": 2.5945732593536377, "logits/rejected": 2.734846830368042, "logps/chosen": -6.820071697235107, "logps/rejected": -88.5203628540039, "loss": 0.3052, "nll_loss": 0.28416964411735535, "rewards/accuracies": 1.0, "rewards/chosen": 0.9380772113800049, "rewards/margins": 8.18799877166748, "rewards/rejected": -7.249921798706055, "step": 5161 }, { "epoch": 0.8603333333333333, "grad_norm": 129.67465209960938, "learning_rate": 1.0057587312542148e-08, "logits/chosen": 3.2144155502319336, "logits/rejected": 3.261023759841919, "logps/chosen": -29.445411682128906, "logps/rejected": -107.59935760498047, "loss": 1.7196, "nll_loss": 1.5497584342956543, "rewards/accuracies": 1.0, "rewards/chosen": -0.3682933747768402, "rewards/margins": 2.4753174781799316, "rewards/rejected": -2.8436107635498047, "step": 5162 }, { "epoch": 0.8605, "grad_norm": 38.201087951660156, "learning_rate": 1.0034007330233484e-08, "logits/chosen": 1.7735062837600708, "logits/rejected": 1.9653596878051758, "logps/chosen": -31.858381271362305, "logps/rejected": -81.85828399658203, "loss": 0.7523, "nll_loss": 0.692573606967926, "rewards/accuracies": 1.0, "rewards/chosen": 0.8606829643249512, "rewards/margins": 4.156649589538574, "rewards/rejected": -3.295966386795044, "step": 5163 }, { "epoch": 0.8606666666666667, "grad_norm": 46.21933364868164, "learning_rate": 1.001045356185518e-08, "logits/chosen": 1.2480554580688477, "logits/rejected": 2.100205659866333, "logps/chosen": -37.07802200317383, "logps/rejected": -328.1357116699219, "loss": 0.8256, "nll_loss": 0.8239561319351196, "rewards/accuracies": 1.0, "rewards/chosen": 3.631052017211914, "rewards/margins": 12.028870582580566, "rewards/rejected": -8.397818565368652, "step": 5164 }, { "epoch": 0.8608333333333333, "grad_norm": 18.59735107421875, "learning_rate": 9.98692601427028e-09, "logits/chosen": 2.324510335922241, "logits/rejected": 2.5086207389831543, "logps/chosen": -61.488624572753906, "logps/rejected": -267.6174011230469, "loss": 0.611, "nll_loss": 0.60282963514328, "rewards/accuracies": 1.0, "rewards/chosen": 2.2746033668518066, "rewards/margins": 8.021295547485352, "rewards/rejected": -5.746692657470703, "step": 5165 }, { "epoch": 0.861, "grad_norm": 32.566410064697266, "learning_rate": 9.963424694334122e-09, "logits/chosen": 1.8519055843353271, "logits/rejected": 1.6983191967010498, "logps/chosen": -63.28010940551758, "logps/rejected": -108.60964965820312, "loss": 0.8886, "nll_loss": 0.8437347412109375, "rewards/accuracies": 1.0, "rewards/chosen": 1.0250049829483032, "rewards/margins": 4.651499271392822, "rewards/rejected": -3.6264941692352295, "step": 5166 }, { "epoch": 0.8611666666666666, "grad_norm": 42.38642120361328, "learning_rate": 9.93994960889446e-09, "logits/chosen": -0.003953407518565655, "logits/rejected": 1.075546383857727, "logps/chosen": -21.81618309020996, "logps/rejected": -307.0382080078125, "loss": 0.7529, "nll_loss": 0.7522820234298706, "rewards/accuracies": 1.0, "rewards/chosen": 4.421254634857178, "rewards/margins": 16.065074920654297, "rewards/rejected": -11.643820762634277, "step": 5167 }, { "epoch": 0.8613333333333333, "grad_norm": 20.51045799255371, "learning_rate": 9.916500764791325e-09, "logits/chosen": 1.1695010662078857, "logits/rejected": 1.7236093282699585, "logps/chosen": -66.35916137695312, "logps/rejected": -319.0576171875, "loss": 0.667, "nll_loss": 0.6635915637016296, "rewards/accuracies": 1.0, "rewards/chosen": 2.772416830062866, "rewards/margins": 15.10368824005127, "rewards/rejected": -12.331271171569824, "step": 5168 }, { "epoch": 0.8615, "grad_norm": 22.642553329467773, "learning_rate": 9.893078168857172e-09, "logits/chosen": 1.7725796699523926, "logits/rejected": 2.0940632820129395, "logps/chosen": -48.8851318359375, "logps/rejected": -119.22943115234375, "loss": 0.6413, "nll_loss": 0.6187991499900818, "rewards/accuracies": 1.0, "rewards/chosen": 2.214285373687744, "rewards/margins": 6.078779697418213, "rewards/rejected": -3.8644943237304688, "step": 5169 }, { "epoch": 0.8616666666666667, "grad_norm": 51.90217590332031, "learning_rate": 9.869681827916776e-09, "logits/chosen": 1.4857122898101807, "logits/rejected": 2.7666118144989014, "logps/chosen": -19.42108154296875, "logps/rejected": -291.10321044921875, "loss": 0.7839, "nll_loss": 0.776843249797821, "rewards/accuracies": 1.0, "rewards/chosen": 2.2701191902160645, "rewards/margins": 8.558309555053711, "rewards/rejected": -6.288189888000488, "step": 5170 }, { "epoch": 0.8618333333333333, "grad_norm": 28.34087562561035, "learning_rate": 9.846311748787296e-09, "logits/chosen": 2.4393529891967773, "logits/rejected": 2.380880117416382, "logps/chosen": -11.920930862426758, "logps/rejected": -32.072017669677734, "loss": 0.3431, "nll_loss": 0.2591506838798523, "rewards/accuracies": 1.0, "rewards/chosen": 4.481820583343506, "rewards/margins": 6.460058689117432, "rewards/rejected": -1.9782381057739258, "step": 5171 }, { "epoch": 0.862, "grad_norm": 18.597036361694336, "learning_rate": 9.822967938278171e-09, "logits/chosen": 1.9390867948532104, "logits/rejected": 2.3095123767852783, "logps/chosen": -16.577375411987305, "logps/rejected": -143.51724243164062, "loss": 0.335, "nll_loss": 0.31879568099975586, "rewards/accuracies": 1.0, "rewards/chosen": 1.9104251861572266, "rewards/margins": 6.530673503875732, "rewards/rejected": -4.620248317718506, "step": 5172 }, { "epoch": 0.8621666666666666, "grad_norm": 50.59420394897461, "learning_rate": 9.799650403191239e-09, "logits/chosen": 2.317363739013672, "logits/rejected": 2.02482533454895, "logps/chosen": -61.72333526611328, "logps/rejected": -69.2856674194336, "loss": 0.9262, "nll_loss": 0.7527235150337219, "rewards/accuracies": 1.0, "rewards/chosen": 0.9363433718681335, "rewards/margins": 2.671865940093994, "rewards/rejected": -1.7355226278305054, "step": 5173 }, { "epoch": 0.8623333333333333, "grad_norm": 49.66645431518555, "learning_rate": 9.776359150320679e-09, "logits/chosen": 2.097846746444702, "logits/rejected": 2.218055248260498, "logps/chosen": -5.625977993011475, "logps/rejected": -60.152069091796875, "loss": 0.3419, "nll_loss": 0.31255432963371277, "rewards/accuracies": 1.0, "rewards/chosen": 2.089303731918335, "rewards/margins": 5.646788597106934, "rewards/rejected": -3.5574851036071777, "step": 5174 }, { "epoch": 0.8625, "grad_norm": 41.19363021850586, "learning_rate": 9.753094186453027e-09, "logits/chosen": 1.4240179061889648, "logits/rejected": 2.0062623023986816, "logps/chosen": -30.381694793701172, "logps/rejected": -109.6275863647461, "loss": 0.6147, "nll_loss": 0.4747140109539032, "rewards/accuracies": 1.0, "rewards/chosen": 1.3679325580596924, "rewards/margins": 3.178374767303467, "rewards/rejected": -1.810442328453064, "step": 5175 }, { "epoch": 0.8626666666666667, "grad_norm": 35.46132278442383, "learning_rate": 9.729855518367081e-09, "logits/chosen": 3.973804473876953, "logits/rejected": 3.9855618476867676, "logps/chosen": -29.892738342285156, "logps/rejected": -187.14743041992188, "loss": 0.7797, "nll_loss": 0.766480565071106, "rewards/accuracies": 1.0, "rewards/chosen": 1.711484670639038, "rewards/margins": 7.204209327697754, "rewards/rejected": -5.492724895477295, "step": 5176 }, { "epoch": 0.8628333333333333, "grad_norm": 27.735109329223633, "learning_rate": 9.70664315283406e-09, "logits/chosen": 2.052823543548584, "logits/rejected": 2.3322834968566895, "logps/chosen": -35.5587272644043, "logps/rejected": -222.00503540039062, "loss": 0.6526, "nll_loss": 0.6349773406982422, "rewards/accuracies": 1.0, "rewards/chosen": 1.0860191583633423, "rewards/margins": 13.327438354492188, "rewards/rejected": -12.241418838500977, "step": 5177 }, { "epoch": 0.863, "grad_norm": 142.88925170898438, "learning_rate": 9.683457096617486e-09, "logits/chosen": 1.9954087734222412, "logits/rejected": 2.232060432434082, "logps/chosen": -12.973997116088867, "logps/rejected": -76.18930053710938, "loss": 1.3141, "nll_loss": 1.2973997592926025, "rewards/accuracies": 1.0, "rewards/chosen": 1.430580496788025, "rewards/margins": 6.859138011932373, "rewards/rejected": -5.428557395935059, "step": 5178 }, { "epoch": 0.8631666666666666, "grad_norm": 30.12110137939453, "learning_rate": 9.660297356473246e-09, "logits/chosen": 2.2036314010620117, "logits/rejected": 2.4354169368743896, "logps/chosen": -43.73985290527344, "logps/rejected": -171.909912109375, "loss": 0.7786, "nll_loss": 0.7673659920692444, "rewards/accuracies": 1.0, "rewards/chosen": 2.1244804859161377, "rewards/margins": 7.271222114562988, "rewards/rejected": -5.1467413902282715, "step": 5179 }, { "epoch": 0.8633333333333333, "grad_norm": 47.505035400390625, "learning_rate": 9.637163939149484e-09, "logits/chosen": 2.8133437633514404, "logits/rejected": 3.0214855670928955, "logps/chosen": -16.34001922607422, "logps/rejected": -546.0026245117188, "loss": 0.6825, "nll_loss": 0.6808341145515442, "rewards/accuracies": 1.0, "rewards/chosen": 4.384868144989014, "rewards/margins": 11.058440208435059, "rewards/rejected": -6.673572063446045, "step": 5180 }, { "epoch": 0.8635, "grad_norm": 28.542743682861328, "learning_rate": 9.614056851386742e-09, "logits/chosen": 2.666743040084839, "logits/rejected": 2.6188361644744873, "logps/chosen": -55.24705123901367, "logps/rejected": -108.02484130859375, "loss": 0.8245, "nll_loss": 0.8124565482139587, "rewards/accuracies": 1.0, "rewards/chosen": 1.9962857961654663, "rewards/margins": 7.151727199554443, "rewards/rejected": -5.1554412841796875, "step": 5181 }, { "epoch": 0.8636666666666667, "grad_norm": 38.90363693237305, "learning_rate": 9.590976099917858e-09, "logits/chosen": 3.344820976257324, "logits/rejected": 3.4374632835388184, "logps/chosen": -51.29066467285156, "logps/rejected": -130.63079833984375, "loss": 0.9869, "nll_loss": 0.9677483439445496, "rewards/accuracies": 1.0, "rewards/chosen": 1.2818039655685425, "rewards/margins": 6.608578681945801, "rewards/rejected": -5.326774597167969, "step": 5182 }, { "epoch": 0.8638333333333333, "grad_norm": 30.73770523071289, "learning_rate": 9.567921691468072e-09, "logits/chosen": 2.880336284637451, "logits/rejected": 3.2202866077423096, "logps/chosen": -41.910972595214844, "logps/rejected": -322.0984802246094, "loss": 0.9346, "nll_loss": 0.9313549399375916, "rewards/accuracies": 1.0, "rewards/chosen": 4.73212194442749, "rewards/margins": 10.334085464477539, "rewards/rejected": -5.601963043212891, "step": 5183 }, { "epoch": 0.864, "grad_norm": 38.041748046875, "learning_rate": 9.544893632754813e-09, "logits/chosen": 1.8343242406845093, "logits/rejected": 2.4608094692230225, "logps/chosen": -41.31588363647461, "logps/rejected": -216.04376220703125, "loss": 0.8563, "nll_loss": 0.843181312084198, "rewards/accuracies": 1.0, "rewards/chosen": 1.6737923622131348, "rewards/margins": 7.3039727210998535, "rewards/rejected": -5.630180358886719, "step": 5184 }, { "epoch": 0.8641666666666666, "grad_norm": 26.001466751098633, "learning_rate": 9.521891930487925e-09, "logits/chosen": 3.0021774768829346, "logits/rejected": 3.191204786300659, "logps/chosen": -86.44064331054688, "logps/rejected": -494.6771545410156, "loss": 0.9629, "nll_loss": 0.9604516625404358, "rewards/accuracies": 1.0, "rewards/chosen": 3.1534502506256104, "rewards/margins": 11.593559265136719, "rewards/rejected": -8.440109252929688, "step": 5185 }, { "epoch": 0.8643333333333333, "grad_norm": 31.29819679260254, "learning_rate": 9.498916591369566e-09, "logits/chosen": 1.9781255722045898, "logits/rejected": 2.395707607269287, "logps/chosen": -62.638118743896484, "logps/rejected": -461.75921630859375, "loss": 0.9524, "nll_loss": 0.9348973631858826, "rewards/accuracies": 1.0, "rewards/chosen": 1.1013813018798828, "rewards/margins": 9.850069046020508, "rewards/rejected": -8.748687744140625, "step": 5186 }, { "epoch": 0.8645, "grad_norm": 31.387197494506836, "learning_rate": 9.475967622094205e-09, "logits/chosen": 2.5896642208099365, "logits/rejected": 2.721256732940674, "logps/chosen": -72.72492980957031, "logps/rejected": -94.97265625, "loss": 0.8953, "nll_loss": 0.8657729625701904, "rewards/accuracies": 1.0, "rewards/chosen": 0.7697609066963196, "rewards/margins": 6.025025844573975, "rewards/rejected": -5.255264759063721, "step": 5187 }, { "epoch": 0.8646666666666667, "grad_norm": 18.196361541748047, "learning_rate": 9.453045029348583e-09, "logits/chosen": 2.19411301612854, "logits/rejected": 2.4708187580108643, "logps/chosen": -106.10887145996094, "logps/rejected": -241.96702575683594, "loss": 0.7421, "nll_loss": 0.7368671298027039, "rewards/accuracies": 1.0, "rewards/chosen": 2.3371338844299316, "rewards/margins": 11.772314071655273, "rewards/rejected": -9.435179710388184, "step": 5188 }, { "epoch": 0.8648333333333333, "grad_norm": 26.538175582885742, "learning_rate": 9.430148819811811e-09, "logits/chosen": 2.678760290145874, "logits/rejected": 3.0984246730804443, "logps/chosen": -59.12397766113281, "logps/rejected": -338.594482421875, "loss": 0.7152, "nll_loss": 0.7038569450378418, "rewards/accuracies": 1.0, "rewards/chosen": 1.5420167446136475, "rewards/margins": 11.634204864501953, "rewards/rejected": -10.092187881469727, "step": 5189 }, { "epoch": 0.865, "grad_norm": 10.697147369384766, "learning_rate": 9.40727900015531e-09, "logits/chosen": 1.8683825731277466, "logits/rejected": 1.8574068546295166, "logps/chosen": -116.56968688964844, "logps/rejected": -168.26724243164062, "loss": 0.4806, "nll_loss": 0.4777446389198303, "rewards/accuracies": 1.0, "rewards/chosen": 4.1257219314575195, "rewards/margins": 10.057699203491211, "rewards/rejected": -5.93197774887085, "step": 5190 }, { "epoch": 0.8651666666666666, "grad_norm": 28.07271385192871, "learning_rate": 9.384435577042793e-09, "logits/chosen": 2.301400899887085, "logits/rejected": 2.8253490924835205, "logps/chosen": -110.39739990234375, "logps/rejected": -384.6791687011719, "loss": 1.2556, "nll_loss": 1.2545157670974731, "rewards/accuracies": 1.0, "rewards/chosen": 3.9748568534851074, "rewards/margins": 12.80854606628418, "rewards/rejected": -8.833688735961914, "step": 5191 }, { "epoch": 0.8653333333333333, "grad_norm": 24.199594497680664, "learning_rate": 9.361618557130268e-09, "logits/chosen": 2.1176705360412598, "logits/rejected": 2.785912275314331, "logps/chosen": -80.31318664550781, "logps/rejected": -81.6539306640625, "loss": 0.8204, "nll_loss": 0.8112443089485168, "rewards/accuracies": 1.0, "rewards/chosen": 2.817584276199341, "rewards/margins": 7.69327449798584, "rewards/rejected": -4.87568998336792, "step": 5192 }, { "epoch": 0.8655, "grad_norm": 31.78109359741211, "learning_rate": 9.338827947066075e-09, "logits/chosen": 3.42899227142334, "logits/rejected": 3.7885355949401855, "logps/chosen": -24.227956771850586, "logps/rejected": -85.19859313964844, "loss": 0.641, "nll_loss": 0.5768560171127319, "rewards/accuracies": 1.0, "rewards/chosen": 1.497006893157959, "rewards/margins": 4.272513389587402, "rewards/rejected": -2.7755067348480225, "step": 5193 }, { "epoch": 0.8656666666666667, "grad_norm": 34.665042877197266, "learning_rate": 9.316063753490843e-09, "logits/chosen": 2.7692201137542725, "logits/rejected": 2.849977970123291, "logps/chosen": -38.0350341796875, "logps/rejected": -249.53395080566406, "loss": 0.8065, "nll_loss": 0.7923965454101562, "rewards/accuracies": 1.0, "rewards/chosen": 1.3279885053634644, "rewards/margins": 9.823055267333984, "rewards/rejected": -8.49506664276123, "step": 5194 }, { "epoch": 0.8658333333333333, "grad_norm": 30.818561553955078, "learning_rate": 9.293325983037547e-09, "logits/chosen": 2.668750762939453, "logits/rejected": 2.8178794384002686, "logps/chosen": -60.143184661865234, "logps/rejected": -298.9159851074219, "loss": 0.8546, "nll_loss": 0.8470871448516846, "rewards/accuracies": 1.0, "rewards/chosen": 1.969417929649353, "rewards/margins": 12.025506973266602, "rewards/rejected": -10.056089401245117, "step": 5195 }, { "epoch": 0.866, "grad_norm": 77.14083099365234, "learning_rate": 9.270614642331375e-09, "logits/chosen": 2.6732020378112793, "logits/rejected": 2.4892196655273438, "logps/chosen": -147.16854858398438, "logps/rejected": -164.22921752929688, "loss": 1.364, "nll_loss": 1.2909520864486694, "rewards/accuracies": 1.0, "rewards/chosen": -0.4055221676826477, "rewards/margins": 5.623671054840088, "rewards/rejected": -6.02919340133667, "step": 5196 }, { "epoch": 0.8661666666666666, "grad_norm": 21.52577781677246, "learning_rate": 9.247929737989901e-09, "logits/chosen": 2.5891621112823486, "logits/rejected": 2.894711971282959, "logps/chosen": -121.78410339355469, "logps/rejected": -206.34222412109375, "loss": 0.9795, "nll_loss": 0.9742729067802429, "rewards/accuracies": 1.0, "rewards/chosen": 2.4378433227539062, "rewards/margins": 9.752525329589844, "rewards/rejected": -7.3146820068359375, "step": 5197 }, { "epoch": 0.8663333333333333, "grad_norm": 264.00390625, "learning_rate": 9.225271276622948e-09, "logits/chosen": 1.618530511856079, "logits/rejected": 2.0499093532562256, "logps/chosen": -44.362083435058594, "logps/rejected": -63.960811614990234, "loss": 4.6401, "nll_loss": 0.7272472977638245, "rewards/accuracies": 0.0, "rewards/chosen": 0.2669486999511719, "rewards/margins": -3.5368740558624268, "rewards/rejected": 3.8038227558135986, "step": 5198 }, { "epoch": 0.8665, "grad_norm": 48.69961166381836, "learning_rate": 9.202639264832668e-09, "logits/chosen": 2.7500689029693604, "logits/rejected": 2.7517802715301514, "logps/chosen": -14.545295715332031, "logps/rejected": -98.63843536376953, "loss": 0.5391, "nll_loss": 0.4407665431499481, "rewards/accuracies": 1.0, "rewards/chosen": 0.43847236037254333, "rewards/margins": 3.31793475151062, "rewards/rejected": -2.879462480545044, "step": 5199 }, { "epoch": 0.8666666666666667, "grad_norm": 486.1026306152344, "learning_rate": 9.180033709213453e-09, "logits/chosen": 2.064863920211792, "logits/rejected": 1.8428094387054443, "logps/chosen": -138.12481689453125, "logps/rejected": -82.92486572265625, "loss": 4.4046, "nll_loss": 1.6443430185317993, "rewards/accuracies": 0.0, "rewards/chosen": -3.621936082839966, "rewards/margins": -2.5669891834259033, "rewards/rejected": -1.0549468994140625, "step": 5200 }, { "epoch": 0.8668333333333333, "grad_norm": 23.602691650390625, "learning_rate": 9.157454616352034e-09, "logits/chosen": 2.1293396949768066, "logits/rejected": 2.2008495330810547, "logps/chosen": -69.5653076171875, "logps/rejected": -404.90106201171875, "loss": 0.7049, "nll_loss": 0.6887654066085815, "rewards/accuracies": 1.0, "rewards/chosen": 1.1837981939315796, "rewards/margins": 10.153411865234375, "rewards/rejected": -8.969614028930664, "step": 5201 }, { "epoch": 0.867, "grad_norm": 60.51043701171875, "learning_rate": 9.134901992827427e-09, "logits/chosen": 2.641505718231201, "logits/rejected": 2.7175066471099854, "logps/chosen": -8.075058937072754, "logps/rejected": -191.01747131347656, "loss": 0.5483, "nll_loss": 0.5383371710777283, "rewards/accuracies": 1.0, "rewards/chosen": 2.3994343280792236, "rewards/margins": 7.458985328674316, "rewards/rejected": -5.059551239013672, "step": 5202 }, { "epoch": 0.8671666666666666, "grad_norm": 31.79708480834961, "learning_rate": 9.112375845210906e-09, "logits/chosen": 1.7414029836654663, "logits/rejected": 2.8496882915496826, "logps/chosen": -50.247520446777344, "logps/rejected": -301.87371826171875, "loss": 0.8372, "nll_loss": 0.8237299919128418, "rewards/accuracies": 1.0, "rewards/chosen": 1.770707130432129, "rewards/margins": 7.052677631378174, "rewards/rejected": -5.281970500946045, "step": 5203 }, { "epoch": 0.8673333333333333, "grad_norm": 189.6464080810547, "learning_rate": 9.089876180066091e-09, "logits/chosen": 2.0286617279052734, "logits/rejected": 2.107299327850342, "logps/chosen": -29.995786666870117, "logps/rejected": -23.257217407226562, "loss": 4.7445, "nll_loss": 0.5262419581413269, "rewards/accuracies": 0.0, "rewards/chosen": 1.2596564292907715, "rewards/margins": -3.670379161834717, "rewards/rejected": 4.930035591125488, "step": 5204 }, { "epoch": 0.8675, "grad_norm": 44.49042510986328, "learning_rate": 9.067403003948782e-09, "logits/chosen": 2.9995133876800537, "logits/rejected": 3.1394522190093994, "logps/chosen": -25.22512435913086, "logps/rejected": -95.38111114501953, "loss": 0.5666, "nll_loss": 0.4425460398197174, "rewards/accuracies": 1.0, "rewards/chosen": 1.4074032306671143, "rewards/margins": 3.3612568378448486, "rewards/rejected": -1.9538536071777344, "step": 5205 }, { "epoch": 0.8676666666666667, "grad_norm": 163.13487243652344, "learning_rate": 9.04495632340716e-09, "logits/chosen": 2.4775216579437256, "logits/rejected": 2.322119951248169, "logps/chosen": -72.17717742919922, "logps/rejected": -90.5130844116211, "loss": 1.7036, "nll_loss": 0.9753672480583191, "rewards/accuracies": 1.0, "rewards/chosen": 1.8966965675354004, "rewards/margins": 0.9777222275733948, "rewards/rejected": 0.9189743399620056, "step": 5206 }, { "epoch": 0.8678333333333333, "grad_norm": 130.66983032226562, "learning_rate": 9.022536144981652e-09, "logits/chosen": 1.7488325834274292, "logits/rejected": 2.009366512298584, "logps/chosen": -22.347530364990234, "logps/rejected": -24.434650421142578, "loss": 1.8173, "nll_loss": 0.406318724155426, "rewards/accuracies": 0.0, "rewards/chosen": 1.6384929418563843, "rewards/margins": -0.39291226863861084, "rewards/rejected": 2.031405210494995, "step": 5207 }, { "epoch": 0.868, "grad_norm": 30.749122619628906, "learning_rate": 9.000142475204964e-09, "logits/chosen": 1.8849611282348633, "logits/rejected": 2.772096872329712, "logps/chosen": -55.74812316894531, "logps/rejected": -345.4520263671875, "loss": 0.7341, "nll_loss": 0.7240015268325806, "rewards/accuracies": 1.0, "rewards/chosen": 1.6635987758636475, "rewards/margins": 12.341264724731445, "rewards/rejected": -10.677665710449219, "step": 5208 }, { "epoch": 0.8681666666666666, "grad_norm": 18.068262100219727, "learning_rate": 8.977775320602044e-09, "logits/chosen": 2.6218268871307373, "logits/rejected": 2.6304104328155518, "logps/chosen": -134.3155517578125, "logps/rejected": -180.33572387695312, "loss": 0.7735, "nll_loss": 0.7631566524505615, "rewards/accuracies": 1.0, "rewards/chosen": 2.2695252895355225, "rewards/margins": 7.397760391235352, "rewards/rejected": -5.128235340118408, "step": 5209 }, { "epoch": 0.8683333333333333, "grad_norm": 26.40760040283203, "learning_rate": 8.955434687690156e-09, "logits/chosen": 2.8758480548858643, "logits/rejected": 2.863243579864502, "logps/chosen": -59.18575668334961, "logps/rejected": -58.167030334472656, "loss": 0.8288, "nll_loss": 0.7891432642936707, "rewards/accuracies": 1.0, "rewards/chosen": 2.152122974395752, "rewards/margins": 5.2953596115112305, "rewards/rejected": -3.1432363986968994, "step": 5210 }, { "epoch": 0.8685, "grad_norm": 45.45246887207031, "learning_rate": 8.933120582978827e-09, "logits/chosen": 0.8391011953353882, "logits/rejected": 1.63968026638031, "logps/chosen": -99.66868591308594, "logps/rejected": -372.38958740234375, "loss": 1.3172, "nll_loss": 1.2616287469863892, "rewards/accuracies": 1.0, "rewards/chosen": -0.1655876338481903, "rewards/margins": 19.22601890563965, "rewards/rejected": -19.3916072845459, "step": 5211 }, { "epoch": 0.8686666666666667, "grad_norm": 33.0131721496582, "learning_rate": 8.910833012969877e-09, "logits/chosen": 1.6228458881378174, "logits/rejected": 1.3096977472305298, "logps/chosen": -99.37698364257812, "logps/rejected": -63.929969787597656, "loss": 0.973, "nll_loss": 0.8794422149658203, "rewards/accuracies": 1.0, "rewards/chosen": 2.138745069503784, "rewards/margins": 4.22011661529541, "rewards/rejected": -2.081371307373047, "step": 5212 }, { "epoch": 0.8688333333333333, "grad_norm": 75.96381378173828, "learning_rate": 8.888571984157323e-09, "logits/chosen": 1.5798027515411377, "logits/rejected": 1.5248194932937622, "logps/chosen": -105.69300842285156, "logps/rejected": -128.93475341796875, "loss": 1.4654, "nll_loss": 1.1009687185287476, "rewards/accuracies": 1.0, "rewards/chosen": 2.3273117542266846, "rewards/margins": 2.434528350830078, "rewards/rejected": -0.10721664875745773, "step": 5213 }, { "epoch": 0.869, "grad_norm": 21.872333526611328, "learning_rate": 8.86633750302752e-09, "logits/chosen": 2.259510040283203, "logits/rejected": 2.29815673828125, "logps/chosen": -112.27201843261719, "logps/rejected": -279.2102355957031, "loss": 0.9904, "nll_loss": 0.9848422408103943, "rewards/accuracies": 1.0, "rewards/chosen": 2.298536777496338, "rewards/margins": 10.835683822631836, "rewards/rejected": -8.53714656829834, "step": 5214 }, { "epoch": 0.8691666666666666, "grad_norm": 27.942651748657227, "learning_rate": 8.844129576059067e-09, "logits/chosen": 3.042386293411255, "logits/rejected": 3.079019546508789, "logps/chosen": -38.101898193359375, "logps/rejected": -123.17506408691406, "loss": 0.6603, "nll_loss": 0.6246212720870972, "rewards/accuracies": 1.0, "rewards/chosen": 1.2230952978134155, "rewards/margins": 5.061761856079102, "rewards/rejected": -3.8386666774749756, "step": 5215 }, { "epoch": 0.8693333333333333, "grad_norm": 26.29693603515625, "learning_rate": 8.821948209722818e-09, "logits/chosen": 0.9255287051200867, "logits/rejected": 1.9809892177581787, "logps/chosen": -9.705211639404297, "logps/rejected": -237.3343048095703, "loss": 0.3442, "nll_loss": 0.32350704073905945, "rewards/accuracies": 1.0, "rewards/chosen": 1.0922149419784546, "rewards/margins": 6.8188676834106445, "rewards/rejected": -5.7266526222229, "step": 5216 }, { "epoch": 0.8695, "grad_norm": 30.91493034362793, "learning_rate": 8.79979341048187e-09, "logits/chosen": 2.4880211353302, "logits/rejected": 2.78385853767395, "logps/chosen": -62.67015838623047, "logps/rejected": -337.1968078613281, "loss": 0.9514, "nll_loss": 0.9353755116462708, "rewards/accuracies": 1.0, "rewards/chosen": 1.3373650312423706, "rewards/margins": 7.380648612976074, "rewards/rejected": -6.043283462524414, "step": 5217 }, { "epoch": 0.8696666666666667, "grad_norm": 51.727378845214844, "learning_rate": 8.777665184791616e-09, "logits/chosen": 2.7106053829193115, "logits/rejected": 2.6376805305480957, "logps/chosen": -55.244869232177734, "logps/rejected": -71.66862487792969, "loss": 1.1884, "nll_loss": 1.1754225492477417, "rewards/accuracies": 1.0, "rewards/chosen": 1.7557109594345093, "rewards/margins": 7.198405742645264, "rewards/rejected": -5.442694664001465, "step": 5218 }, { "epoch": 0.8698333333333333, "grad_norm": 157.9076385498047, "learning_rate": 8.755563539099687e-09, "logits/chosen": 2.6335010528564453, "logits/rejected": 2.7062184810638428, "logps/chosen": -65.39369201660156, "logps/rejected": -80.1100082397461, "loss": 1.7006, "nll_loss": 0.8958039879798889, "rewards/accuracies": 1.0, "rewards/chosen": 1.3008660078048706, "rewards/margins": 0.5689353942871094, "rewards/rejected": 0.7319306135177612, "step": 5219 }, { "epoch": 0.87, "grad_norm": 31.488784790039062, "learning_rate": 8.733488479845996e-09, "logits/chosen": 0.040098775178194046, "logits/rejected": 1.9567790031433105, "logps/chosen": -49.702186584472656, "logps/rejected": -655.9329223632812, "loss": 0.6776, "nll_loss": 0.6539761424064636, "rewards/accuracies": 1.0, "rewards/chosen": 0.7730892300605774, "rewards/margins": 16.668304443359375, "rewards/rejected": -15.895215034484863, "step": 5220 }, { "epoch": 0.8701666666666666, "grad_norm": 21.626245498657227, "learning_rate": 8.711440013462635e-09, "logits/chosen": 1.8218556642532349, "logits/rejected": 2.582467794418335, "logps/chosen": -73.802490234375, "logps/rejected": -61.50913619995117, "loss": 0.6892, "nll_loss": 0.677087128162384, "rewards/accuracies": 1.0, "rewards/chosen": 1.8514024019241333, "rewards/margins": 7.277985095977783, "rewards/rejected": -5.4265828132629395, "step": 5221 }, { "epoch": 0.8703333333333333, "grad_norm": 24.35397720336914, "learning_rate": 8.689418146374038e-09, "logits/chosen": 1.948067307472229, "logits/rejected": 1.8838887214660645, "logps/chosen": -23.684240341186523, "logps/rejected": -68.69733428955078, "loss": 0.4556, "nll_loss": 0.4385969936847687, "rewards/accuracies": 1.0, "rewards/chosen": 1.459044098854065, "rewards/margins": 6.708224773406982, "rewards/rejected": -5.249180793762207, "step": 5222 }, { "epoch": 0.8705, "grad_norm": 25.405370712280273, "learning_rate": 8.667422884996823e-09, "logits/chosen": 2.6111834049224854, "logits/rejected": 2.783240556716919, "logps/chosen": -66.47271728515625, "logps/rejected": -82.25713348388672, "loss": 0.7263, "nll_loss": 0.714760422706604, "rewards/accuracies": 1.0, "rewards/chosen": 1.5958832502365112, "rewards/margins": 8.535186767578125, "rewards/rejected": -6.939303398132324, "step": 5223 }, { "epoch": 0.8706666666666667, "grad_norm": 26.809701919555664, "learning_rate": 8.645454235739902e-09, "logits/chosen": 1.5175942182540894, "logits/rejected": 1.6753560304641724, "logps/chosen": -13.843297004699707, "logps/rejected": -265.31756591796875, "loss": 0.3773, "nll_loss": 0.374143123626709, "rewards/accuracies": 1.0, "rewards/chosen": 3.074612855911255, "rewards/margins": 10.120619773864746, "rewards/rejected": -7.04600715637207, "step": 5224 }, { "epoch": 0.8708333333333333, "grad_norm": 66.0437240600586, "learning_rate": 8.623512205004425e-09, "logits/chosen": 0.2765985131263733, "logits/rejected": 1.3979145288467407, "logps/chosen": -55.696197509765625, "logps/rejected": -304.3819580078125, "loss": 1.863, "nll_loss": 1.8565396070480347, "rewards/accuracies": 1.0, "rewards/chosen": 2.51885986328125, "rewards/margins": 8.440147399902344, "rewards/rejected": -5.921287536621094, "step": 5225 }, { "epoch": 0.871, "grad_norm": 33.667015075683594, "learning_rate": 8.601596799183719e-09, "logits/chosen": 2.581169366836548, "logits/rejected": 2.454930305480957, "logps/chosen": -53.28227615356445, "logps/rejected": -67.74942779541016, "loss": 0.8054, "nll_loss": 0.7504546046257019, "rewards/accuracies": 1.0, "rewards/chosen": 1.2363865375518799, "rewards/margins": 4.371905326843262, "rewards/rejected": -3.135519027709961, "step": 5226 }, { "epoch": 0.8711666666666666, "grad_norm": 20.31035041809082, "learning_rate": 8.579708024663456e-09, "logits/chosen": 2.2359893321990967, "logits/rejected": 2.2698583602905273, "logps/chosen": -40.820045471191406, "logps/rejected": -68.65602111816406, "loss": 0.4909, "nll_loss": 0.4859529137611389, "rewards/accuracies": 1.0, "rewards/chosen": 2.9973366260528564, "rewards/margins": 8.851974487304688, "rewards/rejected": -5.854637622833252, "step": 5227 }, { "epoch": 0.8713333333333333, "grad_norm": 23.6932430267334, "learning_rate": 8.55784588782147e-09, "logits/chosen": 1.0726535320281982, "logits/rejected": 1.0538097620010376, "logps/chosen": -74.38613891601562, "logps/rejected": -124.25191497802734, "loss": 0.8004, "nll_loss": 0.7830120921134949, "rewards/accuracies": 1.0, "rewards/chosen": 2.7010316848754883, "rewards/margins": 6.722451210021973, "rewards/rejected": -4.021419525146484, "step": 5228 }, { "epoch": 0.8715, "grad_norm": 250.61044311523438, "learning_rate": 8.536010395027905e-09, "logits/chosen": 2.602353096008301, "logits/rejected": 2.656619071960449, "logps/chosen": -135.75282287597656, "logps/rejected": -57.70777130126953, "loss": 2.5165, "nll_loss": 1.0130807161331177, "rewards/accuracies": 0.0, "rewards/chosen": 1.0887924432754517, "rewards/margins": -0.6666358709335327, "rewards/rejected": 1.7554283142089844, "step": 5229 }, { "epoch": 0.8716666666666667, "grad_norm": 88.02685546875, "learning_rate": 8.51420155264505e-09, "logits/chosen": 2.6642613410949707, "logits/rejected": 2.5562784671783447, "logps/chosen": -105.90370178222656, "logps/rejected": -41.957820892333984, "loss": 2.0125, "nll_loss": 1.4708846807479858, "rewards/accuracies": 1.0, "rewards/chosen": 3.705275058746338, "rewards/margins": 2.668004274368286, "rewards/rejected": 1.0372707843780518, "step": 5230 }, { "epoch": 0.8718333333333333, "grad_norm": 51.45111083984375, "learning_rate": 8.492419367027493e-09, "logits/chosen": 3.139270782470703, "logits/rejected": 3.1401336193084717, "logps/chosen": -98.75276947021484, "logps/rejected": -121.61231994628906, "loss": 1.3098, "nll_loss": 1.2993786334991455, "rewards/accuracies": 1.0, "rewards/chosen": 2.9772136211395264, "rewards/margins": 7.572757720947266, "rewards/rejected": -4.59554386138916, "step": 5231 }, { "epoch": 0.872, "grad_norm": 68.88284301757812, "learning_rate": 8.470663844522052e-09, "logits/chosen": 2.823490619659424, "logits/rejected": 2.8654139041900635, "logps/chosen": -65.44071960449219, "logps/rejected": -70.39236450195312, "loss": 0.9888, "nll_loss": 0.617365300655365, "rewards/accuracies": 1.0, "rewards/chosen": 1.9135675430297852, "rewards/margins": 2.1376023292541504, "rewards/rejected": -0.22403490543365479, "step": 5232 }, { "epoch": 0.8721666666666666, "grad_norm": 19.835912704467773, "learning_rate": 8.448934991467772e-09, "logits/chosen": 3.085145950317383, "logits/rejected": 3.111708402633667, "logps/chosen": -69.32347106933594, "logps/rejected": -336.63336181640625, "loss": 0.6603, "nll_loss": 0.6539950966835022, "rewards/accuracies": 1.0, "rewards/chosen": 2.2241456508636475, "rewards/margins": 9.561123847961426, "rewards/rejected": -7.336978435516357, "step": 5233 }, { "epoch": 0.8723333333333333, "grad_norm": 29.414701461791992, "learning_rate": 8.427232814195883e-09, "logits/chosen": 3.7931039333343506, "logits/rejected": 3.823810577392578, "logps/chosen": -24.552982330322266, "logps/rejected": -123.11387634277344, "loss": 0.5821, "nll_loss": 0.5709995627403259, "rewards/accuracies": 1.0, "rewards/chosen": 2.202177047729492, "rewards/margins": 7.251837730407715, "rewards/rejected": -5.049660682678223, "step": 5234 }, { "epoch": 0.8725, "grad_norm": 15.194427490234375, "learning_rate": 8.405557319029911e-09, "logits/chosen": 2.4874966144561768, "logits/rejected": 2.4610791206359863, "logps/chosen": -165.38760375976562, "logps/rejected": -179.11248779296875, "loss": 0.7301, "nll_loss": 0.7222167253494263, "rewards/accuracies": 1.0, "rewards/chosen": 2.130972385406494, "rewards/margins": 8.443489074707031, "rewards/rejected": -6.312516212463379, "step": 5235 }, { "epoch": 0.8726666666666667, "grad_norm": 28.61815071105957, "learning_rate": 8.383908512285554e-09, "logits/chosen": 0.7262478470802307, "logits/rejected": 1.6835811138153076, "logps/chosen": -58.148380279541016, "logps/rejected": -430.723388671875, "loss": 0.7955, "nll_loss": 0.7753117084503174, "rewards/accuracies": 1.0, "rewards/chosen": 0.9399997591972351, "rewards/margins": 11.659881591796875, "rewards/rejected": -10.719882011413574, "step": 5236 }, { "epoch": 0.8728333333333333, "grad_norm": 26.417654037475586, "learning_rate": 8.362286400270779e-09, "logits/chosen": 1.394494652748108, "logits/rejected": 2.110836982727051, "logps/chosen": -66.93672180175781, "logps/rejected": -268.076416015625, "loss": 0.8039, "nll_loss": 0.7968657612800598, "rewards/accuracies": 1.0, "rewards/chosen": 2.042210340499878, "rewards/margins": 11.040315628051758, "rewards/rejected": -8.9981050491333, "step": 5237 }, { "epoch": 0.873, "grad_norm": 23.576021194458008, "learning_rate": 8.340690989285726e-09, "logits/chosen": 2.3877148628234863, "logits/rejected": 2.560960531234741, "logps/chosen": -58.832786560058594, "logps/rejected": -291.9822998046875, "loss": 0.7093, "nll_loss": 0.7003903388977051, "rewards/accuracies": 1.0, "rewards/chosen": 1.9451974630355835, "rewards/margins": 8.451197624206543, "rewards/rejected": -6.50600004196167, "step": 5238 }, { "epoch": 0.8731666666666666, "grad_norm": 51.37892150878906, "learning_rate": 8.319122285622793e-09, "logits/chosen": 2.77181339263916, "logits/rejected": 2.612161636352539, "logps/chosen": -20.503644943237305, "logps/rejected": -10.292076110839844, "loss": 1.481, "nll_loss": 0.7322731018066406, "rewards/accuracies": 1.0, "rewards/chosen": 3.5079848766326904, "rewards/margins": 1.7588319778442383, "rewards/rejected": 1.7491528987884521, "step": 5239 }, { "epoch": 0.8733333333333333, "grad_norm": 18.580631256103516, "learning_rate": 8.297580295566574e-09, "logits/chosen": 0.5023140907287598, "logits/rejected": 1.627023696899414, "logps/chosen": -32.81207275390625, "logps/rejected": -338.674560546875, "loss": 0.4222, "nll_loss": 0.4101509153842926, "rewards/accuracies": 1.0, "rewards/chosen": 1.4921127557754517, "rewards/margins": 10.08122730255127, "rewards/rejected": -8.58911418914795, "step": 5240 }, { "epoch": 0.8735, "grad_norm": 118.12852478027344, "learning_rate": 8.276065025393907e-09, "logits/chosen": 3.338454246520996, "logits/rejected": 3.0261597633361816, "logps/chosen": -48.73838424682617, "logps/rejected": -5.956496238708496, "loss": 1.9358, "nll_loss": 0.79899001121521, "rewards/accuracies": 1.0, "rewards/chosen": 1.4967069625854492, "rewards/margins": 0.002493143081665039, "rewards/rejected": 1.4942138195037842, "step": 5241 }, { "epoch": 0.8736666666666667, "grad_norm": 25.131113052368164, "learning_rate": 8.254576481373798e-09, "logits/chosen": 2.909416675567627, "logits/rejected": 2.912130832672119, "logps/chosen": -103.3763656616211, "logps/rejected": -325.439208984375, "loss": 0.9958, "nll_loss": 0.9845368266105652, "rewards/accuracies": 1.0, "rewards/chosen": 1.8918930292129517, "rewards/margins": 7.459895610809326, "rewards/rejected": -5.568002700805664, "step": 5242 }, { "epoch": 0.8738333333333334, "grad_norm": 33.415313720703125, "learning_rate": 8.233114669767505e-09, "logits/chosen": 2.7570157051086426, "logits/rejected": 2.9431111812591553, "logps/chosen": -28.881114959716797, "logps/rejected": -93.69263458251953, "loss": 0.6265, "nll_loss": 0.6016899347305298, "rewards/accuracies": 1.0, "rewards/chosen": 0.7742267847061157, "rewards/margins": 7.552741050720215, "rewards/rejected": -6.778514385223389, "step": 5243 }, { "epoch": 0.874, "grad_norm": 155.267578125, "learning_rate": 8.21167959682848e-09, "logits/chosen": 2.62265682220459, "logits/rejected": 2.760667562484741, "logps/chosen": -18.603805541992188, "logps/rejected": -18.086135864257812, "loss": 3.7196, "nll_loss": 0.6001226902008057, "rewards/accuracies": 0.0, "rewards/chosen": 1.5266640186309814, "rewards/margins": -2.476397752761841, "rewards/rejected": 4.003061771392822, "step": 5244 }, { "epoch": 0.8741666666666666, "grad_norm": 20.58220672607422, "learning_rate": 8.190271268802395e-09, "logits/chosen": 1.275984525680542, "logits/rejected": 2.169341564178467, "logps/chosen": -64.80113220214844, "logps/rejected": -179.7145233154297, "loss": 0.647, "nll_loss": 0.6415953636169434, "rewards/accuracies": 1.0, "rewards/chosen": 2.4609971046447754, "rewards/margins": 9.378753662109375, "rewards/rejected": -6.917757034301758, "step": 5245 }, { "epoch": 0.8743333333333333, "grad_norm": 12.86387825012207, "learning_rate": 8.168889691927139e-09, "logits/chosen": 2.2030560970306396, "logits/rejected": 2.0951597690582275, "logps/chosen": -126.40155792236328, "logps/rejected": -135.37741088867188, "loss": 0.5145, "nll_loss": 0.5035918354988098, "rewards/accuracies": 1.0, "rewards/chosen": 1.9116113185882568, "rewards/margins": 7.554520606994629, "rewards/rejected": -5.642909526824951, "step": 5246 }, { "epoch": 0.8745, "grad_norm": 20.421361923217773, "learning_rate": 8.14753487243276e-09, "logits/chosen": 1.7831650972366333, "logits/rejected": 1.8204991817474365, "logps/chosen": -136.6487579345703, "logps/rejected": -283.6876220703125, "loss": 0.8454, "nll_loss": 0.8435107469558716, "rewards/accuracies": 1.0, "rewards/chosen": 5.054237365722656, "rewards/margins": 11.238484382629395, "rewards/rejected": -6.184247016906738, "step": 5247 }, { "epoch": 0.8746666666666667, "grad_norm": 32.4484977722168, "learning_rate": 8.126206816541547e-09, "logits/chosen": 2.44551682472229, "logits/rejected": 2.7334747314453125, "logps/chosen": -68.43782806396484, "logps/rejected": -217.5513916015625, "loss": 1.1046, "nll_loss": 1.0863145589828491, "rewards/accuracies": 1.0, "rewards/chosen": 1.3973228931427002, "rewards/margins": 6.571112632751465, "rewards/rejected": -5.173789978027344, "step": 5248 }, { "epoch": 0.8748333333333334, "grad_norm": 18.272993087768555, "learning_rate": 8.104905530467987e-09, "logits/chosen": 2.262105941772461, "logits/rejected": 2.5360937118530273, "logps/chosen": -40.185272216796875, "logps/rejected": -325.7847595214844, "loss": 0.4546, "nll_loss": 0.4515198767185211, "rewards/accuracies": 1.0, "rewards/chosen": 3.626988172531128, "rewards/margins": 9.73967170715332, "rewards/rejected": -6.112683296203613, "step": 5249 }, { "epoch": 0.875, "grad_norm": 22.664161682128906, "learning_rate": 8.083631020418791e-09, "logits/chosen": 2.479004144668579, "logits/rejected": 2.529360294342041, "logps/chosen": -61.62083435058594, "logps/rejected": -226.19375610351562, "loss": 0.6491, "nll_loss": 0.6352663040161133, "rewards/accuracies": 1.0, "rewards/chosen": 1.3299249410629272, "rewards/margins": 11.955095291137695, "rewards/rejected": -10.625170707702637, "step": 5250 }, { "epoch": 0.8751666666666666, "grad_norm": 54.55390167236328, "learning_rate": 8.062383292592789e-09, "logits/chosen": 2.1830532550811768, "logits/rejected": 2.176947593688965, "logps/chosen": -50.05559539794922, "logps/rejected": -51.915550231933594, "loss": 1.0713, "nll_loss": 0.758418083190918, "rewards/accuracies": 1.0, "rewards/chosen": 1.850115180015564, "rewards/margins": 2.3655846118927, "rewards/rejected": -0.5154693722724915, "step": 5251 }, { "epoch": 0.8753333333333333, "grad_norm": 29.011817932128906, "learning_rate": 8.041162353181085e-09, "logits/chosen": 2.6787900924682617, "logits/rejected": 2.6837446689605713, "logps/chosen": -72.03530883789062, "logps/rejected": -168.91171264648438, "loss": 0.9785, "nll_loss": 0.9604707956314087, "rewards/accuracies": 1.0, "rewards/chosen": 1.1708450317382812, "rewards/margins": 7.436296463012695, "rewards/rejected": -6.265451431274414, "step": 5252 }, { "epoch": 0.8755, "grad_norm": 21.410058975219727, "learning_rate": 8.019968208366957e-09, "logits/chosen": 2.226083517074585, "logits/rejected": 2.3463094234466553, "logps/chosen": -91.21430206298828, "logps/rejected": -144.669921875, "loss": 0.8042, "nll_loss": 0.8001253008842468, "rewards/accuracies": 1.0, "rewards/chosen": 2.6461634635925293, "rewards/margins": 10.61392879486084, "rewards/rejected": -7.9677653312683105, "step": 5253 }, { "epoch": 0.8756666666666667, "grad_norm": 134.7574005126953, "learning_rate": 7.998800864325873e-09, "logits/chosen": 2.6613428592681885, "logits/rejected": 2.5881056785583496, "logps/chosen": -38.11582946777344, "logps/rejected": -43.17554473876953, "loss": 1.7914, "nll_loss": 0.8864147067070007, "rewards/accuracies": 1.0, "rewards/chosen": 0.4899688959121704, "rewards/margins": 0.12546426057815552, "rewards/rejected": 0.3645046353340149, "step": 5254 }, { "epoch": 0.8758333333333334, "grad_norm": 25.151023864746094, "learning_rate": 7.977660327225467e-09, "logits/chosen": 1.913852572441101, "logits/rejected": 2.422999382019043, "logps/chosen": -41.8997802734375, "logps/rejected": -277.243896484375, "loss": 0.6192, "nll_loss": 0.6072431206703186, "rewards/accuracies": 1.0, "rewards/chosen": 2.0472123622894287, "rewards/margins": 7.15008544921875, "rewards/rejected": -5.1028733253479, "step": 5255 }, { "epoch": 0.876, "grad_norm": 30.46088218688965, "learning_rate": 7.9565466032256e-09, "logits/chosen": 2.5546770095825195, "logits/rejected": 2.555701732635498, "logps/chosen": -22.506505966186523, "logps/rejected": -143.072265625, "loss": 0.5438, "nll_loss": 0.5115115642547607, "rewards/accuracies": 1.0, "rewards/chosen": 0.4673612713813782, "rewards/margins": 7.696712493896484, "rewards/rejected": -7.229351043701172, "step": 5256 }, { "epoch": 0.8761666666666666, "grad_norm": 21.061655044555664, "learning_rate": 7.935459698478298e-09, "logits/chosen": 1.485910177230835, "logits/rejected": 1.972930908203125, "logps/chosen": -106.04150390625, "logps/rejected": -432.57037353515625, "loss": 0.8489, "nll_loss": 0.8349725604057312, "rewards/accuracies": 1.0, "rewards/chosen": 1.392575979232788, "rewards/margins": 8.312822341918945, "rewards/rejected": -6.920246124267578, "step": 5257 }, { "epoch": 0.8763333333333333, "grad_norm": 24.64811897277832, "learning_rate": 7.914399619127798e-09, "logits/chosen": 2.7362117767333984, "logits/rejected": 2.741331100463867, "logps/chosen": -199.7678985595703, "logps/rejected": -250.92979431152344, "loss": 0.8887, "nll_loss": 0.8573729395866394, "rewards/accuracies": 1.0, "rewards/chosen": 0.4699234366416931, "rewards/margins": 11.151540756225586, "rewards/rejected": -10.681617736816406, "step": 5258 }, { "epoch": 0.8765, "grad_norm": 24.100679397583008, "learning_rate": 7.89336637131046e-09, "logits/chosen": 2.108980417251587, "logits/rejected": 2.2995336055755615, "logps/chosen": -95.5285873413086, "logps/rejected": -200.5403289794922, "loss": 1.0516, "nll_loss": 1.049764633178711, "rewards/accuracies": 1.0, "rewards/chosen": 3.4290382862091064, "rewards/margins": 12.755910873413086, "rewards/rejected": -9.326872825622559, "step": 5259 }, { "epoch": 0.8766666666666667, "grad_norm": 27.54085922241211, "learning_rate": 7.872359961154907e-09, "logits/chosen": 2.717938184738159, "logits/rejected": 2.8781321048736572, "logps/chosen": -54.3297233581543, "logps/rejected": -385.46307373046875, "loss": 0.8132, "nll_loss": 0.7989665269851685, "rewards/accuracies": 1.0, "rewards/chosen": 4.57785701751709, "rewards/margins": 8.542059898376465, "rewards/rejected": -3.964202880859375, "step": 5260 }, { "epoch": 0.8768333333333334, "grad_norm": 23.292560577392578, "learning_rate": 7.85138039478188e-09, "logits/chosen": 2.021177053451538, "logits/rejected": 2.1823408603668213, "logps/chosen": -46.86393737792969, "logps/rejected": -444.192138671875, "loss": 0.6241, "nll_loss": 0.6166307330131531, "rewards/accuracies": 1.0, "rewards/chosen": 1.9793541431427002, "rewards/margins": 11.263326644897461, "rewards/rejected": -9.28397274017334, "step": 5261 }, { "epoch": 0.877, "grad_norm": 52.42024612426758, "learning_rate": 7.830427678304352e-09, "logits/chosen": 0.7440093755722046, "logits/rejected": 1.8205112218856812, "logps/chosen": -29.50120735168457, "logps/rejected": -226.8516845703125, "loss": 0.9515, "nll_loss": 0.9219127893447876, "rewards/accuracies": 1.0, "rewards/chosen": 1.4417451620101929, "rewards/margins": 5.406694412231445, "rewards/rejected": -3.964949131011963, "step": 5262 }, { "epoch": 0.8771666666666667, "grad_norm": 16.216157913208008, "learning_rate": 7.80950181782739e-09, "logits/chosen": 1.6060914993286133, "logits/rejected": 1.7554835081100464, "logps/chosen": -155.05596923828125, "logps/rejected": -257.59246826171875, "loss": 0.8588, "nll_loss": 0.8566627502441406, "rewards/accuracies": 1.0, "rewards/chosen": 3.2243118286132812, "rewards/margins": 13.04590892791748, "rewards/rejected": -9.8215970993042, "step": 5263 }, { "epoch": 0.8773333333333333, "grad_norm": 23.459543228149414, "learning_rate": 7.788602819448332e-09, "logits/chosen": 3.5744569301605225, "logits/rejected": 3.745697259902954, "logps/chosen": -92.9053955078125, "logps/rejected": -236.33094787597656, "loss": 0.9285, "nll_loss": 0.9198554754257202, "rewards/accuracies": 1.0, "rewards/chosen": 2.1934478282928467, "rewards/margins": 7.9168291091918945, "rewards/rejected": -5.723381042480469, "step": 5264 }, { "epoch": 0.8775, "grad_norm": 22.61062240600586, "learning_rate": 7.767730689256614e-09, "logits/chosen": 0.7639524340629578, "logits/rejected": 2.3713197708129883, "logps/chosen": -32.24641036987305, "logps/rejected": -319.05047607421875, "loss": 0.4762, "nll_loss": 0.46066299080848694, "rewards/accuracies": 1.0, "rewards/chosen": 1.32636296749115, "rewards/margins": 7.702293872833252, "rewards/rejected": -6.3759307861328125, "step": 5265 }, { "epoch": 0.8776666666666667, "grad_norm": 26.11496353149414, "learning_rate": 7.746885433333894e-09, "logits/chosen": 2.727806329727173, "logits/rejected": 2.9158451557159424, "logps/chosen": -28.05839729309082, "logps/rejected": -373.6806640625, "loss": 0.4817, "nll_loss": 0.4599737524986267, "rewards/accuracies": 1.0, "rewards/chosen": 0.8596544861793518, "rewards/margins": 12.971617698669434, "rewards/rejected": -12.111963272094727, "step": 5266 }, { "epoch": 0.8778333333333334, "grad_norm": 142.05418395996094, "learning_rate": 7.726067057753993e-09, "logits/chosen": 2.5234997272491455, "logits/rejected": 2.5714986324310303, "logps/chosen": -5.586519241333008, "logps/rejected": -58.54218292236328, "loss": 2.2449, "nll_loss": 0.39903706312179565, "rewards/accuracies": 0.0, "rewards/chosen": 1.2632482051849365, "rewards/margins": -1.0774669647216797, "rewards/rejected": 2.340715169906616, "step": 5267 }, { "epoch": 0.878, "grad_norm": 35.4452018737793, "learning_rate": 7.705275568582847e-09, "logits/chosen": 1.797733187675476, "logits/rejected": 2.1741256713867188, "logps/chosen": -28.8771915435791, "logps/rejected": -120.9061050415039, "loss": 0.5806, "nll_loss": 0.5553306341171265, "rewards/accuracies": 1.0, "rewards/chosen": 1.2332922220230103, "rewards/margins": 5.787248134613037, "rewards/rejected": -4.553956031799316, "step": 5268 }, { "epoch": 0.8781666666666667, "grad_norm": 62.32865905761719, "learning_rate": 7.68451097187861e-09, "logits/chosen": 3.0795507431030273, "logits/rejected": 2.7858052253723145, "logps/chosen": -45.41615295410156, "logps/rejected": -78.20391845703125, "loss": 1.5268, "nll_loss": 1.513871669769287, "rewards/accuracies": 1.0, "rewards/chosen": 2.4075608253479004, "rewards/margins": 6.995776653289795, "rewards/rejected": -4.5882158279418945, "step": 5269 }, { "epoch": 0.8783333333333333, "grad_norm": 15.119308471679688, "learning_rate": 7.663773273691599e-09, "logits/chosen": 2.793564796447754, "logits/rejected": 2.7676169872283936, "logps/chosen": -75.70805358886719, "logps/rejected": -53.66545486450195, "loss": 0.4811, "nll_loss": 0.453341543674469, "rewards/accuracies": 1.0, "rewards/chosen": 2.9024887084960938, "rewards/margins": 6.297413349151611, "rewards/rejected": -3.3949246406555176, "step": 5270 }, { "epoch": 0.8785, "grad_norm": 32.81357955932617, "learning_rate": 7.6430624800643e-09, "logits/chosen": 2.6052067279815674, "logits/rejected": 2.749013900756836, "logps/chosen": -137.99307250976562, "logps/rejected": -209.73956298828125, "loss": 0.8781, "nll_loss": 0.8165269494056702, "rewards/accuracies": 1.0, "rewards/chosen": 0.7842666506767273, "rewards/margins": 4.100680828094482, "rewards/rejected": -3.3164141178131104, "step": 5271 }, { "epoch": 0.8786666666666667, "grad_norm": 132.84291076660156, "learning_rate": 7.622378597031287e-09, "logits/chosen": 2.8882830142974854, "logits/rejected": 2.996338367462158, "logps/chosen": -48.837074279785156, "logps/rejected": -61.055580139160156, "loss": 1.8213, "nll_loss": 0.904390275478363, "rewards/accuracies": 1.0, "rewards/chosen": 3.600177764892578, "rewards/margins": 1.3110640048980713, "rewards/rejected": 2.289113759994507, "step": 5272 }, { "epoch": 0.8788333333333334, "grad_norm": 18.882732391357422, "learning_rate": 7.601721630619384e-09, "logits/chosen": 2.768728733062744, "logits/rejected": 3.0876057147979736, "logps/chosen": -62.6588134765625, "logps/rejected": -323.90631103515625, "loss": 0.5627, "nll_loss": 0.5545027852058411, "rewards/accuracies": 1.0, "rewards/chosen": 1.909391164779663, "rewards/margins": 9.764487266540527, "rewards/rejected": -7.855096340179443, "step": 5273 }, { "epoch": 0.879, "grad_norm": 25.218456268310547, "learning_rate": 7.581091586847522e-09, "logits/chosen": 2.469348192214966, "logits/rejected": 2.500248670578003, "logps/chosen": -153.29092407226562, "logps/rejected": -262.2689208984375, "loss": 1.3494, "nll_loss": 1.344657063484192, "rewards/accuracies": 1.0, "rewards/chosen": 2.4568710327148438, "rewards/margins": 11.204307556152344, "rewards/rejected": -8.7474365234375, "step": 5274 }, { "epoch": 0.8791666666666667, "grad_norm": 40.422813415527344, "learning_rate": 7.560488471726822e-09, "logits/chosen": 2.511017322540283, "logits/rejected": 2.4448671340942383, "logps/chosen": -8.842957496643066, "logps/rejected": -120.47340393066406, "loss": 0.361, "nll_loss": 0.34011366963386536, "rewards/accuracies": 1.0, "rewards/chosen": 2.341082811355591, "rewards/margins": 6.256773948669434, "rewards/rejected": -3.9156908988952637, "step": 5275 }, { "epoch": 0.8793333333333333, "grad_norm": 26.123992919921875, "learning_rate": 7.539912291260508e-09, "logits/chosen": 1.127261757850647, "logits/rejected": 0.769943356513977, "logps/chosen": -98.22151184082031, "logps/rejected": -90.4792709350586, "loss": 0.9179, "nll_loss": 0.8929227590560913, "rewards/accuracies": 1.0, "rewards/chosen": 1.805983066558838, "rewards/margins": 5.755789756774902, "rewards/rejected": -3.9498069286346436, "step": 5276 }, { "epoch": 0.8795, "grad_norm": 120.34876251220703, "learning_rate": 7.519363051443994e-09, "logits/chosen": 2.8346364498138428, "logits/rejected": 2.919212579727173, "logps/chosen": -29.58774757385254, "logps/rejected": -49.3028564453125, "loss": 1.2465, "nll_loss": 0.6295265555381775, "rewards/accuracies": 1.0, "rewards/chosen": 0.10504703968763351, "rewards/margins": 0.6298189163208008, "rewards/rejected": -0.5247718691825867, "step": 5277 }, { "epoch": 0.8796666666666667, "grad_norm": 27.006078720092773, "learning_rate": 7.498840758264846e-09, "logits/chosen": 2.8023366928100586, "logits/rejected": 2.8060109615325928, "logps/chosen": -45.825408935546875, "logps/rejected": -213.79052734375, "loss": 0.614, "nll_loss": 0.6029658913612366, "rewards/accuracies": 1.0, "rewards/chosen": 1.584014892578125, "rewards/margins": 9.698617935180664, "rewards/rejected": -8.114603042602539, "step": 5278 }, { "epoch": 0.8798333333333334, "grad_norm": 20.430448532104492, "learning_rate": 7.478345417702769e-09, "logits/chosen": 2.8503472805023193, "logits/rejected": 3.020991086959839, "logps/chosen": -81.95027160644531, "logps/rejected": -283.26861572265625, "loss": 0.783, "nll_loss": 0.7731159329414368, "rewards/accuracies": 1.0, "rewards/chosen": 1.6797189712524414, "rewards/margins": 12.542411804199219, "rewards/rejected": -10.862692832946777, "step": 5279 }, { "epoch": 0.88, "grad_norm": 50.349754333496094, "learning_rate": 7.457877035729587e-09, "logits/chosen": 2.9022793769836426, "logits/rejected": 2.824171543121338, "logps/chosen": -16.88113021850586, "logps/rejected": -63.21491622924805, "loss": 0.5505, "nll_loss": 0.46892037987709045, "rewards/accuracies": 1.0, "rewards/chosen": 1.2246065139770508, "rewards/margins": 3.816744804382324, "rewards/rejected": -2.5921382904052734, "step": 5280 }, { "epoch": 0.8801666666666667, "grad_norm": 65.96554565429688, "learning_rate": 7.4374356183093175e-09, "logits/chosen": 2.4638969898223877, "logits/rejected": 2.4810702800750732, "logps/chosen": -60.65930938720703, "logps/rejected": -58.933921813964844, "loss": 2.2684, "nll_loss": 2.2466413974761963, "rewards/accuracies": 1.0, "rewards/chosen": 1.3573358058929443, "rewards/margins": 6.082195281982422, "rewards/rejected": -4.724859714508057, "step": 5281 }, { "epoch": 0.8803333333333333, "grad_norm": 58.82879638671875, "learning_rate": 7.417021171398097e-09, "logits/chosen": 2.5128402709960938, "logits/rejected": 2.8218326568603516, "logps/chosen": -62.276771545410156, "logps/rejected": -224.5791015625, "loss": 1.7076, "nll_loss": 1.68315589427948, "rewards/accuracies": 1.0, "rewards/chosen": 1.3862075805664062, "rewards/margins": 5.7825164794921875, "rewards/rejected": -4.396308898925781, "step": 5282 }, { "epoch": 0.8805, "grad_norm": 24.354969024658203, "learning_rate": 7.396633700944199e-09, "logits/chosen": 1.57387113571167, "logits/rejected": 1.5841642618179321, "logps/chosen": -49.19718933105469, "logps/rejected": -137.861083984375, "loss": 0.7521, "nll_loss": 0.7454120516777039, "rewards/accuracies": 1.0, "rewards/chosen": 3.0251870155334473, "rewards/margins": 8.25538444519043, "rewards/rejected": -5.230196952819824, "step": 5283 }, { "epoch": 0.8806666666666667, "grad_norm": 37.315486907958984, "learning_rate": 7.3762732128880625e-09, "logits/chosen": 2.6943695545196533, "logits/rejected": 3.011486768722534, "logps/chosen": -32.67218017578125, "logps/rejected": -125.18690490722656, "loss": 0.6649, "nll_loss": 0.6534435749053955, "rewards/accuracies": 1.0, "rewards/chosen": 1.6456708908081055, "rewards/margins": 8.225300788879395, "rewards/rejected": -6.579629898071289, "step": 5284 }, { "epoch": 0.8808333333333334, "grad_norm": 35.905155181884766, "learning_rate": 7.355939713162218e-09, "logits/chosen": 2.7004597187042236, "logits/rejected": 2.9055793285369873, "logps/chosen": -89.20258331298828, "logps/rejected": -292.7478942871094, "loss": 1.0657, "nll_loss": 1.0619356632232666, "rewards/accuracies": 1.0, "rewards/chosen": 2.6799769401550293, "rewards/margins": 11.391702651977539, "rewards/rejected": -8.711725234985352, "step": 5285 }, { "epoch": 0.881, "grad_norm": 32.14225387573242, "learning_rate": 7.335633207691361e-09, "logits/chosen": 2.648629903793335, "logits/rejected": 2.883523941040039, "logps/chosen": -59.31559371948242, "logps/rejected": -164.64405822753906, "loss": 0.9757, "nll_loss": 0.9567031860351562, "rewards/accuracies": 1.0, "rewards/chosen": 1.9438602924346924, "rewards/margins": 6.242536544799805, "rewards/rejected": -4.298676490783691, "step": 5286 }, { "epoch": 0.8811666666666667, "grad_norm": 44.23765563964844, "learning_rate": 7.315353702392335e-09, "logits/chosen": 2.710907459259033, "logits/rejected": 2.5700974464416504, "logps/chosen": -117.20361328125, "logps/rejected": -178.61285400390625, "loss": 1.2901, "nll_loss": 1.26025390625, "rewards/accuracies": 1.0, "rewards/chosen": 0.57501220703125, "rewards/margins": 7.227317810058594, "rewards/rejected": -6.652305603027344, "step": 5287 }, { "epoch": 0.8813333333333333, "grad_norm": 50.414329528808594, "learning_rate": 7.295101203174103e-09, "logits/chosen": 1.6247512102127075, "logits/rejected": 1.7851773500442505, "logps/chosen": -82.55754852294922, "logps/rejected": -110.82260131835938, "loss": 1.038, "nll_loss": 0.9381540417671204, "rewards/accuracies": 1.0, "rewards/chosen": 1.5759438276290894, "rewards/margins": 3.74798583984375, "rewards/rejected": -2.17204213142395, "step": 5288 }, { "epoch": 0.8815, "grad_norm": 47.97669219970703, "learning_rate": 7.274875715937745e-09, "logits/chosen": 1.7253602743148804, "logits/rejected": 2.370504856109619, "logps/chosen": -17.852067947387695, "logps/rejected": -176.09246826171875, "loss": 0.716, "nll_loss": 0.714082658290863, "rewards/accuracies": 1.0, "rewards/chosen": 3.995856761932373, "rewards/margins": 10.692571640014648, "rewards/rejected": -6.696714401245117, "step": 5289 }, { "epoch": 0.8816666666666667, "grad_norm": 31.819059371948242, "learning_rate": 7.254677246576468e-09, "logits/chosen": 2.862351179122925, "logits/rejected": 3.028000831604004, "logps/chosen": -74.24108123779297, "logps/rejected": -203.52078247070312, "loss": 1.0012, "nll_loss": 0.989881157875061, "rewards/accuracies": 1.0, "rewards/chosen": 1.5633316040039062, "rewards/margins": 9.448562622070312, "rewards/rejected": -7.885231018066406, "step": 5290 }, { "epoch": 0.8818333333333334, "grad_norm": 399.8896484375, "learning_rate": 7.234505800975654e-09, "logits/chosen": 2.5099706649780273, "logits/rejected": 2.6267576217651367, "logps/chosen": -77.88194274902344, "logps/rejected": -32.21122741699219, "loss": 6.8653, "nll_loss": 2.884516477584839, "rewards/accuracies": 0.0, "rewards/chosen": -2.956623077392578, "rewards/margins": -3.909860134124756, "rewards/rejected": 0.9532371759414673, "step": 5291 }, { "epoch": 0.882, "grad_norm": 33.12112808227539, "learning_rate": 7.21436138501278e-09, "logits/chosen": 2.6736950874328613, "logits/rejected": 2.638195753097534, "logps/chosen": -113.61052703857422, "logps/rejected": -64.38999938964844, "loss": 0.9765, "nll_loss": 0.8875821828842163, "rewards/accuracies": 1.0, "rewards/chosen": 1.4042999744415283, "rewards/margins": 3.795888662338257, "rewards/rejected": -2.3915886878967285, "step": 5292 }, { "epoch": 0.8821666666666667, "grad_norm": 225.43235778808594, "learning_rate": 7.1942440045574035e-09, "logits/chosen": 2.375826597213745, "logits/rejected": 2.205728530883789, "logps/chosen": -79.94989013671875, "logps/rejected": -26.17903709411621, "loss": 4.8028, "nll_loss": 0.8690204620361328, "rewards/accuracies": 0.0, "rewards/chosen": 1.6315011978149414, "rewards/margins": -3.306452751159668, "rewards/rejected": 4.937953948974609, "step": 5293 }, { "epoch": 0.8823333333333333, "grad_norm": 28.81361198425293, "learning_rate": 7.174153665471272e-09, "logits/chosen": 2.758427858352661, "logits/rejected": 2.7724063396453857, "logps/chosen": -71.68748474121094, "logps/rejected": -167.95208740234375, "loss": 0.9735, "nll_loss": 0.9558331370353699, "rewards/accuracies": 1.0, "rewards/chosen": 1.20562744140625, "rewards/margins": 7.375115871429443, "rewards/rejected": -6.169488430023193, "step": 5294 }, { "epoch": 0.8825, "grad_norm": 92.27344512939453, "learning_rate": 7.154090373608235e-09, "logits/chosen": 2.016808032989502, "logits/rejected": 1.8390501737594604, "logps/chosen": -160.48306274414062, "logps/rejected": -65.880859375, "loss": 1.4396, "nll_loss": 0.8105204701423645, "rewards/accuracies": 1.0, "rewards/chosen": 2.425567626953125, "rewards/margins": 1.5112693309783936, "rewards/rejected": 0.9142982363700867, "step": 5295 }, { "epoch": 0.8826666666666667, "grad_norm": 33.352081298828125, "learning_rate": 7.134054134814249e-09, "logits/chosen": 2.586941719055176, "logits/rejected": 2.6286849975585938, "logps/chosen": -49.03107452392578, "logps/rejected": -98.941650390625, "loss": 0.8071, "nll_loss": 0.7661104202270508, "rewards/accuracies": 1.0, "rewards/chosen": 1.00023353099823, "rewards/margins": 4.820688247680664, "rewards/rejected": -3.8204548358917236, "step": 5296 }, { "epoch": 0.8828333333333334, "grad_norm": 25.650617599487305, "learning_rate": 7.1140449549273895e-09, "logits/chosen": 2.203446388244629, "logits/rejected": 2.4594578742980957, "logps/chosen": -89.09872436523438, "logps/rejected": -553.0607299804688, "loss": 0.93, "nll_loss": 0.928111732006073, "rewards/accuracies": 1.0, "rewards/chosen": 3.360224962234497, "rewards/margins": 13.851542472839355, "rewards/rejected": -10.491317749023438, "step": 5297 }, { "epoch": 0.883, "grad_norm": 32.22582244873047, "learning_rate": 7.094062839777837e-09, "logits/chosen": 1.6561781167984009, "logits/rejected": 2.8289709091186523, "logps/chosen": -52.60948944091797, "logps/rejected": -302.87322998046875, "loss": 0.8779, "nll_loss": 0.8624506592750549, "rewards/accuracies": 1.0, "rewards/chosen": 1.5345101356506348, "rewards/margins": 6.916431903839111, "rewards/rejected": -5.381921768188477, "step": 5298 }, { "epoch": 0.8831666666666667, "grad_norm": 161.66586303710938, "learning_rate": 7.074107795187922e-09, "logits/chosen": 3.8690011501312256, "logits/rejected": 3.744486093521118, "logps/chosen": -55.833595275878906, "logps/rejected": -121.7128677368164, "loss": 1.4863, "nll_loss": 0.6417655348777771, "rewards/accuracies": 1.0, "rewards/chosen": 1.4173517227172852, "rewards/margins": 0.5268800854682922, "rewards/rejected": 0.8904716372489929, "step": 5299 }, { "epoch": 0.8833333333333333, "grad_norm": 39.80784225463867, "learning_rate": 7.0541798269720735e-09, "logits/chosen": 2.6512162685394287, "logits/rejected": 2.761784076690674, "logps/chosen": -36.84388732910156, "logps/rejected": -164.55482482910156, "loss": 0.7325, "nll_loss": 0.6951677799224854, "rewards/accuracies": 1.0, "rewards/chosen": 0.5188560485839844, "rewards/margins": 5.599908351898193, "rewards/rejected": -5.081052303314209, "step": 5300 }, { "epoch": 0.8835, "grad_norm": 214.4221649169922, "learning_rate": 7.034278940936789e-09, "logits/chosen": 2.7871837615966797, "logits/rejected": 2.8788034915924072, "logps/chosen": -66.06620025634766, "logps/rejected": -19.769773483276367, "loss": 3.5198, "nll_loss": 0.7593815326690674, "rewards/accuracies": 0.0, "rewards/chosen": 1.8183397054672241, "rewards/margins": -2.0266222953796387, "rewards/rejected": 3.8449618816375732, "step": 5301 }, { "epoch": 0.8836666666666667, "grad_norm": 197.11355590820312, "learning_rate": 7.01440514288073e-09, "logits/chosen": 2.219048023223877, "logits/rejected": 2.169975996017456, "logps/chosen": -32.697078704833984, "logps/rejected": -65.03776550292969, "loss": 2.3671, "nll_loss": 2.335505962371826, "rewards/accuracies": 1.0, "rewards/chosen": 0.7546497583389282, "rewards/margins": 5.726810455322266, "rewards/rejected": -4.972160816192627, "step": 5302 }, { "epoch": 0.8838333333333334, "grad_norm": 27.254047393798828, "learning_rate": 6.994558438594644e-09, "logits/chosen": 1.8612971305847168, "logits/rejected": 2.0348129272460938, "logps/chosen": -19.695545196533203, "logps/rejected": -91.4708251953125, "loss": 0.4588, "nll_loss": 0.44762593507766724, "rewards/accuracies": 1.0, "rewards/chosen": 1.8002796173095703, "rewards/margins": 7.663178443908691, "rewards/rejected": -5.862898826599121, "step": 5303 }, { "epoch": 0.884, "grad_norm": 20.75889778137207, "learning_rate": 6.974738833861382e-09, "logits/chosen": 1.1633806228637695, "logits/rejected": 2.076519012451172, "logps/chosen": -110.24002075195312, "logps/rejected": -298.6627197265625, "loss": 0.8749, "nll_loss": 0.8680317997932434, "rewards/accuracies": 1.0, "rewards/chosen": 2.1151123046875, "rewards/margins": 9.663214683532715, "rewards/rejected": -7.548102378845215, "step": 5304 }, { "epoch": 0.8841666666666667, "grad_norm": 89.5851821899414, "learning_rate": 6.9549463344559134e-09, "logits/chosen": 1.956200122833252, "logits/rejected": 2.629105567932129, "logps/chosen": -68.72364807128906, "logps/rejected": -248.92135620117188, "loss": 1.9813, "nll_loss": 1.9089902639389038, "rewards/accuracies": 1.0, "rewards/chosen": -0.45142674446105957, "rewards/margins": 6.7573041915893555, "rewards/rejected": -7.208731174468994, "step": 5305 }, { "epoch": 0.8843333333333333, "grad_norm": 171.3928680419922, "learning_rate": 6.935180946145269e-09, "logits/chosen": 2.741466999053955, "logits/rejected": 2.735250949859619, "logps/chosen": -41.52901077270508, "logps/rejected": -15.051188468933105, "loss": 4.1068, "nll_loss": 0.5191125869750977, "rewards/accuracies": 0.0, "rewards/chosen": 1.1723175048828125, "rewards/margins": -3.0374855995178223, "rewards/rejected": 4.209803104400635, "step": 5306 }, { "epoch": 0.8845, "grad_norm": 54.49110412597656, "learning_rate": 6.915442674688632e-09, "logits/chosen": 1.7060500383377075, "logits/rejected": 2.5423834323883057, "logps/chosen": -3.909702777862549, "logps/rejected": -204.67156982421875, "loss": 0.3168, "nll_loss": 0.2792644798755646, "rewards/accuracies": 1.0, "rewards/chosen": 0.778566837310791, "rewards/margins": 5.125545501708984, "rewards/rejected": -4.346978664398193, "step": 5307 }, { "epoch": 0.8846666666666667, "grad_norm": 31.1617431640625, "learning_rate": 6.895731525837245e-09, "logits/chosen": 0.12311039865016937, "logits/rejected": 2.365717887878418, "logps/chosen": -34.07474136352539, "logps/rejected": -770.385986328125, "loss": 0.7111, "nll_loss": 0.7098904252052307, "rewards/accuracies": 1.0, "rewards/chosen": 3.9830899238586426, "rewards/margins": 12.477279663085938, "rewards/rejected": -8.494189262390137, "step": 5308 }, { "epoch": 0.8848333333333334, "grad_norm": 22.259990692138672, "learning_rate": 6.876047505334481e-09, "logits/chosen": 1.7160664796829224, "logits/rejected": 2.4392011165618896, "logps/chosen": -89.56371307373047, "logps/rejected": -328.6105041503906, "loss": 0.7773, "nll_loss": 0.7721010446548462, "rewards/accuracies": 1.0, "rewards/chosen": 2.4515817165374756, "rewards/margins": 9.698820114135742, "rewards/rejected": -7.2472381591796875, "step": 5309 }, { "epoch": 0.885, "grad_norm": 30.75371742248535, "learning_rate": 6.856390618915775e-09, "logits/chosen": 0.033956725150346756, "logits/rejected": 1.9836825132369995, "logps/chosen": -49.94998550415039, "logps/rejected": -665.8480224609375, "loss": 0.6813, "nll_loss": 0.6572366952896118, "rewards/accuracies": 1.0, "rewards/chosen": 0.7483093738555908, "rewards/margins": 17.635034561157227, "rewards/rejected": -16.8867244720459, "step": 5310 }, { "epoch": 0.8851666666666667, "grad_norm": 15.243669509887695, "learning_rate": 6.8367608723086775e-09, "logits/chosen": 1.0537031888961792, "logits/rejected": 1.2638224363327026, "logps/chosen": -227.90576171875, "logps/rejected": -264.79412841796875, "loss": 0.8452, "nll_loss": 0.8409807085990906, "rewards/accuracies": 1.0, "rewards/chosen": 2.6513657569885254, "rewards/margins": 10.125248908996582, "rewards/rejected": -7.473883152008057, "step": 5311 }, { "epoch": 0.8853333333333333, "grad_norm": 40.40380096435547, "learning_rate": 6.8171582712328254e-09, "logits/chosen": 2.097247838973999, "logits/rejected": 1.4545880556106567, "logps/chosen": -76.94917297363281, "logps/rejected": -34.43172836303711, "loss": 1.2656, "nll_loss": 1.183833360671997, "rewards/accuracies": 1.0, "rewards/chosen": 1.3302650451660156, "rewards/margins": 3.866990566253662, "rewards/rejected": -2.5367255210876465, "step": 5312 }, { "epoch": 0.8855, "grad_norm": 28.98128890991211, "learning_rate": 6.797582821399972e-09, "logits/chosen": 2.7629928588867188, "logits/rejected": 2.8358116149902344, "logps/chosen": -57.55160903930664, "logps/rejected": -187.86688232421875, "loss": 0.8381, "nll_loss": 0.822165846824646, "rewards/accuracies": 1.0, "rewards/chosen": 1.3599056005477905, "rewards/margins": 7.282006740570068, "rewards/rejected": -5.922101020812988, "step": 5313 }, { "epoch": 0.8856666666666667, "grad_norm": 34.440513610839844, "learning_rate": 6.778034528513876e-09, "logits/chosen": 3.0062432289123535, "logits/rejected": 3.1607885360717773, "logps/chosen": -81.77739715576172, "logps/rejected": -105.87662506103516, "loss": 0.9498, "nll_loss": 0.9086377620697021, "rewards/accuracies": 1.0, "rewards/chosen": 0.9138344526290894, "rewards/margins": 4.829079627990723, "rewards/rejected": -3.915245294570923, "step": 5314 }, { "epoch": 0.8858333333333334, "grad_norm": 22.254182815551758, "learning_rate": 6.758513398270482e-09, "logits/chosen": 1.7102024555206299, "logits/rejected": 2.2111124992370605, "logps/chosen": -97.23512268066406, "logps/rejected": -303.99517822265625, "loss": 0.8151, "nll_loss": 0.8035960793495178, "rewards/accuracies": 1.0, "rewards/chosen": 1.6712770462036133, "rewards/margins": 8.006444931030273, "rewards/rejected": -6.335167407989502, "step": 5315 }, { "epoch": 0.886, "grad_norm": 19.37120246887207, "learning_rate": 6.739019436357774e-09, "logits/chosen": 2.6945176124572754, "logits/rejected": 2.4735684394836426, "logps/chosen": -152.665771484375, "logps/rejected": -307.72515869140625, "loss": 0.7625, "nll_loss": 0.752048134803772, "rewards/accuracies": 1.0, "rewards/chosen": 2.4771087169647217, "rewards/margins": 7.371594429016113, "rewards/rejected": -4.8944854736328125, "step": 5316 }, { "epoch": 0.8861666666666667, "grad_norm": 23.160146713256836, "learning_rate": 6.719552648455829e-09, "logits/chosen": 2.7276508808135986, "logits/rejected": 2.999424695968628, "logps/chosen": -41.04814147949219, "logps/rejected": -218.41790771484375, "loss": 0.6077, "nll_loss": 0.6036491394042969, "rewards/accuracies": 1.0, "rewards/chosen": 2.794813871383667, "rewards/margins": 9.731490135192871, "rewards/rejected": -6.936676025390625, "step": 5317 }, { "epoch": 0.8863333333333333, "grad_norm": 10.27910041809082, "learning_rate": 6.7001130402367766e-09, "logits/chosen": 1.2778908014297485, "logits/rejected": 1.3817706108093262, "logps/chosen": -111.82826232910156, "logps/rejected": -211.5342254638672, "loss": 0.4711, "nll_loss": 0.46401771903038025, "rewards/accuracies": 1.0, "rewards/chosen": 4.028178691864014, "rewards/margins": 8.833502769470215, "rewards/rejected": -4.805324077606201, "step": 5318 }, { "epoch": 0.8865, "grad_norm": 29.23273468017578, "learning_rate": 6.680700617364876e-09, "logits/chosen": 2.404024839401245, "logits/rejected": 2.6688435077667236, "logps/chosen": -62.95038604736328, "logps/rejected": -343.67034912109375, "loss": 0.8586, "nll_loss": 0.8393386602401733, "rewards/accuracies": 1.0, "rewards/chosen": 1.1127303838729858, "rewards/margins": 7.217296123504639, "rewards/rejected": -6.104565620422363, "step": 5319 }, { "epoch": 0.8866666666666667, "grad_norm": 41.04399108886719, "learning_rate": 6.661315385496424e-09, "logits/chosen": 3.045502185821533, "logits/rejected": 3.1009891033172607, "logps/chosen": -37.55501937866211, "logps/rejected": -133.7059326171875, "loss": 1.1415, "nll_loss": 1.1380311250686646, "rewards/accuracies": 1.0, "rewards/chosen": 3.7654712200164795, "rewards/margins": 9.581753730773926, "rewards/rejected": -5.816282749176025, "step": 5320 }, { "epoch": 0.8868333333333334, "grad_norm": 70.3553466796875, "learning_rate": 6.6419573502798365e-09, "logits/chosen": 1.746599555015564, "logits/rejected": 1.4224050045013428, "logps/chosen": -27.41636848449707, "logps/rejected": -43.5511360168457, "loss": 1.3011, "nll_loss": 1.2461986541748047, "rewards/accuracies": 1.0, "rewards/chosen": 1.6493597030639648, "rewards/margins": 4.567775726318359, "rewards/rejected": -2.9184160232543945, "step": 5321 }, { "epoch": 0.887, "grad_norm": 20.507373809814453, "learning_rate": 6.622626517355556e-09, "logits/chosen": 0.9974578619003296, "logits/rejected": 1.7533528804779053, "logps/chosen": -111.79702758789062, "logps/rejected": -358.33203125, "loss": 0.9541, "nll_loss": 0.9474324584007263, "rewards/accuracies": 1.0, "rewards/chosen": 2.113542318344116, "rewards/margins": 10.40721607208252, "rewards/rejected": -8.293673515319824, "step": 5322 }, { "epoch": 0.8871666666666667, "grad_norm": 24.652019500732422, "learning_rate": 6.603322892356133e-09, "logits/chosen": 0.917924702167511, "logits/rejected": 1.8613438606262207, "logps/chosen": -38.4583854675293, "logps/rejected": -357.03240966796875, "loss": 0.5464, "nll_loss": 0.5268272161483765, "rewards/accuracies": 1.0, "rewards/chosen": 1.2995182275772095, "rewards/margins": 6.473132610321045, "rewards/rejected": -5.173614501953125, "step": 5323 }, { "epoch": 0.8873333333333333, "grad_norm": 31.048717498779297, "learning_rate": 6.584046480906169e-09, "logits/chosen": 1.1413776874542236, "logits/rejected": 2.0245015621185303, "logps/chosen": -47.83152389526367, "logps/rejected": -229.3226776123047, "loss": 0.88, "nll_loss": 0.8541343808174133, "rewards/accuracies": 1.0, "rewards/chosen": 0.7807850241661072, "rewards/margins": 6.857507705688477, "rewards/rejected": -6.076722621917725, "step": 5324 }, { "epoch": 0.8875, "grad_norm": 26.9903621673584, "learning_rate": 6.56479728862237e-09, "logits/chosen": 2.3722758293151855, "logits/rejected": 2.6343791484832764, "logps/chosen": -88.508544921875, "logps/rejected": -553.4212646484375, "loss": 0.9384, "nll_loss": 0.9316688776016235, "rewards/accuracies": 1.0, "rewards/chosen": 2.0677177906036377, "rewards/margins": 14.485185623168945, "rewards/rejected": -12.417468070983887, "step": 5325 }, { "epoch": 0.8876666666666667, "grad_norm": 35.78529739379883, "learning_rate": 6.545575321113484e-09, "logits/chosen": 2.588646173477173, "logits/rejected": 2.8082776069641113, "logps/chosen": -31.64748764038086, "logps/rejected": -60.189697265625, "loss": 0.8183, "nll_loss": 0.7718901038169861, "rewards/accuracies": 1.0, "rewards/chosen": 0.7730832099914551, "rewards/margins": 4.626755714416504, "rewards/rejected": -3.853672742843628, "step": 5326 }, { "epoch": 0.8878333333333334, "grad_norm": 157.25808715820312, "learning_rate": 6.526380583980306e-09, "logits/chosen": 1.1618633270263672, "logits/rejected": 1.8937296867370605, "logps/chosen": -37.26771545410156, "logps/rejected": -257.1546630859375, "loss": 3.4857, "nll_loss": 3.3879740238189697, "rewards/accuracies": 1.0, "rewards/chosen": -0.8373958468437195, "rewards/margins": 7.133222579956055, "rewards/rejected": -7.97061824798584, "step": 5327 }, { "epoch": 0.888, "grad_norm": 53.07658767700195, "learning_rate": 6.507213082815743e-09, "logits/chosen": 2.11435604095459, "logits/rejected": 2.1454336643218994, "logps/chosen": -74.7152099609375, "logps/rejected": -40.67251968383789, "loss": 1.3797, "nll_loss": 1.2881932258605957, "rewards/accuracies": 1.0, "rewards/chosen": 0.3025802671909332, "rewards/margins": 3.4376091957092285, "rewards/rejected": -3.135028839111328, "step": 5328 }, { "epoch": 0.8881666666666667, "grad_norm": 82.91447448730469, "learning_rate": 6.488072823204749e-09, "logits/chosen": 1.9346020221710205, "logits/rejected": 2.053676128387451, "logps/chosen": -58.239078521728516, "logps/rejected": -435.984130859375, "loss": 2.1629, "nll_loss": 2.008244276046753, "rewards/accuracies": 1.0, "rewards/chosen": -1.3178943395614624, "rewards/margins": 4.165394306182861, "rewards/rejected": -5.483288764953613, "step": 5329 }, { "epoch": 0.8883333333333333, "grad_norm": 23.978927612304688, "learning_rate": 6.468959810724328e-09, "logits/chosen": 2.7464098930358887, "logits/rejected": 3.244133472442627, "logps/chosen": -57.156532287597656, "logps/rejected": -400.7032165527344, "loss": 0.6167, "nll_loss": 0.6016477346420288, "rewards/accuracies": 1.0, "rewards/chosen": 1.3372926712036133, "rewards/margins": 7.853097915649414, "rewards/rejected": -6.515805244445801, "step": 5330 }, { "epoch": 0.8885, "grad_norm": 18.77119255065918, "learning_rate": 6.449874050943549e-09, "logits/chosen": 3.352071762084961, "logits/rejected": 3.2099833488464355, "logps/chosen": -74.23814392089844, "logps/rejected": -71.00200653076172, "loss": 0.7126, "nll_loss": 0.7070299983024597, "rewards/accuracies": 1.0, "rewards/chosen": 3.132603406906128, "rewards/margins": 8.620707511901855, "rewards/rejected": -5.488104343414307, "step": 5331 }, { "epoch": 0.8886666666666667, "grad_norm": 29.893564224243164, "learning_rate": 6.430815549423541e-09, "logits/chosen": 2.611360549926758, "logits/rejected": 2.5350704193115234, "logps/chosen": -125.88796997070312, "logps/rejected": -52.011077880859375, "loss": 1.1588, "nll_loss": 1.13412606716156, "rewards/accuracies": 1.0, "rewards/chosen": 1.6271896362304688, "rewards/margins": 5.743659973144531, "rewards/rejected": -4.1164703369140625, "step": 5332 }, { "epoch": 0.8888333333333334, "grad_norm": 28.914241790771484, "learning_rate": 6.411784311717505e-09, "logits/chosen": 2.830483913421631, "logits/rejected": 2.751383066177368, "logps/chosen": -81.85751342773438, "logps/rejected": -130.42379760742188, "loss": 1.0465, "nll_loss": 1.0232188701629639, "rewards/accuracies": 1.0, "rewards/chosen": 2.7399346828460693, "rewards/margins": 6.386660099029541, "rewards/rejected": -3.6467254161834717, "step": 5333 }, { "epoch": 0.889, "grad_norm": 28.59699058532715, "learning_rate": 6.392780343370685e-09, "logits/chosen": 3.290947437286377, "logits/rejected": 3.3268020153045654, "logps/chosen": -54.6001091003418, "logps/rejected": -169.36361694335938, "loss": 0.884, "nll_loss": 0.8806470036506653, "rewards/accuracies": 1.0, "rewards/chosen": 5.954680442810059, "rewards/margins": 11.3980712890625, "rewards/rejected": -5.4433913230896, "step": 5334 }, { "epoch": 0.8891666666666667, "grad_norm": 172.7418670654297, "learning_rate": 6.373803649920384e-09, "logits/chosen": 3.2030937671661377, "logits/rejected": 3.1549837589263916, "logps/chosen": -32.645263671875, "logps/rejected": -11.232135772705078, "loss": 4.8826, "nll_loss": 0.5829511284828186, "rewards/accuracies": 0.0, "rewards/chosen": 1.1225289106369019, "rewards/margins": -3.777714252471924, "rewards/rejected": 4.900243282318115, "step": 5335 }, { "epoch": 0.8893333333333333, "grad_norm": 166.150390625, "learning_rate": 6.354854236895935e-09, "logits/chosen": 2.769059896469116, "logits/rejected": 2.7122554779052734, "logps/chosen": -22.690990447998047, "logps/rejected": -52.953834533691406, "loss": 1.6827, "nll_loss": 1.3347638845443726, "rewards/accuracies": 1.0, "rewards/chosen": 0.02869853936135769, "rewards/margins": 1.4299559593200684, "rewards/rejected": -1.4012573957443237, "step": 5336 }, { "epoch": 0.8895, "grad_norm": 25.5213680267334, "learning_rate": 6.335932109818753e-09, "logits/chosen": 0.963771402835846, "logits/rejected": 1.172007441520691, "logps/chosen": -42.02088165283203, "logps/rejected": -151.1934051513672, "loss": 0.6705, "nll_loss": 0.6565762758255005, "rewards/accuracies": 1.0, "rewards/chosen": 1.331446886062622, "rewards/margins": 10.127331733703613, "rewards/rejected": -8.79588508605957, "step": 5337 }, { "epoch": 0.8896666666666667, "grad_norm": 109.05818939208984, "learning_rate": 6.3170372742023194e-09, "logits/chosen": 2.377990245819092, "logits/rejected": 2.4421753883361816, "logps/chosen": -58.01744079589844, "logps/rejected": -109.22373962402344, "loss": 1.1614, "nll_loss": 0.7947594523429871, "rewards/accuracies": 1.0, "rewards/chosen": -0.60158771276474, "rewards/margins": 1.2768990993499756, "rewards/rejected": -1.8784867525100708, "step": 5338 }, { "epoch": 0.8898333333333334, "grad_norm": 21.54134750366211, "learning_rate": 6.298169735552094e-09, "logits/chosen": 2.323791980743408, "logits/rejected": 2.4545094966888428, "logps/chosen": -146.23080444335938, "logps/rejected": -202.98406982421875, "loss": 1.0078, "nll_loss": 1.0015809535980225, "rewards/accuracies": 1.0, "rewards/chosen": 2.2278993129730225, "rewards/margins": 9.621530532836914, "rewards/rejected": -7.393631458282471, "step": 5339 }, { "epoch": 0.89, "grad_norm": 172.88565063476562, "learning_rate": 6.279329499365649e-09, "logits/chosen": 1.6632086038589478, "logits/rejected": 1.5734333992004395, "logps/chosen": -68.38117980957031, "logps/rejected": -11.826517105102539, "loss": 2.4488, "nll_loss": 0.7683278918266296, "rewards/accuracies": 0.0, "rewards/chosen": 1.7029403448104858, "rewards/margins": -0.7555509805679321, "rewards/rejected": 2.458491325378418, "step": 5340 }, { "epoch": 0.8901666666666667, "grad_norm": 25.002172470092773, "learning_rate": 6.260516571132557e-09, "logits/chosen": 2.5563735961914062, "logits/rejected": 3.112922191619873, "logps/chosen": -26.636201858520508, "logps/rejected": -639.295166015625, "loss": 0.4419, "nll_loss": 0.43665891885757446, "rewards/accuracies": 1.0, "rewards/chosen": 2.38364315032959, "rewards/margins": 10.103784561157227, "rewards/rejected": -7.720141887664795, "step": 5341 }, { "epoch": 0.8903333333333333, "grad_norm": 27.9189395904541, "learning_rate": 6.2417309563344896e-09, "logits/chosen": 1.768545389175415, "logits/rejected": 1.811815619468689, "logps/chosen": -116.33906555175781, "logps/rejected": -130.8561248779297, "loss": 1.0329, "nll_loss": 1.029549241065979, "rewards/accuracies": 1.0, "rewards/chosen": 4.169833660125732, "rewards/margins": 9.879024505615234, "rewards/rejected": -5.709190368652344, "step": 5342 }, { "epoch": 0.8905, "grad_norm": 28.24998664855957, "learning_rate": 6.22297266044508e-09, "logits/chosen": 1.7623772621154785, "logits/rejected": 2.2903645038604736, "logps/chosen": -76.44783020019531, "logps/rejected": -218.949951171875, "loss": 0.8702, "nll_loss": 0.8589643239974976, "rewards/accuracies": 1.0, "rewards/chosen": 1.7046021223068237, "rewards/margins": 7.998781204223633, "rewards/rejected": -6.2941789627075195, "step": 5343 }, { "epoch": 0.8906666666666667, "grad_norm": 132.44618225097656, "learning_rate": 6.2042416889300675e-09, "logits/chosen": 2.413367748260498, "logits/rejected": 2.287426471710205, "logps/chosen": -32.529685974121094, "logps/rejected": -10.880106925964355, "loss": 2.3606, "nll_loss": 0.5513505935668945, "rewards/accuracies": 0.0, "rewards/chosen": 1.4395751953125, "rewards/margins": -0.9892220497131348, "rewards/rejected": 2.4287972450256348, "step": 5344 }, { "epoch": 0.8908333333333334, "grad_norm": 26.89319610595703, "learning_rate": 6.185538047247207e-09, "logits/chosen": 2.937307119369507, "logits/rejected": 3.2114200592041016, "logps/chosen": -33.22371292114258, "logps/rejected": -231.32266235351562, "loss": 0.5459, "nll_loss": 0.5358663201332092, "rewards/accuracies": 1.0, "rewards/chosen": 1.8164258003234863, "rewards/margins": 8.226112365722656, "rewards/rejected": -6.409687042236328, "step": 5345 }, { "epoch": 0.891, "grad_norm": 30.949899673461914, "learning_rate": 6.166861740846297e-09, "logits/chosen": 2.4545321464538574, "logits/rejected": 2.620414972305298, "logps/chosen": -57.574092864990234, "logps/rejected": -296.70196533203125, "loss": 0.866, "nll_loss": 0.8466777801513672, "rewards/accuracies": 1.0, "rewards/chosen": 1.4764232635498047, "rewards/margins": 6.2980055809021, "rewards/rejected": -4.821582317352295, "step": 5346 }, { "epoch": 0.8911666666666667, "grad_norm": 33.06425857543945, "learning_rate": 6.1482127751691595e-09, "logits/chosen": 1.9650629758834839, "logits/rejected": 2.2660372257232666, "logps/chosen": -55.699729919433594, "logps/rejected": -165.39385986328125, "loss": 0.9123, "nll_loss": 0.8983827829360962, "rewards/accuracies": 1.0, "rewards/chosen": 2.8640129566192627, "rewards/margins": 7.126921653747559, "rewards/rejected": -4.262908935546875, "step": 5347 }, { "epoch": 0.8913333333333333, "grad_norm": 18.605314254760742, "learning_rate": 6.129591155649649e-09, "logits/chosen": 2.9854118824005127, "logits/rejected": 3.058588981628418, "logps/chosen": -159.11465454101562, "logps/rejected": -283.95361328125, "loss": 0.9864, "nll_loss": 0.9821891188621521, "rewards/accuracies": 1.0, "rewards/chosen": 2.550769090652466, "rewards/margins": 13.217684745788574, "rewards/rejected": -10.666915893554688, "step": 5348 }, { "epoch": 0.8915, "grad_norm": 43.15635299682617, "learning_rate": 6.11099688771366e-09, "logits/chosen": 2.8286292552948, "logits/rejected": 2.859341621398926, "logps/chosen": -19.85172462463379, "logps/rejected": -51.675994873046875, "loss": 0.6931, "nll_loss": 0.6403782963752747, "rewards/accuracies": 1.0, "rewards/chosen": 1.8741545677185059, "rewards/margins": 4.757237434387207, "rewards/rejected": -2.883082628250122, "step": 5349 }, { "epoch": 0.8916666666666667, "grad_norm": 32.14848327636719, "learning_rate": 6.092429976779112e-09, "logits/chosen": 1.357055902481079, "logits/rejected": 1.1886305809020996, "logps/chosen": -107.99162292480469, "logps/rejected": -50.387535095214844, "loss": 0.9617, "nll_loss": 0.8709002137184143, "rewards/accuracies": 1.0, "rewards/chosen": 1.5846366882324219, "rewards/margins": 3.876905918121338, "rewards/rejected": -2.292269229888916, "step": 5350 }, { "epoch": 0.8918333333333334, "grad_norm": 23.884014129638672, "learning_rate": 6.073890428255979e-09, "logits/chosen": 2.6692922115325928, "logits/rejected": 2.445355176925659, "logps/chosen": -118.47773742675781, "logps/rejected": -62.645626068115234, "loss": 1.1434, "nll_loss": 1.1283595561981201, "rewards/accuracies": 1.0, "rewards/chosen": 2.4257049560546875, "rewards/margins": 6.771546840667725, "rewards/rejected": -4.345841884613037, "step": 5351 }, { "epoch": 0.892, "grad_norm": 55.92215347290039, "learning_rate": 6.055378247546217e-09, "logits/chosen": 1.4984865188598633, "logits/rejected": 2.163032293319702, "logps/chosen": -21.47520637512207, "logps/rejected": -107.49295043945312, "loss": 0.5758, "nll_loss": 0.5506463050842285, "rewards/accuracies": 1.0, "rewards/chosen": 1.1957685947418213, "rewards/margins": 5.824001312255859, "rewards/rejected": -4.628232479095459, "step": 5352 }, { "epoch": 0.8921666666666667, "grad_norm": 93.77994537353516, "learning_rate": 6.036893440043833e-09, "logits/chosen": 2.7216637134552, "logits/rejected": 2.6859259605407715, "logps/chosen": -31.77576446533203, "logps/rejected": -236.7969512939453, "loss": 1.5246, "nll_loss": 1.5131317377090454, "rewards/accuracies": 1.0, "rewards/chosen": 1.5213279724121094, "rewards/margins": 12.360904693603516, "rewards/rejected": -10.839576721191406, "step": 5353 }, { "epoch": 0.8923333333333333, "grad_norm": 74.31005096435547, "learning_rate": 6.018436011134876e-09, "logits/chosen": 2.4038681983947754, "logits/rejected": 2.529416561126709, "logps/chosen": -129.26206970214844, "logps/rejected": -452.3478698730469, "loss": 1.4968, "nll_loss": 1.1439120769500732, "rewards/accuracies": 1.0, "rewards/chosen": -2.883923292160034, "rewards/margins": 7.005609512329102, "rewards/rejected": -9.889533042907715, "step": 5354 }, { "epoch": 0.8925, "grad_norm": 46.75320816040039, "learning_rate": 6.000005966197385e-09, "logits/chosen": 2.119096279144287, "logits/rejected": 1.9492684602737427, "logps/chosen": -61.48978805541992, "logps/rejected": -70.0992660522461, "loss": 1.2388, "nll_loss": 1.160184621810913, "rewards/accuracies": 1.0, "rewards/chosen": 1.0181125402450562, "rewards/margins": 3.782430648803711, "rewards/rejected": -2.7643182277679443, "step": 5355 }, { "epoch": 0.8926666666666667, "grad_norm": 25.298147201538086, "learning_rate": 5.981603310601413e-09, "logits/chosen": 2.5513415336608887, "logits/rejected": 2.667081356048584, "logps/chosen": -62.40095138549805, "logps/rejected": -82.72481536865234, "loss": 0.7458, "nll_loss": 0.717252254486084, "rewards/accuracies": 1.0, "rewards/chosen": 2.5729079246520996, "rewards/margins": 6.013182640075684, "rewards/rejected": -3.440274953842163, "step": 5356 }, { "epoch": 0.8928333333333334, "grad_norm": 32.032649993896484, "learning_rate": 5.963228049709079e-09, "logits/chosen": 2.213165283203125, "logits/rejected": 2.7179949283599854, "logps/chosen": -51.11711120605469, "logps/rejected": -136.1165008544922, "loss": 0.838, "nll_loss": 0.7987047433853149, "rewards/accuracies": 1.0, "rewards/chosen": 1.0307754278182983, "rewards/margins": 4.897504806518555, "rewards/rejected": -3.866729259490967, "step": 5357 }, { "epoch": 0.893, "grad_norm": 25.02528190612793, "learning_rate": 5.944880188874479e-09, "logits/chosen": 2.680413246154785, "logits/rejected": 2.712555408477783, "logps/chosen": -83.18698120117188, "logps/rejected": -150.8719024658203, "loss": 0.8877, "nll_loss": 0.8756523728370667, "rewards/accuracies": 1.0, "rewards/chosen": 2.0037407875061035, "rewards/margins": 7.146705627441406, "rewards/rejected": -5.142964839935303, "step": 5358 }, { "epoch": 0.8931666666666667, "grad_norm": 32.682769775390625, "learning_rate": 5.92655973344377e-09, "logits/chosen": 3.0962440967559814, "logits/rejected": 3.255483388900757, "logps/chosen": -31.284996032714844, "logps/rejected": -311.87506103515625, "loss": 0.7148, "nll_loss": 0.695222020149231, "rewards/accuracies": 1.0, "rewards/chosen": 1.2181789875030518, "rewards/margins": 6.654385566711426, "rewards/rejected": -5.436206340789795, "step": 5359 }, { "epoch": 0.8933333333333333, "grad_norm": 29.36570930480957, "learning_rate": 5.908266688755048e-09, "logits/chosen": 2.641237497329712, "logits/rejected": 2.733102560043335, "logps/chosen": -27.69864273071289, "logps/rejected": -113.54315185546875, "loss": 0.5203, "nll_loss": 0.4859410226345062, "rewards/accuracies": 1.0, "rewards/chosen": 1.9763576984405518, "rewards/margins": 5.376628875732422, "rewards/rejected": -3.400270938873291, "step": 5360 }, { "epoch": 0.8935, "grad_norm": 38.684669494628906, "learning_rate": 5.890001060138483e-09, "logits/chosen": 2.3586721420288086, "logits/rejected": 2.2353053092956543, "logps/chosen": -30.413251876831055, "logps/rejected": -51.59764099121094, "loss": 0.8415, "nll_loss": 0.7798269987106323, "rewards/accuracies": 1.0, "rewards/chosen": 1.7236404418945312, "rewards/margins": 4.460349082946777, "rewards/rejected": -2.736708879470825, "step": 5361 }, { "epoch": 0.8936666666666667, "grad_norm": 53.60724639892578, "learning_rate": 5.87176285291624e-09, "logits/chosen": 0.8548325300216675, "logits/rejected": 2.046060085296631, "logps/chosen": -10.470993995666504, "logps/rejected": -357.3374328613281, "loss": 0.4813, "nll_loss": 0.4759542644023895, "rewards/accuracies": 1.0, "rewards/chosen": 2.304779529571533, "rewards/margins": 12.582931518554688, "rewards/rejected": -10.278152465820312, "step": 5362 }, { "epoch": 0.8938333333333334, "grad_norm": 33.257904052734375, "learning_rate": 5.8535520724025236e-09, "logits/chosen": 3.1190528869628906, "logits/rejected": 3.1652204990386963, "logps/chosen": -50.99866485595703, "logps/rejected": -151.49009704589844, "loss": 0.6831, "nll_loss": 0.6623203158378601, "rewards/accuracies": 1.0, "rewards/chosen": 1.1742061376571655, "rewards/margins": 6.498123645782471, "rewards/rejected": -5.323917388916016, "step": 5363 }, { "epoch": 0.894, "grad_norm": 40.40052795410156, "learning_rate": 5.835368723903456e-09, "logits/chosen": 2.811514139175415, "logits/rejected": 2.8336362838745117, "logps/chosen": -13.790557861328125, "logps/rejected": -57.78294372558594, "loss": 0.6873, "nll_loss": 0.656693160533905, "rewards/accuracies": 1.0, "rewards/chosen": 1.247182846069336, "rewards/margins": 5.34991455078125, "rewards/rejected": -4.102731704711914, "step": 5364 }, { "epoch": 0.8941666666666667, "grad_norm": 56.201969146728516, "learning_rate": 5.817212812717276e-09, "logits/chosen": 2.232243061065674, "logits/rejected": 1.5821014642715454, "logps/chosen": -58.52440643310547, "logps/rejected": -72.56417846679688, "loss": 1.3614, "nll_loss": 1.3005423545837402, "rewards/accuracies": 1.0, "rewards/chosen": 0.5038833618164062, "rewards/margins": 4.161011695861816, "rewards/rejected": -3.6571285724639893, "step": 5365 }, { "epoch": 0.8943333333333333, "grad_norm": 27.826147079467773, "learning_rate": 5.799084344134175e-09, "logits/chosen": 2.7047014236450195, "logits/rejected": 2.8833017349243164, "logps/chosen": -38.67455291748047, "logps/rejected": -309.06573486328125, "loss": 0.5859, "nll_loss": 0.5772320628166199, "rewards/accuracies": 1.0, "rewards/chosen": 1.8889182806015015, "rewards/margins": 9.04414176940918, "rewards/rejected": -7.155223369598389, "step": 5366 }, { "epoch": 0.8945, "grad_norm": 24.3165225982666, "learning_rate": 5.780983323436373e-09, "logits/chosen": 2.852870464324951, "logits/rejected": 2.870573043823242, "logps/chosen": -68.54737854003906, "logps/rejected": -144.33396911621094, "loss": 0.7389, "nll_loss": 0.7292273640632629, "rewards/accuracies": 1.0, "rewards/chosen": 2.9487671852111816, "rewards/margins": 7.673374176025391, "rewards/rejected": -4.724606990814209, "step": 5367 }, { "epoch": 0.8946666666666667, "grad_norm": 76.38188171386719, "learning_rate": 5.762909755898026e-09, "logits/chosen": 2.326691150665283, "logits/rejected": 3.0628132820129395, "logps/chosen": -8.457206726074219, "logps/rejected": -164.8940887451172, "loss": 0.5863, "nll_loss": 0.5638138055801392, "rewards/accuracies": 1.0, "rewards/chosen": 1.8868697881698608, "rewards/margins": 5.947531223297119, "rewards/rejected": -4.060661315917969, "step": 5368 }, { "epoch": 0.8948333333333334, "grad_norm": 128.69725036621094, "learning_rate": 5.7448636467853585e-09, "logits/chosen": 1.6872256994247437, "logits/rejected": 1.9195603132247925, "logps/chosen": -21.933406829833984, "logps/rejected": -24.6744441986084, "loss": 1.7758, "nll_loss": 0.3987891674041748, "rewards/accuracies": 0.0, "rewards/chosen": 1.6799054145812988, "rewards/margins": -0.32752060890197754, "rewards/rejected": 2.0074260234832764, "step": 5369 }, { "epoch": 0.895, "grad_norm": 21.277530670166016, "learning_rate": 5.726845001356573e-09, "logits/chosen": 1.116448998451233, "logits/rejected": 1.8938117027282715, "logps/chosen": -39.11181640625, "logps/rejected": -329.67095947265625, "loss": 0.5319, "nll_loss": 0.5146290063858032, "rewards/accuracies": 1.0, "rewards/chosen": 1.5546784400939941, "rewards/margins": 6.541833877563477, "rewards/rejected": -4.987155437469482, "step": 5370 }, { "epoch": 0.8951666666666667, "grad_norm": 29.768993377685547, "learning_rate": 5.7088538248618924e-09, "logits/chosen": 2.592259407043457, "logits/rejected": 2.5925047397613525, "logps/chosen": -56.194034576416016, "logps/rejected": -90.9666748046875, "loss": 0.7563, "nll_loss": 0.7113168239593506, "rewards/accuracies": 1.0, "rewards/chosen": 1.9517736434936523, "rewards/margins": 5.011280059814453, "rewards/rejected": -3.0595061779022217, "step": 5371 }, { "epoch": 0.8953333333333333, "grad_norm": 28.97800064086914, "learning_rate": 5.690890122543468e-09, "logits/chosen": 1.426648497581482, "logits/rejected": 1.5004420280456543, "logps/chosen": -20.484407424926758, "logps/rejected": -150.35792541503906, "loss": 0.5162, "nll_loss": 0.48772403597831726, "rewards/accuracies": 1.0, "rewards/chosen": 1.7323458194732666, "rewards/margins": 5.531455993652344, "rewards/rejected": -3.7991104125976562, "step": 5372 }, { "epoch": 0.8955, "grad_norm": 34.33513641357422, "learning_rate": 5.672953899635524e-09, "logits/chosen": 2.2569122314453125, "logits/rejected": 2.4620838165283203, "logps/chosen": -34.88425064086914, "logps/rejected": -402.4924011230469, "loss": 0.6818, "nll_loss": 0.6581934094429016, "rewards/accuracies": 1.0, "rewards/chosen": 0.7666080594062805, "rewards/margins": 10.313058853149414, "rewards/rejected": -9.5464506149292, "step": 5373 }, { "epoch": 0.8956666666666667, "grad_norm": 27.400192260742188, "learning_rate": 5.655045161364247e-09, "logits/chosen": 1.2830593585968018, "logits/rejected": 2.072150945663452, "logps/chosen": -84.47080993652344, "logps/rejected": -323.7210693359375, "loss": 0.931, "nll_loss": 0.9282505512237549, "rewards/accuracies": 1.0, "rewards/chosen": 2.980281114578247, "rewards/margins": 14.243803024291992, "rewards/rejected": -11.263522148132324, "step": 5374 }, { "epoch": 0.8958333333333334, "grad_norm": 22.479867935180664, "learning_rate": 5.6371639129478064e-09, "logits/chosen": 2.871283531188965, "logits/rejected": 2.804731607437134, "logps/chosen": -58.104736328125, "logps/rejected": -100.65843200683594, "loss": 0.6845, "nll_loss": 0.6602811217308044, "rewards/accuracies": 1.0, "rewards/chosen": 1.8474899530410767, "rewards/margins": 5.817709445953369, "rewards/rejected": -3.970219612121582, "step": 5375 }, { "epoch": 0.896, "grad_norm": 24.324729919433594, "learning_rate": 5.619310159596357e-09, "logits/chosen": 2.0330357551574707, "logits/rejected": 2.679622173309326, "logps/chosen": -68.85486602783203, "logps/rejected": -179.4529266357422, "loss": 0.8781, "nll_loss": 0.8715806603431702, "rewards/accuracies": 1.0, "rewards/chosen": 2.181187391281128, "rewards/margins": 9.49658203125, "rewards/rejected": -7.315394878387451, "step": 5376 }, { "epoch": 0.8961666666666667, "grad_norm": 33.38165283203125, "learning_rate": 5.601483906512061e-09, "logits/chosen": 2.9463346004486084, "logits/rejected": 2.9128799438476562, "logps/chosen": -24.827699661254883, "logps/rejected": -123.01654052734375, "loss": 0.5848, "nll_loss": 0.5517265796661377, "rewards/accuracies": 1.0, "rewards/chosen": 0.8173018097877502, "rewards/margins": 5.440577030181885, "rewards/rejected": -4.623275279998779, "step": 5377 }, { "epoch": 0.8963333333333333, "grad_norm": 18.7092227935791, "learning_rate": 5.583685158889062e-09, "logits/chosen": 2.529754161834717, "logits/rejected": 2.7086455821990967, "logps/chosen": -6.638113498687744, "logps/rejected": -220.54197692871094, "loss": 0.1835, "nll_loss": 0.16595283150672913, "rewards/accuracies": 1.0, "rewards/chosen": 1.2494564056396484, "rewards/margins": 7.1236491203308105, "rewards/rejected": -5.874192714691162, "step": 5378 }, { "epoch": 0.8965, "grad_norm": 48.04463577270508, "learning_rate": 5.565913921913512e-09, "logits/chosen": 1.5675194263458252, "logits/rejected": 1.999685287475586, "logps/chosen": -15.206838607788086, "logps/rejected": -160.04745483398438, "loss": 0.5886, "nll_loss": 0.5848783850669861, "rewards/accuracies": 1.0, "rewards/chosen": 4.184300899505615, "rewards/margins": 9.738082885742188, "rewards/rejected": -5.553781509399414, "step": 5379 }, { "epoch": 0.8966666666666666, "grad_norm": 36.7558479309082, "learning_rate": 5.548170200763469e-09, "logits/chosen": 2.5584700107574463, "logits/rejected": 2.921555757522583, "logps/chosen": -32.92523193359375, "logps/rejected": -362.11822509765625, "loss": 0.8257, "nll_loss": 0.8231307864189148, "rewards/accuracies": 1.0, "rewards/chosen": 3.591435194015503, "rewards/margins": 10.115742683410645, "rewards/rejected": -6.5243072509765625, "step": 5380 }, { "epoch": 0.8968333333333334, "grad_norm": 32.64939498901367, "learning_rate": 5.530454000609075e-09, "logits/chosen": 1.134888768196106, "logits/rejected": 2.3736746311187744, "logps/chosen": -18.9390926361084, "logps/rejected": -216.24520874023438, "loss": 0.633, "nll_loss": 0.631303071975708, "rewards/accuracies": 1.0, "rewards/chosen": 4.713855743408203, "rewards/margins": 11.13931655883789, "rewards/rejected": -6.425461292266846, "step": 5381 }, { "epoch": 0.897, "grad_norm": 30.925596237182617, "learning_rate": 5.512765326612379e-09, "logits/chosen": 2.1344337463378906, "logits/rejected": 2.280930280685425, "logps/chosen": -29.92490005493164, "logps/rejected": -107.0490951538086, "loss": 0.7289, "nll_loss": 0.7124975919723511, "rewards/accuracies": 1.0, "rewards/chosen": 1.31232488155365, "rewards/margins": 7.303854942321777, "rewards/rejected": -5.991529941558838, "step": 5382 }, { "epoch": 0.8971666666666667, "grad_norm": 30.126020431518555, "learning_rate": 5.495104183927457e-09, "logits/chosen": 1.2559680938720703, "logits/rejected": 1.9906184673309326, "logps/chosen": -52.079254150390625, "logps/rejected": -197.0643310546875, "loss": 0.7155, "nll_loss": 0.7037737965583801, "rewards/accuracies": 1.0, "rewards/chosen": 1.662756323814392, "rewards/margins": 7.861221790313721, "rewards/rejected": -6.198465347290039, "step": 5383 }, { "epoch": 0.8973333333333333, "grad_norm": 10.08424186706543, "learning_rate": 5.477470577700327e-09, "logits/chosen": 1.1132655143737793, "logits/rejected": 1.4067527055740356, "logps/chosen": -109.96075439453125, "logps/rejected": -211.04733276367188, "loss": 0.4635, "nll_loss": 0.45626866817474365, "rewards/accuracies": 1.0, "rewards/chosen": 4.214929103851318, "rewards/margins": 8.971563339233398, "rewards/rejected": -4.756634712219238, "step": 5384 }, { "epoch": 0.8975, "grad_norm": 27.57510757446289, "learning_rate": 5.45986451306899e-09, "logits/chosen": 3.2616019248962402, "logits/rejected": 3.2548985481262207, "logps/chosen": -76.99128723144531, "logps/rejected": -122.22163391113281, "loss": 0.9047, "nll_loss": 0.8952476382255554, "rewards/accuracies": 1.0, "rewards/chosen": 2.009861707687378, "rewards/margins": 7.8984880447387695, "rewards/rejected": -5.888626575469971, "step": 5385 }, { "epoch": 0.8976666666666666, "grad_norm": 41.33115768432617, "learning_rate": 5.442285995163443e-09, "logits/chosen": 2.689305305480957, "logits/rejected": 2.8219406604766846, "logps/chosen": -8.42435073852539, "logps/rejected": -290.0323181152344, "loss": 0.3684, "nll_loss": 0.3662761151790619, "rewards/accuracies": 1.0, "rewards/chosen": 3.6680245399475098, "rewards/margins": 10.571250915527344, "rewards/rejected": -6.903225898742676, "step": 5386 }, { "epoch": 0.8978333333333334, "grad_norm": 30.608301162719727, "learning_rate": 5.424735029105654e-09, "logits/chosen": 2.014192819595337, "logits/rejected": 1.735381007194519, "logps/chosen": -74.09735107421875, "logps/rejected": -114.84956359863281, "loss": 0.829, "nll_loss": 0.7560954689979553, "rewards/accuracies": 1.0, "rewards/chosen": 3.353634834289551, "rewards/margins": 5.571757793426514, "rewards/rejected": -2.218122959136963, "step": 5387 }, { "epoch": 0.898, "grad_norm": 37.71918487548828, "learning_rate": 5.407211620009544e-09, "logits/chosen": 2.4000766277313232, "logits/rejected": 2.629051446914673, "logps/chosen": -61.7703857421875, "logps/rejected": -101.12821960449219, "loss": 1.0887, "nll_loss": 1.0650066137313843, "rewards/accuracies": 1.0, "rewards/chosen": 1.1708351373672485, "rewards/margins": 6.018444061279297, "rewards/rejected": -4.847609043121338, "step": 5388 }, { "epoch": 0.8981666666666667, "grad_norm": 25.509061813354492, "learning_rate": 5.389715772981007e-09, "logits/chosen": 2.8109991550445557, "logits/rejected": 2.9212043285369873, "logps/chosen": -67.45819854736328, "logps/rejected": -225.49636840820312, "loss": 0.7931, "nll_loss": 0.7753815650939941, "rewards/accuracies": 1.0, "rewards/chosen": 1.0748337507247925, "rewards/margins": 10.037776947021484, "rewards/rejected": -8.962943077087402, "step": 5389 }, { "epoch": 0.8983333333333333, "grad_norm": 138.65614318847656, "learning_rate": 5.3722474931179205e-09, "logits/chosen": 3.1044366359710693, "logits/rejected": 3.2721614837646484, "logps/chosen": -34.340599060058594, "logps/rejected": -49.88060760498047, "loss": 2.1372, "nll_loss": 1.560936450958252, "rewards/accuracies": 1.0, "rewards/chosen": 1.5174095630645752, "rewards/margins": 1.2222683429718018, "rewards/rejected": 0.29514122009277344, "step": 5390 }, { "epoch": 0.8985, "grad_norm": 26.621654510498047, "learning_rate": 5.354806785510113e-09, "logits/chosen": 2.3064751625061035, "logits/rejected": 2.581493854522705, "logps/chosen": -88.11216735839844, "logps/rejected": -552.2744750976562, "loss": 0.9339, "nll_loss": 0.9274965524673462, "rewards/accuracies": 1.0, "rewards/chosen": 2.1073555946350098, "rewards/margins": 14.410144805908203, "rewards/rejected": -12.302789688110352, "step": 5391 }, { "epoch": 0.8986666666666666, "grad_norm": 34.075199127197266, "learning_rate": 5.337393655239408e-09, "logits/chosen": 2.4454541206359863, "logits/rejected": 2.3999409675598145, "logps/chosen": -201.36935424804688, "logps/rejected": -296.28216552734375, "loss": 0.866, "nll_loss": 0.8087121844291687, "rewards/accuracies": 1.0, "rewards/chosen": 0.05630188062787056, "rewards/margins": 4.869894504547119, "rewards/rejected": -4.813592433929443, "step": 5392 }, { "epoch": 0.8988333333333334, "grad_norm": 39.05912780761719, "learning_rate": 5.320008107379548e-09, "logits/chosen": 1.9751012325286865, "logits/rejected": 2.0082602500915527, "logps/chosen": -55.5213508605957, "logps/rejected": -62.061859130859375, "loss": 1.0007, "nll_loss": 0.9572647213935852, "rewards/accuracies": 1.0, "rewards/chosen": 0.9717518091201782, "rewards/margins": 4.711202621459961, "rewards/rejected": -3.7394509315490723, "step": 5393 }, { "epoch": 0.899, "grad_norm": 347.89691162109375, "learning_rate": 5.3026501469962794e-09, "logits/chosen": 4.211137294769287, "logits/rejected": 4.054203987121582, "logps/chosen": -211.45816040039062, "logps/rejected": -92.71653747558594, "loss": 3.6308, "nll_loss": 1.201466679573059, "rewards/accuracies": 0.0, "rewards/chosen": 0.34544986486434937, "rewards/margins": -1.9509413242340088, "rewards/rejected": 2.296391248703003, "step": 5394 }, { "epoch": 0.8991666666666667, "grad_norm": 21.03601837158203, "learning_rate": 5.285319779147279e-09, "logits/chosen": 2.1111295223236084, "logits/rejected": 2.421206474304199, "logps/chosen": -45.45918273925781, "logps/rejected": -133.3828582763672, "loss": 0.5674, "nll_loss": 0.5612244606018066, "rewards/accuracies": 1.0, "rewards/chosen": 2.1976051330566406, "rewards/margins": 10.237723350524902, "rewards/rejected": -8.040118217468262, "step": 5395 }, { "epoch": 0.8993333333333333, "grad_norm": 24.560651779174805, "learning_rate": 5.2680170088822415e-09, "logits/chosen": 2.8578436374664307, "logits/rejected": 2.8091177940368652, "logps/chosen": -47.22334671020508, "logps/rejected": -125.783447265625, "loss": 0.7265, "nll_loss": 0.7155051827430725, "rewards/accuracies": 1.0, "rewards/chosen": 2.041074752807617, "rewards/margins": 7.361499309539795, "rewards/rejected": -5.320424556732178, "step": 5396 }, { "epoch": 0.8995, "grad_norm": 42.84656524658203, "learning_rate": 5.250741841242734e-09, "logits/chosen": 2.577831506729126, "logits/rejected": 2.431187629699707, "logps/chosen": -5.73360538482666, "logps/rejected": -37.08344268798828, "loss": 0.3388, "nll_loss": 0.2606184482574463, "rewards/accuracies": 1.0, "rewards/chosen": 1.4192712306976318, "rewards/margins": 3.9757227897644043, "rewards/rejected": -2.5564515590667725, "step": 5397 }, { "epoch": 0.8996666666666666, "grad_norm": 13.93200397491455, "learning_rate": 5.233494281262341e-09, "logits/chosen": 1.2986092567443848, "logits/rejected": 1.5946547985076904, "logps/chosen": -154.4708709716797, "logps/rejected": -383.0838928222656, "loss": 0.7079, "nll_loss": 0.7053464651107788, "rewards/accuracies": 1.0, "rewards/chosen": 3.034682035446167, "rewards/margins": 13.437875747680664, "rewards/rejected": -10.403193473815918, "step": 5398 }, { "epoch": 0.8998333333333334, "grad_norm": 147.5499725341797, "learning_rate": 5.216274333966586e-09, "logits/chosen": 3.436912775039673, "logits/rejected": 3.3686585426330566, "logps/chosen": -44.70256805419922, "logps/rejected": -24.80648422241211, "loss": 1.6465, "nll_loss": 0.8278254270553589, "rewards/accuracies": 1.0, "rewards/chosen": 0.31178590655326843, "rewards/margins": 0.2466592937707901, "rewards/rejected": 0.06512661278247833, "step": 5399 }, { "epoch": 0.9, "grad_norm": 32.12205505371094, "learning_rate": 5.199082004372957e-09, "logits/chosen": 2.5720012187957764, "logits/rejected": 2.5414042472839355, "logps/chosen": -44.117889404296875, "logps/rejected": -40.655799865722656, "loss": 0.7788, "nll_loss": 0.7115788459777832, "rewards/accuracies": 1.0, "rewards/chosen": 0.7752792835235596, "rewards/margins": 3.9559879302978516, "rewards/rejected": -3.180708646774292, "step": 5400 }, { "epoch": 0.9001666666666667, "grad_norm": 50.322479248046875, "learning_rate": 5.181917297490879e-09, "logits/chosen": 1.8824691772460938, "logits/rejected": 2.289285182952881, "logps/chosen": -4.222103595733643, "logps/rejected": -97.55715942382812, "loss": 0.3134, "nll_loss": 0.3015788197517395, "rewards/accuracies": 1.0, "rewards/chosen": 1.7132781744003296, "rewards/margins": 7.65432596206665, "rewards/rejected": -5.941047668457031, "step": 5401 }, { "epoch": 0.9003333333333333, "grad_norm": 115.1636734008789, "learning_rate": 5.164780218321729e-09, "logits/chosen": 1.969150424003601, "logits/rejected": 1.9625815153121948, "logps/chosen": -128.98184204101562, "logps/rejected": -74.38746643066406, "loss": 1.9439, "nll_loss": 1.4492340087890625, "rewards/accuracies": 1.0, "rewards/chosen": -3.1914005279541016, "rewards/margins": 2.158475875854492, "rewards/rejected": -5.349876403808594, "step": 5402 }, { "epoch": 0.9005, "grad_norm": 52.989864349365234, "learning_rate": 5.1476707718588475e-09, "logits/chosen": 0.8130334615707397, "logits/rejected": 1.967426061630249, "logps/chosen": -10.235921859741211, "logps/rejected": -354.39239501953125, "loss": 0.4705, "nll_loss": 0.4652691185474396, "rewards/accuracies": 1.0, "rewards/chosen": 2.328286647796631, "rewards/margins": 12.311935424804688, "rewards/rejected": -9.983649253845215, "step": 5403 }, { "epoch": 0.9006666666666666, "grad_norm": 73.14583587646484, "learning_rate": 5.1305889630875344e-09, "logits/chosen": 1.7483797073364258, "logits/rejected": 2.475992202758789, "logps/chosen": -52.86479187011719, "logps/rejected": -71.88243103027344, "loss": 1.9042, "nll_loss": 1.8880282640457153, "rewards/accuracies": 1.0, "rewards/chosen": 3.13897705078125, "rewards/margins": 7.135358810424805, "rewards/rejected": -3.9963815212249756, "step": 5404 }, { "epoch": 0.9008333333333334, "grad_norm": 30.89788055419922, "learning_rate": 5.113534796984975e-09, "logits/chosen": 2.008849620819092, "logits/rejected": 2.378368616104126, "logps/chosen": -80.21121215820312, "logps/rejected": -167.82687377929688, "loss": 0.9498, "nll_loss": 0.9326885342597961, "rewards/accuracies": 1.0, "rewards/chosen": 1.944284200668335, "rewards/margins": 6.419349670410156, "rewards/rejected": -4.4750657081604, "step": 5405 }, { "epoch": 0.901, "grad_norm": 14.141751289367676, "learning_rate": 5.096508278520384e-09, "logits/chosen": 1.36674165725708, "logits/rejected": 0.9605479836463928, "logps/chosen": -163.76055908203125, "logps/rejected": -81.73916625976562, "loss": 0.5825, "nll_loss": 0.5686129927635193, "rewards/accuracies": 1.0, "rewards/chosen": 2.0769989490509033, "rewards/margins": 6.825156211853027, "rewards/rejected": -4.748157501220703, "step": 5406 }, { "epoch": 0.9011666666666667, "grad_norm": 27.539316177368164, "learning_rate": 5.079509412654859e-09, "logits/chosen": 2.8851256370544434, "logits/rejected": 2.8971927165985107, "logps/chosen": -62.055660247802734, "logps/rejected": -67.56405639648438, "loss": 0.9283, "nll_loss": 0.9125832915306091, "rewards/accuracies": 1.0, "rewards/chosen": 1.595306158065796, "rewards/margins": 6.771177291870117, "rewards/rejected": -5.1758713722229, "step": 5407 }, { "epoch": 0.9013333333333333, "grad_norm": 44.95806884765625, "learning_rate": 5.062538204341471e-09, "logits/chosen": 1.5268855094909668, "logits/rejected": 2.652980089187622, "logps/chosen": -101.83155822753906, "logps/rejected": -290.3134765625, "loss": 1.1022, "nll_loss": 1.0833144187927246, "rewards/accuracies": 1.0, "rewards/chosen": 1.023718237876892, "rewards/margins": 8.705591201782227, "rewards/rejected": -7.681872844696045, "step": 5408 }, { "epoch": 0.9015, "grad_norm": 26.810026168823242, "learning_rate": 5.045594658525232e-09, "logits/chosen": 1.712009310722351, "logits/rejected": 1.3960847854614258, "logps/chosen": -53.893638610839844, "logps/rejected": -54.0782356262207, "loss": 0.7492, "nll_loss": 0.7185817956924438, "rewards/accuracies": 1.0, "rewards/chosen": 1.6462936401367188, "rewards/margins": 5.391695499420166, "rewards/rejected": -3.7454018592834473, "step": 5409 }, { "epoch": 0.9016666666666666, "grad_norm": 47.778282165527344, "learning_rate": 5.028678780143059e-09, "logits/chosen": 2.630558490753174, "logits/rejected": 2.7926909923553467, "logps/chosen": -24.444686889648438, "logps/rejected": -197.30482482910156, "loss": 0.8423, "nll_loss": 0.8148229122161865, "rewards/accuracies": 1.0, "rewards/chosen": 0.786459743976593, "rewards/margins": 6.302356719970703, "rewards/rejected": -5.515896797180176, "step": 5410 }, { "epoch": 0.9018333333333334, "grad_norm": 28.300678253173828, "learning_rate": 5.011790574123842e-09, "logits/chosen": 2.721801996231079, "logits/rejected": 2.7022366523742676, "logps/chosen": -100.34274291992188, "logps/rejected": -67.5682601928711, "loss": 1.0186, "nll_loss": 1.003427267074585, "rewards/accuracies": 1.0, "rewards/chosen": 1.5185409784317017, "rewards/margins": 7.0174384117126465, "rewards/rejected": -5.498897552490234, "step": 5411 }, { "epoch": 0.902, "grad_norm": 29.812467575073242, "learning_rate": 4.994930045388412e-09, "logits/chosen": 3.0346407890319824, "logits/rejected": 3.1838314533233643, "logps/chosen": -41.08760452270508, "logps/rejected": -109.5204849243164, "loss": 0.7572, "nll_loss": 0.7337071299552917, "rewards/accuracies": 1.0, "rewards/chosen": 1.0398712158203125, "rewards/margins": 6.287709712982178, "rewards/rejected": -5.247838497161865, "step": 5412 }, { "epoch": 0.9021666666666667, "grad_norm": 36.1980094909668, "learning_rate": 4.978097198849518e-09, "logits/chosen": 2.5762906074523926, "logits/rejected": 2.8723740577697754, "logps/chosen": -36.01885986328125, "logps/rejected": -299.1517028808594, "loss": 0.9177, "nll_loss": 0.9004713892936707, "rewards/accuracies": 1.0, "rewards/chosen": 1.2377350330352783, "rewards/margins": 7.326746940612793, "rewards/rejected": -6.089012145996094, "step": 5413 }, { "epoch": 0.9023333333333333, "grad_norm": 27.686443328857422, "learning_rate": 4.961292039411835e-09, "logits/chosen": 2.2267448902130127, "logits/rejected": 2.770423173904419, "logps/chosen": -63.98579406738281, "logps/rejected": -360.076416015625, "loss": 0.8601, "nll_loss": 0.8531440496444702, "rewards/accuracies": 1.0, "rewards/chosen": 2.1213128566741943, "rewards/margins": 9.367437362670898, "rewards/rejected": -7.246124267578125, "step": 5414 }, { "epoch": 0.9025, "grad_norm": 28.26698875427246, "learning_rate": 4.944514571971981e-09, "logits/chosen": 2.6557559967041016, "logits/rejected": 2.839904308319092, "logps/chosen": -164.6370086669922, "logps/rejected": -289.2290344238281, "loss": 1.2757, "nll_loss": 1.266438364982605, "rewards/accuracies": 1.0, "rewards/chosen": 1.7476211786270142, "rewards/margins": 10.670122146606445, "rewards/rejected": -8.922500610351562, "step": 5415 }, { "epoch": 0.9026666666666666, "grad_norm": 30.996959686279297, "learning_rate": 4.927764801418521e-09, "logits/chosen": 2.6001017093658447, "logits/rejected": 2.559873342514038, "logps/chosen": -46.629154205322266, "logps/rejected": -142.848388671875, "loss": 0.7236, "nll_loss": 0.6959574818611145, "rewards/accuracies": 1.0, "rewards/chosen": 0.7587658762931824, "rewards/margins": 6.395176410675049, "rewards/rejected": -5.636410713195801, "step": 5416 }, { "epoch": 0.9028333333333334, "grad_norm": 90.06546783447266, "learning_rate": 4.911042732631954e-09, "logits/chosen": 2.267220973968506, "logits/rejected": 2.61281681060791, "logps/chosen": -18.96894645690918, "logps/rejected": -103.34923553466797, "loss": 0.4579, "nll_loss": 0.43111246824264526, "rewards/accuracies": 1.0, "rewards/chosen": 1.4941977262496948, "rewards/margins": 5.582174301147461, "rewards/rejected": -4.087976455688477, "step": 5417 }, { "epoch": 0.903, "grad_norm": 200.53292846679688, "learning_rate": 4.8943483704846465e-09, "logits/chosen": 2.9193315505981445, "logits/rejected": 3.0504908561706543, "logps/chosen": -33.94341278076172, "logps/rejected": -34.69733428955078, "loss": 3.0249, "nll_loss": 0.6788682341575623, "rewards/accuracies": 0.0, "rewards/chosen": 1.0811131000518799, "rewards/margins": -1.7131564617156982, "rewards/rejected": 2.794269561767578, "step": 5418 }, { "epoch": 0.9031666666666667, "grad_norm": 29.43828773498535, "learning_rate": 4.877681719840954e-09, "logits/chosen": 2.4305167198181152, "logits/rejected": 2.488645315170288, "logps/chosen": -14.258415222167969, "logps/rejected": -232.97007751464844, "loss": 0.3265, "nll_loss": 0.3099655210971832, "rewards/accuracies": 1.0, "rewards/chosen": 1.2538275718688965, "rewards/margins": 7.522571563720703, "rewards/rejected": -6.268743991851807, "step": 5419 }, { "epoch": 0.9033333333333333, "grad_norm": 35.34595489501953, "learning_rate": 4.861042785557145e-09, "logits/chosen": 1.5669856071472168, "logits/rejected": 2.2512118816375732, "logps/chosen": -25.402631759643555, "logps/rejected": -192.78665161132812, "loss": 0.6625, "nll_loss": 0.6513495445251465, "rewards/accuracies": 1.0, "rewards/chosen": 1.7628008127212524, "rewards/margins": 7.762640476226807, "rewards/rejected": -5.999839782714844, "step": 5420 }, { "epoch": 0.9035, "grad_norm": 533.1978149414062, "learning_rate": 4.844431572481411e-09, "logits/chosen": 3.847404956817627, "logits/rejected": 3.9690439701080322, "logps/chosen": -166.27127075195312, "logps/rejected": -30.055143356323242, "loss": 7.8058, "nll_loss": 1.933387279510498, "rewards/accuracies": 0.0, "rewards/chosen": -4.097448825836182, "rewards/margins": -5.849034309387207, "rewards/rejected": 1.7515852451324463, "step": 5421 }, { "epoch": 0.9036666666666666, "grad_norm": 125.81304931640625, "learning_rate": 4.8278480854538336e-09, "logits/chosen": 2.1099205017089844, "logits/rejected": 2.046116590499878, "logps/chosen": -50.25648498535156, "logps/rejected": -43.966495513916016, "loss": 1.4958, "nll_loss": 0.6526817083358765, "rewards/accuracies": 1.0, "rewards/chosen": 0.9310245513916016, "rewards/margins": 0.37217938899993896, "rewards/rejected": 0.5588451623916626, "step": 5422 }, { "epoch": 0.9038333333333334, "grad_norm": 23.242656707763672, "learning_rate": 4.81129232930646e-09, "logits/chosen": 2.294201374053955, "logits/rejected": 2.3569960594177246, "logps/chosen": -33.994686126708984, "logps/rejected": -163.29132080078125, "loss": 0.5617, "nll_loss": 0.5395982265472412, "rewards/accuracies": 1.0, "rewards/chosen": 2.27900767326355, "rewards/margins": 6.14680290222168, "rewards/rejected": -3.867794990539551, "step": 5423 }, { "epoch": 0.904, "grad_norm": 48.54518127441406, "learning_rate": 4.794764308863241e-09, "logits/chosen": 2.813433885574341, "logits/rejected": 2.8750452995300293, "logps/chosen": -24.914894104003906, "logps/rejected": -139.76547241210938, "loss": 0.7819, "nll_loss": 0.7785905003547668, "rewards/accuracies": 1.0, "rewards/chosen": 3.3748843669891357, "rewards/margins": 9.59683609008789, "rewards/rejected": -6.221951961517334, "step": 5424 }, { "epoch": 0.9041666666666667, "grad_norm": 29.902570724487305, "learning_rate": 4.77826402894006e-09, "logits/chosen": 1.7868530750274658, "logits/rejected": 1.6048048734664917, "logps/chosen": -29.203596115112305, "logps/rejected": -71.63238525390625, "loss": 0.609, "nll_loss": 0.5959917306900024, "rewards/accuracies": 1.0, "rewards/chosen": 1.5004198551177979, "rewards/margins": 7.976384162902832, "rewards/rejected": -6.475964546203613, "step": 5425 }, { "epoch": 0.9043333333333333, "grad_norm": 36.542823791503906, "learning_rate": 4.761791494344669e-09, "logits/chosen": 1.6980692148208618, "logits/rejected": 1.9744271039962769, "logps/chosen": -57.558685302734375, "logps/rejected": -143.86456298828125, "loss": 1.008, "nll_loss": 0.9923912286758423, "rewards/accuracies": 1.0, "rewards/chosen": 1.5424919128417969, "rewards/margins": 6.851478576660156, "rewards/rejected": -5.308986663818359, "step": 5426 }, { "epoch": 0.9045, "grad_norm": 44.33503341674805, "learning_rate": 4.745346709876785e-09, "logits/chosen": 2.688217878341675, "logits/rejected": 2.986091375350952, "logps/chosen": -35.660423278808594, "logps/rejected": -294.8974914550781, "loss": 0.9002, "nll_loss": 0.8697662353515625, "rewards/accuracies": 1.0, "rewards/chosen": 0.4900684654712677, "rewards/margins": 10.684246063232422, "rewards/rejected": -10.194177627563477, "step": 5427 }, { "epoch": 0.9046666666666666, "grad_norm": 22.61424446105957, "learning_rate": 4.728929680328031e-09, "logits/chosen": 1.0842238664627075, "logits/rejected": 2.320017099380493, "logps/chosen": -120.01498413085938, "logps/rejected": -246.03408813476562, "loss": 0.8479, "nll_loss": 0.8392657041549683, "rewards/accuracies": 1.0, "rewards/chosen": 1.8200181722640991, "rewards/margins": 10.524673461914062, "rewards/rejected": -8.704655647277832, "step": 5428 }, { "epoch": 0.9048333333333334, "grad_norm": 42.50450897216797, "learning_rate": 4.712540410481924e-09, "logits/chosen": 2.7487006187438965, "logits/rejected": 2.777601957321167, "logps/chosen": -103.09739685058594, "logps/rejected": -73.38951110839844, "loss": 1.0141, "nll_loss": 0.8520445227622986, "rewards/accuracies": 1.0, "rewards/chosen": 1.323689341545105, "rewards/margins": 2.9718544483184814, "rewards/rejected": -1.6481651067733765, "step": 5429 }, { "epoch": 0.905, "grad_norm": 32.62117385864258, "learning_rate": 4.696178905113912e-09, "logits/chosen": 2.024303436279297, "logits/rejected": 1.9901093244552612, "logps/chosen": -85.75349426269531, "logps/rejected": -133.2316436767578, "loss": 1.0693, "nll_loss": 1.0586851835250854, "rewards/accuracies": 1.0, "rewards/chosen": 2.1460769176483154, "rewards/margins": 7.380986213684082, "rewards/rejected": -5.2349090576171875, "step": 5430 }, { "epoch": 0.9051666666666667, "grad_norm": 29.611953735351562, "learning_rate": 4.679845168991336e-09, "logits/chosen": 2.178892135620117, "logits/rejected": 2.226536989212036, "logps/chosen": -8.10748291015625, "logps/rejected": -156.57012939453125, "loss": 0.3159, "nll_loss": 0.311826229095459, "rewards/accuracies": 1.0, "rewards/chosen": 2.622490406036377, "rewards/margins": 10.702468872070312, "rewards/rejected": -8.079977989196777, "step": 5431 }, { "epoch": 0.9053333333333333, "grad_norm": 219.72552490234375, "learning_rate": 4.663539206873468e-09, "logits/chosen": 2.4770491123199463, "logits/rejected": 2.4316909313201904, "logps/chosen": -79.024658203125, "logps/rejected": -75.4490737915039, "loss": 2.6616, "nll_loss": 2.0795958042144775, "rewards/accuracies": 1.0, "rewards/chosen": -2.5678608417510986, "rewards/margins": 0.9536774158477783, "rewards/rejected": -3.521538257598877, "step": 5432 }, { "epoch": 0.9055, "grad_norm": 33.58903503417969, "learning_rate": 4.647261023511451e-09, "logits/chosen": 2.8258039951324463, "logits/rejected": 2.7921745777130127, "logps/chosen": -28.293018341064453, "logps/rejected": -91.43028259277344, "loss": 0.613, "nll_loss": 0.5894379019737244, "rewards/accuracies": 1.0, "rewards/chosen": 0.8330364227294922, "rewards/margins": 7.385315418243408, "rewards/rejected": -6.552278995513916, "step": 5433 }, { "epoch": 0.9056666666666666, "grad_norm": 24.403812408447266, "learning_rate": 4.63101062364839e-09, "logits/chosen": 1.6500290632247925, "logits/rejected": 2.380699872970581, "logps/chosen": -50.05361557006836, "logps/rejected": -169.94696044921875, "loss": 0.697, "nll_loss": 0.6856659650802612, "rewards/accuracies": 1.0, "rewards/chosen": 1.5501736402511597, "rewards/margins": 9.592369079589844, "rewards/rejected": -8.042195320129395, "step": 5434 }, { "epoch": 0.9058333333333334, "grad_norm": 27.876625061035156, "learning_rate": 4.614788012019233e-09, "logits/chosen": 2.7462823390960693, "logits/rejected": 2.7829957008361816, "logps/chosen": -79.9970474243164, "logps/rejected": -199.423095703125, "loss": 1.0321, "nll_loss": 1.0126208066940308, "rewards/accuracies": 1.0, "rewards/chosen": 1.0010970830917358, "rewards/margins": 8.261941909790039, "rewards/rejected": -7.260844707489014, "step": 5435 }, { "epoch": 0.906, "grad_norm": 24.482351303100586, "learning_rate": 4.598593193350875e-09, "logits/chosen": 1.5490281581878662, "logits/rejected": 2.0701661109924316, "logps/chosen": -48.56982421875, "logps/rejected": -345.3076171875, "loss": 0.7061, "nll_loss": 0.6938544511795044, "rewards/accuracies": 1.0, "rewards/chosen": 1.4509724378585815, "rewards/margins": 10.863887786865234, "rewards/rejected": -9.412915229797363, "step": 5436 }, { "epoch": 0.9061666666666667, "grad_norm": 25.05552101135254, "learning_rate": 4.5824261723620746e-09, "logits/chosen": 2.5166163444519043, "logits/rejected": 2.7318592071533203, "logps/chosen": -45.19404983520508, "logps/rejected": -315.3446044921875, "loss": 0.531, "nll_loss": 0.5255122780799866, "rewards/accuracies": 1.0, "rewards/chosen": 2.261852741241455, "rewards/margins": 14.734947204589844, "rewards/rejected": -12.473094940185547, "step": 5437 }, { "epoch": 0.9063333333333333, "grad_norm": 37.80914306640625, "learning_rate": 4.566286953763554e-09, "logits/chosen": 2.0716960430145264, "logits/rejected": 1.6894580125808716, "logps/chosen": -64.40275573730469, "logps/rejected": -59.12853240966797, "loss": 1.062, "nll_loss": 1.0387542247772217, "rewards/accuracies": 1.0, "rewards/chosen": 1.3576005697250366, "rewards/margins": 5.9090681076049805, "rewards/rejected": -4.551467418670654, "step": 5438 }, { "epoch": 0.9065, "grad_norm": 35.433589935302734, "learning_rate": 4.5501755422578615e-09, "logits/chosen": 2.1293118000030518, "logits/rejected": 2.2269344329833984, "logps/chosen": -231.65676879882812, "logps/rejected": -354.48846435546875, "loss": 1.1516, "nll_loss": 1.1191149950027466, "rewards/accuracies": 1.0, "rewards/chosen": 0.4772522449493408, "rewards/margins": 6.988855361938477, "rewards/rejected": -6.511602878570557, "step": 5439 }, { "epoch": 0.9066666666666666, "grad_norm": 28.271493911743164, "learning_rate": 4.534091942539475e-09, "logits/chosen": 2.25852370262146, "logits/rejected": 2.5279107093811035, "logps/chosen": -49.94622802734375, "logps/rejected": -242.58010864257812, "loss": 0.7842, "nll_loss": 0.7684035301208496, "rewards/accuracies": 1.0, "rewards/chosen": 1.382156491279602, "rewards/margins": 7.190385341644287, "rewards/rejected": -5.808228969573975, "step": 5440 }, { "epoch": 0.9068333333333334, "grad_norm": 115.96089172363281, "learning_rate": 4.518036159294791e-09, "logits/chosen": 1.7214748859405518, "logits/rejected": 3.201643228530884, "logps/chosen": -26.791210174560547, "logps/rejected": -391.4177551269531, "loss": 1.8148, "nll_loss": 1.7860805988311768, "rewards/accuracies": 1.0, "rewards/chosen": 0.645734429359436, "rewards/margins": 6.77715539932251, "rewards/rejected": -6.131421089172363, "step": 5441 }, { "epoch": 0.907, "grad_norm": 29.07805824279785, "learning_rate": 4.502008197202067e-09, "logits/chosen": 3.008967399597168, "logits/rejected": 3.097717046737671, "logps/chosen": -89.12408447265625, "logps/rejected": -205.45565795898438, "loss": 0.9451, "nll_loss": 0.9381483197212219, "rewards/accuracies": 1.0, "rewards/chosen": 2.5331637859344482, "rewards/margins": 8.210271835327148, "rewards/rejected": -5.677107810974121, "step": 5442 }, { "epoch": 0.9071666666666667, "grad_norm": 28.901870727539062, "learning_rate": 4.486008060931468e-09, "logits/chosen": 1.6583888530731201, "logits/rejected": 2.099445104598999, "logps/chosen": -85.67510223388672, "logps/rejected": -187.89031982421875, "loss": 0.8799, "nll_loss": 0.8567508459091187, "rewards/accuracies": 1.0, "rewards/chosen": 1.7930641174316406, "rewards/margins": 5.878458499908447, "rewards/rejected": -4.085394382476807, "step": 5443 }, { "epoch": 0.9073333333333333, "grad_norm": 18.469867706298828, "learning_rate": 4.4700357551450406e-09, "logits/chosen": 2.24759578704834, "logits/rejected": 2.2877731323242188, "logps/chosen": -59.308082580566406, "logps/rejected": -160.47671508789062, "loss": 0.553, "nll_loss": 0.544110894203186, "rewards/accuracies": 1.0, "rewards/chosen": 1.9037818908691406, "rewards/margins": 8.538724899291992, "rewards/rejected": -6.634943008422852, "step": 5444 }, { "epoch": 0.9075, "grad_norm": 30.674619674682617, "learning_rate": 4.45409128449673e-09, "logits/chosen": 2.176076650619507, "logits/rejected": 2.215383291244507, "logps/chosen": -35.789459228515625, "logps/rejected": -83.996826171875, "loss": 0.6766, "nll_loss": 0.6507174968719482, "rewards/accuracies": 1.0, "rewards/chosen": 0.7044960260391235, "rewards/margins": 7.596146106719971, "rewards/rejected": -6.891650199890137, "step": 5445 }, { "epoch": 0.9076666666666666, "grad_norm": 30.265270233154297, "learning_rate": 4.438174653632409e-09, "logits/chosen": 2.6155078411102295, "logits/rejected": 2.845266103744507, "logps/chosen": -20.725215911865234, "logps/rejected": -327.94085693359375, "loss": 0.576, "nll_loss": 0.5757004618644714, "rewards/accuracies": 1.0, "rewards/chosen": 5.239424705505371, "rewards/margins": 16.178489685058594, "rewards/rejected": -10.939064025878906, "step": 5446 }, { "epoch": 0.9078333333333334, "grad_norm": 45.550140380859375, "learning_rate": 4.422285867189757e-09, "logits/chosen": 2.547783136367798, "logits/rejected": 2.3665101528167725, "logps/chosen": -49.26218032836914, "logps/rejected": -77.92086029052734, "loss": 1.0715, "nll_loss": 1.0481314659118652, "rewards/accuracies": 1.0, "rewards/chosen": 0.8199543356895447, "rewards/margins": 7.685381889343262, "rewards/rejected": -6.865427494049072, "step": 5447 }, { "epoch": 0.908, "grad_norm": 55.17985153198242, "learning_rate": 4.406424929798402e-09, "logits/chosen": 2.557440757751465, "logits/rejected": 2.834491491317749, "logps/chosen": -120.1490249633789, "logps/rejected": -319.50555419921875, "loss": 1.2506, "nll_loss": 1.1334812641143799, "rewards/accuracies": 1.0, "rewards/chosen": -1.0324363708496094, "rewards/margins": 5.269495487213135, "rewards/rejected": -6.301931858062744, "step": 5448 }, { "epoch": 0.9081666666666667, "grad_norm": 11.082884788513184, "learning_rate": 4.390591846079839e-09, "logits/chosen": 1.2981886863708496, "logits/rejected": 0.5258915424346924, "logps/chosen": -134.99069213867188, "logps/rejected": -132.2030029296875, "loss": 0.5499, "nll_loss": 0.5487426519393921, "rewards/accuracies": 1.0, "rewards/chosen": 4.808769226074219, "rewards/margins": 11.795129776000977, "rewards/rejected": -6.9863600730896, "step": 5449 }, { "epoch": 0.9083333333333333, "grad_norm": 28.555774688720703, "learning_rate": 4.3747866206474415e-09, "logits/chosen": 1.8390254974365234, "logits/rejected": 1.9368289709091187, "logps/chosen": -86.94322967529297, "logps/rejected": -123.1124267578125, "loss": 1.0734, "nll_loss": 1.0602833032608032, "rewards/accuracies": 1.0, "rewards/chosen": 2.9542076587677, "rewards/margins": 7.26418399810791, "rewards/rejected": -4.309976100921631, "step": 5450 }, { "epoch": 0.9085, "grad_norm": 21.115015029907227, "learning_rate": 4.359009258106505e-09, "logits/chosen": 2.9302151203155518, "logits/rejected": 2.997967004776001, "logps/chosen": -134.52220153808594, "logps/rejected": -72.6151123046875, "loss": 0.9292, "nll_loss": 0.9151167869567871, "rewards/accuracies": 1.0, "rewards/chosen": 1.4823899269104004, "rewards/margins": 7.454025745391846, "rewards/rejected": -5.971635818481445, "step": 5451 }, { "epoch": 0.9086666666666666, "grad_norm": 33.59102249145508, "learning_rate": 4.34325976305413e-09, "logits/chosen": 1.6163758039474487, "logits/rejected": 2.329577922821045, "logps/chosen": -29.2569522857666, "logps/rejected": -248.61070251464844, "loss": 0.6819, "nll_loss": 0.6649307608604431, "rewards/accuracies": 1.0, "rewards/chosen": 1.1001676321029663, "rewards/margins": 13.395020484924316, "rewards/rejected": -12.294853210449219, "step": 5452 }, { "epoch": 0.9088333333333334, "grad_norm": 35.11174774169922, "learning_rate": 4.327538140079367e-09, "logits/chosen": 1.769989252090454, "logits/rejected": 2.8255269527435303, "logps/chosen": -28.498037338256836, "logps/rejected": -394.465576171875, "loss": 0.6708, "nll_loss": 0.66274493932724, "rewards/accuracies": 1.0, "rewards/chosen": 1.943450927734375, "rewards/margins": 9.34222412109375, "rewards/rejected": -7.398773193359375, "step": 5453 }, { "epoch": 0.909, "grad_norm": 34.37353515625, "learning_rate": 4.311844393763109e-09, "logits/chosen": 2.666214942932129, "logits/rejected": 2.5967681407928467, "logps/chosen": -41.059242248535156, "logps/rejected": -41.017333984375, "loss": 0.8933, "nll_loss": 0.8379437923431396, "rewards/accuracies": 1.0, "rewards/chosen": 1.6653251647949219, "rewards/margins": 4.573371887207031, "rewards/rejected": -2.9080464839935303, "step": 5454 }, { "epoch": 0.9091666666666667, "grad_norm": 40.37153244018555, "learning_rate": 4.296178528678162e-09, "logits/chosen": 2.2935829162597656, "logits/rejected": 2.8175065517425537, "logps/chosen": -74.52691650390625, "logps/rejected": -476.1854248046875, "loss": 1.131, "nll_loss": 1.1123420000076294, "rewards/accuracies": 1.0, "rewards/chosen": 1.0010651350021362, "rewards/margins": 15.465109825134277, "rewards/rejected": -14.464044570922852, "step": 5455 }, { "epoch": 0.9093333333333333, "grad_norm": 38.219261169433594, "learning_rate": 4.280540549389144e-09, "logits/chosen": 2.425835371017456, "logits/rejected": 2.3407726287841797, "logps/chosen": -20.447528839111328, "logps/rejected": -106.85145568847656, "loss": 0.6325, "nll_loss": 0.6013979911804199, "rewards/accuracies": 1.0, "rewards/chosen": 1.1753654479980469, "rewards/margins": 5.329770088195801, "rewards/rejected": -4.154404640197754, "step": 5456 }, { "epoch": 0.9095, "grad_norm": 26.41909408569336, "learning_rate": 4.264930460452609e-09, "logits/chosen": 0.9398555755615234, "logits/rejected": 2.452904462814331, "logps/chosen": -37.316749572753906, "logps/rejected": -406.7806396484375, "loss": 0.6415, "nll_loss": 0.632487416267395, "rewards/accuracies": 1.0, "rewards/chosen": 1.8808457851409912, "rewards/margins": 8.629472732543945, "rewards/rejected": -6.748626708984375, "step": 5457 }, { "epoch": 0.9096666666666666, "grad_norm": 26.754627227783203, "learning_rate": 4.2493482664169526e-09, "logits/chosen": 3.027412176132202, "logits/rejected": 3.1577484607696533, "logps/chosen": -10.579740524291992, "logps/rejected": -136.05645751953125, "loss": 0.3422, "nll_loss": 0.33061689138412476, "rewards/accuracies": 1.0, "rewards/chosen": 1.6915284395217896, "rewards/margins": 7.76787805557251, "rewards/rejected": -6.07634973526001, "step": 5458 }, { "epoch": 0.9098333333333334, "grad_norm": 29.352296829223633, "learning_rate": 4.233793971822474e-09, "logits/chosen": 1.6469403505325317, "logits/rejected": 1.7130568027496338, "logps/chosen": -14.856922149658203, "logps/rejected": -49.66210174560547, "loss": 0.4382, "nll_loss": 0.4015384614467621, "rewards/accuracies": 1.0, "rewards/chosen": 1.5050535202026367, "rewards/margins": 5.073309898376465, "rewards/rejected": -3.568256378173828, "step": 5459 }, { "epoch": 0.91, "grad_norm": 33.088260650634766, "learning_rate": 4.218267581201296e-09, "logits/chosen": 1.9465057849884033, "logits/rejected": 1.865715503692627, "logps/chosen": -6.519152641296387, "logps/rejected": -73.46171569824219, "loss": 0.2584, "nll_loss": 0.25073665380477905, "rewards/accuracies": 1.0, "rewards/chosen": 2.5299630165100098, "rewards/margins": 7.994754791259766, "rewards/rejected": -5.464791774749756, "step": 5460 }, { "epoch": 0.9101666666666667, "grad_norm": 100.60136413574219, "learning_rate": 4.202769099077441e-09, "logits/chosen": 2.151890993118286, "logits/rejected": 2.308314085006714, "logps/chosen": -11.725184440612793, "logps/rejected": -140.577880859375, "loss": 0.9158, "nll_loss": 0.9019371867179871, "rewards/accuracies": 1.0, "rewards/chosen": 1.5768835544586182, "rewards/margins": 7.250144004821777, "rewards/rejected": -5.67326021194458, "step": 5461 }, { "epoch": 0.9103333333333333, "grad_norm": 23.970890045166016, "learning_rate": 4.187298529966799e-09, "logits/chosen": 3.325416326522827, "logits/rejected": 3.169843912124634, "logps/chosen": -34.06233215332031, "logps/rejected": -112.56503295898438, "loss": 0.5311, "nll_loss": 0.4936569929122925, "rewards/accuracies": 1.0, "rewards/chosen": 1.8731434345245361, "rewards/margins": 5.2117109298706055, "rewards/rejected": -3.3385677337646484, "step": 5462 }, { "epoch": 0.9105, "grad_norm": 20.485034942626953, "learning_rate": 4.1718558783771394e-09, "logits/chosen": 2.4457950592041016, "logits/rejected": 2.300448179244995, "logps/chosen": -69.19764709472656, "logps/rejected": -96.19564819335938, "loss": 0.6989, "nll_loss": 0.6718220114707947, "rewards/accuracies": 1.0, "rewards/chosen": 1.9090324640274048, "rewards/margins": 5.681102275848389, "rewards/rejected": -3.7720699310302734, "step": 5463 }, { "epoch": 0.9106666666666666, "grad_norm": 31.13184356689453, "learning_rate": 4.156441148808043e-09, "logits/chosen": 1.76591956615448, "logits/rejected": 2.0020666122436523, "logps/chosen": -69.6208267211914, "logps/rejected": -252.8815155029297, "loss": 1.0616, "nll_loss": 1.054861068725586, "rewards/accuracies": 1.0, "rewards/chosen": 2.069600820541382, "rewards/margins": 10.854291915893555, "rewards/rejected": -8.784690856933594, "step": 5464 }, { "epoch": 0.9108333333333334, "grad_norm": 21.32622718811035, "learning_rate": 4.141054345751016e-09, "logits/chosen": 2.1835107803344727, "logits/rejected": 2.5119400024414062, "logps/chosen": -45.857933044433594, "logps/rejected": -136.20201110839844, "loss": 0.5724, "nll_loss": 0.5661473274230957, "rewards/accuracies": 1.0, "rewards/chosen": 2.1577301025390625, "rewards/margins": 10.479763984680176, "rewards/rejected": -8.322033882141113, "step": 5465 }, { "epoch": 0.911, "grad_norm": 19.97530746459961, "learning_rate": 4.125695473689405e-09, "logits/chosen": 0.9633947610855103, "logits/rejected": 1.466336965560913, "logps/chosen": -38.5422477722168, "logps/rejected": -156.3447265625, "loss": 0.4592, "nll_loss": 0.4534381628036499, "rewards/accuracies": 1.0, "rewards/chosen": 2.2359397411346436, "rewards/margins": 10.885658264160156, "rewards/rejected": -8.649718284606934, "step": 5466 }, { "epoch": 0.9111666666666667, "grad_norm": 24.31049156188965, "learning_rate": 4.110364537098421e-09, "logits/chosen": 2.8725669384002686, "logits/rejected": 2.796041488647461, "logps/chosen": -52.941062927246094, "logps/rejected": -104.56596374511719, "loss": 0.679, "nll_loss": 0.6701399087905884, "rewards/accuracies": 1.0, "rewards/chosen": 2.011202335357666, "rewards/margins": 8.147380828857422, "rewards/rejected": -6.136178493499756, "step": 5467 }, { "epoch": 0.9113333333333333, "grad_norm": 52.47441482543945, "learning_rate": 4.0950615404451e-09, "logits/chosen": 3.152164936065674, "logits/rejected": 3.3389089107513428, "logps/chosen": -24.44255828857422, "logps/rejected": -229.37411499023438, "loss": 0.9439, "nll_loss": 0.9400984644889832, "rewards/accuracies": 1.0, "rewards/chosen": 3.9084389209747314, "rewards/margins": 9.526869773864746, "rewards/rejected": -5.618431091308594, "step": 5468 }, { "epoch": 0.9115, "grad_norm": 19.592540740966797, "learning_rate": 4.079786488188397e-09, "logits/chosen": 0.9107572436332703, "logits/rejected": 1.9933580160140991, "logps/chosen": -99.19218444824219, "logps/rejected": -301.4417724609375, "loss": 0.8113, "nll_loss": 0.7999369502067566, "rewards/accuracies": 1.0, "rewards/chosen": 1.7331688404083252, "rewards/margins": 7.754244804382324, "rewards/rejected": -6.02107572555542, "step": 5469 }, { "epoch": 0.9116666666666666, "grad_norm": 29.08776092529297, "learning_rate": 4.064539384779087e-09, "logits/chosen": 2.6666111946105957, "logits/rejected": 2.906895875930786, "logps/chosen": -80.53370666503906, "logps/rejected": -193.91995239257812, "loss": 1.0403, "nll_loss": 1.0194140672683716, "rewards/accuracies": 1.0, "rewards/chosen": 0.9474312663078308, "rewards/margins": 7.657961368560791, "rewards/rejected": -6.7105302810668945, "step": 5470 }, { "epoch": 0.9118333333333334, "grad_norm": 34.483367919921875, "learning_rate": 4.049320234659803e-09, "logits/chosen": 2.852285623550415, "logits/rejected": 2.839258909225464, "logps/chosen": -68.47322845458984, "logps/rejected": -63.936222076416016, "loss": 1.1673, "nll_loss": 1.1412204504013062, "rewards/accuracies": 1.0, "rewards/chosen": 2.486802816390991, "rewards/margins": 6.073347568511963, "rewards/rejected": -3.5865447521209717, "step": 5471 }, { "epoch": 0.912, "grad_norm": 36.67123031616211, "learning_rate": 4.034129042265066e-09, "logits/chosen": 2.034518241882324, "logits/rejected": 2.2544989585876465, "logps/chosen": -15.972620010375977, "logps/rejected": -257.82574462890625, "loss": 0.6209, "nll_loss": 0.6143314838409424, "rewards/accuracies": 1.0, "rewards/chosen": 3.454327344894409, "rewards/margins": 8.506731986999512, "rewards/rejected": -5.052404880523682, "step": 5472 }, { "epoch": 0.9121666666666667, "grad_norm": 33.47443389892578, "learning_rate": 4.018965812021191e-09, "logits/chosen": 2.3478496074676514, "logits/rejected": 2.5721232891082764, "logps/chosen": -18.422470092773438, "logps/rejected": -190.8839111328125, "loss": 0.3961, "nll_loss": 0.39196744561195374, "rewards/accuracies": 1.0, "rewards/chosen": 2.847198247909546, "rewards/margins": 9.421330451965332, "rewards/rejected": -6.574131965637207, "step": 5473 }, { "epoch": 0.9123333333333333, "grad_norm": 42.496551513671875, "learning_rate": 4.0038305483463985e-09, "logits/chosen": 2.800258159637451, "logits/rejected": 2.837224245071411, "logps/chosen": -70.03477478027344, "logps/rejected": -206.7548828125, "loss": 1.1712, "nll_loss": 1.1672459840774536, "rewards/accuracies": 1.0, "rewards/chosen": 3.0931198596954346, "rewards/margins": 9.299395561218262, "rewards/rejected": -6.206275939941406, "step": 5474 }, { "epoch": 0.9125, "grad_norm": 35.80393981933594, "learning_rate": 3.988723255650728e-09, "logits/chosen": 2.536372661590576, "logits/rejected": 2.2629871368408203, "logps/chosen": -28.985464096069336, "logps/rejected": -58.48948669433594, "loss": 0.6603, "nll_loss": 0.5915400981903076, "rewards/accuracies": 1.0, "rewards/chosen": 1.0700139999389648, "rewards/margins": 3.9958925247192383, "rewards/rejected": -2.9258785247802734, "step": 5475 }, { "epoch": 0.9126666666666666, "grad_norm": 28.5229549407959, "learning_rate": 3.9736439383361135e-09, "logits/chosen": 3.359198570251465, "logits/rejected": 3.5115296840667725, "logps/chosen": -56.198402404785156, "logps/rejected": -249.21868896484375, "loss": 0.9139, "nll_loss": 0.8920379877090454, "rewards/accuracies": 1.0, "rewards/chosen": 0.8336739540100098, "rewards/margins": 10.652933120727539, "rewards/rejected": -9.819258689880371, "step": 5476 }, { "epoch": 0.9128333333333334, "grad_norm": 28.086977005004883, "learning_rate": 3.958592600796262e-09, "logits/chosen": 2.406615734100342, "logits/rejected": 2.7756433486938477, "logps/chosen": -43.57030487060547, "logps/rejected": -558.5235595703125, "loss": 0.6436, "nll_loss": 0.596853494644165, "rewards/accuracies": 1.0, "rewards/chosen": 0.2919689118862152, "rewards/margins": 5.125538349151611, "rewards/rejected": -4.833569526672363, "step": 5477 }, { "epoch": 0.913, "grad_norm": 41.838539123535156, "learning_rate": 3.943569247416801e-09, "logits/chosen": 2.7786715030670166, "logits/rejected": 2.786407470703125, "logps/chosen": -21.076229095458984, "logps/rejected": -82.33132934570312, "loss": 0.8229, "nll_loss": 0.7806010842323303, "rewards/accuracies": 1.0, "rewards/chosen": 1.637040138244629, "rewards/margins": 4.922085762023926, "rewards/rejected": -3.285045623779297, "step": 5478 }, { "epoch": 0.9131666666666667, "grad_norm": 48.87248611450195, "learning_rate": 3.928573882575159e-09, "logits/chosen": 2.535888433456421, "logits/rejected": 2.7817492485046387, "logps/chosen": -13.160850524902344, "logps/rejected": -320.86370849609375, "loss": 0.5668, "nll_loss": 0.5483687520027161, "rewards/accuracies": 1.0, "rewards/chosen": 1.0160194635391235, "rewards/margins": 10.26954460144043, "rewards/rejected": -9.253524780273438, "step": 5479 }, { "epoch": 0.9133333333333333, "grad_norm": 20.889293670654297, "learning_rate": 3.9136065106406436e-09, "logits/chosen": 1.6708513498306274, "logits/rejected": 1.5117298364639282, "logps/chosen": -132.19602966308594, "logps/rejected": -95.37683868408203, "loss": 0.9362, "nll_loss": 0.9309579730033875, "rewards/accuracies": 1.0, "rewards/chosen": 2.4307053089141846, "rewards/margins": 9.567383766174316, "rewards/rejected": -7.136678218841553, "step": 5480 }, { "epoch": 0.9135, "grad_norm": 27.32109260559082, "learning_rate": 3.8986671359743765e-09, "logits/chosen": 1.3258190155029297, "logits/rejected": 2.0969512462615967, "logps/chosen": -37.03969955444336, "logps/rejected": -171.24124145507812, "loss": 0.6157, "nll_loss": 0.6072081923484802, "rewards/accuracies": 1.0, "rewards/chosen": 2.29681658744812, "rewards/margins": 7.8535661697387695, "rewards/rejected": -5.55674934387207, "step": 5481 }, { "epoch": 0.9136666666666666, "grad_norm": 34.101776123046875, "learning_rate": 3.8837557629293195e-09, "logits/chosen": 2.708667516708374, "logits/rejected": 2.714219093322754, "logps/chosen": -54.864463806152344, "logps/rejected": -138.6610107421875, "loss": 0.7702, "nll_loss": 0.7515679597854614, "rewards/accuracies": 1.0, "rewards/chosen": 1.334815263748169, "rewards/margins": 6.566478729248047, "rewards/rejected": -5.231663703918457, "step": 5482 }, { "epoch": 0.9138333333333334, "grad_norm": 249.14486694335938, "learning_rate": 3.868872395850309e-09, "logits/chosen": 2.6035280227661133, "logits/rejected": 2.6391725540161133, "logps/chosen": -56.71895980834961, "logps/rejected": -58.17333221435547, "loss": 2.0247, "nll_loss": 0.6672818660736084, "rewards/accuracies": 0.0, "rewards/chosen": 1.3353376388549805, "rewards/margins": -0.3877300024032593, "rewards/rejected": 1.7230676412582397, "step": 5483 }, { "epoch": 0.914, "grad_norm": 29.32143211364746, "learning_rate": 3.854017039074009e-09, "logits/chosen": 2.315704107284546, "logits/rejected": 2.353990316390991, "logps/chosen": -19.763248443603516, "logps/rejected": -241.685302734375, "loss": 0.5493, "nll_loss": 0.5489791631698608, "rewards/accuracies": 1.0, "rewards/chosen": 5.24122428894043, "rewards/margins": 14.651800155639648, "rewards/rejected": -9.410575866699219, "step": 5484 }, { "epoch": 0.9141666666666667, "grad_norm": 39.328834533691406, "learning_rate": 3.839189696928891e-09, "logits/chosen": 1.1013481616973877, "logits/rejected": 2.135606050491333, "logps/chosen": -33.53655242919922, "logps/rejected": -272.7879638671875, "loss": 0.6587, "nll_loss": 0.6575793027877808, "rewards/accuracies": 1.0, "rewards/chosen": 3.922614574432373, "rewards/margins": 13.499536514282227, "rewards/rejected": -9.576922416687012, "step": 5485 }, { "epoch": 0.9143333333333333, "grad_norm": 23.967853546142578, "learning_rate": 3.824390373735286e-09, "logits/chosen": 2.5361807346343994, "logits/rejected": 2.4884870052337646, "logps/chosen": -91.40493774414062, "logps/rejected": -69.35513305664062, "loss": 0.9041, "nll_loss": 0.8961266875267029, "rewards/accuracies": 1.0, "rewards/chosen": 2.43900990486145, "rewards/margins": 7.931283950805664, "rewards/rejected": -5.492274284362793, "step": 5486 }, { "epoch": 0.9145, "grad_norm": 28.86067771911621, "learning_rate": 3.809619073805381e-09, "logits/chosen": 3.2017788887023926, "logits/rejected": 3.2283413410186768, "logps/chosen": -32.089027404785156, "logps/rejected": -144.47586059570312, "loss": 0.6086, "nll_loss": 0.5942412614822388, "rewards/accuracies": 1.0, "rewards/chosen": 1.5285431146621704, "rewards/margins": 7.183908939361572, "rewards/rejected": -5.655365943908691, "step": 5487 }, { "epoch": 0.9146666666666666, "grad_norm": 43.203948974609375, "learning_rate": 3.794875801443176e-09, "logits/chosen": 2.453564405441284, "logits/rejected": 2.5959267616271973, "logps/chosen": -29.257123947143555, "logps/rejected": -180.35452270507812, "loss": 0.8005, "nll_loss": 0.7699243426322937, "rewards/accuracies": 1.0, "rewards/chosen": 1.9357703924179077, "rewards/margins": 5.520760536193848, "rewards/rejected": -3.5849902629852295, "step": 5488 }, { "epoch": 0.9148333333333334, "grad_norm": 28.352718353271484, "learning_rate": 3.7801605609444785e-09, "logits/chosen": 1.6483304500579834, "logits/rejected": 1.7182966470718384, "logps/chosen": -55.12377166748047, "logps/rejected": -127.60112762451172, "loss": 0.7097, "nll_loss": 0.6890471577644348, "rewards/accuracies": 1.0, "rewards/chosen": 1.086146593093872, "rewards/margins": 6.744523048400879, "rewards/rejected": -5.658376216888428, "step": 5489 }, { "epoch": 0.915, "grad_norm": 39.455650329589844, "learning_rate": 3.765473356596982e-09, "logits/chosen": 2.5204291343688965, "logits/rejected": 2.5081543922424316, "logps/chosen": -10.309823989868164, "logps/rejected": -111.71089172363281, "loss": 0.3952, "nll_loss": 0.3818453252315521, "rewards/accuracies": 1.0, "rewards/chosen": 1.5559241771697998, "rewards/margins": 7.457437515258789, "rewards/rejected": -5.901513576507568, "step": 5490 }, { "epoch": 0.9151666666666667, "grad_norm": 23.85783576965332, "learning_rate": 3.750814192680174e-09, "logits/chosen": 3.373866319656372, "logits/rejected": 3.5868961811065674, "logps/chosen": -85.8648910522461, "logps/rejected": -392.90008544921875, "loss": 0.7916, "nll_loss": 0.7877511382102966, "rewards/accuracies": 1.0, "rewards/chosen": 2.6692957878112793, "rewards/margins": 10.85789680480957, "rewards/rejected": -8.18860149383545, "step": 5491 }, { "epoch": 0.9153333333333333, "grad_norm": 21.801109313964844, "learning_rate": 3.736183073465393e-09, "logits/chosen": 1.7352081537246704, "logits/rejected": 2.460315704345703, "logps/chosen": -49.25651550292969, "logps/rejected": -173.47061157226562, "loss": 0.5663, "nll_loss": 0.553443968296051, "rewards/accuracies": 1.0, "rewards/chosen": 1.7752563953399658, "rewards/margins": 7.158555030822754, "rewards/rejected": -5.383298397064209, "step": 5492 }, { "epoch": 0.9155, "grad_norm": 63.7728271484375, "learning_rate": 3.721580003215807e-09, "logits/chosen": 3.5817530155181885, "logits/rejected": 3.649059534072876, "logps/chosen": -14.260785102844238, "logps/rejected": -59.64543151855469, "loss": 0.7629, "nll_loss": 0.6790849566459656, "rewards/accuracies": 1.0, "rewards/chosen": 2.4293277263641357, "rewards/margins": 4.606385231018066, "rewards/rejected": -2.1770572662353516, "step": 5493 }, { "epoch": 0.9156666666666666, "grad_norm": 26.453105926513672, "learning_rate": 3.7070049861863684e-09, "logits/chosen": 3.841703414916992, "logits/rejected": 3.8584728240966797, "logps/chosen": -55.87174606323242, "logps/rejected": -87.87894439697266, "loss": 0.6501, "nll_loss": 0.6139752864837646, "rewards/accuracies": 1.0, "rewards/chosen": 1.8426427841186523, "rewards/margins": 5.245203495025635, "rewards/rejected": -3.4025607109069824, "step": 5494 }, { "epoch": 0.9158333333333334, "grad_norm": 77.52236938476562, "learning_rate": 3.692458026623901e-09, "logits/chosen": 2.572782278060913, "logits/rejected": 2.841240882873535, "logps/chosen": -62.67697525024414, "logps/rejected": -340.61480712890625, "loss": 1.0566, "nll_loss": 0.882774293422699, "rewards/accuracies": 1.0, "rewards/chosen": 1.3733563423156738, "rewards/margins": 2.913761615753174, "rewards/rejected": -1.5404052734375, "step": 5495 }, { "epoch": 0.916, "grad_norm": 77.57279205322266, "learning_rate": 3.677939128767049e-09, "logits/chosen": 2.1539578437805176, "logits/rejected": 2.3917267322540283, "logps/chosen": -54.294517517089844, "logps/rejected": -186.95828247070312, "loss": 1.9492, "nll_loss": 1.9390897750854492, "rewards/accuracies": 1.0, "rewards/chosen": 1.6348992586135864, "rewards/margins": 11.050148963928223, "rewards/rejected": -9.415249824523926, "step": 5496 }, { "epoch": 0.9161666666666667, "grad_norm": 55.7938346862793, "learning_rate": 3.6634482968462722e-09, "logits/chosen": 1.823752522468567, "logits/rejected": 2.848066568374634, "logps/chosen": -44.92662811279297, "logps/rejected": -260.8932189941406, "loss": 1.5118, "nll_loss": 1.4975544214248657, "rewards/accuracies": 1.0, "rewards/chosen": 1.946541666984558, "rewards/margins": 6.779218673706055, "rewards/rejected": -4.832676887512207, "step": 5497 }, { "epoch": 0.9163333333333333, "grad_norm": 236.31773376464844, "learning_rate": 3.648985535083826e-09, "logits/chosen": 1.9929577112197876, "logits/rejected": 1.482648253440857, "logps/chosen": -127.87193298339844, "logps/rejected": -48.854583740234375, "loss": 2.2171, "nll_loss": 1.0396090745925903, "rewards/accuracies": 0.0, "rewards/chosen": -0.7644143104553223, "rewards/margins": -0.5551759004592896, "rewards/rejected": -0.2092384397983551, "step": 5498 }, { "epoch": 0.9165, "grad_norm": 31.479074478149414, "learning_rate": 3.634550847693829e-09, "logits/chosen": 3.072903871536255, "logits/rejected": 3.172940254211426, "logps/chosen": -18.43636131286621, "logps/rejected": -110.03851318359375, "loss": 0.4647, "nll_loss": 0.46090903878211975, "rewards/accuracies": 1.0, "rewards/chosen": 2.965806007385254, "rewards/margins": 9.57205581665039, "rewards/rejected": -6.606250286102295, "step": 5499 }, { "epoch": 0.9166666666666666, "grad_norm": 22.39805030822754, "learning_rate": 3.6201442388822058e-09, "logits/chosen": 1.1739006042480469, "logits/rejected": 1.9016355276107788, "logps/chosen": -82.40211486816406, "logps/rejected": -184.09970092773438, "loss": 0.7632, "nll_loss": 0.7491101622581482, "rewards/accuracies": 1.0, "rewards/chosen": 1.9497345685958862, "rewards/margins": 6.803891181945801, "rewards/rejected": -4.854156494140625, "step": 5500 }, { "epoch": 0.9168333333333333, "grad_norm": 25.167327880859375, "learning_rate": 3.605765712846698e-09, "logits/chosen": 2.783626079559326, "logits/rejected": 2.713407039642334, "logps/chosen": -32.208152770996094, "logps/rejected": -168.74600219726562, "loss": 0.5256, "nll_loss": 0.503252387046814, "rewards/accuracies": 1.0, "rewards/chosen": 0.8186721801757812, "rewards/margins": 9.09496784210205, "rewards/rejected": -8.27629566192627, "step": 5501 }, { "epoch": 0.917, "grad_norm": 31.762493133544922, "learning_rate": 3.5914152737768544e-09, "logits/chosen": 3.0071609020233154, "logits/rejected": 2.85441255569458, "logps/chosen": -70.28921508789062, "logps/rejected": -110.69491577148438, "loss": 1.0597, "nll_loss": 1.0186842679977417, "rewards/accuracies": 1.0, "rewards/chosen": 2.0805771350860596, "rewards/margins": 5.223453521728516, "rewards/rejected": -3.142876386642456, "step": 5502 }, { "epoch": 0.9171666666666667, "grad_norm": 28.331937789916992, "learning_rate": 3.577092925854042e-09, "logits/chosen": 2.3058340549468994, "logits/rejected": 2.1232314109802246, "logps/chosen": -108.0533447265625, "logps/rejected": -125.49535369873047, "loss": 1.1198, "nll_loss": 1.1139520406723022, "rewards/accuracies": 1.0, "rewards/chosen": 2.48020339012146, "rewards/margins": 8.806471824645996, "rewards/rejected": -6.326268196105957, "step": 5503 }, { "epoch": 0.9173333333333333, "grad_norm": 127.46147155761719, "learning_rate": 3.562798673251466e-09, "logits/chosen": 2.9098784923553467, "logits/rejected": 2.8632500171661377, "logps/chosen": -72.2311019897461, "logps/rejected": -10.666385650634766, "loss": 2.087, "nll_loss": 0.9504095315933228, "rewards/accuracies": 0.0, "rewards/chosen": 1.0289947986602783, "rewards/margins": -0.12477850914001465, "rewards/rejected": 1.153773307800293, "step": 5504 }, { "epoch": 0.9175, "grad_norm": 49.5634651184082, "learning_rate": 3.5485325201341286e-09, "logits/chosen": 2.937554359436035, "logits/rejected": 3.0764098167419434, "logps/chosen": -17.078662872314453, "logps/rejected": -235.61215209960938, "loss": 0.7843, "nll_loss": 0.7763028740882874, "rewards/accuracies": 1.0, "rewards/chosen": 3.445378541946411, "rewards/margins": 8.240920066833496, "rewards/rejected": -4.795541286468506, "step": 5505 }, { "epoch": 0.9176666666666666, "grad_norm": 31.489595413208008, "learning_rate": 3.5342944706588275e-09, "logits/chosen": 1.689131736755371, "logits/rejected": 1.3903902769088745, "logps/chosen": -35.9991340637207, "logps/rejected": -47.61585998535156, "loss": 0.6175, "nll_loss": 0.5714148283004761, "rewards/accuracies": 1.0, "rewards/chosen": 1.257242202758789, "rewards/margins": 4.649835586547852, "rewards/rejected": -3.3925936222076416, "step": 5506 }, { "epoch": 0.9178333333333333, "grad_norm": 25.083038330078125, "learning_rate": 3.5200845289741875e-09, "logits/chosen": 2.0694375038146973, "logits/rejected": 2.28055739402771, "logps/chosen": -68.53607177734375, "logps/rejected": -274.27099609375, "loss": 0.7099, "nll_loss": 0.7065574526786804, "rewards/accuracies": 1.0, "rewards/chosen": 3.8285491466522217, "rewards/margins": 9.701681137084961, "rewards/rejected": -5.873132228851318, "step": 5507 }, { "epoch": 0.918, "grad_norm": 68.3661117553711, "learning_rate": 3.505902699220664e-09, "logits/chosen": 2.6349828243255615, "logits/rejected": 2.6229236125946045, "logps/chosen": -17.686830520629883, "logps/rejected": -10.347616195678711, "loss": 1.2906, "nll_loss": 0.34013140201568604, "rewards/accuracies": 1.0, "rewards/chosen": 2.789238452911377, "rewards/margins": 0.8459835052490234, "rewards/rejected": 1.9432549476623535, "step": 5508 }, { "epoch": 0.9181666666666667, "grad_norm": 190.2187957763672, "learning_rate": 3.4917489855305073e-09, "logits/chosen": 1.684029459953308, "logits/rejected": 2.2772672176361084, "logps/chosen": -31.99051856994629, "logps/rejected": -345.1126708984375, "loss": 3.1746, "nll_loss": 2.908228874206543, "rewards/accuracies": 1.0, "rewards/chosen": -2.354008197784424, "rewards/margins": 13.578317642211914, "rewards/rejected": -15.93232536315918, "step": 5509 }, { "epoch": 0.9183333333333333, "grad_norm": 28.08201789855957, "learning_rate": 3.4776233920277396e-09, "logits/chosen": 2.0387649536132812, "logits/rejected": 1.4854607582092285, "logps/chosen": -58.64405822753906, "logps/rejected": -108.93898010253906, "loss": 0.7777, "nll_loss": 0.7616109848022461, "rewards/accuracies": 1.0, "rewards/chosen": 1.9372360706329346, "rewards/margins": 6.536242485046387, "rewards/rejected": -4.599006175994873, "step": 5510 }, { "epoch": 0.9185, "grad_norm": 38.070377349853516, "learning_rate": 3.4635259228282256e-09, "logits/chosen": 2.433457851409912, "logits/rejected": 2.634105682373047, "logps/chosen": -73.36799621582031, "logps/rejected": -153.15219116210938, "loss": 1.0195, "nll_loss": 0.965368390083313, "rewards/accuracies": 1.0, "rewards/chosen": -0.05824356526136398, "rewards/margins": 5.77801513671875, "rewards/rejected": -5.836258888244629, "step": 5511 }, { "epoch": 0.9186666666666666, "grad_norm": 237.15621948242188, "learning_rate": 3.449456582039645e-09, "logits/chosen": 2.2242777347564697, "logits/rejected": 2.0111072063446045, "logps/chosen": -260.40863037109375, "logps/rejected": -75.99359130859375, "loss": 2.4787, "nll_loss": 1.080533742904663, "rewards/accuracies": 0.0, "rewards/chosen": 1.9185638427734375, "rewards/margins": -0.28769373893737793, "rewards/rejected": 2.2062575817108154, "step": 5512 }, { "epoch": 0.9188333333333333, "grad_norm": 56.26140213012695, "learning_rate": 3.435415373761463e-09, "logits/chosen": 1.7586299180984497, "logits/rejected": 2.825355052947998, "logps/chosen": -44.40409851074219, "logps/rejected": -266.86090087890625, "loss": 1.4909, "nll_loss": 1.4801366329193115, "rewards/accuracies": 1.0, "rewards/chosen": 1.9987945556640625, "rewards/margins": 7.428239345550537, "rewards/rejected": -5.429444789886475, "step": 5513 }, { "epoch": 0.919, "grad_norm": 38.52939987182617, "learning_rate": 3.4214023020849527e-09, "logits/chosen": 2.93807315826416, "logits/rejected": 2.8111281394958496, "logps/chosen": -23.338645935058594, "logps/rejected": -52.72483825683594, "loss": 0.6416, "nll_loss": 0.5304237008094788, "rewards/accuracies": 1.0, "rewards/chosen": 6.610490798950195, "rewards/margins": 8.23779582977295, "rewards/rejected": -1.627305030822754, "step": 5514 }, { "epoch": 0.9191666666666667, "grad_norm": 25.988779067993164, "learning_rate": 3.40741737109318e-09, "logits/chosen": 2.2544851303100586, "logits/rejected": 2.1866676807403564, "logps/chosen": -97.50173950195312, "logps/rejected": -97.74358367919922, "loss": 0.9576, "nll_loss": 0.946618914604187, "rewards/accuracies": 1.0, "rewards/chosen": 2.0252463817596436, "rewards/margins": 7.357953071594238, "rewards/rejected": -5.332706451416016, "step": 5515 }, { "epoch": 0.9193333333333333, "grad_norm": 29.37876319885254, "learning_rate": 3.3934605848610076e-09, "logits/chosen": 2.4329042434692383, "logits/rejected": 2.692028045654297, "logps/chosen": -20.17937469482422, "logps/rejected": -298.6156005859375, "loss": 0.5316, "nll_loss": 0.5310361385345459, "rewards/accuracies": 1.0, "rewards/chosen": 5.623673915863037, "rewards/margins": 13.172380447387695, "rewards/rejected": -7.5487060546875, "step": 5516 }, { "epoch": 0.9195, "grad_norm": 39.49900436401367, "learning_rate": 3.3795319474551276e-09, "logits/chosen": 1.9726245403289795, "logits/rejected": 2.956552267074585, "logps/chosen": -24.25943946838379, "logps/rejected": -230.27182006835938, "loss": 0.5419, "nll_loss": 0.5273790955543518, "rewards/accuracies": 1.0, "rewards/chosen": 1.3021831512451172, "rewards/margins": 8.759906768798828, "rewards/rejected": -7.457724094390869, "step": 5517 }, { "epoch": 0.9196666666666666, "grad_norm": 23.631383895874023, "learning_rate": 3.3656314629340154e-09, "logits/chosen": 2.8833675384521484, "logits/rejected": 3.159166097640991, "logps/chosen": -56.70348358154297, "logps/rejected": -401.2518310546875, "loss": 0.6111, "nll_loss": 0.5968788266181946, "rewards/accuracies": 1.0, "rewards/chosen": 1.3825974464416504, "rewards/margins": 7.9532647132873535, "rewards/rejected": -6.570667266845703, "step": 5518 }, { "epoch": 0.9198333333333333, "grad_norm": 38.59400177001953, "learning_rate": 3.3517591353479093e-09, "logits/chosen": 2.5754106044769287, "logits/rejected": 2.6559433937072754, "logps/chosen": -20.444311141967773, "logps/rejected": -96.33719635009766, "loss": 0.6394, "nll_loss": 0.5111078023910522, "rewards/accuracies": 1.0, "rewards/chosen": 1.2343549728393555, "rewards/margins": 3.234548807144165, "rewards/rejected": -2.0001938343048096, "step": 5519 }, { "epoch": 0.92, "grad_norm": 215.2512664794922, "learning_rate": 3.3379149687388863e-09, "logits/chosen": 2.4916939735412598, "logits/rejected": 2.3992552757263184, "logps/chosen": -103.9931869506836, "logps/rejected": -39.285499572753906, "loss": 2.7503, "nll_loss": 1.299915075302124, "rewards/accuracies": 0.0, "rewards/chosen": 0.6052657961845398, "rewards/margins": -0.7003563046455383, "rewards/rejected": 1.3056221008300781, "step": 5520 }, { "epoch": 0.9201666666666667, "grad_norm": 22.646699905395508, "learning_rate": 3.324098967140809e-09, "logits/chosen": 2.0929834842681885, "logits/rejected": 2.037062644958496, "logps/chosen": -67.4852294921875, "logps/rejected": -69.1458740234375, "loss": 0.762, "nll_loss": 0.7498359084129333, "rewards/accuracies": 1.0, "rewards/chosen": 2.9904251098632812, "rewards/margins": 7.39485502243042, "rewards/rejected": -4.404429912567139, "step": 5521 }, { "epoch": 0.9203333333333333, "grad_norm": 21.155736923217773, "learning_rate": 3.3103111345793353e-09, "logits/chosen": 1.9586304426193237, "logits/rejected": 2.2109408378601074, "logps/chosen": -14.713348388671875, "logps/rejected": -88.62071228027344, "loss": 0.365, "nll_loss": 0.33439427614212036, "rewards/accuracies": 1.0, "rewards/chosen": 1.517880916595459, "rewards/margins": 5.360240459442139, "rewards/rejected": -3.8423595428466797, "step": 5522 }, { "epoch": 0.9205, "grad_norm": 120.43466186523438, "learning_rate": 3.296551475071896e-09, "logits/chosen": 2.6861793994903564, "logits/rejected": 2.9095475673675537, "logps/chosen": -141.1670684814453, "logps/rejected": -295.9263916015625, "loss": 1.5392, "nll_loss": 1.1571072340011597, "rewards/accuracies": 1.0, "rewards/chosen": 1.4983367919921875, "rewards/margins": 1.8713653087615967, "rewards/rejected": -0.37302857637405396, "step": 5523 }, { "epoch": 0.9206666666666666, "grad_norm": 188.14688110351562, "learning_rate": 3.282819992627717e-09, "logits/chosen": 1.7422394752502441, "logits/rejected": 2.0614888668060303, "logps/chosen": -143.038330078125, "logps/rejected": -221.314208984375, "loss": 2.2861, "nll_loss": 1.7233531475067139, "rewards/accuracies": 1.0, "rewards/chosen": -2.688243865966797, "rewards/margins": 1.1060280799865723, "rewards/rejected": -3.794271945953369, "step": 5524 }, { "epoch": 0.9208333333333333, "grad_norm": 22.55390739440918, "learning_rate": 3.269116691247842e-09, "logits/chosen": 2.6903982162475586, "logits/rejected": 2.7508254051208496, "logps/chosen": -63.97819900512695, "logps/rejected": -169.75790405273438, "loss": 0.7262, "nll_loss": 0.7108688950538635, "rewards/accuracies": 1.0, "rewards/chosen": 1.2404766082763672, "rewards/margins": 8.798688888549805, "rewards/rejected": -7.5582122802734375, "step": 5525 }, { "epoch": 0.921, "grad_norm": 35.06074523925781, "learning_rate": 3.2554415749250886e-09, "logits/chosen": 2.6694176197052, "logits/rejected": 3.0107083320617676, "logps/chosen": -29.269756317138672, "logps/rejected": -332.104736328125, "loss": 0.7347, "nll_loss": 0.713896632194519, "rewards/accuracies": 1.0, "rewards/chosen": 0.9024181365966797, "rewards/margins": 8.70566177368164, "rewards/rejected": -7.803244113922119, "step": 5526 }, { "epoch": 0.9211666666666667, "grad_norm": 25.966350555419922, "learning_rate": 3.2417946476440362e-09, "logits/chosen": 2.7856247425079346, "logits/rejected": 3.1026830673217773, "logps/chosen": -98.2126235961914, "logps/rejected": -282.2353210449219, "loss": 1.1572, "nll_loss": 1.155442714691162, "rewards/accuracies": 1.0, "rewards/chosen": 3.4711310863494873, "rewards/margins": 12.45796012878418, "rewards/rejected": -8.986828804016113, "step": 5527 }, { "epoch": 0.9213333333333333, "grad_norm": 16.318359375, "learning_rate": 3.2281759133810702e-09, "logits/chosen": 1.714428186416626, "logits/rejected": 1.8840153217315674, "logps/chosen": -111.00554656982422, "logps/rejected": -185.2272491455078, "loss": 0.6759, "nll_loss": 0.6568375825881958, "rewards/accuracies": 1.0, "rewards/chosen": 1.9128106832504272, "rewards/margins": 6.232790946960449, "rewards/rejected": -4.319980144500732, "step": 5528 }, { "epoch": 0.9215, "grad_norm": 31.668933868408203, "learning_rate": 3.214585376104384e-09, "logits/chosen": 1.6552294492721558, "logits/rejected": 1.4558265209197998, "logps/chosen": -89.52151489257812, "logps/rejected": -65.63017272949219, "loss": 1.06, "nll_loss": 1.0409477949142456, "rewards/accuracies": 1.0, "rewards/chosen": 1.3661361932754517, "rewards/margins": 6.428934574127197, "rewards/rejected": -5.062798500061035, "step": 5529 }, { "epoch": 0.9216666666666666, "grad_norm": 23.04365348815918, "learning_rate": 3.20102303977392e-09, "logits/chosen": 2.5831797122955322, "logits/rejected": 2.6969962120056152, "logps/chosen": -98.92044067382812, "logps/rejected": -251.6924285888672, "loss": 1.0155, "nll_loss": 1.0093923807144165, "rewards/accuracies": 1.0, "rewards/chosen": 2.4068763256073, "rewards/margins": 8.775856018066406, "rewards/rejected": -6.368979454040527, "step": 5530 }, { "epoch": 0.9218333333333333, "grad_norm": 22.252714157104492, "learning_rate": 3.1874889083414404e-09, "logits/chosen": 2.3735525608062744, "logits/rejected": 2.6707282066345215, "logps/chosen": -78.05516052246094, "logps/rejected": -350.3205261230469, "loss": 0.8666, "nll_loss": 0.8577491044998169, "rewards/accuracies": 1.0, "rewards/chosen": 1.7760895490646362, "rewards/margins": 10.826947212219238, "rewards/rejected": -9.050857543945312, "step": 5531 }, { "epoch": 0.922, "grad_norm": 47.04146194458008, "learning_rate": 3.173982985750423e-09, "logits/chosen": 2.421811819076538, "logits/rejected": 2.3924243450164795, "logps/chosen": -49.305511474609375, "logps/rejected": -232.93174743652344, "loss": 1.2918, "nll_loss": 1.264243721961975, "rewards/accuracies": 1.0, "rewards/chosen": 0.5853252410888672, "rewards/margins": 11.076590538024902, "rewards/rejected": -10.491265296936035, "step": 5532 }, { "epoch": 0.9221666666666667, "grad_norm": 280.27679443359375, "learning_rate": 3.1605052759361985e-09, "logits/chosen": 1.5527029037475586, "logits/rejected": 1.8320897817611694, "logps/chosen": -20.994384765625, "logps/rejected": -47.85198211669922, "loss": 6.544, "nll_loss": 0.3558370769023895, "rewards/accuracies": 0.0, "rewards/chosen": 1.9414823055267334, "rewards/margins": -5.523364067077637, "rewards/rejected": 7.464846611022949, "step": 5533 }, { "epoch": 0.9223333333333333, "grad_norm": 20.35579490661621, "learning_rate": 3.147055782825847e-09, "logits/chosen": 0.8781648874282837, "logits/rejected": 2.165071487426758, "logps/chosen": -89.60179138183594, "logps/rejected": -420.66961669921875, "loss": 0.8175, "nll_loss": 0.8072234988212585, "rewards/accuracies": 1.0, "rewards/chosen": 1.6238434314727783, "rewards/margins": 11.271102905273438, "rewards/rejected": -9.647259712219238, "step": 5534 }, { "epoch": 0.9225, "grad_norm": 21.277368545532227, "learning_rate": 3.1336345103382342e-09, "logits/chosen": 2.243911027908325, "logits/rejected": 2.2864830493927, "logps/chosen": -37.16691207885742, "logps/rejected": -63.46342468261719, "loss": 0.5331, "nll_loss": 0.5091357231140137, "rewards/accuracies": 1.0, "rewards/chosen": 1.2688137292861938, "rewards/margins": 5.879030227661133, "rewards/rejected": -4.6102166175842285, "step": 5535 }, { "epoch": 0.9226666666666666, "grad_norm": 267.9814147949219, "learning_rate": 3.120241462383966e-09, "logits/chosen": 2.618028163909912, "logits/rejected": 2.605165958404541, "logps/chosen": -95.63140869140625, "logps/rejected": -54.94396209716797, "loss": 4.354, "nll_loss": 1.195392370223999, "rewards/accuracies": 0.0, "rewards/chosen": 3.39591383934021, "rewards/margins": -2.1211349964141846, "rewards/rejected": 5.5170488357543945, "step": 5536 }, { "epoch": 0.9228333333333333, "grad_norm": 57.817962646484375, "learning_rate": 3.106876642865486e-09, "logits/chosen": 2.943075180053711, "logits/rejected": 3.083582639694214, "logps/chosen": -19.956981658935547, "logps/rejected": -164.3644561767578, "loss": 0.9542, "nll_loss": 0.9503323435783386, "rewards/accuracies": 1.0, "rewards/chosen": 3.105405330657959, "rewards/margins": 9.382112503051758, "rewards/rejected": -6.276707172393799, "step": 5537 }, { "epoch": 0.923, "grad_norm": 61.54798889160156, "learning_rate": 3.0935400556769576e-09, "logits/chosen": 1.9619247913360596, "logits/rejected": 2.2428665161132812, "logps/chosen": -118.5878677368164, "logps/rejected": -296.14752197265625, "loss": 1.3847, "nll_loss": 1.2751384973526, "rewards/accuracies": 1.0, "rewards/chosen": -1.0068382024765015, "rewards/margins": 7.4844584465026855, "rewards/rejected": -8.491296768188477, "step": 5538 }, { "epoch": 0.9231666666666667, "grad_norm": 26.92151641845703, "learning_rate": 3.0802317047043837e-09, "logits/chosen": 1.0479055643081665, "logits/rejected": 1.9760686159133911, "logps/chosen": -63.67763900756836, "logps/rejected": -228.25564575195312, "loss": 0.8709, "nll_loss": 0.8605086207389832, "rewards/accuracies": 1.0, "rewards/chosen": 2.205888032913208, "rewards/margins": 7.405535697937012, "rewards/rejected": -5.199647426605225, "step": 5539 }, { "epoch": 0.9233333333333333, "grad_norm": 33.53010177612305, "learning_rate": 3.0669515938254397e-09, "logits/chosen": 0.8739027380943298, "logits/rejected": 1.7727068662643433, "logps/chosen": -48.825443267822266, "logps/rejected": -270.39862060546875, "loss": 0.8808, "nll_loss": 0.8565868139266968, "rewards/accuracies": 1.0, "rewards/chosen": 0.9814003109931946, "rewards/margins": 6.261558532714844, "rewards/rejected": -5.280158042907715, "step": 5540 }, { "epoch": 0.9235, "grad_norm": 29.091079711914062, "learning_rate": 3.0536997269096753e-09, "logits/chosen": 1.9934602975845337, "logits/rejected": 2.0230870246887207, "logps/chosen": -86.65886688232422, "logps/rejected": -119.0219497680664, "loss": 1.0187, "nll_loss": 1.00766122341156, "rewards/accuracies": 1.0, "rewards/chosen": 2.2224228382110596, "rewards/margins": 7.256936073303223, "rewards/rejected": -5.034512996673584, "step": 5541 }, { "epoch": 0.9236666666666666, "grad_norm": 64.10986328125, "learning_rate": 3.040476107818346e-09, "logits/chosen": 2.259636163711548, "logits/rejected": 1.650216817855835, "logps/chosen": -47.87303924560547, "logps/rejected": -48.680442810058594, "loss": 1.7518, "nll_loss": 1.7097508907318115, "rewards/accuracies": 1.0, "rewards/chosen": 2.4636971950531006, "rewards/margins": 5.470958709716797, "rewards/rejected": -3.007261276245117, "step": 5542 }, { "epoch": 0.9238333333333333, "grad_norm": 36.171302795410156, "learning_rate": 3.0272807404045144e-09, "logits/chosen": 2.058309555053711, "logits/rejected": 2.5478644371032715, "logps/chosen": -13.776955604553223, "logps/rejected": -265.0417175292969, "loss": 0.4656, "nll_loss": 0.4305298924446106, "rewards/accuracies": 1.0, "rewards/chosen": 0.3318560719490051, "rewards/margins": 8.720466613769531, "rewards/rejected": -8.38861083984375, "step": 5543 }, { "epoch": 0.924, "grad_norm": 72.92417907714844, "learning_rate": 3.014113628512982e-09, "logits/chosen": 3.1163783073425293, "logits/rejected": 3.114514112472534, "logps/chosen": -54.68018341064453, "logps/rejected": -66.1358413696289, "loss": 2.0699, "nll_loss": 2.0251917839050293, "rewards/accuracies": 1.0, "rewards/chosen": 0.49023327231407166, "rewards/margins": 4.9155473709106445, "rewards/rejected": -4.425313949584961, "step": 5544 }, { "epoch": 0.9241666666666667, "grad_norm": 121.9704360961914, "learning_rate": 3.0009747759803138e-09, "logits/chosen": 0.8051092624664307, "logits/rejected": 1.5303128957748413, "logps/chosen": -55.08116149902344, "logps/rejected": -200.12118530273438, "loss": 2.9304, "nll_loss": 2.899008274078369, "rewards/accuracies": 1.0, "rewards/chosen": 0.5983344912528992, "rewards/margins": 6.239810943603516, "rewards/rejected": -5.641476631164551, "step": 5545 }, { "epoch": 0.9243333333333333, "grad_norm": 36.442176818847656, "learning_rate": 2.9878641866348807e-09, "logits/chosen": 2.2928318977355957, "logits/rejected": 2.4235010147094727, "logps/chosen": -29.529451370239258, "logps/rejected": -164.94630432128906, "loss": 0.5961, "nll_loss": 0.5368991494178772, "rewards/accuracies": 1.0, "rewards/chosen": 0.11455117166042328, "rewards/margins": 4.567812919616699, "rewards/rejected": -4.453261852264404, "step": 5546 }, { "epoch": 0.9245, "grad_norm": 112.16038513183594, "learning_rate": 2.9747818642967826e-09, "logits/chosen": 3.1205477714538574, "logits/rejected": 2.9893600940704346, "logps/chosen": -67.37493133544922, "logps/rejected": -99.45082092285156, "loss": 2.5682, "nll_loss": 2.4953677654266357, "rewards/accuracies": 1.0, "rewards/chosen": 0.04279327392578125, "rewards/margins": 4.0291242599487305, "rewards/rejected": -3.9863312244415283, "step": 5547 }, { "epoch": 0.9246666666666666, "grad_norm": 127.6135025024414, "learning_rate": 2.961727812777903e-09, "logits/chosen": 1.8267340660095215, "logits/rejected": 2.170842409133911, "logps/chosen": -45.602352142333984, "logps/rejected": -119.58784484863281, "loss": 2.061, "nll_loss": 1.9827109575271606, "rewards/accuracies": 1.0, "rewards/chosen": -0.16581840813159943, "rewards/margins": 4.079856872558594, "rewards/rejected": -4.245675086975098, "step": 5548 }, { "epoch": 0.9248333333333333, "grad_norm": 20.372142791748047, "learning_rate": 2.9487020358818672e-09, "logits/chosen": 2.404928684234619, "logits/rejected": 2.97233510017395, "logps/chosen": -15.159836769104004, "logps/rejected": -148.19854736328125, "loss": 0.2985, "nll_loss": 0.28603464365005493, "rewards/accuracies": 1.0, "rewards/chosen": 1.9162613153457642, "rewards/margins": 7.112238883972168, "rewards/rejected": -5.195977687835693, "step": 5549 }, { "epoch": 0.925, "grad_norm": 29.042842864990234, "learning_rate": 2.9357045374040823e-09, "logits/chosen": 1.1641870737075806, "logits/rejected": 2.5540969371795654, "logps/chosen": -85.22584533691406, "logps/rejected": -366.5693664550781, "loss": 0.9719, "nll_loss": 0.9575936794281006, "rewards/accuracies": 1.0, "rewards/chosen": 1.2753938436508179, "rewards/margins": 11.322671890258789, "rewards/rejected": -10.04727840423584, "step": 5550 }, { "epoch": 0.9251666666666667, "grad_norm": 36.720149993896484, "learning_rate": 2.9227353211317084e-09, "logits/chosen": 2.231778860092163, "logits/rejected": 2.2435812950134277, "logps/chosen": -131.6763916015625, "logps/rejected": -270.28851318359375, "loss": 1.2259, "nll_loss": 1.197058081626892, "rewards/accuracies": 1.0, "rewards/chosen": 0.5611511468887329, "rewards/margins": 7.880744934082031, "rewards/rejected": -7.319593906402588, "step": 5551 }, { "epoch": 0.9253333333333333, "grad_norm": 83.26640319824219, "learning_rate": 2.909794390843656e-09, "logits/chosen": 2.3140599727630615, "logits/rejected": 1.9612153768539429, "logps/chosen": -66.57845306396484, "logps/rejected": -42.075950622558594, "loss": 1.5144, "nll_loss": 1.331568956375122, "rewards/accuracies": 1.0, "rewards/chosen": -0.16127091646194458, "rewards/margins": 2.3301162719726562, "rewards/rejected": -2.491387128829956, "step": 5552 }, { "epoch": 0.9255, "grad_norm": 12.175922393798828, "learning_rate": 2.896881750310598e-09, "logits/chosen": 2.1720261573791504, "logits/rejected": 2.224898099899292, "logps/chosen": -144.35836791992188, "logps/rejected": -205.65017700195312, "loss": 0.6314, "nll_loss": 0.6303858757019043, "rewards/accuracies": 1.0, "rewards/chosen": 4.277932643890381, "rewards/margins": 12.371847152709961, "rewards/rejected": -8.093914031982422, "step": 5553 }, { "epoch": 0.9256666666666666, "grad_norm": 35.65918731689453, "learning_rate": 2.88399740329498e-09, "logits/chosen": 1.6320137977600098, "logits/rejected": 2.2692477703094482, "logps/chosen": -28.464832305908203, "logps/rejected": -197.3870849609375, "loss": 0.7716, "nll_loss": 0.769320011138916, "rewards/accuracies": 1.0, "rewards/chosen": 3.230440616607666, "rewards/margins": 11.509099960327148, "rewards/rejected": -8.278658866882324, "step": 5554 }, { "epoch": 0.9258333333333333, "grad_norm": 23.231689453125, "learning_rate": 2.8711413535509988e-09, "logits/chosen": 2.350221872329712, "logits/rejected": 2.4692625999450684, "logps/chosen": -95.0323486328125, "logps/rejected": -180.80026245117188, "loss": 0.8861, "nll_loss": 0.8799290657043457, "rewards/accuracies": 1.0, "rewards/chosen": 2.2537362575531006, "rewards/margins": 9.279435157775879, "rewards/rejected": -7.025698661804199, "step": 5555 }, { "epoch": 0.926, "grad_norm": 23.037248611450195, "learning_rate": 2.858313604824569e-09, "logits/chosen": 2.52193021774292, "logits/rejected": 2.481830596923828, "logps/chosen": -6.296387672424316, "logps/rejected": -85.36995697021484, "loss": 0.2472, "nll_loss": 0.24216875433921814, "rewards/accuracies": 1.0, "rewards/chosen": 2.585416793823242, "rewards/margins": 9.175020217895508, "rewards/rejected": -6.589603424072266, "step": 5556 }, { "epoch": 0.9261666666666667, "grad_norm": 26.55486297607422, "learning_rate": 2.845514160853413e-09, "logits/chosen": 1.2470614910125732, "logits/rejected": 1.906255841255188, "logps/chosen": -165.64059448242188, "logps/rejected": -236.00421142578125, "loss": 0.8472, "nll_loss": 0.811963677406311, "rewards/accuracies": 1.0, "rewards/chosen": 0.31492921710014343, "rewards/margins": 10.521064758300781, "rewards/rejected": -10.206135749816895, "step": 5557 }, { "epoch": 0.9263333333333333, "grad_norm": 23.071008682250977, "learning_rate": 2.8327430253669684e-09, "logits/chosen": 1.5425552129745483, "logits/rejected": 2.4577574729919434, "logps/chosen": -36.991912841796875, "logps/rejected": -268.1577453613281, "loss": 0.566, "nll_loss": 0.5604833364486694, "rewards/accuracies": 1.0, "rewards/chosen": 2.2494418621063232, "rewards/margins": 12.75476360321045, "rewards/rejected": -10.505321502685547, "step": 5558 }, { "epoch": 0.9265, "grad_norm": 43.25434494018555, "learning_rate": 2.8200002020864587e-09, "logits/chosen": 0.8229730725288391, "logits/rejected": 1.6047731637954712, "logps/chosen": -97.86125183105469, "logps/rejected": -373.8569641113281, "loss": 1.2849, "nll_loss": 1.2387499809265137, "rewards/accuracies": 1.0, "rewards/chosen": 0.0151557931676507, "rewards/margins": 19.55350112915039, "rewards/rejected": -19.538345336914062, "step": 5559 }, { "epoch": 0.9266666666666666, "grad_norm": 79.3525390625, "learning_rate": 2.8072856947248037e-09, "logits/chosen": 2.6795706748962402, "logits/rejected": 2.8382744789123535, "logps/chosen": -106.43319702148438, "logps/rejected": -125.97089385986328, "loss": 1.5271, "nll_loss": 1.156882643699646, "rewards/accuracies": 1.0, "rewards/chosen": 2.0754165649414062, "rewards/margins": 2.2629172801971436, "rewards/rejected": -0.18750077486038208, "step": 5560 }, { "epoch": 0.9268333333333333, "grad_norm": 44.07896423339844, "learning_rate": 2.794599506986739e-09, "logits/chosen": 2.703590154647827, "logits/rejected": 2.7046313285827637, "logps/chosen": -14.958000183105469, "logps/rejected": -127.10287475585938, "loss": 0.5417, "nll_loss": 0.5157930850982666, "rewards/accuracies": 1.0, "rewards/chosen": 1.0427989959716797, "rewards/margins": 5.877915859222412, "rewards/rejected": -4.835116863250732, "step": 5561 }, { "epoch": 0.927, "grad_norm": 18.121784210205078, "learning_rate": 2.781941642568686e-09, "logits/chosen": 1.958044171333313, "logits/rejected": 1.790980577468872, "logps/chosen": -175.508544921875, "logps/rejected": -159.375732421875, "loss": 0.9603, "nll_loss": 0.9538508057594299, "rewards/accuracies": 1.0, "rewards/chosen": 2.239183187484741, "rewards/margins": 9.117634773254395, "rewards/rejected": -6.878451824188232, "step": 5562 }, { "epoch": 0.9271666666666667, "grad_norm": 53.041683197021484, "learning_rate": 2.769312105158872e-09, "logits/chosen": 2.1899406909942627, "logits/rejected": 2.2019457817077637, "logps/chosen": -18.511322021484375, "logps/rejected": -108.82015228271484, "loss": 0.7486, "nll_loss": 0.7404529452323914, "rewards/accuracies": 1.0, "rewards/chosen": 1.959633708000183, "rewards/margins": 8.851129531860352, "rewards/rejected": -6.891496181488037, "step": 5563 }, { "epoch": 0.9273333333333333, "grad_norm": 46.994422912597656, "learning_rate": 2.756710898437209e-09, "logits/chosen": 1.6151257753372192, "logits/rejected": 2.1958370208740234, "logps/chosen": -45.42167663574219, "logps/rejected": -477.80303955078125, "loss": 1.2417, "nll_loss": 1.227612853050232, "rewards/accuracies": 1.0, "rewards/chosen": 1.2796413898468018, "rewards/margins": 13.086346626281738, "rewards/rejected": -11.806705474853516, "step": 5564 }, { "epoch": 0.9275, "grad_norm": 36.381690979003906, "learning_rate": 2.7441380260754045e-09, "logits/chosen": 3.3320813179016113, "logits/rejected": 3.338440418243408, "logps/chosen": -15.773305892944336, "logps/rejected": -113.52208709716797, "loss": 0.4909, "nll_loss": 0.47797897458076477, "rewards/accuracies": 1.0, "rewards/chosen": 3.1392436027526855, "rewards/margins": 7.430312156677246, "rewards/rejected": -4.2910685539245605, "step": 5565 }, { "epoch": 0.9276666666666666, "grad_norm": 16.750682830810547, "learning_rate": 2.7315934917368945e-09, "logits/chosen": 0.9393246173858643, "logits/rejected": 1.5183372497558594, "logps/chosen": -120.88667297363281, "logps/rejected": -531.6488037109375, "loss": 0.777, "nll_loss": 0.7749145030975342, "rewards/accuracies": 1.0, "rewards/chosen": 3.2555742263793945, "rewards/margins": 13.654751777648926, "rewards/rejected": -10.399177551269531, "step": 5566 }, { "epoch": 0.9278333333333333, "grad_norm": 17.43438720703125, "learning_rate": 2.7190772990768553e-09, "logits/chosen": 2.518810272216797, "logits/rejected": 2.593062162399292, "logps/chosen": -233.60348510742188, "logps/rejected": -370.90020751953125, "loss": 1.009, "nll_loss": 1.0069116353988647, "rewards/accuracies": 1.0, "rewards/chosen": 3.2302494049072266, "rewards/margins": 12.599842071533203, "rewards/rejected": -9.369592666625977, "step": 5567 }, { "epoch": 0.928, "grad_norm": 25.823143005371094, "learning_rate": 2.7065894517421806e-09, "logits/chosen": 2.2594733238220215, "logits/rejected": 2.0728721618652344, "logps/chosen": -150.44467163085938, "logps/rejected": -148.91786193847656, "loss": 1.0579, "nll_loss": 1.0447547435760498, "rewards/accuracies": 1.0, "rewards/chosen": 1.4034013748168945, "rewards/margins": 8.66949462890625, "rewards/rejected": -7.266092777252197, "step": 5568 }, { "epoch": 0.9281666666666667, "grad_norm": 48.62290954589844, "learning_rate": 2.69412995337156e-09, "logits/chosen": 1.6737903356552124, "logits/rejected": 1.3998713493347168, "logps/chosen": -143.31610107421875, "logps/rejected": -116.48492431640625, "loss": 1.5004, "nll_loss": 1.4050596952438354, "rewards/accuracies": 1.0, "rewards/chosen": -0.0067687989212572575, "rewards/margins": 3.4341092109680176, "rewards/rejected": -3.440877914428711, "step": 5569 }, { "epoch": 0.9283333333333333, "grad_norm": 71.30752563476562, "learning_rate": 2.6816988075953783e-09, "logits/chosen": 2.0433247089385986, "logits/rejected": 2.6300270557403564, "logps/chosen": -78.97557067871094, "logps/rejected": -307.3495788574219, "loss": 1.0292, "nll_loss": 0.9077652096748352, "rewards/accuracies": 1.0, "rewards/chosen": 0.8700355887413025, "rewards/margins": 3.1358652114868164, "rewards/rejected": -2.265829563140869, "step": 5570 }, { "epoch": 0.9285, "grad_norm": 21.05696678161621, "learning_rate": 2.6692960180357715e-09, "logits/chosen": 1.5061664581298828, "logits/rejected": 2.1322925090789795, "logps/chosen": -54.93782043457031, "logps/rejected": -309.15533447265625, "loss": 0.6305, "nll_loss": 0.6242933869361877, "rewards/accuracies": 1.0, "rewards/chosen": 2.174717664718628, "rewards/margins": 9.949797630310059, "rewards/rejected": -7.775079727172852, "step": 5571 }, { "epoch": 0.9286666666666666, "grad_norm": 20.144193649291992, "learning_rate": 2.6569215883066374e-09, "logits/chosen": 2.3857452869415283, "logits/rejected": 2.507963180541992, "logps/chosen": -101.13801574707031, "logps/rejected": -187.29290771484375, "loss": 0.8105, "nll_loss": 0.8026826977729797, "rewards/accuracies": 1.0, "rewards/chosen": 1.9622833728790283, "rewards/margins": 9.334609031677246, "rewards/rejected": -7.372325897216797, "step": 5572 }, { "epoch": 0.9288333333333333, "grad_norm": 19.209360122680664, "learning_rate": 2.6445755220135477e-09, "logits/chosen": 1.3829498291015625, "logits/rejected": 2.296825647354126, "logps/chosen": -11.520105361938477, "logps/rejected": -255.63294982910156, "loss": 0.2446, "nll_loss": 0.24000221490859985, "rewards/accuracies": 1.0, "rewards/chosen": 2.6456191539764404, "rewards/margins": 9.487497329711914, "rewards/rejected": -6.841878414154053, "step": 5573 }, { "epoch": 0.929, "grad_norm": 30.03722381591797, "learning_rate": 2.6322578227538807e-09, "logits/chosen": 2.2648534774780273, "logits/rejected": 2.621070146560669, "logps/chosen": -35.0345458984375, "logps/rejected": -628.214599609375, "loss": 0.7847, "nll_loss": 0.7785454392433167, "rewards/accuracies": 1.0, "rewards/chosen": 2.1234517097473145, "rewards/margins": 13.172012329101562, "rewards/rejected": -11.04856014251709, "step": 5574 }, { "epoch": 0.9291666666666667, "grad_norm": 29.028722763061523, "learning_rate": 2.6199684941166977e-09, "logits/chosen": 2.9005377292633057, "logits/rejected": 2.8268301486968994, "logps/chosen": -24.630359649658203, "logps/rejected": -64.01000213623047, "loss": 0.5878, "nll_loss": 0.5727990865707397, "rewards/accuracies": 1.0, "rewards/chosen": 2.587820529937744, "rewards/margins": 6.874028205871582, "rewards/rejected": -4.286207675933838, "step": 5575 }, { "epoch": 0.9293333333333333, "grad_norm": 67.81343841552734, "learning_rate": 2.6077075396828462e-09, "logits/chosen": 2.836338996887207, "logits/rejected": 2.9399328231811523, "logps/chosen": -9.730417251586914, "logps/rejected": -190.62088012695312, "loss": 0.6644, "nll_loss": 0.6486943960189819, "rewards/accuracies": 1.0, "rewards/chosen": 1.7363054752349854, "rewards/margins": 6.633256912231445, "rewards/rejected": -4.896951198577881, "step": 5576 }, { "epoch": 0.9295, "grad_norm": 16.48920440673828, "learning_rate": 2.595474963024835e-09, "logits/chosen": 2.335036277770996, "logits/rejected": 2.583073854446411, "logps/chosen": -138.1554412841797, "logps/rejected": -244.07254028320312, "loss": 0.7611, "nll_loss": 0.7508447766304016, "rewards/accuracies": 1.0, "rewards/chosen": 1.9443223476409912, "rewards/margins": 7.641756057739258, "rewards/rejected": -5.6974334716796875, "step": 5577 }, { "epoch": 0.9296666666666666, "grad_norm": 171.50875854492188, "learning_rate": 2.5832707677069576e-09, "logits/chosen": 1.9043835401535034, "logits/rejected": 1.581123948097229, "logps/chosen": -90.71822357177734, "logps/rejected": -10.962143898010254, "loss": 2.4924, "nll_loss": 1.0427380800247192, "rewards/accuracies": 0.0, "rewards/chosen": 1.3131539821624756, "rewards/margins": -0.5265066623687744, "rewards/rejected": 1.83966064453125, "step": 5578 }, { "epoch": 0.9298333333333333, "grad_norm": 24.114261627197266, "learning_rate": 2.5710949572852357e-09, "logits/chosen": 1.002577543258667, "logits/rejected": 1.7416801452636719, "logps/chosen": -66.54707336425781, "logps/rejected": -260.8505859375, "loss": 0.6532, "nll_loss": 0.6398757696151733, "rewards/accuracies": 1.0, "rewards/chosen": 1.4248886108398438, "rewards/margins": 8.220685958862305, "rewards/rejected": -6.795797824859619, "step": 5579 }, { "epoch": 0.93, "grad_norm": 41.47666549682617, "learning_rate": 2.5589475353073985e-09, "logits/chosen": 3.3244588375091553, "logits/rejected": 3.113834857940674, "logps/chosen": -74.30590057373047, "logps/rejected": -43.00732421875, "loss": 1.2499, "nll_loss": 1.1984822750091553, "rewards/accuracies": 1.0, "rewards/chosen": 1.7331398725509644, "rewards/margins": 4.7185587882995605, "rewards/rejected": -2.9854190349578857, "step": 5580 }, { "epoch": 0.9301666666666667, "grad_norm": 39.21107482910156, "learning_rate": 2.5468285053129144e-09, "logits/chosen": 2.0822107791900635, "logits/rejected": 2.45220947265625, "logps/chosen": -39.30583953857422, "logps/rejected": -588.6935424804688, "loss": 1.0423, "nll_loss": 1.0343642234802246, "rewards/accuracies": 1.0, "rewards/chosen": 1.8671331405639648, "rewards/margins": 13.391108512878418, "rewards/rejected": -11.523975372314453, "step": 5581 }, { "epoch": 0.9303333333333333, "grad_norm": 18.827333450317383, "learning_rate": 2.53473787083297e-09, "logits/chosen": 2.602858304977417, "logits/rejected": 2.721738338470459, "logps/chosen": -159.94911193847656, "logps/rejected": -359.43524169921875, "loss": 0.9925, "nll_loss": 0.9873400926589966, "rewards/accuracies": 1.0, "rewards/chosen": 2.6603851318359375, "rewards/margins": 8.971649169921875, "rewards/rejected": -6.3112640380859375, "step": 5582 }, { "epoch": 0.9305, "grad_norm": 23.421417236328125, "learning_rate": 2.522675635390492e-09, "logits/chosen": 0.661185085773468, "logits/rejected": 2.0899312496185303, "logps/chosen": -73.62051391601562, "logps/rejected": -724.3186645507812, "loss": 0.7642, "nll_loss": 0.7512296438217163, "rewards/accuracies": 1.0, "rewards/chosen": 1.382239580154419, "rewards/margins": 9.858344078063965, "rewards/rejected": -8.476104736328125, "step": 5583 }, { "epoch": 0.9306666666666666, "grad_norm": 28.489967346191406, "learning_rate": 2.5106418025001353e-09, "logits/chosen": 2.1362416744232178, "logits/rejected": 1.8811019659042358, "logps/chosen": -34.77842712402344, "logps/rejected": -57.03885269165039, "loss": 0.5398, "nll_loss": 0.5114474296569824, "rewards/accuracies": 1.0, "rewards/chosen": 2.1477534770965576, "rewards/margins": 5.750399112701416, "rewards/rejected": -3.6026456356048584, "step": 5584 }, { "epoch": 0.9308333333333333, "grad_norm": 20.60462188720703, "learning_rate": 2.4986363756682615e-09, "logits/chosen": 1.645403265953064, "logits/rejected": 1.9792773723602295, "logps/chosen": -117.86532592773438, "logps/rejected": -369.9393310546875, "loss": 0.9621, "nll_loss": 0.9582546949386597, "rewards/accuracies": 1.0, "rewards/chosen": 2.6259171962738037, "rewards/margins": 11.714119911193848, "rewards/rejected": -9.088202476501465, "step": 5585 }, { "epoch": 0.931, "grad_norm": 127.06410217285156, "learning_rate": 2.486659358392951e-09, "logits/chosen": 2.982355833053589, "logits/rejected": 2.779721260070801, "logps/chosen": -73.25537109375, "logps/rejected": -42.919368743896484, "loss": 1.8236, "nll_loss": 1.285181999206543, "rewards/accuracies": 1.0, "rewards/chosen": -0.33527299761772156, "rewards/margins": 0.747650146484375, "rewards/rejected": -1.082923173904419, "step": 5586 }, { "epoch": 0.9311666666666667, "grad_norm": 139.3586883544922, "learning_rate": 2.4747107541640444e-09, "logits/chosen": 3.0606424808502197, "logits/rejected": 2.98342227935791, "logps/chosen": -38.99992370605469, "logps/rejected": -14.39362907409668, "loss": 3.538, "nll_loss": 0.5342455506324768, "rewards/accuracies": 0.0, "rewards/chosen": 1.466827392578125, "rewards/margins": -2.3576295375823975, "rewards/rejected": 3.8244569301605225, "step": 5587 }, { "epoch": 0.9313333333333333, "grad_norm": 29.298852920532227, "learning_rate": 2.462790566463069e-09, "logits/chosen": 2.8539273738861084, "logits/rejected": 2.9748494625091553, "logps/chosen": -70.25199890136719, "logps/rejected": -167.5128631591797, "loss": 0.8972, "nll_loss": 0.8892660140991211, "rewards/accuracies": 1.0, "rewards/chosen": 2.0896928310394287, "rewards/margins": 8.439498901367188, "rewards/rejected": -6.349806308746338, "step": 5588 }, { "epoch": 0.9315, "grad_norm": 59.67448043823242, "learning_rate": 2.4508987987632677e-09, "logits/chosen": 2.846872329711914, "logits/rejected": 2.8563597202301025, "logps/chosen": -63.99964904785156, "logps/rejected": -12.821722030639648, "loss": 1.3037, "nll_loss": 0.7272686958312988, "rewards/accuracies": 1.0, "rewards/chosen": 2.2629685401916504, "rewards/margins": 1.6049656867980957, "rewards/rejected": 0.6580028533935547, "step": 5589 }, { "epoch": 0.9316666666666666, "grad_norm": 22.112394332885742, "learning_rate": 2.4390354545296254e-09, "logits/chosen": 2.1861519813537598, "logits/rejected": 2.116926431655884, "logps/chosen": -46.676414489746094, "logps/rejected": -126.40223693847656, "loss": 0.5966, "nll_loss": 0.590840756893158, "rewards/accuracies": 1.0, "rewards/chosen": 2.475264072418213, "rewards/margins": 8.843595504760742, "rewards/rejected": -6.368331432342529, "step": 5590 }, { "epoch": 0.9318333333333333, "grad_norm": 102.14813232421875, "learning_rate": 2.427200537218843e-09, "logits/chosen": 1.904224157333374, "logits/rejected": 2.8778038024902344, "logps/chosen": -24.313800811767578, "logps/rejected": -507.5755920410156, "loss": 1.5309, "nll_loss": 1.5196126699447632, "rewards/accuracies": 1.0, "rewards/chosen": 1.5356217622756958, "rewards/margins": 9.533367156982422, "rewards/rejected": -7.997745513916016, "step": 5591 }, { "epoch": 0.932, "grad_norm": 29.852903366088867, "learning_rate": 2.415394050279318e-09, "logits/chosen": 2.645237922668457, "logits/rejected": 2.5843665599823, "logps/chosen": -103.95960998535156, "logps/rejected": -130.03408813476562, "loss": 1.0275, "nll_loss": 1.0192118883132935, "rewards/accuracies": 1.0, "rewards/chosen": 2.248678684234619, "rewards/margins": 7.939846992492676, "rewards/rejected": -5.691168308258057, "step": 5592 }, { "epoch": 0.9321666666666667, "grad_norm": 42.05359649658203, "learning_rate": 2.4036159971511982e-09, "logits/chosen": 2.551669120788574, "logits/rejected": 2.62764835357666, "logps/chosen": -11.102766036987305, "logps/rejected": -272.342529296875, "loss": 0.4632, "nll_loss": 0.4626152217388153, "rewards/accuracies": 1.0, "rewards/chosen": 4.435454845428467, "rewards/margins": 15.831445693969727, "rewards/rejected": -11.395990371704102, "step": 5593 }, { "epoch": 0.9323333333333333, "grad_norm": 17.414892196655273, "learning_rate": 2.3918663812663055e-09, "logits/chosen": 2.5201845169067383, "logits/rejected": 2.4888408184051514, "logps/chosen": -55.936588287353516, "logps/rejected": -263.3619384765625, "loss": 0.5705, "nll_loss": 0.5650160312652588, "rewards/accuracies": 1.0, "rewards/chosen": 2.3076329231262207, "rewards/margins": 10.244138717651367, "rewards/rejected": -7.9365057945251465, "step": 5594 }, { "epoch": 0.9325, "grad_norm": 17.785812377929688, "learning_rate": 2.3801452060482007e-09, "logits/chosen": 2.1750035285949707, "logits/rejected": 1.9589853286743164, "logps/chosen": -10.248408317565918, "logps/rejected": -70.10546875, "loss": 0.2386, "nll_loss": 0.22774238884449005, "rewards/accuracies": 1.0, "rewards/chosen": 2.4736480712890625, "rewards/margins": 7.317137241363525, "rewards/rejected": -4.843489170074463, "step": 5595 }, { "epoch": 0.9326666666666666, "grad_norm": 61.01670455932617, "learning_rate": 2.3684524749121526e-09, "logits/chosen": 1.931125283241272, "logits/rejected": 1.9217814207077026, "logps/chosen": -57.34638977050781, "logps/rejected": -32.54608917236328, "loss": 1.3877, "nll_loss": 1.3033270835876465, "rewards/accuracies": 1.0, "rewards/chosen": 0.5670539736747742, "rewards/margins": 3.574192762374878, "rewards/rejected": -3.007138729095459, "step": 5596 }, { "epoch": 0.9328333333333333, "grad_norm": 19.790943145751953, "learning_rate": 2.3567881912651468e-09, "logits/chosen": 2.4145467281341553, "logits/rejected": 2.5430009365081787, "logps/chosen": -72.11392211914062, "logps/rejected": -212.59561157226562, "loss": 0.7111, "nll_loss": 0.7069992423057556, "rewards/accuracies": 1.0, "rewards/chosen": 2.533656358718872, "rewards/margins": 13.525670051574707, "rewards/rejected": -10.992013931274414, "step": 5597 }, { "epoch": 0.933, "grad_norm": 27.762069702148438, "learning_rate": 2.3451523585058753e-09, "logits/chosen": 3.1838767528533936, "logits/rejected": 3.0430078506469727, "logps/chosen": -105.55877685546875, "logps/rejected": -66.44770812988281, "loss": 0.9896, "nll_loss": 0.9773960709571838, "rewards/accuracies": 1.0, "rewards/chosen": 2.2595138549804688, "rewards/margins": 7.070986270904541, "rewards/rejected": -4.811472415924072, "step": 5598 }, { "epoch": 0.9331666666666667, "grad_norm": 70.01841735839844, "learning_rate": 2.3335449800247265e-09, "logits/chosen": 2.725900173187256, "logits/rejected": 2.7557458877563477, "logps/chosen": -150.53924560546875, "logps/rejected": -179.39700317382812, "loss": 1.6811, "nll_loss": 1.5205984115600586, "rewards/accuracies": 1.0, "rewards/chosen": -1.4598970413208008, "rewards/margins": 4.633338928222656, "rewards/rejected": -6.093235969543457, "step": 5599 }, { "epoch": 0.9333333333333333, "grad_norm": 15.02767276763916, "learning_rate": 2.321966059203828e-09, "logits/chosen": 2.368129253387451, "logits/rejected": 2.3737173080444336, "logps/chosen": -164.705078125, "logps/rejected": -178.72967529296875, "loss": 0.7266, "nll_loss": 0.7192361950874329, "rewards/accuracies": 1.0, "rewards/chosen": 2.1992249488830566, "rewards/margins": 8.473459243774414, "rewards/rejected": -6.274234771728516, "step": 5600 }, { "epoch": 0.9335, "grad_norm": 46.54444885253906, "learning_rate": 2.3104155994170037e-09, "logits/chosen": 1.0446168184280396, "logits/rejected": 1.65053391456604, "logps/chosen": -6.245424270629883, "logps/rejected": -314.0047607421875, "loss": 0.3332, "nll_loss": 0.3287065327167511, "rewards/accuracies": 1.0, "rewards/chosen": 2.5694823265075684, "rewards/margins": 9.856414794921875, "rewards/rejected": -7.286932468414307, "step": 5601 }, { "epoch": 0.9336666666666666, "grad_norm": 201.4200897216797, "learning_rate": 2.2988936040297725e-09, "logits/chosen": 2.513643503189087, "logits/rejected": 2.4877514839172363, "logps/chosen": -137.37896728515625, "logps/rejected": -14.341264724731445, "loss": 3.3868, "nll_loss": 1.2839157581329346, "rewards/accuracies": 0.0, "rewards/chosen": 2.123234510421753, "rewards/margins": -1.1898155212402344, "rewards/rejected": 3.3130500316619873, "step": 5602 }, { "epoch": 0.9338333333333333, "grad_norm": 29.82437515258789, "learning_rate": 2.2874000763993483e-09, "logits/chosen": 1.3578619956970215, "logits/rejected": 2.2896082401275635, "logps/chosen": -93.33271026611328, "logps/rejected": -359.2234802246094, "loss": 1.0694, "nll_loss": 1.0605990886688232, "rewards/accuracies": 1.0, "rewards/chosen": 1.75865638256073, "rewards/margins": 16.396190643310547, "rewards/rejected": -14.637535095214844, "step": 5603 }, { "epoch": 0.934, "grad_norm": 31.88448143005371, "learning_rate": 2.2759350198746975e-09, "logits/chosen": 3.0088672637939453, "logits/rejected": 3.3246493339538574, "logps/chosen": -68.359619140625, "logps/rejected": -388.6431579589844, "loss": 0.8467, "nll_loss": 0.8336538672447205, "rewards/accuracies": 1.0, "rewards/chosen": 1.3664077520370483, "rewards/margins": 10.385490417480469, "rewards/rejected": -9.019083023071289, "step": 5604 }, { "epoch": 0.9341666666666667, "grad_norm": 136.36988830566406, "learning_rate": 2.264498437796458e-09, "logits/chosen": 2.3921921253204346, "logits/rejected": 2.19140887260437, "logps/chosen": -81.20494079589844, "logps/rejected": -85.04702758789062, "loss": 1.6282, "nll_loss": 0.9783726930618286, "rewards/accuracies": 1.0, "rewards/chosen": 0.7703735828399658, "rewards/margins": 0.7507134079933167, "rewards/rejected": 0.019660189747810364, "step": 5605 }, { "epoch": 0.9343333333333333, "grad_norm": 24.182758331298828, "learning_rate": 2.253090333496954e-09, "logits/chosen": 2.7392661571502686, "logits/rejected": 2.7255165576934814, "logps/chosen": -51.84838104248047, "logps/rejected": -71.5255355834961, "loss": 0.6541, "nll_loss": 0.6322973370552063, "rewards/accuracies": 1.0, "rewards/chosen": 1.192450761795044, "rewards/margins": 6.2218427658081055, "rewards/rejected": -5.029391765594482, "step": 5606 }, { "epoch": 0.9345, "grad_norm": 231.0731201171875, "learning_rate": 2.2417107103002597e-09, "logits/chosen": 4.084583759307861, "logits/rejected": 4.19081449508667, "logps/chosen": -17.785083770751953, "logps/rejected": -163.79074096679688, "loss": 2.5783, "nll_loss": 2.5407261848449707, "rewards/accuracies": 1.0, "rewards/chosen": 0.6887056827545166, "rewards/margins": 5.18162727355957, "rewards/rejected": -4.492921352386475, "step": 5607 }, { "epoch": 0.9346666666666666, "grad_norm": 22.298969268798828, "learning_rate": 2.2303595715221e-09, "logits/chosen": 1.9560825824737549, "logits/rejected": 2.2613525390625, "logps/chosen": -12.426324844360352, "logps/rejected": -270.43658447265625, "loss": 0.3016, "nll_loss": 0.29586490988731384, "rewards/accuracies": 1.0, "rewards/chosen": 2.266655683517456, "rewards/margins": 9.899346351623535, "rewards/rejected": -7.6326904296875, "step": 5608 }, { "epoch": 0.9348333333333333, "grad_norm": 22.204015731811523, "learning_rate": 2.219036920469952e-09, "logits/chosen": 1.7027009725570679, "logits/rejected": 1.9458304643630981, "logps/chosen": -80.55824279785156, "logps/rejected": -183.58241271972656, "loss": 0.7229, "nll_loss": 0.7129048109054565, "rewards/accuracies": 1.0, "rewards/chosen": 1.6731247901916504, "rewards/margins": 9.442319869995117, "rewards/rejected": -7.769195556640625, "step": 5609 }, { "epoch": 0.935, "grad_norm": 31.528921127319336, "learning_rate": 2.207742760442943e-09, "logits/chosen": 1.742449164390564, "logits/rejected": 2.7111093997955322, "logps/chosen": -40.40340805053711, "logps/rejected": -223.63217163085938, "loss": 0.6634, "nll_loss": 0.6413239240646362, "rewards/accuracies": 1.0, "rewards/chosen": 0.8509387969970703, "rewards/margins": 7.997476577758789, "rewards/rejected": -7.146537780761719, "step": 5610 }, { "epoch": 0.9351666666666667, "grad_norm": 50.543033599853516, "learning_rate": 2.196477094731919e-09, "logits/chosen": 2.716143846511841, "logits/rejected": 2.9001152515411377, "logps/chosen": -47.61573028564453, "logps/rejected": -334.68658447265625, "loss": 1.5886, "nll_loss": 1.5871909856796265, "rewards/accuracies": 1.0, "rewards/chosen": 3.63749623298645, "rewards/margins": 12.956010818481445, "rewards/rejected": -9.318514823913574, "step": 5611 }, { "epoch": 0.9353333333333333, "grad_norm": 39.14250183105469, "learning_rate": 2.1852399266194312e-09, "logits/chosen": 2.590834379196167, "logits/rejected": 2.7050814628601074, "logps/chosen": -7.605205535888672, "logps/rejected": -229.334716796875, "loss": 0.3358, "nll_loss": 0.3306611180305481, "rewards/accuracies": 1.0, "rewards/chosen": 2.4617199897766113, "rewards/margins": 9.50295639038086, "rewards/rejected": -7.041236877441406, "step": 5612 }, { "epoch": 0.9355, "grad_norm": 27.07150650024414, "learning_rate": 2.174031259379727e-09, "logits/chosen": 2.7855942249298096, "logits/rejected": 2.845989227294922, "logps/chosen": -54.25306701660156, "logps/rejected": -386.9705810546875, "loss": 0.8106, "nll_loss": 0.7978391647338867, "rewards/accuracies": 1.0, "rewards/chosen": 4.585522651672363, "rewards/margins": 8.700475692749023, "rewards/rejected": -4.114953517913818, "step": 5613 }, { "epoch": 0.9356666666666666, "grad_norm": 77.87091064453125, "learning_rate": 2.1628510962787396e-09, "logits/chosen": 1.386804223060608, "logits/rejected": 2.1209733486175537, "logps/chosen": -24.736148834228516, "logps/rejected": -171.0284423828125, "loss": 1.1484, "nll_loss": 1.1243704557418823, "rewards/accuracies": 1.0, "rewards/chosen": 1.3917790651321411, "rewards/margins": 5.808468341827393, "rewards/rejected": -4.416689395904541, "step": 5614 }, { "epoch": 0.9358333333333333, "grad_norm": 37.93643569946289, "learning_rate": 2.151699440574095e-09, "logits/chosen": 3.0751349925994873, "logits/rejected": 3.052419662475586, "logps/chosen": -70.59930419921875, "logps/rejected": -117.24708557128906, "loss": 0.9154, "nll_loss": 0.9051191806793213, "rewards/accuracies": 1.0, "rewards/chosen": 1.6869752407073975, "rewards/margins": 8.71372127532959, "rewards/rejected": -7.026745796203613, "step": 5615 }, { "epoch": 0.936, "grad_norm": 247.06283569335938, "learning_rate": 2.1405762955151174e-09, "logits/chosen": 1.8208788633346558, "logits/rejected": 1.4914604425430298, "logps/chosen": -82.64869689941406, "logps/rejected": -35.241943359375, "loss": 3.535, "nll_loss": 0.7946990728378296, "rewards/accuracies": 0.0, "rewards/chosen": 2.2183456420898438, "rewards/margins": -1.9118027687072754, "rewards/rejected": 4.130148410797119, "step": 5616 }, { "epoch": 0.9361666666666667, "grad_norm": 159.4858856201172, "learning_rate": 2.1294816643428247e-09, "logits/chosen": 2.1927690505981445, "logits/rejected": 1.8255836963653564, "logps/chosen": -102.56306457519531, "logps/rejected": -20.04952049255371, "loss": 2.6538, "nll_loss": 1.2507692575454712, "rewards/accuracies": 0.0, "rewards/chosen": 1.131733775138855, "rewards/margins": -0.5046621561050415, "rewards/rejected": 1.6363959312438965, "step": 5617 }, { "epoch": 0.9363333333333334, "grad_norm": 45.53580856323242, "learning_rate": 2.1184155502899426e-09, "logits/chosen": 2.4546308517456055, "logits/rejected": 2.3841469287872314, "logps/chosen": -56.65940856933594, "logps/rejected": -46.20192337036133, "loss": 0.814, "nll_loss": 0.7082425951957703, "rewards/accuracies": 1.0, "rewards/chosen": 0.20743486285209656, "rewards/margins": 3.197878360748291, "rewards/rejected": -2.990443468093872, "step": 5618 }, { "epoch": 0.9365, "grad_norm": 26.09257698059082, "learning_rate": 2.107377956580847e-09, "logits/chosen": 2.8194496631622314, "logits/rejected": 2.8309388160705566, "logps/chosen": -49.605003356933594, "logps/rejected": -184.33726501464844, "loss": 0.6963, "nll_loss": 0.6795206069946289, "rewards/accuracies": 1.0, "rewards/chosen": 1.2522907257080078, "rewards/margins": 7.277628421783447, "rewards/rejected": -6.0253376960754395, "step": 5619 }, { "epoch": 0.9366666666666666, "grad_norm": 24.035966873168945, "learning_rate": 2.096368886431632e-09, "logits/chosen": 2.3820295333862305, "logits/rejected": 2.5381648540496826, "logps/chosen": -55.03913879394531, "logps/rejected": -155.7166748046875, "loss": 0.7705, "nll_loss": 0.7644326090812683, "rewards/accuracies": 1.0, "rewards/chosen": 2.1668694019317627, "rewards/margins": 10.898822784423828, "rewards/rejected": -8.731953620910645, "step": 5620 }, { "epoch": 0.9368333333333333, "grad_norm": 84.11127471923828, "learning_rate": 2.0853883430500875e-09, "logits/chosen": 1.231092095375061, "logits/rejected": 1.3204691410064697, "logps/chosen": -118.10133361816406, "logps/rejected": -120.50955200195312, "loss": 1.2766, "nll_loss": 0.9924482107162476, "rewards/accuracies": 1.0, "rewards/chosen": 1.9990525245666504, "rewards/margins": 2.631863594055176, "rewards/rejected": -0.6328110694885254, "step": 5621 }, { "epoch": 0.937, "grad_norm": 10.720256805419922, "learning_rate": 2.074436329635687e-09, "logits/chosen": 1.007736086845398, "logits/rejected": 1.4238120317459106, "logps/chosen": -111.72372436523438, "logps/rejected": -298.0126647949219, "loss": 0.4741, "nll_loss": 0.46358397603034973, "rewards/accuracies": 1.0, "rewards/chosen": 4.123971462249756, "rewards/margins": 8.48298454284668, "rewards/rejected": -4.359013557434082, "step": 5622 }, { "epoch": 0.9371666666666667, "grad_norm": 21.514869689941406, "learning_rate": 2.0635128493795563e-09, "logits/chosen": 2.4989898204803467, "logits/rejected": 2.708482265472412, "logps/chosen": -44.626529693603516, "logps/rejected": -318.7800598144531, "loss": 0.524, "nll_loss": 0.5189131498336792, "rewards/accuracies": 1.0, "rewards/chosen": 2.3186047077178955, "rewards/margins": 15.135244369506836, "rewards/rejected": -12.81663990020752, "step": 5623 }, { "epoch": 0.9373333333333334, "grad_norm": 20.09475326538086, "learning_rate": 2.05261790546456e-09, "logits/chosen": 1.3233113288879395, "logits/rejected": 1.5520964860916138, "logps/chosen": -91.00532531738281, "logps/rejected": -116.04667663574219, "loss": 0.7535, "nll_loss": 0.7398806810379028, "rewards/accuracies": 1.0, "rewards/chosen": 2.563666820526123, "rewards/margins": 6.990054130554199, "rewards/rejected": -4.426387310028076, "step": 5624 }, { "epoch": 0.9375, "grad_norm": 47.0626106262207, "learning_rate": 2.041751501065203e-09, "logits/chosen": 2.1479263305664062, "logits/rejected": 2.1811859607696533, "logps/chosen": -9.87751579284668, "logps/rejected": -171.82266235351562, "loss": 0.4573, "nll_loss": 0.4489779770374298, "rewards/accuracies": 1.0, "rewards/chosen": 1.8563638925552368, "rewards/margins": 9.989069938659668, "rewards/rejected": -8.132705688476562, "step": 5625 }, { "epoch": 0.9376666666666666, "grad_norm": 35.54753875732422, "learning_rate": 2.03091363934772e-09, "logits/chosen": 1.2696408033370972, "logits/rejected": 1.8127135038375854, "logps/chosen": -13.74316120147705, "logps/rejected": -455.83685302734375, "loss": 0.4653, "nll_loss": 0.42947375774383545, "rewards/accuracies": 1.0, "rewards/chosen": 0.2883000373840332, "rewards/margins": 11.30655288696289, "rewards/rejected": -11.0182523727417, "step": 5626 }, { "epoch": 0.9378333333333333, "grad_norm": 44.65329360961914, "learning_rate": 2.0201043234699845e-09, "logits/chosen": 3.1013894081115723, "logits/rejected": 3.238374948501587, "logps/chosen": -18.310325622558594, "logps/rejected": -75.39462280273438, "loss": 0.6402, "nll_loss": 0.5548582673072815, "rewards/accuracies": 1.0, "rewards/chosen": 1.5539051294326782, "rewards/margins": 3.9572339057922363, "rewards/rejected": -2.4033286571502686, "step": 5627 }, { "epoch": 0.938, "grad_norm": 35.58600616455078, "learning_rate": 2.0093235565815657e-09, "logits/chosen": 2.776421308517456, "logits/rejected": 2.654578447341919, "logps/chosen": -16.532323837280273, "logps/rejected": -44.6748046875, "loss": 0.4692, "nll_loss": 0.4133080840110779, "rewards/accuracies": 1.0, "rewards/chosen": 0.7787343859672546, "rewards/margins": 4.267911911010742, "rewards/rejected": -3.489177703857422, "step": 5628 }, { "epoch": 0.9381666666666667, "grad_norm": 21.976083755493164, "learning_rate": 1.9985713418237403e-09, "logits/chosen": 2.5438883304595947, "logits/rejected": 2.7242228984832764, "logps/chosen": -82.9049301147461, "logps/rejected": -321.7958068847656, "loss": 0.7611, "nll_loss": 0.7536813020706177, "rewards/accuracies": 1.0, "rewards/chosen": 1.9641761779785156, "rewards/margins": 10.229056358337402, "rewards/rejected": -8.264880180358887, "step": 5629 }, { "epoch": 0.9383333333333334, "grad_norm": 88.9751205444336, "learning_rate": 1.9878476823294464e-09, "logits/chosen": 2.167478561401367, "logits/rejected": 1.4641711711883545, "logps/chosen": -158.7678680419922, "logps/rejected": -44.811859130859375, "loss": 1.643, "nll_loss": 1.4433443546295166, "rewards/accuracies": 1.0, "rewards/chosen": -0.9505691528320312, "rewards/margins": 2.381646156311035, "rewards/rejected": -3.3322153091430664, "step": 5630 }, { "epoch": 0.9385, "grad_norm": 31.321199417114258, "learning_rate": 1.977152581223274e-09, "logits/chosen": 2.665973663330078, "logits/rejected": 2.635504961013794, "logps/chosen": -157.96385192871094, "logps/rejected": -63.82573699951172, "loss": 1.513, "nll_loss": 1.490225076675415, "rewards/accuracies": 1.0, "rewards/chosen": 1.5094223022460938, "rewards/margins": 5.890706539154053, "rewards/rejected": -4.381284236907959, "step": 5631 }, { "epoch": 0.9386666666666666, "grad_norm": 45.14276123046875, "learning_rate": 1.96648604162154e-09, "logits/chosen": 1.3989100456237793, "logits/rejected": 2.392052412033081, "logps/chosen": -25.156028747558594, "logps/rejected": -207.64645385742188, "loss": 0.7528, "nll_loss": 0.7187438011169434, "rewards/accuracies": 1.0, "rewards/chosen": 0.3457428216934204, "rewards/margins": 9.420939445495605, "rewards/rejected": -9.075196266174316, "step": 5632 }, { "epoch": 0.9388333333333333, "grad_norm": 22.915456771850586, "learning_rate": 1.955848066632215e-09, "logits/chosen": 2.0842087268829346, "logits/rejected": 2.144608974456787, "logps/chosen": -73.65272521972656, "logps/rejected": -153.0392608642578, "loss": 0.7844, "nll_loss": 0.7752918601036072, "rewards/accuracies": 1.0, "rewards/chosen": 2.0208756923675537, "rewards/margins": 7.9334211349487305, "rewards/rejected": -5.912545204162598, "step": 5633 }, { "epoch": 0.939, "grad_norm": 37.576446533203125, "learning_rate": 1.945238659354953e-09, "logits/chosen": 2.888289451599121, "logits/rejected": 2.7238869667053223, "logps/chosen": -28.1739559173584, "logps/rejected": -79.78206634521484, "loss": 0.718, "nll_loss": 0.6871697306632996, "rewards/accuracies": 1.0, "rewards/chosen": 2.0239274501800537, "rewards/margins": 5.568889617919922, "rewards/rejected": -3.544962167739868, "step": 5634 }, { "epoch": 0.9391666666666667, "grad_norm": 20.791168212890625, "learning_rate": 1.9346578228810806e-09, "logits/chosen": 0.6613875031471252, "logits/rejected": 1.5638161897659302, "logps/chosen": -52.96455383300781, "logps/rejected": -277.45611572265625, "loss": 0.5132, "nll_loss": 0.5044243335723877, "rewards/accuracies": 1.0, "rewards/chosen": 1.7841309309005737, "rewards/margins": 10.227693557739258, "rewards/rejected": -8.443562507629395, "step": 5635 }, { "epoch": 0.9393333333333334, "grad_norm": 41.31558609008789, "learning_rate": 1.9241055602935874e-09, "logits/chosen": 2.7250869274139404, "logits/rejected": 3.1423041820526123, "logps/chosen": -47.5214958190918, "logps/rejected": -181.4841766357422, "loss": 1.0402, "nll_loss": 1.011095643043518, "rewards/accuracies": 1.0, "rewards/chosen": 0.642825722694397, "rewards/margins": 6.490269660949707, "rewards/rejected": -5.8474440574646, "step": 5636 }, { "epoch": 0.9395, "grad_norm": 25.276105880737305, "learning_rate": 1.9135818746671583e-09, "logits/chosen": 2.5068647861480713, "logits/rejected": 2.5340888500213623, "logps/chosen": -63.04063034057617, "logps/rejected": -78.48793029785156, "loss": 0.7569, "nll_loss": 0.7330306768417358, "rewards/accuracies": 1.0, "rewards/chosen": 1.0780216455459595, "rewards/margins": 6.092584609985352, "rewards/rejected": -5.014563083648682, "step": 5637 }, { "epoch": 0.9396666666666667, "grad_norm": 29.699033737182617, "learning_rate": 1.9030867690681295e-09, "logits/chosen": 0.9445759057998657, "logits/rejected": 2.54721999168396, "logps/chosen": -40.714942932128906, "logps/rejected": -461.86859130859375, "loss": 0.6972, "nll_loss": 0.6785821914672852, "rewards/accuracies": 1.0, "rewards/chosen": 0.9947418570518494, "rewards/margins": 9.779342651367188, "rewards/rejected": -8.784601211547852, "step": 5638 }, { "epoch": 0.9398333333333333, "grad_norm": 21.33260726928711, "learning_rate": 1.892620246554555e-09, "logits/chosen": 1.5164719820022583, "logits/rejected": 1.4443870782852173, "logps/chosen": -44.647438049316406, "logps/rejected": -101.29261779785156, "loss": 0.5409, "nll_loss": 0.5252639651298523, "rewards/accuracies": 1.0, "rewards/chosen": 1.5452064275741577, "rewards/margins": 6.820093154907227, "rewards/rejected": -5.274886608123779, "step": 5639 }, { "epoch": 0.94, "grad_norm": 26.314250946044922, "learning_rate": 1.8821823101760947e-09, "logits/chosen": 1.3709635734558105, "logits/rejected": 2.5365328788757324, "logps/chosen": -11.590376853942871, "logps/rejected": -179.125732421875, "loss": 0.2628, "nll_loss": 0.25756391882896423, "rewards/accuracies": 1.0, "rewards/chosen": 2.3316490650177, "rewards/margins": 10.60730266571045, "rewards/rejected": -8.275653839111328, "step": 5640 }, { "epoch": 0.9401666666666667, "grad_norm": 22.04875946044922, "learning_rate": 1.8717729629741275e-09, "logits/chosen": 2.6400725841522217, "logits/rejected": 3.186739444732666, "logps/chosen": -118.56591796875, "logps/rejected": -182.16616821289062, "loss": 0.862, "nll_loss": 0.8529922962188721, "rewards/accuracies": 1.0, "rewards/chosen": 1.778712511062622, "rewards/margins": 9.45960521697998, "rewards/rejected": -7.6808929443359375, "step": 5641 }, { "epoch": 0.9403333333333334, "grad_norm": 24.10007667541504, "learning_rate": 1.8613922079816825e-09, "logits/chosen": 1.758641242980957, "logits/rejected": 2.390798807144165, "logps/chosen": -20.119293212890625, "logps/rejected": -289.49560546875, "loss": 0.3962, "nll_loss": 0.37257951498031616, "rewards/accuracies": 1.0, "rewards/chosen": 0.7863693833351135, "rewards/margins": 7.681201457977295, "rewards/rejected": -6.894832134246826, "step": 5642 }, { "epoch": 0.9405, "grad_norm": 187.35047912597656, "learning_rate": 1.8510400482234845e-09, "logits/chosen": 2.661421298980713, "logits/rejected": 2.69632887840271, "logps/chosen": -43.256996154785156, "logps/rejected": -14.995325088500977, "loss": 4.2201, "nll_loss": 0.6269130110740662, "rewards/accuracies": 0.0, "rewards/chosen": 0.9134132266044617, "rewards/margins": -3.0850257873535156, "rewards/rejected": 3.998439073562622, "step": 5643 }, { "epoch": 0.9406666666666667, "grad_norm": 23.765254974365234, "learning_rate": 1.8407164867158654e-09, "logits/chosen": 1.5004115104675293, "logits/rejected": 1.4786797761917114, "logps/chosen": -105.7342758178711, "logps/rejected": -132.72857666015625, "loss": 0.9147, "nll_loss": 0.8885233402252197, "rewards/accuracies": 1.0, "rewards/chosen": 0.8199775815010071, "rewards/margins": 6.353979110717773, "rewards/rejected": -5.534001350402832, "step": 5644 }, { "epoch": 0.9408333333333333, "grad_norm": 63.55369567871094, "learning_rate": 1.8304215264668853e-09, "logits/chosen": 2.846583366394043, "logits/rejected": 2.9571869373321533, "logps/chosen": -55.78780746459961, "logps/rejected": -147.8182830810547, "loss": 1.0427, "nll_loss": 0.9962108731269836, "rewards/accuracies": 1.0, "rewards/chosen": 1.5132725238800049, "rewards/margins": 4.738009452819824, "rewards/rejected": -3.2247371673583984, "step": 5645 }, { "epoch": 0.941, "grad_norm": 27.111183166503906, "learning_rate": 1.8201551704762451e-09, "logits/chosen": 2.609161138534546, "logits/rejected": 2.8193652629852295, "logps/chosen": -75.82408142089844, "logps/rejected": -181.46530151367188, "loss": 0.8481, "nll_loss": 0.8424897193908691, "rewards/accuracies": 1.0, "rewards/chosen": 2.564038038253784, "rewards/margins": 8.77299976348877, "rewards/rejected": -6.208961486816406, "step": 5646 }, { "epoch": 0.9411666666666667, "grad_norm": 19.22734832763672, "learning_rate": 1.8099174217353298e-09, "logits/chosen": 0.9766035676002502, "logits/rejected": 1.3263002634048462, "logps/chosen": -187.76373291015625, "logps/rejected": -224.7556915283203, "loss": 0.7528, "nll_loss": 0.7392272353172302, "rewards/accuracies": 1.0, "rewards/chosen": 1.3698577880859375, "rewards/margins": 8.49236011505127, "rewards/rejected": -7.122502326965332, "step": 5647 }, { "epoch": 0.9413333333333334, "grad_norm": 92.75433349609375, "learning_rate": 1.7997082832271415e-09, "logits/chosen": 2.471426010131836, "logits/rejected": 2.57694935798645, "logps/chosen": -39.31245040893555, "logps/rejected": -91.43580627441406, "loss": 1.0733, "nll_loss": 0.6142569780349731, "rewards/accuracies": 1.0, "rewards/chosen": 1.5235470533370972, "rewards/margins": 1.6079113483428955, "rewards/rejected": -0.08436432480812073, "step": 5648 }, { "epoch": 0.9415, "grad_norm": 34.0018196105957, "learning_rate": 1.7895277579264012e-09, "logits/chosen": 2.1712353229522705, "logits/rejected": 2.8677709102630615, "logps/chosen": -73.31718444824219, "logps/rejected": -252.81643676757812, "loss": 0.974, "nll_loss": 0.9521712064743042, "rewards/accuracies": 1.0, "rewards/chosen": 0.8611969947814941, "rewards/margins": 7.966444492340088, "rewards/rejected": -7.105247497558594, "step": 5649 }, { "epoch": 0.9416666666666667, "grad_norm": 21.979707717895508, "learning_rate": 1.7793758487994692e-09, "logits/chosen": 1.7323546409606934, "logits/rejected": 2.1101233959198, "logps/chosen": -26.990318298339844, "logps/rejected": -115.78673553466797, "loss": 0.4082, "nll_loss": 0.3969164788722992, "rewards/accuracies": 1.0, "rewards/chosen": 1.8006759881973267, "rewards/margins": 7.549922943115234, "rewards/rejected": -5.749247074127197, "step": 5650 }, { "epoch": 0.9418333333333333, "grad_norm": 21.676551818847656, "learning_rate": 1.7692525588043682e-09, "logits/chosen": 1.999346375465393, "logits/rejected": 2.4158546924591064, "logps/chosen": -25.69672393798828, "logps/rejected": -111.50723266601562, "loss": 0.441, "nll_loss": 0.4282788038253784, "rewards/accuracies": 1.0, "rewards/chosen": 2.549257755279541, "rewards/margins": 7.085267066955566, "rewards/rejected": -4.536009311676025, "step": 5651 }, { "epoch": 0.942, "grad_norm": 41.926177978515625, "learning_rate": 1.7591578908907723e-09, "logits/chosen": 2.299133539199829, "logits/rejected": 1.893357515335083, "logps/chosen": -91.53269958496094, "logps/rejected": -57.45085144042969, "loss": 1.2285, "nll_loss": 1.1441587209701538, "rewards/accuracies": 1.0, "rewards/chosen": 0.5158111453056335, "rewards/margins": 3.5707576274871826, "rewards/rejected": -3.0549464225769043, "step": 5652 }, { "epoch": 0.9421666666666667, "grad_norm": 27.751747131347656, "learning_rate": 1.7490918480000173e-09, "logits/chosen": 2.5227138996124268, "logits/rejected": 2.342710494995117, "logps/chosen": -93.42060089111328, "logps/rejected": -75.89309692382812, "loss": 0.9116, "nll_loss": 0.8897200226783752, "rewards/accuracies": 1.0, "rewards/chosen": 2.462580919265747, "rewards/margins": 6.293464660644531, "rewards/rejected": -3.830883502960205, "step": 5653 }, { "epoch": 0.9423333333333334, "grad_norm": 27.771892547607422, "learning_rate": 1.7390544330651235e-09, "logits/chosen": 2.5615317821502686, "logits/rejected": 2.508845567703247, "logps/chosen": -11.024374008178711, "logps/rejected": -209.55264282226562, "loss": 0.3696, "nll_loss": 0.3556250333786011, "rewards/accuracies": 1.0, "rewards/chosen": 1.3635371923446655, "rewards/margins": 8.1741361618042, "rewards/rejected": -6.810598850250244, "step": 5654 }, { "epoch": 0.9425, "grad_norm": 51.394935607910156, "learning_rate": 1.729045649010752e-09, "logits/chosen": 3.029785394668579, "logits/rejected": 3.0006182193756104, "logps/chosen": -22.124561309814453, "logps/rejected": -188.3492431640625, "loss": 0.7912, "nll_loss": 0.7901628613471985, "rewards/accuracies": 1.0, "rewards/chosen": 4.2200236320495605, "rewards/margins": 12.18310546875, "rewards/rejected": -7.963081359863281, "step": 5655 }, { "epoch": 0.9426666666666667, "grad_norm": 63.10824203491211, "learning_rate": 1.7190654987532361e-09, "logits/chosen": 1.6568669080734253, "logits/rejected": 1.1251215934753418, "logps/chosen": -61.41748046875, "logps/rejected": -19.53753089904785, "loss": 1.3216, "nll_loss": 0.5634631514549255, "rewards/accuracies": 1.0, "rewards/chosen": 2.781947612762451, "rewards/margins": 1.3407950401306152, "rewards/rejected": 1.441152572631836, "step": 5656 }, { "epoch": 0.9428333333333333, "grad_norm": 56.53741455078125, "learning_rate": 1.7091139852005055e-09, "logits/chosen": 2.456026077270508, "logits/rejected": 2.5472235679626465, "logps/chosen": -9.843287467956543, "logps/rejected": -29.163875579833984, "loss": 0.8183, "nll_loss": 0.3175254166126251, "rewards/accuracies": 1.0, "rewards/chosen": 1.3449616432189941, "rewards/margins": 1.384680986404419, "rewards/rejected": -0.03971939533948898, "step": 5657 }, { "epoch": 0.943, "grad_norm": 28.97431755065918, "learning_rate": 1.6991911112522406e-09, "logits/chosen": 2.670872688293457, "logits/rejected": 3.0094544887542725, "logps/chosen": -85.28883361816406, "logps/rejected": -351.752197265625, "loss": 1.1282, "nll_loss": 1.107647180557251, "rewards/accuracies": 1.0, "rewards/chosen": 0.8855392932891846, "rewards/margins": 9.824409484863281, "rewards/rejected": -8.938870429992676, "step": 5658 }, { "epoch": 0.9431666666666667, "grad_norm": 29.06032371520996, "learning_rate": 1.6892968797996954e-09, "logits/chosen": 2.8530101776123047, "logits/rejected": 3.1243903636932373, "logps/chosen": -10.25149917602539, "logps/rejected": -383.41656494140625, "loss": 0.3455, "nll_loss": 0.32035940885543823, "rewards/accuracies": 1.0, "rewards/chosen": 0.6794148087501526, "rewards/margins": 8.776289939880371, "rewards/rejected": -8.096875190734863, "step": 5659 }, { "epoch": 0.9433333333333334, "grad_norm": 28.952259063720703, "learning_rate": 1.6794312937258415e-09, "logits/chosen": 0.7562775611877441, "logits/rejected": 2.941049814224243, "logps/chosen": -8.618501663208008, "logps/rejected": -239.24591064453125, "loss": 0.3292, "nll_loss": 0.3078036308288574, "rewards/accuracies": 1.0, "rewards/chosen": 0.8555521965026855, "rewards/margins": 8.852983474731445, "rewards/rejected": -7.997430801391602, "step": 5660 }, { "epoch": 0.9435, "grad_norm": 30.269723892211914, "learning_rate": 1.669594355905246e-09, "logits/chosen": 3.2816925048828125, "logits/rejected": 3.102994918823242, "logps/chosen": -97.24313354492188, "logps/rejected": -87.61199188232422, "loss": 1.1513, "nll_loss": 1.1307340860366821, "rewards/accuracies": 1.0, "rewards/chosen": 1.0517562627792358, "rewards/margins": 6.84621524810791, "rewards/rejected": -5.794458866119385, "step": 5661 }, { "epoch": 0.9436666666666667, "grad_norm": 68.32671356201172, "learning_rate": 1.659786069204161e-09, "logits/chosen": 2.742736339569092, "logits/rejected": 2.7222397327423096, "logps/chosen": -149.81695556640625, "logps/rejected": -182.0425262451172, "loss": 1.663, "nll_loss": 1.5133026838302612, "rewards/accuracies": 1.0, "rewards/chosen": -1.3876678943634033, "rewards/margins": 4.970120429992676, "rewards/rejected": -6.357788562774658, "step": 5662 }, { "epoch": 0.9438333333333333, "grad_norm": 23.967453002929688, "learning_rate": 1.6500064364805e-09, "logits/chosen": 2.351820945739746, "logits/rejected": 2.6696853637695312, "logps/chosen": -80.34127044677734, "logps/rejected": -381.00115966796875, "loss": 0.973, "nll_loss": 0.9679670929908752, "rewards/accuracies": 1.0, "rewards/chosen": 2.4175987243652344, "rewards/margins": 10.024538040161133, "rewards/rejected": -7.606939792633057, "step": 5663 }, { "epoch": 0.944, "grad_norm": 23.164762496948242, "learning_rate": 1.640255460583817e-09, "logits/chosen": 2.3667242527008057, "logits/rejected": 2.4937009811401367, "logps/chosen": -114.99104309082031, "logps/rejected": -131.69798278808594, "loss": 0.8401, "nll_loss": 0.8155392408370972, "rewards/accuracies": 1.0, "rewards/chosen": 1.8061158657073975, "rewards/margins": 5.79545783996582, "rewards/rejected": -3.9893417358398438, "step": 5664 }, { "epoch": 0.9441666666666667, "grad_norm": 41.97072219848633, "learning_rate": 1.6305331443552839e-09, "logits/chosen": 2.736757755279541, "logits/rejected": 2.7767937183380127, "logps/chosen": -68.42137908935547, "logps/rejected": -206.85403442382812, "loss": 1.1439, "nll_loss": 1.1403563022613525, "rewards/accuracies": 1.0, "rewards/chosen": 3.2544593811035156, "rewards/margins": 9.470651626586914, "rewards/rejected": -6.21619176864624, "step": 5665 }, { "epoch": 0.9443333333333334, "grad_norm": 51.20109939575195, "learning_rate": 1.620839490627779e-09, "logits/chosen": 2.1671905517578125, "logits/rejected": 2.3777620792388916, "logps/chosen": -31.07082748413086, "logps/rejected": -161.021240234375, "loss": 0.8893, "nll_loss": 0.8877379298210144, "rewards/accuracies": 1.0, "rewards/chosen": 4.036635398864746, "rewards/margins": 11.157341003417969, "rewards/rejected": -7.120705604553223, "step": 5666 }, { "epoch": 0.9445, "grad_norm": 21.745025634765625, "learning_rate": 1.6111745022257872e-09, "logits/chosen": 3.2788219451904297, "logits/rejected": 3.632817506790161, "logps/chosen": -56.54298400878906, "logps/rejected": -466.7769775390625, "loss": 0.6747, "nll_loss": 0.6574766039848328, "rewards/accuracies": 1.0, "rewards/chosen": 1.0613059997558594, "rewards/margins": 14.593420028686523, "rewards/rejected": -13.532114028930664, "step": 5667 }, { "epoch": 0.9446666666666667, "grad_norm": 31.049579620361328, "learning_rate": 1.6015381819654562e-09, "logits/chosen": 2.0115787982940674, "logits/rejected": 1.7760200500488281, "logps/chosen": -100.62051391601562, "logps/rejected": -155.76426696777344, "loss": 0.9997, "nll_loss": 0.9864756464958191, "rewards/accuracies": 1.0, "rewards/chosen": 2.10495924949646, "rewards/margins": 6.907452583312988, "rewards/rejected": -4.802493572235107, "step": 5668 }, { "epoch": 0.9448333333333333, "grad_norm": 27.595285415649414, "learning_rate": 1.591930532654573e-09, "logits/chosen": 1.711650013923645, "logits/rejected": 2.164158344268799, "logps/chosen": -66.55976867675781, "logps/rejected": -212.30587768554688, "loss": 0.8231, "nll_loss": 0.801925003528595, "rewards/accuracies": 1.0, "rewards/chosen": 1.081939697265625, "rewards/margins": 6.563108921051025, "rewards/rejected": -5.4811692237854, "step": 5669 }, { "epoch": 0.945, "grad_norm": 131.8148956298828, "learning_rate": 1.582351557092576e-09, "logits/chosen": 2.394502639770508, "logits/rejected": 2.4905316829681396, "logps/chosen": -35.1737174987793, "logps/rejected": -120.9613265991211, "loss": 1.3227, "nll_loss": 0.9256242513656616, "rewards/accuracies": 1.0, "rewards/chosen": 0.2315235137939453, "rewards/margins": 1.3059139251708984, "rewards/rejected": -1.0743904113769531, "step": 5670 }, { "epoch": 0.9451666666666667, "grad_norm": 35.860260009765625, "learning_rate": 1.5728012580705551e-09, "logits/chosen": 2.6229469776153564, "logits/rejected": 2.81963849067688, "logps/chosen": -51.259925842285156, "logps/rejected": -320.8565368652344, "loss": 0.7719, "nll_loss": 0.7538225054740906, "rewards/accuracies": 1.0, "rewards/chosen": 1.046168565750122, "rewards/margins": 8.517268180847168, "rewards/rejected": -7.471099853515625, "step": 5671 }, { "epoch": 0.9453333333333334, "grad_norm": 22.646928787231445, "learning_rate": 1.563279638371251e-09, "logits/chosen": 2.206648111343384, "logits/rejected": 2.1732442378997803, "logps/chosen": -85.53225708007812, "logps/rejected": -168.352783203125, "loss": 0.9387, "nll_loss": 0.9296983480453491, "rewards/accuracies": 1.0, "rewards/chosen": 1.7610024213790894, "rewards/margins": 10.02096176147461, "rewards/rejected": -8.25995922088623, "step": 5672 }, { "epoch": 0.9455, "grad_norm": 58.84373474121094, "learning_rate": 1.5537867007690108e-09, "logits/chosen": 2.0757601261138916, "logits/rejected": 2.507444143295288, "logps/chosen": -14.281014442443848, "logps/rejected": -358.5753173828125, "loss": 0.6266, "nll_loss": 0.620913565158844, "rewards/accuracies": 1.0, "rewards/chosen": 2.778801679611206, "rewards/margins": 8.576539993286133, "rewards/rejected": -5.797738552093506, "step": 5673 }, { "epoch": 0.9456666666666667, "grad_norm": 35.79491424560547, "learning_rate": 1.5443224480298556e-09, "logits/chosen": 2.343777656555176, "logits/rejected": 2.3073348999023438, "logps/chosen": -9.51370620727539, "logps/rejected": -83.56838989257812, "loss": 0.379, "nll_loss": 0.35235944390296936, "rewards/accuracies": 1.0, "rewards/chosen": 1.148760199546814, "rewards/margins": 5.6828203201293945, "rewards/rejected": -4.534060001373291, "step": 5674 }, { "epoch": 0.9458333333333333, "grad_norm": 68.95648956298828, "learning_rate": 1.5348868829114458e-09, "logits/chosen": 2.667854070663452, "logits/rejected": 2.674288749694824, "logps/chosen": -3.1994025707244873, "logps/rejected": -82.51966857910156, "loss": 0.3656, "nll_loss": 0.26661691069602966, "rewards/accuracies": 1.0, "rewards/chosen": 0.9824388027191162, "rewards/margins": 3.4648008346557617, "rewards/rejected": -2.4823620319366455, "step": 5675 }, { "epoch": 0.946, "grad_norm": 28.486886978149414, "learning_rate": 1.5254800081630825e-09, "logits/chosen": 1.8923200368881226, "logits/rejected": 2.293818235397339, "logps/chosen": -93.28096771240234, "logps/rejected": -253.5218505859375, "loss": 1.0188, "nll_loss": 1.0139237642288208, "rewards/accuracies": 1.0, "rewards/chosen": 2.3644492626190186, "rewards/margins": 12.345866203308105, "rewards/rejected": -9.981416702270508, "step": 5676 }, { "epoch": 0.9461666666666667, "grad_norm": 25.17101287841797, "learning_rate": 1.5161018265256953e-09, "logits/chosen": 1.9540952444076538, "logits/rejected": 2.0826380252838135, "logps/chosen": -49.28825759887695, "logps/rejected": -167.32501220703125, "loss": 0.5898, "nll_loss": 0.5731191635131836, "rewards/accuracies": 1.0, "rewards/chosen": 1.3885456323623657, "rewards/margins": 6.843476295471191, "rewards/rejected": -5.454930782318115, "step": 5677 }, { "epoch": 0.9463333333333334, "grad_norm": 79.33305358886719, "learning_rate": 1.5067523407318649e-09, "logits/chosen": 0.8975378274917603, "logits/rejected": 3.3371829986572266, "logps/chosen": -12.934038162231445, "logps/rejected": -76.47035217285156, "loss": 1.1055, "nll_loss": 1.0778363943099976, "rewards/accuracies": 1.0, "rewards/chosen": 1.3135432004928589, "rewards/margins": 5.530212879180908, "rewards/rejected": -4.21666955947876, "step": 5678 }, { "epoch": 0.9465, "grad_norm": 49.454280853271484, "learning_rate": 1.4974315535058013e-09, "logits/chosen": 1.7032968997955322, "logits/rejected": 2.7570993900299072, "logps/chosen": -48.69583511352539, "logps/rejected": -316.6239013671875, "loss": 1.1657, "nll_loss": 1.1594245433807373, "rewards/accuracies": 1.0, "rewards/chosen": 2.2788937091827393, "rewards/margins": 8.968972206115723, "rewards/rejected": -6.6900787353515625, "step": 5679 }, { "epoch": 0.9466666666666667, "grad_norm": 27.43219566345215, "learning_rate": 1.4881394675633541e-09, "logits/chosen": 2.3167879581451416, "logits/rejected": 2.3854033946990967, "logps/chosen": -101.08261108398438, "logps/rejected": -357.9275817871094, "loss": 1.107, "nll_loss": 1.098724126815796, "rewards/accuracies": 1.0, "rewards/chosen": 1.9039833545684814, "rewards/margins": 9.08259391784668, "rewards/rejected": -7.178610801696777, "step": 5680 }, { "epoch": 0.9468333333333333, "grad_norm": 35.13741683959961, "learning_rate": 1.4788760856120464e-09, "logits/chosen": 2.0129270553588867, "logits/rejected": 2.0845329761505127, "logps/chosen": -128.24317932128906, "logps/rejected": -396.5902404785156, "loss": 1.0503, "nll_loss": 1.0342191457748413, "rewards/accuracies": 1.0, "rewards/chosen": 1.2269134521484375, "rewards/margins": 7.788074016571045, "rewards/rejected": -6.561160564422607, "step": 5681 }, { "epoch": 0.947, "grad_norm": 35.38927459716797, "learning_rate": 1.4696414103509636e-09, "logits/chosen": 2.6355865001678467, "logits/rejected": 2.7484257221221924, "logps/chosen": -10.890007972717285, "logps/rejected": -73.25800323486328, "loss": 0.4229, "nll_loss": 0.4033336341381073, "rewards/accuracies": 1.0, "rewards/chosen": 1.0471103191375732, "rewards/margins": 7.256004333496094, "rewards/rejected": -6.2088942527771, "step": 5682 }, { "epoch": 0.9471666666666667, "grad_norm": 88.12586975097656, "learning_rate": 1.4604354444708865e-09, "logits/chosen": 1.8053598403930664, "logits/rejected": 1.85418701171875, "logps/chosen": -79.42277526855469, "logps/rejected": -187.64508056640625, "loss": 1.7904, "nll_loss": 1.4985426664352417, "rewards/accuracies": 1.0, "rewards/chosen": -2.4978699684143066, "rewards/margins": 4.79367208480835, "rewards/rejected": -7.291542053222656, "step": 5683 }, { "epoch": 0.9473333333333334, "grad_norm": 14.591038703918457, "learning_rate": 1.4512581906542143e-09, "logits/chosen": 2.2671852111816406, "logits/rejected": 1.7856281995773315, "logps/chosen": -228.58404541015625, "logps/rejected": -167.83380126953125, "loss": 0.8054, "nll_loss": 0.7882208824157715, "rewards/accuracies": 1.0, "rewards/chosen": 2.366037130355835, "rewards/margins": 6.569350242614746, "rewards/rejected": -4.203312873840332, "step": 5684 }, { "epoch": 0.9475, "grad_norm": 50.3982048034668, "learning_rate": 1.4421096515749854e-09, "logits/chosen": 1.9611276388168335, "logits/rejected": 2.6171629428863525, "logps/chosen": -18.877899169921875, "logps/rejected": -106.33895874023438, "loss": 0.7262, "nll_loss": 0.6742107272148132, "rewards/accuracies": 1.0, "rewards/chosen": 0.25111123919487, "rewards/margins": 4.756895065307617, "rewards/rejected": -4.505784034729004, "step": 5685 }, { "epoch": 0.9476666666666667, "grad_norm": 27.821975708007812, "learning_rate": 1.4329898298988452e-09, "logits/chosen": 4.079837322235107, "logits/rejected": 4.040534496307373, "logps/chosen": -90.03924560546875, "logps/rejected": -132.4799041748047, "loss": 1.1168, "nll_loss": 1.0980396270751953, "rewards/accuracies": 1.0, "rewards/chosen": 1.4667541980743408, "rewards/margins": 6.350795745849609, "rewards/rejected": -4.884041786193848, "step": 5686 }, { "epoch": 0.9478333333333333, "grad_norm": 30.546009063720703, "learning_rate": 1.4238987282831016e-09, "logits/chosen": 3.5768425464630127, "logits/rejected": 3.4944164752960205, "logps/chosen": -88.42012023925781, "logps/rejected": -141.479736328125, "loss": 1.0692, "nll_loss": 1.0653026103973389, "rewards/accuracies": 1.0, "rewards/chosen": 3.3759615421295166, "rewards/margins": 9.310403823852539, "rewards/rejected": -5.934442520141602, "step": 5687 }, { "epoch": 0.948, "grad_norm": 273.57373046875, "learning_rate": 1.4148363493766802e-09, "logits/chosen": 1.9112865924835205, "logits/rejected": 2.1701958179473877, "logps/chosen": -71.23368835449219, "logps/rejected": -60.85607147216797, "loss": 4.0454, "nll_loss": 0.848020076751709, "rewards/accuracies": 0.0, "rewards/chosen": 1.3714646100997925, "rewards/margins": -2.5812087059020996, "rewards/rejected": 3.9526734352111816, "step": 5688 }, { "epoch": 0.9481666666666667, "grad_norm": 77.8356704711914, "learning_rate": 1.4058026958201462e-09, "logits/chosen": 0.9507861137390137, "logits/rejected": 1.8715863227844238, "logps/chosen": -34.04789733886719, "logps/rejected": -311.0260314941406, "loss": 1.2193, "nll_loss": 1.2159961462020874, "rewards/accuracies": 1.0, "rewards/chosen": 2.833219528198242, "rewards/margins": 10.99535083770752, "rewards/rejected": -8.162131309509277, "step": 5689 }, { "epoch": 0.9483333333333334, "grad_norm": 28.175479888916016, "learning_rate": 1.3967977702456945e-09, "logits/chosen": 2.19454288482666, "logits/rejected": 2.2100343704223633, "logps/chosen": -29.51021957397461, "logps/rejected": -95.6938705444336, "loss": 0.5346, "nll_loss": 0.5001731514930725, "rewards/accuracies": 1.0, "rewards/chosen": 1.540797472000122, "rewards/margins": 5.189602851867676, "rewards/rejected": -3.6488051414489746, "step": 5690 }, { "epoch": 0.9485, "grad_norm": 18.411760330200195, "learning_rate": 1.3878215752771261e-09, "logits/chosen": 1.6544827222824097, "logits/rejected": 1.622529149055481, "logps/chosen": -166.35964965820312, "logps/rejected": -155.30384826660156, "loss": 1.0116, "nll_loss": 0.9961656332015991, "rewards/accuracies": 1.0, "rewards/chosen": 2.2121078968048096, "rewards/margins": 6.657493591308594, "rewards/rejected": -4.445385932922363, "step": 5691 }, { "epoch": 0.9486666666666667, "grad_norm": 41.663604736328125, "learning_rate": 1.378874113529893e-09, "logits/chosen": 2.050581932067871, "logits/rejected": 1.950472116470337, "logps/chosen": -16.683313369750977, "logps/rejected": -87.51606750488281, "loss": 0.5275, "nll_loss": 0.42777732014656067, "rewards/accuracies": 1.0, "rewards/chosen": 0.886422336101532, "rewards/margins": 3.416512966156006, "rewards/rejected": -2.530090570449829, "step": 5692 }, { "epoch": 0.9488333333333333, "grad_norm": 18.327787399291992, "learning_rate": 1.3699553876110882e-09, "logits/chosen": 2.8265926837921143, "logits/rejected": 2.9785752296447754, "logps/chosen": -77.29397583007812, "logps/rejected": -98.09208679199219, "loss": 0.6513, "nll_loss": 0.6441164612770081, "rewards/accuracies": 1.0, "rewards/chosen": 2.3444504737854004, "rewards/margins": 8.271711349487305, "rewards/rejected": -5.927260398864746, "step": 5693 }, { "epoch": 0.949, "grad_norm": 102.63368225097656, "learning_rate": 1.3610654001193989e-09, "logits/chosen": 1.7178832292556763, "logits/rejected": 1.9882924556732178, "logps/chosen": -245.67770385742188, "logps/rejected": -311.8609619140625, "loss": 1.4066, "nll_loss": 1.247094988822937, "rewards/accuracies": 1.0, "rewards/chosen": -1.4708251953125, "rewards/margins": 4.758613586425781, "rewards/rejected": -6.229438781738281, "step": 5694 }, { "epoch": 0.9491666666666667, "grad_norm": 55.17800521850586, "learning_rate": 1.3522041536451645e-09, "logits/chosen": 1.8250693082809448, "logits/rejected": 1.714357852935791, "logps/chosen": -58.081321716308594, "logps/rejected": -72.34073638916016, "loss": 1.3501, "nll_loss": 1.2906960248947144, "rewards/accuracies": 1.0, "rewards/chosen": 0.5481918454170227, "rewards/margins": 4.182976245880127, "rewards/rejected": -3.634784460067749, "step": 5695 }, { "epoch": 0.9493333333333334, "grad_norm": 210.23948669433594, "learning_rate": 1.3433716507703196e-09, "logits/chosen": 1.9758044481277466, "logits/rejected": 2.378028631210327, "logps/chosen": -10.456541061401367, "logps/rejected": -58.0307502746582, "loss": 3.9248, "nll_loss": 0.43568921089172363, "rewards/accuracies": 0.0, "rewards/chosen": 1.3642834424972534, "rewards/margins": -2.889599323272705, "rewards/rejected": 4.253882884979248, "step": 5696 }, { "epoch": 0.9495, "grad_norm": 23.003433227539062, "learning_rate": 1.3345678940684614e-09, "logits/chosen": 3.6305062770843506, "logits/rejected": 3.8175973892211914, "logps/chosen": -81.42264556884766, "logps/rejected": -370.99981689453125, "loss": 0.86, "nll_loss": 0.8481523990631104, "rewards/accuracies": 1.0, "rewards/chosen": 1.4804086685180664, "rewards/margins": 9.227609634399414, "rewards/rejected": -7.747201442718506, "step": 5697 }, { "epoch": 0.9496666666666667, "grad_norm": 29.50568389892578, "learning_rate": 1.325792886104793e-09, "logits/chosen": 3.0567660331726074, "logits/rejected": 3.2289040088653564, "logps/chosen": -105.28643798828125, "logps/rejected": -177.51763916015625, "loss": 1.0361, "nll_loss": 1.0221985578536987, "rewards/accuracies": 1.0, "rewards/chosen": 1.34380042552948, "rewards/margins": 8.49763298034668, "rewards/rejected": -7.153832912445068, "step": 5698 }, { "epoch": 0.9498333333333333, "grad_norm": 26.15875816345215, "learning_rate": 1.317046629436136e-09, "logits/chosen": 1.9863783121109009, "logits/rejected": 2.4460411071777344, "logps/chosen": -63.820770263671875, "logps/rejected": -385.65399169921875, "loss": 0.7385, "nll_loss": 0.7335719466209412, "rewards/accuracies": 1.0, "rewards/chosen": 2.3874878883361816, "rewards/margins": 10.83245849609375, "rewards/rejected": -8.444971084594727, "step": 5699 }, { "epoch": 0.95, "grad_norm": 34.94718933105469, "learning_rate": 1.3083291266109298e-09, "logits/chosen": 1.8646272420883179, "logits/rejected": 1.798350214958191, "logps/chosen": -78.20451354980469, "logps/rejected": -78.09254455566406, "loss": 1.0736, "nll_loss": 1.0568177700042725, "rewards/accuracies": 1.0, "rewards/chosen": 1.1353874206542969, "rewards/margins": 8.385276794433594, "rewards/rejected": -7.249888896942139, "step": 5700 }, { "epoch": 0.9501666666666667, "grad_norm": 28.344194412231445, "learning_rate": 1.2996403801692646e-09, "logits/chosen": 2.4789905548095703, "logits/rejected": 2.639676809310913, "logps/chosen": -50.700653076171875, "logps/rejected": -329.384765625, "loss": 0.7276, "nll_loss": 0.7242949604988098, "rewards/accuracies": 1.0, "rewards/chosen": 2.732295513153076, "rewards/margins": 17.731403350830078, "rewards/rejected": -14.999107360839844, "step": 5701 }, { "epoch": 0.9503333333333334, "grad_norm": 20.258848190307617, "learning_rate": 1.2909803926428264e-09, "logits/chosen": 2.988252639770508, "logits/rejected": 3.223229169845581, "logps/chosen": -51.4454345703125, "logps/rejected": -180.4755859375, "loss": 0.5875, "nll_loss": 0.5716160535812378, "rewards/accuracies": 1.0, "rewards/chosen": 1.5639970302581787, "rewards/margins": 6.7240400314331055, "rewards/rejected": -5.160043239593506, "step": 5702 }, { "epoch": 0.9505, "grad_norm": 27.399776458740234, "learning_rate": 1.2823491665549191e-09, "logits/chosen": 2.310678720474243, "logits/rejected": 2.660282850265503, "logps/chosen": -93.65132141113281, "logps/rejected": -276.3803405761719, "loss": 1.104, "nll_loss": 1.0889686346054077, "rewards/accuracies": 1.0, "rewards/chosen": 1.306086778640747, "rewards/margins": 7.835846900939941, "rewards/rejected": -6.529759883880615, "step": 5703 }, { "epoch": 0.9506666666666667, "grad_norm": 48.16057586669922, "learning_rate": 1.2737467044204863e-09, "logits/chosen": 2.3665993213653564, "logits/rejected": 2.6810569763183594, "logps/chosen": -15.254467010498047, "logps/rejected": -312.37017822265625, "loss": 0.6384, "nll_loss": 0.6356028318405151, "rewards/accuracies": 1.0, "rewards/chosen": 3.178936004638672, "rewards/margins": 10.35280990600586, "rewards/rejected": -7.1738739013671875, "step": 5704 }, { "epoch": 0.9508333333333333, "grad_norm": 31.877666473388672, "learning_rate": 1.2651730087460677e-09, "logits/chosen": 2.6212880611419678, "logits/rejected": 2.720689296722412, "logps/chosen": -90.8598861694336, "logps/rejected": -204.28750610351562, "loss": 1.191, "nll_loss": 1.179998517036438, "rewards/accuracies": 1.0, "rewards/chosen": 1.5233712196350098, "rewards/margins": 12.080221176147461, "rewards/rejected": -10.556849479675293, "step": 5705 }, { "epoch": 0.951, "grad_norm": 33.36829376220703, "learning_rate": 1.2566280820298425e-09, "logits/chosen": 1.5239827632904053, "logits/rejected": 1.7437702417373657, "logps/chosen": -73.21279907226562, "logps/rejected": -111.04878234863281, "loss": 0.9882, "nll_loss": 0.9633262753486633, "rewards/accuracies": 1.0, "rewards/chosen": 0.7290886640548706, "rewards/margins": 7.501363277435303, "rewards/rejected": -6.772274494171143, "step": 5706 }, { "epoch": 0.9511666666666667, "grad_norm": 38.075984954833984, "learning_rate": 1.2481119267615859e-09, "logits/chosen": 0.34126678109169006, "logits/rejected": 2.4984614849090576, "logps/chosen": -12.120635032653809, "logps/rejected": -399.73162841796875, "loss": 0.4134, "nll_loss": 0.39098823070526123, "rewards/accuracies": 1.0, "rewards/chosen": 0.808781623840332, "rewards/margins": 8.396641731262207, "rewards/rejected": -7.587860107421875, "step": 5707 }, { "epoch": 0.9513333333333334, "grad_norm": 69.43190002441406, "learning_rate": 1.2396245454227129e-09, "logits/chosen": 2.5501675605773926, "logits/rejected": 2.733487129211426, "logps/chosen": -12.080988883972168, "logps/rejected": -323.9754943847656, "loss": 0.6779, "nll_loss": 0.6711661219596863, "rewards/accuracies": 1.0, "rewards/chosen": 2.144819974899292, "rewards/margins": 9.12469482421875, "rewards/rejected": -6.979875087738037, "step": 5708 }, { "epoch": 0.9515, "grad_norm": 20.485849380493164, "learning_rate": 1.231165940486234e-09, "logits/chosen": 0.5627833604812622, "logits/rejected": 1.6120309829711914, "logps/chosen": -6.9141154289245605, "logps/rejected": -287.5421142578125, "loss": 0.2026, "nll_loss": 0.18686798214912415, "rewards/accuracies": 1.0, "rewards/chosen": 1.1580890417099, "rewards/margins": 9.990803718566895, "rewards/rejected": -8.832715034484863, "step": 5709 }, { "epoch": 0.9516666666666667, "grad_norm": 52.092689514160156, "learning_rate": 1.222736114416789e-09, "logits/chosen": 2.1520488262176514, "logits/rejected": 2.5528786182403564, "logps/chosen": -23.809194564819336, "logps/rejected": -231.7019500732422, "loss": 0.9372, "nll_loss": 0.9157383441925049, "rewards/accuracies": 1.0, "rewards/chosen": 1.208626389503479, "rewards/margins": 6.238018035888672, "rewards/rejected": -5.029391765594482, "step": 5710 }, { "epoch": 0.9518333333333333, "grad_norm": 20.476913452148438, "learning_rate": 1.2143350696706245e-09, "logits/chosen": 2.5871875286102295, "logits/rejected": 2.75860333442688, "logps/chosen": -49.752784729003906, "logps/rejected": -335.3062744140625, "loss": 0.5794, "nll_loss": 0.5590200424194336, "rewards/accuracies": 1.0, "rewards/chosen": 1.6864793300628662, "rewards/margins": 6.091026306152344, "rewards/rejected": -4.404547214508057, "step": 5711 }, { "epoch": 0.952, "grad_norm": 22.625146865844727, "learning_rate": 1.2059628086956042e-09, "logits/chosen": 2.6737024784088135, "logits/rejected": 2.9545114040374756, "logps/chosen": -105.97982788085938, "logps/rejected": -481.4143371582031, "loss": 0.9648, "nll_loss": 0.9547733068466187, "rewards/accuracies": 1.0, "rewards/chosen": 1.6821945905685425, "rewards/margins": 8.887923240661621, "rewards/rejected": -7.205728530883789, "step": 5712 }, { "epoch": 0.9521666666666667, "grad_norm": 17.64583396911621, "learning_rate": 1.197619333931199e-09, "logits/chosen": 2.6650006771087646, "logits/rejected": 2.595108985900879, "logps/chosen": -221.25657653808594, "logps/rejected": -105.1841812133789, "loss": 0.8624, "nll_loss": 0.8194687962532043, "rewards/accuracies": 1.0, "rewards/chosen": 2.6001479625701904, "rewards/margins": 5.572369575500488, "rewards/rejected": -2.9722213745117188, "step": 5713 }, { "epoch": 0.9523333333333334, "grad_norm": 36.956275939941406, "learning_rate": 1.1893046478085089e-09, "logits/chosen": 2.596224308013916, "logits/rejected": 2.6631503105163574, "logps/chosen": -23.348731994628906, "logps/rejected": -172.513671875, "loss": 0.6667, "nll_loss": 0.6485759615898132, "rewards/accuracies": 1.0, "rewards/chosen": 1.2250083684921265, "rewards/margins": 6.88588809967041, "rewards/rejected": -5.660879611968994, "step": 5714 }, { "epoch": 0.9525, "grad_norm": 25.351322174072266, "learning_rate": 1.181018752750218e-09, "logits/chosen": 4.133666038513184, "logits/rejected": 4.136792182922363, "logps/chosen": -84.799560546875, "logps/rejected": -169.21080017089844, "loss": 0.9352, "nll_loss": 0.9318631291389465, "rewards/accuracies": 1.0, "rewards/chosen": 3.10300612449646, "rewards/margins": 9.8024320602417, "rewards/rejected": -6.699426174163818, "step": 5715 }, { "epoch": 0.9526666666666667, "grad_norm": 53.778831481933594, "learning_rate": 1.1727616511706507e-09, "logits/chosen": 2.5222606658935547, "logits/rejected": 2.8363876342773438, "logps/chosen": -9.268555641174316, "logps/rejected": -225.9955291748047, "loss": 0.4457, "nll_loss": 0.42129799723625183, "rewards/accuracies": 1.0, "rewards/chosen": 1.974848985671997, "rewards/margins": 5.873602867126465, "rewards/rejected": -3.8987536430358887, "step": 5716 }, { "epoch": 0.9528333333333333, "grad_norm": 38.57190704345703, "learning_rate": 1.1645333454757267e-09, "logits/chosen": 1.5248785018920898, "logits/rejected": 2.264197826385498, "logps/chosen": -41.22401809692383, "logps/rejected": -276.5847473144531, "loss": 1.1815, "nll_loss": 1.1778292655944824, "rewards/accuracies": 1.0, "rewards/chosen": 3.126721143722534, "rewards/margins": 9.440956115722656, "rewards/rejected": -6.314235210418701, "step": 5717 }, { "epoch": 0.953, "grad_norm": 34.96102523803711, "learning_rate": 1.1563338380629616e-09, "logits/chosen": 2.7353363037109375, "logits/rejected": 2.9423460960388184, "logps/chosen": -8.419122695922852, "logps/rejected": -128.55654907226562, "loss": 0.3815, "nll_loss": 0.36604881286621094, "rewards/accuracies": 1.0, "rewards/chosen": 1.9206182956695557, "rewards/margins": 6.613809585571289, "rewards/rejected": -4.6931915283203125, "step": 5718 }, { "epoch": 0.9531666666666667, "grad_norm": 53.79479217529297, "learning_rate": 1.1481631313215223e-09, "logits/chosen": 2.092895746231079, "logits/rejected": 2.3274550437927246, "logps/chosen": -17.679277420043945, "logps/rejected": -293.49725341796875, "loss": 0.7484, "nll_loss": 0.7366366386413574, "rewards/accuracies": 1.0, "rewards/chosen": 1.7978236675262451, "rewards/margins": 7.380099296569824, "rewards/rejected": -5.582275390625, "step": 5719 }, { "epoch": 0.9533333333333334, "grad_norm": 32.250030517578125, "learning_rate": 1.1400212276321374e-09, "logits/chosen": 1.9841666221618652, "logits/rejected": 1.347225308418274, "logps/chosen": -110.3721923828125, "logps/rejected": -41.30260467529297, "loss": 1.1677, "nll_loss": 1.1037218570709229, "rewards/accuracies": 1.0, "rewards/chosen": 1.4456062316894531, "rewards/margins": 4.275605201721191, "rewards/rejected": -2.829998731613159, "step": 5720 }, { "epoch": 0.9535, "grad_norm": 27.633968353271484, "learning_rate": 1.131908129367154e-09, "logits/chosen": 2.6164748668670654, "logits/rejected": 2.100374698638916, "logps/chosen": -81.287109375, "logps/rejected": -66.16230010986328, "loss": 0.8144, "nll_loss": 0.7891950607299805, "rewards/accuracies": 1.0, "rewards/chosen": 1.886061191558838, "rewards/margins": 5.787008285522461, "rewards/rejected": -3.900947332382202, "step": 5721 }, { "epoch": 0.9536666666666667, "grad_norm": 114.59587097167969, "learning_rate": 1.1238238388905586e-09, "logits/chosen": 2.7780940532684326, "logits/rejected": 2.960406541824341, "logps/chosen": -76.8939437866211, "logps/rejected": -161.50662231445312, "loss": 2.5642, "nll_loss": 2.196969747543335, "rewards/accuracies": 1.0, "rewards/chosen": -2.8603806495666504, "rewards/margins": 3.4695115089416504, "rewards/rejected": -6.329892158508301, "step": 5722 }, { "epoch": 0.9538333333333333, "grad_norm": 23.814247131347656, "learning_rate": 1.1157683585579002e-09, "logits/chosen": 1.2925689220428467, "logits/rejected": 1.7964309453964233, "logps/chosen": -36.8443489074707, "logps/rejected": -174.51727294921875, "loss": 0.4733, "nll_loss": 0.443907767534256, "rewards/accuracies": 1.0, "rewards/chosen": 0.4987320005893707, "rewards/margins": 8.986132621765137, "rewards/rejected": -8.487401008605957, "step": 5723 }, { "epoch": 0.954, "grad_norm": 24.417524337768555, "learning_rate": 1.1077416907163573e-09, "logits/chosen": 1.6956161260604858, "logits/rejected": 2.4719581604003906, "logps/chosen": -29.945796966552734, "logps/rejected": -307.514404296875, "loss": 0.4829, "nll_loss": 0.46070462465286255, "rewards/accuracies": 1.0, "rewards/chosen": 0.9114101529121399, "rewards/margins": 7.020689964294434, "rewards/rejected": -6.109279632568359, "step": 5724 }, { "epoch": 0.9541666666666667, "grad_norm": 25.509960174560547, "learning_rate": 1.0997438377047141e-09, "logits/chosen": 2.73130202293396, "logits/rejected": 2.718796730041504, "logps/chosen": -19.843347549438477, "logps/rejected": -177.701416015625, "loss": 0.4139, "nll_loss": 0.40496620535850525, "rewards/accuracies": 1.0, "rewards/chosen": 1.7293351888656616, "rewards/margins": 12.546699523925781, "rewards/rejected": -10.817364692687988, "step": 5725 }, { "epoch": 0.9543333333333334, "grad_norm": 28.754440307617188, "learning_rate": 1.09177480185334e-09, "logits/chosen": 1.845908761024475, "logits/rejected": 1.6478630304336548, "logps/chosen": -94.43511962890625, "logps/rejected": -141.38526916503906, "loss": 1.096, "nll_loss": 1.0854610204696655, "rewards/accuracies": 1.0, "rewards/chosen": 1.9074143171310425, "rewards/margins": 7.595755100250244, "rewards/rejected": -5.688340663909912, "step": 5726 }, { "epoch": 0.9545, "grad_norm": 29.096921920776367, "learning_rate": 1.0838345854842446e-09, "logits/chosen": 1.6573214530944824, "logits/rejected": 1.5873275995254517, "logps/chosen": -28.875585556030273, "logps/rejected": -70.35601806640625, "loss": 0.6018, "nll_loss": 0.5892977118492126, "rewards/accuracies": 1.0, "rewards/chosen": 1.533220887184143, "rewards/margins": 7.881548881530762, "rewards/rejected": -6.348328113555908, "step": 5727 }, { "epoch": 0.9546666666666667, "grad_norm": 49.244285583496094, "learning_rate": 1.0759231909109877e-09, "logits/chosen": 3.0869317054748535, "logits/rejected": 3.0273354053497314, "logps/chosen": -69.40061950683594, "logps/rejected": -68.18287658691406, "loss": 1.2159, "nll_loss": 1.1762816905975342, "rewards/accuracies": 1.0, "rewards/chosen": 1.3779984712600708, "rewards/margins": 4.922080993652344, "rewards/rejected": -3.5440826416015625, "step": 5728 }, { "epoch": 0.9548333333333333, "grad_norm": 23.276805877685547, "learning_rate": 1.0680406204387703e-09, "logits/chosen": 1.9962732791900635, "logits/rejected": 2.311152696609497, "logps/chosen": -100.84782409667969, "logps/rejected": -269.1607360839844, "loss": 0.9891, "nll_loss": 0.9791051745414734, "rewards/accuracies": 1.0, "rewards/chosen": 1.8629372119903564, "rewards/margins": 7.904356002807617, "rewards/rejected": -6.04141902923584, "step": 5729 }, { "epoch": 0.955, "grad_norm": 20.644908905029297, "learning_rate": 1.0601868763643995e-09, "logits/chosen": 0.49850010871887207, "logits/rejected": 1.1554971933364868, "logps/chosen": -80.18017578125, "logps/rejected": -418.0621032714844, "loss": 0.7212, "nll_loss": 0.7095591425895691, "rewards/accuracies": 1.0, "rewards/chosen": 1.4601653814315796, "rewards/margins": 12.156660079956055, "rewards/rejected": -10.696495056152344, "step": 5730 }, { "epoch": 0.9551666666666667, "grad_norm": 23.629121780395508, "learning_rate": 1.0523619609762447e-09, "logits/chosen": 2.754030466079712, "logits/rejected": 3.0135090351104736, "logps/chosen": -74.02790832519531, "logps/rejected": -135.0737762451172, "loss": 0.738, "nll_loss": 0.7329496145248413, "rewards/accuracies": 1.0, "rewards/chosen": 3.2769181728363037, "rewards/margins": 8.815261840820312, "rewards/rejected": -5.538343906402588, "step": 5731 }, { "epoch": 0.9553333333333334, "grad_norm": 25.023122787475586, "learning_rate": 1.0445658765543154e-09, "logits/chosen": 2.895958423614502, "logits/rejected": 3.068939685821533, "logps/chosen": -61.95213317871094, "logps/rejected": -208.78717041015625, "loss": 0.7215, "nll_loss": 0.7040014863014221, "rewards/accuracies": 1.0, "rewards/chosen": 1.3284653425216675, "rewards/margins": 6.75930118560791, "rewards/rejected": -5.430835723876953, "step": 5732 }, { "epoch": 0.9555, "grad_norm": 31.78122329711914, "learning_rate": 1.0367986253701943e-09, "logits/chosen": 2.5032997131347656, "logits/rejected": 2.4984304904937744, "logps/chosen": -112.64594268798828, "logps/rejected": -129.54412841796875, "loss": 1.1935, "nll_loss": 1.1612982749938965, "rewards/accuracies": 1.0, "rewards/chosen": 2.3147850036621094, "rewards/margins": 5.707612037658691, "rewards/rejected": -3.392827033996582, "step": 5733 }, { "epoch": 0.9556666666666667, "grad_norm": 32.059410095214844, "learning_rate": 1.0290602096870714e-09, "logits/chosen": 1.8704060316085815, "logits/rejected": 1.7897697687149048, "logps/chosen": -56.34994888305664, "logps/rejected": -132.59124755859375, "loss": 0.8766, "nll_loss": 0.8537871837615967, "rewards/accuracies": 1.0, "rewards/chosen": 0.9186794757843018, "rewards/margins": 6.729475021362305, "rewards/rejected": -5.810795783996582, "step": 5734 }, { "epoch": 0.9558333333333333, "grad_norm": 19.62138557434082, "learning_rate": 1.0213506317597543e-09, "logits/chosen": 3.2631754875183105, "logits/rejected": 3.2349257469177246, "logps/chosen": -88.6519775390625, "logps/rejected": -168.50790405273438, "loss": 0.8237, "nll_loss": 0.8208515048027039, "rewards/accuracies": 1.0, "rewards/chosen": 3.788792610168457, "rewards/margins": 9.940019607543945, "rewards/rejected": -6.1512274742126465, "step": 5735 }, { "epoch": 0.956, "grad_norm": 9.913504600524902, "learning_rate": 1.013669893834601e-09, "logits/chosen": 2.7092063426971436, "logits/rejected": 2.6284749507904053, "logps/chosen": -150.532958984375, "logps/rejected": -182.70205688476562, "loss": 0.5572, "nll_loss": 0.551402747631073, "rewards/accuracies": 1.0, "rewards/chosen": 4.74371337890625, "rewards/margins": 9.719015121459961, "rewards/rejected": -4.975302219390869, "step": 5736 }, { "epoch": 0.9561666666666667, "grad_norm": 25.645557403564453, "learning_rate": 1.0060179981495999e-09, "logits/chosen": 3.6322498321533203, "logits/rejected": 3.5904335975646973, "logps/chosen": -43.06618881225586, "logps/rejected": -147.23158264160156, "loss": 0.624, "nll_loss": 0.6065660119056702, "rewards/accuracies": 1.0, "rewards/chosen": 1.4875812530517578, "rewards/margins": 6.534373760223389, "rewards/rejected": -5.046792507171631, "step": 5737 }, { "epoch": 0.9563333333333334, "grad_norm": 9.507818222045898, "learning_rate": 9.983949469343444e-10, "logits/chosen": 1.5551687479019165, "logits/rejected": 1.587966799736023, "logps/chosen": -136.2202911376953, "logps/rejected": -221.19927978515625, "loss": 0.4923, "nll_loss": 0.49000105261802673, "rewards/accuracies": 1.0, "rewards/chosen": 3.8121018409729004, "rewards/margins": 10.312939643859863, "rewards/rejected": -6.500837802886963, "step": 5738 }, { "epoch": 0.9565, "grad_norm": 34.66318130493164, "learning_rate": 9.908007424100028e-10, "logits/chosen": 2.5271003246307373, "logits/rejected": 2.4959969520568848, "logps/chosen": -48.00716018676758, "logps/rejected": -128.06228637695312, "loss": 0.7496, "nll_loss": 0.7385717034339905, "rewards/accuracies": 1.0, "rewards/chosen": 1.5750561952590942, "rewards/margins": 8.800226211547852, "rewards/rejected": -7.225170135498047, "step": 5739 }, { "epoch": 0.9566666666666667, "grad_norm": 21.829681396484375, "learning_rate": 9.832353867893383e-10, "logits/chosen": 1.1053228378295898, "logits/rejected": 1.9746394157409668, "logps/chosen": -86.63624572753906, "logps/rejected": -284.6054382324219, "loss": 0.746, "nll_loss": 0.7404807806015015, "rewards/accuracies": 1.0, "rewards/chosen": 2.243093967437744, "rewards/margins": 11.048833847045898, "rewards/rejected": -8.805740356445312, "step": 5740 }, { "epoch": 0.9568333333333333, "grad_norm": 165.81800842285156, "learning_rate": 9.756988822767098e-10, "logits/chosen": 1.5859768390655518, "logits/rejected": 1.3051738739013672, "logps/chosen": -301.51348876953125, "logps/rejected": -287.23626708984375, "loss": 1.5582, "nll_loss": 1.137786626815796, "rewards/accuracies": 1.0, "rewards/chosen": -2.6445465087890625, "rewards/margins": 1.9494400024414062, "rewards/rejected": -4.593986511230469, "step": 5741 }, { "epoch": 0.957, "grad_norm": 53.98333740234375, "learning_rate": 9.68191231068083e-10, "logits/chosen": 2.411090612411499, "logits/rejected": 2.211988687515259, "logps/chosen": -35.011009216308594, "logps/rejected": -38.69648742675781, "loss": 0.942, "nll_loss": 0.4546884298324585, "rewards/accuracies": 1.0, "rewards/chosen": 3.299407482147217, "rewards/margins": 2.63070011138916, "rewards/rejected": 0.6687073707580566, "step": 5742 }, { "epoch": 0.9571666666666667, "grad_norm": 29.391876220703125, "learning_rate": 9.607124353510187e-10, "logits/chosen": 2.3405721187591553, "logits/rejected": 2.396895408630371, "logps/chosen": -8.805632591247559, "logps/rejected": -65.4740982055664, "loss": 0.3464, "nll_loss": 0.3386782109737396, "rewards/accuracies": 1.0, "rewards/chosen": 2.2776448726654053, "rewards/margins": 8.112237930297852, "rewards/rejected": -5.834593296051025, "step": 5743 }, { "epoch": 0.9573333333333334, "grad_norm": 24.546205520629883, "learning_rate": 9.5326249730463e-10, "logits/chosen": 2.3828465938568115, "logits/rejected": 2.573352336883545, "logps/chosen": -53.27021026611328, "logps/rejected": -251.30291748046875, "loss": 0.7281, "nll_loss": 0.7198677062988281, "rewards/accuracies": 1.0, "rewards/chosen": 2.241474151611328, "rewards/margins": 7.949413299560547, "rewards/rejected": -5.707939147949219, "step": 5744 }, { "epoch": 0.9575, "grad_norm": 21.550600051879883, "learning_rate": 9.45841419099669e-10, "logits/chosen": 2.808919906616211, "logits/rejected": 2.977407932281494, "logps/chosen": -126.10968780517578, "logps/rejected": -118.6799087524414, "loss": 0.9699, "nll_loss": 0.9341458082199097, "rewards/accuracies": 1.0, "rewards/chosen": 1.560950517654419, "rewards/margins": 5.139754295349121, "rewards/rejected": -3.5788040161132812, "step": 5745 }, { "epoch": 0.9576666666666667, "grad_norm": 40.43370056152344, "learning_rate": 9.384492028984503e-10, "logits/chosen": 1.5446449518203735, "logits/rejected": 2.8789806365966797, "logps/chosen": -14.008726119995117, "logps/rejected": -111.33547973632812, "loss": 0.5736, "nll_loss": 0.5603489875793457, "rewards/accuracies": 1.0, "rewards/chosen": 3.6218950748443604, "rewards/margins": 7.800620079040527, "rewards/rejected": -4.178724765777588, "step": 5746 }, { "epoch": 0.9578333333333333, "grad_norm": 29.371931076049805, "learning_rate": 9.310858508549068e-10, "logits/chosen": 2.8618240356445312, "logits/rejected": 2.983760356903076, "logps/chosen": -69.12158203125, "logps/rejected": -168.7225341796875, "loss": 0.8819, "nll_loss": 0.8749567270278931, "rewards/accuracies": 1.0, "rewards/chosen": 2.2027344703674316, "rewards/margins": 8.673507690429688, "rewards/rejected": -6.470773220062256, "step": 5747 }, { "epoch": 0.958, "grad_norm": 23.224777221679688, "learning_rate": 9.237513651145224e-10, "logits/chosen": 1.5452560186386108, "logits/rejected": 1.6878801584243774, "logps/chosen": -32.04471206665039, "logps/rejected": -68.5275650024414, "loss": 0.4785, "nll_loss": 0.44506537914276123, "rewards/accuracies": 1.0, "rewards/chosen": 1.205186128616333, "rewards/margins": 5.178460121154785, "rewards/rejected": -3.973273992538452, "step": 5748 }, { "epoch": 0.9581666666666667, "grad_norm": 33.4725341796875, "learning_rate": 9.164457478143873e-10, "logits/chosen": 2.978290319442749, "logits/rejected": 3.1533777713775635, "logps/chosen": -12.182909965515137, "logps/rejected": -153.29324340820312, "loss": 0.4444, "nll_loss": 0.4351039528846741, "rewards/accuracies": 1.0, "rewards/chosen": 1.7763702869415283, "rewards/margins": 8.898368835449219, "rewards/rejected": -7.1219987869262695, "step": 5749 }, { "epoch": 0.9583333333333334, "grad_norm": 146.9262237548828, "learning_rate": 9.091690010831988e-10, "logits/chosen": 2.782191753387451, "logits/rejected": 2.836702585220337, "logps/chosen": -146.2078857421875, "logps/rejected": -218.44522094726562, "loss": 2.2438, "nll_loss": 1.606679916381836, "rewards/accuracies": 1.0, "rewards/chosen": -4.485719203948975, "rewards/margins": 5.634506702423096, "rewards/rejected": -10.12022590637207, "step": 5750 }, { "epoch": 0.9585, "grad_norm": 39.980464935302734, "learning_rate": 9.019211270412275e-10, "logits/chosen": 2.562863826751709, "logits/rejected": 2.4187474250793457, "logps/chosen": -83.210693359375, "logps/rejected": -86.83316040039062, "loss": 1.1034, "nll_loss": 1.0532997846603394, "rewards/accuracies": 1.0, "rewards/chosen": 0.1503593474626541, "rewards/margins": 5.119988918304443, "rewards/rejected": -4.969629764556885, "step": 5751 }, { "epoch": 0.9586666666666667, "grad_norm": 22.00467872619629, "learning_rate": 8.947021278003175e-10, "logits/chosen": 2.4455759525299072, "logits/rejected": 2.6134955883026123, "logps/chosen": -82.68952941894531, "logps/rejected": -323.3798828125, "loss": 0.7589, "nll_loss": 0.751723051071167, "rewards/accuracies": 1.0, "rewards/chosen": 1.9857162237167358, "rewards/margins": 10.409004211425781, "rewards/rejected": -8.423288345336914, "step": 5752 }, { "epoch": 0.9588333333333333, "grad_norm": 32.5095100402832, "learning_rate": 8.875120054639196e-10, "logits/chosen": 2.040426015853882, "logits/rejected": 2.0939035415649414, "logps/chosen": -27.53162384033203, "logps/rejected": -91.92315673828125, "loss": 0.5337, "nll_loss": 0.5005750060081482, "rewards/accuracies": 1.0, "rewards/chosen": 0.8682659268379211, "rewards/margins": 5.33210563659668, "rewards/rejected": -4.463839530944824, "step": 5753 }, { "epoch": 0.959, "grad_norm": 67.04429626464844, "learning_rate": 8.803507621270579e-10, "logits/chosen": 1.8290339708328247, "logits/rejected": 3.4760210514068604, "logps/chosen": -29.22003173828125, "logps/rejected": -426.4460754394531, "loss": 1.1903, "nll_loss": 1.1688013076782227, "rewards/accuracies": 1.0, "rewards/chosen": 0.8222576379776001, "rewards/margins": 10.393836975097656, "rewards/rejected": -9.571578979492188, "step": 5754 }, { "epoch": 0.9591666666666666, "grad_norm": 24.187564849853516, "learning_rate": 8.73218399876341e-10, "logits/chosen": 3.8425018787384033, "logits/rejected": 3.905085802078247, "logps/chosen": -25.510766983032227, "logps/rejected": -93.7175064086914, "loss": 0.4256, "nll_loss": 0.41146400570869446, "rewards/accuracies": 1.0, "rewards/chosen": 2.464017629623413, "rewards/margins": 6.903670310974121, "rewards/rejected": -4.439652442932129, "step": 5755 }, { "epoch": 0.9593333333333334, "grad_norm": 26.22275161743164, "learning_rate": 8.661149207899843e-10, "logits/chosen": 2.3641486167907715, "logits/rejected": 2.6733455657958984, "logps/chosen": -35.18882369995117, "logps/rejected": -185.5836181640625, "loss": 0.5384, "nll_loss": 0.5331639647483826, "rewards/accuracies": 1.0, "rewards/chosen": 2.4249608516693115, "rewards/margins": 9.52145767211914, "rewards/rejected": -7.09649658203125, "step": 5756 }, { "epoch": 0.9595, "grad_norm": 128.02120971679688, "learning_rate": 8.590403269377655e-10, "logits/chosen": 2.465212345123291, "logits/rejected": 2.536059617996216, "logps/chosen": -29.45714569091797, "logps/rejected": -22.70602035522461, "loss": 1.6669, "nll_loss": 0.41488945484161377, "rewards/accuracies": 0.0, "rewards/chosen": 1.9416508674621582, "rewards/margins": -0.038504719734191895, "rewards/rejected": 1.98015558719635, "step": 5757 }, { "epoch": 0.9596666666666667, "grad_norm": 151.57154846191406, "learning_rate": 8.519946203810468e-10, "logits/chosen": 2.287325859069824, "logits/rejected": 2.405411720275879, "logps/chosen": -25.476276397705078, "logps/rejected": -13.761539459228516, "loss": 3.977, "nll_loss": 0.463204950094223, "rewards/accuracies": 0.0, "rewards/chosen": 1.2306808233261108, "rewards/margins": -2.939652919769287, "rewards/rejected": 4.1703338623046875, "step": 5758 }, { "epoch": 0.9598333333333333, "grad_norm": 22.424036026000977, "learning_rate": 8.44977803172775e-10, "logits/chosen": 2.386889696121216, "logits/rejected": 2.4452743530273438, "logps/chosen": -72.33370971679688, "logps/rejected": -106.72726440429688, "loss": 0.6477, "nll_loss": 0.640121340751648, "rewards/accuracies": 1.0, "rewards/chosen": 2.2371842861175537, "rewards/margins": 8.20033073425293, "rewards/rejected": -5.963146209716797, "step": 5759 }, { "epoch": 0.96, "grad_norm": 27.278894424438477, "learning_rate": 8.379898773574922e-10, "logits/chosen": 2.430093765258789, "logits/rejected": 2.7115983963012695, "logps/chosen": -104.17820739746094, "logps/rejected": -220.85787963867188, "loss": 1.2076, "nll_loss": 1.197450876235962, "rewards/accuracies": 1.0, "rewards/chosen": 1.6617004871368408, "rewards/margins": 8.984488487243652, "rewards/rejected": -7.322787761688232, "step": 5760 }, { "epoch": 0.9601666666666666, "grad_norm": 240.27362060546875, "learning_rate": 8.310308449713033e-10, "logits/chosen": 2.560274600982666, "logits/rejected": 2.4912664890289307, "logps/chosen": -62.553775787353516, "logps/rejected": -60.279239654541016, "loss": 2.5285, "nll_loss": 0.6950420141220093, "rewards/accuracies": 0.0, "rewards/chosen": 1.5797771215438843, "rewards/margins": -0.975894570350647, "rewards/rejected": 2.5556716918945312, "step": 5761 }, { "epoch": 0.9603333333333334, "grad_norm": 22.692785263061523, "learning_rate": 8.241007080419082e-10, "logits/chosen": 1.2571479082107544, "logits/rejected": 1.8509306907653809, "logps/chosen": -85.0240249633789, "logps/rejected": -351.41864013671875, "loss": 0.8322, "nll_loss": 0.8254760503768921, "rewards/accuracies": 1.0, "rewards/chosen": 2.070143938064575, "rewards/margins": 10.101866722106934, "rewards/rejected": -8.031723022460938, "step": 5762 }, { "epoch": 0.9605, "grad_norm": 26.87171745300293, "learning_rate": 8.171994685885697e-10, "logits/chosen": 2.0730690956115723, "logits/rejected": 1.371388554573059, "logps/chosen": -77.77255249023438, "logps/rejected": -69.9679946899414, "loss": 0.8498, "nll_loss": 0.8273676037788391, "rewards/accuracies": 1.0, "rewards/chosen": 1.0757232904434204, "rewards/margins": 6.2926344871521, "rewards/rejected": -5.216911315917969, "step": 5763 }, { "epoch": 0.9606666666666667, "grad_norm": 26.729534149169922, "learning_rate": 8.103271286221791e-10, "logits/chosen": 2.700240135192871, "logits/rejected": 2.778419256210327, "logps/chosen": -95.90530395507812, "logps/rejected": -245.8624725341797, "loss": 1.0492, "nll_loss": 1.0424489974975586, "rewards/accuracies": 1.0, "rewards/chosen": 2.2732741832733154, "rewards/margins": 8.589942932128906, "rewards/rejected": -6.31666898727417, "step": 5764 }, { "epoch": 0.9608333333333333, "grad_norm": 27.192594528198242, "learning_rate": 8.034836901451237e-10, "logits/chosen": 2.292593002319336, "logits/rejected": 2.3001582622528076, "logps/chosen": -17.293193817138672, "logps/rejected": -84.10960388183594, "loss": 0.4413, "nll_loss": 0.4117427170276642, "rewards/accuracies": 1.0, "rewards/chosen": 1.6451168060302734, "rewards/margins": 5.457241058349609, "rewards/rejected": -3.812124490737915, "step": 5765 }, { "epoch": 0.961, "grad_norm": 22.21851921081543, "learning_rate": 7.966691551514526e-10, "logits/chosen": 1.1482104063034058, "logits/rejected": 1.8694121837615967, "logps/chosen": -81.88093566894531, "logps/rejected": -185.21597290039062, "loss": 0.7573, "nll_loss": 0.7443721294403076, "rewards/accuracies": 1.0, "rewards/chosen": 2.001852512359619, "rewards/margins": 6.9676361083984375, "rewards/rejected": -4.965783596038818, "step": 5766 }, { "epoch": 0.9611666666666666, "grad_norm": 24.373857498168945, "learning_rate": 7.898835256267445e-10, "logits/chosen": 3.3677122592926025, "logits/rejected": 3.477386951446533, "logps/chosen": -14.228130340576172, "logps/rejected": -61.82218551635742, "loss": 0.3277, "nll_loss": 0.30930712819099426, "rewards/accuracies": 1.0, "rewards/chosen": 2.514570951461792, "rewards/margins": 6.563295364379883, "rewards/rejected": -4.04872465133667, "step": 5767 }, { "epoch": 0.9613333333333334, "grad_norm": 22.55362319946289, "learning_rate": 7.83126803548173e-10, "logits/chosen": 1.4553111791610718, "logits/rejected": 2.6059319972991943, "logps/chosen": -37.66691589355469, "logps/rejected": -263.2397155761719, "loss": 0.5474, "nll_loss": 0.530519962310791, "rewards/accuracies": 1.0, "rewards/chosen": 1.1232551336288452, "rewards/margins": 8.265290260314941, "rewards/rejected": -7.142035484313965, "step": 5768 }, { "epoch": 0.9615, "grad_norm": 22.890886306762695, "learning_rate": 7.763989908844748e-10, "logits/chosen": 2.374859571456909, "logits/rejected": 2.0990357398986816, "logps/chosen": -88.6677474975586, "logps/rejected": -92.73969268798828, "loss": 0.821, "nll_loss": 0.8134656548500061, "rewards/accuracies": 1.0, "rewards/chosen": 1.951331377029419, "rewards/margins": 9.706945419311523, "rewards/rejected": -7.755613803863525, "step": 5769 }, { "epoch": 0.9616666666666667, "grad_norm": 30.023820877075195, "learning_rate": 7.697000895959815e-10, "logits/chosen": 2.6731626987457275, "logits/rejected": 2.5370728969573975, "logps/chosen": -60.40280532836914, "logps/rejected": -62.142486572265625, "loss": 0.9031, "nll_loss": 0.8882765769958496, "rewards/accuracies": 1.0, "rewards/chosen": 1.8431119918823242, "rewards/margins": 6.719311237335205, "rewards/rejected": -4.876199245452881, "step": 5770 }, { "epoch": 0.9618333333333333, "grad_norm": 31.331722259521484, "learning_rate": 7.630301016345875e-10, "logits/chosen": 2.914738416671753, "logits/rejected": 3.0804975032806396, "logps/chosen": -73.77000427246094, "logps/rejected": -202.62103271484375, "loss": 0.994, "nll_loss": 0.9836000800132751, "rewards/accuracies": 1.0, "rewards/chosen": 1.6104393005371094, "rewards/margins": 9.405694961547852, "rewards/rejected": -7.795255184173584, "step": 5771 }, { "epoch": 0.962, "grad_norm": 28.62685775756836, "learning_rate": 7.563890289437824e-10, "logits/chosen": 1.5841155052185059, "logits/rejected": 2.2505836486816406, "logps/chosen": -7.864307403564453, "logps/rejected": -175.9842529296875, "loss": 0.3066, "nll_loss": 0.3024733364582062, "rewards/accuracies": 1.0, "rewards/chosen": 2.563878059387207, "rewards/margins": 11.134241104125977, "rewards/rejected": -8.57036304473877, "step": 5772 }, { "epoch": 0.9621666666666666, "grad_norm": 33.77637481689453, "learning_rate": 7.497768734585852e-10, "logits/chosen": 3.0035881996154785, "logits/rejected": 3.1332104206085205, "logps/chosen": -21.443269729614258, "logps/rejected": -281.54827880859375, "loss": 0.5913, "nll_loss": 0.5795478224754333, "rewards/accuracies": 1.0, "rewards/chosen": 1.6411848068237305, "rewards/margins": 7.759880065917969, "rewards/rejected": -6.118695259094238, "step": 5773 }, { "epoch": 0.9623333333333334, "grad_norm": 19.439998626708984, "learning_rate": 7.431936371056435e-10, "logits/chosen": 1.662693977355957, "logits/rejected": 1.241269826889038, "logps/chosen": -87.99544525146484, "logps/rejected": -58.27409362792969, "loss": 0.725, "nll_loss": 0.6928775906562805, "rewards/accuracies": 1.0, "rewards/chosen": 2.8288826942443848, "rewards/margins": 6.110530853271484, "rewards/rejected": -3.2816483974456787, "step": 5774 }, { "epoch": 0.9625, "grad_norm": 258.51373291015625, "learning_rate": 7.366393218031564e-10, "logits/chosen": 2.8037476539611816, "logits/rejected": 2.7975358963012695, "logps/chosen": -71.38567352294922, "logps/rejected": -32.15279006958008, "loss": 5.4659, "nll_loss": 0.8705568909645081, "rewards/accuracies": 0.0, "rewards/chosen": 1.9948357343673706, "rewards/margins": -3.9017891883850098, "rewards/rejected": 5.89662504196167, "step": 5775 }, { "epoch": 0.9626666666666667, "grad_norm": 23.012943267822266, "learning_rate": 7.301139294608738e-10, "logits/chosen": 2.5977816581726074, "logits/rejected": 2.945549249649048, "logps/chosen": -107.18233489990234, "logps/rejected": -289.77349853515625, "loss": 0.936, "nll_loss": 0.932020366191864, "rewards/accuracies": 1.0, "rewards/chosen": 2.605226993560791, "rewards/margins": 10.873846054077148, "rewards/rejected": -8.2686185836792, "step": 5776 }, { "epoch": 0.9628333333333333, "grad_norm": 18.804420471191406, "learning_rate": 7.236174619801639e-10, "logits/chosen": 0.7623004913330078, "logits/rejected": 1.2751848697662354, "logps/chosen": -49.32909393310547, "logps/rejected": -236.765625, "loss": 0.5255, "nll_loss": 0.5138447880744934, "rewards/accuracies": 1.0, "rewards/chosen": 1.5989112854003906, "rewards/margins": 7.989151954650879, "rewards/rejected": -6.390240669250488, "step": 5777 }, { "epoch": 0.963, "grad_norm": 26.22429656982422, "learning_rate": 7.171499212539122e-10, "logits/chosen": 3.067756175994873, "logits/rejected": 3.1732540130615234, "logps/chosen": -52.72776794433594, "logps/rejected": -199.89830017089844, "loss": 0.8269, "nll_loss": 0.8238714337348938, "rewards/accuracies": 1.0, "rewards/chosen": 4.202948093414307, "rewards/margins": 10.066900253295898, "rewards/rejected": -5.863951683044434, "step": 5778 }, { "epoch": 0.9631666666666666, "grad_norm": 96.29743957519531, "learning_rate": 7.107113091666339e-10, "logits/chosen": 1.0639399290084839, "logits/rejected": 2.3225438594818115, "logps/chosen": -24.910415649414062, "logps/rejected": -273.404541015625, "loss": 1.201, "nll_loss": 1.1862101554870605, "rewards/accuracies": 1.0, "rewards/chosen": 1.8919568061828613, "rewards/margins": 6.708311080932617, "rewards/rejected": -4.816354274749756, "step": 5779 }, { "epoch": 0.9633333333333334, "grad_norm": 16.844562530517578, "learning_rate": 7.043016275943614e-10, "logits/chosen": 2.2292141914367676, "logits/rejected": 2.2048912048339844, "logps/chosen": -185.73019409179688, "logps/rejected": -222.4487762451172, "loss": 0.908, "nll_loss": 0.9060008525848389, "rewards/accuracies": 1.0, "rewards/chosen": 3.24769926071167, "rewards/margins": 13.830612182617188, "rewards/rejected": -10.58291244506836, "step": 5780 }, { "epoch": 0.9635, "grad_norm": 58.12327194213867, "learning_rate": 6.979208784047453e-10, "logits/chosen": 1.9508966207504272, "logits/rejected": 1.945294976234436, "logps/chosen": -86.59501647949219, "logps/rejected": -163.54966735839844, "loss": 1.3682, "nll_loss": 1.3322312831878662, "rewards/accuracies": 1.0, "rewards/chosen": 0.28594970703125, "rewards/margins": 7.926921367645264, "rewards/rejected": -7.640971660614014, "step": 5781 }, { "epoch": 0.9636666666666667, "grad_norm": 24.669660568237305, "learning_rate": 6.915690634569538e-10, "logits/chosen": 2.6388189792633057, "logits/rejected": 2.573148250579834, "logps/chosen": -74.71109771728516, "logps/rejected": -100.11949920654297, "loss": 0.9377, "nll_loss": 0.9111109375953674, "rewards/accuracies": 1.0, "rewards/chosen": 1.9929962158203125, "rewards/margins": 5.764997482299805, "rewards/rejected": -3.772001028060913, "step": 5782 }, { "epoch": 0.9638333333333333, "grad_norm": 35.84687042236328, "learning_rate": 6.852461846017843e-10, "logits/chosen": 3.62910795211792, "logits/rejected": 3.613905668258667, "logps/chosen": -55.57275390625, "logps/rejected": -144.69239807128906, "loss": 1.1408, "nll_loss": 1.134137749671936, "rewards/accuracies": 1.0, "rewards/chosen": 2.1622536182403564, "rewards/margins": 9.11235523223877, "rewards/rejected": -6.950101375579834, "step": 5783 }, { "epoch": 0.964, "grad_norm": 27.541967391967773, "learning_rate": 6.789522436815409e-10, "logits/chosen": 2.402470350265503, "logits/rejected": 2.502915859222412, "logps/chosen": -78.38478088378906, "logps/rejected": -159.6550750732422, "loss": 0.9298, "nll_loss": 0.9114510416984558, "rewards/accuracies": 1.0, "rewards/chosen": 1.2380096912384033, "rewards/margins": 6.7603559494018555, "rewards/rejected": -5.522346019744873, "step": 5784 }, { "epoch": 0.9641666666666666, "grad_norm": 25.536405563354492, "learning_rate": 6.726872425301567e-10, "logits/chosen": 1.7337744235992432, "logits/rejected": 2.2262721061706543, "logps/chosen": -107.86650085449219, "logps/rejected": -541.6898193359375, "loss": 1.0082, "nll_loss": 0.9896008968353271, "rewards/accuracies": 1.0, "rewards/chosen": 0.9662445783615112, "rewards/margins": 13.22767162322998, "rewards/rejected": -12.26142692565918, "step": 5785 }, { "epoch": 0.9643333333333334, "grad_norm": 58.04801940917969, "learning_rate": 6.664511829730934e-10, "logits/chosen": 1.6194807291030884, "logits/rejected": 2.094008207321167, "logps/chosen": -19.7900390625, "logps/rejected": -306.91241455078125, "loss": 0.9033, "nll_loss": 0.8995472192764282, "rewards/accuracies": 1.0, "rewards/chosen": 2.719426393508911, "rewards/margins": 10.267983436584473, "rewards/rejected": -7.548556804656982, "step": 5786 }, { "epoch": 0.9645, "grad_norm": 24.000587463378906, "learning_rate": 6.602440668273757e-10, "logits/chosen": 1.886265516281128, "logits/rejected": 2.100696086883545, "logps/chosen": -68.5318603515625, "logps/rejected": -212.7101593017578, "loss": 0.7241, "nll_loss": 0.7213881015777588, "rewards/accuracies": 1.0, "rewards/chosen": 3.543541193008423, "rewards/margins": 10.026143074035645, "rewards/rejected": -6.482602119445801, "step": 5787 }, { "epoch": 0.9646666666666667, "grad_norm": 97.26116943359375, "learning_rate": 6.540658959016121e-10, "logits/chosen": 3.156074285507202, "logits/rejected": 3.282196283340454, "logps/chosen": -42.061622619628906, "logps/rejected": -200.5989990234375, "loss": 1.6534, "nll_loss": 1.6177546977996826, "rewards/accuracies": 1.0, "rewards/chosen": 0.31485748291015625, "rewards/margins": 7.271908760070801, "rewards/rejected": -6.9570512771606445, "step": 5788 }, { "epoch": 0.9648333333333333, "grad_norm": 18.734880447387695, "learning_rate": 6.47916671995996e-10, "logits/chosen": 2.3215243816375732, "logits/rejected": 2.268000364303589, "logps/chosen": -143.98936462402344, "logps/rejected": -145.92648315429688, "loss": 0.6587, "nll_loss": 0.6428096890449524, "rewards/accuracies": 1.0, "rewards/chosen": 1.1643813848495483, "rewards/margins": 8.832867622375488, "rewards/rejected": -7.66848611831665, "step": 5789 }, { "epoch": 0.965, "grad_norm": 10.375371932983398, "learning_rate": 6.417963969022389e-10, "logits/chosen": 2.217608690261841, "logits/rejected": 2.059934139251709, "logps/chosen": -142.14068603515625, "logps/rejected": -172.92196655273438, "loss": 0.5696, "nll_loss": 0.5640502572059631, "rewards/accuracies": 1.0, "rewards/chosen": 4.397702217102051, "rewards/margins": 9.458573341369629, "rewards/rejected": -5.060871124267578, "step": 5790 }, { "epoch": 0.9651666666666666, "grad_norm": 26.59710121154785, "learning_rate": 6.357050724036361e-10, "logits/chosen": 2.285372734069824, "logits/rejected": 2.5500054359436035, "logps/chosen": -70.17935180664062, "logps/rejected": -247.38865661621094, "loss": 0.8186, "nll_loss": 0.816038966178894, "rewards/accuracies": 1.0, "rewards/chosen": 3.101339101791382, "rewards/margins": 11.210160255432129, "rewards/rejected": -8.108820915222168, "step": 5791 }, { "epoch": 0.9653333333333334, "grad_norm": 30.419353485107422, "learning_rate": 6.296427002750793e-10, "logits/chosen": 2.3208532333374023, "logits/rejected": 2.2154018878936768, "logps/chosen": -11.616230964660645, "logps/rejected": -89.82605743408203, "loss": 0.3551, "nll_loss": 0.3226730525493622, "rewards/accuracies": 1.0, "rewards/chosen": 0.6851924061775208, "rewards/margins": 5.657075881958008, "rewards/rejected": -4.971883296966553, "step": 5792 }, { "epoch": 0.9655, "grad_norm": 38.67849349975586, "learning_rate": 6.236092822829886e-10, "logits/chosen": 1.1552515029907227, "logits/rejected": 2.0447285175323486, "logps/chosen": -123.16508483886719, "logps/rejected": -282.4702453613281, "loss": 1.3408, "nll_loss": 1.2964743375778198, "rewards/accuracies": 1.0, "rewards/chosen": 0.090764619410038, "rewards/margins": 6.583319664001465, "rewards/rejected": -6.492555141448975, "step": 5793 }, { "epoch": 0.9656666666666667, "grad_norm": 40.802406311035156, "learning_rate": 6.176048201853468e-10, "logits/chosen": 1.7289862632751465, "logits/rejected": 2.3186445236206055, "logps/chosen": -44.37068557739258, "logps/rejected": -210.4036865234375, "loss": 0.9446, "nll_loss": 0.944057047367096, "rewards/accuracies": 1.0, "rewards/chosen": 4.604281425476074, "rewards/margins": 13.829004287719727, "rewards/rejected": -9.224722862243652, "step": 5794 }, { "epoch": 0.9658333333333333, "grad_norm": 29.9285945892334, "learning_rate": 6.116293157317209e-10, "logits/chosen": 2.6459877490997314, "logits/rejected": 2.827815055847168, "logps/chosen": -50.22777557373047, "logps/rejected": -315.76123046875, "loss": 0.7606, "nll_loss": 0.7386437654495239, "rewards/accuracies": 1.0, "rewards/chosen": 0.8205963373184204, "rewards/margins": 8.35721492767334, "rewards/rejected": -7.536618232727051, "step": 5795 }, { "epoch": 0.966, "grad_norm": 34.0898323059082, "learning_rate": 6.056827706632184e-10, "logits/chosen": 2.13261079788208, "logits/rejected": 2.76645565032959, "logps/chosen": -106.31236267089844, "logps/rejected": -448.0107116699219, "loss": 1.5004, "nll_loss": 1.4973571300506592, "rewards/accuracies": 1.0, "rewards/chosen": 2.803194046020508, "rewards/margins": 19.300310134887695, "rewards/rejected": -16.497116088867188, "step": 5796 }, { "epoch": 0.9661666666666666, "grad_norm": 51.67146682739258, "learning_rate": 5.99765186712542e-10, "logits/chosen": 1.7270622253417969, "logits/rejected": 2.6089727878570557, "logps/chosen": -3.513744354248047, "logps/rejected": -206.14566040039062, "loss": 0.2849, "nll_loss": 0.25098177790641785, "rewards/accuracies": 1.0, "rewards/chosen": 0.8181626200675964, "rewards/margins": 5.3125505447387695, "rewards/rejected": -4.494388103485107, "step": 5797 }, { "epoch": 0.9663333333333334, "grad_norm": 28.140249252319336, "learning_rate": 5.938765656039124e-10, "logits/chosen": 3.9197261333465576, "logits/rejected": 3.8803610801696777, "logps/chosen": -21.35869598388672, "logps/rejected": -88.51161193847656, "loss": 0.4715, "nll_loss": 0.43589162826538086, "rewards/accuracies": 1.0, "rewards/chosen": 0.7070369720458984, "rewards/margins": 5.28884220123291, "rewards/rejected": -4.581805229187012, "step": 5798 }, { "epoch": 0.9665, "grad_norm": 26.956546783447266, "learning_rate": 5.88016909053135e-10, "logits/chosen": 2.8427019119262695, "logits/rejected": 2.7987067699432373, "logps/chosen": -84.35174560546875, "logps/rejected": -151.3467254638672, "loss": 0.977, "nll_loss": 0.9477724432945251, "rewards/accuracies": 1.0, "rewards/chosen": 0.8745468258857727, "rewards/margins": 5.68730354309082, "rewards/rejected": -4.812756538391113, "step": 5799 }, { "epoch": 0.9666666666666667, "grad_norm": 24.72297477722168, "learning_rate": 5.821862187675775e-10, "logits/chosen": 2.937145709991455, "logits/rejected": 2.869084119796753, "logps/chosen": -61.351356506347656, "logps/rejected": -80.50411224365234, "loss": 0.7753, "nll_loss": 0.7574241757392883, "rewards/accuracies": 1.0, "rewards/chosen": 1.6023002862930298, "rewards/margins": 6.378650188446045, "rewards/rejected": -4.776350021362305, "step": 5800 }, { "epoch": 0.9668333333333333, "grad_norm": 56.276485443115234, "learning_rate": 5.763844964461584e-10, "logits/chosen": 2.2483482360839844, "logits/rejected": 1.8806401491165161, "logps/chosen": -22.794605255126953, "logps/rejected": -84.65338134765625, "loss": 0.9202, "nll_loss": 0.8767156004905701, "rewards/accuracies": 1.0, "rewards/chosen": 0.851348876953125, "rewards/margins": 4.721187591552734, "rewards/rejected": -3.8698387145996094, "step": 5801 }, { "epoch": 0.967, "grad_norm": 52.88673782348633, "learning_rate": 5.706117437793701e-10, "logits/chosen": 2.997884511947632, "logits/rejected": 3.0659635066986084, "logps/chosen": -17.407102584838867, "logps/rejected": -56.95616149902344, "loss": 0.6754, "nll_loss": 0.4580816924571991, "rewards/accuracies": 1.0, "rewards/chosen": 0.9308578968048096, "rewards/margins": 2.397642135620117, "rewards/rejected": -1.4667843580245972, "step": 5802 }, { "epoch": 0.9671666666666666, "grad_norm": 24.328702926635742, "learning_rate": 5.648679624492447e-10, "logits/chosen": 1.8842283487319946, "logits/rejected": 1.472302794456482, "logps/chosen": -69.28680419921875, "logps/rejected": -87.61410522460938, "loss": 0.8631, "nll_loss": 0.8449608683586121, "rewards/accuracies": 1.0, "rewards/chosen": 2.5039772987365723, "rewards/margins": 6.5754265785217285, "rewards/rejected": -4.071449279785156, "step": 5803 }, { "epoch": 0.9673333333333334, "grad_norm": 23.829465866088867, "learning_rate": 5.591531541293881e-10, "logits/chosen": 2.070749521255493, "logits/rejected": 2.0271224975585938, "logps/chosen": -10.522096633911133, "logps/rejected": -43.22952651977539, "loss": 0.2679, "nll_loss": 0.23382438719272614, "rewards/accuracies": 1.0, "rewards/chosen": 3.2540299892425537, "rewards/margins": 6.409602642059326, "rewards/rejected": -3.1555726528167725, "step": 5804 }, { "epoch": 0.9675, "grad_norm": 785.9561157226562, "learning_rate": 5.534673204849572e-10, "logits/chosen": 1.8319900035858154, "logits/rejected": 1.2646043300628662, "logps/chosen": -351.2870788574219, "logps/rejected": -166.4287109375, "loss": 4.2261, "nll_loss": 1.2914966344833374, "rewards/accuracies": 0.0, "rewards/chosen": -5.877213001251221, "rewards/margins": -2.422922134399414, "rewards/rejected": -3.4542908668518066, "step": 5805 }, { "epoch": 0.9676666666666667, "grad_norm": 28.825246810913086, "learning_rate": 5.47810463172671e-10, "logits/chosen": 2.498595714569092, "logits/rejected": 3.0436503887176514, "logps/chosen": -45.654518127441406, "logps/rejected": -721.6464233398438, "loss": 0.6966, "nll_loss": 0.6814106106758118, "rewards/accuracies": 1.0, "rewards/chosen": 1.3433008193969727, "rewards/margins": 7.426748275756836, "rewards/rejected": -6.083447456359863, "step": 5806 }, { "epoch": 0.9678333333333333, "grad_norm": 66.4354248046875, "learning_rate": 5.42182583840789e-10, "logits/chosen": 1.7321730852127075, "logits/rejected": 2.21618390083313, "logps/chosen": -35.36554718017578, "logps/rejected": -160.3944091796875, "loss": 0.831, "nll_loss": 0.8224545121192932, "rewards/accuracies": 1.0, "rewards/chosen": 2.156153917312622, "rewards/margins": 7.931390762329102, "rewards/rejected": -5.7752366065979, "step": 5807 }, { "epoch": 0.968, "grad_norm": 138.88742065429688, "learning_rate": 5.365836841291438e-10, "logits/chosen": 3.035956859588623, "logits/rejected": 3.045557975769043, "logps/chosen": -38.761932373046875, "logps/rejected": -14.934432983398438, "loss": 3.4717, "nll_loss": 0.530985414981842, "rewards/accuracies": 0.0, "rewards/chosen": 1.490626573562622, "rewards/margins": -2.279750108718872, "rewards/rejected": 3.770376682281494, "step": 5808 }, { "epoch": 0.9681666666666666, "grad_norm": 26.68837547302246, "learning_rate": 5.310137656691305e-10, "logits/chosen": 3.1541996002197266, "logits/rejected": 3.0465681552886963, "logps/chosen": -50.371524810791016, "logps/rejected": -305.60260009765625, "loss": 0.6811, "nll_loss": 0.6627832055091858, "rewards/accuracies": 1.0, "rewards/chosen": 1.0146831274032593, "rewards/margins": 8.55674934387207, "rewards/rejected": -7.5420660972595215, "step": 5809 }, { "epoch": 0.9683333333333334, "grad_norm": 78.58628845214844, "learning_rate": 5.254728300836952e-10, "logits/chosen": 0.6518176794052124, "logits/rejected": 1.667379379272461, "logps/chosen": -116.17906951904297, "logps/rejected": -304.7516784667969, "loss": 1.427, "nll_loss": 1.3053826093673706, "rewards/accuracies": 1.0, "rewards/chosen": -1.1440476179122925, "rewards/margins": 5.633574485778809, "rewards/rejected": -6.777622222900391, "step": 5810 }, { "epoch": 0.9685, "grad_norm": 244.39126586914062, "learning_rate": 5.199608789873133e-10, "logits/chosen": 1.9917176961898804, "logits/rejected": 1.89112389087677, "logps/chosen": -123.82264709472656, "logps/rejected": -28.648454666137695, "loss": 3.9887, "nll_loss": 1.0405263900756836, "rewards/accuracies": 0.0, "rewards/chosen": 2.1008834838867188, "rewards/margins": -2.1613411903381348, "rewards/rejected": 4.2622246742248535, "step": 5811 }, { "epoch": 0.9686666666666667, "grad_norm": 28.85619354248047, "learning_rate": 5.144779139860333e-10, "logits/chosen": 2.576113700866699, "logits/rejected": 2.690276622772217, "logps/chosen": -79.0335922241211, "logps/rejected": -185.4700927734375, "loss": 0.983, "nll_loss": 0.9638242721557617, "rewards/accuracies": 1.0, "rewards/chosen": 0.9816941022872925, "rewards/margins": 8.143198013305664, "rewards/rejected": -7.161503791809082, "step": 5812 }, { "epoch": 0.9688333333333333, "grad_norm": 41.54606246948242, "learning_rate": 5.090239366774773e-10, "logits/chosen": 2.8045482635498047, "logits/rejected": 2.890641927719116, "logps/chosen": -8.534674644470215, "logps/rejected": -291.47479248046875, "loss": 0.3731, "nll_loss": 0.3710727393627167, "rewards/accuracies": 1.0, "rewards/chosen": 3.656991958618164, "rewards/margins": 10.704465866088867, "rewards/rejected": -7.047473907470703, "step": 5813 }, { "epoch": 0.969, "grad_norm": 316.4879455566406, "learning_rate": 5.035989486508074e-10, "logits/chosen": 2.7541067600250244, "logits/rejected": 2.696228504180908, "logps/chosen": -84.63532257080078, "logps/rejected": -58.05552291870117, "loss": 3.0095, "nll_loss": 1.244637131690979, "rewards/accuracies": 0.0, "rewards/chosen": 1.1543785333633423, "rewards/margins": -0.9864460229873657, "rewards/rejected": 2.140824556350708, "step": 5814 }, { "epoch": 0.9691666666666666, "grad_norm": 18.834671020507812, "learning_rate": 4.982029514867148e-10, "logits/chosen": 2.6398470401763916, "logits/rejected": 2.6335525512695312, "logps/chosen": -163.6316375732422, "logps/rejected": -207.50453186035156, "loss": 0.9485, "nll_loss": 0.9350378513336182, "rewards/accuracies": 1.0, "rewards/chosen": 1.41258704662323, "rewards/margins": 7.988752841949463, "rewards/rejected": -6.576165676116943, "step": 5815 }, { "epoch": 0.9693333333333334, "grad_norm": 32.87491226196289, "learning_rate": 4.928359467574639e-10, "logits/chosen": 2.5276246070861816, "logits/rejected": 2.445453643798828, "logps/chosen": -82.89857482910156, "logps/rejected": -76.21316528320312, "loss": 1.1999, "nll_loss": 1.1675854921340942, "rewards/accuracies": 1.0, "rewards/chosen": 2.2065322399139404, "rewards/margins": 5.635018348693848, "rewards/rejected": -3.428485870361328, "step": 5816 }, { "epoch": 0.9695, "grad_norm": 37.0291633605957, "learning_rate": 4.874979360268927e-10, "logits/chosen": 2.1652307510375977, "logits/rejected": 2.2953388690948486, "logps/chosen": -36.71479415893555, "logps/rejected": -228.37582397460938, "loss": 0.7344, "nll_loss": 0.719897985458374, "rewards/accuracies": 1.0, "rewards/chosen": 1.314893364906311, "rewards/margins": 8.054980278015137, "rewards/rejected": -6.740086555480957, "step": 5817 }, { "epoch": 0.9696666666666667, "grad_norm": 125.08724212646484, "learning_rate": 4.82188920850346e-10, "logits/chosen": 1.5953316688537598, "logits/rejected": 1.8889780044555664, "logps/chosen": -55.131996154785156, "logps/rejected": -145.7381591796875, "loss": 1.2092, "nll_loss": 1.1251426935195923, "rewards/accuracies": 1.0, "rewards/chosen": 1.2074860334396362, "rewards/margins": 3.7956008911132812, "rewards/rejected": -2.5881149768829346, "step": 5818 }, { "epoch": 0.9698333333333333, "grad_norm": 45.796409606933594, "learning_rate": 4.769089027747642e-10, "logits/chosen": 2.794297933578491, "logits/rejected": 3.10741925239563, "logps/chosen": -47.471866607666016, "logps/rejected": -137.79034423828125, "loss": 0.8652, "nll_loss": 0.8184805512428284, "rewards/accuracies": 1.0, "rewards/chosen": 1.3741801977157593, "rewards/margins": 4.6800031661987305, "rewards/rejected": -3.3058228492736816, "step": 5819 }, { "epoch": 0.97, "grad_norm": 205.1586456298828, "learning_rate": 4.716578833386054e-10, "logits/chosen": 2.4452478885650635, "logits/rejected": 1.7905409336090088, "logps/chosen": -287.6788330078125, "logps/rejected": -53.59587478637695, "loss": 1.7312, "nll_loss": 1.3442937135696411, "rewards/accuracies": 1.0, "rewards/chosen": -2.154040575027466, "rewards/margins": 1.6728873252868652, "rewards/rejected": -3.826927900314331, "step": 5820 }, { "epoch": 0.9701666666666666, "grad_norm": 36.07510757446289, "learning_rate": 4.664358640718902e-10, "logits/chosen": 3.1787328720092773, "logits/rejected": 3.041800022125244, "logps/chosen": -14.344961166381836, "logps/rejected": -52.810882568359375, "loss": 0.6363, "nll_loss": 0.6236939430236816, "rewards/accuracies": 1.0, "rewards/chosen": 3.356100559234619, "rewards/margins": 7.6401214599609375, "rewards/rejected": -4.284020900726318, "step": 5821 }, { "epoch": 0.9703333333333334, "grad_norm": 24.529903411865234, "learning_rate": 4.612428464961793e-10, "logits/chosen": 0.8170136213302612, "logits/rejected": 1.6543149948120117, "logps/chosen": -52.60674285888672, "logps/rejected": -312.1753234863281, "loss": 0.6012, "nll_loss": 0.5845192670822144, "rewards/accuracies": 1.0, "rewards/chosen": 1.1040252447128296, "rewards/margins": 8.919563293457031, "rewards/rejected": -7.815537929534912, "step": 5822 }, { "epoch": 0.9705, "grad_norm": 19.541370391845703, "learning_rate": 4.5607883212462896e-10, "logits/chosen": 3.1348612308502197, "logits/rejected": 3.058502674102783, "logps/chosen": -128.31033325195312, "logps/rejected": -180.2630615234375, "loss": 0.9432, "nll_loss": 0.9365717768669128, "rewards/accuracies": 1.0, "rewards/chosen": 2.0580642223358154, "rewards/margins": 10.560869216918945, "rewards/rejected": -8.50280475616455, "step": 5823 }, { "epoch": 0.9706666666666667, "grad_norm": 19.452192306518555, "learning_rate": 4.509438224618689e-10, "logits/chosen": 2.356020450592041, "logits/rejected": 2.5992982387542725, "logps/chosen": -54.91565704345703, "logps/rejected": -286.4068603515625, "loss": 0.5728, "nll_loss": 0.5661407709121704, "rewards/accuracies": 1.0, "rewards/chosen": 2.0435943603515625, "rewards/margins": 10.516329765319824, "rewards/rejected": -8.472735404968262, "step": 5824 }, { "epoch": 0.9708333333333333, "grad_norm": 24.87571144104004, "learning_rate": 4.4583781900413564e-10, "logits/chosen": 2.3885529041290283, "logits/rejected": 2.7894322872161865, "logps/chosen": -34.51902770996094, "logps/rejected": -210.38473510742188, "loss": 0.6197, "nll_loss": 0.6055969595909119, "rewards/accuracies": 1.0, "rewards/chosen": 1.2576560974121094, "rewards/margins": 10.289630889892578, "rewards/rejected": -9.031974792480469, "step": 5825 }, { "epoch": 0.971, "grad_norm": 68.250732421875, "learning_rate": 4.407608232392057e-10, "logits/chosen": 2.240201711654663, "logits/rejected": 2.1909265518188477, "logps/chosen": -33.337005615234375, "logps/rejected": -14.278975486755371, "loss": 1.9998, "nll_loss": 0.6803471446037292, "rewards/accuracies": 1.0, "rewards/chosen": 4.045481204986572, "rewards/margins": 0.5892083644866943, "rewards/rejected": 3.456272840499878, "step": 5826 }, { "epoch": 0.9711666666666666, "grad_norm": 28.42810821533203, "learning_rate": 4.357128366463736e-10, "logits/chosen": 2.034162759780884, "logits/rejected": 2.5262811183929443, "logps/chosen": -58.027854919433594, "logps/rejected": -533.9547119140625, "loss": 0.8545, "nll_loss": 0.8409834504127502, "rewards/accuracies": 1.0, "rewards/chosen": 1.3388572931289673, "rewards/margins": 8.835550308227539, "rewards/rejected": -7.496692657470703, "step": 5827 }, { "epoch": 0.9713333333333334, "grad_norm": 16.43939781188965, "learning_rate": 4.3069386069651825e-10, "logits/chosen": 1.8880157470703125, "logits/rejected": 1.717577576637268, "logps/chosen": -145.73782348632812, "logps/rejected": -212.51121520996094, "loss": 0.7551, "nll_loss": 0.7473732829093933, "rewards/accuracies": 1.0, "rewards/chosen": 2.5059967041015625, "rewards/margins": 7.960348606109619, "rewards/rejected": -5.454351902008057, "step": 5828 }, { "epoch": 0.9715, "grad_norm": 26.986289978027344, "learning_rate": 4.257038968520366e-10, "logits/chosen": 1.5163522958755493, "logits/rejected": 2.2983806133270264, "logps/chosen": -31.51030158996582, "logps/rejected": -355.0502014160156, "loss": 0.5493, "nll_loss": 0.52517169713974, "rewards/accuracies": 1.0, "rewards/chosen": 0.7071039080619812, "rewards/margins": 8.763649940490723, "rewards/rejected": -8.056546211242676, "step": 5829 }, { "epoch": 0.9716666666666667, "grad_norm": 75.44239807128906, "learning_rate": 4.207429465668877e-10, "logits/chosen": 3.186574697494507, "logits/rejected": 3.1674954891204834, "logps/chosen": -53.1906623840332, "logps/rejected": -142.6068115234375, "loss": 1.7769, "nll_loss": 1.773021936416626, "rewards/accuracies": 1.0, "rewards/chosen": 2.665975570678711, "rewards/margins": 10.307367324829102, "rewards/rejected": -7.641391277313232, "step": 5830 }, { "epoch": 0.9718333333333333, "grad_norm": 61.206111907958984, "learning_rate": 4.1581101128658203e-10, "logits/chosen": 1.8143030405044556, "logits/rejected": 2.209686040878296, "logps/chosen": -13.183490753173828, "logps/rejected": -113.26177215576172, "loss": 0.5347, "nll_loss": 0.5273396372795105, "rewards/accuracies": 1.0, "rewards/chosen": 2.7232513427734375, "rewards/margins": 8.05145263671875, "rewards/rejected": -5.328201770782471, "step": 5831 }, { "epoch": 0.972, "grad_norm": 32.432804107666016, "learning_rate": 4.1090809244814783e-10, "logits/chosen": 1.2586085796356201, "logits/rejected": 2.2324142456054688, "logps/chosen": -37.8587646484375, "logps/rejected": -235.48294067382812, "loss": 0.7555, "nll_loss": 0.7423286437988281, "rewards/accuracies": 1.0, "rewards/chosen": 1.3272590637207031, "rewards/margins": 10.592097282409668, "rewards/rejected": -9.264838218688965, "step": 5832 }, { "epoch": 0.9721666666666666, "grad_norm": 124.05040740966797, "learning_rate": 4.0603419148019793e-10, "logits/chosen": 2.4394092559814453, "logits/rejected": 2.5808491706848145, "logps/chosen": -67.70921325683594, "logps/rejected": -34.60646057128906, "loss": 2.0124, "nll_loss": 1.4719394445419312, "rewards/accuracies": 1.0, "rewards/chosen": 0.8104660511016846, "rewards/margins": 1.0563843250274658, "rewards/rejected": -0.24591827392578125, "step": 5833 }, { "epoch": 0.9723333333333334, "grad_norm": 172.04879760742188, "learning_rate": 4.011893098028629e-10, "logits/chosen": 3.2996442317962646, "logits/rejected": 3.3273720741271973, "logps/chosen": -44.184696197509766, "logps/rejected": -39.380558013916016, "loss": 2.6994, "nll_loss": 0.6052696704864502, "rewards/accuracies": 0.0, "rewards/chosen": 1.2793320417404175, "rewards/margins": -1.366475224494934, "rewards/rejected": 2.6458072662353516, "step": 5834 }, { "epoch": 0.9725, "grad_norm": 25.94898796081543, "learning_rate": 3.9637344882782475e-10, "logits/chosen": 3.4911303520202637, "logits/rejected": 3.401373863220215, "logps/chosen": -75.13002014160156, "logps/rejected": -179.3685760498047, "loss": 0.8112, "nll_loss": 0.7908422946929932, "rewards/accuracies": 1.0, "rewards/chosen": 0.8777405023574829, "rewards/margins": 9.741255760192871, "rewards/rejected": -8.86351490020752, "step": 5835 }, { "epoch": 0.9726666666666667, "grad_norm": 32.71628189086914, "learning_rate": 3.915866099583054e-10, "logits/chosen": 1.2169299125671387, "logits/rejected": 2.0007851123809814, "logps/chosen": -83.93248748779297, "logps/rejected": -275.73992919921875, "loss": 1.105, "nll_loss": 1.0900324583053589, "rewards/accuracies": 1.0, "rewards/chosen": 1.1968040466308594, "rewards/margins": 10.751338958740234, "rewards/rejected": -9.554534912109375, "step": 5836 }, { "epoch": 0.9728333333333333, "grad_norm": 127.41504669189453, "learning_rate": 3.8682879458907803e-10, "logits/chosen": 2.8808681964874268, "logits/rejected": 2.887700319290161, "logps/chosen": -29.892467498779297, "logps/rejected": -48.74406051635742, "loss": 1.2925, "nll_loss": 0.636009931564331, "rewards/accuracies": 1.0, "rewards/chosen": 0.0745750442147255, "rewards/margins": 0.5434673428535461, "rewards/rejected": -0.46889230608940125, "step": 5837 }, { "epoch": 0.973, "grad_norm": 23.489749908447266, "learning_rate": 3.8210000410645595e-10, "logits/chosen": 2.7627787590026855, "logits/rejected": 2.7487776279449463, "logps/chosen": -8.55379581451416, "logps/rejected": -195.313720703125, "loss": 0.2709, "nll_loss": 0.2673061192035675, "rewards/accuracies": 1.0, "rewards/chosen": 2.758232593536377, "rewards/margins": 10.53398323059082, "rewards/rejected": -7.775750637054443, "step": 5838 }, { "epoch": 0.9731666666666666, "grad_norm": 489.5653991699219, "learning_rate": 3.774002398883036e-10, "logits/chosen": 2.5203967094421387, "logits/rejected": 2.531691551208496, "logps/chosen": -276.06011962890625, "logps/rejected": -282.4498291015625, "loss": 4.689, "nll_loss": 1.725375771522522, "rewards/accuracies": 0.0, "rewards/chosen": -8.997420310974121, "rewards/margins": -1.6144485473632812, "rewards/rejected": -7.38297176361084, "step": 5839 }, { "epoch": 0.9733333333333334, "grad_norm": 32.7166633605957, "learning_rate": 3.7272950330400345e-10, "logits/chosen": 1.8199764490127563, "logits/rejected": 1.650580883026123, "logps/chosen": -141.3330078125, "logps/rejected": -156.15402221679688, "loss": 1.2563, "nll_loss": 1.1777751445770264, "rewards/accuracies": 1.0, "rewards/chosen": 2.0038986206054688, "rewards/margins": 4.384393692016602, "rewards/rejected": -2.3804948329925537, "step": 5840 }, { "epoch": 0.9735, "grad_norm": 86.3743667602539, "learning_rate": 3.680877957145112e-10, "logits/chosen": 2.2818603515625, "logits/rejected": 2.457595109939575, "logps/chosen": -86.20050048828125, "logps/rejected": -97.79671478271484, "loss": 1.3741, "nll_loss": 0.9472582340240479, "rewards/accuracies": 1.0, "rewards/chosen": 1.8478379249572754, "rewards/margins": 1.9108086824417114, "rewards/rejected": -0.06297073513269424, "step": 5841 }, { "epoch": 0.9736666666666667, "grad_norm": 41.38557434082031, "learning_rate": 3.634751184723006e-10, "logits/chosen": 2.6999588012695312, "logits/rejected": 2.8852407932281494, "logps/chosen": -17.398160934448242, "logps/rejected": -324.0985107421875, "loss": 0.6335, "nll_loss": 0.6213628649711609, "rewards/accuracies": 1.0, "rewards/chosen": 4.592955112457275, "rewards/margins": 8.784040451049805, "rewards/rejected": -4.1910858154296875, "step": 5842 }, { "epoch": 0.9738333333333333, "grad_norm": 37.595916748046875, "learning_rate": 3.5889147292138543e-10, "logits/chosen": 2.9198148250579834, "logits/rejected": 2.865814685821533, "logps/chosen": -27.661115646362305, "logps/rejected": -81.50318145751953, "loss": 0.7016, "nll_loss": 0.6746613383293152, "rewards/accuracies": 1.0, "rewards/chosen": 2.075211524963379, "rewards/margins": 5.792284965515137, "rewards/rejected": -3.717073440551758, "step": 5843 }, { "epoch": 0.974, "grad_norm": 102.78314971923828, "learning_rate": 3.5433686039735287e-10, "logits/chosen": 2.936854839324951, "logits/rejected": 2.9064748287200928, "logps/chosen": -43.1617431640625, "logps/rejected": -35.77372741699219, "loss": 1.642, "nll_loss": 1.541491150856018, "rewards/accuracies": 1.0, "rewards/chosen": 0.13217811286449432, "rewards/margins": 3.2889246940612793, "rewards/rejected": -3.1567466259002686, "step": 5844 }, { "epoch": 0.9741666666666666, "grad_norm": 81.80791473388672, "learning_rate": 3.498112822272858e-10, "logits/chosen": 2.6244096755981445, "logits/rejected": 2.7700531482696533, "logps/chosen": -18.721481323242188, "logps/rejected": -37.29141616821289, "loss": 1.4635, "nll_loss": 0.6240493655204773, "rewards/accuracies": 1.0, "rewards/chosen": 3.3618011474609375, "rewards/margins": 1.4314112663269043, "rewards/rejected": 1.9303898811340332, "step": 5845 }, { "epoch": 0.9743333333333334, "grad_norm": 175.49880981445312, "learning_rate": 3.453147397298517e-10, "logits/chosen": 2.496521472930908, "logits/rejected": 2.6645596027374268, "logps/chosen": -28.399595260620117, "logps/rejected": -244.53170776367188, "loss": 2.6219, "nll_loss": 2.5817813873291016, "rewards/accuracies": 1.0, "rewards/chosen": 0.1857929229736328, "rewards/margins": 6.955428123474121, "rewards/rejected": -6.769635200500488, "step": 5846 }, { "epoch": 0.9745, "grad_norm": 24.032106399536133, "learning_rate": 3.4084723421521354e-10, "logits/chosen": 2.099869728088379, "logits/rejected": 1.8304787874221802, "logps/chosen": -116.73240661621094, "logps/rejected": -119.4886474609375, "loss": 1.1184, "nll_loss": 1.1012492179870605, "rewards/accuracies": 1.0, "rewards/chosen": 2.19720458984375, "rewards/margins": 6.496563911437988, "rewards/rejected": -4.299359321594238, "step": 5847 }, { "epoch": 0.9746666666666667, "grad_norm": 21.521406173706055, "learning_rate": 3.364087669851079e-10, "logits/chosen": 1.4803169965744019, "logits/rejected": 1.3178026676177979, "logps/chosen": -44.3773193359375, "logps/rejected": -102.64630126953125, "loss": 0.5366, "nll_loss": 0.5220860838890076, "rewards/accuracies": 1.0, "rewards/chosen": 1.572218418121338, "rewards/margins": 6.982473850250244, "rewards/rejected": -5.410255432128906, "step": 5848 }, { "epoch": 0.9748333333333333, "grad_norm": 69.50468444824219, "learning_rate": 3.3199933933278914e-10, "logits/chosen": 2.7961080074310303, "logits/rejected": 2.782024383544922, "logps/chosen": -151.19654846191406, "logps/rejected": -183.24136352539062, "loss": 1.6894, "nll_loss": 1.5272376537322998, "rewards/accuracies": 1.0, "rewards/chosen": -1.5256272554397583, "rewards/margins": 4.952045440673828, "rewards/rejected": -6.477672576904297, "step": 5849 }, { "epoch": 0.975, "grad_norm": 12.709113121032715, "learning_rate": 3.276189525430628e-10, "logits/chosen": 2.624682664871216, "logits/rejected": 2.6332828998565674, "logps/chosen": -256.75677490234375, "logps/rejected": -358.2707824707031, "loss": 0.8499, "nll_loss": 0.8473823070526123, "rewards/accuracies": 1.0, "rewards/chosen": 3.1177978515625, "rewards/margins": 11.192593574523926, "rewards/rejected": -8.074795722961426, "step": 5850 }, { "epoch": 0.9751666666666666, "grad_norm": 149.3955535888672, "learning_rate": 3.232676078922636e-10, "logits/chosen": 2.790503740310669, "logits/rejected": 3.014754295349121, "logps/chosen": -54.877906799316406, "logps/rejected": -348.7308349609375, "loss": 2.5792, "nll_loss": 2.494450330734253, "rewards/accuracies": 1.0, "rewards/chosen": -0.7152854800224304, "rewards/margins": 7.348533630371094, "rewards/rejected": -8.06381893157959, "step": 5851 }, { "epoch": 0.9753333333333334, "grad_norm": 22.944297790527344, "learning_rate": 3.189453066482883e-10, "logits/chosen": 2.193150520324707, "logits/rejected": 1.992810606956482, "logps/chosen": -74.57832336425781, "logps/rejected": -74.83084106445312, "loss": 0.7926, "nll_loss": 0.768848717212677, "rewards/accuracies": 1.0, "rewards/chosen": 1.8836175203323364, "rewards/margins": 5.877761363983154, "rewards/rejected": -3.9941439628601074, "step": 5852 }, { "epoch": 0.9755, "grad_norm": 27.93329620361328, "learning_rate": 3.1465205007052963e-10, "logits/chosen": 2.3157806396484375, "logits/rejected": 2.469954252243042, "logps/chosen": -38.64173889160156, "logps/rejected": -119.352783203125, "loss": 0.6974, "nll_loss": 0.6662368178367615, "rewards/accuracies": 1.0, "rewards/chosen": 2.8597259521484375, "rewards/margins": 6.175865173339844, "rewards/rejected": -3.3161392211914062, "step": 5853 }, { "epoch": 0.9756666666666667, "grad_norm": 28.904909133911133, "learning_rate": 3.103878394099424e-10, "logits/chosen": 1.9070961475372314, "logits/rejected": 1.7756965160369873, "logps/chosen": -128.03091430664062, "logps/rejected": -150.1404266357422, "loss": 1.1518, "nll_loss": 1.1431329250335693, "rewards/accuracies": 1.0, "rewards/chosen": 1.7927124500274658, "rewards/margins": 9.621907234191895, "rewards/rejected": -7.82919454574585, "step": 5854 }, { "epoch": 0.9758333333333333, "grad_norm": 23.38251304626465, "learning_rate": 3.061526759090327e-10, "logits/chosen": 2.622368812561035, "logits/rejected": 2.749417543411255, "logps/chosen": -13.862931251525879, "logps/rejected": -127.71839904785156, "loss": 0.2961, "nll_loss": 0.288811057806015, "rewards/accuracies": 1.0, "rewards/chosen": 2.0556602478027344, "rewards/margins": 8.951847076416016, "rewards/rejected": -6.896186351776123, "step": 5855 }, { "epoch": 0.976, "grad_norm": 26.631580352783203, "learning_rate": 3.0194656080180236e-10, "logits/chosen": 0.6878515481948853, "logits/rejected": 1.8082828521728516, "logps/chosen": -34.41923522949219, "logps/rejected": -316.804931640625, "loss": 0.6231, "nll_loss": 0.6146292090415955, "rewards/accuracies": 1.0, "rewards/chosen": 1.7828407287597656, "rewards/margins": 11.24890422821045, "rewards/rejected": -9.466063499450684, "step": 5856 }, { "epoch": 0.9761666666666666, "grad_norm": 24.601991653442383, "learning_rate": 2.9776949531382656e-10, "logits/chosen": 2.266768455505371, "logits/rejected": 2.340977191925049, "logps/chosen": -63.17625427246094, "logps/rejected": -142.2534637451172, "loss": 0.7316, "nll_loss": 0.7179120182991028, "rewards/accuracies": 1.0, "rewards/chosen": 1.6995857954025269, "rewards/margins": 7.015093803405762, "rewards/rejected": -5.315507888793945, "step": 5857 }, { "epoch": 0.9763333333333334, "grad_norm": 30.20539093017578, "learning_rate": 2.936214806621984e-10, "logits/chosen": 0.7055264115333557, "logits/rejected": 2.8541197776794434, "logps/chosen": -8.73112964630127, "logps/rejected": -240.93832397460938, "loss": 0.3329, "nll_loss": 0.31182602047920227, "rewards/accuracies": 1.0, "rewards/chosen": 0.8442893028259277, "rewards/margins": 9.010961532592773, "rewards/rejected": -8.166671752929688, "step": 5858 }, { "epoch": 0.9765, "grad_norm": 258.98388671875, "learning_rate": 2.8950251805553996e-10, "logits/chosen": 3.2145512104034424, "logits/rejected": 3.2381482124328613, "logps/chosen": -218.29086303710938, "logps/rejected": -331.8568115234375, "loss": 1.7779, "nll_loss": 1.1369315385818481, "rewards/accuracies": 1.0, "rewards/chosen": -3.97613525390625, "rewards/margins": 1.984057903289795, "rewards/rejected": -5.960193157196045, "step": 5859 }, { "epoch": 0.9766666666666667, "grad_norm": 22.033645629882812, "learning_rate": 2.854126086940356e-10, "logits/chosen": 1.4277327060699463, "logits/rejected": 1.3293507099151611, "logps/chosen": -69.07514953613281, "logps/rejected": -112.81071472167969, "loss": 0.7016, "nll_loss": 0.6977288126945496, "rewards/accuracies": 1.0, "rewards/chosen": 2.955214023590088, "rewards/margins": 9.477821350097656, "rewards/rejected": -6.522607326507568, "step": 5860 }, { "epoch": 0.9768333333333333, "grad_norm": 27.5792179107666, "learning_rate": 2.813517537693877e-10, "logits/chosen": 2.2036046981811523, "logits/rejected": 2.330615758895874, "logps/chosen": -44.75181198120117, "logps/rejected": -158.0112762451172, "loss": 0.6745, "nll_loss": 0.6581149101257324, "rewards/accuracies": 1.0, "rewards/chosen": 1.3236286640167236, "rewards/margins": 7.047402381896973, "rewards/rejected": -5.723773956298828, "step": 5861 }, { "epoch": 0.977, "grad_norm": 54.726219177246094, "learning_rate": 2.7731995446481635e-10, "logits/chosen": 2.7424120903015137, "logits/rejected": 2.7503914833068848, "logps/chosen": -17.970426559448242, "logps/rejected": -37.60721206665039, "loss": 0.8384, "nll_loss": 0.7813228964805603, "rewards/accuracies": 1.0, "rewards/chosen": 1.94333016872406, "rewards/margins": 4.740488052368164, "rewards/rejected": -2.7971577644348145, "step": 5862 }, { "epoch": 0.9771666666666666, "grad_norm": 30.857929229736328, "learning_rate": 2.7331721195509305e-10, "logits/chosen": 2.7000858783721924, "logits/rejected": 2.8769798278808594, "logps/chosen": -44.713497161865234, "logps/rejected": -396.9710693359375, "loss": 0.7247, "nll_loss": 0.7097380757331848, "rewards/accuracies": 1.0, "rewards/chosen": 1.1965423822402954, "rewards/margins": 9.934617042541504, "rewards/rejected": -8.73807430267334, "step": 5863 }, { "epoch": 0.9773333333333334, "grad_norm": 45.358741760253906, "learning_rate": 2.6934352740652923e-10, "logits/chosen": 3.1837003231048584, "logits/rejected": 3.210038900375366, "logps/chosen": -20.511363983154297, "logps/rejected": -68.38057708740234, "loss": 0.8272, "nll_loss": 0.8204545378684998, "rewards/accuracies": 1.0, "rewards/chosen": 3.156848907470703, "rewards/margins": 8.335183143615723, "rewards/rejected": -5.1783342361450195, "step": 5864 }, { "epoch": 0.9775, "grad_norm": 150.40374755859375, "learning_rate": 2.6539890197695427e-10, "logits/chosen": 2.0391247272491455, "logits/rejected": 2.0949950218200684, "logps/chosen": -49.317604064941406, "logps/rejected": -32.1453857421875, "loss": 2.3388, "nll_loss": 0.7954452037811279, "rewards/accuracies": 0.0, "rewards/chosen": 1.7081397771835327, "rewards/margins": -0.5493568181991577, "rewards/rejected": 2.2574965953826904, "step": 5865 }, { "epoch": 0.9776666666666667, "grad_norm": 23.52085304260254, "learning_rate": 2.6148333681573763e-10, "logits/chosen": 2.4332003593444824, "logits/rejected": 2.6569089889526367, "logps/chosen": -72.96188354492188, "logps/rejected": -233.3675079345703, "loss": 0.6406, "nll_loss": 0.6289818286895752, "rewards/accuracies": 1.0, "rewards/chosen": 1.4672530889511108, "rewards/margins": 9.7042818069458, "rewards/rejected": -8.237029075622559, "step": 5866 }, { "epoch": 0.9778333333333333, "grad_norm": 37.083587646484375, "learning_rate": 2.57596833063789e-10, "logits/chosen": 2.2106688022613525, "logits/rejected": 1.7227790355682373, "logps/chosen": -289.6327819824219, "logps/rejected": -117.64529418945312, "loss": 1.1119, "nll_loss": 1.0807193517684937, "rewards/accuracies": 1.0, "rewards/chosen": 1.5037750005722046, "rewards/margins": 5.3362274169921875, "rewards/rejected": -3.8324525356292725, "step": 5867 }, { "epoch": 0.978, "grad_norm": 107.29953002929688, "learning_rate": 2.537393918535358e-10, "logits/chosen": 2.9997711181640625, "logits/rejected": 2.9529080390930176, "logps/chosen": -57.30234146118164, "logps/rejected": -38.02119064331055, "loss": 2.0003, "nll_loss": 1.9100781679153442, "rewards/accuracies": 1.0, "rewards/chosen": 1.068515419960022, "rewards/margins": 3.636739730834961, "rewards/rejected": -2.5682244300842285, "step": 5868 }, { "epoch": 0.9781666666666666, "grad_norm": 24.55567741394043, "learning_rate": 2.4991101430895666e-10, "logits/chosen": 2.544739246368408, "logits/rejected": 2.5233964920043945, "logps/chosen": -73.602783203125, "logps/rejected": -78.91938781738281, "loss": 0.8947, "nll_loss": 0.8867805004119873, "rewards/accuracies": 1.0, "rewards/chosen": 2.9740962982177734, "rewards/margins": 7.995815277099609, "rewards/rejected": -5.021718978881836, "step": 5869 }, { "epoch": 0.9783333333333334, "grad_norm": 72.38225555419922, "learning_rate": 2.461117015455261e-10, "logits/chosen": 2.188910722732544, "logits/rejected": 2.6241414546966553, "logps/chosen": -78.54166412353516, "logps/rejected": -306.4725341796875, "loss": 1.0313, "nll_loss": 0.9027777314186096, "rewards/accuracies": 1.0, "rewards/chosen": 0.9134262204170227, "rewards/margins": 3.0915513038635254, "rewards/rejected": -2.1781251430511475, "step": 5870 }, { "epoch": 0.9785, "grad_norm": 259.0832214355469, "learning_rate": 2.4234145467028067e-10, "logits/chosen": 1.5740807056427002, "logits/rejected": 1.8884268999099731, "logps/chosen": -273.7763671875, "logps/rejected": -423.3017883300781, "loss": 2.7482, "nll_loss": 1.6296213865280151, "rewards/accuracies": 1.0, "rewards/chosen": -5.813831329345703, "rewards/margins": 1.1177911758422852, "rewards/rejected": -6.931622505187988, "step": 5871 }, { "epoch": 0.9786666666666667, "grad_norm": 31.795629501342773, "learning_rate": 2.386002747817861e-10, "logits/chosen": 1.5855621099472046, "logits/rejected": 1.9498108625411987, "logps/chosen": -51.65001678466797, "logps/rejected": -124.89813995361328, "loss": 0.7785, "nll_loss": 0.7595590949058533, "rewards/accuracies": 1.0, "rewards/chosen": 1.9457719326019287, "rewards/margins": 6.25618839263916, "rewards/rejected": -4.3104166984558105, "step": 5872 }, { "epoch": 0.9788333333333333, "grad_norm": 33.64351272583008, "learning_rate": 2.3488816297013713e-10, "logits/chosen": 3.0135107040405273, "logits/rejected": 3.618669033050537, "logps/chosen": -51.65365219116211, "logps/rejected": -145.05931091308594, "loss": 0.9182, "nll_loss": 0.9062047004699707, "rewards/accuracies": 1.0, "rewards/chosen": 1.8200490474700928, "rewards/margins": 7.275282859802246, "rewards/rejected": -5.455234050750732, "step": 5873 }, { "epoch": 0.979, "grad_norm": 162.8438262939453, "learning_rate": 2.3120512031693518e-10, "logits/chosen": 0.9817222952842712, "logits/rejected": 2.267465114593506, "logps/chosen": -19.891311645507812, "logps/rejected": -313.56231689453125, "loss": 1.715, "nll_loss": 1.657609462738037, "rewards/accuracies": 1.0, "rewards/chosen": 0.9841399192810059, "rewards/margins": 4.252718925476074, "rewards/rejected": -3.2685792446136475, "step": 5874 }, { "epoch": 0.9791666666666666, "grad_norm": 22.432205200195312, "learning_rate": 2.2755114789534402e-10, "logits/chosen": 3.0402657985687256, "logits/rejected": 3.022274971008301, "logps/chosen": -76.2109146118164, "logps/rejected": -51.01921463012695, "loss": 0.8347, "nll_loss": 0.7938635945320129, "rewards/accuracies": 1.0, "rewards/chosen": 2.6465706825256348, "rewards/margins": 5.6837382316589355, "rewards/rejected": -3.037167549133301, "step": 5875 }, { "epoch": 0.9793333333333333, "grad_norm": 28.819000244140625, "learning_rate": 2.2392624677004535e-10, "logits/chosen": 1.4982184171676636, "logits/rejected": 2.24084734916687, "logps/chosen": -56.3655891418457, "logps/rejected": -578.1651000976562, "loss": 0.8403, "nll_loss": 0.8289057016372681, "rewards/accuracies": 1.0, "rewards/chosen": 1.4632290601730347, "rewards/margins": 23.151185989379883, "rewards/rejected": -21.687957763671875, "step": 5876 }, { "epoch": 0.9795, "grad_norm": 53.6217041015625, "learning_rate": 2.2033041799723872e-10, "logits/chosen": 2.7828001976013184, "logits/rejected": 2.774303436279297, "logps/chosen": -11.722301483154297, "logps/rejected": -207.65872192382812, "loss": 0.5347, "nll_loss": 0.5328319072723389, "rewards/accuracies": 1.0, "rewards/chosen": 3.635399341583252, "rewards/margins": 10.972135543823242, "rewards/rejected": -7.336735725402832, "step": 5877 }, { "epoch": 0.9796666666666667, "grad_norm": 21.965559005737305, "learning_rate": 2.1676366262467492e-10, "logits/chosen": 2.3519721031188965, "logits/rejected": 2.457427501678467, "logps/chosen": -113.08783721923828, "logps/rejected": -275.153076171875, "loss": 0.9977, "nll_loss": 0.991998553276062, "rewards/accuracies": 1.0, "rewards/chosen": 2.2169549465179443, "rewards/margins": 10.34838581085205, "rewards/rejected": -8.131430625915527, "step": 5878 }, { "epoch": 0.9798333333333333, "grad_norm": 23.152923583984375, "learning_rate": 2.1322598169160043e-10, "logits/chosen": 1.2175672054290771, "logits/rejected": 2.3656692504882812, "logps/chosen": -84.71357727050781, "logps/rejected": -301.4629211425781, "loss": 0.9083, "nll_loss": 0.9012082815170288, "rewards/accuracies": 1.0, "rewards/chosen": 1.9548263549804688, "rewards/margins": 12.007731437683105, "rewards/rejected": -10.052905082702637, "step": 5879 }, { "epoch": 0.98, "grad_norm": 32.017982482910156, "learning_rate": 2.097173762288351e-10, "logits/chosen": 2.8474926948547363, "logits/rejected": 3.3040974140167236, "logps/chosen": -18.255123138427734, "logps/rejected": -262.4313659667969, "loss": 0.5903, "nll_loss": 0.588874876499176, "rewards/accuracies": 1.0, "rewards/chosen": 3.8270974159240723, "rewards/margins": 11.513494491577148, "rewards/rejected": -7.686397552490234, "step": 5880 }, { "epoch": 0.9801666666666666, "grad_norm": 130.83335876464844, "learning_rate": 2.0623784725868343e-10, "logits/chosen": 2.7157864570617676, "logits/rejected": 2.603612184524536, "logps/chosen": -22.837993621826172, "logps/rejected": -5.693270683288574, "loss": 1.7893, "nll_loss": 0.5437617301940918, "rewards/accuracies": 0.0, "rewards/chosen": 1.3634953498840332, "rewards/margins": -0.19588148593902588, "rewards/rejected": 1.559376835823059, "step": 5881 }, { "epoch": 0.9803333333333333, "grad_norm": 18.97355842590332, "learning_rate": 2.0278739579501213e-10, "logits/chosen": 2.062687873840332, "logits/rejected": 2.3443586826324463, "logps/chosen": -16.480493545532227, "logps/rejected": -143.454345703125, "loss": 0.3331, "nll_loss": 0.3169325292110443, "rewards/accuracies": 1.0, "rewards/chosen": 1.9201133251190186, "rewards/margins": 6.534071922302246, "rewards/rejected": -4.613958835601807, "step": 5882 }, { "epoch": 0.9805, "grad_norm": 21.3931884765625, "learning_rate": 1.993660228431837e-10, "logits/chosen": 1.7770746946334839, "logits/rejected": 2.0916786193847656, "logps/chosen": -83.67880249023438, "logps/rejected": -160.13534545898438, "loss": 0.7497, "nll_loss": 0.7405204176902771, "rewards/accuracies": 1.0, "rewards/chosen": 2.1463775634765625, "rewards/margins": 7.7149763107299805, "rewards/rejected": -5.568598747253418, "step": 5883 }, { "epoch": 0.9806666666666667, "grad_norm": 571.8963012695312, "learning_rate": 1.9597372940012292e-10, "logits/chosen": 1.676084280014038, "logits/rejected": 1.262224555015564, "logps/chosen": -294.65802001953125, "logps/rejected": -176.8779754638672, "loss": 3.0954, "nll_loss": 0.8516127467155457, "rewards/accuracies": 0.0, "rewards/chosen": -7.780957221984863, "rewards/margins": -0.8429336547851562, "rewards/rejected": -6.938023567199707, "step": 5884 }, { "epoch": 0.9808333333333333, "grad_norm": 27.174747467041016, "learning_rate": 1.9261051645423908e-10, "logits/chosen": 1.3290642499923706, "logits/rejected": 1.2368110418319702, "logps/chosen": -104.74075317382812, "logps/rejected": -139.85015869140625, "loss": 0.9084, "nll_loss": 0.8952199816703796, "rewards/accuracies": 1.0, "rewards/chosen": 1.3681076765060425, "rewards/margins": 8.73691463470459, "rewards/rejected": -7.368807315826416, "step": 5885 }, { "epoch": 0.981, "grad_norm": 35.48173522949219, "learning_rate": 1.8927638498551503e-10, "logits/chosen": 1.7325537204742432, "logits/rejected": 2.7594990730285645, "logps/chosen": -28.528562545776367, "logps/rejected": -394.0433044433594, "loss": 0.6712, "nll_loss": 0.6634548306465149, "rewards/accuracies": 1.0, "rewards/chosen": 1.9403982162475586, "rewards/margins": 9.296943664550781, "rewards/rejected": -7.356545925140381, "step": 5886 }, { "epoch": 0.9811666666666666, "grad_norm": 59.48823165893555, "learning_rate": 1.8597133596541804e-10, "logits/chosen": 1.70353364944458, "logits/rejected": 1.5373612642288208, "logps/chosen": -69.45978546142578, "logps/rejected": -65.64230346679688, "loss": 1.0203, "nll_loss": 0.7160802483558655, "rewards/accuracies": 1.0, "rewards/chosen": 1.4886085987091064, "rewards/margins": 2.2302255630493164, "rewards/rejected": -0.7416168451309204, "step": 5887 }, { "epoch": 0.9813333333333333, "grad_norm": 33.01680374145508, "learning_rate": 1.826953703569667e-10, "logits/chosen": 0.07928357273340225, "logits/rejected": 1.9837566614151, "logps/chosen": -117.48074340820312, "logps/rejected": -424.8128356933594, "loss": 1.6947, "nll_loss": 1.678296446800232, "rewards/accuracies": 1.0, "rewards/chosen": 1.085957407951355, "rewards/margins": 14.251402854919434, "rewards/rejected": -13.165445327758789, "step": 5888 }, { "epoch": 0.9815, "grad_norm": 33.55736541748047, "learning_rate": 1.7944848911470856e-10, "logits/chosen": 2.4103519916534424, "logits/rejected": 2.5674328804016113, "logps/chosen": -20.283845901489258, "logps/rejected": -383.8516845703125, "loss": 0.7552, "nll_loss": 0.7512535452842712, "rewards/accuracies": 1.0, "rewards/chosen": 2.72536039352417, "rewards/margins": 9.872064590454102, "rewards/rejected": -7.146704196929932, "step": 5889 }, { "epoch": 0.9816666666666667, "grad_norm": 33.59649658203125, "learning_rate": 1.7623069318469795e-10, "logits/chosen": 1.7321393489837646, "logits/rejected": 2.7362053394317627, "logps/chosen": -37.3878059387207, "logps/rejected": -452.578125, "loss": 0.9899, "nll_loss": 0.9838895201683044, "rewards/accuracies": 1.0, "rewards/chosen": 3.1607418060302734, "rewards/margins": 8.519725799560547, "rewards/rejected": -5.358984470367432, "step": 5890 }, { "epoch": 0.9818333333333333, "grad_norm": 52.277076721191406, "learning_rate": 1.7304198350451826e-10, "logits/chosen": 2.5909013748168945, "logits/rejected": 2.4675307273864746, "logps/chosen": -15.058616638183594, "logps/rejected": -53.646602630615234, "loss": 0.7105, "nll_loss": 0.6844825744628906, "rewards/accuracies": 1.0, "rewards/chosen": 1.8684924840927124, "rewards/margins": 5.742867946624756, "rewards/rejected": -3.874375343322754, "step": 5891 }, { "epoch": 0.982, "grad_norm": 114.09446716308594, "learning_rate": 1.698823610032929e-10, "logits/chosen": 1.9159361124038696, "logits/rejected": 1.8683655261993408, "logps/chosen": -35.81664276123047, "logps/rejected": -257.8143310546875, "loss": 1.4813, "nll_loss": 1.3775629997253418, "rewards/accuracies": 1.0, "rewards/chosen": -0.4763343930244446, "rewards/margins": 3.579488515853882, "rewards/rejected": -4.055822849273682, "step": 5892 }, { "epoch": 0.9821666666666666, "grad_norm": 61.450687408447266, "learning_rate": 1.667518266016521e-10, "logits/chosen": 2.5912370681762695, "logits/rejected": 2.443743944168091, "logps/chosen": -161.66729736328125, "logps/rejected": -59.18910598754883, "loss": 1.0646, "nll_loss": 0.8333366513252258, "rewards/accuracies": 1.0, "rewards/chosen": 0.8503906726837158, "rewards/margins": 2.280970811843872, "rewards/rejected": -1.4305801391601562, "step": 5893 }, { "epoch": 0.9823333333333333, "grad_norm": 44.27067947387695, "learning_rate": 1.6365038121176623e-10, "logits/chosen": 1.7768189907073975, "logits/rejected": 2.122986078262329, "logps/chosen": -19.431734085083008, "logps/rejected": -80.48783111572266, "loss": 0.6428, "nll_loss": 0.607241690158844, "rewards/accuracies": 1.0, "rewards/chosen": 0.6503229737281799, "rewards/margins": 5.364344596862793, "rewards/rejected": -4.714021682739258, "step": 5894 }, { "epoch": 0.9825, "grad_norm": 32.58271789550781, "learning_rate": 1.605780257373124e-10, "logits/chosen": 3.4848036766052246, "logits/rejected": 3.417992353439331, "logps/chosen": -34.77290344238281, "logps/rejected": -82.4260482788086, "loss": 0.7005, "nll_loss": 0.6687096953392029, "rewards/accuracies": 1.0, "rewards/chosen": 2.6773319244384766, "rewards/margins": 6.0071868896484375, "rewards/rejected": -3.329854965209961, "step": 5895 }, { "epoch": 0.9826666666666667, "grad_norm": 19.30023765563965, "learning_rate": 1.5753476107350782e-10, "logits/chosen": 2.466508388519287, "logits/rejected": 2.52388596534729, "logps/chosen": -132.63133239746094, "logps/rejected": -293.49102783203125, "loss": 0.951, "nll_loss": 0.9473666548728943, "rewards/accuracies": 1.0, "rewards/chosen": 2.774257183074951, "rewards/margins": 10.161041259765625, "rewards/rejected": -7.386784553527832, "step": 5896 }, { "epoch": 0.9828333333333333, "grad_norm": 25.62708854675293, "learning_rate": 1.5452058810708768e-10, "logits/chosen": 2.5304877758026123, "logits/rejected": 2.580547571182251, "logps/chosen": -142.36257934570312, "logps/rejected": -285.21954345703125, "loss": 1.1406, "nll_loss": 1.1209653615951538, "rewards/accuracies": 1.0, "rewards/chosen": 0.9650116562843323, "rewards/margins": 7.7950286865234375, "rewards/rejected": -6.83001708984375, "step": 5897 }, { "epoch": 0.983, "grad_norm": 20.742992401123047, "learning_rate": 1.5153550771630496e-10, "logits/chosen": 1.2215055227279663, "logits/rejected": 1.8447452783584595, "logps/chosen": -89.60064697265625, "logps/rejected": -489.6530456542969, "loss": 0.8251, "nll_loss": 0.8220241665840149, "rewards/accuracies": 1.0, "rewards/chosen": 2.8056793212890625, "rewards/margins": 13.139259338378906, "rewards/rejected": -10.333580017089844, "step": 5898 }, { "epoch": 0.9831666666666666, "grad_norm": 23.111705780029297, "learning_rate": 1.4857952077094171e-10, "logits/chosen": 0.5415946245193481, "logits/rejected": 1.6885850429534912, "logps/chosen": -97.29023742675781, "logps/rejected": -374.4372253417969, "loss": 0.8681, "nll_loss": 0.8609756827354431, "rewards/accuracies": 1.0, "rewards/chosen": 1.9540451765060425, "rewards/margins": 11.444948196411133, "rewards/rejected": -9.4909029006958, "step": 5899 }, { "epoch": 0.9833333333333333, "grad_norm": 70.00817108154297, "learning_rate": 1.4565262813230894e-10, "logits/chosen": 2.089883327484131, "logits/rejected": 2.447169542312622, "logps/chosen": -33.97211837768555, "logps/rejected": -370.2718200683594, "loss": 1.1361, "nll_loss": 1.1324039697647095, "rewards/accuracies": 1.0, "rewards/chosen": 2.8857624530792236, "rewards/margins": 9.730058670043945, "rewards/rejected": -6.844296455383301, "step": 5900 }, { "epoch": 0.9835, "grad_norm": 24.086034774780273, "learning_rate": 1.4275483065321337e-10, "logits/chosen": 2.644678831100464, "logits/rejected": 2.805762529373169, "logps/chosen": -25.101301193237305, "logps/rejected": -159.97821044921875, "loss": 0.445, "nll_loss": 0.43278101086616516, "rewards/accuracies": 1.0, "rewards/chosen": 1.8169372081756592, "rewards/margins": 7.20561408996582, "rewards/rejected": -5.388676643371582, "step": 5901 }, { "epoch": 0.9836666666666667, "grad_norm": 35.0061149597168, "learning_rate": 1.398861291780351e-10, "logits/chosen": 1.6530548334121704, "logits/rejected": 1.8875110149383545, "logps/chosen": -48.62410354614258, "logps/rejected": -117.93768310546875, "loss": 0.8439, "nll_loss": 0.8241373896598816, "rewards/accuracies": 1.0, "rewards/chosen": 0.9984623789787292, "rewards/margins": 7.328639507293701, "rewards/rejected": -6.330177307128906, "step": 5902 }, { "epoch": 0.9838333333333333, "grad_norm": 28.507137298583984, "learning_rate": 1.3704652454261667e-10, "logits/chosen": 1.6580557823181152, "logits/rejected": 1.8156543970108032, "logps/chosen": -105.57191467285156, "logps/rejected": -142.93849182128906, "loss": 1.1462, "nll_loss": 1.1351819038391113, "rewards/accuracies": 1.0, "rewards/chosen": 2.004504442214966, "rewards/margins": 7.3328857421875, "rewards/rejected": -5.328381538391113, "step": 5903 }, { "epoch": 0.984, "grad_norm": 14.085370063781738, "learning_rate": 1.3423601757436288e-10, "logits/chosen": 0.7496919631958008, "logits/rejected": 1.506903886795044, "logps/chosen": -158.84722900390625, "logps/rejected": -529.637451171875, "loss": 0.6789, "nll_loss": 0.6759455800056458, "rewards/accuracies": 1.0, "rewards/chosen": 2.852993965148926, "rewards/margins": 13.668808937072754, "rewards/rejected": -10.815814971923828, "step": 5904 }, { "epoch": 0.9841666666666666, "grad_norm": 29.814495086669922, "learning_rate": 1.314546090921853e-10, "logits/chosen": 1.7437044382095337, "logits/rejected": 1.6445695161819458, "logps/chosen": -76.11714172363281, "logps/rejected": -106.16307830810547, "loss": 1.0837, "nll_loss": 1.0571825504302979, "rewards/accuracies": 1.0, "rewards/chosen": 2.038691759109497, "rewards/margins": 5.800077438354492, "rewards/rejected": -3.761385440826416, "step": 5905 }, { "epoch": 0.9843333333333333, "grad_norm": 29.18044090270996, "learning_rate": 1.2870229990651348e-10, "logits/chosen": 2.6772422790527344, "logits/rejected": 2.5031542778015137, "logps/chosen": -173.93785095214844, "logps/rejected": -33.624229431152344, "loss": 0.8195, "nll_loss": 0.7187514901161194, "rewards/accuracies": 1.0, "rewards/chosen": 1.850555419921875, "rewards/margins": 3.966381788253784, "rewards/rejected": -2.115826368331909, "step": 5906 }, { "epoch": 0.9845, "grad_norm": 186.39639282226562, "learning_rate": 1.25979090819317e-10, "logits/chosen": 2.475198745727539, "logits/rejected": 2.178969144821167, "logps/chosen": -47.50665283203125, "logps/rejected": -43.727874755859375, "loss": 2.8553, "nll_loss": 2.3753323554992676, "rewards/accuracies": 1.0, "rewards/chosen": -2.0205812454223633, "rewards/margins": 1.0808522701263428, "rewards/rejected": -3.101433515548706, "step": 5907 }, { "epoch": 0.9846666666666667, "grad_norm": 101.03448486328125, "learning_rate": 1.232849826240723e-10, "logits/chosen": 2.922752618789673, "logits/rejected": 3.034677267074585, "logps/chosen": -22.13978385925293, "logps/rejected": -179.459228515625, "loss": 1.1251, "nll_loss": 1.1069892644882202, "rewards/accuracies": 1.0, "rewards/chosen": 1.0310487747192383, "rewards/margins": 8.153714179992676, "rewards/rejected": -7.1226654052734375, "step": 5908 }, { "epoch": 0.9848333333333333, "grad_norm": 34.10044860839844, "learning_rate": 1.2061997610576247e-10, "logits/chosen": 1.524827480316162, "logits/rejected": 1.9083484411239624, "logps/chosen": -109.81932830810547, "logps/rejected": -270.01531982421875, "loss": 1.4321, "nll_loss": 1.426224946975708, "rewards/accuracies": 1.0, "rewards/chosen": 2.1591269969940186, "rewards/margins": 10.886635780334473, "rewards/rejected": -8.727508544921875, "step": 5909 }, { "epoch": 0.985, "grad_norm": 24.553850173950195, "learning_rate": 1.1798407204093307e-10, "logits/chosen": 1.997971534729004, "logits/rejected": 2.4830992221832275, "logps/chosen": -52.56103515625, "logps/rejected": -322.01336669921875, "loss": 0.674, "nll_loss": 0.6570129990577698, "rewards/accuracies": 1.0, "rewards/chosen": 1.0539788007736206, "rewards/margins": 10.39614200592041, "rewards/rejected": -9.3421630859375, "step": 5910 }, { "epoch": 0.9851666666666666, "grad_norm": 64.15089416503906, "learning_rate": 1.15377271197592e-10, "logits/chosen": 3.0948994159698486, "logits/rejected": 3.4357011318206787, "logps/chosen": -45.85836410522461, "logps/rejected": -144.3726348876953, "loss": 1.3204, "nll_loss": 1.2738432884216309, "rewards/accuracies": 1.0, "rewards/chosen": 0.29249152541160583, "rewards/margins": 5.035351753234863, "rewards/rejected": -4.742860317230225, "step": 5911 }, { "epoch": 0.9853333333333333, "grad_norm": 195.32928466796875, "learning_rate": 1.127995743353205e-10, "logits/chosen": 1.755957007408142, "logits/rejected": 1.5503302812576294, "logps/chosen": -84.33390808105469, "logps/rejected": -15.455327987670898, "loss": 3.0656, "nll_loss": 0.8349891304969788, "rewards/accuracies": 0.0, "rewards/chosen": 1.7110154628753662, "rewards/margins": -1.4309015274047852, "rewards/rejected": 3.1419169902801514, "step": 5912 }, { "epoch": 0.9855, "grad_norm": 39.118770599365234, "learning_rate": 1.1025098220518447e-10, "logits/chosen": 1.8005664348602295, "logits/rejected": 2.5537757873535156, "logps/chosen": -19.07813262939453, "logps/rejected": -225.99078369140625, "loss": 0.7083, "nll_loss": 0.7065975069999695, "rewards/accuracies": 1.0, "rewards/chosen": 4.403510570526123, "rewards/margins": 10.98065185546875, "rewards/rejected": -6.577140808105469, "step": 5913 }, { "epoch": 0.9856666666666667, "grad_norm": 12.74420166015625, "learning_rate": 1.0773149554980099e-10, "logits/chosen": 2.5678341388702393, "logits/rejected": 2.5830295085906982, "logps/chosen": -257.4487609863281, "logps/rejected": -356.1986083984375, "loss": 0.8524, "nll_loss": 0.8496659994125366, "rewards/accuracies": 1.0, "rewards/chosen": 3.0485992431640625, "rewards/margins": 10.916177749633789, "rewards/rejected": -7.867578506469727, "step": 5914 }, { "epoch": 0.9858333333333333, "grad_norm": 27.319015502929688, "learning_rate": 1.0524111510326061e-10, "logits/chosen": 1.544140100479126, "logits/rejected": 1.5388790369033813, "logps/chosen": -90.34735107421875, "logps/rejected": -138.67294311523438, "loss": 0.9104, "nll_loss": 0.9034734964370728, "rewards/accuracies": 1.0, "rewards/chosen": 2.7363176345825195, "rewards/margins": 8.16111946105957, "rewards/rejected": -5.424802303314209, "step": 5915 }, { "epoch": 0.986, "grad_norm": 33.151039123535156, "learning_rate": 1.0277984159122733e-10, "logits/chosen": 2.2179834842681885, "logits/rejected": 2.1697726249694824, "logps/chosen": -79.64726257324219, "logps/rejected": -192.41049194335938, "loss": 1.038, "nll_loss": 1.0211187601089478, "rewards/accuracies": 1.0, "rewards/chosen": 1.0972915887832642, "rewards/margins": 8.458674430847168, "rewards/rejected": -7.361382961273193, "step": 5916 }, { "epoch": 0.9861666666666666, "grad_norm": 19.524621963500977, "learning_rate": 1.0034767573083858e-10, "logits/chosen": 2.289466619491577, "logits/rejected": 2.1416308879852295, "logps/chosen": -10.57653522491455, "logps/rejected": -69.9505386352539, "loss": 0.2462, "nll_loss": 0.23503409326076508, "rewards/accuracies": 1.0, "rewards/chosen": 2.44083571434021, "rewards/margins": 7.268832206726074, "rewards/rejected": -4.827996253967285, "step": 5917 }, { "epoch": 0.9863333333333333, "grad_norm": 602.5509033203125, "learning_rate": 9.794461823077193e-11, "logits/chosen": 1.7311757802963257, "logits/rejected": 1.5753933191299438, "logps/chosen": -409.43817138671875, "logps/rejected": -104.53119659423828, "loss": 5.9289, "nll_loss": 1.103606939315796, "rewards/accuracies": 0.0, "rewards/chosen": -3.8320159912109375, "rewards/margins": -4.783555030822754, "rewards/rejected": 0.9515389204025269, "step": 5918 }, { "epoch": 0.9865, "grad_norm": 19.197980880737305, "learning_rate": 9.557066979123396e-11, "logits/chosen": 3.2390267848968506, "logits/rejected": 2.914696455001831, "logps/chosen": -12.770545959472656, "logps/rejected": -77.36312866210938, "loss": 0.2723, "nll_loss": 0.2660529613494873, "rewards/accuracies": 1.0, "rewards/chosen": 2.482184648513794, "rewards/margins": 8.52177906036377, "rewards/rejected": -6.039594650268555, "step": 5919 }, { "epoch": 0.9866666666666667, "grad_norm": 29.768125534057617, "learning_rate": 9.32258311039269e-11, "logits/chosen": 1.8796064853668213, "logits/rejected": 2.0616695880889893, "logps/chosen": -75.13917541503906, "logps/rejected": -161.54547119140625, "loss": 0.9527, "nll_loss": 0.9392396807670593, "rewards/accuracies": 1.0, "rewards/chosen": 1.6751205921173096, "rewards/margins": 7.068329811096191, "rewards/rejected": -5.393208980560303, "step": 5920 }, { "epoch": 0.9868333333333333, "grad_norm": 26.25625991821289, "learning_rate": 9.09101028520709e-11, "logits/chosen": 2.3103652000427246, "logits/rejected": 2.2151296138763428, "logps/chosen": -22.7586612701416, "logps/rejected": -140.56451416015625, "loss": 0.503, "nll_loss": 0.4947535991668701, "rewards/accuracies": 1.0, "rewards/chosen": 1.9672609567642212, "rewards/margins": 8.542327880859375, "rewards/rejected": -6.575067043304443, "step": 5921 }, { "epoch": 0.987, "grad_norm": 16.005611419677734, "learning_rate": 8.862348571043732e-11, "logits/chosen": 2.7881386280059814, "logits/rejected": 2.7460105419158936, "logps/chosen": -192.31451416015625, "logps/rejected": -221.15841674804688, "loss": 0.8959, "nll_loss": 0.8862419724464417, "rewards/accuracies": 1.0, "rewards/chosen": 2.6345462799072266, "rewards/margins": 7.564545154571533, "rewards/rejected": -4.929998874664307, "step": 5922 }, { "epoch": 0.9871666666666666, "grad_norm": 242.3354949951172, "learning_rate": 8.636598034527098e-11, "logits/chosen": 2.2840192317962646, "logits/rejected": 2.0723657608032227, "logps/chosen": -55.14556884765625, "logps/rejected": -45.48419952392578, "loss": 5.052, "nll_loss": 1.0404824018478394, "rewards/accuracies": 0.0, "rewards/chosen": -0.9729877710342407, "rewards/margins": -3.8081531524658203, "rewards/rejected": 2.835165500640869, "step": 5923 }, { "epoch": 0.9873333333333333, "grad_norm": 35.06583023071289, "learning_rate": 8.413758741435683e-11, "logits/chosen": 2.5869083404541016, "logits/rejected": 2.607708215713501, "logps/chosen": -41.670902252197266, "logps/rejected": -151.5208740234375, "loss": 0.843, "nll_loss": 0.8170764446258545, "rewards/accuracies": 1.0, "rewards/chosen": 0.7797245979309082, "rewards/margins": 6.440935134887695, "rewards/rejected": -5.661210536956787, "step": 5924 }, { "epoch": 0.9875, "grad_norm": 20.919095993041992, "learning_rate": 8.193830756699771e-11, "logits/chosen": 2.6194851398468018, "logits/rejected": 2.6166718006134033, "logps/chosen": -48.54012680053711, "logps/rejected": -247.06259155273438, "loss": 0.5886, "nll_loss": 0.5848206877708435, "rewards/accuracies": 1.0, "rewards/chosen": 2.93833589553833, "rewards/margins": 9.514347076416016, "rewards/rejected": -6.576010704040527, "step": 5925 }, { "epoch": 0.9876666666666667, "grad_norm": 45.29894256591797, "learning_rate": 7.976814144401434e-11, "logits/chosen": 2.695889472961426, "logits/rejected": 2.886948823928833, "logps/chosen": -23.534799575805664, "logps/rejected": -196.33181762695312, "loss": 0.8094, "nll_loss": 0.7844931483268738, "rewards/accuracies": 1.0, "rewards/chosen": 0.877448558807373, "rewards/margins": 6.29604434967041, "rewards/rejected": -5.418595790863037, "step": 5926 }, { "epoch": 0.9878333333333333, "grad_norm": 39.648860931396484, "learning_rate": 7.762708967774534e-11, "logits/chosen": 3.2086637020111084, "logits/rejected": 3.33725905418396, "logps/chosen": -20.014013290405273, "logps/rejected": -336.44781494140625, "loss": 0.6733, "nll_loss": 0.6671336889266968, "rewards/accuracies": 1.0, "rewards/chosen": 2.096237897872925, "rewards/margins": 11.998289108276367, "rewards/rejected": -9.902050971984863, "step": 5927 }, { "epoch": 0.988, "grad_norm": 178.54957580566406, "learning_rate": 7.551515289203614e-11, "logits/chosen": 1.9660651683807373, "logits/rejected": 2.2425830364227295, "logps/chosen": -42.06218719482422, "logps/rejected": -16.57712173461914, "loss": 3.8801, "nll_loss": 0.6471105217933655, "rewards/accuracies": 0.0, "rewards/chosen": 0.7135635614395142, "rewards/margins": -2.7402749061584473, "rewards/rejected": 3.453838348388672, "step": 5928 }, { "epoch": 0.9881666666666666, "grad_norm": 42.746482849121094, "learning_rate": 7.343233170223894e-11, "logits/chosen": 2.8340110778808594, "logits/rejected": 2.8495163917541504, "logps/chosen": -34.32430648803711, "logps/rejected": -149.56736755371094, "loss": 0.9185, "nll_loss": 0.9032713174819946, "rewards/accuracies": 1.0, "rewards/chosen": 1.9197373390197754, "rewards/margins": 6.64483118057251, "rewards/rejected": -4.725093841552734, "step": 5929 }, { "epoch": 0.9883333333333333, "grad_norm": 29.188087463378906, "learning_rate": 7.137862671526828e-11, "logits/chosen": 1.698577642440796, "logits/rejected": 1.4036513566970825, "logps/chosen": -60.9007682800293, "logps/rejected": -68.6551513671875, "loss": 0.897, "nll_loss": 0.845844030380249, "rewards/accuracies": 1.0, "rewards/chosen": 1.4997868537902832, "rewards/margins": 4.617031574249268, "rewards/rejected": -3.1172447204589844, "step": 5930 }, { "epoch": 0.9885, "grad_norm": 20.139707565307617, "learning_rate": 6.935403852950106e-11, "logits/chosen": 2.385084867477417, "logits/rejected": 2.3255558013916016, "logps/chosen": -209.855224609375, "logps/rejected": -221.93907165527344, "loss": 0.9208, "nll_loss": 0.9124137759208679, "rewards/accuracies": 1.0, "rewards/chosen": 1.7770967483520508, "rewards/margins": 13.145890235900879, "rewards/rejected": -11.368793487548828, "step": 5931 }, { "epoch": 0.9886666666666667, "grad_norm": 46.44675827026367, "learning_rate": 6.735856773486536e-11, "logits/chosen": 2.347341299057007, "logits/rejected": 2.4002838134765625, "logps/chosen": -98.2809066772461, "logps/rejected": -313.3365478515625, "loss": 1.6964, "nll_loss": 1.6657782793045044, "rewards/accuracies": 1.0, "rewards/chosen": 0.44558411836624146, "rewards/margins": 8.017633438110352, "rewards/rejected": -7.572049617767334, "step": 5932 }, { "epoch": 0.9888333333333333, "grad_norm": 35.10943603515625, "learning_rate": 6.539221491279611e-11, "logits/chosen": 2.702772617340088, "logits/rejected": 2.7135844230651855, "logps/chosen": -138.5775909423828, "logps/rejected": -208.1866455078125, "loss": 0.8903, "nll_loss": 0.8199857473373413, "rewards/accuracies": 1.0, "rewards/chosen": 0.7258148193359375, "rewards/margins": 3.886937141418457, "rewards/rejected": -3.1611223220825195, "step": 5933 }, { "epoch": 0.989, "grad_norm": 40.797027587890625, "learning_rate": 6.34549806362239e-11, "logits/chosen": 1.9812341928482056, "logits/rejected": 2.046091079711914, "logps/chosen": -61.661781311035156, "logps/rejected": -231.2694091796875, "loss": 0.8404, "nll_loss": 0.7519729137420654, "rewards/accuracies": 1.0, "rewards/chosen": -0.10767745971679688, "rewards/margins": 3.6313228607177734, "rewards/rejected": -3.7390003204345703, "step": 5934 }, { "epoch": 0.9891666666666666, "grad_norm": 79.29328918457031, "learning_rate": 6.15468654696305e-11, "logits/chosen": 1.8155665397644043, "logits/rejected": 2.1699059009552, "logps/chosen": -57.329490661621094, "logps/rejected": -438.3337097167969, "loss": 2.1126, "nll_loss": 1.9768792390823364, "rewards/accuracies": 1.0, "rewards/chosen": -1.226935625076294, "rewards/margins": 4.491311073303223, "rewards/rejected": -5.7182464599609375, "step": 5935 }, { "epoch": 0.9893333333333333, "grad_norm": 29.359092712402344, "learning_rate": 5.966786996898231e-11, "logits/chosen": 2.678696393966675, "logits/rejected": 2.719573497772217, "logps/chosen": -14.986260414123535, "logps/rejected": -129.7106475830078, "loss": 0.4234, "nll_loss": 0.3943752646446228, "rewards/accuracies": 1.0, "rewards/chosen": 1.1327093839645386, "rewards/margins": 5.477763652801514, "rewards/rejected": -4.3450541496276855, "step": 5936 }, { "epoch": 0.9895, "grad_norm": 28.68351173400879, "learning_rate": 5.7817994681774726e-11, "logits/chosen": 2.1834568977355957, "logits/rejected": 2.2700629234313965, "logps/chosen": -75.57654571533203, "logps/rejected": -191.2605743408203, "loss": 0.9366, "nll_loss": 0.9216651916503906, "rewards/accuracies": 1.0, "rewards/chosen": 1.636286973953247, "rewards/margins": 6.807059288024902, "rewards/rejected": -5.170772075653076, "step": 5937 }, { "epoch": 0.9896666666666667, "grad_norm": 31.472978591918945, "learning_rate": 5.599724014703211e-11, "logits/chosen": 2.0862486362457275, "logits/rejected": 2.4239842891693115, "logps/chosen": -227.4556884765625, "logps/rejected": -358.4203796386719, "loss": 1.1195, "nll_loss": 1.0988197326660156, "rewards/accuracies": 1.0, "rewards/chosen": 0.8973603248596191, "rewards/margins": 7.802155017852783, "rewards/rejected": -6.904794692993164, "step": 5938 }, { "epoch": 0.9898333333333333, "grad_norm": 64.67265319824219, "learning_rate": 5.4205606895263434e-11, "logits/chosen": 2.6369054317474365, "logits/rejected": 2.54213547706604, "logps/chosen": -73.56956481933594, "logps/rejected": -11.170355796813965, "loss": 2.1306, "nll_loss": 1.0361911058425903, "rewards/accuracies": 1.0, "rewards/chosen": 4.748556613922119, "rewards/margins": 1.482560396194458, "rewards/rejected": 3.265996217727661, "step": 5939 }, { "epoch": 0.99, "grad_norm": 22.920408248901367, "learning_rate": 5.244309544850667e-11, "logits/chosen": 2.762519359588623, "logits/rejected": 2.754002809524536, "logps/chosen": -28.778335571289062, "logps/rejected": -111.619384765625, "loss": 0.4652, "nll_loss": 0.4427436888217926, "rewards/accuracies": 1.0, "rewards/chosen": 1.2038002014160156, "rewards/margins": 6.072267055511475, "rewards/rejected": -4.868466854095459, "step": 5940 }, { "epoch": 0.9901666666666666, "grad_norm": 55.90353012084961, "learning_rate": 5.0709706320328785e-11, "logits/chosen": 4.527368068695068, "logits/rejected": 4.440149784088135, "logps/chosen": -47.504112243652344, "logps/rejected": -113.41697692871094, "loss": 1.0619, "nll_loss": 1.0556468963623047, "rewards/accuracies": 1.0, "rewards/chosen": 2.4986748695373535, "rewards/margins": 8.507189750671387, "rewards/rejected": -6.008514881134033, "step": 5941 }, { "epoch": 0.9903333333333333, "grad_norm": 23.908952713012695, "learning_rate": 4.900544001578133e-11, "logits/chosen": 1.9308764934539795, "logits/rejected": 2.287545680999756, "logps/chosen": -52.55975341796875, "logps/rejected": -223.61517333984375, "loss": 0.6547, "nll_loss": 0.6488858461380005, "rewards/accuracies": 1.0, "rewards/chosen": 2.2243142127990723, "rewards/margins": 9.945499420166016, "rewards/rejected": -7.721185207366943, "step": 5942 }, { "epoch": 0.9905, "grad_norm": 29.581377029418945, "learning_rate": 4.733029703146707e-11, "logits/chosen": 1.8628789186477661, "logits/rejected": 2.335479497909546, "logps/chosen": -16.984649658203125, "logps/rejected": -241.84323120117188, "loss": 0.5015, "nll_loss": 0.4995485246181488, "rewards/accuracies": 1.0, "rewards/chosen": 4.6615400314331055, "rewards/margins": 10.977703094482422, "rewards/rejected": -6.316163539886475, "step": 5943 }, { "epoch": 0.9906666666666667, "grad_norm": 25.67681884765625, "learning_rate": 4.568427785546225e-11, "logits/chosen": 2.2845664024353027, "logits/rejected": 2.5937070846557617, "logps/chosen": -52.31287384033203, "logps/rejected": -349.6543884277344, "loss": 0.6412, "nll_loss": 0.630275547504425, "rewards/accuracies": 1.0, "rewards/chosen": 1.5089081525802612, "rewards/margins": 11.525369644165039, "rewards/rejected": -10.016461372375488, "step": 5944 }, { "epoch": 0.9908333333333333, "grad_norm": 22.54117202758789, "learning_rate": 4.406738296738321e-11, "logits/chosen": 2.997366189956665, "logits/rejected": 3.346421957015991, "logps/chosen": -44.36723327636719, "logps/rejected": -183.27122497558594, "loss": 0.5057, "nll_loss": 0.49296918511390686, "rewards/accuracies": 1.0, "rewards/chosen": 1.4844169616699219, "rewards/margins": 7.892001152038574, "rewards/rejected": -6.407584190368652, "step": 5945 }, { "epoch": 0.991, "grad_norm": 21.245065689086914, "learning_rate": 4.247961283835311e-11, "logits/chosen": 2.684694766998291, "logits/rejected": 2.7107584476470947, "logps/chosen": -58.238563537597656, "logps/rejected": -100.08943939208984, "loss": 0.6154, "nll_loss": 0.606651782989502, "rewards/accuracies": 1.0, "rewards/chosen": 2.315660238265991, "rewards/margins": 7.730032920837402, "rewards/rejected": -5.414372444152832, "step": 5946 }, { "epoch": 0.9911666666666666, "grad_norm": 21.046300888061523, "learning_rate": 4.092096793102407e-11, "logits/chosen": 2.7202045917510986, "logits/rejected": 2.649420738220215, "logps/chosen": -20.89203643798828, "logps/rejected": -73.54762268066406, "loss": 0.3577, "nll_loss": 0.34820061922073364, "rewards/accuracies": 1.0, "rewards/chosen": 1.9910237789154053, "rewards/margins": 7.793323516845703, "rewards/rejected": -5.802299499511719, "step": 5947 }, { "epoch": 0.9913333333333333, "grad_norm": 28.919649124145508, "learning_rate": 3.9391448699532816e-11, "logits/chosen": 0.8122856616973877, "logits/rejected": 1.4357755184173584, "logps/chosen": -108.84559631347656, "logps/rejected": -283.82568359375, "loss": 1.1388, "nll_loss": 1.1338082551956177, "rewards/accuracies": 1.0, "rewards/chosen": 2.376861572265625, "rewards/margins": 10.208078384399414, "rewards/rejected": -7.831216335296631, "step": 5948 }, { "epoch": 0.9915, "grad_norm": 54.39328384399414, "learning_rate": 3.789105558954508e-11, "logits/chosen": 1.85898756980896, "logits/rejected": 2.5509395599365234, "logps/chosen": -10.71007251739502, "logps/rejected": -162.32456970214844, "loss": 0.5776, "nll_loss": 0.5100035071372986, "rewards/accuracies": 1.0, "rewards/chosen": 0.9076061248779297, "rewards/margins": 3.9868650436401367, "rewards/rejected": -3.079258918762207, "step": 5949 }, { "epoch": 0.9916666666666667, "grad_norm": 31.280851364135742, "learning_rate": 3.64197890382445e-11, "logits/chosen": 1.652616262435913, "logits/rejected": 2.074800968170166, "logps/chosen": -76.1161880493164, "logps/rejected": -168.58029174804688, "loss": 1.1068, "nll_loss": 1.087374210357666, "rewards/accuracies": 1.0, "rewards/chosen": 1.0238739252090454, "rewards/margins": 7.265372276306152, "rewards/rejected": -6.2414984703063965, "step": 5950 }, { "epoch": 0.9918333333333333, "grad_norm": 16.742769241333008, "learning_rate": 3.4977649474321466e-11, "logits/chosen": 2.1744697093963623, "logits/rejected": 2.163839101791382, "logps/chosen": -157.10830688476562, "logps/rejected": -102.5266342163086, "loss": 0.7129, "nll_loss": 0.7045215964317322, "rewards/accuracies": 1.0, "rewards/chosen": 1.8460800647735596, "rewards/margins": 9.150300025939941, "rewards/rejected": -7.304219722747803, "step": 5951 }, { "epoch": 0.992, "grad_norm": 34.770301818847656, "learning_rate": 3.356463731798431e-11, "logits/chosen": 2.669015884399414, "logits/rejected": 2.6350884437561035, "logps/chosen": -118.99774169921875, "logps/rejected": -122.24143981933594, "loss": 1.1502, "nll_loss": 1.1333118677139282, "rewards/accuracies": 1.0, "rewards/chosen": 1.3679351806640625, "rewards/margins": 6.764436721801758, "rewards/rejected": -5.396501541137695, "step": 5952 }, { "epoch": 0.9921666666666666, "grad_norm": 59.87370300292969, "learning_rate": 3.218075298093703e-11, "logits/chosen": 2.232269525527954, "logits/rejected": 2.221635580062866, "logps/chosen": -54.08338165283203, "logps/rejected": -134.93231201171875, "loss": 1.8696, "nll_loss": 1.8649441003799438, "rewards/accuracies": 1.0, "rewards/chosen": 3.7584664821624756, "rewards/margins": 9.195823669433594, "rewards/rejected": -5.437356948852539, "step": 5953 }, { "epoch": 0.9923333333333333, "grad_norm": 25.897144317626953, "learning_rate": 3.082599686643483e-11, "logits/chosen": 2.092223644256592, "logits/rejected": 2.4081075191497803, "logps/chosen": -18.16752052307129, "logps/rejected": -160.75570678710938, "loss": 0.4607, "nll_loss": 0.4541880190372467, "rewards/accuracies": 1.0, "rewards/chosen": 2.1046149730682373, "rewards/margins": 9.731481552124023, "rewards/rejected": -7.626866817474365, "step": 5954 }, { "epoch": 0.9925, "grad_norm": 30.330123901367188, "learning_rate": 2.950036936919531e-11, "logits/chosen": 2.315218925476074, "logits/rejected": 2.4928667545318604, "logps/chosen": -35.834693908691406, "logps/rejected": -188.54409790039062, "loss": 0.6526, "nll_loss": 0.6515398621559143, "rewards/accuracies": 1.0, "rewards/chosen": 5.708506107330322, "rewards/margins": 12.496134757995605, "rewards/rejected": -6.787628650665283, "step": 5955 }, { "epoch": 0.9926666666666667, "grad_norm": 112.8275375366211, "learning_rate": 2.8203870875487257e-11, "logits/chosen": 2.5307986736297607, "logits/rejected": 2.600856065750122, "logps/chosen": -66.59160614013672, "logps/rejected": -75.81243896484375, "loss": 3.1537, "nll_loss": 3.026890754699707, "rewards/accuracies": 1.0, "rewards/chosen": -0.329184353351593, "rewards/margins": 2.9804019927978516, "rewards/rejected": -3.3095862865448, "step": 5956 }, { "epoch": 0.9928333333333333, "grad_norm": 35.68400192260742, "learning_rate": 2.693650176308626e-11, "logits/chosen": 1.66576087474823, "logits/rejected": 2.385866641998291, "logps/chosen": -28.972436904907227, "logps/rejected": -143.83380126953125, "loss": 0.7108, "nll_loss": 0.6584644913673401, "rewards/accuracies": 1.0, "rewards/chosen": 1.2525453567504883, "rewards/margins": 4.475311279296875, "rewards/rejected": -3.2227656841278076, "step": 5957 }, { "epoch": 0.993, "grad_norm": 127.87246704101562, "learning_rate": 2.5698262401263603e-11, "logits/chosen": 2.73079514503479, "logits/rejected": 3.0191118717193604, "logps/chosen": -23.112667083740234, "logps/rejected": -168.88711547851562, "loss": 1.9509, "nll_loss": 1.9260557889938354, "rewards/accuracies": 1.0, "rewards/chosen": 1.4642269611358643, "rewards/margins": 5.727819442749023, "rewards/rejected": -4.263592720031738, "step": 5958 }, { "epoch": 0.9931666666666666, "grad_norm": 44.8580322265625, "learning_rate": 2.448915315080846e-11, "logits/chosen": 1.534956693649292, "logits/rejected": 2.5872578620910645, "logps/chosen": -49.85044479370117, "logps/rejected": -213.0174102783203, "loss": 1.0121, "nll_loss": 0.9405744075775146, "rewards/accuracies": 1.0, "rewards/chosen": 0.5412490963935852, "rewards/margins": 3.8447983264923096, "rewards/rejected": -3.303549289703369, "step": 5959 }, { "epoch": 0.9933333333333333, "grad_norm": 14.943811416625977, "learning_rate": 2.3309174364027905e-11, "logits/chosen": 2.7274768352508545, "logits/rejected": 2.7457151412963867, "logps/chosen": -140.06011962890625, "logps/rejected": -60.576717376708984, "loss": 0.676, "nll_loss": 0.6484264135360718, "rewards/accuracies": 1.0, "rewards/chosen": 4.222903728485107, "rewards/margins": 7.558812141418457, "rewards/rejected": -3.3359084129333496, "step": 5960 }, { "epoch": 0.9935, "grad_norm": 20.96588134765625, "learning_rate": 2.2158326384746907e-11, "logits/chosen": 2.310786724090576, "logits/rejected": 2.2850725650787354, "logps/chosen": -55.23996353149414, "logps/rejected": -81.15798950195312, "loss": 0.5636, "nll_loss": 0.5523996353149414, "rewards/accuracies": 1.0, "rewards/chosen": 1.8277547359466553, "rewards/margins": 7.480608940124512, "rewards/rejected": -5.6528544425964355, "step": 5961 }, { "epoch": 0.9936666666666667, "grad_norm": 28.200332641601562, "learning_rate": 2.1036609548297225e-11, "logits/chosen": 2.2017393112182617, "logits/rejected": 2.275759220123291, "logps/chosen": -33.668514251708984, "logps/rejected": -187.13656616210938, "loss": 0.5843, "nll_loss": 0.561141848564148, "rewards/accuracies": 1.0, "rewards/chosen": 0.725019097328186, "rewards/margins": 9.974442481994629, "rewards/rejected": -9.249423027038574, "step": 5962 }, { "epoch": 0.9938333333333333, "grad_norm": 26.773433685302734, "learning_rate": 1.9944024181506314e-11, "logits/chosen": 1.052916169166565, "logits/rejected": 1.3945118188858032, "logps/chosen": -42.60728454589844, "logps/rejected": -148.78543090820312, "loss": 0.6796, "nll_loss": 0.6657387018203735, "rewards/accuracies": 1.0, "rewards/chosen": 1.2728065252304077, "rewards/margins": 9.827893257141113, "rewards/rejected": -8.555087089538574, "step": 5963 }, { "epoch": 0.994, "grad_norm": 27.252849578857422, "learning_rate": 1.888057060274173e-11, "logits/chosen": 2.7857823371887207, "logits/rejected": 2.8707034587860107, "logps/chosen": -104.87593078613281, "logps/rejected": -202.3514404296875, "loss": 1.1335, "nll_loss": 1.1276981830596924, "rewards/accuracies": 1.0, "rewards/chosen": 3.533748149871826, "rewards/margins": 8.762701988220215, "rewards/rejected": -5.228953838348389, "step": 5964 }, { "epoch": 0.9941666666666666, "grad_norm": 22.233911514282227, "learning_rate": 1.7846249121855616e-11, "logits/chosen": 4.403820991516113, "logits/rejected": 4.356818675994873, "logps/chosen": -72.82575225830078, "logps/rejected": -160.33302307128906, "loss": 0.7705, "nll_loss": 0.7665867209434509, "rewards/accuracies": 1.0, "rewards/chosen": 2.5512077808380127, "rewards/margins": 12.384955406188965, "rewards/rejected": -9.833747863769531, "step": 5965 }, { "epoch": 0.9943333333333333, "grad_norm": 72.12965393066406, "learning_rate": 1.6841060040229116e-11, "logits/chosen": 2.4142346382141113, "logits/rejected": 2.3202624320983887, "logps/chosen": -26.607921600341797, "logps/rejected": -66.52953338623047, "loss": 0.8578, "nll_loss": 0.8063004612922668, "rewards/accuracies": 1.0, "rewards/chosen": 1.5392143726348877, "rewards/margins": 4.629218101501465, "rewards/rejected": -3.0900039672851562, "step": 5966 }, { "epoch": 0.9945, "grad_norm": 20.577077865600586, "learning_rate": 1.5865003650761267e-11, "logits/chosen": 2.7142579555511475, "logits/rejected": 2.896064281463623, "logps/chosen": -40.8582763671875, "logps/rejected": -268.7323303222656, "loss": 0.5152, "nll_loss": 0.5107284188270569, "rewards/accuracies": 1.0, "rewards/chosen": 2.424525499343872, "rewards/margins": 11.873072624206543, "rewards/rejected": -9.44854736328125, "step": 5967 }, { "epoch": 0.9946666666666667, "grad_norm": 22.573808670043945, "learning_rate": 1.4918080237835695e-11, "logits/chosen": 2.8122832775115967, "logits/rejected": 2.7686173915863037, "logps/chosen": -148.31114196777344, "logps/rejected": -182.08981323242188, "loss": 1.0036, "nll_loss": 0.995376706123352, "rewards/accuracies": 1.0, "rewards/chosen": 1.9430923461914062, "rewards/margins": 8.602212905883789, "rewards/rejected": -6.659120559692383, "step": 5968 }, { "epoch": 0.9948333333333333, "grad_norm": 61.36913299560547, "learning_rate": 1.400029007736503e-11, "logits/chosen": 2.7557296752929688, "logits/rejected": 2.9794301986694336, "logps/chosen": -26.519739151000977, "logps/rejected": -204.1127471923828, "loss": 1.1105, "nll_loss": 1.1049891710281372, "rewards/accuracies": 1.0, "rewards/chosen": 4.078473091125488, "rewards/margins": 9.216679573059082, "rewards/rejected": -5.138206481933594, "step": 5969 }, { "epoch": 0.995, "grad_norm": 43.50606155395508, "learning_rate": 1.311163343677979e-11, "logits/chosen": 1.8310970067977905, "logits/rejected": 3.108229398727417, "logps/chosen": -58.038002014160156, "logps/rejected": -178.86175537109375, "loss": 1.0914, "nll_loss": 1.0747780799865723, "rewards/accuracies": 1.0, "rewards/chosen": 1.3156418800354004, "rewards/margins": 6.971425533294678, "rewards/rejected": -5.655783653259277, "step": 5970 }, { "epoch": 0.9951666666666666, "grad_norm": 17.81082534790039, "learning_rate": 1.2252110575017294e-11, "logits/chosen": 1.662088394165039, "logits/rejected": 1.2857391834259033, "logps/chosen": -204.21688842773438, "logps/rejected": -230.78125, "loss": 1.1301, "nll_loss": 1.1282700300216675, "rewards/accuracies": 1.0, "rewards/chosen": 3.390280246734619, "rewards/margins": 11.93513298034668, "rewards/rejected": -8.544852256774902, "step": 5971 }, { "epoch": 0.9953333333333333, "grad_norm": 34.71459197998047, "learning_rate": 1.1421721742499446e-11, "logits/chosen": 2.431917905807495, "logits/rejected": 2.4440722465515137, "logps/chosen": -93.37613677978516, "logps/rejected": -160.0579833984375, "loss": 1.1307, "nll_loss": 1.111620545387268, "rewards/accuracies": 1.0, "rewards/chosen": 1.0667091608047485, "rewards/margins": 7.135106563568115, "rewards/rejected": -6.068397521972656, "step": 5972 }, { "epoch": 0.9955, "grad_norm": 31.584579467773438, "learning_rate": 1.0620467181210458e-11, "logits/chosen": 2.4763083457946777, "logits/rejected": 2.7203171253204346, "logps/chosen": -14.746251106262207, "logps/rejected": -192.5532989501953, "loss": 0.3746, "nll_loss": 0.368656188249588, "rewards/accuracies": 1.0, "rewards/chosen": 2.308337450027466, "rewards/margins": 9.08189582824707, "rewards/rejected": -6.773558139801025, "step": 5973 }, { "epoch": 0.9956666666666667, "grad_norm": 32.559688568115234, "learning_rate": 9.84834712458582e-12, "logits/chosen": 3.3916592597961426, "logits/rejected": 3.2379322052001953, "logps/chosen": -44.01762390136719, "logps/rejected": -98.55911254882812, "loss": 0.832, "nll_loss": 0.8305211067199707, "rewards/accuracies": 1.0, "rewards/chosen": 5.014977931976318, "rewards/margins": 11.558778762817383, "rewards/rejected": -6.543800354003906, "step": 5974 }, { "epoch": 0.9958333333333333, "grad_norm": 138.46730041503906, "learning_rate": 9.105361797623335e-12, "logits/chosen": 2.5699257850646973, "logits/rejected": 2.657046318054199, "logps/chosen": -58.62962341308594, "logps/rejected": -131.91207885742188, "loss": 2.2265, "nll_loss": 1.5033236742019653, "rewards/accuracies": 1.0, "rewards/chosen": 2.357985734939575, "rewards/margins": 1.2401589155197144, "rewards/rejected": 1.1178268194198608, "step": 5975 }, { "epoch": 0.996, "grad_norm": 29.37147331237793, "learning_rate": 8.391511416816488e-12, "logits/chosen": 2.090991258621216, "logits/rejected": 2.1117656230926514, "logps/chosen": -134.91160583496094, "logps/rejected": -43.201602935791016, "loss": 0.6874, "nll_loss": 0.6104596853256226, "rewards/accuracies": 1.0, "rewards/chosen": 1.1122833490371704, "rewards/margins": 3.8774681091308594, "rewards/rejected": -2.7651848793029785, "step": 5976 }, { "epoch": 0.9961666666666666, "grad_norm": 18.530437469482422, "learning_rate": 7.706796190143362e-12, "logits/chosen": 2.4770801067352295, "logits/rejected": 2.7825405597686768, "logps/chosen": -54.832786560058594, "logps/rejected": -323.1236267089844, "loss": 0.579, "nll_loss": 0.5711749196052551, "rewards/accuracies": 1.0, "rewards/chosen": 1.8485230207443237, "rewards/margins": 11.492393493652344, "rewards/rejected": -9.64387035369873, "step": 5977 }, { "epoch": 0.9963333333333333, "grad_norm": 22.51535987854004, "learning_rate": 7.051216317133235e-12, "logits/chosen": 2.7560150623321533, "logits/rejected": 2.751492738723755, "logps/chosen": -28.25143051147461, "logps/rejected": -112.14795684814453, "loss": 0.4558, "nll_loss": 0.4346373379230499, "rewards/accuracies": 1.0, "rewards/chosen": 1.2564908266067505, "rewards/margins": 6.177814483642578, "rewards/rejected": -4.921323776245117, "step": 5978 }, { "epoch": 0.9965, "grad_norm": 61.07416534423828, "learning_rate": 6.424771988788879e-12, "logits/chosen": 2.91740083694458, "logits/rejected": 2.7692854404449463, "logps/chosen": -44.3546028137207, "logps/rejected": -77.48518371582031, "loss": 1.4917, "nll_loss": 1.4784867763519287, "rewards/accuracies": 1.0, "rewards/chosen": 2.513715982437134, "rewards/margins": 7.030057907104492, "rewards/rejected": -4.5163421630859375, "step": 5979 }, { "epoch": 0.9966666666666667, "grad_norm": 29.824893951416016, "learning_rate": 5.827463387653164e-12, "logits/chosen": 3.367924451828003, "logits/rejected": 3.3838558197021484, "logps/chosen": -105.11357116699219, "logps/rejected": -142.62945556640625, "loss": 1.0244, "nll_loss": 1.0107076168060303, "rewards/accuracies": 1.0, "rewards/chosen": 1.6311089992523193, "rewards/margins": 7.080453872680664, "rewards/rejected": -5.449344635009766, "step": 5980 }, { "epoch": 0.9968333333333333, "grad_norm": 130.94046020507812, "learning_rate": 5.2592906877646504e-12, "logits/chosen": 2.6429152488708496, "logits/rejected": 2.568634271621704, "logps/chosen": -83.6370849609375, "logps/rejected": -16.03192138671875, "loss": 1.9695, "nll_loss": 0.7467595934867859, "rewards/accuracies": 1.0, "rewards/chosen": 2.1818466186523438, "rewards/margins": 0.09433436393737793, "rewards/rejected": 2.087512254714966, "step": 5981 }, { "epoch": 0.997, "grad_norm": 47.02477264404297, "learning_rate": 4.720254054679795e-12, "logits/chosen": 1.5267506837844849, "logits/rejected": 2.6442432403564453, "logps/chosen": -66.18358612060547, "logps/rejected": -249.4033203125, "loss": 1.3234, "nll_loss": 1.2977173328399658, "rewards/accuracies": 1.0, "rewards/chosen": 0.9688522815704346, "rewards/margins": 5.958065986633301, "rewards/rejected": -4.989213466644287, "step": 5982 }, { "epoch": 0.9971666666666666, "grad_norm": 33.901519775390625, "learning_rate": 4.210353645450748e-12, "logits/chosen": 2.4716691970825195, "logits/rejected": 2.4054272174835205, "logps/chosen": -42.17278289794922, "logps/rejected": -141.55569458007812, "loss": 0.8327, "nll_loss": 0.795712947845459, "rewards/accuracies": 1.0, "rewards/chosen": 0.9436100125312805, "rewards/margins": 5.019042015075684, "rewards/rejected": -4.075431823730469, "step": 5983 }, { "epoch": 0.9973333333333333, "grad_norm": 20.794527053833008, "learning_rate": 3.729589608647554e-12, "logits/chosen": 2.377347230911255, "logits/rejected": 2.305723190307617, "logps/chosen": -55.20157241821289, "logps/rejected": -82.24995422363281, "loss": 0.5628, "nll_loss": 0.5520156621932983, "rewards/accuracies": 1.0, "rewards/chosen": 1.8315937519073486, "rewards/margins": 7.593644142150879, "rewards/rejected": -5.762050628662109, "step": 5984 }, { "epoch": 0.9975, "grad_norm": 276.8645324707031, "learning_rate": 3.277962084369257e-12, "logits/chosen": 2.061786651611328, "logits/rejected": 1.926132321357727, "logps/chosen": -101.7015609741211, "logps/rejected": -12.227062225341797, "loss": 3.563, "nll_loss": 1.6403473615646362, "rewards/accuracies": 0.0, "rewards/chosen": 0.8730484247207642, "rewards/margins": -1.24370276927948, "rewards/rejected": 2.116751194000244, "step": 5985 }, { "epoch": 0.9976666666666667, "grad_norm": 62.70676803588867, "learning_rate": 2.855471204199489e-12, "logits/chosen": 1.0860695838928223, "logits/rejected": 1.955320954322815, "logps/chosen": -23.847721099853516, "logps/rejected": -206.22265625, "loss": 0.7121, "nll_loss": 0.5816517472267151, "rewards/accuracies": 1.0, "rewards/chosen": 0.8794838190078735, "rewards/margins": 3.061887264251709, "rewards/rejected": -2.182403564453125, "step": 5986 }, { "epoch": 0.9978333333333333, "grad_norm": 107.00530242919922, "learning_rate": 2.4621170912508815e-12, "logits/chosen": 2.5409605503082275, "logits/rejected": 3.0612337589263916, "logps/chosen": -56.982173919677734, "logps/rejected": -345.312255859375, "loss": 2.4951, "nll_loss": 2.374257802963257, "rewards/accuracies": 1.0, "rewards/chosen": -1.1349785327911377, "rewards/margins": 5.327432632446289, "rewards/rejected": -6.462411403656006, "step": 5987 }, { "epoch": 0.998, "grad_norm": 21.10828399658203, "learning_rate": 2.0978998601206555e-12, "logits/chosen": 2.4120712280273438, "logits/rejected": 2.3816184997558594, "logps/chosen": -77.46980285644531, "logps/rejected": -99.0787353515625, "loss": 0.7965, "nll_loss": 0.7825231552124023, "rewards/accuracies": 1.0, "rewards/chosen": 3.1726856231689453, "rewards/margins": 7.391355514526367, "rewards/rejected": -4.218669891357422, "step": 5988 }, { "epoch": 0.9981666666666666, "grad_norm": 23.486623764038086, "learning_rate": 1.7628196169461319e-12, "logits/chosen": 0.8686693906784058, "logits/rejected": 2.406304121017456, "logps/chosen": -62.953880310058594, "logps/rejected": -336.1663513183594, "loss": 0.7175, "nll_loss": 0.699487566947937, "rewards/accuracies": 1.0, "rewards/chosen": 1.044019341468811, "rewards/margins": 8.029109001159668, "rewards/rejected": -6.985089302062988, "step": 5989 }, { "epoch": 0.9983333333333333, "grad_norm": 28.533889770507812, "learning_rate": 1.4568764593603234e-12, "logits/chosen": 2.3786277770996094, "logits/rejected": 2.442976474761963, "logps/chosen": -16.534229278564453, "logps/rejected": -289.59234619140625, "loss": 0.4503, "nll_loss": 0.4468711018562317, "rewards/accuracies": 1.0, "rewards/chosen": 2.8436222076416016, "rewards/margins": 10.184061050415039, "rewards/rejected": -7.3404388427734375, "step": 5990 }, { "epoch": 0.9985, "grad_norm": 152.5736541748047, "learning_rate": 1.1800704765030368e-12, "logits/chosen": 2.866710662841797, "logits/rejected": 2.772596836090088, "logps/chosen": -98.17861938476562, "logps/rejected": -71.9587631225586, "loss": 2.0179, "nll_loss": 0.9175571799278259, "rewards/accuracies": 1.0, "rewards/chosen": 1.8973320722579956, "rewards/margins": 0.21786046028137207, "rewards/rejected": 1.6794716119766235, "step": 5991 }, { "epoch": 0.9986666666666667, "grad_norm": 37.57434844970703, "learning_rate": 9.324017490319747e-13, "logits/chosen": 2.762228488922119, "logits/rejected": 2.6370232105255127, "logps/chosen": -87.98335266113281, "logps/rejected": -85.97667694091797, "loss": 1.4163, "nll_loss": 1.3965610265731812, "rewards/accuracies": 1.0, "rewards/chosen": 5.209285736083984, "rewards/margins": 8.875782012939453, "rewards/rejected": -3.6664958000183105, "step": 5992 }, { "epoch": 0.9988333333333334, "grad_norm": 29.630359649658203, "learning_rate": 7.138703491005315e-13, "logits/chosen": 1.3343768119812012, "logits/rejected": 2.1343507766723633, "logps/chosen": -49.823524475097656, "logps/rejected": -232.61111450195312, "loss": 0.929, "nll_loss": 0.92265784740448, "rewards/accuracies": 1.0, "rewards/chosen": 2.4677224159240723, "rewards/margins": 8.450321197509766, "rewards/rejected": -5.982598781585693, "step": 5993 }, { "epoch": 0.999, "grad_norm": 20.93592071533203, "learning_rate": 5.244763404133046e-13, "logits/chosen": 3.578766107559204, "logits/rejected": 3.4863457679748535, "logps/chosen": -42.52460861206055, "logps/rejected": -74.5712890625, "loss": 0.5684, "nll_loss": 0.54518723487854, "rewards/accuracies": 1.0, "rewards/chosen": 1.380300521850586, "rewards/margins": 5.870652675628662, "rewards/rejected": -4.490352153778076, "step": 5994 }, { "epoch": 0.9991666666666666, "grad_norm": 28.846454620361328, "learning_rate": 3.642197781150713e-13, "logits/chosen": 3.4066622257232666, "logits/rejected": 3.628864288330078, "logps/chosen": -106.89616394042969, "logps/rejected": -287.4275207519531, "loss": 1.1684, "nll_loss": 1.1619147062301636, "rewards/accuracies": 1.0, "rewards/chosen": 2.040180206298828, "rewards/margins": 11.632285118103027, "rewards/rejected": -9.5921049118042, "step": 5995 }, { "epoch": 0.9993333333333333, "grad_norm": 23.39508056640625, "learning_rate": 2.331007089351189e-13, "logits/chosen": 1.6117140054702759, "logits/rejected": 1.6761659383773804, "logps/chosen": -61.37430953979492, "logps/rejected": -198.38607788085938, "loss": 0.7482, "nll_loss": 0.7394494414329529, "rewards/accuracies": 1.0, "rewards/chosen": 1.7558965682983398, "rewards/margins": 10.048372268676758, "rewards/rejected": -8.292475700378418, "step": 5996 }, { "epoch": 0.9995, "grad_norm": 31.298032760620117, "learning_rate": 1.3111917106511937e-13, "logits/chosen": 2.564039707183838, "logits/rejected": 2.5975892543792725, "logps/chosen": -28.499622344970703, "logps/rejected": -225.51495361328125, "loss": 0.6566, "nll_loss": 0.6477188467979431, "rewards/accuracies": 1.0, "rewards/chosen": 1.8688641786575317, "rewards/margins": 8.420567512512207, "rewards/rejected": -6.551703453063965, "step": 5997 }, { "epoch": 0.9996666666666667, "grad_norm": 22.577247619628906, "learning_rate": 5.827519421464089e-14, "logits/chosen": 1.1024210453033447, "logits/rejected": 1.6297754049301147, "logps/chosen": -76.24168395996094, "logps/rejected": -142.66200256347656, "loss": 0.7586, "nll_loss": 0.7474675178527832, "rewards/accuracies": 1.0, "rewards/chosen": 2.0978333950042725, "rewards/margins": 7.2728986740112305, "rewards/rejected": -5.175065517425537, "step": 5998 }, { "epoch": 0.9998333333333334, "grad_norm": 29.78904914855957, "learning_rate": 1.4568799611147652e-14, "logits/chosen": 3.937488079071045, "logits/rejected": 3.8901400566101074, "logps/chosen": -57.877628326416016, "logps/rejected": -143.71746826171875, "loss": 0.8095, "nll_loss": 0.782130241394043, "rewards/accuracies": 1.0, "rewards/chosen": 2.619499921798706, "rewards/margins": 6.151092529296875, "rewards/rejected": -3.531592845916748, "step": 5999 }, { "epoch": 1.0, "grad_norm": 35.613956451416016, "learning_rate": 0.0, "logits/chosen": 2.4657480716705322, "logits/rejected": 2.1924824714660645, "logps/chosen": -28.961151123046875, "logps/rejected": -58.34172058105469, "loss": 0.6615, "nll_loss": 0.5910439491271973, "rewards/accuracies": 1.0, "rewards/chosen": 1.072445273399353, "rewards/margins": 3.983546733856201, "rewards/rejected": -2.9111015796661377, "step": 6000 }, { "epoch": 1.0, "step": 6000, "total_flos": 2.7507546089846735e+20, "train_loss": 1.159107870930185, "train_runtime": 7781.4574, "train_samples_per_second": 0.771, "train_steps_per_second": 0.771 } ], "logging_steps": 1.0, "max_steps": 6000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.7507546089846735e+20, "train_batch_size": 1, "trial_name": null, "trial_params": null }