{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 9258, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "diff_generated": 0.0, "epoch": 0.0003240440699935191, "grad_norm": 3559.2297401785586, "learning_rate": 8.639308855291577e-10, "logits/chosen": -2.6053388118743896, "logits/rejected": -2.4319162368774414, "logps/chosen": -116.55142974853516, "logps/rejected": -89.49524688720703, "loss": 10.3352, "losses_ref": -89.49524688720703, "ref_logps/chosen": -116.55142974853516, "ref_logps/rejected": -89.49524688720703, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1, "u": 0.0, "weight": 1.0 }, { "diff_generated": 0.017430514097213745, "epoch": 0.0032404406999351912, "grad_norm": 3375.614561667246, "learning_rate": 8.639308855291576e-09, "logits/chosen": -2.497408390045166, "logits/rejected": -2.570788860321045, "logps/chosen": -92.1458511352539, "logps/rejected": -91.23849487304688, "loss": 5.6185, "losses_ref": -91.74820709228516, "ref_logps/chosen": -92.17684173583984, "ref_logps/rejected": -91.25592803955078, "rewards/accuracies": 0.4340277910232544, "rewards/chosen": 0.030985673889517784, "rewards/margins": 0.013555158860981464, "rewards/rejected": 0.017430514097213745, "step": 10, "u": 0.025068603456020355, "weight": 1.0016683340072632 }, { "diff_generated": -0.0066615985706448555, "epoch": 0.0064808813998703824, "grad_norm": 3139.7387168039268, "learning_rate": 1.727861771058315e-08, "logits/chosen": -2.5308899879455566, "logits/rejected": -2.5875303745269775, "logps/chosen": -100.12669372558594, "logps/rejected": -85.41898345947266, "loss": 10.963, "losses_ref": -85.79915618896484, "ref_logps/chosen": -100.17314910888672, "ref_logps/rejected": -85.41232299804688, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.04647066444158554, "rewards/margins": 0.05313226580619812, "rewards/rejected": -0.0066615985706448555, "step": 20, "u": 0.004874364472925663, "weight": 0.9997771382331848 }, { "diff_generated": -0.044504955410957336, "epoch": 0.009721322099805573, "grad_norm": 3411.2943748721727, "learning_rate": 2.591792656587473e-08, "logits/chosen": -2.5375819206237793, "logits/rejected": -2.5686402320861816, "logps/chosen": -100.92872619628906, "logps/rejected": -87.86363983154297, "loss": 12.3682, "losses_ref": -87.37126159667969, "ref_logps/chosen": -101.48959350585938, "ref_logps/rejected": -87.81913757324219, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.5608684420585632, "rewards/margins": 0.6053733825683594, "rewards/rejected": -0.044504955410957336, "step": 30, "u": -0.02490001730620861, "weight": 0.9959570169448853 }, { "diff_generated": -0.29012399911880493, "epoch": 0.012961762799740765, "grad_norm": 3355.00768506651, "learning_rate": 3.45572354211663e-08, "logits/chosen": -2.554452419281006, "logits/rejected": -2.6106112003326416, "logps/chosen": -97.12639617919922, "logps/rejected": -89.53710174560547, "loss": 10.9751, "losses_ref": -84.70036315917969, "ref_logps/chosen": -99.29696655273438, "ref_logps/rejected": -89.24697875976562, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": 2.1705615520477295, "rewards/margins": 2.4606857299804688, "rewards/rejected": -0.29012399911880493, "step": 40, "u": -0.21681609749794006, "weight": 0.9673402905464172 }, { "diff_generated": -0.6411628723144531, "epoch": 0.016202203499675955, "grad_norm": 3064.562817800374, "learning_rate": 4.319654427645788e-08, "logits/chosen": -2.4995243549346924, "logits/rejected": -2.5654492378234863, "logps/chosen": -90.70710754394531, "logps/rejected": -85.55342864990234, "loss": 14.0442, "losses_ref": -74.90791320800781, "ref_logps/chosen": -95.43073272705078, "ref_logps/rejected": -84.91226959228516, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 4.7236223220825195, "rewards/margins": 5.364785194396973, "rewards/rejected": -0.6411628723144531, "step": 50, "u": -0.3739597201347351, "weight": 0.9222582578659058 }, { "diff_generated": -2.4193122386932373, "epoch": 0.019442644199611146, "grad_norm": 3050.7408687627135, "learning_rate": 5.183585313174946e-08, "logits/chosen": -2.516291856765747, "logits/rejected": -2.5783753395080566, "logps/chosen": -81.91847229003906, "logps/rejected": -85.47832489013672, "loss": 33.34, "losses_ref": -47.07170104980469, "ref_logps/chosen": -94.7326889038086, "ref_logps/rejected": -83.05900573730469, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 12.81421947479248, "rewards/margins": 15.23353385925293, "rewards/rejected": -2.4193122386932373, "step": 60, "u": -1.1126874685287476, "weight": 0.6429678797721863 }, { "diff_generated": -3.986859083175659, "epoch": 0.02268308489954634, "grad_norm": 3040.2648567467995, "learning_rate": 6.047516198704104e-08, "logits/chosen": -2.4859328269958496, "logits/rejected": -2.544261932373047, "logps/chosen": -75.36338806152344, "logps/rejected": -89.84022521972656, "loss": 32.419, "losses_ref": -41.79503631591797, "ref_logps/chosen": -95.0781021118164, "ref_logps/rejected": -85.85337829589844, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 19.7147159576416, "rewards/margins": 23.701576232910156, "rewards/rejected": -3.986859083175659, "step": 70, "u": -1.096972107887268, "weight": 0.5654774904251099 }, { "diff_generated": -11.147028923034668, "epoch": 0.02592352559948153, "grad_norm": 1757.466149188433, "learning_rate": 6.91144708423326e-08, "logits/chosen": -2.4527060985565186, "logits/rejected": -2.5265250205993652, "logps/chosen": -59.0428466796875, "logps/rejected": -99.50745391845703, "loss": 45.6743, "losses_ref": -15.425445556640625, "ref_logps/chosen": -94.65631103515625, "ref_logps/rejected": -88.36042785644531, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 35.61347198486328, "rewards/margins": 46.760498046875, "rewards/rejected": -11.147028923034668, "step": 80, "u": -1.673259973526001, "weight": 0.23887920379638672 }, { "diff_generated": -21.959552764892578, "epoch": 0.02916396629941672, "grad_norm": 1171.535498795329, "learning_rate": 7.775377969762419e-08, "logits/chosen": -2.4958112239837646, "logits/rejected": -2.4814047813415527, "logps/chosen": -52.37348556518555, "logps/rejected": -102.47017669677734, "loss": 48.9248, "losses_ref": -2.9107580184936523, "ref_logps/chosen": -96.49531555175781, "ref_logps/rejected": -80.51063537597656, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 44.121826171875, "rewards/margins": 66.08137512207031, "rewards/rejected": -21.959552764892578, "step": 90, "u": -2.1358799934387207, "weight": 0.07833331823348999 }, { "diff_generated": -26.378650665283203, "epoch": 0.03240440699935191, "grad_norm": 981.3254617740091, "learning_rate": 8.639308855291576e-08, "logits/chosen": -2.505199432373047, "logits/rejected": -2.507596731185913, "logps/chosen": -53.363304138183594, "logps/rejected": -113.31050872802734, "loss": 48.5492, "losses_ref": -0.968902587890625, "ref_logps/chosen": -100.1287612915039, "ref_logps/rejected": -86.93186950683594, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 46.76546096801758, "rewards/margins": 73.14411163330078, "rewards/rejected": -26.378650665283203, "step": 100, "u": -2.16186261177063, "weight": 0.06941097974777222 }, { "diff_generated": -26.51546859741211, "epoch": 0.0356448476992871, "grad_norm": 840.9009121084388, "learning_rate": 9.503239740820734e-08, "logits/chosen": -2.4892473220825195, "logits/rejected": -2.5445773601531982, "logps/chosen": -46.26842498779297, "logps/rejected": -111.51081848144531, "loss": 42.2979, "losses_ref": -1.3088524341583252, "ref_logps/chosen": -96.73486328125, "ref_logps/rejected": -84.99533081054688, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 50.46643829345703, "rewards/margins": 76.98190307617188, "rewards/rejected": -26.51546859741211, "step": 110, "u": -2.185539722442627, "weight": 0.05797583609819412 }, { "diff_generated": -30.8448486328125, "epoch": 0.03888528839922229, "grad_norm": 815.4716872744215, "learning_rate": 1.0367170626349892e-07, "logits/chosen": -2.4767396450042725, "logits/rejected": -2.6058273315429688, "logps/chosen": -40.43291473388672, "logps/rejected": -119.78251647949219, "loss": 41.4793, "losses_ref": -0.2510392963886261, "ref_logps/chosen": -94.6415786743164, "ref_logps/rejected": -88.93766021728516, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 54.208656311035156, "rewards/margins": 85.05350494384766, "rewards/rejected": -30.8448486328125, "step": 120, "u": -2.2387325763702393, "weight": 0.030215347185730934 }, { "diff_generated": -30.562610626220703, "epoch": 0.04212572909915749, "grad_norm": 819.4309847490115, "learning_rate": 1.1231101511879049e-07, "logits/chosen": -2.430781126022339, "logits/rejected": -2.5365960597991943, "logps/chosen": -39.41309356689453, "logps/rejected": -114.13203430175781, "loss": 39.7267, "losses_ref": -0.43195924162864685, "ref_logps/chosen": -90.4994888305664, "ref_logps/rejected": -83.56944274902344, "rewards/accuracies": 0.9375, "rewards/chosen": 51.086395263671875, "rewards/margins": 81.64900970458984, "rewards/rejected": -30.562610626220703, "step": 130, "u": -2.133713483810425, "weight": 0.07522068917751312 }, { "diff_generated": -31.19342041015625, "epoch": 0.04536616979909268, "grad_norm": 811.2637523209452, "learning_rate": 1.2095032397408208e-07, "logits/chosen": -2.4572250843048096, "logits/rejected": -2.5759568214416504, "logps/chosen": -39.12738800048828, "logps/rejected": -124.8710708618164, "loss": 39.4256, "losses_ref": -0.022554311901330948, "ref_logps/chosen": -92.23231506347656, "ref_logps/rejected": -93.67765808105469, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 53.10492706298828, "rewards/margins": 84.29834747314453, "rewards/rejected": -31.19342041015625, "step": 140, "u": -2.2014377117156982, "weight": 0.04419802874326706 }, { "diff_generated": -33.59473419189453, "epoch": 0.04860661049902787, "grad_norm": 860.0384740581414, "learning_rate": 1.2958963282937366e-07, "logits/chosen": -2.506410837173462, "logits/rejected": -2.5207464694976807, "logps/chosen": -40.406578063964844, "logps/rejected": -118.5238037109375, "loss": 39.5802, "losses_ref": -0.040201567113399506, "ref_logps/chosen": -98.3244400024414, "ref_logps/rejected": -84.9290771484375, "rewards/accuracies": 0.9375, "rewards/chosen": 57.9178581237793, "rewards/margins": 91.5125961303711, "rewards/rejected": -33.59473419189453, "step": 150, "u": -2.156111478805542, "weight": 0.0648484081029892 }, { "diff_generated": -35.42806625366211, "epoch": 0.05184705119896306, "grad_norm": 862.2311061479475, "learning_rate": 1.382289416846652e-07, "logits/chosen": -2.4801454544067383, "logits/rejected": -2.5408148765563965, "logps/chosen": -41.032039642333984, "logps/rejected": -123.56022644042969, "loss": 37.5874, "losses_ref": -0.961107075214386, "ref_logps/chosen": -99.9718246459961, "ref_logps/rejected": -88.13216400146484, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 58.939788818359375, "rewards/margins": 94.36785125732422, "rewards/rejected": -35.42806625366211, "step": 160, "u": -2.1681969165802, "weight": 0.04808913171291351 }, { "diff_generated": -34.35420227050781, "epoch": 0.05508749189889825, "grad_norm": 770.1748867796118, "learning_rate": 1.468682505399568e-07, "logits/chosen": -2.4965875148773193, "logits/rejected": -2.5947012901306152, "logps/chosen": -34.472076416015625, "logps/rejected": -120.37129974365234, "loss": 36.0503, "losses_ref": -0.019783183932304382, "ref_logps/chosen": -94.23568725585938, "ref_logps/rejected": -86.01708984375, "rewards/accuracies": 0.96875, "rewards/chosen": 59.76360321044922, "rewards/margins": 94.11781311035156, "rewards/rejected": -34.35420227050781, "step": 170, "u": -2.2295360565185547, "weight": 0.032412897795438766 }, { "diff_generated": -35.52586364746094, "epoch": 0.05832793259883344, "grad_norm": 730.0749859427049, "learning_rate": 1.5550755939524837e-07, "logits/chosen": -2.4865972995758057, "logits/rejected": -2.5497257709503174, "logps/chosen": -36.68155288696289, "logps/rejected": -120.2018051147461, "loss": 34.2722, "losses_ref": -0.09937143325805664, "ref_logps/chosen": -95.27645111083984, "ref_logps/rejected": -84.67594909667969, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 58.59489822387695, "rewards/margins": 94.12075805664062, "rewards/rejected": -35.52586364746094, "step": 180, "u": -2.172288417816162, "weight": 0.05707244947552681 }, { "diff_generated": -36.80834197998047, "epoch": 0.06156837329876863, "grad_norm": 865.7529357983891, "learning_rate": 1.6414686825053995e-07, "logits/chosen": -2.4904823303222656, "logits/rejected": -2.515784740447998, "logps/chosen": -38.33757019042969, "logps/rejected": -126.2683334350586, "loss": 34.8775, "losses_ref": -0.006524696946144104, "ref_logps/chosen": -101.89496612548828, "ref_logps/rejected": -89.45999908447266, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 63.55739212036133, "rewards/margins": 100.36573791503906, "rewards/rejected": -36.80834197998047, "step": 190, "u": -2.2591099739074707, "weight": 0.019078662618994713 }, { "diff_generated": -35.73137664794922, "epoch": 0.06480881399870382, "grad_norm": 800.9108334177273, "learning_rate": 1.7278617710583153e-07, "logits/chosen": -2.4334239959716797, "logits/rejected": -2.5226898193359375, "logps/chosen": -32.73336410522461, "logps/rejected": -121.62447357177734, "loss": 33.73, "losses_ref": -0.04314727336168289, "ref_logps/chosen": -86.64543914794922, "ref_logps/rejected": -85.8930892944336, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 53.912071228027344, "rewards/margins": 89.6434555053711, "rewards/rejected": -35.73137664794922, "step": 200, "u": -2.1284313201904297, "weight": 0.07653830945491791 }, { "diff_generated": -37.98542404174805, "epoch": 0.06804925469863901, "grad_norm": 927.4241431650823, "learning_rate": 1.814254859611231e-07, "logits/chosen": -2.4798386096954346, "logits/rejected": -2.5692715644836426, "logps/chosen": -35.32215881347656, "logps/rejected": -126.21580505371094, "loss": 34.3679, "losses_ref": -0.030094826593995094, "ref_logps/chosen": -95.78032684326172, "ref_logps/rejected": -88.23038482666016, "rewards/accuracies": 0.96875, "rewards/chosen": 60.45817947387695, "rewards/margins": 98.44361114501953, "rewards/rejected": -37.98542404174805, "step": 210, "u": -2.230320453643799, "weight": 0.03158506378531456 }, { "diff_generated": -38.94337844848633, "epoch": 0.0712896953985742, "grad_norm": 797.8259592178767, "learning_rate": 1.900647948164147e-07, "logits/chosen": -2.4474635124206543, "logits/rejected": -2.5280601978302, "logps/chosen": -35.149330139160156, "logps/rejected": -120.66595458984375, "loss": 33.9716, "losses_ref": -0.02789616584777832, "ref_logps/chosen": -97.20127868652344, "ref_logps/rejected": -81.72257995605469, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 62.05195236206055, "rewards/margins": 100.9953384399414, "rewards/rejected": -38.94337844848633, "step": 220, "u": -2.171726942062378, "weight": 0.057657964527606964 }, { "diff_generated": -38.69519805908203, "epoch": 0.07453013609850939, "grad_norm": 869.5453620671002, "learning_rate": 1.9870410367170624e-07, "logits/chosen": -2.4745936393737793, "logits/rejected": -2.494105815887451, "logps/chosen": -33.155364990234375, "logps/rejected": -122.06390380859375, "loss": 34.0393, "losses_ref": -0.24938344955444336, "ref_logps/chosen": -92.7715835571289, "ref_logps/rejected": -83.36869812011719, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 59.61621856689453, "rewards/margins": 98.31141662597656, "rewards/rejected": -38.69519805908203, "step": 230, "u": -2.1387956142425537, "weight": 0.07319749146699905 }, { "diff_generated": -41.04708480834961, "epoch": 0.07777057679844458, "grad_norm": 817.7110572483012, "learning_rate": 2.0734341252699785e-07, "logits/chosen": -2.4703361988067627, "logits/rejected": -2.5415077209472656, "logps/chosen": -31.33148765563965, "logps/rejected": -127.82364654541016, "loss": 32.6819, "losses_ref": -0.13492469489574432, "ref_logps/chosen": -92.26710510253906, "ref_logps/rejected": -86.77656555175781, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 60.93561553955078, "rewards/margins": 101.98270416259766, "rewards/rejected": -41.04708480834961, "step": 240, "u": -2.172236442565918, "weight": 0.05711379647254944 }, { "diff_generated": -41.528465270996094, "epoch": 0.08101101749837979, "grad_norm": 779.7448937764428, "learning_rate": 2.159827213822894e-07, "logits/chosen": -2.4615213871002197, "logits/rejected": -2.578935384750366, "logps/chosen": -33.79179000854492, "logps/rejected": -130.5545196533203, "loss": 33.9777, "losses_ref": -0.02141922526061535, "ref_logps/chosen": -94.50299835205078, "ref_logps/rejected": -89.02606201171875, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 60.711204528808594, "rewards/margins": 102.23966217041016, "rewards/rejected": -41.528465270996094, "step": 250, "u": -2.1865832805633545, "weight": 0.05091395229101181 }, { "diff_generated": -40.10847091674805, "epoch": 0.08425145819831498, "grad_norm": 808.1011773378456, "learning_rate": 2.2462203023758098e-07, "logits/chosen": -2.4640259742736816, "logits/rejected": -2.5208182334899902, "logps/chosen": -35.152008056640625, "logps/rejected": -128.22848510742188, "loss": 33.6627, "losses_ref": -0.0065854983404278755, "ref_logps/chosen": -96.67770385742188, "ref_logps/rejected": -88.1200180053711, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 61.52568817138672, "rewards/margins": 101.6341552734375, "rewards/rejected": -40.10847091674805, "step": 260, "u": -2.1871533393859863, "weight": 0.05033022165298462 }, { "diff_generated": -41.47755813598633, "epoch": 0.08749189889825017, "grad_norm": 824.21173391217, "learning_rate": 2.3326133909287256e-07, "logits/chosen": -2.46863055229187, "logits/rejected": -2.5370888710021973, "logps/chosen": -32.25581741333008, "logps/rejected": -126.69731140136719, "loss": 33.1363, "losses_ref": -0.0064047775231301785, "ref_logps/chosen": -94.9454116821289, "ref_logps/rejected": -85.21976470947266, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 62.6895866394043, "rewards/margins": 104.16715240478516, "rewards/rejected": -41.47755813598633, "step": 270, "u": -2.1151492595672607, "weight": 0.08162893354892731 }, { "diff_generated": -40.3583984375, "epoch": 0.09073233959818536, "grad_norm": 714.5025449894256, "learning_rate": 2.4190064794816416e-07, "logits/chosen": -2.456831693649292, "logits/rejected": -2.5513622760772705, "logps/chosen": -31.738773345947266, "logps/rejected": -126.47242736816406, "loss": 32.5606, "losses_ref": -0.08480539917945862, "ref_logps/chosen": -94.12120056152344, "ref_logps/rejected": -86.11402130126953, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 62.38242721557617, "rewards/margins": 102.74082946777344, "rewards/rejected": -40.3583984375, "step": 280, "u": -2.1713805198669434, "weight": 0.05789435654878616 }, { "diff_generated": -41.735252380371094, "epoch": 0.09397278029812055, "grad_norm": 801.2272509358011, "learning_rate": 2.505399568034557e-07, "logits/chosen": -2.4742555618286133, "logits/rejected": -2.5228075981140137, "logps/chosen": -34.12358474731445, "logps/rejected": -126.54386138916016, "loss": 32.6423, "losses_ref": -0.2615818977355957, "ref_logps/chosen": -98.3595199584961, "ref_logps/rejected": -84.80860900878906, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 64.2359390258789, "rewards/margins": 105.97119140625, "rewards/rejected": -41.735252380371094, "step": 290, "u": -2.139753818511963, "weight": 0.07259203493595123 }, { "diff_generated": -46.503387451171875, "epoch": 0.09721322099805574, "grad_norm": 781.8410441358157, "learning_rate": 2.591792656587473e-07, "logits/chosen": -2.496601104736328, "logits/rejected": -2.5690560340881348, "logps/chosen": -29.659671783447266, "logps/rejected": -131.7477264404297, "loss": 31.4824, "losses_ref": -0.0010774282272905111, "ref_logps/chosen": -95.02106475830078, "ref_logps/rejected": -85.24435424804688, "rewards/accuracies": 0.96875, "rewards/chosen": 65.36140441894531, "rewards/margins": 111.86478424072266, "rewards/rejected": -46.503387451171875, "step": 300, "u": -2.230616807937622, "weight": 0.031264010816812515 }, { "diff_generated": -44.111000061035156, "epoch": 0.10045366169799093, "grad_norm": 760.9281935094895, "learning_rate": 2.6781857451403887e-07, "logits/chosen": -2.4643948078155518, "logits/rejected": -2.565307378768921, "logps/chosen": -29.657928466796875, "logps/rejected": -130.83432006835938, "loss": 30.1822, "losses_ref": -0.0012849947670474648, "ref_logps/chosen": -91.39910125732422, "ref_logps/rejected": -86.72331237792969, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 61.741172790527344, "rewards/margins": 105.8521728515625, "rewards/rejected": -44.111000061035156, "step": 310, "u": -2.201822280883789, "weight": 0.043777596205472946 }, { "diff_generated": -40.68602752685547, "epoch": 0.10369410239792612, "grad_norm": 762.7683248772289, "learning_rate": 2.764578833693304e-07, "logits/chosen": -2.425774574279785, "logits/rejected": -2.4849331378936768, "logps/chosen": -30.319400787353516, "logps/rejected": -122.18917083740234, "loss": 31.0654, "losses_ref": -0.008473332040011883, "ref_logps/chosen": -90.56777954101562, "ref_logps/rejected": -81.5031509399414, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 60.248374938964844, "rewards/margins": 100.93440246582031, "rewards/rejected": -40.68602752685547, "step": 320, "u": -2.115234375, "weight": 0.08154076337814331 }, { "diff_generated": -44.732337951660156, "epoch": 0.10693454309786131, "grad_norm": 798.216714500383, "learning_rate": 2.8509719222462203e-07, "logits/chosen": -2.4940147399902344, "logits/rejected": -2.6184709072113037, "logps/chosen": -31.332714080810547, "logps/rejected": -135.71978759765625, "loss": 30.2959, "losses_ref": -2.9487182473530993e-05, "ref_logps/chosen": -94.46064758300781, "ref_logps/rejected": -90.98744201660156, "rewards/accuracies": 0.9375, "rewards/chosen": 63.1279411315918, "rewards/margins": 107.86029052734375, "rewards/rejected": -44.732337951660156, "step": 330, "u": -2.15867280960083, "weight": 0.06250102818012238 }, { "diff_generated": -44.797607421875, "epoch": 0.1101749837977965, "grad_norm": 777.6758192243882, "learning_rate": 2.937365010799136e-07, "logits/chosen": -2.4762284755706787, "logits/rejected": -2.558218002319336, "logps/chosen": -30.14864158630371, "logps/rejected": -131.7941131591797, "loss": 30.8224, "losses_ref": -0.09419278800487518, "ref_logps/chosen": -93.49372100830078, "ref_logps/rejected": -86.99652862548828, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 63.345069885253906, "rewards/margins": 108.14266204833984, "rewards/rejected": -44.797607421875, "step": 340, "u": -2.2006499767303467, "weight": 0.04497247934341431 }, { "diff_generated": -44.76807403564453, "epoch": 0.11341542449773169, "grad_norm": 753.8383918643427, "learning_rate": 3.023758099352052e-07, "logits/chosen": -2.457641124725342, "logits/rejected": -2.528771162033081, "logps/chosen": -32.162479400634766, "logps/rejected": -128.0952606201172, "loss": 30.7947, "losses_ref": -0.005067890044301748, "ref_logps/chosen": -98.45516204833984, "ref_logps/rejected": -83.32720184326172, "rewards/accuracies": 0.96875, "rewards/chosen": 66.29267883300781, "rewards/margins": 111.06075286865234, "rewards/rejected": -44.76807403564453, "step": 350, "u": -2.230400323867798, "weight": 0.03150248900055885 }, { "diff_generated": -44.002403259277344, "epoch": 0.11665586519766688, "grad_norm": 745.0619513300946, "learning_rate": 3.1101511879049674e-07, "logits/chosen": -2.4722886085510254, "logits/rejected": -2.5328807830810547, "logps/chosen": -30.5725154876709, "logps/rejected": -128.28097534179688, "loss": 31.3639, "losses_ref": -0.00848553515970707, "ref_logps/chosen": -96.97710418701172, "ref_logps/rejected": -84.27857971191406, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 66.40459442138672, "rewards/margins": 110.40699768066406, "rewards/rejected": -44.002403259277344, "step": 360, "u": -2.1440012454986572, "weight": 0.06905999779701233 }, { "diff_generated": -45.63245391845703, "epoch": 0.11989630589760207, "grad_norm": 721.4188041218672, "learning_rate": 3.1965442764578835e-07, "logits/chosen": -2.5433833599090576, "logits/rejected": -2.5798544883728027, "logps/chosen": -29.05537986755371, "logps/rejected": -132.5902099609375, "loss": 29.871, "losses_ref": -0.00014184534666128457, "ref_logps/chosen": -99.68949890136719, "ref_logps/rejected": -86.9577407836914, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 70.63412475585938, "rewards/margins": 116.26658630371094, "rewards/rejected": -45.63245391845703, "step": 370, "u": -2.216233015060425, "weight": 0.03750551864504814 }, { "diff_generated": -46.33927917480469, "epoch": 0.12313674659753726, "grad_norm": 723.4759624818365, "learning_rate": 3.282937365010799e-07, "logits/chosen": -2.501493215560913, "logits/rejected": -2.5781912803649902, "logps/chosen": -30.456531524658203, "logps/rejected": -137.51571655273438, "loss": 30.3151, "losses_ref": -0.033796604722738266, "ref_logps/chosen": -94.95821380615234, "ref_logps/rejected": -91.17644500732422, "rewards/accuracies": 0.9375, "rewards/chosen": 64.5016860961914, "rewards/margins": 110.84095764160156, "rewards/rejected": -46.33927917480469, "step": 380, "u": -2.158191442489624, "weight": 0.06301557272672653 }, { "diff_generated": -46.33161163330078, "epoch": 0.12637718729747247, "grad_norm": 804.2361358345585, "learning_rate": 3.3693304535637145e-07, "logits/chosen": -2.5290114879608154, "logits/rejected": -2.6311047077178955, "logps/chosen": -32.53776931762695, "logps/rejected": -134.83871459960938, "loss": 30.3089, "losses_ref": -0.004766993690282106, "ref_logps/chosen": -99.89033508300781, "ref_logps/rejected": -88.50709533691406, "rewards/accuracies": 0.96875, "rewards/chosen": 67.3525619506836, "rewards/margins": 113.68416595458984, "rewards/rejected": -46.33161163330078, "step": 390, "u": -2.2304444313049316, "weight": 0.031454164534807205 }, { "diff_generated": -46.13669204711914, "epoch": 0.12961762799740764, "grad_norm": 713.9089688148474, "learning_rate": 3.4557235421166306e-07, "logits/chosen": -2.4912726879119873, "logits/rejected": -2.5743274688720703, "logps/chosen": -30.20980453491211, "logps/rejected": -133.94284057617188, "loss": 28.4143, "losses_ref": -0.00325656752102077, "ref_logps/chosen": -95.0901870727539, "ref_logps/rejected": -87.80616760253906, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 64.88038635253906, "rewards/margins": 111.0170669555664, "rewards/rejected": -46.13669204711914, "step": 400, "u": -2.201824903488159, "weight": 0.0437745526432991 }, { "diff_generated": -45.739585876464844, "epoch": 0.13285806869734285, "grad_norm": 708.8169001595913, "learning_rate": 3.542116630669546e-07, "logits/chosen": -2.4827613830566406, "logits/rejected": -2.5828845500946045, "logps/chosen": -27.490942001342773, "logps/rejected": -135.59054565429688, "loss": 28.7896, "losses_ref": -0.004578437190502882, "ref_logps/chosen": -93.90925598144531, "ref_logps/rejected": -89.85096740722656, "rewards/accuracies": 0.96875, "rewards/chosen": 66.41831970214844, "rewards/margins": 112.15791320800781, "rewards/rejected": -45.739585876464844, "step": 410, "u": -2.2303926944732666, "weight": 0.03151000663638115 }, { "diff_generated": -44.8867301940918, "epoch": 0.13609850939727802, "grad_norm": 729.0360218701636, "learning_rate": 3.628509719222462e-07, "logits/chosen": -2.537703275680542, "logits/rejected": -2.633138656616211, "logps/chosen": -29.31488037109375, "logps/rejected": -132.73167419433594, "loss": 29.2031, "losses_ref": -0.008955566212534904, "ref_logps/chosen": -99.2090072631836, "ref_logps/rejected": -87.84493255615234, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 69.89412689208984, "rewards/margins": 114.7808609008789, "rewards/rejected": -44.8867301940918, "step": 420, "u": -2.2448973655700684, "weight": 0.025136280804872513 }, { "diff_generated": -44.21284103393555, "epoch": 0.13933895009721323, "grad_norm": 708.8181357717095, "learning_rate": 3.7149028077753777e-07, "logits/chosen": -2.455392360687256, "logits/rejected": -2.5685718059539795, "logps/chosen": -25.873310089111328, "logps/rejected": -123.27958679199219, "loss": 29.4561, "losses_ref": -0.0032315519638359547, "ref_logps/chosen": -88.95941162109375, "ref_logps/rejected": -79.06675720214844, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 63.08610916137695, "rewards/margins": 107.2989501953125, "rewards/rejected": -44.21284103393555, "step": 430, "u": -2.0578160285949707, "weight": 0.10638221353292465 }, { "diff_generated": -44.5488395690918, "epoch": 0.1425793907971484, "grad_norm": 758.7274720374536, "learning_rate": 3.801295896328294e-07, "logits/chosen": -2.4996840953826904, "logits/rejected": -2.5589592456817627, "logps/chosen": -28.99948501586914, "logps/rejected": -127.78646087646484, "loss": 29.7841, "losses_ref": -0.19051943719387054, "ref_logps/chosen": -95.38642883300781, "ref_logps/rejected": -83.23762512207031, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 66.3869400024414, "rewards/margins": 110.935791015625, "rewards/rejected": -44.5488395690918, "step": 440, "u": -2.1401772499084473, "weight": 0.07216720283031464 }, { "diff_generated": -43.60665512084961, "epoch": 0.1458198314970836, "grad_norm": 785.0425370892705, "learning_rate": 3.887688984881209e-07, "logits/chosen": -2.51354718208313, "logits/rejected": -2.557900905609131, "logps/chosen": -27.42291831970215, "logps/rejected": -127.36767578125, "loss": 28.9717, "losses_ref": -0.005422559566795826, "ref_logps/chosen": -92.18173217773438, "ref_logps/rejected": -83.76101684570312, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 64.75880432128906, "rewards/margins": 108.36546325683594, "rewards/rejected": -43.60665512084961, "step": 450, "u": -2.115337371826172, "weight": 0.08142946660518646 }, { "diff_generated": -44.98752975463867, "epoch": 0.14906027219701878, "grad_norm": 761.1618028676306, "learning_rate": 3.974082073434125e-07, "logits/chosen": -2.5288493633270264, "logits/rejected": -2.5853981971740723, "logps/chosen": -30.04689598083496, "logps/rejected": -128.4542694091797, "loss": 29.1264, "losses_ref": -0.0021941731683909893, "ref_logps/chosen": -97.36630249023438, "ref_logps/rejected": -83.46673583984375, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 67.31941223144531, "rewards/margins": 112.30694580078125, "rewards/rejected": -44.98752975463867, "step": 460, "u": -2.1729543209075928, "weight": 0.05637165904045105 }, { "diff_generated": -46.988868713378906, "epoch": 0.152300712896954, "grad_norm": 736.8358893108898, "learning_rate": 4.060475161987041e-07, "logits/chosen": -2.527107000350952, "logits/rejected": -2.6046130657196045, "logps/chosen": -30.838703155517578, "logps/rejected": -138.16522216796875, "loss": 29.1809, "losses_ref": -0.00042680688784457743, "ref_logps/chosen": -100.99031066894531, "ref_logps/rejected": -91.17635345458984, "rewards/accuracies": 0.96875, "rewards/chosen": 70.151611328125, "rewards/margins": 117.14048767089844, "rewards/rejected": -46.988868713378906, "step": 470, "u": -2.2306249141693115, "weight": 0.03125474974513054 }, { "diff_generated": -47.64769744873047, "epoch": 0.15554115359688916, "grad_norm": 689.8783850837162, "learning_rate": 4.146868250539957e-07, "logits/chosen": -2.522904634475708, "logits/rejected": -2.5496037006378174, "logps/chosen": -26.92234230041504, "logps/rejected": -130.68350219726562, "loss": 28.5857, "losses_ref": -0.016092773526906967, "ref_logps/chosen": -95.0140380859375, "ref_logps/rejected": -83.03581237792969, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 68.09169006347656, "rewards/margins": 115.73939514160156, "rewards/rejected": -47.64769744873047, "step": 480, "u": -2.2014780044555664, "weight": 0.044157128781080246 }, { "diff_generated": -45.847373962402344, "epoch": 0.15878159429682437, "grad_norm": 742.9073843335941, "learning_rate": 4.2332613390928724e-07, "logits/chosen": -2.505765438079834, "logits/rejected": -2.5322728157043457, "logps/chosen": -29.727895736694336, "logps/rejected": -128.55931091308594, "loss": 29.5698, "losses_ref": -0.023366082459688187, "ref_logps/chosen": -99.20357513427734, "ref_logps/rejected": -82.7119369506836, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 69.4756851196289, "rewards/margins": 115.32305908203125, "rewards/rejected": -45.847373962402344, "step": 490, "u": -2.172699451446533, "weight": 0.05664912611246109 }, { "diff_generated": -47.095062255859375, "epoch": 0.16202203499675957, "grad_norm": 645.9278684155573, "learning_rate": 4.319654427645788e-07, "logits/chosen": -2.4909958839416504, "logits/rejected": -2.604588270187378, "logps/chosen": -26.1655216217041, "logps/rejected": -130.22900390625, "loss": 28.1373, "losses_ref": -0.0002788856509141624, "ref_logps/chosen": -93.60696411132812, "ref_logps/rejected": -83.1339340209961, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 67.44145202636719, "rewards/margins": 114.53651428222656, "rewards/rejected": -47.095062255859375, "step": 500, "u": -2.1442744731903076, "weight": 0.06875874847173691 }, { "diff_generated": -45.97614288330078, "epoch": 0.16526247569669475, "grad_norm": 671.3594525863219, "learning_rate": 4.406047516198704e-07, "logits/chosen": -2.4338736534118652, "logits/rejected": -2.586536169052124, "logps/chosen": -25.121789932250977, "logps/rejected": -125.8269271850586, "loss": 27.2431, "losses_ref": -0.004285829141736031, "ref_logps/chosen": -84.57170104980469, "ref_logps/rejected": -79.85078430175781, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 59.44990158081055, "rewards/margins": 105.4260482788086, "rewards/rejected": -45.97614288330078, "step": 510, "u": -2.115294933319092, "weight": 0.08147425949573517 }, { "diff_generated": -47.68573760986328, "epoch": 0.16850291639662995, "grad_norm": 727.2153684375536, "learning_rate": 4.4924406047516195e-07, "logits/chosen": -2.538134813308716, "logits/rejected": -2.5833652019500732, "logps/chosen": -30.263620376586914, "logps/rejected": -137.04959106445312, "loss": 27.8138, "losses_ref": -0.00044774659909307957, "ref_logps/chosen": -102.84830474853516, "ref_logps/rejected": -89.36383819580078, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 72.58468627929688, "rewards/margins": 120.27042388916016, "rewards/rejected": -47.68573760986328, "step": 520, "u": -2.201833724975586, "weight": 0.04376474767923355 }, { "diff_generated": -45.31121063232422, "epoch": 0.17174335709656513, "grad_norm": 688.1006559007947, "learning_rate": 4.5788336933045356e-07, "logits/chosen": -2.5154356956481934, "logits/rejected": -2.595595598220825, "logps/chosen": -26.32985496520996, "logps/rejected": -128.65464782714844, "loss": 26.9283, "losses_ref": -0.023449674248695374, "ref_logps/chosen": -92.49571228027344, "ref_logps/rejected": -83.34344482421875, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 66.16585540771484, "rewards/margins": 111.4770736694336, "rewards/rejected": -45.31121063232422, "step": 530, "u": -2.1150999069213867, "weight": 0.08168499171733856 }, { "diff_generated": -45.9770622253418, "epoch": 0.17498379779650033, "grad_norm": 661.6627036065419, "learning_rate": 4.665226781857451e-07, "logits/chosen": -2.495020627975464, "logits/rejected": -2.6251144409179688, "logps/chosen": -27.82745361328125, "logps/rejected": -132.03318786621094, "loss": 27.5039, "losses_ref": -0.3354525566101074, "ref_logps/chosen": -93.02908325195312, "ref_logps/rejected": -86.0561294555664, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 65.20162200927734, "rewards/margins": 111.1786880493164, "rewards/rejected": -45.9770622253418, "step": 540, "u": -2.1621901988983154, "weight": 0.06197571009397507 }, { "diff_generated": -43.12482452392578, "epoch": 0.1782242384964355, "grad_norm": 722.3393998366524, "learning_rate": 4.751619870410367e-07, "logits/chosen": -2.4731545448303223, "logits/rejected": -2.4869635105133057, "logps/chosen": -27.638330459594727, "logps/rejected": -121.67626953125, "loss": 27.899, "losses_ref": -0.00017084872524719685, "ref_logps/chosen": -92.68699645996094, "ref_logps/rejected": -78.55145263671875, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 65.04866790771484, "rewards/margins": 108.1734848022461, "rewards/rejected": -43.12482452392578, "step": 550, "u": -2.144278049468994, "weight": 0.06875475496053696 }, { "diff_generated": -47.44527053833008, "epoch": 0.18146467919637072, "grad_norm": 659.4609284904622, "learning_rate": 4.838012958963283e-07, "logits/chosen": -2.4913129806518555, "logits/rejected": -2.594072103500366, "logps/chosen": -24.48001480102539, "logps/rejected": -135.76724243164062, "loss": 27.7372, "losses_ref": -0.0050836303271353245, "ref_logps/chosen": -92.13008117675781, "ref_logps/rejected": -88.32197570800781, "rewards/accuracies": 0.96875, "rewards/chosen": 67.65005493164062, "rewards/margins": 115.0953369140625, "rewards/rejected": -47.44527053833008, "step": 560, "u": -2.230499744415283, "weight": 0.03139229863882065 }, { "diff_generated": -49.268489837646484, "epoch": 0.1847051198963059, "grad_norm": 696.0980480562955, "learning_rate": 4.924406047516198e-07, "logits/chosen": -2.4896411895751953, "logits/rejected": -2.5952906608581543, "logps/chosen": -26.756811141967773, "logps/rejected": -137.99081420898438, "loss": 27.2014, "losses_ref": -0.0020136612001806498, "ref_logps/chosen": -100.06852722167969, "ref_logps/rejected": -88.72233581542969, "rewards/accuracies": 0.96875, "rewards/chosen": 73.31172180175781, "rewards/margins": 122.5802001953125, "rewards/rejected": -49.268489837646484, "step": 570, "u": -2.2305784225463867, "weight": 0.031306345015764236 }, { "diff_generated": -46.57258987426758, "epoch": 0.1879455605962411, "grad_norm": 680.9889293886696, "learning_rate": 5.010799136069114e-07, "logits/chosen": -2.551598310470581, "logits/rejected": -2.584933280944824, "logps/chosen": -28.661914825439453, "logps/rejected": -135.87179565429688, "loss": 27.5338, "losses_ref": -0.00011642322351690382, "ref_logps/chosen": -98.13680267333984, "ref_logps/rejected": -89.29920959472656, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 69.47489929199219, "rewards/margins": 116.0474853515625, "rewards/rejected": -46.57258987426758, "step": 580, "u": -2.173062562942505, "weight": 0.056252289563417435 }, { "diff_generated": -48.98841094970703, "epoch": 0.19118600129617627, "grad_norm": 644.8107725127082, "learning_rate": 5.097192224622029e-07, "logits/chosen": -2.518934488296509, "logits/rejected": -2.5555710792541504, "logps/chosen": -26.301610946655273, "logps/rejected": -131.03005981445312, "loss": 27.3424, "losses_ref": -0.0015558989252895117, "ref_logps/chosen": -96.68141174316406, "ref_logps/rejected": -82.04166412353516, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 70.37979888916016, "rewards/margins": 119.36820983886719, "rewards/rejected": -48.98841094970703, "step": 590, "u": -2.2449631690979004, "weight": 0.02506338059902191 }, { "diff_generated": -46.974483489990234, "epoch": 0.19442644199611148, "grad_norm": 666.4151875482576, "learning_rate": 5.183585313174946e-07, "logits/chosen": -2.5313637256622314, "logits/rejected": -2.5866589546203613, "logps/chosen": -32.090721130371094, "logps/rejected": -131.39773559570312, "loss": 27.5002, "losses_ref": -1.6000910818547709e-06, "ref_logps/chosen": -98.82708740234375, "ref_logps/rejected": -84.42324829101562, "rewards/accuracies": 0.9375, "rewards/chosen": 66.73637390136719, "rewards/margins": 113.7108383178711, "rewards/rejected": -46.974483489990234, "step": 600, "u": -2.1586735248565674, "weight": 0.06250002980232239 }, { "diff_generated": -49.115753173828125, "epoch": 0.19766688269604665, "grad_norm": 707.0247432576331, "learning_rate": 5.269978401727861e-07, "logits/chosen": -2.507723093032837, "logits/rejected": -2.5827906131744385, "logps/chosen": -28.90814208984375, "logps/rejected": -136.62918090820312, "loss": 27.4246, "losses_ref": -1.971707388292998e-05, "ref_logps/chosen": -100.26750183105469, "ref_logps/rejected": -87.51342010498047, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 71.35935974121094, "rewards/margins": 120.47511291503906, "rewards/rejected": -49.115753173828125, "step": 610, "u": -2.187455654144287, "weight": 0.050000227987766266 }, { "diff_generated": -48.82651901245117, "epoch": 0.20090732339598186, "grad_norm": 679.191601482562, "learning_rate": 5.356371490280777e-07, "logits/chosen": -2.5194132328033447, "logits/rejected": -2.6098580360412598, "logps/chosen": -28.0224609375, "logps/rejected": -140.78091430664062, "loss": 26.5323, "losses_ref": -0.00011035500938305631, "ref_logps/chosen": -93.5048599243164, "ref_logps/rejected": -91.95439147949219, "rewards/accuracies": 0.9375, "rewards/chosen": 65.48240661621094, "rewards/margins": 114.30892181396484, "rewards/rejected": -48.82651901245117, "step": 620, "u": -2.158672571182251, "weight": 0.06250132620334625 }, { "diff_generated": -49.514381408691406, "epoch": 0.20414776409591703, "grad_norm": 716.9561895225931, "learning_rate": 5.442764578833693e-07, "logits/chosen": -2.534128189086914, "logits/rejected": -2.5818467140197754, "logps/chosen": -30.54462242126465, "logps/rejected": -139.87940979003906, "loss": 27.3582, "losses_ref": -0.016371209174394608, "ref_logps/chosen": -100.50285339355469, "ref_logps/rejected": -90.36502075195312, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 69.9582290649414, "rewards/margins": 119.47261810302734, "rewards/rejected": -49.514381408691406, "step": 630, "u": -2.2013890743255615, "weight": 0.04425160214304924 }, { "diff_generated": -47.97042465209961, "epoch": 0.20738820479585224, "grad_norm": 682.4139467028398, "learning_rate": 5.529157667386608e-07, "logits/chosen": -2.532174587249756, "logits/rejected": -2.578139066696167, "logps/chosen": -25.9278564453125, "logps/rejected": -131.57861328125, "loss": 26.2839, "losses_ref": -0.00019156280905008316, "ref_logps/chosen": -91.6938705444336, "ref_logps/rejected": -83.6081771850586, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 65.76602172851562, "rewards/margins": 113.7364501953125, "rewards/rejected": -47.97042465209961, "step": 640, "u": -2.129889488220215, "weight": 0.07500191032886505 }, { "diff_generated": -50.43115997314453, "epoch": 0.21062864549578741, "grad_norm": 682.2415820353419, "learning_rate": 5.615550755939525e-07, "logits/chosen": -2.4852089881896973, "logits/rejected": -2.5890121459960938, "logps/chosen": -26.9046573638916, "logps/rejected": -135.3916015625, "loss": 27.7744, "losses_ref": -1.791851900634356e-05, "ref_logps/chosen": -94.32743835449219, "ref_logps/rejected": -84.96044158935547, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 67.42278289794922, "rewards/margins": 117.85392761230469, "rewards/rejected": -50.43115997314453, "step": 650, "u": -2.2018465995788574, "weight": 0.04375043511390686 }, { "diff_generated": -46.6839485168457, "epoch": 0.21386908619572262, "grad_norm": 685.3497673103903, "learning_rate": 5.701943844492441e-07, "logits/chosen": -2.529299020767212, "logits/rejected": -2.5489015579223633, "logps/chosen": -29.30217933654785, "logps/rejected": -133.85888671875, "loss": 27.8461, "losses_ref": -0.002348927315324545, "ref_logps/chosen": -95.46574401855469, "ref_logps/rejected": -87.1749267578125, "rewards/accuracies": 0.9375, "rewards/chosen": 66.16357421875, "rewards/margins": 112.84751892089844, "rewards/rejected": -46.6839485168457, "step": 660, "u": -2.1586408615112305, "weight": 0.06253615021705627 }, { "diff_generated": -50.5396728515625, "epoch": 0.21710952689565782, "grad_norm": 636.5908719338354, "learning_rate": 5.788336933045357e-07, "logits/chosen": -2.5330991744995117, "logits/rejected": -2.5660250186920166, "logps/chosen": -29.539474487304688, "logps/rejected": -140.71392822265625, "loss": 26.3251, "losses_ref": -0.010962968692183495, "ref_logps/chosen": -100.68766784667969, "ref_logps/rejected": -90.17426300048828, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 71.14818572998047, "rewards/margins": 121.6878662109375, "rewards/rejected": -50.5396728515625, "step": 670, "u": -2.2449309825897217, "weight": 0.025098636746406555 }, { "diff_generated": -51.689666748046875, "epoch": 0.220349967595593, "grad_norm": 617.310775210867, "learning_rate": 5.874730021598272e-07, "logits/chosen": -2.5135836601257324, "logits/rejected": -2.585562229156494, "logps/chosen": -25.00190544128418, "logps/rejected": -142.4840545654297, "loss": 26.4555, "losses_ref": -0.2515738606452942, "ref_logps/chosen": -97.02738952636719, "ref_logps/rejected": -90.79439544677734, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 72.02549743652344, "rewards/margins": 123.71514892578125, "rewards/rejected": -51.689666748046875, "step": 680, "u": -2.239535093307495, "weight": 0.029057253152132034 }, { "diff_generated": -50.270172119140625, "epoch": 0.2235904082955282, "grad_norm": 662.4353318575914, "learning_rate": 5.961123110151188e-07, "logits/chosen": -2.519314765930176, "logits/rejected": -2.567821979522705, "logps/chosen": -27.141326904296875, "logps/rejected": -140.2618408203125, "loss": 26.7699, "losses_ref": -2.612667003631941e-06, "ref_logps/chosen": -98.13988494873047, "ref_logps/rejected": -89.99168395996094, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 70.9985580444336, "rewards/margins": 121.26872253417969, "rewards/rejected": -50.270172119140625, "step": 690, "u": -2.2018468379974365, "weight": 0.04375007748603821 }, { "diff_generated": -47.56863784790039, "epoch": 0.22683084899546338, "grad_norm": 636.8522481633842, "learning_rate": 6.047516198704104e-07, "logits/chosen": -2.5055432319641113, "logits/rejected": -2.5514261722564697, "logps/chosen": -24.244586944580078, "logps/rejected": -130.41024780273438, "loss": 26.718, "losses_ref": -0.008043577894568443, "ref_logps/chosen": -92.876708984375, "ref_logps/rejected": -82.84159851074219, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 68.63211822509766, "rewards/margins": 116.20075988769531, "rewards/rejected": -47.56863784790039, "step": 700, "u": -2.1439430713653564, "weight": 0.0691227912902832 }, { "diff_generated": -49.37560272216797, "epoch": 0.23007128969539858, "grad_norm": 594.9494944924277, "learning_rate": 6.133909287257019e-07, "logits/chosen": -2.4527430534362793, "logits/rejected": -2.5097007751464844, "logps/chosen": -25.301738739013672, "logps/rejected": -132.68112182617188, "loss": 26.4036, "losses_ref": -3.0844853426970076e-06, "ref_logps/chosen": -93.79328918457031, "ref_logps/rejected": -83.30552673339844, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 68.4915542602539, "rewards/margins": 117.86714935302734, "rewards/rejected": -49.37560272216797, "step": 710, "u": -2.144282341003418, "weight": 0.06875006854534149 }, { "diff_generated": -51.86012649536133, "epoch": 0.23331173039533376, "grad_norm": 669.4559775251164, "learning_rate": 6.220302375809935e-07, "logits/chosen": -2.5223159790039062, "logits/rejected": -2.596004009246826, "logps/chosen": -25.50338363647461, "logps/rejected": -139.2146759033203, "loss": 26.0855, "losses_ref": -0.020114298909902573, "ref_logps/chosen": -94.37274169921875, "ref_logps/rejected": -87.35454559326172, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 68.8693618774414, "rewards/margins": 120.72947692871094, "rewards/rejected": -51.86012649536133, "step": 720, "u": -2.201023578643799, "weight": 0.04462960734963417 }, { "diff_generated": -50.6171875, "epoch": 0.23655217109526896, "grad_norm": 737.1465697011254, "learning_rate": 6.306695464362851e-07, "logits/chosen": -2.4950945377349854, "logits/rejected": -2.5960280895233154, "logps/chosen": -25.795177459716797, "logps/rejected": -138.80050659179688, "loss": 26.2013, "losses_ref": -9.304036439061747e-07, "ref_logps/chosen": -94.69329071044922, "ref_logps/rejected": -88.18331909179688, "rewards/accuracies": 0.9375, "rewards/chosen": 68.89811706542969, "rewards/margins": 119.51530456542969, "rewards/rejected": -50.6171875, "step": 730, "u": -2.1586735248565674, "weight": 0.0625000149011612 }, { "diff_generated": -48.57529830932617, "epoch": 0.23979261179520414, "grad_norm": 643.2099825309971, "learning_rate": 6.393088552915767e-07, "logits/chosen": -2.4291679859161377, "logits/rejected": -2.5446364879608154, "logps/chosen": -24.468910217285156, "logps/rejected": -132.59165954589844, "loss": 26.4268, "losses_ref": -0.0013817059807479382, "ref_logps/chosen": -92.81462097167969, "ref_logps/rejected": -84.016357421875, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 68.34571838378906, "rewards/margins": 116.92100524902344, "rewards/rejected": -48.57529830932617, "step": 740, "u": -2.1154351234436035, "weight": 0.08132173866033554 }, { "diff_generated": -50.2636604309082, "epoch": 0.24303305249513935, "grad_norm": 660.445744253361, "learning_rate": 6.479481641468682e-07, "logits/chosen": -2.467881679534912, "logits/rejected": -2.5269060134887695, "logps/chosen": -25.483837127685547, "logps/rejected": -133.77403259277344, "loss": 26.7616, "losses_ref": -0.00010903090151259676, "ref_logps/chosen": -95.33479309082031, "ref_logps/rejected": -83.5103759765625, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 69.8509521484375, "rewards/margins": 120.1146011352539, "rewards/rejected": -50.2636604309082, "step": 750, "u": -2.144279956817627, "weight": 0.06875289976596832 }, { "diff_generated": -51.882415771484375, "epoch": 0.24627349319507452, "grad_norm": 596.6517283622172, "learning_rate": 6.565874730021598e-07, "logits/chosen": -2.5085771083831787, "logits/rejected": -2.6453890800476074, "logps/chosen": -26.5816593170166, "logps/rejected": -143.75991821289062, "loss": 26.4018, "losses_ref": -3.152846602461068e-06, "ref_logps/chosen": -94.84214782714844, "ref_logps/rejected": -91.87751007080078, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 68.26048278808594, "rewards/margins": 120.14290618896484, "rewards/rejected": -51.882415771484375, "step": 760, "u": -2.259411573410034, "weight": 0.018750067800283432 }, { "diff_generated": -49.163570404052734, "epoch": 0.24951393389500973, "grad_norm": 661.0098738714644, "learning_rate": 6.652267818574514e-07, "logits/chosen": -2.517294406890869, "logits/rejected": -2.5745046138763428, "logps/chosen": -26.86139488220215, "logps/rejected": -135.12640380859375, "loss": 26.2463, "losses_ref": -0.0008471701294183731, "ref_logps/chosen": -94.9187240600586, "ref_logps/rejected": -85.96281433105469, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 68.05733489990234, "rewards/margins": 117.22090148925781, "rewards/rejected": -49.163570404052734, "step": 770, "u": -2.115487813949585, "weight": 0.08126357942819595 }, { "diff_generated": -51.59791946411133, "epoch": 0.25275437459494493, "grad_norm": 587.3515590292881, "learning_rate": 6.738660907127429e-07, "logits/chosen": -2.516849994659424, "logits/rejected": -2.5866785049438477, "logps/chosen": -29.505550384521484, "logps/rejected": -143.68307495117188, "loss": 26.9824, "losses_ref": -2.568489253462758e-05, "ref_logps/chosen": -98.34056091308594, "ref_logps/rejected": -92.08514404296875, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 68.83500671386719, "rewards/margins": 120.43292236328125, "rewards/rejected": -51.59791946411133, "step": 780, "u": -2.2162375450134277, "weight": 0.037500619888305664 }, { "diff_generated": -47.421485900878906, "epoch": 0.2559948152948801, "grad_norm": 665.3356518222674, "learning_rate": 6.825053995680345e-07, "logits/chosen": -2.457528591156006, "logits/rejected": -2.594669818878174, "logps/chosen": -25.825942993164062, "logps/rejected": -132.35867309570312, "loss": 26.2666, "losses_ref": -0.0013907465618103743, "ref_logps/chosen": -91.23823547363281, "ref_logps/rejected": -84.93717193603516, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 65.41229248046875, "rewards/margins": 112.8337631225586, "rewards/rejected": -47.421485900878906, "step": 790, "u": -2.1298203468322754, "weight": 0.07507820427417755 }, { "diff_generated": -52.05354690551758, "epoch": 0.2592352559948153, "grad_norm": 643.0444700640635, "learning_rate": 6.911447084233261e-07, "logits/chosen": -2.484684705734253, "logits/rejected": -2.5525033473968506, "logps/chosen": -24.99363899230957, "logps/rejected": -136.7119140625, "loss": 25.4338, "losses_ref": -0.022313769906759262, "ref_logps/chosen": -96.20983123779297, "ref_logps/rejected": -84.65837097167969, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 71.21620178222656, "rewards/margins": 123.2697525024414, "rewards/rejected": -52.05354690551758, "step": 800, "u": -2.2015719413757324, "weight": 0.044049136340618134 }, { "diff_generated": -47.964622497558594, "epoch": 0.26247569669475046, "grad_norm": 610.0379646255323, "learning_rate": 6.997840172786177e-07, "logits/chosen": -2.5204567909240723, "logits/rejected": -2.5491814613342285, "logps/chosen": -27.75726890563965, "logps/rejected": -129.86639404296875, "loss": 25.074, "losses_ref": -0.04612641781568527, "ref_logps/chosen": -98.92430114746094, "ref_logps/rejected": -81.90177917480469, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 71.16703796386719, "rewards/margins": 119.13166809082031, "rewards/rejected": -47.964622497558594, "step": 810, "u": -2.2013754844665527, "weight": 0.04425793141126633 }, { "diff_generated": -51.2788200378418, "epoch": 0.2657161373946857, "grad_norm": 574.9023035648446, "learning_rate": 7.084233261339092e-07, "logits/chosen": -2.4982612133026123, "logits/rejected": -2.6119399070739746, "logps/chosen": -25.619009017944336, "logps/rejected": -138.2942657470703, "loss": 25.7192, "losses_ref": -0.010108882561326027, "ref_logps/chosen": -92.62557220458984, "ref_logps/rejected": -87.01544189453125, "rewards/accuracies": 0.9375, "rewards/chosen": 67.00656127929688, "rewards/margins": 118.2853775024414, "rewards/rejected": -51.2788200378418, "step": 820, "u": -2.158543109893799, "weight": 0.06264402717351913 }, { "diff_generated": -49.65804672241211, "epoch": 0.26895657809462087, "grad_norm": 611.8921644073595, "learning_rate": 7.170626349892008e-07, "logits/chosen": -2.5054807662963867, "logits/rejected": -2.5855443477630615, "logps/chosen": -25.27883529663086, "logps/rejected": -134.4387969970703, "loss": 26.0776, "losses_ref": -0.002082222606986761, "ref_logps/chosen": -90.44572448730469, "ref_logps/rejected": -84.78074645996094, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 65.16688537597656, "rewards/margins": 114.82493591308594, "rewards/rejected": -49.65804672241211, "step": 830, "u": -2.1297812461853027, "weight": 0.07512130588293076 }, { "diff_generated": -48.48400115966797, "epoch": 0.27219701879455604, "grad_norm": 661.4384956363074, "learning_rate": 7.257019438444924e-07, "logits/chosen": -2.5549542903900146, "logits/rejected": -2.538541793823242, "logps/chosen": -27.421245574951172, "logps/rejected": -135.65159606933594, "loss": 25.6821, "losses_ref": -6.91217292114743e-06, "ref_logps/chosen": -99.6714096069336, "ref_logps/rejected": -87.1675796508789, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 72.25016784667969, "rewards/margins": 120.73416900634766, "rewards/rejected": -48.48400115966797, "step": 840, "u": -2.187455654144287, "weight": 0.0500001385807991 }, { "diff_generated": -47.902313232421875, "epoch": 0.2754374594944913, "grad_norm": 624.0056138723679, "learning_rate": 7.343412526997839e-07, "logits/chosen": -2.4939754009246826, "logits/rejected": -2.5117056369781494, "logps/chosen": -26.297977447509766, "logps/rejected": -128.50059509277344, "loss": 26.1491, "losses_ref": -2.7239013888902264e-06, "ref_logps/chosen": -94.59888458251953, "ref_logps/rejected": -80.5982894897461, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 68.30091094970703, "rewards/margins": 116.2032241821289, "rewards/rejected": -47.902313232421875, "step": 850, "u": -2.101109027862549, "weight": 0.08750005066394806 }, { "diff_generated": -50.83550262451172, "epoch": 0.27867790019442645, "grad_norm": 651.6845882583268, "learning_rate": 7.429805615550755e-07, "logits/chosen": -2.523118257522583, "logits/rejected": -2.5627999305725098, "logps/chosen": -27.77315902709961, "logps/rejected": -137.4436492919922, "loss": 25.9522, "losses_ref": -0.0023150129709392786, "ref_logps/chosen": -96.98990631103516, "ref_logps/rejected": -86.60813903808594, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 69.21675109863281, "rewards/margins": 120.05224609375, "rewards/rejected": -50.83550262451172, "step": 860, "u": -2.1441612243652344, "weight": 0.06888330727815628 }, { "diff_generated": -53.04962921142578, "epoch": 0.28191834089436163, "grad_norm": 647.3557833104493, "learning_rate": 7.516198704103671e-07, "logits/chosen": -2.4396255016326904, "logits/rejected": -2.6068167686462402, "logps/chosen": -20.75543212890625, "logps/rejected": -139.11436462402344, "loss": 24.3917, "losses_ref": -0.0001939669018611312, "ref_logps/chosen": -87.64575958251953, "ref_logps/rejected": -86.06474304199219, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 66.89032745361328, "rewards/margins": 119.93994140625, "rewards/rejected": -53.04962921142578, "step": 870, "u": -2.173062562942505, "weight": 0.05625234916806221 }, { "diff_generated": -50.16646194458008, "epoch": 0.2851587815942968, "grad_norm": 652.868899692234, "learning_rate": 7.602591792656587e-07, "logits/chosen": -2.536747694015503, "logits/rejected": -2.653494358062744, "logps/chosen": -28.167068481445312, "logps/rejected": -136.5677490234375, "loss": 25.6688, "losses_ref": -0.0002300078485859558, "ref_logps/chosen": -95.93633270263672, "ref_logps/rejected": -86.40128326416016, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 67.7692642211914, "rewards/margins": 117.93571472167969, "rewards/rejected": -50.16646194458008, "step": 880, "u": -2.216236114501953, "weight": 0.03750232607126236 }, { "diff_generated": -48.77215576171875, "epoch": 0.28839922229423204, "grad_norm": 634.4173185749601, "learning_rate": 7.688984881209502e-07, "logits/chosen": -2.5125479698181152, "logits/rejected": -2.6143321990966797, "logps/chosen": -25.836898803710938, "logps/rejected": -135.23353576660156, "loss": 26.4335, "losses_ref": -0.0017788056284189224, "ref_logps/chosen": -93.00483703613281, "ref_logps/rejected": -86.46137237548828, "rewards/accuracies": 0.9375, "rewards/chosen": 67.16793060302734, "rewards/margins": 115.9400863647461, "rewards/rejected": -48.77215576171875, "step": 890, "u": -2.1585824489593506, "weight": 0.0626002699136734 }, { "diff_generated": -51.363059997558594, "epoch": 0.2916396629941672, "grad_norm": 651.5449682597222, "learning_rate": 7.775377969762419e-07, "logits/chosen": -2.512857437133789, "logits/rejected": -2.6043753623962402, "logps/chosen": -22.82807731628418, "logps/rejected": -137.58694458007812, "loss": 24.7741, "losses_ref": -0.010770822875201702, "ref_logps/chosen": -92.31722259521484, "ref_logps/rejected": -86.22389221191406, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 69.48914337158203, "rewards/margins": 120.85221099853516, "rewards/rejected": -51.363059997558594, "step": 900, "u": -2.2015607357025146, "weight": 0.04406408965587616 }, { "diff_generated": -49.646968841552734, "epoch": 0.2948801036941024, "grad_norm": 651.8929270726834, "learning_rate": 7.861771058315335e-07, "logits/chosen": -2.547717571258545, "logits/rejected": -2.6330621242523193, "logps/chosen": -24.217119216918945, "logps/rejected": -138.98812866210938, "loss": 26.0227, "losses_ref": -0.06351267546415329, "ref_logps/chosen": -96.43073272705078, "ref_logps/rejected": -89.34115600585938, "rewards/accuracies": 0.96875, "rewards/chosen": 72.21361541748047, "rewards/margins": 121.86058044433594, "rewards/rejected": -49.646968841552734, "step": 910, "u": -2.2299375534057617, "weight": 0.03198622539639473 }, { "diff_generated": -49.06929397583008, "epoch": 0.29812054439403757, "grad_norm": 645.3313956415323, "learning_rate": 7.94816414686825e-07, "logits/chosen": -2.4820826053619385, "logits/rejected": -2.5499496459960938, "logps/chosen": -23.020343780517578, "logps/rejected": -135.4531707763672, "loss": 25.0257, "losses_ref": -0.0012926750350743532, "ref_logps/chosen": -91.547607421875, "ref_logps/rejected": -86.38389587402344, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 68.52725982666016, "rewards/margins": 117.5965576171875, "rewards/rejected": -49.06929397583008, "step": 920, "u": -2.201815128326416, "weight": 0.04378523305058479 }, { "diff_generated": -52.943565368652344, "epoch": 0.3013609850939728, "grad_norm": 589.5035558054351, "learning_rate": 7.999995450631473e-07, "logits/chosen": -2.514936923980713, "logits/rejected": -2.6288318634033203, "logps/chosen": -23.644351959228516, "logps/rejected": -141.34170532226562, "loss": 25.621, "losses_ref": -2.8535801902762614e-05, "ref_logps/chosen": -91.30097961425781, "ref_logps/rejected": -88.39814758300781, "rewards/accuracies": 0.9375, "rewards/chosen": 67.6566390991211, "rewards/margins": 120.6001968383789, "rewards/rejected": -52.943565368652344, "step": 930, "u": -2.1586732864379883, "weight": 0.06250031292438507 }, { "diff_generated": -51.099365234375, "epoch": 0.304601425793908, "grad_norm": 562.7275091649057, "learning_rate": 7.999944270354383e-07, "logits/chosen": -2.4732613563537598, "logits/rejected": -2.6294915676116943, "logps/chosen": -26.57818031311035, "logps/rejected": -140.29534912109375, "loss": 25.235, "losses_ref": -1.997053686864092e-06, "ref_logps/chosen": -91.357666015625, "ref_logps/rejected": -89.19598388671875, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 64.77949523925781, "rewards/margins": 115.87886047363281, "rewards/rejected": -51.099365234375, "step": 940, "u": -2.2018468379974365, "weight": 0.04375007376074791 }, { "diff_generated": -50.940223693847656, "epoch": 0.30784186649384315, "grad_norm": 607.7827036467936, "learning_rate": 7.99983622381959e-07, "logits/chosen": -2.507047176361084, "logits/rejected": -2.5627152919769287, "logps/chosen": -26.599227905273438, "logps/rejected": -135.17007446289062, "loss": 25.5488, "losses_ref": -0.22947004437446594, "ref_logps/chosen": -97.92918395996094, "ref_logps/rejected": -84.22985076904297, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 71.32997131347656, "rewards/margins": 122.27018737792969, "rewards/rejected": -50.940223693847656, "step": 950, "u": -2.18208646774292, "weight": 0.054003261029720306 }, { "diff_generated": -51.61498260498047, "epoch": 0.31108230719377833, "grad_norm": 620.0873356568507, "learning_rate": 7.999671312563164e-07, "logits/chosen": -2.521172046661377, "logits/rejected": -2.52459716796875, "logps/chosen": -25.872112274169922, "logps/rejected": -133.9535675048828, "loss": 24.7471, "losses_ref": -0.00038078351644799113, "ref_logps/chosen": -96.575927734375, "ref_logps/rejected": -82.33859252929688, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 70.70381164550781, "rewards/margins": 122.31880187988281, "rewards/rejected": -51.61498260498047, "step": 960, "u": -2.1730563640594482, "weight": 0.056259334087371826 }, { "diff_generated": -53.22172164916992, "epoch": 0.31432274789371356, "grad_norm": 584.0586409072972, "learning_rate": 7.999449538929611e-07, "logits/chosen": -2.471055507659912, "logits/rejected": -2.5570130348205566, "logps/chosen": -24.730777740478516, "logps/rejected": -133.05520629882812, "loss": 25.3145, "losses_ref": -6.452653178712353e-05, "ref_logps/chosen": -92.06562805175781, "ref_logps/rejected": -79.83350372314453, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 67.3348617553711, "rewards/margins": 120.55656433105469, "rewards/rejected": -53.22172164916992, "step": 970, "u": -2.20184588432312, "weight": 0.04375113174319267 }, { "diff_generated": -53.71428298950195, "epoch": 0.31756318859364874, "grad_norm": 596.051340234887, "learning_rate": 7.99917090607183e-07, "logits/chosen": -2.4926838874816895, "logits/rejected": -2.6254682540893555, "logps/chosen": -22.642475128173828, "logps/rejected": -141.4960479736328, "loss": 24.9684, "losses_ref": -0.0023799485061317682, "ref_logps/chosen": -92.01348876953125, "ref_logps/rejected": -87.7817611694336, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 69.37101745605469, "rewards/margins": 123.08528900146484, "rewards/rejected": -53.71428298950195, "step": 980, "u": -2.1873419284820557, "weight": 0.0501260869204998 }, { "diff_generated": -51.06912612915039, "epoch": 0.3208036292935839, "grad_norm": 650.4725137934786, "learning_rate": 7.998835417951081e-07, "logits/chosen": -2.5319457054138184, "logits/rejected": -2.589599132537842, "logps/chosen": -25.422199249267578, "logps/rejected": -135.44515991210938, "loss": 25.2832, "losses_ref": -0.000997675582766533, "ref_logps/chosen": -95.29524230957031, "ref_logps/rejected": -84.37602996826172, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 69.87303924560547, "rewards/margins": 120.9421615600586, "rewards/rejected": -51.06912612915039, "step": 990, "u": -2.216195821762085, "weight": 0.03754685819149017 }, { "diff_generated": -53.8853759765625, "epoch": 0.32404406999351915, "grad_norm": 713.1371118025122, "learning_rate": 7.998443079336919e-07, "logits/chosen": -2.4941048622131348, "logits/rejected": -2.6063921451568604, "logps/chosen": -25.767486572265625, "logps/rejected": -147.36996459960938, "loss": 24.9082, "losses_ref": -5.925068307988113e-06, "ref_logps/chosen": -97.91233825683594, "ref_logps/rejected": -93.4845962524414, "rewards/accuracies": 0.96875, "rewards/chosen": 72.14485168457031, "rewards/margins": 126.03022766113281, "rewards/rejected": -53.8853759765625, "step": 1000, "u": -2.2306292057037354, "weight": 0.031250081956386566 }, { "diff_generated": -51.92223358154297, "epoch": 0.3272845106934543, "grad_norm": 611.4370402546909, "learning_rate": 7.997993895807128e-07, "logits/chosen": -2.554468870162964, "logits/rejected": -2.6043498516082764, "logps/chosen": -25.103681564331055, "logps/rejected": -139.8118896484375, "loss": 25.5352, "losses_ref": -0.001320630544796586, "ref_logps/chosen": -96.08708190917969, "ref_logps/rejected": -87.88965606689453, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 70.98339080810547, "rewards/margins": 122.90562438964844, "rewards/rejected": -51.92223358154297, "step": 1010, "u": -2.201784133911133, "weight": 0.043819475919008255 }, { "diff_generated": -50.5982666015625, "epoch": 0.3305249513933895, "grad_norm": 584.2638283241826, "learning_rate": 7.997487873747646e-07, "logits/chosen": -2.5191614627838135, "logits/rejected": -2.585754871368408, "logps/chosen": -22.96677017211914, "logps/rejected": -134.28956604003906, "loss": 23.536, "losses_ref": -3.434952304814942e-05, "ref_logps/chosen": -93.22602081298828, "ref_logps/rejected": -83.69129943847656, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 70.25923919677734, "rewards/margins": 120.8575210571289, "rewards/rejected": -50.5982666015625, "step": 1020, "u": -2.1442813873291016, "weight": 0.06875105202198029 }, { "diff_generated": -51.261444091796875, "epoch": 0.3337653920933247, "grad_norm": 643.4131696639599, "learning_rate": 7.996925020352465e-07, "logits/chosen": -2.512216091156006, "logits/rejected": -2.521505117416382, "logps/chosen": -27.69634437561035, "logps/rejected": -137.0533905029297, "loss": 26.2529, "losses_ref": -0.0018226455431431532, "ref_logps/chosen": -99.18524169921875, "ref_logps/rejected": -85.79194641113281, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 71.48890686035156, "rewards/margins": 122.7503433227539, "rewards/rejected": -51.261444091796875, "step": 1030, "u": -2.201768398284912, "weight": 0.043836988508701324 }, { "diff_generated": -50.166053771972656, "epoch": 0.3370058327932599, "grad_norm": 546.0749685730806, "learning_rate": 7.99630534362354e-07, "logits/chosen": -2.47261118888855, "logits/rejected": -2.5434730052948, "logps/chosen": -22.700708389282227, "logps/rejected": -134.80189514160156, "loss": 25.3072, "losses_ref": -0.0005364461103454232, "ref_logps/chosen": -89.48124694824219, "ref_logps/rejected": -84.63584899902344, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 66.7805404663086, "rewards/margins": 116.94660949707031, "rewards/rejected": -50.166053771972656, "step": 1040, "u": -2.144258499145508, "weight": 0.06877659261226654 }, { "diff_generated": -50.989837646484375, "epoch": 0.3402462734931951, "grad_norm": 573.2441692505033, "learning_rate": 7.995628852370667e-07, "logits/chosen": -2.4530699253082275, "logits/rejected": -2.552277088165283, "logps/chosen": -24.12959861755371, "logps/rejected": -137.18719482421875, "loss": 25.43, "losses_ref": -0.0008495537331327796, "ref_logps/chosen": -91.74011993408203, "ref_logps/rejected": -86.19735717773438, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 67.61051940917969, "rewards/margins": 118.600341796875, "rewards/rejected": -50.989837646484375, "step": 1050, "u": -2.1730258464813232, "weight": 0.056292999535799026 }, { "diff_generated": -53.81852340698242, "epoch": 0.34348671419313026, "grad_norm": 551.1043513632601, "learning_rate": 7.994895556211363e-07, "logits/chosen": -2.466102123260498, "logits/rejected": -2.607362985610962, "logps/chosen": -24.560415267944336, "logps/rejected": -141.72219848632812, "loss": 24.2731, "losses_ref": -0.00028797605773434043, "ref_logps/chosen": -91.91780853271484, "ref_logps/rejected": -87.90367126464844, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 67.35738372802734, "rewards/margins": 121.17591857910156, "rewards/rejected": -53.81852340698242, "step": 1060, "u": -2.2018346786499023, "weight": 0.043763622641563416 }, { "diff_generated": -53.04597091674805, "epoch": 0.34672715489306544, "grad_norm": 571.4159159668553, "learning_rate": 7.994105465570722e-07, "logits/chosen": -2.479231357574463, "logits/rejected": -2.534417152404785, "logps/chosen": -27.557628631591797, "logps/rejected": -137.225341796875, "loss": 24.8815, "losses_ref": -0.00040217855712398887, "ref_logps/chosen": -98.73329162597656, "ref_logps/rejected": -84.17935943603516, "rewards/accuracies": 0.9375, "rewards/chosen": 71.1756591796875, "rewards/margins": 124.22164154052734, "rewards/rejected": -53.04597091674805, "step": 1070, "u": -2.1586556434631348, "weight": 0.06251987814903259 }, { "diff_generated": -52.54377365112305, "epoch": 0.34996759559300067, "grad_norm": 642.7462552016408, "learning_rate": 7.993258591681279e-07, "logits/chosen": -2.4565343856811523, "logits/rejected": -2.5162532329559326, "logps/chosen": -25.28310775756836, "logps/rejected": -133.00509643554688, "loss": 25.5631, "losses_ref": -1.070001758307626e-06, "ref_logps/chosen": -91.83539581298828, "ref_logps/rejected": -80.46131896972656, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 66.55229187011719, "rewards/margins": 119.0960693359375, "rewards/rejected": -52.54377365112305, "step": 1080, "u": -2.115499973297119, "weight": 0.08125002682209015 }, { "diff_generated": -54.84162139892578, "epoch": 0.35320803629293585, "grad_norm": 557.97475074345, "learning_rate": 7.992354946582836e-07, "logits/chosen": -2.4957919120788574, "logits/rejected": -2.59224271774292, "logps/chosen": -23.85027313232422, "logps/rejected": -143.63851928710938, "loss": 24.5671, "losses_ref": -0.07038359344005585, "ref_logps/chosen": -94.2538070678711, "ref_logps/rejected": -88.79689025878906, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 70.40353393554688, "rewards/margins": 125.24515533447266, "rewards/rejected": -54.84162139892578, "step": 1090, "u": -2.201259136199951, "weight": 0.044373251497745514 }, { "diff_generated": -50.38102722167969, "epoch": 0.356448476992871, "grad_norm": 626.4601477446425, "learning_rate": 7.991394543122304e-07, "logits/chosen": -2.4803991317749023, "logits/rejected": -2.565451145172119, "logps/chosen": -25.01320457458496, "logps/rejected": -133.71278381347656, "loss": 24.8838, "losses_ref": -0.00086111732525751, "ref_logps/chosen": -92.66462707519531, "ref_logps/rejected": -83.33175659179688, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 67.65141296386719, "rewards/margins": 118.0324478149414, "rewards/rejected": -50.38102722167969, "step": 1100, "u": -2.173022747039795, "weight": 0.05629643052816391 }, { "diff_generated": -51.571044921875, "epoch": 0.3596889176928062, "grad_norm": 560.6761655202565, "learning_rate": 7.990377394953507e-07, "logits/chosen": -2.4450554847717285, "logits/rejected": -2.594871997833252, "logps/chosen": -24.059139251708984, "logps/rejected": -140.58322143554688, "loss": 24.425, "losses_ref": -0.0007773134857416153, "ref_logps/chosen": -93.8017578125, "ref_logps/rejected": -89.01216888427734, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 69.74263000488281, "rewards/margins": 121.31367492675781, "rewards/rejected": -51.571044921875, "step": 1110, "u": -2.1874217987060547, "weight": 0.050037842243909836 }, { "diff_generated": -52.709999084472656, "epoch": 0.36292935839274143, "grad_norm": 522.8230341997952, "learning_rate": 7.989303516537001e-07, "logits/chosen": -2.5109031200408936, "logits/rejected": -2.602411985397339, "logps/chosen": -20.902240753173828, "logps/rejected": -136.27841186523438, "loss": 24.8449, "losses_ref": -0.003221045481041074, "ref_logps/chosen": -90.93617248535156, "ref_logps/rejected": -83.56842041015625, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 70.033935546875, "rewards/margins": 122.74393463134766, "rewards/rejected": -52.709999084472656, "step": 1120, "u": -2.187288999557495, "weight": 0.05018278956413269 }, { "diff_generated": -54.4443359375, "epoch": 0.3661697990926766, "grad_norm": 608.4703573539997, "learning_rate": 7.98817292313986e-07, "logits/chosen": -2.5404746532440186, "logits/rejected": -2.6264424324035645, "logps/chosen": -28.353923797607422, "logps/rejected": -143.04530334472656, "loss": 24.3711, "losses_ref": -0.0017772326245903969, "ref_logps/chosen": -102.36439514160156, "ref_logps/rejected": -88.60096740722656, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 74.01045227050781, "rewards/margins": 128.4547882080078, "rewards/rejected": -54.4443359375, "step": 1130, "u": -2.2161457538604736, "weight": 0.037601880729198456 }, { "diff_generated": -49.8851432800293, "epoch": 0.3694102397926118, "grad_norm": 556.5135304257494, "learning_rate": 7.986985630835463e-07, "logits/chosen": -2.4946882724761963, "logits/rejected": -2.5596494674682617, "logps/chosen": -24.316936492919922, "logps/rejected": -135.36636352539062, "loss": 24.4909, "losses_ref": -0.013844695873558521, "ref_logps/chosen": -96.05143737792969, "ref_logps/rejected": -85.48120880126953, "rewards/accuracies": 0.96875, "rewards/chosen": 71.73450469970703, "rewards/margins": 121.61964416503906, "rewards/rejected": -49.8851432800293, "step": 1140, "u": -2.229886293411255, "weight": 0.0320565402507782 }, { "diff_generated": -49.407188415527344, "epoch": 0.37265068049254696, "grad_norm": 580.0555758431312, "learning_rate": 7.985741656503261e-07, "logits/chosen": -2.518845558166504, "logits/rejected": -2.57997465133667, "logps/chosen": -28.659387588500977, "logps/rejected": -131.66476440429688, "loss": 25.0707, "losses_ref": -0.003389369696378708, "ref_logps/chosen": -98.24806213378906, "ref_logps/rejected": -82.25758361816406, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 69.58866882324219, "rewards/margins": 118.995849609375, "rewards/rejected": -49.407188415527344, "step": 1150, "u": -2.172929525375366, "weight": 0.05639916658401489 }, { "diff_generated": -52.84662628173828, "epoch": 0.3758911211924822, "grad_norm": 569.5674422468707, "learning_rate": 7.984441017828543e-07, "logits/chosen": -2.4930014610290527, "logits/rejected": -2.6044936180114746, "logps/chosen": -25.53784942626953, "logps/rejected": -139.98287963867188, "loss": 24.346, "losses_ref": -4.5965853132656775e-06, "ref_logps/chosen": -96.56849670410156, "ref_logps/rejected": -87.13624572753906, "rewards/accuracies": 0.96875, "rewards/chosen": 71.0306396484375, "rewards/margins": 123.87727355957031, "rewards/rejected": -52.84662628173828, "step": 1160, "u": -2.2306294441223145, "weight": 0.03125005215406418 }, { "diff_generated": -50.60841369628906, "epoch": 0.37913156189241737, "grad_norm": 562.3476842737259, "learning_rate": 7.983083733302178e-07, "logits/chosen": -2.538132905960083, "logits/rejected": -2.570681095123291, "logps/chosen": -25.372121810913086, "logps/rejected": -138.02139282226562, "loss": 24.5143, "losses_ref": -5.09760866407305e-06, "ref_logps/chosen": -94.68878936767578, "ref_logps/rejected": -87.41297912597656, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 69.31666564941406, "rewards/margins": 119.9250717163086, "rewards/rejected": -50.60841369628906, "step": 1170, "u": -2.187455654144287, "weight": 0.050000082701444626 }, { "diff_generated": -50.25326919555664, "epoch": 0.38237200259235254, "grad_norm": 560.2383575313584, "learning_rate": 7.98166982222036e-07, "logits/chosen": -2.5210089683532715, "logits/rejected": -2.5609211921691895, "logps/chosen": -25.46693992614746, "logps/rejected": -132.6431427001953, "loss": 24.6057, "losses_ref": -0.005505814682692289, "ref_logps/chosen": -97.08512115478516, "ref_logps/rejected": -82.38987731933594, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 71.61817932128906, "rewards/margins": 121.8714599609375, "rewards/rejected": -50.25326919555664, "step": 1180, "u": -2.2015438079833984, "weight": 0.04408115893602371 }, { "diff_generated": -51.55100631713867, "epoch": 0.3856124432922878, "grad_norm": 551.3974517108097, "learning_rate": 7.980199304684328e-07, "logits/chosen": -2.483508825302124, "logits/rejected": -2.510715961456299, "logps/chosen": -25.69317054748535, "logps/rejected": -139.46047973632812, "loss": 24.5328, "losses_ref": -0.0007960908114910126, "ref_logps/chosen": -98.38687896728516, "ref_logps/rejected": -87.90947723388672, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 72.69371032714844, "rewards/margins": 124.2447280883789, "rewards/rejected": -51.55100631713867, "step": 1190, "u": -2.2162063121795654, "weight": 0.03753543645143509 }, { "diff_generated": -51.10063171386719, "epoch": 0.38885288399222295, "grad_norm": 559.0601762716767, "learning_rate": 7.978672201600077e-07, "logits/chosen": -2.4614675045013428, "logits/rejected": -2.575929641723633, "logps/chosen": -23.63718605041504, "logps/rejected": -137.8214111328125, "loss": 23.8937, "losses_ref": -0.0016802713507786393, "ref_logps/chosen": -92.49464416503906, "ref_logps/rejected": -86.72079467773438, "rewards/accuracies": 0.9375, "rewards/chosen": 68.85746002197266, "rewards/margins": 119.9581069946289, "rewards/rejected": -51.10063171386719, "step": 1200, "u": -2.158592700958252, "weight": 0.06258918344974518 }, { "diff_generated": -53.33112716674805, "epoch": 0.39209332469215813, "grad_norm": 558.976664007009, "learning_rate": 7.97708853467807e-07, "logits/chosen": -2.5114684104919434, "logits/rejected": -2.608743906021118, "logps/chosen": -22.37912940979004, "logps/rejected": -136.87342834472656, "loss": 23.9141, "losses_ref": -0.0029803109355270863, "ref_logps/chosen": -94.94758605957031, "ref_logps/rejected": -83.54229736328125, "rewards/accuracies": 0.96875, "rewards/chosen": 72.56846618652344, "rewards/margins": 125.89958190917969, "rewards/rejected": -53.33112716674805, "step": 1210, "u": -2.230485439300537, "weight": 0.03140822798013687 }, { "diff_generated": -52.54514694213867, "epoch": 0.3953337653920933, "grad_norm": 570.3696399396063, "learning_rate": 7.975448326432927e-07, "logits/chosen": -2.5000858306884766, "logits/rejected": -2.616079330444336, "logps/chosen": -24.4619197845459, "logps/rejected": -139.69168090820312, "loss": 24.1186, "losses_ref": -0.0010114926844835281, "ref_logps/chosen": -92.59854125976562, "ref_logps/rejected": -87.14653015136719, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 68.1366195678711, "rewards/margins": 120.68177795410156, "rewards/rejected": -52.54514694213867, "step": 1220, "u": -2.201807737350464, "weight": 0.04379352182149887 }, { "diff_generated": -54.53679275512695, "epoch": 0.39857420609202854, "grad_norm": 543.429809670915, "learning_rate": 7.973751600183094e-07, "logits/chosen": -2.518784761428833, "logits/rejected": -2.5801219940185547, "logps/chosen": -25.79349708557129, "logps/rejected": -144.06625366210938, "loss": 25.213, "losses_ref": -0.00040732818888500333, "ref_logps/chosen": -98.10910034179688, "ref_logps/rejected": -89.52947235107422, "rewards/accuracies": 0.96875, "rewards/chosen": 72.31558227539062, "rewards/margins": 126.8523941040039, "rewards/rejected": -54.53679275512695, "step": 1230, "u": -2.2306201457977295, "weight": 0.03126021847128868 }, { "diff_generated": -52.76995086669922, "epoch": 0.4018146467919637, "grad_norm": 559.798929661523, "learning_rate": 7.971998380050529e-07, "logits/chosen": -2.4895637035369873, "logits/rejected": -2.5688529014587402, "logps/chosen": -25.33633041381836, "logps/rejected": -135.40036010742188, "loss": 24.6792, "losses_ref": -0.23069611191749573, "ref_logps/chosen": -97.98286437988281, "ref_logps/rejected": -82.63040924072266, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 72.64652252197266, "rewards/margins": 125.4164810180664, "rewards/rejected": -52.76995086669922, "step": 1240, "u": -2.251986265182495, "weight": 0.023596609011292458 }, { "diff_generated": -53.05475997924805, "epoch": 0.4050550874918989, "grad_norm": 493.893056663407, "learning_rate": 7.970188690960343e-07, "logits/chosen": -2.4412055015563965, "logits/rejected": -2.5813724994659424, "logps/chosen": -20.939809799194336, "logps/rejected": -137.92222595214844, "loss": 23.4929, "losses_ref": -0.0011973511427640915, "ref_logps/chosen": -89.072265625, "ref_logps/rejected": -84.86746978759766, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 68.13246154785156, "rewards/margins": 121.1872329711914, "rewards/rejected": -53.05475997924805, "step": 1250, "u": -2.1730077266693115, "weight": 0.056313030421733856 }, { "diff_generated": -53.86749267578125, "epoch": 0.40829552819183407, "grad_norm": 571.1919107683036, "learning_rate": 7.968322558640458e-07, "logits/chosen": -2.4561567306518555, "logits/rejected": -2.5805623531341553, "logps/chosen": -24.173444747924805, "logps/rejected": -137.20947265625, "loss": 24.7757, "losses_ref": -0.0017804211238399148, "ref_logps/chosen": -94.57585144042969, "ref_logps/rejected": -83.34197235107422, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 70.40240478515625, "rewards/margins": 124.26991271972656, "rewards/rejected": -53.86749267578125, "step": 1260, "u": -2.1729800701141357, "weight": 0.05634336546063423 }, { "diff_generated": -51.295204162597656, "epoch": 0.4115359688917693, "grad_norm": 586.1857358227938, "learning_rate": 7.966400009621233e-07, "logits/chosen": -2.4896740913391113, "logits/rejected": -2.57775616645813, "logps/chosen": -24.672901153564453, "logps/rejected": -135.76600646972656, "loss": 25.0857, "losses_ref": -0.005821887403726578, "ref_logps/chosen": -92.84335327148438, "ref_logps/rejected": -84.47081756591797, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 68.17045593261719, "rewards/margins": 119.46565246582031, "rewards/rejected": -51.295204162597656, "step": 1270, "u": -2.115187168121338, "weight": 0.0815921351313591 }, { "diff_generated": -50.49216842651367, "epoch": 0.4147764095917045, "grad_norm": 524.773221147273, "learning_rate": 7.964421071235092e-07, "logits/chosen": -2.4497692584991455, "logits/rejected": -2.5709147453308105, "logps/chosen": -23.245464324951172, "logps/rejected": -127.8901596069336, "loss": 23.9401, "losses_ref": -0.0034059532918035984, "ref_logps/chosen": -87.4626235961914, "ref_logps/rejected": -77.39798736572266, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 64.21714782714844, "rewards/margins": 114.70931243896484, "rewards/rejected": -50.49216842651367, "step": 1280, "u": -2.115327835083008, "weight": 0.08143889158964157 }, { "diff_generated": -50.94807052612305, "epoch": 0.41801685029163965, "grad_norm": 552.3493908807122, "learning_rate": 7.962385771616133e-07, "logits/chosen": -2.479870557785034, "logits/rejected": -2.5021462440490723, "logps/chosen": -24.55561637878418, "logps/rejected": -131.24034118652344, "loss": 23.9531, "losses_ref": -0.003040406620129943, "ref_logps/chosen": -93.30787658691406, "ref_logps/rejected": -80.29228210449219, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 68.75225067138672, "rewards/margins": 119.70033264160156, "rewards/rejected": -50.94807052612305, "step": 1290, "u": -2.1297543048858643, "weight": 0.07515055686235428 }, { "diff_generated": -53.61127471923828, "epoch": 0.42125729099157483, "grad_norm": 527.6859831301322, "learning_rate": 7.960294139699724e-07, "logits/chosen": -2.4919817447662354, "logits/rejected": -2.580754041671753, "logps/chosen": -22.840330123901367, "logps/rejected": -142.553955078125, "loss": 23.351, "losses_ref": -1.0085510382396023e-07, "ref_logps/chosen": -95.0149917602539, "ref_logps/rejected": -88.94267272949219, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 72.17464447021484, "rewards/margins": 125.78592681884766, "rewards/rejected": -53.61127471923828, "step": 1300, "u": -2.173064708709717, "weight": 0.05624999850988388 }, { "diff_generated": -55.222808837890625, "epoch": 0.42449773169151006, "grad_norm": 575.0788238036812, "learning_rate": 7.958146205222102e-07, "logits/chosen": -2.4579484462738037, "logits/rejected": -2.54203724861145, "logps/chosen": -22.296905517578125, "logps/rejected": -139.87576293945312, "loss": 23.8415, "losses_ref": -4.046513822686393e-06, "ref_logps/chosen": -97.4652328491211, "ref_logps/rejected": -84.65293884277344, "rewards/accuracies": 0.96875, "rewards/chosen": 75.16834259033203, "rewards/margins": 130.39114379882812, "rewards/rejected": -55.222808837890625, "step": 1310, "u": -2.2306294441223145, "weight": 0.031250160187482834 }, { "diff_generated": -52.997215270996094, "epoch": 0.42773817239144524, "grad_norm": 540.076528075968, "learning_rate": 7.955941998719939e-07, "logits/chosen": -2.4463772773742676, "logits/rejected": -2.5361287593841553, "logps/chosen": -24.38715171813965, "logps/rejected": -138.02978515625, "loss": 23.5507, "losses_ref": -1.9104633963706874e-07, "ref_logps/chosen": -91.82412719726562, "ref_logps/rejected": -85.0325698852539, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 67.43697357177734, "rewards/margins": 120.4342041015625, "rewards/rejected": -52.997215270996094, "step": 1320, "u": -2.115499973297119, "weight": 0.08124999701976776 }, { "diff_generated": -54.24946212768555, "epoch": 0.4309786130913804, "grad_norm": 532.374570559775, "learning_rate": 7.953681551529918e-07, "logits/chosen": -2.430579662322998, "logits/rejected": -2.5255911350250244, "logps/chosen": -21.84467124938965, "logps/rejected": -137.99459838867188, "loss": 23.4925, "losses_ref": -0.004371698014438152, "ref_logps/chosen": -93.0247802734375, "ref_logps/rejected": -83.7451400756836, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 71.18009948730469, "rewards/margins": 125.4295654296875, "rewards/rejected": -54.24946212768555, "step": 1330, "u": -2.172853708267212, "weight": 0.05648232623934746 }, { "diff_generated": -56.7379264831543, "epoch": 0.43421905379131565, "grad_norm": 629.1562646223169, "learning_rate": 7.951364895788277e-07, "logits/chosen": -2.4883508682250977, "logits/rejected": -2.5553619861602783, "logps/chosen": -22.71754264831543, "logps/rejected": -143.79302978515625, "loss": 23.6339, "losses_ref": -1.7104119365285442e-07, "ref_logps/chosen": -95.89179992675781, "ref_logps/rejected": -87.05510711669922, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 73.17425537109375, "rewards/margins": 129.9121551513672, "rewards/rejected": -56.7379264831543, "step": 1340, "u": -2.2018468379974365, "weight": 0.04375000298023224 }, { "diff_generated": -53.1761360168457, "epoch": 0.4374594944912508, "grad_norm": 549.2730111417429, "learning_rate": 7.948992064430363e-07, "logits/chosen": -2.479661464691162, "logits/rejected": -2.592576503753662, "logps/chosen": -24.881542205810547, "logps/rejected": -139.13638305664062, "loss": 24.3426, "losses_ref": -2.509763135094545e-06, "ref_logps/chosen": -96.41204833984375, "ref_logps/rejected": -85.96025085449219, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 71.53050994873047, "rewards/margins": 124.70664978027344, "rewards/rejected": -53.1761360168457, "step": 1350, "u": -2.2018468379974365, "weight": 0.04375007376074791 }, { "diff_generated": -53.69463348388672, "epoch": 0.440699935191186, "grad_norm": 578.1317387676165, "learning_rate": 7.946563091190154e-07, "logits/chosen": -2.4729490280151367, "logits/rejected": -2.563425302505493, "logps/chosen": -24.247787475585938, "logps/rejected": -136.15133666992188, "loss": 24.7494, "losses_ref": -0.0016694276127964258, "ref_logps/chosen": -94.1891860961914, "ref_logps/rejected": -82.45668029785156, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 69.94139099121094, "rewards/margins": 123.63603210449219, "rewards/rejected": -53.69463348388672, "step": 1360, "u": -2.187392473220825, "weight": 0.050070326775312424 }, { "diff_generated": -50.28107452392578, "epoch": 0.4439403758911212, "grad_norm": 545.079547424566, "learning_rate": 7.944078010599788e-07, "logits/chosen": -2.498121976852417, "logits/rejected": -2.4967360496520996, "logps/chosen": -24.85556411743164, "logps/rejected": -133.72714233398438, "loss": 23.8484, "losses_ref": -9.963375305233058e-06, "ref_logps/chosen": -96.14152526855469, "ref_logps/rejected": -83.4460678100586, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 71.28595733642578, "rewards/margins": 121.56703186035156, "rewards/rejected": -50.28107452392578, "step": 1370, "u": -2.115499973297119, "weight": 0.08125033229589462 }, { "diff_generated": -52.63268280029297, "epoch": 0.4471808165910564, "grad_norm": 584.977752142354, "learning_rate": 7.941536857989063e-07, "logits/chosen": -2.4137930870056152, "logits/rejected": -2.4826889038085938, "logps/chosen": -24.653759002685547, "logps/rejected": -139.8968048095703, "loss": 23.9151, "losses_ref": -0.000810875091701746, "ref_logps/chosen": -94.25761413574219, "ref_logps/rejected": -87.26411437988281, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 69.6038589477539, "rewards/margins": 122.23653411865234, "rewards/rejected": -52.63268280029297, "step": 1380, "u": -2.1442465782165527, "weight": 0.06878943741321564 }, { "diff_generated": -52.39778518676758, "epoch": 0.4504212572909916, "grad_norm": 522.095104660753, "learning_rate": 7.938939669484943e-07, "logits/chosen": -2.4579412937164307, "logits/rejected": -2.556067943572998, "logps/chosen": -21.02359390258789, "logps/rejected": -136.83511352539062, "loss": 23.2373, "losses_ref": -7.89561599958688e-05, "ref_logps/chosen": -92.44002532958984, "ref_logps/rejected": -84.43733215332031, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 71.41643524169922, "rewards/margins": 123.814208984375, "rewards/rejected": -52.39778518676758, "step": 1390, "u": -2.20184588432312, "weight": 0.0437513068318367 }, { "diff_generated": -52.960174560546875, "epoch": 0.45366169799092676, "grad_norm": 527.8687486131116, "learning_rate": 7.936286482011041e-07, "logits/chosen": -2.4333748817443848, "logits/rejected": -2.498981475830078, "logps/chosen": -24.400089263916016, "logps/rejected": -139.44273376464844, "loss": 24.7374, "losses_ref": -0.00069514597998932, "ref_logps/chosen": -94.76936340332031, "ref_logps/rejected": -86.4825439453125, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 70.36927795410156, "rewards/margins": 123.32945251464844, "rewards/rejected": -52.960174560546875, "step": 1400, "u": -2.1874213218688965, "weight": 0.0500384084880352 }, { "diff_generated": -59.92344284057617, "epoch": 0.45690213869086194, "grad_norm": 534.2599238039342, "learning_rate": 7.933577333287091e-07, "logits/chosen": -2.417055130004883, "logits/rejected": -2.5821313858032227, "logps/chosen": -23.159229278564453, "logps/rejected": -147.50450134277344, "loss": 22.6167, "losses_ref": -0.0012575514847412705, "ref_logps/chosen": -91.5752944946289, "ref_logps/rejected": -87.58106231689453, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 68.41606140136719, "rewards/margins": 128.33950805664062, "rewards/rejected": -59.92344284057617, "step": 1410, "u": -2.2017815113067627, "weight": 0.04382243752479553 }, { "diff_generated": -53.30607986450195, "epoch": 0.46014257939079717, "grad_norm": 522.7862086723107, "learning_rate": 7.930812261828421e-07, "logits/chosen": -2.4632554054260254, "logits/rejected": -2.5481324195861816, "logps/chosen": -27.21381187438965, "logps/rejected": -136.55772399902344, "loss": 24.9904, "losses_ref": -0.0023922298569232225, "ref_logps/chosen": -95.427001953125, "ref_logps/rejected": -83.25163269042969, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 68.21318817138672, "rewards/margins": 121.5192642211914, "rewards/rejected": -53.30607986450195, "step": 1420, "u": -2.2017314434051514, "weight": 0.043877117335796356 }, { "diff_generated": -54.06325149536133, "epoch": 0.46338302009073234, "grad_norm": 521.0229100480946, "learning_rate": 7.92799130694539e-07, "logits/chosen": -2.4891176223754883, "logits/rejected": -2.549175500869751, "logps/chosen": -23.772846221923828, "logps/rejected": -140.67127990722656, "loss": 22.8429, "losses_ref": -0.00034526773379184306, "ref_logps/chosen": -95.93737030029297, "ref_logps/rejected": -86.60801696777344, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 72.16453552246094, "rewards/margins": 126.227783203125, "rewards/rejected": -54.06325149536133, "step": 1430, "u": -2.14426589012146, "weight": 0.06876814365386963 }, { "diff_generated": -54.022361755371094, "epoch": 0.4666234607906675, "grad_norm": 538.1163426543451, "learning_rate": 7.925114508742848e-07, "logits/chosen": -2.4775466918945312, "logits/rejected": -2.5832314491271973, "logps/chosen": -22.278833389282227, "logps/rejected": -137.95533752441406, "loss": 23.6777, "losses_ref": -1.4719394414441922e-07, "ref_logps/chosen": -90.06036376953125, "ref_logps/rejected": -83.9329605102539, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 67.78153228759766, "rewards/margins": 121.80389404296875, "rewards/rejected": -54.022361755371094, "step": 1440, "u": -2.187455654144287, "weight": 0.05000000074505806 }, { "diff_generated": -58.577171325683594, "epoch": 0.4698639014906027, "grad_norm": 512.2109426836644, "learning_rate": 7.92218190811955e-07, "logits/chosen": -2.4623990058898926, "logits/rejected": -2.6098854541778564, "logps/chosen": -23.492599487304688, "logps/rejected": -145.59201049804688, "loss": 22.9684, "losses_ref": -1.1896883734152652e-05, "ref_logps/chosen": -94.02626037597656, "ref_logps/rejected": -87.01484680175781, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 70.5336685180664, "rewards/margins": 129.11083984375, "rewards/rejected": -58.577171325683594, "step": 1450, "u": -2.2018468379974365, "weight": 0.04375017434358597 }, { "diff_generated": -55.12044143676758, "epoch": 0.47310434219053793, "grad_norm": 517.9787531892808, "learning_rate": 7.919193546767581e-07, "logits/chosen": -2.4582018852233887, "logits/rejected": -2.53832745552063, "logps/chosen": -23.89086151123047, "logps/rejected": -141.64981079101562, "loss": 23.5758, "losses_ref": -0.0004338372382335365, "ref_logps/chosen": -93.01166534423828, "ref_logps/rejected": -86.52937316894531, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 69.12080383300781, "rewards/margins": 124.24124908447266, "rewards/rejected": -55.12044143676758, "step": 1460, "u": -2.201826333999634, "weight": 0.04377306252717972 }, { "diff_generated": -54.81425857543945, "epoch": 0.4763447828904731, "grad_norm": 489.74655295683687, "learning_rate": 7.916149467171768e-07, "logits/chosen": -2.467390537261963, "logits/rejected": -2.532069444656372, "logps/chosen": -20.180316925048828, "logps/rejected": -134.23587036132812, "loss": 23.0804, "losses_ref": -8.911225449992344e-06, "ref_logps/chosen": -90.47590637207031, "ref_logps/rejected": -79.42161560058594, "rewards/accuracies": 0.9375, "rewards/chosen": 70.29559326171875, "rewards/margins": 125.1098403930664, "rewards/rejected": -54.81425857543945, "step": 1470, "u": -2.1586732864379883, "weight": 0.0625002458691597 }, { "diff_generated": -52.9602165222168, "epoch": 0.4795852235904083, "grad_norm": 495.5308876754751, "learning_rate": 7.913049712609066e-07, "logits/chosen": -2.448991298675537, "logits/rejected": -2.5381381511688232, "logps/chosen": -22.26654052734375, "logps/rejected": -135.81704711914062, "loss": 22.7017, "losses_ref": -3.1697170470579294e-06, "ref_logps/chosen": -91.15074157714844, "ref_logps/rejected": -82.85684204101562, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 68.88420104980469, "rewards/margins": 121.84442138671875, "rewards/rejected": -52.9602165222168, "step": 1480, "u": -2.115499973297119, "weight": 0.08125009387731552 }, { "diff_generated": -53.670166015625, "epoch": 0.48282566429034346, "grad_norm": 517.8427068314436, "learning_rate": 7.909894327147949e-07, "logits/chosen": -2.478231430053711, "logits/rejected": -2.5495798587799072, "logps/chosen": -23.509714126586914, "logps/rejected": -138.92987060546875, "loss": 23.4401, "losses_ref": -0.0008373827440664172, "ref_logps/chosen": -95.60778045654297, "ref_logps/rejected": -85.25968933105469, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 72.09806823730469, "rewards/margins": 125.76823425292969, "rewards/rejected": -53.670166015625, "step": 1490, "u": -2.216200351715088, "weight": 0.03754196688532829 }, { "diff_generated": -58.10505294799805, "epoch": 0.4860661049902787, "grad_norm": 546.7827706515662, "learning_rate": 7.906683355647783e-07, "logits/chosen": -2.4726247787475586, "logits/rejected": -2.5774738788604736, "logps/chosen": -22.412656784057617, "logps/rejected": -148.88279724121094, "loss": 23.0294, "losses_ref": -0.0006750643369741738, "ref_logps/chosen": -94.51060485839844, "ref_logps/rejected": -90.77774810791016, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 72.09796142578125, "rewards/margins": 130.2030029296875, "rewards/rejected": -58.10505294799805, "step": 1500, "u": -2.2162046432495117, "weight": 0.03753728047013283 }, { "diff_generated": -57.015953063964844, "epoch": 0.48930654569021387, "grad_norm": 553.5451201077923, "learning_rate": 7.903416843758187e-07, "logits/chosen": -2.5150179862976074, "logits/rejected": -2.61221981048584, "logps/chosen": -22.41795539855957, "logps/rejected": -141.7072296142578, "loss": 23.4806, "losses_ref": -0.003707682015374303, "ref_logps/chosen": -93.69750213623047, "ref_logps/rejected": -84.69126892089844, "rewards/accuracies": 0.9375, "rewards/chosen": 71.279541015625, "rewards/margins": 128.29550170898438, "rewards/rejected": -57.015953063964844, "step": 1510, "u": -2.158477306365967, "weight": 0.06271643191576004 }, { "diff_generated": -55.60862350463867, "epoch": 0.49254698639014904, "grad_norm": 523.2448385167842, "learning_rate": 7.900094837918385e-07, "logits/chosen": -2.4987995624542236, "logits/rejected": -2.5742671489715576, "logps/chosen": -26.006755828857422, "logps/rejected": -144.40855407714844, "loss": 23.9932, "losses_ref": -0.0009523486951366067, "ref_logps/chosen": -97.47042083740234, "ref_logps/rejected": -88.79991912841797, "rewards/accuracies": 0.96875, "rewards/chosen": 71.46366119384766, "rewards/margins": 127.0722885131836, "rewards/rejected": -55.60862350463867, "step": 1520, "u": -2.230584144592285, "weight": 0.031299836933612823 }, { "diff_generated": -57.468544006347656, "epoch": 0.4957874270900843, "grad_norm": 542.5668513892613, "learning_rate": 7.896717385356545e-07, "logits/chosen": -2.487705707550049, "logits/rejected": -2.624055862426758, "logps/chosen": -22.60787582397461, "logps/rejected": -146.48764038085938, "loss": 22.4425, "losses_ref": -4.9768182179832365e-06, "ref_logps/chosen": -95.36245727539062, "ref_logps/rejected": -89.01911163330078, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 72.75457763671875, "rewards/margins": 130.22311401367188, "rewards/rejected": -57.468544006347656, "step": 1530, "u": -2.2450203895568848, "weight": 0.025000065565109253 }, { "diff_generated": -55.307334899902344, "epoch": 0.49902786779001945, "grad_norm": 580.1655904703864, "learning_rate": 7.893284534089109e-07, "logits/chosen": -2.4579875469207764, "logits/rejected": -2.552023410797119, "logps/chosen": -22.494773864746094, "logps/rejected": -138.22250366210938, "loss": 23.8534, "losses_ref": -0.00047390550025738776, "ref_logps/chosen": -94.70335388183594, "ref_logps/rejected": -82.91517639160156, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 72.20856475830078, "rewards/margins": 127.51590728759766, "rewards/rejected": -55.307334899902344, "step": 1540, "u": -2.2162322998046875, "weight": 0.037506647408008575 }, { "diff_generated": -53.2509651184082, "epoch": 0.5022683084899546, "grad_norm": 505.97810942841124, "learning_rate": 7.889796332920106e-07, "logits/chosen": -2.429352283477783, "logits/rejected": -2.5697340965270996, "logps/chosen": -21.56386947631836, "logps/rejected": -135.63404846191406, "loss": 22.5935, "losses_ref": -0.002656723605468869, "ref_logps/chosen": -88.36921691894531, "ref_logps/rejected": -82.38307189941406, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 66.80535125732422, "rewards/margins": 120.05632019042969, "rewards/rejected": -53.2509651184082, "step": 1550, "u": -2.1297876834869385, "weight": 0.0751144140958786 }, { "diff_generated": -55.338844299316406, "epoch": 0.5055087491898899, "grad_norm": 517.7480424570723, "learning_rate": 7.886252831440465e-07, "logits/chosen": -2.472877025604248, "logits/rejected": -2.583745241165161, "logps/chosen": -24.70361328125, "logps/rejected": -147.2733917236328, "loss": 23.1055, "losses_ref": -0.0028544296510517597, "ref_logps/chosen": -95.76852416992188, "ref_logps/rejected": -91.93456268310547, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 71.0649185180664, "rewards/margins": 126.40376281738281, "rewards/rejected": -55.338844299316406, "step": 1560, "u": -2.2448930740356445, "weight": 0.025140201672911644 }, { "diff_generated": -56.882781982421875, "epoch": 0.508749189889825, "grad_norm": 507.17514765947175, "learning_rate": 7.882654080027304e-07, "logits/chosen": -2.4699952602386475, "logits/rejected": -2.575896978378296, "logps/chosen": -22.763708114624023, "logps/rejected": -146.9736328125, "loss": 23.5348, "losses_ref": -0.0004549544246401638, "ref_logps/chosen": -96.1058578491211, "ref_logps/rejected": -90.09082794189453, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 73.34214782714844, "rewards/margins": 130.22494506835938, "rewards/rejected": -56.882781982421875, "step": 1570, "u": -2.2162158489227295, "weight": 0.037524718791246414 }, { "diff_generated": -52.004127502441406, "epoch": 0.5119896305897602, "grad_norm": 464.5625476061178, "learning_rate": 7.879000129843218e-07, "logits/chosen": -2.5297818183898926, "logits/rejected": -2.567648410797119, "logps/chosen": -26.26664161682129, "logps/rejected": -136.30093383789062, "loss": 23.1308, "losses_ref": -0.0055158380419015884, "ref_logps/chosen": -96.4405517578125, "ref_logps/rejected": -84.29681396484375, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 70.17391204833984, "rewards/margins": 122.17803955078125, "rewards/rejected": -52.004127502441406, "step": 1580, "u": -2.1297948360443115, "weight": 0.07510620355606079 }, { "diff_generated": -55.481468200683594, "epoch": 0.5152300712896954, "grad_norm": 530.5031033607447, "learning_rate": 7.87529103283555e-07, "logits/chosen": -2.529362201690674, "logits/rejected": -2.5975449085235596, "logps/chosen": -24.188579559326172, "logps/rejected": -146.2343292236328, "loss": 23.4568, "losses_ref": -1.0383139397163177e-06, "ref_logps/chosen": -96.58631896972656, "ref_logps/rejected": -90.75286102294922, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 72.39774322509766, "rewards/margins": 127.87921142578125, "rewards/rejected": -55.481468200683594, "step": 1590, "u": -2.173064708709717, "weight": 0.05625002458691597 }, { "diff_generated": -54.71356964111328, "epoch": 0.5184705119896306, "grad_norm": 487.43724963171377, "learning_rate": 7.871526841735649e-07, "logits/chosen": -2.499135971069336, "logits/rejected": -2.5506656169891357, "logps/chosen": -22.029584884643555, "logps/rejected": -143.09744262695312, "loss": 23.0352, "losses_ref": -9.371944543090649e-06, "ref_logps/chosen": -93.57704162597656, "ref_logps/rejected": -88.3838882446289, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 71.54745483398438, "rewards/margins": 126.26102447509766, "rewards/rejected": -54.71356964111328, "step": 1600, "u": -2.1730644702911377, "weight": 0.056250352412462234 }, { "diff_generated": -58.845428466796875, "epoch": 0.5217109526895658, "grad_norm": 465.2252457978142, "learning_rate": 7.867707610058127e-07, "logits/chosen": -2.5014264583587646, "logits/rejected": -2.617633581161499, "logps/chosen": -22.12125587463379, "logps/rejected": -146.1781463623047, "loss": 24.4162, "losses_ref": -1.3118931008193613e-07, "ref_logps/chosen": -98.06317901611328, "ref_logps/rejected": -87.33271789550781, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 75.94192504882812, "rewards/margins": 134.78736877441406, "rewards/rejected": -58.845428466796875, "step": 1610, "u": -2.2018468379974365, "weight": 0.04374999925494194 }, { "diff_generated": -57.821632385253906, "epoch": 0.5249513933895009, "grad_norm": 530.9806499111314, "learning_rate": 7.863833392100093e-07, "logits/chosen": -2.4269180297851562, "logits/rejected": -2.5671417713165283, "logps/chosen": -20.6740665435791, "logps/rejected": -143.11346435546875, "loss": 23.1303, "losses_ref": -0.0014479614328593016, "ref_logps/chosen": -90.92362213134766, "ref_logps/rejected": -85.29182434082031, "rewards/accuracies": 0.96875, "rewards/chosen": 70.24955749511719, "rewards/margins": 128.07119750976562, "rewards/rejected": -57.821632385253906, "step": 1620, "u": -2.2305593490600586, "weight": 0.03132731840014458 }, { "diff_generated": -56.96986770629883, "epoch": 0.5281918340894362, "grad_norm": 527.1711533731245, "learning_rate": 7.859904242940385e-07, "logits/chosen": -2.485358476638794, "logits/rejected": -2.5643229484558105, "logps/chosen": -22.638986587524414, "logps/rejected": -146.45889282226562, "loss": 23.265, "losses_ref": -0.0023109859321266413, "ref_logps/chosen": -95.79054260253906, "ref_logps/rejected": -89.48902893066406, "rewards/accuracies": 0.96875, "rewards/chosen": 73.15155792236328, "rewards/margins": 130.1214141845703, "rewards/rejected": -56.96986770629883, "step": 1630, "u": -2.2305521965026855, "weight": 0.03133513033390045 }, { "diff_generated": -55.67280197143555, "epoch": 0.5314322747893714, "grad_norm": 631.6742557579007, "learning_rate": 7.855920218438783e-07, "logits/chosen": -2.473820447921753, "logits/rejected": -2.5379204750061035, "logps/chosen": -22.74087905883789, "logps/rejected": -139.18263244628906, "loss": 23.9599, "losses_ref": -3.2874831958906725e-05, "ref_logps/chosen": -94.07108306884766, "ref_logps/rejected": -83.50982666015625, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 71.33020782470703, "rewards/margins": 127.00301361083984, "rewards/rejected": -55.67280197143555, "step": 1640, "u": -2.187455654144287, "weight": 0.05000032112002373 }, { "diff_generated": -56.30420684814453, "epoch": 0.5346727154893065, "grad_norm": 609.874994767621, "learning_rate": 7.851881375235216e-07, "logits/chosen": -2.5344715118408203, "logits/rejected": -2.588593006134033, "logps/chosen": -21.436311721801758, "logps/rejected": -141.29171752929688, "loss": 22.9227, "losses_ref": -2.7268544045000453e-07, "ref_logps/chosen": -95.1426010131836, "ref_logps/rejected": -84.98751068115234, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 73.70628356933594, "rewards/margins": 130.010498046875, "rewards/rejected": -56.30420684814453, "step": 1650, "u": -2.115499973297119, "weight": 0.08125000447034836 }, { "diff_generated": -55.4068603515625, "epoch": 0.5379131561892417, "grad_norm": 576.0051849898757, "learning_rate": 7.847787770748959e-07, "logits/chosen": -2.5346810817718506, "logits/rejected": -2.5801634788513184, "logps/chosen": -24.148042678833008, "logps/rejected": -148.05355834960938, "loss": 24.3066, "losses_ref": -0.002318193670362234, "ref_logps/chosen": -97.6961441040039, "ref_logps/rejected": -92.64669036865234, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 73.548095703125, "rewards/margins": 128.95497131347656, "rewards/rejected": -55.4068603515625, "step": 1660, "u": -2.216139554977417, "weight": 0.03760867565870285 }, { "diff_generated": -55.42523193359375, "epoch": 0.541153596889177, "grad_norm": 480.4127839107038, "learning_rate": 7.843639463177815e-07, "logits/chosen": -2.4996581077575684, "logits/rejected": -2.627084732055664, "logps/chosen": -23.054407119750977, "logps/rejected": -146.1646270751953, "loss": 22.0121, "losses_ref": -3.36450739268912e-07, "ref_logps/chosen": -93.96330261230469, "ref_logps/rejected": -90.73939514160156, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 70.90888977050781, "rewards/margins": 126.33412170410156, "rewards/rejected": -55.42523193359375, "step": 1670, "u": -2.2018468379974365, "weight": 0.043750010430812836 }, { "diff_generated": -56.191612243652344, "epoch": 0.5443940375891121, "grad_norm": 498.8808720156564, "learning_rate": 7.839436511497288e-07, "logits/chosen": -2.4894304275512695, "logits/rejected": -2.5800139904022217, "logps/chosen": -23.490821838378906, "logps/rejected": -147.164794921875, "loss": 22.1627, "losses_ref": -0.003980209585279226, "ref_logps/chosen": -95.75300598144531, "ref_logps/rejected": -90.97319030761719, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 72.26219940185547, "rewards/margins": 128.4538116455078, "rewards/rejected": -56.191612243652344, "step": 1680, "u": -2.1730291843414307, "weight": 0.05628935620188713 }, { "diff_generated": -55.59052276611328, "epoch": 0.5476344782890473, "grad_norm": 526.1634443234373, "learning_rate": 7.835178975459744e-07, "logits/chosen": -2.4564805030822754, "logits/rejected": -2.5387251377105713, "logps/chosen": -21.98865509033203, "logps/rejected": -137.17153930664062, "loss": 22.9433, "losses_ref": -0.001532155554741621, "ref_logps/chosen": -90.76319122314453, "ref_logps/rejected": -81.58100891113281, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 68.7745361328125, "rewards/margins": 124.36505126953125, "rewards/rejected": -55.59052276611328, "step": 1690, "u": -2.2017760276794434, "weight": 0.04382842034101486 }, { "diff_generated": -59.38103103637695, "epoch": 0.5508749189889826, "grad_norm": 524.3064296692643, "learning_rate": 7.83086691559356e-07, "logits/chosen": -2.50773286819458, "logits/rejected": -2.5514986515045166, "logps/chosen": -21.255802154541016, "logps/rejected": -146.8485565185547, "loss": 23.0709, "losses_ref": -0.001003900310024619, "ref_logps/chosen": -97.35025024414062, "ref_logps/rejected": -87.467529296875, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 76.09444427490234, "rewards/margins": 135.47547912597656, "rewards/rejected": -59.38103103637695, "step": 1700, "u": -2.187408924102783, "weight": 0.05005186051130295 }, { "diff_generated": -55.77043914794922, "epoch": 0.5541153596889177, "grad_norm": 484.2893979735214, "learning_rate": 7.826500393202268e-07, "logits/chosen": -2.465657949447632, "logits/rejected": -2.5137181282043457, "logps/chosen": -24.55307388305664, "logps/rejected": -139.7406005859375, "loss": 23.3985, "losses_ref": -0.0014938053209334612, "ref_logps/chosen": -96.75263977050781, "ref_logps/rejected": -83.97016143798828, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 72.19956970214844, "rewards/margins": 127.97001647949219, "rewards/rejected": -55.77043914794922, "step": 1710, "u": -2.1874420642852783, "weight": 0.050015270709991455 }, { "diff_generated": -55.82611083984375, "epoch": 0.5573558003888529, "grad_norm": 525.7360101180998, "learning_rate": 7.82207947036368e-07, "logits/chosen": -2.4367599487304688, "logits/rejected": -2.5369248390197754, "logps/chosen": -21.981922149658203, "logps/rejected": -137.39761352539062, "loss": 22.7757, "losses_ref": -0.0003991415142081678, "ref_logps/chosen": -91.92091369628906, "ref_logps/rejected": -81.5715103149414, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 69.93899536132812, "rewards/margins": 125.76509857177734, "rewards/rejected": -55.82611083984375, "step": 1720, "u": -2.173046588897705, "weight": 0.05627021938562393 }, { "diff_generated": -54.84632110595703, "epoch": 0.560596241088788, "grad_norm": 505.5876111269165, "learning_rate": 7.817604209929007e-07, "logits/chosen": -2.483346462249756, "logits/rejected": -2.494868755340576, "logps/chosen": -25.0811824798584, "logps/rejected": -136.0178680419922, "loss": 23.4849, "losses_ref": -0.005923398770391941, "ref_logps/chosen": -98.15687561035156, "ref_logps/rejected": -81.17155456542969, "rewards/accuracies": 0.9375, "rewards/chosen": 73.07569885253906, "rewards/margins": 127.9220199584961, "rewards/rejected": -54.84632110595703, "step": 1730, "u": -2.1583805084228516, "weight": 0.06282185018062592 }, { "diff_generated": -59.43408203125, "epoch": 0.5638366817887233, "grad_norm": 477.8168424857454, "learning_rate": 7.813074675521962e-07, "logits/chosen": -2.5375099182128906, "logits/rejected": -2.572326898574829, "logps/chosen": -24.597471237182617, "logps/rejected": -143.44293212890625, "loss": 23.7144, "losses_ref": -0.002471204148605466, "ref_logps/chosen": -99.7923355102539, "ref_logps/rejected": -84.00885772705078, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 75.19486999511719, "rewards/margins": 134.6289520263672, "rewards/rejected": -59.43408203125, "step": 1740, "u": -2.2448902130126953, "weight": 0.025143280625343323 }, { "diff_generated": -56.804656982421875, "epoch": 0.5670771224886585, "grad_norm": 494.3841984274702, "learning_rate": 7.80849093153786e-07, "logits/chosen": -2.491807699203491, "logits/rejected": -2.5932672023773193, "logps/chosen": -21.136425018310547, "logps/rejected": -142.30722045898438, "loss": 22.1587, "losses_ref": -0.00046972898417152464, "ref_logps/chosen": -92.85779571533203, "ref_logps/rejected": -85.50257110595703, "rewards/accuracies": 0.9375, "rewards/chosen": 71.72136688232422, "rewards/margins": 128.52603149414062, "rewards/rejected": -56.804656982421875, "step": 1750, "u": -2.1586501598358154, "weight": 0.06252589076757431 }, { "diff_generated": -56.39727783203125, "epoch": 0.5703175631885936, "grad_norm": 460.22585311602836, "learning_rate": 7.803853043142702e-07, "logits/chosen": -2.4713480472564697, "logits/rejected": -2.581171989440918, "logps/chosen": -24.284358978271484, "logps/rejected": -141.79885864257812, "loss": 21.9982, "losses_ref": -9.926590109898825e-07, "ref_logps/chosen": -95.4493637084961, "ref_logps/rejected": -85.4015884399414, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 71.16500854492188, "rewards/margins": 127.56229400634766, "rewards/rejected": -56.39727783203125, "step": 1760, "u": -2.173064708709717, "weight": 0.056250035762786865 }, { "diff_generated": -57.44085693359375, "epoch": 0.5735580038885288, "grad_norm": 498.84117678721265, "learning_rate": 7.799161076272245e-07, "logits/chosen": -2.4647529125213623, "logits/rejected": -2.568406581878662, "logps/chosen": -22.84539031982422, "logps/rejected": -136.6644744873047, "loss": 22.619, "losses_ref": -6.464334546762984e-06, "ref_logps/chosen": -93.20040130615234, "ref_logps/rejected": -79.22361755371094, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 70.35499572753906, "rewards/margins": 127.79586029052734, "rewards/rejected": -57.44085693359375, "step": 1770, "u": -2.187455654144287, "weight": 0.0500001423060894 }, { "diff_generated": -57.65531539916992, "epoch": 0.5767984445884641, "grad_norm": 567.0598517544732, "learning_rate": 7.794415097631066e-07, "logits/chosen": -2.485158681869507, "logits/rejected": -2.5438666343688965, "logps/chosen": -22.188308715820312, "logps/rejected": -140.32151794433594, "loss": 23.3885, "losses_ref": -0.000717981078196317, "ref_logps/chosen": -94.6068344116211, "ref_logps/rejected": -82.66620635986328, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 72.41851806640625, "rewards/margins": 130.07383728027344, "rewards/rejected": -57.65531539916992, "step": 1780, "u": -2.2450015544891357, "weight": 0.025021100416779518 }, { "diff_generated": -58.910499572753906, "epoch": 0.5800388852883992, "grad_norm": 502.2065972530842, "learning_rate": 7.789615174691619e-07, "logits/chosen": -2.433727741241455, "logits/rejected": -2.560873508453369, "logps/chosen": -24.428359985351562, "logps/rejected": -149.1986541748047, "loss": 23.0511, "losses_ref": -5.546243073695223e-07, "ref_logps/chosen": -95.4670181274414, "ref_logps/rejected": -90.28814697265625, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 71.03865814208984, "rewards/margins": 129.9491424560547, "rewards/rejected": -58.910499572753906, "step": 1790, "u": -2.187455654144287, "weight": 0.050000011920928955 }, { "diff_generated": -58.59474563598633, "epoch": 0.5832793259883344, "grad_norm": 514.5693084653902, "learning_rate": 7.784761375693268e-07, "logits/chosen": -2.416273832321167, "logits/rejected": -2.5081026554107666, "logps/chosen": -23.346494674682617, "logps/rejected": -149.67361450195312, "loss": 23.1464, "losses_ref": -1.1515285223140381e-05, "ref_logps/chosen": -92.47997283935547, "ref_logps/rejected": -91.0788803100586, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 69.13347625732422, "rewards/margins": 127.72822570800781, "rewards/rejected": -58.59474563598633, "step": 1800, "u": -2.1730644702911377, "weight": 0.05625023692846298 }, { "diff_generated": -55.998878479003906, "epoch": 0.5865197666882696, "grad_norm": 519.4944457021509, "learning_rate": 7.779853769641319e-07, "logits/chosen": -2.4430603981018066, "logits/rejected": -2.5393834114074707, "logps/chosen": -24.854917526245117, "logps/rejected": -139.3914794921875, "loss": 22.3479, "losses_ref": -0.00019149412401020527, "ref_logps/chosen": -95.69673156738281, "ref_logps/rejected": -83.3926010131836, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 70.84181213378906, "rewards/margins": 126.84068298339844, "rewards/rejected": -55.998878479003906, "step": 1810, "u": -2.2018439769744873, "weight": 0.04375326260924339 }, { "diff_generated": -58.254791259765625, "epoch": 0.5897602073882048, "grad_norm": 512.5708535156688, "learning_rate": 7.774892426306042e-07, "logits/chosen": -2.466561794281006, "logits/rejected": -2.5939698219299316, "logps/chosen": -21.761241912841797, "logps/rejected": -147.48875427246094, "loss": 22.3906, "losses_ref": -0.002497596899047494, "ref_logps/chosen": -91.84721374511719, "ref_logps/rejected": -89.23395538330078, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 70.08597564697266, "rewards/margins": 128.34075927734375, "rewards/rejected": -58.254791259765625, "step": 1820, "u": -2.1298465728759766, "weight": 0.07504962384700775 }, { "diff_generated": -57.40629959106445, "epoch": 0.59300064808814, "grad_norm": 455.3768485779574, "learning_rate": 7.769877416221678e-07, "logits/chosen": -2.468407392501831, "logits/rejected": -2.515033006668091, "logps/chosen": -25.566970825195312, "logps/rejected": -142.10153198242188, "loss": 24.0047, "losses_ref": -6.456654091380187e-07, "ref_logps/chosen": -97.21444702148438, "ref_logps/rejected": -84.69524383544922, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 71.64747619628906, "rewards/margins": 129.05377197265625, "rewards/rejected": -57.40629959106445, "step": 1830, "u": -2.187455654144287, "weight": 0.05000002309679985 }, { "diff_generated": -55.58686065673828, "epoch": 0.5962410887880751, "grad_norm": 484.9968123238012, "learning_rate": 7.764808810685433e-07, "logits/chosen": -2.431522846221924, "logits/rejected": -2.5557100772857666, "logps/chosen": -19.275615692138672, "logps/rejected": -136.2754364013672, "loss": 22.7215, "losses_ref": -0.0005879181553609669, "ref_logps/chosen": -88.68095397949219, "ref_logps/rejected": -80.68856048583984, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 69.40534210205078, "rewards/margins": 124.99220275878906, "rewards/rejected": -55.58686065673828, "step": 1840, "u": -2.1298632621765137, "weight": 0.0750310942530632 }, { "diff_generated": -56.19511795043945, "epoch": 0.5994815294880104, "grad_norm": 498.45416111428847, "learning_rate": 7.759686681756468e-07, "logits/chosen": -2.4816057682037354, "logits/rejected": -2.5456433296203613, "logps/chosen": -22.139755249023438, "logps/rejected": -142.64230346679688, "loss": 22.4336, "losses_ref": -0.006184516940265894, "ref_logps/chosen": -93.66046142578125, "ref_logps/rejected": -86.44719696044922, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 71.52071380615234, "rewards/margins": 127.7158203125, "rewards/rejected": -56.19511795043945, "step": 1850, "u": -2.1727375984191895, "weight": 0.056604884564876556 }, { "diff_generated": -58.84028244018555, "epoch": 0.6027219701879456, "grad_norm": 458.8710847819926, "learning_rate": 7.754511102254876e-07, "logits/chosen": -2.4350783824920654, "logits/rejected": -2.54716420173645, "logps/chosen": -21.77615737915039, "logps/rejected": -140.6431884765625, "loss": 23.2702, "losses_ref": -5.444636826723581e-06, "ref_logps/chosen": -90.06400299072266, "ref_logps/rejected": -81.80291748046875, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 68.28785705566406, "rewards/margins": 127.12812805175781, "rewards/rejected": -58.84028244018555, "step": 1860, "u": -2.2018465995788574, "weight": 0.04375021532177925 }, { "diff_generated": -54.66754913330078, "epoch": 0.6059624108878807, "grad_norm": 487.4657979052523, "learning_rate": 7.74928214576064e-07, "logits/chosen": -2.4859158992767334, "logits/rejected": -2.5099780559539795, "logps/chosen": -23.468854904174805, "logps/rejected": -139.61276245117188, "loss": 23.0208, "losses_ref": -2.9951024771435186e-06, "ref_logps/chosen": -98.22453308105469, "ref_logps/rejected": -84.94522094726562, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 74.75566864013672, "rewards/margins": 129.42323303222656, "rewards/rejected": -54.66754913330078, "step": 1870, "u": -2.216238021850586, "weight": 0.037500061094760895 }, { "diff_generated": -60.11989212036133, "epoch": 0.609202851587816, "grad_norm": 472.01258007651603, "learning_rate": 7.743999886612591e-07, "logits/chosen": -2.461061954498291, "logits/rejected": -2.5660219192504883, "logps/chosen": -22.852632522583008, "logps/rejected": -152.16908264160156, "loss": 21.9321, "losses_ref": -5.277171112538781e-06, "ref_logps/chosen": -96.39404296875, "ref_logps/rejected": -92.0491943359375, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 73.5414047241211, "rewards/margins": 133.66128540039062, "rewards/rejected": -60.11989212036133, "step": 1880, "u": -2.2738025188446045, "weight": 0.012500083073973656 }, { "diff_generated": -61.25305938720703, "epoch": 0.6124432922877512, "grad_norm": 497.17719794133916, "learning_rate": 7.738664399907355e-07, "logits/chosen": -2.4737637042999268, "logits/rejected": -2.5895867347717285, "logps/chosen": -22.45340919494629, "logps/rejected": -151.78509521484375, "loss": 21.5031, "losses_ref": -0.002105607185512781, "ref_logps/chosen": -94.99655151367188, "ref_logps/rejected": -90.53204345703125, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 72.54315185546875, "rewards/margins": 133.79620361328125, "rewards/rejected": -61.25305938720703, "step": 1890, "u": -2.2593088150024414, "weight": 0.018863562494516373 }, { "diff_generated": -55.67549514770508, "epoch": 0.6156837329876863, "grad_norm": 473.5608592373336, "learning_rate": 7.733275761498278e-07, "logits/chosen": -2.494147777557373, "logits/rejected": -2.535625457763672, "logps/chosen": -24.380495071411133, "logps/rejected": -140.7521514892578, "loss": 22.8896, "losses_ref": -0.00022286793682724237, "ref_logps/chosen": -94.96155548095703, "ref_logps/rejected": -85.07664489746094, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 70.5810546875, "rewards/margins": 126.25655364990234, "rewards/rejected": -55.67549514770508, "step": 1900, "u": -2.17305850982666, "weight": 0.056256867945194244 }, { "diff_generated": -57.19377517700195, "epoch": 0.6189241736876215, "grad_norm": 511.7405798643593, "learning_rate": 7.727834047994353e-07, "logits/chosen": -2.463592529296875, "logits/rejected": -2.5578956604003906, "logps/chosen": -26.32914161682129, "logps/rejected": -145.5801239013672, "loss": 22.4974, "losses_ref": -2.7549589503905736e-05, "ref_logps/chosen": -98.46235656738281, "ref_logps/rejected": -88.3863525390625, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 72.13322448730469, "rewards/margins": 129.32699584960938, "rewards/rejected": -57.19377517700195, "step": 1910, "u": -2.1874547004699707, "weight": 0.05000131204724312 }, { "diff_generated": -57.825355529785156, "epoch": 0.6221646143875567, "grad_norm": 505.19909052460184, "learning_rate": 7.722339336759129e-07, "logits/chosen": -2.391700267791748, "logits/rejected": -2.5454742908477783, "logps/chosen": -23.258031845092773, "logps/rejected": -143.39334106445312, "loss": 23.1168, "losses_ref": -6.362137355608866e-05, "ref_logps/chosen": -91.22623443603516, "ref_logps/rejected": -85.56798553466797, "rewards/accuracies": 0.9375, "rewards/chosen": 67.96820068359375, "rewards/margins": 125.79354095458984, "rewards/rejected": -57.825355529785156, "step": 1920, "u": -2.158673048019409, "weight": 0.0625004917383194 }, { "diff_generated": -54.9102668762207, "epoch": 0.6254050550874919, "grad_norm": 529.4901064435244, "learning_rate": 7.71679170590961e-07, "logits/chosen": -2.515338897705078, "logits/rejected": -2.560856580734253, "logps/chosen": -24.146343231201172, "logps/rejected": -137.59536743164062, "loss": 21.9412, "losses_ref": -0.0008923925342969596, "ref_logps/chosen": -96.57925415039062, "ref_logps/rejected": -82.68508911132812, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 72.43292236328125, "rewards/margins": 127.34318542480469, "rewards/rejected": -54.9102668762207, "step": 1930, "u": -2.18741512298584, "weight": 0.050045304000377655 }, { "diff_generated": -59.83012771606445, "epoch": 0.6286454957874271, "grad_norm": 438.39465837647845, "learning_rate": 7.711191234315146e-07, "logits/chosen": -2.4719552993774414, "logits/rejected": -2.5544826984405518, "logps/chosen": -23.674427032470703, "logps/rejected": -149.69705200195312, "loss": 22.4189, "losses_ref": -1.7031243260134943e-06, "ref_logps/chosen": -98.4489974975586, "ref_logps/rejected": -89.86692810058594, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 74.77456665039062, "rewards/margins": 134.60470581054688, "rewards/rejected": -59.83012771606445, "step": 1940, "u": -2.2450203895568848, "weight": 0.025000065565109253 }, { "diff_generated": -58.504981994628906, "epoch": 0.6318859364873622, "grad_norm": 487.23718047122196, "learning_rate": 7.705538001596312e-07, "logits/chosen": -2.477814197540283, "logits/rejected": -2.5949177742004395, "logps/chosen": -20.551475524902344, "logps/rejected": -148.85836791992188, "loss": 22.8294, "losses_ref": -5.330602625974734e-09, "ref_logps/chosen": -92.30406188964844, "ref_logps/rejected": -90.35337829589844, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 71.75257873535156, "rewards/margins": 130.25755310058594, "rewards/rejected": -58.504981994628906, "step": 1950, "u": -2.187455654144287, "weight": 0.05000000074505806 }, { "diff_generated": -57.42897415161133, "epoch": 0.6351263771872975, "grad_norm": 488.39965331393387, "learning_rate": 7.699832088123774e-07, "logits/chosen": -2.504335880279541, "logits/rejected": -2.5275423526763916, "logps/chosen": -24.176315307617188, "logps/rejected": -143.69924926757812, "loss": 23.0429, "losses_ref": -3.060336439375533e-06, "ref_logps/chosen": -98.70487213134766, "ref_logps/rejected": -86.2702865600586, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 74.52854919433594, "rewards/margins": 131.95753479003906, "rewards/rejected": -57.42897415161133, "step": 1960, "u": -2.2018468379974365, "weight": 0.04375012591481209 }, { "diff_generated": -58.23870849609375, "epoch": 0.6383668178872327, "grad_norm": 504.1265747258012, "learning_rate": 7.694073575017151e-07, "logits/chosen": -2.3937363624572754, "logits/rejected": -2.508058547973633, "logps/chosen": -20.843114852905273, "logps/rejected": -140.5963134765625, "loss": 22.1691, "losses_ref": -1.2568195870699128e-06, "ref_logps/chosen": -89.77731323242188, "ref_logps/rejected": -82.35760498046875, "rewards/accuracies": 0.9375, "rewards/chosen": 68.93418884277344, "rewards/margins": 127.17289733886719, "rewards/rejected": -58.23870849609375, "step": 1970, "u": -2.1586735248565674, "weight": 0.06250002235174179 }, { "diff_generated": -56.48950958251953, "epoch": 0.6416072585871678, "grad_norm": 518.2740490404789, "learning_rate": 7.688262544143854e-07, "logits/chosen": -2.467092752456665, "logits/rejected": -2.529083728790283, "logps/chosen": -22.24001121520996, "logps/rejected": -140.2478485107422, "loss": 22.4542, "losses_ref": -8.965320375864394e-06, "ref_logps/chosen": -93.88130187988281, "ref_logps/rejected": -83.75831604003906, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 71.64128112792969, "rewards/margins": 128.1307830810547, "rewards/rejected": -56.48950958251953, "step": 1980, "u": -2.1298911571502686, "weight": 0.07500015199184418 }, { "diff_generated": -55.2055778503418, "epoch": 0.6448476992871031, "grad_norm": 469.91031715979483, "learning_rate": 7.682399078117928e-07, "logits/chosen": -2.4817662239074707, "logits/rejected": -2.5173556804656982, "logps/chosen": -21.232852935791016, "logps/rejected": -142.70101928710938, "loss": 22.9948, "losses_ref": -0.0010377921862527728, "ref_logps/chosen": -96.75160217285156, "ref_logps/rejected": -87.49544525146484, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 75.51875305175781, "rewards/margins": 130.72433471679688, "rewards/rejected": -55.2055778503418, "step": 1990, "u": -2.1442325115203857, "weight": 0.06880507618188858 }, { "diff_generated": -54.693626403808594, "epoch": 0.6480881399870383, "grad_norm": 502.7417960438683, "learning_rate": 7.67648326029888e-07, "logits/chosen": -2.483794927597046, "logits/rejected": -2.523793935775757, "logps/chosen": -24.50412368774414, "logps/rejected": -141.32821655273438, "loss": 23.343, "losses_ref": -0.000803236966021359, "ref_logps/chosen": -97.73753356933594, "ref_logps/rejected": -86.63460540771484, "rewards/accuracies": 0.9375, "rewards/chosen": 73.23341369628906, "rewards/margins": 127.92704010009766, "rewards/rejected": -54.693626403808594, "step": 2000, "u": -2.1586384773254395, "weight": 0.06253884732723236 }, { "diff_generated": -55.728248596191406, "epoch": 0.6513285806869734, "grad_norm": 495.9694491130362, "learning_rate": 7.670515174790485e-07, "logits/chosen": -2.4611029624938965, "logits/rejected": -2.5048117637634277, "logps/chosen": -23.760311126708984, "logps/rejected": -142.3408203125, "loss": 23.0292, "losses_ref": -5.852278377460607e-07, "ref_logps/chosen": -95.93404388427734, "ref_logps/rejected": -86.61258697509766, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 72.17372131347656, "rewards/margins": 127.9019775390625, "rewards/rejected": -55.728248596191406, "step": 2010, "u": -2.216238260269165, "weight": 0.037500008940696716 }, { "diff_generated": -60.43064498901367, "epoch": 0.6545690213869086, "grad_norm": 493.2141019606241, "learning_rate": 7.664494906439598e-07, "logits/chosen": -2.462008237838745, "logits/rejected": -2.5223259925842285, "logps/chosen": -21.097309112548828, "logps/rejected": -148.689697265625, "loss": 21.5231, "losses_ref": -2.144928146208258e-08, "ref_logps/chosen": -96.27225494384766, "ref_logps/rejected": -88.25904846191406, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 75.17494201660156, "rewards/margins": 135.60560607910156, "rewards/rejected": -60.43064498901367, "step": 2020, "u": -2.2018468379974365, "weight": 0.04374999925494194 }, { "diff_generated": -56.947662353515625, "epoch": 0.6578094620868438, "grad_norm": 447.1663290171413, "learning_rate": 7.658422540834943e-07, "logits/chosen": -2.4873642921447754, "logits/rejected": -2.5300660133361816, "logps/chosen": -25.486000061035156, "logps/rejected": -149.83065795898438, "loss": 23.6932, "losses_ref": -0.0004086032568011433, "ref_logps/chosen": -99.67965698242188, "ref_logps/rejected": -92.88298797607422, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 74.19366455078125, "rewards/margins": 131.14132690429688, "rewards/rejected": -56.947662353515625, "step": 2030, "u": -2.245002269744873, "weight": 0.025020133703947067 }, { "diff_generated": -55.35199737548828, "epoch": 0.661049902786779, "grad_norm": 469.0213842529389, "learning_rate": 7.6522981643059e-07, "logits/chosen": -2.463914155960083, "logits/rejected": -2.5366599559783936, "logps/chosen": -24.274606704711914, "logps/rejected": -139.7829132080078, "loss": 21.7409, "losses_ref": -0.012434705160558224, "ref_logps/chosen": -97.09610748291016, "ref_logps/rejected": -84.43091583251953, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 72.82149505615234, "rewards/margins": 128.17349243164062, "rewards/rejected": -55.35199737548828, "step": 2040, "u": -2.2155699729919434, "weight": 0.03820699453353882 }, { "diff_generated": -59.47046661376953, "epoch": 0.6642903434867142, "grad_norm": 503.1826607295146, "learning_rate": 7.646121863921278e-07, "logits/chosen": -2.4561409950256348, "logits/rejected": -2.490854263305664, "logps/chosen": -24.944446563720703, "logps/rejected": -144.05796813964844, "loss": 22.7095, "losses_ref": -1.2438914382073563e-05, "ref_logps/chosen": -102.60357666015625, "ref_logps/rejected": -84.58749389648438, "rewards/accuracies": 0.96875, "rewards/chosen": 77.65913391113281, "rewards/margins": 137.12960815429688, "rewards/rejected": -59.47046661376953, "step": 2050, "u": -2.2306289672851562, "weight": 0.03125036507844925 }, { "diff_generated": -57.042747497558594, "epoch": 0.6675307841866494, "grad_norm": 453.3027299027881, "learning_rate": 7.639893727488069e-07, "logits/chosen": -2.423924207687378, "logits/rejected": -2.5609469413757324, "logps/chosen": -21.400257110595703, "logps/rejected": -143.56591796875, "loss": 21.7158, "losses_ref": -0.0006009475910104811, "ref_logps/chosen": -92.40319061279297, "ref_logps/rejected": -86.52315521240234, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 71.00294494628906, "rewards/margins": 128.0457000732422, "rewards/rejected": -57.042747497558594, "step": 2060, "u": -2.2162106037139893, "weight": 0.037530649453401566 }, { "diff_generated": -54.49127197265625, "epoch": 0.6707712248865846, "grad_norm": 516.3445862422591, "learning_rate": 7.633613843550212e-07, "logits/chosen": -2.4732346534729004, "logits/rejected": -2.527963161468506, "logps/chosen": -23.60536766052246, "logps/rejected": -136.6197509765625, "loss": 23.2891, "losses_ref": -0.0018613319844007492, "ref_logps/chosen": -97.85231018066406, "ref_logps/rejected": -82.12848663330078, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 74.24693298339844, "rewards/margins": 128.73818969726562, "rewards/rejected": -54.49127197265625, "step": 2070, "u": -2.173004627227783, "weight": 0.05631663277745247 }, { "diff_generated": -56.165184020996094, "epoch": 0.6740116655865198, "grad_norm": 503.7437146207077, "learning_rate": 7.627282301387325e-07, "logits/chosen": -2.381239652633667, "logits/rejected": -2.4903016090393066, "logps/chosen": -21.5819091796875, "logps/rejected": -140.04544067382812, "loss": 22.1412, "losses_ref": -5.365263405110454e-06, "ref_logps/chosen": -88.96678161621094, "ref_logps/rejected": -83.88023376464844, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 67.38487243652344, "rewards/margins": 123.550048828125, "rewards/rejected": -56.165184020996094, "step": 2080, "u": -2.144282341003418, "weight": 0.06875016540288925 }, { "diff_generated": -57.1841926574707, "epoch": 0.6772521062864549, "grad_norm": 454.9895346490162, "learning_rate": 7.620899191013438e-07, "logits/chosen": -2.4103646278381348, "logits/rejected": -2.522381544113159, "logps/chosen": -24.676916122436523, "logps/rejected": -148.68307495117188, "loss": 22.9282, "losses_ref": -0.006165254861116409, "ref_logps/chosen": -94.65473937988281, "ref_logps/rejected": -91.49888610839844, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 69.97782897949219, "rewards/margins": 127.16202545166016, "rewards/rejected": -57.1841926574707, "step": 2090, "u": -2.172731876373291, "weight": 0.05661041662096977 }, { "diff_generated": -55.309730529785156, "epoch": 0.6804925469863902, "grad_norm": 489.7994554032864, "learning_rate": 7.614464603175717e-07, "logits/chosen": -2.497722625732422, "logits/rejected": -2.48801326751709, "logps/chosen": -23.2744083404541, "logps/rejected": -137.96255493164062, "loss": 21.5154, "losses_ref": -0.0013496755855157971, "ref_logps/chosen": -100.1006088256836, "ref_logps/rejected": -82.65283966064453, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 76.8261947631836, "rewards/margins": 132.1359405517578, "rewards/rejected": -55.309730529785156, "step": 2100, "u": -2.1730010509490967, "weight": 0.056320447474718094 }, { "diff_generated": -56.59611892700195, "epoch": 0.6837329876863253, "grad_norm": 467.7258584343898, "learning_rate": 7.607978629353167e-07, "logits/chosen": -2.4443843364715576, "logits/rejected": -2.535287380218506, "logps/chosen": -22.963520050048828, "logps/rejected": -145.03662109375, "loss": 22.243, "losses_ref": -0.0002075113879982382, "ref_logps/chosen": -93.62332916259766, "ref_logps/rejected": -88.44050598144531, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 70.65980529785156, "rewards/margins": 127.25592041015625, "rewards/rejected": -56.59611892700195, "step": 2110, "u": -2.1874475479125977, "weight": 0.05000927299261093 }, { "diff_generated": -59.0588493347168, "epoch": 0.6869734283862605, "grad_norm": 506.1015617033496, "learning_rate": 7.60144136175534e-07, "logits/chosen": -2.455575942993164, "logits/rejected": -2.53769588470459, "logps/chosen": -19.758596420288086, "logps/rejected": -143.96475219726562, "loss": 21.7778, "losses_ref": -0.002314900513738394, "ref_logps/chosen": -94.31592559814453, "ref_logps/rejected": -84.90589904785156, "rewards/accuracies": 0.9375, "rewards/chosen": 74.55732727050781, "rewards/margins": 133.61618041992188, "rewards/rejected": -59.0588493347168, "step": 2120, "u": -2.1585612297058105, "weight": 0.06262405216693878 }, { "diff_generated": -57.1196174621582, "epoch": 0.6902138690861958, "grad_norm": 534.2687251405287, "learning_rate": 7.594852893321015e-07, "logits/chosen": -2.422362804412842, "logits/rejected": -2.5526645183563232, "logps/chosen": -21.735645294189453, "logps/rejected": -144.26742553710938, "loss": 22.0867, "losses_ref": -0.0022257170639932156, "ref_logps/chosen": -91.74186706542969, "ref_logps/rejected": -87.14781188964844, "rewards/accuracies": 0.9375, "rewards/chosen": 70.00621032714844, "rewards/margins": 127.1258316040039, "rewards/rejected": -57.1196174621582, "step": 2130, "u": -2.158565044403076, "weight": 0.06261952221393585 }, { "diff_generated": -54.54099655151367, "epoch": 0.6934543097861309, "grad_norm": 457.63891961234555, "learning_rate": 7.588213317716883e-07, "logits/chosen": -2.361682415008545, "logits/rejected": -2.5019543170928955, "logps/chosen": -19.993026733398438, "logps/rejected": -136.1216278076172, "loss": 22.3299, "losses_ref": -1.1567156121827793e-07, "ref_logps/chosen": -89.22071838378906, "ref_logps/rejected": -81.58064270019531, "rewards/accuracies": 0.9375, "rewards/chosen": 69.22769165039062, "rewards/margins": 123.7686767578125, "rewards/rejected": -54.54099655151367, "step": 2140, "u": -2.1586735248565674, "weight": 0.0625 }, { "diff_generated": -54.2440299987793, "epoch": 0.6966947504860661, "grad_norm": 477.00173461765286, "learning_rate": 7.581522729336214e-07, "logits/chosen": -2.382094144821167, "logits/rejected": -2.4460394382476807, "logps/chosen": -21.491865158081055, "logps/rejected": -132.05075073242188, "loss": 21.9708, "losses_ref": -0.0005560126155614853, "ref_logps/chosen": -94.92799377441406, "ref_logps/rejected": -77.8067398071289, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 73.43612670898438, "rewards/margins": 127.68016052246094, "rewards/rejected": -54.2440299987793, "step": 2150, "u": -2.144258975982666, "weight": 0.06877604871988297 }, { "diff_generated": -56.922821044921875, "epoch": 0.6999351911860013, "grad_norm": 460.42405683815605, "learning_rate": 7.574781223297513e-07, "logits/chosen": -2.4442458152770996, "logits/rejected": -2.4855704307556152, "logps/chosen": -23.61258316040039, "logps/rejected": -139.59652709960938, "loss": 21.2033, "losses_ref": -0.013810291886329651, "ref_logps/chosen": -97.78433990478516, "ref_logps/rejected": -82.6737060546875, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 74.1717529296875, "rewards/margins": 131.09457397460938, "rewards/rejected": -56.922821044921875, "step": 2160, "u": -2.172224998474121, "weight": 0.05712286755442619 }, { "diff_generated": -55.76105880737305, "epoch": 0.7031756318859365, "grad_norm": 466.2392560237832, "learning_rate": 7.567988895443173e-07, "logits/chosen": -2.4225571155548096, "logits/rejected": -2.4542760848999023, "logps/chosen": -20.48651885986328, "logps/rejected": -140.50071716308594, "loss": 22.1093, "losses_ref": -0.00023919029626995325, "ref_logps/chosen": -93.83261108398438, "ref_logps/rejected": -84.73966217041016, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 73.34608459472656, "rewards/margins": 129.10714721679688, "rewards/rejected": -55.76105880737305, "step": 2170, "u": -2.129882335662842, "weight": 0.07500983029603958 }, { "diff_generated": -55.14765548706055, "epoch": 0.7064160725858717, "grad_norm": 458.56557339059754, "learning_rate": 7.561145842338102e-07, "logits/chosen": -2.4281909465789795, "logits/rejected": -2.4837698936462402, "logps/chosen": -22.63579750061035, "logps/rejected": -139.19200134277344, "loss": 21.7774, "losses_ref": -0.0003904419136233628, "ref_logps/chosen": -94.46659088134766, "ref_logps/rejected": -84.04434967041016, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 71.8307876586914, "rewards/margins": 126.97843933105469, "rewards/rejected": -55.14765548706055, "step": 2180, "u": -2.2018356323242188, "weight": 0.04376264289021492 }, { "diff_generated": -57.23699188232422, "epoch": 0.7096565132858069, "grad_norm": 511.57694347026296, "learning_rate": 7.554252161268365e-07, "logits/chosen": -2.412655830383301, "logits/rejected": -2.511845588684082, "logps/chosen": -23.021530151367188, "logps/rejected": -145.2180633544922, "loss": 21.7257, "losses_ref": -0.0007701918366365135, "ref_logps/chosen": -94.85514068603516, "ref_logps/rejected": -87.98106384277344, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 71.8336181640625, "rewards/margins": 129.0706024169922, "rewards/rejected": -57.23699188232422, "step": 2190, "u": -2.1730284690856934, "weight": 0.0562903955578804 }, { "diff_generated": -56.55291748046875, "epoch": 0.712896953985742, "grad_norm": 490.6510002260033, "learning_rate": 7.547307950239785e-07, "logits/chosen": -2.474010944366455, "logits/rejected": -2.541577100753784, "logps/chosen": -22.41421890258789, "logps/rejected": -139.7757110595703, "loss": 22.3575, "losses_ref": -0.00034674102789722383, "ref_logps/chosen": -98.89764404296875, "ref_logps/rejected": -83.22279357910156, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 76.48342895507812, "rewards/margins": 133.03634643554688, "rewards/rejected": -56.55291748046875, "step": 2200, "u": -2.201831340789795, "weight": 0.04376723989844322 }, { "diff_generated": -53.38134002685547, "epoch": 0.7161373946856773, "grad_norm": 490.56252903024705, "learning_rate": 7.540313307976563e-07, "logits/chosen": -2.436366081237793, "logits/rejected": -2.5205202102661133, "logps/chosen": -21.785724639892578, "logps/rejected": -134.70553588867188, "loss": 23.25, "losses_ref": -1.0933801604551263e-05, "ref_logps/chosen": -91.46631622314453, "ref_logps/rejected": -81.3241958618164, "rewards/accuracies": 0.90625, "rewards/chosen": 69.68058776855469, "rewards/margins": 123.06193542480469, "rewards/rejected": -53.38134002685547, "step": 2210, "u": -2.0867176055908203, "weight": 0.09375043213367462 }, { "diff_generated": -55.9515380859375, "epoch": 0.7193778353856124, "grad_norm": 428.5675957865356, "learning_rate": 7.533268333919865e-07, "logits/chosen": -2.4399867057800293, "logits/rejected": -2.5394577980041504, "logps/chosen": -23.00450897216797, "logps/rejected": -143.8893585205078, "loss": 21.8257, "losses_ref": -6.2254525801108684e-06, "ref_logps/chosen": -96.14192962646484, "ref_logps/rejected": -87.93781280517578, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 73.13742065429688, "rewards/margins": 129.08895874023438, "rewards/rejected": -55.9515380859375, "step": 2220, "u": -2.216237783432007, "weight": 0.03750023618340492 }, { "diff_generated": -57.02019119262695, "epoch": 0.7226182760855476, "grad_norm": 465.81723520228275, "learning_rate": 7.526173128226416e-07, "logits/chosen": -2.415139675140381, "logits/rejected": -2.530526638031006, "logps/chosen": -23.722383499145508, "logps/rejected": -143.73326110839844, "loss": 21.2618, "losses_ref": -4.169198655290529e-05, "ref_logps/chosen": -93.94865417480469, "ref_logps/rejected": -86.71307373046875, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 70.22628021240234, "rewards/margins": 127.24647521972656, "rewards/rejected": -57.02019119262695, "step": 2230, "u": -2.201845407485962, "weight": 0.0437517985701561 }, { "diff_generated": -56.1454963684082, "epoch": 0.7258587167854829, "grad_norm": 456.4544393860723, "learning_rate": 7.519027791767069e-07, "logits/chosen": -2.416743040084839, "logits/rejected": -2.474727153778076, "logps/chosen": -25.03844451904297, "logps/rejected": -146.59579467773438, "loss": 22.6618, "losses_ref": -2.143383608199656e-05, "ref_logps/chosen": -98.1845703125, "ref_logps/rejected": -90.4503173828125, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 73.14612579345703, "rewards/margins": 129.2916259765625, "rewards/rejected": -56.1454963684082, "step": 2240, "u": -2.2018463611602783, "weight": 0.043750692158937454 }, { "diff_generated": -53.48323440551758, "epoch": 0.729099157485418, "grad_norm": 516.2210753632854, "learning_rate": 7.511832426125375e-07, "logits/chosen": -2.4550464153289795, "logits/rejected": -2.49265193939209, "logps/chosen": -22.875, "logps/rejected": -140.51768493652344, "loss": 22.1306, "losses_ref": -1.8146038200939074e-05, "ref_logps/chosen": -96.42729187011719, "ref_logps/rejected": -87.03443908691406, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 73.55229187011719, "rewards/margins": 127.03553771972656, "rewards/rejected": -53.48323440551758, "step": 2250, "u": -2.1298909187316895, "weight": 0.07500042766332626 }, { "diff_generated": -52.945457458496094, "epoch": 0.7323395981853532, "grad_norm": 509.54662632563736, "learning_rate": 7.504587133596141e-07, "logits/chosen": -2.5048627853393555, "logits/rejected": -2.5559840202331543, "logps/chosen": -21.633480072021484, "logps/rejected": -138.461669921875, "loss": 21.5234, "losses_ref": -4.3154597051398014e-07, "ref_logps/chosen": -93.41227722167969, "ref_logps/rejected": -85.51620483398438, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 71.77879333496094, "rewards/margins": 124.72425842285156, "rewards/rejected": -52.945457458496094, "step": 2260, "u": -2.1298911571502686, "weight": 0.07500001788139343 }, { "diff_generated": -59.143280029296875, "epoch": 0.7355800388852884, "grad_norm": 497.5814124852356, "learning_rate": 7.497292017183965e-07, "logits/chosen": -2.500357151031494, "logits/rejected": -2.5774011611938477, "logps/chosen": -22.00308609008789, "logps/rejected": -143.08343505859375, "loss": 22.602, "losses_ref": -0.0005357967456802726, "ref_logps/chosen": -97.12187194824219, "ref_logps/rejected": -83.9401626586914, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 75.11878204345703, "rewards/margins": 134.26205444335938, "rewards/rejected": -59.143280029296875, "step": 2270, "u": -2.259385108947754, "weight": 0.01877937652170658 }, { "diff_generated": -57.037635803222656, "epoch": 0.7388204795852236, "grad_norm": 476.0990938511596, "learning_rate": 7.489947180601791e-07, "logits/chosen": -2.4256510734558105, "logits/rejected": -2.494957208633423, "logps/chosen": -21.480037689208984, "logps/rejected": -143.7701873779297, "loss": 20.9277, "losses_ref": -0.0008092170464806259, "ref_logps/chosen": -92.27450561523438, "ref_logps/rejected": -86.73255157470703, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 70.79447937011719, "rewards/margins": 127.83211517333984, "rewards/rejected": -57.037635803222656, "step": 2280, "u": -2.173030376434326, "weight": 0.05628802627325058 }, { "diff_generated": -58.950225830078125, "epoch": 0.7420609202851588, "grad_norm": 507.13162525453004, "learning_rate": 7.482552728269412e-07, "logits/chosen": -2.4813027381896973, "logits/rejected": -2.561781644821167, "logps/chosen": -22.769901275634766, "logps/rejected": -143.92344665527344, "loss": 21.6814, "losses_ref": -0.0013163576368242502, "ref_logps/chosen": -96.60799407958984, "ref_logps/rejected": -84.97322845458984, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 73.83808898925781, "rewards/margins": 132.78831481933594, "rewards/rejected": -58.950225830078125, "step": 2290, "u": -2.1873927116394043, "weight": 0.05006963014602661 }, { "diff_generated": -56.21698760986328, "epoch": 0.7453013609850939, "grad_norm": 480.2308129690591, "learning_rate": 7.475108765312001e-07, "logits/chosen": -2.4525883197784424, "logits/rejected": -2.490872383117676, "logps/chosen": -22.308719635009766, "logps/rejected": -140.81539916992188, "loss": 22.2788, "losses_ref": -2.739727165135264e-07, "ref_logps/chosen": -95.74879455566406, "ref_logps/rejected": -84.59840393066406, "rewards/accuracies": 0.9375, "rewards/chosen": 73.44007110595703, "rewards/margins": 129.6570587158203, "rewards/rejected": -56.21698760986328, "step": 2300, "u": -2.1586735248565674, "weight": 0.0625000074505806 }, { "diff_generated": -56.06154251098633, "epoch": 0.7485418016850292, "grad_norm": 486.9295523674511, "learning_rate": 7.467615397558613e-07, "logits/chosen": -2.464097738265991, "logits/rejected": -2.570708990097046, "logps/chosen": -22.407894134521484, "logps/rejected": -144.00759887695312, "loss": 22.8778, "losses_ref": -0.0002869610325433314, "ref_logps/chosen": -93.29719543457031, "ref_logps/rejected": -87.9460678100586, "rewards/accuracies": 0.9375, "rewards/chosen": 70.88929748535156, "rewards/margins": 126.95084381103516, "rewards/rejected": -56.06154251098633, "step": 2310, "u": -2.158660888671875, "weight": 0.06251401454210281 }, { "diff_generated": -55.63084030151367, "epoch": 0.7517822423849644, "grad_norm": 479.68801719361085, "learning_rate": 7.460072731540676e-07, "logits/chosen": -2.4591574668884277, "logits/rejected": -2.5600147247314453, "logps/chosen": -20.084136962890625, "logps/rejected": -141.91134643554688, "loss": 21.2137, "losses_ref": -1.929281461343635e-05, "ref_logps/chosen": -92.06275177001953, "ref_logps/rejected": -86.28050231933594, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 71.97862243652344, "rewards/margins": 127.60945892333984, "rewards/rejected": -55.63084030151367, "step": 2320, "u": -2.187455415725708, "weight": 0.05000050738453865 }, { "diff_generated": -56.46466064453125, "epoch": 0.7550226830848995, "grad_norm": 476.99868631752895, "learning_rate": 7.452480874490483e-07, "logits/chosen": -2.4715018272399902, "logits/rejected": -2.563992738723755, "logps/chosen": -20.5650691986084, "logps/rejected": -143.99801635742188, "loss": 21.5986, "losses_ref": -0.0008421779493801296, "ref_logps/chosen": -94.32906341552734, "ref_logps/rejected": -87.53335571289062, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 73.76399993896484, "rewards/margins": 130.22866821289062, "rewards/rejected": -56.46466064453125, "step": 2330, "u": -2.173027753829956, "weight": 0.05629073455929756 }, { "diff_generated": -58.54656219482422, "epoch": 0.7582631237848347, "grad_norm": 477.7399059877632, "learning_rate": 7.44483993433966e-07, "logits/chosen": -2.4514431953430176, "logits/rejected": -2.5479302406311035, "logps/chosen": -18.108726501464844, "logps/rejected": -143.81797790527344, "loss": 21.7014, "losses_ref": -0.0013811999233439565, "ref_logps/chosen": -89.15837097167969, "ref_logps/rejected": -85.27140808105469, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 71.04964447021484, "rewards/margins": 129.59622192382812, "rewards/rejected": -58.54656219482422, "step": 2340, "u": -2.1729941368103027, "weight": 0.05632782727479935 }, { "diff_generated": -56.10137939453125, "epoch": 0.76150356448477, "grad_norm": 511.48486644109664, "learning_rate": 7.437150019717641e-07, "logits/chosen": -2.430873394012451, "logits/rejected": -2.5499558448791504, "logps/chosen": -19.955095291137695, "logps/rejected": -138.7136993408203, "loss": 22.672, "losses_ref": -6.416399992303923e-05, "ref_logps/chosen": -91.30572509765625, "ref_logps/rejected": -82.6123046875, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 71.35063171386719, "rewards/margins": 127.4520034790039, "rewards/rejected": -56.10137939453125, "step": 2350, "u": -2.129889965057373, "weight": 0.07500138133764267 }, { "diff_generated": -57.626991271972656, "epoch": 0.7647440051847051, "grad_norm": 481.0871517222036, "learning_rate": 7.429411239950116e-07, "logits/chosen": -2.485440254211426, "logits/rejected": -2.6047866344451904, "logps/chosen": -23.08542251586914, "logps/rejected": -150.21994018554688, "loss": 22.1324, "losses_ref": -0.0004304622416384518, "ref_logps/chosen": -95.6878662109375, "ref_logps/rejected": -92.59294128417969, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 72.6024398803711, "rewards/margins": 130.2294158935547, "rewards/rejected": -57.626991271972656, "step": 2360, "u": -2.1874358654022217, "weight": 0.05002208799123764 }, { "diff_generated": -55.8709716796875, "epoch": 0.7679844458846403, "grad_norm": 457.02298028837765, "learning_rate": 7.421623705057477e-07, "logits/chosen": -2.509002208709717, "logits/rejected": -2.521097183227539, "logps/chosen": -18.914812088012695, "logps/rejected": -140.98692321777344, "loss": 21.34, "losses_ref": -0.0011177074629813433, "ref_logps/chosen": -94.02415466308594, "ref_logps/rejected": -85.115966796875, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 75.10933685302734, "rewards/margins": 130.98031616210938, "rewards/rejected": -55.8709716796875, "step": 2370, "u": -2.1298372745513916, "weight": 0.07505981624126434 }, { "diff_generated": -53.20804977416992, "epoch": 0.7712248865845756, "grad_norm": 503.14428502568614, "learning_rate": 7.413787525753261e-07, "logits/chosen": -2.441809892654419, "logits/rejected": -2.5438599586486816, "logps/chosen": -21.622966766357422, "logps/rejected": -135.86770629882812, "loss": 22.7855, "losses_ref": -0.0025614311452955008, "ref_logps/chosen": -90.37882232666016, "ref_logps/rejected": -82.65966796875, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 68.75585174560547, "rewards/margins": 121.96390533447266, "rewards/rejected": -53.20804977416992, "step": 2380, "u": -2.1010196208953857, "weight": 0.08759871870279312 }, { "diff_generated": -54.82086181640625, "epoch": 0.7744653272845107, "grad_norm": 459.212222917981, "learning_rate": 7.405902813442564e-07, "logits/chosen": -2.4724698066711426, "logits/rejected": -2.5220937728881836, "logps/chosen": -19.182483673095703, "logps/rejected": -139.30343627929688, "loss": 20.8541, "losses_ref": -0.002965776016935706, "ref_logps/chosen": -94.22468566894531, "ref_logps/rejected": -84.48258209228516, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 75.04219055175781, "rewards/margins": 129.86306762695312, "rewards/rejected": -54.82086181640625, "step": 2390, "u": -2.2017018795013428, "weight": 0.043910298496484756 }, { "diff_generated": -57.6229133605957, "epoch": 0.7777057679844459, "grad_norm": 435.42049220403754, "learning_rate": 7.39796968022047e-07, "logits/chosen": -2.416891574859619, "logits/rejected": -2.5111327171325684, "logps/chosen": -19.746204376220703, "logps/rejected": -140.6691131591797, "loss": 20.8716, "losses_ref": -0.004521545954048634, "ref_logps/chosen": -90.78548431396484, "ref_logps/rejected": -83.04621124267578, "rewards/accuracies": 0.96875, "rewards/chosen": 71.03929138183594, "rewards/margins": 128.6621856689453, "rewards/rejected": -57.6229133605957, "step": 2400, "u": -2.230410099029541, "weight": 0.031490933150053024 }, { "diff_generated": -57.4922981262207, "epoch": 0.780946208684381, "grad_norm": 481.804369528505, "learning_rate": 7.389988238870451e-07, "logits/chosen": -2.460920810699463, "logits/rejected": -2.4654626846313477, "logps/chosen": -23.994354248046875, "logps/rejected": -145.25741577148438, "loss": 21.7064, "losses_ref": -0.0070587992668151855, "ref_logps/chosen": -102.72373962402344, "ref_logps/rejected": -87.76512145996094, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 78.72938537597656, "rewards/margins": 136.22169494628906, "rewards/rejected": -57.4922981262207, "step": 2410, "u": -2.215846300125122, "weight": 0.03792215883731842 }, { "diff_generated": -59.77937698364258, "epoch": 0.7841866493843163, "grad_norm": 490.4641157632023, "learning_rate": 7.381958602862763e-07, "logits/chosen": -2.464838743209839, "logits/rejected": -2.5294761657714844, "logps/chosen": -22.61722183227539, "logps/rejected": -148.07510375976562, "loss": 21.6444, "losses_ref": -0.001273915870115161, "ref_logps/chosen": -98.81327056884766, "ref_logps/rejected": -88.29571533203125, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 76.19606018066406, "rewards/margins": 135.97543334960938, "rewards/rejected": -59.77937698364258, "step": 2420, "u": -2.216182231903076, "weight": 0.03756193816661835 }, { "diff_generated": -59.4302978515625, "epoch": 0.7874270900842515, "grad_norm": 516.6666797651599, "learning_rate": 7.373880886352832e-07, "logits/chosen": -2.513746500015259, "logits/rejected": -2.561347484588623, "logps/chosen": -24.95885467529297, "logps/rejected": -146.04820251464844, "loss": 21.7484, "losses_ref": -2.387703261774732e-06, "ref_logps/chosen": -101.25908660888672, "ref_logps/rejected": -86.61790466308594, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 76.30021667480469, "rewards/margins": 135.7305145263672, "rewards/rejected": -59.4302978515625, "step": 2430, "u": -2.216238021850586, "weight": 0.03750010207295418 }, { "diff_generated": -60.237892150878906, "epoch": 0.7906675307841866, "grad_norm": 481.8729871679236, "learning_rate": 7.365755204179637e-07, "logits/chosen": -2.3747448921203613, "logits/rejected": -2.5454201698303223, "logps/chosen": -21.511890411376953, "logps/rejected": -148.35452270507812, "loss": 21.8352, "losses_ref": -0.00043111745617352426, "ref_logps/chosen": -91.5963134765625, "ref_logps/rejected": -88.11661529541016, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 70.08443450927734, "rewards/margins": 130.3223114013672, "rewards/rejected": -60.237892150878906, "step": 2440, "u": -2.201833724975586, "weight": 0.04376457259058952 }, { "diff_generated": -57.28288650512695, "epoch": 0.7939079714841218, "grad_norm": 488.4952904683894, "learning_rate": 7.357581671864073e-07, "logits/chosen": -2.4144272804260254, "logits/rejected": -2.5472493171691895, "logps/chosen": -21.880416870117188, "logps/rejected": -145.05154418945312, "loss": 22.3205, "losses_ref": -0.001727291732095182, "ref_logps/chosen": -95.05560302734375, "ref_logps/rejected": -87.76866149902344, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 73.17517852783203, "rewards/margins": 130.45806884765625, "rewards/rejected": -57.28288650512695, "step": 2450, "u": -2.1730313301086426, "weight": 0.05628693103790283 }, { "diff_generated": -58.206886291503906, "epoch": 0.7971484121840571, "grad_norm": 439.44428631457066, "learning_rate": 7.349360405607303e-07, "logits/chosen": -2.394195079803467, "logits/rejected": -2.5290329456329346, "logps/chosen": -18.200389862060547, "logps/rejected": -139.7240753173828, "loss": 21.1708, "losses_ref": -1.836994329096342e-06, "ref_logps/chosen": -87.65174865722656, "ref_logps/rejected": -81.51720428466797, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 69.45135498046875, "rewards/margins": 127.6582260131836, "rewards/rejected": -58.206886291503906, "step": 2460, "u": -2.101109027862549, "weight": 0.08750007301568985 }, { "diff_generated": -55.12480545043945, "epoch": 0.8003888528839922, "grad_norm": 435.480304925401, "learning_rate": 7.341091522289122e-07, "logits/chosen": -2.5137763023376465, "logits/rejected": -2.5602359771728516, "logps/chosen": -20.748157501220703, "logps/rejected": -138.9665985107422, "loss": 21.1365, "losses_ref": -0.001150214346125722, "ref_logps/chosen": -95.16680908203125, "ref_logps/rejected": -83.841796875, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 74.41864776611328, "rewards/margins": 129.54345703125, "rewards/rejected": -55.12480545043945, "step": 2470, "u": -2.2017948627471924, "weight": 0.043808113783597946 }, { "diff_generated": -59.74702072143555, "epoch": 0.8036292935839274, "grad_norm": 455.6827583126531, "learning_rate": 7.332775139466278e-07, "logits/chosen": -2.542436122894287, "logits/rejected": -2.6427390575408936, "logps/chosen": -20.466482162475586, "logps/rejected": -150.67933654785156, "loss": 21.9027, "losses_ref": -0.001079038018360734, "ref_logps/chosen": -96.73770904541016, "ref_logps/rejected": -90.93230438232422, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 76.27122497558594, "rewards/margins": 136.01824951171875, "rewards/rejected": -59.74702072143555, "step": 2480, "u": -2.2593626976013184, "weight": 0.018804144114255905 }, { "diff_generated": -55.41155242919922, "epoch": 0.8068697342838627, "grad_norm": 458.2025692080539, "learning_rate": 7.324411375370809e-07, "logits/chosen": -2.4522483348846436, "logits/rejected": -2.5456674098968506, "logps/chosen": -21.490245819091797, "logps/rejected": -140.60546875, "loss": 22.3088, "losses_ref": -2.5578192435204983e-05, "ref_logps/chosen": -91.77385711669922, "ref_logps/rejected": -85.19390869140625, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 70.28361511230469, "rewards/margins": 125.6951675415039, "rewards/rejected": -55.41155242919922, "step": 2490, "u": -2.1298909187316895, "weight": 0.07500037550926208 }, { "diff_generated": -56.444732666015625, "epoch": 0.8101101749837978, "grad_norm": 493.372644152006, "learning_rate": 7.316000348908365e-07, "logits/chosen": -2.485949993133545, "logits/rejected": -2.576566457748413, "logps/chosen": -22.575986862182617, "logps/rejected": -140.38726806640625, "loss": 21.7191, "losses_ref": -0.0002884681161958724, "ref_logps/chosen": -92.9170150756836, "ref_logps/rejected": -83.9425277709961, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 70.34103393554688, "rewards/margins": 126.78575134277344, "rewards/rejected": -56.444732666015625, "step": 2500, "u": -2.129878520965576, "weight": 0.0750143826007843 }, { "diff_generated": -57.7220573425293, "epoch": 0.813350615683733, "grad_norm": 507.80135367519387, "learning_rate": 7.307542179656511e-07, "logits/chosen": -2.4682345390319824, "logits/rejected": -2.5501224994659424, "logps/chosen": -21.272693634033203, "logps/rejected": -144.73831176757812, "loss": 21.4688, "losses_ref": -0.0015492306556552649, "ref_logps/chosen": -94.3641357421875, "ref_logps/rejected": -87.01626586914062, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 73.09144592285156, "rewards/margins": 130.81350708007812, "rewards/rejected": -57.7220573425293, "step": 2510, "u": -2.129817485809326, "weight": 0.0750814825296402 }, { "diff_generated": -54.088233947753906, "epoch": 0.8165910563836681, "grad_norm": 478.4146285598937, "learning_rate": 7.29903698786303e-07, "logits/chosen": -2.4700889587402344, "logits/rejected": -2.4862189292907715, "logps/chosen": -22.107303619384766, "logps/rejected": -133.4788055419922, "loss": 21.8233, "losses_ref": -0.0017220573499798775, "ref_logps/chosen": -95.75920104980469, "ref_logps/rejected": -79.39057159423828, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 73.65189361572266, "rewards/margins": 127.74012756347656, "rewards/rejected": -54.088233947753906, "step": 2520, "u": -2.1873762607574463, "weight": 0.05008823797106743 }, { "diff_generated": -53.54823684692383, "epoch": 0.8198314970836034, "grad_norm": 476.36245565519215, "learning_rate": 7.290484894444214e-07, "logits/chosen": -2.410266160964966, "logits/rejected": -2.5156915187835693, "logps/chosen": -19.08709716796875, "logps/rejected": -136.63551330566406, "loss": 20.7257, "losses_ref": -9.177963875117712e-06, "ref_logps/chosen": -86.3339614868164, "ref_logps/rejected": -83.0872802734375, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 67.24686431884766, "rewards/margins": 120.79510498046875, "rewards/rejected": -53.54823684692383, "step": 2530, "u": -2.043544054031372, "weight": 0.11250035464763641 }, { "diff_generated": -55.499168395996094, "epoch": 0.8230719377835386, "grad_norm": 506.77751262247625, "learning_rate": 7.281886020983144e-07, "logits/chosen": -2.460317850112915, "logits/rejected": -2.4854583740234375, "logps/chosen": -23.15581703186035, "logps/rejected": -135.55416870117188, "loss": 20.8908, "losses_ref": -0.00029835925670340657, "ref_logps/chosen": -97.4378433227539, "ref_logps/rejected": -80.05500793457031, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 74.28202056884766, "rewards/margins": 129.7811737060547, "rewards/rejected": -55.499168395996094, "step": 2540, "u": -2.216226577758789, "weight": 0.037513017654418945 }, { "diff_generated": -55.99641036987305, "epoch": 0.8263123784834737, "grad_norm": 513.340136041323, "learning_rate": 7.273240489727963e-07, "logits/chosen": -2.4329352378845215, "logits/rejected": -2.4697928428649902, "logps/chosen": -22.266624450683594, "logps/rejected": -135.0956268310547, "loss": 21.4897, "losses_ref": -0.0011090862099081278, "ref_logps/chosen": -95.71326446533203, "ref_logps/rejected": -79.0992202758789, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 73.44662475585938, "rewards/margins": 129.4430389404297, "rewards/rejected": -55.99641036987305, "step": 2550, "u": -2.144231081008911, "weight": 0.0688069611787796 }, { "diff_generated": -56.29785919189453, "epoch": 0.829552819183409, "grad_norm": 454.9273628876045, "learning_rate": 7.264548423590133e-07, "logits/chosen": -2.4539313316345215, "logits/rejected": -2.537295341491699, "logps/chosen": -21.008182525634766, "logps/rejected": -139.8561248779297, "loss": 21.3602, "losses_ref": -4.210594488540664e-05, "ref_logps/chosen": -93.7306900024414, "ref_logps/rejected": -83.55828857421875, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 72.7225112915039, "rewards/margins": 129.02035522460938, "rewards/rejected": -56.29785919189453, "step": 2560, "u": -2.1730635166168213, "weight": 0.056251466274261475 }, { "diff_generated": -53.686622619628906, "epoch": 0.8327932598833442, "grad_norm": 489.15136824729063, "learning_rate": 7.255809946142695e-07, "logits/chosen": -2.439706325531006, "logits/rejected": -2.485032320022583, "logps/chosen": -22.65837287902832, "logps/rejected": -138.0975799560547, "loss": 22.6904, "losses_ref": -0.00020490979659371078, "ref_logps/chosen": -95.66709899902344, "ref_logps/rejected": -84.41097259521484, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 73.00873565673828, "rewards/margins": 126.6953353881836, "rewards/rejected": -53.686622619628906, "step": 2570, "u": -2.173058271408081, "weight": 0.056257009506225586 }, { "diff_generated": -58.009979248046875, "epoch": 0.8360337005832793, "grad_norm": 490.3665858873195, "learning_rate": 7.247025181618508e-07, "logits/chosen": -2.4753196239471436, "logits/rejected": -2.531113862991333, "logps/chosen": -22.63921356201172, "logps/rejected": -146.94308471679688, "loss": 21.4989, "losses_ref": -7.747672498226166e-05, "ref_logps/chosen": -96.47139739990234, "ref_logps/rejected": -88.93311309814453, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 73.83218383789062, "rewards/margins": 131.8421630859375, "rewards/rejected": -58.009979248046875, "step": 2580, "u": -2.2162346839904785, "weight": 0.0375036746263504 }, { "diff_generated": -56.34900665283203, "epoch": 0.8392741412832145, "grad_norm": 515.5358271111543, "learning_rate": 7.238194254908483e-07, "logits/chosen": -2.4178130626678467, "logits/rejected": -2.4952597618103027, "logps/chosen": -21.742961883544922, "logps/rejected": -139.27444458007812, "loss": 23.3113, "losses_ref": -1.4187762644723989e-05, "ref_logps/chosen": -93.23844909667969, "ref_logps/rejected": -82.92540740966797, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 71.49549102783203, "rewards/margins": 127.84449768066406, "rewards/rejected": -56.34900665283203, "step": 2590, "u": -2.1730639934539795, "weight": 0.056250639259815216 }, { "diff_generated": -56.2597541809082, "epoch": 0.8425145819831497, "grad_norm": 433.46724920592413, "learning_rate": 7.229317291559807e-07, "logits/chosen": -2.451038360595703, "logits/rejected": -2.5517618656158447, "logps/chosen": -22.46918487548828, "logps/rejected": -142.5192108154297, "loss": 21.5436, "losses_ref": -0.026391511783003807, "ref_logps/chosen": -93.1564712524414, "ref_logps/rejected": -86.25945281982422, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 70.68728637695312, "rewards/margins": 126.94703674316406, "rewards/rejected": -56.2597541809082, "step": 2600, "u": -2.2014243602752686, "weight": 0.044207897037267685 }, { "diff_generated": -55.4771842956543, "epoch": 0.8457550226830849, "grad_norm": 483.12958162627, "learning_rate": 7.22039441777416e-07, "logits/chosen": -2.425280809402466, "logits/rejected": -2.537978410720825, "logps/chosen": -20.165245056152344, "logps/rejected": -140.473388671875, "loss": 22.1426, "losses_ref": -0.012793747708201408, "ref_logps/chosen": -91.35377502441406, "ref_logps/rejected": -84.99620056152344, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 71.18852233886719, "rewards/margins": 126.66572570800781, "rewards/rejected": -55.4771842956543, "step": 2610, "u": -2.143904447555542, "weight": 0.0691661387681961 }, { "diff_generated": -58.96485137939453, "epoch": 0.8489954633830201, "grad_norm": 462.2183901664622, "learning_rate": 7.21142576040592e-07, "logits/chosen": -2.4881858825683594, "logits/rejected": -2.5900299549102783, "logps/chosen": -23.751340866088867, "logps/rejected": -145.04563903808594, "loss": 21.3726, "losses_ref": -7.155739467634703e-07, "ref_logps/chosen": -99.04013061523438, "ref_logps/rejected": -86.08078002929688, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 75.28878784179688, "rewards/margins": 134.25363159179688, "rewards/rejected": -58.96485137939453, "step": 2620, "u": -2.216238260269165, "weight": 0.03750001639127731 }, { "diff_generated": -61.1406364440918, "epoch": 0.8522359040829552, "grad_norm": 464.8238044329431, "learning_rate": 7.202411446960357e-07, "logits/chosen": -2.4685490131378174, "logits/rejected": -2.5211329460144043, "logps/chosen": -23.971097946166992, "logps/rejected": -151.80685424804688, "loss": 21.7248, "losses_ref": -0.0010434570722281933, "ref_logps/chosen": -99.79468536376953, "ref_logps/rejected": -90.6662368774414, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 75.82359313964844, "rewards/margins": 136.96421813964844, "rewards/rejected": -61.1406364440918, "step": 2630, "u": -2.259364604949951, "weight": 0.018802126869559288 }, { "diff_generated": -60.53166961669922, "epoch": 0.8554763447828905, "grad_norm": 458.67165860521243, "learning_rate": 7.193351605591825e-07, "logits/chosen": -2.470578193664551, "logits/rejected": -2.582828998565674, "logps/chosen": -20.103727340698242, "logps/rejected": -145.3340606689453, "loss": 20.5432, "losses_ref": -0.0011830010917037725, "ref_logps/chosen": -90.42179870605469, "ref_logps/rejected": -84.8023681640625, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 70.31806945800781, "rewards/margins": 130.84974670410156, "rewards/rejected": -60.53166961669922, "step": 2640, "u": -2.1730074882507324, "weight": 0.05631326511502266 }, { "diff_generated": -58.205894470214844, "epoch": 0.8587167854828257, "grad_norm": 495.1440294686424, "learning_rate": 7.184246365101939e-07, "logits/chosen": -2.5101726055145264, "logits/rejected": -2.5068435668945312, "logps/chosen": -23.52204704284668, "logps/rejected": -146.87620544433594, "loss": 22.7303, "losses_ref": -6.494811714219395e-06, "ref_logps/chosen": -101.6772689819336, "ref_logps/rejected": -88.6703109741211, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 78.15522766113281, "rewards/margins": 136.36111450195312, "rewards/rejected": -58.205894470214844, "step": 2650, "u": -2.216237783432007, "weight": 0.037500280886888504 }, { "diff_generated": -61.2960205078125, "epoch": 0.8619572261827608, "grad_norm": 467.0051884894631, "learning_rate": 7.175095854937739e-07, "logits/chosen": -2.463174819946289, "logits/rejected": -2.5186073780059814, "logps/chosen": -21.171960830688477, "logps/rejected": -148.35946655273438, "loss": 22.1944, "losses_ref": -1.2179619091057248e-07, "ref_logps/chosen": -98.50288391113281, "ref_logps/rejected": -87.06343841552734, "rewards/accuracies": 0.96875, "rewards/chosen": 77.33091735839844, "rewards/margins": 138.62693786621094, "rewards/rejected": -61.2960205078125, "step": 2660, "u": -2.2306292057037354, "weight": 0.03125 }, { "diff_generated": -57.64461135864258, "epoch": 0.8651976668826961, "grad_norm": 474.22303186662896, "learning_rate": 7.165900205189853e-07, "logits/chosen": -2.464472532272339, "logits/rejected": -2.574596643447876, "logps/chosen": -20.4440860748291, "logps/rejected": -145.9555206298828, "loss": 20.8537, "losses_ref": -3.338614718018107e-08, "ref_logps/chosen": -92.37683868408203, "ref_logps/rejected": -88.3109130859375, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 71.93275451660156, "rewards/margins": 129.57736206054688, "rewards/rejected": -57.64461135864258, "step": 2670, "u": -2.173064708709717, "weight": 0.05624999850988388 }, { "diff_generated": -59.3572998046875, "epoch": 0.8684381075826313, "grad_norm": 441.31149418382375, "learning_rate": 7.156659546590653e-07, "logits/chosen": -2.404878616333008, "logits/rejected": -2.4967124462127686, "logps/chosen": -19.269577026367188, "logps/rejected": -144.28993225097656, "loss": 21.3004, "losses_ref": -6.453227001657069e-07, "ref_logps/chosen": -93.02782440185547, "ref_logps/rejected": -84.93263244628906, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 73.75823974609375, "rewards/margins": 133.1155548095703, "rewards/rejected": -59.3572998046875, "step": 2680, "u": -2.2018470764160156, "weight": 0.043750010430812836 }, { "diff_generated": -55.28821563720703, "epoch": 0.8716785482825664, "grad_norm": 463.8652831186927, "learning_rate": 7.147374010512385e-07, "logits/chosen": -2.3714890480041504, "logits/rejected": -2.4094130992889404, "logps/chosen": -19.746137619018555, "logps/rejected": -135.70211791992188, "loss": 20.9851, "losses_ref": -0.0011874515330418944, "ref_logps/chosen": -89.21805572509766, "ref_logps/rejected": -80.41390228271484, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 69.47191619873047, "rewards/margins": 124.7601318359375, "rewards/rejected": -55.28821563720703, "step": 2690, "u": -2.1010565757751465, "weight": 0.08755816519260406 }, { "diff_generated": -58.68421173095703, "epoch": 0.8749189889825016, "grad_norm": 461.6775648644702, "learning_rate": 7.13804372896531e-07, "logits/chosen": -2.3763599395751953, "logits/rejected": -2.5035808086395264, "logps/chosen": -20.23203468322754, "logps/rejected": -143.46829223632812, "loss": 21.6863, "losses_ref": -0.0015272090677171946, "ref_logps/chosen": -90.75230407714844, "ref_logps/rejected": -84.78406524658203, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 70.52027893066406, "rewards/margins": 129.20448303222656, "rewards/rejected": -58.68421173095703, "step": 2700, "u": -2.172990322113037, "weight": 0.05633222311735153 }, { "diff_generated": -58.122169494628906, "epoch": 0.8781594296824368, "grad_norm": 453.822172883697, "learning_rate": 7.128668834595827e-07, "logits/chosen": -2.4922752380371094, "logits/rejected": -2.5195541381835938, "logps/chosen": -23.63937759399414, "logps/rejected": -143.8534393310547, "loss": 20.6594, "losses_ref": -0.004180192481726408, "ref_logps/chosen": -97.52043151855469, "ref_logps/rejected": -85.73128509521484, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 73.88105773925781, "rewards/margins": 132.00320434570312, "rewards/rejected": -58.122169494628906, "step": 2710, "u": -2.172847270965576, "weight": 0.05648912116885185 }, { "diff_generated": -56.95354461669922, "epoch": 0.881399870382372, "grad_norm": 492.4355541276597, "learning_rate": 7.119249460684583e-07, "logits/chosen": -2.406707286834717, "logits/rejected": -2.4368181228637695, "logps/chosen": -23.046157836914062, "logps/rejected": -138.7194366455078, "loss": 22.6015, "losses_ref": -8.472305125906132e-06, "ref_logps/chosen": -96.1999740600586, "ref_logps/rejected": -81.76588439941406, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 73.15381622314453, "rewards/margins": 130.10736083984375, "rewards/rejected": -56.95354461669922, "step": 2720, "u": -2.216238021850586, "weight": 0.03750025853514671 }, { "diff_generated": -57.65205001831055, "epoch": 0.8846403110823072, "grad_norm": 439.5878754716779, "learning_rate": 7.109785741144577e-07, "logits/chosen": -2.37678861618042, "logits/rejected": -2.513676881790161, "logps/chosen": -22.001630783081055, "logps/rejected": -144.2527313232422, "loss": 21.2921, "losses_ref": -7.83679115556879e-06, "ref_logps/chosen": -90.35001373291016, "ref_logps/rejected": -86.60069274902344, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 68.34837341308594, "rewards/margins": 126.00044250488281, "rewards/rejected": -57.65205001831055, "step": 2730, "u": -2.2018468379974365, "weight": 0.04375031590461731 }, { "diff_generated": -58.04121017456055, "epoch": 0.8878807517822424, "grad_norm": 434.4572641426545, "learning_rate": 7.100277810519264e-07, "logits/chosen": -2.459519863128662, "logits/rejected": -2.5082592964172363, "logps/chosen": -21.73441505432129, "logps/rejected": -146.6686248779297, "loss": 21.1753, "losses_ref": -2.30889941121859e-06, "ref_logps/chosen": -97.97511291503906, "ref_logps/rejected": -88.6274185180664, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 76.2406997680664, "rewards/margins": 134.28189086914062, "rewards/rejected": -58.04121017456055, "step": 2740, "u": -2.2018468379974365, "weight": 0.04375004023313522 }, { "diff_generated": -55.9808235168457, "epoch": 0.8911211924821776, "grad_norm": 437.56983692274633, "learning_rate": 7.090725803980633e-07, "logits/chosen": -2.398206949234009, "logits/rejected": -2.5035288333892822, "logps/chosen": -20.635684967041016, "logps/rejected": -139.47872924804688, "loss": 21.8344, "losses_ref": -2.7282279916107655e-05, "ref_logps/chosen": -90.36199951171875, "ref_logps/rejected": -83.49790954589844, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 69.726318359375, "rewards/margins": 125.70713806152344, "rewards/rejected": -55.9808235168457, "step": 2750, "u": -2.1442816257476807, "weight": 0.06875090301036835 }, { "diff_generated": -58.962196350097656, "epoch": 0.8943616331821128, "grad_norm": 453.82844426997264, "learning_rate": 7.081129857327297e-07, "logits/chosen": -2.4433138370513916, "logits/rejected": -2.53584623336792, "logps/chosen": -20.66114616394043, "logps/rejected": -144.21824645996094, "loss": 22.4838, "losses_ref": -0.00010804003977682441, "ref_logps/chosen": -96.87528991699219, "ref_logps/rejected": -85.25605010986328, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 76.21414184570312, "rewards/margins": 135.17633056640625, "rewards/rejected": -58.962196350097656, "step": 2760, "u": -2.1874513626098633, "weight": 0.05000500753521919 }, { "diff_generated": -56.82305908203125, "epoch": 0.8976020738820479, "grad_norm": 492.44022233311415, "learning_rate": 7.071490106982547e-07, "logits/chosen": -2.4296534061431885, "logits/rejected": -2.48490571975708, "logps/chosen": -22.398880004882812, "logps/rejected": -142.8795928955078, "loss": 22.0816, "losses_ref": -5.544167152038426e-07, "ref_logps/chosen": -96.34947967529297, "ref_logps/rejected": -86.05652618408203, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 73.95059967041016, "rewards/margins": 130.77365112304688, "rewards/rejected": -56.82305908203125, "step": 2770, "u": -2.1298911571502686, "weight": 0.07500001043081284 }, { "diff_generated": -60.1945915222168, "epoch": 0.9008425145819832, "grad_norm": 501.50198429090375, "learning_rate": 7.061806689992424e-07, "logits/chosen": -2.401369333267212, "logits/rejected": -2.4693374633789062, "logps/chosen": -21.959651947021484, "logps/rejected": -145.50927734375, "loss": 20.9386, "losses_ref": -1.3287276487972122e-05, "ref_logps/chosen": -94.9290771484375, "ref_logps/rejected": -85.31468200683594, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 72.96942138671875, "rewards/margins": 133.16403198242188, "rewards/rejected": -60.1945915222168, "step": 2780, "u": -2.1730642318725586, "weight": 0.056250639259815216 }, { "diff_generated": -61.94187545776367, "epoch": 0.9040829552819183, "grad_norm": 474.9848841951121, "learning_rate": 7.052079744023769e-07, "logits/chosen": -2.554199695587158, "logits/rejected": -2.5834202766418457, "logps/chosen": -25.187421798706055, "logps/rejected": -149.35482788085938, "loss": 21.6333, "losses_ref": -6.639597268076614e-05, "ref_logps/chosen": -104.10478210449219, "ref_logps/rejected": -87.41294860839844, "rewards/accuracies": 0.96875, "rewards/chosen": 78.9173583984375, "rewards/margins": 140.85923767089844, "rewards/rejected": -61.94187545776367, "step": 2790, "u": -2.2306275367736816, "weight": 0.03125188127160072 }, { "diff_generated": -59.27500534057617, "epoch": 0.9073233959818535, "grad_norm": 479.4450117078786, "learning_rate": 7.042309407362264e-07, "logits/chosen": -2.416618824005127, "logits/rejected": -2.5326590538024902, "logps/chosen": -19.899805068969727, "logps/rejected": -146.05563354492188, "loss": 21.8238, "losses_ref": -0.0020984322763979435, "ref_logps/chosen": -95.49771118164062, "ref_logps/rejected": -86.78062438964844, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 75.59790802001953, "rewards/margins": 134.87290954589844, "rewards/rejected": -59.27500534057617, "step": 2800, "u": -2.2017922401428223, "weight": 0.04381098598241806 }, { "diff_generated": -58.58484649658203, "epoch": 0.9105638366817888, "grad_norm": 464.62151462477925, "learning_rate": 7.032495818910462e-07, "logits/chosen": -2.4766998291015625, "logits/rejected": -2.5218327045440674, "logps/chosen": -19.523643493652344, "logps/rejected": -144.54153442382812, "loss": 21.0012, "losses_ref": -1.2667987903114408e-06, "ref_logps/chosen": -92.96281433105469, "ref_logps/rejected": -85.9566879272461, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 73.43919372558594, "rewards/margins": 132.02401733398438, "rewards/rejected": -58.58484649658203, "step": 2810, "u": -2.1298911571502686, "weight": 0.07500003278255463 }, { "diff_generated": -59.304412841796875, "epoch": 0.9138042773817239, "grad_norm": 481.66422343005866, "learning_rate": 7.022639118185819e-07, "logits/chosen": -2.4660236835479736, "logits/rejected": -2.4848055839538574, "logps/chosen": -22.82510757446289, "logps/rejected": -146.7274932861328, "loss": 21.1632, "losses_ref": -0.00026902236277237535, "ref_logps/chosen": -100.3491439819336, "ref_logps/rejected": -87.4230728149414, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 77.52403259277344, "rewards/margins": 136.82846069335938, "rewards/rejected": -59.304412841796875, "step": 2820, "u": -2.187443494796753, "weight": 0.05001381039619446 }, { "diff_generated": -61.32807540893555, "epoch": 0.9170447180816591, "grad_norm": 447.16863172300367, "learning_rate": 7.012739445318712e-07, "logits/chosen": -2.4933276176452637, "logits/rejected": -2.5432355403900146, "logps/chosen": -22.36863899230957, "logps/rejected": -148.42919921875, "loss": 21.3078, "losses_ref": -0.0005821407539770007, "ref_logps/chosen": -98.70513916015625, "ref_logps/rejected": -87.10112762451172, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 76.33650207519531, "rewards/margins": 137.66458129882812, "rewards/rejected": -61.32807540893555, "step": 2830, "u": -2.2162117958068848, "weight": 0.03752908110618591 }, { "diff_generated": -57.458465576171875, "epoch": 0.9202851587815943, "grad_norm": 468.00606440020357, "learning_rate": 7.002796941050435e-07, "logits/chosen": -2.468254327774048, "logits/rejected": -2.5315186977386475, "logps/chosen": -20.7177677154541, "logps/rejected": -147.50428771972656, "loss": 22.576, "losses_ref": -0.0009493259713053703, "ref_logps/chosen": -93.41374206542969, "ref_logps/rejected": -90.04581451416016, "rewards/accuracies": 0.9375, "rewards/chosen": 72.69596862792969, "rewards/margins": 130.15443420410156, "rewards/rejected": -57.458465576171875, "step": 2840, "u": -2.1586289405822754, "weight": 0.0625494197010994 }, { "diff_generated": -60.067726135253906, "epoch": 0.9235255994815295, "grad_norm": 487.44319674950174, "learning_rate": 6.992811746731213e-07, "logits/chosen": -2.475463390350342, "logits/rejected": -2.5366272926330566, "logps/chosen": -23.523868560791016, "logps/rejected": -147.427490234375, "loss": 22.1732, "losses_ref": -3.6845955037279055e-05, "ref_logps/chosen": -98.42115783691406, "ref_logps/rejected": -87.35975646972656, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 74.89728546142578, "rewards/margins": 134.9650115966797, "rewards/rejected": -60.067726135253906, "step": 2850, "u": -2.201845645904541, "weight": 0.04375119134783745 }, { "diff_generated": -61.67189407348633, "epoch": 0.9267660401814647, "grad_norm": 454.104891943021, "learning_rate": 6.98278400431818e-07, "logits/chosen": -2.506474733352661, "logits/rejected": -2.596954345703125, "logps/chosen": -23.808429718017578, "logps/rejected": -153.27609252929688, "loss": 22.1171, "losses_ref": -0.0010014523286372423, "ref_logps/chosen": -99.9022445678711, "ref_logps/rejected": -91.60420989990234, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 76.09381866455078, "rewards/margins": 137.76571655273438, "rewards/rejected": -61.67189407348633, "step": 2860, "u": -2.2449753284454346, "weight": 0.025050124153494835 }, { "diff_generated": -61.183860778808594, "epoch": 0.9300064808813999, "grad_norm": 430.2836400141675, "learning_rate": 6.972713856373369e-07, "logits/chosen": -2.499459981918335, "logits/rejected": -2.5934641361236572, "logps/chosen": -21.586721420288086, "logps/rejected": -149.8661651611328, "loss": 21.4534, "losses_ref": -3.025634896403062e-06, "ref_logps/chosen": -95.02459716796875, "ref_logps/rejected": -88.68229675292969, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 73.43787384033203, "rewards/margins": 134.62173461914062, "rewards/rejected": -61.183860778808594, "step": 2870, "u": -2.2450203895568848, "weight": 0.025000056251883507 }, { "diff_generated": -61.021156311035156, "epoch": 0.933246921581335, "grad_norm": 435.91054480995274, "learning_rate": 6.962601446061681e-07, "logits/chosen": -2.4959959983825684, "logits/rejected": -2.494209051132202, "logps/chosen": -21.066835403442383, "logps/rejected": -142.05026245117188, "loss": 21.0805, "losses_ref": -0.010031198151409626, "ref_logps/chosen": -96.79750061035156, "ref_logps/rejected": -81.02911376953125, "rewards/accuracies": 0.96875, "rewards/chosen": 75.73066711425781, "rewards/margins": 136.75180053710938, "rewards/rejected": -61.021156311035156, "step": 2880, "u": -2.230487823486328, "weight": 0.03140627592802048 }, { "diff_generated": -61.153526306152344, "epoch": 0.9364873622812703, "grad_norm": 467.78890629943817, "learning_rate": 6.952446917148853e-07, "logits/chosen": -2.469348669052124, "logits/rejected": -2.5688066482543945, "logps/chosen": -21.54801368713379, "logps/rejected": -149.8988494873047, "loss": 21.4278, "losses_ref": -0.005234680138528347, "ref_logps/chosen": -99.34354400634766, "ref_logps/rejected": -88.74533081054688, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 77.79553985595703, "rewards/margins": 138.94906616210938, "rewards/rejected": -61.153526306152344, "step": 2890, "u": -2.244741201400757, "weight": 0.025303319096565247 }, { "diff_generated": -62.5263671875, "epoch": 0.9397278029812054, "grad_norm": 468.65833498464394, "learning_rate": 6.94225041399941e-07, "logits/chosen": -2.473790407180786, "logits/rejected": -2.6006760597229004, "logps/chosen": -20.871822357177734, "logps/rejected": -153.6435546875, "loss": 20.2554, "losses_ref": -0.0003484871704131365, "ref_logps/chosen": -96.58824157714844, "ref_logps/rejected": -91.11719512939453, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 75.71641540527344, "rewards/margins": 138.2427978515625, "rewards/rejected": -62.5263671875, "step": 2900, "u": -2.1730494499206543, "weight": 0.056267015635967255 }, { "diff_generated": -59.31611251831055, "epoch": 0.9429682436811406, "grad_norm": 418.6116303978115, "learning_rate": 6.932012081574615e-07, "logits/chosen": -2.4939768314361572, "logits/rejected": -2.547677516937256, "logps/chosen": -21.716999053955078, "logps/rejected": -146.97817993164062, "loss": 21.6353, "losses_ref": -4.112573606107617e-07, "ref_logps/chosen": -93.69564056396484, "ref_logps/rejected": -87.66205596923828, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 71.97864532470703, "rewards/margins": 131.2947540283203, "rewards/rejected": -59.31611251831055, "step": 2910, "u": -2.2018468379974365, "weight": 0.043750010430812836 }, { "diff_generated": -57.04352951049805, "epoch": 0.9462086843810759, "grad_norm": 429.2864831942657, "learning_rate": 6.921732065430411e-07, "logits/chosen": -2.433727741241455, "logits/rejected": -2.556497573852539, "logps/chosen": -18.58115005493164, "logps/rejected": -142.70858764648438, "loss": 20.421, "losses_ref": -8.253007877101481e-07, "ref_logps/chosen": -90.64105987548828, "ref_logps/rejected": -85.66506958007812, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 72.0599136352539, "rewards/margins": 129.1034393310547, "rewards/rejected": -57.04352951049805, "step": 2920, "u": -2.1298911571502686, "weight": 0.07500003278255463 }, { "diff_generated": -56.113372802734375, "epoch": 0.949449125081011, "grad_norm": 448.6438982921234, "learning_rate": 6.911410511715343e-07, "logits/chosen": -2.455838441848755, "logits/rejected": -2.5022530555725098, "logps/chosen": -21.27231216430664, "logps/rejected": -138.38426208496094, "loss": 20.6897, "losses_ref": -5.750478521804325e-05, "ref_logps/chosen": -93.20436096191406, "ref_logps/rejected": -82.27088165283203, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 71.93205261230469, "rewards/margins": 128.04544067382812, "rewards/rejected": -56.113372802734375, "step": 2930, "u": -2.1298890113830566, "weight": 0.07500265538692474 }, { "diff_generated": -58.04365158081055, "epoch": 0.9526895657809462, "grad_norm": 490.29397554218724, "learning_rate": 6.901047567168491e-07, "logits/chosen": -2.5180070400238037, "logits/rejected": -2.5559816360473633, "logps/chosen": -21.59964370727539, "logps/rejected": -142.6434783935547, "loss": 21.3082, "losses_ref": -0.0004614538047462702, "ref_logps/chosen": -95.13563537597656, "ref_logps/rejected": -84.59981536865234, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 73.53599548339844, "rewards/margins": 131.5796356201172, "rewards/rejected": -58.04365158081055, "step": 2940, "u": -2.1298718452453613, "weight": 0.0750214010477066 }, { "diff_generated": -61.00908279418945, "epoch": 0.9559300064808814, "grad_norm": 487.9778393083608, "learning_rate": 6.890643379117374e-07, "logits/chosen": -2.5026142597198486, "logits/rejected": -2.5429794788360596, "logps/chosen": -21.640348434448242, "logps/rejected": -151.98902893066406, "loss": 20.7525, "losses_ref": -0.00010877321619773284, "ref_logps/chosen": -100.30354309082031, "ref_logps/rejected": -90.97994232177734, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 78.6631851196289, "rewards/margins": 139.67227172851562, "rewards/rejected": -61.00908279418945, "step": 2950, "u": -2.1730599403381348, "weight": 0.05625521019101143 }, { "diff_generated": -60.049903869628906, "epoch": 0.9591704471808166, "grad_norm": 475.56335002534007, "learning_rate": 6.880198095475866e-07, "logits/chosen": -2.5132415294647217, "logits/rejected": -2.5256381034851074, "logps/chosen": -25.432048797607422, "logps/rejected": -147.38023376464844, "loss": 21.5488, "losses_ref": -3.1123508961172774e-06, "ref_logps/chosen": -105.21751403808594, "ref_logps/rejected": -87.3303451538086, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 79.78547668457031, "rewards/margins": 139.83535766601562, "rewards/rejected": -60.049903869628906, "step": 2960, "u": -2.216238021850586, "weight": 0.037500131875276566 }, { "diff_generated": -54.8358039855957, "epoch": 0.9624108878807518, "grad_norm": 486.02730600965197, "learning_rate": 6.86971186474208e-07, "logits/chosen": -2.4802966117858887, "logits/rejected": -2.5337626934051514, "logps/chosen": -21.163768768310547, "logps/rejected": -135.5006561279297, "loss": 22.2722, "losses_ref": -0.000891751900780946, "ref_logps/chosen": -92.32938385009766, "ref_logps/rejected": -80.66484832763672, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 71.16561889648438, "rewards/margins": 126.00142669677734, "rewards/rejected": -54.8358039855957, "step": 2970, "u": -2.1730198860168457, "weight": 0.05629971623420715 }, { "diff_generated": -59.64556884765625, "epoch": 0.9656513285806869, "grad_norm": 488.1934193333916, "learning_rate": 6.859184835996271e-07, "logits/chosen": -2.4701828956604004, "logits/rejected": -2.586439371109009, "logps/chosen": -19.97806739807129, "logps/rejected": -148.86654663085938, "loss": 21.1577, "losses_ref": -2.3398897610604763e-06, "ref_logps/chosen": -94.80010223388672, "ref_logps/rejected": -89.22097778320312, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 74.82203674316406, "rewards/margins": 134.46759033203125, "rewards/rejected": -59.64556884765625, "step": 2980, "u": -2.2018468379974365, "weight": 0.04375002905726433 }, { "diff_generated": -61.30529022216797, "epoch": 0.9688917692806222, "grad_norm": 459.6357752841041, "learning_rate": 6.848617158898704e-07, "logits/chosen": -2.45888614654541, "logits/rejected": -2.5506579875946045, "logps/chosen": -18.167362213134766, "logps/rejected": -150.75900268554688, "loss": 20.1378, "losses_ref": -0.0018867189064621925, "ref_logps/chosen": -90.96756744384766, "ref_logps/rejected": -89.4537124633789, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 72.8001937866211, "rewards/margins": 134.10548400878906, "rewards/rejected": -61.30529022216797, "step": 2990, "u": -2.2018001079559326, "weight": 0.04380171000957489 }, { "diff_generated": -61.82233428955078, "epoch": 0.9721322099805574, "grad_norm": 465.8121795887825, "learning_rate": 6.838008983687538e-07, "logits/chosen": -2.4415152072906494, "logits/rejected": -2.540574789047241, "logps/chosen": -19.56886863708496, "logps/rejected": -156.75039672851562, "loss": 20.7824, "losses_ref": -0.0007329249056056142, "ref_logps/chosen": -94.28569030761719, "ref_logps/rejected": -94.92806243896484, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 74.71682739257812, "rewards/margins": 136.53915405273438, "rewards/rejected": -61.82233428955078, "step": 3000, "u": -2.2162041664123535, "weight": 0.03753752261400223 }, { "diff_generated": -61.789337158203125, "epoch": 0.9753726506804925, "grad_norm": 482.63406855650675, "learning_rate": 6.827360461176675e-07, "logits/chosen": -2.463770627975464, "logits/rejected": -2.5621752738952637, "logps/chosen": -22.280914306640625, "logps/rejected": -148.77462768554688, "loss": 22.1914, "losses_ref": -7.3342125688213855e-06, "ref_logps/chosen": -93.60955047607422, "ref_logps/rejected": -86.98530578613281, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 71.32862854003906, "rewards/margins": 133.11798095703125, "rewards/rejected": -61.789337158203125, "step": 3010, "u": -2.187455654144287, "weight": 0.05000026151537895 }, { "diff_generated": -58.69233322143555, "epoch": 0.9786130913804277, "grad_norm": 465.3136564602363, "learning_rate": 6.816671742753636e-07, "logits/chosen": -2.453428030014038, "logits/rejected": -2.5174663066864014, "logps/chosen": -22.349462509155273, "logps/rejected": -144.7392120361328, "loss": 20.2201, "losses_ref": -0.0022303853183984756, "ref_logps/chosen": -93.95580291748047, "ref_logps/rejected": -86.046875, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 71.60633087158203, "rewards/margins": 130.29867553710938, "rewards/rejected": -58.69233322143555, "step": 3020, "u": -2.1297779083251953, "weight": 0.07512475550174713 }, { "diff_generated": -58.20878982543945, "epoch": 0.981853532080363, "grad_norm": 452.30032153383627, "learning_rate": 6.80594298037739e-07, "logits/chosen": -2.4573230743408203, "logits/rejected": -2.522925615310669, "logps/chosen": -20.73061752319336, "logps/rejected": -147.1402130126953, "loss": 21.9122, "losses_ref": -0.0012742785038426518, "ref_logps/chosen": -94.61383056640625, "ref_logps/rejected": -88.93141174316406, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 73.88321685791016, "rewards/margins": 132.09201049804688, "rewards/rejected": -58.20878982543945, "step": 3030, "u": -2.187399387359619, "weight": 0.05006258562207222 }, { "diff_generated": -59.879798889160156, "epoch": 0.9850939727802981, "grad_norm": 449.0923495215593, "learning_rate": 6.795174326576201e-07, "logits/chosen": -2.517982006072998, "logits/rejected": -2.576937437057495, "logps/chosen": -21.717044830322266, "logps/rejected": -146.72311401367188, "loss": 20.9081, "losses_ref": -1.8586888472782448e-06, "ref_logps/chosen": -97.45092010498047, "ref_logps/rejected": -86.84330749511719, "rewards/accuracies": 0.96875, "rewards/chosen": 75.73387145996094, "rewards/margins": 135.61367797851562, "rewards/rejected": -59.879798889160156, "step": 3040, "u": -2.2306292057037354, "weight": 0.03125004842877388 }, { "diff_generated": -58.180763244628906, "epoch": 0.9883344134802333, "grad_norm": 464.7999063633373, "learning_rate": 6.784365934445467e-07, "logits/chosen": -2.4162662029266357, "logits/rejected": -2.5505805015563965, "logps/chosen": -19.611560821533203, "logps/rejected": -144.66844177246094, "loss": 21.0366, "losses_ref": -0.0008930475451052189, "ref_logps/chosen": -89.82169342041016, "ref_logps/rejected": -86.48768615722656, "rewards/accuracies": 0.9375, "rewards/chosen": 70.21012878417969, "rewards/margins": 128.39089965820312, "rewards/rejected": -58.180763244628906, "step": 3050, "u": -2.1586310863494873, "weight": 0.06254696100950241 }, { "diff_generated": -58.33638381958008, "epoch": 0.9915748541801686, "grad_norm": 472.00855214714693, "learning_rate": 6.77351795764553e-07, "logits/chosen": -2.5259509086608887, "logits/rejected": -2.5823984146118164, "logps/chosen": -20.90003776550293, "logps/rejected": -148.71282958984375, "loss": 21.1491, "losses_ref": -0.0017345917876809835, "ref_logps/chosen": -100.18885803222656, "ref_logps/rejected": -90.37644958496094, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 79.288818359375, "rewards/margins": 137.6251983642578, "rewards/rejected": -58.33638381958008, "step": 3060, "u": -2.2161552906036377, "weight": 0.037591852247714996 }, { "diff_generated": -56.86510467529297, "epoch": 0.9948152948801037, "grad_norm": 462.1652750726531, "learning_rate": 6.7626305503995e-07, "logits/chosen": -2.4363324642181396, "logits/rejected": -2.5213849544525146, "logps/chosen": -21.539684295654297, "logps/rejected": -139.07391357421875, "loss": 20.7354, "losses_ref": -0.0002831167366821319, "ref_logps/chosen": -93.53211975097656, "ref_logps/rejected": -82.20880889892578, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 71.99244689941406, "rewards/margins": 128.8575439453125, "rewards/rejected": -56.86510467529297, "step": 3070, "u": -2.2018346786499023, "weight": 0.04376371577382088 }, { "diff_generated": -62.3128662109375, "epoch": 0.9980557355800389, "grad_norm": 459.85247368799446, "learning_rate": 6.75170386749106e-07, "logits/chosen": -2.4534130096435547, "logits/rejected": -2.5522007942199707, "logps/chosen": -22.608396530151367, "logps/rejected": -157.0138397216797, "loss": 20.5769, "losses_ref": -0.00012439176498446614, "ref_logps/chosen": -98.05714416503906, "ref_logps/rejected": -94.70097351074219, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 75.4487533569336, "rewards/margins": 137.76162719726562, "rewards/rejected": -62.3128662109375, "step": 3080, "u": -2.28818941116333, "weight": 0.0062549663707613945 }, { "diff_generated": -59.61762619018555, "epoch": 1.0012961762799741, "grad_norm": 451.9635942553253, "learning_rate": 6.740738064262265e-07, "logits/chosen": -2.4870359897613525, "logits/rejected": -2.5867018699645996, "logps/chosen": -19.341670989990234, "logps/rejected": -147.53176879882812, "loss": 19.2801, "losses_ref": -0.0015698724891990423, "ref_logps/chosen": -93.33935546875, "ref_logps/rejected": -87.9141616821289, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 73.99769592285156, "rewards/margins": 133.61532592773438, "rewards/rejected": -59.61762619018555, "step": 3090, "u": -3.090423345565796, "weight": 0.04383974149823189 }, { "diff_generated": -66.58012390136719, "epoch": 1.0045366169799093, "grad_norm": 462.3154034742135, "learning_rate": 6.729733296611336e-07, "logits/chosen": -2.510164260864258, "logits/rejected": -2.58900785446167, "logps/chosen": -16.40743064880371, "logps/rejected": -154.39218139648438, "loss": 17.2317, "losses_ref": -0.00252619874663651, "ref_logps/chosen": -95.60277557373047, "ref_logps/rejected": -87.81204223632812, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 79.19535064697266, "rewards/margins": 145.77548217773438, "rewards/rejected": -66.58012390136719, "step": 3100, "u": -4.446268558502197, "weight": 0.025122780352830887 }, { "diff_generated": -66.8081283569336, "epoch": 1.0077770576798444, "grad_norm": 495.00120559126685, "learning_rate": 6.718689720990442e-07, "logits/chosen": -2.4792261123657227, "logits/rejected": -2.572613000869751, "logps/chosen": -16.952396392822266, "logps/rejected": -153.31338500976562, "loss": 17.2435, "losses_ref": -0.08517072349786758, "ref_logps/chosen": -94.31925964355469, "ref_logps/rejected": -86.5052490234375, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 77.36685943603516, "rewards/margins": 144.17498779296875, "rewards/rejected": -66.8081283569336, "step": 3110, "u": -4.296696662902832, "weight": 0.05492968484759331 }, { "diff_generated": -63.595611572265625, "epoch": 1.0110174983797797, "grad_norm": 467.1313773559849, "learning_rate": 6.707607494403471e-07, "logits/chosen": -2.4678194522857666, "logits/rejected": -2.552511692047119, "logps/chosen": -16.294748306274414, "logps/rejected": -148.17742919921875, "loss": 16.8622, "losses_ref": -0.003516948549076915, "ref_logps/chosen": -91.89102935791016, "ref_logps/rejected": -84.58182525634766, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 75.59628295898438, "rewards/margins": 139.19190979003906, "rewards/rejected": -63.595611572265625, "step": 3120, "u": -4.301178455352783, "weight": 0.056418467313051224 }, { "diff_generated": -65.88175201416016, "epoch": 1.0142579390797148, "grad_norm": 475.5010872008353, "learning_rate": 6.696486774403812e-07, "logits/chosen": -2.4392142295837402, "logits/rejected": -2.5472519397735596, "logps/chosen": -18.009849548339844, "logps/rejected": -154.6840362548828, "loss": 17.8148, "losses_ref": -1.152172558249731e-06, "ref_logps/chosen": -92.8963394165039, "ref_logps/rejected": -88.80229187011719, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 74.88648986816406, "rewards/margins": 140.76824951171875, "rewards/rejected": -65.88175201416016, "step": 3130, "u": -4.3748979568481445, "weight": 0.050000034272670746 }, { "diff_generated": -67.03091430664062, "epoch": 1.01749837977965, "grad_norm": 460.5138890938086, "learning_rate": 6.685327719092096e-07, "logits/chosen": -2.4010062217712402, "logits/rejected": -2.5667436122894287, "logps/chosen": -14.499166488647461, "logps/rejected": -149.5535430908203, "loss": 17.7814, "losses_ref": -9.857653640210629e-05, "ref_logps/chosen": -88.37496948242188, "ref_logps/rejected": -82.52261352539062, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 73.87580871582031, "rewards/margins": 140.90672302246094, "rewards/rejected": -67.03091430664062, "step": 3140, "u": -4.2142720222473145, "weight": 0.07500406354665756 }, { "diff_generated": -65.28272247314453, "epoch": 1.0207388204795853, "grad_norm": 421.26837650221717, "learning_rate": 6.674130487113962e-07, "logits/chosen": -2.5053372383117676, "logits/rejected": -2.55894136428833, "logps/chosen": -18.61953353881836, "logps/rejected": -150.85842895507812, "loss": 18.2352, "losses_ref": -3.157412109544566e-08, "ref_logps/chosen": -98.04133605957031, "ref_logps/rejected": -85.5757064819336, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 79.42179870605469, "rewards/margins": 144.70452880859375, "rewards/rejected": -65.28272247314453, "step": 3150, "u": -4.365433692932129, "weight": 0.04374999925494194 }, { "diff_generated": -68.14897155761719, "epoch": 1.0239792611795204, "grad_norm": 436.8557600214886, "learning_rate": 6.662895237657799e-07, "logits/chosen": -2.5241332054138184, "logits/rejected": -2.5666394233703613, "logps/chosen": -17.1142520904541, "logps/rejected": -153.32432556152344, "loss": 17.3762, "losses_ref": -8.085754416242708e-06, "ref_logps/chosen": -98.8514404296875, "ref_logps/rejected": -85.17535400390625, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 81.73719787597656, "rewards/margins": 149.88616943359375, "rewards/rejected": -68.14897155761719, "step": 3160, "u": -4.447952747344971, "weight": 0.025000324472784996 }, { "diff_generated": -64.65580749511719, "epoch": 1.0272197018794555, "grad_norm": 460.11094844811134, "learning_rate": 6.651622130452481e-07, "logits/chosen": -2.46612548828125, "logits/rejected": -2.5433506965637207, "logps/chosen": -19.915821075439453, "logps/rejected": -151.89566040039062, "loss": 17.3588, "losses_ref": -0.00018419846310280263, "ref_logps/chosen": -96.14291381835938, "ref_logps/rejected": -87.23985290527344, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 76.22709655761719, "rewards/margins": 140.88290405273438, "rewards/rejected": -64.65580749511719, "step": 3170, "u": -4.275555610656738, "weight": 0.06875447183847427 }, { "diff_generated": -68.25298309326172, "epoch": 1.030460142579391, "grad_norm": 434.93450671648856, "learning_rate": 6.640311325765096e-07, "logits/chosen": -2.4406819343566895, "logits/rejected": -2.571199417114258, "logps/chosen": -16.93312644958496, "logps/rejected": -160.1395721435547, "loss": 17.5433, "losses_ref": -0.0010875340085476637, "ref_logps/chosen": -92.92141723632812, "ref_logps/rejected": -91.88658142089844, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 75.98828887939453, "rewards/margins": 144.24127197265625, "rewards/rejected": -68.25298309326172, "step": 3180, "u": -4.345793724060059, "weight": 0.050050728023052216 }, { "diff_generated": -66.62785339355469, "epoch": 1.033700583279326, "grad_norm": 510.4120915296864, "learning_rate": 6.628962984398663e-07, "logits/chosen": -2.485319137573242, "logits/rejected": -2.5952136516571045, "logps/chosen": -17.191240310668945, "logps/rejected": -155.60009765625, "loss": 17.5711, "losses_ref": -0.0012958078878000379, "ref_logps/chosen": -96.52293395996094, "ref_logps/rejected": -88.97222900390625, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 79.33170318603516, "rewards/margins": 145.9595489501953, "rewards/rejected": -66.62785339355469, "step": 3190, "u": -4.343929290771484, "weight": 0.04381078481674194 }, { "diff_generated": -67.12268829345703, "epoch": 1.0369410239792611, "grad_norm": 452.0149194344355, "learning_rate": 6.617577267689863e-07, "logits/chosen": -2.4636361598968506, "logits/rejected": -2.5697529315948486, "logps/chosen": -16.585233688354492, "logps/rejected": -153.8497314453125, "loss": 17.8756, "losses_ref": -0.0017653731629252434, "ref_logps/chosen": -94.59716033935547, "ref_logps/rejected": -86.72703552246094, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 78.01192474365234, "rewards/margins": 145.13461303710938, "rewards/rejected": -67.12268829345703, "step": 3200, "u": -4.3374176025390625, "weight": 0.05008355900645256 }, { "diff_generated": -66.08069610595703, "epoch": 1.0401814646791965, "grad_norm": 444.5697359069676, "learning_rate": 6.606154337506721e-07, "logits/chosen": -2.514535665512085, "logits/rejected": -2.5685744285583496, "logps/chosen": -19.98543357849121, "logps/rejected": -150.83689880371094, "loss": 17.259, "losses_ref": -0.0015171390259638429, "ref_logps/chosen": -99.03950500488281, "ref_logps/rejected": -84.75621032714844, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 79.05406951904297, "rewards/margins": 145.13479614257812, "rewards/rejected": -66.08069610595703, "step": 3210, "u": -4.347279071807861, "weight": 0.050075214356184006 }, { "diff_generated": -61.97309112548828, "epoch": 1.0434219053791316, "grad_norm": 453.53740449674814, "learning_rate": 6.594694356246325e-07, "logits/chosen": -2.5020461082458496, "logits/rejected": -2.5105185508728027, "logps/chosen": -18.61556625366211, "logps/rejected": -142.10214233398438, "loss": 17.6667, "losses_ref": -6.612971503727749e-08, "ref_logps/chosen": -96.94075012207031, "ref_logps/rejected": -80.12906646728516, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 78.32518005371094, "rewards/margins": 140.2982635498047, "rewards/rejected": -61.97309112548828, "step": 3220, "u": -4.212699890136719, "weight": 0.08124999701976776 }, { "diff_generated": -65.51012420654297, "epoch": 1.0466623460790667, "grad_norm": 448.9919035981769, "learning_rate": 6.583197486832506e-07, "logits/chosen": -2.495256185531616, "logits/rejected": -2.533613920211792, "logps/chosen": -17.16736602783203, "logps/rejected": -151.90597534179688, "loss": 18.1376, "losses_ref": -1.783437937774579e-06, "ref_logps/chosen": -94.41279602050781, "ref_logps/rejected": -86.39586639404297, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 77.24542999267578, "rewards/margins": 142.7555694580078, "rewards/rejected": -65.51012420654297, "step": 3230, "u": -4.212017059326172, "weight": 0.07500006258487701 }, { "diff_generated": -65.96939086914062, "epoch": 1.0499027867790018, "grad_norm": 457.2713751712491, "learning_rate": 6.571663892713527e-07, "logits/chosen": -2.4962501525878906, "logits/rejected": -2.5798799991607666, "logps/chosen": -17.670930862426758, "logps/rejected": -152.85986328125, "loss": 16.7632, "losses_ref": -0.006728614680469036, "ref_logps/chosen": -95.16192626953125, "ref_logps/rejected": -86.8904800415039, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 77.49100494384766, "rewards/margins": 143.46038818359375, "rewards/rejected": -65.96939086914062, "step": 3240, "u": -4.393555164337158, "weight": 0.03781301528215408 }, { "diff_generated": -63.8961181640625, "epoch": 1.0531432274789372, "grad_norm": 471.8849246342818, "learning_rate": 6.560093737859755e-07, "logits/chosen": -2.5056347846984863, "logits/rejected": -2.4781854152679443, "logps/chosen": -17.58412742614746, "logps/rejected": -144.17770385742188, "loss": 17.5115, "losses_ref": -7.131105803637183e-07, "ref_logps/chosen": -95.46975708007812, "ref_logps/rejected": -80.28160095214844, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 77.8856201171875, "rewards/margins": 141.78172302246094, "rewards/rejected": -63.8961181640625, "step": 3250, "u": -4.220991134643555, "weight": 0.07500002533197403 }, { "diff_generated": -66.7835464477539, "epoch": 1.0563836681788723, "grad_norm": 457.9417057538034, "learning_rate": 6.548487186761334e-07, "logits/chosen": -2.4845380783081055, "logits/rejected": -2.5910003185272217, "logps/chosen": -17.014305114746094, "logps/rejected": -154.10397338867188, "loss": 17.4821, "losses_ref": -0.0043944017961621284, "ref_logps/chosen": -92.6029052734375, "ref_logps/rejected": -87.3204345703125, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 75.5886001586914, "rewards/margins": 142.3721466064453, "rewards/rejected": -66.7835464477539, "step": 3260, "u": -4.323208332061768, "weight": 0.0439579114317894 }, { "diff_generated": -65.97068786621094, "epoch": 1.0596241088788074, "grad_norm": 473.9528443346443, "learning_rate": 6.536844404425845e-07, "logits/chosen": -2.4793522357940674, "logits/rejected": -2.586184501647949, "logps/chosen": -16.8640193939209, "logps/rejected": -154.61196899414062, "loss": 17.15, "losses_ref": -0.03145980462431908, "ref_logps/chosen": -95.58818054199219, "ref_logps/rejected": -88.64127349853516, "rewards/accuracies": 0.9375, "rewards/chosen": 78.72415924072266, "rewards/margins": 144.69485473632812, "rewards/rejected": -65.97068786621094, "step": 3270, "u": -4.271958351135254, "weight": 0.06429260969161987 }, { "diff_generated": -65.097412109375, "epoch": 1.0628645495787428, "grad_norm": 452.4811006320355, "learning_rate": 6.525165556375959e-07, "logits/chosen": -2.444775342941284, "logits/rejected": -2.578066349029541, "logps/chosen": -15.83275032043457, "logps/rejected": -146.58653259277344, "loss": 17.2568, "losses_ref": -0.0008164413156919181, "ref_logps/chosen": -90.00611877441406, "ref_logps/rejected": -81.4891357421875, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 74.17335510253906, "rewards/margins": 139.270751953125, "rewards/rejected": -65.097412109375, "step": 3280, "u": -4.3508782386779785, "weight": 0.05003942921757698 }, { "diff_generated": -67.97452545166016, "epoch": 1.0661049902786779, "grad_norm": 481.39553728386596, "learning_rate": 6.513450808647086e-07, "logits/chosen": -2.437958002090454, "logits/rejected": -2.5370841026306152, "logps/chosen": -18.49962615966797, "logps/rejected": -152.44979858398438, "loss": 18.0844, "losses_ref": -1.7934650031747879e-06, "ref_logps/chosen": -96.98509979248047, "ref_logps/rejected": -84.47525787353516, "rewards/accuracies": 0.96875, "rewards/chosen": 78.48546600341797, "rewards/margins": 146.45999145507812, "rewards/rejected": -67.97452545166016, "step": 3290, "u": -4.41939115524292, "weight": 0.03125005215406418 }, { "diff_generated": -66.6233139038086, "epoch": 1.069345430978613, "grad_norm": 465.37946702473073, "learning_rate": 6.501700327785011e-07, "logits/chosen": -2.5347819328308105, "logits/rejected": -2.573274612426758, "logps/chosen": -16.581130981445312, "logps/rejected": -150.71554565429688, "loss": 17.4926, "losses_ref": -9.045367733051535e-06, "ref_logps/chosen": -93.96571350097656, "ref_logps/rejected": -84.09223175048828, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 77.38458251953125, "rewards/margins": 144.0078887939453, "rewards/rejected": -66.6233139038086, "step": 3300, "u": -4.263393878936768, "weight": 0.06875050067901611 }, { "diff_generated": -63.082496643066406, "epoch": 1.0725858716785484, "grad_norm": 471.1217208714754, "learning_rate": 6.489914280843528e-07, "logits/chosen": -2.508230686187744, "logits/rejected": -2.532465696334839, "logps/chosen": -18.42227554321289, "logps/rejected": -138.54238891601562, "loss": 17.4918, "losses_ref": -3.896912858181167e-06, "ref_logps/chosen": -93.45298767089844, "ref_logps/rejected": -75.45990753173828, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 75.03073120117188, "rewards/margins": 138.11322021484375, "rewards/rejected": -63.082496643066406, "step": 3310, "u": -4.203088283538818, "weight": 0.07500005513429642 }, { "diff_generated": -66.8916015625, "epoch": 1.0758263123784835, "grad_norm": 472.5438097483653, "learning_rate": 6.478092835382071e-07, "logits/chosen": -2.4746553897857666, "logits/rejected": -2.5401337146759033, "logps/chosen": -18.509883880615234, "logps/rejected": -152.7678680419922, "loss": 17.1164, "losses_ref": -0.00354144093580544, "ref_logps/chosen": -96.56227111816406, "ref_logps/rejected": -85.87627410888672, "rewards/accuracies": 0.96875, "rewards/chosen": 78.0523910522461, "rewards/margins": 144.94398498535156, "rewards/rejected": -66.8916015625, "step": 3320, "u": -4.42160701751709, "weight": 0.031427718698978424 }, { "diff_generated": -69.9482421875, "epoch": 1.0790667530784186, "grad_norm": 467.0081761853318, "learning_rate": 6.466236159463319e-07, "logits/chosen": -2.4707579612731934, "logits/rejected": -2.56321382522583, "logps/chosen": -16.863197326660156, "logps/rejected": -162.45408630371094, "loss": 17.6638, "losses_ref": -1.793040610209573e-05, "ref_logps/chosen": -95.34489440917969, "ref_logps/rejected": -92.5058364868164, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 78.481689453125, "rewards/margins": 148.42991638183594, "rewards/rejected": -69.9482421875, "step": 3330, "u": -4.3579301834106445, "weight": 0.04375045746564865 }, { "diff_generated": -71.49409484863281, "epoch": 1.082307193778354, "grad_norm": 502.1179103976336, "learning_rate": 6.45434442165082e-07, "logits/chosen": -2.4856925010681152, "logits/rejected": -2.5980522632598877, "logps/chosen": -17.530838012695312, "logps/rejected": -163.8715362548828, "loss": 17.9225, "losses_ref": -3.988393018516945e-06, "ref_logps/chosen": -96.364990234375, "ref_logps/rejected": -92.37743377685547, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 78.83414459228516, "rewards/margins": 150.3282470703125, "rewards/rejected": -71.49409484863281, "step": 3340, "u": -4.450523376464844, "weight": 0.018750127404928207 }, { "diff_generated": -68.51228332519531, "epoch": 1.085547634478289, "grad_norm": 445.6767835939639, "learning_rate": 6.442417791006585e-07, "logits/chosen": -2.4890549182891846, "logits/rejected": -2.5770602226257324, "logps/chosen": -17.04773712158203, "logps/rejected": -154.55792236328125, "loss": 17.6168, "losses_ref": -5.988636075926479e-07, "ref_logps/chosen": -95.89872741699219, "ref_logps/rejected": -86.04563903808594, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 78.85099792480469, "rewards/margins": 147.36328125, "rewards/rejected": -68.51228332519531, "step": 3350, "u": -4.234282493591309, "weight": 0.05625001713633537 }, { "diff_generated": -66.1912612915039, "epoch": 1.0887880751782242, "grad_norm": 492.68807977978, "learning_rate": 6.43045643708869e-07, "logits/chosen": -2.4714395999908447, "logits/rejected": -2.5174362659454346, "logps/chosen": -17.673574447631836, "logps/rejected": -149.83248901367188, "loss": 18.1353, "losses_ref": -0.0020296932198107243, "ref_logps/chosen": -98.73755645751953, "ref_logps/rejected": -83.6412353515625, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 81.06398010253906, "rewards/margins": 147.2552490234375, "rewards/rejected": -66.1912612915039, "step": 3360, "u": -4.327150821685791, "weight": 0.050094105303287506 }, { "diff_generated": -66.18561553955078, "epoch": 1.0920285158781595, "grad_norm": 508.9405686088096, "learning_rate": 6.418460529948861e-07, "logits/chosen": -2.453836441040039, "logits/rejected": -2.5827078819274902, "logps/chosen": -15.104484558105469, "logps/rejected": -148.48648071289062, "loss": 17.6481, "losses_ref": -0.0013367208885028958, "ref_logps/chosen": -89.31407928466797, "ref_logps/rejected": -82.3008804321289, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 74.2095947265625, "rewards/margins": 140.39520263671875, "rewards/rejected": -66.18561553955078, "step": 3370, "u": -4.214970588684082, "weight": 0.08131183683872223 }, { "diff_generated": -67.63780212402344, "epoch": 1.0952689565780946, "grad_norm": 494.30171232629164, "learning_rate": 6.406430240130064e-07, "logits/chosen": -2.4436795711517334, "logits/rejected": -2.5577890872955322, "logps/chosen": -17.331829071044922, "logps/rejected": -153.96365356445312, "loss": 17.1103, "losses_ref": -0.0005108517943881452, "ref_logps/chosen": -94.46105194091797, "ref_logps/rejected": -86.32584381103516, "rewards/accuracies": 0.9375, "rewards/chosen": 77.12922668457031, "rewards/margins": 144.76699829101562, "rewards/rejected": -67.63780212402344, "step": 3380, "u": -4.249946117401123, "weight": 0.06252266466617584 }, { "diff_generated": -64.21388244628906, "epoch": 1.0985093972780298, "grad_norm": 436.4445367228094, "learning_rate": 6.39436573866407e-07, "logits/chosen": -2.469306468963623, "logits/rejected": -2.567539691925049, "logps/chosen": -18.55048942565918, "logps/rejected": -152.05569458007812, "loss": 17.6939, "losses_ref": -1.3752096492680721e-05, "ref_logps/chosen": -96.04664611816406, "ref_logps/rejected": -87.84181213378906, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 77.49615478515625, "rewards/margins": 141.7100067138672, "rewards/rejected": -64.21388244628906, "step": 3390, "u": -4.355813980102539, "weight": 0.050000060349702835 }, { "diff_generated": -64.73839569091797, "epoch": 1.101749837977965, "grad_norm": 495.27880536840814, "learning_rate": 6.38226719706903e-07, "logits/chosen": -2.4457192420959473, "logits/rejected": -2.5553908348083496, "logps/chosen": -16.09178924560547, "logps/rejected": -146.14370727539062, "loss": 17.7363, "losses_ref": -0.00144859217107296, "ref_logps/chosen": -90.36268615722656, "ref_logps/rejected": -81.4052963256836, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 74.27088928222656, "rewards/margins": 139.00930786132812, "rewards/rejected": -64.73839569091797, "step": 3400, "u": -4.222038745880127, "weight": 0.07507114857435226 }, { "diff_generated": -65.90394592285156, "epoch": 1.1049902786779002, "grad_norm": 466.87894214692636, "learning_rate": 6.370134787347039e-07, "logits/chosen": -2.473989963531494, "logits/rejected": -2.566941738128662, "logps/chosen": -17.068649291992188, "logps/rejected": -156.55332946777344, "loss": 17.2998, "losses_ref": -1.9342860468896106e-06, "ref_logps/chosen": -94.74530029296875, "ref_logps/rejected": -90.6493911743164, "rewards/accuracies": 0.9375, "rewards/chosen": 77.67665100097656, "rewards/margins": 143.5806121826172, "rewards/rejected": -65.90394592285156, "step": 3410, "u": -4.272525787353516, "weight": 0.0625000149011612 }, { "diff_generated": -64.97685241699219, "epoch": 1.1082307193778353, "grad_norm": 481.9965436826669, "learning_rate": 6.357968681981683e-07, "logits/chosen": -2.4253671169281006, "logits/rejected": -2.472318649291992, "logps/chosen": -18.774478912353516, "logps/rejected": -151.53453063964844, "loss": 17.6054, "losses_ref": -1.1018643419902219e-07, "ref_logps/chosen": -98.19063568115234, "ref_logps/rejected": -86.55766296386719, "rewards/accuracies": 0.9375, "rewards/chosen": 79.41615295410156, "rewards/margins": 144.39300537109375, "rewards/rejected": -64.97685241699219, "step": 3420, "u": -4.315249443054199, "weight": 0.0625 }, { "diff_generated": -66.34733581542969, "epoch": 1.1114711600777705, "grad_norm": 463.2751033596651, "learning_rate": 6.345769053935595e-07, "logits/chosen": -2.4380462169647217, "logits/rejected": -2.569019317626953, "logps/chosen": -14.185934066772461, "logps/rejected": -151.55096435546875, "loss": 16.9791, "losses_ref": -1.4939736502128653e-05, "ref_logps/chosen": -87.73268127441406, "ref_logps/rejected": -85.2036361694336, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 73.54673767089844, "rewards/margins": 139.89407348632812, "rewards/rejected": -66.34733581542969, "step": 3430, "u": -4.263554573059082, "weight": 0.0687505379319191 }, { "diff_generated": -70.10304260253906, "epoch": 1.1147116007777058, "grad_norm": 477.96388369546867, "learning_rate": 6.333536076647985e-07, "logits/chosen": -2.3879170417785645, "logits/rejected": -2.544254779815674, "logps/chosen": -16.81411361694336, "logps/rejected": -159.60748291015625, "loss": 17.5568, "losses_ref": -0.010113712400197983, "ref_logps/chosen": -93.77284240722656, "ref_logps/rejected": -89.50444793701172, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 76.95872497558594, "rewards/margins": 147.06175231933594, "rewards/rejected": -70.10304260253906, "step": 3440, "u": -4.284361362457275, "weight": 0.050551921129226685 }, { "diff_generated": -68.41407012939453, "epoch": 1.117952041477641, "grad_norm": 474.8415955834364, "learning_rate": 6.321269924032188e-07, "logits/chosen": -2.439448356628418, "logits/rejected": -2.4982120990753174, "logps/chosen": -20.122116088867188, "logps/rejected": -155.7066650390625, "loss": 18.1174, "losses_ref": -0.0050384835340082645, "ref_logps/chosen": -100.1120376586914, "ref_logps/rejected": -87.29259490966797, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 79.98992156982422, "rewards/margins": 148.4039764404297, "rewards/rejected": -68.41407012939453, "step": 3450, "u": -4.3607940673828125, "weight": 0.044004492461681366 }, { "diff_generated": -68.7255859375, "epoch": 1.121192482177576, "grad_norm": 440.84411122537495, "learning_rate": 6.308970770473184e-07, "logits/chosen": -2.405679702758789, "logits/rejected": -2.4746363162994385, "logps/chosen": -17.939823150634766, "logps/rejected": -156.30960083007812, "loss": 17.7598, "losses_ref": -0.013999903574585915, "ref_logps/chosen": -96.26406860351562, "ref_logps/rejected": -87.58399963378906, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 78.32423400878906, "rewards/margins": 147.04983520507812, "rewards/rejected": -68.7255859375, "step": 3460, "u": -4.44952392578125, "weight": 0.025478944182395935 }, { "diff_generated": -66.78877258300781, "epoch": 1.1244329228775114, "grad_norm": 492.26077070106766, "learning_rate": 6.296638790825117e-07, "logits/chosen": -2.4781577587127686, "logits/rejected": -2.537199020385742, "logps/chosen": -17.374792098999023, "logps/rejected": -152.56198120117188, "loss": 17.4689, "losses_ref": -0.000983425066806376, "ref_logps/chosen": -96.94143676757812, "ref_logps/rejected": -85.77323150634766, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 79.56664276123047, "rewards/margins": 146.3554229736328, "rewards/rejected": -66.78877258300781, "step": 3470, "u": -4.366348743438721, "weight": 0.04379170760512352 }, { "diff_generated": -65.74295806884766, "epoch": 1.1276733635774465, "grad_norm": 450.3889903703623, "learning_rate": 6.284274160408812e-07, "logits/chosen": -2.4432475566864014, "logits/rejected": -2.5553131103515625, "logps/chosen": -15.783769607543945, "logps/rejected": -149.1510467529297, "loss": 17.0366, "losses_ref": -2.2255520889302716e-05, "ref_logps/chosen": -92.00151062011719, "ref_logps/rejected": -83.40808868408203, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 76.2177505493164, "rewards/margins": 141.960693359375, "rewards/rejected": -65.74295806884766, "step": 3480, "u": -4.216153621673584, "weight": 0.0812506452202797 }, { "diff_generated": -66.05118560791016, "epoch": 1.1309138042773816, "grad_norm": 506.9247184796788, "learning_rate": 6.271877055009284e-07, "logits/chosen": -2.4507365226745605, "logits/rejected": -2.552597761154175, "logps/chosen": -17.666889190673828, "logps/rejected": -152.59658813476562, "loss": 17.9093, "losses_ref": -0.0013337829150259495, "ref_logps/chosen": -96.6623764038086, "ref_logps/rejected": -86.54540252685547, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 78.9954833984375, "rewards/margins": 145.04666137695312, "rewards/rejected": -66.05118560791016, "step": 3490, "u": -4.291287422180176, "weight": 0.05006307363510132 }, { "diff_generated": -70.07727813720703, "epoch": 1.134154244977317, "grad_norm": 471.32853184759824, "learning_rate": 6.259447650873236e-07, "logits/chosen": -2.509068012237549, "logits/rejected": -2.5998129844665527, "logps/chosen": -15.926129341125488, "logps/rejected": -161.7591552734375, "loss": 18.0347, "losses_ref": -0.003403474111109972, "ref_logps/chosen": -94.38692474365234, "ref_logps/rejected": -91.68186950683594, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 78.4607925415039, "rewards/margins": 148.5380859375, "rewards/rejected": -70.07727813720703, "step": 3500, "u": -4.349414825439453, "weight": 0.050165869295597076 }, { "diff_generated": -66.88079833984375, "epoch": 1.137394685677252, "grad_norm": 440.6501931497652, "learning_rate": 6.246986124706555e-07, "logits/chosen": -2.4343461990356445, "logits/rejected": -2.565577983856201, "logps/chosen": -18.252582550048828, "logps/rejected": -157.1911163330078, "loss": 17.6839, "losses_ref": -0.004433914087712765, "ref_logps/chosen": -93.1754379272461, "ref_logps/rejected": -90.31031799316406, "rewards/accuracies": 0.9375, "rewards/chosen": 74.9228515625, "rewards/margins": 141.80364990234375, "rewards/rejected": -66.88079833984375, "step": 3510, "u": -4.272037506103516, "weight": 0.06272298842668533 }, { "diff_generated": -66.67464447021484, "epoch": 1.1406351263771872, "grad_norm": 454.5961208898324, "learning_rate": 6.234492653671797e-07, "logits/chosen": -2.4900689125061035, "logits/rejected": -2.5623250007629395, "logps/chosen": -18.45693588256836, "logps/rejected": -154.75906372070312, "loss": 17.7162, "losses_ref": -0.0011696848087012768, "ref_logps/chosen": -98.3475341796875, "ref_logps/rejected": -88.08442687988281, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 79.8906021118164, "rewards/margins": 146.56524658203125, "rewards/rejected": -66.67464447021484, "step": 3520, "u": -4.374854564666748, "weight": 0.05005534738302231 }, { "diff_generated": -65.45890045166016, "epoch": 1.1438755670771226, "grad_norm": 523.89030306214, "learning_rate": 6.221967415385675e-07, "logits/chosen": -2.4773688316345215, "logits/rejected": -2.516385555267334, "logps/chosen": -17.959131240844727, "logps/rejected": -148.49928283691406, "loss": 18.0136, "losses_ref": -4.428060947248014e-06, "ref_logps/chosen": -99.86962890625, "ref_logps/rejected": -83.0403823852539, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 81.91049194335938, "rewards/margins": 147.36940002441406, "rewards/rejected": -65.45890045166016, "step": 3530, "u": -4.355672836303711, "weight": 0.050000131130218506 }, { "diff_generated": -66.77120971679688, "epoch": 1.1471160077770577, "grad_norm": 470.31431857349844, "learning_rate": 6.209410587916524e-07, "logits/chosen": -2.426239252090454, "logits/rejected": -2.4639995098114014, "logps/chosen": -19.04548454284668, "logps/rejected": -149.74981689453125, "loss": 17.8432, "losses_ref": -1.6486468439325108e-06, "ref_logps/chosen": -100.50316619873047, "ref_logps/rejected": -82.97860717773438, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 81.45768737792969, "rewards/margins": 148.22891235351562, "rewards/rejected": -66.77120971679688, "step": 3540, "u": -4.378117084503174, "weight": 0.03750004991889 }, { "diff_generated": -66.69547271728516, "epoch": 1.1503564484769928, "grad_norm": 487.2198330207783, "learning_rate": 6.196822349781781e-07, "logits/chosen": -2.4488656520843506, "logits/rejected": -2.503800868988037, "logps/chosen": -18.600788116455078, "logps/rejected": -150.7244873046875, "loss": 17.6787, "losses_ref": -0.0007633547065779567, "ref_logps/chosen": -97.8669662475586, "ref_logps/rejected": -84.02903747558594, "rewards/accuracies": 0.96875, "rewards/chosen": 79.26618957519531, "rewards/margins": 145.961669921875, "rewards/rejected": -66.69547271728516, "step": 3550, "u": -4.396124362945557, "weight": 0.031284209340810776 }, { "diff_generated": -64.43789672851562, "epoch": 1.1535968891769282, "grad_norm": 443.73160688764654, "learning_rate": 6.184202879945437e-07, "logits/chosen": -2.4165685176849365, "logits/rejected": -2.4975438117980957, "logps/chosen": -18.127599716186523, "logps/rejected": -142.0409393310547, "loss": 18.1951, "losses_ref": -8.06269440545293e-07, "ref_logps/chosen": -92.44525146484375, "ref_logps/rejected": -77.60304260253906, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 74.3176498413086, "rewards/margins": 138.75555419921875, "rewards/rejected": -64.43789672851562, "step": 3560, "u": -4.217528343200684, "weight": 0.07500002533197403 }, { "diff_generated": -66.31949615478516, "epoch": 1.1568373298768633, "grad_norm": 396.4063188323412, "learning_rate": 6.171552357815497e-07, "logits/chosen": -2.4018328189849854, "logits/rejected": -2.540160894393921, "logps/chosen": -17.0816650390625, "logps/rejected": -151.96615600585938, "loss": 17.6642, "losses_ref": -0.0004584209527820349, "ref_logps/chosen": -93.61339569091797, "ref_logps/rejected": -85.64667510986328, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 76.53173828125, "rewards/margins": 142.85122680664062, "rewards/rejected": -66.31949615478516, "step": 3570, "u": -4.435829162597656, "weight": 0.025020593777298927 }, { "diff_generated": -67.34017944335938, "epoch": 1.1600777705767984, "grad_norm": 500.8741646869675, "learning_rate": 6.15887096324143e-07, "logits/chosen": -2.467637538909912, "logits/rejected": -2.5334534645080566, "logps/chosen": -18.958505630493164, "logps/rejected": -153.88809204101562, "loss": 17.3967, "losses_ref": -9.873149792838376e-06, "ref_logps/chosen": -97.52275848388672, "ref_logps/rejected": -86.54790496826172, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 78.56425476074219, "rewards/margins": 145.90443420410156, "rewards/rejected": -67.34017944335938, "step": 3580, "u": -4.305108547210693, "weight": 0.05625037103891373 }, { "diff_generated": -66.20716857910156, "epoch": 1.1633182112767337, "grad_norm": 449.6767901200747, "learning_rate": 6.14615887651161e-07, "logits/chosen": -2.4450440406799316, "logits/rejected": -2.5749616622924805, "logps/chosen": -16.25723648071289, "logps/rejected": -153.3737335205078, "loss": 17.0221, "losses_ref": -0.0018811358604580164, "ref_logps/chosen": -88.58432006835938, "ref_logps/rejected": -87.16657257080078, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 72.32708740234375, "rewards/margins": 138.5342559814453, "rewards/rejected": -66.20716857910156, "step": 3590, "u": -4.233583450317383, "weight": 0.06883620470762253 }, { "diff_generated": -66.19894409179688, "epoch": 1.1665586519766689, "grad_norm": 479.7835646590672, "learning_rate": 6.133416278350756e-07, "logits/chosen": -2.4543826580047607, "logits/rejected": -2.5439774990081787, "logps/chosen": -17.245370864868164, "logps/rejected": -149.97860717773438, "loss": 17.5297, "losses_ref": -0.003648832906037569, "ref_logps/chosen": -93.31538391113281, "ref_logps/rejected": -83.7796401977539, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 76.07001495361328, "rewards/margins": 142.26895141601562, "rewards/rejected": -66.19894409179688, "step": 3600, "u": -4.308576583862305, "weight": 0.05642462521791458 }, { "diff_generated": -68.85673522949219, "epoch": 1.169799092676604, "grad_norm": 493.566813054623, "learning_rate": 6.120643349917359e-07, "logits/chosen": -2.470064640045166, "logits/rejected": -2.525467872619629, "logps/chosen": -17.012256622314453, "logps/rejected": -155.98880004882812, "loss": 17.6349, "losses_ref": -0.0008439187076874077, "ref_logps/chosen": -100.3587417602539, "ref_logps/rejected": -87.13206481933594, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 83.34648132324219, "rewards/margins": 152.20321655273438, "rewards/rejected": -68.85673522949219, "step": 3610, "u": -4.288359642028809, "weight": 0.05004129558801651 }, { "diff_generated": -62.7186164855957, "epoch": 1.173039533376539, "grad_norm": 480.34216176357796, "learning_rate": 6.107840272801108e-07, "logits/chosen": -2.456737756729126, "logits/rejected": -2.5191168785095215, "logps/chosen": -18.4807186126709, "logps/rejected": -145.6920166015625, "loss": 17.6207, "losses_ref": -0.005524917971342802, "ref_logps/chosen": -92.15921783447266, "ref_logps/rejected": -82.97339630126953, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 73.6784896850586, "rewards/margins": 136.39712524414062, "rewards/rejected": -62.7186164855957, "step": 3620, "u": -4.388144493103027, "weight": 0.044021543115377426 }, { "diff_generated": -64.40690612792969, "epoch": 1.1762799740764744, "grad_norm": 462.5062963786796, "learning_rate": 6.095007229020311e-07, "logits/chosen": -2.4350109100341797, "logits/rejected": -2.576110363006592, "logps/chosen": -15.553683280944824, "logps/rejected": -151.23849487304688, "loss": 17.6157, "losses_ref": -0.00013673387002199888, "ref_logps/chosen": -91.82894134521484, "ref_logps/rejected": -86.83157348632812, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 76.27525329589844, "rewards/margins": 140.68215942382812, "rewards/rejected": -64.40690612792969, "step": 3630, "u": -4.306022644042969, "weight": 0.05625535175204277 }, { "diff_generated": -64.05938720703125, "epoch": 1.1795204147764096, "grad_norm": 453.34527622779916, "learning_rate": 6.082144401019304e-07, "logits/chosen": -2.467184066772461, "logits/rejected": -2.519963026046753, "logps/chosen": -17.702754974365234, "logps/rejected": -146.12852478027344, "loss": 17.4214, "losses_ref": -1.905074532260187e-05, "ref_logps/chosen": -93.05728912353516, "ref_logps/rejected": -82.06913757324219, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 75.35453796386719, "rewards/margins": 139.41392517089844, "rewards/rejected": -64.05938720703125, "step": 3640, "u": -4.3346781730651855, "weight": 0.050000596791505814 }, { "diff_generated": -64.80006408691406, "epoch": 1.1827608554763447, "grad_norm": 497.3921847635673, "learning_rate": 6.069251971665857e-07, "logits/chosen": -2.4072885513305664, "logits/rejected": -2.5418806076049805, "logps/chosen": -17.536151885986328, "logps/rejected": -149.25770568847656, "loss": 17.8686, "losses_ref": -0.023113643750548363, "ref_logps/chosen": -90.7900161743164, "ref_logps/rejected": -84.45764923095703, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 73.25386047363281, "rewards/margins": 138.05393981933594, "rewards/rejected": -64.80006408691406, "step": 3650, "u": -4.255960464477539, "weight": 0.06997169554233551 }, { "diff_generated": -70.54817199707031, "epoch": 1.18600129617628, "grad_norm": 501.99646571703795, "learning_rate": 6.056330124248576e-07, "logits/chosen": -2.420248508453369, "logits/rejected": -2.5905723571777344, "logps/chosen": -15.543874740600586, "logps/rejected": -158.19857788085938, "loss": 17.1204, "losses_ref": -9.355277143185958e-06, "ref_logps/chosen": -91.93193054199219, "ref_logps/rejected": -87.65039825439453, "rewards/accuracies": 0.96875, "rewards/chosen": 76.38804626464844, "rewards/margins": 146.9362335205078, "rewards/rejected": -70.54817199707031, "step": 3660, "u": -4.446316719055176, "weight": 0.031250424683094025 }, { "diff_generated": -69.34424591064453, "epoch": 1.1892417368762151, "grad_norm": 411.3228445717369, "learning_rate": 6.043379042474297e-07, "logits/chosen": -2.437598705291748, "logits/rejected": -2.5694053173065186, "logps/chosen": -18.93130874633789, "logps/rejected": -155.77711486816406, "loss": 17.2569, "losses_ref": -0.0005264817154966295, "ref_logps/chosen": -97.73585510253906, "ref_logps/rejected": -86.43287658691406, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 78.80455017089844, "rewards/margins": 148.14878845214844, "rewards/rejected": -69.34424591064453, "step": 3670, "u": -4.360543251037598, "weight": 0.05002452805638313 }, { "diff_generated": -68.2588882446289, "epoch": 1.1924821775761503, "grad_norm": 432.07562523916755, "learning_rate": 6.030398910465475e-07, "logits/chosen": -2.4059746265411377, "logits/rejected": -2.509774684906006, "logps/chosen": -17.03068733215332, "logps/rejected": -153.54539489746094, "loss": 18.2568, "losses_ref": -0.0018109262455254793, "ref_logps/chosen": -93.44799041748047, "ref_logps/rejected": -85.28651428222656, "rewards/accuracies": 0.9375, "rewards/chosen": 76.41730499267578, "rewards/margins": 144.67617797851562, "rewards/rejected": -68.2588882446289, "step": 3680, "u": -4.272830486297607, "weight": 0.06258610635995865 }, { "diff_generated": -66.36592864990234, "epoch": 1.1957226182760856, "grad_norm": 449.0600224169998, "learning_rate": 6.017389912757561e-07, "logits/chosen": -2.4729011058807373, "logits/rejected": -2.5933284759521484, "logps/chosen": -15.932981491088867, "logps/rejected": -154.79925537109375, "loss": 17.0165, "losses_ref": -0.0008205400081351399, "ref_logps/chosen": -90.65199279785156, "ref_logps/rejected": -88.43331146240234, "rewards/accuracies": 0.9375, "rewards/chosen": 74.7190170288086, "rewards/margins": 141.0849609375, "rewards/rejected": -66.36592864990234, "step": 3690, "u": -4.300013542175293, "weight": 0.06253810971975327 }, { "diff_generated": -62.134925842285156, "epoch": 1.1989630589760207, "grad_norm": 487.2705623202895, "learning_rate": 6.004352234296389e-07, "logits/chosen": -2.444080114364624, "logits/rejected": -2.518881320953369, "logps/chosen": -19.006927490234375, "logps/rejected": -147.88319396972656, "loss": 18.3132, "losses_ref": -0.011056670919060707, "ref_logps/chosen": -100.0880126953125, "ref_logps/rejected": -85.74827575683594, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 81.08108520507812, "rewards/margins": 143.2160186767578, "rewards/rejected": -62.134925842285156, "step": 3700, "u": -4.219624996185303, "weight": 0.06931637227535248 }, { "diff_generated": -62.08357620239258, "epoch": 1.2022034996759559, "grad_norm": 464.6541566589534, "learning_rate": 5.991286060435536e-07, "logits/chosen": -2.4166674613952637, "logits/rejected": -2.476719856262207, "logps/chosen": -18.68705940246582, "logps/rejected": -142.98526000976562, "loss": 18.034, "losses_ref": -9.735246749187354e-06, "ref_logps/chosen": -95.77069091796875, "ref_logps/rejected": -80.90168762207031, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 77.08363342285156, "rewards/margins": 139.16720581054688, "rewards/rejected": -62.08357620239258, "step": 3710, "u": -4.208899021148682, "weight": 0.08125011622905731 }, { "diff_generated": -65.4081802368164, "epoch": 1.2054439403758912, "grad_norm": 467.91529233329715, "learning_rate": 5.978191576933692e-07, "logits/chosen": -2.435298204421997, "logits/rejected": -2.5294342041015625, "logps/chosen": -16.750965118408203, "logps/rejected": -149.4308319091797, "loss": 17.6858, "losses_ref": -0.0006217855261638761, "ref_logps/chosen": -94.02302551269531, "ref_logps/rejected": -84.02266693115234, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 77.27205657958984, "rewards/margins": 142.6802215576172, "rewards/rejected": -65.4081802368164, "step": 3720, "u": -4.320083141326904, "weight": 0.056277960538864136 }, { "diff_generated": -70.64672088623047, "epoch": 1.2086843810758263, "grad_norm": 439.429553373978, "learning_rate": 5.965068969952017e-07, "logits/chosen": -2.457728862762451, "logits/rejected": -2.5720419883728027, "logps/chosen": -17.41990852355957, "logps/rejected": -163.0725555419922, "loss": 16.9155, "losses_ref": -0.00014942415873520076, "ref_logps/chosen": -95.34916687011719, "ref_logps/rejected": -92.42584991455078, "rewards/accuracies": 0.96875, "rewards/chosen": 77.92925262451172, "rewards/margins": 148.5759735107422, "rewards/rejected": -70.64672088623047, "step": 3730, "u": -4.437350273132324, "weight": 0.031256116926670074 }, { "diff_generated": -67.57591247558594, "epoch": 1.2119248217757614, "grad_norm": 417.0455210166249, "learning_rate": 5.951918426051502e-07, "logits/chosen": -2.49534273147583, "logits/rejected": -2.5804288387298584, "logps/chosen": -13.765565872192383, "logps/rejected": -151.92489624023438, "loss": 16.8638, "losses_ref": -0.0001484433450968936, "ref_logps/chosen": -92.38279724121094, "ref_logps/rejected": -84.34899139404297, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 78.61722564697266, "rewards/margins": 146.19314575195312, "rewards/rejected": -67.57591247558594, "step": 3740, "u": -4.319943428039551, "weight": 0.050006695091724396 }, { "diff_generated": -67.62321472167969, "epoch": 1.2151652624756968, "grad_norm": 430.78735226706414, "learning_rate": 5.938740132190306e-07, "logits/chosen": -2.4281935691833496, "logits/rejected": -2.52376127243042, "logps/chosen": -16.50156021118164, "logps/rejected": -155.07244873046875, "loss": 18.3025, "losses_ref": -2.4736641535128e-07, "ref_logps/chosen": -95.54866027832031, "ref_logps/rejected": -87.44921875, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 79.04710388183594, "rewards/margins": 146.67031860351562, "rewards/rejected": -67.62321472167969, "step": 3750, "u": -4.36559534072876, "weight": 0.04375000670552254 }, { "diff_generated": -70.21691131591797, "epoch": 1.218405703175632, "grad_norm": 444.5620289021675, "learning_rate": 5.9255342757211e-07, "logits/chosen": -2.4592199325561523, "logits/rejected": -2.579988956451416, "logps/chosen": -17.252155303955078, "logps/rejected": -153.64694213867188, "loss": 17.0841, "losses_ref": -0.0016604771371930838, "ref_logps/chosen": -95.3075942993164, "ref_logps/rejected": -83.43003845214844, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 78.05543518066406, "rewards/margins": 148.27235412597656, "rewards/rejected": -70.21691131591797, "step": 3760, "u": -4.428204536437988, "weight": 0.02507650852203369 }, { "diff_generated": -62.5455436706543, "epoch": 1.221646143875567, "grad_norm": 521.0068796064838, "learning_rate": 5.91230104438841e-07, "logits/chosen": -2.474541187286377, "logits/rejected": -2.4790170192718506, "logps/chosen": -18.36305809020996, "logps/rejected": -143.1754608154297, "loss": 17.3262, "losses_ref": -0.014308147132396698, "ref_logps/chosen": -95.47447967529297, "ref_logps/rejected": -80.62992858886719, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 77.11141967773438, "rewards/margins": 139.65696716308594, "rewards/rejected": -62.5455436706543, "step": 3770, "u": -4.210149765014648, "weight": 0.06888743489980698 }, { "diff_generated": -63.37689208984375, "epoch": 1.2248865845755024, "grad_norm": 439.35926696012797, "learning_rate": 5.899040626325945e-07, "logits/chosen": -2.484140634536743, "logits/rejected": -2.5540719032287598, "logps/chosen": -16.743610382080078, "logps/rejected": -144.51959228515625, "loss": 17.3574, "losses_ref": -0.00037624576361849904, "ref_logps/chosen": -93.28424835205078, "ref_logps/rejected": -81.1427001953125, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 76.54063415527344, "rewards/margins": 139.9175262451172, "rewards/rejected": -63.37689208984375, "step": 3780, "u": -4.222517490386963, "weight": 0.06875824928283691 }, { "diff_generated": -65.10592651367188, "epoch": 1.2281270252754375, "grad_norm": 507.5671400011117, "learning_rate": 5.885753210053917e-07, "logits/chosen": -2.4867701530456543, "logits/rejected": -2.567906141281128, "logps/chosen": -17.901391983032227, "logps/rejected": -152.5151824951172, "loss": 17.8247, "losses_ref": -0.00267465366050601, "ref_logps/chosen": -95.4359359741211, "ref_logps/rejected": -87.4092788696289, "rewards/accuracies": 0.9375, "rewards/chosen": 77.5345458984375, "rewards/margins": 142.64047241210938, "rewards/rejected": -65.10592651367188, "step": 3790, "u": -4.243159294128418, "weight": 0.06263072788715363 }, { "diff_generated": -63.96403884887695, "epoch": 1.2313674659753726, "grad_norm": 480.8556932216797, "learning_rate": 5.872438984476368e-07, "logits/chosen": -2.471346616744995, "logits/rejected": -2.505013942718506, "logps/chosen": -18.858333587646484, "logps/rejected": -141.34747314453125, "loss": 17.6185, "losses_ref": -0.001003618584945798, "ref_logps/chosen": -95.43574523925781, "ref_logps/rejected": -77.38343811035156, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 76.57740783691406, "rewards/margins": 140.5414581298828, "rewards/rejected": -63.96403884887695, "step": 3800, "u": -4.403631210327148, "weight": 0.04379910230636597 }, { "diff_generated": -68.29475402832031, "epoch": 1.2346079066753077, "grad_norm": 463.0034451448924, "learning_rate": 5.859098138878482e-07, "logits/chosen": -2.465116024017334, "logits/rejected": -2.5396251678466797, "logps/chosen": -19.610532760620117, "logps/rejected": -157.35377502441406, "loss": 18.0952, "losses_ref": -0.000537110201548785, "ref_logps/chosen": -99.69366455078125, "ref_logps/rejected": -89.05900573730469, "rewards/accuracies": 0.96875, "rewards/chosen": 80.0831298828125, "rewards/margins": 148.3778839111328, "rewards/rejected": -68.29475402832031, "step": 3810, "u": -4.445765495300293, "weight": 0.031274110078811646 }, { "diff_generated": -66.76065063476562, "epoch": 1.237848347375243, "grad_norm": 485.8388138489112, "learning_rate": 5.845730862923889e-07, "logits/chosen": -2.4166407585144043, "logits/rejected": -2.4928579330444336, "logps/chosen": -18.420665740966797, "logps/rejected": -153.03782653808594, "loss": 17.9809, "losses_ref": -0.012328693643212318, "ref_logps/chosen": -94.0389633178711, "ref_logps/rejected": -86.27717590332031, "rewards/accuracies": 0.9375, "rewards/chosen": 75.61830139160156, "rewards/margins": 142.3789520263672, "rewards/rejected": -66.76065063476562, "step": 3820, "u": -4.259571552276611, "weight": 0.06311958283185959 }, { "diff_generated": -66.66990661621094, "epoch": 1.2410887880751782, "grad_norm": 474.973404226213, "learning_rate": 5.83233734665198e-07, "logits/chosen": -2.437342405319214, "logits/rejected": -2.4982120990753174, "logps/chosen": -17.382713317871094, "logps/rejected": -150.70150756835938, "loss": 17.059, "losses_ref": -0.00036550246295519173, "ref_logps/chosen": -97.52508544921875, "ref_logps/rejected": -84.03162384033203, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 80.14237976074219, "rewards/margins": 146.81228637695312, "rewards/rejected": -66.66990661621094, "step": 3830, "u": -4.3889641761779785, "weight": 0.043765999376773834 }, { "diff_generated": -68.80332946777344, "epoch": 1.2443292287751135, "grad_norm": 423.7381461477668, "learning_rate": 5.818917780475196e-07, "logits/chosen": -2.475179672241211, "logits/rejected": -2.5922000408172607, "logps/chosen": -20.405536651611328, "logps/rejected": -158.35231018066406, "loss": 17.7137, "losses_ref": -2.940079468771728e-08, "ref_logps/chosen": -102.04740905761719, "ref_logps/rejected": -89.54898834228516, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 81.6418685913086, "rewards/margins": 150.4451904296875, "rewards/rejected": -68.80332946777344, "step": 3840, "u": -4.486301898956299, "weight": 0.01875000074505806 }, { "diff_generated": -66.65308380126953, "epoch": 1.2475696694750487, "grad_norm": 474.8560758696638, "learning_rate": 5.805472355176318e-07, "logits/chosen": -2.4920127391815186, "logits/rejected": -2.560584306716919, "logps/chosen": -17.58675765991211, "logps/rejected": -155.38302612304688, "loss": 17.4797, "losses_ref": -8.679247684995062e-07, "ref_logps/chosen": -97.6273422241211, "ref_logps/rejected": -88.72994995117188, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 80.04058074951172, "rewards/margins": 146.6936798095703, "rewards/rejected": -66.65308380126953, "step": 3850, "u": -4.330027103424072, "weight": 0.05000001937150955 }, { "diff_generated": -62.756385803222656, "epoch": 1.2508101101749838, "grad_norm": 430.1279253062814, "learning_rate": 5.792001261905767e-07, "logits/chosen": -2.454555034637451, "logits/rejected": -2.5446088314056396, "logps/chosen": -17.413721084594727, "logps/rejected": -142.61538696289062, "loss": 16.6907, "losses_ref": -3.525937700032955e-06, "ref_logps/chosen": -92.18282318115234, "ref_logps/rejected": -79.85899353027344, "rewards/accuracies": 0.90625, "rewards/chosen": 74.76910400390625, "rewards/margins": 137.52548217773438, "rewards/rejected": -62.756385803222656, "step": 3860, "u": -4.043120384216309, "weight": 0.09375004470348358 }, { "diff_generated": -64.377685546875, "epoch": 1.254050550874919, "grad_norm": 477.44499672360115, "learning_rate": 5.778504692178876e-07, "logits/chosen": -2.4434051513671875, "logits/rejected": -2.6090359687805176, "logps/chosen": -16.34039306640625, "logps/rejected": -143.81460571289062, "loss": 17.0992, "losses_ref": -5.5604403314646333e-05, "ref_logps/chosen": -90.14739227294922, "ref_logps/rejected": -79.43690490722656, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 73.80699157714844, "rewards/margins": 138.18466186523438, "rewards/rejected": -64.377685546875, "step": 3870, "u": -4.2456865310668945, "weight": 0.0750010758638382 }, { "diff_generated": -68.07923889160156, "epoch": 1.2572909915748542, "grad_norm": 403.44420896919473, "learning_rate": 5.76498283787317e-07, "logits/chosen": -2.462009906768799, "logits/rejected": -2.5193183422088623, "logps/chosen": -17.027873992919922, "logps/rejected": -157.0982666015625, "loss": 17.0411, "losses_ref": -7.056218720435936e-08, "ref_logps/chosen": -96.52754211425781, "ref_logps/rejected": -89.01904296875, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 79.49966430664062, "rewards/margins": 147.5789031982422, "rewards/rejected": -68.07923889160156, "step": 3880, "u": -4.3342485427856445, "weight": 0.05000000074505806 }, { "diff_generated": -65.75061798095703, "epoch": 1.2605314322747894, "grad_norm": 430.6980995406202, "learning_rate": 5.751435891225643e-07, "logits/chosen": -2.420855760574341, "logits/rejected": -2.534722089767456, "logps/chosen": -15.174592971801758, "logps/rejected": -149.7338409423828, "loss": 16.6038, "losses_ref": -0.0031283546704798937, "ref_logps/chosen": -89.49068450927734, "ref_logps/rejected": -83.98322296142578, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 74.31610107421875, "rewards/margins": 140.06671142578125, "rewards/rejected": -65.75061798095703, "step": 3890, "u": -4.326550006866455, "weight": 0.056396253407001495 }, { "diff_generated": -64.78160095214844, "epoch": 1.2637718729747245, "grad_norm": 454.27027889030245, "learning_rate": 5.737864044830015e-07, "logits/chosen": -2.4483044147491455, "logits/rejected": -2.5388526916503906, "logps/chosen": -18.663970947265625, "logps/rejected": -147.86756896972656, "loss": 17.4371, "losses_ref": -0.00033568666549399495, "ref_logps/chosen": -96.4852294921875, "ref_logps/rejected": -83.08597564697266, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 77.82125091552734, "rewards/margins": 142.6028594970703, "rewards/rejected": -64.78160095214844, "step": 3900, "u": -4.257702827453613, "weight": 0.0750146210193634 }, { "diff_generated": -65.7984390258789, "epoch": 1.2670123136746598, "grad_norm": 451.96926077815203, "learning_rate": 5.724267491634006e-07, "logits/chosen": -2.4495253562927246, "logits/rejected": -2.538529396057129, "logps/chosen": -17.021711349487305, "logps/rejected": -151.54751586914062, "loss": 17.0695, "losses_ref": -0.0006901304004713893, "ref_logps/chosen": -95.57925415039062, "ref_logps/rejected": -85.74907684326172, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 78.55754089355469, "rewards/margins": 144.35597229003906, "rewards/rejected": -65.7984390258789, "step": 3910, "u": -4.390527725219727, "weight": 0.0437803752720356 }, { "diff_generated": -64.09269714355469, "epoch": 1.270252754374595, "grad_norm": 412.65807296826927, "learning_rate": 5.710646424936581e-07, "logits/chosen": -2.475703716278076, "logits/rejected": -2.5469605922698975, "logps/chosen": -18.97299575805664, "logps/rejected": -147.84817504882812, "loss": 17.749, "losses_ref": -0.0020144921727478504, "ref_logps/chosen": -98.8651123046875, "ref_logps/rejected": -83.7554702758789, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 79.89212036132812, "rewards/margins": 143.98483276367188, "rewards/rejected": -64.09269714355469, "step": 3920, "u": -4.362476825714111, "weight": 0.04384419694542885 }, { "diff_generated": -66.27864074707031, "epoch": 1.27349319507453, "grad_norm": 459.39132209179314, "learning_rate": 5.697001038385212e-07, "logits/chosen": -2.4267189502716064, "logits/rejected": -2.5123836994171143, "logps/chosen": -18.38796615600586, "logps/rejected": -155.38739013671875, "loss": 17.6722, "losses_ref": -0.0027240305207669735, "ref_logps/chosen": -98.43446350097656, "ref_logps/rejected": -89.10877227783203, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 80.04649353027344, "rewards/margins": 146.32513427734375, "rewards/rejected": -66.27864074707031, "step": 3930, "u": -4.358931064605713, "weight": 0.04388491064310074 }, { "diff_generated": -66.59132385253906, "epoch": 1.2767336357744652, "grad_norm": 477.7976292344514, "learning_rate": 5.683331525973118e-07, "logits/chosen": -2.4096784591674805, "logits/rejected": -2.52586030960083, "logps/chosen": -18.428979873657227, "logps/rejected": -151.68357849121094, "loss": 17.9629, "losses_ref": -1.724712461736999e-08, "ref_logps/chosen": -94.28060913085938, "ref_logps/rejected": -85.09223937988281, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 75.85163116455078, "rewards/margins": 142.4429473876953, "rewards/rejected": -66.59132385253906, "step": 3940, "u": -4.3122944831848145, "weight": 0.05624999850988388 }, { "diff_generated": -65.66859436035156, "epoch": 1.2799740764744005, "grad_norm": 483.81987528369837, "learning_rate": 5.66963808203651e-07, "logits/chosen": -2.4494543075561523, "logits/rejected": -2.5477046966552734, "logps/chosen": -18.167545318603516, "logps/rejected": -149.3108673095703, "loss": 17.3193, "losses_ref": -0.00034220569068565965, "ref_logps/chosen": -96.05061340332031, "ref_logps/rejected": -83.64225006103516, "rewards/accuracies": 0.96875, "rewards/chosen": 77.88307189941406, "rewards/margins": 143.55166625976562, "rewards/rejected": -65.66859436035156, "step": 3950, "u": -4.404166221618652, "weight": 0.03126030042767525 }, { "diff_generated": -65.86676788330078, "epoch": 1.2832145171743357, "grad_norm": 464.5151411476406, "learning_rate": 5.65592090125183e-07, "logits/chosen": -2.4210243225097656, "logits/rejected": -2.5444581508636475, "logps/chosen": -13.709085464477539, "logps/rejected": -151.24037170410156, "loss": 16.9163, "losses_ref": -1.5870946299401112e-05, "ref_logps/chosen": -90.91244506835938, "ref_logps/rejected": -85.37360382080078, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 77.20335388183594, "rewards/margins": 143.07009887695312, "rewards/rejected": -65.86676788330078, "step": 3960, "u": -4.187623977661133, "weight": 0.07500036060810089 }, { "diff_generated": -67.22178649902344, "epoch": 1.286454957874271, "grad_norm": 458.0925132442424, "learning_rate": 5.642180178632977e-07, "logits/chosen": -2.4346470832824707, "logits/rejected": -2.551971912384033, "logps/chosen": -16.795867919921875, "logps/rejected": -153.08494567871094, "loss": 17.6376, "losses_ref": -0.0018662631046026945, "ref_logps/chosen": -93.50836181640625, "ref_logps/rejected": -85.8631591796875, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 76.7125015258789, "rewards/margins": 143.93429565429688, "rewards/rejected": -67.22178649902344, "step": 3970, "u": -4.368497848510742, "weight": 0.04383764788508415 }, { "diff_generated": -69.42669677734375, "epoch": 1.2896953985742061, "grad_norm": 451.47536908412064, "learning_rate": 5.628416109528542e-07, "logits/chosen": -2.418494939804077, "logits/rejected": -2.5364253520965576, "logps/chosen": -16.748294830322266, "logps/rejected": -157.46620178222656, "loss": 18.0641, "losses_ref": -0.0028826945926994085, "ref_logps/chosen": -93.15105438232422, "ref_logps/rejected": -88.03948974609375, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 76.40276336669922, "rewards/margins": 145.8294677734375, "rewards/rejected": -69.42669677734375, "step": 3980, "u": -4.361595153808594, "weight": 0.05014491081237793 }, { "diff_generated": -64.70690155029297, "epoch": 1.2929358392741412, "grad_norm": 454.28630354692615, "learning_rate": 5.614628889619029e-07, "logits/chosen": -2.3802146911621094, "logits/rejected": -2.545664072036743, "logps/chosen": -16.525279998779297, "logps/rejected": -149.8128204345703, "loss": 17.8829, "losses_ref": -0.005721858702600002, "ref_logps/chosen": -89.77972412109375, "ref_logps/rejected": -85.10591125488281, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 73.25444793701172, "rewards/margins": 137.9613494873047, "rewards/rejected": -64.70690155029297, "step": 3990, "u": -4.33136510848999, "weight": 0.05031445622444153 }, { "diff_generated": -64.35713195800781, "epoch": 1.2961762799740764, "grad_norm": 468.82409945638295, "learning_rate": 5.600818714914065e-07, "logits/chosen": -2.444451093673706, "logits/rejected": -2.533601760864258, "logps/chosen": -19.608978271484375, "logps/rejected": -150.1212158203125, "loss": 17.86, "losses_ref": -0.0031665258575230837, "ref_logps/chosen": -97.11079406738281, "ref_logps/rejected": -85.76406860351562, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 77.5018081665039, "rewards/margins": 141.8589324951172, "rewards/rejected": -64.35713195800781, "step": 4000, "u": -4.308414936065674, "weight": 0.056388258934020996 }, { "diff_generated": -69.15150451660156, "epoch": 1.2994167206740117, "grad_norm": 478.1275719327832, "learning_rate": 5.586985781749625e-07, "logits/chosen": -2.475505828857422, "logits/rejected": -2.670222043991089, "logps/chosen": -17.083032608032227, "logps/rejected": -163.42282104492188, "loss": 17.2895, "losses_ref": -0.0024460928980261087, "ref_logps/chosen": -93.89830780029297, "ref_logps/rejected": -94.27131652832031, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 76.81526184082031, "rewards/margins": 145.96676635742188, "rewards/rejected": -69.15150451660156, "step": 4010, "u": -4.460943698883057, "weight": 0.025116896256804466 }, { "diff_generated": -65.5688247680664, "epoch": 1.3026571613739468, "grad_norm": 438.468504539203, "learning_rate": 5.573130286785237e-07, "logits/chosen": -2.5178561210632324, "logits/rejected": -2.5199027061462402, "logps/chosen": -17.68651008605957, "logps/rejected": -150.85919189453125, "loss": 17.2796, "losses_ref": -0.0003696681815199554, "ref_logps/chosen": -98.16356658935547, "ref_logps/rejected": -85.29036712646484, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 80.47705078125, "rewards/margins": 146.04586791992188, "rewards/rejected": -65.5688247680664, "step": 4020, "u": -4.210076332092285, "weight": 0.07501642405986786 }, { "diff_generated": -65.79573059082031, "epoch": 1.3058976020738822, "grad_norm": 477.750017756696, "learning_rate": 5.559252427001178e-07, "logits/chosen": -2.386543035507202, "logits/rejected": -2.5044496059417725, "logps/chosen": -18.277788162231445, "logps/rejected": -144.05825805664062, "loss": 17.1326, "losses_ref": -0.00564918015152216, "ref_logps/chosen": -93.29234313964844, "ref_logps/rejected": -78.26251983642578, "rewards/accuracies": 0.9375, "rewards/chosen": 75.01456451416016, "rewards/margins": 140.81028747558594, "rewards/rejected": -65.79573059082031, "step": 4030, "u": -4.24915885925293, "weight": 0.06275015324354172 }, { "diff_generated": -67.20874786376953, "epoch": 1.3091380427738173, "grad_norm": 464.78850180166785, "learning_rate": 5.545352399695687e-07, "logits/chosen": -2.4252359867095947, "logits/rejected": -2.564512252807617, "logps/chosen": -17.03969955444336, "logps/rejected": -149.99732971191406, "loss": 17.5304, "losses_ref": -0.006088468246161938, "ref_logps/chosen": -92.34000396728516, "ref_logps/rejected": -82.78856658935547, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 75.30028533935547, "rewards/margins": 142.509033203125, "rewards/rejected": -67.20874786376953, "step": 4040, "u": -4.393402099609375, "weight": 0.037814319133758545 }, { "diff_generated": -64.21138000488281, "epoch": 1.3123784834737524, "grad_norm": 460.87277273602257, "learning_rate": 5.531430402482153e-07, "logits/chosen": -2.4368481636047363, "logits/rejected": -2.5155985355377197, "logps/chosen": -17.370777130126953, "logps/rejected": -152.25576782226562, "loss": 17.227, "losses_ref": -0.002279623644426465, "ref_logps/chosen": -94.12760162353516, "ref_logps/rejected": -88.04439544677734, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 76.75682067871094, "rewards/margins": 140.9682159423828, "rewards/rejected": -64.21138000488281, "step": 4050, "u": -4.247066497802734, "weight": 0.06885669380426407 }, { "diff_generated": -68.6576919555664, "epoch": 1.3156189241736875, "grad_norm": 416.17907967388055, "learning_rate": 5.517486633286299e-07, "logits/chosen": -2.417910099029541, "logits/rejected": -2.5245842933654785, "logps/chosen": -17.445659637451172, "logps/rejected": -155.00851440429688, "loss": 17.7637, "losses_ref": -4.589330160342797e-08, "ref_logps/chosen": -92.51380920410156, "ref_logps/rejected": -86.35084533691406, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 75.06814575195312, "rewards/margins": 143.725830078125, "rewards/rejected": -68.6576919555664, "step": 4060, "u": -4.233851432800293, "weight": 0.06875000149011612 }, { "diff_generated": -67.85105895996094, "epoch": 1.3188593648736229, "grad_norm": 447.42658631506015, "learning_rate": 5.503521290343384e-07, "logits/chosen": -2.4626007080078125, "logits/rejected": -2.5869998931884766, "logps/chosen": -18.858882904052734, "logps/rejected": -162.36654663085938, "loss": 17.8787, "losses_ref": -8.136340511555318e-06, "ref_logps/chosen": -96.63755798339844, "ref_logps/rejected": -94.51548767089844, "rewards/accuracies": 0.96875, "rewards/chosen": 77.77867126464844, "rewards/margins": 145.62974548339844, "rewards/rejected": -67.85105895996094, "step": 4070, "u": -4.423346519470215, "weight": 0.03125032037496567 }, { "diff_generated": -65.6728286743164, "epoch": 1.322099805573558, "grad_norm": 497.9260171796494, "learning_rate": 5.489534572195373e-07, "logits/chosen": -2.3953137397766113, "logits/rejected": -2.590769052505493, "logps/chosen": -15.14660930633545, "logps/rejected": -146.5998077392578, "loss": 17.2821, "losses_ref": -0.00026578555116429925, "ref_logps/chosen": -85.99039459228516, "ref_logps/rejected": -80.92698669433594, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 70.84378051757812, "rewards/margins": 136.5166015625, "rewards/rejected": -65.6728286743164, "step": 4080, "u": -4.190157890319824, "weight": 0.08126143366098404 }, { "diff_generated": -71.92523193359375, "epoch": 1.3253402462734931, "grad_norm": 485.0867769878626, "learning_rate": 5.47552667768811e-07, "logits/chosen": -2.394679546356201, "logits/rejected": -2.573765277862549, "logps/chosen": -14.7946195602417, "logps/rejected": -162.44032287597656, "loss": 17.2681, "losses_ref": -1.239477427361635e-07, "ref_logps/chosen": -90.20631408691406, "ref_logps/rejected": -90.51509094238281, "rewards/accuracies": 0.96875, "rewards/chosen": 75.41169738769531, "rewards/margins": 147.3369140625, "rewards/rejected": -71.92523193359375, "step": 4090, "u": -4.3911848068237305, "weight": 0.0312500037252903 }, { "diff_generated": -70.5416030883789, "epoch": 1.3285806869734285, "grad_norm": 424.5869132146945, "learning_rate": 5.46149780596851e-07, "logits/chosen": -2.434891939163208, "logits/rejected": -2.5823049545288086, "logps/chosen": -16.643686294555664, "logps/rejected": -159.75894165039062, "loss": 17.6156, "losses_ref": -0.004576454870402813, "ref_logps/chosen": -96.3367691040039, "ref_logps/rejected": -89.21734619140625, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 79.6930923461914, "rewards/margins": 150.2346649169922, "rewards/rejected": -70.5416030883789, "step": 4100, "u": -4.441787242889404, "weight": 0.02523742988705635 }, { "diff_generated": -67.7085189819336, "epoch": 1.3318211276733636, "grad_norm": 508.6914339973369, "learning_rate": 5.447448156481708e-07, "logits/chosen": -2.4687657356262207, "logits/rejected": -2.5582194328308105, "logps/chosen": -15.509539604187012, "logps/rejected": -157.65174865722656, "loss": 17.1468, "losses_ref": -0.01627708598971367, "ref_logps/chosen": -95.32611083984375, "ref_logps/rejected": -89.94322204589844, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 79.81657409667969, "rewards/margins": 147.5251007080078, "rewards/rejected": -67.7085189819336, "step": 4110, "u": -4.349822044372559, "weight": 0.04460912570357323 }, { "diff_generated": -68.07727813720703, "epoch": 1.3350615683732987, "grad_norm": 469.9815058204587, "learning_rate": 5.433377928968234e-07, "logits/chosen": -2.470069408416748, "logits/rejected": -2.5536696910858154, "logps/chosen": -18.169631958007812, "logps/rejected": -154.72691345214844, "loss": 17.7251, "losses_ref": -0.0014369834680110216, "ref_logps/chosen": -98.80369567871094, "ref_logps/rejected": -86.6496353149414, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 80.63407135009766, "rewards/margins": 148.71133422851562, "rewards/rejected": -68.07727813720703, "step": 4120, "u": -4.4524922370910645, "weight": 0.025073552504181862 }, { "diff_generated": -65.11859893798828, "epoch": 1.3383020090732338, "grad_norm": 424.64693099799103, "learning_rate": 5.41928732346117e-07, "logits/chosen": -2.4283761978149414, "logits/rejected": -2.5155179500579834, "logps/chosen": -18.113378524780273, "logps/rejected": -148.42088317871094, "loss": 17.4648, "losses_ref": -0.004977349191904068, "ref_logps/chosen": -95.88996887207031, "ref_logps/rejected": -83.30229187011719, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 77.7765884399414, "rewards/margins": 142.8951873779297, "rewards/rejected": -65.11859893798828, "step": 4130, "u": -4.399342060089111, "weight": 0.037747155874967575 }, { "diff_generated": -66.83191680908203, "epoch": 1.3415424497731692, "grad_norm": 465.3070412715531, "learning_rate": 5.405176540283311e-07, "logits/chosen": -2.426403522491455, "logits/rejected": -2.517180919647217, "logps/chosen": -17.60582733154297, "logps/rejected": -153.71560668945312, "loss": 17.6287, "losses_ref": -0.0006925761117599905, "ref_logps/chosen": -96.36148071289062, "ref_logps/rejected": -86.88368225097656, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 78.75565338134766, "rewards/margins": 145.5875701904297, "rewards/rejected": -66.83191680908203, "step": 4140, "u": -4.210391998291016, "weight": 0.0750294178724289 }, { "diff_generated": -70.90372467041016, "epoch": 1.3447828904731043, "grad_norm": 439.07340709547856, "learning_rate": 5.391045780044308e-07, "logits/chosen": -2.4661173820495605, "logits/rejected": -2.607022523880005, "logps/chosen": -18.112369537353516, "logps/rejected": -164.56256103515625, "loss": 17.2512, "losses_ref": -2.79961994920086e-07, "ref_logps/chosen": -96.12417602539062, "ref_logps/rejected": -93.6588363647461, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 78.01180267333984, "rewards/margins": 148.91552734375, "rewards/rejected": -70.90372467041016, "step": 4150, "u": -4.338640213012695, "weight": 0.043750010430812836 }, { "diff_generated": -63.74640655517578, "epoch": 1.3480233311730396, "grad_norm": 488.68903225387913, "learning_rate": 5.376895243637823e-07, "logits/chosen": -2.431675672531128, "logits/rejected": -2.4988338947296143, "logps/chosen": -18.850200653076172, "logps/rejected": -146.23924255371094, "loss": 17.7837, "losses_ref": -0.005620983429253101, "ref_logps/chosen": -94.14060974121094, "ref_logps/rejected": -82.49284362792969, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 75.29041290283203, "rewards/margins": 139.0368194580078, "rewards/rejected": -63.74640655517578, "step": 4160, "u": -4.236712455749512, "weight": 0.07528746873140335 }, { "diff_generated": -71.92745208740234, "epoch": 1.3512637718729748, "grad_norm": 476.3299931949927, "learning_rate": 5.362725132238672e-07, "logits/chosen": -2.426936149597168, "logits/rejected": -2.638857364654541, "logps/chosen": -16.467594146728516, "logps/rejected": -165.2958221435547, "loss": 17.3957, "losses_ref": -0.004750962369143963, "ref_logps/chosen": -91.93846130371094, "ref_logps/rejected": -93.3683853149414, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 75.47087097167969, "rewards/margins": 147.3983154296875, "rewards/rejected": -71.92745208740234, "step": 4170, "u": -4.38946008682251, "weight": 0.03774666786193848 }, { "diff_generated": -68.27117919921875, "epoch": 1.3545042125729099, "grad_norm": 467.16826897509446, "learning_rate": 5.348535647299964e-07, "logits/chosen": -2.4064178466796875, "logits/rejected": -2.5552902221679688, "logps/chosen": -17.2877140045166, "logps/rejected": -161.4751434326172, "loss": 17.1639, "losses_ref": -1.1598888249864103e-07, "ref_logps/chosen": -94.16259765625, "ref_logps/rejected": -93.2039566040039, "rewards/accuracies": 0.96875, "rewards/chosen": 76.87488555908203, "rewards/margins": 145.1460723876953, "rewards/rejected": -68.27117919921875, "step": 4180, "u": -4.43471097946167, "weight": 0.03125 }, { "diff_generated": -67.98145294189453, "epoch": 1.357744653272845, "grad_norm": 439.1505245186258, "learning_rate": 5.334326990550234e-07, "logits/chosen": -2.4439923763275146, "logits/rejected": -2.5619823932647705, "logps/chosen": -16.554126739501953, "logps/rejected": -155.3748016357422, "loss": 17.0193, "losses_ref": -7.341781838476891e-06, "ref_logps/chosen": -95.81574249267578, "ref_logps/rejected": -87.39334869384766, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 79.26161193847656, "rewards/margins": 147.24307250976562, "rewards/rejected": -67.98145294189453, "step": 4190, "u": -4.322827339172363, "weight": 0.05000028759241104 }, { "diff_generated": -69.60054016113281, "epoch": 1.3609850939727803, "grad_norm": 450.7519374284902, "learning_rate": 5.320099363990584e-07, "logits/chosen": -2.4455199241638184, "logits/rejected": -2.4962477684020996, "logps/chosen": -17.381702423095703, "logps/rejected": -153.2888641357422, "loss": 16.4503, "losses_ref": -0.0011503873392939568, "ref_logps/chosen": -98.63499450683594, "ref_logps/rejected": -83.68831634521484, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 81.25328063964844, "rewards/margins": 150.8538360595703, "rewards/rejected": -69.60054016113281, "step": 4200, "u": -4.400361061096191, "weight": 0.037554167211055756 }, { "diff_generated": -67.48835754394531, "epoch": 1.3642255346727155, "grad_norm": 421.7107776772206, "learning_rate": 5.305852969891799e-07, "logits/chosen": -2.4636857509613037, "logits/rejected": -2.502215623855591, "logps/chosen": -17.966630935668945, "logps/rejected": -146.44480895996094, "loss": 16.9674, "losses_ref": -2.9401869383605117e-08, "ref_logps/chosen": -96.73162841796875, "ref_logps/rejected": -78.95645141601562, "rewards/accuracies": 0.9375, "rewards/chosen": 78.76499938964844, "rewards/margins": 146.2533721923828, "rewards/rejected": -67.48835754394531, "step": 4210, "u": -4.2420148849487305, "weight": 0.0625 }, { "diff_generated": -63.33342742919922, "epoch": 1.3674659753726508, "grad_norm": 419.00702060951977, "learning_rate": 5.29158801079148e-07, "logits/chosen": -2.385185480117798, "logits/rejected": -2.465200901031494, "logps/chosen": -17.246349334716797, "logps/rejected": -141.04661560058594, "loss": 17.4708, "losses_ref": -0.0014706698711961508, "ref_logps/chosen": -95.12071228027344, "ref_logps/rejected": -77.71318054199219, "rewards/accuracies": 0.90625, "rewards/chosen": 77.87437438964844, "rewards/margins": 141.2078094482422, "rewards/rejected": -63.33342742919922, "step": 4220, "u": -4.143843650817871, "weight": 0.09382256120443344 }, { "diff_generated": -66.72733306884766, "epoch": 1.370706416072586, "grad_norm": 448.4120302430566, "learning_rate": 5.277304689491165e-07, "logits/chosen": -2.4468398094177246, "logits/rejected": -2.5708279609680176, "logps/chosen": -17.90654754638672, "logps/rejected": -146.7433624267578, "loss": 17.4143, "losses_ref": -0.0013314300449565053, "ref_logps/chosen": -93.74774932861328, "ref_logps/rejected": -80.01602172851562, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 75.84119415283203, "rewards/margins": 142.5685272216797, "rewards/rejected": -66.72733306884766, "step": 4230, "u": -4.346431732177734, "weight": 0.03756508231163025 }, { "diff_generated": -67.25651550292969, "epoch": 1.373946856772521, "grad_norm": 470.0274356799925, "learning_rate": 5.26300320905344e-07, "logits/chosen": -2.4239392280578613, "logits/rejected": -2.5337131023406982, "logps/chosen": -16.854785919189453, "logps/rejected": -150.7886505126953, "loss": 17.592, "losses_ref": -2.157007656933274e-06, "ref_logps/chosen": -96.3592300415039, "ref_logps/rejected": -83.5321273803711, "rewards/accuracies": 0.96875, "rewards/chosen": 79.50444030761719, "rewards/margins": 146.76095581054688, "rewards/rejected": -67.25651550292969, "step": 4240, "u": -4.427393913269043, "weight": 0.03125005215406418 }, { "diff_generated": -68.52471923828125, "epoch": 1.3771872974724562, "grad_norm": 466.15491091372587, "learning_rate": 5.248683772799054e-07, "logits/chosen": -2.4210152626037598, "logits/rejected": -2.4874701499938965, "logps/chosen": -19.08351707458496, "logps/rejected": -152.42335510253906, "loss": 17.0809, "losses_ref": -0.0020189809147268534, "ref_logps/chosen": -100.47675323486328, "ref_logps/rejected": -83.89862060546875, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 81.39323425292969, "rewards/margins": 149.91795349121094, "rewards/rejected": -68.52471923828125, "step": 4250, "u": -4.339241027832031, "weight": 0.04385364428162575 }, { "diff_generated": -68.41958618164062, "epoch": 1.3804277381723915, "grad_norm": 443.81081877289006, "learning_rate": 5.234346584304033e-07, "logits/chosen": -2.414257049560547, "logits/rejected": -2.509781837463379, "logps/chosen": -16.89341163635254, "logps/rejected": -152.24014282226562, "loss": 17.5102, "losses_ref": -0.001452519092708826, "ref_logps/chosen": -92.01890563964844, "ref_logps/rejected": -83.82056427001953, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 75.12548828125, "rewards/margins": 143.54507446289062, "rewards/rejected": -68.41958618164062, "step": 4260, "u": -4.205715179443359, "weight": 0.0688219666481018 }, { "diff_generated": -69.97354125976562, "epoch": 1.3836681788723266, "grad_norm": 442.71038008181597, "learning_rate": 5.21999184739678e-07, "logits/chosen": -2.4220805168151855, "logits/rejected": -2.494659662246704, "logps/chosen": -19.814462661743164, "logps/rejected": -158.04541015625, "loss": 17.7849, "losses_ref": -2.526703042349254e-07, "ref_logps/chosen": -98.62994384765625, "ref_logps/rejected": -88.07185363769531, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 78.81547546386719, "rewards/margins": 148.7890167236328, "rewards/rejected": -69.97354125976562, "step": 4270, "u": -4.459265232086182, "weight": 0.025000005960464478 }, { "diff_generated": -67.09244537353516, "epoch": 1.3869086195722617, "grad_norm": 506.04582752692124, "learning_rate": 5.205619766155182e-07, "logits/chosen": -2.443906307220459, "logits/rejected": -2.5445313453674316, "logps/chosen": -17.672672271728516, "logps/rejected": -150.82272338867188, "loss": 17.4618, "losses_ref": -0.008290953002870083, "ref_logps/chosen": -93.44664001464844, "ref_logps/rejected": -83.73028564453125, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 75.77397155761719, "rewards/margins": 142.8664093017578, "rewards/rejected": -67.09244537353516, "step": 4280, "u": -4.226373672485352, "weight": 0.06915529072284698 }, { "diff_generated": -67.34722900390625, "epoch": 1.390149060272197, "grad_norm": 445.24258122184585, "learning_rate": 5.191230544903702e-07, "logits/chosen": -2.4065587520599365, "logits/rejected": -2.4920034408569336, "logps/chosen": -15.778546333312988, "logps/rejected": -151.96900939941406, "loss": 17.0353, "losses_ref": -0.0008197773131541908, "ref_logps/chosen": -88.67896270751953, "ref_logps/rejected": -84.62178039550781, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 72.90040588378906, "rewards/margins": 140.24761962890625, "rewards/rejected": -67.34722900390625, "step": 4290, "u": -4.341623306274414, "weight": 0.05003521963953972 }, { "diff_generated": -66.68577575683594, "epoch": 1.3933895009721322, "grad_norm": 511.6361436706194, "learning_rate": 5.176824388210483e-07, "logits/chosen": -2.4076170921325684, "logits/rejected": -2.5193543434143066, "logps/chosen": -17.607181549072266, "logps/rejected": -152.07278442382812, "loss": 17.4805, "losses_ref": -0.00011498709500301629, "ref_logps/chosen": -92.15415954589844, "ref_logps/rejected": -85.38701629638672, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 74.54698181152344, "rewards/margins": 141.23277282714844, "rewards/rejected": -66.68577575683594, "step": 4300, "u": -4.220432281494141, "weight": 0.0750034898519516 }, { "diff_generated": -66.54644775390625, "epoch": 1.3966299416720673, "grad_norm": 468.88529082210414, "learning_rate": 5.162401500884432e-07, "logits/chosen": -2.427182674407959, "logits/rejected": -2.504462957382202, "logps/chosen": -17.371049880981445, "logps/rejected": -148.7251739501953, "loss": 17.4356, "losses_ref": -0.015387284569442272, "ref_logps/chosen": -95.58000946044922, "ref_logps/rejected": -82.17872619628906, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 78.2089614868164, "rewards/margins": 144.75540161132812, "rewards/rejected": -66.54644775390625, "step": 4310, "u": -4.298781394958496, "weight": 0.057079873979091644 }, { "diff_generated": -66.85215759277344, "epoch": 1.3998703823720027, "grad_norm": 469.3847642908731, "learning_rate": 5.147962087972314e-07, "logits/chosen": -2.413745403289795, "logits/rejected": -2.462054491043091, "logps/chosen": -18.344039916992188, "logps/rejected": -151.5030975341797, "loss": 17.8151, "losses_ref": -9.710121048556175e-06, "ref_logps/chosen": -97.64744567871094, "ref_logps/rejected": -84.65092468261719, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 79.30342102050781, "rewards/margins": 146.1555633544922, "rewards/rejected": -66.85215759277344, "step": 4320, "u": -4.29888916015625, "weight": 0.05625038221478462 }, { "diff_generated": -70.12738037109375, "epoch": 1.4031108230719378, "grad_norm": 468.64992596686636, "learning_rate": 5.133506354755833e-07, "logits/chosen": -2.435763359069824, "logits/rejected": -2.5422844886779785, "logps/chosen": -15.104168891906738, "logps/rejected": -155.98138427734375, "loss": 16.8697, "losses_ref": -0.0039013822097331285, "ref_logps/chosen": -91.3575439453125, "ref_logps/rejected": -85.85398864746094, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 76.25337982177734, "rewards/margins": 146.38075256347656, "rewards/rejected": -70.12738037109375, "step": 4330, "u": -4.263181686401367, "weight": 0.05643879249691963 }, { "diff_generated": -67.4532470703125, "epoch": 1.406351263771873, "grad_norm": 457.9270962811255, "learning_rate": 5.119034506748713e-07, "logits/chosen": -2.3538706302642822, "logits/rejected": -2.440309762954712, "logps/chosen": -16.107725143432617, "logps/rejected": -151.02162170410156, "loss": 17.015, "losses_ref": -0.0006191584980115294, "ref_logps/chosen": -91.54263305664062, "ref_logps/rejected": -83.56836700439453, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 75.4349136352539, "rewards/margins": 142.88815307617188, "rewards/rejected": -67.4532470703125, "step": 4340, "u": -4.243154048919678, "weight": 0.06877802312374115 }, { "diff_generated": -66.83910369873047, "epoch": 1.4095917044718083, "grad_norm": 464.18351139838603, "learning_rate": 5.104546749693781e-07, "logits/chosen": -2.4061524868011475, "logits/rejected": -2.5223965644836426, "logps/chosen": -18.637516021728516, "logps/rejected": -149.72984313964844, "loss": 17.3378, "losses_ref": -0.001131789991632104, "ref_logps/chosen": -96.01673889160156, "ref_logps/rejected": -82.8907470703125, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 77.37923431396484, "rewards/margins": 144.2183380126953, "rewards/rejected": -66.83910369873047, "step": 4350, "u": -4.342132568359375, "weight": 0.04380253329873085 }, { "diff_generated": -68.60645294189453, "epoch": 1.4128321451717434, "grad_norm": 457.0813142753334, "learning_rate": 5.09004328956004e-07, "logits/chosen": -2.4368767738342285, "logits/rejected": -2.5215744972229004, "logps/chosen": -17.591197967529297, "logps/rejected": -153.7823028564453, "loss": 17.2265, "losses_ref": -1.2529037576314295e-06, "ref_logps/chosen": -95.30029296875, "ref_logps/rejected": -85.17585754394531, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 77.70909118652344, "rewards/margins": 146.3155517578125, "rewards/rejected": -68.60645294189453, "step": 4360, "u": -4.3323163986206055, "weight": 0.05625004693865776 }, { "diff_generated": -66.69379425048828, "epoch": 1.4160725858716785, "grad_norm": 411.08045984959665, "learning_rate": 5.075524332539736e-07, "logits/chosen": -2.398942470550537, "logits/rejected": -2.455166816711426, "logps/chosen": -16.874691009521484, "logps/rejected": -152.13819885253906, "loss": 17.3265, "losses_ref": -1.1959253242821433e-05, "ref_logps/chosen": -95.65223693847656, "ref_logps/rejected": -85.44440460205078, "rewards/accuracies": 0.96875, "rewards/chosen": 78.77754211425781, "rewards/margins": 145.47132873535156, "rewards/rejected": -66.69379425048828, "step": 4370, "u": -4.406135559082031, "weight": 0.03125051409006119 }, { "diff_generated": -69.11666107177734, "epoch": 1.4193130265716136, "grad_norm": 461.40170637984556, "learning_rate": 5.060990085045432e-07, "logits/chosen": -2.4089365005493164, "logits/rejected": -2.5167927742004395, "logps/chosen": -17.30145263671875, "logps/rejected": -155.79261779785156, "loss": 17.6613, "losses_ref": -1.8280853453234158e-07, "ref_logps/chosen": -93.44625091552734, "ref_logps/rejected": -86.67595672607422, "rewards/accuracies": 0.96875, "rewards/chosen": 76.1447982788086, "rewards/margins": 145.26145935058594, "rewards/rejected": -69.11666107177734, "step": 4380, "u": -4.437478542327881, "weight": 0.0312500037252903 }, { "diff_generated": -68.13085174560547, "epoch": 1.422553467271549, "grad_norm": 466.4529799309482, "learning_rate": 5.046440753707077e-07, "logits/chosen": -2.481142520904541, "logits/rejected": -2.5197412967681885, "logps/chosen": -15.788442611694336, "logps/rejected": -153.73379516601562, "loss": 17.2616, "losses_ref": -0.0003627826808951795, "ref_logps/chosen": -96.31988525390625, "ref_logps/rejected": -85.60295104980469, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 80.53144836425781, "rewards/margins": 148.6623077392578, "rewards/rejected": -68.13085174560547, "step": 4390, "u": -4.33340311050415, "weight": 0.05626552179455757 }, { "diff_generated": -66.99527740478516, "epoch": 1.425793907971484, "grad_norm": 473.0960654748571, "learning_rate": 5.031876545369054e-07, "logits/chosen": -2.461520195007324, "logits/rejected": -2.5205090045928955, "logps/chosen": -17.834148406982422, "logps/rejected": -149.48910522460938, "loss": 17.8125, "losses_ref": -0.00037082930793985724, "ref_logps/chosen": -95.21051025390625, "ref_logps/rejected": -82.49385070800781, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 77.37635803222656, "rewards/margins": 144.37164306640625, "rewards/rejected": -66.99527740478516, "step": 4400, "u": -4.373237133026123, "weight": 0.05001600459218025 }, { "diff_generated": -69.36138153076172, "epoch": 1.4290343486714194, "grad_norm": 456.7809003773492, "learning_rate": 5.017297667087257e-07, "logits/chosen": -2.4498043060302734, "logits/rejected": -2.535263776779175, "logps/chosen": -18.219280242919922, "logps/rejected": -152.06491088867188, "loss": 17.463, "losses_ref": -0.0010456106392666698, "ref_logps/chosen": -96.76631927490234, "ref_logps/rejected": -82.70353698730469, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 78.54704284667969, "rewards/margins": 147.90841674804688, "rewards/rejected": -69.36138153076172, "step": 4410, "u": -4.371595859527588, "weight": 0.037546005100011826 }, { "diff_generated": -69.4170150756836, "epoch": 1.4322747893713546, "grad_norm": 494.26461007460443, "learning_rate": 5.002704326126135e-07, "logits/chosen": -2.489748954772949, "logits/rejected": -2.5593252182006836, "logps/chosen": -18.945158004760742, "logps/rejected": -156.34237670898438, "loss": 17.6249, "losses_ref": -0.0002525493036955595, "ref_logps/chosen": -98.93243408203125, "ref_logps/rejected": -86.92535400390625, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 79.9872817993164, "rewards/margins": 149.404296875, "rewards/rejected": -69.4170150756836, "step": 4420, "u": -4.320488929748535, "weight": 0.056261636316776276 }, { "diff_generated": -70.99942779541016, "epoch": 1.4355152300712897, "grad_norm": 501.8354939197857, "learning_rate": 4.988096729955751e-07, "logits/chosen": -2.479881763458252, "logits/rejected": -2.568162202835083, "logps/chosen": -16.15049171447754, "logps/rejected": -156.91697692871094, "loss": 17.3995, "losses_ref": -0.00015837197133805603, "ref_logps/chosen": -97.31709289550781, "ref_logps/rejected": -85.91755676269531, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 81.1666030883789, "rewards/margins": 152.16603088378906, "rewards/rejected": -70.99942779541016, "step": 4430, "u": -4.261697292327881, "weight": 0.06875661015510559 }, { "diff_generated": -66.48624420166016, "epoch": 1.4387556707712248, "grad_norm": 489.7294121744171, "learning_rate": 4.97347508624883e-07, "logits/chosen": -2.4843106269836426, "logits/rejected": -2.5230603218078613, "logps/chosen": -16.20532989501953, "logps/rejected": -149.9473114013672, "loss": 16.8241, "losses_ref": -0.0029774392023682594, "ref_logps/chosen": -94.02255249023438, "ref_logps/rejected": -83.46109008789062, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 77.81721496582031, "rewards/margins": 144.30345153808594, "rewards/rejected": -66.48624420166016, "step": 4440, "u": -4.226017951965332, "weight": 0.08138980716466904 }, { "diff_generated": -69.31263732910156, "epoch": 1.4419961114711601, "grad_norm": 434.07506473374804, "learning_rate": 4.958839602877809e-07, "logits/chosen": -2.417771816253662, "logits/rejected": -2.535672664642334, "logps/chosen": -17.75906753540039, "logps/rejected": -155.42984008789062, "loss": 17.376, "losses_ref": -2.4924074750742875e-05, "ref_logps/chosen": -93.65541076660156, "ref_logps/rejected": -86.11720275878906, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 75.8963394165039, "rewards/margins": 145.20896911621094, "rewards/rejected": -69.31263732910156, "step": 4450, "u": -4.432461261749268, "weight": 0.03750060871243477 }, { "diff_generated": -67.58025360107422, "epoch": 1.4452365521710953, "grad_norm": 433.5516392365605, "learning_rate": 4.944190487911878e-07, "logits/chosen": -2.4273107051849365, "logits/rejected": -2.530043125152588, "logps/chosen": -16.647306442260742, "logps/rejected": -154.08511352539062, "loss": 17.6515, "losses_ref": -0.002214368199929595, "ref_logps/chosen": -94.8519058227539, "ref_logps/rejected": -86.5048599243164, "rewards/accuracies": 0.9375, "rewards/chosen": 78.20460510253906, "rewards/margins": 145.78488159179688, "rewards/rejected": -67.58025360107422, "step": 4460, "u": -4.280516624450684, "weight": 0.06258859485387802 }, { "diff_generated": -65.32229614257812, "epoch": 1.4484769928710304, "grad_norm": 446.3728833764338, "learning_rate": 4.929527949614025e-07, "logits/chosen": -2.4531006813049316, "logits/rejected": -2.4834418296813965, "logps/chosen": -18.64037322998047, "logps/rejected": -147.5410614013672, "loss": 17.427, "losses_ref": -9.17950728762662e-06, "ref_logps/chosen": -100.15422058105469, "ref_logps/rejected": -82.21876525878906, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 81.51384735107422, "rewards/margins": 146.83615112304688, "rewards/rejected": -65.32229614257812, "step": 4470, "u": -4.405593395233154, "weight": 0.037500280886888504 }, { "diff_generated": -67.51651000976562, "epoch": 1.4517174335709657, "grad_norm": 476.64036777111323, "learning_rate": 4.914852196438077e-07, "logits/chosen": -2.4044480323791504, "logits/rejected": -2.51806640625, "logps/chosen": -17.459749221801758, "logps/rejected": -153.8185272216797, "loss": 17.796, "losses_ref": -0.00025501454365439713, "ref_logps/chosen": -93.2803955078125, "ref_logps/rejected": -86.3020248413086, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 75.8206558227539, "rewards/margins": 143.337158203125, "rewards/rejected": -67.51651000976562, "step": 4480, "u": -4.34688663482666, "weight": 0.043761108070611954 }, { "diff_generated": -71.80397033691406, "epoch": 1.4549578742709008, "grad_norm": 495.62260448956147, "learning_rate": 4.900163437025727e-07, "logits/chosen": -2.413210153579712, "logits/rejected": -2.5546677112579346, "logps/chosen": -17.38454818725586, "logps/rejected": -161.24551391601562, "loss": 17.9293, "losses_ref": -2.827008529493469e-07, "ref_logps/chosen": -92.62767028808594, "ref_logps/rejected": -89.44153594970703, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 75.24312591552734, "rewards/margins": 147.047119140625, "rewards/rejected": -71.80397033691406, "step": 4490, "u": -4.375165939331055, "weight": 0.04375000298023224 }, { "diff_generated": -68.02531433105469, "epoch": 1.458198314970836, "grad_norm": 504.03590321188335, "learning_rate": 4.885461880203582e-07, "logits/chosen": -2.4162282943725586, "logits/rejected": -2.5359253883361816, "logps/chosen": -17.028575897216797, "logps/rejected": -152.71405029296875, "loss": 16.9765, "losses_ref": -0.0028386306948959827, "ref_logps/chosen": -91.15119171142578, "ref_logps/rejected": -84.688720703125, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 74.12261199951172, "rewards/margins": 142.14793395996094, "rewards/rejected": -68.02531433105469, "step": 4500, "u": -4.2423601150512695, "weight": 0.056392062455415726 }, { "diff_generated": -71.33853149414062, "epoch": 1.4614387556707713, "grad_norm": 409.8261741998233, "learning_rate": 4.870747734980186e-07, "logits/chosen": -2.4482059478759766, "logits/rejected": -2.5609679222106934, "logps/chosen": -16.207927703857422, "logps/rejected": -163.0063018798828, "loss": 17.6178, "losses_ref": -0.003997477702796459, "ref_logps/chosen": -96.40123748779297, "ref_logps/rejected": -91.66776275634766, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 80.19331359863281, "rewards/margins": 151.53182983398438, "rewards/rejected": -71.33853149414062, "step": 4510, "u": -4.368565082550049, "weight": 0.043938539922237396 }, { "diff_generated": -63.6971321105957, "epoch": 1.4646791963707064, "grad_norm": 452.7392547705994, "learning_rate": 4.856021210543043e-07, "logits/chosen": -2.370863437652588, "logits/rejected": -2.4753031730651855, "logps/chosen": -17.3338565826416, "logps/rejected": -141.4090118408203, "loss": 17.7314, "losses_ref": -8.973429430625401e-07, "ref_logps/chosen": -86.48564147949219, "ref_logps/rejected": -77.71188354492188, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 69.15177917480469, "rewards/margins": 132.8489227294922, "rewards/rejected": -63.6971321105957, "step": 4520, "u": -4.074034214019775, "weight": 0.10625002533197403 }, { "diff_generated": -69.9952163696289, "epoch": 1.4679196370706415, "grad_norm": 470.16356446008257, "learning_rate": 4.841282516255653e-07, "logits/chosen": -2.497943878173828, "logits/rejected": -2.5594377517700195, "logps/chosen": -18.48931312561035, "logps/rejected": -158.583251953125, "loss": 17.7948, "losses_ref": -0.000756235618609935, "ref_logps/chosen": -101.09172821044922, "ref_logps/rejected": -88.58805847167969, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 82.6024169921875, "rewards/margins": 152.59762573242188, "rewards/rejected": -69.9952163696289, "step": 4530, "u": -4.4657979011535645, "weight": 0.02503257431089878 }, { "diff_generated": -70.44725036621094, "epoch": 1.471160077770577, "grad_norm": 446.5388305269726, "learning_rate": 4.826531861654537e-07, "logits/chosen": -2.418222665786743, "logits/rejected": -2.4776690006256104, "logps/chosen": -18.502029418945312, "logps/rejected": -159.91282653808594, "loss": 17.1606, "losses_ref": -0.0012745390413329005, "ref_logps/chosen": -98.4915542602539, "ref_logps/rejected": -89.46559143066406, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 79.98951721191406, "rewards/margins": 150.436767578125, "rewards/rejected": -70.44725036621094, "step": 4540, "u": -4.396437168121338, "weight": 0.03755884990096092 }, { "diff_generated": -72.79644012451172, "epoch": 1.474400518470512, "grad_norm": 472.8502607929755, "learning_rate": 4.811769456446243e-07, "logits/chosen": -2.4596428871154785, "logits/rejected": -2.5735411643981934, "logps/chosen": -16.844120025634766, "logps/rejected": -159.51223754882812, "loss": 17.0855, "losses_ref": -3.752495558728697e-07, "ref_logps/chosen": -95.08149719238281, "ref_logps/rejected": -86.71580505371094, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 78.23738098144531, "rewards/margins": 151.0338134765625, "rewards/rejected": -72.79644012451172, "step": 4550, "u": -4.390454292297363, "weight": 0.03750000521540642 }, { "diff_generated": -68.4251937866211, "epoch": 1.4776409591704471, "grad_norm": 501.1855634488351, "learning_rate": 4.796995510504384e-07, "logits/chosen": -2.402315378189087, "logits/rejected": -2.5745930671691895, "logps/chosen": -15.994440078735352, "logps/rejected": -153.82589721679688, "loss": 17.8596, "losses_ref": -6.664349712082185e-07, "ref_logps/chosen": -89.04400634765625, "ref_logps/rejected": -85.40069580078125, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 73.04957580566406, "rewards/margins": 141.47476196289062, "rewards/rejected": -68.4251937866211, "step": 4560, "u": -4.18105936050415, "weight": 0.07500000298023224 }, { "diff_generated": -67.21964263916016, "epoch": 1.4808813998703823, "grad_norm": 453.5729849326357, "learning_rate": 4.782210233866637e-07, "logits/chosen": -2.4193265438079834, "logits/rejected": -2.5252389907836914, "logps/chosen": -16.418968200683594, "logps/rejected": -156.79345703125, "loss": 16.8052, "losses_ref": -2.263903979837778e-06, "ref_logps/chosen": -92.21155548095703, "ref_logps/rejected": -89.57381439208984, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 75.7925796508789, "rewards/margins": 143.01222229003906, "rewards/rejected": -67.21964263916016, "step": 4570, "u": -4.222241401672363, "weight": 0.06875006854534149 }, { "diff_generated": -69.27766418457031, "epoch": 1.4841218405703176, "grad_norm": 419.96379946452873, "learning_rate": 4.76741383673177e-07, "logits/chosen": -2.446869373321533, "logits/rejected": -2.5299861431121826, "logps/chosen": -16.789928436279297, "logps/rejected": -155.53073120117188, "loss": 17.2807, "losses_ref": -0.0027264286763966084, "ref_logps/chosen": -94.69251251220703, "ref_logps/rejected": -86.25306701660156, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 77.902587890625, "rewards/margins": 147.1802520751953, "rewards/rejected": -69.27766418457031, "step": 4580, "u": -4.301576614379883, "weight": 0.05639181658625603 }, { "diff_generated": -65.70954895019531, "epoch": 1.4873622812702527, "grad_norm": 501.90105626271105, "learning_rate": 4.752606529456648e-07, "logits/chosen": -2.410231113433838, "logits/rejected": -2.5358357429504395, "logps/chosen": -15.482803344726562, "logps/rejected": -148.21693420410156, "loss": 17.3371, "losses_ref": -0.0004646036250051111, "ref_logps/chosen": -90.462646484375, "ref_logps/rejected": -82.50740051269531, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 74.97984313964844, "rewards/margins": 140.68939208984375, "rewards/rejected": -65.70954895019531, "step": 4590, "u": -4.142441272735596, "weight": 0.0875222310423851 }, { "diff_generated": -72.26192474365234, "epoch": 1.490602721970188, "grad_norm": 442.7557677460933, "learning_rate": 4.7377885225532396e-07, "logits/chosen": -2.459202289581299, "logits/rejected": -2.5650484561920166, "logps/chosen": -15.90203857421875, "logps/rejected": -164.494873046875, "loss": 16.9939, "losses_ref": -3.4513675473135663e-06, "ref_logps/chosen": -95.97832489013672, "ref_logps/rejected": -92.23296356201172, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 80.07628631591797, "rewards/margins": 152.3382110595703, "rewards/rejected": -72.26192474365234, "step": 4600, "u": -4.404824256896973, "weight": 0.037500057369470596 }, { "diff_generated": -67.0380630493164, "epoch": 1.4938431626701232, "grad_norm": 452.2425383294523, "learning_rate": 4.722960026685633e-07, "logits/chosen": -2.4154162406921387, "logits/rejected": -2.517221689224243, "logps/chosen": -16.425350189208984, "logps/rejected": -149.91822814941406, "loss": 16.4547, "losses_ref": -0.008105043321847916, "ref_logps/chosen": -91.67552185058594, "ref_logps/rejected": -82.88018035888672, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 75.25016021728516, "rewards/margins": 142.28822326660156, "rewards/rejected": -67.0380630493164, "step": 4610, "u": -4.350281238555908, "weight": 0.05040092393755913 }, { "diff_generated": -67.85414123535156, "epoch": 1.4970836033700583, "grad_norm": 441.68144740480705, "learning_rate": 4.7081212526670267e-07, "logits/chosen": -2.3970372676849365, "logits/rejected": -2.45599102973938, "logps/chosen": -19.708385467529297, "logps/rejected": -152.5522918701172, "loss": 17.0668, "losses_ref": -0.0015395600348711014, "ref_logps/chosen": -96.54747009277344, "ref_logps/rejected": -84.69816589355469, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 76.83908081054688, "rewards/margins": 144.69322204589844, "rewards/rejected": -67.85414123535156, "step": 4620, "u": -4.298979759216309, "weight": 0.05007138103246689 }, { "diff_generated": -67.05535888671875, "epoch": 1.5003240440699934, "grad_norm": 453.4773167246375, "learning_rate": 4.693272411456753e-07, "logits/chosen": -2.4733786582946777, "logits/rejected": -2.5276851654052734, "logps/chosen": -17.27235984802246, "logps/rejected": -150.65411376953125, "loss": 17.2596, "losses_ref": -0.0004150184686295688, "ref_logps/chosen": -94.43132019042969, "ref_logps/rejected": -83.59877014160156, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 77.15895080566406, "rewards/margins": 144.21432495117188, "rewards/rejected": -67.05535888671875, "step": 4630, "u": -4.372154712677002, "weight": 0.05000603199005127 }, { "diff_generated": -70.12632751464844, "epoch": 1.5035644847699285, "grad_norm": 467.84998763747177, "learning_rate": 4.6784137141572566e-07, "logits/chosen": -2.4407970905303955, "logits/rejected": -2.5051474571228027, "logps/chosen": -16.63579750061035, "logps/rejected": -156.5682373046875, "loss": 16.9536, "losses_ref": -2.0801840037165675e-06, "ref_logps/chosen": -96.26951599121094, "ref_logps/rejected": -86.44189453125, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 79.63372039794922, "rewards/margins": 149.7600555419922, "rewards/rejected": -70.12632751464844, "step": 4640, "u": -4.290956497192383, "weight": 0.056250084191560745 }, { "diff_generated": -64.9892807006836, "epoch": 1.5068049254698639, "grad_norm": 484.2847531059146, "learning_rate": 4.6635453720111096e-07, "logits/chosen": -2.423842430114746, "logits/rejected": -2.5103225708007812, "logps/chosen": -17.545272827148438, "logps/rejected": -148.31085205078125, "loss": 16.5725, "losses_ref": -1.28770659557631e-06, "ref_logps/chosen": -92.8660659790039, "ref_logps/rejected": -83.32157897949219, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 75.32079315185547, "rewards/margins": 140.31008911132812, "rewards/rejected": -64.9892807006836, "step": 4650, "u": -4.208277702331543, "weight": 0.07500003278255463 }, { "diff_generated": -71.25359344482422, "epoch": 1.5100453661697992, "grad_norm": 489.3099876004456, "learning_rate": 4.6486675963980014e-07, "logits/chosen": -2.4613852500915527, "logits/rejected": -2.613790512084961, "logps/chosen": -17.93228530883789, "logps/rejected": -161.30078125, "loss": 17.3612, "losses_ref": -0.007633232977241278, "ref_logps/chosen": -93.60543823242188, "ref_logps/rejected": -90.04718780517578, "rewards/accuracies": 0.96875, "rewards/chosen": 75.67315673828125, "rewards/margins": 146.92674255371094, "rewards/rejected": -71.25359344482422, "step": 4660, "u": -4.4339728355407715, "weight": 0.031660519540309906 }, { "diff_generated": -65.96153259277344, "epoch": 1.5132858068697344, "grad_norm": 442.07501282365024, "learning_rate": 4.633780598831733e-07, "logits/chosen": -2.4787216186523438, "logits/rejected": -2.5391018390655518, "logps/chosen": -19.442729949951172, "logps/rejected": -153.48989868164062, "loss": 16.5713, "losses_ref": -8.795756798463117e-07, "ref_logps/chosen": -98.21834564208984, "ref_logps/rejected": -87.52836608886719, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 78.77561950683594, "rewards/margins": 144.73715209960938, "rewards/rejected": -65.96153259277344, "step": 4670, "u": -4.303662300109863, "weight": 0.05625002458691597 }, { "diff_generated": -69.84078979492188, "epoch": 1.5165262475696695, "grad_norm": 494.0586391820382, "learning_rate": 4.6188845909572143e-07, "logits/chosen": -2.4449756145477295, "logits/rejected": -2.5386128425598145, "logps/chosen": -15.847773551940918, "logps/rejected": -155.44656372070312, "loss": 17.7909, "losses_ref": -0.003316181479021907, "ref_logps/chosen": -92.31333923339844, "ref_logps/rejected": -85.60575866699219, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 76.46556854248047, "rewards/margins": 146.30636596679688, "rewards/rejected": -69.84078979492188, "step": 4680, "u": -4.302974224090576, "weight": 0.05641231685876846 }, { "diff_generated": -69.53398895263672, "epoch": 1.5197666882696046, "grad_norm": 452.3262545659709, "learning_rate": 4.603979784547451e-07, "logits/chosen": -2.3991618156433105, "logits/rejected": -2.524679183959961, "logps/chosen": -17.85516357421875, "logps/rejected": -156.85577392578125, "loss": 17.0628, "losses_ref": -0.00722300773486495, "ref_logps/chosen": -93.98219299316406, "ref_logps/rejected": -87.32179260253906, "rewards/accuracies": 0.9375, "rewards/chosen": 76.12702178955078, "rewards/margins": 145.6610107421875, "rewards/rejected": -69.53398895263672, "step": 4690, "u": -4.316760063171387, "weight": 0.0628783255815506 }, { "diff_generated": -70.70417022705078, "epoch": 1.5230071289695397, "grad_norm": 483.5096682337427, "learning_rate": 4.5890663915005364e-07, "logits/chosen": -2.4551730155944824, "logits/rejected": -2.5739877223968506, "logps/chosen": -15.682329177856445, "logps/rejected": -162.4754180908203, "loss": 17.2193, "losses_ref": -0.0009247121633961797, "ref_logps/chosen": -94.73026275634766, "ref_logps/rejected": -91.77125549316406, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 79.04792785644531, "rewards/margins": 149.75210571289062, "rewards/rejected": -70.70417022705078, "step": 4700, "u": -4.416566848754883, "weight": 0.03754302114248276 }, { "diff_generated": -68.02668762207031, "epoch": 1.526247569669475, "grad_norm": 474.65671883190913, "learning_rate": 4.574144623836637e-07, "logits/chosen": -2.4455883502960205, "logits/rejected": -2.5542654991149902, "logps/chosen": -16.18857192993164, "logps/rejected": -154.2319793701172, "loss": 17.6006, "losses_ref": -0.0005734398728236556, "ref_logps/chosen": -95.07334899902344, "ref_logps/rejected": -86.20530700683594, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 78.88477325439453, "rewards/margins": 146.91143798828125, "rewards/rejected": -68.02668762207031, "step": 4710, "u": -4.256333827972412, "weight": 0.06877660006284714 }, { "diff_generated": -66.6358413696289, "epoch": 1.5294880103694104, "grad_norm": 397.73122217993006, "learning_rate": 4.5592146936949785e-07, "logits/chosen": -2.432037353515625, "logits/rejected": -2.531618118286133, "logps/chosen": -18.91817855834961, "logps/rejected": -152.69790649414062, "loss": 16.7594, "losses_ref": -0.0002670374815352261, "ref_logps/chosen": -95.0885009765625, "ref_logps/rejected": -86.06204223632812, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 76.17031860351562, "rewards/margins": 142.80618286132812, "rewards/rejected": -66.6358413696289, "step": 4720, "u": -4.290534496307373, "weight": 0.050011225044727325 }, { "diff_generated": -68.60977935791016, "epoch": 1.5327284510693455, "grad_norm": 445.9665979961661, "learning_rate": 4.544276813330835e-07, "logits/chosen": -2.460371971130371, "logits/rejected": -2.5344886779785156, "logps/chosen": -16.79252052307129, "logps/rejected": -152.0134735107422, "loss": 17.1902, "losses_ref": -3.2375027103626053e-07, "ref_logps/chosen": -98.1429214477539, "ref_logps/rejected": -83.40367889404297, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 81.35038757324219, "rewards/margins": 149.96017456054688, "rewards/rejected": -68.60977935791016, "step": 4730, "u": -4.352096080780029, "weight": 0.050000011920928955 }, { "diff_generated": -72.2760009765625, "epoch": 1.5359688917692806, "grad_norm": 476.0648268098202, "learning_rate": 4.529331195112501e-07, "logits/chosen": -2.405479907989502, "logits/rejected": -2.544811725616455, "logps/chosen": -17.25438117980957, "logps/rejected": -162.0577850341797, "loss": 17.2953, "losses_ref": -0.0028177141211926937, "ref_logps/chosen": -97.26611328125, "ref_logps/rejected": -89.78179931640625, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 80.01172637939453, "rewards/margins": 152.28775024414062, "rewards/rejected": -72.2760009765625, "step": 4740, "u": -4.424232482910156, "weight": 0.025139007717370987 }, { "diff_generated": -68.67704772949219, "epoch": 1.5392093324692158, "grad_norm": 425.2681629255773, "learning_rate": 4.5143780515182833e-07, "logits/chosen": -2.4343087673187256, "logits/rejected": -2.5154240131378174, "logps/chosen": -20.168615341186523, "logps/rejected": -156.32225036621094, "loss": 17.2894, "losses_ref": -3.488729589662398e-07, "ref_logps/chosen": -99.758544921875, "ref_logps/rejected": -87.64521789550781, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 79.58992767333984, "rewards/margins": 148.26698303222656, "rewards/rejected": -68.67704772949219, "step": 4750, "u": -4.499643325805664, "weight": 0.018750010058283806 }, { "diff_generated": -69.74950408935547, "epoch": 1.5424497731691509, "grad_norm": 480.75981177075647, "learning_rate": 4.499417595133471e-07, "logits/chosen": -2.3829503059387207, "logits/rejected": -2.4904589653015137, "logps/chosen": -17.013784408569336, "logps/rejected": -155.1744842529297, "loss": 17.4305, "losses_ref": -0.03174503520131111, "ref_logps/chosen": -93.46163177490234, "ref_logps/rejected": -85.42498779296875, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 76.44784545898438, "rewards/margins": 146.19735717773438, "rewards/rejected": -69.74950408935547, "step": 4760, "u": -4.38844633102417, "weight": 0.03943491727113724 }, { "diff_generated": -71.02115631103516, "epoch": 1.5456902138690862, "grad_norm": 473.23312074904203, "learning_rate": 4.4844500386473207e-07, "logits/chosen": -2.444065570831299, "logits/rejected": -2.5454695224761963, "logps/chosen": -17.960790634155273, "logps/rejected": -159.2787628173828, "loss": 17.4406, "losses_ref": -0.0001555870840093121, "ref_logps/chosen": -98.4795913696289, "ref_logps/rejected": -88.25760650634766, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 80.518798828125, "rewards/margins": 151.53994750976562, "rewards/rejected": -71.02115631103516, "step": 4770, "u": -4.4323649406433105, "weight": 0.03750746697187424 }, { "diff_generated": -69.52899932861328, "epoch": 1.5489306545690213, "grad_norm": 492.0938269097183, "learning_rate": 4.4694755948500276e-07, "logits/chosen": -2.413266658782959, "logits/rejected": -2.5660948753356934, "logps/chosen": -14.34319019317627, "logps/rejected": -155.9700164794922, "loss": 16.9127, "losses_ref": -0.0016228422755375504, "ref_logps/chosen": -89.77197265625, "ref_logps/rejected": -86.44102478027344, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 75.42878723144531, "rewards/margins": 144.95779418945312, "rewards/rejected": -69.52899932861328, "step": 4780, "u": -4.278973579406738, "weight": 0.056326042860746384 }, { "diff_generated": -77.00670623779297, "epoch": 1.5521710952689567, "grad_norm": 455.8030015792514, "learning_rate": 4.4544944766297037e-07, "logits/chosen": -2.4555513858795166, "logits/rejected": -2.6273884773254395, "logps/chosen": -15.951858520507812, "logps/rejected": -173.6452178955078, "loss": 16.4869, "losses_ref": -1.6469462238433152e-08, "ref_logps/chosen": -96.30865478515625, "ref_logps/rejected": -96.63851928710938, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 80.3567886352539, "rewards/margins": 157.36349487304688, "rewards/rejected": -77.00670623779297, "step": 4790, "u": -4.393777370452881, "weight": 0.01875000074505806 }, { "diff_generated": -64.95992279052734, "epoch": 1.5554115359688918, "grad_norm": 502.4897379317211, "learning_rate": 4.439506896969348e-07, "logits/chosen": -2.3634815216064453, "logits/rejected": -2.428330659866333, "logps/chosen": -16.078983306884766, "logps/rejected": -142.49655151367188, "loss": 16.9718, "losses_ref": -1.6241930467231214e-08, "ref_logps/chosen": -88.50505065917969, "ref_logps/rejected": -77.53661346435547, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 72.42605590820312, "rewards/margins": 137.385986328125, "rewards/rejected": -64.95992279052734, "step": 4800, "u": -4.174968719482422, "weight": 0.08749999850988388 }, { "diff_generated": -68.2002944946289, "epoch": 1.558651976668827, "grad_norm": 503.3697031130378, "learning_rate": 4.4245130689438206e-07, "logits/chosen": -2.3789191246032715, "logits/rejected": -2.4425642490386963, "logps/chosen": -17.81094741821289, "logps/rejected": -151.67152404785156, "loss": 17.6768, "losses_ref": -1.3876584489480592e-05, "ref_logps/chosen": -93.52096557617188, "ref_logps/rejected": -83.47123718261719, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 75.71002197265625, "rewards/margins": 143.91030883789062, "rewards/rejected": -68.2002944946289, "step": 4810, "u": -4.195111274719238, "weight": 0.07500021904706955 }, { "diff_generated": -71.60388946533203, "epoch": 1.561892417368762, "grad_norm": 464.6552704484477, "learning_rate": 4.4095132057168145e-07, "logits/chosen": -2.4288578033447266, "logits/rejected": -2.493143320083618, "logps/chosen": -19.020919799804688, "logps/rejected": -158.80801391601562, "loss": 16.9986, "losses_ref": -4.041445208713412e-06, "ref_logps/chosen": -98.4945068359375, "ref_logps/rejected": -87.20411682128906, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 79.47359466552734, "rewards/margins": 151.07748413085938, "rewards/rejected": -71.60388946533203, "step": 4820, "u": -4.346208572387695, "weight": 0.04375015199184418 }, { "diff_generated": -70.94068908691406, "epoch": 1.5651328580686974, "grad_norm": 456.0521866765002, "learning_rate": 4.3945075205378215e-07, "logits/chosen": -2.3616397380828857, "logits/rejected": -2.521225929260254, "logps/chosen": -16.200428009033203, "logps/rejected": -158.74551391601562, "loss": 17.2717, "losses_ref": -0.00018199995975010097, "ref_logps/chosen": -92.65226745605469, "ref_logps/rejected": -87.80482482910156, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 76.45184326171875, "rewards/margins": 147.39254760742188, "rewards/rejected": -70.94068908691406, "step": 4830, "u": -4.3289103507995605, "weight": 0.04375418275594711 }, { "diff_generated": -68.45477294921875, "epoch": 1.5683732987686325, "grad_norm": 463.7507709258817, "learning_rate": 4.379496226739104e-07, "logits/chosen": -2.4417545795440674, "logits/rejected": -2.5383334159851074, "logps/chosen": -16.365196228027344, "logps/rejected": -152.93263244628906, "loss": 17.3574, "losses_ref": -2.3878867523308145e-06, "ref_logps/chosen": -93.81847381591797, "ref_logps/rejected": -84.47784423828125, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 77.45327758789062, "rewards/margins": 145.90805053710938, "rewards/rejected": -68.45477294921875, "step": 4840, "u": -4.2860236167907715, "weight": 0.05625007301568985 }, { "diff_generated": -68.72309875488281, "epoch": 1.5716137394685679, "grad_norm": 507.16325174099217, "learning_rate": 4.364479537732663e-07, "logits/chosen": -2.437304735183716, "logits/rejected": -2.5395822525024414, "logps/chosen": -17.980350494384766, "logps/rejected": -155.20404052734375, "loss": 17.6529, "losses_ref": -0.0010237336391583085, "ref_logps/chosen": -94.89065551757812, "ref_logps/rejected": -86.48092651367188, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 76.91030883789062, "rewards/margins": 145.63339233398438, "rewards/rejected": -68.72309875488281, "step": 4850, "u": -4.335413932800293, "weight": 0.04379686713218689 }, { "diff_generated": -73.38167572021484, "epoch": 1.574854180168503, "grad_norm": 442.10680424593454, "learning_rate": 4.349457667007197e-07, "logits/chosen": -2.4412600994110107, "logits/rejected": -2.541186809539795, "logps/chosen": -18.689762115478516, "logps/rejected": -166.74819946289062, "loss": 16.7162, "losses_ref": -0.01046350784599781, "ref_logps/chosen": -99.40778350830078, "ref_logps/rejected": -93.36651611328125, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 80.718017578125, "rewards/margins": 154.09970092773438, "rewards/rejected": -73.38167572021484, "step": 4860, "u": -4.530239105224609, "weight": 0.012633567675948143 }, { "diff_generated": -69.97039031982422, "epoch": 1.578094620868438, "grad_norm": 457.81659951396927, "learning_rate": 4.334430828125074e-07, "logits/chosen": -2.435004472732544, "logits/rejected": -2.5484673976898193, "logps/chosen": -17.670394897460938, "logps/rejected": -155.90631103515625, "loss": 16.8956, "losses_ref": -2.0294830704870037e-08, "ref_logps/chosen": -96.66577911376953, "ref_logps/rejected": -85.93592834472656, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 78.9953842163086, "rewards/margins": 148.9657745361328, "rewards/rejected": -69.97039031982422, "step": 4870, "u": -4.273224353790283, "weight": 0.06875000149011612 }, { "diff_generated": -68.66413116455078, "epoch": 1.5813350615683732, "grad_norm": 438.34378748807904, "learning_rate": 4.319399234719297e-07, "logits/chosen": -2.381214141845703, "logits/rejected": -2.532839298248291, "logps/chosen": -14.926599502563477, "logps/rejected": -151.05838012695312, "loss": 16.2461, "losses_ref": -1.6406092484544388e-08, "ref_logps/chosen": -87.36531066894531, "ref_logps/rejected": -82.39424896240234, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 72.43870544433594, "rewards/margins": 141.1028289794922, "rewards/rejected": -68.66413116455078, "step": 4880, "u": -4.148575782775879, "weight": 0.08124999701976776 }, { "diff_generated": -73.14501190185547, "epoch": 1.5845755022683083, "grad_norm": 484.5661681960606, "learning_rate": 4.3043631004904563e-07, "logits/chosen": -2.4165432453155518, "logits/rejected": -2.50289249420166, "logps/chosen": -14.882474899291992, "logps/rejected": -160.80938720703125, "loss": 16.5932, "losses_ref": -0.0002339294005651027, "ref_logps/chosen": -92.9581527709961, "ref_logps/rejected": -87.66438293457031, "rewards/accuracies": 0.96875, "rewards/chosen": 78.07566833496094, "rewards/margins": 151.22068786621094, "rewards/rejected": -73.14501190185547, "step": 4890, "u": -4.442835807800293, "weight": 0.03126036375761032 }, { "diff_generated": -68.52119445800781, "epoch": 1.5878159429682437, "grad_norm": 423.6852586030226, "learning_rate": 4.2893226392037024e-07, "logits/chosen": -2.4660491943359375, "logits/rejected": -2.5341620445251465, "logps/chosen": -18.97785758972168, "logps/rejected": -153.42442321777344, "loss": 17.3246, "losses_ref": -0.0012148026144132018, "ref_logps/chosen": -101.24699401855469, "ref_logps/rejected": -84.9032211303711, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 82.2691421508789, "rewards/margins": 150.7903289794922, "rewards/rejected": -68.52119445800781, "step": 4900, "u": -4.488637447357178, "weight": 0.01880759373307228 }, { "diff_generated": -71.99812316894531, "epoch": 1.591056383668179, "grad_norm": 449.46646678982796, "learning_rate": 4.2742780646857015e-07, "logits/chosen": -2.453869104385376, "logits/rejected": -2.5750644207000732, "logps/chosen": -15.577032089233398, "logps/rejected": -162.82864379882812, "loss": 16.5234, "losses_ref": -0.004288672003895044, "ref_logps/chosen": -96.67981719970703, "ref_logps/rejected": -90.83052062988281, "rewards/accuracies": 0.96875, "rewards/chosen": 81.10277557373047, "rewards/margins": 153.10089111328125, "rewards/rejected": -71.99812316894531, "step": 4910, "u": -4.421360015869141, "weight": 0.0314519926905632 }, { "diff_generated": -69.86315155029297, "epoch": 1.5942968243681142, "grad_norm": 454.5353855349628, "learning_rate": 4.2592295908215953e-07, "logits/chosen": -2.421281337738037, "logits/rejected": -2.534956693649292, "logps/chosen": -18.646427154541016, "logps/rejected": -156.1486358642578, "loss": 17.8942, "losses_ref": -7.0457475409568815e-09, "ref_logps/chosen": -95.86431121826172, "ref_logps/rejected": -86.28546905517578, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 77.21788024902344, "rewards/margins": 147.08102416992188, "rewards/rejected": -69.86315155029297, "step": 4920, "u": -4.3402299880981445, "weight": 0.05000000074505806 }, { "diff_generated": -71.47618103027344, "epoch": 1.5975372650680493, "grad_norm": 445.8051593638038, "learning_rate": 4.2441774315519645e-07, "logits/chosen": -2.460808277130127, "logits/rejected": -2.5700113773345947, "logps/chosen": -16.961727142333984, "logps/rejected": -158.3717498779297, "loss": 16.2322, "losses_ref": -3.7529636642830155e-07, "ref_logps/chosen": -94.57036590576172, "ref_logps/rejected": -86.89557647705078, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 77.60865783691406, "rewards/margins": 149.08482360839844, "rewards/rejected": -71.47618103027344, "step": 4930, "u": -4.4106645584106445, "weight": 0.03750000149011612 }, { "diff_generated": -70.75782775878906, "epoch": 1.6007777057679844, "grad_norm": 420.1674727332997, "learning_rate": 4.229121800869781e-07, "logits/chosen": -2.4677226543426514, "logits/rejected": -2.551055908203125, "logps/chosen": -15.742657661437988, "logps/rejected": -157.78079223632812, "loss": 17.1654, "losses_ref": -1.1926164233955205e-06, "ref_logps/chosen": -97.43973541259766, "ref_logps/rejected": -87.02295684814453, "rewards/accuracies": 0.96875, "rewards/chosen": 81.69708251953125, "rewards/margins": 152.4549102783203, "rewards/rejected": -70.75782775878906, "step": 4940, "u": -4.393712997436523, "weight": 0.03125004097819328 }, { "diff_generated": -70.05594635009766, "epoch": 1.6040181464679195, "grad_norm": 471.1934574344933, "learning_rate": 4.2140629128173703e-07, "logits/chosen": -2.5027830600738525, "logits/rejected": -2.576869487762451, "logps/chosen": -15.281623840332031, "logps/rejected": -154.04367065429688, "loss": 17.2283, "losses_ref": -4.5796954850629845e-07, "ref_logps/chosen": -91.05293273925781, "ref_logps/rejected": -83.98772430419922, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 75.77131652832031, "rewards/margins": 145.82723999023438, "rewards/rejected": -70.05594635009766, "step": 4950, "u": -4.314549922943115, "weight": 0.050000011920928955 }, { "diff_generated": -72.00413513183594, "epoch": 1.6072585871678549, "grad_norm": 512.6807740122287, "learning_rate": 4.199000981483368e-07, "logits/chosen": -2.4834907054901123, "logits/rejected": -2.572270154953003, "logps/chosen": -19.872325897216797, "logps/rejected": -158.2327117919922, "loss": 17.4412, "losses_ref": -7.864770850574132e-06, "ref_logps/chosen": -100.17959594726562, "ref_logps/rejected": -86.22859191894531, "rewards/accuracies": 0.96875, "rewards/chosen": 80.30726623535156, "rewards/margins": 152.31138610839844, "rewards/rejected": -72.00413513183594, "step": 4960, "u": -4.382212162017822, "weight": 0.031250111758708954 }, { "diff_generated": -70.52757263183594, "epoch": 1.61049902786779, "grad_norm": 424.9704894505284, "learning_rate": 4.183936220999676e-07, "logits/chosen": -2.4406819343566895, "logits/rejected": -2.502345085144043, "logps/chosen": -17.687253952026367, "logps/rejected": -157.0961151123047, "loss": 17.5785, "losses_ref": -6.711905007250607e-05, "ref_logps/chosen": -98.40314483642578, "ref_logps/rejected": -86.56853485107422, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 80.71589660644531, "rewards/margins": 151.2434844970703, "rewards/rejected": -70.52757263183594, "step": 4970, "u": -4.28690242767334, "weight": 0.05000241845846176 }, { "diff_generated": -70.8532943725586, "epoch": 1.6137394685677253, "grad_norm": 423.2785756936077, "learning_rate": 4.168868845538414e-07, "logits/chosen": -2.443761110305786, "logits/rejected": -2.5265536308288574, "logps/chosen": -16.052059173583984, "logps/rejected": -158.07748413085938, "loss": 17.0892, "losses_ref": -8.346935942427081e-07, "ref_logps/chosen": -97.72169494628906, "ref_logps/rejected": -87.22419738769531, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 81.66963958740234, "rewards/margins": 152.52293395996094, "rewards/rejected": -70.8532943725586, "step": 4980, "u": -4.407050132751465, "weight": 0.03750001639127731 }, { "diff_generated": -64.80369567871094, "epoch": 1.6169799092676604, "grad_norm": 491.3871139681972, "learning_rate": 4.15379906930888e-07, "logits/chosen": -2.397629499435425, "logits/rejected": -2.4859225749969482, "logps/chosen": -15.061877250671387, "logps/rejected": -142.0963897705078, "loss": 16.7449, "losses_ref": -4.988124601368327e-06, "ref_logps/chosen": -89.53046417236328, "ref_logps/rejected": -77.29270935058594, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 74.46858978271484, "rewards/margins": 139.2722930908203, "rewards/rejected": -64.80369567871094, "step": 4990, "u": -4.022293567657471, "weight": 0.11250004917383194 }, { "diff_generated": -68.20692443847656, "epoch": 1.6202203499675956, "grad_norm": 470.2407142627594, "learning_rate": 4.1387271065545074e-07, "logits/chosen": -2.4599132537841797, "logits/rejected": -2.4912075996398926, "logps/chosen": -18.08317756652832, "logps/rejected": -149.76382446289062, "loss": 18.0198, "losses_ref": -0.0042698136530816555, "ref_logps/chosen": -99.14116668701172, "ref_logps/rejected": -81.5569076538086, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 81.05799865722656, "rewards/margins": 149.26492309570312, "rewards/rejected": -68.20692443847656, "step": 5000, "u": -4.370108604431152, "weight": 0.03771400451660156 }, { "diff_generated": -71.28777313232422, "epoch": 1.6234607906675307, "grad_norm": 426.3288299899426, "learning_rate": 4.123653171549807e-07, "logits/chosen": -2.466158390045166, "logits/rejected": -2.5273663997650146, "logps/chosen": -16.16830825805664, "logps/rejected": -155.9675750732422, "loss": 17.4407, "losses_ref": -3.1566725056109135e-07, "ref_logps/chosen": -96.88363647460938, "ref_logps/rejected": -84.67980194091797, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 80.71533203125, "rewards/margins": 152.0031280517578, "rewards/rejected": -71.28777313232422, "step": 5010, "u": -4.45050573348999, "weight": 0.025000005960464478 }, { "diff_generated": -70.68708038330078, "epoch": 1.626701231367466, "grad_norm": 457.7143642688739, "learning_rate": 4.108577478597335e-07, "logits/chosen": -2.3896279335021973, "logits/rejected": -2.5536961555480957, "logps/chosen": -18.48324203491211, "logps/rejected": -156.32644653320312, "loss": 17.4408, "losses_ref": -0.0021601675543934107, "ref_logps/chosen": -92.8128662109375, "ref_logps/rejected": -85.63936614990234, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 74.32962036132812, "rewards/margins": 145.01670837402344, "rewards/rejected": -70.68708038330078, "step": 5020, "u": -4.347861289978027, "weight": 0.04385722056031227 }, { "diff_generated": -65.55160522460938, "epoch": 1.6299416720674011, "grad_norm": 476.173100293017, "learning_rate": 4.093500242024637e-07, "logits/chosen": -2.522347927093506, "logits/rejected": -2.502315044403076, "logps/chosen": -17.50808334350586, "logps/rejected": -147.79000854492188, "loss": 17.3038, "losses_ref": -0.0009514664998278022, "ref_logps/chosen": -97.16732025146484, "ref_logps/rejected": -82.23841857910156, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 79.65924072265625, "rewards/margins": 145.21084594726562, "rewards/rejected": -65.55160522460938, "step": 5030, "u": -4.321379661560059, "weight": 0.05629762262105942 }, { "diff_generated": -67.8985366821289, "epoch": 1.6331821127673365, "grad_norm": 481.04827543949136, "learning_rate": 4.0784216761812044e-07, "logits/chosen": -2.4526283740997314, "logits/rejected": -2.4655888080596924, "logps/chosen": -16.992067337036133, "logps/rejected": -149.39520263671875, "loss": 17.1386, "losses_ref": -0.0008671922842040658, "ref_logps/chosen": -96.89338684082031, "ref_logps/rejected": -81.49665832519531, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 79.90131378173828, "rewards/margins": 147.7998504638672, "rewards/rejected": -67.8985366821289, "step": 5040, "u": -4.304007530212402, "weight": 0.056291352957487106 }, { "diff_generated": -70.22386169433594, "epoch": 1.6364225534672716, "grad_norm": 502.71013582975803, "learning_rate": 4.063341995435427e-07, "logits/chosen": -2.421682834625244, "logits/rejected": -2.5176949501037598, "logps/chosen": -15.038667678833008, "logps/rejected": -155.9605255126953, "loss": 16.6153, "losses_ref": -8.21582180066116e-09, "ref_logps/chosen": -90.36064147949219, "ref_logps/rejected": -85.73666381835938, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 75.32197570800781, "rewards/margins": 145.5458221435547, "rewards/rejected": -70.22386169433594, "step": 5050, "u": -4.340941429138184, "weight": 0.04374999925494194 }, { "diff_generated": -67.02069854736328, "epoch": 1.6396629941672067, "grad_norm": 464.9741379478892, "learning_rate": 4.048261414171544e-07, "logits/chosen": -2.474191665649414, "logits/rejected": -2.5061898231506348, "logps/chosen": -16.692886352539062, "logps/rejected": -148.44381713867188, "loss": 17.3252, "losses_ref": -0.00026513769989833236, "ref_logps/chosen": -95.00669860839844, "ref_logps/rejected": -81.42312622070312, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 78.31381225585938, "rewards/margins": 145.33450317382812, "rewards/rejected": -67.02069854736328, "step": 5060, "u": -4.157334804534912, "weight": 0.08751135319471359 }, { "diff_generated": -68.2977523803711, "epoch": 1.6429034348671419, "grad_norm": 441.3001977518817, "learning_rate": 4.0331801467865967e-07, "logits/chosen": -2.472256898880005, "logits/rejected": -2.558577299118042, "logps/chosen": -16.66042137145996, "logps/rejected": -149.11135864257812, "loss": 17.3723, "losses_ref": -5.643848635372706e-05, "ref_logps/chosen": -93.59695434570312, "ref_logps/rejected": -80.81361389160156, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 76.93653869628906, "rewards/margins": 145.2342987060547, "rewards/rejected": -68.2977523803711, "step": 5070, "u": -4.28251314163208, "weight": 0.05000165104866028 }, { "diff_generated": -71.2527847290039, "epoch": 1.646143875567077, "grad_norm": 463.35015092911704, "learning_rate": 4.0180984076873833e-07, "logits/chosen": -2.3988022804260254, "logits/rejected": -2.5074386596679688, "logps/chosen": -17.2117977142334, "logps/rejected": -158.65309143066406, "loss": 17.5038, "losses_ref": -0.0013229569885879755, "ref_logps/chosen": -94.10649871826172, "ref_logps/rejected": -87.40029907226562, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 76.89469909667969, "rewards/margins": 148.14749145507812, "rewards/rejected": -71.2527847290039, "step": 5080, "u": -4.235500812530518, "weight": 0.06881176680326462 }, { "diff_generated": -73.563232421875, "epoch": 1.6493843162670123, "grad_norm": 485.7493691528965, "learning_rate": 4.003016411287407e-07, "logits/chosen": -2.4517648220062256, "logits/rejected": -2.606292486190796, "logps/chosen": -16.88274383544922, "logps/rejected": -167.32257080078125, "loss": 17.0329, "losses_ref": -7.952481610118411e-06, "ref_logps/chosen": -96.70402526855469, "ref_logps/rejected": -93.75936126708984, "rewards/accuracies": 0.96875, "rewards/chosen": 79.82127380371094, "rewards/margins": 153.38449096679688, "rewards/rejected": -73.563232421875, "step": 5090, "u": -4.407740116119385, "weight": 0.03125026077032089 }, { "diff_generated": -70.79591369628906, "epoch": 1.6526247569669477, "grad_norm": 432.09735279072305, "learning_rate": 3.9879343720038276e-07, "logits/chosen": -2.442922830581665, "logits/rejected": -2.5303492546081543, "logps/chosen": -16.87449073791504, "logps/rejected": -161.27662658691406, "loss": 17.6718, "losses_ref": -7.192376187958871e-07, "ref_logps/chosen": -95.96610260009766, "ref_logps/rejected": -90.48070526123047, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 79.09161376953125, "rewards/margins": 149.8875274658203, "rewards/rejected": -70.79591369628906, "step": 5100, "u": -4.367244720458984, "weight": 0.04375002533197403 }, { "diff_generated": -71.23027038574219, "epoch": 1.6558651976668828, "grad_norm": 431.98988316072837, "learning_rate": 3.972852504254415e-07, "logits/chosen": -2.3836681842803955, "logits/rejected": -2.535038471221924, "logps/chosen": -16.863073348999023, "logps/rejected": -156.7201690673828, "loss": 16.4406, "losses_ref": -6.091965474297467e-07, "ref_logps/chosen": -91.42034149169922, "ref_logps/rejected": -85.48990631103516, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 74.5572738647461, "rewards/margins": 145.78753662109375, "rewards/rejected": -71.23027038574219, "step": 5110, "u": -4.2270708084106445, "weight": 0.06875001639127731 }, { "diff_generated": -71.12030792236328, "epoch": 1.659105638366818, "grad_norm": 460.07962332170194, "learning_rate": 3.9577710224545033e-07, "logits/chosen": -2.4391770362854004, "logits/rejected": -2.569916248321533, "logps/chosen": -18.38995361328125, "logps/rejected": -157.92996215820312, "loss": 17.1801, "losses_ref": -0.004063854459673166, "ref_logps/chosen": -96.03662872314453, "ref_logps/rejected": -86.80965423583984, "rewards/accuracies": 0.96875, "rewards/chosen": 77.64668273925781, "rewards/margins": 148.76698303222656, "rewards/rejected": -71.12030792236328, "step": 5120, "u": -4.407456398010254, "weight": 0.031440265476703644 }, { "diff_generated": -69.2224349975586, "epoch": 1.662346079066753, "grad_norm": 447.29819080459765, "learning_rate": 3.9426901410139346e-07, "logits/chosen": -2.468127727508545, "logits/rejected": -2.4933931827545166, "logps/chosen": -19.3382625579834, "logps/rejected": -158.279052734375, "loss": 17.0487, "losses_ref": -0.004990004934370518, "ref_logps/chosen": -102.7806625366211, "ref_logps/rejected": -89.0566177368164, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 83.44239807128906, "rewards/margins": 152.66485595703125, "rewards/rejected": -69.2224349975586, "step": 5130, "u": -4.480766296386719, "weight": 0.012752932496368885 }, { "diff_generated": -73.63088989257812, "epoch": 1.6655865197666881, "grad_norm": 470.50878956666423, "learning_rate": 3.9276100743340217e-07, "logits/chosen": -2.5297019481658936, "logits/rejected": -2.5885841846466064, "logps/chosen": -17.93415641784668, "logps/rejected": -163.42138671875, "loss": 17.1159, "losses_ref": -0.0012631936697289348, "ref_logps/chosen": -101.6262435913086, "ref_logps/rejected": -89.7905044555664, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 83.69209289550781, "rewards/margins": 157.32296752929688, "rewards/rejected": -73.63088989257812, "step": 5140, "u": -4.47725772857666, "weight": 0.012562265619635582 }, { "diff_generated": -71.59645080566406, "epoch": 1.6688269604666235, "grad_norm": 440.79440036964496, "learning_rate": 3.9125310368044877e-07, "logits/chosen": -2.407548189163208, "logits/rejected": -2.506441354751587, "logps/chosen": -16.486774444580078, "logps/rejected": -157.2587127685547, "loss": 16.7643, "losses_ref": -0.0005869531887583435, "ref_logps/chosen": -94.74775695800781, "ref_logps/rejected": -85.6622543334961, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 78.26097869873047, "rewards/margins": 149.85743713378906, "rewards/rejected": -71.59645080566406, "step": 5150, "u": -4.3845953941345215, "weight": 0.03752673789858818 }, { "diff_generated": -69.29833984375, "epoch": 1.6720674011665586, "grad_norm": 492.35900604525534, "learning_rate": 3.8974532428004305e-07, "logits/chosen": -2.4077913761138916, "logits/rejected": -2.4718546867370605, "logps/chosen": -17.270605087280273, "logps/rejected": -155.9961395263672, "loss": 17.1053, "losses_ref": -0.00144083215855062, "ref_logps/chosen": -98.63170623779297, "ref_logps/rejected": -86.69779205322266, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 81.36109924316406, "rewards/margins": 150.65945434570312, "rewards/rejected": -69.29833984375, "step": 5160, "u": -4.301863193511963, "weight": 0.056316327303647995 }, { "diff_generated": -67.48606872558594, "epoch": 1.675307841866494, "grad_norm": 431.33522497944324, "learning_rate": 3.8823769066792643e-07, "logits/chosen": -2.4253952503204346, "logits/rejected": -2.5132360458374023, "logps/chosen": -17.439252853393555, "logps/rejected": -148.45159912109375, "loss": 17.2726, "losses_ref": -0.00045064339064992964, "ref_logps/chosen": -91.80262756347656, "ref_logps/rejected": -80.96551513671875, "rewards/accuracies": 0.9375, "rewards/chosen": 74.36337280273438, "rewards/margins": 141.8494415283203, "rewards/rejected": -67.48606872558594, "step": 5170, "u": -4.298040866851807, "weight": 0.062519371509552 }, { "diff_generated": -75.08834838867188, "epoch": 1.678548282566429, "grad_norm": 473.1830697348528, "learning_rate": 3.867302242777681e-07, "logits/chosen": -2.4858357906341553, "logits/rejected": -2.6244637966156006, "logps/chosen": -16.95624542236328, "logps/rejected": -168.19943237304688, "loss": 17.1289, "losses_ref": -1.8262624745801759e-09, "ref_logps/chosen": -98.0115966796875, "ref_logps/rejected": -93.11107635498047, "rewards/accuracies": 0.96875, "rewards/chosen": 81.05535125732422, "rewards/margins": 156.14369201660156, "rewards/rejected": -75.08834838867188, "step": 5180, "u": -4.44937801361084, "weight": 0.03125 }, { "diff_generated": -69.98115539550781, "epoch": 1.6817887232663642, "grad_norm": 457.77313240740835, "learning_rate": 3.852229465408597e-07, "logits/chosen": -2.409611225128174, "logits/rejected": -2.5834403038024902, "logps/chosen": -17.508363723754883, "logps/rejected": -156.28579711914062, "loss": 17.3377, "losses_ref": -4.212985368212685e-05, "ref_logps/chosen": -92.58866882324219, "ref_logps/rejected": -86.30463409423828, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 75.08030700683594, "rewards/margins": 145.0614776611328, "rewards/rejected": -69.98115539550781, "step": 5190, "u": -4.236456871032715, "weight": 0.07500100880861282 }, { "diff_generated": -69.71676635742188, "epoch": 1.6850291639662993, "grad_norm": 460.02618445520545, "learning_rate": 3.8371587888581067e-07, "logits/chosen": -2.4338431358337402, "logits/rejected": -2.5173604488372803, "logps/chosen": -17.64065933227539, "logps/rejected": -157.76220703125, "loss": 17.0008, "losses_ref": -0.00027629570104181767, "ref_logps/chosen": -99.22086334228516, "ref_logps/rejected": -88.0454330444336, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 81.58020782470703, "rewards/margins": 151.29696655273438, "rewards/rejected": -69.71676635742188, "step": 5200, "u": -4.450808525085449, "weight": 0.025009319186210632 }, { "diff_generated": -69.77263641357422, "epoch": 1.6882696046662347, "grad_norm": 468.22422746219297, "learning_rate": 3.822090427382442e-07, "logits/chosen": -2.45259428024292, "logits/rejected": -2.4920616149902344, "logps/chosen": -17.049236297607422, "logps/rejected": -156.7518310546875, "loss": 16.9809, "losses_ref": -0.18010127544403076, "ref_logps/chosen": -93.86769104003906, "ref_logps/rejected": -86.97917175292969, "rewards/accuracies": 0.9375, "rewards/chosen": 76.81845092773438, "rewards/margins": 146.59109497070312, "rewards/rejected": -69.77263641357422, "step": 5210, "u": -4.296868324279785, "weight": 0.06403098255395889 }, { "diff_generated": -69.26919555664062, "epoch": 1.6915100453661698, "grad_norm": 454.43662699303434, "learning_rate": 3.807024595204916e-07, "logits/chosen": -2.4401068687438965, "logits/rejected": -2.4879443645477295, "logps/chosen": -16.294349670410156, "logps/rejected": -153.8958740234375, "loss": 17.3555, "losses_ref": -1.8101005707649165e-06, "ref_logps/chosen": -94.20816802978516, "ref_logps/rejected": -84.62667846679688, "rewards/accuracies": 0.96875, "rewards/chosen": 77.91381072998047, "rewards/margins": 147.18301391601562, "rewards/rejected": -69.26919555664062, "step": 5220, "u": -4.433684825897217, "weight": 0.03125007078051567 }, { "diff_generated": -71.1961669921875, "epoch": 1.6947504860661051, "grad_norm": 470.61477092459324, "learning_rate": 3.7919615065128905e-07, "logits/chosen": -2.5041403770446777, "logits/rejected": -2.558800458908081, "logps/chosen": -17.792478561401367, "logps/rejected": -157.79530334472656, "loss": 17.7305, "losses_ref": -0.06184719130396843, "ref_logps/chosen": -100.58880615234375, "ref_logps/rejected": -86.59913635253906, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 82.79631805419922, "rewards/margins": 153.99249267578125, "rewards/rejected": -71.1961669921875, "step": 5230, "u": -4.348549842834473, "weight": 0.044755056500434875 }, { "diff_generated": -65.75114440917969, "epoch": 1.6979909267660402, "grad_norm": 453.03870378109104, "learning_rate": 3.7769013754547155e-07, "logits/chosen": -2.4477427005767822, "logits/rejected": -2.5033042430877686, "logps/chosen": -17.491987228393555, "logps/rejected": -150.37059020996094, "loss": 16.7332, "losses_ref": -0.00048043514834716916, "ref_logps/chosen": -97.56319427490234, "ref_logps/rejected": -84.61946105957031, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 80.07121276855469, "rewards/margins": 145.82237243652344, "rewards/rejected": -65.75114440917969, "step": 5240, "u": -4.236319541931152, "weight": 0.07501258701086044 }, { "diff_generated": -69.01460266113281, "epoch": 1.7012313674659754, "grad_norm": 464.04588651328675, "learning_rate": 3.761844416136701e-07, "logits/chosen": -2.454002618789673, "logits/rejected": -2.554508686065674, "logps/chosen": -17.00436782836914, "logps/rejected": -152.9644317626953, "loss": 16.5869, "losses_ref": -0.0029619138222187757, "ref_logps/chosen": -94.6144790649414, "ref_logps/rejected": -83.9498291015625, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 77.610107421875, "rewards/margins": 146.62472534179688, "rewards/rejected": -69.01460266113281, "step": 5250, "u": -4.372044086456299, "weight": 0.043896518647670746 }, { "diff_generated": -68.21833801269531, "epoch": 1.7044718081659105, "grad_norm": 417.2026837693391, "learning_rate": 3.746790842620059e-07, "logits/chosen": -2.42793869972229, "logits/rejected": -2.5127129554748535, "logps/chosen": -15.581718444824219, "logps/rejected": -150.99046325683594, "loss": 17.0069, "losses_ref": -0.0004940610378980637, "ref_logps/chosen": -89.71794128417969, "ref_logps/rejected": -82.77213287353516, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 74.13622283935547, "rewards/margins": 142.35458374023438, "rewards/rejected": -68.21833801269531, "step": 5260, "u": -4.313251495361328, "weight": 0.05625907704234123 }, { "diff_generated": -68.33058166503906, "epoch": 1.7077122488658456, "grad_norm": 457.1306327953638, "learning_rate": 3.731740868917872e-07, "logits/chosen": -2.3772644996643066, "logits/rejected": -2.4986536502838135, "logps/chosen": -17.534826278686523, "logps/rejected": -153.14242553710938, "loss": 17.5256, "losses_ref": -2.465140980234537e-08, "ref_logps/chosen": -91.63734436035156, "ref_logps/rejected": -84.81184387207031, "rewards/accuracies": 0.9375, "rewards/chosen": 74.10250854492188, "rewards/margins": 142.43309020996094, "rewards/rejected": -68.33058166503906, "step": 5270, "u": -4.248973846435547, "weight": 0.0625 }, { "diff_generated": -70.68391418457031, "epoch": 1.710952689565781, "grad_norm": 493.4429615429504, "learning_rate": 3.716694708992039e-07, "logits/chosen": -2.459246873855591, "logits/rejected": -2.5305166244506836, "logps/chosen": -17.377029418945312, "logps/rejected": -158.16543579101562, "loss": 17.2543, "losses_ref": -0.00031063079950399697, "ref_logps/chosen": -97.60765075683594, "ref_logps/rejected": -87.48152160644531, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 80.23062896728516, "rewards/margins": 150.91453552246094, "rewards/rejected": -70.68391418457031, "step": 5280, "u": -4.373373031616211, "weight": 0.03751382231712341 }, { "diff_generated": -72.90049743652344, "epoch": 1.7141931302657163, "grad_norm": 487.7074986433388, "learning_rate": 3.701652576750242e-07, "logits/chosen": -2.4398880004882812, "logits/rejected": -2.562415838241577, "logps/chosen": -16.305221557617188, "logps/rejected": -161.3070831298828, "loss": 17.379, "losses_ref": -2.912343006755691e-07, "ref_logps/chosen": -93.390625, "ref_logps/rejected": -88.40657043457031, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 77.08540344238281, "rewards/margins": 149.9859161376953, "rewards/rejected": -72.90049743652344, "step": 5290, "u": -4.360350608825684, "weight": 0.050000011920928955 }, { "diff_generated": -70.36927795410156, "epoch": 1.7174335709656514, "grad_norm": 480.9713570803477, "learning_rate": 3.686614686042906e-07, "logits/chosen": -2.4388208389282227, "logits/rejected": -2.545295238494873, "logps/chosen": -16.006649017333984, "logps/rejected": -155.7036590576172, "loss": 17.2565, "losses_ref": -0.000997414463199675, "ref_logps/chosen": -96.17381286621094, "ref_logps/rejected": -85.33439636230469, "rewards/accuracies": 0.96875, "rewards/chosen": 80.16717529296875, "rewards/margins": 150.53643798828125, "rewards/rejected": -70.36927795410156, "step": 5300, "u": -4.416836261749268, "weight": 0.031297024339437485 }, { "diff_generated": -72.0317611694336, "epoch": 1.7206740116655865, "grad_norm": 438.97836563138696, "learning_rate": 3.6715812506601493e-07, "logits/chosen": -2.4332687854766846, "logits/rejected": -2.4583961963653564, "logps/chosen": -17.069910049438477, "logps/rejected": -159.6905059814453, "loss": 16.7191, "losses_ref": -0.00018645053205545992, "ref_logps/chosen": -100.94898986816406, "ref_logps/rejected": -87.65870666503906, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 83.87908172607422, "rewards/margins": 155.91085815429688, "rewards/rejected": -72.0317611694336, "step": 5310, "u": -4.3903889656066895, "weight": 0.03750806301832199 }, { "diff_generated": -67.04124450683594, "epoch": 1.7239144523655217, "grad_norm": 475.4394366078291, "learning_rate": 3.6565524843287526e-07, "logits/chosen": -2.452981472015381, "logits/rejected": -2.5339856147766113, "logps/chosen": -15.451299667358398, "logps/rejected": -148.46969604492188, "loss": 16.5968, "losses_ref": -1.6524964507880213e-07, "ref_logps/chosen": -92.11998748779297, "ref_logps/rejected": -81.42845916748047, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 76.66868591308594, "rewards/margins": 143.70993041992188, "rewards/rejected": -67.04124450683594, "step": 5320, "u": -4.24575138092041, "weight": 0.07500000298023224 }, { "diff_generated": -71.4760971069336, "epoch": 1.7271548930654568, "grad_norm": 440.991290221704, "learning_rate": 3.641528600709115e-07, "logits/chosen": -2.4461655616760254, "logits/rejected": -2.5163216590881348, "logps/chosen": -17.627792358398438, "logps/rejected": -157.37338256835938, "loss": 16.9069, "losses_ref": -0.0048348382115364075, "ref_logps/chosen": -96.23738098144531, "ref_logps/rejected": -85.89727783203125, "rewards/accuracies": 0.9375, "rewards/chosen": 78.6095962524414, "rewards/margins": 150.08567810058594, "rewards/rejected": -71.4760971069336, "step": 5330, "u": -4.264947891235352, "weight": 0.06274246424436569 }, { "diff_generated": -65.23731231689453, "epoch": 1.7303953337653921, "grad_norm": 460.9102888832523, "learning_rate": 3.6265098133922277e-07, "logits/chosen": -2.477548360824585, "logits/rejected": -2.515925168991089, "logps/chosen": -15.422185897827148, "logps/rejected": -144.16061401367188, "loss": 16.4254, "losses_ref": -0.008267196826636791, "ref_logps/chosen": -90.59266662597656, "ref_logps/rejected": -78.92329406738281, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 75.17047119140625, "rewards/margins": 140.40777587890625, "rewards/rejected": -65.23731231689453, "step": 5340, "u": -4.144270896911621, "weight": 0.10040758550167084 }, { "diff_generated": -70.13384246826172, "epoch": 1.7336357744653272, "grad_norm": 436.0160106234523, "learning_rate": 3.611496335896617e-07, "logits/chosen": -2.443701982498169, "logits/rejected": -2.561774730682373, "logps/chosen": -15.86700439453125, "logps/rejected": -157.41140747070312, "loss": 17.0001, "losses_ref": -1.2482225429266691e-05, "ref_logps/chosen": -93.91472625732422, "ref_logps/rejected": -87.27755737304688, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 78.0477294921875, "rewards/margins": 148.1815643310547, "rewards/rejected": -70.13384246826172, "step": 5350, "u": -4.3748579025268555, "weight": 0.0500003807246685 }, { "diff_generated": -70.87732696533203, "epoch": 1.7368762151652626, "grad_norm": 459.22489703370104, "learning_rate": 3.59648838166533e-07, "logits/chosen": -2.4528727531433105, "logits/rejected": -2.5678493976593018, "logps/chosen": -17.44341278076172, "logps/rejected": -158.6827392578125, "loss": 17.2521, "losses_ref": -5.317440923136019e-07, "ref_logps/chosen": -95.00345611572266, "ref_logps/rejected": -87.80540466308594, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 77.56005096435547, "rewards/margins": 148.43736267089844, "rewards/rejected": -70.87732696533203, "step": 5360, "u": -4.344834804534912, "weight": 0.04375001788139343 }, { "diff_generated": -66.20857238769531, "epoch": 1.7401166558651977, "grad_norm": 492.27639823140424, "learning_rate": 3.5814861640628864e-07, "logits/chosen": -2.3951048851013184, "logits/rejected": -2.4831387996673584, "logps/chosen": -18.14093017578125, "logps/rejected": -149.3103790283203, "loss": 17.3597, "losses_ref": -7.648421274097927e-07, "ref_logps/chosen": -91.5876693725586, "ref_logps/rejected": -83.10181427001953, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 73.44673919677734, "rewards/margins": 139.65530395507812, "rewards/rejected": -66.20857238769531, "step": 5370, "u": -4.144860744476318, "weight": 0.08750001341104507 }, { "diff_generated": -71.12236022949219, "epoch": 1.7433570965651328, "grad_norm": 415.92824723503844, "learning_rate": 3.5664898963722526e-07, "logits/chosen": -2.3865838050842285, "logits/rejected": -2.513049364089966, "logps/chosen": -17.46709632873535, "logps/rejected": -156.9357452392578, "loss": 16.5102, "losses_ref": -1.2130412230249021e-08, "ref_logps/chosen": -93.4074478149414, "ref_logps/rejected": -85.81340026855469, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 75.94035339355469, "rewards/margins": 147.06271362304688, "rewards/rejected": -71.12236022949219, "step": 5380, "u": -4.3460259437561035, "weight": 0.05000000074505806 }, { "diff_generated": -67.88063049316406, "epoch": 1.746597537265068, "grad_norm": 447.0975280429844, "learning_rate": 3.5514997917918016e-07, "logits/chosen": -2.412515640258789, "logits/rejected": -2.53787899017334, "logps/chosen": -14.890464782714844, "logps/rejected": -149.828125, "loss": 15.9397, "losses_ref": -2.935139242765672e-08, "ref_logps/chosen": -91.47549438476562, "ref_logps/rejected": -81.94749450683594, "rewards/accuracies": 0.9375, "rewards/chosen": 76.58503723144531, "rewards/margins": 144.46566772460938, "rewards/rejected": -67.88063049316406, "step": 5390, "u": -4.2765278816223145, "weight": 0.0625 }, { "diff_generated": -72.81483459472656, "epoch": 1.7498379779650033, "grad_norm": 429.0403541723934, "learning_rate": 3.536516063432293e-07, "logits/chosen": -2.4231066703796387, "logits/rejected": -2.5467000007629395, "logps/chosen": -16.429943084716797, "logps/rejected": -159.30337524414062, "loss": 16.9469, "losses_ref": -0.0024499078281223774, "ref_logps/chosen": -93.90434265136719, "ref_logps/rejected": -86.48854064941406, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 77.47440338134766, "rewards/margins": 150.28924560546875, "rewards/rejected": -72.81483459472656, "step": 5400, "u": -4.391019344329834, "weight": 0.03762707859277725 }, { "diff_generated": -71.17898559570312, "epoch": 1.7530784186649384, "grad_norm": 426.8218423325292, "learning_rate": 3.5215389243138326e-07, "logits/chosen": -2.41701340675354, "logits/rejected": -2.485673189163208, "logps/chosen": -19.6455135345459, "logps/rejected": -159.58499145507812, "loss": 16.824, "losses_ref": -0.0006879680440761149, "ref_logps/chosen": -102.19815063476562, "ref_logps/rejected": -88.40601348876953, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 82.55262756347656, "rewards/margins": 153.7316131591797, "rewards/rejected": -71.17898559570312, "step": 5410, "u": -4.41085958480835, "weight": 0.0375228226184845 }, { "diff_generated": -74.72703552246094, "epoch": 1.7563188593648738, "grad_norm": 483.34852960013836, "learning_rate": 3.50656858736285e-07, "logits/chosen": -2.439666271209717, "logits/rejected": -2.4910497665405273, "logps/chosen": -17.57253646850586, "logps/rejected": -167.9637451171875, "loss": 16.7944, "losses_ref": -0.00021042392472736537, "ref_logps/chosen": -101.20845031738281, "ref_logps/rejected": -93.2367172241211, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 83.63592529296875, "rewards/margins": 158.3629608154297, "rewards/rejected": -74.72703552246094, "step": 5420, "u": -4.3953704833984375, "weight": 0.02500920556485653 }, { "diff_generated": -69.79595947265625, "epoch": 1.7595593000648089, "grad_norm": 442.3153908664761, "learning_rate": 3.491605265409073e-07, "logits/chosen": -2.4521777629852295, "logits/rejected": -2.4844508171081543, "logps/chosen": -19.611408233642578, "logps/rejected": -159.6874237060547, "loss": 16.9916, "losses_ref": -0.0011415036860853434, "ref_logps/chosen": -99.37894439697266, "ref_logps/rejected": -89.89147186279297, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 79.76752471923828, "rewards/margins": 149.5634765625, "rewards/rejected": -69.79595947265625, "step": 5430, "u": -4.332057952880859, "weight": 0.05630398914217949 }, { "diff_generated": -70.03108978271484, "epoch": 1.762799740764744, "grad_norm": 499.24694029734786, "learning_rate": 3.4766491711824916e-07, "logits/chosen": -2.3980777263641357, "logits/rejected": -2.501035213470459, "logps/chosen": -17.213281631469727, "logps/rejected": -157.7755584716797, "loss": 17.2019, "losses_ref": -0.04993446543812752, "ref_logps/chosen": -93.63558197021484, "ref_logps/rejected": -87.74449157714844, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 76.42230224609375, "rewards/margins": 146.45339965820312, "rewards/rejected": -70.03108978271484, "step": 5440, "u": -4.326253414154053, "weight": 0.05067465454339981 }, { "diff_generated": -71.91776275634766, "epoch": 1.7660401814646791, "grad_norm": 449.3354623063938, "learning_rate": 3.4617005173103497e-07, "logits/chosen": -2.4744386672973633, "logits/rejected": -2.538677930831909, "logps/chosen": -17.06021499633789, "logps/rejected": -158.7646942138672, "loss": 16.6296, "losses_ref": -0.0002994223905261606, "ref_logps/chosen": -99.29143524169922, "ref_logps/rejected": -86.84693908691406, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 82.2312240600586, "rewards/margins": 154.1489715576172, "rewards/rejected": -71.91776275634766, "step": 5450, "u": -4.445778846740723, "weight": 0.025012975558638573 }, { "diff_generated": -69.34452819824219, "epoch": 1.7692806221646142, "grad_norm": 420.9992020837075, "learning_rate": 3.4467595163141056e-07, "logits/chosen": -2.4221789836883545, "logits/rejected": -2.531097888946533, "logps/chosen": -16.535367965698242, "logps/rejected": -154.95436096191406, "loss": 16.9989, "losses_ref": -8.049047755775973e-05, "ref_logps/chosen": -93.34659576416016, "ref_logps/rejected": -85.6098403930664, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 76.81124114990234, "rewards/margins": 146.15576171875, "rewards/rejected": -69.34452819824219, "step": 5460, "u": -4.314999580383301, "weight": 0.056253306567668915 }, { "diff_generated": -68.81937408447266, "epoch": 1.7725210628645496, "grad_norm": 460.36253791839636, "learning_rate": 3.4318263806064244e-07, "logits/chosen": -2.4252026081085205, "logits/rejected": -2.4815948009490967, "logps/chosen": -17.621421813964844, "logps/rejected": -155.63185119628906, "loss": 17.1712, "losses_ref": -9.521203173790127e-05, "ref_logps/chosen": -96.93167114257812, "ref_logps/rejected": -86.81249237060547, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 79.31024932861328, "rewards/margins": 148.12960815429688, "rewards/rejected": -68.81937408447266, "step": 5470, "u": -4.36357307434082, "weight": 0.04375128075480461 }, { "diff_generated": -69.7218246459961, "epoch": 1.775761503564485, "grad_norm": 436.30995540960345, "learning_rate": 3.4169013224881475e-07, "logits/chosen": -2.473205089569092, "logits/rejected": -2.5455079078674316, "logps/chosen": -17.088150024414062, "logps/rejected": -156.84713745117188, "loss": 16.8389, "losses_ref": -0.0009352788329124451, "ref_logps/chosen": -96.74797058105469, "ref_logps/rejected": -87.12532043457031, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 79.65982055664062, "rewards/margins": 149.3816680908203, "rewards/rejected": -69.7218246459961, "step": 5480, "u": -4.265133380889893, "weight": 0.06879539787769318 }, { "diff_generated": -65.35900115966797, "epoch": 1.77900194426442, "grad_norm": 480.0695222800333, "learning_rate": 3.4019845541452844e-07, "logits/chosen": -2.4091029167175293, "logits/rejected": -2.4511494636535645, "logps/chosen": -16.346660614013672, "logps/rejected": -145.93875122070312, "loss": 17.1195, "losses_ref": -0.00016973615856841207, "ref_logps/chosen": -91.6805191040039, "ref_logps/rejected": -80.57975769042969, "rewards/accuracies": 0.9375, "rewards/chosen": 75.3338623046875, "rewards/margins": 140.69284057617188, "rewards/rejected": -65.35900115966797, "step": 5490, "u": -4.253424167633057, "weight": 0.06250713765621185 }, { "diff_generated": -68.89006805419922, "epoch": 1.7822423849643552, "grad_norm": 457.98814975416997, "learning_rate": 3.387076287645985e-07, "logits/chosen": -2.422269105911255, "logits/rejected": -2.521350383758545, "logps/chosen": -16.43721580505371, "logps/rejected": -155.05738830566406, "loss": 16.6865, "losses_ref": -0.012262609787285328, "ref_logps/chosen": -94.85432434082031, "ref_logps/rejected": -86.16730499267578, "rewards/accuracies": 0.9375, "rewards/chosen": 78.41709899902344, "rewards/margins": 147.3071746826172, "rewards/rejected": -68.89006805419922, "step": 5500, "u": -4.225312232971191, "weight": 0.06295279413461685 }, { "diff_generated": -69.23516845703125, "epoch": 1.7854828256642903, "grad_norm": 413.9341227063301, "learning_rate": 3.372176734937536e-07, "logits/chosen": -2.376582145690918, "logits/rejected": -2.510887622833252, "logps/chosen": -15.711641311645508, "logps/rejected": -158.05885314941406, "loss": 16.1692, "losses_ref": -3.3730197174008936e-05, "ref_logps/chosen": -94.01397705078125, "ref_logps/rejected": -88.82369232177734, "rewards/accuracies": 0.9375, "rewards/chosen": 78.30233001708984, "rewards/margins": 147.53750610351562, "rewards/rejected": -69.23516845703125, "step": 5510, "u": -4.285754203796387, "weight": 0.06250113993883133 }, { "diff_generated": -67.3664321899414, "epoch": 1.7887232663642254, "grad_norm": 467.5054402064834, "learning_rate": 3.3572861078433376e-07, "logits/chosen": -2.4425301551818848, "logits/rejected": -2.4949724674224854, "logps/chosen": -15.836771965026855, "logps/rejected": -149.77706909179688, "loss": 16.8381, "losses_ref": -0.0006229934515431523, "ref_logps/chosen": -92.31486511230469, "ref_logps/rejected": -82.41062927246094, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 76.47808837890625, "rewards/margins": 143.84451293945312, "rewards/rejected": -67.3664321899414, "step": 5520, "u": -4.226956367492676, "weight": 0.06877995282411575 }, { "diff_generated": -68.57489013671875, "epoch": 1.7919637070641607, "grad_norm": 435.08875035945834, "learning_rate": 3.3424046180599e-07, "logits/chosen": -2.438937187194824, "logits/rejected": -2.516026258468628, "logps/chosen": -15.992448806762695, "logps/rejected": -149.87245178222656, "loss": 16.5741, "losses_ref": -1.454621241236964e-07, "ref_logps/chosen": -91.31656646728516, "ref_logps/rejected": -81.29755401611328, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 75.32411193847656, "rewards/margins": 143.8990020751953, "rewards/rejected": -68.57489013671875, "step": 5530, "u": -4.198179244995117, "weight": 0.08124999701976776 }, { "diff_generated": -68.36991882324219, "epoch": 1.7952041477640959, "grad_norm": 470.5825818556089, "learning_rate": 3.3275324771538273e-07, "logits/chosen": -2.397613525390625, "logits/rejected": -2.4743857383728027, "logps/chosen": -16.997846603393555, "logps/rejected": -155.87808227539062, "loss": 16.447, "losses_ref": -1.585126119607594e-05, "ref_logps/chosen": -92.73638916015625, "ref_logps/rejected": -87.50816345214844, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 75.7385482788086, "rewards/margins": 144.1084442138672, "rewards/rejected": -68.36991882324219, "step": 5540, "u": -4.300636291503906, "weight": 0.056250762194395065 }, { "diff_generated": -67.50434112548828, "epoch": 1.7984445884640312, "grad_norm": 434.58597111300475, "learning_rate": 3.312669896558816e-07, "logits/chosen": -2.470421314239502, "logits/rejected": -2.513103485107422, "logps/chosen": -16.30929183959961, "logps/rejected": -154.2463836669922, "loss": 16.3786, "losses_ref": -0.0007042810902930796, "ref_logps/chosen": -96.81012725830078, "ref_logps/rejected": -86.74208068847656, "rewards/accuracies": 0.9375, "rewards/chosen": 80.5008316040039, "rewards/margins": 148.0051727294922, "rewards/rejected": -67.50434112548828, "step": 5550, "u": -4.243407249450684, "weight": 0.06253115087747574 }, { "diff_generated": -69.06336212158203, "epoch": 1.8016850291639663, "grad_norm": 499.72614956659856, "learning_rate": 3.2978170875726454e-07, "logits/chosen": -2.4539406299591064, "logits/rejected": -2.540009021759033, "logps/chosen": -14.689851760864258, "logps/rejected": -154.6982879638672, "loss": 16.1144, "losses_ref": -8.424254183125868e-09, "ref_logps/chosen": -92.51824188232422, "ref_logps/rejected": -85.6349105834961, "rewards/accuracies": 0.9375, "rewards/chosen": 77.8283920288086, "rewards/margins": 146.89175415039062, "rewards/rejected": -69.06336212158203, "step": 5560, "u": -4.2520318031311035, "weight": 0.0625 }, { "diff_generated": -69.83843994140625, "epoch": 1.8049254698639015, "grad_norm": 460.6886443876183, "learning_rate": 3.2829742613541704e-07, "logits/chosen": -2.3935115337371826, "logits/rejected": -2.545502185821533, "logps/chosen": -17.24357795715332, "logps/rejected": -160.28634643554688, "loss": 16.6144, "losses_ref": -0.0011852236930280924, "ref_logps/chosen": -94.0816650390625, "ref_logps/rejected": -90.4478988647461, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 76.83808898925781, "rewards/margins": 146.67652893066406, "rewards/rejected": -69.83843994140625, "step": 5570, "u": -4.173181533813477, "weight": 0.0750543624162674 }, { "diff_generated": -68.1365966796875, "epoch": 1.8081659105638366, "grad_norm": 457.9151207111868, "learning_rate": 3.26814162892033e-07, "logits/chosen": -2.4605672359466553, "logits/rejected": -2.5549590587615967, "logps/chosen": -17.256122589111328, "logps/rejected": -154.7444610595703, "loss": 16.5853, "losses_ref": -6.146209670987446e-06, "ref_logps/chosen": -98.32067108154297, "ref_logps/rejected": -86.60784912109375, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 81.06455993652344, "rewards/margins": 149.20115661621094, "rewards/rejected": -68.1365966796875, "step": 5580, "u": -4.302131652832031, "weight": 0.05625029653310776 }, { "diff_generated": -68.90168762207031, "epoch": 1.811406351263772, "grad_norm": 402.7850425533113, "learning_rate": 3.2533194011431346e-07, "logits/chosen": -2.439361572265625, "logits/rejected": -2.5364279747009277, "logps/chosen": -15.55030632019043, "logps/rejected": -153.10989379882812, "loss": 16.0603, "losses_ref": -2.9312252181057374e-08, "ref_logps/chosen": -92.38101196289062, "ref_logps/rejected": -84.20820617675781, "rewards/accuracies": 0.9375, "rewards/chosen": 76.8307113647461, "rewards/margins": 145.73239135742188, "rewards/rejected": -68.90168762207031, "step": 5590, "u": -4.271599292755127, "weight": 0.0625 }, { "diff_generated": -72.8712387084961, "epoch": 1.814646791963707, "grad_norm": 448.7536562243639, "learning_rate": 3.2385077887466766e-07, "logits/chosen": -2.4683947563171387, "logits/rejected": -2.58646821975708, "logps/chosen": -16.766773223876953, "logps/rejected": -164.80836486816406, "loss": 16.4598, "losses_ref": -0.0009516210993751884, "ref_logps/chosen": -97.44658660888672, "ref_logps/rejected": -91.93711853027344, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 80.67982482910156, "rewards/margins": 153.55105590820312, "rewards/rejected": -72.8712387084961, "step": 5600, "u": -4.3830766677856445, "weight": 0.03754193335771561 }, { "diff_generated": -69.22303771972656, "epoch": 1.8178872326636424, "grad_norm": 486.62192104812317, "learning_rate": 3.223707002304131e-07, "logits/chosen": -2.3914060592651367, "logits/rejected": -2.5072906017303467, "logps/chosen": -17.703372955322266, "logps/rejected": -157.489501953125, "loss": 17.4487, "losses_ref": -0.000594664248637855, "ref_logps/chosen": -90.99412536621094, "ref_logps/rejected": -88.26644897460938, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 73.29075622558594, "rewards/margins": 142.5137939453125, "rewards/rejected": -69.22303771972656, "step": 5610, "u": -4.2054243087768555, "weight": 0.07502665370702744 }, { "diff_generated": -70.07087707519531, "epoch": 1.8211276733635775, "grad_norm": 458.51573979209115, "learning_rate": 3.208917252234765e-07, "logits/chosen": -2.414137601852417, "logits/rejected": -2.5393738746643066, "logps/chosen": -14.719442367553711, "logps/rejected": -154.69467163085938, "loss": 16.6353, "losses_ref": -0.0017374107846990228, "ref_logps/chosen": -91.6000747680664, "ref_logps/rejected": -84.62378692626953, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 76.8806381225586, "rewards/margins": 146.95150756835938, "rewards/rejected": -70.07087707519531, "step": 5620, "u": -4.319277286529541, "weight": 0.05008368566632271 }, { "diff_generated": -67.59291076660156, "epoch": 1.8243681140635126, "grad_norm": 460.39703583819204, "learning_rate": 3.1941387488009396e-07, "logits/chosen": -2.426354169845581, "logits/rejected": -2.5217723846435547, "logps/chosen": -16.977571487426758, "logps/rejected": -152.21470642089844, "loss": 16.6692, "losses_ref": -0.0006723630940541625, "ref_logps/chosen": -95.56573486328125, "ref_logps/rejected": -84.6218032836914, "rewards/accuracies": 0.9375, "rewards/chosen": 78.58818054199219, "rewards/margins": 146.1810760498047, "rewards/rejected": -67.59291076660156, "step": 5630, "u": -4.250881195068359, "weight": 0.06253048777580261 }, { "diff_generated": -69.4136962890625, "epoch": 1.8276085547634477, "grad_norm": 446.7409918562372, "learning_rate": 3.179371702105132e-07, "logits/chosen": -2.4898746013641357, "logits/rejected": -2.5813000202178955, "logps/chosen": -18.666357040405273, "logps/rejected": -157.55398559570312, "loss": 17.0778, "losses_ref": -0.0025139835197478533, "ref_logps/chosen": -100.08354187011719, "ref_logps/rejected": -88.14030456542969, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 81.41717529296875, "rewards/margins": 150.83087158203125, "rewards/rejected": -69.4136962890625, "step": 5640, "u": -4.346514701843262, "weight": 0.037541624158620834 }, { "diff_generated": -68.67171478271484, "epoch": 1.8308489954633829, "grad_norm": 451.5965545928365, "learning_rate": 3.164616322086936e-07, "logits/chosen": -2.4457664489746094, "logits/rejected": -2.5104620456695557, "logps/chosen": -16.123414993286133, "logps/rejected": -154.59475708007812, "loss": 17.171, "losses_ref": -7.124496903543331e-08, "ref_logps/chosen": -95.08893585205078, "ref_logps/rejected": -85.92303466796875, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 78.96553039550781, "rewards/margins": 147.63723754882812, "rewards/rejected": -68.67171478271484, "step": 5650, "u": -4.360419273376465, "weight": 0.05000000074505806 }, { "diff_generated": -75.24528503417969, "epoch": 1.8340894361633182, "grad_norm": 423.36652708511275, "learning_rate": 3.1498728185200845e-07, "logits/chosen": -2.484219551086426, "logits/rejected": -2.571659564971924, "logps/chosen": -18.681427001953125, "logps/rejected": -167.66644287109375, "loss": 16.7819, "losses_ref": -0.0029082954861223698, "ref_logps/chosen": -99.9460678100586, "ref_logps/rejected": -92.42115020751953, "rewards/accuracies": 0.96875, "rewards/chosen": 81.2646484375, "rewards/margins": 156.50994873046875, "rewards/rejected": -75.24528503417969, "step": 5660, "u": -4.3646111488342285, "weight": 0.03129839152097702 }, { "diff_generated": -68.78709411621094, "epoch": 1.8373298768632536, "grad_norm": 454.5740812362012, "learning_rate": 3.1351414010094683e-07, "logits/chosen": -2.405937671661377, "logits/rejected": -2.524935007095337, "logps/chosen": -18.181316375732422, "logps/rejected": -155.01504516601562, "loss": 16.65, "losses_ref": -6.4683889355876545e-09, "ref_logps/chosen": -93.27643585205078, "ref_logps/rejected": -86.22793579101562, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 75.09513092041016, "rewards/margins": 143.88223266601562, "rewards/rejected": -68.78709411621094, "step": 5670, "u": -4.331615447998047, "weight": 0.04374999925494194 }, { "diff_generated": -69.10374450683594, "epoch": 1.8405703175631887, "grad_norm": 434.643897722096, "learning_rate": 3.120422278988149e-07, "logits/chosen": -2.4068522453308105, "logits/rejected": -2.507352590560913, "logps/chosen": -17.3289852142334, "logps/rejected": -155.24156188964844, "loss": 17.0959, "losses_ref": -5.715831503039226e-05, "ref_logps/chosen": -95.69300842285156, "ref_logps/rejected": -86.13782501220703, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 78.36402893066406, "rewards/margins": 147.4677734375, "rewards/rejected": -69.10374450683594, "step": 5680, "u": -4.363373756408691, "weight": 0.04375145584344864 }, { "diff_generated": -68.53547668457031, "epoch": 1.8438107582631238, "grad_norm": 459.7871284739823, "learning_rate": 3.10571566171439e-07, "logits/chosen": -2.435319185256958, "logits/rejected": -2.5634605884552, "logps/chosen": -17.82388687133789, "logps/rejected": -155.66110229492188, "loss": 17.2522, "losses_ref": -0.0006606754614040256, "ref_logps/chosen": -94.27436828613281, "ref_logps/rejected": -87.12561798095703, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 76.45048522949219, "rewards/margins": 144.98594665527344, "rewards/rejected": -68.53547668457031, "step": 5690, "u": -4.3797197341918945, "weight": 0.03752906247973442 }, { "diff_generated": -68.64985656738281, "epoch": 1.847051198963059, "grad_norm": 427.58675584780747, "learning_rate": 3.0910217582686756e-07, "logits/chosen": -2.4144062995910645, "logits/rejected": -2.5840725898742676, "logps/chosen": -16.03522491455078, "logps/rejected": -154.49697875976562, "loss": 17.4557, "losses_ref": -0.0017615113174542785, "ref_logps/chosen": -88.2295150756836, "ref_logps/rejected": -85.84712219238281, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 72.19429016113281, "rewards/margins": 140.84414672851562, "rewards/rejected": -68.64985656738281, "step": 5700, "u": -4.2154765129089355, "weight": 0.06880569458007812 }, { "diff_generated": -67.02249145507812, "epoch": 1.850291639662994, "grad_norm": 412.16964669146034, "learning_rate": 3.0763407775507426e-07, "logits/chosen": -2.4929986000061035, "logits/rejected": -2.591066360473633, "logps/chosen": -16.952259063720703, "logps/rejected": -154.13525390625, "loss": 16.801, "losses_ref": -0.0003933070693165064, "ref_logps/chosen": -93.57677459716797, "ref_logps/rejected": -87.11276245117188, "rewards/accuracies": 0.90625, "rewards/chosen": 76.62451171875, "rewards/margins": 143.64700317382812, "rewards/rejected": -67.02249145507812, "step": 5710, "u": -4.159280300140381, "weight": 0.09376726299524307 }, { "diff_generated": -69.30915832519531, "epoch": 1.8535320803629294, "grad_norm": 427.6719015690686, "learning_rate": 3.0616729282766037e-07, "logits/chosen": -2.395036220550537, "logits/rejected": -2.5346271991729736, "logps/chosen": -15.799379348754883, "logps/rejected": -152.8411865234375, "loss": 17.1469, "losses_ref": -8.902359738272025e-09, "ref_logps/chosen": -90.04239654541016, "ref_logps/rejected": -83.53202819824219, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 74.24299621582031, "rewards/margins": 143.55215454101562, "rewards/rejected": -69.30915832519531, "step": 5720, "u": -4.275652885437012, "weight": 0.06875000149011612 }, { "diff_generated": -66.54766845703125, "epoch": 1.8567725210628645, "grad_norm": 482.6223148458128, "learning_rate": 3.047018418975593e-07, "logits/chosen": -2.4510700702667236, "logits/rejected": -2.4502556324005127, "logps/chosen": -18.8032283782959, "logps/rejected": -148.94503784179688, "loss": 17.357, "losses_ref": -0.0006291717290878296, "ref_logps/chosen": -98.66024780273438, "ref_logps/rejected": -82.39737701416016, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 79.85701751708984, "rewards/margins": 146.40469360351562, "rewards/rejected": -66.54766845703125, "step": 5730, "u": -4.319241523742676, "weight": 0.05003003031015396 }, { "diff_generated": -70.15180969238281, "epoch": 1.8600129617627998, "grad_norm": 436.60195450201667, "learning_rate": 3.032377457987385e-07, "logits/chosen": -2.4268975257873535, "logits/rejected": -2.542205810546875, "logps/chosen": -17.047893524169922, "logps/rejected": -156.6510772705078, "loss": 16.1966, "losses_ref": -4.815445208805613e-06, "ref_logps/chosen": -95.20332336425781, "ref_logps/rejected": -86.49925231933594, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 78.15543365478516, "rewards/margins": 148.30723571777344, "rewards/rejected": -70.15180969238281, "step": 5740, "u": -4.337141990661621, "weight": 0.0500001423060894 }, { "diff_generated": -72.12767791748047, "epoch": 1.863253402462735, "grad_norm": 487.1011768414387, "learning_rate": 3.017750253459048e-07, "logits/chosen": -2.4596199989318848, "logits/rejected": -2.529163360595703, "logps/chosen": -18.66078758239746, "logps/rejected": -162.290283203125, "loss": 17.5329, "losses_ref": -6.803600172133883e-07, "ref_logps/chosen": -98.63432312011719, "ref_logps/rejected": -90.16262817382812, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 79.9735336303711, "rewards/margins": 152.10122680664062, "rewards/rejected": -72.12767791748047, "step": 5750, "u": -4.4022650718688965, "weight": 0.03750002756714821 }, { "diff_generated": -72.43559265136719, "epoch": 1.86649384316267, "grad_norm": 437.2306151875033, "learning_rate": 3.003137013342071e-07, "logits/chosen": -2.48848819732666, "logits/rejected": -2.634258508682251, "logps/chosen": -15.372105598449707, "logps/rejected": -163.10440063476562, "loss": 16.7388, "losses_ref": -0.0001623667194508016, "ref_logps/chosen": -92.11798095703125, "ref_logps/rejected": -90.66879272460938, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 76.7458724975586, "rewards/margins": 149.1814727783203, "rewards/rejected": -72.43559265136719, "step": 5760, "u": -4.388466835021973, "weight": 0.04375699535012245 }, { "diff_generated": -72.01509857177734, "epoch": 1.8697342838626052, "grad_norm": 456.3011398041365, "learning_rate": 2.9885379453894224e-07, "logits/chosen": -2.477651596069336, "logits/rejected": -2.6130967140197754, "logps/chosen": -15.400982856750488, "logps/rejected": -157.45724487304688, "loss": 16.5494, "losses_ref": -0.0009191132267005742, "ref_logps/chosen": -94.54651641845703, "ref_logps/rejected": -85.44215393066406, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 79.1455307006836, "rewards/margins": 151.16061401367188, "rewards/rejected": -72.01509857177734, "step": 5770, "u": -4.335265159606934, "weight": 0.05004185438156128 }, { "diff_generated": -71.83040618896484, "epoch": 1.8729747245625405, "grad_norm": 453.99773682282233, "learning_rate": 2.9739532571525806e-07, "logits/chosen": -2.494704008102417, "logits/rejected": -2.625767946243286, "logps/chosen": -15.995404243469238, "logps/rejected": -157.23162841796875, "loss": 17.0838, "losses_ref": -0.0009462740272283554, "ref_logps/chosen": -95.97868347167969, "ref_logps/rejected": -85.40122985839844, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 79.98329162597656, "rewards/margins": 151.8136749267578, "rewards/rejected": -71.83040618896484, "step": 5780, "u": -4.389662265777588, "weight": 0.03754409775137901 }, { "diff_generated": -68.6485366821289, "epoch": 1.8762151652624757, "grad_norm": 475.71899486903936, "learning_rate": 2.959383155978596e-07, "logits/chosen": -2.435675621032715, "logits/rejected": -2.5268807411193848, "logps/chosen": -15.956690788269043, "logps/rejected": -156.03738403320312, "loss": 17.3861, "losses_ref": -3.328962702653371e-05, "ref_logps/chosen": -95.65718078613281, "ref_logps/rejected": -87.38885498046875, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 79.70049285888672, "rewards/margins": 148.34902954101562, "rewards/rejected": -68.6485366821289, "step": 5790, "u": -4.327332496643066, "weight": 0.05000089854001999 }, { "diff_generated": -67.33222198486328, "epoch": 1.879455605962411, "grad_norm": 458.83112990203347, "learning_rate": 2.9448278490071373e-07, "logits/chosen": -2.4544126987457275, "logits/rejected": -2.554018020629883, "logps/chosen": -16.464946746826172, "logps/rejected": -151.57635498046875, "loss": 17.5263, "losses_ref": -1.2650698977267893e-07, "ref_logps/chosen": -93.53096771240234, "ref_logps/rejected": -84.24412536621094, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 77.06602478027344, "rewards/margins": 144.39825439453125, "rewards/rejected": -67.33222198486328, "step": 5800, "u": -4.192667484283447, "weight": 0.07500000298023224 }, { "diff_generated": -73.76513671875, "epoch": 1.8826960466623461, "grad_norm": 451.9329289107445, "learning_rate": 2.930287543167544e-07, "logits/chosen": -2.510129451751709, "logits/rejected": -2.558194875717163, "logps/chosen": -17.454477310180664, "logps/rejected": -160.72348022460938, "loss": 16.6293, "losses_ref": -0.00039221724728122354, "ref_logps/chosen": -104.30326080322266, "ref_logps/rejected": -86.95834350585938, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 86.84878540039062, "rewards/margins": 160.61392211914062, "rewards/rejected": -73.76513671875, "step": 5810, "u": -4.528841018676758, "weight": 0.006267140153795481 }, { "diff_generated": -67.52911376953125, "epoch": 1.8859364873622813, "grad_norm": 454.44370150744254, "learning_rate": 2.9157624451758944e-07, "logits/chosen": -2.428351640701294, "logits/rejected": -2.445155382156372, "logps/chosen": -17.483226776123047, "logps/rejected": -150.34759521484375, "loss": 16.8459, "losses_ref": -5.854690954265607e-08, "ref_logps/chosen": -97.53578186035156, "ref_logps/rejected": -82.81848907470703, "rewards/accuracies": 0.9375, "rewards/chosen": 80.05255889892578, "rewards/margins": 147.5816650390625, "rewards/rejected": -67.52911376953125, "step": 5820, "u": -4.267212867736816, "weight": 0.0625 }, { "diff_generated": -70.92760467529297, "epoch": 1.8891769280622164, "grad_norm": 451.70117899869916, "learning_rate": 2.901252761532055e-07, "logits/chosen": -2.491132974624634, "logits/rejected": -2.526857376098633, "logps/chosen": -17.131113052368164, "logps/rejected": -158.18505859375, "loss": 16.4416, "losses_ref": -7.195662874437403e-06, "ref_logps/chosen": -100.11669921875, "ref_logps/rejected": -87.25745391845703, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 82.985595703125, "rewards/margins": 153.91317749023438, "rewards/rejected": -70.92760467529297, "step": 5830, "u": -4.349549293518066, "weight": 0.03750026598572731 }, { "diff_generated": -68.52245330810547, "epoch": 1.8924173687621515, "grad_norm": 524.9177651666605, "learning_rate": 2.8867586985167523e-07, "logits/chosen": -2.51749324798584, "logits/rejected": -2.5768675804138184, "logps/chosen": -16.798006057739258, "logps/rejected": -156.85104370117188, "loss": 16.435, "losses_ref": -4.86377757624723e-05, "ref_logps/chosen": -96.13957977294922, "ref_logps/rejected": -88.32856750488281, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 79.3415756225586, "rewards/margins": 147.86402893066406, "rewards/rejected": -68.52245330810547, "step": 5840, "u": -4.243464469909668, "weight": 0.0687512457370758 }, { "diff_generated": -69.49964141845703, "epoch": 1.8956578094620868, "grad_norm": 443.47953975122937, "learning_rate": 2.8722804621886364e-07, "logits/chosen": -2.4797089099884033, "logits/rejected": -2.560629367828369, "logps/chosen": -17.303192138671875, "logps/rejected": -153.7431182861328, "loss": 17.1681, "losses_ref": -0.00043771107448264956, "ref_logps/chosen": -92.99197387695312, "ref_logps/rejected": -84.24346923828125, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 75.68878936767578, "rewards/margins": 145.1884307861328, "rewards/rejected": -69.49964141845703, "step": 5850, "u": -4.391529560089111, "weight": 0.04376835376024246 }, { "diff_generated": -71.12813568115234, "epoch": 1.8988982501620222, "grad_norm": 469.21594692643225, "learning_rate": 2.857818258381358e-07, "logits/chosen": -2.4311540126800537, "logits/rejected": -2.511909246444702, "logps/chosen": -17.4440975189209, "logps/rejected": -162.2183380126953, "loss": 16.9258, "losses_ref": -0.0005043140263296664, "ref_logps/chosen": -99.31665802001953, "ref_logps/rejected": -91.09019470214844, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 81.87255859375, "rewards/margins": 153.00070190429688, "rewards/rejected": -71.12813568115234, "step": 5860, "u": -4.432432651519775, "weight": 0.037522491067647934 }, { "diff_generated": -69.65772247314453, "epoch": 1.9021386908619573, "grad_norm": 448.62853805445513, "learning_rate": 2.8433722927006314e-07, "logits/chosen": -2.4755847454071045, "logits/rejected": -2.5879673957824707, "logps/chosen": -18.797056198120117, "logps/rejected": -156.71347045898438, "loss": 16.861, "losses_ref": -9.900189979816787e-06, "ref_logps/chosen": -96.04248046875, "ref_logps/rejected": -87.05574798583984, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 77.24542236328125, "rewards/margins": 146.9031219482422, "rewards/rejected": -69.65772247314453, "step": 5870, "u": -4.378717422485352, "weight": 0.037500329315662384 }, { "diff_generated": -69.74181365966797, "epoch": 1.9053791315618924, "grad_norm": 445.2510424039169, "learning_rate": 2.82894277052132e-07, "logits/chosen": -2.4473252296447754, "logits/rejected": -2.5861172676086426, "logps/chosen": -16.19223976135254, "logps/rejected": -154.62640380859375, "loss": 16.6272, "losses_ref": -1.7233291146112606e-05, "ref_logps/chosen": -93.07598114013672, "ref_logps/rejected": -84.88458251953125, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 76.88375091552734, "rewards/margins": 146.62557983398438, "rewards/rejected": -69.74181365966797, "step": 5880, "u": -4.307101249694824, "weight": 0.05625048279762268 }, { "diff_generated": -72.17448425292969, "epoch": 1.9086195722618275, "grad_norm": 480.57871183808413, "learning_rate": 2.814529896984514e-07, "logits/chosen": -2.4194865226745605, "logits/rejected": -2.496985673904419, "logps/chosen": -17.08938980102539, "logps/rejected": -162.45860290527344, "loss": 16.8849, "losses_ref": -0.0012498985743150115, "ref_logps/chosen": -94.307861328125, "ref_logps/rejected": -90.28411865234375, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 77.2184829711914, "rewards/margins": 149.39297485351562, "rewards/rejected": -72.17448425292969, "step": 5890, "u": -4.246086597442627, "weight": 0.06881099194288254 }, { "diff_generated": -67.37945556640625, "epoch": 1.9118600129617627, "grad_norm": 471.3713556679269, "learning_rate": 2.8001338769946126e-07, "logits/chosen": -2.438253879547119, "logits/rejected": -2.4733853340148926, "logps/chosen": -16.475162506103516, "logps/rejected": -146.17007446289062, "loss": 16.9889, "losses_ref": -8.270395483123139e-05, "ref_logps/chosen": -95.65721893310547, "ref_logps/rejected": -78.79060363769531, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 79.18206024169922, "rewards/margins": 146.56150817871094, "rewards/rejected": -67.37945556640625, "step": 5900, "u": -4.228701591491699, "weight": 0.06875334680080414 }, { "diff_generated": -69.47562408447266, "epoch": 1.915100453661698, "grad_norm": 423.27160932440387, "learning_rate": 2.7857549152164153e-07, "logits/chosen": -2.474318027496338, "logits/rejected": -2.5566134452819824, "logps/chosen": -14.384271621704102, "logps/rejected": -156.02700805664062, "loss": 16.5497, "losses_ref": -0.0011477151419967413, "ref_logps/chosen": -91.45785522460938, "ref_logps/rejected": -86.55137634277344, "rewards/accuracies": 0.9375, "rewards/chosen": 77.07359313964844, "rewards/margins": 146.54920959472656, "rewards/rejected": -69.47562408447266, "step": 5910, "u": -4.288008213043213, "weight": 0.06255216151475906 }, { "diff_generated": -73.12073516845703, "epoch": 1.9183408943616331, "grad_norm": 450.40903172345037, "learning_rate": 2.7713932160722043e-07, "logits/chosen": -2.4164717197418213, "logits/rejected": -2.566805124282837, "logps/chosen": -17.061847686767578, "logps/rejected": -162.56674194335938, "loss": 16.6963, "losses_ref": -0.0018255922477692366, "ref_logps/chosen": -95.25648498535156, "ref_logps/rejected": -89.44600677490234, "rewards/accuracies": 0.96875, "rewards/chosen": 78.19463348388672, "rewards/margins": 151.31536865234375, "rewards/rejected": -73.12073516845703, "step": 5920, "u": -4.383121967315674, "weight": 0.03133777529001236 }, { "diff_generated": -72.05683135986328, "epoch": 1.9215813350615685, "grad_norm": 447.0661139454466, "learning_rate": 2.757048983738847e-07, "logits/chosen": -2.460322856903076, "logits/rejected": -2.580812931060791, "logps/chosen": -16.62095832824707, "logps/rejected": -163.3804931640625, "loss": 16.1758, "losses_ref": -0.001195084652863443, "ref_logps/chosen": -94.19022369384766, "ref_logps/rejected": -91.32366180419922, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 77.56925964355469, "rewards/margins": 149.6260986328125, "rewards/rejected": -72.05683135986328, "step": 5930, "u": -4.401998519897461, "weight": 0.03755543380975723 }, { "diff_generated": -69.83851623535156, "epoch": 1.9248217757615036, "grad_norm": 437.08750385278915, "learning_rate": 2.742722422144885e-07, "logits/chosen": -2.4585840702056885, "logits/rejected": -2.5811221599578857, "logps/chosen": -17.641761779785156, "logps/rejected": -161.27040100097656, "loss": 16.6886, "losses_ref": -0.00032297830330207944, "ref_logps/chosen": -96.47654724121094, "ref_logps/rejected": -91.43187713623047, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 78.83478546142578, "rewards/margins": 148.6732940673828, "rewards/rejected": -69.83851623535156, "step": 5940, "u": -4.332736015319824, "weight": 0.05626402050256729 }, { "diff_generated": -69.04411315917969, "epoch": 1.9280622164614387, "grad_norm": 460.6095520647696, "learning_rate": 2.7284137349676466e-07, "logits/chosen": -2.393308162689209, "logits/rejected": -2.4955241680145264, "logps/chosen": -15.419522285461426, "logps/rejected": -156.3408660888672, "loss": 16.5459, "losses_ref": -2.796388720582854e-08, "ref_logps/chosen": -90.29603576660156, "ref_logps/rejected": -87.29673767089844, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 74.87651062011719, "rewards/margins": 143.92063903808594, "rewards/rejected": -69.04411315917969, "step": 5950, "u": -4.220925807952881, "weight": 0.07500000298023224 }, { "diff_generated": -70.74433898925781, "epoch": 1.9313026571613738, "grad_norm": 455.37578432642556, "learning_rate": 2.7141231256303343e-07, "logits/chosen": -2.4291133880615234, "logits/rejected": -2.5476202964782715, "logps/chosen": -19.036624908447266, "logps/rejected": -159.58172607421875, "loss": 17.6372, "losses_ref": -1.3405813660938293e-05, "ref_logps/chosen": -98.10065460205078, "ref_logps/rejected": -88.83736419677734, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 79.06402587890625, "rewards/margins": 149.80838012695312, "rewards/rejected": -70.74433898925781, "step": 5960, "u": -4.429462432861328, "weight": 0.03750030696392059 }, { "diff_generated": -68.28518676757812, "epoch": 1.9345430978613092, "grad_norm": 460.5678292263605, "learning_rate": 2.69985079729915e-07, "logits/chosen": -2.4392125606536865, "logits/rejected": -2.5044326782226562, "logps/chosen": -17.118776321411133, "logps/rejected": -148.4984588623047, "loss": 17.0562, "losses_ref": -0.0002496158122085035, "ref_logps/chosen": -98.86695861816406, "ref_logps/rejected": -80.21326446533203, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 81.74818420410156, "rewards/margins": 150.0333709716797, "rewards/rejected": -68.28518676757812, "step": 5970, "u": -4.321126461029053, "weight": 0.05626310035586357 }, { "diff_generated": -70.55073547363281, "epoch": 1.9377835385612443, "grad_norm": 492.5859735668745, "learning_rate": 2.6855969528803945e-07, "logits/chosen": -2.444809675216675, "logits/rejected": -2.5491554737091064, "logps/chosen": -18.09766387939453, "logps/rejected": -157.51620483398438, "loss": 17.5105, "losses_ref": -1.5437821275554597e-05, "ref_logps/chosen": -97.93743896484375, "ref_logps/rejected": -86.9654769897461, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 79.83978271484375, "rewards/margins": 150.3905029296875, "rewards/rejected": -70.55073547363281, "step": 5980, "u": -4.346704959869385, "weight": 0.043750226497650146 }, { "diff_generated": -66.5762939453125, "epoch": 1.9410239792611796, "grad_norm": 431.79970088562357, "learning_rate": 2.6713617950175903e-07, "logits/chosen": -2.4027295112609863, "logits/rejected": -2.5291686058044434, "logps/chosen": -15.071945190429688, "logps/rejected": -148.65245056152344, "loss": 16.5865, "losses_ref": -1.4883593166814535e-06, "ref_logps/chosen": -91.81842803955078, "ref_logps/rejected": -82.0761489868164, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 76.74647521972656, "rewards/margins": 143.32278442382812, "rewards/rejected": -66.5762939453125, "step": 5990, "u": -4.325742721557617, "weight": 0.05000003054738045 }, { "diff_generated": -67.45310974121094, "epoch": 1.9442644199611148, "grad_norm": 466.72615095363875, "learning_rate": 2.657145526088593e-07, "logits/chosen": -2.3987174034118652, "logits/rejected": -2.532212018966675, "logps/chosen": -16.78557586669922, "logps/rejected": -147.83082580566406, "loss": 16.9198, "losses_ref": -2.534356724481768e-07, "ref_logps/chosen": -89.77131652832031, "ref_logps/rejected": -80.37770080566406, "rewards/accuracies": 0.90625, "rewards/chosen": 72.98574829101562, "rewards/margins": 140.43885803222656, "rewards/rejected": -67.45310974121094, "step": 6000, "u": -4.156952857971191, "weight": 0.0937500074505806 }, { "diff_generated": -69.40829467773438, "epoch": 1.9475048606610499, "grad_norm": 468.2504707444501, "learning_rate": 2.6429483482027243e-07, "logits/chosen": -2.43640398979187, "logits/rejected": -2.5646090507507324, "logps/chosen": -16.859195709228516, "logps/rejected": -153.06393432617188, "loss": 17.001, "losses_ref": -0.0001398011518176645, "ref_logps/chosen": -93.74928283691406, "ref_logps/rejected": -83.6556396484375, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 76.89009094238281, "rewards/margins": 146.2984161376953, "rewards/rejected": -69.40829467773438, "step": 6010, "u": -4.381274223327637, "weight": 0.04375586658716202 }, { "diff_generated": -66.53333282470703, "epoch": 1.950745301360985, "grad_norm": 453.98594442615945, "learning_rate": 2.628770463197889e-07, "logits/chosen": -2.504582643508911, "logits/rejected": -2.5561158657073975, "logps/chosen": -17.2025146484375, "logps/rejected": -151.67718505859375, "loss": 16.9012, "losses_ref": -2.9776818337268196e-05, "ref_logps/chosen": -98.95398712158203, "ref_logps/rejected": -85.14385223388672, "rewards/accuracies": 0.9375, "rewards/chosen": 81.75148010253906, "rewards/margins": 148.28482055664062, "rewards/rejected": -66.53333282470703, "step": 6020, "u": -4.286922931671143, "weight": 0.06250147521495819 }, { "diff_generated": -67.9385986328125, "epoch": 1.9539857420609201, "grad_norm": 437.9430910005116, "learning_rate": 2.6146120726377103e-07, "logits/chosen": -2.3654942512512207, "logits/rejected": -2.5123817920684814, "logps/chosen": -14.911382675170898, "logps/rejected": -147.7805633544922, "loss": 17.0955, "losses_ref": -0.0007130379672162235, "ref_logps/chosen": -89.0401611328125, "ref_logps/rejected": -79.84195709228516, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 74.12876892089844, "rewards/margins": 142.0673828125, "rewards/rejected": -67.9385986328125, "step": 6030, "u": -4.1883392333984375, "weight": 0.0875338613986969 }, { "diff_generated": -64.7969741821289, "epoch": 1.9572261827608555, "grad_norm": 448.64790944231504, "learning_rate": 2.600473377808667e-07, "logits/chosen": -2.430640697479248, "logits/rejected": -2.480102062225342, "logps/chosen": -16.628345489501953, "logps/rejected": -141.89987182617188, "loss": 16.4593, "losses_ref": -8.310528937727213e-05, "ref_logps/chosen": -89.28855895996094, "ref_logps/rejected": -77.10289764404297, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 72.66020965576172, "rewards/margins": 137.45718383789062, "rewards/rejected": -64.7969741821289, "step": 6040, "u": -4.257237434387207, "weight": 0.06875047832727432 }, { "diff_generated": -67.9603500366211, "epoch": 1.9604666234607908, "grad_norm": 452.8056559318795, "learning_rate": 2.5863545797172226e-07, "logits/chosen": -2.4394993782043457, "logits/rejected": -2.533747673034668, "logps/chosen": -17.315380096435547, "logps/rejected": -150.82473754882812, "loss": 17.2681, "losses_ref": -0.001021057483740151, "ref_logps/chosen": -93.55432891845703, "ref_logps/rejected": -82.8643798828125, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 76.23894500732422, "rewards/margins": 144.1992950439453, "rewards/rejected": -67.9603500366211, "step": 6050, "u": -4.3375749588012695, "weight": 0.05630000680685043 }, { "diff_generated": -67.12407684326172, "epoch": 1.963707064160726, "grad_norm": 450.7826790115826, "learning_rate": 2.5722558790869786e-07, "logits/chosen": -2.413055181503296, "logits/rejected": -2.4825711250305176, "logps/chosen": -17.044614791870117, "logps/rejected": -145.07261657714844, "loss": 16.397, "losses_ref": -2.6650173822417855e-05, "ref_logps/chosen": -91.9409408569336, "ref_logps/rejected": -77.94853210449219, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 74.89633178710938, "rewards/margins": 142.02041625976562, "rewards/rejected": -67.12407684326172, "step": 6060, "u": -4.174244403839111, "weight": 0.08125053346157074 }, { "diff_generated": -72.6106185913086, "epoch": 1.966947504860661, "grad_norm": 448.19207789067633, "learning_rate": 2.558177476355812e-07, "logits/chosen": -2.4541189670562744, "logits/rejected": -2.589430332183838, "logps/chosen": -19.113964080810547, "logps/rejected": -162.339111328125, "loss": 16.7823, "losses_ref": -0.0004977741627953947, "ref_logps/chosen": -98.77516174316406, "ref_logps/rejected": -89.72850036621094, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 79.66120147705078, "rewards/margins": 152.27182006835938, "rewards/rejected": -72.6106185913086, "step": 6070, "u": -4.469677448272705, "weight": 0.02501002512872219 }, { "diff_generated": -72.3133316040039, "epoch": 1.9701879455605962, "grad_norm": 457.84983535438994, "learning_rate": 2.544119571673031e-07, "logits/chosen": -2.467571496963501, "logits/rejected": -2.5975966453552246, "logps/chosen": -16.998483657836914, "logps/rejected": -161.43502807617188, "loss": 16.7225, "losses_ref": -0.015463406220078468, "ref_logps/chosen": -95.86445617675781, "ref_logps/rejected": -89.1217041015625, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 78.86597442626953, "rewards/margins": 151.17929077148438, "rewards/rejected": -72.3133316040039, "step": 6080, "u": -4.396916389465332, "weight": 0.03814239054918289 }, { "diff_generated": -72.62419128417969, "epoch": 1.9734283862605313, "grad_norm": 508.9547507073546, "learning_rate": 2.5300823648965267e-07, "logits/chosen": -2.4152259826660156, "logits/rejected": -2.534400463104248, "logps/chosen": -15.61021614074707, "logps/rejected": -160.32122802734375, "loss": 16.9425, "losses_ref": -1.0055341590486933e-06, "ref_logps/chosen": -92.42676544189453, "ref_logps/rejected": -87.69705200195312, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 76.8165512084961, "rewards/margins": 149.44073486328125, "rewards/rejected": -72.62419128417969, "step": 6090, "u": -4.382531642913818, "weight": 0.04375002905726433 }, { "diff_generated": -72.01034545898438, "epoch": 1.9766688269604666, "grad_norm": 439.69088541526725, "learning_rate": 2.516066055589937e-07, "logits/chosen": -2.447578191757202, "logits/rejected": -2.578108549118042, "logps/chosen": -15.061620712280273, "logps/rejected": -158.85385131835938, "loss": 16.9858, "losses_ref": -7.055670039335382e-07, "ref_logps/chosen": -92.02629852294922, "ref_logps/rejected": -86.84352111816406, "rewards/accuracies": 0.9375, "rewards/chosen": 76.96467590332031, "rewards/margins": 148.9750213623047, "rewards/rejected": -72.01034545898438, "step": 6100, "u": -4.283817291259766, "weight": 0.0625000149011612 }, { "diff_generated": -71.61358642578125, "epoch": 1.9799092676604018, "grad_norm": 412.81350587394274, "learning_rate": 2.502070843019799e-07, "logits/chosen": -2.438821315765381, "logits/rejected": -2.559591770172119, "logps/chosen": -17.955440521240234, "logps/rejected": -159.21438598632812, "loss": 16.1884, "losses_ref": -0.00022827927023172379, "ref_logps/chosen": -97.82897186279297, "ref_logps/rejected": -87.60079193115234, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 79.87352752685547, "rewards/margins": 151.48712158203125, "rewards/rejected": -71.61358642578125, "step": 6110, "u": -4.4112467765808105, "weight": 0.03751049190759659 }, { "diff_generated": -68.79540252685547, "epoch": 1.983149708360337, "grad_norm": 438.77721573717224, "learning_rate": 2.4880969261527294e-07, "logits/chosen": -2.4548709392547607, "logits/rejected": -2.555267095565796, "logps/chosen": -16.763917922973633, "logps/rejected": -156.84829711914062, "loss": 16.7081, "losses_ref": -9.546678484184667e-06, "ref_logps/chosen": -96.22209167480469, "ref_logps/rejected": -88.05290222167969, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 79.45818328857422, "rewards/margins": 148.2535858154297, "rewards/rejected": -68.79540252685547, "step": 6120, "u": -4.3454976081848145, "weight": 0.03750038146972656 }, { "diff_generated": -70.16226196289062, "epoch": 1.9863901490602722, "grad_norm": 472.14024979167294, "learning_rate": 2.4741445036525814e-07, "logits/chosen": -2.4340896606445312, "logits/rejected": -2.510011911392212, "logps/chosen": -14.988530158996582, "logps/rejected": -155.96693420410156, "loss": 16.1331, "losses_ref": -0.0012874031672254205, "ref_logps/chosen": -92.43404388427734, "ref_logps/rejected": -85.80467224121094, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 77.44551086425781, "rewards/margins": 147.60777282714844, "rewards/rejected": -70.16226196289062, "step": 6130, "u": -4.145638942718506, "weight": 0.08755604922771454 }, { "diff_generated": -66.4461669921875, "epoch": 1.9896305897602073, "grad_norm": 460.0733260209511, "learning_rate": 2.460213773877635e-07, "logits/chosen": -2.3939826488494873, "logits/rejected": -2.493049144744873, "logps/chosen": -15.80755615234375, "logps/rejected": -144.96444702148438, "loss": 16.6138, "losses_ref": -0.0007387199439108372, "ref_logps/chosen": -90.4625473022461, "ref_logps/rejected": -78.51826477050781, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 74.65499114990234, "rewards/margins": 141.10116577148438, "rewards/rejected": -66.4461669921875, "step": 6140, "u": -4.115630149841309, "weight": 0.10003437101840973 }, { "diff_generated": -73.65410614013672, "epoch": 1.9928710304601425, "grad_norm": 505.0343263835002, "learning_rate": 2.4463049348777666e-07, "logits/chosen": -2.4246139526367188, "logits/rejected": -2.5277953147888184, "logps/chosen": -15.673370361328125, "logps/rejected": -161.61569213867188, "loss": 16.5994, "losses_ref": -1.5297347388809612e-08, "ref_logps/chosen": -96.08084869384766, "ref_logps/rejected": -87.96159362792969, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 80.407470703125, "rewards/margins": 154.0615692138672, "rewards/rejected": -73.65410614013672, "step": 6150, "u": -4.396246910095215, "weight": 0.03750000149011612 }, { "diff_generated": -68.90767669677734, "epoch": 1.9961114711600778, "grad_norm": 453.67577081346997, "learning_rate": 2.4324181843916364e-07, "logits/chosen": -2.441559076309204, "logits/rejected": -2.5246987342834473, "logps/chosen": -19.356121063232422, "logps/rejected": -152.5162353515625, "loss": 16.7865, "losses_ref": -1.9142080986966903e-07, "ref_logps/chosen": -94.289306640625, "ref_logps/rejected": -83.60856628417969, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 74.93318176269531, "rewards/margins": 143.8408660888672, "rewards/rejected": -68.90767669677734, "step": 6160, "u": -4.186587333679199, "weight": 0.07500000298023224 }, { "diff_generated": -74.30818176269531, "epoch": 1.999351911860013, "grad_norm": 440.2516626198898, "learning_rate": 2.4185537198438777e-07, "logits/chosen": -2.484731674194336, "logits/rejected": -2.582129955291748, "logps/chosen": -17.606393814086914, "logps/rejected": -162.97836303710938, "loss": 16.7489, "losses_ref": -0.0005036066868342459, "ref_logps/chosen": -99.83641052246094, "ref_logps/rejected": -88.67017364501953, "rewards/accuracies": 0.96875, "rewards/chosen": 82.23001861572266, "rewards/margins": 156.5382080078125, "rewards/rejected": -74.30818176269531, "step": 6170, "u": -4.434797763824463, "weight": 0.03127221763134003 }, { "diff_generated": -73.82777404785156, "epoch": 2.0025923525599483, "grad_norm": 465.6875766799557, "learning_rate": 2.40471173834229e-07, "logits/chosen": -2.477844476699829, "logits/rejected": -2.580146074295044, "logps/chosen": -14.714004516601562, "logps/rejected": -164.83810424804688, "loss": 14.2976, "losses_ref": -0.014145202934741974, "ref_logps/chosen": -98.5716781616211, "ref_logps/rejected": -91.01033782958984, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 83.85767364501953, "rewards/margins": 157.68545532226562, "rewards/rejected": -73.82777404785156, "step": 6180, "u": -6.301093578338623, "weight": 0.019366895779967308 }, { "diff_generated": -76.43583679199219, "epoch": 2.0058327932598834, "grad_norm": 442.8113423529574, "learning_rate": 2.3908924366750385e-07, "logits/chosen": -2.419363498687744, "logits/rejected": -2.518618583679199, "logps/chosen": -13.03242301940918, "logps/rejected": -163.94589233398438, "loss": 13.0716, "losses_ref": -2.3213447093439754e-06, "ref_logps/chosen": -93.88200378417969, "ref_logps/rejected": -87.51005554199219, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 80.84957885742188, "rewards/margins": 157.28541564941406, "rewards/rejected": -76.43583679199219, "step": 6190, "u": -6.368585109710693, "weight": 0.07500007748603821 }, { "diff_generated": -82.67878723144531, "epoch": 2.0090732339598185, "grad_norm": 494.5893161029665, "learning_rate": 2.3770960113078505e-07, "logits/chosen": -2.412153720855713, "logits/rejected": -2.5851235389709473, "logps/chosen": -12.750777244567871, "logps/rejected": -172.10757446289062, "loss": 13.332, "losses_ref": -0.00504049938172102, "ref_logps/chosen": -93.04595184326172, "ref_logps/rejected": -89.42879486083984, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 80.29518127441406, "rewards/margins": 162.97393798828125, "rewards/rejected": -82.67878723144531, "step": 6200, "u": -6.533880710601807, "weight": 0.050222884863615036 }, { "diff_generated": -81.47315216064453, "epoch": 2.0123136746597536, "grad_norm": 444.69402811189104, "learning_rate": 2.3633226583812304e-07, "logits/chosen": -2.3955163955688477, "logits/rejected": -2.5131001472473145, "logps/chosen": -13.523590087890625, "logps/rejected": -170.2469482421875, "loss": 12.7886, "losses_ref": -0.018442081287503242, "ref_logps/chosen": -95.49665832519531, "ref_logps/rejected": -88.77379608154297, "rewards/accuracies": 0.96875, "rewards/chosen": 81.97306823730469, "rewards/margins": 163.4462127685547, "rewards/rejected": -81.47315216064453, "step": 6210, "u": -6.657259464263916, "weight": 0.0321735255420208 }, { "diff_generated": -77.23645782470703, "epoch": 2.0155541153596888, "grad_norm": 438.333881371591, "learning_rate": 2.3495725737076642e-07, "logits/chosen": -2.4523422718048096, "logits/rejected": -2.5315611362457275, "logps/chosen": -13.320953369140625, "logps/rejected": -159.6463623046875, "loss": 13.2749, "losses_ref": -0.0011979702394455671, "ref_logps/chosen": -97.73123931884766, "ref_logps/rejected": -82.40990447998047, "rewards/accuracies": 0.9375, "rewards/chosen": 84.41028594970703, "rewards/margins": 161.64674377441406, "rewards/rejected": -77.23645782470703, "step": 6220, "u": -6.433352470397949, "weight": 0.06255219876766205 }, { "diff_generated": -77.82986450195312, "epoch": 2.0187945560596243, "grad_norm": 421.5016639924878, "learning_rate": 2.3358459527688432e-07, "logits/chosen": -2.430393695831299, "logits/rejected": -2.5355336666107178, "logps/chosen": -13.974141120910645, "logps/rejected": -165.40695190429688, "loss": 13.4308, "losses_ref": -0.007256612181663513, "ref_logps/chosen": -99.11470794677734, "ref_logps/rejected": -87.57707214355469, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 85.14057159423828, "rewards/margins": 162.97042846679688, "rewards/rejected": -77.82986450195312, "step": 6230, "u": -6.607992649078369, "weight": 0.0378216877579689 }, { "diff_generated": -81.22264099121094, "epoch": 2.0220349967595594, "grad_norm": 485.103031244035, "learning_rate": 2.3221429907128734e-07, "logits/chosen": -2.4243063926696777, "logits/rejected": -2.553650140762329, "logps/chosen": -13.33491039276123, "logps/rejected": -173.98171997070312, "loss": 13.053, "losses_ref": -1.8116115825250745e-05, "ref_logps/chosen": -93.6789779663086, "ref_logps/rejected": -92.75906372070312, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 80.34407043457031, "rewards/margins": 161.5666961669922, "rewards/rejected": -81.22264099121094, "step": 6240, "u": -6.483554840087891, "weight": 0.050000619143247604 }, { "diff_generated": -79.91209411621094, "epoch": 2.0252754374594946, "grad_norm": 509.1077077436576, "learning_rate": 2.3084638823515136e-07, "logits/chosen": -2.411858558654785, "logits/rejected": -2.514808177947998, "logps/chosen": -12.484736442565918, "logps/rejected": -165.52288818359375, "loss": 12.819, "losses_ref": -0.00534836994484067, "ref_logps/chosen": -96.26007843017578, "ref_logps/rejected": -85.61079406738281, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 83.77534484863281, "rewards/margins": 163.6874542236328, "rewards/rejected": -79.91209411621094, "step": 6250, "u": -6.536829948425293, "weight": 0.05019962787628174 }, { "diff_generated": -78.60842895507812, "epoch": 2.0285158781594297, "grad_norm": 497.35899680965207, "learning_rate": 2.2948088221573986e-07, "logits/chosen": -2.4031450748443604, "logits/rejected": -2.511592149734497, "logps/chosen": -15.3632230758667, "logps/rejected": -166.2181396484375, "loss": 13.2077, "losses_ref": -3.695975010487018e-07, "ref_logps/chosen": -100.11011505126953, "ref_logps/rejected": -87.60970306396484, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 84.74689483642578, "rewards/margins": 163.35531616210938, "rewards/rejected": -78.60842895507812, "step": 6260, "u": -6.557508945465088, "weight": 0.037500012665987015 }, { "diff_generated": -77.8721923828125, "epoch": 2.031756318859365, "grad_norm": 471.59980342396824, "learning_rate": 2.2811780042612753e-07, "logits/chosen": -2.382750988006592, "logits/rejected": -2.492981433868408, "logps/chosen": -13.257951736450195, "logps/rejected": -166.88327026367188, "loss": 12.9873, "losses_ref": -3.5366301176509296e-07, "ref_logps/chosen": -95.4902114868164, "ref_logps/rejected": -89.01109313964844, "rewards/accuracies": 0.9375, "rewards/chosen": 82.23226165771484, "rewards/margins": 160.1044464111328, "rewards/rejected": -77.8721923828125, "step": 6270, "u": -6.430145263671875, "weight": 0.0625000074505806 }, { "diff_generated": -78.22283935546875, "epoch": 2.0349967595593, "grad_norm": 483.27826658503307, "learning_rate": 2.267571622449246e-07, "logits/chosen": -2.4012675285339355, "logits/rejected": -2.4772655963897705, "logps/chosen": -13.27515983581543, "logps/rejected": -161.35049438476562, "loss": 13.185, "losses_ref": -0.02035510167479515, "ref_logps/chosen": -92.0546875, "ref_logps/rejected": -83.12764739990234, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 78.77952575683594, "rewards/margins": 157.0023651123047, "rewards/rejected": -78.22283935546875, "step": 6280, "u": -6.3433518409729, "weight": 0.0760183110833168 }, { "diff_generated": -77.20220184326172, "epoch": 2.038237200259235, "grad_norm": 449.763223317915, "learning_rate": 2.2539898701600082e-07, "logits/chosen": -2.3824188709259033, "logits/rejected": -2.4665560722351074, "logps/chosen": -13.088310241699219, "logps/rejected": -159.70404052734375, "loss": 12.8279, "losses_ref": -1.1242273956213467e-07, "ref_logps/chosen": -95.3175277709961, "ref_logps/rejected": -82.50181579589844, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 82.22923278808594, "rewards/margins": 159.43142700195312, "rewards/rejected": -77.20220184326172, "step": 6290, "u": -6.379042625427246, "weight": 0.06875000149011612 }, { "diff_generated": -76.66944885253906, "epoch": 2.0414776409591706, "grad_norm": 477.9389997620248, "learning_rate": 2.2404329404821086e-07, "logits/chosen": -2.3812286853790283, "logits/rejected": -2.4753291606903076, "logps/chosen": -14.383306503295898, "logps/rejected": -158.17892456054688, "loss": 13.2235, "losses_ref": -1.3196420695749111e-05, "ref_logps/chosen": -93.89494323730469, "ref_logps/rejected": -81.50947570800781, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 79.51163482666016, "rewards/margins": 156.18109130859375, "rewards/rejected": -76.66944885253906, "step": 6300, "u": -6.47409725189209, "weight": 0.05000042915344238 }, { "diff_generated": -81.24650573730469, "epoch": 2.0447180816591057, "grad_norm": 513.9727361727379, "learning_rate": 2.2269010261511974e-07, "logits/chosen": -2.4193193912506104, "logits/rejected": -2.4820616245269775, "logps/chosen": -13.977895736694336, "logps/rejected": -167.22427368164062, "loss": 13.3155, "losses_ref": -0.0016800116281956434, "ref_logps/chosen": -100.33441162109375, "ref_logps/rejected": -85.97776794433594, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 86.35652160644531, "rewards/margins": 167.60301208496094, "rewards/rejected": -81.24650573730469, "step": 6310, "u": -6.495428562164307, "weight": 0.05632109194993973 }, { "diff_generated": -78.64732360839844, "epoch": 2.047958522359041, "grad_norm": 509.7672668654707, "learning_rate": 2.2133943195472874e-07, "logits/chosen": -2.389063596725464, "logits/rejected": -2.543116807937622, "logps/chosen": -12.62861156463623, "logps/rejected": -160.91986083984375, "loss": 12.6887, "losses_ref": -4.11522727006286e-08, "ref_logps/chosen": -92.83280944824219, "ref_logps/rejected": -82.27253723144531, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 80.2042007446289, "rewards/margins": 158.8515167236328, "rewards/rejected": -78.64732360839844, "step": 6320, "u": -6.228217124938965, "weight": 0.08124999701976776 }, { "diff_generated": -80.81334686279297, "epoch": 2.051198963058976, "grad_norm": 516.1515524619925, "learning_rate": 2.1999130126920158e-07, "logits/chosen": -2.4193766117095947, "logits/rejected": -2.548755168914795, "logps/chosen": -13.09967041015625, "logps/rejected": -170.51239013671875, "loss": 12.9505, "losses_ref": -0.0040515633299946785, "ref_logps/chosen": -97.01029205322266, "ref_logps/rejected": -89.69903564453125, "rewards/accuracies": 0.96875, "rewards/chosen": 83.91061401367188, "rewards/margins": 164.72396850585938, "rewards/rejected": -80.81334686279297, "step": 6330, "u": -6.651572227478027, "weight": 0.03141610696911812 }, { "diff_generated": -79.29988098144531, "epoch": 2.054439403758911, "grad_norm": 512.5347774712834, "learning_rate": 2.1864572972459228e-07, "logits/chosen": -2.3943607807159424, "logits/rejected": -2.567241668701172, "logps/chosen": -11.203665733337402, "logps/rejected": -165.3535919189453, "loss": 13.1086, "losses_ref": -0.0017887745052576065, "ref_logps/chosen": -88.66815185546875, "ref_logps/rejected": -86.05369567871094, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 77.46449279785156, "rewards/margins": 156.76437377929688, "rewards/rejected": -79.29988098144531, "step": 6340, "u": -6.356213092803955, "weight": 0.06882587820291519 }, { "diff_generated": -79.52970886230469, "epoch": 2.057679844458846, "grad_norm": 472.6827846426297, "learning_rate": 2.1730273645057173e-07, "logits/chosen": -2.3555686473846436, "logits/rejected": -2.450321674346924, "logps/chosen": -13.726099014282227, "logps/rejected": -167.40455627441406, "loss": 12.6711, "losses_ref": -0.00632984284311533, "ref_logps/chosen": -96.25859069824219, "ref_logps/rejected": -87.87483978271484, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 82.53250122070312, "rewards/margins": 162.0622100830078, "rewards/rejected": -79.52970886230469, "step": 6350, "u": -6.746617317199707, "weight": 0.019036132842302322 }, { "diff_generated": -82.3400650024414, "epoch": 2.060920285158782, "grad_norm": 521.7664594366094, "learning_rate": 2.1596234054015654e-07, "logits/chosen": -2.384187936782837, "logits/rejected": -2.5147311687469482, "logps/chosen": -13.163564682006836, "logps/rejected": -171.75308227539062, "loss": 13.3316, "losses_ref": -0.001959248911589384, "ref_logps/chosen": -94.92860412597656, "ref_logps/rejected": -89.41303253173828, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 81.76502990722656, "rewards/margins": 164.1051025390625, "rewards/rejected": -82.3400650024414, "step": 6360, "u": -6.575686454772949, "weight": 0.037587970495224 }, { "diff_generated": -79.38497161865234, "epoch": 2.064160725858717, "grad_norm": 470.4498756243637, "learning_rate": 2.1462456104943692e-07, "logits/chosen": -2.356865406036377, "logits/rejected": -2.474856376647949, "logps/chosen": -12.403861045837402, "logps/rejected": -163.8896942138672, "loss": 12.8589, "losses_ref": -1.5257610357366502e-05, "ref_logps/chosen": -93.32125091552734, "ref_logps/rejected": -84.50472259521484, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 80.91740417480469, "rewards/margins": 160.3023681640625, "rewards/rejected": -79.38497161865234, "step": 6370, "u": -6.5001540184021, "weight": 0.056250590831041336 }, { "diff_generated": -77.94718170166016, "epoch": 2.067401166558652, "grad_norm": 490.2535163314096, "learning_rate": 2.132894169973063e-07, "logits/chosen": -2.41898250579834, "logits/rejected": -2.504578113555908, "logps/chosen": -13.155477523803711, "logps/rejected": -160.1114959716797, "loss": 13.4251, "losses_ref": -0.004206720273941755, "ref_logps/chosen": -95.85162353515625, "ref_logps/rejected": -82.16432189941406, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 82.69615173339844, "rewards/margins": 160.64334106445312, "rewards/rejected": -77.94718170166016, "step": 6380, "u": -6.4894819259643555, "weight": 0.056429021060466766 }, { "diff_generated": -80.44488525390625, "epoch": 2.070641607258587, "grad_norm": 489.00207122247065, "learning_rate": 2.1195692736519013e-07, "logits/chosen": -2.4086267948150635, "logits/rejected": -2.506894588470459, "logps/chosen": -13.574335098266602, "logps/rejected": -168.43133544921875, "loss": 13.2942, "losses_ref": -0.040814243257045746, "ref_logps/chosen": -95.96758270263672, "ref_logps/rejected": -87.98646545410156, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 82.39324951171875, "rewards/margins": 162.83810424804688, "rewards/rejected": -80.44488525390625, "step": 6390, "u": -6.472268581390381, "weight": 0.039497148245573044 }, { "diff_generated": -80.75009155273438, "epoch": 2.0738820479585223, "grad_norm": 435.56787766493755, "learning_rate": 2.1062711109677757e-07, "logits/chosen": -2.4283664226531982, "logits/rejected": -2.535419464111328, "logps/chosen": -13.241666793823242, "logps/rejected": -166.3533477783203, "loss": 12.7151, "losses_ref": -0.002088053384795785, "ref_logps/chosen": -94.4981689453125, "ref_logps/rejected": -85.60326385498047, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 81.25650787353516, "rewards/margins": 162.006591796875, "rewards/rejected": -80.75009155273438, "step": 6400, "u": -6.565550327301025, "weight": 0.0438404306769371 }, { "diff_generated": -80.47855377197266, "epoch": 2.0771224886584574, "grad_norm": 480.20915565188733, "learning_rate": 2.0929998709775068e-07, "logits/chosen": -2.414332389831543, "logits/rejected": -2.4279377460479736, "logps/chosen": -13.154367446899414, "logps/rejected": -161.90716552734375, "loss": 12.9361, "losses_ref": -0.0005014360649511218, "ref_logps/chosen": -97.96044921875, "ref_logps/rejected": -81.42860412597656, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 84.80607604980469, "rewards/margins": 165.28463745117188, "rewards/rejected": -80.47855377197266, "step": 6410, "u": -6.4327850341796875, "weight": 0.06876204907894135 }, { "diff_generated": -81.22803497314453, "epoch": 2.080362929358393, "grad_norm": 515.953440725428, "learning_rate": 2.0797557423551574e-07, "logits/chosen": -2.416334629058838, "logits/rejected": -2.521374225616455, "logps/chosen": -13.194360733032227, "logps/rejected": -172.79388427734375, "loss": 13.1244, "losses_ref": -0.004450161475688219, "ref_logps/chosen": -100.98467254638672, "ref_logps/rejected": -91.56584167480469, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 87.7903060913086, "rewards/margins": 169.01834106445312, "rewards/rejected": -81.22803497314453, "step": 6420, "u": -6.475274085998535, "weight": 0.05643799155950546 }, { "diff_generated": -82.28623962402344, "epoch": 2.083603370058328, "grad_norm": 444.8669315098703, "learning_rate": 2.066538913389361e-07, "logits/chosen": -2.400272846221924, "logits/rejected": -2.5217082500457764, "logps/chosen": -13.127087593078613, "logps/rejected": -171.65762329101562, "loss": 13.2154, "losses_ref": -5.08952371092164e-06, "ref_logps/chosen": -97.62368774414062, "ref_logps/rejected": -89.37138366699219, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 84.49659729003906, "rewards/margins": 166.7828369140625, "rewards/rejected": -82.28623962402344, "step": 6430, "u": -6.559577941894531, "weight": 0.03750015050172806 }, { "diff_generated": -80.60981750488281, "epoch": 2.086843810758263, "grad_norm": 472.61054469292606, "learning_rate": 2.053349571980635e-07, "logits/chosen": -2.444110155105591, "logits/rejected": -2.498958110809326, "logps/chosen": -13.0723876953125, "logps/rejected": -164.4912567138672, "loss": 12.6799, "losses_ref": -0.001947386539541185, "ref_logps/chosen": -98.06462097167969, "ref_logps/rejected": -83.88143920898438, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 84.99222564697266, "rewards/margins": 165.60203552246094, "rewards/rejected": -80.60981750488281, "step": 6440, "u": -6.712790489196777, "weight": 0.025087004527449608 }, { "diff_generated": -78.48230743408203, "epoch": 2.0900842514581983, "grad_norm": 441.39652718067543, "learning_rate": 2.0401879056387155e-07, "logits/chosen": -2.354447841644287, "logits/rejected": -2.46140718460083, "logps/chosen": -11.920354843139648, "logps/rejected": -160.78182983398438, "loss": 13.2252, "losses_ref": -0.00024359929375350475, "ref_logps/chosen": -93.72767639160156, "ref_logps/rejected": -82.29952239990234, "rewards/accuracies": 0.9375, "rewards/chosen": 81.80731964111328, "rewards/margins": 160.28964233398438, "rewards/rejected": -78.48230743408203, "step": 6450, "u": -6.464148044586182, "weight": 0.06251000612974167 }, { "diff_generated": -77.48588562011719, "epoch": 2.0933246921581334, "grad_norm": 520.8812809619677, "learning_rate": 2.0270541014798864e-07, "logits/chosen": -2.358785390853882, "logits/rejected": -2.443070888519287, "logps/chosen": -12.780475616455078, "logps/rejected": -161.23361206054688, "loss": 13.0773, "losses_ref": -0.0012289454462006688, "ref_logps/chosen": -94.05496215820312, "ref_logps/rejected": -83.74772644042969, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 81.27449035644531, "rewards/margins": 158.7603759765625, "rewards/rejected": -77.48588562011719, "step": 6460, "u": -6.318059921264648, "weight": 0.08130304515361786 }, { "diff_generated": -79.57652282714844, "epoch": 2.0965651328580686, "grad_norm": 469.0737272469611, "learning_rate": 2.0139483462243225e-07, "logits/chosen": -2.327782392501831, "logits/rejected": -2.4817862510681152, "logps/chosen": -11.793447494506836, "logps/rejected": -166.00112915039062, "loss": 12.8989, "losses_ref": -3.7005463582318043e-08, "ref_logps/chosen": -91.22651672363281, "ref_logps/rejected": -86.42463684082031, "rewards/accuracies": 0.9375, "rewards/chosen": 79.43307495117188, "rewards/margins": 159.0095977783203, "rewards/rejected": -79.57652282714844, "step": 6470, "u": -6.46950626373291, "weight": 0.0625 }, { "diff_generated": -84.00804138183594, "epoch": 2.0998055735580037, "grad_norm": 478.75599326278547, "learning_rate": 2.00087082619343e-07, "logits/chosen": -2.361438274383545, "logits/rejected": -2.494079351425171, "logps/chosen": -12.477500915527344, "logps/rejected": -171.92276000976562, "loss": 12.8422, "losses_ref": -0.02715255320072174, "ref_logps/chosen": -95.34806060791016, "ref_logps/rejected": -87.91471862792969, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 82.87055969238281, "rewards/margins": 166.87860107421875, "rewards/rejected": -84.00804138183594, "step": 6480, "u": -6.616391181945801, "weight": 0.038793593645095825 }, { "diff_generated": -83.79143524169922, "epoch": 2.1030460142579392, "grad_norm": 481.8031524288851, "learning_rate": 1.9878217273072116e-07, "logits/chosen": -2.3388664722442627, "logits/rejected": -2.511909246444702, "logps/chosen": -11.842902183532715, "logps/rejected": -170.57864379882812, "loss": 12.9571, "losses_ref": -0.0015412219800055027, "ref_logps/chosen": -88.08900451660156, "ref_logps/rejected": -86.78721618652344, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 76.24609375, "rewards/margins": 160.03753662109375, "rewards/rejected": -83.79143524169922, "step": 6490, "u": -6.510128021240234, "weight": 0.05006871372461319 }, { "diff_generated": -83.82054901123047, "epoch": 2.1062864549578744, "grad_norm": 484.8975833780512, "learning_rate": 1.974801235081602e-07, "logits/chosen": -2.3324244022369385, "logits/rejected": -2.4709558486938477, "logps/chosen": -12.802592277526855, "logps/rejected": -167.81314086914062, "loss": 13.0662, "losses_ref": -0.0014063044218346477, "ref_logps/chosen": -90.68067932128906, "ref_logps/rejected": -83.99259948730469, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 77.87808990478516, "rewards/margins": 161.69863891601562, "rewards/rejected": -83.82054901123047, "step": 6500, "u": -6.287791728973389, "weight": 0.08131426572799683 }, { "diff_generated": -80.39985656738281, "epoch": 2.1095268956578095, "grad_norm": 519.4806520608191, "learning_rate": 1.9618095346258485e-07, "logits/chosen": -2.369863986968994, "logits/rejected": -2.460906505584717, "logps/chosen": -12.442307472229004, "logps/rejected": -161.36331176757812, "loss": 13.0987, "losses_ref": -1.0886478776228614e-06, "ref_logps/chosen": -96.80855560302734, "ref_logps/rejected": -80.96344757080078, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 84.36624908447266, "rewards/margins": 164.76611328125, "rewards/rejected": -80.39985656738281, "step": 6510, "u": -6.362314224243164, "weight": 0.07500002533197403 }, { "diff_generated": -85.63273620605469, "epoch": 2.1127673363577446, "grad_norm": 529.5689988573159, "learning_rate": 1.948846810639871e-07, "logits/chosen": -2.4165825843811035, "logits/rejected": -2.5419535636901855, "logps/chosen": -14.049939155578613, "logps/rejected": -174.29861450195312, "loss": 13.6717, "losses_ref": -0.0036067564506083727, "ref_logps/chosen": -99.15531921386719, "ref_logps/rejected": -88.66587829589844, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 85.10539245605469, "rewards/margins": 170.73812866210938, "rewards/rejected": -85.63273620605469, "step": 6520, "u": -6.568295478820801, "weight": 0.043914470821619034 }, { "diff_generated": -84.3892822265625, "epoch": 2.1160077770576797, "grad_norm": 495.94061382304113, "learning_rate": 1.9359132474116374e-07, "logits/chosen": -2.4003686904907227, "logits/rejected": -2.512028217315674, "logps/chosen": -13.148529052734375, "logps/rejected": -171.59902954101562, "loss": 13.2403, "losses_ref": -4.4650082031694183e-07, "ref_logps/chosen": -96.95204162597656, "ref_logps/rejected": -87.20973205566406, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 83.80351257324219, "rewards/margins": 168.1927947998047, "rewards/rejected": -84.3892822265625, "step": 6530, "u": -6.600459098815918, "weight": 0.037500012665987015 }, { "diff_generated": -83.33629608154297, "epoch": 2.119248217757615, "grad_norm": 502.5286805379201, "learning_rate": 1.923009028814545e-07, "logits/chosen": -2.3959062099456787, "logits/rejected": -2.577664852142334, "logps/chosen": -13.528100967407227, "logps/rejected": -170.61611938476562, "loss": 13.1637, "losses_ref": -1.0654134712240193e-05, "ref_logps/chosen": -93.49635314941406, "ref_logps/rejected": -87.2798080444336, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 79.96824645996094, "rewards/margins": 163.30455017089844, "rewards/rejected": -83.33629608154297, "step": 6540, "u": -6.5688958168029785, "weight": 0.04375031217932701 }, { "diff_generated": -80.15025329589844, "epoch": 2.1224886584575504, "grad_norm": 510.76521545503005, "learning_rate": 1.910134338304804e-07, "logits/chosen": -2.3848767280578613, "logits/rejected": -2.5160529613494873, "logps/chosen": -13.322404861450195, "logps/rejected": -164.24024963378906, "loss": 12.8367, "losses_ref": -0.000348121888237074, "ref_logps/chosen": -96.18370056152344, "ref_logps/rejected": -84.08997344970703, "rewards/accuracies": 0.9375, "rewards/chosen": 82.86128997802734, "rewards/margins": 163.0115509033203, "rewards/rejected": -80.15025329589844, "step": 6550, "u": -6.42775821685791, "weight": 0.06251399964094162 }, { "diff_generated": -84.13975524902344, "epoch": 2.1257290991574855, "grad_norm": 496.24500877500515, "learning_rate": 1.897289358918834e-07, "logits/chosen": -2.368081569671631, "logits/rejected": -2.479598045349121, "logps/chosen": -12.546297073364258, "logps/rejected": -171.93328857421875, "loss": 12.9928, "losses_ref": -0.0011568386107683182, "ref_logps/chosen": -97.453857421875, "ref_logps/rejected": -87.79353332519531, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 84.90755462646484, "rewards/margins": 169.0473175048828, "rewards/rejected": -84.13975524902344, "step": 6560, "u": -6.587471008300781, "weight": 0.03754870221018791 }, { "diff_generated": -78.54136657714844, "epoch": 2.1289695398574207, "grad_norm": 533.887128493409, "learning_rate": 1.8844742732706508e-07, "logits/chosen": -2.360599994659424, "logits/rejected": -2.4836483001708984, "logps/chosen": -12.47716236114502, "logps/rejected": -158.98513793945312, "loss": 12.3918, "losses_ref": -6.709530862281099e-05, "ref_logps/chosen": -89.12882995605469, "ref_logps/rejected": -80.44377136230469, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 76.65166473388672, "rewards/margins": 155.1930389404297, "rewards/rejected": -78.54136657714844, "step": 6570, "u": -6.196860313415527, "weight": 0.10000260174274445 }, { "diff_generated": -83.926025390625, "epoch": 2.1322099805573558, "grad_norm": 499.9450985494422, "learning_rate": 1.8716892635492906e-07, "logits/chosen": -2.401566982269287, "logits/rejected": -2.545031785964966, "logps/chosen": -12.370294570922852, "logps/rejected": -174.70640563964844, "loss": 12.6282, "losses_ref": -3.838214013285324e-08, "ref_logps/chosen": -96.84266662597656, "ref_logps/rejected": -90.7803726196289, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 84.47237396240234, "rewards/margins": 168.3983917236328, "rewards/rejected": -83.926025390625, "step": 6580, "u": -6.578399658203125, "weight": 0.03750000149011612 }, { "diff_generated": -79.67466735839844, "epoch": 2.135450421257291, "grad_norm": 470.58095351712876, "learning_rate": 1.8589345115161948e-07, "logits/chosen": -2.4138946533203125, "logits/rejected": -2.506059169769287, "logps/chosen": -12.929830551147461, "logps/rejected": -166.64724731445312, "loss": 12.697, "losses_ref": -9.55009671343987e-09, "ref_logps/chosen": -96.51829528808594, "ref_logps/rejected": -86.97258758544922, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 83.58845520019531, "rewards/margins": 163.26312255859375, "rewards/rejected": -79.67466735839844, "step": 6590, "u": -6.518712520599365, "weight": 0.05624999850988388 }, { "diff_generated": -84.69886016845703, "epoch": 2.138690861957226, "grad_norm": 460.42093157266436, "learning_rate": 1.846210198502646e-07, "logits/chosen": -2.4073596000671387, "logits/rejected": -2.526094436645508, "logps/chosen": -12.00372314453125, "logps/rejected": -172.64437866210938, "loss": 12.7592, "losses_ref": -0.04379742592573166, "ref_logps/chosen": -93.78791809082031, "ref_logps/rejected": -87.94551086425781, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 81.78419494628906, "rewards/margins": 166.48306274414062, "rewards/rejected": -84.69886016845703, "step": 6600, "u": -6.497406005859375, "weight": 0.05853024125099182 }, { "diff_generated": -86.61919403076172, "epoch": 2.141931302657161, "grad_norm": 484.96256944654857, "learning_rate": 1.8335165054071795e-07, "logits/chosen": -2.371428966522217, "logits/rejected": -2.5984182357788086, "logps/chosen": -11.764432907104492, "logps/rejected": -177.4099884033203, "loss": 12.6912, "losses_ref": -0.005594468675553799, "ref_logps/chosen": -90.90178680419922, "ref_logps/rejected": -90.79080963134766, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 79.13734436035156, "rewards/margins": 165.75656127929688, "rewards/rejected": -86.61919403076172, "step": 6610, "u": -6.5773138999938965, "weight": 0.04402199015021324 }, { "diff_generated": -83.58558654785156, "epoch": 2.1451717433570967, "grad_norm": 526.7006537364417, "learning_rate": 1.8208536126930173e-07, "logits/chosen": -2.4086477756500244, "logits/rejected": -2.5347962379455566, "logps/chosen": -13.11962890625, "logps/rejected": -171.22561645507812, "loss": 13.1348, "losses_ref": -0.0004633056523744017, "ref_logps/chosen": -95.87711334228516, "ref_logps/rejected": -87.64002990722656, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 82.75748443603516, "rewards/margins": 166.3430633544922, "rewards/rejected": -83.58558654785156, "step": 6620, "u": -6.69122838973999, "weight": 0.02501877211034298 }, { "diff_generated": -81.69627380371094, "epoch": 2.148412184057032, "grad_norm": 500.3112282939001, "learning_rate": 1.8082217003854933e-07, "logits/chosen": -2.4026248455047607, "logits/rejected": -2.5078542232513428, "logps/chosen": -12.539911270141602, "logps/rejected": -167.52072143554688, "loss": 12.9024, "losses_ref": -0.002466453704982996, "ref_logps/chosen": -94.2463150024414, "ref_logps/rejected": -85.824462890625, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 81.70640563964844, "rewards/margins": 163.40267944335938, "rewards/rejected": -81.69627380371094, "step": 6630, "u": -6.409876346588135, "weight": 0.0688544362783432 }, { "diff_generated": -80.04955291748047, "epoch": 2.151652624756967, "grad_norm": 497.75656122376796, "learning_rate": 1.7956209480695087e-07, "logits/chosen": -2.389005661010742, "logits/rejected": -2.5154757499694824, "logps/chosen": -13.171731948852539, "logps/rejected": -161.18557739257812, "loss": 12.8453, "losses_ref": -0.007567479275166988, "ref_logps/chosen": -93.38511657714844, "ref_logps/rejected": -81.13603210449219, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 80.21339416503906, "rewards/margins": 160.262939453125, "rewards/rejected": -80.04955291748047, "step": 6640, "u": -6.542353630065918, "weight": 0.03783651068806648 }, { "diff_generated": -80.24171447753906, "epoch": 2.154893065456902, "grad_norm": 512.22661502845, "learning_rate": 1.7830515348869664e-07, "logits/chosen": -2.3576130867004395, "logits/rejected": -2.528032064437866, "logps/chosen": -12.79431438446045, "logps/rejected": -163.2600555419922, "loss": 13.1941, "losses_ref": -7.352201691901428e-07, "ref_logps/chosen": -88.53156280517578, "ref_logps/rejected": -83.01834106445312, "rewards/accuracies": 0.9375, "rewards/chosen": 75.73725128173828, "rewards/margins": 155.97897338867188, "rewards/rejected": -80.24171447753906, "step": 6650, "u": -6.4445905685424805, "weight": 0.06250002980232239 }, { "diff_generated": -78.14437103271484, "epoch": 2.158133506156837, "grad_norm": 486.94817034851, "learning_rate": 1.770513639534225e-07, "logits/chosen": -2.3804211616516113, "logits/rejected": -2.4835352897644043, "logps/chosen": -12.82800006866455, "logps/rejected": -158.52310180664062, "loss": 12.9297, "losses_ref": -1.1419600014050957e-05, "ref_logps/chosen": -90.69715881347656, "ref_logps/rejected": -80.37873840332031, "rewards/accuracies": 0.90625, "rewards/chosen": 77.86915588378906, "rewards/margins": 156.01351928710938, "rewards/rejected": -78.14437103271484, "step": 6660, "u": -6.209801197052002, "weight": 0.09375027567148209 }, { "diff_generated": -79.85459899902344, "epoch": 2.1613739468567728, "grad_norm": 504.888228932194, "learning_rate": 1.7580074402595698e-07, "logits/chosen": -2.3688759803771973, "logits/rejected": -2.5014474391937256, "logps/chosen": -11.610100746154785, "logps/rejected": -164.87625122070312, "loss": 12.3143, "losses_ref": -4.962249704476562e-07, "ref_logps/chosen": -90.61629486083984, "ref_logps/rejected": -85.02165985107422, "rewards/accuracies": 0.90625, "rewards/chosen": 79.0062026977539, "rewards/margins": 158.86080932617188, "rewards/rejected": -79.85459899902344, "step": 6670, "u": -6.213131904602051, "weight": 0.0937500149011612 }, { "diff_generated": -84.20277404785156, "epoch": 2.164614387556708, "grad_norm": 459.58600259916545, "learning_rate": 1.7455331148606618e-07, "logits/chosen": -2.350407123565674, "logits/rejected": -2.4642868041992188, "logps/chosen": -12.381486892700195, "logps/rejected": -170.39273071289062, "loss": 12.3502, "losses_ref": -0.001807486405596137, "ref_logps/chosen": -92.08818817138672, "ref_logps/rejected": -86.18995666503906, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 79.70670318603516, "rewards/margins": 163.9094696044922, "rewards/rejected": -84.20277404785156, "step": 6680, "u": -6.574119567871094, "weight": 0.03757709637284279 }, { "diff_generated": -83.0318374633789, "epoch": 2.167854828256643, "grad_norm": 494.0762548465317, "learning_rate": 1.7330908406820237e-07, "logits/chosen": -2.378242254257202, "logits/rejected": -2.481672763824463, "logps/chosen": -11.903432846069336, "logps/rejected": -163.31051635742188, "loss": 12.7796, "losses_ref": -5.902011657177297e-10, "ref_logps/chosen": -93.77474975585938, "ref_logps/rejected": -80.27867126464844, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 81.87132263183594, "rewards/margins": 164.90316772460938, "rewards/rejected": -83.0318374633789, "step": 6690, "u": -6.509033203125, "weight": 0.05000000074505806 }, { "diff_generated": -81.65437316894531, "epoch": 2.171095268956578, "grad_norm": 502.7757106064382, "learning_rate": 1.7206807946125123e-07, "logits/chosen": -2.394428014755249, "logits/rejected": -2.4850785732269287, "logps/chosen": -13.18397045135498, "logps/rejected": -170.88265991210938, "loss": 13.5294, "losses_ref": -0.00011690105020534247, "ref_logps/chosen": -98.64112854003906, "ref_logps/rejected": -89.22828674316406, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 85.45716857910156, "rewards/margins": 167.11154174804688, "rewards/rejected": -81.65437316894531, "step": 6700, "u": -6.533175468444824, "weight": 0.05000308156013489 }, { "diff_generated": -84.60969543457031, "epoch": 2.1743357096565132, "grad_norm": 493.7980281327751, "learning_rate": 1.7083031530828072e-07, "logits/chosen": -2.3933663368225098, "logits/rejected": -2.4629433155059814, "logps/chosen": -13.83466911315918, "logps/rejected": -172.16818237304688, "loss": 12.7764, "losses_ref": -0.021861081942915916, "ref_logps/chosen": -104.56904602050781, "ref_logps/rejected": -87.55848693847656, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 90.734375, "rewards/margins": 175.34405517578125, "rewards/rejected": -84.60969543457031, "step": 6710, "u": -6.686763763427734, "weight": 0.01909906603395939 }, { "diff_generated": -83.56124114990234, "epoch": 2.1775761503564484, "grad_norm": 463.96518711732523, "learning_rate": 1.6959580920628937e-07, "logits/chosen": -2.364957332611084, "logits/rejected": -2.4778125286102295, "logps/chosen": -13.045331001281738, "logps/rejected": -167.7306365966797, "loss": 12.9641, "losses_ref": -0.0005527561879716814, "ref_logps/chosen": -95.67125701904297, "ref_logps/rejected": -84.16940307617188, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 82.62592315673828, "rewards/margins": 166.18714904785156, "rewards/rejected": -83.56124114990234, "step": 6720, "u": -6.551631927490234, "weight": 0.04377732053399086 }, { "diff_generated": -78.09506225585938, "epoch": 2.1808165910563835, "grad_norm": 534.711775566111, "learning_rate": 1.6836457870595783e-07, "logits/chosen": -2.340938091278076, "logits/rejected": -2.4108238220214844, "logps/chosen": -12.503395080566406, "logps/rejected": -156.5635986328125, "loss": 12.6524, "losses_ref": -0.006767577491700649, "ref_logps/chosen": -91.59136962890625, "ref_logps/rejected": -78.46855163574219, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 79.08796691894531, "rewards/margins": 157.1830291748047, "rewards/rejected": -78.09506225585938, "step": 6730, "u": -6.41765832901001, "weight": 0.06905090808868408 }, { "diff_generated": -83.96517181396484, "epoch": 2.184057031756319, "grad_norm": 531.0780111374985, "learning_rate": 1.6713664131139723e-07, "logits/chosen": -2.363882303237915, "logits/rejected": -2.435429096221924, "logps/chosen": -12.827771186828613, "logps/rejected": -169.38075256347656, "loss": 13.0853, "losses_ref": -0.0035223353188484907, "ref_logps/chosen": -98.59832763671875, "ref_logps/rejected": -85.41558837890625, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 85.77055358886719, "rewards/margins": 169.73573303222656, "rewards/rejected": -83.96517181396484, "step": 6740, "u": -6.626309394836426, "weight": 0.03765714913606644 }, { "diff_generated": -83.90516662597656, "epoch": 2.187297472456254, "grad_norm": 570.9212004915166, "learning_rate": 1.659120144799019e-07, "logits/chosen": -2.4047927856445312, "logits/rejected": -2.524055242538452, "logps/chosen": -13.002700805664062, "logps/rejected": -169.1326446533203, "loss": 12.9903, "losses_ref": -0.008439160883426666, "ref_logps/chosen": -94.8218994140625, "ref_logps/rejected": -85.22748565673828, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 81.81919860839844, "rewards/margins": 165.72434997558594, "rewards/rejected": -83.90516662597656, "step": 6750, "u": -6.468390464782715, "weight": 0.056635864078998566 }, { "diff_generated": -81.97956848144531, "epoch": 2.1905379131561893, "grad_norm": 479.3194514385537, "learning_rate": 1.6469071562170114e-07, "logits/chosen": -2.4284636974334717, "logits/rejected": -2.5151870250701904, "logps/chosen": -12.341389656066895, "logps/rejected": -165.90318298339844, "loss": 12.7468, "losses_ref": -3.743227505736968e-09, "ref_logps/chosen": -93.52154541015625, "ref_logps/rejected": -83.92363739013672, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 81.1801528930664, "rewards/margins": 163.15969848632812, "rewards/rejected": -81.97956848144531, "step": 6760, "u": -6.347648620605469, "weight": 0.06875000149011612 }, { "diff_generated": -83.8197250366211, "epoch": 2.1937783538561244, "grad_norm": 503.7998354414286, "learning_rate": 1.6347276209971024e-07, "logits/chosen": -2.3805127143859863, "logits/rejected": -2.5530269145965576, "logps/chosen": -11.702221870422363, "logps/rejected": -168.94638061523438, "loss": 12.4645, "losses_ref": -0.0016042323550209403, "ref_logps/chosen": -90.09027099609375, "ref_logps/rejected": -85.12665557861328, "rewards/accuracies": 0.9375, "rewards/chosen": 78.38804626464844, "rewards/margins": 162.20777893066406, "rewards/rejected": -83.8197250366211, "step": 6770, "u": -6.439352989196777, "weight": 0.06256560236215591 }, { "diff_generated": -80.71089935302734, "epoch": 2.1970187945560595, "grad_norm": 490.02924840371116, "learning_rate": 1.6225817122928534e-07, "logits/chosen": -2.3680498600006104, "logits/rejected": -2.483076333999634, "logps/chosen": -11.827688217163086, "logps/rejected": -166.25616455078125, "loss": 12.9312, "losses_ref": -0.0018793217604979873, "ref_logps/chosen": -91.48072814941406, "ref_logps/rejected": -85.54527282714844, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 79.65303802490234, "rewards/margins": 160.3639373779297, "rewards/rejected": -80.71089935302734, "step": 6780, "u": -6.448507785797119, "weight": 0.056329526007175446 }, { "diff_generated": -82.11483001708984, "epoch": 2.2002592352559946, "grad_norm": 484.9852882132878, "learning_rate": 1.6104696027797635e-07, "logits/chosen": -2.302804470062256, "logits/rejected": -2.517000675201416, "logps/chosen": -11.84687328338623, "logps/rejected": -165.51992797851562, "loss": 12.6721, "losses_ref": -1.3320375558123487e-07, "ref_logps/chosen": -86.8465805053711, "ref_logps/rejected": -83.40510559082031, "rewards/accuracies": 0.9375, "rewards/chosen": 74.99971771240234, "rewards/margins": 157.1145477294922, "rewards/rejected": -82.11483001708984, "step": 6790, "u": -6.432467460632324, "weight": 0.0625 }, { "diff_generated": -85.60330200195312, "epoch": 2.20349967595593, "grad_norm": 500.89423272331715, "learning_rate": 1.5983914646528193e-07, "logits/chosen": -2.3903326988220215, "logits/rejected": -2.4877140522003174, "logps/chosen": -13.389312744140625, "logps/rejected": -171.62155151367188, "loss": 12.7111, "losses_ref": -1.1886468200827949e-05, "ref_logps/chosen": -93.59146118164062, "ref_logps/rejected": -86.01826477050781, "rewards/accuracies": 0.9375, "rewards/chosen": 80.20214080810547, "rewards/margins": 165.80545043945312, "rewards/rejected": -85.60330200195312, "step": 6800, "u": -6.437894344329834, "weight": 0.06250022351741791 }, { "diff_generated": -84.71478271484375, "epoch": 2.2067401166558653, "grad_norm": 473.07032489741954, "learning_rate": 1.5863474696240365e-07, "logits/chosen": -2.424415111541748, "logits/rejected": -2.5038774013519287, "logps/chosen": -13.811508178710938, "logps/rejected": -169.12124633789062, "loss": 13.1273, "losses_ref": -0.000734654429834336, "ref_logps/chosen": -96.21209716796875, "ref_logps/rejected": -84.40647888183594, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 82.40059661865234, "rewards/margins": 167.11537170410156, "rewards/rejected": -84.71478271484375, "step": 6810, "u": -6.517383575439453, "weight": 0.05002971366047859 }, { "diff_generated": -81.48091125488281, "epoch": 2.2099805573558005, "grad_norm": 542.4283769774358, "learning_rate": 1.5743377889200388e-07, "logits/chosen": -2.375166654586792, "logits/rejected": -2.5270323753356934, "logps/chosen": -13.916537284851074, "logps/rejected": -167.28854370117188, "loss": 13.3413, "losses_ref": -0.003760767402127385, "ref_logps/chosen": -94.32147216796875, "ref_logps/rejected": -85.80763244628906, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 80.40492248535156, "rewards/margins": 161.88583374023438, "rewards/rejected": -81.48091125488281, "step": 6820, "u": -6.543093204498291, "weight": 0.05016561597585678 }, { "diff_generated": -82.93003845214844, "epoch": 2.2132209980557356, "grad_norm": 511.37752565432453, "learning_rate": 1.5623625932795994e-07, "logits/chosen": -2.41325044631958, "logits/rejected": -2.519089698791504, "logps/chosen": -13.878247261047363, "logps/rejected": -170.31137084960938, "loss": 13.2477, "losses_ref": -0.01453393418341875, "ref_logps/chosen": -97.43132781982422, "ref_logps/rejected": -87.38133239746094, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 83.55308532714844, "rewards/margins": 166.48312377929688, "rewards/rejected": -82.93003845214844, "step": 6830, "u": -6.694279670715332, "weight": 0.02572527527809143 }, { "diff_generated": -83.49250793457031, "epoch": 2.2164614387556707, "grad_norm": 526.8628217578116, "learning_rate": 1.5504220529512324e-07, "logits/chosen": -2.3933825492858887, "logits/rejected": -2.5277512073516846, "logps/chosen": -12.323593139648438, "logps/rejected": -166.7972869873047, "loss": 13.3676, "losses_ref": -0.06453749537467957, "ref_logps/chosen": -97.86778259277344, "ref_logps/rejected": -83.30477905273438, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 85.544189453125, "rewards/margins": 169.03668212890625, "rewards/rejected": -83.49250793457031, "step": 6840, "u": -6.69271183013916, "weight": 0.02855154313147068 }, { "diff_generated": -81.74871063232422, "epoch": 2.219701879455606, "grad_norm": 501.04138572115914, "learning_rate": 1.5385163376907636e-07, "logits/chosen": -2.4174468517303467, "logits/rejected": -2.477114200592041, "logps/chosen": -13.157957077026367, "logps/rejected": -164.51651000976562, "loss": 12.6108, "losses_ref": -0.0037777810357511044, "ref_logps/chosen": -97.16854858398438, "ref_logps/rejected": -82.76777648925781, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 84.0105972290039, "rewards/margins": 165.75930786132812, "rewards/rejected": -81.74871063232422, "step": 6850, "u": -6.4757208824157715, "weight": 0.056429821997880936 }, { "diff_generated": -78.98966979980469, "epoch": 2.222942320155541, "grad_norm": 504.75515042657014, "learning_rate": 1.526645616758921e-07, "logits/chosen": -2.3435864448547363, "logits/rejected": -2.469954252243042, "logps/chosen": -12.959577560424805, "logps/rejected": -161.3673553466797, "loss": 13.1083, "losses_ref": -0.0011534191435202956, "ref_logps/chosen": -90.16709899902344, "ref_logps/rejected": -82.37767791748047, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 77.20752716064453, "rewards/margins": 156.19717407226562, "rewards/rejected": -78.98966979980469, "step": 6860, "u": -6.452353000640869, "weight": 0.056289829313755035 }, { "diff_generated": -86.8541030883789, "epoch": 2.2261827608554765, "grad_norm": 486.5098764500833, "learning_rate": 1.5148100589189205e-07, "logits/chosen": -2.4341390132904053, "logits/rejected": -2.5227787494659424, "logps/chosen": -13.576004028320312, "logps/rejected": -172.723876953125, "loss": 13.1079, "losses_ref": -0.0012087022187188268, "ref_logps/chosen": -97.78997039794922, "ref_logps/rejected": -85.8697738647461, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 84.2139663696289, "rewards/margins": 171.06808471679688, "rewards/rejected": -86.8541030883789, "step": 6870, "u": -6.57666540145874, "weight": 0.0375499427318573 }, { "diff_generated": -87.36380004882812, "epoch": 2.2294232015554116, "grad_norm": 499.7333864772283, "learning_rate": 1.5030098324340808e-07, "logits/chosen": -2.41763973236084, "logits/rejected": -2.5282013416290283, "logps/chosen": -12.894952774047852, "logps/rejected": -174.74172973632812, "loss": 12.5028, "losses_ref": -0.002254816237837076, "ref_logps/chosen": -100.2215347290039, "ref_logps/rejected": -87.3779296875, "rewards/accuracies": 0.96875, "rewards/chosen": 87.32657623291016, "rewards/margins": 174.6903839111328, "rewards/rejected": -87.36380004882812, "step": 6880, "u": -6.649659156799316, "weight": 0.03135109692811966 }, { "diff_generated": -85.03706359863281, "epoch": 2.2326636422553467, "grad_norm": 503.3857547944509, "learning_rate": 1.491245105065419e-07, "logits/chosen": -2.418447732925415, "logits/rejected": -2.5524816513061523, "logps/chosen": -12.30914306640625, "logps/rejected": -173.20089721679688, "loss": 12.9675, "losses_ref": -0.005227755755186081, "ref_logps/chosen": -95.12874603271484, "ref_logps/rejected": -88.1638412475586, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 82.81959533691406, "rewards/margins": 167.85665893554688, "rewards/rejected": -85.03706359863281, "step": 6890, "u": -6.624598026275635, "weight": 0.037730418145656586 }, { "diff_generated": -86.80964660644531, "epoch": 2.235904082955282, "grad_norm": 518.1303914744179, "learning_rate": 1.4795160440692672e-07, "logits/chosen": -2.43729305267334, "logits/rejected": -2.527352809906006, "logps/chosen": -13.276708602905273, "logps/rejected": -173.63804626464844, "loss": 12.9862, "losses_ref": -0.0007822831976227462, "ref_logps/chosen": -99.302978515625, "ref_logps/rejected": -86.82841491699219, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 86.02627563476562, "rewards/margins": 172.83590698242188, "rewards/rejected": -86.80964660644531, "step": 6900, "u": -6.6816582679748535, "weight": 0.025032568722963333 }, { "diff_generated": -84.83689880371094, "epoch": 2.239144523655217, "grad_norm": 491.41988866598723, "learning_rate": 1.467822816194904e-07, "logits/chosen": -2.4127767086029053, "logits/rejected": -2.5224814414978027, "logps/chosen": -12.989709854125977, "logps/rejected": -169.14642333984375, "loss": 12.7208, "losses_ref": -0.005537012591958046, "ref_logps/chosen": -96.77596282958984, "ref_logps/rejected": -84.30952453613281, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 83.78624725341797, "rewards/margins": 168.62315368652344, "rewards/rejected": -84.83689880371094, "step": 6910, "u": -6.521003723144531, "weight": 0.050242386758327484 }, { "diff_generated": -85.5491943359375, "epoch": 2.242384964355152, "grad_norm": 550.1375066039961, "learning_rate": 1.4561655876821694e-07, "logits/chosen": -2.3261666297912598, "logits/rejected": -2.512333631515503, "logps/chosen": -12.838945388793945, "logps/rejected": -173.94326782226562, "loss": 12.9348, "losses_ref": -0.0017390226712450385, "ref_logps/chosen": -91.61780548095703, "ref_logps/rejected": -88.39408111572266, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 78.77885437011719, "rewards/margins": 164.3280487060547, "rewards/rejected": -85.5491943359375, "step": 6920, "u": -6.529515266418457, "weight": 0.05007760971784592 }, { "diff_generated": -84.65425109863281, "epoch": 2.2456254050550877, "grad_norm": 524.7811803424514, "learning_rate": 1.4445445242591138e-07, "logits/chosen": -2.37631893157959, "logits/rejected": -2.5157320499420166, "logps/chosen": -12.7593412399292, "logps/rejected": -170.8683319091797, "loss": 13.0834, "losses_ref": -2.6054085111582026e-08, "ref_logps/chosen": -96.13145446777344, "ref_logps/rejected": -86.21408081054688, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 83.37211608886719, "rewards/margins": 168.0263671875, "rewards/rejected": -84.65425109863281, "step": 6930, "u": -6.5352582931518555, "weight": 0.05000000074505806 }, { "diff_generated": -83.07829284667969, "epoch": 2.248865845755023, "grad_norm": 502.47717459419925, "learning_rate": 1.4329597911396362e-07, "logits/chosen": -2.4220199584960938, "logits/rejected": -2.498976230621338, "logps/chosen": -13.39250373840332, "logps/rejected": -166.42909240722656, "loss": 12.9699, "losses_ref": -0.00041079233051277697, "ref_logps/chosen": -100.69302368164062, "ref_logps/rejected": -83.35079193115234, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 87.30052185058594, "rewards/margins": 170.37881469726562, "rewards/rejected": -83.07829284667969, "step": 6940, "u": -6.401181697845459, "weight": 0.06876643002033234 }, { "diff_generated": -84.72123718261719, "epoch": 2.252106286454958, "grad_norm": 516.1839062498832, "learning_rate": 1.421411553021137e-07, "logits/chosen": -2.440908193588257, "logits/rejected": -2.558413028717041, "logps/chosen": -14.195680618286133, "logps/rejected": -170.11984252929688, "loss": 12.8176, "losses_ref": -1.3914039698192937e-07, "ref_logps/chosen": -101.77497863769531, "ref_logps/rejected": -85.39862060546875, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 87.57929992675781, "rewards/margins": 172.30055236816406, "rewards/rejected": -84.72123718261719, "step": 6950, "u": -6.594904899597168, "weight": 0.03750000149011612 }, { "diff_generated": -84.58776092529297, "epoch": 2.255346727154893, "grad_norm": 531.8777877793126, "learning_rate": 1.4098999740821716e-07, "logits/chosen": -2.356252908706665, "logits/rejected": -2.452484369277954, "logps/chosen": -11.869619369506836, "logps/rejected": -170.38638305664062, "loss": 12.8437, "losses_ref": -8.181816156138666e-06, "ref_logps/chosen": -94.5312728881836, "ref_logps/rejected": -85.7986068725586, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 82.66166687011719, "rewards/margins": 167.24942016601562, "rewards/rejected": -84.58776092529297, "step": 6960, "u": -6.414976596832275, "weight": 0.0687502771615982 }, { "diff_generated": -84.35314178466797, "epoch": 2.258587167854828, "grad_norm": 509.9867754579174, "learning_rate": 1.3984252179801277e-07, "logits/chosen": -2.3887689113616943, "logits/rejected": -2.5168299674987793, "logps/chosen": -13.763440132141113, "logps/rejected": -173.8634033203125, "loss": 12.9378, "losses_ref": -2.446452924687037e-07, "ref_logps/chosen": -94.25096130371094, "ref_logps/rejected": -89.51025390625, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 80.4875259399414, "rewards/margins": 164.84066772460938, "rewards/rejected": -84.35314178466797, "step": 6970, "u": -6.471514701843262, "weight": 0.056250013411045074 }, { "diff_generated": -81.13739013671875, "epoch": 2.2618276085547633, "grad_norm": 485.95806985386764, "learning_rate": 1.3869874478488846e-07, "logits/chosen": -2.355803966522217, "logits/rejected": -2.518582582473755, "logps/chosen": -12.210909843444824, "logps/rejected": -163.37709045410156, "loss": 12.4143, "losses_ref": -4.4140466570752324e-07, "ref_logps/chosen": -90.22859954833984, "ref_logps/rejected": -82.23968505859375, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 78.01768493652344, "rewards/margins": 159.1550750732422, "rewards/rejected": -81.13739013671875, "step": 6980, "u": -6.590836524963379, "weight": 0.043750010430812836 }, { "diff_generated": -80.5860824584961, "epoch": 2.2650680492546984, "grad_norm": 510.0246999329795, "learning_rate": 1.3755868262965047e-07, "logits/chosen": -2.4310669898986816, "logits/rejected": -2.4750888347625732, "logps/chosen": -12.444555282592773, "logps/rejected": -160.20654296875, "loss": 12.9388, "losses_ref": -0.011145448312163353, "ref_logps/chosen": -96.44710540771484, "ref_logps/rejected": -79.6204605102539, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 84.00254821777344, "rewards/margins": 164.588623046875, "rewards/rejected": -80.5860824584961, "step": 6990, "u": -6.405091285705566, "weight": 0.06928651034832001 }, { "diff_generated": -82.3926010131836, "epoch": 2.268308489954634, "grad_norm": 481.8003241790466, "learning_rate": 1.3642235154029172e-07, "logits/chosen": -2.41209077835083, "logits/rejected": -2.4497084617614746, "logps/chosen": -14.089653015136719, "logps/rejected": -163.5379638671875, "loss": 12.7251, "losses_ref": -1.1824274537275414e-07, "ref_logps/chosen": -100.86749267578125, "ref_logps/rejected": -81.14535522460938, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 86.77783966064453, "rewards/margins": 169.1704559326172, "rewards/rejected": -82.3926010131836, "step": 7000, "u": -6.477536201477051, "weight": 0.05624999850988388 }, { "diff_generated": -83.03773498535156, "epoch": 2.271548930654569, "grad_norm": 498.2740157382577, "learning_rate": 1.352897676717614e-07, "logits/chosen": -2.4076733589172363, "logits/rejected": -2.476492404937744, "logps/chosen": -13.60411548614502, "logps/rejected": -168.27487182617188, "loss": 13.318, "losses_ref": -1.2477959899115376e-05, "ref_logps/chosen": -99.06245422363281, "ref_logps/rejected": -85.23712158203125, "rewards/accuracies": 0.96875, "rewards/chosen": 85.45832824707031, "rewards/margins": 168.49607849121094, "rewards/rejected": -83.03773498535156, "step": 7010, "u": -6.662878513336182, "weight": 0.031250398606061935 }, { "diff_generated": -82.75672149658203, "epoch": 2.274789371354504, "grad_norm": 469.74278121718174, "learning_rate": 1.341609471257354e-07, "logits/chosen": -2.348039388656616, "logits/rejected": -2.5116515159606934, "logps/chosen": -12.133288383483887, "logps/rejected": -166.936767578125, "loss": 12.7259, "losses_ref": -1.324751337961061e-06, "ref_logps/chosen": -92.88429260253906, "ref_logps/rejected": -84.18003845214844, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 80.75099182128906, "rewards/margins": 163.50772094726562, "rewards/rejected": -82.75672149658203, "step": 7020, "u": -6.534178733825684, "weight": 0.043750010430812836 }, { "diff_generated": -85.21153259277344, "epoch": 2.2780298120544393, "grad_norm": 460.40707525635656, "learning_rate": 1.3303590595038735e-07, "logits/chosen": -2.3632588386535645, "logits/rejected": -2.5475070476531982, "logps/chosen": -13.21232795715332, "logps/rejected": -170.18663024902344, "loss": 12.9074, "losses_ref": -5.1966587477636494e-08, "ref_logps/chosen": -93.5289306640625, "ref_logps/rejected": -84.97509765625, "rewards/accuracies": 0.96875, "rewards/chosen": 80.31660461425781, "rewards/margins": 165.5281524658203, "rewards/rejected": -85.21153259277344, "step": 7030, "u": -6.636476993560791, "weight": 0.03125 }, { "diff_generated": -79.51531219482422, "epoch": 2.2812702527543745, "grad_norm": 498.87865447951674, "learning_rate": 1.3191466014016049e-07, "logits/chosen": -2.384385347366333, "logits/rejected": -2.4455251693725586, "logps/chosen": -11.143257141113281, "logps/rejected": -163.07403564453125, "loss": 12.4045, "losses_ref": -6.7265777943248395e-06, "ref_logps/chosen": -95.297607421875, "ref_logps/rejected": -83.55873107910156, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 84.15434265136719, "rewards/margins": 163.66966247558594, "rewards/rejected": -79.51531219482422, "step": 7040, "u": -6.4666948318481445, "weight": 0.05625021457672119 }, { "diff_generated": -81.31959533691406, "epoch": 2.28451069345431, "grad_norm": 476.81821446420355, "learning_rate": 1.3079722563553994e-07, "logits/chosen": -2.4070966243743896, "logits/rejected": -2.495264768600464, "logps/chosen": -12.539602279663086, "logps/rejected": -164.7382354736328, "loss": 12.565, "losses_ref": -0.006604082882404327, "ref_logps/chosen": -93.5062255859375, "ref_logps/rejected": -83.41862487792969, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 80.96662139892578, "rewards/margins": 162.28622436523438, "rewards/rejected": -81.31959533691406, "step": 7050, "u": -6.380990028381348, "weight": 0.06906723976135254 }, { "diff_generated": -84.39396667480469, "epoch": 2.287751134154245, "grad_norm": 499.95043530982576, "learning_rate": 1.2968361832282705e-07, "logits/chosen": -2.3694443702697754, "logits/rejected": -2.499861717224121, "logps/chosen": -12.672450065612793, "logps/rejected": -170.8816680908203, "loss": 12.9321, "losses_ref": -2.2672212551810844e-08, "ref_logps/chosen": -96.10008239746094, "ref_logps/rejected": -86.48768615722656, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 83.42762756347656, "rewards/margins": 167.8216094970703, "rewards/rejected": -84.39396667480469, "step": 7060, "u": -6.547879695892334, "weight": 0.05000000074505806 }, { "diff_generated": -85.82933044433594, "epoch": 2.2909915748541803, "grad_norm": 498.6979087032496, "learning_rate": 1.2857385403391226e-07, "logits/chosen": -2.3565831184387207, "logits/rejected": -2.492098093032837, "logps/chosen": -13.709028244018555, "logps/rejected": -174.83709716796875, "loss": 13.3289, "losses_ref": -0.007923029363155365, "ref_logps/chosen": -96.8045883178711, "ref_logps/rejected": -89.00776672363281, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 83.09556579589844, "rewards/margins": 168.9248809814453, "rewards/rejected": -85.82933044433594, "step": 7070, "u": -6.711148262023926, "weight": 0.02536194957792759 }, { "diff_generated": -81.84160614013672, "epoch": 2.2942320155541154, "grad_norm": 533.2120579610759, "learning_rate": 1.274679485460509e-07, "logits/chosen": -2.3976547718048096, "logits/rejected": -2.453458309173584, "logps/chosen": -13.225214004516602, "logps/rejected": -165.89492797851562, "loss": 12.7832, "losses_ref": -3.6161025036562933e-06, "ref_logps/chosen": -97.51805114746094, "ref_logps/rejected": -84.05333709716797, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 84.29283905029297, "rewards/margins": 166.13442993164062, "rewards/rejected": -81.84160614013672, "step": 7080, "u": -6.306405544281006, "weight": 0.0812501460313797 }, { "diff_generated": -82.47283935546875, "epoch": 2.2974724562540505, "grad_norm": 472.59279168597607, "learning_rate": 1.2636591758163868e-07, "logits/chosen": -2.3648295402526855, "logits/rejected": -2.542153835296631, "logps/chosen": -12.415548324584961, "logps/rejected": -167.7449188232422, "loss": 12.7337, "losses_ref": -0.0115945003926754, "ref_logps/chosen": -92.28016662597656, "ref_logps/rejected": -85.27208709716797, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 79.8646240234375, "rewards/margins": 162.33746337890625, "rewards/rejected": -82.47283935546875, "step": 7090, "u": -6.5920257568359375, "weight": 0.03803582862019539 }, { "diff_generated": -81.75079345703125, "epoch": 2.3007128969539856, "grad_norm": 505.3085147920675, "learning_rate": 1.2526777680798813e-07, "logits/chosen": -2.3778653144836426, "logits/rejected": -2.5342462062835693, "logps/chosen": -12.207781791687012, "logps/rejected": -170.48773193359375, "loss": 12.6916, "losses_ref": -0.0039069210179150105, "ref_logps/chosen": -91.88865661621094, "ref_logps/rejected": -88.7369384765625, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 79.68087005615234, "rewards/margins": 161.43167114257812, "rewards/rejected": -81.75079345703125, "step": 7100, "u": -6.36777925491333, "weight": 0.06892150640487671 }, { "diff_generated": -85.78407287597656, "epoch": 2.3039533376539207, "grad_norm": 491.3990275948269, "learning_rate": 1.241735418371057e-07, "logits/chosen": -2.3440051078796387, "logits/rejected": -2.523491144180298, "logps/chosen": -12.945859909057617, "logps/rejected": -174.6090850830078, "loss": 12.7095, "losses_ref": -0.0023649369832128286, "ref_logps/chosen": -91.75056457519531, "ref_logps/rejected": -88.82500457763672, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 78.80470275878906, "rewards/margins": 164.58877563476562, "rewards/rejected": -85.78407287597656, "step": 7110, "u": -6.561973571777344, "weight": 0.03760456293821335 }, { "diff_generated": -83.95856475830078, "epoch": 2.3071937783538563, "grad_norm": 533.5355848679819, "learning_rate": 1.2308322822547027e-07, "logits/chosen": -2.362577199935913, "logits/rejected": -2.4960556030273438, "logps/chosen": -12.309553146362305, "logps/rejected": -167.9497833251953, "loss": 12.4265, "losses_ref": -5.8301971250784845e-08, "ref_logps/chosen": -95.14002990722656, "ref_logps/rejected": -83.99121856689453, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 82.83045959472656, "rewards/margins": 166.78904724121094, "rewards/rejected": -83.95856475830078, "step": 7120, "u": -6.475826263427734, "weight": 0.05624999850988388 }, { "diff_generated": -83.14198303222656, "epoch": 2.3104342190537914, "grad_norm": 499.945946875676, "learning_rate": 1.2199685147381148e-07, "logits/chosen": -2.4278907775878906, "logits/rejected": -2.5294861793518066, "logps/chosen": -13.416807174682617, "logps/rejected": -169.05868530273438, "loss": 12.9315, "losses_ref": -0.00023093321942724288, "ref_logps/chosen": -95.8389892578125, "ref_logps/rejected": -85.91670227050781, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 82.42218780517578, "rewards/margins": 165.56417846679688, "rewards/rejected": -83.14198303222656, "step": 7130, "u": -6.518418788909912, "weight": 0.050009287893772125 }, { "diff_generated": -82.58621215820312, "epoch": 2.3136746597537265, "grad_norm": 500.1642698273991, "learning_rate": 1.2091442702688933e-07, "logits/chosen": -2.383411169052124, "logits/rejected": -2.5127146244049072, "logps/chosen": -12.741984367370605, "logps/rejected": -169.25953674316406, "loss": 13.1358, "losses_ref": -0.0027023288421332836, "ref_logps/chosen": -92.45108795166016, "ref_logps/rejected": -86.67330932617188, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 79.70909118652344, "rewards/margins": 162.29531860351562, "rewards/rejected": -82.58621215820312, "step": 7140, "u": -6.45116662979126, "weight": 0.05637402459979057 }, { "diff_generated": -79.54682922363281, "epoch": 2.3169151004536617, "grad_norm": 527.4886173738923, "learning_rate": 1.198359702732755e-07, "logits/chosen": -2.3902125358581543, "logits/rejected": -2.4888460636138916, "logps/chosen": -13.180867195129395, "logps/rejected": -162.79171752929688, "loss": 12.8312, "losses_ref": -0.0008114447700791061, "ref_logps/chosen": -95.47314453125, "ref_logps/rejected": -83.24490356445312, "rewards/accuracies": 0.9375, "rewards/chosen": 82.29228210449219, "rewards/margins": 161.839111328125, "rewards/rejected": -79.54682922363281, "step": 7150, "u": -6.421548366546631, "weight": 0.06253501027822495 }, { "diff_generated": -83.003662109375, "epoch": 2.320155541153597, "grad_norm": 504.246990999001, "learning_rate": 1.1876149654513321e-07, "logits/chosen": -2.3551647663116455, "logits/rejected": -2.5079431533813477, "logps/chosen": -13.358386039733887, "logps/rejected": -171.64193725585938, "loss": 13.2005, "losses_ref": -0.00260176626034081, "ref_logps/chosen": -94.94227600097656, "ref_logps/rejected": -88.63829803466797, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 81.58389282226562, "rewards/margins": 164.58755493164062, "rewards/rejected": -83.003662109375, "step": 7160, "u": -6.538697719573975, "weight": 0.05011763423681259 }, { "diff_generated": -83.80158996582031, "epoch": 2.323395981853532, "grad_norm": 476.0883198202353, "learning_rate": 1.1769102111800036e-07, "logits/chosen": -2.3814778327941895, "logits/rejected": -2.5466561317443848, "logps/chosen": -13.39539909362793, "logps/rejected": -170.6013946533203, "loss": 12.7603, "losses_ref": -0.007953451946377754, "ref_logps/chosen": -96.72367858886719, "ref_logps/rejected": -86.7998046875, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 83.3282699584961, "rewards/margins": 167.12985229492188, "rewards/rejected": -83.80158996582031, "step": 7170, "u": -6.736584663391113, "weight": 0.019122375175356865 }, { "diff_generated": -80.74140930175781, "epoch": 2.3266364225534675, "grad_norm": 503.6890165150702, "learning_rate": 1.166245592105719e-07, "logits/chosen": -2.3559749126434326, "logits/rejected": -2.4508726596832275, "logps/chosen": -13.094064712524414, "logps/rejected": -160.50881958007812, "loss": 12.853, "losses_ref": -3.550645999439439e-07, "ref_logps/chosen": -96.27635955810547, "ref_logps/rejected": -79.76742553710938, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 83.18229675292969, "rewards/margins": 163.9237060546875, "rewards/rejected": -80.74140930175781, "step": 7180, "u": -6.578315734863281, "weight": 0.04375000670552254 }, { "diff_generated": -83.57976531982422, "epoch": 2.3298768632534026, "grad_norm": 506.1377150659979, "learning_rate": 1.1556212598448349e-07, "logits/chosen": -2.3926408290863037, "logits/rejected": -2.502164840698242, "logps/chosen": -14.994264602661133, "logps/rejected": -171.16343688964844, "loss": 13.0739, "losses_ref": -4.4104973540015635e-07, "ref_logps/chosen": -98.24559020996094, "ref_logps/rejected": -87.58365631103516, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 83.2513198852539, "rewards/margins": 166.83108520507812, "rewards/rejected": -83.57976531982422, "step": 7190, "u": -6.625774383544922, "weight": 0.037500012665987015 }, { "diff_generated": -87.09251403808594, "epoch": 2.3331173039533377, "grad_norm": 532.4466559165295, "learning_rate": 1.1450373654409591e-07, "logits/chosen": -2.3863420486450195, "logits/rejected": -2.56119966506958, "logps/chosen": -12.581506729125977, "logps/rejected": -175.84518432617188, "loss": 13.0843, "losses_ref": -0.003380722599104047, "ref_logps/chosen": -93.87139892578125, "ref_logps/rejected": -88.75267028808594, "rewards/accuracies": 0.9375, "rewards/chosen": 81.28990173339844, "rewards/margins": 168.3824005126953, "rewards/rejected": -87.09251403808594, "step": 7200, "u": -6.45444393157959, "weight": 0.06264631450176239 }, { "diff_generated": -79.31378936767578, "epoch": 2.336357744653273, "grad_norm": 492.9742860836095, "learning_rate": 1.1344940593628063e-07, "logits/chosen": -2.340789318084717, "logits/rejected": -2.4311439990997314, "logps/chosen": -12.333671569824219, "logps/rejected": -159.327880859375, "loss": 12.8581, "losses_ref": -0.008099230006337166, "ref_logps/chosen": -94.08828735351562, "ref_logps/rejected": -80.01409149169922, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 81.7546157836914, "rewards/margins": 161.0684051513672, "rewards/rejected": -79.31378936767578, "step": 7210, "u": -6.362817764282227, "weight": 0.07535295188426971 }, { "diff_generated": -82.0297622680664, "epoch": 2.339598185353208, "grad_norm": 499.32682808368605, "learning_rate": 1.1239914915020512e-07, "logits/chosen": -2.3604369163513184, "logits/rejected": -2.5125110149383545, "logps/chosen": -12.218565940856934, "logps/rejected": -167.6261749267578, "loss": 13.0646, "losses_ref": -0.0029511586762964725, "ref_logps/chosen": -91.50608825683594, "ref_logps/rejected": -85.59638977050781, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 79.28752136230469, "rewards/margins": 161.31729125976562, "rewards/rejected": -82.0297622680664, "step": 7220, "u": -6.343608379364014, "weight": 0.08137090504169464 }, { "diff_generated": -83.73043823242188, "epoch": 2.342838626053143, "grad_norm": 490.66335871155917, "learning_rate": 1.1135298111712122e-07, "logits/chosen": -2.356489896774292, "logits/rejected": -2.4991321563720703, "logps/chosen": -12.847094535827637, "logps/rejected": -168.40830993652344, "loss": 12.8496, "losses_ref": -1.1878989347735569e-08, "ref_logps/chosen": -94.01008605957031, "ref_logps/rejected": -84.67787170410156, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 81.1629867553711, "rewards/margins": 164.89340209960938, "rewards/rejected": -83.73043823242188, "step": 7230, "u": -6.560157775878906, "weight": 0.05000000074505806 }, { "diff_generated": -81.16157531738281, "epoch": 2.346079066753078, "grad_norm": 494.03668382681167, "learning_rate": 1.1031091671015094e-07, "logits/chosen": -2.342160940170288, "logits/rejected": -2.462944507598877, "logps/chosen": -11.229791641235352, "logps/rejected": -164.2108917236328, "loss": 12.7249, "losses_ref": -0.007652191910892725, "ref_logps/chosen": -91.34001922607422, "ref_logps/rejected": -83.0493392944336, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 80.1102294921875, "rewards/margins": 161.27178955078125, "rewards/rejected": -81.16157531738281, "step": 7240, "u": -6.273160934448242, "weight": 0.08783890306949615 }, { "diff_generated": -86.33053588867188, "epoch": 2.3493195074530138, "grad_norm": 519.6081817427561, "learning_rate": 1.0927297074407662e-07, "logits/chosen": -2.3760571479797363, "logits/rejected": -2.5106639862060547, "logps/chosen": -12.724100112915039, "logps/rejected": -176.9182891845703, "loss": 12.4812, "losses_ref": -1.339400341748842e-06, "ref_logps/chosen": -94.36858367919922, "ref_logps/rejected": -90.58776092529297, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 81.64447784423828, "rewards/margins": 167.97500610351562, "rewards/rejected": -86.33053588867188, "step": 7250, "u": -6.500415802001953, "weight": 0.050000034272670746 }, { "diff_generated": -85.05168151855469, "epoch": 2.352559948152949, "grad_norm": 506.91432397616205, "learning_rate": 1.0823915797512952e-07, "logits/chosen": -2.3924827575683594, "logits/rejected": -2.5272819995880127, "logps/chosen": -11.652626991271973, "logps/rejected": -168.64675903320312, "loss": 12.5285, "losses_ref": -0.001545862527564168, "ref_logps/chosen": -94.11674499511719, "ref_logps/rejected": -83.59507751464844, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 82.46412658691406, "rewards/margins": 167.5157928466797, "rewards/rejected": -85.05168151855469, "step": 7260, "u": -6.477601051330566, "weight": 0.05631608888506889 }, { "diff_generated": -86.03199768066406, "epoch": 2.355800388852884, "grad_norm": 487.9596910661287, "learning_rate": 1.0720949310078032e-07, "logits/chosen": -2.362509250640869, "logits/rejected": -2.5025877952575684, "logps/chosen": -12.802087783813477, "logps/rejected": -173.5316925048828, "loss": 12.7406, "losses_ref": -5.450115025951163e-08, "ref_logps/chosen": -96.65104675292969, "ref_logps/rejected": -87.49970245361328, "rewards/accuracies": 0.96875, "rewards/chosen": 83.84896087646484, "rewards/margins": 169.88095092773438, "rewards/rejected": -86.03199768066406, "step": 7270, "u": -6.663240909576416, "weight": 0.03125 }, { "diff_generated": -85.66532897949219, "epoch": 2.359040829552819, "grad_norm": 529.4654595969122, "learning_rate": 1.0618399075952993e-07, "logits/chosen": -2.411695957183838, "logits/rejected": -2.5174262523651123, "logps/chosen": -12.513715744018555, "logps/rejected": -174.34246826171875, "loss": 12.7701, "losses_ref": -0.001065053860656917, "ref_logps/chosen": -97.73978424072266, "ref_logps/rejected": -88.6771469116211, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 85.22607421875, "rewards/margins": 170.89138793945312, "rewards/rejected": -85.66532897949219, "step": 7280, "u": -6.444096565246582, "weight": 0.056295327842235565 }, { "diff_generated": -83.40331268310547, "epoch": 2.3622812702527543, "grad_norm": 486.96056947331795, "learning_rate": 1.0516266553070159e-07, "logits/chosen": -2.3514552116394043, "logits/rejected": -2.4933013916015625, "logps/chosen": -13.503857612609863, "logps/rejected": -168.85653686523438, "loss": 12.5846, "losses_ref": -4.49640893407377e-08, "ref_logps/chosen": -94.25113677978516, "ref_logps/rejected": -85.4532241821289, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 80.74726867675781, "rewards/margins": 164.1505889892578, "rewards/rejected": -83.40331268310547, "step": 7290, "u": -6.5332183837890625, "weight": 0.05000000074505806 }, { "diff_generated": -85.90556335449219, "epoch": 2.3655217109526894, "grad_norm": 512.973851912908, "learning_rate": 1.041455319342336e-07, "logits/chosen": -2.3956139087677, "logits/rejected": -2.5182456970214844, "logps/chosen": -12.487277030944824, "logps/rejected": -172.7784423828125, "loss": 12.8994, "losses_ref": -4.39100585936103e-06, "ref_logps/chosen": -101.12513732910156, "ref_logps/rejected": -86.8729019165039, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 88.6378402709961, "rewards/margins": 174.5434112548828, "rewards/rejected": -85.90556335449219, "step": 7300, "u": -6.607975959777832, "weight": 0.03750017657876015 }, { "diff_generated": -84.39129638671875, "epoch": 2.368762151652625, "grad_norm": 500.970037385123, "learning_rate": 1.0313260443047247e-07, "logits/chosen": -2.295703887939453, "logits/rejected": -2.526883125305176, "logps/chosen": -11.327601432800293, "logps/rejected": -168.99575805664062, "loss": 13.0578, "losses_ref": -0.004974209703505039, "ref_logps/chosen": -88.77967834472656, "ref_logps/rejected": -84.60444641113281, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 77.45207214355469, "rewards/margins": 161.8433837890625, "rewards/rejected": -84.39129638671875, "step": 7310, "u": -6.404440879821777, "weight": 0.06897146999835968 }, { "diff_generated": -85.76993560791016, "epoch": 2.37200259235256, "grad_norm": 489.33374283528923, "learning_rate": 1.0212389741996834e-07, "logits/chosen": -2.4081289768218994, "logits/rejected": -2.583893299102783, "logps/chosen": -12.362791061401367, "logps/rejected": -174.359130859375, "loss": 12.7088, "losses_ref": -0.003301215823739767, "ref_logps/chosen": -94.69625091552734, "ref_logps/rejected": -88.58919525146484, "rewards/accuracies": 0.9375, "rewards/chosen": 82.33345794677734, "rewards/margins": 168.1033935546875, "rewards/rejected": -85.76993560791016, "step": 7320, "u": -6.429391384124756, "weight": 0.06264026463031769 }, { "diff_generated": -87.53488159179688, "epoch": 2.375243033052495, "grad_norm": 465.1530214753127, "learning_rate": 1.0111942524326891e-07, "logits/chosen": -2.370417356491089, "logits/rejected": -2.5388333797454834, "logps/chosen": -12.295100212097168, "logps/rejected": -178.72235107421875, "loss": 12.6474, "losses_ref": -1.535337673885806e-06, "ref_logps/chosen": -93.35027313232422, "ref_logps/rejected": -91.18746185302734, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 81.05517578125, "rewards/margins": 168.59005737304688, "rewards/rejected": -87.53488159179688, "step": 7330, "u": -6.5652756690979, "weight": 0.04375005513429642 }, { "diff_generated": -85.3342514038086, "epoch": 2.3784834737524303, "grad_norm": 496.91622123501764, "learning_rate": 1.0011920218071664e-07, "logits/chosen": -2.452415704727173, "logits/rejected": -2.5386643409729004, "logps/chosen": -12.925765991210938, "logps/rejected": -175.0770721435547, "loss": 12.6224, "losses_ref": -8.268243618658744e-06, "ref_logps/chosen": -95.21087646484375, "ref_logps/rejected": -89.7428207397461, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 82.28511810302734, "rewards/margins": 167.61936950683594, "rewards/rejected": -85.3342514038086, "step": 7340, "u": -6.49071741104126, "weight": 0.0562501959502697 }, { "diff_generated": -90.63404846191406, "epoch": 2.3817239144523654, "grad_norm": 488.04327837680876, "learning_rate": 9.912324245224524e-08, "logits/chosen": -2.4498038291931152, "logits/rejected": -2.5991556644439697, "logps/chosen": -13.388803482055664, "logps/rejected": -182.3604278564453, "loss": 12.4083, "losses_ref": -0.00018008516053669155, "ref_logps/chosen": -98.12663269042969, "ref_logps/rejected": -91.72637939453125, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 84.73783111572266, "rewards/margins": 175.3718719482422, "rewards/rejected": -90.63404846191406, "step": 7350, "u": -6.733044624328613, "weight": 0.012507098726928234 }, { "diff_generated": -81.8043441772461, "epoch": 2.3849643551523005, "grad_norm": 522.1737155431391, "learning_rate": 9.813156021717763e-08, "logits/chosen": -2.3892436027526855, "logits/rejected": -2.4666178226470947, "logps/chosen": -12.565677642822266, "logps/rejected": -163.40260314941406, "loss": 12.871, "losses_ref": -0.00020811586000490934, "ref_logps/chosen": -95.48179626464844, "ref_logps/rejected": -81.59825134277344, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 82.91612243652344, "rewards/margins": 164.72047424316406, "rewards/rejected": -81.8043441772461, "step": 7360, "u": -6.4513349533081055, "weight": 0.056258104741573334 }, { "diff_generated": -86.375, "epoch": 2.3882047958522357, "grad_norm": 490.9721252761537, "learning_rate": 9.714416957402468e-08, "logits/chosen": -2.383329391479492, "logits/rejected": -2.558297634124756, "logps/chosen": -12.554519653320312, "logps/rejected": -174.5645751953125, "loss": 12.7989, "losses_ref": -0.00037836996489204466, "ref_logps/chosen": -92.61976623535156, "ref_logps/rejected": -88.18955993652344, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 80.06523895263672, "rewards/margins": 166.44024658203125, "rewards/rejected": -86.375, "step": 7370, "u": -6.5053887367248535, "weight": 0.05626590922474861 }, { "diff_generated": -89.63764953613281, "epoch": 2.3914452365521712, "grad_norm": 498.76182579058303, "learning_rate": 9.616108456028462e-08, "logits/chosen": -2.383967399597168, "logits/rejected": -2.5193114280700684, "logps/chosen": -12.145334243774414, "logps/rejected": -180.26341247558594, "loss": 12.7477, "losses_ref": -1.3903306808060734e-06, "ref_logps/chosen": -91.63188171386719, "ref_logps/rejected": -90.6257553100586, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 79.48655700683594, "rewards/margins": 169.1241912841797, "rewards/rejected": -89.63764953613281, "step": 7380, "u": -6.723959445953369, "weight": 0.02500004507601261 }, { "diff_generated": -86.11238098144531, "epoch": 2.3946856772521063, "grad_norm": 494.50793530286535, "learning_rate": 9.518231915224371e-08, "logits/chosen": -2.386399030685425, "logits/rejected": -2.483867645263672, "logps/chosen": -12.247283935546875, "logps/rejected": -176.57333374023438, "loss": 12.4872, "losses_ref": -0.0006079341983422637, "ref_logps/chosen": -97.18465423583984, "ref_logps/rejected": -90.4609375, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 84.93736267089844, "rewards/margins": 171.0497589111328, "rewards/rejected": -86.11238098144531, "step": 7390, "u": -6.583271026611328, "weight": 0.04377469792962074 }, { "diff_generated": -87.58899688720703, "epoch": 2.3979261179520415, "grad_norm": 482.55961854403847, "learning_rate": 9.4207887264777e-08, "logits/chosen": -2.32859468460083, "logits/rejected": -2.561094284057617, "logps/chosen": -11.497884750366211, "logps/rejected": -175.28054809570312, "loss": 12.848, "losses_ref": -0.0022828192450106144, "ref_logps/chosen": -88.45655822753906, "ref_logps/rejected": -87.69156646728516, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 76.95866394042969, "rewards/margins": 164.5476531982422, "rewards/rejected": -87.58899688720703, "step": 7400, "u": -6.567407131195068, "weight": 0.04385367035865784 }, { "diff_generated": -86.6302261352539, "epoch": 2.4011665586519766, "grad_norm": 472.6398812524397, "learning_rate": 9.323780275115156e-08, "logits/chosen": -2.3910207748413086, "logits/rejected": -2.5345606803894043, "logps/chosen": -12.565729141235352, "logps/rejected": -172.0085906982422, "loss": 12.8935, "losses_ref": -0.003717700717970729, "ref_logps/chosen": -96.87049102783203, "ref_logps/rejected": -85.37837982177734, "rewards/accuracies": 0.96875, "rewards/chosen": 84.30476379394531, "rewards/margins": 170.9349822998047, "rewards/rejected": -86.6302261352539, "step": 7410, "u": -6.673760414123535, "weight": 0.03141096979379654 }, { "diff_generated": -81.17831420898438, "epoch": 2.4044069993519117, "grad_norm": 460.87154964636005, "learning_rate": 9.22720794028283e-08, "logits/chosen": -2.4026007652282715, "logits/rejected": -2.4874207973480225, "logps/chosen": -12.965787887573242, "logps/rejected": -165.58517456054688, "loss": 13.0799, "losses_ref": -1.249319012686101e-07, "ref_logps/chosen": -96.84405517578125, "ref_logps/rejected": -84.4068603515625, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 83.87825775146484, "rewards/margins": 165.0565643310547, "rewards/rejected": -81.17831420898438, "step": 7420, "u": -6.408229827880859, "weight": 0.06875000149011612 }, { "diff_generated": -83.51194763183594, "epoch": 2.4076474400518473, "grad_norm": 744.9085311548929, "learning_rate": 9.13107309492668e-08, "logits/chosen": -2.3378539085388184, "logits/rejected": -2.490166187286377, "logps/chosen": -12.576040267944336, "logps/rejected": -166.04745483398438, "loss": 12.4652, "losses_ref": -0.0005707393283955753, "ref_logps/chosen": -94.02613830566406, "ref_logps/rejected": -82.5354995727539, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 81.45008850097656, "rewards/margins": 164.9620361328125, "rewards/rejected": -83.51194763183594, "step": 7430, "u": -6.59079122543335, "weight": 0.04377346485853195 }, { "diff_generated": -80.52021789550781, "epoch": 2.4108878807517824, "grad_norm": 510.8672346234607, "learning_rate": 9.035377105772966e-08, "logits/chosen": -2.3841404914855957, "logits/rejected": -2.4778783321380615, "logps/chosen": -12.856233596801758, "logps/rejected": -162.56240844726562, "loss": 13.0052, "losses_ref": -0.0017416516784578562, "ref_logps/chosen": -93.31913757324219, "ref_logps/rejected": -82.04218292236328, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 80.46290588378906, "rewards/margins": 160.98312377929688, "rewards/rejected": -80.52021789550781, "step": 7440, "u": -6.387154579162598, "weight": 0.06882445514202118 }, { "diff_generated": -85.48727416992188, "epoch": 2.4141283214517175, "grad_norm": 515.1576877546976, "learning_rate": 8.940121333308849e-08, "logits/chosen": -2.3297321796417236, "logits/rejected": -2.5619239807128906, "logps/chosen": -11.624930381774902, "logps/rejected": -170.84661865234375, "loss": 12.8739, "losses_ref": -1.706665557321685e-06, "ref_logps/chosen": -85.62763977050781, "ref_logps/rejected": -85.35934448242188, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 74.00270080566406, "rewards/margins": 159.48997497558594, "rewards/rejected": -85.48727416992188, "step": 7450, "u": -6.44665002822876, "weight": 0.05625002458691597 }, { "diff_generated": -87.2186279296875, "epoch": 2.4173687621516526, "grad_norm": 464.9770061734808, "learning_rate": 8.845307131762991e-08, "logits/chosen": -2.39561128616333, "logits/rejected": -2.5099291801452637, "logps/chosen": -13.754659652709961, "logps/rejected": -172.62655639648438, "loss": 12.8239, "losses_ref": -0.0005621786694973707, "ref_logps/chosen": -99.22911071777344, "ref_logps/rejected": -85.40792083740234, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 85.47445678710938, "rewards/margins": 172.69308471679688, "rewards/rejected": -87.2186279296875, "step": 7460, "u": -6.46007776260376, "weight": 0.05627412348985672 }, { "diff_generated": -87.04798126220703, "epoch": 2.4206092028515878, "grad_norm": 480.5447502967754, "learning_rate": 8.750935849086424e-08, "logits/chosen": -2.4041030406951904, "logits/rejected": -2.5212578773498535, "logps/chosen": -13.613739013671875, "logps/rejected": -176.61572265625, "loss": 12.7459, "losses_ref": -0.005251473747193813, "ref_logps/chosen": -102.5396957397461, "ref_logps/rejected": -89.56773376464844, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 88.92596435546875, "rewards/margins": 175.97393798828125, "rewards/rejected": -87.04798126220703, "step": 7470, "u": -6.651014804840088, "weight": 0.025226224213838577 }, { "diff_generated": -86.72041320800781, "epoch": 2.423849643551523, "grad_norm": 522.3173239436194, "learning_rate": 8.657008826933223e-08, "logits/chosen": -2.357637643814087, "logits/rejected": -2.539961814880371, "logps/chosen": -12.631352424621582, "logps/rejected": -175.08822631835938, "loss": 13.1405, "losses_ref": -3.868344933266599e-08, "ref_logps/chosen": -95.20268249511719, "ref_logps/rejected": -88.36781311035156, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 82.57133483886719, "rewards/margins": 169.29177856445312, "rewards/rejected": -86.72041320800781, "step": 7480, "u": -6.551732540130615, "weight": 0.04374999925494194 }, { "diff_generated": -86.01445770263672, "epoch": 2.427090084251458, "grad_norm": 466.80659358478727, "learning_rate": 8.563527400641559e-08, "logits/chosen": -2.383613109588623, "logits/rejected": -2.5712387561798096, "logps/chosen": -12.490646362304688, "logps/rejected": -173.9608917236328, "loss": 12.3431, "losses_ref": -0.00016282778233289719, "ref_logps/chosen": -90.7940673828125, "ref_logps/rejected": -87.94645690917969, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 78.30342102050781, "rewards/margins": 164.31788635253906, "rewards/rejected": -86.01445770263672, "step": 7490, "u": -6.375924587249756, "weight": 0.07500634342432022 }, { "diff_generated": -81.46131896972656, "epoch": 2.4303305249513936, "grad_norm": 490.9905956290775, "learning_rate": 8.470492899214696e-08, "logits/chosen": -2.35518217086792, "logits/rejected": -2.424717426300049, "logps/chosen": -12.561359405517578, "logps/rejected": -160.87351989746094, "loss": 12.4984, "losses_ref": -0.001344609772786498, "ref_logps/chosen": -91.66764068603516, "ref_logps/rejected": -79.4122085571289, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 79.10627746582031, "rewards/margins": 160.56759643554688, "rewards/rejected": -81.46131896972656, "step": 7500, "u": -6.370742321014404, "weight": 0.0688091292977333 }, { "diff_generated": -87.88899993896484, "epoch": 2.4335709656513287, "grad_norm": 496.72652571908117, "learning_rate": 8.377906645302015e-08, "logits/chosen": -2.3465425968170166, "logits/rejected": -2.5052857398986816, "logps/chosen": -13.459360122680664, "logps/rejected": -179.36007690429688, "loss": 12.85, "losses_ref": -0.0046943118795752525, "ref_logps/chosen": -94.39802551269531, "ref_logps/rejected": -91.47105407714844, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 80.93867492675781, "rewards/margins": 168.82766723632812, "rewards/rejected": -87.88899993896484, "step": 7510, "u": -6.632806301116943, "weight": 0.037701454013586044 }, { "diff_generated": -81.4451675415039, "epoch": 2.436811406351264, "grad_norm": 478.270473192927, "learning_rate": 8.28576995518031e-08, "logits/chosen": -2.3640151023864746, "logits/rejected": -2.4249727725982666, "logps/chosen": -13.7388277053833, "logps/rejected": -164.45909118652344, "loss": 12.9501, "losses_ref": -0.006367249879986048, "ref_logps/chosen": -98.01344299316406, "ref_logps/rejected": -83.01393127441406, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 84.27461242675781, "rewards/margins": 165.71978759765625, "rewards/rejected": -81.4451675415039, "step": 7520, "u": -6.509137153625488, "weight": 0.05030224844813347 }, { "diff_generated": -88.09165954589844, "epoch": 2.440051847051199, "grad_norm": 530.0567812682875, "learning_rate": 8.194084138735023e-08, "logits/chosen": -2.3818490505218506, "logits/rejected": -2.541490077972412, "logps/chosen": -13.17347526550293, "logps/rejected": -179.87777709960938, "loss": 12.7952, "losses_ref": -0.0015509051736444235, "ref_logps/chosen": -92.35968780517578, "ref_logps/rejected": -91.78611755371094, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 79.18620300292969, "rewards/margins": 167.27786254882812, "rewards/rejected": -88.09165954589844, "step": 7530, "u": -6.595727443695068, "weight": 0.043817587196826935 }, { "diff_generated": -82.6982650756836, "epoch": 2.443292287751134, "grad_norm": 482.2876230032835, "learning_rate": 8.102850499441638e-08, "logits/chosen": -2.3446247577667236, "logits/rejected": -2.476062774658203, "logps/chosen": -12.425708770751953, "logps/rejected": -167.6982421875, "loss": 12.9033, "losses_ref": -5.4635460600138686e-08, "ref_logps/chosen": -91.31380462646484, "ref_logps/rejected": -84.99995422363281, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 78.88810729980469, "rewards/margins": 161.58636474609375, "rewards/rejected": -82.6982650756836, "step": 7540, "u": -6.135350227355957, "weight": 0.10625000298023224 }, { "diff_generated": -87.83369445800781, "epoch": 2.446532728451069, "grad_norm": 472.3359347546331, "learning_rate": 8.012070334347103e-08, "logits/chosen": -2.394005537033081, "logits/rejected": -2.520233154296875, "logps/chosen": -13.785612106323242, "logps/rejected": -177.16799926757812, "loss": 12.4521, "losses_ref": -0.00033257578616030514, "ref_logps/chosen": -99.60476684570312, "ref_logps/rejected": -89.33430480957031, "rewards/accuracies": 0.96875, "rewards/chosen": 85.81916046142578, "rewards/margins": 173.65286254882812, "rewards/rejected": -87.83369445800781, "step": 7550, "u": -6.665897369384766, "weight": 0.03126341477036476 }, { "diff_generated": -83.32557678222656, "epoch": 2.4497731691510047, "grad_norm": 483.4723322508994, "learning_rate": 7.921744934051515e-08, "logits/chosen": -2.3564887046813965, "logits/rejected": -2.497473955154419, "logps/chosen": -12.408846855163574, "logps/rejected": -169.51048278808594, "loss": 12.3062, "losses_ref": -0.001365851378068328, "ref_logps/chosen": -90.70404052734375, "ref_logps/rejected": -86.18492126464844, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 78.29520416259766, "rewards/margins": 161.62078857421875, "rewards/rejected": -83.32557678222656, "step": 7560, "u": -6.274251937866211, "weight": 0.08130748569965363 }, { "diff_generated": -83.88063049316406, "epoch": 2.45301360985094, "grad_norm": 497.2983036407309, "learning_rate": 7.831875582689598e-08, "logits/chosen": -2.3528811931610107, "logits/rejected": -2.4794909954071045, "logps/chosen": -12.237689018249512, "logps/rejected": -165.22021484375, "loss": 12.5008, "losses_ref": -2.523396176457027e-07, "ref_logps/chosen": -91.89834594726562, "ref_logps/rejected": -81.33958435058594, "rewards/accuracies": 0.9375, "rewards/chosen": 79.66065216064453, "rewards/margins": 163.54129028320312, "rewards/rejected": -83.88063049316406, "step": 7570, "u": -6.427340507507324, "weight": 0.0625000074505806 }, { "diff_generated": -84.7837905883789, "epoch": 2.456254050550875, "grad_norm": 534.7371450262729, "learning_rate": 7.742463557912593e-08, "logits/chosen": -2.3579623699188232, "logits/rejected": -2.508383274078369, "logps/chosen": -12.433358192443848, "logps/rejected": -170.16738891601562, "loss": 12.6894, "losses_ref": -0.0018981487955898046, "ref_logps/chosen": -93.244140625, "ref_logps/rejected": -85.38359069824219, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 80.81077575683594, "rewards/margins": 165.59457397460938, "rewards/rejected": -84.7837905883789, "step": 7580, "u": -6.416357517242432, "weight": 0.06882932037115097 }, { "diff_generated": -90.3713607788086, "epoch": 2.45949449125081, "grad_norm": 524.2400804507265, "learning_rate": 7.65351013087002e-08, "logits/chosen": -2.3619818687438965, "logits/rejected": -2.5806076526641846, "logps/chosen": -11.463775634765625, "logps/rejected": -184.11569213867188, "loss": 12.7985, "losses_ref": -0.004068558104336262, "ref_logps/chosen": -91.43190002441406, "ref_logps/rejected": -93.74433135986328, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 79.9681167602539, "rewards/margins": 170.33946228027344, "rewards/rejected": -90.3713607788086, "step": 7590, "u": -6.602597713470459, "weight": 0.04393560811877251 }, { "diff_generated": -85.97703552246094, "epoch": 2.462734931950745, "grad_norm": 525.1137645178529, "learning_rate": 7.565016566191631e-08, "logits/chosen": -2.306933641433716, "logits/rejected": -2.4763407707214355, "logps/chosen": -12.672266006469727, "logps/rejected": -170.2589874267578, "loss": 12.5481, "losses_ref": -2.8921974148943264e-07, "ref_logps/chosen": -92.51892852783203, "ref_logps/rejected": -84.2819595336914, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 79.8466567993164, "rewards/margins": 165.82369995117188, "rewards/rejected": -85.97703552246094, "step": 7600, "u": -6.497170925140381, "weight": 0.050000011920928955 }, { "diff_generated": -84.66465759277344, "epoch": 2.4659753726506803, "grad_norm": 538.5819875031116, "learning_rate": 7.47698412196939e-08, "logits/chosen": -2.407266139984131, "logits/rejected": -2.4851815700531006, "logps/chosen": -12.466886520385742, "logps/rejected": -174.0008087158203, "loss": 12.9953, "losses_ref": -0.03617560863494873, "ref_logps/chosen": -98.9618911743164, "ref_logps/rejected": -89.3361587524414, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 86.49501037597656, "rewards/margins": 171.15965270996094, "rewards/rejected": -84.66465759277344, "step": 7610, "u": -6.4572882652282715, "weight": 0.056824516505002975 }, { "diff_generated": -86.6190185546875, "epoch": 2.4692158133506155, "grad_norm": 486.0634635356683, "learning_rate": 7.389414049739682e-08, "logits/chosen": -2.4103035926818848, "logits/rejected": -2.5272722244262695, "logps/chosen": -12.675764083862305, "logps/rejected": -172.9994659423828, "loss": 12.7994, "losses_ref": -0.003231314243748784, "ref_logps/chosen": -97.57255554199219, "ref_logps/rejected": -86.38043212890625, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 84.89678955078125, "rewards/margins": 171.5157928466797, "rewards/rejected": -86.6190185546875, "step": 7620, "u": -6.525698661804199, "weight": 0.05013857036828995 }, { "diff_generated": -87.88928985595703, "epoch": 2.472456254050551, "grad_norm": 478.67004558614457, "learning_rate": 7.302307594465422e-08, "logits/chosen": -2.385324716567993, "logits/rejected": -2.594041109085083, "logps/chosen": -13.049673080444336, "logps/rejected": -181.9893798828125, "loss": 12.4213, "losses_ref": -0.00019054643053095788, "ref_logps/chosen": -95.00083923339844, "ref_logps/rejected": -94.10009765625, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 81.95115661621094, "rewards/margins": 169.84043884277344, "rewards/rejected": -87.88928985595703, "step": 7630, "u": -6.503349304199219, "weight": 0.04375634342432022 }, { "diff_generated": -85.76202392578125, "epoch": 2.475696694750486, "grad_norm": 515.7209612263493, "learning_rate": 7.215665994518367e-08, "logits/chosen": -2.3745484352111816, "logits/rejected": -2.5339818000793457, "logps/chosen": -12.694661140441895, "logps/rejected": -171.424560546875, "loss": 12.8026, "losses_ref": -2.924050477304263e-07, "ref_logps/chosen": -91.8033218383789, "ref_logps/rejected": -85.66253662109375, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 79.1086654663086, "rewards/margins": 164.8706817626953, "rewards/rejected": -85.76202392578125, "step": 7640, "u": -6.478395938873291, "weight": 0.056250013411045074 }, { "diff_generated": -88.4301528930664, "epoch": 2.4789371354504213, "grad_norm": 555.0381925773094, "learning_rate": 7.129490481661605e-08, "logits/chosen": -2.4156832695007324, "logits/rejected": -2.5395307540893555, "logps/chosen": -14.228047370910645, "logps/rejected": -182.26687622070312, "loss": 12.8529, "losses_ref": -0.03741047531366348, "ref_logps/chosen": -104.72440338134766, "ref_logps/rejected": -93.83674621582031, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 90.49635314941406, "rewards/margins": 178.92648315429688, "rewards/rejected": -88.4301528930664, "step": 7650, "u": -6.741459846496582, "weight": 0.014490276575088501 }, { "diff_generated": -80.32636260986328, "epoch": 2.4821775761503564, "grad_norm": 500.4918228868725, "learning_rate": 7.043782281031911e-08, "logits/chosen": -2.3989498615264893, "logits/rejected": -2.4414381980895996, "logps/chosen": -14.554104804992676, "logps/rejected": -162.0161895751953, "loss": 12.6795, "losses_ref": -0.002818151144310832, "ref_logps/chosen": -97.82319641113281, "ref_logps/rejected": -81.6898193359375, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 83.26908874511719, "rewards/margins": 163.59544372558594, "rewards/rejected": -80.32636260986328, "step": 7660, "u": -6.331325054168701, "weight": 0.07511264085769653 }, { "diff_generated": -86.31947326660156, "epoch": 2.4854180168502915, "grad_norm": 505.39487820480815, "learning_rate": 6.958542611122422e-08, "logits/chosen": -2.4010703563690186, "logits/rejected": -2.5161900520324707, "logps/chosen": -12.543961524963379, "logps/rejected": -170.29434204101562, "loss": 12.7568, "losses_ref": -1.1059737126117852e-08, "ref_logps/chosen": -97.9937973022461, "ref_logps/rejected": -83.97486877441406, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 85.4498291015625, "rewards/margins": 171.76930236816406, "rewards/rejected": -86.31947326660156, "step": 7670, "u": -6.538404941558838, "weight": 0.04374999925494194 }, { "diff_generated": -82.28599548339844, "epoch": 2.488658457550227, "grad_norm": 515.332660650725, "learning_rate": 6.873772683765283e-08, "logits/chosen": -2.3219847679138184, "logits/rejected": -2.484175205230713, "logps/chosen": -12.61694622039795, "logps/rejected": -168.28042602539062, "loss": 12.2396, "losses_ref": -0.0011883302358910441, "ref_logps/chosen": -91.09086608886719, "ref_logps/rejected": -85.99443054199219, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 78.47391510009766, "rewards/margins": 160.75991821289062, "rewards/rejected": -82.28599548339844, "step": 7680, "u": -6.3603620529174805, "weight": 0.07505214214324951 }, { "diff_generated": -86.8857650756836, "epoch": 2.491898898250162, "grad_norm": 520.0914704230654, "learning_rate": 6.789473704114428e-08, "logits/chosen": -2.389352321624756, "logits/rejected": -2.4951353073120117, "logps/chosen": -12.93322467803955, "logps/rejected": -173.37252807617188, "loss": 12.8485, "losses_ref": -4.2353548224127735e-07, "ref_logps/chosen": -99.44371032714844, "ref_logps/rejected": -86.48677062988281, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 86.51048278808594, "rewards/margins": 173.39625549316406, "rewards/rejected": -86.8857650756836, "step": 7690, "u": -6.696573734283447, "weight": 0.018750013783574104 }, { "diff_generated": -83.52281188964844, "epoch": 2.4951393389500973, "grad_norm": 485.94800288731153, "learning_rate": 6.7056468706284e-08, "logits/chosen": -2.368849277496338, "logits/rejected": -2.4442880153656006, "logps/chosen": -13.004542350769043, "logps/rejected": -167.32431030273438, "loss": 12.8778, "losses_ref": -4.529524488816605e-08, "ref_logps/chosen": -97.18318939208984, "ref_logps/rejected": -83.801513671875, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 84.17866516113281, "rewards/margins": 167.70147705078125, "rewards/rejected": -83.52281188964844, "step": 7700, "u": -6.416772365570068, "weight": 0.05624999850988388 }, { "diff_generated": -81.74546813964844, "epoch": 2.4983797796500324, "grad_norm": 475.0479170919597, "learning_rate": 6.622293375053422e-08, "logits/chosen": -2.3324759006500244, "logits/rejected": -2.4128990173339844, "logps/chosen": -12.462357521057129, "logps/rejected": -165.7411651611328, "loss": 13.0175, "losses_ref": -0.0005353426095098257, "ref_logps/chosen": -95.83375549316406, "ref_logps/rejected": -83.99568176269531, "rewards/accuracies": 0.9375, "rewards/chosen": 83.37139892578125, "rewards/margins": 165.11688232421875, "rewards/rejected": -81.74546813964844, "step": 7710, "u": -6.465706825256348, "weight": 0.0625208243727684 }, { "diff_generated": -84.4853515625, "epoch": 2.5016202203499676, "grad_norm": 518.0026426198627, "learning_rate": 6.539414402406316e-08, "logits/chosen": -2.3503990173339844, "logits/rejected": -2.4843220710754395, "logps/chosen": -12.903669357299805, "logps/rejected": -174.85250854492188, "loss": 12.8774, "losses_ref": -1.42912256251293e-06, "ref_logps/chosen": -98.93122863769531, "ref_logps/rejected": -90.3671646118164, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 86.0275650024414, "rewards/margins": 170.512939453125, "rewards/rejected": -84.4853515625, "step": 7720, "u": -6.619194984436035, "weight": 0.0375000536441803 }, { "diff_generated": -80.78546905517578, "epoch": 2.5048606610499027, "grad_norm": 515.8740681402575, "learning_rate": 6.457011130957747e-08, "logits/chosen": -2.3500072956085205, "logits/rejected": -2.442248821258545, "logps/chosen": -13.2356538772583, "logps/rejected": -162.047119140625, "loss": 12.8804, "losses_ref": -3.20266941855607e-08, "ref_logps/chosen": -98.41993713378906, "ref_logps/rejected": -81.26166534423828, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 85.18428039550781, "rewards/margins": 165.96974182128906, "rewards/rejected": -80.78546905517578, "step": 7730, "u": -6.437910556793213, "weight": 0.05624999850988388 }, { "diff_generated": -85.4696273803711, "epoch": 2.508101101749838, "grad_norm": 546.6845540934409, "learning_rate": 6.37508473221549e-08, "logits/chosen": -2.37815260887146, "logits/rejected": -2.4845166206359863, "logps/chosen": -13.139554977416992, "logps/rejected": -173.57278442382812, "loss": 12.9879, "losses_ref": -0.00040427473140880466, "ref_logps/chosen": -96.54368591308594, "ref_logps/rejected": -88.1031494140625, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 83.40412902832031, "rewards/margins": 168.87376403808594, "rewards/rejected": -85.4696273803711, "step": 7740, "u": -6.579657077789307, "weight": 0.043766215443611145 }, { "diff_generated": -89.28426361083984, "epoch": 2.511341542449773, "grad_norm": 494.946916076293, "learning_rate": 6.293636370907665e-08, "logits/chosen": -2.3897414207458496, "logits/rejected": -2.5804595947265625, "logps/chosen": -12.588485717773438, "logps/rejected": -178.6708984375, "loss": 12.7232, "losses_ref": -0.012705594301223755, "ref_logps/chosen": -94.24117279052734, "ref_logps/rejected": -89.38664245605469, "rewards/accuracies": 0.96875, "rewards/chosen": 81.6526870727539, "rewards/margins": 170.9369659423828, "rewards/rejected": -89.28426361083984, "step": 7750, "u": -6.6324920654296875, "weight": 0.03186144679784775 }, { "diff_generated": -89.33751678466797, "epoch": 2.5145819831497085, "grad_norm": 512.3183896963463, "learning_rate": 6.212667204966293e-08, "logits/chosen": -2.418869972229004, "logits/rejected": -2.5200929641723633, "logps/chosen": -13.113365173339844, "logps/rejected": -179.0424346923828, "loss": 13.1742, "losses_ref": -2.3532444881624315e-09, "ref_logps/chosen": -98.5973129272461, "ref_logps/rejected": -89.70491790771484, "rewards/accuracies": 0.96875, "rewards/chosen": 85.48394012451172, "rewards/margins": 174.8214569091797, "rewards/rejected": -89.33751678466797, "step": 7760, "u": -6.676608085632324, "weight": 0.03125 }, { "diff_generated": -85.3768310546875, "epoch": 2.5178224238496436, "grad_norm": 488.4091704506323, "learning_rate": 6.132178385510772e-08, "logits/chosen": -2.3752994537353516, "logits/rejected": -2.5198378562927246, "logps/chosen": -12.729066848754883, "logps/rejected": -173.28065490722656, "loss": 12.7355, "losses_ref": -0.0031602573581039906, "ref_logps/chosen": -92.53146362304688, "ref_logps/rejected": -87.90381622314453, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 79.80239868164062, "rewards/margins": 165.1792449951172, "rewards/rejected": -85.3768310546875, "step": 7770, "u": -6.568255424499512, "weight": 0.04388529807329178 }, { "diff_generated": -83.88563537597656, "epoch": 2.5210628645495787, "grad_norm": 474.70341248308773, "learning_rate": 6.052171056831547e-08, "logits/chosen": -2.3717455863952637, "logits/rejected": -2.528341770172119, "logps/chosen": -11.977242469787598, "logps/rejected": -165.62997436523438, "loss": 12.6246, "losses_ref": -0.0009697287459857762, "ref_logps/chosen": -90.6422348022461, "ref_logps/rejected": -81.74433135986328, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 78.66499328613281, "rewards/margins": 162.55062866210938, "rewards/rejected": -83.88563537597656, "step": 7780, "u": -6.627276420593262, "weight": 0.03754133731126785 }, { "diff_generated": -86.89268493652344, "epoch": 2.524303305249514, "grad_norm": 521.5084899838258, "learning_rate": 5.972646356373779e-08, "logits/chosen": -2.3941633701324463, "logits/rejected": -2.4962596893310547, "logps/chosen": -14.027194023132324, "logps/rejected": -172.46983337402344, "loss": 12.6922, "losses_ref": -1.3707585821975954e-07, "ref_logps/chosen": -97.36051940917969, "ref_logps/rejected": -85.57716369628906, "rewards/accuracies": 0.96875, "rewards/chosen": 83.33332824707031, "rewards/margins": 170.2259979248047, "rewards/rejected": -86.89268493652344, "step": 7790, "u": -6.6635541915893555, "weight": 0.0312500037252903 }, { "diff_generated": -85.21521759033203, "epoch": 2.527543745949449, "grad_norm": 511.26271392949695, "learning_rate": 5.893605414721277e-08, "logits/chosen": -2.403787136077881, "logits/rejected": -2.5516161918640137, "logps/chosen": -11.525293350219727, "logps/rejected": -168.976318359375, "loss": 12.3588, "losses_ref": -0.008900230750441551, "ref_logps/chosen": -94.09223937988281, "ref_logps/rejected": -83.76110076904297, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 82.56694793701172, "rewards/margins": 167.78216552734375, "rewards/rejected": -85.21521759033203, "step": 7800, "u": -6.569947242736816, "weight": 0.044144630432128906 }, { "diff_generated": -88.24825286865234, "epoch": 2.5307841866493845, "grad_norm": 513.7535587783697, "learning_rate": 5.815049355580317e-08, "logits/chosen": -2.3834657669067383, "logits/rejected": -2.5250654220581055, "logps/chosen": -12.82642650604248, "logps/rejected": -176.42503356933594, "loss": 12.4961, "losses_ref": -1.1369086116985727e-08, "ref_logps/chosen": -96.07859802246094, "ref_logps/rejected": -88.1767807006836, "rewards/accuracies": 0.96875, "rewards/chosen": 83.25218200683594, "rewards/margins": 171.50042724609375, "rewards/rejected": -88.24825286865234, "step": 7810, "u": -6.661282539367676, "weight": 0.03125 }, { "diff_generated": -85.78338623046875, "epoch": 2.5340246273493197, "grad_norm": 527.7786162945669, "learning_rate": 5.736979295763742e-08, "logits/chosen": -2.367727041244507, "logits/rejected": -2.4320693016052246, "logps/chosen": -13.660482406616211, "logps/rejected": -169.5161895751953, "loss": 12.9545, "losses_ref": -2.9643251764355227e-05, "ref_logps/chosen": -101.06855773925781, "ref_logps/rejected": -83.73277282714844, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 87.4080810546875, "rewards/margins": 173.1914825439453, "rewards/rejected": -85.78338623046875, "step": 7820, "u": -6.695784568786621, "weight": 0.02500077709555626 }, { "diff_generated": -87.93540954589844, "epoch": 2.537265068049255, "grad_norm": 506.13584198160663, "learning_rate": 5.659396345175049e-08, "logits/chosen": -2.348079204559326, "logits/rejected": -2.476435422897339, "logps/chosen": -13.36473274230957, "logps/rejected": -174.88262939453125, "loss": 12.3834, "losses_ref": -0.0071576847694814205, "ref_logps/chosen": -100.14743041992188, "ref_logps/rejected": -86.94721984863281, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 86.78269958496094, "rewards/margins": 174.71810913085938, "rewards/rejected": -87.93540954589844, "step": 7830, "u": -6.6284918785095215, "weight": 0.037833355367183685 }, { "diff_generated": -84.89559173583984, "epoch": 2.54050550874919, "grad_norm": 488.6686843515456, "learning_rate": 5.5823016067926234e-08, "logits/chosen": -2.34673810005188, "logits/rejected": -2.4491865634918213, "logps/chosen": -13.33026123046875, "logps/rejected": -171.094970703125, "loss": 12.5142, "losses_ref": -0.003317506518214941, "ref_logps/chosen": -96.6379165649414, "ref_logps/rejected": -86.1993637084961, "rewards/accuracies": 0.9375, "rewards/chosen": 83.30766296386719, "rewards/margins": 168.20326232910156, "rewards/rejected": -84.89559173583984, "step": 7840, "u": -6.420805931091309, "weight": 0.06264188140630722 }, { "diff_generated": -81.24967956542969, "epoch": 2.543745949449125, "grad_norm": 491.2639204565971, "learning_rate": 5.5056961766540444e-08, "logits/chosen": -2.3522627353668213, "logits/rejected": -2.464498519897461, "logps/chosen": -12.752184867858887, "logps/rejected": -168.3269500732422, "loss": 12.7157, "losses_ref": -5.747070304096269e-07, "ref_logps/chosen": -92.69832611083984, "ref_logps/rejected": -87.07726287841797, "rewards/accuracies": 0.90625, "rewards/chosen": 79.94613647460938, "rewards/margins": 161.19583129882812, "rewards/rejected": -81.24967956542969, "step": 7850, "u": -6.237468719482422, "weight": 0.0937500074505806 }, { "diff_generated": -86.9503173828125, "epoch": 2.54698639014906, "grad_norm": 528.2205381646952, "learning_rate": 5.429581143840525e-08, "logits/chosen": -2.3642375469207764, "logits/rejected": -2.516458511352539, "logps/chosen": -12.689916610717773, "logps/rejected": -173.31358337402344, "loss": 13.1316, "losses_ref": -0.002915473422035575, "ref_logps/chosen": -98.03343200683594, "ref_logps/rejected": -86.36326599121094, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 85.34352111816406, "rewards/margins": 172.29385375976562, "rewards/rejected": -86.9503173828125, "step": 7860, "u": -6.518470764160156, "weight": 0.043878473341464996 }, { "diff_generated": -82.84322357177734, "epoch": 2.5502268308489953, "grad_norm": 498.4432213297694, "learning_rate": 5.3539575904614176e-08, "logits/chosen": -2.381110429763794, "logits/rejected": -2.499281167984009, "logps/chosen": -13.256353378295898, "logps/rejected": -167.81980895996094, "loss": 12.5832, "losses_ref": -0.00034237594809383154, "ref_logps/chosen": -93.40634155273438, "ref_logps/rejected": -84.97659301757812, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 80.14997863769531, "rewards/margins": 162.99319458007812, "rewards/rejected": -82.84322357177734, "step": 7870, "u": -6.393845081329346, "weight": 0.06876394152641296 }, { "diff_generated": -86.20030212402344, "epoch": 2.5534672715489304, "grad_norm": 468.91919847693663, "learning_rate": 5.278826591638794e-08, "logits/chosen": -2.384467124938965, "logits/rejected": -2.534027338027954, "logps/chosen": -13.1602783203125, "logps/rejected": -177.34799194335938, "loss": 13.1184, "losses_ref": -1.0411190487502608e-07, "ref_logps/chosen": -95.07594299316406, "ref_logps/rejected": -91.1476821899414, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 81.91566467285156, "rewards/margins": 168.115966796875, "rewards/rejected": -86.20030212402344, "step": 7880, "u": -6.67303466796875, "weight": 0.02500000223517418 }, { "diff_generated": -87.34493255615234, "epoch": 2.556707712248866, "grad_norm": 541.1356078527222, "learning_rate": 5.204189215492252e-08, "logits/chosen": -2.356952667236328, "logits/rejected": -2.517338275909424, "logps/chosen": -12.496864318847656, "logps/rejected": -175.65017700195312, "loss": 12.5768, "losses_ref": -2.529996265820955e-07, "ref_logps/chosen": -97.91763305664062, "ref_logps/rejected": -88.30525970458984, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 85.42076110839844, "rewards/margins": 172.7657012939453, "rewards/rejected": -87.34493255615234, "step": 7890, "u": -6.555611610412598, "weight": 0.05000000447034836 }, { "diff_generated": -85.5751724243164, "epoch": 2.559948152948801, "grad_norm": 467.6387257104504, "learning_rate": 5.1300465231236145e-08, "logits/chosen": -2.372973918914795, "logits/rejected": -2.455167293548584, "logps/chosen": -13.153573989868164, "logps/rejected": -166.5150146484375, "loss": 12.7604, "losses_ref": -1.840089396409894e-07, "ref_logps/chosen": -98.97048950195312, "ref_logps/rejected": -80.93985748291016, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 85.8169174194336, "rewards/margins": 171.39208984375, "rewards/rejected": -85.5751724243164, "step": 7900, "u": -6.509287357330322, "weight": 0.05000000447034836 }, { "diff_generated": -87.47126770019531, "epoch": 2.563188593648736, "grad_norm": 480.3490266312158, "learning_rate": 5.056399568601946e-08, "logits/chosen": -2.405813694000244, "logits/rejected": -2.511786937713623, "logps/chosen": -13.059216499328613, "logps/rejected": -175.0909881591797, "loss": 13.0647, "losses_ref": -0.006940539926290512, "ref_logps/chosen": -95.51531982421875, "ref_logps/rejected": -87.61970520019531, "rewards/accuracies": 0.9375, "rewards/chosen": 82.45610046386719, "rewards/margins": 169.9273681640625, "rewards/rejected": -87.47126770019531, "step": 7910, "u": -6.41863489151001, "weight": 0.06280623376369476 }, { "diff_generated": -85.1175308227539, "epoch": 2.5664290343486713, "grad_norm": 503.66718179721994, "learning_rate": 4.983249398948502e-08, "logits/chosen": -2.4177050590515137, "logits/rejected": -2.4998488426208496, "logps/chosen": -13.102018356323242, "logps/rejected": -174.5506134033203, "loss": 12.8097, "losses_ref": -3.150193515466526e-05, "ref_logps/chosen": -99.92887115478516, "ref_logps/rejected": -89.43309020996094, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 86.82683563232422, "rewards/margins": 171.94436645507812, "rewards/rejected": -85.1175308227539, "step": 7920, "u": -6.611052513122559, "weight": 0.03750133514404297 }, { "diff_generated": -84.71986389160156, "epoch": 2.569669475048607, "grad_norm": 505.03372562716527, "learning_rate": 4.910597054121877e-08, "logits/chosen": -2.362746000289917, "logits/rejected": -2.449460744857788, "logps/chosen": -14.314231872558594, "logps/rejected": -166.46458435058594, "loss": 12.7078, "losses_ref": -0.0003298623487353325, "ref_logps/chosen": -99.32063293457031, "ref_logps/rejected": -81.74470520019531, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 85.00639343261719, "rewards/margins": 169.7262725830078, "rewards/rejected": -84.71986389160156, "step": 7930, "u": -6.521559238433838, "weight": 0.04376343637704849 }, { "diff_generated": -87.66574096679688, "epoch": 2.572909915748542, "grad_norm": 512.5633770227915, "learning_rate": 4.838443567003194e-08, "logits/chosen": -2.3789749145507812, "logits/rejected": -2.5361878871917725, "logps/chosen": -11.975103378295898, "logps/rejected": -174.80824279785156, "loss": 12.4216, "losses_ref": -0.009310315363109112, "ref_logps/chosen": -92.89860534667969, "ref_logps/rejected": -87.14249420166016, "rewards/accuracies": 0.96875, "rewards/chosen": 80.92350769042969, "rewards/margins": 168.58924865722656, "rewards/rejected": -87.66574096679688, "step": 7940, "u": -6.625628471374512, "weight": 0.03165370970964432 }, { "diff_generated": -86.88675689697266, "epoch": 2.576150356448477, "grad_norm": 533.2632256964176, "learning_rate": 4.766789963381459e-08, "logits/chosen": -2.383544921875, "logits/rejected": -2.5006332397460938, "logps/chosen": -13.377037048339844, "logps/rejected": -173.28250122070312, "loss": 12.9989, "losses_ref": -0.008450334891676903, "ref_logps/chosen": -96.12030792236328, "ref_logps/rejected": -86.395751953125, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 82.74327850341797, "rewards/margins": 169.63002014160156, "rewards/rejected": -86.88675689697266, "step": 7950, "u": -6.491290092468262, "weight": 0.056649159640073776 }, { "diff_generated": -87.16014099121094, "epoch": 2.5793907971484122, "grad_norm": 499.656898104997, "learning_rate": 4.695637261938912e-08, "logits/chosen": -2.391625165939331, "logits/rejected": -2.4923512935638428, "logps/chosen": -12.244363784790039, "logps/rejected": -173.76107788085938, "loss": 12.6396, "losses_ref": -6.377808290380926e-07, "ref_logps/chosen": -97.15655517578125, "ref_logps/rejected": -86.6009292602539, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 84.91219329833984, "rewards/margins": 172.0723419189453, "rewards/rejected": -87.16014099121094, "step": 7960, "u": -6.561807155609131, "weight": 0.03750001639127731 }, { "diff_generated": -85.76423645019531, "epoch": 2.5826312378483474, "grad_norm": 465.2509841706746, "learning_rate": 4.624986474236623e-08, "logits/chosen": -2.414842128753662, "logits/rejected": -2.5293426513671875, "logps/chosen": -12.228536605834961, "logps/rejected": -172.13262939453125, "loss": 12.4326, "losses_ref": -4.4096346130118036e-08, "ref_logps/chosen": -96.31724548339844, "ref_logps/rejected": -86.36839294433594, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 84.08869934082031, "rewards/margins": 169.85293579101562, "rewards/rejected": -85.76423645019531, "step": 7970, "u": -6.528074741363525, "weight": 0.05000000074505806 }, { "diff_generated": -83.78959655761719, "epoch": 2.5858716785482825, "grad_norm": 523.6643190508312, "learning_rate": 4.554838604700073e-08, "logits/chosen": -2.3502275943756104, "logits/rejected": -2.461812973022461, "logps/chosen": -12.568353652954102, "logps/rejected": -164.447265625, "loss": 12.6323, "losses_ref": -0.006550629623234272, "ref_logps/chosen": -90.35621643066406, "ref_logps/rejected": -80.65765380859375, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 77.78785705566406, "rewards/margins": 161.57745361328125, "rewards/rejected": -83.78959655761719, "step": 7980, "u": -6.409112453460693, "weight": 0.06904677301645279 }, { "diff_generated": -86.20465850830078, "epoch": 2.5891121192482176, "grad_norm": 526.2645841769199, "learning_rate": 4.4851946506048445e-08, "logits/chosen": -2.3885281085968018, "logits/rejected": -2.506099224090576, "logps/chosen": -12.333206176757812, "logps/rejected": -172.03591918945312, "loss": 12.9873, "losses_ref": -3.100065537608998e-09, "ref_logps/chosen": -94.87696838378906, "ref_logps/rejected": -85.83128356933594, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 82.54376220703125, "rewards/margins": 168.7484130859375, "rewards/rejected": -86.20465850830078, "step": 7990, "u": -6.473536014556885, "weight": 0.05624999850988388 }, { "diff_generated": -83.8978042602539, "epoch": 2.5923525599481527, "grad_norm": 493.1932292731906, "learning_rate": 4.4160556020625026e-08, "logits/chosen": -2.3730297088623047, "logits/rejected": -2.524240016937256, "logps/chosen": -13.095507621765137, "logps/rejected": -168.27194213867188, "loss": 12.7239, "losses_ref": -8.18051262285735e-07, "ref_logps/chosen": -91.02601623535156, "ref_logps/rejected": -84.37416076660156, "rewards/accuracies": 0.9375, "rewards/chosen": 77.93051147460938, "rewards/margins": 161.8282928466797, "rewards/rejected": -83.8978042602539, "step": 8000, "u": -6.421624660491943, "weight": 0.06250002235174179 }, { "diff_generated": -86.31498718261719, "epoch": 2.5955930006480883, "grad_norm": 501.71546954800465, "learning_rate": 4.347422442006476e-08, "logits/chosen": -2.3795294761657715, "logits/rejected": -2.50423264503479, "logps/chosen": -12.979804992675781, "logps/rejected": -172.81736755371094, "loss": 12.5756, "losses_ref": -0.00048283609794452786, "ref_logps/chosen": -98.77983856201172, "ref_logps/rejected": -86.50237274169922, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 85.80003356933594, "rewards/margins": 172.11502075195312, "rewards/rejected": -86.31498718261719, "step": 8010, "u": -6.6053948402404785, "weight": 0.04377000406384468 }, { "diff_generated": -86.91321563720703, "epoch": 2.5988334413480234, "grad_norm": 534.5271316801942, "learning_rate": 4.2792961461781064e-08, "logits/chosen": -2.4245381355285645, "logits/rejected": -2.550638437271118, "logps/chosen": -13.415209770202637, "logps/rejected": -171.35308837890625, "loss": 13.083, "losses_ref": -0.005519128870218992, "ref_logps/chosen": -96.28224182128906, "ref_logps/rejected": -84.43988037109375, "rewards/accuracies": 0.96875, "rewards/chosen": 82.86702728271484, "rewards/margins": 169.78024291992188, "rewards/rejected": -86.91321563720703, "step": 8020, "u": -6.6761980056762695, "weight": 0.03151529282331467 }, { "diff_generated": -84.75545501708984, "epoch": 2.6020738820479585, "grad_norm": 483.275655835092, "learning_rate": 4.211677683112751e-08, "logits/chosen": -2.3873836994171143, "logits/rejected": -2.5185983180999756, "logps/chosen": -12.050555229187012, "logps/rejected": -171.59332275390625, "loss": 12.2757, "losses_ref": -2.054375727311708e-07, "ref_logps/chosen": -95.49308013916016, "ref_logps/rejected": -86.83785247802734, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 83.44252014160156, "rewards/margins": 168.19798278808594, "rewards/rejected": -84.75545501708984, "step": 8030, "u": -6.334198474884033, "weight": 0.07500000298023224 }, { "diff_generated": -89.45109558105469, "epoch": 2.6053143227478937, "grad_norm": 486.574861960957, "learning_rate": 4.1445680141260594e-08, "logits/chosen": -2.4202070236206055, "logits/rejected": -2.5206031799316406, "logps/chosen": -14.027897834777832, "logps/rejected": -174.24551391601562, "loss": 13.1134, "losses_ref": -0.0034157063346356153, "ref_logps/chosen": -102.09026336669922, "ref_logps/rejected": -84.7944107055664, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 88.06236267089844, "rewards/margins": 177.51345825195312, "rewards/rejected": -89.45109558105469, "step": 8040, "u": -6.611212253570557, "weight": 0.03765515238046646 }, { "diff_generated": -88.83689880371094, "epoch": 2.6085547634478288, "grad_norm": 477.13792511325187, "learning_rate": 4.077968093300237e-08, "logits/chosen": -2.3851420879364014, "logits/rejected": -2.5064923763275146, "logps/chosen": -12.40340805053711, "logps/rejected": -178.78726196289062, "loss": 12.5005, "losses_ref": -0.0012303909752517939, "ref_logps/chosen": -96.12643432617188, "ref_logps/rejected": -89.95036315917969, "rewards/accuracies": 0.96875, "rewards/chosen": 83.72303009033203, "rewards/margins": 172.55990600585938, "rewards/rejected": -88.83689880371094, "step": 8050, "u": -6.618790626525879, "weight": 0.03129594027996063 }, { "diff_generated": -89.02252197265625, "epoch": 2.6117952041477643, "grad_norm": 491.7406817234213, "learning_rate": 4.011878867470542e-08, "logits/chosen": -2.388709306716919, "logits/rejected": -2.545560359954834, "logps/chosen": -13.7767333984375, "logps/rejected": -177.96762084960938, "loss": 12.8765, "losses_ref": -0.0017306599766016006, "ref_logps/chosen": -97.41840362548828, "ref_logps/rejected": -88.94508361816406, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 83.64167022705078, "rewards/margins": 172.66419982910156, "rewards/rejected": -89.02252197265625, "step": 8060, "u": -6.748841285705566, "weight": 0.01882075145840645 }, { "diff_generated": -86.64888000488281, "epoch": 2.6150356448476995, "grad_norm": 509.8738582350685, "learning_rate": 3.9463012762118144e-08, "logits/chosen": -2.321654796600342, "logits/rejected": -2.5281715393066406, "logps/chosen": -11.479809761047363, "logps/rejected": -175.80943298339844, "loss": 12.4087, "losses_ref": -0.024613162502646446, "ref_logps/chosen": -89.32393646240234, "ref_logps/rejected": -89.16055297851562, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 77.84413146972656, "rewards/margins": 164.49301147460938, "rewards/rejected": -86.64888000488281, "step": 8070, "u": -6.471219539642334, "weight": 0.0574127733707428 }, { "diff_generated": -86.16117858886719, "epoch": 2.6182760855476346, "grad_norm": 497.2157398866359, "learning_rate": 3.8812362518250816e-08, "logits/chosen": -2.401134967803955, "logits/rejected": -2.519824743270874, "logps/chosen": -13.755337715148926, "logps/rejected": -171.7306365966797, "loss": 13.0926, "losses_ref": -3.134549686478749e-08, "ref_logps/chosen": -96.5144271850586, "ref_logps/rejected": -85.56947326660156, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 82.75908660888672, "rewards/margins": 168.92025756835938, "rewards/rejected": -86.16117858886719, "step": 8080, "u": -6.461573600769043, "weight": 0.05624999850988388 }, { "diff_generated": -86.84434509277344, "epoch": 2.6215165262475697, "grad_norm": 450.8214177458869, "learning_rate": 3.816684719324352e-08, "logits/chosen": -2.345268487930298, "logits/rejected": -2.538722515106201, "logps/chosen": -11.970441818237305, "logps/rejected": -173.00057983398438, "loss": 12.524, "losses_ref": -0.003985968884080648, "ref_logps/chosen": -89.08036804199219, "ref_logps/rejected": -86.15625, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 77.10990905761719, "rewards/margins": 163.9542694091797, "rewards/rejected": -86.84434509277344, "step": 8090, "u": -6.583459377288818, "weight": 0.03768478333950043 }, { "diff_generated": -85.60336303710938, "epoch": 2.624756966947505, "grad_norm": 495.38888541572555, "learning_rate": 3.7526475964234286e-08, "logits/chosen": -2.3677401542663574, "logits/rejected": -2.5159714221954346, "logps/chosen": -12.289840698242188, "logps/rejected": -170.49014282226562, "loss": 12.9352, "losses_ref": -9.287772329003019e-09, "ref_logps/chosen": -95.17460632324219, "ref_logps/rejected": -84.88678741455078, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 82.884765625, "rewards/margins": 168.48812866210938, "rewards/rejected": -85.60336303710938, "step": 8100, "u": -6.533503532409668, "weight": 0.05000000074505806 }, { "diff_generated": -83.99730682373047, "epoch": 2.62799740764744, "grad_norm": 495.7291956689171, "learning_rate": 3.689125793522874e-08, "logits/chosen": -2.3391013145446777, "logits/rejected": -2.4577574729919434, "logps/chosen": -11.927130699157715, "logps/rejected": -171.8058319091797, "loss": 12.5268, "losses_ref": -0.003856272902339697, "ref_logps/chosen": -90.72027587890625, "ref_logps/rejected": -87.80851745605469, "rewards/accuracies": 0.9375, "rewards/chosen": 78.79315185546875, "rewards/margins": 162.79046630859375, "rewards/rejected": -83.99730682373047, "step": 8110, "u": -6.4522294998168945, "weight": 0.06267724931240082 }, { "diff_generated": -83.66921997070312, "epoch": 2.631237848347375, "grad_norm": 494.1546333660402, "learning_rate": 3.6261202136970814e-08, "logits/chosen": -2.3662924766540527, "logits/rejected": -2.4921998977661133, "logps/chosen": -12.41911792755127, "logps/rejected": -171.39114379882812, "loss": 12.6946, "losses_ref": -2.692143823423976e-07, "ref_logps/chosen": -96.14432525634766, "ref_logps/rejected": -87.72193908691406, "rewards/accuracies": 0.9375, "rewards/chosen": 83.72520446777344, "rewards/margins": 167.39442443847656, "rewards/rejected": -83.66921997070312, "step": 8120, "u": -6.4582319259643555, "weight": 0.0625000074505806 }, { "diff_generated": -86.19597625732422, "epoch": 2.63447828904731, "grad_norm": 482.8019019904665, "learning_rate": 3.563631752681422e-08, "logits/chosen": -2.3443899154663086, "logits/rejected": -2.5313496589660645, "logps/chosen": -12.188249588012695, "logps/rejected": -174.0465850830078, "loss": 12.8193, "losses_ref": -6.370446499204263e-07, "ref_logps/chosen": -91.61663818359375, "ref_logps/rejected": -87.85062408447266, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 79.42839050292969, "rewards/margins": 165.62435913085938, "rewards/rejected": -86.19597625732422, "step": 8130, "u": -6.591284275054932, "weight": 0.04375001788139343 }, { "diff_generated": -87.0280990600586, "epoch": 2.6377187297472457, "grad_norm": 507.050776481255, "learning_rate": 3.501661298859489e-08, "logits/chosen": -2.3417413234710693, "logits/rejected": -2.473163366317749, "logps/chosen": -13.762643814086914, "logps/rejected": -178.0719451904297, "loss": 12.8385, "losses_ref": -1.76808725882438e-07, "ref_logps/chosen": -95.24227905273438, "ref_logps/rejected": -91.04386138916016, "rewards/accuracies": 0.96875, "rewards/chosen": 81.47962188720703, "rewards/margins": 168.50772094726562, "rewards/rejected": -87.0280990600586, "step": 8140, "u": -6.665112495422363, "weight": 0.0312500074505806 }, { "diff_generated": -84.09680938720703, "epoch": 2.640959170447181, "grad_norm": 535.1578726596663, "learning_rate": 3.4402097332505074e-08, "logits/chosen": -2.3699049949645996, "logits/rejected": -2.505519151687622, "logps/chosen": -12.561290740966797, "logps/rejected": -168.15115356445312, "loss": 12.7527, "losses_ref": -1.7613732694599094e-08, "ref_logps/chosen": -91.27629089355469, "ref_logps/rejected": -84.05433654785156, "rewards/accuracies": 0.9375, "rewards/chosen": 78.71501159667969, "rewards/margins": 162.81182861328125, "rewards/rejected": -84.09680938720703, "step": 8150, "u": -6.432689666748047, "weight": 0.0625 }, { "diff_generated": -80.46368408203125, "epoch": 2.644199611147116, "grad_norm": 495.8366130046274, "learning_rate": 3.379277929496798e-08, "logits/chosen": -2.337191581726074, "logits/rejected": -2.4601244926452637, "logps/chosen": -12.33616828918457, "logps/rejected": -167.09994506835938, "loss": 12.7435, "losses_ref": -8.165208242871813e-08, "ref_logps/chosen": -93.17217254638672, "ref_logps/rejected": -86.6362533569336, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 80.83600616455078, "rewards/margins": 161.2996826171875, "rewards/rejected": -80.46368408203125, "step": 8160, "u": -6.322528839111328, "weight": 0.07500000298023224 }, { "diff_generated": -81.28134155273438, "epoch": 2.647440051847051, "grad_norm": 527.2054934900391, "learning_rate": 3.3188667538513435e-08, "logits/chosen": -2.3057284355163574, "logits/rejected": -2.496896743774414, "logps/chosen": -12.039349555969238, "logps/rejected": -169.8492889404297, "loss": 12.5938, "losses_ref": -0.005264888517558575, "ref_logps/chosen": -88.80217742919922, "ref_logps/rejected": -88.56796264648438, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 76.76283264160156, "rewards/margins": 158.04417419433594, "rewards/rejected": -81.28134155273438, "step": 8170, "u": -6.3711838722229, "weight": 0.07514555752277374 }, { "diff_generated": -85.64323425292969, "epoch": 2.6506804925469862, "grad_norm": 500.54881737349564, "learning_rate": 3.258977065165478e-08, "logits/chosen": -2.3974757194519043, "logits/rejected": -2.4821276664733887, "logps/chosen": -12.726496696472168, "logps/rejected": -169.305908203125, "loss": 12.981, "losses_ref": -2.1205362799037175e-08, "ref_logps/chosen": -96.9805679321289, "ref_logps/rejected": -83.66265106201172, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 84.25407409667969, "rewards/margins": 169.89730834960938, "rewards/rejected": -85.64323425292969, "step": 8180, "u": -6.555540561676025, "weight": 0.05000000074505806 }, { "diff_generated": -89.43001556396484, "epoch": 2.653920933246922, "grad_norm": 505.4959091442462, "learning_rate": 3.1996097148766897e-08, "logits/chosen": -2.3234241008758545, "logits/rejected": -2.5218329429626465, "logps/chosen": -12.170676231384277, "logps/rejected": -179.04788208007812, "loss": 12.4663, "losses_ref": -1.6466621673316695e-05, "ref_logps/chosen": -91.13664245605469, "ref_logps/rejected": -89.61786651611328, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 78.9659652709961, "rewards/margins": 168.39596557617188, "rewards/rejected": -89.43001556396484, "step": 8190, "u": -6.49074649810791, "weight": 0.056250572204589844 }, { "diff_generated": -84.8772964477539, "epoch": 2.657161373946857, "grad_norm": 486.00921903786366, "learning_rate": 3.1407655469964754e-08, "logits/chosen": -2.4197030067443848, "logits/rejected": -2.5114760398864746, "logps/chosen": -12.12280559539795, "logps/rejected": -176.52188110351562, "loss": 12.4027, "losses_ref": -0.0027509736828505993, "ref_logps/chosen": -94.04446411132812, "ref_logps/rejected": -91.64457702636719, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 81.92166137695312, "rewards/margins": 166.7989501953125, "rewards/rejected": -84.8772964477539, "step": 8200, "u": -6.483721733093262, "weight": 0.0501210018992424 }, { "diff_generated": -87.22736358642578, "epoch": 2.660401814646792, "grad_norm": 494.58958979856357, "learning_rate": 3.0824453980984234e-08, "logits/chosen": -2.3733162879943848, "logits/rejected": -2.483097553253174, "logps/chosen": -12.148710250854492, "logps/rejected": -173.47787475585938, "loss": 13.1226, "losses_ref": -0.012098370119929314, "ref_logps/chosen": -92.38720703125, "ref_logps/rejected": -86.25053405761719, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 80.23848724365234, "rewards/margins": 167.46585083007812, "rewards/rejected": -87.22736358642578, "step": 8210, "u": -6.378817558288574, "weight": 0.06930071115493774 }, { "diff_generated": -82.57842254638672, "epoch": 2.663642255346727, "grad_norm": 521.9251566733633, "learning_rate": 3.0246500973062184e-08, "logits/chosen": -2.374394416809082, "logits/rejected": -2.4588265419006348, "logps/chosen": -13.365102767944336, "logps/rejected": -162.84796142578125, "loss": 12.6945, "losses_ref": -0.025809219107031822, "ref_logps/chosen": -92.61255645751953, "ref_logps/rejected": -80.26952362060547, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 79.24744415283203, "rewards/margins": 161.8258819580078, "rewards/rejected": -82.57842254638672, "step": 8220, "u": -6.362309455871582, "weight": 0.06996141374111176 }, { "diff_generated": -84.21598052978516, "epoch": 2.6668826960466623, "grad_norm": 478.60892978321334, "learning_rate": 2.9673804662819324e-08, "logits/chosen": -2.367405414581299, "logits/rejected": -2.4610178470611572, "logps/chosen": -12.569252967834473, "logps/rejected": -165.32691955566406, "loss": 12.6299, "losses_ref": -0.0011514907237142324, "ref_logps/chosen": -92.96192932128906, "ref_logps/rejected": -81.11091613769531, "rewards/accuracies": 0.9375, "rewards/chosen": 80.39268493652344, "rewards/margins": 164.60867309570312, "rewards/rejected": -84.21598052978516, "step": 8230, "u": -6.457944393157959, "weight": 0.06254855543375015 }, { "diff_generated": -87.5735092163086, "epoch": 2.6701231367465974, "grad_norm": 508.32581470095147, "learning_rate": 2.9106373192143087e-08, "logits/chosen": -2.3813018798828125, "logits/rejected": -2.548152208328247, "logps/chosen": -11.875589370727539, "logps/rejected": -176.88278198242188, "loss": 12.6674, "losses_ref": -0.0047495425678789616, "ref_logps/chosen": -90.63737487792969, "ref_logps/rejected": -89.30928039550781, "rewards/accuracies": 0.9375, "rewards/chosen": 78.76178741455078, "rewards/margins": 166.33529663085938, "rewards/rejected": -87.5735092163086, "step": 8240, "u": -6.413567543029785, "weight": 0.06272226572036743 }, { "diff_generated": -86.81454467773438, "epoch": 2.6733635774465325, "grad_norm": 501.0978421599488, "learning_rate": 2.854421462807193e-08, "logits/chosen": -2.36572527885437, "logits/rejected": -2.4993362426757812, "logps/chosen": -11.884244918823242, "logps/rejected": -171.13729858398438, "loss": 12.3468, "losses_ref": -7.660739242965064e-07, "ref_logps/chosen": -97.11248016357422, "ref_logps/rejected": -84.32276153564453, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 85.22823333740234, "rewards/margins": 172.04278564453125, "rewards/rejected": -86.81454467773438, "step": 8250, "u": -6.548535346984863, "weight": 0.04375002905726433 }, { "diff_generated": -81.70663452148438, "epoch": 2.6766040181464676, "grad_norm": 492.62343461174584, "learning_rate": 2.798733696268063e-08, "logits/chosen": -2.344013214111328, "logits/rejected": -2.4782357215881348, "logps/chosen": -13.149152755737305, "logps/rejected": -164.31423950195312, "loss": 13.1615, "losses_ref": -1.994596203758192e-07, "ref_logps/chosen": -93.44435119628906, "ref_logps/rejected": -82.60762786865234, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 80.29519653320312, "rewards/margins": 162.0018310546875, "rewards/rejected": -81.70663452148438, "step": 8260, "u": -6.470333099365234, "weight": 0.05625000596046448 }, { "diff_generated": -89.9212646484375, "epoch": 2.679844458846403, "grad_norm": 477.27097344952216, "learning_rate": 2.7435748112966694e-08, "logits/chosen": -2.3386342525482178, "logits/rejected": -2.525294065475464, "logps/chosen": -11.611748695373535, "logps/rejected": -183.88877868652344, "loss": 12.5768, "losses_ref": -0.001180317485705018, "ref_logps/chosen": -99.32749938964844, "ref_logps/rejected": -93.96751403808594, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 87.71575164794922, "rewards/margins": 177.6370086669922, "rewards/rejected": -89.9212646484375, "step": 8270, "u": -6.4835076332092285, "weight": 0.05005160719156265 }, { "diff_generated": -79.57349395751953, "epoch": 2.6830848995463383, "grad_norm": 528.2810782020115, "learning_rate": 2.6889455920737903e-08, "logits/chosen": -2.3760812282562256, "logits/rejected": -2.441226005554199, "logps/chosen": -14.722673416137695, "logps/rejected": -158.25631713867188, "loss": 12.8496, "losses_ref": -0.0030767028219997883, "ref_logps/chosen": -94.90017700195312, "ref_logps/rejected": -78.68284606933594, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 80.17750549316406, "rewards/margins": 159.75100708007812, "rewards/rejected": -79.57349395751953, "step": 8280, "u": -6.374342918395996, "weight": 0.07513058185577393 }, { "diff_generated": -89.38903045654297, "epoch": 2.6863253402462735, "grad_norm": 508.05682186752637, "learning_rate": 2.6348468152500357e-08, "logits/chosen": -2.3738772869110107, "logits/rejected": -2.524718761444092, "logps/chosen": -11.598087310791016, "logps/rejected": -176.02505493164062, "loss": 12.7554, "losses_ref": -0.002667112974449992, "ref_logps/chosen": -90.2225570678711, "ref_logps/rejected": -86.63602447509766, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 78.62446594238281, "rewards/margins": 168.01351928710938, "rewards/rejected": -89.38903045654297, "step": 8290, "u": -6.515158653259277, "weight": 0.05636848136782646 }, { "diff_generated": -88.80402374267578, "epoch": 2.6895657809462086, "grad_norm": 482.00697128423803, "learning_rate": 2.5812792499348935e-08, "logits/chosen": -2.3881940841674805, "logits/rejected": -2.4838156700134277, "logps/chosen": -13.3690185546875, "logps/rejected": -177.01930236816406, "loss": 12.4884, "losses_ref": -9.388824764755554e-06, "ref_logps/chosen": -98.63578796386719, "ref_logps/rejected": -88.21526336669922, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 85.26676940917969, "rewards/margins": 174.07078552246094, "rewards/rejected": -88.80402374267578, "step": 8300, "u": -6.709897518157959, "weight": 0.025000352412462234 }, { "diff_generated": -88.19657897949219, "epoch": 2.692806221646144, "grad_norm": 462.05724084912237, "learning_rate": 2.5282436576857046e-08, "logits/chosen": -2.3803226947784424, "logits/rejected": -2.5157971382141113, "logps/chosen": -11.993725776672363, "logps/rejected": -177.28646850585938, "loss": 12.5235, "losses_ref": -0.006812377832829952, "ref_logps/chosen": -95.5112075805664, "ref_logps/rejected": -89.08988952636719, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 83.5174789428711, "rewards/margins": 171.71405029296875, "rewards/rejected": -88.19657897949219, "step": 8310, "u": -6.486662864685059, "weight": 0.05032174661755562 }, { "diff_generated": -84.83506774902344, "epoch": 2.6960466623460793, "grad_norm": 511.6195205388548, "learning_rate": 2.4757407924968878e-08, "logits/chosen": -2.3571653366088867, "logits/rejected": -2.5362257957458496, "logps/chosen": -11.663923263549805, "logps/rejected": -170.24285888671875, "loss": 12.5592, "losses_ref": -0.012650948949158192, "ref_logps/chosen": -91.58747863769531, "ref_logps/rejected": -85.40780639648438, "rewards/accuracies": 0.9375, "rewards/chosen": 79.92354583740234, "rewards/margins": 164.7586212158203, "rewards/rejected": -84.83506774902344, "step": 8320, "u": -6.4202470779418945, "weight": 0.06304998695850372 }, { "diff_generated": -87.63829040527344, "epoch": 2.6992871030460144, "grad_norm": 491.1195406215226, "learning_rate": 2.4237714007892117e-08, "logits/chosen": -2.4236512184143066, "logits/rejected": -2.51503324508667, "logps/chosen": -13.503862380981445, "logps/rejected": -179.43336486816406, "loss": 12.9538, "losses_ref": -1.1350125006526213e-10, "ref_logps/chosen": -102.19181823730469, "ref_logps/rejected": -91.7950668334961, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 88.68795013427734, "rewards/margins": 176.32623291015625, "rewards/rejected": -87.63829040527344, "step": 8330, "u": -6.6181640625, "weight": 0.03750000149011612 }, { "diff_generated": -84.88908386230469, "epoch": 2.7025275437459495, "grad_norm": 509.93968497196573, "learning_rate": 2.372336221399176e-08, "logits/chosen": -2.3671317100524902, "logits/rejected": -2.4888668060302734, "logps/chosen": -12.767683982849121, "logps/rejected": -175.57974243164062, "loss": 12.3047, "losses_ref": -0.005530247930437326, "ref_logps/chosen": -94.7016830444336, "ref_logps/rejected": -90.6906509399414, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 81.93400573730469, "rewards/margins": 166.82308959960938, "rewards/rejected": -84.88908386230469, "step": 8340, "u": -6.289109706878662, "weight": 0.08149810135364532 }, { "diff_generated": -84.93101501464844, "epoch": 2.7057679844458846, "grad_norm": 460.3913834206574, "learning_rate": 2.3214359855685095e-08, "logits/chosen": -2.3634300231933594, "logits/rejected": -2.460094451904297, "logps/chosen": -12.92186164855957, "logps/rejected": -169.9513397216797, "loss": 12.3929, "losses_ref": -0.002780128736048937, "ref_logps/chosen": -97.1541748046875, "ref_logps/rejected": -85.02031707763672, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 84.2323226928711, "rewards/margins": 169.163330078125, "rewards/rejected": -84.93101501464844, "step": 8350, "u": -6.59981632232666, "weight": 0.0376209020614624 }, { "diff_generated": -84.09104919433594, "epoch": 2.7090084251458197, "grad_norm": 522.8097767684436, "learning_rate": 2.271071416933772e-08, "logits/chosen": -2.390049934387207, "logits/rejected": -2.5383729934692383, "logps/chosen": -11.891609191894531, "logps/rejected": -169.39395141601562, "loss": 12.5033, "losses_ref": -3.2407893741037697e-05, "ref_logps/chosen": -91.81161499023438, "ref_logps/rejected": -85.30288696289062, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 79.92000579833984, "rewards/margins": 164.01104736328125, "rewards/rejected": -84.09104919433594, "step": 8360, "u": -6.485341548919678, "weight": 0.05000120401382446 }, { "diff_generated": -83.08517456054688, "epoch": 2.712248865845755, "grad_norm": 503.23447806756167, "learning_rate": 2.2212432315160855e-08, "logits/chosen": -2.3702919483184814, "logits/rejected": -2.48654842376709, "logps/chosen": -12.602476119995117, "logps/rejected": -167.88804626464844, "loss": 13.0166, "losses_ref": -0.01596415974199772, "ref_logps/chosen": -92.63908386230469, "ref_logps/rejected": -84.80287170410156, "rewards/accuracies": 0.9375, "rewards/chosen": 80.0365982055664, "rewards/margins": 163.1217803955078, "rewards/rejected": -83.08517456054688, "step": 8370, "u": -6.448731422424316, "weight": 0.06323273479938507 }, { "diff_generated": -89.09854888916016, "epoch": 2.71548930654569, "grad_norm": 492.9086390613034, "learning_rate": 2.171952137710904e-08, "logits/chosen": -2.425790309906006, "logits/rejected": -2.5577409267425537, "logps/chosen": -11.534905433654785, "logps/rejected": -177.09619140625, "loss": 12.5847, "losses_ref": -2.4892568006862348e-08, "ref_logps/chosen": -96.31239318847656, "ref_logps/rejected": -87.9976577758789, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 84.7774887084961, "rewards/margins": 173.8760528564453, "rewards/rejected": -89.09854888916016, "step": 8380, "u": -6.6341071128845215, "weight": 0.03750000149011612 }, { "diff_generated": -83.87593841552734, "epoch": 2.7187297472456255, "grad_norm": 471.84603527073546, "learning_rate": 2.1231988362780327e-08, "logits/chosen": -2.3621182441711426, "logits/rejected": -2.477931261062622, "logps/chosen": -12.505033493041992, "logps/rejected": -169.10244750976562, "loss": 12.6486, "losses_ref": -1.6721272686481825e-06, "ref_logps/chosen": -95.0557861328125, "ref_logps/rejected": -85.22648620605469, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 82.5507583618164, "rewards/margins": 166.4267120361328, "rewards/rejected": -83.87593841552734, "step": 8390, "u": -6.376302242279053, "weight": 0.0687500461935997 }, { "diff_generated": -90.33910369873047, "epoch": 2.7219701879455607, "grad_norm": 512.9739846227034, "learning_rate": 2.0749840203315584e-08, "logits/chosen": -2.392246723175049, "logits/rejected": -2.558375597000122, "logps/chosen": -13.82586669921875, "logps/rejected": -179.183349609375, "loss": 12.564, "losses_ref": -0.0003666019765660167, "ref_logps/chosen": -95.07868957519531, "ref_logps/rejected": -88.84425354003906, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 81.25282287597656, "rewards/margins": 171.5919189453125, "rewards/rejected": -90.33910369873047, "step": 8400, "u": -6.5747785568237305, "weight": 0.043763164430856705 }, { "diff_generated": -85.40348815917969, "epoch": 2.725210628645496, "grad_norm": 460.2332420013002, "learning_rate": 2.0273083753300724e-08, "logits/chosen": -2.3995018005371094, "logits/rejected": -2.485994338989258, "logps/chosen": -13.077387809753418, "logps/rejected": -169.70018005371094, "loss": 12.8605, "losses_ref": -0.000844582449644804, "ref_logps/chosen": -100.71969604492188, "ref_logps/rejected": -84.29668426513672, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 87.6423110961914, "rewards/margins": 173.04580688476562, "rewards/rejected": -85.40348815917969, "step": 8410, "u": -6.576523780822754, "weight": 0.04378564655780792 }, { "diff_generated": -87.17457580566406, "epoch": 2.728451069345431, "grad_norm": 538.4549395583889, "learning_rate": 1.980172579066899e-08, "logits/chosen": -2.4052786827087402, "logits/rejected": -2.537781238555908, "logps/chosen": -13.450469970703125, "logps/rejected": -174.35183715820312, "loss": 12.7838, "losses_ref": -3.527696662786184e-06, "ref_logps/chosen": -98.7660903930664, "ref_logps/rejected": -87.17725372314453, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 85.31561279296875, "rewards/margins": 172.49020385742188, "rewards/rejected": -87.17457580566406, "step": 8420, "u": -6.604406833648682, "weight": 0.037500061094760895 }, { "diff_generated": -91.29328918457031, "epoch": 2.731691510045366, "grad_norm": 535.2424483734745, "learning_rate": 1.9335773016604608e-08, "logits/chosen": -2.394975185394287, "logits/rejected": -2.547994375228882, "logps/chosen": -13.524676322937012, "logps/rejected": -183.4735107421875, "loss": 12.9211, "losses_ref": -6.3783551773610725e-09, "ref_logps/chosen": -98.69517517089844, "ref_logps/rejected": -92.18020629882812, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 85.17050170898438, "rewards/margins": 176.46377563476562, "rewards/rejected": -91.29328918457031, "step": 8430, "u": -6.727007865905762, "weight": 0.02500000037252903 }, { "diff_generated": -82.27983093261719, "epoch": 2.7349319507453016, "grad_norm": 526.8174489152042, "learning_rate": 1.887523205544741e-08, "logits/chosen": -2.3671412467956543, "logits/rejected": -2.4857192039489746, "logps/chosen": -13.052943229675293, "logps/rejected": -163.49301147460938, "loss": 12.8229, "losses_ref": -0.0011958193499594927, "ref_logps/chosen": -92.99552154541016, "ref_logps/rejected": -81.21318054199219, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 79.94257354736328, "rewards/margins": 162.222412109375, "rewards/rejected": -82.27983093261719, "step": 8440, "u": -6.532777309417725, "weight": 0.05005018785595894 }, { "diff_generated": -86.48809814453125, "epoch": 2.7381723914452367, "grad_norm": 478.5720153145638, "learning_rate": 1.8420109454598997e-08, "logits/chosen": -2.4062106609344482, "logits/rejected": -2.5305914878845215, "logps/chosen": -11.691521644592285, "logps/rejected": -170.15487670898438, "loss": 12.7834, "losses_ref": -0.0031040345784276724, "ref_logps/chosen": -92.42660522460938, "ref_logps/rejected": -83.66678619384766, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 80.7350845336914, "rewards/margins": 167.22317504882812, "rewards/rejected": -86.48809814453125, "step": 8450, "u": -6.624154567718506, "weight": 0.037630610167980194 }, { "diff_generated": -86.4380874633789, "epoch": 2.741412832145172, "grad_norm": 525.8424870433707, "learning_rate": 1.797041168442921e-08, "logits/chosen": -2.4024062156677246, "logits/rejected": -2.5008559226989746, "logps/chosen": -12.73488712310791, "logps/rejected": -169.62216186523438, "loss": 12.8644, "losses_ref": -0.0007443568902090192, "ref_logps/chosen": -97.05931091308594, "ref_logps/rejected": -83.18407440185547, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 84.32442474365234, "rewards/margins": 170.76251220703125, "rewards/rejected": -86.4380874633789, "step": 8460, "u": -6.547275543212891, "weight": 0.04378039762377739 }, { "diff_generated": -87.27616882324219, "epoch": 2.744653272845107, "grad_norm": 497.21888671107916, "learning_rate": 1.7526145138184377e-08, "logits/chosen": -2.4237253665924072, "logits/rejected": -2.5312697887420654, "logps/chosen": -12.734219551086426, "logps/rejected": -172.1459503173828, "loss": 13.1801, "losses_ref": -1.1529791343889428e-08, "ref_logps/chosen": -97.7723617553711, "ref_logps/rejected": -84.86976623535156, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 85.03813171386719, "rewards/margins": 172.3143310546875, "rewards/rejected": -87.27616882324219, "step": 8470, "u": -6.614732265472412, "weight": 0.03750000149011612 }, { "diff_generated": -84.35192108154297, "epoch": 2.747893713545042, "grad_norm": 527.620403277501, "learning_rate": 1.708731613189669e-08, "logits/chosen": -2.4409520626068115, "logits/rejected": -2.5005288124084473, "logps/chosen": -14.184789657592773, "logps/rejected": -166.6622314453125, "loss": 12.6912, "losses_ref": -0.018728725612163544, "ref_logps/chosen": -101.62376403808594, "ref_logps/rejected": -82.31028747558594, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 87.43898010253906, "rewards/margins": 171.79090881347656, "rewards/rejected": -84.35192108154297, "step": 8480, "u": -6.550576210021973, "weight": 0.04464550316333771 }, { "diff_generated": -85.64671325683594, "epoch": 2.751134154244977, "grad_norm": 511.1466343453559, "learning_rate": 1.6653930904293677e-08, "logits/chosen": -2.3939261436462402, "logits/rejected": -2.4896676540374756, "logps/chosen": -13.360217094421387, "logps/rejected": -173.86691284179688, "loss": 12.6242, "losses_ref": -0.004592637997120619, "ref_logps/chosen": -101.32906341552734, "ref_logps/rejected": -88.22019958496094, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 87.96885681152344, "rewards/margins": 173.61557006835938, "rewards/rejected": -85.64671325683594, "step": 8490, "u": -6.602023124694824, "weight": 0.03769877552986145 }, { "diff_generated": -85.69892883300781, "epoch": 2.7543745949449123, "grad_norm": 504.5576680791223, "learning_rate": 1.6225995616710297e-08, "logits/chosen": -2.4041481018066406, "logits/rejected": -2.4688780307769775, "logps/chosen": -13.450655937194824, "logps/rejected": -172.2867889404297, "loss": 12.4601, "losses_ref": -6.851646503491793e-06, "ref_logps/chosen": -98.85618591308594, "ref_logps/rejected": -86.58786010742188, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 85.4055404663086, "rewards/margins": 171.10446166992188, "rewards/rejected": -85.69892883300781, "step": 8500, "u": -6.673095703125, "weight": 0.02500019408762455 }, { "diff_generated": -88.74124145507812, "epoch": 2.7576150356448474, "grad_norm": 551.6726809002059, "learning_rate": 1.58035163530009e-08, "logits/chosen": -2.4708077907562256, "logits/rejected": -2.537278652191162, "logps/chosen": -13.46081256866455, "logps/rejected": -178.23049926757812, "loss": 12.9081, "losses_ref": -0.0019827443175017834, "ref_logps/chosen": -100.0168685913086, "ref_logps/rejected": -89.4892578125, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 86.5560531616211, "rewards/margins": 175.2973175048828, "rewards/rejected": -88.74124145507812, "step": 8510, "u": -6.623356819152832, "weight": 0.037584926933050156 }, { "diff_generated": -87.86695861816406, "epoch": 2.760855476344783, "grad_norm": 504.9036060217802, "learning_rate": 1.538649911945291e-08, "logits/chosen": -2.3583149909973145, "logits/rejected": -2.548267126083374, "logps/chosen": -13.576777458190918, "logps/rejected": -178.18577575683594, "loss": 13.2624, "losses_ref": -0.0017562673892825842, "ref_logps/chosen": -95.54901123046875, "ref_logps/rejected": -90.31883239746094, "rewards/accuracies": 0.96875, "rewards/chosen": 81.97222137451172, "rewards/margins": 169.8391876220703, "rewards/rejected": -87.86695861816406, "step": 8520, "u": -6.665831089019775, "weight": 0.031327299773693085 }, { "diff_generated": -80.264892578125, "epoch": 2.764095917044718, "grad_norm": 497.41903616203007, "learning_rate": 1.497494984470107e-08, "logits/chosen": -2.3654308319091797, "logits/rejected": -2.46647310256958, "logps/chosen": -14.232625961303711, "logps/rejected": -164.5033416748047, "loss": 12.9144, "losses_ref": -0.020502448081970215, "ref_logps/chosen": -99.23295593261719, "ref_logps/rejected": -84.23846435546875, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 85.00032806396484, "rewards/margins": 165.26519775390625, "rewards/rejected": -80.264892578125, "step": 8530, "u": -6.4730224609375, "weight": 0.057284872978925705 }, { "diff_generated": -81.96501159667969, "epoch": 2.7673363577446533, "grad_norm": 499.74748943064225, "learning_rate": 1.4568874379643936e-08, "logits/chosen": -2.392131805419922, "logits/rejected": -2.49577260017395, "logps/chosen": -11.832303047180176, "logps/rejected": -164.77926635742188, "loss": 12.4482, "losses_ref": -0.0021569118835031986, "ref_logps/chosen": -90.89392852783203, "ref_logps/rejected": -82.8142318725586, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 79.0616226196289, "rewards/margins": 161.02664184570312, "rewards/rejected": -81.96501159667969, "step": 8540, "u": -6.1016950607299805, "weight": 0.11259114742279053 }, { "diff_generated": -88.95906066894531, "epoch": 2.7705767984445884, "grad_norm": 536.4161948876989, "learning_rate": 1.4168278497359798e-08, "logits/chosen": -2.42805814743042, "logits/rejected": -2.557976245880127, "logps/chosen": -12.606058120727539, "logps/rejected": -181.2037811279297, "loss": 12.5847, "losses_ref": -1.0212413137367093e-08, "ref_logps/chosen": -96.47504425048828, "ref_logps/rejected": -92.24471282958984, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 83.86898040771484, "rewards/margins": 172.8280487060547, "rewards/rejected": -88.95906066894531, "step": 8550, "u": -6.537459373474121, "weight": 0.05000000074505806 }, { "diff_generated": -90.14323425292969, "epoch": 2.7738172391445235, "grad_norm": 469.26948291565867, "learning_rate": 1.3773167893025161e-08, "logits/chosen": -2.397914409637451, "logits/rejected": -2.5138306617736816, "logps/chosen": -13.796974182128906, "logps/rejected": -180.79031372070312, "loss": 12.9604, "losses_ref": -0.0008936094818636775, "ref_logps/chosen": -98.47550201416016, "ref_logps/rejected": -90.6470718383789, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 84.67852020263672, "rewards/margins": 174.82176208496094, "rewards/rejected": -90.14323425292969, "step": 8560, "u": -6.502872467041016, "weight": 0.05628952383995056 }, { "diff_generated": -84.3171615600586, "epoch": 2.777057679844459, "grad_norm": 481.68081385509606, "learning_rate": 1.3383548183833715e-08, "logits/chosen": -2.3971917629241943, "logits/rejected": -2.4937527179718018, "logps/chosen": -13.414143562316895, "logps/rejected": -165.9224853515625, "loss": 12.8354, "losses_ref": -4.630480361811351e-06, "ref_logps/chosen": -96.98786926269531, "ref_logps/rejected": -81.6053237915039, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 83.57373046875, "rewards/margins": 167.89089965820312, "rewards/rejected": -84.3171615600586, "step": 8570, "u": -6.320916175842285, "weight": 0.06875015795230865 }, { "diff_generated": -82.9990234375, "epoch": 2.780298120544394, "grad_norm": 511.05202783179914, "learning_rate": 1.2999424908916346e-08, "logits/chosen": -2.3500263690948486, "logits/rejected": -2.445317029953003, "logps/chosen": -13.810076713562012, "logps/rejected": -169.82754516601562, "loss": 12.5559, "losses_ref": -0.010504474863409996, "ref_logps/chosen": -97.06626892089844, "ref_logps/rejected": -86.8285140991211, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 83.2562026977539, "rewards/margins": 166.25521850585938, "rewards/rejected": -82.9990234375, "step": 8580, "u": -6.568633079528809, "weight": 0.044232361018657684 }, { "diff_generated": -83.91486358642578, "epoch": 2.7835385612443293, "grad_norm": 505.11501311692376, "learning_rate": 1.2620803529262357e-08, "logits/chosen": -2.3712546825408936, "logits/rejected": -2.4964489936828613, "logps/chosen": -11.956393241882324, "logps/rejected": -168.12840270996094, "loss": 12.4465, "losses_ref": -0.00022628402803093195, "ref_logps/chosen": -96.67184448242188, "ref_logps/rejected": -84.2135238647461, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 84.71544647216797, "rewards/margins": 168.6303253173828, "rewards/rejected": -83.91486358642578, "step": 8590, "u": -6.529415130615234, "weight": 0.05000894516706467 }, { "diff_generated": -86.57169342041016, "epoch": 2.7867790019442644, "grad_norm": 506.88211246823346, "learning_rate": 1.2247689427642027e-08, "logits/chosen": -2.4125804901123047, "logits/rejected": -2.5085551738739014, "logps/chosen": -13.256278991699219, "logps/rejected": -175.28707885742188, "loss": 12.6541, "losses_ref": -7.255699152608486e-09, "ref_logps/chosen": -97.22776794433594, "ref_logps/rejected": -88.71540069580078, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 83.97148895263672, "rewards/margins": 170.5431671142578, "rewards/rejected": -86.57169342041016, "step": 8600, "u": -6.509591102600098, "weight": 0.05000000074505806 }, { "diff_generated": -84.92244720458984, "epoch": 2.7900194426441995, "grad_norm": 518.6409227763658, "learning_rate": 1.1880087908529945e-08, "logits/chosen": -2.3713769912719727, "logits/rejected": -2.474369764328003, "logps/chosen": -13.483648300170898, "logps/rejected": -166.7283172607422, "loss": 12.8296, "losses_ref": -0.0012584489304572344, "ref_logps/chosen": -96.45650482177734, "ref_logps/rejected": -81.80587768554688, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 82.97285461425781, "rewards/margins": 167.8953094482422, "rewards/rejected": -84.92244720458984, "step": 8610, "u": -6.539379119873047, "weight": 0.05004796385765076 }, { "diff_generated": -84.60955810546875, "epoch": 2.7932598833441347, "grad_norm": 459.4485713200273, "learning_rate": 1.1518004198029529e-08, "logits/chosen": -2.409595012664795, "logits/rejected": -2.5200963020324707, "logps/chosen": -13.681007385253906, "logps/rejected": -168.84112548828125, "loss": 12.6365, "losses_ref": -3.974982405452465e-07, "ref_logps/chosen": -99.23111724853516, "ref_logps/rejected": -84.23157501220703, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 85.55010986328125, "rewards/margins": 170.15965270996094, "rewards/rejected": -84.60955810546875, "step": 8620, "u": -6.522212982177734, "weight": 0.050000011920928955 }, { "diff_generated": -86.31803131103516, "epoch": 2.79650032404407, "grad_norm": 511.1749130208627, "learning_rate": 1.1161443443798946e-08, "logits/chosen": -2.377769947052002, "logits/rejected": -2.535102367401123, "logps/chosen": -12.180910110473633, "logps/rejected": -173.79360961914062, "loss": 12.752, "losses_ref": -3.1645927265344653e-06, "ref_logps/chosen": -94.26895141601562, "ref_logps/rejected": -87.47557830810547, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 82.0880355834961, "rewards/margins": 168.4060516357422, "rewards/rejected": -86.31803131103516, "step": 8630, "u": -6.404803276062012, "weight": 0.06875012069940567 }, { "diff_generated": -83.41902160644531, "epoch": 2.7997407647440054, "grad_norm": 602.2522933359077, "learning_rate": 1.0810410714977747e-08, "logits/chosen": -2.3440022468566895, "logits/rejected": -2.4584457874298096, "logps/chosen": -13.622339248657227, "logps/rejected": -166.0839385986328, "loss": 12.9616, "losses_ref": -0.0007582043763250113, "ref_logps/chosen": -92.6719970703125, "ref_logps/rejected": -82.6649169921875, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 79.04966735839844, "rewards/margins": 162.46865844726562, "rewards/rejected": -83.41902160644531, "step": 8640, "u": -6.447157859802246, "weight": 0.05628180503845215 }, { "diff_generated": -91.5981674194336, "epoch": 2.8029812054439405, "grad_norm": 489.3615918457275, "learning_rate": 1.0464911002114885e-08, "logits/chosen": -2.392082691192627, "logits/rejected": -2.563253402709961, "logps/chosen": -13.450553894042969, "logps/rejected": -183.0391845703125, "loss": 12.6733, "losses_ref": -0.002970527159050107, "ref_logps/chosen": -96.15373229980469, "ref_logps/rejected": -91.44102478027344, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 82.70317840576172, "rewards/margins": 174.3013458251953, "rewards/rejected": -91.5981674194336, "step": 8650, "u": -6.729278564453125, "weight": 0.01888248324394226 }, { "diff_generated": -88.04591369628906, "epoch": 2.8062216461438756, "grad_norm": 463.19974618327154, "learning_rate": 1.0124949217097656e-08, "logits/chosen": -2.4216623306274414, "logits/rejected": -2.558814287185669, "logps/chosen": -11.761835098266602, "logps/rejected": -177.6961669921875, "loss": 12.6788, "losses_ref": -1.860668089648243e-05, "ref_logps/chosen": -95.4864730834961, "ref_logps/rejected": -89.6502456665039, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 83.72464752197266, "rewards/margins": 171.77053833007812, "rewards/rejected": -88.04591369628906, "step": 8660, "u": -6.536529541015625, "weight": 0.04375075548887253 }, { "diff_generated": -86.40773010253906, "epoch": 2.8094620868438107, "grad_norm": 472.62108278894453, "learning_rate": 9.790530193082114e-09, "logits/chosen": -2.3973047733306885, "logits/rejected": -2.4718689918518066, "logps/chosen": -14.47937297821045, "logps/rejected": -171.40640258789062, "loss": 13.1518, "losses_ref": -0.0019137548515573144, "ref_logps/chosen": -98.88545227050781, "ref_logps/rejected": -84.99864196777344, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 84.40608215332031, "rewards/margins": 170.81381225585938, "rewards/rejected": -86.40773010253906, "step": 8670, "u": -6.6227264404296875, "weight": 0.037581831216812134 }, { "diff_generated": -85.9549789428711, "epoch": 2.812702527543746, "grad_norm": 538.9199023880284, "learning_rate": 9.461658684423968e-09, "logits/chosen": -2.3717586994171143, "logits/rejected": -2.513598680496216, "logps/chosen": -14.2130126953125, "logps/rejected": -169.3182373046875, "loss": 13.0126, "losses_ref": -6.566205229319166e-06, "ref_logps/chosen": -94.47737121582031, "ref_logps/rejected": -83.3632583618164, "rewards/accuracies": 0.9375, "rewards/chosen": 80.26435852050781, "rewards/margins": 166.21934509277344, "rewards/rejected": -85.9549789428711, "step": 8680, "u": -6.473133087158203, "weight": 0.06250016391277313 }, { "diff_generated": -89.59869384765625, "epoch": 2.8159429682436814, "grad_norm": 518.7525170558285, "learning_rate": 9.138339366611526e-09, "logits/chosen": -2.4366939067840576, "logits/rejected": -2.547464609146118, "logps/chosen": -12.505029678344727, "logps/rejected": -180.10690307617188, "loss": 12.5597, "losses_ref": -1.1943488686938508e-07, "ref_logps/chosen": -98.70265197753906, "ref_logps/rejected": -90.5082015991211, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 86.19761657714844, "rewards/margins": 175.79629516601562, "rewards/rejected": -89.59869384765625, "step": 8690, "u": -6.688715934753418, "weight": 0.02500000223517418 }, { "diff_generated": -85.91513061523438, "epoch": 2.8191834089436165, "grad_norm": 491.41658674176796, "learning_rate": 8.82057683619859e-09, "logits/chosen": -2.3363218307495117, "logits/rejected": -2.519660711288452, "logps/chosen": -10.693092346191406, "logps/rejected": -171.95101928710938, "loss": 12.0402, "losses_ref": -0.0021391697227954865, "ref_logps/chosen": -86.15867614746094, "ref_logps/rejected": -86.03590393066406, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 75.465576171875, "rewards/margins": 161.38070678710938, "rewards/rejected": -85.91513061523438, "step": 8700, "u": -6.5091071128845215, "weight": 0.05634554475545883 }, { "diff_generated": -85.58265686035156, "epoch": 2.8224238496435516, "grad_norm": 486.4172883940246, "learning_rate": 8.508375610739626e-09, "logits/chosen": -2.4158897399902344, "logits/rejected": -2.500046730041504, "logps/chosen": -13.407247543334961, "logps/rejected": -168.00059509277344, "loss": 12.6608, "losses_ref": -4.46725152869476e-06, "ref_logps/chosen": -95.40724182128906, "ref_logps/rejected": -82.41795349121094, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 82.0, "rewards/margins": 167.58267211914062, "rewards/rejected": -85.58265686035156, "step": 8710, "u": -6.590667724609375, "weight": 0.04375016316771507 }, { "diff_generated": -87.30168151855469, "epoch": 2.8256642903434868, "grad_norm": 530.6312982735535, "learning_rate": 8.201740128725365e-09, "logits/chosen": -2.369098663330078, "logits/rejected": -2.536067008972168, "logps/chosen": -12.3034029006958, "logps/rejected": -173.2048797607422, "loss": 12.2594, "losses_ref": -1.9929467853785354e-08, "ref_logps/chosen": -92.73594665527344, "ref_logps/rejected": -85.90321350097656, "rewards/accuracies": 0.9375, "rewards/chosen": 80.43254089355469, "rewards/margins": 167.73423767089844, "rewards/rejected": -87.30168151855469, "step": 8720, "u": -6.42525577545166, "weight": 0.0625 }, { "diff_generated": -87.40478515625, "epoch": 2.828904731043422, "grad_norm": 471.09383996051514, "learning_rate": 7.900674749519564e-09, "logits/chosen": -2.412445545196533, "logits/rejected": -2.513763904571533, "logps/chosen": -13.007713317871094, "logps/rejected": -177.88394165039062, "loss": 12.564, "losses_ref": -1.7794054230080292e-08, "ref_logps/chosen": -94.12556457519531, "ref_logps/rejected": -90.47914123535156, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 81.11785125732422, "rewards/margins": 168.5226593017578, "rewards/rejected": -87.40478515625, "step": 8730, "u": -6.489443302154541, "weight": 0.05624999850988388 }, { "diff_generated": -83.98377990722656, "epoch": 2.832145171743357, "grad_norm": 501.12454027761083, "learning_rate": 7.605183753297283e-09, "logits/chosen": -2.4327521324157715, "logits/rejected": -2.488560438156128, "logps/chosen": -13.543904304504395, "logps/rejected": -171.91537475585938, "loss": 12.5131, "losses_ref": -5.7025822570722084e-06, "ref_logps/chosen": -99.78609466552734, "ref_logps/rejected": -87.93158721923828, "rewards/accuracies": 0.9375, "rewards/chosen": 86.2421875, "rewards/margins": 170.22598266601562, "rewards/rejected": -83.98377990722656, "step": 8740, "u": -6.419000148773193, "weight": 0.06250022351741791 }, { "diff_generated": -83.87056732177734, "epoch": 2.835385612443292, "grad_norm": 485.1619344748365, "learning_rate": 7.315271340983731e-09, "logits/chosen": -2.407285451889038, "logits/rejected": -2.5274343490600586, "logps/chosen": -11.9943265914917, "logps/rejected": -169.26734924316406, "loss": 12.3454, "losses_ref": -0.0011153435334563255, "ref_logps/chosen": -95.7374267578125, "ref_logps/rejected": -85.39678192138672, "rewards/accuracies": 0.9375, "rewards/chosen": 83.74310302734375, "rewards/margins": 167.61367797851562, "rewards/rejected": -83.87056732177734, "step": 8750, "u": -6.429316520690918, "weight": 0.06254558265209198 }, { "diff_generated": -86.22040557861328, "epoch": 2.8386260531432272, "grad_norm": 470.0720536433388, "learning_rate": 7.030941634194932e-09, "logits/chosen": -2.4146881103515625, "logits/rejected": -2.539377450942993, "logps/chosen": -13.119758605957031, "logps/rejected": -174.09194946289062, "loss": 13.0131, "losses_ref": -8.802903721516486e-06, "ref_logps/chosen": -94.57557678222656, "ref_logps/rejected": -87.87154388427734, "rewards/accuracies": 0.9375, "rewards/chosen": 81.45582580566406, "rewards/margins": 167.6762237548828, "rewards/rejected": -86.22040557861328, "step": 8760, "u": -6.445687770843506, "weight": 0.06250028312206268 }, { "diff_generated": -87.8705062866211, "epoch": 2.841866493843163, "grad_norm": 486.0851192565662, "learning_rate": 6.752198675178711e-09, "logits/chosen": -2.417034387588501, "logits/rejected": -2.5238921642303467, "logps/chosen": -11.984928131103516, "logps/rejected": -175.46932983398438, "loss": 12.2577, "losses_ref": -1.0461581601006742e-09, "ref_logps/chosen": -95.73543548583984, "ref_logps/rejected": -87.59882354736328, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 83.7505111694336, "rewards/margins": 171.6210174560547, "rewards/rejected": -87.8705062866211, "step": 8770, "u": -6.54312801361084, "weight": 0.05000000074505806 }, { "diff_generated": -82.89167785644531, "epoch": 2.845106934543098, "grad_norm": 457.19363309019275, "learning_rate": 6.479046426757584e-09, "logits/chosen": -2.3589627742767334, "logits/rejected": -2.44968581199646, "logps/chosen": -12.40269660949707, "logps/rejected": -163.45108032226562, "loss": 12.3775, "losses_ref": -0.009188800118863583, "ref_logps/chosen": -95.39823913574219, "ref_logps/rejected": -80.55940246582031, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 82.99554443359375, "rewards/margins": 165.88722229003906, "rewards/rejected": -82.89167785644531, "step": 8780, "u": -6.339357852935791, "weight": 0.06919096410274506 }, { "diff_generated": -86.87873840332031, "epoch": 2.848347375243033, "grad_norm": 503.57331972834254, "learning_rate": 6.211488772272133e-09, "logits/chosen": -2.3619651794433594, "logits/rejected": -2.5722765922546387, "logps/chosen": -11.627964973449707, "logps/rejected": -178.09188842773438, "loss": 12.2835, "losses_ref": -3.3028396018153217e-08, "ref_logps/chosen": -89.77183532714844, "ref_logps/rejected": -91.21315002441406, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 78.14387512207031, "rewards/margins": 165.0226287841797, "rewards/rejected": -86.87873840332031, "step": 8790, "u": -6.495707035064697, "weight": 0.05624999850988388 }, { "diff_generated": -88.65650939941406, "epoch": 2.851587815942968, "grad_norm": 484.89134879727567, "learning_rate": 5.9495295155260305e-09, "logits/chosen": -2.408686399459839, "logits/rejected": -2.5539755821228027, "logps/chosen": -13.315896987915039, "logps/rejected": -174.79092407226562, "loss": 12.9074, "losses_ref": -2.7582482076127235e-08, "ref_logps/chosen": -98.5407485961914, "ref_logps/rejected": -86.13442993164062, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 85.22486877441406, "rewards/margins": 173.88134765625, "rewards/rejected": -88.65650939941406, "step": 8800, "u": -6.520726203918457, "weight": 0.05000000074505806 }, { "diff_generated": -84.28498840332031, "epoch": 2.8548282566429033, "grad_norm": 507.28233972164253, "learning_rate": 5.69317238073177e-09, "logits/chosen": -2.3808228969573975, "logits/rejected": -2.4702324867248535, "logps/chosen": -12.662498474121094, "logps/rejected": -165.72178649902344, "loss": 12.5595, "losses_ref": -0.0004648033936973661, "ref_logps/chosen": -94.13968658447266, "ref_logps/rejected": -81.43679809570312, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 81.47718811035156, "rewards/margins": 165.76219177246094, "rewards/rejected": -84.28498840332031, "step": 8810, "u": -6.3492021560668945, "weight": 0.06876911222934723 }, { "diff_generated": -82.69854736328125, "epoch": 2.858068697342839, "grad_norm": 489.5156510442447, "learning_rate": 5.442421012457909e-09, "logits/chosen": -2.3480446338653564, "logits/rejected": -2.4562230110168457, "logps/chosen": -11.1311616897583, "logps/rejected": -165.80661010742188, "loss": 12.5271, "losses_ref": -0.0005079759284853935, "ref_logps/chosen": -89.9560546875, "ref_logps/rejected": -83.10804748535156, "rewards/accuracies": 0.90625, "rewards/chosen": 78.82488250732422, "rewards/margins": 161.52342224121094, "rewards/rejected": -82.69854736328125, "step": 8820, "u": -6.202415943145752, "weight": 0.09377063810825348 }, { "diff_generated": -83.34148406982422, "epoch": 2.861309138042774, "grad_norm": 510.3538193736458, "learning_rate": 5.197278975577069e-09, "logits/chosen": -2.339171886444092, "logits/rejected": -2.503340721130371, "logps/chosen": -13.35303020477295, "logps/rejected": -167.5297393798828, "loss": 12.7168, "losses_ref": -0.0007630180334672332, "ref_logps/chosen": -93.05812072753906, "ref_logps/rejected": -84.18824768066406, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 79.70509338378906, "rewards/margins": 163.0465850830078, "rewards/rejected": -83.34148406982422, "step": 8830, "u": -6.637404441833496, "weight": 0.0375315360724926 }, { "diff_generated": -88.45011901855469, "epoch": 2.864549578742709, "grad_norm": 519.6341574740505, "learning_rate": 4.957749755215346e-09, "logits/chosen": -2.3442203998565674, "logits/rejected": -2.563141345977783, "logps/chosen": -11.600044250488281, "logps/rejected": -178.43556213378906, "loss": 12.97, "losses_ref": -2.4012560828623464e-08, "ref_logps/chosen": -90.67213439941406, "ref_logps/rejected": -89.98545837402344, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 79.07207489013672, "rewards/margins": 167.52218627929688, "rewards/rejected": -88.45011901855469, "step": 8840, "u": -6.600878715515137, "weight": 0.03750000149011612 }, { "diff_generated": -82.25421905517578, "epoch": 2.8677900194426442, "grad_norm": 508.07055814248287, "learning_rate": 4.723836756702848e-09, "logits/chosen": -2.353315830230713, "logits/rejected": -2.471273183822632, "logps/chosen": -11.62907600402832, "logps/rejected": -161.6550750732422, "loss": 12.5092, "losses_ref": -1.6753695035731653e-06, "ref_logps/chosen": -93.2217788696289, "ref_logps/rejected": -79.40087127685547, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 81.59269714355469, "rewards/margins": 163.84690856933594, "rewards/rejected": -82.25421905517578, "step": 8850, "u": -6.413300514221191, "weight": 0.0687500387430191 }, { "diff_generated": -83.08013916015625, "epoch": 2.8710304601425793, "grad_norm": 543.348378628187, "learning_rate": 4.495543305524974e-09, "logits/chosen": -2.3752541542053223, "logits/rejected": -2.5042357444763184, "logps/chosen": -12.29096794128418, "logps/rejected": -165.60269165039062, "loss": 12.9747, "losses_ref": -5.240957534624613e-07, "ref_logps/chosen": -93.07635498046875, "ref_logps/rejected": -82.5225601196289, "rewards/accuracies": 0.9375, "rewards/chosen": 80.78538513183594, "rewards/margins": 163.8655242919922, "rewards/rejected": -83.08013916015625, "step": 8860, "u": -6.475934028625488, "weight": 0.0625000149011612 }, { "diff_generated": -86.92012023925781, "epoch": 2.8742709008425145, "grad_norm": 506.52607107402076, "learning_rate": 4.2728726472756934e-09, "logits/chosen": -2.382050037384033, "logits/rejected": -2.5092759132385254, "logps/chosen": -14.272692680358887, "logps/rejected": -175.8685760498047, "loss": 12.8992, "losses_ref": -0.01005796529352665, "ref_logps/chosen": -99.31636047363281, "ref_logps/rejected": -88.94844818115234, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 85.0436782836914, "rewards/margins": 171.9637908935547, "rewards/rejected": -86.92012023925781, "step": 8870, "u": -6.565062522888184, "weight": 0.03794359415769577 }, { "diff_generated": -86.80320739746094, "epoch": 2.8775113415424496, "grad_norm": 500.519894631932, "learning_rate": 4.055827947610746e-09, "logits/chosen": -2.383944272994995, "logits/rejected": -2.494957208633423, "logps/chosen": -13.14458179473877, "logps/rejected": -174.47988891601562, "loss": 12.9073, "losses_ref": -0.002788522047922015, "ref_logps/chosen": -99.15926361083984, "ref_logps/rejected": -87.67668151855469, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 86.01467895507812, "rewards/margins": 172.81790161132812, "rewards/rejected": -86.80320739746094, "step": 8880, "u": -6.616991996765137, "weight": 0.03761402145028114 }, { "diff_generated": -81.07831573486328, "epoch": 2.8807517822423847, "grad_norm": 518.0233566543808, "learning_rate": 3.844412292203092e-09, "logits/chosen": -2.3635849952697754, "logits/rejected": -2.442833662033081, "logps/chosen": -12.299894332885742, "logps/rejected": -162.95193481445312, "loss": 12.2416, "losses_ref": -0.0006439397693611681, "ref_logps/chosen": -94.99177551269531, "ref_logps/rejected": -81.8736343383789, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 82.69188690185547, "rewards/margins": 163.7701873779297, "rewards/rejected": -81.07831573486328, "step": 8890, "u": -6.533448219299316, "weight": 0.050026316195726395 }, { "diff_generated": -85.1846694946289, "epoch": 2.8839922229423203, "grad_norm": 498.04616266719586, "learning_rate": 3.638628686698908e-09, "logits/chosen": -2.3565611839294434, "logits/rejected": -2.4600491523742676, "logps/chosen": -12.45705509185791, "logps/rejected": -177.6852264404297, "loss": 12.9739, "losses_ref": -0.041970349848270416, "ref_logps/chosen": -97.85175323486328, "ref_logps/rejected": -92.50055694580078, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 85.39469909667969, "rewards/margins": 170.57937622070312, "rewards/rejected": -85.1846694946289, "step": 8900, "u": -6.387704372406006, "weight": 0.07687592506408691 }, { "diff_generated": -85.67487335205078, "epoch": 2.8872326636422554, "grad_norm": 526.337540490445, "learning_rate": 3.438480056674864e-09, "logits/chosen": -2.3919167518615723, "logits/rejected": -2.477691173553467, "logps/chosen": -12.685729026794434, "logps/rejected": -175.30699157714844, "loss": 12.8342, "losses_ref": -0.0003273399197496474, "ref_logps/chosen": -101.33018493652344, "ref_logps/rejected": -89.63212585449219, "rewards/accuracies": 0.9375, "rewards/chosen": 88.64444732666016, "rewards/margins": 174.31930541992188, "rewards/rejected": -85.67487335205078, "step": 8910, "u": -6.431562900543213, "weight": 0.06251437962055206 }, { "diff_generated": -89.34169006347656, "epoch": 2.8904731043421905, "grad_norm": 526.3476451399375, "learning_rate": 3.243969247596423e-09, "logits/chosen": -2.37211275100708, "logits/rejected": -2.491511821746826, "logps/chosen": -12.31456470489502, "logps/rejected": -179.0964813232422, "loss": 13.1078, "losses_ref": -0.0004912428557872772, "ref_logps/chosen": -96.56272888183594, "ref_logps/rejected": -89.7547836303711, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 84.24817657470703, "rewards/margins": 173.58987426757812, "rewards/rejected": -89.34169006347656, "step": 8920, "u": -6.722291469573975, "weight": 0.025020426139235497 }, { "diff_generated": -84.24349212646484, "epoch": 2.8937135450421256, "grad_norm": 527.0982706366657, "learning_rate": 3.0550990247776522e-09, "logits/chosen": -2.3859753608703613, "logits/rejected": -2.5176186561584473, "logps/chosen": -11.915032386779785, "logps/rejected": -167.55758666992188, "loss": 12.8392, "losses_ref": -3.0404958550889205e-08, "ref_logps/chosen": -94.97882080078125, "ref_logps/rejected": -83.31407165527344, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 83.06379699707031, "rewards/margins": 167.30728149414062, "rewards/rejected": -84.24349212646484, "step": 8930, "u": -6.539845943450928, "weight": 0.05000000074505806 }, { "diff_generated": -85.3631362915039, "epoch": 2.8969539857420608, "grad_norm": 490.6648318727779, "learning_rate": 2.871872073341608e-09, "logits/chosen": -2.400597095489502, "logits/rejected": -2.5779776573181152, "logps/chosen": -12.452852249145508, "logps/rejected": -173.8939208984375, "loss": 12.8271, "losses_ref": -0.000623942818492651, "ref_logps/chosen": -94.02854919433594, "ref_logps/rejected": -88.53079223632812, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 81.57572174072266, "rewards/margins": 166.9388427734375, "rewards/rejected": -85.3631362915039, "step": 8940, "u": -6.538206577301025, "weight": 0.05002504587173462 }, { "diff_generated": -87.08778381347656, "epoch": 2.9001944264419963, "grad_norm": 520.9679196841599, "learning_rate": 2.694290998182325e-09, "logits/chosen": -2.403104305267334, "logits/rejected": -2.558917999267578, "logps/chosen": -12.943342208862305, "logps/rejected": -174.18057250976562, "loss": 12.9416, "losses_ref": -0.0016796886920928955, "ref_logps/chosen": -98.12152099609375, "ref_logps/rejected": -87.09278106689453, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 85.17818450927734, "rewards/margins": 172.26596069335938, "rewards/rejected": -87.08778381347656, "step": 8950, "u": -6.579586029052734, "weight": 0.04382295534014702 }, { "diff_generated": -86.29190826416016, "epoch": 2.9034348671419314, "grad_norm": 529.2791272283124, "learning_rate": 2.52235832392782e-09, "logits/chosen": -2.38016414642334, "logits/rejected": -2.5228476524353027, "logps/chosen": -12.305742263793945, "logps/rejected": -176.33604431152344, "loss": 13.1021, "losses_ref": -7.620108954142779e-05, "ref_logps/chosen": -95.00885009765625, "ref_logps/rejected": -90.04413604736328, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 82.70310974121094, "rewards/margins": 168.99502563476562, "rewards/rejected": -86.29190826416016, "step": 8960, "u": -6.599948883056641, "weight": 0.037502940744161606 }, { "diff_generated": -87.52955627441406, "epoch": 2.9066753078418666, "grad_norm": 469.7649862031913, "learning_rate": 2.35607649490408e-09, "logits/chosen": -2.4092063903808594, "logits/rejected": -2.477895736694336, "logps/chosen": -13.770861625671387, "logps/rejected": -174.51356506347656, "loss": 13.0702, "losses_ref": -4.5696660322391836e-07, "ref_logps/chosen": -104.1186752319336, "ref_logps/rejected": -86.9840087890625, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 90.34780883789062, "rewards/margins": 177.8773651123047, "rewards/rejected": -87.52955627441406, "step": 8970, "u": -6.810555934906006, "weight": 0.012500005774199963 }, { "diff_generated": -83.99754333496094, "epoch": 2.9099157485418017, "grad_norm": 566.3807216829343, "learning_rate": 2.1954478751003313e-09, "logits/chosen": -2.3780055046081543, "logits/rejected": -2.497126340866089, "logps/chosen": -11.279232025146484, "logps/rejected": -165.64120483398438, "loss": 12.4931, "losses_ref": -0.006799762137234211, "ref_logps/chosen": -94.69685363769531, "ref_logps/rejected": -81.6436538696289, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 83.41761016845703, "rewards/margins": 167.41517639160156, "rewards/rejected": -83.99754333496094, "step": 8980, "u": -6.490334987640381, "weight": 0.05654176324605942 }, { "diff_generated": -84.52422332763672, "epoch": 2.913156189241737, "grad_norm": 472.8819765444885, "learning_rate": 2.040474748135512e-09, "logits/chosen": -2.3666622638702393, "logits/rejected": -2.467114210128784, "logps/chosen": -12.454826354980469, "logps/rejected": -170.0270538330078, "loss": 12.9191, "losses_ref": -0.0022382144816219807, "ref_logps/chosen": -97.16545104980469, "ref_logps/rejected": -85.50281524658203, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 84.71062469482422, "rewards/margins": 169.23487854003906, "rewards/rejected": -84.52422332763672, "step": 8990, "u": -6.418060302734375, "weight": 0.06884618103504181 }, { "diff_generated": -85.54656982421875, "epoch": 2.916396629941672, "grad_norm": 488.54506638633603, "learning_rate": 1.8911593172258544e-09, "logits/chosen": -2.3644773960113525, "logits/rejected": -2.4817874431610107, "logps/chosen": -12.495416641235352, "logps/rejected": -169.99362182617188, "loss": 12.6552, "losses_ref": -1.4663429581673881e-08, "ref_logps/chosen": -94.41749572753906, "ref_logps/rejected": -84.44705200195312, "rewards/accuracies": 0.9375, "rewards/chosen": 81.92208099365234, "rewards/margins": 167.46865844726562, "rewards/rejected": -85.54656982421875, "step": 9000, "u": -6.4284987449646, "weight": 0.0625 }, { "diff_generated": -87.22413635253906, "epoch": 2.919637070641607, "grad_norm": 529.4796688062927, "learning_rate": 1.7475037051532638e-09, "logits/chosen": -2.4001526832580566, "logits/rejected": -2.506021499633789, "logps/chosen": -13.3471097946167, "logps/rejected": -172.06570434570312, "loss": 12.8322, "losses_ref": -0.0021448889747262, "ref_logps/chosen": -95.56883239746094, "ref_logps/rejected": -84.8415756225586, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 82.22171783447266, "rewards/margins": 169.4458465576172, "rewards/rejected": -87.22413635253906, "step": 9010, "u": -6.472896575927734, "weight": 0.05634375661611557 }, { "diff_generated": -89.62074279785156, "epoch": 2.9228775113415426, "grad_norm": 491.15091859051904, "learning_rate": 1.609509954235566e-09, "logits/chosen": -2.3617372512817383, "logits/rejected": -2.571871280670166, "logps/chosen": -12.765652656555176, "logps/rejected": -181.67649841308594, "loss": 12.5575, "losses_ref": -0.004562483634799719, "ref_logps/chosen": -94.67340087890625, "ref_logps/rejected": -92.05574035644531, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 81.90774536132812, "rewards/margins": 171.5284881591797, "rewards/rejected": -89.62074279785156, "step": 9020, "u": -6.612214088439941, "weight": 0.03769397363066673 }, { "diff_generated": -87.36817932128906, "epoch": 2.9261179520414777, "grad_norm": 529.3732264024782, "learning_rate": 1.4771800262970203e-09, "logits/chosen": -2.3470075130462646, "logits/rejected": -2.5225512981414795, "logps/chosen": -13.415380477905273, "logps/rejected": -176.37176513671875, "loss": 12.5222, "losses_ref": -1.5120046725769498e-07, "ref_logps/chosen": -93.91580963134766, "ref_logps/rejected": -89.00359344482422, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 80.50043487548828, "rewards/margins": 167.86862182617188, "rewards/rejected": -87.36817932128906, "step": 9030, "u": -6.55454158782959, "weight": 0.04375000298023224 }, { "diff_generated": -86.6895980834961, "epoch": 2.929358392741413, "grad_norm": 483.88931269328356, "learning_rate": 1.3505158026408724e-09, "logits/chosen": -2.367731809616089, "logits/rejected": -2.493482828140259, "logps/chosen": -14.568082809448242, "logps/rejected": -176.5614013671875, "loss": 12.352, "losses_ref": -0.02299603261053562, "ref_logps/chosen": -97.7130126953125, "ref_logps/rejected": -89.87179565429688, "rewards/accuracies": 0.96875, "rewards/chosen": 83.1449203491211, "rewards/margins": 169.8345184326172, "rewards/rejected": -86.6895980834961, "step": 9040, "u": -6.6297607421875, "weight": 0.032338447868824005 }, { "diff_generated": -87.48365783691406, "epoch": 2.932598833441348, "grad_norm": 509.056014485162, "learning_rate": 1.2295190840223125e-09, "logits/chosen": -2.392026901245117, "logits/rejected": -2.5566885471343994, "logps/chosen": -12.913421630859375, "logps/rejected": -175.8246612548828, "loss": 12.3104, "losses_ref": -0.003945712000131607, "ref_logps/chosen": -98.818603515625, "ref_logps/rejected": -88.34100341796875, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 85.9051742553711, "rewards/margins": 173.38882446289062, "rewards/rejected": -87.48365783691406, "step": 9050, "u": -6.52199649810791, "weight": 0.05018042400479317 }, { "diff_generated": -86.2895736694336, "epoch": 2.935839274141283, "grad_norm": 522.5397264350086, "learning_rate": 1.1141915906228928e-09, "logits/chosen": -2.38533353805542, "logits/rejected": -2.4990665912628174, "logps/chosen": -12.127532958984375, "logps/rejected": -172.2735595703125, "loss": 12.553, "losses_ref": -2.1370703962020343e-07, "ref_logps/chosen": -95.51380920410156, "ref_logps/rejected": -85.9839859008789, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 83.38627624511719, "rewards/margins": 169.67587280273438, "rewards/rejected": -86.2895736694336, "step": 9060, "u": -6.544399261474609, "weight": 0.04375000670552254 }, { "diff_generated": -83.47368621826172, "epoch": 2.9390797148412187, "grad_norm": 563.9845256802446, "learning_rate": 1.0045349620262379e-09, "logits/chosen": -2.3863961696624756, "logits/rejected": -2.5087244510650635, "logps/chosen": -12.521696090698242, "logps/rejected": -165.8682861328125, "loss": 12.6952, "losses_ref": -0.00032701349118724465, "ref_logps/chosen": -96.12919616699219, "ref_logps/rejected": -82.39459991455078, "rewards/accuracies": 0.9375, "rewards/chosen": 83.60749816894531, "rewards/margins": 167.08120727539062, "rewards/rejected": -83.47368621826172, "step": 9070, "u": -6.404754638671875, "weight": 0.06251270323991776 }, { "diff_generated": -84.08647155761719, "epoch": 2.942320155541154, "grad_norm": 497.23346440289765, "learning_rate": 9.005507571945958e-10, "logits/chosen": -2.402247905731201, "logits/rejected": -2.4763851165771484, "logps/chosen": -13.246496200561523, "logps/rejected": -165.32626342773438, "loss": 12.3413, "losses_ref": -2.0963292968190217e-07, "ref_logps/chosen": -95.0720443725586, "ref_logps/rejected": -81.23980712890625, "rewards/accuracies": 0.9375, "rewards/chosen": 81.8255615234375, "rewards/margins": 165.9120330810547, "rewards/rejected": -84.08647155761719, "step": 9080, "u": -6.462907314300537, "weight": 0.0625000074505806 }, { "diff_generated": -85.76387023925781, "epoch": 2.945560596241089, "grad_norm": 492.894108034187, "learning_rate": 8.022404544466788e-10, "logits/chosen": -2.3979382514953613, "logits/rejected": -2.5150272846221924, "logps/chosen": -12.623950004577637, "logps/rejected": -169.61444091796875, "loss": 12.4399, "losses_ref": -0.004426004830747843, "ref_logps/chosen": -95.57967376708984, "ref_logps/rejected": -83.8505630493164, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 82.95572662353516, "rewards/margins": 168.71958923339844, "rewards/rejected": -85.76387023925781, "step": 9090, "u": -6.603558540344238, "weight": 0.04393995180726051 }, { "diff_generated": -82.2959976196289, "epoch": 2.948801036941024, "grad_norm": 537.1206136986993, "learning_rate": 7.096054514367455e-10, "logits/chosen": -2.3304896354675293, "logits/rejected": -2.522718906402588, "logps/chosen": -12.409939765930176, "logps/rejected": -164.3667755126953, "loss": 12.5513, "losses_ref": -0.0012399861589074135, "ref_logps/chosen": -86.47990417480469, "ref_logps/rejected": -82.07076263427734, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 74.06996154785156, "rewards/margins": 156.36595153808594, "rewards/rejected": -82.2959976196289, "step": 9100, "u": -6.275343894958496, "weight": 0.08755262196063995 }, { "diff_generated": -90.0444107055664, "epoch": 2.952041477640959, "grad_norm": 506.99486602906086, "learning_rate": 6.226470651346182e-10, "logits/chosen": -2.366097927093506, "logits/rejected": -2.557908535003662, "logps/chosen": -12.234922409057617, "logps/rejected": -182.9877471923828, "loss": 12.5142, "losses_ref": -1.0447491760601224e-08, "ref_logps/chosen": -92.40087890625, "ref_logps/rejected": -92.94332885742188, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 80.16595458984375, "rewards/margins": 170.21035766601562, "rewards/rejected": -90.0444107055664, "step": 9110, "u": -6.548572540283203, "weight": 0.05000000074505806 }, { "diff_generated": -89.26964569091797, "epoch": 2.9552819183408943, "grad_norm": 501.46049075234794, "learning_rate": 5.413665318070304e-10, "logits/chosen": -2.3693573474884033, "logits/rejected": -2.546677350997925, "logps/chosen": -13.509109497070312, "logps/rejected": -181.3638458251953, "loss": 13.2782, "losses_ref": -7.902246466073848e-07, "ref_logps/chosen": -96.25882720947266, "ref_logps/rejected": -92.09419250488281, "rewards/accuracies": 0.96875, "rewards/chosen": 82.74971771240234, "rewards/margins": 172.0193634033203, "rewards/rejected": -89.26964569091797, "step": 9120, "u": -6.638380527496338, "weight": 0.03125002235174179 }, { "diff_generated": -89.65789794921875, "epoch": 2.9585223590408294, "grad_norm": 535.7757933629651, "learning_rate": 4.657650069999963e-10, "logits/chosen": -2.3974645137786865, "logits/rejected": -2.5350089073181152, "logps/chosen": -12.196057319641113, "logps/rejected": -177.3319854736328, "loss": 12.9091, "losses_ref": -0.014481325633823872, "ref_logps/chosen": -94.6359634399414, "ref_logps/rejected": -87.67405700683594, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 82.43990325927734, "rewards/margins": 172.09780883789062, "rewards/rejected": -89.65789794921875, "step": 9130, "u": -6.52349853515625, "weight": 0.050685059279203415 }, { "diff_generated": -84.9540023803711, "epoch": 2.9617627997407645, "grad_norm": 522.8122611896282, "learning_rate": 3.95843565522469e-10, "logits/chosen": -2.385798215866089, "logits/rejected": -2.4658164978027344, "logps/chosen": -11.826509475708008, "logps/rejected": -171.10751342773438, "loss": 12.475, "losses_ref": -0.0056012957356870174, "ref_logps/chosen": -97.4726333618164, "ref_logps/rejected": -86.15352630615234, "rewards/accuracies": 0.96875, "rewards/chosen": 85.6461181640625, "rewards/margins": 170.6001434326172, "rewards/rejected": -84.9540023803711, "step": 9140, "u": -6.638216495513916, "weight": 0.031489625573158264 }, { "diff_generated": -83.58999633789062, "epoch": 2.9650032404407, "grad_norm": 461.03397205632257, "learning_rate": 3.3160320143097444e-10, "logits/chosen": -2.415177583694458, "logits/rejected": -2.542485237121582, "logps/chosen": -12.850624084472656, "logps/rejected": -167.17276000976562, "loss": 12.8863, "losses_ref": -4.105303560208995e-06, "ref_logps/chosen": -92.3864974975586, "ref_logps/rejected": -83.58274841308594, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 79.53587341308594, "rewards/margins": 163.1258544921875, "rewards/rejected": -83.58999633789062, "step": 9150, "u": -6.271819591522217, "weight": 0.08750005066394806 }, { "diff_generated": -83.55833435058594, "epoch": 2.968243681140635, "grad_norm": 526.4209216536375, "learning_rate": 2.7304482801548957e-10, "logits/chosen": -2.3909294605255127, "logits/rejected": -2.435844898223877, "logps/chosen": -12.864395141601562, "logps/rejected": -165.61740112304688, "loss": 12.5398, "losses_ref": -1.5140069535846123e-06, "ref_logps/chosen": -95.42951965332031, "ref_logps/rejected": -82.05905151367188, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 82.56513214111328, "rewards/margins": 166.12344360351562, "rewards/rejected": -83.55833435058594, "step": 9160, "u": -6.4903082847595215, "weight": 0.05625002458691597 }, { "diff_generated": -86.75840759277344, "epoch": 2.9714841218405703, "grad_norm": 515.474364819348, "learning_rate": 2.201692777865194e-10, "logits/chosen": -2.3582444190979004, "logits/rejected": -2.4979777336120605, "logps/chosen": -11.988899230957031, "logps/rejected": -175.92791748046875, "loss": 12.4569, "losses_ref": -0.0018433972727507353, "ref_logps/chosen": -95.80802917480469, "ref_logps/rejected": -89.16950988769531, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 83.81912994384766, "rewards/margins": 170.57754516601562, "rewards/rejected": -86.75840759277344, "step": 9170, "u": -6.5215277671813965, "weight": 0.05007495731115341 }, { "diff_generated": -84.22948455810547, "epoch": 2.9747245625405054, "grad_norm": 491.027791113081, "learning_rate": 1.729773024631953e-10, "logits/chosen": -2.3653883934020996, "logits/rejected": -2.491422653198242, "logps/chosen": -12.891985893249512, "logps/rejected": -170.76918029785156, "loss": 13.2191, "losses_ref": -0.00504010496661067, "ref_logps/chosen": -97.28108215332031, "ref_logps/rejected": -86.5396728515625, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 84.38908386230469, "rewards/margins": 168.6185760498047, "rewards/rejected": -84.22948455810547, "step": 9180, "u": -6.577776908874512, "weight": 0.04397150129079819 }, { "diff_generated": -82.56714630126953, "epoch": 2.9779650032404406, "grad_norm": 524.5628445911263, "learning_rate": 1.3146957296261696e-10, "logits/chosen": -2.296567916870117, "logits/rejected": -2.5240516662597656, "logps/chosen": -11.981468200683594, "logps/rejected": -168.29286193847656, "loss": 12.8342, "losses_ref": -1.685024031417015e-08, "ref_logps/chosen": -87.14879608154297, "ref_logps/rejected": -85.72572326660156, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 75.16732025146484, "rewards/margins": 157.7344512939453, "rewards/rejected": -82.56714630126953, "step": 9190, "u": -6.213091850280762, "weight": 0.08749999850988388 }, { "diff_generated": -90.73418426513672, "epoch": 2.981205443940376, "grad_norm": 496.7884393889243, "learning_rate": 9.564667939030435e-11, "logits/chosen": -2.4040732383728027, "logits/rejected": -2.5331547260284424, "logps/chosen": -12.428566932678223, "logps/rejected": -181.51348876953125, "loss": 12.9255, "losses_ref": -0.0026974931824952364, "ref_logps/chosen": -95.91474914550781, "ref_logps/rejected": -90.779296875, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 83.4861831665039, "rewards/margins": 174.22036743164062, "rewards/rejected": -90.73418426513672, "step": 9200, "u": -6.768782615661621, "weight": 0.018865812569856644 }, { "diff_generated": -87.49602508544922, "epoch": 2.9844458846403112, "grad_norm": 486.7750746787758, "learning_rate": 6.550913103189337e-11, "logits/chosen": -2.3700268268585205, "logits/rejected": -2.5084478855133057, "logps/chosen": -11.562259674072266, "logps/rejected": -178.05300903320312, "loss": 12.3706, "losses_ref": -0.000528900243807584, "ref_logps/chosen": -93.57711029052734, "ref_logps/rejected": -90.5569839477539, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 82.01486206054688, "rewards/margins": 169.51089477539062, "rewards/rejected": -87.49602508544922, "step": 9210, "u": -6.715609550476074, "weight": 0.018771730363368988 }, { "diff_generated": -80.7569808959961, "epoch": 2.9876863253402464, "grad_norm": 527.7795938159417, "learning_rate": 4.1057356345675085e-11, "logits/chosen": -2.3647098541259766, "logits/rejected": -2.3917651176452637, "logps/chosen": -14.38011646270752, "logps/rejected": -163.92794799804688, "loss": 13.0986, "losses_ref": -0.0022724694572389126, "ref_logps/chosen": -99.43622589111328, "ref_logps/rejected": -83.17097473144531, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 85.05610656738281, "rewards/margins": 165.81307983398438, "rewards/rejected": -80.7569808959961, "step": 9220, "u": -6.606814384460449, "weight": 0.03759920597076416 }, { "diff_generated": -90.11248779296875, "epoch": 2.9909267660401815, "grad_norm": 502.63482025081817, "learning_rate": 2.229170295673377e-11, "logits/chosen": -2.401397228240967, "logits/rejected": -2.52555513381958, "logps/chosen": -12.586407661437988, "logps/rejected": -177.9874725341797, "loss": 12.5562, "losses_ref": -4.895515992586752e-09, "ref_logps/chosen": -97.63871002197266, "ref_logps/rejected": -87.87500762939453, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 85.05230712890625, "rewards/margins": 175.164794921875, "rewards/rejected": -90.11248779296875, "step": 9230, "u": -6.748744964599609, "weight": 0.01875000074505806 }, { "diff_generated": -86.4819564819336, "epoch": 2.9941672067401166, "grad_norm": 497.2515717059789, "learning_rate": 9.212437651973103e-12, "logits/chosen": -2.4102160930633545, "logits/rejected": -2.5013763904571533, "logps/chosen": -13.198896408081055, "logps/rejected": -174.96900939941406, "loss": 12.526, "losses_ref": -0.0031384092289954424, "ref_logps/chosen": -96.73193359375, "ref_logps/rejected": -88.48704528808594, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 83.53303527832031, "rewards/margins": 170.01498413085938, "rewards/rejected": -86.4819564819336, "step": 9240, "u": -6.405519008636475, "weight": 0.06888642907142639 }, { "diff_generated": -87.38936614990234, "epoch": 2.9974076474400517, "grad_norm": 471.9558730646237, "learning_rate": 1.819746376119369e-12, "logits/chosen": -2.3839592933654785, "logits/rejected": -2.4822239875793457, "logps/chosen": -14.210899353027344, "logps/rejected": -170.20504760742188, "loss": 13.092, "losses_ref": -0.0017271274700760841, "ref_logps/chosen": -96.49150848388672, "ref_logps/rejected": -82.81568145751953, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 82.28060913085938, "rewards/margins": 169.66998291015625, "rewards/rejected": -87.38936614990234, "step": 9250, "u": -6.5622968673706055, "weight": 0.050075747072696686 } ], "logging_steps": 10, "max_steps": 9258, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }