diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,19479 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 9258, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "diff_generated": 0.0, + "epoch": 0.0003240440699935191, + "grad_norm": 3559.2297401785586, + "learning_rate": 8.639308855291577e-10, + "logits/chosen": -2.6053388118743896, + "logits/rejected": -2.4319162368774414, + "logps/chosen": -116.55142974853516, + "logps/rejected": -89.49524688720703, + "loss": 10.3352, + "losses_ref": -89.49524688720703, + "ref_logps/chosen": -116.55142974853516, + "ref_logps/rejected": -89.49524688720703, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "u": 0.0, + "weight": 1.0 + }, + { + "diff_generated": 0.017430514097213745, + "epoch": 0.0032404406999351912, + "grad_norm": 3375.614561667246, + "learning_rate": 8.639308855291576e-09, + "logits/chosen": -2.497408390045166, + "logits/rejected": -2.570788860321045, + "logps/chosen": -92.1458511352539, + "logps/rejected": -91.23849487304688, + "loss": 5.6185, + "losses_ref": -91.74820709228516, + "ref_logps/chosen": -92.17684173583984, + "ref_logps/rejected": -91.25592803955078, + "rewards/accuracies": 0.4340277910232544, + "rewards/chosen": 0.030985673889517784, + "rewards/margins": 0.013555158860981464, + "rewards/rejected": 0.017430514097213745, + "step": 10, + "u": 0.025068603456020355, + "weight": 1.0016683340072632 + }, + { + "diff_generated": -0.0066615985706448555, + "epoch": 0.0064808813998703824, + "grad_norm": 3139.7387168039268, + "learning_rate": 1.727861771058315e-08, + "logits/chosen": -2.5308899879455566, + "logits/rejected": -2.5875303745269775, + "logps/chosen": -100.12669372558594, + "logps/rejected": -85.41898345947266, + "loss": 10.963, + "losses_ref": -85.79915618896484, + "ref_logps/chosen": -100.17314910888672, + "ref_logps/rejected": -85.41232299804688, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.04647066444158554, + "rewards/margins": 0.05313226580619812, + "rewards/rejected": -0.0066615985706448555, + "step": 20, + "u": 0.004874364472925663, + "weight": 0.9997771382331848 + }, + { + "diff_generated": -0.044504955410957336, + "epoch": 0.009721322099805573, + "grad_norm": 3411.2943748721727, + "learning_rate": 2.591792656587473e-08, + "logits/chosen": -2.5375819206237793, + "logits/rejected": -2.5686402320861816, + "logps/chosen": -100.92872619628906, + "logps/rejected": -87.86363983154297, + "loss": 12.3682, + "losses_ref": -87.37126159667969, + "ref_logps/chosen": -101.48959350585938, + "ref_logps/rejected": -87.81913757324219, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.5608684420585632, + "rewards/margins": 0.6053733825683594, + "rewards/rejected": -0.044504955410957336, + "step": 30, + "u": -0.02490001730620861, + "weight": 0.9959570169448853 + }, + { + "diff_generated": -0.29012399911880493, + "epoch": 0.012961762799740765, + "grad_norm": 3355.00768506651, + "learning_rate": 3.45572354211663e-08, + "logits/chosen": -2.554452419281006, + "logits/rejected": -2.6106112003326416, + "logps/chosen": -97.12639617919922, + "logps/rejected": -89.53710174560547, + "loss": 10.9751, + "losses_ref": -84.70036315917969, + "ref_logps/chosen": -99.29696655273438, + "ref_logps/rejected": -89.24697875976562, + "rewards/accuracies": 0.9593750238418579, + "rewards/chosen": 2.1705615520477295, + "rewards/margins": 2.4606857299804688, + "rewards/rejected": -0.29012399911880493, + "step": 40, + "u": -0.21681609749794006, + "weight": 0.9673402905464172 + }, + { + "diff_generated": -0.6411628723144531, + "epoch": 0.016202203499675955, + "grad_norm": 3064.562817800374, + "learning_rate": 4.319654427645788e-08, + "logits/chosen": -2.4995243549346924, + "logits/rejected": -2.5654492378234863, + "logps/chosen": -90.70710754394531, + "logps/rejected": -85.55342864990234, + "loss": 14.0442, + "losses_ref": -74.90791320800781, + "ref_logps/chosen": -95.43073272705078, + "ref_logps/rejected": -84.91226959228516, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 4.7236223220825195, + "rewards/margins": 5.364785194396973, + "rewards/rejected": -0.6411628723144531, + "step": 50, + "u": -0.3739597201347351, + "weight": 0.9222582578659058 + }, + { + "diff_generated": -2.4193122386932373, + "epoch": 0.019442644199611146, + "grad_norm": 3050.7408687627135, + "learning_rate": 5.183585313174946e-08, + "logits/chosen": -2.516291856765747, + "logits/rejected": -2.5783753395080566, + "logps/chosen": -81.91847229003906, + "logps/rejected": -85.47832489013672, + "loss": 33.34, + "losses_ref": -47.07170104980469, + "ref_logps/chosen": -94.7326889038086, + "ref_logps/rejected": -83.05900573730469, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 12.81421947479248, + "rewards/margins": 15.23353385925293, + "rewards/rejected": -2.4193122386932373, + "step": 60, + "u": -1.1126874685287476, + "weight": 0.6429678797721863 + }, + { + "diff_generated": -3.986859083175659, + "epoch": 0.02268308489954634, + "grad_norm": 3040.2648567467995, + "learning_rate": 6.047516198704104e-08, + "logits/chosen": -2.4859328269958496, + "logits/rejected": -2.544261932373047, + "logps/chosen": -75.36338806152344, + "logps/rejected": -89.84022521972656, + "loss": 32.419, + "losses_ref": -41.79503631591797, + "ref_logps/chosen": -95.0781021118164, + "ref_logps/rejected": -85.85337829589844, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 19.7147159576416, + "rewards/margins": 23.701576232910156, + "rewards/rejected": -3.986859083175659, + "step": 70, + "u": -1.096972107887268, + "weight": 0.5654774904251099 + }, + { + "diff_generated": -11.147028923034668, + "epoch": 0.02592352559948153, + "grad_norm": 1757.466149188433, + "learning_rate": 6.91144708423326e-08, + "logits/chosen": -2.4527060985565186, + "logits/rejected": -2.5265250205993652, + "logps/chosen": -59.0428466796875, + "logps/rejected": -99.50745391845703, + "loss": 45.6743, + "losses_ref": -15.425445556640625, + "ref_logps/chosen": -94.65631103515625, + "ref_logps/rejected": -88.36042785644531, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 35.61347198486328, + "rewards/margins": 46.760498046875, + "rewards/rejected": -11.147028923034668, + "step": 80, + "u": -1.673259973526001, + "weight": 0.23887920379638672 + }, + { + "diff_generated": -21.959552764892578, + "epoch": 0.02916396629941672, + "grad_norm": 1171.535498795329, + "learning_rate": 7.775377969762419e-08, + "logits/chosen": -2.4958112239837646, + "logits/rejected": -2.4814047813415527, + "logps/chosen": -52.37348556518555, + "logps/rejected": -102.47017669677734, + "loss": 48.9248, + "losses_ref": -2.9107580184936523, + "ref_logps/chosen": -96.49531555175781, + "ref_logps/rejected": -80.51063537597656, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 44.121826171875, + "rewards/margins": 66.08137512207031, + "rewards/rejected": -21.959552764892578, + "step": 90, + "u": -2.1358799934387207, + "weight": 0.07833331823348999 + }, + { + "diff_generated": -26.378650665283203, + "epoch": 0.03240440699935191, + "grad_norm": 981.3254617740091, + "learning_rate": 8.639308855291576e-08, + "logits/chosen": -2.505199432373047, + "logits/rejected": -2.507596731185913, + "logps/chosen": -53.363304138183594, + "logps/rejected": -113.31050872802734, + "loss": 48.5492, + "losses_ref": -0.968902587890625, + "ref_logps/chosen": -100.1287612915039, + "ref_logps/rejected": -86.93186950683594, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 46.76546096801758, + "rewards/margins": 73.14411163330078, + "rewards/rejected": -26.378650665283203, + "step": 100, + "u": -2.16186261177063, + "weight": 0.06941097974777222 + }, + { + "diff_generated": -26.51546859741211, + "epoch": 0.0356448476992871, + "grad_norm": 840.9009121084388, + "learning_rate": 9.503239740820734e-08, + "logits/chosen": -2.4892473220825195, + "logits/rejected": -2.5445773601531982, + "logps/chosen": -46.26842498779297, + "logps/rejected": -111.51081848144531, + "loss": 42.2979, + "losses_ref": -1.3088524341583252, + "ref_logps/chosen": -96.73486328125, + "ref_logps/rejected": -84.99533081054688, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 50.46643829345703, + "rewards/margins": 76.98190307617188, + "rewards/rejected": -26.51546859741211, + "step": 110, + "u": -2.185539722442627, + "weight": 0.05797583609819412 + }, + { + "diff_generated": -30.8448486328125, + "epoch": 0.03888528839922229, + "grad_norm": 815.4716872744215, + "learning_rate": 1.0367170626349892e-07, + "logits/chosen": -2.4767396450042725, + "logits/rejected": -2.6058273315429688, + "logps/chosen": -40.43291473388672, + "logps/rejected": -119.78251647949219, + "loss": 41.4793, + "losses_ref": -0.2510392963886261, + "ref_logps/chosen": -94.6415786743164, + "ref_logps/rejected": -88.93766021728516, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 54.208656311035156, + "rewards/margins": 85.05350494384766, + "rewards/rejected": -30.8448486328125, + "step": 120, + "u": -2.2387325763702393, + "weight": 0.030215347185730934 + }, + { + "diff_generated": -30.562610626220703, + "epoch": 0.04212572909915749, + "grad_norm": 819.4309847490115, + "learning_rate": 1.1231101511879049e-07, + "logits/chosen": -2.430781126022339, + "logits/rejected": -2.5365960597991943, + "logps/chosen": -39.41309356689453, + "logps/rejected": -114.13203430175781, + "loss": 39.7267, + "losses_ref": -0.43195924162864685, + "ref_logps/chosen": -90.4994888305664, + "ref_logps/rejected": -83.56944274902344, + "rewards/accuracies": 0.9375, + "rewards/chosen": 51.086395263671875, + "rewards/margins": 81.64900970458984, + "rewards/rejected": -30.562610626220703, + "step": 130, + "u": -2.133713483810425, + "weight": 0.07522068917751312 + }, + { + "diff_generated": -31.19342041015625, + "epoch": 0.04536616979909268, + "grad_norm": 811.2637523209452, + "learning_rate": 1.2095032397408208e-07, + "logits/chosen": -2.4572250843048096, + "logits/rejected": -2.5759568214416504, + "logps/chosen": -39.12738800048828, + "logps/rejected": -124.8710708618164, + "loss": 39.4256, + "losses_ref": -0.022554311901330948, + "ref_logps/chosen": -92.23231506347656, + "ref_logps/rejected": -93.67765808105469, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 53.10492706298828, + "rewards/margins": 84.29834747314453, + "rewards/rejected": -31.19342041015625, + "step": 140, + "u": -2.2014377117156982, + "weight": 0.04419802874326706 + }, + { + "diff_generated": -33.59473419189453, + "epoch": 0.04860661049902787, + "grad_norm": 860.0384740581414, + "learning_rate": 1.2958963282937366e-07, + "logits/chosen": -2.506410837173462, + "logits/rejected": -2.5207464694976807, + "logps/chosen": -40.406578063964844, + "logps/rejected": -118.5238037109375, + "loss": 39.5802, + "losses_ref": -0.040201567113399506, + "ref_logps/chosen": -98.3244400024414, + "ref_logps/rejected": -84.9290771484375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 57.9178581237793, + "rewards/margins": 91.5125961303711, + "rewards/rejected": -33.59473419189453, + "step": 150, + "u": -2.156111478805542, + "weight": 0.0648484081029892 + }, + { + "diff_generated": -35.42806625366211, + "epoch": 0.05184705119896306, + "grad_norm": 862.2311061479475, + "learning_rate": 1.382289416846652e-07, + "logits/chosen": -2.4801454544067383, + "logits/rejected": -2.5408148765563965, + "logps/chosen": -41.032039642333984, + "logps/rejected": -123.56022644042969, + "loss": 37.5874, + "losses_ref": -0.961107075214386, + "ref_logps/chosen": -99.9718246459961, + "ref_logps/rejected": -88.13216400146484, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 58.939788818359375, + "rewards/margins": 94.36785125732422, + "rewards/rejected": -35.42806625366211, + "step": 160, + "u": -2.1681969165802, + "weight": 0.04808913171291351 + }, + { + "diff_generated": -34.35420227050781, + "epoch": 0.05508749189889825, + "grad_norm": 770.1748867796118, + "learning_rate": 1.468682505399568e-07, + "logits/chosen": -2.4965875148773193, + "logits/rejected": -2.5947012901306152, + "logps/chosen": -34.472076416015625, + "logps/rejected": -120.37129974365234, + "loss": 36.0503, + "losses_ref": -0.019783183932304382, + "ref_logps/chosen": -94.23568725585938, + "ref_logps/rejected": -86.01708984375, + "rewards/accuracies": 0.96875, + "rewards/chosen": 59.76360321044922, + "rewards/margins": 94.11781311035156, + "rewards/rejected": -34.35420227050781, + "step": 170, + "u": -2.2295360565185547, + "weight": 0.032412897795438766 + }, + { + "diff_generated": -35.52586364746094, + "epoch": 0.05832793259883344, + "grad_norm": 730.0749859427049, + "learning_rate": 1.5550755939524837e-07, + "logits/chosen": -2.4865972995758057, + "logits/rejected": -2.5497257709503174, + "logps/chosen": -36.68155288696289, + "logps/rejected": -120.2018051147461, + "loss": 34.2722, + "losses_ref": -0.09937143325805664, + "ref_logps/chosen": -95.27645111083984, + "ref_logps/rejected": -84.67594909667969, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 58.59489822387695, + "rewards/margins": 94.12075805664062, + "rewards/rejected": -35.52586364746094, + "step": 180, + "u": -2.172288417816162, + "weight": 0.05707244947552681 + }, + { + "diff_generated": -36.80834197998047, + "epoch": 0.06156837329876863, + "grad_norm": 865.7529357983891, + "learning_rate": 1.6414686825053995e-07, + "logits/chosen": -2.4904823303222656, + "logits/rejected": -2.515784740447998, + "logps/chosen": -38.33757019042969, + "logps/rejected": -126.2683334350586, + "loss": 34.8775, + "losses_ref": -0.006524696946144104, + "ref_logps/chosen": -101.89496612548828, + "ref_logps/rejected": -89.45999908447266, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 63.55739212036133, + "rewards/margins": 100.36573791503906, + "rewards/rejected": -36.80834197998047, + "step": 190, + "u": -2.2591099739074707, + "weight": 0.019078662618994713 + }, + { + "diff_generated": -35.73137664794922, + "epoch": 0.06480881399870382, + "grad_norm": 800.9108334177273, + "learning_rate": 1.7278617710583153e-07, + "logits/chosen": -2.4334239959716797, + "logits/rejected": -2.5226898193359375, + "logps/chosen": -32.73336410522461, + "logps/rejected": -121.62447357177734, + "loss": 33.73, + "losses_ref": -0.04314727336168289, + "ref_logps/chosen": -86.64543914794922, + "ref_logps/rejected": -85.8930892944336, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 53.912071228027344, + "rewards/margins": 89.6434555053711, + "rewards/rejected": -35.73137664794922, + "step": 200, + "u": -2.1284313201904297, + "weight": 0.07653830945491791 + }, + { + "diff_generated": -37.98542404174805, + "epoch": 0.06804925469863901, + "grad_norm": 927.4241431650823, + "learning_rate": 1.814254859611231e-07, + "logits/chosen": -2.4798386096954346, + "logits/rejected": -2.5692715644836426, + "logps/chosen": -35.32215881347656, + "logps/rejected": -126.21580505371094, + "loss": 34.3679, + "losses_ref": -0.030094826593995094, + "ref_logps/chosen": -95.78032684326172, + "ref_logps/rejected": -88.23038482666016, + "rewards/accuracies": 0.96875, + "rewards/chosen": 60.45817947387695, + "rewards/margins": 98.44361114501953, + "rewards/rejected": -37.98542404174805, + "step": 210, + "u": -2.230320453643799, + "weight": 0.03158506378531456 + }, + { + "diff_generated": -38.94337844848633, + "epoch": 0.0712896953985742, + "grad_norm": 797.8259592178767, + "learning_rate": 1.900647948164147e-07, + "logits/chosen": -2.4474635124206543, + "logits/rejected": -2.5280601978302, + "logps/chosen": -35.149330139160156, + "logps/rejected": -120.66595458984375, + "loss": 33.9716, + "losses_ref": -0.02789616584777832, + "ref_logps/chosen": -97.20127868652344, + "ref_logps/rejected": -81.72257995605469, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 62.05195236206055, + "rewards/margins": 100.9953384399414, + "rewards/rejected": -38.94337844848633, + "step": 220, + "u": -2.171726942062378, + "weight": 0.057657964527606964 + }, + { + "diff_generated": -38.69519805908203, + "epoch": 0.07453013609850939, + "grad_norm": 869.5453620671002, + "learning_rate": 1.9870410367170624e-07, + "logits/chosen": -2.4745936393737793, + "logits/rejected": -2.494105815887451, + "logps/chosen": -33.155364990234375, + "logps/rejected": -122.06390380859375, + "loss": 34.0393, + "losses_ref": -0.24938344955444336, + "ref_logps/chosen": -92.7715835571289, + "ref_logps/rejected": -83.36869812011719, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 59.61621856689453, + "rewards/margins": 98.31141662597656, + "rewards/rejected": -38.69519805908203, + "step": 230, + "u": -2.1387956142425537, + "weight": 0.07319749146699905 + }, + { + "diff_generated": -41.04708480834961, + "epoch": 0.07777057679844458, + "grad_norm": 817.7110572483012, + "learning_rate": 2.0734341252699785e-07, + "logits/chosen": -2.4703361988067627, + "logits/rejected": -2.5415077209472656, + "logps/chosen": -31.33148765563965, + "logps/rejected": -127.82364654541016, + "loss": 32.6819, + "losses_ref": -0.13492469489574432, + "ref_logps/chosen": -92.26710510253906, + "ref_logps/rejected": -86.77656555175781, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 60.93561553955078, + "rewards/margins": 101.98270416259766, + "rewards/rejected": -41.04708480834961, + "step": 240, + "u": -2.172236442565918, + "weight": 0.05711379647254944 + }, + { + "diff_generated": -41.528465270996094, + "epoch": 0.08101101749837979, + "grad_norm": 779.7448937764428, + "learning_rate": 2.159827213822894e-07, + "logits/chosen": -2.4615213871002197, + "logits/rejected": -2.578935384750366, + "logps/chosen": -33.79179000854492, + "logps/rejected": -130.5545196533203, + "loss": 33.9777, + "losses_ref": -0.02141922526061535, + "ref_logps/chosen": -94.50299835205078, + "ref_logps/rejected": -89.02606201171875, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 60.711204528808594, + "rewards/margins": 102.23966217041016, + "rewards/rejected": -41.528465270996094, + "step": 250, + "u": -2.1865832805633545, + "weight": 0.05091395229101181 + }, + { + "diff_generated": -40.10847091674805, + "epoch": 0.08425145819831498, + "grad_norm": 808.1011773378456, + "learning_rate": 2.2462203023758098e-07, + "logits/chosen": -2.4640259742736816, + "logits/rejected": -2.5208182334899902, + "logps/chosen": -35.152008056640625, + "logps/rejected": -128.22848510742188, + "loss": 33.6627, + "losses_ref": -0.0065854983404278755, + "ref_logps/chosen": -96.67770385742188, + "ref_logps/rejected": -88.1200180053711, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 61.52568817138672, + "rewards/margins": 101.6341552734375, + "rewards/rejected": -40.10847091674805, + "step": 260, + "u": -2.1871533393859863, + "weight": 0.05033022165298462 + }, + { + "diff_generated": -41.47755813598633, + "epoch": 0.08749189889825017, + "grad_norm": 824.21173391217, + "learning_rate": 2.3326133909287256e-07, + "logits/chosen": -2.46863055229187, + "logits/rejected": -2.5370888710021973, + "logps/chosen": -32.25581741333008, + "logps/rejected": -126.69731140136719, + "loss": 33.1363, + "losses_ref": -0.0064047775231301785, + "ref_logps/chosen": -94.9454116821289, + "ref_logps/rejected": -85.21976470947266, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 62.6895866394043, + "rewards/margins": 104.16715240478516, + "rewards/rejected": -41.47755813598633, + "step": 270, + "u": -2.1151492595672607, + "weight": 0.08162893354892731 + }, + { + "diff_generated": -40.3583984375, + "epoch": 0.09073233959818536, + "grad_norm": 714.5025449894256, + "learning_rate": 2.4190064794816416e-07, + "logits/chosen": -2.456831693649292, + "logits/rejected": -2.5513622760772705, + "logps/chosen": -31.738773345947266, + "logps/rejected": -126.47242736816406, + "loss": 32.5606, + "losses_ref": -0.08480539917945862, + "ref_logps/chosen": -94.12120056152344, + "ref_logps/rejected": -86.11402130126953, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 62.38242721557617, + "rewards/margins": 102.74082946777344, + "rewards/rejected": -40.3583984375, + "step": 280, + "u": -2.1713805198669434, + "weight": 0.05789435654878616 + }, + { + "diff_generated": -41.735252380371094, + "epoch": 0.09397278029812055, + "grad_norm": 801.2272509358011, + "learning_rate": 2.505399568034557e-07, + "logits/chosen": -2.4742555618286133, + "logits/rejected": -2.5228075981140137, + "logps/chosen": -34.12358474731445, + "logps/rejected": -126.54386138916016, + "loss": 32.6423, + "losses_ref": -0.2615818977355957, + "ref_logps/chosen": -98.3595199584961, + "ref_logps/rejected": -84.80860900878906, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 64.2359390258789, + "rewards/margins": 105.97119140625, + "rewards/rejected": -41.735252380371094, + "step": 290, + "u": -2.139753818511963, + "weight": 0.07259203493595123 + }, + { + "diff_generated": -46.503387451171875, + "epoch": 0.09721322099805574, + "grad_norm": 781.8410441358157, + "learning_rate": 2.591792656587473e-07, + "logits/chosen": -2.496601104736328, + "logits/rejected": -2.5690560340881348, + "logps/chosen": -29.659671783447266, + "logps/rejected": -131.7477264404297, + "loss": 31.4824, + "losses_ref": -0.0010774282272905111, + "ref_logps/chosen": -95.02106475830078, + "ref_logps/rejected": -85.24435424804688, + "rewards/accuracies": 0.96875, + "rewards/chosen": 65.36140441894531, + "rewards/margins": 111.86478424072266, + "rewards/rejected": -46.503387451171875, + "step": 300, + "u": -2.230616807937622, + "weight": 0.031264010816812515 + }, + { + "diff_generated": -44.111000061035156, + "epoch": 0.10045366169799093, + "grad_norm": 760.9281935094895, + "learning_rate": 2.6781857451403887e-07, + "logits/chosen": -2.4643948078155518, + "logits/rejected": -2.565307378768921, + "logps/chosen": -29.657928466796875, + "logps/rejected": -130.83432006835938, + "loss": 30.1822, + "losses_ref": -0.0012849947670474648, + "ref_logps/chosen": -91.39910125732422, + "ref_logps/rejected": -86.72331237792969, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 61.741172790527344, + "rewards/margins": 105.8521728515625, + "rewards/rejected": -44.111000061035156, + "step": 310, + "u": -2.201822280883789, + "weight": 0.043777596205472946 + }, + { + "diff_generated": -40.68602752685547, + "epoch": 0.10369410239792612, + "grad_norm": 762.7683248772289, + "learning_rate": 2.764578833693304e-07, + "logits/chosen": -2.425774574279785, + "logits/rejected": -2.4849331378936768, + "logps/chosen": -30.319400787353516, + "logps/rejected": -122.18917083740234, + "loss": 31.0654, + "losses_ref": -0.008473332040011883, + "ref_logps/chosen": -90.56777954101562, + "ref_logps/rejected": -81.5031509399414, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 60.248374938964844, + "rewards/margins": 100.93440246582031, + "rewards/rejected": -40.68602752685547, + "step": 320, + "u": -2.115234375, + "weight": 0.08154076337814331 + }, + { + "diff_generated": -44.732337951660156, + "epoch": 0.10693454309786131, + "grad_norm": 798.216714500383, + "learning_rate": 2.8509719222462203e-07, + "logits/chosen": -2.4940147399902344, + "logits/rejected": -2.6184709072113037, + "logps/chosen": -31.332714080810547, + "logps/rejected": -135.71978759765625, + "loss": 30.2959, + "losses_ref": -2.9487182473530993e-05, + "ref_logps/chosen": -94.46064758300781, + "ref_logps/rejected": -90.98744201660156, + "rewards/accuracies": 0.9375, + "rewards/chosen": 63.1279411315918, + "rewards/margins": 107.86029052734375, + "rewards/rejected": -44.732337951660156, + "step": 330, + "u": -2.15867280960083, + "weight": 0.06250102818012238 + }, + { + "diff_generated": -44.797607421875, + "epoch": 0.1101749837977965, + "grad_norm": 777.6758192243882, + "learning_rate": 2.937365010799136e-07, + "logits/chosen": -2.4762284755706787, + "logits/rejected": -2.558218002319336, + "logps/chosen": -30.14864158630371, + "logps/rejected": -131.7941131591797, + "loss": 30.8224, + "losses_ref": -0.09419278800487518, + "ref_logps/chosen": -93.49372100830078, + "ref_logps/rejected": -86.99652862548828, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 63.345069885253906, + "rewards/margins": 108.14266204833984, + "rewards/rejected": -44.797607421875, + "step": 340, + "u": -2.2006499767303467, + "weight": 0.04497247934341431 + }, + { + "diff_generated": -44.76807403564453, + "epoch": 0.11341542449773169, + "grad_norm": 753.8383918643427, + "learning_rate": 3.023758099352052e-07, + "logits/chosen": -2.457641124725342, + "logits/rejected": -2.528771162033081, + "logps/chosen": -32.162479400634766, + "logps/rejected": -128.0952606201172, + "loss": 30.7947, + "losses_ref": -0.005067890044301748, + "ref_logps/chosen": -98.45516204833984, + "ref_logps/rejected": -83.32720184326172, + "rewards/accuracies": 0.96875, + "rewards/chosen": 66.29267883300781, + "rewards/margins": 111.06075286865234, + "rewards/rejected": -44.76807403564453, + "step": 350, + "u": -2.230400323867798, + "weight": 0.03150248900055885 + }, + { + "diff_generated": -44.002403259277344, + "epoch": 0.11665586519766688, + "grad_norm": 745.0619513300946, + "learning_rate": 3.1101511879049674e-07, + "logits/chosen": -2.4722886085510254, + "logits/rejected": -2.5328807830810547, + "logps/chosen": -30.5725154876709, + "logps/rejected": -128.28097534179688, + "loss": 31.3639, + "losses_ref": -0.00848553515970707, + "ref_logps/chosen": -96.97710418701172, + "ref_logps/rejected": -84.27857971191406, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 66.40459442138672, + "rewards/margins": 110.40699768066406, + "rewards/rejected": -44.002403259277344, + "step": 360, + "u": -2.1440012454986572, + "weight": 0.06905999779701233 + }, + { + "diff_generated": -45.63245391845703, + "epoch": 0.11989630589760207, + "grad_norm": 721.4188041218672, + "learning_rate": 3.1965442764578835e-07, + "logits/chosen": -2.5433833599090576, + "logits/rejected": -2.5798544883728027, + "logps/chosen": -29.05537986755371, + "logps/rejected": -132.5902099609375, + "loss": 29.871, + "losses_ref": -0.00014184534666128457, + "ref_logps/chosen": -99.68949890136719, + "ref_logps/rejected": -86.9577407836914, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 70.63412475585938, + "rewards/margins": 116.26658630371094, + "rewards/rejected": -45.63245391845703, + "step": 370, + "u": -2.216233015060425, + "weight": 0.03750551864504814 + }, + { + "diff_generated": -46.33927917480469, + "epoch": 0.12313674659753726, + "grad_norm": 723.4759624818365, + "learning_rate": 3.282937365010799e-07, + "logits/chosen": -2.501493215560913, + "logits/rejected": -2.5781912803649902, + "logps/chosen": -30.456531524658203, + "logps/rejected": -137.51571655273438, + "loss": 30.3151, + "losses_ref": -0.033796604722738266, + "ref_logps/chosen": -94.95821380615234, + "ref_logps/rejected": -91.17644500732422, + "rewards/accuracies": 0.9375, + "rewards/chosen": 64.5016860961914, + "rewards/margins": 110.84095764160156, + "rewards/rejected": -46.33927917480469, + "step": 380, + "u": -2.158191442489624, + "weight": 0.06301557272672653 + }, + { + "diff_generated": -46.33161163330078, + "epoch": 0.12637718729747247, + "grad_norm": 804.2361358345585, + "learning_rate": 3.3693304535637145e-07, + "logits/chosen": -2.5290114879608154, + "logits/rejected": -2.6311047077178955, + "logps/chosen": -32.53776931762695, + "logps/rejected": -134.83871459960938, + "loss": 30.3089, + "losses_ref": -0.004766993690282106, + "ref_logps/chosen": -99.89033508300781, + "ref_logps/rejected": -88.50709533691406, + "rewards/accuracies": 0.96875, + "rewards/chosen": 67.3525619506836, + "rewards/margins": 113.68416595458984, + "rewards/rejected": -46.33161163330078, + "step": 390, + "u": -2.2304444313049316, + "weight": 0.031454164534807205 + }, + { + "diff_generated": -46.13669204711914, + "epoch": 0.12961762799740764, + "grad_norm": 713.9089688148474, + "learning_rate": 3.4557235421166306e-07, + "logits/chosen": -2.4912726879119873, + "logits/rejected": -2.5743274688720703, + "logps/chosen": -30.20980453491211, + "logps/rejected": -133.94284057617188, + "loss": 28.4143, + "losses_ref": -0.00325656752102077, + "ref_logps/chosen": -95.0901870727539, + "ref_logps/rejected": -87.80616760253906, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 64.88038635253906, + "rewards/margins": 111.0170669555664, + "rewards/rejected": -46.13669204711914, + "step": 400, + "u": -2.201824903488159, + "weight": 0.0437745526432991 + }, + { + "diff_generated": -45.739585876464844, + "epoch": 0.13285806869734285, + "grad_norm": 708.8169001595913, + "learning_rate": 3.542116630669546e-07, + "logits/chosen": -2.4827613830566406, + "logits/rejected": -2.5828845500946045, + "logps/chosen": -27.490942001342773, + "logps/rejected": -135.59054565429688, + "loss": 28.7896, + "losses_ref": -0.004578437190502882, + "ref_logps/chosen": -93.90925598144531, + "ref_logps/rejected": -89.85096740722656, + "rewards/accuracies": 0.96875, + "rewards/chosen": 66.41831970214844, + "rewards/margins": 112.15791320800781, + "rewards/rejected": -45.739585876464844, + "step": 410, + "u": -2.2303926944732666, + "weight": 0.03151000663638115 + }, + { + "diff_generated": -44.8867301940918, + "epoch": 0.13609850939727802, + "grad_norm": 729.0360218701636, + "learning_rate": 3.628509719222462e-07, + "logits/chosen": -2.537703275680542, + "logits/rejected": -2.633138656616211, + "logps/chosen": -29.31488037109375, + "logps/rejected": -132.73167419433594, + "loss": 29.2031, + "losses_ref": -0.008955566212534904, + "ref_logps/chosen": -99.2090072631836, + "ref_logps/rejected": -87.84493255615234, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 69.89412689208984, + "rewards/margins": 114.7808609008789, + "rewards/rejected": -44.8867301940918, + "step": 420, + "u": -2.2448973655700684, + "weight": 0.025136280804872513 + }, + { + "diff_generated": -44.21284103393555, + "epoch": 0.13933895009721323, + "grad_norm": 708.8181357717095, + "learning_rate": 3.7149028077753777e-07, + "logits/chosen": -2.455392360687256, + "logits/rejected": -2.5685718059539795, + "logps/chosen": -25.873310089111328, + "logps/rejected": -123.27958679199219, + "loss": 29.4561, + "losses_ref": -0.0032315519638359547, + "ref_logps/chosen": -88.95941162109375, + "ref_logps/rejected": -79.06675720214844, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": 63.08610916137695, + "rewards/margins": 107.2989501953125, + "rewards/rejected": -44.21284103393555, + "step": 430, + "u": -2.0578160285949707, + "weight": 0.10638221353292465 + }, + { + "diff_generated": -44.5488395690918, + "epoch": 0.1425793907971484, + "grad_norm": 758.7274720374536, + "learning_rate": 3.801295896328294e-07, + "logits/chosen": -2.4996840953826904, + "logits/rejected": -2.5589592456817627, + "logps/chosen": -28.99948501586914, + "logps/rejected": -127.78646087646484, + "loss": 29.7841, + "losses_ref": -0.19051943719387054, + "ref_logps/chosen": -95.38642883300781, + "ref_logps/rejected": -83.23762512207031, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 66.3869400024414, + "rewards/margins": 110.935791015625, + "rewards/rejected": -44.5488395690918, + "step": 440, + "u": -2.1401772499084473, + "weight": 0.07216720283031464 + }, + { + "diff_generated": -43.60665512084961, + "epoch": 0.1458198314970836, + "grad_norm": 785.0425370892705, + "learning_rate": 3.887688984881209e-07, + "logits/chosen": -2.51354718208313, + "logits/rejected": -2.557900905609131, + "logps/chosen": -27.42291831970215, + "logps/rejected": -127.36767578125, + "loss": 28.9717, + "losses_ref": -0.005422559566795826, + "ref_logps/chosen": -92.18173217773438, + "ref_logps/rejected": -83.76101684570312, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 64.75880432128906, + "rewards/margins": 108.36546325683594, + "rewards/rejected": -43.60665512084961, + "step": 450, + "u": -2.115337371826172, + "weight": 0.08142946660518646 + }, + { + "diff_generated": -44.98752975463867, + "epoch": 0.14906027219701878, + "grad_norm": 761.1618028676306, + "learning_rate": 3.974082073434125e-07, + "logits/chosen": -2.5288493633270264, + "logits/rejected": -2.5853981971740723, + "logps/chosen": -30.04689598083496, + "logps/rejected": -128.4542694091797, + "loss": 29.1264, + "losses_ref": -0.0021941731683909893, + "ref_logps/chosen": -97.36630249023438, + "ref_logps/rejected": -83.46673583984375, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 67.31941223144531, + "rewards/margins": 112.30694580078125, + "rewards/rejected": -44.98752975463867, + "step": 460, + "u": -2.1729543209075928, + "weight": 0.05637165904045105 + }, + { + "diff_generated": -46.988868713378906, + "epoch": 0.152300712896954, + "grad_norm": 736.8358893108898, + "learning_rate": 4.060475161987041e-07, + "logits/chosen": -2.527107000350952, + "logits/rejected": -2.6046130657196045, + "logps/chosen": -30.838703155517578, + "logps/rejected": -138.16522216796875, + "loss": 29.1809, + "losses_ref": -0.00042680688784457743, + "ref_logps/chosen": -100.99031066894531, + "ref_logps/rejected": -91.17635345458984, + "rewards/accuracies": 0.96875, + "rewards/chosen": 70.151611328125, + "rewards/margins": 117.14048767089844, + "rewards/rejected": -46.988868713378906, + "step": 470, + "u": -2.2306249141693115, + "weight": 0.03125474974513054 + }, + { + "diff_generated": -47.64769744873047, + "epoch": 0.15554115359688916, + "grad_norm": 689.8783850837162, + "learning_rate": 4.146868250539957e-07, + "logits/chosen": -2.522904634475708, + "logits/rejected": -2.5496037006378174, + "logps/chosen": -26.92234230041504, + "logps/rejected": -130.68350219726562, + "loss": 28.5857, + "losses_ref": -0.016092773526906967, + "ref_logps/chosen": -95.0140380859375, + "ref_logps/rejected": -83.03581237792969, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 68.09169006347656, + "rewards/margins": 115.73939514160156, + "rewards/rejected": -47.64769744873047, + "step": 480, + "u": -2.2014780044555664, + "weight": 0.044157128781080246 + }, + { + "diff_generated": -45.847373962402344, + "epoch": 0.15878159429682437, + "grad_norm": 742.9073843335941, + "learning_rate": 4.2332613390928724e-07, + "logits/chosen": -2.505765438079834, + "logits/rejected": -2.5322728157043457, + "logps/chosen": -29.727895736694336, + "logps/rejected": -128.55931091308594, + "loss": 29.5698, + "losses_ref": -0.023366082459688187, + "ref_logps/chosen": -99.20357513427734, + "ref_logps/rejected": -82.7119369506836, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 69.4756851196289, + "rewards/margins": 115.32305908203125, + "rewards/rejected": -45.847373962402344, + "step": 490, + "u": -2.172699451446533, + "weight": 0.05664912611246109 + }, + { + "diff_generated": -47.095062255859375, + "epoch": 0.16202203499675957, + "grad_norm": 645.9278684155573, + "learning_rate": 4.319654427645788e-07, + "logits/chosen": -2.4909958839416504, + "logits/rejected": -2.604588270187378, + "logps/chosen": -26.1655216217041, + "logps/rejected": -130.22900390625, + "loss": 28.1373, + "losses_ref": -0.0002788856509141624, + "ref_logps/chosen": -93.60696411132812, + "ref_logps/rejected": -83.1339340209961, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 67.44145202636719, + "rewards/margins": 114.53651428222656, + "rewards/rejected": -47.095062255859375, + "step": 500, + "u": -2.1442744731903076, + "weight": 0.06875874847173691 + }, + { + "diff_generated": -45.97614288330078, + "epoch": 0.16526247569669475, + "grad_norm": 671.3594525863219, + "learning_rate": 4.406047516198704e-07, + "logits/chosen": -2.4338736534118652, + "logits/rejected": -2.586536169052124, + "logps/chosen": -25.121789932250977, + "logps/rejected": -125.8269271850586, + "loss": 27.2431, + "losses_ref": -0.004285829141736031, + "ref_logps/chosen": -84.57170104980469, + "ref_logps/rejected": -79.85078430175781, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 59.44990158081055, + "rewards/margins": 105.4260482788086, + "rewards/rejected": -45.97614288330078, + "step": 510, + "u": -2.115294933319092, + "weight": 0.08147425949573517 + }, + { + "diff_generated": -47.68573760986328, + "epoch": 0.16850291639662995, + "grad_norm": 727.2153684375536, + "learning_rate": 4.4924406047516195e-07, + "logits/chosen": -2.538134813308716, + "logits/rejected": -2.5833652019500732, + "logps/chosen": -30.263620376586914, + "logps/rejected": -137.04959106445312, + "loss": 27.8138, + "losses_ref": -0.00044774659909307957, + "ref_logps/chosen": -102.84830474853516, + "ref_logps/rejected": -89.36383819580078, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 72.58468627929688, + "rewards/margins": 120.27042388916016, + "rewards/rejected": -47.68573760986328, + "step": 520, + "u": -2.201833724975586, + "weight": 0.04376474767923355 + }, + { + "diff_generated": -45.31121063232422, + "epoch": 0.17174335709656513, + "grad_norm": 688.1006559007947, + "learning_rate": 4.5788336933045356e-07, + "logits/chosen": -2.5154356956481934, + "logits/rejected": -2.595595598220825, + "logps/chosen": -26.32985496520996, + "logps/rejected": -128.65464782714844, + "loss": 26.9283, + "losses_ref": -0.023449674248695374, + "ref_logps/chosen": -92.49571228027344, + "ref_logps/rejected": -83.34344482421875, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 66.16585540771484, + "rewards/margins": 111.4770736694336, + "rewards/rejected": -45.31121063232422, + "step": 530, + "u": -2.1150999069213867, + "weight": 0.08168499171733856 + }, + { + "diff_generated": -45.9770622253418, + "epoch": 0.17498379779650033, + "grad_norm": 661.6627036065419, + "learning_rate": 4.665226781857451e-07, + "logits/chosen": -2.495020627975464, + "logits/rejected": -2.6251144409179688, + "logps/chosen": -27.82745361328125, + "logps/rejected": -132.03318786621094, + "loss": 27.5039, + "losses_ref": -0.3354525566101074, + "ref_logps/chosen": -93.02908325195312, + "ref_logps/rejected": -86.0561294555664, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 65.20162200927734, + "rewards/margins": 111.1786880493164, + "rewards/rejected": -45.9770622253418, + "step": 540, + "u": -2.1621901988983154, + "weight": 0.06197571009397507 + }, + { + "diff_generated": -43.12482452392578, + "epoch": 0.1782242384964355, + "grad_norm": 722.3393998366524, + "learning_rate": 4.751619870410367e-07, + "logits/chosen": -2.4731545448303223, + "logits/rejected": -2.4869635105133057, + "logps/chosen": -27.638330459594727, + "logps/rejected": -121.67626953125, + "loss": 27.899, + "losses_ref": -0.00017084872524719685, + "ref_logps/chosen": -92.68699645996094, + "ref_logps/rejected": -78.55145263671875, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 65.04866790771484, + "rewards/margins": 108.1734848022461, + "rewards/rejected": -43.12482452392578, + "step": 550, + "u": -2.144278049468994, + "weight": 0.06875475496053696 + }, + { + "diff_generated": -47.44527053833008, + "epoch": 0.18146467919637072, + "grad_norm": 659.4609284904622, + "learning_rate": 4.838012958963283e-07, + "logits/chosen": -2.4913129806518555, + "logits/rejected": -2.594072103500366, + "logps/chosen": -24.48001480102539, + "logps/rejected": -135.76724243164062, + "loss": 27.7372, + "losses_ref": -0.0050836303271353245, + "ref_logps/chosen": -92.13008117675781, + "ref_logps/rejected": -88.32197570800781, + "rewards/accuracies": 0.96875, + "rewards/chosen": 67.65005493164062, + "rewards/margins": 115.0953369140625, + "rewards/rejected": -47.44527053833008, + "step": 560, + "u": -2.230499744415283, + "weight": 0.03139229863882065 + }, + { + "diff_generated": -49.268489837646484, + "epoch": 0.1847051198963059, + "grad_norm": 696.0980480562955, + "learning_rate": 4.924406047516198e-07, + "logits/chosen": -2.4896411895751953, + "logits/rejected": -2.5952906608581543, + "logps/chosen": -26.756811141967773, + "logps/rejected": -137.99081420898438, + "loss": 27.2014, + "losses_ref": -0.0020136612001806498, + "ref_logps/chosen": -100.06852722167969, + "ref_logps/rejected": -88.72233581542969, + "rewards/accuracies": 0.96875, + "rewards/chosen": 73.31172180175781, + "rewards/margins": 122.5802001953125, + "rewards/rejected": -49.268489837646484, + "step": 570, + "u": -2.2305784225463867, + "weight": 0.031306345015764236 + }, + { + "diff_generated": -46.57258987426758, + "epoch": 0.1879455605962411, + "grad_norm": 680.9889293886696, + "learning_rate": 5.010799136069114e-07, + "logits/chosen": -2.551598310470581, + "logits/rejected": -2.584933280944824, + "logps/chosen": -28.661914825439453, + "logps/rejected": -135.87179565429688, + "loss": 27.5338, + "losses_ref": -0.00011642322351690382, + "ref_logps/chosen": -98.13680267333984, + "ref_logps/rejected": -89.29920959472656, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 69.47489929199219, + "rewards/margins": 116.0474853515625, + "rewards/rejected": -46.57258987426758, + "step": 580, + "u": -2.173062562942505, + "weight": 0.056252289563417435 + }, + { + "diff_generated": -48.98841094970703, + "epoch": 0.19118600129617627, + "grad_norm": 644.8107725127082, + "learning_rate": 5.097192224622029e-07, + "logits/chosen": -2.518934488296509, + "logits/rejected": -2.5555710792541504, + "logps/chosen": -26.301610946655273, + "logps/rejected": -131.03005981445312, + "loss": 27.3424, + "losses_ref": -0.0015558989252895117, + "ref_logps/chosen": -96.68141174316406, + "ref_logps/rejected": -82.04166412353516, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 70.37979888916016, + "rewards/margins": 119.36820983886719, + "rewards/rejected": -48.98841094970703, + "step": 590, + "u": -2.2449631690979004, + "weight": 0.02506338059902191 + }, + { + "diff_generated": -46.974483489990234, + "epoch": 0.19442644199611148, + "grad_norm": 666.4151875482576, + "learning_rate": 5.183585313174946e-07, + "logits/chosen": -2.5313637256622314, + "logits/rejected": -2.5866589546203613, + "logps/chosen": -32.090721130371094, + "logps/rejected": -131.39773559570312, + "loss": 27.5002, + "losses_ref": -1.6000910818547709e-06, + "ref_logps/chosen": -98.82708740234375, + "ref_logps/rejected": -84.42324829101562, + "rewards/accuracies": 0.9375, + "rewards/chosen": 66.73637390136719, + "rewards/margins": 113.7108383178711, + "rewards/rejected": -46.974483489990234, + "step": 600, + "u": -2.1586735248565674, + "weight": 0.06250002980232239 + }, + { + "diff_generated": -49.115753173828125, + "epoch": 0.19766688269604665, + "grad_norm": 707.0247432576331, + "learning_rate": 5.269978401727861e-07, + "logits/chosen": -2.507723093032837, + "logits/rejected": -2.5827906131744385, + "logps/chosen": -28.90814208984375, + "logps/rejected": -136.62918090820312, + "loss": 27.4246, + "losses_ref": -1.971707388292998e-05, + "ref_logps/chosen": -100.26750183105469, + "ref_logps/rejected": -87.51342010498047, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 71.35935974121094, + "rewards/margins": 120.47511291503906, + "rewards/rejected": -49.115753173828125, + "step": 610, + "u": -2.187455654144287, + "weight": 0.050000227987766266 + }, + { + "diff_generated": -48.82651901245117, + "epoch": 0.20090732339598186, + "grad_norm": 679.191601482562, + "learning_rate": 5.356371490280777e-07, + "logits/chosen": -2.5194132328033447, + "logits/rejected": -2.6098580360412598, + "logps/chosen": -28.0224609375, + "logps/rejected": -140.78091430664062, + "loss": 26.5323, + "losses_ref": -0.00011035500938305631, + "ref_logps/chosen": -93.5048599243164, + "ref_logps/rejected": -91.95439147949219, + "rewards/accuracies": 0.9375, + "rewards/chosen": 65.48240661621094, + "rewards/margins": 114.30892181396484, + "rewards/rejected": -48.82651901245117, + "step": 620, + "u": -2.158672571182251, + "weight": 0.06250132620334625 + }, + { + "diff_generated": -49.514381408691406, + "epoch": 0.20414776409591703, + "grad_norm": 716.9561895225931, + "learning_rate": 5.442764578833693e-07, + "logits/chosen": -2.534128189086914, + "logits/rejected": -2.5818467140197754, + "logps/chosen": -30.54462242126465, + "logps/rejected": -139.87940979003906, + "loss": 27.3582, + "losses_ref": -0.016371209174394608, + "ref_logps/chosen": -100.50285339355469, + "ref_logps/rejected": -90.36502075195312, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 69.9582290649414, + "rewards/margins": 119.47261810302734, + "rewards/rejected": -49.514381408691406, + "step": 630, + "u": -2.2013890743255615, + "weight": 0.04425160214304924 + }, + { + "diff_generated": -47.97042465209961, + "epoch": 0.20738820479585224, + "grad_norm": 682.4139467028398, + "learning_rate": 5.529157667386608e-07, + "logits/chosen": -2.532174587249756, + "logits/rejected": -2.578139066696167, + "logps/chosen": -25.9278564453125, + "logps/rejected": -131.57861328125, + "loss": 26.2839, + "losses_ref": -0.00019156280905008316, + "ref_logps/chosen": -91.6938705444336, + "ref_logps/rejected": -83.6081771850586, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 65.76602172851562, + "rewards/margins": 113.7364501953125, + "rewards/rejected": -47.97042465209961, + "step": 640, + "u": -2.129889488220215, + "weight": 0.07500191032886505 + }, + { + "diff_generated": -50.43115997314453, + "epoch": 0.21062864549578741, + "grad_norm": 682.2415820353419, + "learning_rate": 5.615550755939525e-07, + "logits/chosen": -2.4852089881896973, + "logits/rejected": -2.5890121459960938, + "logps/chosen": -26.9046573638916, + "logps/rejected": -135.3916015625, + "loss": 27.7744, + "losses_ref": -1.791851900634356e-05, + "ref_logps/chosen": -94.32743835449219, + "ref_logps/rejected": -84.96044158935547, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 67.42278289794922, + "rewards/margins": 117.85392761230469, + "rewards/rejected": -50.43115997314453, + "step": 650, + "u": -2.2018465995788574, + "weight": 0.04375043511390686 + }, + { + "diff_generated": -46.6839485168457, + "epoch": 0.21386908619572262, + "grad_norm": 685.3497673103903, + "learning_rate": 5.701943844492441e-07, + "logits/chosen": -2.529299020767212, + "logits/rejected": -2.5489015579223633, + "logps/chosen": -29.30217933654785, + "logps/rejected": -133.85888671875, + "loss": 27.8461, + "losses_ref": -0.002348927315324545, + "ref_logps/chosen": -95.46574401855469, + "ref_logps/rejected": -87.1749267578125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 66.16357421875, + "rewards/margins": 112.84751892089844, + "rewards/rejected": -46.6839485168457, + "step": 660, + "u": -2.1586408615112305, + "weight": 0.06253615021705627 + }, + { + "diff_generated": -50.5396728515625, + "epoch": 0.21710952689565782, + "grad_norm": 636.5908719338354, + "learning_rate": 5.788336933045357e-07, + "logits/chosen": -2.5330991744995117, + "logits/rejected": -2.5660250186920166, + "logps/chosen": -29.539474487304688, + "logps/rejected": -140.71392822265625, + "loss": 26.3251, + "losses_ref": -0.010962968692183495, + "ref_logps/chosen": -100.68766784667969, + "ref_logps/rejected": -90.17426300048828, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 71.14818572998047, + "rewards/margins": 121.6878662109375, + "rewards/rejected": -50.5396728515625, + "step": 670, + "u": -2.2449309825897217, + "weight": 0.025098636746406555 + }, + { + "diff_generated": -51.689666748046875, + "epoch": 0.220349967595593, + "grad_norm": 617.310775210867, + "learning_rate": 5.874730021598272e-07, + "logits/chosen": -2.5135836601257324, + "logits/rejected": -2.585562229156494, + "logps/chosen": -25.00190544128418, + "logps/rejected": -142.4840545654297, + "loss": 26.4555, + "losses_ref": -0.2515738606452942, + "ref_logps/chosen": -97.02738952636719, + "ref_logps/rejected": -90.79439544677734, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 72.02549743652344, + "rewards/margins": 123.71514892578125, + "rewards/rejected": -51.689666748046875, + "step": 680, + "u": -2.239535093307495, + "weight": 0.029057253152132034 + }, + { + "diff_generated": -50.270172119140625, + "epoch": 0.2235904082955282, + "grad_norm": 662.4353318575914, + "learning_rate": 5.961123110151188e-07, + "logits/chosen": -2.519314765930176, + "logits/rejected": -2.567821979522705, + "logps/chosen": -27.141326904296875, + "logps/rejected": -140.2618408203125, + "loss": 26.7699, + "losses_ref": -2.612667003631941e-06, + "ref_logps/chosen": -98.13988494873047, + "ref_logps/rejected": -89.99168395996094, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 70.9985580444336, + "rewards/margins": 121.26872253417969, + "rewards/rejected": -50.270172119140625, + "step": 690, + "u": -2.2018468379974365, + "weight": 0.04375007748603821 + }, + { + "diff_generated": -47.56863784790039, + "epoch": 0.22683084899546338, + "grad_norm": 636.8522481633842, + "learning_rate": 6.047516198704104e-07, + "logits/chosen": -2.5055432319641113, + "logits/rejected": -2.5514261722564697, + "logps/chosen": -24.244586944580078, + "logps/rejected": -130.41024780273438, + "loss": 26.718, + "losses_ref": -0.008043577894568443, + "ref_logps/chosen": -92.876708984375, + "ref_logps/rejected": -82.84159851074219, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 68.63211822509766, + "rewards/margins": 116.20075988769531, + "rewards/rejected": -47.56863784790039, + "step": 700, + "u": -2.1439430713653564, + "weight": 0.0691227912902832 + }, + { + "diff_generated": -49.37560272216797, + "epoch": 0.23007128969539858, + "grad_norm": 594.9494944924277, + "learning_rate": 6.133909287257019e-07, + "logits/chosen": -2.4527430534362793, + "logits/rejected": -2.5097007751464844, + "logps/chosen": -25.301738739013672, + "logps/rejected": -132.68112182617188, + "loss": 26.4036, + "losses_ref": -3.0844853426970076e-06, + "ref_logps/chosen": -93.79328918457031, + "ref_logps/rejected": -83.30552673339844, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 68.4915542602539, + "rewards/margins": 117.86714935302734, + "rewards/rejected": -49.37560272216797, + "step": 710, + "u": -2.144282341003418, + "weight": 0.06875006854534149 + }, + { + "diff_generated": -51.86012649536133, + "epoch": 0.23331173039533376, + "grad_norm": 669.4559775251164, + "learning_rate": 6.220302375809935e-07, + "logits/chosen": -2.5223159790039062, + "logits/rejected": -2.596004009246826, + "logps/chosen": -25.50338363647461, + "logps/rejected": -139.2146759033203, + "loss": 26.0855, + "losses_ref": -0.020114298909902573, + "ref_logps/chosen": -94.37274169921875, + "ref_logps/rejected": -87.35454559326172, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 68.8693618774414, + "rewards/margins": 120.72947692871094, + "rewards/rejected": -51.86012649536133, + "step": 720, + "u": -2.201023578643799, + "weight": 0.04462960734963417 + }, + { + "diff_generated": -50.6171875, + "epoch": 0.23655217109526896, + "grad_norm": 737.1465697011254, + "learning_rate": 6.306695464362851e-07, + "logits/chosen": -2.4950945377349854, + "logits/rejected": -2.5960280895233154, + "logps/chosen": -25.795177459716797, + "logps/rejected": -138.80050659179688, + "loss": 26.2013, + "losses_ref": -9.304036439061747e-07, + "ref_logps/chosen": -94.69329071044922, + "ref_logps/rejected": -88.18331909179688, + "rewards/accuracies": 0.9375, + "rewards/chosen": 68.89811706542969, + "rewards/margins": 119.51530456542969, + "rewards/rejected": -50.6171875, + "step": 730, + "u": -2.1586735248565674, + "weight": 0.0625000149011612 + }, + { + "diff_generated": -48.57529830932617, + "epoch": 0.23979261179520414, + "grad_norm": 643.2099825309971, + "learning_rate": 6.393088552915767e-07, + "logits/chosen": -2.4291679859161377, + "logits/rejected": -2.5446364879608154, + "logps/chosen": -24.468910217285156, + "logps/rejected": -132.59165954589844, + "loss": 26.4268, + "losses_ref": -0.0013817059807479382, + "ref_logps/chosen": -92.81462097167969, + "ref_logps/rejected": -84.016357421875, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 68.34571838378906, + "rewards/margins": 116.92100524902344, + "rewards/rejected": -48.57529830932617, + "step": 740, + "u": -2.1154351234436035, + "weight": 0.08132173866033554 + }, + { + "diff_generated": -50.2636604309082, + "epoch": 0.24303305249513935, + "grad_norm": 660.445744253361, + "learning_rate": 6.479481641468682e-07, + "logits/chosen": -2.467881679534912, + "logits/rejected": -2.5269060134887695, + "logps/chosen": -25.483837127685547, + "logps/rejected": -133.77403259277344, + "loss": 26.7616, + "losses_ref": -0.00010903090151259676, + "ref_logps/chosen": -95.33479309082031, + "ref_logps/rejected": -83.5103759765625, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 69.8509521484375, + "rewards/margins": 120.1146011352539, + "rewards/rejected": -50.2636604309082, + "step": 750, + "u": -2.144279956817627, + "weight": 0.06875289976596832 + }, + { + "diff_generated": -51.882415771484375, + "epoch": 0.24627349319507452, + "grad_norm": 596.6517283622172, + "learning_rate": 6.565874730021598e-07, + "logits/chosen": -2.5085771083831787, + "logits/rejected": -2.6453890800476074, + "logps/chosen": -26.5816593170166, + "logps/rejected": -143.75991821289062, + "loss": 26.4018, + "losses_ref": -3.152846602461068e-06, + "ref_logps/chosen": -94.84214782714844, + "ref_logps/rejected": -91.87751007080078, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 68.26048278808594, + "rewards/margins": 120.14290618896484, + "rewards/rejected": -51.882415771484375, + "step": 760, + "u": -2.259411573410034, + "weight": 0.018750067800283432 + }, + { + "diff_generated": -49.163570404052734, + "epoch": 0.24951393389500973, + "grad_norm": 661.0098738714644, + "learning_rate": 6.652267818574514e-07, + "logits/chosen": -2.517294406890869, + "logits/rejected": -2.5745046138763428, + "logps/chosen": -26.86139488220215, + "logps/rejected": -135.12640380859375, + "loss": 26.2463, + "losses_ref": -0.0008471701294183731, + "ref_logps/chosen": -94.9187240600586, + "ref_logps/rejected": -85.96281433105469, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 68.05733489990234, + "rewards/margins": 117.22090148925781, + "rewards/rejected": -49.163570404052734, + "step": 770, + "u": -2.115487813949585, + "weight": 0.08126357942819595 + }, + { + "diff_generated": -51.59791946411133, + "epoch": 0.25275437459494493, + "grad_norm": 587.3515590292881, + "learning_rate": 6.738660907127429e-07, + "logits/chosen": -2.516849994659424, + "logits/rejected": -2.5866785049438477, + "logps/chosen": -29.505550384521484, + "logps/rejected": -143.68307495117188, + "loss": 26.9824, + "losses_ref": -2.568489253462758e-05, + "ref_logps/chosen": -98.34056091308594, + "ref_logps/rejected": -92.08514404296875, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 68.83500671386719, + "rewards/margins": 120.43292236328125, + "rewards/rejected": -51.59791946411133, + "step": 780, + "u": -2.2162375450134277, + "weight": 0.037500619888305664 + }, + { + "diff_generated": -47.421485900878906, + "epoch": 0.2559948152948801, + "grad_norm": 665.3356518222674, + "learning_rate": 6.825053995680345e-07, + "logits/chosen": -2.457528591156006, + "logits/rejected": -2.594669818878174, + "logps/chosen": -25.825942993164062, + "logps/rejected": -132.35867309570312, + "loss": 26.2666, + "losses_ref": -0.0013907465618103743, + "ref_logps/chosen": -91.23823547363281, + "ref_logps/rejected": -84.93717193603516, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 65.41229248046875, + "rewards/margins": 112.8337631225586, + "rewards/rejected": -47.421485900878906, + "step": 790, + "u": -2.1298203468322754, + "weight": 0.07507820427417755 + }, + { + "diff_generated": -52.05354690551758, + "epoch": 0.2592352559948153, + "grad_norm": 643.0444700640635, + "learning_rate": 6.911447084233261e-07, + "logits/chosen": -2.484684705734253, + "logits/rejected": -2.5525033473968506, + "logps/chosen": -24.99363899230957, + "logps/rejected": -136.7119140625, + "loss": 25.4338, + "losses_ref": -0.022313769906759262, + "ref_logps/chosen": -96.20983123779297, + "ref_logps/rejected": -84.65837097167969, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 71.21620178222656, + "rewards/margins": 123.2697525024414, + "rewards/rejected": -52.05354690551758, + "step": 800, + "u": -2.2015719413757324, + "weight": 0.044049136340618134 + }, + { + "diff_generated": -47.964622497558594, + "epoch": 0.26247569669475046, + "grad_norm": 610.0379646255323, + "learning_rate": 6.997840172786177e-07, + "logits/chosen": -2.5204567909240723, + "logits/rejected": -2.5491814613342285, + "logps/chosen": -27.75726890563965, + "logps/rejected": -129.86639404296875, + "loss": 25.074, + "losses_ref": -0.04612641781568527, + "ref_logps/chosen": -98.92430114746094, + "ref_logps/rejected": -81.90177917480469, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 71.16703796386719, + "rewards/margins": 119.13166809082031, + "rewards/rejected": -47.964622497558594, + "step": 810, + "u": -2.2013754844665527, + "weight": 0.04425793141126633 + }, + { + "diff_generated": -51.2788200378418, + "epoch": 0.2657161373946857, + "grad_norm": 574.9023035648446, + "learning_rate": 7.084233261339092e-07, + "logits/chosen": -2.4982612133026123, + "logits/rejected": -2.6119399070739746, + "logps/chosen": -25.619009017944336, + "logps/rejected": -138.2942657470703, + "loss": 25.7192, + "losses_ref": -0.010108882561326027, + "ref_logps/chosen": -92.62557220458984, + "ref_logps/rejected": -87.01544189453125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 67.00656127929688, + "rewards/margins": 118.2853775024414, + "rewards/rejected": -51.2788200378418, + "step": 820, + "u": -2.158543109893799, + "weight": 0.06264402717351913 + }, + { + "diff_generated": -49.65804672241211, + "epoch": 0.26895657809462087, + "grad_norm": 611.8921644073595, + "learning_rate": 7.170626349892008e-07, + "logits/chosen": -2.5054807662963867, + "logits/rejected": -2.5855443477630615, + "logps/chosen": -25.27883529663086, + "logps/rejected": -134.4387969970703, + "loss": 26.0776, + "losses_ref": -0.002082222606986761, + "ref_logps/chosen": -90.44572448730469, + "ref_logps/rejected": -84.78074645996094, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 65.16688537597656, + "rewards/margins": 114.82493591308594, + "rewards/rejected": -49.65804672241211, + "step": 830, + "u": -2.1297812461853027, + "weight": 0.07512130588293076 + }, + { + "diff_generated": -48.48400115966797, + "epoch": 0.27219701879455604, + "grad_norm": 661.4384956363074, + "learning_rate": 7.257019438444924e-07, + "logits/chosen": -2.5549542903900146, + "logits/rejected": -2.538541793823242, + "logps/chosen": -27.421245574951172, + "logps/rejected": -135.65159606933594, + "loss": 25.6821, + "losses_ref": -6.91217292114743e-06, + "ref_logps/chosen": -99.6714096069336, + "ref_logps/rejected": -87.1675796508789, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 72.25016784667969, + "rewards/margins": 120.73416900634766, + "rewards/rejected": -48.48400115966797, + "step": 840, + "u": -2.187455654144287, + "weight": 0.0500001385807991 + }, + { + "diff_generated": -47.902313232421875, + "epoch": 0.2754374594944913, + "grad_norm": 624.0056138723679, + "learning_rate": 7.343412526997839e-07, + "logits/chosen": -2.4939754009246826, + "logits/rejected": -2.5117056369781494, + "logps/chosen": -26.297977447509766, + "logps/rejected": -128.50059509277344, + "loss": 26.1491, + "losses_ref": -2.7239013888902264e-06, + "ref_logps/chosen": -94.59888458251953, + "ref_logps/rejected": -80.5982894897461, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 68.30091094970703, + "rewards/margins": 116.2032241821289, + "rewards/rejected": -47.902313232421875, + "step": 850, + "u": -2.101109027862549, + "weight": 0.08750005066394806 + }, + { + "diff_generated": -50.83550262451172, + "epoch": 0.27867790019442645, + "grad_norm": 651.6845882583268, + "learning_rate": 7.429805615550755e-07, + "logits/chosen": -2.523118257522583, + "logits/rejected": -2.5627999305725098, + "logps/chosen": -27.77315902709961, + "logps/rejected": -137.4436492919922, + "loss": 25.9522, + "losses_ref": -0.0023150129709392786, + "ref_logps/chosen": -96.98990631103516, + "ref_logps/rejected": -86.60813903808594, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 69.21675109863281, + "rewards/margins": 120.05224609375, + "rewards/rejected": -50.83550262451172, + "step": 860, + "u": -2.1441612243652344, + "weight": 0.06888330727815628 + }, + { + "diff_generated": -53.04962921142578, + "epoch": 0.28191834089436163, + "grad_norm": 647.3557833104493, + "learning_rate": 7.516198704103671e-07, + "logits/chosen": -2.4396255016326904, + "logits/rejected": -2.6068167686462402, + "logps/chosen": -20.75543212890625, + "logps/rejected": -139.11436462402344, + "loss": 24.3917, + "losses_ref": -0.0001939669018611312, + "ref_logps/chosen": -87.64575958251953, + "ref_logps/rejected": -86.06474304199219, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 66.89032745361328, + "rewards/margins": 119.93994140625, + "rewards/rejected": -53.04962921142578, + "step": 870, + "u": -2.173062562942505, + "weight": 0.05625234916806221 + }, + { + "diff_generated": -50.16646194458008, + "epoch": 0.2851587815942968, + "grad_norm": 652.868899692234, + "learning_rate": 7.602591792656587e-07, + "logits/chosen": -2.536747694015503, + "logits/rejected": -2.653494358062744, + "logps/chosen": -28.167068481445312, + "logps/rejected": -136.5677490234375, + "loss": 25.6688, + "losses_ref": -0.0002300078485859558, + "ref_logps/chosen": -95.93633270263672, + "ref_logps/rejected": -86.40128326416016, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 67.7692642211914, + "rewards/margins": 117.93571472167969, + "rewards/rejected": -50.16646194458008, + "step": 880, + "u": -2.216236114501953, + "weight": 0.03750232607126236 + }, + { + "diff_generated": -48.77215576171875, + "epoch": 0.28839922229423204, + "grad_norm": 634.4173185749601, + "learning_rate": 7.688984881209502e-07, + "logits/chosen": -2.5125479698181152, + "logits/rejected": -2.6143321990966797, + "logps/chosen": -25.836898803710938, + "logps/rejected": -135.23353576660156, + "loss": 26.4335, + "losses_ref": -0.0017788056284189224, + "ref_logps/chosen": -93.00483703613281, + "ref_logps/rejected": -86.46137237548828, + "rewards/accuracies": 0.9375, + "rewards/chosen": 67.16793060302734, + "rewards/margins": 115.9400863647461, + "rewards/rejected": -48.77215576171875, + "step": 890, + "u": -2.1585824489593506, + "weight": 0.0626002699136734 + }, + { + "diff_generated": -51.363059997558594, + "epoch": 0.2916396629941672, + "grad_norm": 651.5449682597222, + "learning_rate": 7.775377969762419e-07, + "logits/chosen": -2.512857437133789, + "logits/rejected": -2.6043753623962402, + "logps/chosen": -22.82807731628418, + "logps/rejected": -137.58694458007812, + "loss": 24.7741, + "losses_ref": -0.010770822875201702, + "ref_logps/chosen": -92.31722259521484, + "ref_logps/rejected": -86.22389221191406, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 69.48914337158203, + "rewards/margins": 120.85221099853516, + "rewards/rejected": -51.363059997558594, + "step": 900, + "u": -2.2015607357025146, + "weight": 0.04406408965587616 + }, + { + "diff_generated": -49.646968841552734, + "epoch": 0.2948801036941024, + "grad_norm": 651.8929270726834, + "learning_rate": 7.861771058315335e-07, + "logits/chosen": -2.547717571258545, + "logits/rejected": -2.6330621242523193, + "logps/chosen": -24.217119216918945, + "logps/rejected": -138.98812866210938, + "loss": 26.0227, + "losses_ref": -0.06351267546415329, + "ref_logps/chosen": -96.43073272705078, + "ref_logps/rejected": -89.34115600585938, + "rewards/accuracies": 0.96875, + "rewards/chosen": 72.21361541748047, + "rewards/margins": 121.86058044433594, + "rewards/rejected": -49.646968841552734, + "step": 910, + "u": -2.2299375534057617, + "weight": 0.03198622539639473 + }, + { + "diff_generated": -49.06929397583008, + "epoch": 0.29812054439403757, + "grad_norm": 645.3313956415323, + "learning_rate": 7.94816414686825e-07, + "logits/chosen": -2.4820826053619385, + "logits/rejected": -2.5499496459960938, + "logps/chosen": -23.020343780517578, + "logps/rejected": -135.4531707763672, + "loss": 25.0257, + "losses_ref": -0.0012926750350743532, + "ref_logps/chosen": -91.547607421875, + "ref_logps/rejected": -86.38389587402344, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 68.52725982666016, + "rewards/margins": 117.5965576171875, + "rewards/rejected": -49.06929397583008, + "step": 920, + "u": -2.201815128326416, + "weight": 0.04378523305058479 + }, + { + "diff_generated": -52.943565368652344, + "epoch": 0.3013609850939728, + "grad_norm": 589.5035558054351, + "learning_rate": 7.999995450631473e-07, + "logits/chosen": -2.514936923980713, + "logits/rejected": -2.6288318634033203, + "logps/chosen": -23.644351959228516, + "logps/rejected": -141.34170532226562, + "loss": 25.621, + "losses_ref": -2.8535801902762614e-05, + "ref_logps/chosen": -91.30097961425781, + "ref_logps/rejected": -88.39814758300781, + "rewards/accuracies": 0.9375, + "rewards/chosen": 67.6566390991211, + "rewards/margins": 120.6001968383789, + "rewards/rejected": -52.943565368652344, + "step": 930, + "u": -2.1586732864379883, + "weight": 0.06250031292438507 + }, + { + "diff_generated": -51.099365234375, + "epoch": 0.304601425793908, + "grad_norm": 562.7275091649057, + "learning_rate": 7.999944270354383e-07, + "logits/chosen": -2.4732613563537598, + "logits/rejected": -2.6294915676116943, + "logps/chosen": -26.57818031311035, + "logps/rejected": -140.29534912109375, + "loss": 25.235, + "losses_ref": -1.997053686864092e-06, + "ref_logps/chosen": -91.357666015625, + "ref_logps/rejected": -89.19598388671875, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 64.77949523925781, + "rewards/margins": 115.87886047363281, + "rewards/rejected": -51.099365234375, + "step": 940, + "u": -2.2018468379974365, + "weight": 0.04375007376074791 + }, + { + "diff_generated": -50.940223693847656, + "epoch": 0.30784186649384315, + "grad_norm": 607.7827036467936, + "learning_rate": 7.99983622381959e-07, + "logits/chosen": -2.507047176361084, + "logits/rejected": -2.5627152919769287, + "logps/chosen": -26.599227905273438, + "logps/rejected": -135.17007446289062, + "loss": 25.5488, + "losses_ref": -0.22947004437446594, + "ref_logps/chosen": -97.92918395996094, + "ref_logps/rejected": -84.22985076904297, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 71.32997131347656, + "rewards/margins": 122.27018737792969, + "rewards/rejected": -50.940223693847656, + "step": 950, + "u": -2.18208646774292, + "weight": 0.054003261029720306 + }, + { + "diff_generated": -51.61498260498047, + "epoch": 0.31108230719377833, + "grad_norm": 620.0873356568507, + "learning_rate": 7.999671312563164e-07, + "logits/chosen": -2.521172046661377, + "logits/rejected": -2.52459716796875, + "logps/chosen": -25.872112274169922, + "logps/rejected": -133.9535675048828, + "loss": 24.7471, + "losses_ref": -0.00038078351644799113, + "ref_logps/chosen": -96.575927734375, + "ref_logps/rejected": -82.33859252929688, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 70.70381164550781, + "rewards/margins": 122.31880187988281, + "rewards/rejected": -51.61498260498047, + "step": 960, + "u": -2.1730563640594482, + "weight": 0.056259334087371826 + }, + { + "diff_generated": -53.22172164916992, + "epoch": 0.31432274789371356, + "grad_norm": 584.0586409072972, + "learning_rate": 7.999449538929611e-07, + "logits/chosen": -2.471055507659912, + "logits/rejected": -2.5570130348205566, + "logps/chosen": -24.730777740478516, + "logps/rejected": -133.05520629882812, + "loss": 25.3145, + "losses_ref": -6.452653178712353e-05, + "ref_logps/chosen": -92.06562805175781, + "ref_logps/rejected": -79.83350372314453, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 67.3348617553711, + "rewards/margins": 120.55656433105469, + "rewards/rejected": -53.22172164916992, + "step": 970, + "u": -2.20184588432312, + "weight": 0.04375113174319267 + }, + { + "diff_generated": -53.71428298950195, + "epoch": 0.31756318859364874, + "grad_norm": 596.051340234887, + "learning_rate": 7.99917090607183e-07, + "logits/chosen": -2.4926838874816895, + "logits/rejected": -2.6254682540893555, + "logps/chosen": -22.642475128173828, + "logps/rejected": -141.4960479736328, + "loss": 24.9684, + "losses_ref": -0.0023799485061317682, + "ref_logps/chosen": -92.01348876953125, + "ref_logps/rejected": -87.7817611694336, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 69.37101745605469, + "rewards/margins": 123.08528900146484, + "rewards/rejected": -53.71428298950195, + "step": 980, + "u": -2.1873419284820557, + "weight": 0.0501260869204998 + }, + { + "diff_generated": -51.06912612915039, + "epoch": 0.3208036292935839, + "grad_norm": 650.4725137934786, + "learning_rate": 7.998835417951081e-07, + "logits/chosen": -2.5319457054138184, + "logits/rejected": -2.589599132537842, + "logps/chosen": -25.422199249267578, + "logps/rejected": -135.44515991210938, + "loss": 25.2832, + "losses_ref": -0.000997675582766533, + "ref_logps/chosen": -95.29524230957031, + "ref_logps/rejected": -84.37602996826172, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 69.87303924560547, + "rewards/margins": 120.9421615600586, + "rewards/rejected": -51.06912612915039, + "step": 990, + "u": -2.216195821762085, + "weight": 0.03754685819149017 + }, + { + "diff_generated": -53.8853759765625, + "epoch": 0.32404406999351915, + "grad_norm": 713.1371118025122, + "learning_rate": 7.998443079336919e-07, + "logits/chosen": -2.4941048622131348, + "logits/rejected": -2.6063921451568604, + "logps/chosen": -25.767486572265625, + "logps/rejected": -147.36996459960938, + "loss": 24.9082, + "losses_ref": -5.925068307988113e-06, + "ref_logps/chosen": -97.91233825683594, + "ref_logps/rejected": -93.4845962524414, + "rewards/accuracies": 0.96875, + "rewards/chosen": 72.14485168457031, + "rewards/margins": 126.03022766113281, + "rewards/rejected": -53.8853759765625, + "step": 1000, + "u": -2.2306292057037354, + "weight": 0.031250081956386566 + }, + { + "diff_generated": -51.92223358154297, + "epoch": 0.3272845106934543, + "grad_norm": 611.4370402546909, + "learning_rate": 7.997993895807128e-07, + "logits/chosen": -2.554468870162964, + "logits/rejected": -2.6043498516082764, + "logps/chosen": -25.103681564331055, + "logps/rejected": -139.8118896484375, + "loss": 25.5352, + "losses_ref": -0.001320630544796586, + "ref_logps/chosen": -96.08708190917969, + "ref_logps/rejected": -87.88965606689453, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 70.98339080810547, + "rewards/margins": 122.90562438964844, + "rewards/rejected": -51.92223358154297, + "step": 1010, + "u": -2.201784133911133, + "weight": 0.043819475919008255 + }, + { + "diff_generated": -50.5982666015625, + "epoch": 0.3305249513933895, + "grad_norm": 584.2638283241826, + "learning_rate": 7.997487873747646e-07, + "logits/chosen": -2.5191614627838135, + "logits/rejected": -2.585754871368408, + "logps/chosen": -22.96677017211914, + "logps/rejected": -134.28956604003906, + "loss": 23.536, + "losses_ref": -3.434952304814942e-05, + "ref_logps/chosen": -93.22602081298828, + "ref_logps/rejected": -83.69129943847656, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 70.25923919677734, + "rewards/margins": 120.8575210571289, + "rewards/rejected": -50.5982666015625, + "step": 1020, + "u": -2.1442813873291016, + "weight": 0.06875105202198029 + }, + { + "diff_generated": -51.261444091796875, + "epoch": 0.3337653920933247, + "grad_norm": 643.4131696639599, + "learning_rate": 7.996925020352465e-07, + "logits/chosen": -2.512216091156006, + "logits/rejected": -2.521505117416382, + "logps/chosen": -27.69634437561035, + "logps/rejected": -137.0533905029297, + "loss": 26.2529, + "losses_ref": -0.0018226455431431532, + "ref_logps/chosen": -99.18524169921875, + "ref_logps/rejected": -85.79194641113281, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 71.48890686035156, + "rewards/margins": 122.7503433227539, + "rewards/rejected": -51.261444091796875, + "step": 1030, + "u": -2.201768398284912, + "weight": 0.043836988508701324 + }, + { + "diff_generated": -50.166053771972656, + "epoch": 0.3370058327932599, + "grad_norm": 546.0749685730806, + "learning_rate": 7.99630534362354e-07, + "logits/chosen": -2.47261118888855, + "logits/rejected": -2.5434730052948, + "logps/chosen": -22.700708389282227, + "logps/rejected": -134.80189514160156, + "loss": 25.3072, + "losses_ref": -0.0005364461103454232, + "ref_logps/chosen": -89.48124694824219, + "ref_logps/rejected": -84.63584899902344, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 66.7805404663086, + "rewards/margins": 116.94660949707031, + "rewards/rejected": -50.166053771972656, + "step": 1040, + "u": -2.144258499145508, + "weight": 0.06877659261226654 + }, + { + "diff_generated": -50.989837646484375, + "epoch": 0.3402462734931951, + "grad_norm": 573.2441692505033, + "learning_rate": 7.995628852370667e-07, + "logits/chosen": -2.4530699253082275, + "logits/rejected": -2.552277088165283, + "logps/chosen": -24.12959861755371, + "logps/rejected": -137.18719482421875, + "loss": 25.43, + "losses_ref": -0.0008495537331327796, + "ref_logps/chosen": -91.74011993408203, + "ref_logps/rejected": -86.19735717773438, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 67.61051940917969, + "rewards/margins": 118.600341796875, + "rewards/rejected": -50.989837646484375, + "step": 1050, + "u": -2.1730258464813232, + "weight": 0.056292999535799026 + }, + { + "diff_generated": -53.81852340698242, + "epoch": 0.34348671419313026, + "grad_norm": 551.1043513632601, + "learning_rate": 7.994895556211363e-07, + "logits/chosen": -2.466102123260498, + "logits/rejected": -2.607362985610962, + "logps/chosen": -24.560415267944336, + "logps/rejected": -141.72219848632812, + "loss": 24.2731, + "losses_ref": -0.00028797605773434043, + "ref_logps/chosen": -91.91780853271484, + "ref_logps/rejected": -87.90367126464844, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 67.35738372802734, + "rewards/margins": 121.17591857910156, + "rewards/rejected": -53.81852340698242, + "step": 1060, + "u": -2.2018346786499023, + "weight": 0.043763622641563416 + }, + { + "diff_generated": -53.04597091674805, + "epoch": 0.34672715489306544, + "grad_norm": 571.4159159668553, + "learning_rate": 7.994105465570722e-07, + "logits/chosen": -2.479231357574463, + "logits/rejected": -2.534417152404785, + "logps/chosen": -27.557628631591797, + "logps/rejected": -137.225341796875, + "loss": 24.8815, + "losses_ref": -0.00040217855712398887, + "ref_logps/chosen": -98.73329162597656, + "ref_logps/rejected": -84.17935943603516, + "rewards/accuracies": 0.9375, + "rewards/chosen": 71.1756591796875, + "rewards/margins": 124.22164154052734, + "rewards/rejected": -53.04597091674805, + "step": 1070, + "u": -2.1586556434631348, + "weight": 0.06251987814903259 + }, + { + "diff_generated": -52.54377365112305, + "epoch": 0.34996759559300067, + "grad_norm": 642.7462552016408, + "learning_rate": 7.993258591681279e-07, + "logits/chosen": -2.4565343856811523, + "logits/rejected": -2.5162532329559326, + "logps/chosen": -25.28310775756836, + "logps/rejected": -133.00509643554688, + "loss": 25.5631, + "losses_ref": -1.070001758307626e-06, + "ref_logps/chosen": -91.83539581298828, + "ref_logps/rejected": -80.46131896972656, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 66.55229187011719, + "rewards/margins": 119.0960693359375, + "rewards/rejected": -52.54377365112305, + "step": 1080, + "u": -2.115499973297119, + "weight": 0.08125002682209015 + }, + { + "diff_generated": -54.84162139892578, + "epoch": 0.35320803629293585, + "grad_norm": 557.97475074345, + "learning_rate": 7.992354946582836e-07, + "logits/chosen": -2.4957919120788574, + "logits/rejected": -2.59224271774292, + "logps/chosen": -23.85027313232422, + "logps/rejected": -143.63851928710938, + "loss": 24.5671, + "losses_ref": -0.07038359344005585, + "ref_logps/chosen": -94.2538070678711, + "ref_logps/rejected": -88.79689025878906, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 70.40353393554688, + "rewards/margins": 125.24515533447266, + "rewards/rejected": -54.84162139892578, + "step": 1090, + "u": -2.201259136199951, + "weight": 0.044373251497745514 + }, + { + "diff_generated": -50.38102722167969, + "epoch": 0.356448476992871, + "grad_norm": 626.4601477446425, + "learning_rate": 7.991394543122304e-07, + "logits/chosen": -2.4803991317749023, + "logits/rejected": -2.565451145172119, + "logps/chosen": -25.01320457458496, + "logps/rejected": -133.71278381347656, + "loss": 24.8838, + "losses_ref": -0.00086111732525751, + "ref_logps/chosen": -92.66462707519531, + "ref_logps/rejected": -83.33175659179688, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 67.65141296386719, + "rewards/margins": 118.0324478149414, + "rewards/rejected": -50.38102722167969, + "step": 1100, + "u": -2.173022747039795, + "weight": 0.05629643052816391 + }, + { + "diff_generated": -51.571044921875, + "epoch": 0.3596889176928062, + "grad_norm": 560.6761655202565, + "learning_rate": 7.990377394953507e-07, + "logits/chosen": -2.4450554847717285, + "logits/rejected": -2.594871997833252, + "logps/chosen": -24.059139251708984, + "logps/rejected": -140.58322143554688, + "loss": 24.425, + "losses_ref": -0.0007773134857416153, + "ref_logps/chosen": -93.8017578125, + "ref_logps/rejected": -89.01216888427734, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 69.74263000488281, + "rewards/margins": 121.31367492675781, + "rewards/rejected": -51.571044921875, + "step": 1110, + "u": -2.1874217987060547, + "weight": 0.050037842243909836 + }, + { + "diff_generated": -52.709999084472656, + "epoch": 0.36292935839274143, + "grad_norm": 522.8230341997952, + "learning_rate": 7.989303516537001e-07, + "logits/chosen": -2.5109031200408936, + "logits/rejected": -2.602411985397339, + "logps/chosen": -20.902240753173828, + "logps/rejected": -136.27841186523438, + "loss": 24.8449, + "losses_ref": -0.003221045481041074, + "ref_logps/chosen": -90.93617248535156, + "ref_logps/rejected": -83.56842041015625, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 70.033935546875, + "rewards/margins": 122.74393463134766, + "rewards/rejected": -52.709999084472656, + "step": 1120, + "u": -2.187288999557495, + "weight": 0.05018278956413269 + }, + { + "diff_generated": -54.4443359375, + "epoch": 0.3661697990926766, + "grad_norm": 608.4703573539997, + "learning_rate": 7.98817292313986e-07, + "logits/chosen": -2.5404746532440186, + "logits/rejected": -2.6264424324035645, + "logps/chosen": -28.353923797607422, + "logps/rejected": -143.04530334472656, + "loss": 24.3711, + "losses_ref": -0.0017772326245903969, + "ref_logps/chosen": -102.36439514160156, + "ref_logps/rejected": -88.60096740722656, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 74.01045227050781, + "rewards/margins": 128.4547882080078, + "rewards/rejected": -54.4443359375, + "step": 1130, + "u": -2.2161457538604736, + "weight": 0.037601880729198456 + }, + { + "diff_generated": -49.8851432800293, + "epoch": 0.3694102397926118, + "grad_norm": 556.5135304257494, + "learning_rate": 7.986985630835463e-07, + "logits/chosen": -2.4946882724761963, + "logits/rejected": -2.5596494674682617, + "logps/chosen": -24.316936492919922, + "logps/rejected": -135.36636352539062, + "loss": 24.4909, + "losses_ref": -0.013844695873558521, + "ref_logps/chosen": -96.05143737792969, + "ref_logps/rejected": -85.48120880126953, + "rewards/accuracies": 0.96875, + "rewards/chosen": 71.73450469970703, + "rewards/margins": 121.61964416503906, + "rewards/rejected": -49.8851432800293, + "step": 1140, + "u": -2.229886293411255, + "weight": 0.0320565402507782 + }, + { + "diff_generated": -49.407188415527344, + "epoch": 0.37265068049254696, + "grad_norm": 580.0555758431312, + "learning_rate": 7.985741656503261e-07, + "logits/chosen": -2.518845558166504, + "logits/rejected": -2.57997465133667, + "logps/chosen": -28.659387588500977, + "logps/rejected": -131.66476440429688, + "loss": 25.0707, + "losses_ref": -0.003389369696378708, + "ref_logps/chosen": -98.24806213378906, + "ref_logps/rejected": -82.25758361816406, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 69.58866882324219, + "rewards/margins": 118.995849609375, + "rewards/rejected": -49.407188415527344, + "step": 1150, + "u": -2.172929525375366, + "weight": 0.05639916658401489 + }, + { + "diff_generated": -52.84662628173828, + "epoch": 0.3758911211924822, + "grad_norm": 569.5674422468707, + "learning_rate": 7.984441017828543e-07, + "logits/chosen": -2.4930014610290527, + "logits/rejected": -2.6044936180114746, + "logps/chosen": -25.53784942626953, + "logps/rejected": -139.98287963867188, + "loss": 24.346, + "losses_ref": -4.5965853132656775e-06, + "ref_logps/chosen": -96.56849670410156, + "ref_logps/rejected": -87.13624572753906, + "rewards/accuracies": 0.96875, + "rewards/chosen": 71.0306396484375, + "rewards/margins": 123.87727355957031, + "rewards/rejected": -52.84662628173828, + "step": 1160, + "u": -2.2306294441223145, + "weight": 0.03125005215406418 + }, + { + "diff_generated": -50.60841369628906, + "epoch": 0.37913156189241737, + "grad_norm": 562.3476842737259, + "learning_rate": 7.983083733302178e-07, + "logits/chosen": -2.538132905960083, + "logits/rejected": -2.570681095123291, + "logps/chosen": -25.372121810913086, + "logps/rejected": -138.02139282226562, + "loss": 24.5143, + "losses_ref": -5.09760866407305e-06, + "ref_logps/chosen": -94.68878936767578, + "ref_logps/rejected": -87.41297912597656, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 69.31666564941406, + "rewards/margins": 119.9250717163086, + "rewards/rejected": -50.60841369628906, + "step": 1170, + "u": -2.187455654144287, + "weight": 0.050000082701444626 + }, + { + "diff_generated": -50.25326919555664, + "epoch": 0.38237200259235254, + "grad_norm": 560.2383575313584, + "learning_rate": 7.98166982222036e-07, + "logits/chosen": -2.5210089683532715, + "logits/rejected": -2.5609211921691895, + "logps/chosen": -25.46693992614746, + "logps/rejected": -132.6431427001953, + "loss": 24.6057, + "losses_ref": -0.005505814682692289, + "ref_logps/chosen": -97.08512115478516, + "ref_logps/rejected": -82.38987731933594, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 71.61817932128906, + "rewards/margins": 121.8714599609375, + "rewards/rejected": -50.25326919555664, + "step": 1180, + "u": -2.2015438079833984, + "weight": 0.04408115893602371 + }, + { + "diff_generated": -51.55100631713867, + "epoch": 0.3856124432922878, + "grad_norm": 551.3974517108097, + "learning_rate": 7.980199304684328e-07, + "logits/chosen": -2.483508825302124, + "logits/rejected": -2.510715961456299, + "logps/chosen": -25.69317054748535, + "logps/rejected": -139.46047973632812, + "loss": 24.5328, + "losses_ref": -0.0007960908114910126, + "ref_logps/chosen": -98.38687896728516, + "ref_logps/rejected": -87.90947723388672, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 72.69371032714844, + "rewards/margins": 124.2447280883789, + "rewards/rejected": -51.55100631713867, + "step": 1190, + "u": -2.2162063121795654, + "weight": 0.03753543645143509 + }, + { + "diff_generated": -51.10063171386719, + "epoch": 0.38885288399222295, + "grad_norm": 559.0601762716767, + "learning_rate": 7.978672201600077e-07, + "logits/chosen": -2.4614675045013428, + "logits/rejected": -2.575929641723633, + "logps/chosen": -23.63718605041504, + "logps/rejected": -137.8214111328125, + "loss": 23.8937, + "losses_ref": -0.0016802713507786393, + "ref_logps/chosen": -92.49464416503906, + "ref_logps/rejected": -86.72079467773438, + "rewards/accuracies": 0.9375, + "rewards/chosen": 68.85746002197266, + "rewards/margins": 119.9581069946289, + "rewards/rejected": -51.10063171386719, + "step": 1200, + "u": -2.158592700958252, + "weight": 0.06258918344974518 + }, + { + "diff_generated": -53.33112716674805, + "epoch": 0.39209332469215813, + "grad_norm": 558.976664007009, + "learning_rate": 7.97708853467807e-07, + "logits/chosen": -2.5114684104919434, + "logits/rejected": -2.608743906021118, + "logps/chosen": -22.37912940979004, + "logps/rejected": -136.87342834472656, + "loss": 23.9141, + "losses_ref": -0.0029803109355270863, + "ref_logps/chosen": -94.94758605957031, + "ref_logps/rejected": -83.54229736328125, + "rewards/accuracies": 0.96875, + "rewards/chosen": 72.56846618652344, + "rewards/margins": 125.89958190917969, + "rewards/rejected": -53.33112716674805, + "step": 1210, + "u": -2.230485439300537, + "weight": 0.03140822798013687 + }, + { + "diff_generated": -52.54514694213867, + "epoch": 0.3953337653920933, + "grad_norm": 570.3696399396063, + "learning_rate": 7.975448326432927e-07, + "logits/chosen": -2.5000858306884766, + "logits/rejected": -2.616079330444336, + "logps/chosen": -24.4619197845459, + "logps/rejected": -139.69168090820312, + "loss": 24.1186, + "losses_ref": -0.0010114926844835281, + "ref_logps/chosen": -92.59854125976562, + "ref_logps/rejected": -87.14653015136719, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 68.1366195678711, + "rewards/margins": 120.68177795410156, + "rewards/rejected": -52.54514694213867, + "step": 1220, + "u": -2.201807737350464, + "weight": 0.04379352182149887 + }, + { + "diff_generated": -54.53679275512695, + "epoch": 0.39857420609202854, + "grad_norm": 543.429809670915, + "learning_rate": 7.973751600183094e-07, + "logits/chosen": -2.518784761428833, + "logits/rejected": -2.5801219940185547, + "logps/chosen": -25.79349708557129, + "logps/rejected": -144.06625366210938, + "loss": 25.213, + "losses_ref": -0.00040732818888500333, + "ref_logps/chosen": -98.10910034179688, + "ref_logps/rejected": -89.52947235107422, + "rewards/accuracies": 0.96875, + "rewards/chosen": 72.31558227539062, + "rewards/margins": 126.8523941040039, + "rewards/rejected": -54.53679275512695, + "step": 1230, + "u": -2.2306201457977295, + "weight": 0.03126021847128868 + }, + { + "diff_generated": -52.76995086669922, + "epoch": 0.4018146467919637, + "grad_norm": 559.798929661523, + "learning_rate": 7.971998380050529e-07, + "logits/chosen": -2.4895637035369873, + "logits/rejected": -2.5688529014587402, + "logps/chosen": -25.33633041381836, + "logps/rejected": -135.40036010742188, + "loss": 24.6792, + "losses_ref": -0.23069611191749573, + "ref_logps/chosen": -97.98286437988281, + "ref_logps/rejected": -82.63040924072266, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 72.64652252197266, + "rewards/margins": 125.4164810180664, + "rewards/rejected": -52.76995086669922, + "step": 1240, + "u": -2.251986265182495, + "weight": 0.023596609011292458 + }, + { + "diff_generated": -53.05475997924805, + "epoch": 0.4050550874918989, + "grad_norm": 493.893056663407, + "learning_rate": 7.970188690960343e-07, + "logits/chosen": -2.4412055015563965, + "logits/rejected": -2.5813724994659424, + "logps/chosen": -20.939809799194336, + "logps/rejected": -137.92222595214844, + "loss": 23.4929, + "losses_ref": -0.0011973511427640915, + "ref_logps/chosen": -89.072265625, + "ref_logps/rejected": -84.86746978759766, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 68.13246154785156, + "rewards/margins": 121.1872329711914, + "rewards/rejected": -53.05475997924805, + "step": 1250, + "u": -2.1730077266693115, + "weight": 0.056313030421733856 + }, + { + "diff_generated": -53.86749267578125, + "epoch": 0.40829552819183407, + "grad_norm": 571.1919107683036, + "learning_rate": 7.968322558640458e-07, + "logits/chosen": -2.4561567306518555, + "logits/rejected": -2.5805623531341553, + "logps/chosen": -24.173444747924805, + "logps/rejected": -137.20947265625, + "loss": 24.7757, + "losses_ref": -0.0017804211238399148, + "ref_logps/chosen": -94.57585144042969, + "ref_logps/rejected": -83.34197235107422, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 70.40240478515625, + "rewards/margins": 124.26991271972656, + "rewards/rejected": -53.86749267578125, + "step": 1260, + "u": -2.1729800701141357, + "weight": 0.05634336546063423 + }, + { + "diff_generated": -51.295204162597656, + "epoch": 0.4115359688917693, + "grad_norm": 586.1857358227938, + "learning_rate": 7.966400009621233e-07, + "logits/chosen": -2.4896740913391113, + "logits/rejected": -2.57775616645813, + "logps/chosen": -24.672901153564453, + "logps/rejected": -135.76600646972656, + "loss": 25.0857, + "losses_ref": -0.005821887403726578, + "ref_logps/chosen": -92.84335327148438, + "ref_logps/rejected": -84.47081756591797, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 68.17045593261719, + "rewards/margins": 119.46565246582031, + "rewards/rejected": -51.295204162597656, + "step": 1270, + "u": -2.115187168121338, + "weight": 0.0815921351313591 + }, + { + "diff_generated": -50.49216842651367, + "epoch": 0.4147764095917045, + "grad_norm": 524.773221147273, + "learning_rate": 7.964421071235092e-07, + "logits/chosen": -2.4497692584991455, + "logits/rejected": -2.5709147453308105, + "logps/chosen": -23.245464324951172, + "logps/rejected": -127.8901596069336, + "loss": 23.9401, + "losses_ref": -0.0034059532918035984, + "ref_logps/chosen": -87.4626235961914, + "ref_logps/rejected": -77.39798736572266, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 64.21714782714844, + "rewards/margins": 114.70931243896484, + "rewards/rejected": -50.49216842651367, + "step": 1280, + "u": -2.115327835083008, + "weight": 0.08143889158964157 + }, + { + "diff_generated": -50.94807052612305, + "epoch": 0.41801685029163965, + "grad_norm": 552.3493908807122, + "learning_rate": 7.962385771616133e-07, + "logits/chosen": -2.479870557785034, + "logits/rejected": -2.5021462440490723, + "logps/chosen": -24.55561637878418, + "logps/rejected": -131.24034118652344, + "loss": 23.9531, + "losses_ref": -0.003040406620129943, + "ref_logps/chosen": -93.30787658691406, + "ref_logps/rejected": -80.29228210449219, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 68.75225067138672, + "rewards/margins": 119.70033264160156, + "rewards/rejected": -50.94807052612305, + "step": 1290, + "u": -2.1297543048858643, + "weight": 0.07515055686235428 + }, + { + "diff_generated": -53.61127471923828, + "epoch": 0.42125729099157483, + "grad_norm": 527.6859831301322, + "learning_rate": 7.960294139699724e-07, + "logits/chosen": -2.4919817447662354, + "logits/rejected": -2.580754041671753, + "logps/chosen": -22.840330123901367, + "logps/rejected": -142.553955078125, + "loss": 23.351, + "losses_ref": -1.0085510382396023e-07, + "ref_logps/chosen": -95.0149917602539, + "ref_logps/rejected": -88.94267272949219, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 72.17464447021484, + "rewards/margins": 125.78592681884766, + "rewards/rejected": -53.61127471923828, + "step": 1300, + "u": -2.173064708709717, + "weight": 0.05624999850988388 + }, + { + "diff_generated": -55.222808837890625, + "epoch": 0.42449773169151006, + "grad_norm": 575.0788238036812, + "learning_rate": 7.958146205222102e-07, + "logits/chosen": -2.4579484462738037, + "logits/rejected": -2.54203724861145, + "logps/chosen": -22.296905517578125, + "logps/rejected": -139.87576293945312, + "loss": 23.8415, + "losses_ref": -4.046513822686393e-06, + "ref_logps/chosen": -97.4652328491211, + "ref_logps/rejected": -84.65293884277344, + "rewards/accuracies": 0.96875, + "rewards/chosen": 75.16834259033203, + "rewards/margins": 130.39114379882812, + "rewards/rejected": -55.222808837890625, + "step": 1310, + "u": -2.2306294441223145, + "weight": 0.031250160187482834 + }, + { + "diff_generated": -52.997215270996094, + "epoch": 0.42773817239144524, + "grad_norm": 540.076528075968, + "learning_rate": 7.955941998719939e-07, + "logits/chosen": -2.4463772773742676, + "logits/rejected": -2.5361287593841553, + "logps/chosen": -24.38715171813965, + "logps/rejected": -138.02978515625, + "loss": 23.5507, + "losses_ref": -1.9104633963706874e-07, + "ref_logps/chosen": -91.82412719726562, + "ref_logps/rejected": -85.0325698852539, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 67.43697357177734, + "rewards/margins": 120.4342041015625, + "rewards/rejected": -52.997215270996094, + "step": 1320, + "u": -2.115499973297119, + "weight": 0.08124999701976776 + }, + { + "diff_generated": -54.24946212768555, + "epoch": 0.4309786130913804, + "grad_norm": 532.374570559775, + "learning_rate": 7.953681551529918e-07, + "logits/chosen": -2.430579662322998, + "logits/rejected": -2.5255911350250244, + "logps/chosen": -21.84467124938965, + "logps/rejected": -137.99459838867188, + "loss": 23.4925, + "losses_ref": -0.004371698014438152, + "ref_logps/chosen": -93.0247802734375, + "ref_logps/rejected": -83.7451400756836, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 71.18009948730469, + "rewards/margins": 125.4295654296875, + "rewards/rejected": -54.24946212768555, + "step": 1330, + "u": -2.172853708267212, + "weight": 0.05648232623934746 + }, + { + "diff_generated": -56.7379264831543, + "epoch": 0.43421905379131565, + "grad_norm": 629.1562646223169, + "learning_rate": 7.951364895788277e-07, + "logits/chosen": -2.4883508682250977, + "logits/rejected": -2.5553619861602783, + "logps/chosen": -22.71754264831543, + "logps/rejected": -143.79302978515625, + "loss": 23.6339, + "losses_ref": -1.7104119365285442e-07, + "ref_logps/chosen": -95.89179992675781, + "ref_logps/rejected": -87.05510711669922, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 73.17425537109375, + "rewards/margins": 129.9121551513672, + "rewards/rejected": -56.7379264831543, + "step": 1340, + "u": -2.2018468379974365, + "weight": 0.04375000298023224 + }, + { + "diff_generated": -53.1761360168457, + "epoch": 0.4374594944912508, + "grad_norm": 549.2730111417429, + "learning_rate": 7.948992064430363e-07, + "logits/chosen": -2.479661464691162, + "logits/rejected": -2.592576503753662, + "logps/chosen": -24.881542205810547, + "logps/rejected": -139.13638305664062, + "loss": 24.3426, + "losses_ref": -2.509763135094545e-06, + "ref_logps/chosen": -96.41204833984375, + "ref_logps/rejected": -85.96025085449219, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 71.53050994873047, + "rewards/margins": 124.70664978027344, + "rewards/rejected": -53.1761360168457, + "step": 1350, + "u": -2.2018468379974365, + "weight": 0.04375007376074791 + }, + { + "diff_generated": -53.69463348388672, + "epoch": 0.440699935191186, + "grad_norm": 578.1317387676165, + "learning_rate": 7.946563091190154e-07, + "logits/chosen": -2.4729490280151367, + "logits/rejected": -2.563425302505493, + "logps/chosen": -24.247787475585938, + "logps/rejected": -136.15133666992188, + "loss": 24.7494, + "losses_ref": -0.0016694276127964258, + "ref_logps/chosen": -94.1891860961914, + "ref_logps/rejected": -82.45668029785156, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 69.94139099121094, + "rewards/margins": 123.63603210449219, + "rewards/rejected": -53.69463348388672, + "step": 1360, + "u": -2.187392473220825, + "weight": 0.050070326775312424 + }, + { + "diff_generated": -50.28107452392578, + "epoch": 0.4439403758911212, + "grad_norm": 545.079547424566, + "learning_rate": 7.944078010599788e-07, + "logits/chosen": -2.498121976852417, + "logits/rejected": -2.4967360496520996, + "logps/chosen": -24.85556411743164, + "logps/rejected": -133.72714233398438, + "loss": 23.8484, + "losses_ref": -9.963375305233058e-06, + "ref_logps/chosen": -96.14152526855469, + "ref_logps/rejected": -83.4460678100586, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 71.28595733642578, + "rewards/margins": 121.56703186035156, + "rewards/rejected": -50.28107452392578, + "step": 1370, + "u": -2.115499973297119, + "weight": 0.08125033229589462 + }, + { + "diff_generated": -52.63268280029297, + "epoch": 0.4471808165910564, + "grad_norm": 584.977752142354, + "learning_rate": 7.941536857989063e-07, + "logits/chosen": -2.4137930870056152, + "logits/rejected": -2.4826889038085938, + "logps/chosen": -24.653759002685547, + "logps/rejected": -139.8968048095703, + "loss": 23.9151, + "losses_ref": -0.000810875091701746, + "ref_logps/chosen": -94.25761413574219, + "ref_logps/rejected": -87.26411437988281, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 69.6038589477539, + "rewards/margins": 122.23653411865234, + "rewards/rejected": -52.63268280029297, + "step": 1380, + "u": -2.1442465782165527, + "weight": 0.06878943741321564 + }, + { + "diff_generated": -52.39778518676758, + "epoch": 0.4504212572909916, + "grad_norm": 522.095104660753, + "learning_rate": 7.938939669484943e-07, + "logits/chosen": -2.4579412937164307, + "logits/rejected": -2.556067943572998, + "logps/chosen": -21.02359390258789, + "logps/rejected": -136.83511352539062, + "loss": 23.2373, + "losses_ref": -7.89561599958688e-05, + "ref_logps/chosen": -92.44002532958984, + "ref_logps/rejected": -84.43733215332031, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 71.41643524169922, + "rewards/margins": 123.814208984375, + "rewards/rejected": -52.39778518676758, + "step": 1390, + "u": -2.20184588432312, + "weight": 0.0437513068318367 + }, + { + "diff_generated": -52.960174560546875, + "epoch": 0.45366169799092676, + "grad_norm": 527.8687486131116, + "learning_rate": 7.936286482011041e-07, + "logits/chosen": -2.4333748817443848, + "logits/rejected": -2.498981475830078, + "logps/chosen": -24.400089263916016, + "logps/rejected": -139.44273376464844, + "loss": 24.7374, + "losses_ref": -0.00069514597998932, + "ref_logps/chosen": -94.76936340332031, + "ref_logps/rejected": -86.4825439453125, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 70.36927795410156, + "rewards/margins": 123.32945251464844, + "rewards/rejected": -52.960174560546875, + "step": 1400, + "u": -2.1874213218688965, + "weight": 0.0500384084880352 + }, + { + "diff_generated": -59.92344284057617, + "epoch": 0.45690213869086194, + "grad_norm": 534.2599238039342, + "learning_rate": 7.933577333287091e-07, + "logits/chosen": -2.417055130004883, + "logits/rejected": -2.5821313858032227, + "logps/chosen": -23.159229278564453, + "logps/rejected": -147.50450134277344, + "loss": 22.6167, + "losses_ref": -0.0012575514847412705, + "ref_logps/chosen": -91.5752944946289, + "ref_logps/rejected": -87.58106231689453, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 68.41606140136719, + "rewards/margins": 128.33950805664062, + "rewards/rejected": -59.92344284057617, + "step": 1410, + "u": -2.2017815113067627, + "weight": 0.04382243752479553 + }, + { + "diff_generated": -53.30607986450195, + "epoch": 0.46014257939079717, + "grad_norm": 522.7862086723107, + "learning_rate": 7.930812261828421e-07, + "logits/chosen": -2.4632554054260254, + "logits/rejected": -2.5481324195861816, + "logps/chosen": -27.21381187438965, + "logps/rejected": -136.55772399902344, + "loss": 24.9904, + "losses_ref": -0.0023922298569232225, + "ref_logps/chosen": -95.427001953125, + "ref_logps/rejected": -83.25163269042969, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 68.21318817138672, + "rewards/margins": 121.5192642211914, + "rewards/rejected": -53.30607986450195, + "step": 1420, + "u": -2.2017314434051514, + "weight": 0.043877117335796356 + }, + { + "diff_generated": -54.06325149536133, + "epoch": 0.46338302009073234, + "grad_norm": 521.0229100480946, + "learning_rate": 7.92799130694539e-07, + "logits/chosen": -2.4891176223754883, + "logits/rejected": -2.549175500869751, + "logps/chosen": -23.772846221923828, + "logps/rejected": -140.67127990722656, + "loss": 22.8429, + "losses_ref": -0.00034526773379184306, + "ref_logps/chosen": -95.93737030029297, + "ref_logps/rejected": -86.60801696777344, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 72.16453552246094, + "rewards/margins": 126.227783203125, + "rewards/rejected": -54.06325149536133, + "step": 1430, + "u": -2.14426589012146, + "weight": 0.06876814365386963 + }, + { + "diff_generated": -54.022361755371094, + "epoch": 0.4666234607906675, + "grad_norm": 538.1163426543451, + "learning_rate": 7.925114508742848e-07, + "logits/chosen": -2.4775466918945312, + "logits/rejected": -2.5832314491271973, + "logps/chosen": -22.278833389282227, + "logps/rejected": -137.95533752441406, + "loss": 23.6777, + "losses_ref": -1.4719394414441922e-07, + "ref_logps/chosen": -90.06036376953125, + "ref_logps/rejected": -83.9329605102539, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 67.78153228759766, + "rewards/margins": 121.80389404296875, + "rewards/rejected": -54.022361755371094, + "step": 1440, + "u": -2.187455654144287, + "weight": 0.05000000074505806 + }, + { + "diff_generated": -58.577171325683594, + "epoch": 0.4698639014906027, + "grad_norm": 512.2109426836644, + "learning_rate": 7.92218190811955e-07, + "logits/chosen": -2.4623990058898926, + "logits/rejected": -2.6098854541778564, + "logps/chosen": -23.492599487304688, + "logps/rejected": -145.59201049804688, + "loss": 22.9684, + "losses_ref": -1.1896883734152652e-05, + "ref_logps/chosen": -94.02626037597656, + "ref_logps/rejected": -87.01484680175781, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 70.5336685180664, + "rewards/margins": 129.11083984375, + "rewards/rejected": -58.577171325683594, + "step": 1450, + "u": -2.2018468379974365, + "weight": 0.04375017434358597 + }, + { + "diff_generated": -55.12044143676758, + "epoch": 0.47310434219053793, + "grad_norm": 517.9787531892808, + "learning_rate": 7.919193546767581e-07, + "logits/chosen": -2.4582018852233887, + "logits/rejected": -2.53832745552063, + "logps/chosen": -23.89086151123047, + "logps/rejected": -141.64981079101562, + "loss": 23.5758, + "losses_ref": -0.0004338372382335365, + "ref_logps/chosen": -93.01166534423828, + "ref_logps/rejected": -86.52937316894531, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 69.12080383300781, + "rewards/margins": 124.24124908447266, + "rewards/rejected": -55.12044143676758, + "step": 1460, + "u": -2.201826333999634, + "weight": 0.04377306252717972 + }, + { + "diff_generated": -54.81425857543945, + "epoch": 0.4763447828904731, + "grad_norm": 489.74655295683687, + "learning_rate": 7.916149467171768e-07, + "logits/chosen": -2.467390537261963, + "logits/rejected": -2.532069444656372, + "logps/chosen": -20.180316925048828, + "logps/rejected": -134.23587036132812, + "loss": 23.0804, + "losses_ref": -8.911225449992344e-06, + "ref_logps/chosen": -90.47590637207031, + "ref_logps/rejected": -79.42161560058594, + "rewards/accuracies": 0.9375, + "rewards/chosen": 70.29559326171875, + "rewards/margins": 125.1098403930664, + "rewards/rejected": -54.81425857543945, + "step": 1470, + "u": -2.1586732864379883, + "weight": 0.0625002458691597 + }, + { + "diff_generated": -52.9602165222168, + "epoch": 0.4795852235904083, + "grad_norm": 495.5308876754751, + "learning_rate": 7.913049712609066e-07, + "logits/chosen": -2.448991298675537, + "logits/rejected": -2.5381381511688232, + "logps/chosen": -22.26654052734375, + "logps/rejected": -135.81704711914062, + "loss": 22.7017, + "losses_ref": -3.1697170470579294e-06, + "ref_logps/chosen": -91.15074157714844, + "ref_logps/rejected": -82.85684204101562, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 68.88420104980469, + "rewards/margins": 121.84442138671875, + "rewards/rejected": -52.9602165222168, + "step": 1480, + "u": -2.115499973297119, + "weight": 0.08125009387731552 + }, + { + "diff_generated": -53.670166015625, + "epoch": 0.48282566429034346, + "grad_norm": 517.8427068314436, + "learning_rate": 7.909894327147949e-07, + "logits/chosen": -2.478231430053711, + "logits/rejected": -2.5495798587799072, + "logps/chosen": -23.509714126586914, + "logps/rejected": -138.92987060546875, + "loss": 23.4401, + "losses_ref": -0.0008373827440664172, + "ref_logps/chosen": -95.60778045654297, + "ref_logps/rejected": -85.25968933105469, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 72.09806823730469, + "rewards/margins": 125.76823425292969, + "rewards/rejected": -53.670166015625, + "step": 1490, + "u": -2.216200351715088, + "weight": 0.03754196688532829 + }, + { + "diff_generated": -58.10505294799805, + "epoch": 0.4860661049902787, + "grad_norm": 546.7827706515662, + "learning_rate": 7.906683355647783e-07, + "logits/chosen": -2.4726247787475586, + "logits/rejected": -2.5774738788604736, + "logps/chosen": -22.412656784057617, + "logps/rejected": -148.88279724121094, + "loss": 23.0294, + "losses_ref": -0.0006750643369741738, + "ref_logps/chosen": -94.51060485839844, + "ref_logps/rejected": -90.77774810791016, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 72.09796142578125, + "rewards/margins": 130.2030029296875, + "rewards/rejected": -58.10505294799805, + "step": 1500, + "u": -2.2162046432495117, + "weight": 0.03753728047013283 + }, + { + "diff_generated": -57.015953063964844, + "epoch": 0.48930654569021387, + "grad_norm": 553.5451201077923, + "learning_rate": 7.903416843758187e-07, + "logits/chosen": -2.5150179862976074, + "logits/rejected": -2.61221981048584, + "logps/chosen": -22.41795539855957, + "logps/rejected": -141.7072296142578, + "loss": 23.4806, + "losses_ref": -0.003707682015374303, + "ref_logps/chosen": -93.69750213623047, + "ref_logps/rejected": -84.69126892089844, + "rewards/accuracies": 0.9375, + "rewards/chosen": 71.279541015625, + "rewards/margins": 128.29550170898438, + "rewards/rejected": -57.015953063964844, + "step": 1510, + "u": -2.158477306365967, + "weight": 0.06271643191576004 + }, + { + "diff_generated": -55.60862350463867, + "epoch": 0.49254698639014904, + "grad_norm": 523.2448385167842, + "learning_rate": 7.900094837918385e-07, + "logits/chosen": -2.4987995624542236, + "logits/rejected": -2.5742671489715576, + "logps/chosen": -26.006755828857422, + "logps/rejected": -144.40855407714844, + "loss": 23.9932, + "losses_ref": -0.0009523486951366067, + "ref_logps/chosen": -97.47042083740234, + "ref_logps/rejected": -88.79991912841797, + "rewards/accuracies": 0.96875, + "rewards/chosen": 71.46366119384766, + "rewards/margins": 127.0722885131836, + "rewards/rejected": -55.60862350463867, + "step": 1520, + "u": -2.230584144592285, + "weight": 0.031299836933612823 + }, + { + "diff_generated": -57.468544006347656, + "epoch": 0.4957874270900843, + "grad_norm": 542.5668513892613, + "learning_rate": 7.896717385356545e-07, + "logits/chosen": -2.487705707550049, + "logits/rejected": -2.624055862426758, + "logps/chosen": -22.60787582397461, + "logps/rejected": -146.48764038085938, + "loss": 22.4425, + "losses_ref": -4.9768182179832365e-06, + "ref_logps/chosen": -95.36245727539062, + "ref_logps/rejected": -89.01911163330078, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 72.75457763671875, + "rewards/margins": 130.22311401367188, + "rewards/rejected": -57.468544006347656, + "step": 1530, + "u": -2.2450203895568848, + "weight": 0.025000065565109253 + }, + { + "diff_generated": -55.307334899902344, + "epoch": 0.49902786779001945, + "grad_norm": 580.1655904703864, + "learning_rate": 7.893284534089109e-07, + "logits/chosen": -2.4579875469207764, + "logits/rejected": -2.552023410797119, + "logps/chosen": -22.494773864746094, + "logps/rejected": -138.22250366210938, + "loss": 23.8534, + "losses_ref": -0.00047390550025738776, + "ref_logps/chosen": -94.70335388183594, + "ref_logps/rejected": -82.91517639160156, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 72.20856475830078, + "rewards/margins": 127.51590728759766, + "rewards/rejected": -55.307334899902344, + "step": 1540, + "u": -2.2162322998046875, + "weight": 0.037506647408008575 + }, + { + "diff_generated": -53.2509651184082, + "epoch": 0.5022683084899546, + "grad_norm": 505.97810942841124, + "learning_rate": 7.889796332920106e-07, + "logits/chosen": -2.429352283477783, + "logits/rejected": -2.5697340965270996, + "logps/chosen": -21.56386947631836, + "logps/rejected": -135.63404846191406, + "loss": 22.5935, + "losses_ref": -0.002656723605468869, + "ref_logps/chosen": -88.36921691894531, + "ref_logps/rejected": -82.38307189941406, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 66.80535125732422, + "rewards/margins": 120.05632019042969, + "rewards/rejected": -53.2509651184082, + "step": 1550, + "u": -2.1297876834869385, + "weight": 0.0751144140958786 + }, + { + "diff_generated": -55.338844299316406, + "epoch": 0.5055087491898899, + "grad_norm": 517.7480424570723, + "learning_rate": 7.886252831440465e-07, + "logits/chosen": -2.472877025604248, + "logits/rejected": -2.583745241165161, + "logps/chosen": -24.70361328125, + "logps/rejected": -147.2733917236328, + "loss": 23.1055, + "losses_ref": -0.0028544296510517597, + "ref_logps/chosen": -95.76852416992188, + "ref_logps/rejected": -91.93456268310547, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 71.0649185180664, + "rewards/margins": 126.40376281738281, + "rewards/rejected": -55.338844299316406, + "step": 1560, + "u": -2.2448930740356445, + "weight": 0.025140201672911644 + }, + { + "diff_generated": -56.882781982421875, + "epoch": 0.508749189889825, + "grad_norm": 507.17514765947175, + "learning_rate": 7.882654080027304e-07, + "logits/chosen": -2.4699952602386475, + "logits/rejected": -2.575896978378296, + "logps/chosen": -22.763708114624023, + "logps/rejected": -146.9736328125, + "loss": 23.5348, + "losses_ref": -0.0004549544246401638, + "ref_logps/chosen": -96.1058578491211, + "ref_logps/rejected": -90.09082794189453, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 73.34214782714844, + "rewards/margins": 130.22494506835938, + "rewards/rejected": -56.882781982421875, + "step": 1570, + "u": -2.2162158489227295, + "weight": 0.037524718791246414 + }, + { + "diff_generated": -52.004127502441406, + "epoch": 0.5119896305897602, + "grad_norm": 464.5625476061178, + "learning_rate": 7.879000129843218e-07, + "logits/chosen": -2.5297818183898926, + "logits/rejected": -2.567648410797119, + "logps/chosen": -26.26664161682129, + "logps/rejected": -136.30093383789062, + "loss": 23.1308, + "losses_ref": -0.0055158380419015884, + "ref_logps/chosen": -96.4405517578125, + "ref_logps/rejected": -84.29681396484375, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 70.17391204833984, + "rewards/margins": 122.17803955078125, + "rewards/rejected": -52.004127502441406, + "step": 1580, + "u": -2.1297948360443115, + "weight": 0.07510620355606079 + }, + { + "diff_generated": -55.481468200683594, + "epoch": 0.5152300712896954, + "grad_norm": 530.5031033607447, + "learning_rate": 7.87529103283555e-07, + "logits/chosen": -2.529362201690674, + "logits/rejected": -2.5975449085235596, + "logps/chosen": -24.188579559326172, + "logps/rejected": -146.2343292236328, + "loss": 23.4568, + "losses_ref": -1.0383139397163177e-06, + "ref_logps/chosen": -96.58631896972656, + "ref_logps/rejected": -90.75286102294922, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 72.39774322509766, + "rewards/margins": 127.87921142578125, + "rewards/rejected": -55.481468200683594, + "step": 1590, + "u": -2.173064708709717, + "weight": 0.05625002458691597 + }, + { + "diff_generated": -54.71356964111328, + "epoch": 0.5184705119896306, + "grad_norm": 487.43724963171377, + "learning_rate": 7.871526841735649e-07, + "logits/chosen": -2.499135971069336, + "logits/rejected": -2.5506656169891357, + "logps/chosen": -22.029584884643555, + "logps/rejected": -143.09744262695312, + "loss": 23.0352, + "losses_ref": -9.371944543090649e-06, + "ref_logps/chosen": -93.57704162597656, + "ref_logps/rejected": -88.3838882446289, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 71.54745483398438, + "rewards/margins": 126.26102447509766, + "rewards/rejected": -54.71356964111328, + "step": 1600, + "u": -2.1730644702911377, + "weight": 0.056250352412462234 + }, + { + "diff_generated": -58.845428466796875, + "epoch": 0.5217109526895658, + "grad_norm": 465.2252457978142, + "learning_rate": 7.867707610058127e-07, + "logits/chosen": -2.5014264583587646, + "logits/rejected": -2.617633581161499, + "logps/chosen": -22.12125587463379, + "logps/rejected": -146.1781463623047, + "loss": 24.4162, + "losses_ref": -1.3118931008193613e-07, + "ref_logps/chosen": -98.06317901611328, + "ref_logps/rejected": -87.33271789550781, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 75.94192504882812, + "rewards/margins": 134.78736877441406, + "rewards/rejected": -58.845428466796875, + "step": 1610, + "u": -2.2018468379974365, + "weight": 0.04374999925494194 + }, + { + "diff_generated": -57.821632385253906, + "epoch": 0.5249513933895009, + "grad_norm": 530.9806499111314, + "learning_rate": 7.863833392100093e-07, + "logits/chosen": -2.4269180297851562, + "logits/rejected": -2.5671417713165283, + "logps/chosen": -20.6740665435791, + "logps/rejected": -143.11346435546875, + "loss": 23.1303, + "losses_ref": -0.0014479614328593016, + "ref_logps/chosen": -90.92362213134766, + "ref_logps/rejected": -85.29182434082031, + "rewards/accuracies": 0.96875, + "rewards/chosen": 70.24955749511719, + "rewards/margins": 128.07119750976562, + "rewards/rejected": -57.821632385253906, + "step": 1620, + "u": -2.2305593490600586, + "weight": 0.03132731840014458 + }, + { + "diff_generated": -56.96986770629883, + "epoch": 0.5281918340894362, + "grad_norm": 527.1711533731245, + "learning_rate": 7.859904242940385e-07, + "logits/chosen": -2.485358476638794, + "logits/rejected": -2.5643229484558105, + "logps/chosen": -22.638986587524414, + "logps/rejected": -146.45889282226562, + "loss": 23.265, + "losses_ref": -0.0023109859321266413, + "ref_logps/chosen": -95.79054260253906, + "ref_logps/rejected": -89.48902893066406, + "rewards/accuracies": 0.96875, + "rewards/chosen": 73.15155792236328, + "rewards/margins": 130.1214141845703, + "rewards/rejected": -56.96986770629883, + "step": 1630, + "u": -2.2305521965026855, + "weight": 0.03133513033390045 + }, + { + "diff_generated": -55.67280197143555, + "epoch": 0.5314322747893714, + "grad_norm": 631.6742557579007, + "learning_rate": 7.855920218438783e-07, + "logits/chosen": -2.473820447921753, + "logits/rejected": -2.5379204750061035, + "logps/chosen": -22.74087905883789, + "logps/rejected": -139.18263244628906, + "loss": 23.9599, + "losses_ref": -3.2874831958906725e-05, + "ref_logps/chosen": -94.07108306884766, + "ref_logps/rejected": -83.50982666015625, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 71.33020782470703, + "rewards/margins": 127.00301361083984, + "rewards/rejected": -55.67280197143555, + "step": 1640, + "u": -2.187455654144287, + "weight": 0.05000032112002373 + }, + { + "diff_generated": -56.30420684814453, + "epoch": 0.5346727154893065, + "grad_norm": 609.874994767621, + "learning_rate": 7.851881375235216e-07, + "logits/chosen": -2.5344715118408203, + "logits/rejected": -2.588593006134033, + "logps/chosen": -21.436311721801758, + "logps/rejected": -141.29171752929688, + "loss": 22.9227, + "losses_ref": -2.7268544045000453e-07, + "ref_logps/chosen": -95.1426010131836, + "ref_logps/rejected": -84.98751068115234, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 73.70628356933594, + "rewards/margins": 130.010498046875, + "rewards/rejected": -56.30420684814453, + "step": 1650, + "u": -2.115499973297119, + "weight": 0.08125000447034836 + }, + { + "diff_generated": -55.4068603515625, + "epoch": 0.5379131561892417, + "grad_norm": 576.0051849898757, + "learning_rate": 7.847787770748959e-07, + "logits/chosen": -2.5346810817718506, + "logits/rejected": -2.5801634788513184, + "logps/chosen": -24.148042678833008, + "logps/rejected": -148.05355834960938, + "loss": 24.3066, + "losses_ref": -0.002318193670362234, + "ref_logps/chosen": -97.6961441040039, + "ref_logps/rejected": -92.64669036865234, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 73.548095703125, + "rewards/margins": 128.95497131347656, + "rewards/rejected": -55.4068603515625, + "step": 1660, + "u": -2.216139554977417, + "weight": 0.03760867565870285 + }, + { + "diff_generated": -55.42523193359375, + "epoch": 0.541153596889177, + "grad_norm": 480.4127839107038, + "learning_rate": 7.843639463177815e-07, + "logits/chosen": -2.4996581077575684, + "logits/rejected": -2.627084732055664, + "logps/chosen": -23.054407119750977, + "logps/rejected": -146.1646270751953, + "loss": 22.0121, + "losses_ref": -3.36450739268912e-07, + "ref_logps/chosen": -93.96330261230469, + "ref_logps/rejected": -90.73939514160156, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 70.90888977050781, + "rewards/margins": 126.33412170410156, + "rewards/rejected": -55.42523193359375, + "step": 1670, + "u": -2.2018468379974365, + "weight": 0.043750010430812836 + }, + { + "diff_generated": -56.191612243652344, + "epoch": 0.5443940375891121, + "grad_norm": 498.8808720156564, + "learning_rate": 7.839436511497288e-07, + "logits/chosen": -2.4894304275512695, + "logits/rejected": -2.5800139904022217, + "logps/chosen": -23.490821838378906, + "logps/rejected": -147.164794921875, + "loss": 22.1627, + "losses_ref": -0.003980209585279226, + "ref_logps/chosen": -95.75300598144531, + "ref_logps/rejected": -90.97319030761719, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 72.26219940185547, + "rewards/margins": 128.4538116455078, + "rewards/rejected": -56.191612243652344, + "step": 1680, + "u": -2.1730291843414307, + "weight": 0.05628935620188713 + }, + { + "diff_generated": -55.59052276611328, + "epoch": 0.5476344782890473, + "grad_norm": 526.1634443234373, + "learning_rate": 7.835178975459744e-07, + "logits/chosen": -2.4564805030822754, + "logits/rejected": -2.5387251377105713, + "logps/chosen": -21.98865509033203, + "logps/rejected": -137.17153930664062, + "loss": 22.9433, + "losses_ref": -0.001532155554741621, + "ref_logps/chosen": -90.76319122314453, + "ref_logps/rejected": -81.58100891113281, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 68.7745361328125, + "rewards/margins": 124.36505126953125, + "rewards/rejected": -55.59052276611328, + "step": 1690, + "u": -2.2017760276794434, + "weight": 0.04382842034101486 + }, + { + "diff_generated": -59.38103103637695, + "epoch": 0.5508749189889826, + "grad_norm": 524.3064296692643, + "learning_rate": 7.83086691559356e-07, + "logits/chosen": -2.50773286819458, + "logits/rejected": -2.5514986515045166, + "logps/chosen": -21.255802154541016, + "logps/rejected": -146.8485565185547, + "loss": 23.0709, + "losses_ref": -0.001003900310024619, + "ref_logps/chosen": -97.35025024414062, + "ref_logps/rejected": -87.467529296875, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 76.09444427490234, + "rewards/margins": 135.47547912597656, + "rewards/rejected": -59.38103103637695, + "step": 1700, + "u": -2.187408924102783, + "weight": 0.05005186051130295 + }, + { + "diff_generated": -55.77043914794922, + "epoch": 0.5541153596889177, + "grad_norm": 484.2893979735214, + "learning_rate": 7.826500393202268e-07, + "logits/chosen": -2.465657949447632, + "logits/rejected": -2.5137181282043457, + "logps/chosen": -24.55307388305664, + "logps/rejected": -139.7406005859375, + "loss": 23.3985, + "losses_ref": -0.0014938053209334612, + "ref_logps/chosen": -96.75263977050781, + "ref_logps/rejected": -83.97016143798828, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 72.19956970214844, + "rewards/margins": 127.97001647949219, + "rewards/rejected": -55.77043914794922, + "step": 1710, + "u": -2.1874420642852783, + "weight": 0.050015270709991455 + }, + { + "diff_generated": -55.82611083984375, + "epoch": 0.5573558003888529, + "grad_norm": 525.7360101180998, + "learning_rate": 7.82207947036368e-07, + "logits/chosen": -2.4367599487304688, + "logits/rejected": -2.5369248390197754, + "logps/chosen": -21.981922149658203, + "logps/rejected": -137.39761352539062, + "loss": 22.7757, + "losses_ref": -0.0003991415142081678, + "ref_logps/chosen": -91.92091369628906, + "ref_logps/rejected": -81.5715103149414, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 69.93899536132812, + "rewards/margins": 125.76509857177734, + "rewards/rejected": -55.82611083984375, + "step": 1720, + "u": -2.173046588897705, + "weight": 0.05627021938562393 + }, + { + "diff_generated": -54.84632110595703, + "epoch": 0.560596241088788, + "grad_norm": 505.5876111269165, + "learning_rate": 7.817604209929007e-07, + "logits/chosen": -2.483346462249756, + "logits/rejected": -2.494868755340576, + "logps/chosen": -25.0811824798584, + "logps/rejected": -136.0178680419922, + "loss": 23.4849, + "losses_ref": -0.005923398770391941, + "ref_logps/chosen": -98.15687561035156, + "ref_logps/rejected": -81.17155456542969, + "rewards/accuracies": 0.9375, + "rewards/chosen": 73.07569885253906, + "rewards/margins": 127.9220199584961, + "rewards/rejected": -54.84632110595703, + "step": 1730, + "u": -2.1583805084228516, + "weight": 0.06282185018062592 + }, + { + "diff_generated": -59.43408203125, + "epoch": 0.5638366817887233, + "grad_norm": 477.8168424857454, + "learning_rate": 7.813074675521962e-07, + "logits/chosen": -2.5375099182128906, + "logits/rejected": -2.572326898574829, + "logps/chosen": -24.597471237182617, + "logps/rejected": -143.44293212890625, + "loss": 23.7144, + "losses_ref": -0.002471204148605466, + "ref_logps/chosen": -99.7923355102539, + "ref_logps/rejected": -84.00885772705078, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 75.19486999511719, + "rewards/margins": 134.6289520263672, + "rewards/rejected": -59.43408203125, + "step": 1740, + "u": -2.2448902130126953, + "weight": 0.025143280625343323 + }, + { + "diff_generated": -56.804656982421875, + "epoch": 0.5670771224886585, + "grad_norm": 494.3841984274702, + "learning_rate": 7.80849093153786e-07, + "logits/chosen": -2.491807699203491, + "logits/rejected": -2.5932672023773193, + "logps/chosen": -21.136425018310547, + "logps/rejected": -142.30722045898438, + "loss": 22.1587, + "losses_ref": -0.00046972898417152464, + "ref_logps/chosen": -92.85779571533203, + "ref_logps/rejected": -85.50257110595703, + "rewards/accuracies": 0.9375, + "rewards/chosen": 71.72136688232422, + "rewards/margins": 128.52603149414062, + "rewards/rejected": -56.804656982421875, + "step": 1750, + "u": -2.1586501598358154, + "weight": 0.06252589076757431 + }, + { + "diff_generated": -56.39727783203125, + "epoch": 0.5703175631885936, + "grad_norm": 460.22585311602836, + "learning_rate": 7.803853043142702e-07, + "logits/chosen": -2.4713480472564697, + "logits/rejected": -2.581171989440918, + "logps/chosen": -24.284358978271484, + "logps/rejected": -141.79885864257812, + "loss": 21.9982, + "losses_ref": -9.926590109898825e-07, + "ref_logps/chosen": -95.4493637084961, + "ref_logps/rejected": -85.4015884399414, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 71.16500854492188, + "rewards/margins": 127.56229400634766, + "rewards/rejected": -56.39727783203125, + "step": 1760, + "u": -2.173064708709717, + "weight": 0.056250035762786865 + }, + { + "diff_generated": -57.44085693359375, + "epoch": 0.5735580038885288, + "grad_norm": 498.84117678721265, + "learning_rate": 7.799161076272245e-07, + "logits/chosen": -2.4647529125213623, + "logits/rejected": -2.568406581878662, + "logps/chosen": -22.84539031982422, + "logps/rejected": -136.6644744873047, + "loss": 22.619, + "losses_ref": -6.464334546762984e-06, + "ref_logps/chosen": -93.20040130615234, + "ref_logps/rejected": -79.22361755371094, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 70.35499572753906, + "rewards/margins": 127.79586029052734, + "rewards/rejected": -57.44085693359375, + "step": 1770, + "u": -2.187455654144287, + "weight": 0.0500001423060894 + }, + { + "diff_generated": -57.65531539916992, + "epoch": 0.5767984445884641, + "grad_norm": 567.0598517544732, + "learning_rate": 7.794415097631066e-07, + "logits/chosen": -2.485158681869507, + "logits/rejected": -2.5438666343688965, + "logps/chosen": -22.188308715820312, + "logps/rejected": -140.32151794433594, + "loss": 23.3885, + "losses_ref": -0.000717981078196317, + "ref_logps/chosen": -94.6068344116211, + "ref_logps/rejected": -82.66620635986328, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 72.41851806640625, + "rewards/margins": 130.07383728027344, + "rewards/rejected": -57.65531539916992, + "step": 1780, + "u": -2.2450015544891357, + "weight": 0.025021100416779518 + }, + { + "diff_generated": -58.910499572753906, + "epoch": 0.5800388852883992, + "grad_norm": 502.2065972530842, + "learning_rate": 7.789615174691619e-07, + "logits/chosen": -2.433727741241455, + "logits/rejected": -2.560873508453369, + "logps/chosen": -24.428359985351562, + "logps/rejected": -149.1986541748047, + "loss": 23.0511, + "losses_ref": -5.546243073695223e-07, + "ref_logps/chosen": -95.4670181274414, + "ref_logps/rejected": -90.28814697265625, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 71.03865814208984, + "rewards/margins": 129.9491424560547, + "rewards/rejected": -58.910499572753906, + "step": 1790, + "u": -2.187455654144287, + "weight": 0.050000011920928955 + }, + { + "diff_generated": -58.59474563598633, + "epoch": 0.5832793259883344, + "grad_norm": 514.5693084653902, + "learning_rate": 7.784761375693268e-07, + "logits/chosen": -2.416273832321167, + "logits/rejected": -2.5081026554107666, + "logps/chosen": -23.346494674682617, + "logps/rejected": -149.67361450195312, + "loss": 23.1464, + "losses_ref": -1.1515285223140381e-05, + "ref_logps/chosen": -92.47997283935547, + "ref_logps/rejected": -91.0788803100586, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 69.13347625732422, + "rewards/margins": 127.72822570800781, + "rewards/rejected": -58.59474563598633, + "step": 1800, + "u": -2.1730644702911377, + "weight": 0.05625023692846298 + }, + { + "diff_generated": -55.998878479003906, + "epoch": 0.5865197666882696, + "grad_norm": 519.4944457021509, + "learning_rate": 7.779853769641319e-07, + "logits/chosen": -2.4430603981018066, + "logits/rejected": -2.5393834114074707, + "logps/chosen": -24.854917526245117, + "logps/rejected": -139.3914794921875, + "loss": 22.3479, + "losses_ref": -0.00019149412401020527, + "ref_logps/chosen": -95.69673156738281, + "ref_logps/rejected": -83.3926010131836, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 70.84181213378906, + "rewards/margins": 126.84068298339844, + "rewards/rejected": -55.998878479003906, + "step": 1810, + "u": -2.2018439769744873, + "weight": 0.04375326260924339 + }, + { + "diff_generated": -58.254791259765625, + "epoch": 0.5897602073882048, + "grad_norm": 512.5708535156688, + "learning_rate": 7.774892426306042e-07, + "logits/chosen": -2.466561794281006, + "logits/rejected": -2.5939698219299316, + "logps/chosen": -21.761241912841797, + "logps/rejected": -147.48875427246094, + "loss": 22.3906, + "losses_ref": -0.002497596899047494, + "ref_logps/chosen": -91.84721374511719, + "ref_logps/rejected": -89.23395538330078, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 70.08597564697266, + "rewards/margins": 128.34075927734375, + "rewards/rejected": -58.254791259765625, + "step": 1820, + "u": -2.1298465728759766, + "weight": 0.07504962384700775 + }, + { + "diff_generated": -57.40629959106445, + "epoch": 0.59300064808814, + "grad_norm": 455.3768485779574, + "learning_rate": 7.769877416221678e-07, + "logits/chosen": -2.468407392501831, + "logits/rejected": -2.515033006668091, + "logps/chosen": -25.566970825195312, + "logps/rejected": -142.10153198242188, + "loss": 24.0047, + "losses_ref": -6.456654091380187e-07, + "ref_logps/chosen": -97.21444702148438, + "ref_logps/rejected": -84.69524383544922, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 71.64747619628906, + "rewards/margins": 129.05377197265625, + "rewards/rejected": -57.40629959106445, + "step": 1830, + "u": -2.187455654144287, + "weight": 0.05000002309679985 + }, + { + "diff_generated": -55.58686065673828, + "epoch": 0.5962410887880751, + "grad_norm": 484.9968123238012, + "learning_rate": 7.764808810685433e-07, + "logits/chosen": -2.431522846221924, + "logits/rejected": -2.5557100772857666, + "logps/chosen": -19.275615692138672, + "logps/rejected": -136.2754364013672, + "loss": 22.7215, + "losses_ref": -0.0005879181553609669, + "ref_logps/chosen": -88.68095397949219, + "ref_logps/rejected": -80.68856048583984, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 69.40534210205078, + "rewards/margins": 124.99220275878906, + "rewards/rejected": -55.58686065673828, + "step": 1840, + "u": -2.1298632621765137, + "weight": 0.0750310942530632 + }, + { + "diff_generated": -56.19511795043945, + "epoch": 0.5994815294880104, + "grad_norm": 498.45416111428847, + "learning_rate": 7.759686681756468e-07, + "logits/chosen": -2.4816057682037354, + "logits/rejected": -2.5456433296203613, + "logps/chosen": -22.139755249023438, + "logps/rejected": -142.64230346679688, + "loss": 22.4336, + "losses_ref": -0.006184516940265894, + "ref_logps/chosen": -93.66046142578125, + "ref_logps/rejected": -86.44719696044922, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 71.52071380615234, + "rewards/margins": 127.7158203125, + "rewards/rejected": -56.19511795043945, + "step": 1850, + "u": -2.1727375984191895, + "weight": 0.056604884564876556 + }, + { + "diff_generated": -58.84028244018555, + "epoch": 0.6027219701879456, + "grad_norm": 458.8710847819926, + "learning_rate": 7.754511102254876e-07, + "logits/chosen": -2.4350783824920654, + "logits/rejected": -2.54716420173645, + "logps/chosen": -21.77615737915039, + "logps/rejected": -140.6431884765625, + "loss": 23.2702, + "losses_ref": -5.444636826723581e-06, + "ref_logps/chosen": -90.06400299072266, + "ref_logps/rejected": -81.80291748046875, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 68.28785705566406, + "rewards/margins": 127.12812805175781, + "rewards/rejected": -58.84028244018555, + "step": 1860, + "u": -2.2018465995788574, + "weight": 0.04375021532177925 + }, + { + "diff_generated": -54.66754913330078, + "epoch": 0.6059624108878807, + "grad_norm": 487.4657979052523, + "learning_rate": 7.74928214576064e-07, + "logits/chosen": -2.4859158992767334, + "logits/rejected": -2.5099780559539795, + "logps/chosen": -23.468854904174805, + "logps/rejected": -139.61276245117188, + "loss": 23.0208, + "losses_ref": -2.9951024771435186e-06, + "ref_logps/chosen": -98.22453308105469, + "ref_logps/rejected": -84.94522094726562, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 74.75566864013672, + "rewards/margins": 129.42323303222656, + "rewards/rejected": -54.66754913330078, + "step": 1870, + "u": -2.216238021850586, + "weight": 0.037500061094760895 + }, + { + "diff_generated": -60.11989212036133, + "epoch": 0.609202851587816, + "grad_norm": 472.01258007651603, + "learning_rate": 7.743999886612591e-07, + "logits/chosen": -2.461061954498291, + "logits/rejected": -2.5660219192504883, + "logps/chosen": -22.852632522583008, + "logps/rejected": -152.16908264160156, + "loss": 21.9321, + "losses_ref": -5.277171112538781e-06, + "ref_logps/chosen": -96.39404296875, + "ref_logps/rejected": -92.0491943359375, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 73.5414047241211, + "rewards/margins": 133.66128540039062, + "rewards/rejected": -60.11989212036133, + "step": 1880, + "u": -2.2738025188446045, + "weight": 0.012500083073973656 + }, + { + "diff_generated": -61.25305938720703, + "epoch": 0.6124432922877512, + "grad_norm": 497.17719794133916, + "learning_rate": 7.738664399907355e-07, + "logits/chosen": -2.4737637042999268, + "logits/rejected": -2.5895867347717285, + "logps/chosen": -22.45340919494629, + "logps/rejected": -151.78509521484375, + "loss": 21.5031, + "losses_ref": -0.002105607185512781, + "ref_logps/chosen": -94.99655151367188, + "ref_logps/rejected": -90.53204345703125, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 72.54315185546875, + "rewards/margins": 133.79620361328125, + "rewards/rejected": -61.25305938720703, + "step": 1890, + "u": -2.2593088150024414, + "weight": 0.018863562494516373 + }, + { + "diff_generated": -55.67549514770508, + "epoch": 0.6156837329876863, + "grad_norm": 473.5608592373336, + "learning_rate": 7.733275761498278e-07, + "logits/chosen": -2.494147777557373, + "logits/rejected": -2.535625457763672, + "logps/chosen": -24.380495071411133, + "logps/rejected": -140.7521514892578, + "loss": 22.8896, + "losses_ref": -0.00022286793682724237, + "ref_logps/chosen": -94.96155548095703, + "ref_logps/rejected": -85.07664489746094, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 70.5810546875, + "rewards/margins": 126.25655364990234, + "rewards/rejected": -55.67549514770508, + "step": 1900, + "u": -2.17305850982666, + "weight": 0.056256867945194244 + }, + { + "diff_generated": -57.19377517700195, + "epoch": 0.6189241736876215, + "grad_norm": 511.7405798643593, + "learning_rate": 7.727834047994353e-07, + "logits/chosen": -2.463592529296875, + "logits/rejected": -2.5578956604003906, + "logps/chosen": -26.32914161682129, + "logps/rejected": -145.5801239013672, + "loss": 22.4974, + "losses_ref": -2.7549589503905736e-05, + "ref_logps/chosen": -98.46235656738281, + "ref_logps/rejected": -88.3863525390625, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 72.13322448730469, + "rewards/margins": 129.32699584960938, + "rewards/rejected": -57.19377517700195, + "step": 1910, + "u": -2.1874547004699707, + "weight": 0.05000131204724312 + }, + { + "diff_generated": -57.825355529785156, + "epoch": 0.6221646143875567, + "grad_norm": 505.19909052460184, + "learning_rate": 7.722339336759129e-07, + "logits/chosen": -2.391700267791748, + "logits/rejected": -2.5454742908477783, + "logps/chosen": -23.258031845092773, + "logps/rejected": -143.39334106445312, + "loss": 23.1168, + "losses_ref": -6.362137355608866e-05, + "ref_logps/chosen": -91.22623443603516, + "ref_logps/rejected": -85.56798553466797, + "rewards/accuracies": 0.9375, + "rewards/chosen": 67.96820068359375, + "rewards/margins": 125.79354095458984, + "rewards/rejected": -57.825355529785156, + "step": 1920, + "u": -2.158673048019409, + "weight": 0.0625004917383194 + }, + { + "diff_generated": -54.9102668762207, + "epoch": 0.6254050550874919, + "grad_norm": 529.4901064435244, + "learning_rate": 7.71679170590961e-07, + "logits/chosen": -2.515338897705078, + "logits/rejected": -2.560856580734253, + "logps/chosen": -24.146343231201172, + "logps/rejected": -137.59536743164062, + "loss": 21.9412, + "losses_ref": -0.0008923925342969596, + "ref_logps/chosen": -96.57925415039062, + "ref_logps/rejected": -82.68508911132812, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 72.43292236328125, + "rewards/margins": 127.34318542480469, + "rewards/rejected": -54.9102668762207, + "step": 1930, + "u": -2.18741512298584, + "weight": 0.050045304000377655 + }, + { + "diff_generated": -59.83012771606445, + "epoch": 0.6286454957874271, + "grad_norm": 438.39465837647845, + "learning_rate": 7.711191234315146e-07, + "logits/chosen": -2.4719552993774414, + "logits/rejected": -2.5544826984405518, + "logps/chosen": -23.674427032470703, + "logps/rejected": -149.69705200195312, + "loss": 22.4189, + "losses_ref": -1.7031243260134943e-06, + "ref_logps/chosen": -98.4489974975586, + "ref_logps/rejected": -89.86692810058594, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 74.77456665039062, + "rewards/margins": 134.60470581054688, + "rewards/rejected": -59.83012771606445, + "step": 1940, + "u": -2.2450203895568848, + "weight": 0.025000065565109253 + }, + { + "diff_generated": -58.504981994628906, + "epoch": 0.6318859364873622, + "grad_norm": 487.23718047122196, + "learning_rate": 7.705538001596312e-07, + "logits/chosen": -2.477814197540283, + "logits/rejected": -2.5949177742004395, + "logps/chosen": -20.551475524902344, + "logps/rejected": -148.85836791992188, + "loss": 22.8294, + "losses_ref": -5.330602625974734e-09, + "ref_logps/chosen": -92.30406188964844, + "ref_logps/rejected": -90.35337829589844, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 71.75257873535156, + "rewards/margins": 130.25755310058594, + "rewards/rejected": -58.504981994628906, + "step": 1950, + "u": -2.187455654144287, + "weight": 0.05000000074505806 + }, + { + "diff_generated": -57.42897415161133, + "epoch": 0.6351263771872975, + "grad_norm": 488.39965331393387, + "learning_rate": 7.699832088123774e-07, + "logits/chosen": -2.504335880279541, + "logits/rejected": -2.5275423526763916, + "logps/chosen": -24.176315307617188, + "logps/rejected": -143.69924926757812, + "loss": 23.0429, + "losses_ref": -3.060336439375533e-06, + "ref_logps/chosen": -98.70487213134766, + "ref_logps/rejected": -86.2702865600586, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 74.52854919433594, + "rewards/margins": 131.95753479003906, + "rewards/rejected": -57.42897415161133, + "step": 1960, + "u": -2.2018468379974365, + "weight": 0.04375012591481209 + }, + { + "diff_generated": -58.23870849609375, + "epoch": 0.6383668178872327, + "grad_norm": 504.1265747258012, + "learning_rate": 7.694073575017151e-07, + "logits/chosen": -2.3937363624572754, + "logits/rejected": -2.508058547973633, + "logps/chosen": -20.843114852905273, + "logps/rejected": -140.5963134765625, + "loss": 22.1691, + "losses_ref": -1.2568195870699128e-06, + "ref_logps/chosen": -89.77731323242188, + "ref_logps/rejected": -82.35760498046875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 68.93418884277344, + "rewards/margins": 127.17289733886719, + "rewards/rejected": -58.23870849609375, + "step": 1970, + "u": -2.1586735248565674, + "weight": 0.06250002235174179 + }, + { + "diff_generated": -56.48950958251953, + "epoch": 0.6416072585871678, + "grad_norm": 518.2740490404789, + "learning_rate": 7.688262544143854e-07, + "logits/chosen": -2.467092752456665, + "logits/rejected": -2.529083728790283, + "logps/chosen": -22.24001121520996, + "logps/rejected": -140.2478485107422, + "loss": 22.4542, + "losses_ref": -8.965320375864394e-06, + "ref_logps/chosen": -93.88130187988281, + "ref_logps/rejected": -83.75831604003906, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 71.64128112792969, + "rewards/margins": 128.1307830810547, + "rewards/rejected": -56.48950958251953, + "step": 1980, + "u": -2.1298911571502686, + "weight": 0.07500015199184418 + }, + { + "diff_generated": -55.2055778503418, + "epoch": 0.6448476992871031, + "grad_norm": 469.91031715979483, + "learning_rate": 7.682399078117928e-07, + "logits/chosen": -2.4817662239074707, + "logits/rejected": -2.5173556804656982, + "logps/chosen": -21.232852935791016, + "logps/rejected": -142.70101928710938, + "loss": 22.9948, + "losses_ref": -0.0010377921862527728, + "ref_logps/chosen": -96.75160217285156, + "ref_logps/rejected": -87.49544525146484, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 75.51875305175781, + "rewards/margins": 130.72433471679688, + "rewards/rejected": -55.2055778503418, + "step": 1990, + "u": -2.1442325115203857, + "weight": 0.06880507618188858 + }, + { + "diff_generated": -54.693626403808594, + "epoch": 0.6480881399870383, + "grad_norm": 502.7417960438683, + "learning_rate": 7.67648326029888e-07, + "logits/chosen": -2.483794927597046, + "logits/rejected": -2.523793935775757, + "logps/chosen": -24.50412368774414, + "logps/rejected": -141.32821655273438, + "loss": 23.343, + "losses_ref": -0.000803236966021359, + "ref_logps/chosen": -97.73753356933594, + "ref_logps/rejected": -86.63460540771484, + "rewards/accuracies": 0.9375, + "rewards/chosen": 73.23341369628906, + "rewards/margins": 127.92704010009766, + "rewards/rejected": -54.693626403808594, + "step": 2000, + "u": -2.1586384773254395, + "weight": 0.06253884732723236 + }, + { + "diff_generated": -55.728248596191406, + "epoch": 0.6513285806869734, + "grad_norm": 495.9694491130362, + "learning_rate": 7.670515174790485e-07, + "logits/chosen": -2.4611029624938965, + "logits/rejected": -2.5048117637634277, + "logps/chosen": -23.760311126708984, + "logps/rejected": -142.3408203125, + "loss": 23.0292, + "losses_ref": -5.852278377460607e-07, + "ref_logps/chosen": -95.93404388427734, + "ref_logps/rejected": -86.61258697509766, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 72.17372131347656, + "rewards/margins": 127.9019775390625, + "rewards/rejected": -55.728248596191406, + "step": 2010, + "u": -2.216238260269165, + "weight": 0.037500008940696716 + }, + { + "diff_generated": -60.43064498901367, + "epoch": 0.6545690213869086, + "grad_norm": 493.2141019606241, + "learning_rate": 7.664494906439598e-07, + "logits/chosen": -2.462008237838745, + "logits/rejected": -2.5223259925842285, + "logps/chosen": -21.097309112548828, + "logps/rejected": -148.689697265625, + "loss": 21.5231, + "losses_ref": -2.144928146208258e-08, + "ref_logps/chosen": -96.27225494384766, + "ref_logps/rejected": -88.25904846191406, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 75.17494201660156, + "rewards/margins": 135.60560607910156, + "rewards/rejected": -60.43064498901367, + "step": 2020, + "u": -2.2018468379974365, + "weight": 0.04374999925494194 + }, + { + "diff_generated": -56.947662353515625, + "epoch": 0.6578094620868438, + "grad_norm": 447.1663290171413, + "learning_rate": 7.658422540834943e-07, + "logits/chosen": -2.4873642921447754, + "logits/rejected": -2.5300660133361816, + "logps/chosen": -25.486000061035156, + "logps/rejected": -149.83065795898438, + "loss": 23.6932, + "losses_ref": -0.0004086032568011433, + "ref_logps/chosen": -99.67965698242188, + "ref_logps/rejected": -92.88298797607422, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 74.19366455078125, + "rewards/margins": 131.14132690429688, + "rewards/rejected": -56.947662353515625, + "step": 2030, + "u": -2.245002269744873, + "weight": 0.025020133703947067 + }, + { + "diff_generated": -55.35199737548828, + "epoch": 0.661049902786779, + "grad_norm": 469.0213842529389, + "learning_rate": 7.6522981643059e-07, + "logits/chosen": -2.463914155960083, + "logits/rejected": -2.5366599559783936, + "logps/chosen": -24.274606704711914, + "logps/rejected": -139.7829132080078, + "loss": 21.7409, + "losses_ref": -0.012434705160558224, + "ref_logps/chosen": -97.09610748291016, + "ref_logps/rejected": -84.43091583251953, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 72.82149505615234, + "rewards/margins": 128.17349243164062, + "rewards/rejected": -55.35199737548828, + "step": 2040, + "u": -2.2155699729919434, + "weight": 0.03820699453353882 + }, + { + "diff_generated": -59.47046661376953, + "epoch": 0.6642903434867142, + "grad_norm": 503.1826607295146, + "learning_rate": 7.646121863921278e-07, + "logits/chosen": -2.4561409950256348, + "logits/rejected": -2.490854263305664, + "logps/chosen": -24.944446563720703, + "logps/rejected": -144.05796813964844, + "loss": 22.7095, + "losses_ref": -1.2438914382073563e-05, + "ref_logps/chosen": -102.60357666015625, + "ref_logps/rejected": -84.58749389648438, + "rewards/accuracies": 0.96875, + "rewards/chosen": 77.65913391113281, + "rewards/margins": 137.12960815429688, + "rewards/rejected": -59.47046661376953, + "step": 2050, + "u": -2.2306289672851562, + "weight": 0.03125036507844925 + }, + { + "diff_generated": -57.042747497558594, + "epoch": 0.6675307841866494, + "grad_norm": 453.3027299027881, + "learning_rate": 7.639893727488069e-07, + "logits/chosen": -2.423924207687378, + "logits/rejected": -2.5609469413757324, + "logps/chosen": -21.400257110595703, + "logps/rejected": -143.56591796875, + "loss": 21.7158, + "losses_ref": -0.0006009475910104811, + "ref_logps/chosen": -92.40319061279297, + "ref_logps/rejected": -86.52315521240234, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 71.00294494628906, + "rewards/margins": 128.0457000732422, + "rewards/rejected": -57.042747497558594, + "step": 2060, + "u": -2.2162106037139893, + "weight": 0.037530649453401566 + }, + { + "diff_generated": -54.49127197265625, + "epoch": 0.6707712248865846, + "grad_norm": 516.3445862422591, + "learning_rate": 7.633613843550212e-07, + "logits/chosen": -2.4732346534729004, + "logits/rejected": -2.527963161468506, + "logps/chosen": -23.60536766052246, + "logps/rejected": -136.6197509765625, + "loss": 23.2891, + "losses_ref": -0.0018613319844007492, + "ref_logps/chosen": -97.85231018066406, + "ref_logps/rejected": -82.12848663330078, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 74.24693298339844, + "rewards/margins": 128.73818969726562, + "rewards/rejected": -54.49127197265625, + "step": 2070, + "u": -2.173004627227783, + "weight": 0.05631663277745247 + }, + { + "diff_generated": -56.165184020996094, + "epoch": 0.6740116655865198, + "grad_norm": 503.7437146207077, + "learning_rate": 7.627282301387325e-07, + "logits/chosen": -2.381239652633667, + "logits/rejected": -2.4903016090393066, + "logps/chosen": -21.5819091796875, + "logps/rejected": -140.04544067382812, + "loss": 22.1412, + "losses_ref": -5.365263405110454e-06, + "ref_logps/chosen": -88.96678161621094, + "ref_logps/rejected": -83.88023376464844, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 67.38487243652344, + "rewards/margins": 123.550048828125, + "rewards/rejected": -56.165184020996094, + "step": 2080, + "u": -2.144282341003418, + "weight": 0.06875016540288925 + }, + { + "diff_generated": -57.1841926574707, + "epoch": 0.6772521062864549, + "grad_norm": 454.9895346490162, + "learning_rate": 7.620899191013438e-07, + "logits/chosen": -2.4103646278381348, + "logits/rejected": -2.522381544113159, + "logps/chosen": -24.676916122436523, + "logps/rejected": -148.68307495117188, + "loss": 22.9282, + "losses_ref": -0.006165254861116409, + "ref_logps/chosen": -94.65473937988281, + "ref_logps/rejected": -91.49888610839844, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 69.97782897949219, + "rewards/margins": 127.16202545166016, + "rewards/rejected": -57.1841926574707, + "step": 2090, + "u": -2.172731876373291, + "weight": 0.05661041662096977 + }, + { + "diff_generated": -55.309730529785156, + "epoch": 0.6804925469863902, + "grad_norm": 489.7994554032864, + "learning_rate": 7.614464603175717e-07, + "logits/chosen": -2.497722625732422, + "logits/rejected": -2.48801326751709, + "logps/chosen": -23.2744083404541, + "logps/rejected": -137.96255493164062, + "loss": 21.5154, + "losses_ref": -0.0013496755855157971, + "ref_logps/chosen": -100.1006088256836, + "ref_logps/rejected": -82.65283966064453, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 76.8261947631836, + "rewards/margins": 132.1359405517578, + "rewards/rejected": -55.309730529785156, + "step": 2100, + "u": -2.1730010509490967, + "weight": 0.056320447474718094 + }, + { + "diff_generated": -56.59611892700195, + "epoch": 0.6837329876863253, + "grad_norm": 467.7258584343898, + "learning_rate": 7.607978629353167e-07, + "logits/chosen": -2.4443843364715576, + "logits/rejected": -2.535287380218506, + "logps/chosen": -22.963520050048828, + "logps/rejected": -145.03662109375, + "loss": 22.243, + "losses_ref": -0.0002075113879982382, + "ref_logps/chosen": -93.62332916259766, + "ref_logps/rejected": -88.44050598144531, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 70.65980529785156, + "rewards/margins": 127.25592041015625, + "rewards/rejected": -56.59611892700195, + "step": 2110, + "u": -2.1874475479125977, + "weight": 0.05000927299261093 + }, + { + "diff_generated": -59.0588493347168, + "epoch": 0.6869734283862605, + "grad_norm": 506.1015617033496, + "learning_rate": 7.60144136175534e-07, + "logits/chosen": -2.455575942993164, + "logits/rejected": -2.53769588470459, + "logps/chosen": -19.758596420288086, + "logps/rejected": -143.96475219726562, + "loss": 21.7778, + "losses_ref": -0.002314900513738394, + "ref_logps/chosen": -94.31592559814453, + "ref_logps/rejected": -84.90589904785156, + "rewards/accuracies": 0.9375, + "rewards/chosen": 74.55732727050781, + "rewards/margins": 133.61618041992188, + "rewards/rejected": -59.0588493347168, + "step": 2120, + "u": -2.1585612297058105, + "weight": 0.06262405216693878 + }, + { + "diff_generated": -57.1196174621582, + "epoch": 0.6902138690861958, + "grad_norm": 534.2687251405287, + "learning_rate": 7.594852893321015e-07, + "logits/chosen": -2.422362804412842, + "logits/rejected": -2.5526645183563232, + "logps/chosen": -21.735645294189453, + "logps/rejected": -144.26742553710938, + "loss": 22.0867, + "losses_ref": -0.0022257170639932156, + "ref_logps/chosen": -91.74186706542969, + "ref_logps/rejected": -87.14781188964844, + "rewards/accuracies": 0.9375, + "rewards/chosen": 70.00621032714844, + "rewards/margins": 127.1258316040039, + "rewards/rejected": -57.1196174621582, + "step": 2130, + "u": -2.158565044403076, + "weight": 0.06261952221393585 + }, + { + "diff_generated": -54.54099655151367, + "epoch": 0.6934543097861309, + "grad_norm": 457.63891961234555, + "learning_rate": 7.588213317716883e-07, + "logits/chosen": -2.361682415008545, + "logits/rejected": -2.5019543170928955, + "logps/chosen": -19.993026733398438, + "logps/rejected": -136.1216278076172, + "loss": 22.3299, + "losses_ref": -1.1567156121827793e-07, + "ref_logps/chosen": -89.22071838378906, + "ref_logps/rejected": -81.58064270019531, + "rewards/accuracies": 0.9375, + "rewards/chosen": 69.22769165039062, + "rewards/margins": 123.7686767578125, + "rewards/rejected": -54.54099655151367, + "step": 2140, + "u": -2.1586735248565674, + "weight": 0.0625 + }, + { + "diff_generated": -54.2440299987793, + "epoch": 0.6966947504860661, + "grad_norm": 477.00173461765286, + "learning_rate": 7.581522729336214e-07, + "logits/chosen": -2.382094144821167, + "logits/rejected": -2.4460394382476807, + "logps/chosen": -21.491865158081055, + "logps/rejected": -132.05075073242188, + "loss": 21.9708, + "losses_ref": -0.0005560126155614853, + "ref_logps/chosen": -94.92799377441406, + "ref_logps/rejected": -77.8067398071289, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 73.43612670898438, + "rewards/margins": 127.68016052246094, + "rewards/rejected": -54.2440299987793, + "step": 2150, + "u": -2.144258975982666, + "weight": 0.06877604871988297 + }, + { + "diff_generated": -56.922821044921875, + "epoch": 0.6999351911860013, + "grad_norm": 460.42405683815605, + "learning_rate": 7.574781223297513e-07, + "logits/chosen": -2.4442458152770996, + "logits/rejected": -2.4855704307556152, + "logps/chosen": -23.61258316040039, + "logps/rejected": -139.59652709960938, + "loss": 21.2033, + "losses_ref": -0.013810291886329651, + "ref_logps/chosen": -97.78433990478516, + "ref_logps/rejected": -82.6737060546875, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 74.1717529296875, + "rewards/margins": 131.09457397460938, + "rewards/rejected": -56.922821044921875, + "step": 2160, + "u": -2.172224998474121, + "weight": 0.05712286755442619 + }, + { + "diff_generated": -55.76105880737305, + "epoch": 0.7031756318859365, + "grad_norm": 466.2392560237832, + "learning_rate": 7.567988895443173e-07, + "logits/chosen": -2.4225571155548096, + "logits/rejected": -2.4542760848999023, + "logps/chosen": -20.48651885986328, + "logps/rejected": -140.50071716308594, + "loss": 22.1093, + "losses_ref": -0.00023919029626995325, + "ref_logps/chosen": -93.83261108398438, + "ref_logps/rejected": -84.73966217041016, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 73.34608459472656, + "rewards/margins": 129.10714721679688, + "rewards/rejected": -55.76105880737305, + "step": 2170, + "u": -2.129882335662842, + "weight": 0.07500983029603958 + }, + { + "diff_generated": -55.14765548706055, + "epoch": 0.7064160725858717, + "grad_norm": 458.56557339059754, + "learning_rate": 7.561145842338102e-07, + "logits/chosen": -2.4281909465789795, + "logits/rejected": -2.4837698936462402, + "logps/chosen": -22.63579750061035, + "logps/rejected": -139.19200134277344, + "loss": 21.7774, + "losses_ref": -0.0003904419136233628, + "ref_logps/chosen": -94.46659088134766, + "ref_logps/rejected": -84.04434967041016, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 71.8307876586914, + "rewards/margins": 126.97843933105469, + "rewards/rejected": -55.14765548706055, + "step": 2180, + "u": -2.2018356323242188, + "weight": 0.04376264289021492 + }, + { + "diff_generated": -57.23699188232422, + "epoch": 0.7096565132858069, + "grad_norm": 511.57694347026296, + "learning_rate": 7.554252161268365e-07, + "logits/chosen": -2.412655830383301, + "logits/rejected": -2.511845588684082, + "logps/chosen": -23.021530151367188, + "logps/rejected": -145.2180633544922, + "loss": 21.7257, + "losses_ref": -0.0007701918366365135, + "ref_logps/chosen": -94.85514068603516, + "ref_logps/rejected": -87.98106384277344, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 71.8336181640625, + "rewards/margins": 129.0706024169922, + "rewards/rejected": -57.23699188232422, + "step": 2190, + "u": -2.1730284690856934, + "weight": 0.0562903955578804 + }, + { + "diff_generated": -56.55291748046875, + "epoch": 0.712896953985742, + "grad_norm": 490.6510002260033, + "learning_rate": 7.547307950239785e-07, + "logits/chosen": -2.474010944366455, + "logits/rejected": -2.541577100753784, + "logps/chosen": -22.41421890258789, + "logps/rejected": -139.7757110595703, + "loss": 22.3575, + "losses_ref": -0.00034674102789722383, + "ref_logps/chosen": -98.89764404296875, + "ref_logps/rejected": -83.22279357910156, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 76.48342895507812, + "rewards/margins": 133.03634643554688, + "rewards/rejected": -56.55291748046875, + "step": 2200, + "u": -2.201831340789795, + "weight": 0.04376723989844322 + }, + { + "diff_generated": -53.38134002685547, + "epoch": 0.7161373946856773, + "grad_norm": 490.56252903024705, + "learning_rate": 7.540313307976563e-07, + "logits/chosen": -2.436366081237793, + "logits/rejected": -2.5205202102661133, + "logps/chosen": -21.785724639892578, + "logps/rejected": -134.70553588867188, + "loss": 23.25, + "losses_ref": -1.0933801604551263e-05, + "ref_logps/chosen": -91.46631622314453, + "ref_logps/rejected": -81.3241958618164, + "rewards/accuracies": 0.90625, + "rewards/chosen": 69.68058776855469, + "rewards/margins": 123.06193542480469, + "rewards/rejected": -53.38134002685547, + "step": 2210, + "u": -2.0867176055908203, + "weight": 0.09375043213367462 + }, + { + "diff_generated": -55.9515380859375, + "epoch": 0.7193778353856124, + "grad_norm": 428.5675957865356, + "learning_rate": 7.533268333919865e-07, + "logits/chosen": -2.4399867057800293, + "logits/rejected": -2.5394577980041504, + "logps/chosen": -23.00450897216797, + "logps/rejected": -143.8893585205078, + "loss": 21.8257, + "losses_ref": -6.2254525801108684e-06, + "ref_logps/chosen": -96.14192962646484, + "ref_logps/rejected": -87.93781280517578, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 73.13742065429688, + "rewards/margins": 129.08895874023438, + "rewards/rejected": -55.9515380859375, + "step": 2220, + "u": -2.216237783432007, + "weight": 0.03750023618340492 + }, + { + "diff_generated": -57.02019119262695, + "epoch": 0.7226182760855476, + "grad_norm": 465.81723520228275, + "learning_rate": 7.526173128226416e-07, + "logits/chosen": -2.415139675140381, + "logits/rejected": -2.530526638031006, + "logps/chosen": -23.722383499145508, + "logps/rejected": -143.73326110839844, + "loss": 21.2618, + "losses_ref": -4.169198655290529e-05, + "ref_logps/chosen": -93.94865417480469, + "ref_logps/rejected": -86.71307373046875, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 70.22628021240234, + "rewards/margins": 127.24647521972656, + "rewards/rejected": -57.02019119262695, + "step": 2230, + "u": -2.201845407485962, + "weight": 0.0437517985701561 + }, + { + "diff_generated": -56.1454963684082, + "epoch": 0.7258587167854829, + "grad_norm": 456.4544393860723, + "learning_rate": 7.519027791767069e-07, + "logits/chosen": -2.416743040084839, + "logits/rejected": -2.474727153778076, + "logps/chosen": -25.03844451904297, + "logps/rejected": -146.59579467773438, + "loss": 22.6618, + "losses_ref": -2.143383608199656e-05, + "ref_logps/chosen": -98.1845703125, + "ref_logps/rejected": -90.4503173828125, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 73.14612579345703, + "rewards/margins": 129.2916259765625, + "rewards/rejected": -56.1454963684082, + "step": 2240, + "u": -2.2018463611602783, + "weight": 0.043750692158937454 + }, + { + "diff_generated": -53.48323440551758, + "epoch": 0.729099157485418, + "grad_norm": 516.2210753632854, + "learning_rate": 7.511832426125375e-07, + "logits/chosen": -2.4550464153289795, + "logits/rejected": -2.49265193939209, + "logps/chosen": -22.875, + "logps/rejected": -140.51768493652344, + "loss": 22.1306, + "losses_ref": -1.8146038200939074e-05, + "ref_logps/chosen": -96.42729187011719, + "ref_logps/rejected": -87.03443908691406, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 73.55229187011719, + "rewards/margins": 127.03553771972656, + "rewards/rejected": -53.48323440551758, + "step": 2250, + "u": -2.1298909187316895, + "weight": 0.07500042766332626 + }, + { + "diff_generated": -52.945457458496094, + "epoch": 0.7323395981853532, + "grad_norm": 509.54662632563736, + "learning_rate": 7.504587133596141e-07, + "logits/chosen": -2.5048627853393555, + "logits/rejected": -2.5559840202331543, + "logps/chosen": -21.633480072021484, + "logps/rejected": -138.461669921875, + "loss": 21.5234, + "losses_ref": -4.3154597051398014e-07, + "ref_logps/chosen": -93.41227722167969, + "ref_logps/rejected": -85.51620483398438, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 71.77879333496094, + "rewards/margins": 124.72425842285156, + "rewards/rejected": -52.945457458496094, + "step": 2260, + "u": -2.1298911571502686, + "weight": 0.07500001788139343 + }, + { + "diff_generated": -59.143280029296875, + "epoch": 0.7355800388852884, + "grad_norm": 497.5814124852356, + "learning_rate": 7.497292017183965e-07, + "logits/chosen": -2.500357151031494, + "logits/rejected": -2.5774011611938477, + "logps/chosen": -22.00308609008789, + "logps/rejected": -143.08343505859375, + "loss": 22.602, + "losses_ref": -0.0005357967456802726, + "ref_logps/chosen": -97.12187194824219, + "ref_logps/rejected": -83.9401626586914, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 75.11878204345703, + "rewards/margins": 134.26205444335938, + "rewards/rejected": -59.143280029296875, + "step": 2270, + "u": -2.259385108947754, + "weight": 0.01877937652170658 + }, + { + "diff_generated": -57.037635803222656, + "epoch": 0.7388204795852236, + "grad_norm": 476.0990938511596, + "learning_rate": 7.489947180601791e-07, + "logits/chosen": -2.4256510734558105, + "logits/rejected": -2.494957208633423, + "logps/chosen": -21.480037689208984, + "logps/rejected": -143.7701873779297, + "loss": 20.9277, + "losses_ref": -0.0008092170464806259, + "ref_logps/chosen": -92.27450561523438, + "ref_logps/rejected": -86.73255157470703, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 70.79447937011719, + "rewards/margins": 127.83211517333984, + "rewards/rejected": -57.037635803222656, + "step": 2280, + "u": -2.173030376434326, + "weight": 0.05628802627325058 + }, + { + "diff_generated": -58.950225830078125, + "epoch": 0.7420609202851588, + "grad_norm": 507.13162525453004, + "learning_rate": 7.482552728269412e-07, + "logits/chosen": -2.4813027381896973, + "logits/rejected": -2.561781644821167, + "logps/chosen": -22.769901275634766, + "logps/rejected": -143.92344665527344, + "loss": 21.6814, + "losses_ref": -0.0013163576368242502, + "ref_logps/chosen": -96.60799407958984, + "ref_logps/rejected": -84.97322845458984, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 73.83808898925781, + "rewards/margins": 132.78831481933594, + "rewards/rejected": -58.950225830078125, + "step": 2290, + "u": -2.1873927116394043, + "weight": 0.05006963014602661 + }, + { + "diff_generated": -56.21698760986328, + "epoch": 0.7453013609850939, + "grad_norm": 480.2308129690591, + "learning_rate": 7.475108765312001e-07, + "logits/chosen": -2.4525883197784424, + "logits/rejected": -2.490872383117676, + "logps/chosen": -22.308719635009766, + "logps/rejected": -140.81539916992188, + "loss": 22.2788, + "losses_ref": -2.739727165135264e-07, + "ref_logps/chosen": -95.74879455566406, + "ref_logps/rejected": -84.59840393066406, + "rewards/accuracies": 0.9375, + "rewards/chosen": 73.44007110595703, + "rewards/margins": 129.6570587158203, + "rewards/rejected": -56.21698760986328, + "step": 2300, + "u": -2.1586735248565674, + "weight": 0.0625000074505806 + }, + { + "diff_generated": -56.06154251098633, + "epoch": 0.7485418016850292, + "grad_norm": 486.9295523674511, + "learning_rate": 7.467615397558613e-07, + "logits/chosen": -2.464097738265991, + "logits/rejected": -2.570708990097046, + "logps/chosen": -22.407894134521484, + "logps/rejected": -144.00759887695312, + "loss": 22.8778, + "losses_ref": -0.0002869610325433314, + "ref_logps/chosen": -93.29719543457031, + "ref_logps/rejected": -87.9460678100586, + "rewards/accuracies": 0.9375, + "rewards/chosen": 70.88929748535156, + "rewards/margins": 126.95084381103516, + "rewards/rejected": -56.06154251098633, + "step": 2310, + "u": -2.158660888671875, + "weight": 0.06251401454210281 + }, + { + "diff_generated": -55.63084030151367, + "epoch": 0.7517822423849644, + "grad_norm": 479.68801719361085, + "learning_rate": 7.460072731540676e-07, + "logits/chosen": -2.4591574668884277, + "logits/rejected": -2.5600147247314453, + "logps/chosen": -20.084136962890625, + "logps/rejected": -141.91134643554688, + "loss": 21.2137, + "losses_ref": -1.929281461343635e-05, + "ref_logps/chosen": -92.06275177001953, + "ref_logps/rejected": -86.28050231933594, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 71.97862243652344, + "rewards/margins": 127.60945892333984, + "rewards/rejected": -55.63084030151367, + "step": 2320, + "u": -2.187455415725708, + "weight": 0.05000050738453865 + }, + { + "diff_generated": -56.46466064453125, + "epoch": 0.7550226830848995, + "grad_norm": 476.99868631752895, + "learning_rate": 7.452480874490483e-07, + "logits/chosen": -2.4715018272399902, + "logits/rejected": -2.563992738723755, + "logps/chosen": -20.5650691986084, + "logps/rejected": -143.99801635742188, + "loss": 21.5986, + "losses_ref": -0.0008421779493801296, + "ref_logps/chosen": -94.32906341552734, + "ref_logps/rejected": -87.53335571289062, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 73.76399993896484, + "rewards/margins": 130.22866821289062, + "rewards/rejected": -56.46466064453125, + "step": 2330, + "u": -2.173027753829956, + "weight": 0.05629073455929756 + }, + { + "diff_generated": -58.54656219482422, + "epoch": 0.7582631237848347, + "grad_norm": 477.7399059877632, + "learning_rate": 7.44483993433966e-07, + "logits/chosen": -2.4514431953430176, + "logits/rejected": -2.5479302406311035, + "logps/chosen": -18.108726501464844, + "logps/rejected": -143.81797790527344, + "loss": 21.7014, + "losses_ref": -0.0013811999233439565, + "ref_logps/chosen": -89.15837097167969, + "ref_logps/rejected": -85.27140808105469, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 71.04964447021484, + "rewards/margins": 129.59622192382812, + "rewards/rejected": -58.54656219482422, + "step": 2340, + "u": -2.1729941368103027, + "weight": 0.05632782727479935 + }, + { + "diff_generated": -56.10137939453125, + "epoch": 0.76150356448477, + "grad_norm": 511.48486644109664, + "learning_rate": 7.437150019717641e-07, + "logits/chosen": -2.430873394012451, + "logits/rejected": -2.5499558448791504, + "logps/chosen": -19.955095291137695, + "logps/rejected": -138.7136993408203, + "loss": 22.672, + "losses_ref": -6.416399992303923e-05, + "ref_logps/chosen": -91.30572509765625, + "ref_logps/rejected": -82.6123046875, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 71.35063171386719, + "rewards/margins": 127.4520034790039, + "rewards/rejected": -56.10137939453125, + "step": 2350, + "u": -2.129889965057373, + "weight": 0.07500138133764267 + }, + { + "diff_generated": -57.626991271972656, + "epoch": 0.7647440051847051, + "grad_norm": 481.0871517222036, + "learning_rate": 7.429411239950116e-07, + "logits/chosen": -2.485440254211426, + "logits/rejected": -2.6047866344451904, + "logps/chosen": -23.08542251586914, + "logps/rejected": -150.21994018554688, + "loss": 22.1324, + "losses_ref": -0.0004304622416384518, + "ref_logps/chosen": -95.6878662109375, + "ref_logps/rejected": -92.59294128417969, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 72.6024398803711, + "rewards/margins": 130.2294158935547, + "rewards/rejected": -57.626991271972656, + "step": 2360, + "u": -2.1874358654022217, + "weight": 0.05002208799123764 + }, + { + "diff_generated": -55.8709716796875, + "epoch": 0.7679844458846403, + "grad_norm": 457.02298028837765, + "learning_rate": 7.421623705057477e-07, + "logits/chosen": -2.509002208709717, + "logits/rejected": -2.521097183227539, + "logps/chosen": -18.914812088012695, + "logps/rejected": -140.98692321777344, + "loss": 21.34, + "losses_ref": -0.0011177074629813433, + "ref_logps/chosen": -94.02415466308594, + "ref_logps/rejected": -85.115966796875, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 75.10933685302734, + "rewards/margins": 130.98031616210938, + "rewards/rejected": -55.8709716796875, + "step": 2370, + "u": -2.1298372745513916, + "weight": 0.07505981624126434 + }, + { + "diff_generated": -53.20804977416992, + "epoch": 0.7712248865845756, + "grad_norm": 503.14428502568614, + "learning_rate": 7.413787525753261e-07, + "logits/chosen": -2.441809892654419, + "logits/rejected": -2.5438599586486816, + "logps/chosen": -21.622966766357422, + "logps/rejected": -135.86770629882812, + "loss": 22.7855, + "losses_ref": -0.0025614311452955008, + "ref_logps/chosen": -90.37882232666016, + "ref_logps/rejected": -82.65966796875, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 68.75585174560547, + "rewards/margins": 121.96390533447266, + "rewards/rejected": -53.20804977416992, + "step": 2380, + "u": -2.1010196208953857, + "weight": 0.08759871870279312 + }, + { + "diff_generated": -54.82086181640625, + "epoch": 0.7744653272845107, + "grad_norm": 459.212222917981, + "learning_rate": 7.405902813442564e-07, + "logits/chosen": -2.4724698066711426, + "logits/rejected": -2.5220937728881836, + "logps/chosen": -19.182483673095703, + "logps/rejected": -139.30343627929688, + "loss": 20.8541, + "losses_ref": -0.002965776016935706, + "ref_logps/chosen": -94.22468566894531, + "ref_logps/rejected": -84.48258209228516, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 75.04219055175781, + "rewards/margins": 129.86306762695312, + "rewards/rejected": -54.82086181640625, + "step": 2390, + "u": -2.2017018795013428, + "weight": 0.043910298496484756 + }, + { + "diff_generated": -57.6229133605957, + "epoch": 0.7777057679844459, + "grad_norm": 435.42049220403754, + "learning_rate": 7.39796968022047e-07, + "logits/chosen": -2.416891574859619, + "logits/rejected": -2.5111327171325684, + "logps/chosen": -19.746204376220703, + "logps/rejected": -140.6691131591797, + "loss": 20.8716, + "losses_ref": -0.004521545954048634, + "ref_logps/chosen": -90.78548431396484, + "ref_logps/rejected": -83.04621124267578, + "rewards/accuracies": 0.96875, + "rewards/chosen": 71.03929138183594, + "rewards/margins": 128.6621856689453, + "rewards/rejected": -57.6229133605957, + "step": 2400, + "u": -2.230410099029541, + "weight": 0.031490933150053024 + }, + { + "diff_generated": -57.4922981262207, + "epoch": 0.780946208684381, + "grad_norm": 481.804369528505, + "learning_rate": 7.389988238870451e-07, + "logits/chosen": -2.460920810699463, + "logits/rejected": -2.4654626846313477, + "logps/chosen": -23.994354248046875, + "logps/rejected": -145.25741577148438, + "loss": 21.7064, + "losses_ref": -0.0070587992668151855, + "ref_logps/chosen": -102.72373962402344, + "ref_logps/rejected": -87.76512145996094, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 78.72938537597656, + "rewards/margins": 136.22169494628906, + "rewards/rejected": -57.4922981262207, + "step": 2410, + "u": -2.215846300125122, + "weight": 0.03792215883731842 + }, + { + "diff_generated": -59.77937698364258, + "epoch": 0.7841866493843163, + "grad_norm": 490.4641157632023, + "learning_rate": 7.381958602862763e-07, + "logits/chosen": -2.464838743209839, + "logits/rejected": -2.5294761657714844, + "logps/chosen": -22.61722183227539, + "logps/rejected": -148.07510375976562, + "loss": 21.6444, + "losses_ref": -0.001273915870115161, + "ref_logps/chosen": -98.81327056884766, + "ref_logps/rejected": -88.29571533203125, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 76.19606018066406, + "rewards/margins": 135.97543334960938, + "rewards/rejected": -59.77937698364258, + "step": 2420, + "u": -2.216182231903076, + "weight": 0.03756193816661835 + }, + { + "diff_generated": -59.4302978515625, + "epoch": 0.7874270900842515, + "grad_norm": 516.6666797651599, + "learning_rate": 7.373880886352832e-07, + "logits/chosen": -2.513746500015259, + "logits/rejected": -2.561347484588623, + "logps/chosen": -24.95885467529297, + "logps/rejected": -146.04820251464844, + "loss": 21.7484, + "losses_ref": -2.387703261774732e-06, + "ref_logps/chosen": -101.25908660888672, + "ref_logps/rejected": -86.61790466308594, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 76.30021667480469, + "rewards/margins": 135.7305145263672, + "rewards/rejected": -59.4302978515625, + "step": 2430, + "u": -2.216238021850586, + "weight": 0.03750010207295418 + }, + { + "diff_generated": -60.237892150878906, + "epoch": 0.7906675307841866, + "grad_norm": 481.8729871679236, + "learning_rate": 7.365755204179637e-07, + "logits/chosen": -2.3747448921203613, + "logits/rejected": -2.5454201698303223, + "logps/chosen": -21.511890411376953, + "logps/rejected": -148.35452270507812, + "loss": 21.8352, + "losses_ref": -0.00043111745617352426, + "ref_logps/chosen": -91.5963134765625, + "ref_logps/rejected": -88.11661529541016, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 70.08443450927734, + "rewards/margins": 130.3223114013672, + "rewards/rejected": -60.237892150878906, + "step": 2440, + "u": -2.201833724975586, + "weight": 0.04376457259058952 + }, + { + "diff_generated": -57.28288650512695, + "epoch": 0.7939079714841218, + "grad_norm": 488.4952904683894, + "learning_rate": 7.357581671864073e-07, + "logits/chosen": -2.4144272804260254, + "logits/rejected": -2.5472493171691895, + "logps/chosen": -21.880416870117188, + "logps/rejected": -145.05154418945312, + "loss": 22.3205, + "losses_ref": -0.001727291732095182, + "ref_logps/chosen": -95.05560302734375, + "ref_logps/rejected": -87.76866149902344, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 73.17517852783203, + "rewards/margins": 130.45806884765625, + "rewards/rejected": -57.28288650512695, + "step": 2450, + "u": -2.1730313301086426, + "weight": 0.05628693103790283 + }, + { + "diff_generated": -58.206886291503906, + "epoch": 0.7971484121840571, + "grad_norm": 439.44428631457066, + "learning_rate": 7.349360405607303e-07, + "logits/chosen": -2.394195079803467, + "logits/rejected": -2.5290329456329346, + "logps/chosen": -18.200389862060547, + "logps/rejected": -139.7240753173828, + "loss": 21.1708, + "losses_ref": -1.836994329096342e-06, + "ref_logps/chosen": -87.65174865722656, + "ref_logps/rejected": -81.51720428466797, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 69.45135498046875, + "rewards/margins": 127.6582260131836, + "rewards/rejected": -58.206886291503906, + "step": 2460, + "u": -2.101109027862549, + "weight": 0.08750007301568985 + }, + { + "diff_generated": -55.12480545043945, + "epoch": 0.8003888528839922, + "grad_norm": 435.480304925401, + "learning_rate": 7.341091522289122e-07, + "logits/chosen": -2.5137763023376465, + "logits/rejected": -2.5602359771728516, + "logps/chosen": -20.748157501220703, + "logps/rejected": -138.9665985107422, + "loss": 21.1365, + "losses_ref": -0.001150214346125722, + "ref_logps/chosen": -95.16680908203125, + "ref_logps/rejected": -83.841796875, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 74.41864776611328, + "rewards/margins": 129.54345703125, + "rewards/rejected": -55.12480545043945, + "step": 2470, + "u": -2.2017948627471924, + "weight": 0.043808113783597946 + }, + { + "diff_generated": -59.74702072143555, + "epoch": 0.8036292935839274, + "grad_norm": 455.6827583126531, + "learning_rate": 7.332775139466278e-07, + "logits/chosen": -2.542436122894287, + "logits/rejected": -2.6427390575408936, + "logps/chosen": -20.466482162475586, + "logps/rejected": -150.67933654785156, + "loss": 21.9027, + "losses_ref": -0.001079038018360734, + "ref_logps/chosen": -96.73770904541016, + "ref_logps/rejected": -90.93230438232422, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 76.27122497558594, + "rewards/margins": 136.01824951171875, + "rewards/rejected": -59.74702072143555, + "step": 2480, + "u": -2.2593626976013184, + "weight": 0.018804144114255905 + }, + { + "diff_generated": -55.41155242919922, + "epoch": 0.8068697342838627, + "grad_norm": 458.2025692080539, + "learning_rate": 7.324411375370809e-07, + "logits/chosen": -2.4522483348846436, + "logits/rejected": -2.5456674098968506, + "logps/chosen": -21.490245819091797, + "logps/rejected": -140.60546875, + "loss": 22.3088, + "losses_ref": -2.5578192435204983e-05, + "ref_logps/chosen": -91.77385711669922, + "ref_logps/rejected": -85.19390869140625, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 70.28361511230469, + "rewards/margins": 125.6951675415039, + "rewards/rejected": -55.41155242919922, + "step": 2490, + "u": -2.1298909187316895, + "weight": 0.07500037550926208 + }, + { + "diff_generated": -56.444732666015625, + "epoch": 0.8101101749837978, + "grad_norm": 493.372644152006, + "learning_rate": 7.316000348908365e-07, + "logits/chosen": -2.485949993133545, + "logits/rejected": -2.576566457748413, + "logps/chosen": -22.575986862182617, + "logps/rejected": -140.38726806640625, + "loss": 21.7191, + "losses_ref": -0.0002884681161958724, + "ref_logps/chosen": -92.9170150756836, + "ref_logps/rejected": -83.9425277709961, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 70.34103393554688, + "rewards/margins": 126.78575134277344, + "rewards/rejected": -56.444732666015625, + "step": 2500, + "u": -2.129878520965576, + "weight": 0.0750143826007843 + }, + { + "diff_generated": -57.7220573425293, + "epoch": 0.813350615683733, + "grad_norm": 507.80135367519387, + "learning_rate": 7.307542179656511e-07, + "logits/chosen": -2.4682345390319824, + "logits/rejected": -2.5501224994659424, + "logps/chosen": -21.272693634033203, + "logps/rejected": -144.73831176757812, + "loss": 21.4688, + "losses_ref": -0.0015492306556552649, + "ref_logps/chosen": -94.3641357421875, + "ref_logps/rejected": -87.01626586914062, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 73.09144592285156, + "rewards/margins": 130.81350708007812, + "rewards/rejected": -57.7220573425293, + "step": 2510, + "u": -2.129817485809326, + "weight": 0.0750814825296402 + }, + { + "diff_generated": -54.088233947753906, + "epoch": 0.8165910563836681, + "grad_norm": 478.4146285598937, + "learning_rate": 7.29903698786303e-07, + "logits/chosen": -2.4700889587402344, + "logits/rejected": -2.4862189292907715, + "logps/chosen": -22.107303619384766, + "logps/rejected": -133.4788055419922, + "loss": 21.8233, + "losses_ref": -0.0017220573499798775, + "ref_logps/chosen": -95.75920104980469, + "ref_logps/rejected": -79.39057159423828, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 73.65189361572266, + "rewards/margins": 127.74012756347656, + "rewards/rejected": -54.088233947753906, + "step": 2520, + "u": -2.1873762607574463, + "weight": 0.05008823797106743 + }, + { + "diff_generated": -53.54823684692383, + "epoch": 0.8198314970836034, + "grad_norm": 476.36245565519215, + "learning_rate": 7.290484894444214e-07, + "logits/chosen": -2.410266160964966, + "logits/rejected": -2.5156915187835693, + "logps/chosen": -19.08709716796875, + "logps/rejected": -136.63551330566406, + "loss": 20.7257, + "losses_ref": -9.177963875117712e-06, + "ref_logps/chosen": -86.3339614868164, + "ref_logps/rejected": -83.0872802734375, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 67.24686431884766, + "rewards/margins": 120.79510498046875, + "rewards/rejected": -53.54823684692383, + "step": 2530, + "u": -2.043544054031372, + "weight": 0.11250035464763641 + }, + { + "diff_generated": -55.499168395996094, + "epoch": 0.8230719377835386, + "grad_norm": 506.77751262247625, + "learning_rate": 7.281886020983144e-07, + "logits/chosen": -2.460317850112915, + "logits/rejected": -2.4854583740234375, + "logps/chosen": -23.15581703186035, + "logps/rejected": -135.55416870117188, + "loss": 20.8908, + "losses_ref": -0.00029835925670340657, + "ref_logps/chosen": -97.4378433227539, + "ref_logps/rejected": -80.05500793457031, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 74.28202056884766, + "rewards/margins": 129.7811737060547, + "rewards/rejected": -55.499168395996094, + "step": 2540, + "u": -2.216226577758789, + "weight": 0.037513017654418945 + }, + { + "diff_generated": -55.99641036987305, + "epoch": 0.8263123784834737, + "grad_norm": 513.340136041323, + "learning_rate": 7.273240489727963e-07, + "logits/chosen": -2.4329352378845215, + "logits/rejected": -2.4697928428649902, + "logps/chosen": -22.266624450683594, + "logps/rejected": -135.0956268310547, + "loss": 21.4897, + "losses_ref": -0.0011090862099081278, + "ref_logps/chosen": -95.71326446533203, + "ref_logps/rejected": -79.0992202758789, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 73.44662475585938, + "rewards/margins": 129.4430389404297, + "rewards/rejected": -55.99641036987305, + "step": 2550, + "u": -2.144231081008911, + "weight": 0.0688069611787796 + }, + { + "diff_generated": -56.29785919189453, + "epoch": 0.829552819183409, + "grad_norm": 454.9273628876045, + "learning_rate": 7.264548423590133e-07, + "logits/chosen": -2.4539313316345215, + "logits/rejected": -2.537295341491699, + "logps/chosen": -21.008182525634766, + "logps/rejected": -139.8561248779297, + "loss": 21.3602, + "losses_ref": -4.210594488540664e-05, + "ref_logps/chosen": -93.7306900024414, + "ref_logps/rejected": -83.55828857421875, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 72.7225112915039, + "rewards/margins": 129.02035522460938, + "rewards/rejected": -56.29785919189453, + "step": 2560, + "u": -2.1730635166168213, + "weight": 0.056251466274261475 + }, + { + "diff_generated": -53.686622619628906, + "epoch": 0.8327932598833442, + "grad_norm": 489.15136824729063, + "learning_rate": 7.255809946142695e-07, + "logits/chosen": -2.439706325531006, + "logits/rejected": -2.485032320022583, + "logps/chosen": -22.65837287902832, + "logps/rejected": -138.0975799560547, + "loss": 22.6904, + "losses_ref": -0.00020490979659371078, + "ref_logps/chosen": -95.66709899902344, + "ref_logps/rejected": -84.41097259521484, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 73.00873565673828, + "rewards/margins": 126.6953353881836, + "rewards/rejected": -53.686622619628906, + "step": 2570, + "u": -2.173058271408081, + "weight": 0.056257009506225586 + }, + { + "diff_generated": -58.009979248046875, + "epoch": 0.8360337005832793, + "grad_norm": 490.3665858873195, + "learning_rate": 7.247025181618508e-07, + "logits/chosen": -2.4753196239471436, + "logits/rejected": -2.531113862991333, + "logps/chosen": -22.63921356201172, + "logps/rejected": -146.94308471679688, + "loss": 21.4989, + "losses_ref": -7.747672498226166e-05, + "ref_logps/chosen": -96.47139739990234, + "ref_logps/rejected": -88.93311309814453, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 73.83218383789062, + "rewards/margins": 131.8421630859375, + "rewards/rejected": -58.009979248046875, + "step": 2580, + "u": -2.2162346839904785, + "weight": 0.0375036746263504 + }, + { + "diff_generated": -56.34900665283203, + "epoch": 0.8392741412832145, + "grad_norm": 515.5358271111543, + "learning_rate": 7.238194254908483e-07, + "logits/chosen": -2.4178130626678467, + "logits/rejected": -2.4952597618103027, + "logps/chosen": -21.742961883544922, + "logps/rejected": -139.27444458007812, + "loss": 23.3113, + "losses_ref": -1.4187762644723989e-05, + "ref_logps/chosen": -93.23844909667969, + "ref_logps/rejected": -82.92540740966797, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 71.49549102783203, + "rewards/margins": 127.84449768066406, + "rewards/rejected": -56.34900665283203, + "step": 2590, + "u": -2.1730639934539795, + "weight": 0.056250639259815216 + }, + { + "diff_generated": -56.2597541809082, + "epoch": 0.8425145819831497, + "grad_norm": 433.46724920592413, + "learning_rate": 7.229317291559807e-07, + "logits/chosen": -2.451038360595703, + "logits/rejected": -2.5517618656158447, + "logps/chosen": -22.46918487548828, + "logps/rejected": -142.5192108154297, + "loss": 21.5436, + "losses_ref": -0.026391511783003807, + "ref_logps/chosen": -93.1564712524414, + "ref_logps/rejected": -86.25945281982422, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 70.68728637695312, + "rewards/margins": 126.94703674316406, + "rewards/rejected": -56.2597541809082, + "step": 2600, + "u": -2.2014243602752686, + "weight": 0.044207897037267685 + }, + { + "diff_generated": -55.4771842956543, + "epoch": 0.8457550226830849, + "grad_norm": 483.12958162627, + "learning_rate": 7.22039441777416e-07, + "logits/chosen": -2.425280809402466, + "logits/rejected": -2.537978410720825, + "logps/chosen": -20.165245056152344, + "logps/rejected": -140.473388671875, + "loss": 22.1426, + "losses_ref": -0.012793747708201408, + "ref_logps/chosen": -91.35377502441406, + "ref_logps/rejected": -84.99620056152344, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 71.18852233886719, + "rewards/margins": 126.66572570800781, + "rewards/rejected": -55.4771842956543, + "step": 2610, + "u": -2.143904447555542, + "weight": 0.0691661387681961 + }, + { + "diff_generated": -58.96485137939453, + "epoch": 0.8489954633830201, + "grad_norm": 462.2183901664622, + "learning_rate": 7.21142576040592e-07, + "logits/chosen": -2.4881858825683594, + "logits/rejected": -2.5900299549102783, + "logps/chosen": -23.751340866088867, + "logps/rejected": -145.04563903808594, + "loss": 21.3726, + "losses_ref": -7.155739467634703e-07, + "ref_logps/chosen": -99.04013061523438, + "ref_logps/rejected": -86.08078002929688, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 75.28878784179688, + "rewards/margins": 134.25363159179688, + "rewards/rejected": -58.96485137939453, + "step": 2620, + "u": -2.216238260269165, + "weight": 0.03750001639127731 + }, + { + "diff_generated": -61.1406364440918, + "epoch": 0.8522359040829552, + "grad_norm": 464.8238044329431, + "learning_rate": 7.202411446960357e-07, + "logits/chosen": -2.4685490131378174, + "logits/rejected": -2.5211329460144043, + "logps/chosen": -23.971097946166992, + "logps/rejected": -151.80685424804688, + "loss": 21.7248, + "losses_ref": -0.0010434570722281933, + "ref_logps/chosen": -99.79468536376953, + "ref_logps/rejected": -90.6662368774414, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 75.82359313964844, + "rewards/margins": 136.96421813964844, + "rewards/rejected": -61.1406364440918, + "step": 2630, + "u": -2.259364604949951, + "weight": 0.018802126869559288 + }, + { + "diff_generated": -60.53166961669922, + "epoch": 0.8554763447828905, + "grad_norm": 458.67165860521243, + "learning_rate": 7.193351605591825e-07, + "logits/chosen": -2.470578193664551, + "logits/rejected": -2.582828998565674, + "logps/chosen": -20.103727340698242, + "logps/rejected": -145.3340606689453, + "loss": 20.5432, + "losses_ref": -0.0011830010917037725, + "ref_logps/chosen": -90.42179870605469, + "ref_logps/rejected": -84.8023681640625, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 70.31806945800781, + "rewards/margins": 130.84974670410156, + "rewards/rejected": -60.53166961669922, + "step": 2640, + "u": -2.1730074882507324, + "weight": 0.05631326511502266 + }, + { + "diff_generated": -58.205894470214844, + "epoch": 0.8587167854828257, + "grad_norm": 495.1440294686424, + "learning_rate": 7.184246365101939e-07, + "logits/chosen": -2.5101726055145264, + "logits/rejected": -2.5068435668945312, + "logps/chosen": -23.52204704284668, + "logps/rejected": -146.87620544433594, + "loss": 22.7303, + "losses_ref": -6.494811714219395e-06, + "ref_logps/chosen": -101.6772689819336, + "ref_logps/rejected": -88.6703109741211, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 78.15522766113281, + "rewards/margins": 136.36111450195312, + "rewards/rejected": -58.205894470214844, + "step": 2650, + "u": -2.216237783432007, + "weight": 0.037500280886888504 + }, + { + "diff_generated": -61.2960205078125, + "epoch": 0.8619572261827608, + "grad_norm": 467.0051884894631, + "learning_rate": 7.175095854937739e-07, + "logits/chosen": -2.463174819946289, + "logits/rejected": -2.5186073780059814, + "logps/chosen": -21.171960830688477, + "logps/rejected": -148.35946655273438, + "loss": 22.1944, + "losses_ref": -1.2179619091057248e-07, + "ref_logps/chosen": -98.50288391113281, + "ref_logps/rejected": -87.06343841552734, + "rewards/accuracies": 0.96875, + "rewards/chosen": 77.33091735839844, + "rewards/margins": 138.62693786621094, + "rewards/rejected": -61.2960205078125, + "step": 2660, + "u": -2.2306292057037354, + "weight": 0.03125 + }, + { + "diff_generated": -57.64461135864258, + "epoch": 0.8651976668826961, + "grad_norm": 474.22303186662896, + "learning_rate": 7.165900205189853e-07, + "logits/chosen": -2.464472532272339, + "logits/rejected": -2.574596643447876, + "logps/chosen": -20.4440860748291, + "logps/rejected": -145.9555206298828, + "loss": 20.8537, + "losses_ref": -3.338614718018107e-08, + "ref_logps/chosen": -92.37683868408203, + "ref_logps/rejected": -88.3109130859375, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 71.93275451660156, + "rewards/margins": 129.57736206054688, + "rewards/rejected": -57.64461135864258, + "step": 2670, + "u": -2.173064708709717, + "weight": 0.05624999850988388 + }, + { + "diff_generated": -59.3572998046875, + "epoch": 0.8684381075826313, + "grad_norm": 441.31149418382375, + "learning_rate": 7.156659546590653e-07, + "logits/chosen": -2.404878616333008, + "logits/rejected": -2.4967124462127686, + "logps/chosen": -19.269577026367188, + "logps/rejected": -144.28993225097656, + "loss": 21.3004, + "losses_ref": -6.453227001657069e-07, + "ref_logps/chosen": -93.02782440185547, + "ref_logps/rejected": -84.93263244628906, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 73.75823974609375, + "rewards/margins": 133.1155548095703, + "rewards/rejected": -59.3572998046875, + "step": 2680, + "u": -2.2018470764160156, + "weight": 0.043750010430812836 + }, + { + "diff_generated": -55.28821563720703, + "epoch": 0.8716785482825664, + "grad_norm": 463.8652831186927, + "learning_rate": 7.147374010512385e-07, + "logits/chosen": -2.3714890480041504, + "logits/rejected": -2.4094130992889404, + "logps/chosen": -19.746137619018555, + "logps/rejected": -135.70211791992188, + "loss": 20.9851, + "losses_ref": -0.0011874515330418944, + "ref_logps/chosen": -89.21805572509766, + "ref_logps/rejected": -80.41390228271484, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 69.47191619873047, + "rewards/margins": 124.7601318359375, + "rewards/rejected": -55.28821563720703, + "step": 2690, + "u": -2.1010565757751465, + "weight": 0.08755816519260406 + }, + { + "diff_generated": -58.68421173095703, + "epoch": 0.8749189889825016, + "grad_norm": 461.6775648644702, + "learning_rate": 7.13804372896531e-07, + "logits/chosen": -2.3763599395751953, + "logits/rejected": -2.5035808086395264, + "logps/chosen": -20.23203468322754, + "logps/rejected": -143.46829223632812, + "loss": 21.6863, + "losses_ref": -0.0015272090677171946, + "ref_logps/chosen": -90.75230407714844, + "ref_logps/rejected": -84.78406524658203, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 70.52027893066406, + "rewards/margins": 129.20448303222656, + "rewards/rejected": -58.68421173095703, + "step": 2700, + "u": -2.172990322113037, + "weight": 0.05633222311735153 + }, + { + "diff_generated": -58.122169494628906, + "epoch": 0.8781594296824368, + "grad_norm": 453.822172883697, + "learning_rate": 7.128668834595827e-07, + "logits/chosen": -2.4922752380371094, + "logits/rejected": -2.5195541381835938, + "logps/chosen": -23.63937759399414, + "logps/rejected": -143.8534393310547, + "loss": 20.6594, + "losses_ref": -0.004180192481726408, + "ref_logps/chosen": -97.52043151855469, + "ref_logps/rejected": -85.73128509521484, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 73.88105773925781, + "rewards/margins": 132.00320434570312, + "rewards/rejected": -58.122169494628906, + "step": 2710, + "u": -2.172847270965576, + "weight": 0.05648912116885185 + }, + { + "diff_generated": -56.95354461669922, + "epoch": 0.881399870382372, + "grad_norm": 492.4355541276597, + "learning_rate": 7.119249460684583e-07, + "logits/chosen": -2.406707286834717, + "logits/rejected": -2.4368181228637695, + "logps/chosen": -23.046157836914062, + "logps/rejected": -138.7194366455078, + "loss": 22.6015, + "losses_ref": -8.472305125906132e-06, + "ref_logps/chosen": -96.1999740600586, + "ref_logps/rejected": -81.76588439941406, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 73.15381622314453, + "rewards/margins": 130.10736083984375, + "rewards/rejected": -56.95354461669922, + "step": 2720, + "u": -2.216238021850586, + "weight": 0.03750025853514671 + }, + { + "diff_generated": -57.65205001831055, + "epoch": 0.8846403110823072, + "grad_norm": 439.5878754716779, + "learning_rate": 7.109785741144577e-07, + "logits/chosen": -2.37678861618042, + "logits/rejected": -2.513676881790161, + "logps/chosen": -22.001630783081055, + "logps/rejected": -144.2527313232422, + "loss": 21.2921, + "losses_ref": -7.83679115556879e-06, + "ref_logps/chosen": -90.35001373291016, + "ref_logps/rejected": -86.60069274902344, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 68.34837341308594, + "rewards/margins": 126.00044250488281, + "rewards/rejected": -57.65205001831055, + "step": 2730, + "u": -2.2018468379974365, + "weight": 0.04375031590461731 + }, + { + "diff_generated": -58.04121017456055, + "epoch": 0.8878807517822424, + "grad_norm": 434.4572641426545, + "learning_rate": 7.100277810519264e-07, + "logits/chosen": -2.459519863128662, + "logits/rejected": -2.5082592964172363, + "logps/chosen": -21.73441505432129, + "logps/rejected": -146.6686248779297, + "loss": 21.1753, + "losses_ref": -2.30889941121859e-06, + "ref_logps/chosen": -97.97511291503906, + "ref_logps/rejected": -88.6274185180664, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 76.2406997680664, + "rewards/margins": 134.28189086914062, + "rewards/rejected": -58.04121017456055, + "step": 2740, + "u": -2.2018468379974365, + "weight": 0.04375004023313522 + }, + { + "diff_generated": -55.9808235168457, + "epoch": 0.8911211924821776, + "grad_norm": 437.56983692274633, + "learning_rate": 7.090725803980633e-07, + "logits/chosen": -2.398206949234009, + "logits/rejected": -2.5035288333892822, + "logps/chosen": -20.635684967041016, + "logps/rejected": -139.47872924804688, + "loss": 21.8344, + "losses_ref": -2.7282279916107655e-05, + "ref_logps/chosen": -90.36199951171875, + "ref_logps/rejected": -83.49790954589844, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 69.726318359375, + "rewards/margins": 125.70713806152344, + "rewards/rejected": -55.9808235168457, + "step": 2750, + "u": -2.1442816257476807, + "weight": 0.06875090301036835 + }, + { + "diff_generated": -58.962196350097656, + "epoch": 0.8943616331821128, + "grad_norm": 453.82844426997264, + "learning_rate": 7.081129857327297e-07, + "logits/chosen": -2.4433138370513916, + "logits/rejected": -2.53584623336792, + "logps/chosen": -20.66114616394043, + "logps/rejected": -144.21824645996094, + "loss": 22.4838, + "losses_ref": -0.00010804003977682441, + "ref_logps/chosen": -96.87528991699219, + "ref_logps/rejected": -85.25605010986328, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 76.21414184570312, + "rewards/margins": 135.17633056640625, + "rewards/rejected": -58.962196350097656, + "step": 2760, + "u": -2.1874513626098633, + "weight": 0.05000500753521919 + }, + { + "diff_generated": -56.82305908203125, + "epoch": 0.8976020738820479, + "grad_norm": 492.44022233311415, + "learning_rate": 7.071490106982547e-07, + "logits/chosen": -2.4296534061431885, + "logits/rejected": -2.48490571975708, + "logps/chosen": -22.398880004882812, + "logps/rejected": -142.8795928955078, + "loss": 22.0816, + "losses_ref": -5.544167152038426e-07, + "ref_logps/chosen": -96.34947967529297, + "ref_logps/rejected": -86.05652618408203, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 73.95059967041016, + "rewards/margins": 130.77365112304688, + "rewards/rejected": -56.82305908203125, + "step": 2770, + "u": -2.1298911571502686, + "weight": 0.07500001043081284 + }, + { + "diff_generated": -60.1945915222168, + "epoch": 0.9008425145819832, + "grad_norm": 501.50198429090375, + "learning_rate": 7.061806689992424e-07, + "logits/chosen": -2.401369333267212, + "logits/rejected": -2.4693374633789062, + "logps/chosen": -21.959651947021484, + "logps/rejected": -145.50927734375, + "loss": 20.9386, + "losses_ref": -1.3287276487972122e-05, + "ref_logps/chosen": -94.9290771484375, + "ref_logps/rejected": -85.31468200683594, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 72.96942138671875, + "rewards/margins": 133.16403198242188, + "rewards/rejected": -60.1945915222168, + "step": 2780, + "u": -2.1730642318725586, + "weight": 0.056250639259815216 + }, + { + "diff_generated": -61.94187545776367, + "epoch": 0.9040829552819183, + "grad_norm": 474.9848841951121, + "learning_rate": 7.052079744023769e-07, + "logits/chosen": -2.554199695587158, + "logits/rejected": -2.5834202766418457, + "logps/chosen": -25.187421798706055, + "logps/rejected": -149.35482788085938, + "loss": 21.6333, + "losses_ref": -6.639597268076614e-05, + "ref_logps/chosen": -104.10478210449219, + "ref_logps/rejected": -87.41294860839844, + "rewards/accuracies": 0.96875, + "rewards/chosen": 78.9173583984375, + "rewards/margins": 140.85923767089844, + "rewards/rejected": -61.94187545776367, + "step": 2790, + "u": -2.2306275367736816, + "weight": 0.03125188127160072 + }, + { + "diff_generated": -59.27500534057617, + "epoch": 0.9073233959818535, + "grad_norm": 479.4450117078786, + "learning_rate": 7.042309407362264e-07, + "logits/chosen": -2.416618824005127, + "logits/rejected": -2.5326590538024902, + "logps/chosen": -19.899805068969727, + "logps/rejected": -146.05563354492188, + "loss": 21.8238, + "losses_ref": -0.0020984322763979435, + "ref_logps/chosen": -95.49771118164062, + "ref_logps/rejected": -86.78062438964844, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 75.59790802001953, + "rewards/margins": 134.87290954589844, + "rewards/rejected": -59.27500534057617, + "step": 2800, + "u": -2.2017922401428223, + "weight": 0.04381098598241806 + }, + { + "diff_generated": -58.58484649658203, + "epoch": 0.9105638366817888, + "grad_norm": 464.62151462477925, + "learning_rate": 7.032495818910462e-07, + "logits/chosen": -2.4766998291015625, + "logits/rejected": -2.5218327045440674, + "logps/chosen": -19.523643493652344, + "logps/rejected": -144.54153442382812, + "loss": 21.0012, + "losses_ref": -1.2667987903114408e-06, + "ref_logps/chosen": -92.96281433105469, + "ref_logps/rejected": -85.9566879272461, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 73.43919372558594, + "rewards/margins": 132.02401733398438, + "rewards/rejected": -58.58484649658203, + "step": 2810, + "u": -2.1298911571502686, + "weight": 0.07500003278255463 + }, + { + "diff_generated": -59.304412841796875, + "epoch": 0.9138042773817239, + "grad_norm": 481.66422343005866, + "learning_rate": 7.022639118185819e-07, + "logits/chosen": -2.4660236835479736, + "logits/rejected": -2.4848055839538574, + "logps/chosen": -22.82510757446289, + "logps/rejected": -146.7274932861328, + "loss": 21.1632, + "losses_ref": -0.00026902236277237535, + "ref_logps/chosen": -100.3491439819336, + "ref_logps/rejected": -87.4230728149414, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 77.52403259277344, + "rewards/margins": 136.82846069335938, + "rewards/rejected": -59.304412841796875, + "step": 2820, + "u": -2.187443494796753, + "weight": 0.05001381039619446 + }, + { + "diff_generated": -61.32807540893555, + "epoch": 0.9170447180816591, + "grad_norm": 447.16863172300367, + "learning_rate": 7.012739445318712e-07, + "logits/chosen": -2.4933276176452637, + "logits/rejected": -2.5432355403900146, + "logps/chosen": -22.36863899230957, + "logps/rejected": -148.42919921875, + "loss": 21.3078, + "losses_ref": -0.0005821407539770007, + "ref_logps/chosen": -98.70513916015625, + "ref_logps/rejected": -87.10112762451172, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 76.33650207519531, + "rewards/margins": 137.66458129882812, + "rewards/rejected": -61.32807540893555, + "step": 2830, + "u": -2.2162117958068848, + "weight": 0.03752908110618591 + }, + { + "diff_generated": -57.458465576171875, + "epoch": 0.9202851587815943, + "grad_norm": 468.00606440020357, + "learning_rate": 7.002796941050435e-07, + "logits/chosen": -2.468254327774048, + "logits/rejected": -2.5315186977386475, + "logps/chosen": -20.7177677154541, + "logps/rejected": -147.50428771972656, + "loss": 22.576, + "losses_ref": -0.0009493259713053703, + "ref_logps/chosen": -93.41374206542969, + "ref_logps/rejected": -90.04581451416016, + "rewards/accuracies": 0.9375, + "rewards/chosen": 72.69596862792969, + "rewards/margins": 130.15443420410156, + "rewards/rejected": -57.458465576171875, + "step": 2840, + "u": -2.1586289405822754, + "weight": 0.0625494197010994 + }, + { + "diff_generated": -60.067726135253906, + "epoch": 0.9235255994815295, + "grad_norm": 487.44319674950174, + "learning_rate": 6.992811746731213e-07, + "logits/chosen": -2.475463390350342, + "logits/rejected": -2.5366272926330566, + "logps/chosen": -23.523868560791016, + "logps/rejected": -147.427490234375, + "loss": 22.1732, + "losses_ref": -3.6845955037279055e-05, + "ref_logps/chosen": -98.42115783691406, + "ref_logps/rejected": -87.35975646972656, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 74.89728546142578, + "rewards/margins": 134.9650115966797, + "rewards/rejected": -60.067726135253906, + "step": 2850, + "u": -2.201845645904541, + "weight": 0.04375119134783745 + }, + { + "diff_generated": -61.67189407348633, + "epoch": 0.9267660401814647, + "grad_norm": 454.104891943021, + "learning_rate": 6.98278400431818e-07, + "logits/chosen": -2.506474733352661, + "logits/rejected": -2.596954345703125, + "logps/chosen": -23.808429718017578, + "logps/rejected": -153.27609252929688, + "loss": 22.1171, + "losses_ref": -0.0010014523286372423, + "ref_logps/chosen": -99.9022445678711, + "ref_logps/rejected": -91.60420989990234, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 76.09381866455078, + "rewards/margins": 137.76571655273438, + "rewards/rejected": -61.67189407348633, + "step": 2860, + "u": -2.2449753284454346, + "weight": 0.025050124153494835 + }, + { + "diff_generated": -61.183860778808594, + "epoch": 0.9300064808813999, + "grad_norm": 430.2836400141675, + "learning_rate": 6.972713856373369e-07, + "logits/chosen": -2.499459981918335, + "logits/rejected": -2.5934641361236572, + "logps/chosen": -21.586721420288086, + "logps/rejected": -149.8661651611328, + "loss": 21.4534, + "losses_ref": -3.025634896403062e-06, + "ref_logps/chosen": -95.02459716796875, + "ref_logps/rejected": -88.68229675292969, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 73.43787384033203, + "rewards/margins": 134.62173461914062, + "rewards/rejected": -61.183860778808594, + "step": 2870, + "u": -2.2450203895568848, + "weight": 0.025000056251883507 + }, + { + "diff_generated": -61.021156311035156, + "epoch": 0.933246921581335, + "grad_norm": 435.91054480995274, + "learning_rate": 6.962601446061681e-07, + "logits/chosen": -2.4959959983825684, + "logits/rejected": -2.494209051132202, + "logps/chosen": -21.066835403442383, + "logps/rejected": -142.05026245117188, + "loss": 21.0805, + "losses_ref": -0.010031198151409626, + "ref_logps/chosen": -96.79750061035156, + "ref_logps/rejected": -81.02911376953125, + "rewards/accuracies": 0.96875, + "rewards/chosen": 75.73066711425781, + "rewards/margins": 136.75180053710938, + "rewards/rejected": -61.021156311035156, + "step": 2880, + "u": -2.230487823486328, + "weight": 0.03140627592802048 + }, + { + "diff_generated": -61.153526306152344, + "epoch": 0.9364873622812703, + "grad_norm": 467.78890629943817, + "learning_rate": 6.952446917148853e-07, + "logits/chosen": -2.469348669052124, + "logits/rejected": -2.5688066482543945, + "logps/chosen": -21.54801368713379, + "logps/rejected": -149.8988494873047, + "loss": 21.4278, + "losses_ref": -0.005234680138528347, + "ref_logps/chosen": -99.34354400634766, + "ref_logps/rejected": -88.74533081054688, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 77.79553985595703, + "rewards/margins": 138.94906616210938, + "rewards/rejected": -61.153526306152344, + "step": 2890, + "u": -2.244741201400757, + "weight": 0.025303319096565247 + }, + { + "diff_generated": -62.5263671875, + "epoch": 0.9397278029812054, + "grad_norm": 468.65833498464394, + "learning_rate": 6.94225041399941e-07, + "logits/chosen": -2.473790407180786, + "logits/rejected": -2.6006760597229004, + "logps/chosen": -20.871822357177734, + "logps/rejected": -153.6435546875, + "loss": 20.2554, + "losses_ref": -0.0003484871704131365, + "ref_logps/chosen": -96.58824157714844, + "ref_logps/rejected": -91.11719512939453, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 75.71641540527344, + "rewards/margins": 138.2427978515625, + "rewards/rejected": -62.5263671875, + "step": 2900, + "u": -2.1730494499206543, + "weight": 0.056267015635967255 + }, + { + "diff_generated": -59.31611251831055, + "epoch": 0.9429682436811406, + "grad_norm": 418.6116303978115, + "learning_rate": 6.932012081574615e-07, + "logits/chosen": -2.4939768314361572, + "logits/rejected": -2.547677516937256, + "logps/chosen": -21.716999053955078, + "logps/rejected": -146.97817993164062, + "loss": 21.6353, + "losses_ref": -4.112573606107617e-07, + "ref_logps/chosen": -93.69564056396484, + "ref_logps/rejected": -87.66205596923828, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 71.97864532470703, + "rewards/margins": 131.2947540283203, + "rewards/rejected": -59.31611251831055, + "step": 2910, + "u": -2.2018468379974365, + "weight": 0.043750010430812836 + }, + { + "diff_generated": -57.04352951049805, + "epoch": 0.9462086843810759, + "grad_norm": 429.2864831942657, + "learning_rate": 6.921732065430411e-07, + "logits/chosen": -2.433727741241455, + "logits/rejected": -2.556497573852539, + "logps/chosen": -18.58115005493164, + "logps/rejected": -142.70858764648438, + "loss": 20.421, + "losses_ref": -8.253007877101481e-07, + "ref_logps/chosen": -90.64105987548828, + "ref_logps/rejected": -85.66506958007812, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 72.0599136352539, + "rewards/margins": 129.1034393310547, + "rewards/rejected": -57.04352951049805, + "step": 2920, + "u": -2.1298911571502686, + "weight": 0.07500003278255463 + }, + { + "diff_generated": -56.113372802734375, + "epoch": 0.949449125081011, + "grad_norm": 448.6438982921234, + "learning_rate": 6.911410511715343e-07, + "logits/chosen": -2.455838441848755, + "logits/rejected": -2.5022530555725098, + "logps/chosen": -21.27231216430664, + "logps/rejected": -138.38426208496094, + "loss": 20.6897, + "losses_ref": -5.750478521804325e-05, + "ref_logps/chosen": -93.20436096191406, + "ref_logps/rejected": -82.27088165283203, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 71.93205261230469, + "rewards/margins": 128.04544067382812, + "rewards/rejected": -56.113372802734375, + "step": 2930, + "u": -2.1298890113830566, + "weight": 0.07500265538692474 + }, + { + "diff_generated": -58.04365158081055, + "epoch": 0.9526895657809462, + "grad_norm": 490.29397554218724, + "learning_rate": 6.901047567168491e-07, + "logits/chosen": -2.5180070400238037, + "logits/rejected": -2.5559816360473633, + "logps/chosen": -21.59964370727539, + "logps/rejected": -142.6434783935547, + "loss": 21.3082, + "losses_ref": -0.0004614538047462702, + "ref_logps/chosen": -95.13563537597656, + "ref_logps/rejected": -84.59981536865234, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 73.53599548339844, + "rewards/margins": 131.5796356201172, + "rewards/rejected": -58.04365158081055, + "step": 2940, + "u": -2.1298718452453613, + "weight": 0.0750214010477066 + }, + { + "diff_generated": -61.00908279418945, + "epoch": 0.9559300064808814, + "grad_norm": 487.9778393083608, + "learning_rate": 6.890643379117374e-07, + "logits/chosen": -2.5026142597198486, + "logits/rejected": -2.5429794788360596, + "logps/chosen": -21.640348434448242, + "logps/rejected": -151.98902893066406, + "loss": 20.7525, + "losses_ref": -0.00010877321619773284, + "ref_logps/chosen": -100.30354309082031, + "ref_logps/rejected": -90.97994232177734, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 78.6631851196289, + "rewards/margins": 139.67227172851562, + "rewards/rejected": -61.00908279418945, + "step": 2950, + "u": -2.1730599403381348, + "weight": 0.05625521019101143 + }, + { + "diff_generated": -60.049903869628906, + "epoch": 0.9591704471808166, + "grad_norm": 475.56335002534007, + "learning_rate": 6.880198095475866e-07, + "logits/chosen": -2.5132415294647217, + "logits/rejected": -2.5256381034851074, + "logps/chosen": -25.432048797607422, + "logps/rejected": -147.38023376464844, + "loss": 21.5488, + "losses_ref": -3.1123508961172774e-06, + "ref_logps/chosen": -105.21751403808594, + "ref_logps/rejected": -87.3303451538086, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 79.78547668457031, + "rewards/margins": 139.83535766601562, + "rewards/rejected": -60.049903869628906, + "step": 2960, + "u": -2.216238021850586, + "weight": 0.037500131875276566 + }, + { + "diff_generated": -54.8358039855957, + "epoch": 0.9624108878807518, + "grad_norm": 486.02730600965197, + "learning_rate": 6.86971186474208e-07, + "logits/chosen": -2.4802966117858887, + "logits/rejected": -2.5337626934051514, + "logps/chosen": -21.163768768310547, + "logps/rejected": -135.5006561279297, + "loss": 22.2722, + "losses_ref": -0.000891751900780946, + "ref_logps/chosen": -92.32938385009766, + "ref_logps/rejected": -80.66484832763672, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 71.16561889648438, + "rewards/margins": 126.00142669677734, + "rewards/rejected": -54.8358039855957, + "step": 2970, + "u": -2.1730198860168457, + "weight": 0.05629971623420715 + }, + { + "diff_generated": -59.64556884765625, + "epoch": 0.9656513285806869, + "grad_norm": 488.1934193333916, + "learning_rate": 6.859184835996271e-07, + "logits/chosen": -2.4701828956604004, + "logits/rejected": -2.586439371109009, + "logps/chosen": -19.97806739807129, + "logps/rejected": -148.86654663085938, + "loss": 21.1577, + "losses_ref": -2.3398897610604763e-06, + "ref_logps/chosen": -94.80010223388672, + "ref_logps/rejected": -89.22097778320312, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 74.82203674316406, + "rewards/margins": 134.46759033203125, + "rewards/rejected": -59.64556884765625, + "step": 2980, + "u": -2.2018468379974365, + "weight": 0.04375002905726433 + }, + { + "diff_generated": -61.30529022216797, + "epoch": 0.9688917692806222, + "grad_norm": 459.6357752841041, + "learning_rate": 6.848617158898704e-07, + "logits/chosen": -2.45888614654541, + "logits/rejected": -2.5506579875946045, + "logps/chosen": -18.167362213134766, + "logps/rejected": -150.75900268554688, + "loss": 20.1378, + "losses_ref": -0.0018867189064621925, + "ref_logps/chosen": -90.96756744384766, + "ref_logps/rejected": -89.4537124633789, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 72.8001937866211, + "rewards/margins": 134.10548400878906, + "rewards/rejected": -61.30529022216797, + "step": 2990, + "u": -2.2018001079559326, + "weight": 0.04380171000957489 + }, + { + "diff_generated": -61.82233428955078, + "epoch": 0.9721322099805574, + "grad_norm": 465.8121795887825, + "learning_rate": 6.838008983687538e-07, + "logits/chosen": -2.4415152072906494, + "logits/rejected": -2.540574789047241, + "logps/chosen": -19.56886863708496, + "logps/rejected": -156.75039672851562, + "loss": 20.7824, + "losses_ref": -0.0007329249056056142, + "ref_logps/chosen": -94.28569030761719, + "ref_logps/rejected": -94.92806243896484, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 74.71682739257812, + "rewards/margins": 136.53915405273438, + "rewards/rejected": -61.82233428955078, + "step": 3000, + "u": -2.2162041664123535, + "weight": 0.03753752261400223 + }, + { + "diff_generated": -61.789337158203125, + "epoch": 0.9753726506804925, + "grad_norm": 482.63406855650675, + "learning_rate": 6.827360461176675e-07, + "logits/chosen": -2.463770627975464, + "logits/rejected": -2.5621752738952637, + "logps/chosen": -22.280914306640625, + "logps/rejected": -148.77462768554688, + "loss": 22.1914, + "losses_ref": -7.3342125688213855e-06, + "ref_logps/chosen": -93.60955047607422, + "ref_logps/rejected": -86.98530578613281, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 71.32862854003906, + "rewards/margins": 133.11798095703125, + "rewards/rejected": -61.789337158203125, + "step": 3010, + "u": -2.187455654144287, + "weight": 0.05000026151537895 + }, + { + "diff_generated": -58.69233322143555, + "epoch": 0.9786130913804277, + "grad_norm": 465.3136564602363, + "learning_rate": 6.816671742753636e-07, + "logits/chosen": -2.453428030014038, + "logits/rejected": -2.5174663066864014, + "logps/chosen": -22.349462509155273, + "logps/rejected": -144.7392120361328, + "loss": 20.2201, + "losses_ref": -0.0022303853183984756, + "ref_logps/chosen": -93.95580291748047, + "ref_logps/rejected": -86.046875, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 71.60633087158203, + "rewards/margins": 130.29867553710938, + "rewards/rejected": -58.69233322143555, + "step": 3020, + "u": -2.1297779083251953, + "weight": 0.07512475550174713 + }, + { + "diff_generated": -58.20878982543945, + "epoch": 0.981853532080363, + "grad_norm": 452.30032153383627, + "learning_rate": 6.80594298037739e-07, + "logits/chosen": -2.4573230743408203, + "logits/rejected": -2.522925615310669, + "logps/chosen": -20.73061752319336, + "logps/rejected": -147.1402130126953, + "loss": 21.9122, + "losses_ref": -0.0012742785038426518, + "ref_logps/chosen": -94.61383056640625, + "ref_logps/rejected": -88.93141174316406, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 73.88321685791016, + "rewards/margins": 132.09201049804688, + "rewards/rejected": -58.20878982543945, + "step": 3030, + "u": -2.187399387359619, + "weight": 0.05006258562207222 + }, + { + "diff_generated": -59.879798889160156, + "epoch": 0.9850939727802981, + "grad_norm": 449.0923495215593, + "learning_rate": 6.795174326576201e-07, + "logits/chosen": -2.517982006072998, + "logits/rejected": -2.576937437057495, + "logps/chosen": -21.717044830322266, + "logps/rejected": -146.72311401367188, + "loss": 20.9081, + "losses_ref": -1.8586888472782448e-06, + "ref_logps/chosen": -97.45092010498047, + "ref_logps/rejected": -86.84330749511719, + "rewards/accuracies": 0.96875, + "rewards/chosen": 75.73387145996094, + "rewards/margins": 135.61367797851562, + "rewards/rejected": -59.879798889160156, + "step": 3040, + "u": -2.2306292057037354, + "weight": 0.03125004842877388 + }, + { + "diff_generated": -58.180763244628906, + "epoch": 0.9883344134802333, + "grad_norm": 464.7999063633373, + "learning_rate": 6.784365934445467e-07, + "logits/chosen": -2.4162662029266357, + "logits/rejected": -2.5505805015563965, + "logps/chosen": -19.611560821533203, + "logps/rejected": -144.66844177246094, + "loss": 21.0366, + "losses_ref": -0.0008930475451052189, + "ref_logps/chosen": -89.82169342041016, + "ref_logps/rejected": -86.48768615722656, + "rewards/accuracies": 0.9375, + "rewards/chosen": 70.21012878417969, + "rewards/margins": 128.39089965820312, + "rewards/rejected": -58.180763244628906, + "step": 3050, + "u": -2.1586310863494873, + "weight": 0.06254696100950241 + }, + { + "diff_generated": -58.33638381958008, + "epoch": 0.9915748541801686, + "grad_norm": 472.00855214714693, + "learning_rate": 6.77351795764553e-07, + "logits/chosen": -2.5259509086608887, + "logits/rejected": -2.5823984146118164, + "logps/chosen": -20.90003776550293, + "logps/rejected": -148.71282958984375, + "loss": 21.1491, + "losses_ref": -0.0017345917876809835, + "ref_logps/chosen": -100.18885803222656, + "ref_logps/rejected": -90.37644958496094, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 79.288818359375, + "rewards/margins": 137.6251983642578, + "rewards/rejected": -58.33638381958008, + "step": 3060, + "u": -2.2161552906036377, + "weight": 0.037591852247714996 + }, + { + "diff_generated": -56.86510467529297, + "epoch": 0.9948152948801037, + "grad_norm": 462.1652750726531, + "learning_rate": 6.7626305503995e-07, + "logits/chosen": -2.4363324642181396, + "logits/rejected": -2.5213849544525146, + "logps/chosen": -21.539684295654297, + "logps/rejected": -139.07391357421875, + "loss": 20.7354, + "losses_ref": -0.0002831167366821319, + "ref_logps/chosen": -93.53211975097656, + "ref_logps/rejected": -82.20880889892578, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 71.99244689941406, + "rewards/margins": 128.8575439453125, + "rewards/rejected": -56.86510467529297, + "step": 3070, + "u": -2.2018346786499023, + "weight": 0.04376371577382088 + }, + { + "diff_generated": -62.3128662109375, + "epoch": 0.9980557355800389, + "grad_norm": 459.85247368799446, + "learning_rate": 6.75170386749106e-07, + "logits/chosen": -2.4534130096435547, + "logits/rejected": -2.5522007942199707, + "logps/chosen": -22.608396530151367, + "logps/rejected": -157.0138397216797, + "loss": 20.5769, + "losses_ref": -0.00012439176498446614, + "ref_logps/chosen": -98.05714416503906, + "ref_logps/rejected": -94.70097351074219, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 75.4487533569336, + "rewards/margins": 137.76162719726562, + "rewards/rejected": -62.3128662109375, + "step": 3080, + "u": -2.28818941116333, + "weight": 0.0062549663707613945 + }, + { + "diff_generated": -59.61762619018555, + "epoch": 1.0012961762799741, + "grad_norm": 451.9635942553253, + "learning_rate": 6.740738064262265e-07, + "logits/chosen": -2.4870359897613525, + "logits/rejected": -2.5867018699645996, + "logps/chosen": -19.341670989990234, + "logps/rejected": -147.53176879882812, + "loss": 19.2801, + "losses_ref": -0.0015698724891990423, + "ref_logps/chosen": -93.33935546875, + "ref_logps/rejected": -87.9141616821289, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 73.99769592285156, + "rewards/margins": 133.61532592773438, + "rewards/rejected": -59.61762619018555, + "step": 3090, + "u": -3.090423345565796, + "weight": 0.04383974149823189 + }, + { + "diff_generated": -66.58012390136719, + "epoch": 1.0045366169799093, + "grad_norm": 462.3154034742135, + "learning_rate": 6.729733296611336e-07, + "logits/chosen": -2.510164260864258, + "logits/rejected": -2.58900785446167, + "logps/chosen": -16.40743064880371, + "logps/rejected": -154.39218139648438, + "loss": 17.2317, + "losses_ref": -0.00252619874663651, + "ref_logps/chosen": -95.60277557373047, + "ref_logps/rejected": -87.81204223632812, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 79.19535064697266, + "rewards/margins": 145.77548217773438, + "rewards/rejected": -66.58012390136719, + "step": 3100, + "u": -4.446268558502197, + "weight": 0.025122780352830887 + }, + { + "diff_generated": -66.8081283569336, + "epoch": 1.0077770576798444, + "grad_norm": 495.00120559126685, + "learning_rate": 6.718689720990442e-07, + "logits/chosen": -2.4792261123657227, + "logits/rejected": -2.572613000869751, + "logps/chosen": -16.952396392822266, + "logps/rejected": -153.31338500976562, + "loss": 17.2435, + "losses_ref": -0.08517072349786758, + "ref_logps/chosen": -94.31925964355469, + "ref_logps/rejected": -86.5052490234375, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 77.36685943603516, + "rewards/margins": 144.17498779296875, + "rewards/rejected": -66.8081283569336, + "step": 3110, + "u": -4.296696662902832, + "weight": 0.05492968484759331 + }, + { + "diff_generated": -63.595611572265625, + "epoch": 1.0110174983797797, + "grad_norm": 467.1313773559849, + "learning_rate": 6.707607494403471e-07, + "logits/chosen": -2.4678194522857666, + "logits/rejected": -2.552511692047119, + "logps/chosen": -16.294748306274414, + "logps/rejected": -148.17742919921875, + "loss": 16.8622, + "losses_ref": -0.003516948549076915, + "ref_logps/chosen": -91.89102935791016, + "ref_logps/rejected": -84.58182525634766, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 75.59628295898438, + "rewards/margins": 139.19190979003906, + "rewards/rejected": -63.595611572265625, + "step": 3120, + "u": -4.301178455352783, + "weight": 0.056418467313051224 + }, + { + "diff_generated": -65.88175201416016, + "epoch": 1.0142579390797148, + "grad_norm": 475.5010872008353, + "learning_rate": 6.696486774403812e-07, + "logits/chosen": -2.4392142295837402, + "logits/rejected": -2.5472519397735596, + "logps/chosen": -18.009849548339844, + "logps/rejected": -154.6840362548828, + "loss": 17.8148, + "losses_ref": -1.152172558249731e-06, + "ref_logps/chosen": -92.8963394165039, + "ref_logps/rejected": -88.80229187011719, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 74.88648986816406, + "rewards/margins": 140.76824951171875, + "rewards/rejected": -65.88175201416016, + "step": 3130, + "u": -4.3748979568481445, + "weight": 0.050000034272670746 + }, + { + "diff_generated": -67.03091430664062, + "epoch": 1.01749837977965, + "grad_norm": 460.5138890938086, + "learning_rate": 6.685327719092096e-07, + "logits/chosen": -2.4010062217712402, + "logits/rejected": -2.5667436122894287, + "logps/chosen": -14.499166488647461, + "logps/rejected": -149.5535430908203, + "loss": 17.7814, + "losses_ref": -9.857653640210629e-05, + "ref_logps/chosen": -88.37496948242188, + "ref_logps/rejected": -82.52261352539062, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 73.87580871582031, + "rewards/margins": 140.90672302246094, + "rewards/rejected": -67.03091430664062, + "step": 3140, + "u": -4.2142720222473145, + "weight": 0.07500406354665756 + }, + { + "diff_generated": -65.28272247314453, + "epoch": 1.0207388204795853, + "grad_norm": 421.26837650221717, + "learning_rate": 6.674130487113962e-07, + "logits/chosen": -2.5053372383117676, + "logits/rejected": -2.55894136428833, + "logps/chosen": -18.61953353881836, + "logps/rejected": -150.85842895507812, + "loss": 18.2352, + "losses_ref": -3.157412109544566e-08, + "ref_logps/chosen": -98.04133605957031, + "ref_logps/rejected": -85.5757064819336, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 79.42179870605469, + "rewards/margins": 144.70452880859375, + "rewards/rejected": -65.28272247314453, + "step": 3150, + "u": -4.365433692932129, + "weight": 0.04374999925494194 + }, + { + "diff_generated": -68.14897155761719, + "epoch": 1.0239792611795204, + "grad_norm": 436.8557600214886, + "learning_rate": 6.662895237657799e-07, + "logits/chosen": -2.5241332054138184, + "logits/rejected": -2.5666394233703613, + "logps/chosen": -17.1142520904541, + "logps/rejected": -153.32432556152344, + "loss": 17.3762, + "losses_ref": -8.085754416242708e-06, + "ref_logps/chosen": -98.8514404296875, + "ref_logps/rejected": -85.17535400390625, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 81.73719787597656, + "rewards/margins": 149.88616943359375, + "rewards/rejected": -68.14897155761719, + "step": 3160, + "u": -4.447952747344971, + "weight": 0.025000324472784996 + }, + { + "diff_generated": -64.65580749511719, + "epoch": 1.0272197018794555, + "grad_norm": 460.11094844811134, + "learning_rate": 6.651622130452481e-07, + "logits/chosen": -2.46612548828125, + "logits/rejected": -2.5433506965637207, + "logps/chosen": -19.915821075439453, + "logps/rejected": -151.89566040039062, + "loss": 17.3588, + "losses_ref": -0.00018419846310280263, + "ref_logps/chosen": -96.14291381835938, + "ref_logps/rejected": -87.23985290527344, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 76.22709655761719, + "rewards/margins": 140.88290405273438, + "rewards/rejected": -64.65580749511719, + "step": 3170, + "u": -4.275555610656738, + "weight": 0.06875447183847427 + }, + { + "diff_generated": -68.25298309326172, + "epoch": 1.030460142579391, + "grad_norm": 434.93450671648856, + "learning_rate": 6.640311325765096e-07, + "logits/chosen": -2.4406819343566895, + "logits/rejected": -2.571199417114258, + "logps/chosen": -16.93312644958496, + "logps/rejected": -160.1395721435547, + "loss": 17.5433, + "losses_ref": -0.0010875340085476637, + "ref_logps/chosen": -92.92141723632812, + "ref_logps/rejected": -91.88658142089844, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 75.98828887939453, + "rewards/margins": 144.24127197265625, + "rewards/rejected": -68.25298309326172, + "step": 3180, + "u": -4.345793724060059, + "weight": 0.050050728023052216 + }, + { + "diff_generated": -66.62785339355469, + "epoch": 1.033700583279326, + "grad_norm": 510.4120915296864, + "learning_rate": 6.628962984398663e-07, + "logits/chosen": -2.485319137573242, + "logits/rejected": -2.5952136516571045, + "logps/chosen": -17.191240310668945, + "logps/rejected": -155.60009765625, + "loss": 17.5711, + "losses_ref": -0.0012958078878000379, + "ref_logps/chosen": -96.52293395996094, + "ref_logps/rejected": -88.97222900390625, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 79.33170318603516, + "rewards/margins": 145.9595489501953, + "rewards/rejected": -66.62785339355469, + "step": 3190, + "u": -4.343929290771484, + "weight": 0.04381078481674194 + }, + { + "diff_generated": -67.12268829345703, + "epoch": 1.0369410239792611, + "grad_norm": 452.0149194344355, + "learning_rate": 6.617577267689863e-07, + "logits/chosen": -2.4636361598968506, + "logits/rejected": -2.5697529315948486, + "logps/chosen": -16.585233688354492, + "logps/rejected": -153.8497314453125, + "loss": 17.8756, + "losses_ref": -0.0017653731629252434, + "ref_logps/chosen": -94.59716033935547, + "ref_logps/rejected": -86.72703552246094, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 78.01192474365234, + "rewards/margins": 145.13461303710938, + "rewards/rejected": -67.12268829345703, + "step": 3200, + "u": -4.3374176025390625, + "weight": 0.05008355900645256 + }, + { + "diff_generated": -66.08069610595703, + "epoch": 1.0401814646791965, + "grad_norm": 444.5697359069676, + "learning_rate": 6.606154337506721e-07, + "logits/chosen": -2.514535665512085, + "logits/rejected": -2.5685744285583496, + "logps/chosen": -19.98543357849121, + "logps/rejected": -150.83689880371094, + "loss": 17.259, + "losses_ref": -0.0015171390259638429, + "ref_logps/chosen": -99.03950500488281, + "ref_logps/rejected": -84.75621032714844, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 79.05406951904297, + "rewards/margins": 145.13479614257812, + "rewards/rejected": -66.08069610595703, + "step": 3210, + "u": -4.347279071807861, + "weight": 0.050075214356184006 + }, + { + "diff_generated": -61.97309112548828, + "epoch": 1.0434219053791316, + "grad_norm": 453.53740449674814, + "learning_rate": 6.594694356246325e-07, + "logits/chosen": -2.5020461082458496, + "logits/rejected": -2.5105185508728027, + "logps/chosen": -18.61556625366211, + "logps/rejected": -142.10214233398438, + "loss": 17.6667, + "losses_ref": -6.612971503727749e-08, + "ref_logps/chosen": -96.94075012207031, + "ref_logps/rejected": -80.12906646728516, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 78.32518005371094, + "rewards/margins": 140.2982635498047, + "rewards/rejected": -61.97309112548828, + "step": 3220, + "u": -4.212699890136719, + "weight": 0.08124999701976776 + }, + { + "diff_generated": -65.51012420654297, + "epoch": 1.0466623460790667, + "grad_norm": 448.9919035981769, + "learning_rate": 6.583197486832506e-07, + "logits/chosen": -2.495256185531616, + "logits/rejected": -2.533613920211792, + "logps/chosen": -17.16736602783203, + "logps/rejected": -151.90597534179688, + "loss": 18.1376, + "losses_ref": -1.783437937774579e-06, + "ref_logps/chosen": -94.41279602050781, + "ref_logps/rejected": -86.39586639404297, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 77.24542999267578, + "rewards/margins": 142.7555694580078, + "rewards/rejected": -65.51012420654297, + "step": 3230, + "u": -4.212017059326172, + "weight": 0.07500006258487701 + }, + { + "diff_generated": -65.96939086914062, + "epoch": 1.0499027867790018, + "grad_norm": 457.2713751712491, + "learning_rate": 6.571663892713527e-07, + "logits/chosen": -2.4962501525878906, + "logits/rejected": -2.5798799991607666, + "logps/chosen": -17.670930862426758, + "logps/rejected": -152.85986328125, + "loss": 16.7632, + "losses_ref": -0.006728614680469036, + "ref_logps/chosen": -95.16192626953125, + "ref_logps/rejected": -86.8904800415039, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 77.49100494384766, + "rewards/margins": 143.46038818359375, + "rewards/rejected": -65.96939086914062, + "step": 3240, + "u": -4.393555164337158, + "weight": 0.03781301528215408 + }, + { + "diff_generated": -63.8961181640625, + "epoch": 1.0531432274789372, + "grad_norm": 471.8849246342818, + "learning_rate": 6.560093737859755e-07, + "logits/chosen": -2.5056347846984863, + "logits/rejected": -2.4781854152679443, + "logps/chosen": -17.58412742614746, + "logps/rejected": -144.17770385742188, + "loss": 17.5115, + "losses_ref": -7.131105803637183e-07, + "ref_logps/chosen": -95.46975708007812, + "ref_logps/rejected": -80.28160095214844, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 77.8856201171875, + "rewards/margins": 141.78172302246094, + "rewards/rejected": -63.8961181640625, + "step": 3250, + "u": -4.220991134643555, + "weight": 0.07500002533197403 + }, + { + "diff_generated": -66.7835464477539, + "epoch": 1.0563836681788723, + "grad_norm": 457.9417057538034, + "learning_rate": 6.548487186761334e-07, + "logits/chosen": -2.4845380783081055, + "logits/rejected": -2.5910003185272217, + "logps/chosen": -17.014305114746094, + "logps/rejected": -154.10397338867188, + "loss": 17.4821, + "losses_ref": -0.0043944017961621284, + "ref_logps/chosen": -92.6029052734375, + "ref_logps/rejected": -87.3204345703125, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 75.5886001586914, + "rewards/margins": 142.3721466064453, + "rewards/rejected": -66.7835464477539, + "step": 3260, + "u": -4.323208332061768, + "weight": 0.0439579114317894 + }, + { + "diff_generated": -65.97068786621094, + "epoch": 1.0596241088788074, + "grad_norm": 473.9528443346443, + "learning_rate": 6.536844404425845e-07, + "logits/chosen": -2.4793522357940674, + "logits/rejected": -2.586184501647949, + "logps/chosen": -16.8640193939209, + "logps/rejected": -154.61196899414062, + "loss": 17.15, + "losses_ref": -0.03145980462431908, + "ref_logps/chosen": -95.58818054199219, + "ref_logps/rejected": -88.64127349853516, + "rewards/accuracies": 0.9375, + "rewards/chosen": 78.72415924072266, + "rewards/margins": 144.69485473632812, + "rewards/rejected": -65.97068786621094, + "step": 3270, + "u": -4.271958351135254, + "weight": 0.06429260969161987 + }, + { + "diff_generated": -65.097412109375, + "epoch": 1.0628645495787428, + "grad_norm": 452.4811006320355, + "learning_rate": 6.525165556375959e-07, + "logits/chosen": -2.444775342941284, + "logits/rejected": -2.578066349029541, + "logps/chosen": -15.83275032043457, + "logps/rejected": -146.58653259277344, + "loss": 17.2568, + "losses_ref": -0.0008164413156919181, + "ref_logps/chosen": -90.00611877441406, + "ref_logps/rejected": -81.4891357421875, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 74.17335510253906, + "rewards/margins": 139.270751953125, + "rewards/rejected": -65.097412109375, + "step": 3280, + "u": -4.3508782386779785, + "weight": 0.05003942921757698 + }, + { + "diff_generated": -67.97452545166016, + "epoch": 1.0661049902786779, + "grad_norm": 481.39553728386596, + "learning_rate": 6.513450808647086e-07, + "logits/chosen": -2.437958002090454, + "logits/rejected": -2.5370841026306152, + "logps/chosen": -18.49962615966797, + "logps/rejected": -152.44979858398438, + "loss": 18.0844, + "losses_ref": -1.7934650031747879e-06, + "ref_logps/chosen": -96.98509979248047, + "ref_logps/rejected": -84.47525787353516, + "rewards/accuracies": 0.96875, + "rewards/chosen": 78.48546600341797, + "rewards/margins": 146.45999145507812, + "rewards/rejected": -67.97452545166016, + "step": 3290, + "u": -4.41939115524292, + "weight": 0.03125005215406418 + }, + { + "diff_generated": -66.6233139038086, + "epoch": 1.069345430978613, + "grad_norm": 465.37946702473073, + "learning_rate": 6.501700327785011e-07, + "logits/chosen": -2.5347819328308105, + "logits/rejected": -2.573274612426758, + "logps/chosen": -16.581130981445312, + "logps/rejected": -150.71554565429688, + "loss": 17.4926, + "losses_ref": -9.045367733051535e-06, + "ref_logps/chosen": -93.96571350097656, + "ref_logps/rejected": -84.09223175048828, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 77.38458251953125, + "rewards/margins": 144.0078887939453, + "rewards/rejected": -66.6233139038086, + "step": 3300, + "u": -4.263393878936768, + "weight": 0.06875050067901611 + }, + { + "diff_generated": -63.082496643066406, + "epoch": 1.0725858716785484, + "grad_norm": 471.1217208714754, + "learning_rate": 6.489914280843528e-07, + "logits/chosen": -2.508230686187744, + "logits/rejected": -2.532465696334839, + "logps/chosen": -18.42227554321289, + "logps/rejected": -138.54238891601562, + "loss": 17.4918, + "losses_ref": -3.896912858181167e-06, + "ref_logps/chosen": -93.45298767089844, + "ref_logps/rejected": -75.45990753173828, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 75.03073120117188, + "rewards/margins": 138.11322021484375, + "rewards/rejected": -63.082496643066406, + "step": 3310, + "u": -4.203088283538818, + "weight": 0.07500005513429642 + }, + { + "diff_generated": -66.8916015625, + "epoch": 1.0758263123784835, + "grad_norm": 472.5438097483653, + "learning_rate": 6.478092835382071e-07, + "logits/chosen": -2.4746553897857666, + "logits/rejected": -2.5401337146759033, + "logps/chosen": -18.509883880615234, + "logps/rejected": -152.7678680419922, + "loss": 17.1164, + "losses_ref": -0.00354144093580544, + "ref_logps/chosen": -96.56227111816406, + "ref_logps/rejected": -85.87627410888672, + "rewards/accuracies": 0.96875, + "rewards/chosen": 78.0523910522461, + "rewards/margins": 144.94398498535156, + "rewards/rejected": -66.8916015625, + "step": 3320, + "u": -4.42160701751709, + "weight": 0.031427718698978424 + }, + { + "diff_generated": -69.9482421875, + "epoch": 1.0790667530784186, + "grad_norm": 467.0081761853318, + "learning_rate": 6.466236159463319e-07, + "logits/chosen": -2.4707579612731934, + "logits/rejected": -2.56321382522583, + "logps/chosen": -16.863197326660156, + "logps/rejected": -162.45408630371094, + "loss": 17.6638, + "losses_ref": -1.793040610209573e-05, + "ref_logps/chosen": -95.34489440917969, + "ref_logps/rejected": -92.5058364868164, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 78.481689453125, + "rewards/margins": 148.42991638183594, + "rewards/rejected": -69.9482421875, + "step": 3330, + "u": -4.3579301834106445, + "weight": 0.04375045746564865 + }, + { + "diff_generated": -71.49409484863281, + "epoch": 1.082307193778354, + "grad_norm": 502.1179103976336, + "learning_rate": 6.45434442165082e-07, + "logits/chosen": -2.4856925010681152, + "logits/rejected": -2.5980522632598877, + "logps/chosen": -17.530838012695312, + "logps/rejected": -163.8715362548828, + "loss": 17.9225, + "losses_ref": -3.988393018516945e-06, + "ref_logps/chosen": -96.364990234375, + "ref_logps/rejected": -92.37743377685547, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 78.83414459228516, + "rewards/margins": 150.3282470703125, + "rewards/rejected": -71.49409484863281, + "step": 3340, + "u": -4.450523376464844, + "weight": 0.018750127404928207 + }, + { + "diff_generated": -68.51228332519531, + "epoch": 1.085547634478289, + "grad_norm": 445.6767835939639, + "learning_rate": 6.442417791006585e-07, + "logits/chosen": -2.4890549182891846, + "logits/rejected": -2.5770602226257324, + "logps/chosen": -17.04773712158203, + "logps/rejected": -154.55792236328125, + "loss": 17.6168, + "losses_ref": -5.988636075926479e-07, + "ref_logps/chosen": -95.89872741699219, + "ref_logps/rejected": -86.04563903808594, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 78.85099792480469, + "rewards/margins": 147.36328125, + "rewards/rejected": -68.51228332519531, + "step": 3350, + "u": -4.234282493591309, + "weight": 0.05625001713633537 + }, + { + "diff_generated": -66.1912612915039, + "epoch": 1.0887880751782242, + "grad_norm": 492.68807977978, + "learning_rate": 6.43045643708869e-07, + "logits/chosen": -2.4714395999908447, + "logits/rejected": -2.5174362659454346, + "logps/chosen": -17.673574447631836, + "logps/rejected": -149.83248901367188, + "loss": 18.1353, + "losses_ref": -0.0020296932198107243, + "ref_logps/chosen": -98.73755645751953, + "ref_logps/rejected": -83.6412353515625, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 81.06398010253906, + "rewards/margins": 147.2552490234375, + "rewards/rejected": -66.1912612915039, + "step": 3360, + "u": -4.327150821685791, + "weight": 0.050094105303287506 + }, + { + "diff_generated": -66.18561553955078, + "epoch": 1.0920285158781595, + "grad_norm": 508.9405686088096, + "learning_rate": 6.418460529948861e-07, + "logits/chosen": -2.453836441040039, + "logits/rejected": -2.5827078819274902, + "logps/chosen": -15.104484558105469, + "logps/rejected": -148.48648071289062, + "loss": 17.6481, + "losses_ref": -0.0013367208885028958, + "ref_logps/chosen": -89.31407928466797, + "ref_logps/rejected": -82.3008804321289, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 74.2095947265625, + "rewards/margins": 140.39520263671875, + "rewards/rejected": -66.18561553955078, + "step": 3370, + "u": -4.214970588684082, + "weight": 0.08131183683872223 + }, + { + "diff_generated": -67.63780212402344, + "epoch": 1.0952689565780946, + "grad_norm": 494.30171232629164, + "learning_rate": 6.406430240130064e-07, + "logits/chosen": -2.4436795711517334, + "logits/rejected": -2.5577890872955322, + "logps/chosen": -17.331829071044922, + "logps/rejected": -153.96365356445312, + "loss": 17.1103, + "losses_ref": -0.0005108517943881452, + "ref_logps/chosen": -94.46105194091797, + "ref_logps/rejected": -86.32584381103516, + "rewards/accuracies": 0.9375, + "rewards/chosen": 77.12922668457031, + "rewards/margins": 144.76699829101562, + "rewards/rejected": -67.63780212402344, + "step": 3380, + "u": -4.249946117401123, + "weight": 0.06252266466617584 + }, + { + "diff_generated": -64.21388244628906, + "epoch": 1.0985093972780298, + "grad_norm": 436.4445367228094, + "learning_rate": 6.39436573866407e-07, + "logits/chosen": -2.469306468963623, + "logits/rejected": -2.567539691925049, + "logps/chosen": -18.55048942565918, + "logps/rejected": -152.05569458007812, + "loss": 17.6939, + "losses_ref": -1.3752096492680721e-05, + "ref_logps/chosen": -96.04664611816406, + "ref_logps/rejected": -87.84181213378906, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 77.49615478515625, + "rewards/margins": 141.7100067138672, + "rewards/rejected": -64.21388244628906, + "step": 3390, + "u": -4.355813980102539, + "weight": 0.050000060349702835 + }, + { + "diff_generated": -64.73839569091797, + "epoch": 1.101749837977965, + "grad_norm": 495.27880536840814, + "learning_rate": 6.38226719706903e-07, + "logits/chosen": -2.4457192420959473, + "logits/rejected": -2.5553908348083496, + "logps/chosen": -16.09178924560547, + "logps/rejected": -146.14370727539062, + "loss": 17.7363, + "losses_ref": -0.00144859217107296, + "ref_logps/chosen": -90.36268615722656, + "ref_logps/rejected": -81.4052963256836, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 74.27088928222656, + "rewards/margins": 139.00930786132812, + "rewards/rejected": -64.73839569091797, + "step": 3400, + "u": -4.222038745880127, + "weight": 0.07507114857435226 + }, + { + "diff_generated": -65.90394592285156, + "epoch": 1.1049902786779002, + "grad_norm": 466.87894214692636, + "learning_rate": 6.370134787347039e-07, + "logits/chosen": -2.473989963531494, + "logits/rejected": -2.566941738128662, + "logps/chosen": -17.068649291992188, + "logps/rejected": -156.55332946777344, + "loss": 17.2998, + "losses_ref": -1.9342860468896106e-06, + "ref_logps/chosen": -94.74530029296875, + "ref_logps/rejected": -90.6493911743164, + "rewards/accuracies": 0.9375, + "rewards/chosen": 77.67665100097656, + "rewards/margins": 143.5806121826172, + "rewards/rejected": -65.90394592285156, + "step": 3410, + "u": -4.272525787353516, + "weight": 0.0625000149011612 + }, + { + "diff_generated": -64.97685241699219, + "epoch": 1.1082307193778353, + "grad_norm": 481.9965436826669, + "learning_rate": 6.357968681981683e-07, + "logits/chosen": -2.4253671169281006, + "logits/rejected": -2.472318649291992, + "logps/chosen": -18.774478912353516, + "logps/rejected": -151.53453063964844, + "loss": 17.6054, + "losses_ref": -1.1018643419902219e-07, + "ref_logps/chosen": -98.19063568115234, + "ref_logps/rejected": -86.55766296386719, + "rewards/accuracies": 0.9375, + "rewards/chosen": 79.41615295410156, + "rewards/margins": 144.39300537109375, + "rewards/rejected": -64.97685241699219, + "step": 3420, + "u": -4.315249443054199, + "weight": 0.0625 + }, + { + "diff_generated": -66.34733581542969, + "epoch": 1.1114711600777705, + "grad_norm": 463.2751033596651, + "learning_rate": 6.345769053935595e-07, + "logits/chosen": -2.4380462169647217, + "logits/rejected": -2.569019317626953, + "logps/chosen": -14.185934066772461, + "logps/rejected": -151.55096435546875, + "loss": 16.9791, + "losses_ref": -1.4939736502128653e-05, + "ref_logps/chosen": -87.73268127441406, + "ref_logps/rejected": -85.2036361694336, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 73.54673767089844, + "rewards/margins": 139.89407348632812, + "rewards/rejected": -66.34733581542969, + "step": 3430, + "u": -4.263554573059082, + "weight": 0.0687505379319191 + }, + { + "diff_generated": -70.10304260253906, + "epoch": 1.1147116007777058, + "grad_norm": 477.96388369546867, + "learning_rate": 6.333536076647985e-07, + "logits/chosen": -2.3879170417785645, + "logits/rejected": -2.544254779815674, + "logps/chosen": -16.81411361694336, + "logps/rejected": -159.60748291015625, + "loss": 17.5568, + "losses_ref": -0.010113712400197983, + "ref_logps/chosen": -93.77284240722656, + "ref_logps/rejected": -89.50444793701172, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 76.95872497558594, + "rewards/margins": 147.06175231933594, + "rewards/rejected": -70.10304260253906, + "step": 3440, + "u": -4.284361362457275, + "weight": 0.050551921129226685 + }, + { + "diff_generated": -68.41407012939453, + "epoch": 1.117952041477641, + "grad_norm": 474.8415955834364, + "learning_rate": 6.321269924032188e-07, + "logits/chosen": -2.439448356628418, + "logits/rejected": -2.4982120990753174, + "logps/chosen": -20.122116088867188, + "logps/rejected": -155.7066650390625, + "loss": 18.1174, + "losses_ref": -0.0050384835340082645, + "ref_logps/chosen": -100.1120376586914, + "ref_logps/rejected": -87.29259490966797, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 79.98992156982422, + "rewards/margins": 148.4039764404297, + "rewards/rejected": -68.41407012939453, + "step": 3450, + "u": -4.3607940673828125, + "weight": 0.044004492461681366 + }, + { + "diff_generated": -68.7255859375, + "epoch": 1.121192482177576, + "grad_norm": 440.84411122537495, + "learning_rate": 6.308970770473184e-07, + "logits/chosen": -2.405679702758789, + "logits/rejected": -2.4746363162994385, + "logps/chosen": -17.939823150634766, + "logps/rejected": -156.30960083007812, + "loss": 17.7598, + "losses_ref": -0.013999903574585915, + "ref_logps/chosen": -96.26406860351562, + "ref_logps/rejected": -87.58399963378906, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 78.32423400878906, + "rewards/margins": 147.04983520507812, + "rewards/rejected": -68.7255859375, + "step": 3460, + "u": -4.44952392578125, + "weight": 0.025478944182395935 + }, + { + "diff_generated": -66.78877258300781, + "epoch": 1.1244329228775114, + "grad_norm": 492.26077070106766, + "learning_rate": 6.296638790825117e-07, + "logits/chosen": -2.4781577587127686, + "logits/rejected": -2.537199020385742, + "logps/chosen": -17.374792098999023, + "logps/rejected": -152.56198120117188, + "loss": 17.4689, + "losses_ref": -0.000983425066806376, + "ref_logps/chosen": -96.94143676757812, + "ref_logps/rejected": -85.77323150634766, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 79.56664276123047, + "rewards/margins": 146.3554229736328, + "rewards/rejected": -66.78877258300781, + "step": 3470, + "u": -4.366348743438721, + "weight": 0.04379170760512352 + }, + { + "diff_generated": -65.74295806884766, + "epoch": 1.1276733635774465, + "grad_norm": 450.3889903703623, + "learning_rate": 6.284274160408812e-07, + "logits/chosen": -2.4432475566864014, + "logits/rejected": -2.5553131103515625, + "logps/chosen": -15.783769607543945, + "logps/rejected": -149.1510467529297, + "loss": 17.0366, + "losses_ref": -2.2255520889302716e-05, + "ref_logps/chosen": -92.00151062011719, + "ref_logps/rejected": -83.40808868408203, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 76.2177505493164, + "rewards/margins": 141.960693359375, + "rewards/rejected": -65.74295806884766, + "step": 3480, + "u": -4.216153621673584, + "weight": 0.0812506452202797 + }, + { + "diff_generated": -66.05118560791016, + "epoch": 1.1309138042773816, + "grad_norm": 506.9247184796788, + "learning_rate": 6.271877055009284e-07, + "logits/chosen": -2.4507365226745605, + "logits/rejected": -2.552597761154175, + "logps/chosen": -17.666889190673828, + "logps/rejected": -152.59658813476562, + "loss": 17.9093, + "losses_ref": -0.0013337829150259495, + "ref_logps/chosen": -96.6623764038086, + "ref_logps/rejected": -86.54540252685547, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 78.9954833984375, + "rewards/margins": 145.04666137695312, + "rewards/rejected": -66.05118560791016, + "step": 3490, + "u": -4.291287422180176, + "weight": 0.05006307363510132 + }, + { + "diff_generated": -70.07727813720703, + "epoch": 1.134154244977317, + "grad_norm": 471.32853184759824, + "learning_rate": 6.259447650873236e-07, + "logits/chosen": -2.509068012237549, + "logits/rejected": -2.5998129844665527, + "logps/chosen": -15.926129341125488, + "logps/rejected": -161.7591552734375, + "loss": 18.0347, + "losses_ref": -0.003403474111109972, + "ref_logps/chosen": -94.38692474365234, + "ref_logps/rejected": -91.68186950683594, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 78.4607925415039, + "rewards/margins": 148.5380859375, + "rewards/rejected": -70.07727813720703, + "step": 3500, + "u": -4.349414825439453, + "weight": 0.050165869295597076 + }, + { + "diff_generated": -66.88079833984375, + "epoch": 1.137394685677252, + "grad_norm": 440.6501931497652, + "learning_rate": 6.246986124706555e-07, + "logits/chosen": -2.4343461990356445, + "logits/rejected": -2.565577983856201, + "logps/chosen": -18.252582550048828, + "logps/rejected": -157.1911163330078, + "loss": 17.6839, + "losses_ref": -0.004433914087712765, + "ref_logps/chosen": -93.1754379272461, + "ref_logps/rejected": -90.31031799316406, + "rewards/accuracies": 0.9375, + "rewards/chosen": 74.9228515625, + "rewards/margins": 141.80364990234375, + "rewards/rejected": -66.88079833984375, + "step": 3510, + "u": -4.272037506103516, + "weight": 0.06272298842668533 + }, + { + "diff_generated": -66.67464447021484, + "epoch": 1.1406351263771872, + "grad_norm": 454.5961208898324, + "learning_rate": 6.234492653671797e-07, + "logits/chosen": -2.4900689125061035, + "logits/rejected": -2.5623250007629395, + "logps/chosen": -18.45693588256836, + "logps/rejected": -154.75906372070312, + "loss": 17.7162, + "losses_ref": -0.0011696848087012768, + "ref_logps/chosen": -98.3475341796875, + "ref_logps/rejected": -88.08442687988281, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 79.8906021118164, + "rewards/margins": 146.56524658203125, + "rewards/rejected": -66.67464447021484, + "step": 3520, + "u": -4.374854564666748, + "weight": 0.05005534738302231 + }, + { + "diff_generated": -65.45890045166016, + "epoch": 1.1438755670771226, + "grad_norm": 523.89030306214, + "learning_rate": 6.221967415385675e-07, + "logits/chosen": -2.4773688316345215, + "logits/rejected": -2.516385555267334, + "logps/chosen": -17.959131240844727, + "logps/rejected": -148.49928283691406, + "loss": 18.0136, + "losses_ref": -4.428060947248014e-06, + "ref_logps/chosen": -99.86962890625, + "ref_logps/rejected": -83.0403823852539, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 81.91049194335938, + "rewards/margins": 147.36940002441406, + "rewards/rejected": -65.45890045166016, + "step": 3530, + "u": -4.355672836303711, + "weight": 0.050000131130218506 + }, + { + "diff_generated": -66.77120971679688, + "epoch": 1.1471160077770577, + "grad_norm": 470.31431857349844, + "learning_rate": 6.209410587916524e-07, + "logits/chosen": -2.426239252090454, + "logits/rejected": -2.4639995098114014, + "logps/chosen": -19.04548454284668, + "logps/rejected": -149.74981689453125, + "loss": 17.8432, + "losses_ref": -1.6486468439325108e-06, + "ref_logps/chosen": -100.50316619873047, + "ref_logps/rejected": -82.97860717773438, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 81.45768737792969, + "rewards/margins": 148.22891235351562, + "rewards/rejected": -66.77120971679688, + "step": 3540, + "u": -4.378117084503174, + "weight": 0.03750004991889 + }, + { + "diff_generated": -66.69547271728516, + "epoch": 1.1503564484769928, + "grad_norm": 487.2198330207783, + "learning_rate": 6.196822349781781e-07, + "logits/chosen": -2.4488656520843506, + "logits/rejected": -2.503800868988037, + "logps/chosen": -18.600788116455078, + "logps/rejected": -150.7244873046875, + "loss": 17.6787, + "losses_ref": -0.0007633547065779567, + "ref_logps/chosen": -97.8669662475586, + "ref_logps/rejected": -84.02903747558594, + "rewards/accuracies": 0.96875, + "rewards/chosen": 79.26618957519531, + "rewards/margins": 145.961669921875, + "rewards/rejected": -66.69547271728516, + "step": 3550, + "u": -4.396124362945557, + "weight": 0.031284209340810776 + }, + { + "diff_generated": -64.43789672851562, + "epoch": 1.1535968891769282, + "grad_norm": 443.73160688764654, + "learning_rate": 6.184202879945437e-07, + "logits/chosen": -2.4165685176849365, + "logits/rejected": -2.4975438117980957, + "logps/chosen": -18.127599716186523, + "logps/rejected": -142.0409393310547, + "loss": 18.1951, + "losses_ref": -8.06269440545293e-07, + "ref_logps/chosen": -92.44525146484375, + "ref_logps/rejected": -77.60304260253906, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 74.3176498413086, + "rewards/margins": 138.75555419921875, + "rewards/rejected": -64.43789672851562, + "step": 3560, + "u": -4.217528343200684, + "weight": 0.07500002533197403 + }, + { + "diff_generated": -66.31949615478516, + "epoch": 1.1568373298768633, + "grad_norm": 396.4063188323412, + "learning_rate": 6.171552357815497e-07, + "logits/chosen": -2.4018328189849854, + "logits/rejected": -2.540160894393921, + "logps/chosen": -17.0816650390625, + "logps/rejected": -151.96615600585938, + "loss": 17.6642, + "losses_ref": -0.0004584209527820349, + "ref_logps/chosen": -93.61339569091797, + "ref_logps/rejected": -85.64667510986328, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 76.53173828125, + "rewards/margins": 142.85122680664062, + "rewards/rejected": -66.31949615478516, + "step": 3570, + "u": -4.435829162597656, + "weight": 0.025020593777298927 + }, + { + "diff_generated": -67.34017944335938, + "epoch": 1.1600777705767984, + "grad_norm": 500.8741646869675, + "learning_rate": 6.15887096324143e-07, + "logits/chosen": -2.467637538909912, + "logits/rejected": -2.5334534645080566, + "logps/chosen": -18.958505630493164, + "logps/rejected": -153.88809204101562, + "loss": 17.3967, + "losses_ref": -9.873149792838376e-06, + "ref_logps/chosen": -97.52275848388672, + "ref_logps/rejected": -86.54790496826172, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 78.56425476074219, + "rewards/margins": 145.90443420410156, + "rewards/rejected": -67.34017944335938, + "step": 3580, + "u": -4.305108547210693, + "weight": 0.05625037103891373 + }, + { + "diff_generated": -66.20716857910156, + "epoch": 1.1633182112767337, + "grad_norm": 449.6767901200747, + "learning_rate": 6.14615887651161e-07, + "logits/chosen": -2.4450440406799316, + "logits/rejected": -2.5749616622924805, + "logps/chosen": -16.25723648071289, + "logps/rejected": -153.3737335205078, + "loss": 17.0221, + "losses_ref": -0.0018811358604580164, + "ref_logps/chosen": -88.58432006835938, + "ref_logps/rejected": -87.16657257080078, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 72.32708740234375, + "rewards/margins": 138.5342559814453, + "rewards/rejected": -66.20716857910156, + "step": 3590, + "u": -4.233583450317383, + "weight": 0.06883620470762253 + }, + { + "diff_generated": -66.19894409179688, + "epoch": 1.1665586519766689, + "grad_norm": 479.7835646590672, + "learning_rate": 6.133416278350756e-07, + "logits/chosen": -2.4543826580047607, + "logits/rejected": -2.5439774990081787, + "logps/chosen": -17.245370864868164, + "logps/rejected": -149.97860717773438, + "loss": 17.5297, + "losses_ref": -0.003648832906037569, + "ref_logps/chosen": -93.31538391113281, + "ref_logps/rejected": -83.7796401977539, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 76.07001495361328, + "rewards/margins": 142.26895141601562, + "rewards/rejected": -66.19894409179688, + "step": 3600, + "u": -4.308576583862305, + "weight": 0.05642462521791458 + }, + { + "diff_generated": -68.85673522949219, + "epoch": 1.169799092676604, + "grad_norm": 493.566813054623, + "learning_rate": 6.120643349917359e-07, + "logits/chosen": -2.470064640045166, + "logits/rejected": -2.525467872619629, + "logps/chosen": -17.012256622314453, + "logps/rejected": -155.98880004882812, + "loss": 17.6349, + "losses_ref": -0.0008439187076874077, + "ref_logps/chosen": -100.3587417602539, + "ref_logps/rejected": -87.13206481933594, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 83.34648132324219, + "rewards/margins": 152.20321655273438, + "rewards/rejected": -68.85673522949219, + "step": 3610, + "u": -4.288359642028809, + "weight": 0.05004129558801651 + }, + { + "diff_generated": -62.7186164855957, + "epoch": 1.173039533376539, + "grad_norm": 480.34216176357796, + "learning_rate": 6.107840272801108e-07, + "logits/chosen": -2.456737756729126, + "logits/rejected": -2.5191168785095215, + "logps/chosen": -18.4807186126709, + "logps/rejected": -145.6920166015625, + "loss": 17.6207, + "losses_ref": -0.005524917971342802, + "ref_logps/chosen": -92.15921783447266, + "ref_logps/rejected": -82.97339630126953, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 73.6784896850586, + "rewards/margins": 136.39712524414062, + "rewards/rejected": -62.7186164855957, + "step": 3620, + "u": -4.388144493103027, + "weight": 0.044021543115377426 + }, + { + "diff_generated": -64.40690612792969, + "epoch": 1.1762799740764744, + "grad_norm": 462.5062963786796, + "learning_rate": 6.095007229020311e-07, + "logits/chosen": -2.4350109100341797, + "logits/rejected": -2.576110363006592, + "logps/chosen": -15.553683280944824, + "logps/rejected": -151.23849487304688, + "loss": 17.6157, + "losses_ref": -0.00013673387002199888, + "ref_logps/chosen": -91.82894134521484, + "ref_logps/rejected": -86.83157348632812, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 76.27525329589844, + "rewards/margins": 140.68215942382812, + "rewards/rejected": -64.40690612792969, + "step": 3630, + "u": -4.306022644042969, + "weight": 0.05625535175204277 + }, + { + "diff_generated": -64.05938720703125, + "epoch": 1.1795204147764096, + "grad_norm": 453.34527622779916, + "learning_rate": 6.082144401019304e-07, + "logits/chosen": -2.467184066772461, + "logits/rejected": -2.519963026046753, + "logps/chosen": -17.702754974365234, + "logps/rejected": -146.12852478027344, + "loss": 17.4214, + "losses_ref": -1.905074532260187e-05, + "ref_logps/chosen": -93.05728912353516, + "ref_logps/rejected": -82.06913757324219, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 75.35453796386719, + "rewards/margins": 139.41392517089844, + "rewards/rejected": -64.05938720703125, + "step": 3640, + "u": -4.3346781730651855, + "weight": 0.050000596791505814 + }, + { + "diff_generated": -64.80006408691406, + "epoch": 1.1827608554763447, + "grad_norm": 497.3921847635673, + "learning_rate": 6.069251971665857e-07, + "logits/chosen": -2.4072885513305664, + "logits/rejected": -2.5418806076049805, + "logps/chosen": -17.536151885986328, + "logps/rejected": -149.25770568847656, + "loss": 17.8686, + "losses_ref": -0.023113643750548363, + "ref_logps/chosen": -90.7900161743164, + "ref_logps/rejected": -84.45764923095703, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 73.25386047363281, + "rewards/margins": 138.05393981933594, + "rewards/rejected": -64.80006408691406, + "step": 3650, + "u": -4.255960464477539, + "weight": 0.06997169554233551 + }, + { + "diff_generated": -70.54817199707031, + "epoch": 1.18600129617628, + "grad_norm": 501.99646571703795, + "learning_rate": 6.056330124248576e-07, + "logits/chosen": -2.420248508453369, + "logits/rejected": -2.5905723571777344, + "logps/chosen": -15.543874740600586, + "logps/rejected": -158.19857788085938, + "loss": 17.1204, + "losses_ref": -9.355277143185958e-06, + "ref_logps/chosen": -91.93193054199219, + "ref_logps/rejected": -87.65039825439453, + "rewards/accuracies": 0.96875, + "rewards/chosen": 76.38804626464844, + "rewards/margins": 146.9362335205078, + "rewards/rejected": -70.54817199707031, + "step": 3660, + "u": -4.446316719055176, + "weight": 0.031250424683094025 + }, + { + "diff_generated": -69.34424591064453, + "epoch": 1.1892417368762151, + "grad_norm": 411.3228445717369, + "learning_rate": 6.043379042474297e-07, + "logits/chosen": -2.437598705291748, + "logits/rejected": -2.5694053173065186, + "logps/chosen": -18.93130874633789, + "logps/rejected": -155.77711486816406, + "loss": 17.2569, + "losses_ref": -0.0005264817154966295, + "ref_logps/chosen": -97.73585510253906, + "ref_logps/rejected": -86.43287658691406, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 78.80455017089844, + "rewards/margins": 148.14878845214844, + "rewards/rejected": -69.34424591064453, + "step": 3670, + "u": -4.360543251037598, + "weight": 0.05002452805638313 + }, + { + "diff_generated": -68.2588882446289, + "epoch": 1.1924821775761503, + "grad_norm": 432.07562523916755, + "learning_rate": 6.030398910465475e-07, + "logits/chosen": -2.4059746265411377, + "logits/rejected": -2.509774684906006, + "logps/chosen": -17.03068733215332, + "logps/rejected": -153.54539489746094, + "loss": 18.2568, + "losses_ref": -0.0018109262455254793, + "ref_logps/chosen": -93.44799041748047, + "ref_logps/rejected": -85.28651428222656, + "rewards/accuracies": 0.9375, + "rewards/chosen": 76.41730499267578, + "rewards/margins": 144.67617797851562, + "rewards/rejected": -68.2588882446289, + "step": 3680, + "u": -4.272830486297607, + "weight": 0.06258610635995865 + }, + { + "diff_generated": -66.36592864990234, + "epoch": 1.1957226182760856, + "grad_norm": 449.0600224169998, + "learning_rate": 6.017389912757561e-07, + "logits/chosen": -2.4729011058807373, + "logits/rejected": -2.5933284759521484, + "logps/chosen": -15.932981491088867, + "logps/rejected": -154.79925537109375, + "loss": 17.0165, + "losses_ref": -0.0008205400081351399, + "ref_logps/chosen": -90.65199279785156, + "ref_logps/rejected": -88.43331146240234, + "rewards/accuracies": 0.9375, + "rewards/chosen": 74.7190170288086, + "rewards/margins": 141.0849609375, + "rewards/rejected": -66.36592864990234, + "step": 3690, + "u": -4.300013542175293, + "weight": 0.06253810971975327 + }, + { + "diff_generated": -62.134925842285156, + "epoch": 1.1989630589760207, + "grad_norm": 487.2705623202895, + "learning_rate": 6.004352234296389e-07, + "logits/chosen": -2.444080114364624, + "logits/rejected": -2.518881320953369, + "logps/chosen": -19.006927490234375, + "logps/rejected": -147.88319396972656, + "loss": 18.3132, + "losses_ref": -0.011056670919060707, + "ref_logps/chosen": -100.0880126953125, + "ref_logps/rejected": -85.74827575683594, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 81.08108520507812, + "rewards/margins": 143.2160186767578, + "rewards/rejected": -62.134925842285156, + "step": 3700, + "u": -4.219624996185303, + "weight": 0.06931637227535248 + }, + { + "diff_generated": -62.08357620239258, + "epoch": 1.2022034996759559, + "grad_norm": 464.6541566589534, + "learning_rate": 5.991286060435536e-07, + "logits/chosen": -2.4166674613952637, + "logits/rejected": -2.476719856262207, + "logps/chosen": -18.68705940246582, + "logps/rejected": -142.98526000976562, + "loss": 18.034, + "losses_ref": -9.735246749187354e-06, + "ref_logps/chosen": -95.77069091796875, + "ref_logps/rejected": -80.90168762207031, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 77.08363342285156, + "rewards/margins": 139.16720581054688, + "rewards/rejected": -62.08357620239258, + "step": 3710, + "u": -4.208899021148682, + "weight": 0.08125011622905731 + }, + { + "diff_generated": -65.4081802368164, + "epoch": 1.2054439403758912, + "grad_norm": 467.91529233329715, + "learning_rate": 5.978191576933692e-07, + "logits/chosen": -2.435298204421997, + "logits/rejected": -2.5294342041015625, + "logps/chosen": -16.750965118408203, + "logps/rejected": -149.4308319091797, + "loss": 17.6858, + "losses_ref": -0.0006217855261638761, + "ref_logps/chosen": -94.02302551269531, + "ref_logps/rejected": -84.02266693115234, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 77.27205657958984, + "rewards/margins": 142.6802215576172, + "rewards/rejected": -65.4081802368164, + "step": 3720, + "u": -4.320083141326904, + "weight": 0.056277960538864136 + }, + { + "diff_generated": -70.64672088623047, + "epoch": 1.2086843810758263, + "grad_norm": 439.429553373978, + "learning_rate": 5.965068969952017e-07, + "logits/chosen": -2.457728862762451, + "logits/rejected": -2.5720419883728027, + "logps/chosen": -17.41990852355957, + "logps/rejected": -163.0725555419922, + "loss": 16.9155, + "losses_ref": -0.00014942415873520076, + "ref_logps/chosen": -95.34916687011719, + "ref_logps/rejected": -92.42584991455078, + "rewards/accuracies": 0.96875, + "rewards/chosen": 77.92925262451172, + "rewards/margins": 148.5759735107422, + "rewards/rejected": -70.64672088623047, + "step": 3730, + "u": -4.437350273132324, + "weight": 0.031256116926670074 + }, + { + "diff_generated": -67.57591247558594, + "epoch": 1.2119248217757614, + "grad_norm": 417.0455210166249, + "learning_rate": 5.951918426051502e-07, + "logits/chosen": -2.49534273147583, + "logits/rejected": -2.5804288387298584, + "logps/chosen": -13.765565872192383, + "logps/rejected": -151.92489624023438, + "loss": 16.8638, + "losses_ref": -0.0001484433450968936, + "ref_logps/chosen": -92.38279724121094, + "ref_logps/rejected": -84.34899139404297, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 78.61722564697266, + "rewards/margins": 146.19314575195312, + "rewards/rejected": -67.57591247558594, + "step": 3740, + "u": -4.319943428039551, + "weight": 0.050006695091724396 + }, + { + "diff_generated": -67.62321472167969, + "epoch": 1.2151652624756968, + "grad_norm": 430.78735226706414, + "learning_rate": 5.938740132190306e-07, + "logits/chosen": -2.4281935691833496, + "logits/rejected": -2.52376127243042, + "logps/chosen": -16.50156021118164, + "logps/rejected": -155.07244873046875, + "loss": 18.3025, + "losses_ref": -2.4736641535128e-07, + "ref_logps/chosen": -95.54866027832031, + "ref_logps/rejected": -87.44921875, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 79.04710388183594, + "rewards/margins": 146.67031860351562, + "rewards/rejected": -67.62321472167969, + "step": 3750, + "u": -4.36559534072876, + "weight": 0.04375000670552254 + }, + { + "diff_generated": -70.21691131591797, + "epoch": 1.218405703175632, + "grad_norm": 444.5620289021675, + "learning_rate": 5.9255342757211e-07, + "logits/chosen": -2.4592199325561523, + "logits/rejected": -2.579988956451416, + "logps/chosen": -17.252155303955078, + "logps/rejected": -153.64694213867188, + "loss": 17.0841, + "losses_ref": -0.0016604771371930838, + "ref_logps/chosen": -95.3075942993164, + "ref_logps/rejected": -83.43003845214844, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 78.05543518066406, + "rewards/margins": 148.27235412597656, + "rewards/rejected": -70.21691131591797, + "step": 3760, + "u": -4.428204536437988, + "weight": 0.02507650852203369 + }, + { + "diff_generated": -62.5455436706543, + "epoch": 1.221646143875567, + "grad_norm": 521.0068796064838, + "learning_rate": 5.91230104438841e-07, + "logits/chosen": -2.474541187286377, + "logits/rejected": -2.4790170192718506, + "logps/chosen": -18.36305809020996, + "logps/rejected": -143.1754608154297, + "loss": 17.3262, + "losses_ref": -0.014308147132396698, + "ref_logps/chosen": -95.47447967529297, + "ref_logps/rejected": -80.62992858886719, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 77.11141967773438, + "rewards/margins": 139.65696716308594, + "rewards/rejected": -62.5455436706543, + "step": 3770, + "u": -4.210149765014648, + "weight": 0.06888743489980698 + }, + { + "diff_generated": -63.37689208984375, + "epoch": 1.2248865845755024, + "grad_norm": 439.35926696012797, + "learning_rate": 5.899040626325945e-07, + "logits/chosen": -2.484140634536743, + "logits/rejected": -2.5540719032287598, + "logps/chosen": -16.743610382080078, + "logps/rejected": -144.51959228515625, + "loss": 17.3574, + "losses_ref": -0.00037624576361849904, + "ref_logps/chosen": -93.28424835205078, + "ref_logps/rejected": -81.1427001953125, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 76.54063415527344, + "rewards/margins": 139.9175262451172, + "rewards/rejected": -63.37689208984375, + "step": 3780, + "u": -4.222517490386963, + "weight": 0.06875824928283691 + }, + { + "diff_generated": -65.10592651367188, + "epoch": 1.2281270252754375, + "grad_norm": 507.5671400011117, + "learning_rate": 5.885753210053917e-07, + "logits/chosen": -2.4867701530456543, + "logits/rejected": -2.567906141281128, + "logps/chosen": -17.901391983032227, + "logps/rejected": -152.5151824951172, + "loss": 17.8247, + "losses_ref": -0.00267465366050601, + "ref_logps/chosen": -95.4359359741211, + "ref_logps/rejected": -87.4092788696289, + "rewards/accuracies": 0.9375, + "rewards/chosen": 77.5345458984375, + "rewards/margins": 142.64047241210938, + "rewards/rejected": -65.10592651367188, + "step": 3790, + "u": -4.243159294128418, + "weight": 0.06263072788715363 + }, + { + "diff_generated": -63.96403884887695, + "epoch": 1.2313674659753726, + "grad_norm": 480.8556932216797, + "learning_rate": 5.872438984476368e-07, + "logits/chosen": -2.471346616744995, + "logits/rejected": -2.505013942718506, + "logps/chosen": -18.858333587646484, + "logps/rejected": -141.34747314453125, + "loss": 17.6185, + "losses_ref": -0.001003618584945798, + "ref_logps/chosen": -95.43574523925781, + "ref_logps/rejected": -77.38343811035156, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 76.57740783691406, + "rewards/margins": 140.5414581298828, + "rewards/rejected": -63.96403884887695, + "step": 3800, + "u": -4.403631210327148, + "weight": 0.04379910230636597 + }, + { + "diff_generated": -68.29475402832031, + "epoch": 1.2346079066753077, + "grad_norm": 463.0034451448924, + "learning_rate": 5.859098138878482e-07, + "logits/chosen": -2.465116024017334, + "logits/rejected": -2.5396251678466797, + "logps/chosen": -19.610532760620117, + "logps/rejected": -157.35377502441406, + "loss": 18.0952, + "losses_ref": -0.000537110201548785, + "ref_logps/chosen": -99.69366455078125, + "ref_logps/rejected": -89.05900573730469, + "rewards/accuracies": 0.96875, + "rewards/chosen": 80.0831298828125, + "rewards/margins": 148.3778839111328, + "rewards/rejected": -68.29475402832031, + "step": 3810, + "u": -4.445765495300293, + "weight": 0.031274110078811646 + }, + { + "diff_generated": -66.76065063476562, + "epoch": 1.237848347375243, + "grad_norm": 485.8388138489112, + "learning_rate": 5.845730862923889e-07, + "logits/chosen": -2.4166407585144043, + "logits/rejected": -2.4928579330444336, + "logps/chosen": -18.420665740966797, + "logps/rejected": -153.03782653808594, + "loss": 17.9809, + "losses_ref": -0.012328693643212318, + "ref_logps/chosen": -94.0389633178711, + "ref_logps/rejected": -86.27717590332031, + "rewards/accuracies": 0.9375, + "rewards/chosen": 75.61830139160156, + "rewards/margins": 142.3789520263672, + "rewards/rejected": -66.76065063476562, + "step": 3820, + "u": -4.259571552276611, + "weight": 0.06311958283185959 + }, + { + "diff_generated": -66.66990661621094, + "epoch": 1.2410887880751782, + "grad_norm": 474.973404226213, + "learning_rate": 5.83233734665198e-07, + "logits/chosen": -2.437342405319214, + "logits/rejected": -2.4982120990753174, + "logps/chosen": -17.382713317871094, + "logps/rejected": -150.70150756835938, + "loss": 17.059, + "losses_ref": -0.00036550246295519173, + "ref_logps/chosen": -97.52508544921875, + "ref_logps/rejected": -84.03162384033203, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 80.14237976074219, + "rewards/margins": 146.81228637695312, + "rewards/rejected": -66.66990661621094, + "step": 3830, + "u": -4.3889641761779785, + "weight": 0.043765999376773834 + }, + { + "diff_generated": -68.80332946777344, + "epoch": 1.2443292287751135, + "grad_norm": 423.7381461477668, + "learning_rate": 5.818917780475196e-07, + "logits/chosen": -2.475179672241211, + "logits/rejected": -2.5922000408172607, + "logps/chosen": -20.405536651611328, + "logps/rejected": -158.35231018066406, + "loss": 17.7137, + "losses_ref": -2.940079468771728e-08, + "ref_logps/chosen": -102.04740905761719, + "ref_logps/rejected": -89.54898834228516, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 81.6418685913086, + "rewards/margins": 150.4451904296875, + "rewards/rejected": -68.80332946777344, + "step": 3840, + "u": -4.486301898956299, + "weight": 0.01875000074505806 + }, + { + "diff_generated": -66.65308380126953, + "epoch": 1.2475696694750487, + "grad_norm": 474.8560758696638, + "learning_rate": 5.805472355176318e-07, + "logits/chosen": -2.4920127391815186, + "logits/rejected": -2.560584306716919, + "logps/chosen": -17.58675765991211, + "logps/rejected": -155.38302612304688, + "loss": 17.4797, + "losses_ref": -8.679247684995062e-07, + "ref_logps/chosen": -97.6273422241211, + "ref_logps/rejected": -88.72994995117188, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 80.04058074951172, + "rewards/margins": 146.6936798095703, + "rewards/rejected": -66.65308380126953, + "step": 3850, + "u": -4.330027103424072, + "weight": 0.05000001937150955 + }, + { + "diff_generated": -62.756385803222656, + "epoch": 1.2508101101749838, + "grad_norm": 430.1279253062814, + "learning_rate": 5.792001261905767e-07, + "logits/chosen": -2.454555034637451, + "logits/rejected": -2.5446088314056396, + "logps/chosen": -17.413721084594727, + "logps/rejected": -142.61538696289062, + "loss": 16.6907, + "losses_ref": -3.525937700032955e-06, + "ref_logps/chosen": -92.18282318115234, + "ref_logps/rejected": -79.85899353027344, + "rewards/accuracies": 0.90625, + "rewards/chosen": 74.76910400390625, + "rewards/margins": 137.52548217773438, + "rewards/rejected": -62.756385803222656, + "step": 3860, + "u": -4.043120384216309, + "weight": 0.09375004470348358 + }, + { + "diff_generated": -64.377685546875, + "epoch": 1.254050550874919, + "grad_norm": 477.44499672360115, + "learning_rate": 5.778504692178876e-07, + "logits/chosen": -2.4434051513671875, + "logits/rejected": -2.6090359687805176, + "logps/chosen": -16.34039306640625, + "logps/rejected": -143.81460571289062, + "loss": 17.0992, + "losses_ref": -5.5604403314646333e-05, + "ref_logps/chosen": -90.14739227294922, + "ref_logps/rejected": -79.43690490722656, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 73.80699157714844, + "rewards/margins": 138.18466186523438, + "rewards/rejected": -64.377685546875, + "step": 3870, + "u": -4.2456865310668945, + "weight": 0.0750010758638382 + }, + { + "diff_generated": -68.07923889160156, + "epoch": 1.2572909915748542, + "grad_norm": 403.44420896919473, + "learning_rate": 5.76498283787317e-07, + "logits/chosen": -2.462009906768799, + "logits/rejected": -2.5193183422088623, + "logps/chosen": -17.027873992919922, + "logps/rejected": -157.0982666015625, + "loss": 17.0411, + "losses_ref": -7.056218720435936e-08, + "ref_logps/chosen": -96.52754211425781, + "ref_logps/rejected": -89.01904296875, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 79.49966430664062, + "rewards/margins": 147.5789031982422, + "rewards/rejected": -68.07923889160156, + "step": 3880, + "u": -4.3342485427856445, + "weight": 0.05000000074505806 + }, + { + "diff_generated": -65.75061798095703, + "epoch": 1.2605314322747894, + "grad_norm": 430.6980995406202, + "learning_rate": 5.751435891225643e-07, + "logits/chosen": -2.420855760574341, + "logits/rejected": -2.534722089767456, + "logps/chosen": -15.174592971801758, + "logps/rejected": -149.7338409423828, + "loss": 16.6038, + "losses_ref": -0.0031283546704798937, + "ref_logps/chosen": -89.49068450927734, + "ref_logps/rejected": -83.98322296142578, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 74.31610107421875, + "rewards/margins": 140.06671142578125, + "rewards/rejected": -65.75061798095703, + "step": 3890, + "u": -4.326550006866455, + "weight": 0.056396253407001495 + }, + { + "diff_generated": -64.78160095214844, + "epoch": 1.2637718729747245, + "grad_norm": 454.27027889030245, + "learning_rate": 5.737864044830015e-07, + "logits/chosen": -2.4483044147491455, + "logits/rejected": -2.5388526916503906, + "logps/chosen": -18.663970947265625, + "logps/rejected": -147.86756896972656, + "loss": 17.4371, + "losses_ref": -0.00033568666549399495, + "ref_logps/chosen": -96.4852294921875, + "ref_logps/rejected": -83.08597564697266, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 77.82125091552734, + "rewards/margins": 142.6028594970703, + "rewards/rejected": -64.78160095214844, + "step": 3900, + "u": -4.257702827453613, + "weight": 0.0750146210193634 + }, + { + "diff_generated": -65.7984390258789, + "epoch": 1.2670123136746598, + "grad_norm": 451.96926077815203, + "learning_rate": 5.724267491634006e-07, + "logits/chosen": -2.4495253562927246, + "logits/rejected": -2.538529396057129, + "logps/chosen": -17.021711349487305, + "logps/rejected": -151.54751586914062, + "loss": 17.0695, + "losses_ref": -0.0006901304004713893, + "ref_logps/chosen": -95.57925415039062, + "ref_logps/rejected": -85.74907684326172, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 78.55754089355469, + "rewards/margins": 144.35597229003906, + "rewards/rejected": -65.7984390258789, + "step": 3910, + "u": -4.390527725219727, + "weight": 0.0437803752720356 + }, + { + "diff_generated": -64.09269714355469, + "epoch": 1.270252754374595, + "grad_norm": 412.65807296826927, + "learning_rate": 5.710646424936581e-07, + "logits/chosen": -2.475703716278076, + "logits/rejected": -2.5469605922698975, + "logps/chosen": -18.97299575805664, + "logps/rejected": -147.84817504882812, + "loss": 17.749, + "losses_ref": -0.0020144921727478504, + "ref_logps/chosen": -98.8651123046875, + "ref_logps/rejected": -83.7554702758789, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 79.89212036132812, + "rewards/margins": 143.98483276367188, + "rewards/rejected": -64.09269714355469, + "step": 3920, + "u": -4.362476825714111, + "weight": 0.04384419694542885 + }, + { + "diff_generated": -66.27864074707031, + "epoch": 1.27349319507453, + "grad_norm": 459.39132209179314, + "learning_rate": 5.697001038385212e-07, + "logits/chosen": -2.4267189502716064, + "logits/rejected": -2.5123836994171143, + "logps/chosen": -18.38796615600586, + "logps/rejected": -155.38739013671875, + "loss": 17.6722, + "losses_ref": -0.0027240305207669735, + "ref_logps/chosen": -98.43446350097656, + "ref_logps/rejected": -89.10877227783203, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 80.04649353027344, + "rewards/margins": 146.32513427734375, + "rewards/rejected": -66.27864074707031, + "step": 3930, + "u": -4.358931064605713, + "weight": 0.04388491064310074 + }, + { + "diff_generated": -66.59132385253906, + "epoch": 1.2767336357744652, + "grad_norm": 477.7976292344514, + "learning_rate": 5.683331525973118e-07, + "logits/chosen": -2.4096784591674805, + "logits/rejected": -2.52586030960083, + "logps/chosen": -18.428979873657227, + "logps/rejected": -151.68357849121094, + "loss": 17.9629, + "losses_ref": -1.724712461736999e-08, + "ref_logps/chosen": -94.28060913085938, + "ref_logps/rejected": -85.09223937988281, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 75.85163116455078, + "rewards/margins": 142.4429473876953, + "rewards/rejected": -66.59132385253906, + "step": 3940, + "u": -4.3122944831848145, + "weight": 0.05624999850988388 + }, + { + "diff_generated": -65.66859436035156, + "epoch": 1.2799740764744005, + "grad_norm": 483.81987528369837, + "learning_rate": 5.66963808203651e-07, + "logits/chosen": -2.4494543075561523, + "logits/rejected": -2.5477046966552734, + "logps/chosen": -18.167545318603516, + "logps/rejected": -149.3108673095703, + "loss": 17.3193, + "losses_ref": -0.00034220569068565965, + "ref_logps/chosen": -96.05061340332031, + "ref_logps/rejected": -83.64225006103516, + "rewards/accuracies": 0.96875, + "rewards/chosen": 77.88307189941406, + "rewards/margins": 143.55166625976562, + "rewards/rejected": -65.66859436035156, + "step": 3950, + "u": -4.404166221618652, + "weight": 0.03126030042767525 + }, + { + "diff_generated": -65.86676788330078, + "epoch": 1.2832145171743357, + "grad_norm": 464.5151411476406, + "learning_rate": 5.65592090125183e-07, + "logits/chosen": -2.4210243225097656, + "logits/rejected": -2.5444581508636475, + "logps/chosen": -13.709085464477539, + "logps/rejected": -151.24037170410156, + "loss": 16.9163, + "losses_ref": -1.5870946299401112e-05, + "ref_logps/chosen": -90.91244506835938, + "ref_logps/rejected": -85.37360382080078, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 77.20335388183594, + "rewards/margins": 143.07009887695312, + "rewards/rejected": -65.86676788330078, + "step": 3960, + "u": -4.187623977661133, + "weight": 0.07500036060810089 + }, + { + "diff_generated": -67.22178649902344, + "epoch": 1.286454957874271, + "grad_norm": 458.0925132442424, + "learning_rate": 5.642180178632977e-07, + "logits/chosen": -2.4346470832824707, + "logits/rejected": -2.551971912384033, + "logps/chosen": -16.795867919921875, + "logps/rejected": -153.08494567871094, + "loss": 17.6376, + "losses_ref": -0.0018662631046026945, + "ref_logps/chosen": -93.50836181640625, + "ref_logps/rejected": -85.8631591796875, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 76.7125015258789, + "rewards/margins": 143.93429565429688, + "rewards/rejected": -67.22178649902344, + "step": 3970, + "u": -4.368497848510742, + "weight": 0.04383764788508415 + }, + { + "diff_generated": -69.42669677734375, + "epoch": 1.2896953985742061, + "grad_norm": 451.47536908412064, + "learning_rate": 5.628416109528542e-07, + "logits/chosen": -2.418494939804077, + "logits/rejected": -2.5364253520965576, + "logps/chosen": -16.748294830322266, + "logps/rejected": -157.46620178222656, + "loss": 18.0641, + "losses_ref": -0.0028826945926994085, + "ref_logps/chosen": -93.15105438232422, + "ref_logps/rejected": -88.03948974609375, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 76.40276336669922, + "rewards/margins": 145.8294677734375, + "rewards/rejected": -69.42669677734375, + "step": 3980, + "u": -4.361595153808594, + "weight": 0.05014491081237793 + }, + { + "diff_generated": -64.70690155029297, + "epoch": 1.2929358392741412, + "grad_norm": 454.28630354692615, + "learning_rate": 5.614628889619029e-07, + "logits/chosen": -2.3802146911621094, + "logits/rejected": -2.545664072036743, + "logps/chosen": -16.525279998779297, + "logps/rejected": -149.8128204345703, + "loss": 17.8829, + "losses_ref": -0.005721858702600002, + "ref_logps/chosen": -89.77972412109375, + "ref_logps/rejected": -85.10591125488281, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 73.25444793701172, + "rewards/margins": 137.9613494873047, + "rewards/rejected": -64.70690155029297, + "step": 3990, + "u": -4.33136510848999, + "weight": 0.05031445622444153 + }, + { + "diff_generated": -64.35713195800781, + "epoch": 1.2961762799740764, + "grad_norm": 468.82409945638295, + "learning_rate": 5.600818714914065e-07, + "logits/chosen": -2.444451093673706, + "logits/rejected": -2.533601760864258, + "logps/chosen": -19.608978271484375, + "logps/rejected": -150.1212158203125, + "loss": 17.86, + "losses_ref": -0.0031665258575230837, + "ref_logps/chosen": -97.11079406738281, + "ref_logps/rejected": -85.76406860351562, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 77.5018081665039, + "rewards/margins": 141.8589324951172, + "rewards/rejected": -64.35713195800781, + "step": 4000, + "u": -4.308414936065674, + "weight": 0.056388258934020996 + }, + { + "diff_generated": -69.15150451660156, + "epoch": 1.2994167206740117, + "grad_norm": 478.1275719327832, + "learning_rate": 5.586985781749625e-07, + "logits/chosen": -2.475505828857422, + "logits/rejected": -2.670222043991089, + "logps/chosen": -17.083032608032227, + "logps/rejected": -163.42282104492188, + "loss": 17.2895, + "losses_ref": -0.0024460928980261087, + "ref_logps/chosen": -93.89830780029297, + "ref_logps/rejected": -94.27131652832031, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 76.81526184082031, + "rewards/margins": 145.96676635742188, + "rewards/rejected": -69.15150451660156, + "step": 4010, + "u": -4.460943698883057, + "weight": 0.025116896256804466 + }, + { + "diff_generated": -65.5688247680664, + "epoch": 1.3026571613739468, + "grad_norm": 438.468504539203, + "learning_rate": 5.573130286785237e-07, + "logits/chosen": -2.5178561210632324, + "logits/rejected": -2.5199027061462402, + "logps/chosen": -17.68651008605957, + "logps/rejected": -150.85919189453125, + "loss": 17.2796, + "losses_ref": -0.0003696681815199554, + "ref_logps/chosen": -98.16356658935547, + "ref_logps/rejected": -85.29036712646484, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 80.47705078125, + "rewards/margins": 146.04586791992188, + "rewards/rejected": -65.5688247680664, + "step": 4020, + "u": -4.210076332092285, + "weight": 0.07501642405986786 + }, + { + "diff_generated": -65.79573059082031, + "epoch": 1.3058976020738822, + "grad_norm": 477.750017756696, + "learning_rate": 5.559252427001178e-07, + "logits/chosen": -2.386543035507202, + "logits/rejected": -2.5044496059417725, + "logps/chosen": -18.277788162231445, + "logps/rejected": -144.05825805664062, + "loss": 17.1326, + "losses_ref": -0.00564918015152216, + "ref_logps/chosen": -93.29234313964844, + "ref_logps/rejected": -78.26251983642578, + "rewards/accuracies": 0.9375, + "rewards/chosen": 75.01456451416016, + "rewards/margins": 140.81028747558594, + "rewards/rejected": -65.79573059082031, + "step": 4030, + "u": -4.24915885925293, + "weight": 0.06275015324354172 + }, + { + "diff_generated": -67.20874786376953, + "epoch": 1.3091380427738173, + "grad_norm": 464.78850180166785, + "learning_rate": 5.545352399695687e-07, + "logits/chosen": -2.4252359867095947, + "logits/rejected": -2.564512252807617, + "logps/chosen": -17.03969955444336, + "logps/rejected": -149.99732971191406, + "loss": 17.5304, + "losses_ref": -0.006088468246161938, + "ref_logps/chosen": -92.34000396728516, + "ref_logps/rejected": -82.78856658935547, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 75.30028533935547, + "rewards/margins": 142.509033203125, + "rewards/rejected": -67.20874786376953, + "step": 4040, + "u": -4.393402099609375, + "weight": 0.037814319133758545 + }, + { + "diff_generated": -64.21138000488281, + "epoch": 1.3123784834737524, + "grad_norm": 460.87277273602257, + "learning_rate": 5.531430402482153e-07, + "logits/chosen": -2.4368481636047363, + "logits/rejected": -2.5155985355377197, + "logps/chosen": -17.370777130126953, + "logps/rejected": -152.25576782226562, + "loss": 17.227, + "losses_ref": -0.002279623644426465, + "ref_logps/chosen": -94.12760162353516, + "ref_logps/rejected": -88.04439544677734, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 76.75682067871094, + "rewards/margins": 140.9682159423828, + "rewards/rejected": -64.21138000488281, + "step": 4050, + "u": -4.247066497802734, + "weight": 0.06885669380426407 + }, + { + "diff_generated": -68.6576919555664, + "epoch": 1.3156189241736875, + "grad_norm": 416.17907967388055, + "learning_rate": 5.517486633286299e-07, + "logits/chosen": -2.417910099029541, + "logits/rejected": -2.5245842933654785, + "logps/chosen": -17.445659637451172, + "logps/rejected": -155.00851440429688, + "loss": 17.7637, + "losses_ref": -4.589330160342797e-08, + "ref_logps/chosen": -92.51380920410156, + "ref_logps/rejected": -86.35084533691406, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 75.06814575195312, + "rewards/margins": 143.725830078125, + "rewards/rejected": -68.6576919555664, + "step": 4060, + "u": -4.233851432800293, + "weight": 0.06875000149011612 + }, + { + "diff_generated": -67.85105895996094, + "epoch": 1.3188593648736229, + "grad_norm": 447.42658631506015, + "learning_rate": 5.503521290343384e-07, + "logits/chosen": -2.4626007080078125, + "logits/rejected": -2.5869998931884766, + "logps/chosen": -18.858882904052734, + "logps/rejected": -162.36654663085938, + "loss": 17.8787, + "losses_ref": -8.136340511555318e-06, + "ref_logps/chosen": -96.63755798339844, + "ref_logps/rejected": -94.51548767089844, + "rewards/accuracies": 0.96875, + "rewards/chosen": 77.77867126464844, + "rewards/margins": 145.62974548339844, + "rewards/rejected": -67.85105895996094, + "step": 4070, + "u": -4.423346519470215, + "weight": 0.03125032037496567 + }, + { + "diff_generated": -65.6728286743164, + "epoch": 1.322099805573558, + "grad_norm": 497.9260171796494, + "learning_rate": 5.489534572195373e-07, + "logits/chosen": -2.3953137397766113, + "logits/rejected": -2.590769052505493, + "logps/chosen": -15.14660930633545, + "logps/rejected": -146.5998077392578, + "loss": 17.2821, + "losses_ref": -0.00026578555116429925, + "ref_logps/chosen": -85.99039459228516, + "ref_logps/rejected": -80.92698669433594, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 70.84378051757812, + "rewards/margins": 136.5166015625, + "rewards/rejected": -65.6728286743164, + "step": 4080, + "u": -4.190157890319824, + "weight": 0.08126143366098404 + }, + { + "diff_generated": -71.92523193359375, + "epoch": 1.3253402462734931, + "grad_norm": 485.0867769878626, + "learning_rate": 5.47552667768811e-07, + "logits/chosen": -2.394679546356201, + "logits/rejected": -2.573765277862549, + "logps/chosen": -14.7946195602417, + "logps/rejected": -162.44032287597656, + "loss": 17.2681, + "losses_ref": -1.239477427361635e-07, + "ref_logps/chosen": -90.20631408691406, + "ref_logps/rejected": -90.51509094238281, + "rewards/accuracies": 0.96875, + "rewards/chosen": 75.41169738769531, + "rewards/margins": 147.3369140625, + "rewards/rejected": -71.92523193359375, + "step": 4090, + "u": -4.3911848068237305, + "weight": 0.0312500037252903 + }, + { + "diff_generated": -70.5416030883789, + "epoch": 1.3285806869734285, + "grad_norm": 424.5869132146945, + "learning_rate": 5.46149780596851e-07, + "logits/chosen": -2.434891939163208, + "logits/rejected": -2.5823049545288086, + "logps/chosen": -16.643686294555664, + "logps/rejected": -159.75894165039062, + "loss": 17.6156, + "losses_ref": -0.004576454870402813, + "ref_logps/chosen": -96.3367691040039, + "ref_logps/rejected": -89.21734619140625, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 79.6930923461914, + "rewards/margins": 150.2346649169922, + "rewards/rejected": -70.5416030883789, + "step": 4100, + "u": -4.441787242889404, + "weight": 0.02523742988705635 + }, + { + "diff_generated": -67.7085189819336, + "epoch": 1.3318211276733636, + "grad_norm": 508.6914339973369, + "learning_rate": 5.447448156481708e-07, + "logits/chosen": -2.4687657356262207, + "logits/rejected": -2.5582194328308105, + "logps/chosen": -15.509539604187012, + "logps/rejected": -157.65174865722656, + "loss": 17.1468, + "losses_ref": -0.01627708598971367, + "ref_logps/chosen": -95.32611083984375, + "ref_logps/rejected": -89.94322204589844, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 79.81657409667969, + "rewards/margins": 147.5251007080078, + "rewards/rejected": -67.7085189819336, + "step": 4110, + "u": -4.349822044372559, + "weight": 0.04460912570357323 + }, + { + "diff_generated": -68.07727813720703, + "epoch": 1.3350615683732987, + "grad_norm": 469.9815058204587, + "learning_rate": 5.433377928968234e-07, + "logits/chosen": -2.470069408416748, + "logits/rejected": -2.5536696910858154, + "logps/chosen": -18.169631958007812, + "logps/rejected": -154.72691345214844, + "loss": 17.7251, + "losses_ref": -0.0014369834680110216, + "ref_logps/chosen": -98.80369567871094, + "ref_logps/rejected": -86.6496353149414, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 80.63407135009766, + "rewards/margins": 148.71133422851562, + "rewards/rejected": -68.07727813720703, + "step": 4120, + "u": -4.4524922370910645, + "weight": 0.025073552504181862 + }, + { + "diff_generated": -65.11859893798828, + "epoch": 1.3383020090732338, + "grad_norm": 424.64693099799103, + "learning_rate": 5.41928732346117e-07, + "logits/chosen": -2.4283761978149414, + "logits/rejected": -2.5155179500579834, + "logps/chosen": -18.113378524780273, + "logps/rejected": -148.42088317871094, + "loss": 17.4648, + "losses_ref": -0.004977349191904068, + "ref_logps/chosen": -95.88996887207031, + "ref_logps/rejected": -83.30229187011719, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 77.7765884399414, + "rewards/margins": 142.8951873779297, + "rewards/rejected": -65.11859893798828, + "step": 4130, + "u": -4.399342060089111, + "weight": 0.037747155874967575 + }, + { + "diff_generated": -66.83191680908203, + "epoch": 1.3415424497731692, + "grad_norm": 465.3070412715531, + "learning_rate": 5.405176540283311e-07, + "logits/chosen": -2.426403522491455, + "logits/rejected": -2.517180919647217, + "logps/chosen": -17.60582733154297, + "logps/rejected": -153.71560668945312, + "loss": 17.6287, + "losses_ref": -0.0006925761117599905, + "ref_logps/chosen": -96.36148071289062, + "ref_logps/rejected": -86.88368225097656, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 78.75565338134766, + "rewards/margins": 145.5875701904297, + "rewards/rejected": -66.83191680908203, + "step": 4140, + "u": -4.210391998291016, + "weight": 0.0750294178724289 + }, + { + "diff_generated": -70.90372467041016, + "epoch": 1.3447828904731043, + "grad_norm": 439.07340709547856, + "learning_rate": 5.391045780044308e-07, + "logits/chosen": -2.4661173820495605, + "logits/rejected": -2.607022523880005, + "logps/chosen": -18.112369537353516, + "logps/rejected": -164.56256103515625, + "loss": 17.2512, + "losses_ref": -2.79961994920086e-07, + "ref_logps/chosen": -96.12417602539062, + "ref_logps/rejected": -93.6588363647461, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 78.01180267333984, + "rewards/margins": 148.91552734375, + "rewards/rejected": -70.90372467041016, + "step": 4150, + "u": -4.338640213012695, + "weight": 0.043750010430812836 + }, + { + "diff_generated": -63.74640655517578, + "epoch": 1.3480233311730396, + "grad_norm": 488.68903225387913, + "learning_rate": 5.376895243637823e-07, + "logits/chosen": -2.431675672531128, + "logits/rejected": -2.4988338947296143, + "logps/chosen": -18.850200653076172, + "logps/rejected": -146.23924255371094, + "loss": 17.7837, + "losses_ref": -0.005620983429253101, + "ref_logps/chosen": -94.14060974121094, + "ref_logps/rejected": -82.49284362792969, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 75.29041290283203, + "rewards/margins": 139.0368194580078, + "rewards/rejected": -63.74640655517578, + "step": 4160, + "u": -4.236712455749512, + "weight": 0.07528746873140335 + }, + { + "diff_generated": -71.92745208740234, + "epoch": 1.3512637718729748, + "grad_norm": 476.3299931949927, + "learning_rate": 5.362725132238672e-07, + "logits/chosen": -2.426936149597168, + "logits/rejected": -2.638857364654541, + "logps/chosen": -16.467594146728516, + "logps/rejected": -165.2958221435547, + "loss": 17.3957, + "losses_ref": -0.004750962369143963, + "ref_logps/chosen": -91.93846130371094, + "ref_logps/rejected": -93.3683853149414, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 75.47087097167969, + "rewards/margins": 147.3983154296875, + "rewards/rejected": -71.92745208740234, + "step": 4170, + "u": -4.38946008682251, + "weight": 0.03774666786193848 + }, + { + "diff_generated": -68.27117919921875, + "epoch": 1.3545042125729099, + "grad_norm": 467.16826897509446, + "learning_rate": 5.348535647299964e-07, + "logits/chosen": -2.4064178466796875, + "logits/rejected": -2.5552902221679688, + "logps/chosen": -17.2877140045166, + "logps/rejected": -161.4751434326172, + "loss": 17.1639, + "losses_ref": -1.1598888249864103e-07, + "ref_logps/chosen": -94.16259765625, + "ref_logps/rejected": -93.2039566040039, + "rewards/accuracies": 0.96875, + "rewards/chosen": 76.87488555908203, + "rewards/margins": 145.1460723876953, + "rewards/rejected": -68.27117919921875, + "step": 4180, + "u": -4.43471097946167, + "weight": 0.03125 + }, + { + "diff_generated": -67.98145294189453, + "epoch": 1.357744653272845, + "grad_norm": 439.1505245186258, + "learning_rate": 5.334326990550234e-07, + "logits/chosen": -2.4439923763275146, + "logits/rejected": -2.5619823932647705, + "logps/chosen": -16.554126739501953, + "logps/rejected": -155.3748016357422, + "loss": 17.0193, + "losses_ref": -7.341781838476891e-06, + "ref_logps/chosen": -95.81574249267578, + "ref_logps/rejected": -87.39334869384766, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 79.26161193847656, + "rewards/margins": 147.24307250976562, + "rewards/rejected": -67.98145294189453, + "step": 4190, + "u": -4.322827339172363, + "weight": 0.05000028759241104 + }, + { + "diff_generated": -69.60054016113281, + "epoch": 1.3609850939727803, + "grad_norm": 450.7519374284902, + "learning_rate": 5.320099363990584e-07, + "logits/chosen": -2.4455199241638184, + "logits/rejected": -2.4962477684020996, + "logps/chosen": -17.381702423095703, + "logps/rejected": -153.2888641357422, + "loss": 16.4503, + "losses_ref": -0.0011503873392939568, + "ref_logps/chosen": -98.63499450683594, + "ref_logps/rejected": -83.68831634521484, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 81.25328063964844, + "rewards/margins": 150.8538360595703, + "rewards/rejected": -69.60054016113281, + "step": 4200, + "u": -4.400361061096191, + "weight": 0.037554167211055756 + }, + { + "diff_generated": -67.48835754394531, + "epoch": 1.3642255346727155, + "grad_norm": 421.7107776772206, + "learning_rate": 5.305852969891799e-07, + "logits/chosen": -2.4636857509613037, + "logits/rejected": -2.502215623855591, + "logps/chosen": -17.966630935668945, + "logps/rejected": -146.44480895996094, + "loss": 16.9674, + "losses_ref": -2.9401869383605117e-08, + "ref_logps/chosen": -96.73162841796875, + "ref_logps/rejected": -78.95645141601562, + "rewards/accuracies": 0.9375, + "rewards/chosen": 78.76499938964844, + "rewards/margins": 146.2533721923828, + "rewards/rejected": -67.48835754394531, + "step": 4210, + "u": -4.2420148849487305, + "weight": 0.0625 + }, + { + "diff_generated": -63.33342742919922, + "epoch": 1.3674659753726508, + "grad_norm": 419.00702060951977, + "learning_rate": 5.29158801079148e-07, + "logits/chosen": -2.385185480117798, + "logits/rejected": -2.465200901031494, + "logps/chosen": -17.246349334716797, + "logps/rejected": -141.04661560058594, + "loss": 17.4708, + "losses_ref": -0.0014706698711961508, + "ref_logps/chosen": -95.12071228027344, + "ref_logps/rejected": -77.71318054199219, + "rewards/accuracies": 0.90625, + "rewards/chosen": 77.87437438964844, + "rewards/margins": 141.2078094482422, + "rewards/rejected": -63.33342742919922, + "step": 4220, + "u": -4.143843650817871, + "weight": 0.09382256120443344 + }, + { + "diff_generated": -66.72733306884766, + "epoch": 1.370706416072586, + "grad_norm": 448.4120302430566, + "learning_rate": 5.277304689491165e-07, + "logits/chosen": -2.4468398094177246, + "logits/rejected": -2.5708279609680176, + "logps/chosen": -17.90654754638672, + "logps/rejected": -146.7433624267578, + "loss": 17.4143, + "losses_ref": -0.0013314300449565053, + "ref_logps/chosen": -93.74774932861328, + "ref_logps/rejected": -80.01602172851562, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 75.84119415283203, + "rewards/margins": 142.5685272216797, + "rewards/rejected": -66.72733306884766, + "step": 4230, + "u": -4.346431732177734, + "weight": 0.03756508231163025 + }, + { + "diff_generated": -67.25651550292969, + "epoch": 1.373946856772521, + "grad_norm": 470.0274356799925, + "learning_rate": 5.26300320905344e-07, + "logits/chosen": -2.4239392280578613, + "logits/rejected": -2.5337131023406982, + "logps/chosen": -16.854785919189453, + "logps/rejected": -150.7886505126953, + "loss": 17.592, + "losses_ref": -2.157007656933274e-06, + "ref_logps/chosen": -96.3592300415039, + "ref_logps/rejected": -83.5321273803711, + "rewards/accuracies": 0.96875, + "rewards/chosen": 79.50444030761719, + "rewards/margins": 146.76095581054688, + "rewards/rejected": -67.25651550292969, + "step": 4240, + "u": -4.427393913269043, + "weight": 0.03125005215406418 + }, + { + "diff_generated": -68.52471923828125, + "epoch": 1.3771872974724562, + "grad_norm": 466.15491091372587, + "learning_rate": 5.248683772799054e-07, + "logits/chosen": -2.4210152626037598, + "logits/rejected": -2.4874701499938965, + "logps/chosen": -19.08351707458496, + "logps/rejected": -152.42335510253906, + "loss": 17.0809, + "losses_ref": -0.0020189809147268534, + "ref_logps/chosen": -100.47675323486328, + "ref_logps/rejected": -83.89862060546875, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 81.39323425292969, + "rewards/margins": 149.91795349121094, + "rewards/rejected": -68.52471923828125, + "step": 4250, + "u": -4.339241027832031, + "weight": 0.04385364428162575 + }, + { + "diff_generated": -68.41958618164062, + "epoch": 1.3804277381723915, + "grad_norm": 443.81081877289006, + "learning_rate": 5.234346584304033e-07, + "logits/chosen": -2.414257049560547, + "logits/rejected": -2.509781837463379, + "logps/chosen": -16.89341163635254, + "logps/rejected": -152.24014282226562, + "loss": 17.5102, + "losses_ref": -0.001452519092708826, + "ref_logps/chosen": -92.01890563964844, + "ref_logps/rejected": -83.82056427001953, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 75.12548828125, + "rewards/margins": 143.54507446289062, + "rewards/rejected": -68.41958618164062, + "step": 4260, + "u": -4.205715179443359, + "weight": 0.0688219666481018 + }, + { + "diff_generated": -69.97354125976562, + "epoch": 1.3836681788723266, + "grad_norm": 442.71038008181597, + "learning_rate": 5.21999184739678e-07, + "logits/chosen": -2.4220805168151855, + "logits/rejected": -2.494659662246704, + "logps/chosen": -19.814462661743164, + "logps/rejected": -158.04541015625, + "loss": 17.7849, + "losses_ref": -2.526703042349254e-07, + "ref_logps/chosen": -98.62994384765625, + "ref_logps/rejected": -88.07185363769531, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 78.81547546386719, + "rewards/margins": 148.7890167236328, + "rewards/rejected": -69.97354125976562, + "step": 4270, + "u": -4.459265232086182, + "weight": 0.025000005960464478 + }, + { + "diff_generated": -67.09244537353516, + "epoch": 1.3869086195722617, + "grad_norm": 506.04582752692124, + "learning_rate": 5.205619766155182e-07, + "logits/chosen": -2.443906307220459, + "logits/rejected": -2.5445313453674316, + "logps/chosen": -17.672672271728516, + "logps/rejected": -150.82272338867188, + "loss": 17.4618, + "losses_ref": -0.008290953002870083, + "ref_logps/chosen": -93.44664001464844, + "ref_logps/rejected": -83.73028564453125, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 75.77397155761719, + "rewards/margins": 142.8664093017578, + "rewards/rejected": -67.09244537353516, + "step": 4280, + "u": -4.226373672485352, + "weight": 0.06915529072284698 + }, + { + "diff_generated": -67.34722900390625, + "epoch": 1.390149060272197, + "grad_norm": 445.24258122184585, + "learning_rate": 5.191230544903702e-07, + "logits/chosen": -2.4065587520599365, + "logits/rejected": -2.4920034408569336, + "logps/chosen": -15.778546333312988, + "logps/rejected": -151.96900939941406, + "loss": 17.0353, + "losses_ref": -0.0008197773131541908, + "ref_logps/chosen": -88.67896270751953, + "ref_logps/rejected": -84.62178039550781, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 72.90040588378906, + "rewards/margins": 140.24761962890625, + "rewards/rejected": -67.34722900390625, + "step": 4290, + "u": -4.341623306274414, + "weight": 0.05003521963953972 + }, + { + "diff_generated": -66.68577575683594, + "epoch": 1.3933895009721322, + "grad_norm": 511.6361436706194, + "learning_rate": 5.176824388210483e-07, + "logits/chosen": -2.4076170921325684, + "logits/rejected": -2.5193543434143066, + "logps/chosen": -17.607181549072266, + "logps/rejected": -152.07278442382812, + "loss": 17.4805, + "losses_ref": -0.00011498709500301629, + "ref_logps/chosen": -92.15415954589844, + "ref_logps/rejected": -85.38701629638672, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 74.54698181152344, + "rewards/margins": 141.23277282714844, + "rewards/rejected": -66.68577575683594, + "step": 4300, + "u": -4.220432281494141, + "weight": 0.0750034898519516 + }, + { + "diff_generated": -66.54644775390625, + "epoch": 1.3966299416720673, + "grad_norm": 468.88529082210414, + "learning_rate": 5.162401500884432e-07, + "logits/chosen": -2.427182674407959, + "logits/rejected": -2.504462957382202, + "logps/chosen": -17.371049880981445, + "logps/rejected": -148.7251739501953, + "loss": 17.4356, + "losses_ref": -0.015387284569442272, + "ref_logps/chosen": -95.58000946044922, + "ref_logps/rejected": -82.17872619628906, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 78.2089614868164, + "rewards/margins": 144.75540161132812, + "rewards/rejected": -66.54644775390625, + "step": 4310, + "u": -4.298781394958496, + "weight": 0.057079873979091644 + }, + { + "diff_generated": -66.85215759277344, + "epoch": 1.3998703823720027, + "grad_norm": 469.3847642908731, + "learning_rate": 5.147962087972314e-07, + "logits/chosen": -2.413745403289795, + "logits/rejected": -2.462054491043091, + "logps/chosen": -18.344039916992188, + "logps/rejected": -151.5030975341797, + "loss": 17.8151, + "losses_ref": -9.710121048556175e-06, + "ref_logps/chosen": -97.64744567871094, + "ref_logps/rejected": -84.65092468261719, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 79.30342102050781, + "rewards/margins": 146.1555633544922, + "rewards/rejected": -66.85215759277344, + "step": 4320, + "u": -4.29888916015625, + "weight": 0.05625038221478462 + }, + { + "diff_generated": -70.12738037109375, + "epoch": 1.4031108230719378, + "grad_norm": 468.64992596686636, + "learning_rate": 5.133506354755833e-07, + "logits/chosen": -2.435763359069824, + "logits/rejected": -2.5422844886779785, + "logps/chosen": -15.104168891906738, + "logps/rejected": -155.98138427734375, + "loss": 16.8697, + "losses_ref": -0.0039013822097331285, + "ref_logps/chosen": -91.3575439453125, + "ref_logps/rejected": -85.85398864746094, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 76.25337982177734, + "rewards/margins": 146.38075256347656, + "rewards/rejected": -70.12738037109375, + "step": 4330, + "u": -4.263181686401367, + "weight": 0.05643879249691963 + }, + { + "diff_generated": -67.4532470703125, + "epoch": 1.406351263771873, + "grad_norm": 457.9270962811255, + "learning_rate": 5.119034506748713e-07, + "logits/chosen": -2.3538706302642822, + "logits/rejected": -2.440309762954712, + "logps/chosen": -16.107725143432617, + "logps/rejected": -151.02162170410156, + "loss": 17.015, + "losses_ref": -0.0006191584980115294, + "ref_logps/chosen": -91.54263305664062, + "ref_logps/rejected": -83.56836700439453, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 75.4349136352539, + "rewards/margins": 142.88815307617188, + "rewards/rejected": -67.4532470703125, + "step": 4340, + "u": -4.243154048919678, + "weight": 0.06877802312374115 + }, + { + "diff_generated": -66.83910369873047, + "epoch": 1.4095917044718083, + "grad_norm": 464.18351139838603, + "learning_rate": 5.104546749693781e-07, + "logits/chosen": -2.4061524868011475, + "logits/rejected": -2.5223965644836426, + "logps/chosen": -18.637516021728516, + "logps/rejected": -149.72984313964844, + "loss": 17.3378, + "losses_ref": -0.001131789991632104, + "ref_logps/chosen": -96.01673889160156, + "ref_logps/rejected": -82.8907470703125, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 77.37923431396484, + "rewards/margins": 144.2183380126953, + "rewards/rejected": -66.83910369873047, + "step": 4350, + "u": -4.342132568359375, + "weight": 0.04380253329873085 + }, + { + "diff_generated": -68.60645294189453, + "epoch": 1.4128321451717434, + "grad_norm": 457.0813142753334, + "learning_rate": 5.09004328956004e-07, + "logits/chosen": -2.4368767738342285, + "logits/rejected": -2.5215744972229004, + "logps/chosen": -17.591197967529297, + "logps/rejected": -153.7823028564453, + "loss": 17.2265, + "losses_ref": -1.2529037576314295e-06, + "ref_logps/chosen": -95.30029296875, + "ref_logps/rejected": -85.17585754394531, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 77.70909118652344, + "rewards/margins": 146.3155517578125, + "rewards/rejected": -68.60645294189453, + "step": 4360, + "u": -4.3323163986206055, + "weight": 0.05625004693865776 + }, + { + "diff_generated": -66.69379425048828, + "epoch": 1.4160725858716785, + "grad_norm": 411.08045984959665, + "learning_rate": 5.075524332539736e-07, + "logits/chosen": -2.398942470550537, + "logits/rejected": -2.455166816711426, + "logps/chosen": -16.874691009521484, + "logps/rejected": -152.13819885253906, + "loss": 17.3265, + "losses_ref": -1.1959253242821433e-05, + "ref_logps/chosen": -95.65223693847656, + "ref_logps/rejected": -85.44440460205078, + "rewards/accuracies": 0.96875, + "rewards/chosen": 78.77754211425781, + "rewards/margins": 145.47132873535156, + "rewards/rejected": -66.69379425048828, + "step": 4370, + "u": -4.406135559082031, + "weight": 0.03125051409006119 + }, + { + "diff_generated": -69.11666107177734, + "epoch": 1.4193130265716136, + "grad_norm": 461.40170637984556, + "learning_rate": 5.060990085045432e-07, + "logits/chosen": -2.4089365005493164, + "logits/rejected": -2.5167927742004395, + "logps/chosen": -17.30145263671875, + "logps/rejected": -155.79261779785156, + "loss": 17.6613, + "losses_ref": -1.8280853453234158e-07, + "ref_logps/chosen": -93.44625091552734, + "ref_logps/rejected": -86.67595672607422, + "rewards/accuracies": 0.96875, + "rewards/chosen": 76.1447982788086, + "rewards/margins": 145.26145935058594, + "rewards/rejected": -69.11666107177734, + "step": 4380, + "u": -4.437478542327881, + "weight": 0.0312500037252903 + }, + { + "diff_generated": -68.13085174560547, + "epoch": 1.422553467271549, + "grad_norm": 466.4529799309482, + "learning_rate": 5.046440753707077e-07, + "logits/chosen": -2.481142520904541, + "logits/rejected": -2.5197412967681885, + "logps/chosen": -15.788442611694336, + "logps/rejected": -153.73379516601562, + "loss": 17.2616, + "losses_ref": -0.0003627826808951795, + "ref_logps/chosen": -96.31988525390625, + "ref_logps/rejected": -85.60295104980469, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 80.53144836425781, + "rewards/margins": 148.6623077392578, + "rewards/rejected": -68.13085174560547, + "step": 4390, + "u": -4.33340311050415, + "weight": 0.05626552179455757 + }, + { + "diff_generated": -66.99527740478516, + "epoch": 1.425793907971484, + "grad_norm": 473.0960654748571, + "learning_rate": 5.031876545369054e-07, + "logits/chosen": -2.461520195007324, + "logits/rejected": -2.5205090045928955, + "logps/chosen": -17.834148406982422, + "logps/rejected": -149.48910522460938, + "loss": 17.8125, + "losses_ref": -0.00037082930793985724, + "ref_logps/chosen": -95.21051025390625, + "ref_logps/rejected": -82.49385070800781, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 77.37635803222656, + "rewards/margins": 144.37164306640625, + "rewards/rejected": -66.99527740478516, + "step": 4400, + "u": -4.373237133026123, + "weight": 0.05001600459218025 + }, + { + "diff_generated": -69.36138153076172, + "epoch": 1.4290343486714194, + "grad_norm": 456.7809003773492, + "learning_rate": 5.017297667087257e-07, + "logits/chosen": -2.4498043060302734, + "logits/rejected": -2.535263776779175, + "logps/chosen": -18.219280242919922, + "logps/rejected": -152.06491088867188, + "loss": 17.463, + "losses_ref": -0.0010456106392666698, + "ref_logps/chosen": -96.76631927490234, + "ref_logps/rejected": -82.70353698730469, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 78.54704284667969, + "rewards/margins": 147.90841674804688, + "rewards/rejected": -69.36138153076172, + "step": 4410, + "u": -4.371595859527588, + "weight": 0.037546005100011826 + }, + { + "diff_generated": -69.4170150756836, + "epoch": 1.4322747893713546, + "grad_norm": 494.26461007460443, + "learning_rate": 5.002704326126135e-07, + "logits/chosen": -2.489748954772949, + "logits/rejected": -2.5593252182006836, + "logps/chosen": -18.945158004760742, + "logps/rejected": -156.34237670898438, + "loss": 17.6249, + "losses_ref": -0.0002525493036955595, + "ref_logps/chosen": -98.93243408203125, + "ref_logps/rejected": -86.92535400390625, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 79.9872817993164, + "rewards/margins": 149.404296875, + "rewards/rejected": -69.4170150756836, + "step": 4420, + "u": -4.320488929748535, + "weight": 0.056261636316776276 + }, + { + "diff_generated": -70.99942779541016, + "epoch": 1.4355152300712897, + "grad_norm": 501.8354939197857, + "learning_rate": 4.988096729955751e-07, + "logits/chosen": -2.479881763458252, + "logits/rejected": -2.568162202835083, + "logps/chosen": -16.15049171447754, + "logps/rejected": -156.91697692871094, + "loss": 17.3995, + "losses_ref": -0.00015837197133805603, + "ref_logps/chosen": -97.31709289550781, + "ref_logps/rejected": -85.91755676269531, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 81.1666030883789, + "rewards/margins": 152.16603088378906, + "rewards/rejected": -70.99942779541016, + "step": 4430, + "u": -4.261697292327881, + "weight": 0.06875661015510559 + }, + { + "diff_generated": -66.48624420166016, + "epoch": 1.4387556707712248, + "grad_norm": 489.7294121744171, + "learning_rate": 4.97347508624883e-07, + "logits/chosen": -2.4843106269836426, + "logits/rejected": -2.5230603218078613, + "logps/chosen": -16.20532989501953, + "logps/rejected": -149.9473114013672, + "loss": 16.8241, + "losses_ref": -0.0029774392023682594, + "ref_logps/chosen": -94.02255249023438, + "ref_logps/rejected": -83.46109008789062, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 77.81721496582031, + "rewards/margins": 144.30345153808594, + "rewards/rejected": -66.48624420166016, + "step": 4440, + "u": -4.226017951965332, + "weight": 0.08138980716466904 + }, + { + "diff_generated": -69.31263732910156, + "epoch": 1.4419961114711601, + "grad_norm": 434.07506473374804, + "learning_rate": 4.958839602877809e-07, + "logits/chosen": -2.417771816253662, + "logits/rejected": -2.535672664642334, + "logps/chosen": -17.75906753540039, + "logps/rejected": -155.42984008789062, + "loss": 17.376, + "losses_ref": -2.4924074750742875e-05, + "ref_logps/chosen": -93.65541076660156, + "ref_logps/rejected": -86.11720275878906, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 75.8963394165039, + "rewards/margins": 145.20896911621094, + "rewards/rejected": -69.31263732910156, + "step": 4450, + "u": -4.432461261749268, + "weight": 0.03750060871243477 + }, + { + "diff_generated": -67.58025360107422, + "epoch": 1.4452365521710953, + "grad_norm": 433.5516392365605, + "learning_rate": 4.944190487911878e-07, + "logits/chosen": -2.4273107051849365, + "logits/rejected": -2.530043125152588, + "logps/chosen": -16.647306442260742, + "logps/rejected": -154.08511352539062, + "loss": 17.6515, + "losses_ref": -0.002214368199929595, + "ref_logps/chosen": -94.8519058227539, + "ref_logps/rejected": -86.5048599243164, + "rewards/accuracies": 0.9375, + "rewards/chosen": 78.20460510253906, + "rewards/margins": 145.78488159179688, + "rewards/rejected": -67.58025360107422, + "step": 4460, + "u": -4.280516624450684, + "weight": 0.06258859485387802 + }, + { + "diff_generated": -65.32229614257812, + "epoch": 1.4484769928710304, + "grad_norm": 446.3728833764338, + "learning_rate": 4.929527949614025e-07, + "logits/chosen": -2.4531006813049316, + "logits/rejected": -2.4834418296813965, + "logps/chosen": -18.64037322998047, + "logps/rejected": -147.5410614013672, + "loss": 17.427, + "losses_ref": -9.17950728762662e-06, + "ref_logps/chosen": -100.15422058105469, + "ref_logps/rejected": -82.21876525878906, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 81.51384735107422, + "rewards/margins": 146.83615112304688, + "rewards/rejected": -65.32229614257812, + "step": 4470, + "u": -4.405593395233154, + "weight": 0.037500280886888504 + }, + { + "diff_generated": -67.51651000976562, + "epoch": 1.4517174335709657, + "grad_norm": 476.64036777111323, + "learning_rate": 4.914852196438077e-07, + "logits/chosen": -2.4044480323791504, + "logits/rejected": -2.51806640625, + "logps/chosen": -17.459749221801758, + "logps/rejected": -153.8185272216797, + "loss": 17.796, + "losses_ref": -0.00025501454365439713, + "ref_logps/chosen": -93.2803955078125, + "ref_logps/rejected": -86.3020248413086, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 75.8206558227539, + "rewards/margins": 143.337158203125, + "rewards/rejected": -67.51651000976562, + "step": 4480, + "u": -4.34688663482666, + "weight": 0.043761108070611954 + }, + { + "diff_generated": -71.80397033691406, + "epoch": 1.4549578742709008, + "grad_norm": 495.62260448956147, + "learning_rate": 4.900163437025727e-07, + "logits/chosen": -2.413210153579712, + "logits/rejected": -2.5546677112579346, + "logps/chosen": -17.38454818725586, + "logps/rejected": -161.24551391601562, + "loss": 17.9293, + "losses_ref": -2.827008529493469e-07, + "ref_logps/chosen": -92.62767028808594, + "ref_logps/rejected": -89.44153594970703, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 75.24312591552734, + "rewards/margins": 147.047119140625, + "rewards/rejected": -71.80397033691406, + "step": 4490, + "u": -4.375165939331055, + "weight": 0.04375000298023224 + }, + { + "diff_generated": -68.02531433105469, + "epoch": 1.458198314970836, + "grad_norm": 504.03590321188335, + "learning_rate": 4.885461880203582e-07, + "logits/chosen": -2.4162282943725586, + "logits/rejected": -2.5359253883361816, + "logps/chosen": -17.028575897216797, + "logps/rejected": -152.71405029296875, + "loss": 16.9765, + "losses_ref": -0.0028386306948959827, + "ref_logps/chosen": -91.15119171142578, + "ref_logps/rejected": -84.688720703125, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 74.12261199951172, + "rewards/margins": 142.14793395996094, + "rewards/rejected": -68.02531433105469, + "step": 4500, + "u": -4.2423601150512695, + "weight": 0.056392062455415726 + }, + { + "diff_generated": -71.33853149414062, + "epoch": 1.4614387556707713, + "grad_norm": 409.8261741998233, + "learning_rate": 4.870747734980186e-07, + "logits/chosen": -2.4482059478759766, + "logits/rejected": -2.5609679222106934, + "logps/chosen": -16.207927703857422, + "logps/rejected": -163.0063018798828, + "loss": 17.6178, + "losses_ref": -0.003997477702796459, + "ref_logps/chosen": -96.40123748779297, + "ref_logps/rejected": -91.66776275634766, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 80.19331359863281, + "rewards/margins": 151.53182983398438, + "rewards/rejected": -71.33853149414062, + "step": 4510, + "u": -4.368565082550049, + "weight": 0.043938539922237396 + }, + { + "diff_generated": -63.6971321105957, + "epoch": 1.4646791963707064, + "grad_norm": 452.7392547705994, + "learning_rate": 4.856021210543043e-07, + "logits/chosen": -2.370863437652588, + "logits/rejected": -2.4753031730651855, + "logps/chosen": -17.3338565826416, + "logps/rejected": -141.4090118408203, + "loss": 17.7314, + "losses_ref": -8.973429430625401e-07, + "ref_logps/chosen": -86.48564147949219, + "ref_logps/rejected": -77.71188354492188, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": 69.15177917480469, + "rewards/margins": 132.8489227294922, + "rewards/rejected": -63.6971321105957, + "step": 4520, + "u": -4.074034214019775, + "weight": 0.10625002533197403 + }, + { + "diff_generated": -69.9952163696289, + "epoch": 1.4679196370706415, + "grad_norm": 470.16356446008257, + "learning_rate": 4.841282516255653e-07, + "logits/chosen": -2.497943878173828, + "logits/rejected": -2.5594377517700195, + "logps/chosen": -18.48931312561035, + "logps/rejected": -158.583251953125, + "loss": 17.7948, + "losses_ref": -0.000756235618609935, + "ref_logps/chosen": -101.09172821044922, + "ref_logps/rejected": -88.58805847167969, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 82.6024169921875, + "rewards/margins": 152.59762573242188, + "rewards/rejected": -69.9952163696289, + "step": 4530, + "u": -4.4657979011535645, + "weight": 0.02503257431089878 + }, + { + "diff_generated": -70.44725036621094, + "epoch": 1.471160077770577, + "grad_norm": 446.5388305269726, + "learning_rate": 4.826531861654537e-07, + "logits/chosen": -2.418222665786743, + "logits/rejected": -2.4776690006256104, + "logps/chosen": -18.502029418945312, + "logps/rejected": -159.91282653808594, + "loss": 17.1606, + "losses_ref": -0.0012745390413329005, + "ref_logps/chosen": -98.4915542602539, + "ref_logps/rejected": -89.46559143066406, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 79.98951721191406, + "rewards/margins": 150.436767578125, + "rewards/rejected": -70.44725036621094, + "step": 4540, + "u": -4.396437168121338, + "weight": 0.03755884990096092 + }, + { + "diff_generated": -72.79644012451172, + "epoch": 1.474400518470512, + "grad_norm": 472.8502607929755, + "learning_rate": 4.811769456446243e-07, + "logits/chosen": -2.4596428871154785, + "logits/rejected": -2.5735411643981934, + "logps/chosen": -16.844120025634766, + "logps/rejected": -159.51223754882812, + "loss": 17.0855, + "losses_ref": -3.752495558728697e-07, + "ref_logps/chosen": -95.08149719238281, + "ref_logps/rejected": -86.71580505371094, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 78.23738098144531, + "rewards/margins": 151.0338134765625, + "rewards/rejected": -72.79644012451172, + "step": 4550, + "u": -4.390454292297363, + "weight": 0.03750000521540642 + }, + { + "diff_generated": -68.4251937866211, + "epoch": 1.4776409591704471, + "grad_norm": 501.1855634488351, + "learning_rate": 4.796995510504384e-07, + "logits/chosen": -2.402315378189087, + "logits/rejected": -2.5745930671691895, + "logps/chosen": -15.994440078735352, + "logps/rejected": -153.82589721679688, + "loss": 17.8596, + "losses_ref": -6.664349712082185e-07, + "ref_logps/chosen": -89.04400634765625, + "ref_logps/rejected": -85.40069580078125, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 73.04957580566406, + "rewards/margins": 141.47476196289062, + "rewards/rejected": -68.4251937866211, + "step": 4560, + "u": -4.18105936050415, + "weight": 0.07500000298023224 + }, + { + "diff_generated": -67.21964263916016, + "epoch": 1.4808813998703823, + "grad_norm": 453.5729849326357, + "learning_rate": 4.782210233866637e-07, + "logits/chosen": -2.4193265438079834, + "logits/rejected": -2.5252389907836914, + "logps/chosen": -16.418968200683594, + "logps/rejected": -156.79345703125, + "loss": 16.8052, + "losses_ref": -2.263903979837778e-06, + "ref_logps/chosen": -92.21155548095703, + "ref_logps/rejected": -89.57381439208984, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 75.7925796508789, + "rewards/margins": 143.01222229003906, + "rewards/rejected": -67.21964263916016, + "step": 4570, + "u": -4.222241401672363, + "weight": 0.06875006854534149 + }, + { + "diff_generated": -69.27766418457031, + "epoch": 1.4841218405703176, + "grad_norm": 419.96379946452873, + "learning_rate": 4.76741383673177e-07, + "logits/chosen": -2.446869373321533, + "logits/rejected": -2.5299861431121826, + "logps/chosen": -16.789928436279297, + "logps/rejected": -155.53073120117188, + "loss": 17.2807, + "losses_ref": -0.0027264286763966084, + "ref_logps/chosen": -94.69251251220703, + "ref_logps/rejected": -86.25306701660156, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 77.902587890625, + "rewards/margins": 147.1802520751953, + "rewards/rejected": -69.27766418457031, + "step": 4580, + "u": -4.301576614379883, + "weight": 0.05639181658625603 + }, + { + "diff_generated": -65.70954895019531, + "epoch": 1.4873622812702527, + "grad_norm": 501.90105626271105, + "learning_rate": 4.752606529456648e-07, + "logits/chosen": -2.410231113433838, + "logits/rejected": -2.5358357429504395, + "logps/chosen": -15.482803344726562, + "logps/rejected": -148.21693420410156, + "loss": 17.3371, + "losses_ref": -0.0004646036250051111, + "ref_logps/chosen": -90.462646484375, + "ref_logps/rejected": -82.50740051269531, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 74.97984313964844, + "rewards/margins": 140.68939208984375, + "rewards/rejected": -65.70954895019531, + "step": 4590, + "u": -4.142441272735596, + "weight": 0.0875222310423851 + }, + { + "diff_generated": -72.26192474365234, + "epoch": 1.490602721970188, + "grad_norm": 442.7557677460933, + "learning_rate": 4.7377885225532396e-07, + "logits/chosen": -2.459202289581299, + "logits/rejected": -2.5650484561920166, + "logps/chosen": -15.90203857421875, + "logps/rejected": -164.494873046875, + "loss": 16.9939, + "losses_ref": -3.4513675473135663e-06, + "ref_logps/chosen": -95.97832489013672, + "ref_logps/rejected": -92.23296356201172, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 80.07628631591797, + "rewards/margins": 152.3382110595703, + "rewards/rejected": -72.26192474365234, + "step": 4600, + "u": -4.404824256896973, + "weight": 0.037500057369470596 + }, + { + "diff_generated": -67.0380630493164, + "epoch": 1.4938431626701232, + "grad_norm": 452.2425383294523, + "learning_rate": 4.722960026685633e-07, + "logits/chosen": -2.4154162406921387, + "logits/rejected": -2.517221689224243, + "logps/chosen": -16.425350189208984, + "logps/rejected": -149.91822814941406, + "loss": 16.4547, + "losses_ref": -0.008105043321847916, + "ref_logps/chosen": -91.67552185058594, + "ref_logps/rejected": -82.88018035888672, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 75.25016021728516, + "rewards/margins": 142.28822326660156, + "rewards/rejected": -67.0380630493164, + "step": 4610, + "u": -4.350281238555908, + "weight": 0.05040092393755913 + }, + { + "diff_generated": -67.85414123535156, + "epoch": 1.4970836033700583, + "grad_norm": 441.68144740480705, + "learning_rate": 4.7081212526670267e-07, + "logits/chosen": -2.3970372676849365, + "logits/rejected": -2.45599102973938, + "logps/chosen": -19.708385467529297, + "logps/rejected": -152.5522918701172, + "loss": 17.0668, + "losses_ref": -0.0015395600348711014, + "ref_logps/chosen": -96.54747009277344, + "ref_logps/rejected": -84.69816589355469, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 76.83908081054688, + "rewards/margins": 144.69322204589844, + "rewards/rejected": -67.85414123535156, + "step": 4620, + "u": -4.298979759216309, + "weight": 0.05007138103246689 + }, + { + "diff_generated": -67.05535888671875, + "epoch": 1.5003240440699934, + "grad_norm": 453.4773167246375, + "learning_rate": 4.693272411456753e-07, + "logits/chosen": -2.4733786582946777, + "logits/rejected": -2.5276851654052734, + "logps/chosen": -17.27235984802246, + "logps/rejected": -150.65411376953125, + "loss": 17.2596, + "losses_ref": -0.0004150184686295688, + "ref_logps/chosen": -94.43132019042969, + "ref_logps/rejected": -83.59877014160156, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 77.15895080566406, + "rewards/margins": 144.21432495117188, + "rewards/rejected": -67.05535888671875, + "step": 4630, + "u": -4.372154712677002, + "weight": 0.05000603199005127 + }, + { + "diff_generated": -70.12632751464844, + "epoch": 1.5035644847699285, + "grad_norm": 467.84998763747177, + "learning_rate": 4.6784137141572566e-07, + "logits/chosen": -2.4407970905303955, + "logits/rejected": -2.5051474571228027, + "logps/chosen": -16.63579750061035, + "logps/rejected": -156.5682373046875, + "loss": 16.9536, + "losses_ref": -2.0801840037165675e-06, + "ref_logps/chosen": -96.26951599121094, + "ref_logps/rejected": -86.44189453125, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 79.63372039794922, + "rewards/margins": 149.7600555419922, + "rewards/rejected": -70.12632751464844, + "step": 4640, + "u": -4.290956497192383, + "weight": 0.056250084191560745 + }, + { + "diff_generated": -64.9892807006836, + "epoch": 1.5068049254698639, + "grad_norm": 484.2847531059146, + "learning_rate": 4.6635453720111096e-07, + "logits/chosen": -2.423842430114746, + "logits/rejected": -2.5103225708007812, + "logps/chosen": -17.545272827148438, + "logps/rejected": -148.31085205078125, + "loss": 16.5725, + "losses_ref": -1.28770659557631e-06, + "ref_logps/chosen": -92.8660659790039, + "ref_logps/rejected": -83.32157897949219, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 75.32079315185547, + "rewards/margins": 140.31008911132812, + "rewards/rejected": -64.9892807006836, + "step": 4650, + "u": -4.208277702331543, + "weight": 0.07500003278255463 + }, + { + "diff_generated": -71.25359344482422, + "epoch": 1.5100453661697992, + "grad_norm": 489.3099876004456, + "learning_rate": 4.6486675963980014e-07, + "logits/chosen": -2.4613852500915527, + "logits/rejected": -2.613790512084961, + "logps/chosen": -17.93228530883789, + "logps/rejected": -161.30078125, + "loss": 17.3612, + "losses_ref": -0.007633232977241278, + "ref_logps/chosen": -93.60543823242188, + "ref_logps/rejected": -90.04718780517578, + "rewards/accuracies": 0.96875, + "rewards/chosen": 75.67315673828125, + "rewards/margins": 146.92674255371094, + "rewards/rejected": -71.25359344482422, + "step": 4660, + "u": -4.4339728355407715, + "weight": 0.031660519540309906 + }, + { + "diff_generated": -65.96153259277344, + "epoch": 1.5132858068697344, + "grad_norm": 442.07501282365024, + "learning_rate": 4.633780598831733e-07, + "logits/chosen": -2.4787216186523438, + "logits/rejected": -2.5391018390655518, + "logps/chosen": -19.442729949951172, + "logps/rejected": -153.48989868164062, + "loss": 16.5713, + "losses_ref": -8.795756798463117e-07, + "ref_logps/chosen": -98.21834564208984, + "ref_logps/rejected": -87.52836608886719, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 78.77561950683594, + "rewards/margins": 144.73715209960938, + "rewards/rejected": -65.96153259277344, + "step": 4670, + "u": -4.303662300109863, + "weight": 0.05625002458691597 + }, + { + "diff_generated": -69.84078979492188, + "epoch": 1.5165262475696695, + "grad_norm": 494.0586391820382, + "learning_rate": 4.6188845909572143e-07, + "logits/chosen": -2.4449756145477295, + "logits/rejected": -2.5386128425598145, + "logps/chosen": -15.847773551940918, + "logps/rejected": -155.44656372070312, + "loss": 17.7909, + "losses_ref": -0.003316181479021907, + "ref_logps/chosen": -92.31333923339844, + "ref_logps/rejected": -85.60575866699219, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 76.46556854248047, + "rewards/margins": 146.30636596679688, + "rewards/rejected": -69.84078979492188, + "step": 4680, + "u": -4.302974224090576, + "weight": 0.05641231685876846 + }, + { + "diff_generated": -69.53398895263672, + "epoch": 1.5197666882696046, + "grad_norm": 452.3262545659709, + "learning_rate": 4.603979784547451e-07, + "logits/chosen": -2.3991618156433105, + "logits/rejected": -2.524679183959961, + "logps/chosen": -17.85516357421875, + "logps/rejected": -156.85577392578125, + "loss": 17.0628, + "losses_ref": -0.00722300773486495, + "ref_logps/chosen": -93.98219299316406, + "ref_logps/rejected": -87.32179260253906, + "rewards/accuracies": 0.9375, + "rewards/chosen": 76.12702178955078, + "rewards/margins": 145.6610107421875, + "rewards/rejected": -69.53398895263672, + "step": 4690, + "u": -4.316760063171387, + "weight": 0.0628783255815506 + }, + { + "diff_generated": -70.70417022705078, + "epoch": 1.5230071289695397, + "grad_norm": 483.5096682337427, + "learning_rate": 4.5890663915005364e-07, + "logits/chosen": -2.4551730155944824, + "logits/rejected": -2.5739877223968506, + "logps/chosen": -15.682329177856445, + "logps/rejected": -162.4754180908203, + "loss": 17.2193, + "losses_ref": -0.0009247121633961797, + "ref_logps/chosen": -94.73026275634766, + "ref_logps/rejected": -91.77125549316406, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 79.04792785644531, + "rewards/margins": 149.75210571289062, + "rewards/rejected": -70.70417022705078, + "step": 4700, + "u": -4.416566848754883, + "weight": 0.03754302114248276 + }, + { + "diff_generated": -68.02668762207031, + "epoch": 1.526247569669475, + "grad_norm": 474.65671883190913, + "learning_rate": 4.574144623836637e-07, + "logits/chosen": -2.4455883502960205, + "logits/rejected": -2.5542654991149902, + "logps/chosen": -16.18857192993164, + "logps/rejected": -154.2319793701172, + "loss": 17.6006, + "losses_ref": -0.0005734398728236556, + "ref_logps/chosen": -95.07334899902344, + "ref_logps/rejected": -86.20530700683594, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 78.88477325439453, + "rewards/margins": 146.91143798828125, + "rewards/rejected": -68.02668762207031, + "step": 4710, + "u": -4.256333827972412, + "weight": 0.06877660006284714 + }, + { + "diff_generated": -66.6358413696289, + "epoch": 1.5294880103694104, + "grad_norm": 397.73122217993006, + "learning_rate": 4.5592146936949785e-07, + "logits/chosen": -2.432037353515625, + "logits/rejected": -2.531618118286133, + "logps/chosen": -18.91817855834961, + "logps/rejected": -152.69790649414062, + "loss": 16.7594, + "losses_ref": -0.0002670374815352261, + "ref_logps/chosen": -95.0885009765625, + "ref_logps/rejected": -86.06204223632812, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 76.17031860351562, + "rewards/margins": 142.80618286132812, + "rewards/rejected": -66.6358413696289, + "step": 4720, + "u": -4.290534496307373, + "weight": 0.050011225044727325 + }, + { + "diff_generated": -68.60977935791016, + "epoch": 1.5327284510693455, + "grad_norm": 445.9665979961661, + "learning_rate": 4.544276813330835e-07, + "logits/chosen": -2.460371971130371, + "logits/rejected": -2.5344886779785156, + "logps/chosen": -16.79252052307129, + "logps/rejected": -152.0134735107422, + "loss": 17.1902, + "losses_ref": -3.2375027103626053e-07, + "ref_logps/chosen": -98.1429214477539, + "ref_logps/rejected": -83.40367889404297, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 81.35038757324219, + "rewards/margins": 149.96017456054688, + "rewards/rejected": -68.60977935791016, + "step": 4730, + "u": -4.352096080780029, + "weight": 0.050000011920928955 + }, + { + "diff_generated": -72.2760009765625, + "epoch": 1.5359688917692806, + "grad_norm": 476.0648268098202, + "learning_rate": 4.529331195112501e-07, + "logits/chosen": -2.405479907989502, + "logits/rejected": -2.544811725616455, + "logps/chosen": -17.25438117980957, + "logps/rejected": -162.0577850341797, + "loss": 17.2953, + "losses_ref": -0.0028177141211926937, + "ref_logps/chosen": -97.26611328125, + "ref_logps/rejected": -89.78179931640625, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 80.01172637939453, + "rewards/margins": 152.28775024414062, + "rewards/rejected": -72.2760009765625, + "step": 4740, + "u": -4.424232482910156, + "weight": 0.025139007717370987 + }, + { + "diff_generated": -68.67704772949219, + "epoch": 1.5392093324692158, + "grad_norm": 425.2681629255773, + "learning_rate": 4.5143780515182833e-07, + "logits/chosen": -2.4343087673187256, + "logits/rejected": -2.5154240131378174, + "logps/chosen": -20.168615341186523, + "logps/rejected": -156.32225036621094, + "loss": 17.2894, + "losses_ref": -3.488729589662398e-07, + "ref_logps/chosen": -99.758544921875, + "ref_logps/rejected": -87.64521789550781, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 79.58992767333984, + "rewards/margins": 148.26698303222656, + "rewards/rejected": -68.67704772949219, + "step": 4750, + "u": -4.499643325805664, + "weight": 0.018750010058283806 + }, + { + "diff_generated": -69.74950408935547, + "epoch": 1.5424497731691509, + "grad_norm": 480.75981177075647, + "learning_rate": 4.499417595133471e-07, + "logits/chosen": -2.3829503059387207, + "logits/rejected": -2.4904589653015137, + "logps/chosen": -17.013784408569336, + "logps/rejected": -155.1744842529297, + "loss": 17.4305, + "losses_ref": -0.03174503520131111, + "ref_logps/chosen": -93.46163177490234, + "ref_logps/rejected": -85.42498779296875, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 76.44784545898438, + "rewards/margins": 146.19735717773438, + "rewards/rejected": -69.74950408935547, + "step": 4760, + "u": -4.38844633102417, + "weight": 0.03943491727113724 + }, + { + "diff_generated": -71.02115631103516, + "epoch": 1.5456902138690862, + "grad_norm": 473.23312074904203, + "learning_rate": 4.4844500386473207e-07, + "logits/chosen": -2.444065570831299, + "logits/rejected": -2.5454695224761963, + "logps/chosen": -17.960790634155273, + "logps/rejected": -159.2787628173828, + "loss": 17.4406, + "losses_ref": -0.0001555870840093121, + "ref_logps/chosen": -98.4795913696289, + "ref_logps/rejected": -88.25760650634766, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 80.518798828125, + "rewards/margins": 151.53994750976562, + "rewards/rejected": -71.02115631103516, + "step": 4770, + "u": -4.4323649406433105, + "weight": 0.03750746697187424 + }, + { + "diff_generated": -69.52899932861328, + "epoch": 1.5489306545690213, + "grad_norm": 492.0938269097183, + "learning_rate": 4.4694755948500276e-07, + "logits/chosen": -2.413266658782959, + "logits/rejected": -2.5660948753356934, + "logps/chosen": -14.34319019317627, + "logps/rejected": -155.9700164794922, + "loss": 16.9127, + "losses_ref": -0.0016228422755375504, + "ref_logps/chosen": -89.77197265625, + "ref_logps/rejected": -86.44102478027344, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 75.42878723144531, + "rewards/margins": 144.95779418945312, + "rewards/rejected": -69.52899932861328, + "step": 4780, + "u": -4.278973579406738, + "weight": 0.056326042860746384 + }, + { + "diff_generated": -77.00670623779297, + "epoch": 1.5521710952689567, + "grad_norm": 455.8030015792514, + "learning_rate": 4.4544944766297037e-07, + "logits/chosen": -2.4555513858795166, + "logits/rejected": -2.6273884773254395, + "logps/chosen": -15.951858520507812, + "logps/rejected": -173.6452178955078, + "loss": 16.4869, + "losses_ref": -1.6469462238433152e-08, + "ref_logps/chosen": -96.30865478515625, + "ref_logps/rejected": -96.63851928710938, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 80.3567886352539, + "rewards/margins": 157.36349487304688, + "rewards/rejected": -77.00670623779297, + "step": 4790, + "u": -4.393777370452881, + "weight": 0.01875000074505806 + }, + { + "diff_generated": -64.95992279052734, + "epoch": 1.5554115359688918, + "grad_norm": 502.4897379317211, + "learning_rate": 4.439506896969348e-07, + "logits/chosen": -2.3634815216064453, + "logits/rejected": -2.428330659866333, + "logps/chosen": -16.078983306884766, + "logps/rejected": -142.49655151367188, + "loss": 16.9718, + "losses_ref": -1.6241930467231214e-08, + "ref_logps/chosen": -88.50505065917969, + "ref_logps/rejected": -77.53661346435547, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 72.42605590820312, + "rewards/margins": 137.385986328125, + "rewards/rejected": -64.95992279052734, + "step": 4800, + "u": -4.174968719482422, + "weight": 0.08749999850988388 + }, + { + "diff_generated": -68.2002944946289, + "epoch": 1.558651976668827, + "grad_norm": 503.3697031130378, + "learning_rate": 4.4245130689438206e-07, + "logits/chosen": -2.3789191246032715, + "logits/rejected": -2.4425642490386963, + "logps/chosen": -17.81094741821289, + "logps/rejected": -151.67152404785156, + "loss": 17.6768, + "losses_ref": -1.3876584489480592e-05, + "ref_logps/chosen": -93.52096557617188, + "ref_logps/rejected": -83.47123718261719, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 75.71002197265625, + "rewards/margins": 143.91030883789062, + "rewards/rejected": -68.2002944946289, + "step": 4810, + "u": -4.195111274719238, + "weight": 0.07500021904706955 + }, + { + "diff_generated": -71.60388946533203, + "epoch": 1.561892417368762, + "grad_norm": 464.6552704484477, + "learning_rate": 4.4095132057168145e-07, + "logits/chosen": -2.4288578033447266, + "logits/rejected": -2.493143320083618, + "logps/chosen": -19.020919799804688, + "logps/rejected": -158.80801391601562, + "loss": 16.9986, + "losses_ref": -4.041445208713412e-06, + "ref_logps/chosen": -98.4945068359375, + "ref_logps/rejected": -87.20411682128906, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 79.47359466552734, + "rewards/margins": 151.07748413085938, + "rewards/rejected": -71.60388946533203, + "step": 4820, + "u": -4.346208572387695, + "weight": 0.04375015199184418 + }, + { + "diff_generated": -70.94068908691406, + "epoch": 1.5651328580686974, + "grad_norm": 456.0521866765002, + "learning_rate": 4.3945075205378215e-07, + "logits/chosen": -2.3616397380828857, + "logits/rejected": -2.521225929260254, + "logps/chosen": -16.200428009033203, + "logps/rejected": -158.74551391601562, + "loss": 17.2717, + "losses_ref": -0.00018199995975010097, + "ref_logps/chosen": -92.65226745605469, + "ref_logps/rejected": -87.80482482910156, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 76.45184326171875, + "rewards/margins": 147.39254760742188, + "rewards/rejected": -70.94068908691406, + "step": 4830, + "u": -4.3289103507995605, + "weight": 0.04375418275594711 + }, + { + "diff_generated": -68.45477294921875, + "epoch": 1.5683732987686325, + "grad_norm": 463.7507709258817, + "learning_rate": 4.379496226739104e-07, + "logits/chosen": -2.4417545795440674, + "logits/rejected": -2.5383334159851074, + "logps/chosen": -16.365196228027344, + "logps/rejected": -152.93263244628906, + "loss": 17.3574, + "losses_ref": -2.3878867523308145e-06, + "ref_logps/chosen": -93.81847381591797, + "ref_logps/rejected": -84.47784423828125, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 77.45327758789062, + "rewards/margins": 145.90805053710938, + "rewards/rejected": -68.45477294921875, + "step": 4840, + "u": -4.2860236167907715, + "weight": 0.05625007301568985 + }, + { + "diff_generated": -68.72309875488281, + "epoch": 1.5716137394685679, + "grad_norm": 507.16325174099217, + "learning_rate": 4.364479537732663e-07, + "logits/chosen": -2.437304735183716, + "logits/rejected": -2.5395822525024414, + "logps/chosen": -17.980350494384766, + "logps/rejected": -155.20404052734375, + "loss": 17.6529, + "losses_ref": -0.0010237336391583085, + "ref_logps/chosen": -94.89065551757812, + "ref_logps/rejected": -86.48092651367188, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 76.91030883789062, + "rewards/margins": 145.63339233398438, + "rewards/rejected": -68.72309875488281, + "step": 4850, + "u": -4.335413932800293, + "weight": 0.04379686713218689 + }, + { + "diff_generated": -73.38167572021484, + "epoch": 1.574854180168503, + "grad_norm": 442.10680424593454, + "learning_rate": 4.349457667007197e-07, + "logits/chosen": -2.4412600994110107, + "logits/rejected": -2.541186809539795, + "logps/chosen": -18.689762115478516, + "logps/rejected": -166.74819946289062, + "loss": 16.7162, + "losses_ref": -0.01046350784599781, + "ref_logps/chosen": -99.40778350830078, + "ref_logps/rejected": -93.36651611328125, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 80.718017578125, + "rewards/margins": 154.09970092773438, + "rewards/rejected": -73.38167572021484, + "step": 4860, + "u": -4.530239105224609, + "weight": 0.012633567675948143 + }, + { + "diff_generated": -69.97039031982422, + "epoch": 1.578094620868438, + "grad_norm": 457.81659951396927, + "learning_rate": 4.334430828125074e-07, + "logits/chosen": -2.435004472732544, + "logits/rejected": -2.5484673976898193, + "logps/chosen": -17.670394897460938, + "logps/rejected": -155.90631103515625, + "loss": 16.8956, + "losses_ref": -2.0294830704870037e-08, + "ref_logps/chosen": -96.66577911376953, + "ref_logps/rejected": -85.93592834472656, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 78.9953842163086, + "rewards/margins": 148.9657745361328, + "rewards/rejected": -69.97039031982422, + "step": 4870, + "u": -4.273224353790283, + "weight": 0.06875000149011612 + }, + { + "diff_generated": -68.66413116455078, + "epoch": 1.5813350615683732, + "grad_norm": 438.34378748807904, + "learning_rate": 4.319399234719297e-07, + "logits/chosen": -2.381214141845703, + "logits/rejected": -2.532839298248291, + "logps/chosen": -14.926599502563477, + "logps/rejected": -151.05838012695312, + "loss": 16.2461, + "losses_ref": -1.6406092484544388e-08, + "ref_logps/chosen": -87.36531066894531, + "ref_logps/rejected": -82.39424896240234, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 72.43870544433594, + "rewards/margins": 141.1028289794922, + "rewards/rejected": -68.66413116455078, + "step": 4880, + "u": -4.148575782775879, + "weight": 0.08124999701976776 + }, + { + "diff_generated": -73.14501190185547, + "epoch": 1.5845755022683083, + "grad_norm": 484.5661681960606, + "learning_rate": 4.3043631004904563e-07, + "logits/chosen": -2.4165432453155518, + "logits/rejected": -2.50289249420166, + "logps/chosen": -14.882474899291992, + "logps/rejected": -160.80938720703125, + "loss": 16.5932, + "losses_ref": -0.0002339294005651027, + "ref_logps/chosen": -92.9581527709961, + "ref_logps/rejected": -87.66438293457031, + "rewards/accuracies": 0.96875, + "rewards/chosen": 78.07566833496094, + "rewards/margins": 151.22068786621094, + "rewards/rejected": -73.14501190185547, + "step": 4890, + "u": -4.442835807800293, + "weight": 0.03126036375761032 + }, + { + "diff_generated": -68.52119445800781, + "epoch": 1.5878159429682437, + "grad_norm": 423.6852586030226, + "learning_rate": 4.2893226392037024e-07, + "logits/chosen": -2.4660491943359375, + "logits/rejected": -2.5341620445251465, + "logps/chosen": -18.97785758972168, + "logps/rejected": -153.42442321777344, + "loss": 17.3246, + "losses_ref": -0.0012148026144132018, + "ref_logps/chosen": -101.24699401855469, + "ref_logps/rejected": -84.9032211303711, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 82.2691421508789, + "rewards/margins": 150.7903289794922, + "rewards/rejected": -68.52119445800781, + "step": 4900, + "u": -4.488637447357178, + "weight": 0.01880759373307228 + }, + { + "diff_generated": -71.99812316894531, + "epoch": 1.591056383668179, + "grad_norm": 449.46646678982796, + "learning_rate": 4.2742780646857015e-07, + "logits/chosen": -2.453869104385376, + "logits/rejected": -2.5750644207000732, + "logps/chosen": -15.577032089233398, + "logps/rejected": -162.82864379882812, + "loss": 16.5234, + "losses_ref": -0.004288672003895044, + "ref_logps/chosen": -96.67981719970703, + "ref_logps/rejected": -90.83052062988281, + "rewards/accuracies": 0.96875, + "rewards/chosen": 81.10277557373047, + "rewards/margins": 153.10089111328125, + "rewards/rejected": -71.99812316894531, + "step": 4910, + "u": -4.421360015869141, + "weight": 0.0314519926905632 + }, + { + "diff_generated": -69.86315155029297, + "epoch": 1.5942968243681142, + "grad_norm": 454.5353855349628, + "learning_rate": 4.2592295908215953e-07, + "logits/chosen": -2.421281337738037, + "logits/rejected": -2.534956693649292, + "logps/chosen": -18.646427154541016, + "logps/rejected": -156.1486358642578, + "loss": 17.8942, + "losses_ref": -7.0457475409568815e-09, + "ref_logps/chosen": -95.86431121826172, + "ref_logps/rejected": -86.28546905517578, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 77.21788024902344, + "rewards/margins": 147.08102416992188, + "rewards/rejected": -69.86315155029297, + "step": 4920, + "u": -4.3402299880981445, + "weight": 0.05000000074505806 + }, + { + "diff_generated": -71.47618103027344, + "epoch": 1.5975372650680493, + "grad_norm": 445.8051593638038, + "learning_rate": 4.2441774315519645e-07, + "logits/chosen": -2.460808277130127, + "logits/rejected": -2.5700113773345947, + "logps/chosen": -16.961727142333984, + "logps/rejected": -158.3717498779297, + "loss": 16.2322, + "losses_ref": -3.7529636642830155e-07, + "ref_logps/chosen": -94.57036590576172, + "ref_logps/rejected": -86.89557647705078, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 77.60865783691406, + "rewards/margins": 149.08482360839844, + "rewards/rejected": -71.47618103027344, + "step": 4930, + "u": -4.4106645584106445, + "weight": 0.03750000149011612 + }, + { + "diff_generated": -70.75782775878906, + "epoch": 1.6007777057679844, + "grad_norm": 420.1674727332997, + "learning_rate": 4.229121800869781e-07, + "logits/chosen": -2.4677226543426514, + "logits/rejected": -2.551055908203125, + "logps/chosen": -15.742657661437988, + "logps/rejected": -157.78079223632812, + "loss": 17.1654, + "losses_ref": -1.1926164233955205e-06, + "ref_logps/chosen": -97.43973541259766, + "ref_logps/rejected": -87.02295684814453, + "rewards/accuracies": 0.96875, + "rewards/chosen": 81.69708251953125, + "rewards/margins": 152.4549102783203, + "rewards/rejected": -70.75782775878906, + "step": 4940, + "u": -4.393712997436523, + "weight": 0.03125004097819328 + }, + { + "diff_generated": -70.05594635009766, + "epoch": 1.6040181464679195, + "grad_norm": 471.1934574344933, + "learning_rate": 4.2140629128173703e-07, + "logits/chosen": -2.5027830600738525, + "logits/rejected": -2.576869487762451, + "logps/chosen": -15.281623840332031, + "logps/rejected": -154.04367065429688, + "loss": 17.2283, + "losses_ref": -4.5796954850629845e-07, + "ref_logps/chosen": -91.05293273925781, + "ref_logps/rejected": -83.98772430419922, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 75.77131652832031, + "rewards/margins": 145.82723999023438, + "rewards/rejected": -70.05594635009766, + "step": 4950, + "u": -4.314549922943115, + "weight": 0.050000011920928955 + }, + { + "diff_generated": -72.00413513183594, + "epoch": 1.6072585871678549, + "grad_norm": 512.6807740122287, + "learning_rate": 4.199000981483368e-07, + "logits/chosen": -2.4834907054901123, + "logits/rejected": -2.572270154953003, + "logps/chosen": -19.872325897216797, + "logps/rejected": -158.2327117919922, + "loss": 17.4412, + "losses_ref": -7.864770850574132e-06, + "ref_logps/chosen": -100.17959594726562, + "ref_logps/rejected": -86.22859191894531, + "rewards/accuracies": 0.96875, + "rewards/chosen": 80.30726623535156, + "rewards/margins": 152.31138610839844, + "rewards/rejected": -72.00413513183594, + "step": 4960, + "u": -4.382212162017822, + "weight": 0.031250111758708954 + }, + { + "diff_generated": -70.52757263183594, + "epoch": 1.61049902786779, + "grad_norm": 424.9704894505284, + "learning_rate": 4.183936220999676e-07, + "logits/chosen": -2.4406819343566895, + "logits/rejected": -2.502345085144043, + "logps/chosen": -17.687253952026367, + "logps/rejected": -157.0961151123047, + "loss": 17.5785, + "losses_ref": -6.711905007250607e-05, + "ref_logps/chosen": -98.40314483642578, + "ref_logps/rejected": -86.56853485107422, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 80.71589660644531, + "rewards/margins": 151.2434844970703, + "rewards/rejected": -70.52757263183594, + "step": 4970, + "u": -4.28690242767334, + "weight": 0.05000241845846176 + }, + { + "diff_generated": -70.8532943725586, + "epoch": 1.6137394685677253, + "grad_norm": 423.2785756936077, + "learning_rate": 4.168868845538414e-07, + "logits/chosen": -2.443761110305786, + "logits/rejected": -2.5265536308288574, + "logps/chosen": -16.052059173583984, + "logps/rejected": -158.07748413085938, + "loss": 17.0892, + "losses_ref": -8.346935942427081e-07, + "ref_logps/chosen": -97.72169494628906, + "ref_logps/rejected": -87.22419738769531, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 81.66963958740234, + "rewards/margins": 152.52293395996094, + "rewards/rejected": -70.8532943725586, + "step": 4980, + "u": -4.407050132751465, + "weight": 0.03750001639127731 + }, + { + "diff_generated": -64.80369567871094, + "epoch": 1.6169799092676604, + "grad_norm": 491.3871139681972, + "learning_rate": 4.15379906930888e-07, + "logits/chosen": -2.397629499435425, + "logits/rejected": -2.4859225749969482, + "logps/chosen": -15.061877250671387, + "logps/rejected": -142.0963897705078, + "loss": 16.7449, + "losses_ref": -4.988124601368327e-06, + "ref_logps/chosen": -89.53046417236328, + "ref_logps/rejected": -77.29270935058594, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 74.46858978271484, + "rewards/margins": 139.2722930908203, + "rewards/rejected": -64.80369567871094, + "step": 4990, + "u": -4.022293567657471, + "weight": 0.11250004917383194 + }, + { + "diff_generated": -68.20692443847656, + "epoch": 1.6202203499675956, + "grad_norm": 470.2407142627594, + "learning_rate": 4.1387271065545074e-07, + "logits/chosen": -2.4599132537841797, + "logits/rejected": -2.4912075996398926, + "logps/chosen": -18.08317756652832, + "logps/rejected": -149.76382446289062, + "loss": 18.0198, + "losses_ref": -0.0042698136530816555, + "ref_logps/chosen": -99.14116668701172, + "ref_logps/rejected": -81.5569076538086, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 81.05799865722656, + "rewards/margins": 149.26492309570312, + "rewards/rejected": -68.20692443847656, + "step": 5000, + "u": -4.370108604431152, + "weight": 0.03771400451660156 + }, + { + "diff_generated": -71.28777313232422, + "epoch": 1.6234607906675307, + "grad_norm": 426.3288299899426, + "learning_rate": 4.123653171549807e-07, + "logits/chosen": -2.466158390045166, + "logits/rejected": -2.5273663997650146, + "logps/chosen": -16.16830825805664, + "logps/rejected": -155.9675750732422, + "loss": 17.4407, + "losses_ref": -3.1566725056109135e-07, + "ref_logps/chosen": -96.88363647460938, + "ref_logps/rejected": -84.67980194091797, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 80.71533203125, + "rewards/margins": 152.0031280517578, + "rewards/rejected": -71.28777313232422, + "step": 5010, + "u": -4.45050573348999, + "weight": 0.025000005960464478 + }, + { + "diff_generated": -70.68708038330078, + "epoch": 1.626701231367466, + "grad_norm": 457.7143642688739, + "learning_rate": 4.108577478597335e-07, + "logits/chosen": -2.3896279335021973, + "logits/rejected": -2.5536961555480957, + "logps/chosen": -18.48324203491211, + "logps/rejected": -156.32644653320312, + "loss": 17.4408, + "losses_ref": -0.0021601675543934107, + "ref_logps/chosen": -92.8128662109375, + "ref_logps/rejected": -85.63936614990234, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 74.32962036132812, + "rewards/margins": 145.01670837402344, + "rewards/rejected": -70.68708038330078, + "step": 5020, + "u": -4.347861289978027, + "weight": 0.04385722056031227 + }, + { + "diff_generated": -65.55160522460938, + "epoch": 1.6299416720674011, + "grad_norm": 476.173100293017, + "learning_rate": 4.093500242024637e-07, + "logits/chosen": -2.522347927093506, + "logits/rejected": -2.502315044403076, + "logps/chosen": -17.50808334350586, + "logps/rejected": -147.79000854492188, + "loss": 17.3038, + "losses_ref": -0.0009514664998278022, + "ref_logps/chosen": -97.16732025146484, + "ref_logps/rejected": -82.23841857910156, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 79.65924072265625, + "rewards/margins": 145.21084594726562, + "rewards/rejected": -65.55160522460938, + "step": 5030, + "u": -4.321379661560059, + "weight": 0.05629762262105942 + }, + { + "diff_generated": -67.8985366821289, + "epoch": 1.6331821127673365, + "grad_norm": 481.04827543949136, + "learning_rate": 4.0784216761812044e-07, + "logits/chosen": -2.4526283740997314, + "logits/rejected": -2.4655888080596924, + "logps/chosen": -16.992067337036133, + "logps/rejected": -149.39520263671875, + "loss": 17.1386, + "losses_ref": -0.0008671922842040658, + "ref_logps/chosen": -96.89338684082031, + "ref_logps/rejected": -81.49665832519531, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 79.90131378173828, + "rewards/margins": 147.7998504638672, + "rewards/rejected": -67.8985366821289, + "step": 5040, + "u": -4.304007530212402, + "weight": 0.056291352957487106 + }, + { + "diff_generated": -70.22386169433594, + "epoch": 1.6364225534672716, + "grad_norm": 502.71013582975803, + "learning_rate": 4.063341995435427e-07, + "logits/chosen": -2.421682834625244, + "logits/rejected": -2.5176949501037598, + "logps/chosen": -15.038667678833008, + "logps/rejected": -155.9605255126953, + "loss": 16.6153, + "losses_ref": -8.21582180066116e-09, + "ref_logps/chosen": -90.36064147949219, + "ref_logps/rejected": -85.73666381835938, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 75.32197570800781, + "rewards/margins": 145.5458221435547, + "rewards/rejected": -70.22386169433594, + "step": 5050, + "u": -4.340941429138184, + "weight": 0.04374999925494194 + }, + { + "diff_generated": -67.02069854736328, + "epoch": 1.6396629941672067, + "grad_norm": 464.9741379478892, + "learning_rate": 4.048261414171544e-07, + "logits/chosen": -2.474191665649414, + "logits/rejected": -2.5061898231506348, + "logps/chosen": -16.692886352539062, + "logps/rejected": -148.44381713867188, + "loss": 17.3252, + "losses_ref": -0.00026513769989833236, + "ref_logps/chosen": -95.00669860839844, + "ref_logps/rejected": -81.42312622070312, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 78.31381225585938, + "rewards/margins": 145.33450317382812, + "rewards/rejected": -67.02069854736328, + "step": 5060, + "u": -4.157334804534912, + "weight": 0.08751135319471359 + }, + { + "diff_generated": -68.2977523803711, + "epoch": 1.6429034348671419, + "grad_norm": 441.3001977518817, + "learning_rate": 4.0331801467865967e-07, + "logits/chosen": -2.472256898880005, + "logits/rejected": -2.558577299118042, + "logps/chosen": -16.66042137145996, + "logps/rejected": -149.11135864257812, + "loss": 17.3723, + "losses_ref": -5.643848635372706e-05, + "ref_logps/chosen": -93.59695434570312, + "ref_logps/rejected": -80.81361389160156, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 76.93653869628906, + "rewards/margins": 145.2342987060547, + "rewards/rejected": -68.2977523803711, + "step": 5070, + "u": -4.28251314163208, + "weight": 0.05000165104866028 + }, + { + "diff_generated": -71.2527847290039, + "epoch": 1.646143875567077, + "grad_norm": 463.35015092911704, + "learning_rate": 4.0180984076873833e-07, + "logits/chosen": -2.3988022804260254, + "logits/rejected": -2.5074386596679688, + "logps/chosen": -17.2117977142334, + "logps/rejected": -158.65309143066406, + "loss": 17.5038, + "losses_ref": -0.0013229569885879755, + "ref_logps/chosen": -94.10649871826172, + "ref_logps/rejected": -87.40029907226562, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 76.89469909667969, + "rewards/margins": 148.14749145507812, + "rewards/rejected": -71.2527847290039, + "step": 5080, + "u": -4.235500812530518, + "weight": 0.06881176680326462 + }, + { + "diff_generated": -73.563232421875, + "epoch": 1.6493843162670123, + "grad_norm": 485.7493691528965, + "learning_rate": 4.003016411287407e-07, + "logits/chosen": -2.4517648220062256, + "logits/rejected": -2.606292486190796, + "logps/chosen": -16.88274383544922, + "logps/rejected": -167.32257080078125, + "loss": 17.0329, + "losses_ref": -7.952481610118411e-06, + "ref_logps/chosen": -96.70402526855469, + "ref_logps/rejected": -93.75936126708984, + "rewards/accuracies": 0.96875, + "rewards/chosen": 79.82127380371094, + "rewards/margins": 153.38449096679688, + "rewards/rejected": -73.563232421875, + "step": 5090, + "u": -4.407740116119385, + "weight": 0.03125026077032089 + }, + { + "diff_generated": -70.79591369628906, + "epoch": 1.6526247569669477, + "grad_norm": 432.09735279072305, + "learning_rate": 3.9879343720038276e-07, + "logits/chosen": -2.442922830581665, + "logits/rejected": -2.5303492546081543, + "logps/chosen": -16.87449073791504, + "logps/rejected": -161.27662658691406, + "loss": 17.6718, + "losses_ref": -7.192376187958871e-07, + "ref_logps/chosen": -95.96610260009766, + "ref_logps/rejected": -90.48070526123047, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 79.09161376953125, + "rewards/margins": 149.8875274658203, + "rewards/rejected": -70.79591369628906, + "step": 5100, + "u": -4.367244720458984, + "weight": 0.04375002533197403 + }, + { + "diff_generated": -71.23027038574219, + "epoch": 1.6558651976668828, + "grad_norm": 431.98988316072837, + "learning_rate": 3.972852504254415e-07, + "logits/chosen": -2.3836681842803955, + "logits/rejected": -2.535038471221924, + "logps/chosen": -16.863073348999023, + "logps/rejected": -156.7201690673828, + "loss": 16.4406, + "losses_ref": -6.091965474297467e-07, + "ref_logps/chosen": -91.42034149169922, + "ref_logps/rejected": -85.48990631103516, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 74.5572738647461, + "rewards/margins": 145.78753662109375, + "rewards/rejected": -71.23027038574219, + "step": 5110, + "u": -4.2270708084106445, + "weight": 0.06875001639127731 + }, + { + "diff_generated": -71.12030792236328, + "epoch": 1.659105638366818, + "grad_norm": 460.07962332170194, + "learning_rate": 3.9577710224545033e-07, + "logits/chosen": -2.4391770362854004, + "logits/rejected": -2.569916248321533, + "logps/chosen": -18.38995361328125, + "logps/rejected": -157.92996215820312, + "loss": 17.1801, + "losses_ref": -0.004063854459673166, + "ref_logps/chosen": -96.03662872314453, + "ref_logps/rejected": -86.80965423583984, + "rewards/accuracies": 0.96875, + "rewards/chosen": 77.64668273925781, + "rewards/margins": 148.76698303222656, + "rewards/rejected": -71.12030792236328, + "step": 5120, + "u": -4.407456398010254, + "weight": 0.031440265476703644 + }, + { + "diff_generated": -69.2224349975586, + "epoch": 1.662346079066753, + "grad_norm": 447.29819080459765, + "learning_rate": 3.9426901410139346e-07, + "logits/chosen": -2.468127727508545, + "logits/rejected": -2.4933931827545166, + "logps/chosen": -19.3382625579834, + "logps/rejected": -158.279052734375, + "loss": 17.0487, + "losses_ref": -0.004990004934370518, + "ref_logps/chosen": -102.7806625366211, + "ref_logps/rejected": -89.0566177368164, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 83.44239807128906, + "rewards/margins": 152.66485595703125, + "rewards/rejected": -69.2224349975586, + "step": 5130, + "u": -4.480766296386719, + "weight": 0.012752932496368885 + }, + { + "diff_generated": -73.63088989257812, + "epoch": 1.6655865197666881, + "grad_norm": 470.50878956666423, + "learning_rate": 3.9276100743340217e-07, + "logits/chosen": -2.5297019481658936, + "logits/rejected": -2.5885841846466064, + "logps/chosen": -17.93415641784668, + "logps/rejected": -163.42138671875, + "loss": 17.1159, + "losses_ref": -0.0012631936697289348, + "ref_logps/chosen": -101.6262435913086, + "ref_logps/rejected": -89.7905044555664, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 83.69209289550781, + "rewards/margins": 157.32296752929688, + "rewards/rejected": -73.63088989257812, + "step": 5140, + "u": -4.47725772857666, + "weight": 0.012562265619635582 + }, + { + "diff_generated": -71.59645080566406, + "epoch": 1.6688269604666235, + "grad_norm": 440.79440036964496, + "learning_rate": 3.9125310368044877e-07, + "logits/chosen": -2.407548189163208, + "logits/rejected": -2.506441354751587, + "logps/chosen": -16.486774444580078, + "logps/rejected": -157.2587127685547, + "loss": 16.7643, + "losses_ref": -0.0005869531887583435, + "ref_logps/chosen": -94.74775695800781, + "ref_logps/rejected": -85.6622543334961, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 78.26097869873047, + "rewards/margins": 149.85743713378906, + "rewards/rejected": -71.59645080566406, + "step": 5150, + "u": -4.3845953941345215, + "weight": 0.03752673789858818 + }, + { + "diff_generated": -69.29833984375, + "epoch": 1.6720674011665586, + "grad_norm": 492.35900604525534, + "learning_rate": 3.8974532428004305e-07, + "logits/chosen": -2.4077913761138916, + "logits/rejected": -2.4718546867370605, + "logps/chosen": -17.270605087280273, + "logps/rejected": -155.9961395263672, + "loss": 17.1053, + "losses_ref": -0.00144083215855062, + "ref_logps/chosen": -98.63170623779297, + "ref_logps/rejected": -86.69779205322266, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 81.36109924316406, + "rewards/margins": 150.65945434570312, + "rewards/rejected": -69.29833984375, + "step": 5160, + "u": -4.301863193511963, + "weight": 0.056316327303647995 + }, + { + "diff_generated": -67.48606872558594, + "epoch": 1.675307841866494, + "grad_norm": 431.33522497944324, + "learning_rate": 3.8823769066792643e-07, + "logits/chosen": -2.4253952503204346, + "logits/rejected": -2.5132360458374023, + "logps/chosen": -17.439252853393555, + "logps/rejected": -148.45159912109375, + "loss": 17.2726, + "losses_ref": -0.00045064339064992964, + "ref_logps/chosen": -91.80262756347656, + "ref_logps/rejected": -80.96551513671875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 74.36337280273438, + "rewards/margins": 141.8494415283203, + "rewards/rejected": -67.48606872558594, + "step": 5170, + "u": -4.298040866851807, + "weight": 0.062519371509552 + }, + { + "diff_generated": -75.08834838867188, + "epoch": 1.678548282566429, + "grad_norm": 473.1830697348528, + "learning_rate": 3.867302242777681e-07, + "logits/chosen": -2.4858357906341553, + "logits/rejected": -2.6244637966156006, + "logps/chosen": -16.95624542236328, + "logps/rejected": -168.19943237304688, + "loss": 17.1289, + "losses_ref": -1.8262624745801759e-09, + "ref_logps/chosen": -98.0115966796875, + "ref_logps/rejected": -93.11107635498047, + "rewards/accuracies": 0.96875, + "rewards/chosen": 81.05535125732422, + "rewards/margins": 156.14369201660156, + "rewards/rejected": -75.08834838867188, + "step": 5180, + "u": -4.44937801361084, + "weight": 0.03125 + }, + { + "diff_generated": -69.98115539550781, + "epoch": 1.6817887232663642, + "grad_norm": 457.77313240740835, + "learning_rate": 3.852229465408597e-07, + "logits/chosen": -2.409611225128174, + "logits/rejected": -2.5834403038024902, + "logps/chosen": -17.508363723754883, + "logps/rejected": -156.28579711914062, + "loss": 17.3377, + "losses_ref": -4.212985368212685e-05, + "ref_logps/chosen": -92.58866882324219, + "ref_logps/rejected": -86.30463409423828, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 75.08030700683594, + "rewards/margins": 145.0614776611328, + "rewards/rejected": -69.98115539550781, + "step": 5190, + "u": -4.236456871032715, + "weight": 0.07500100880861282 + }, + { + "diff_generated": -69.71676635742188, + "epoch": 1.6850291639662993, + "grad_norm": 460.02618445520545, + "learning_rate": 3.8371587888581067e-07, + "logits/chosen": -2.4338431358337402, + "logits/rejected": -2.5173604488372803, + "logps/chosen": -17.64065933227539, + "logps/rejected": -157.76220703125, + "loss": 17.0008, + "losses_ref": -0.00027629570104181767, + "ref_logps/chosen": -99.22086334228516, + "ref_logps/rejected": -88.0454330444336, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 81.58020782470703, + "rewards/margins": 151.29696655273438, + "rewards/rejected": -69.71676635742188, + "step": 5200, + "u": -4.450808525085449, + "weight": 0.025009319186210632 + }, + { + "diff_generated": -69.77263641357422, + "epoch": 1.6882696046662347, + "grad_norm": 468.22422746219297, + "learning_rate": 3.822090427382442e-07, + "logits/chosen": -2.45259428024292, + "logits/rejected": -2.4920616149902344, + "logps/chosen": -17.049236297607422, + "logps/rejected": -156.7518310546875, + "loss": 16.9809, + "losses_ref": -0.18010127544403076, + "ref_logps/chosen": -93.86769104003906, + "ref_logps/rejected": -86.97917175292969, + "rewards/accuracies": 0.9375, + "rewards/chosen": 76.81845092773438, + "rewards/margins": 146.59109497070312, + "rewards/rejected": -69.77263641357422, + "step": 5210, + "u": -4.296868324279785, + "weight": 0.06403098255395889 + }, + { + "diff_generated": -69.26919555664062, + "epoch": 1.6915100453661698, + "grad_norm": 454.43662699303434, + "learning_rate": 3.807024595204916e-07, + "logits/chosen": -2.4401068687438965, + "logits/rejected": -2.4879443645477295, + "logps/chosen": -16.294349670410156, + "logps/rejected": -153.8958740234375, + "loss": 17.3555, + "losses_ref": -1.8101005707649165e-06, + "ref_logps/chosen": -94.20816802978516, + "ref_logps/rejected": -84.62667846679688, + "rewards/accuracies": 0.96875, + "rewards/chosen": 77.91381072998047, + "rewards/margins": 147.18301391601562, + "rewards/rejected": -69.26919555664062, + "step": 5220, + "u": -4.433684825897217, + "weight": 0.03125007078051567 + }, + { + "diff_generated": -71.1961669921875, + "epoch": 1.6947504860661051, + "grad_norm": 470.61477092459324, + "learning_rate": 3.7919615065128905e-07, + "logits/chosen": -2.5041403770446777, + "logits/rejected": -2.558800458908081, + "logps/chosen": -17.792478561401367, + "logps/rejected": -157.79530334472656, + "loss": 17.7305, + "losses_ref": -0.06184719130396843, + "ref_logps/chosen": -100.58880615234375, + "ref_logps/rejected": -86.59913635253906, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 82.79631805419922, + "rewards/margins": 153.99249267578125, + "rewards/rejected": -71.1961669921875, + "step": 5230, + "u": -4.348549842834473, + "weight": 0.044755056500434875 + }, + { + "diff_generated": -65.75114440917969, + "epoch": 1.6979909267660402, + "grad_norm": 453.03870378109104, + "learning_rate": 3.7769013754547155e-07, + "logits/chosen": -2.4477427005767822, + "logits/rejected": -2.5033042430877686, + "logps/chosen": -17.491987228393555, + "logps/rejected": -150.37059020996094, + "loss": 16.7332, + "losses_ref": -0.00048043514834716916, + "ref_logps/chosen": -97.56319427490234, + "ref_logps/rejected": -84.61946105957031, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 80.07121276855469, + "rewards/margins": 145.82237243652344, + "rewards/rejected": -65.75114440917969, + "step": 5240, + "u": -4.236319541931152, + "weight": 0.07501258701086044 + }, + { + "diff_generated": -69.01460266113281, + "epoch": 1.7012313674659754, + "grad_norm": 464.04588651328675, + "learning_rate": 3.761844416136701e-07, + "logits/chosen": -2.454002618789673, + "logits/rejected": -2.554508686065674, + "logps/chosen": -17.00436782836914, + "logps/rejected": -152.9644317626953, + "loss": 16.5869, + "losses_ref": -0.0029619138222187757, + "ref_logps/chosen": -94.6144790649414, + "ref_logps/rejected": -83.9498291015625, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 77.610107421875, + "rewards/margins": 146.62472534179688, + "rewards/rejected": -69.01460266113281, + "step": 5250, + "u": -4.372044086456299, + "weight": 0.043896518647670746 + }, + { + "diff_generated": -68.21833801269531, + "epoch": 1.7044718081659105, + "grad_norm": 417.2026837693391, + "learning_rate": 3.746790842620059e-07, + "logits/chosen": -2.42793869972229, + "logits/rejected": -2.5127129554748535, + "logps/chosen": -15.581718444824219, + "logps/rejected": -150.99046325683594, + "loss": 17.0069, + "losses_ref": -0.0004940610378980637, + "ref_logps/chosen": -89.71794128417969, + "ref_logps/rejected": -82.77213287353516, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 74.13622283935547, + "rewards/margins": 142.35458374023438, + "rewards/rejected": -68.21833801269531, + "step": 5260, + "u": -4.313251495361328, + "weight": 0.05625907704234123 + }, + { + "diff_generated": -68.33058166503906, + "epoch": 1.7077122488658456, + "grad_norm": 457.1306327953638, + "learning_rate": 3.731740868917872e-07, + "logits/chosen": -2.3772644996643066, + "logits/rejected": -2.4986536502838135, + "logps/chosen": -17.534826278686523, + "logps/rejected": -153.14242553710938, + "loss": 17.5256, + "losses_ref": -2.465140980234537e-08, + "ref_logps/chosen": -91.63734436035156, + "ref_logps/rejected": -84.81184387207031, + "rewards/accuracies": 0.9375, + "rewards/chosen": 74.10250854492188, + "rewards/margins": 142.43309020996094, + "rewards/rejected": -68.33058166503906, + "step": 5270, + "u": -4.248973846435547, + "weight": 0.0625 + }, + { + "diff_generated": -70.68391418457031, + "epoch": 1.710952689565781, + "grad_norm": 493.4429615429504, + "learning_rate": 3.716694708992039e-07, + "logits/chosen": -2.459246873855591, + "logits/rejected": -2.5305166244506836, + "logps/chosen": -17.377029418945312, + "logps/rejected": -158.16543579101562, + "loss": 17.2543, + "losses_ref": -0.00031063079950399697, + "ref_logps/chosen": -97.60765075683594, + "ref_logps/rejected": -87.48152160644531, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 80.23062896728516, + "rewards/margins": 150.91453552246094, + "rewards/rejected": -70.68391418457031, + "step": 5280, + "u": -4.373373031616211, + "weight": 0.03751382231712341 + }, + { + "diff_generated": -72.90049743652344, + "epoch": 1.7141931302657163, + "grad_norm": 487.7074986433388, + "learning_rate": 3.701652576750242e-07, + "logits/chosen": -2.4398880004882812, + "logits/rejected": -2.562415838241577, + "logps/chosen": -16.305221557617188, + "logps/rejected": -161.3070831298828, + "loss": 17.379, + "losses_ref": -2.912343006755691e-07, + "ref_logps/chosen": -93.390625, + "ref_logps/rejected": -88.40657043457031, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 77.08540344238281, + "rewards/margins": 149.9859161376953, + "rewards/rejected": -72.90049743652344, + "step": 5290, + "u": -4.360350608825684, + "weight": 0.050000011920928955 + }, + { + "diff_generated": -70.36927795410156, + "epoch": 1.7174335709656514, + "grad_norm": 480.9713570803477, + "learning_rate": 3.686614686042906e-07, + "logits/chosen": -2.4388208389282227, + "logits/rejected": -2.545295238494873, + "logps/chosen": -16.006649017333984, + "logps/rejected": -155.7036590576172, + "loss": 17.2565, + "losses_ref": -0.000997414463199675, + "ref_logps/chosen": -96.17381286621094, + "ref_logps/rejected": -85.33439636230469, + "rewards/accuracies": 0.96875, + "rewards/chosen": 80.16717529296875, + "rewards/margins": 150.53643798828125, + "rewards/rejected": -70.36927795410156, + "step": 5300, + "u": -4.416836261749268, + "weight": 0.031297024339437485 + }, + { + "diff_generated": -72.0317611694336, + "epoch": 1.7206740116655865, + "grad_norm": 438.97836563138696, + "learning_rate": 3.6715812506601493e-07, + "logits/chosen": -2.4332687854766846, + "logits/rejected": -2.4583961963653564, + "logps/chosen": -17.069910049438477, + "logps/rejected": -159.6905059814453, + "loss": 16.7191, + "losses_ref": -0.00018645053205545992, + "ref_logps/chosen": -100.94898986816406, + "ref_logps/rejected": -87.65870666503906, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 83.87908172607422, + "rewards/margins": 155.91085815429688, + "rewards/rejected": -72.0317611694336, + "step": 5310, + "u": -4.3903889656066895, + "weight": 0.03750806301832199 + }, + { + "diff_generated": -67.04124450683594, + "epoch": 1.7239144523655217, + "grad_norm": 475.4394366078291, + "learning_rate": 3.6565524843287526e-07, + "logits/chosen": -2.452981472015381, + "logits/rejected": -2.5339856147766113, + "logps/chosen": -15.451299667358398, + "logps/rejected": -148.46969604492188, + "loss": 16.5968, + "losses_ref": -1.6524964507880213e-07, + "ref_logps/chosen": -92.11998748779297, + "ref_logps/rejected": -81.42845916748047, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 76.66868591308594, + "rewards/margins": 143.70993041992188, + "rewards/rejected": -67.04124450683594, + "step": 5320, + "u": -4.24575138092041, + "weight": 0.07500000298023224 + }, + { + "diff_generated": -71.4760971069336, + "epoch": 1.7271548930654568, + "grad_norm": 440.991290221704, + "learning_rate": 3.641528600709115e-07, + "logits/chosen": -2.4461655616760254, + "logits/rejected": -2.5163216590881348, + "logps/chosen": -17.627792358398438, + "logps/rejected": -157.37338256835938, + "loss": 16.9069, + "losses_ref": -0.0048348382115364075, + "ref_logps/chosen": -96.23738098144531, + "ref_logps/rejected": -85.89727783203125, + "rewards/accuracies": 0.9375, + "rewards/chosen": 78.6095962524414, + "rewards/margins": 150.08567810058594, + "rewards/rejected": -71.4760971069336, + "step": 5330, + "u": -4.264947891235352, + "weight": 0.06274246424436569 + }, + { + "diff_generated": -65.23731231689453, + "epoch": 1.7303953337653921, + "grad_norm": 460.9102888832523, + "learning_rate": 3.6265098133922277e-07, + "logits/chosen": -2.477548360824585, + "logits/rejected": -2.515925168991089, + "logps/chosen": -15.422185897827148, + "logps/rejected": -144.16061401367188, + "loss": 16.4254, + "losses_ref": -0.008267196826636791, + "ref_logps/chosen": -90.59266662597656, + "ref_logps/rejected": -78.92329406738281, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 75.17047119140625, + "rewards/margins": 140.40777587890625, + "rewards/rejected": -65.23731231689453, + "step": 5340, + "u": -4.144270896911621, + "weight": 0.10040758550167084 + }, + { + "diff_generated": -70.13384246826172, + "epoch": 1.7336357744653272, + "grad_norm": 436.0160106234523, + "learning_rate": 3.611496335896617e-07, + "logits/chosen": -2.443701982498169, + "logits/rejected": -2.561774730682373, + "logps/chosen": -15.86700439453125, + "logps/rejected": -157.41140747070312, + "loss": 17.0001, + "losses_ref": -1.2482225429266691e-05, + "ref_logps/chosen": -93.91472625732422, + "ref_logps/rejected": -87.27755737304688, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 78.0477294921875, + "rewards/margins": 148.1815643310547, + "rewards/rejected": -70.13384246826172, + "step": 5350, + "u": -4.3748579025268555, + "weight": 0.0500003807246685 + }, + { + "diff_generated": -70.87732696533203, + "epoch": 1.7368762151652626, + "grad_norm": 459.22489703370104, + "learning_rate": 3.59648838166533e-07, + "logits/chosen": -2.4528727531433105, + "logits/rejected": -2.5678493976593018, + "logps/chosen": -17.44341278076172, + "logps/rejected": -158.6827392578125, + "loss": 17.2521, + "losses_ref": -5.317440923136019e-07, + "ref_logps/chosen": -95.00345611572266, + "ref_logps/rejected": -87.80540466308594, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 77.56005096435547, + "rewards/margins": 148.43736267089844, + "rewards/rejected": -70.87732696533203, + "step": 5360, + "u": -4.344834804534912, + "weight": 0.04375001788139343 + }, + { + "diff_generated": -66.20857238769531, + "epoch": 1.7401166558651977, + "grad_norm": 492.27639823140424, + "learning_rate": 3.5814861640628864e-07, + "logits/chosen": -2.3951048851013184, + "logits/rejected": -2.4831387996673584, + "logps/chosen": -18.14093017578125, + "logps/rejected": -149.3103790283203, + "loss": 17.3597, + "losses_ref": -7.648421274097927e-07, + "ref_logps/chosen": -91.5876693725586, + "ref_logps/rejected": -83.10181427001953, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 73.44673919677734, + "rewards/margins": 139.65530395507812, + "rewards/rejected": -66.20857238769531, + "step": 5370, + "u": -4.144860744476318, + "weight": 0.08750001341104507 + }, + { + "diff_generated": -71.12236022949219, + "epoch": 1.7433570965651328, + "grad_norm": 415.92824723503844, + "learning_rate": 3.5664898963722526e-07, + "logits/chosen": -2.3865838050842285, + "logits/rejected": -2.513049364089966, + "logps/chosen": -17.46709632873535, + "logps/rejected": -156.9357452392578, + "loss": 16.5102, + "losses_ref": -1.2130412230249021e-08, + "ref_logps/chosen": -93.4074478149414, + "ref_logps/rejected": -85.81340026855469, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 75.94035339355469, + "rewards/margins": 147.06271362304688, + "rewards/rejected": -71.12236022949219, + "step": 5380, + "u": -4.3460259437561035, + "weight": 0.05000000074505806 + }, + { + "diff_generated": -67.88063049316406, + "epoch": 1.746597537265068, + "grad_norm": 447.0975280429844, + "learning_rate": 3.5514997917918016e-07, + "logits/chosen": -2.412515640258789, + "logits/rejected": -2.53787899017334, + "logps/chosen": -14.890464782714844, + "logps/rejected": -149.828125, + "loss": 15.9397, + "losses_ref": -2.935139242765672e-08, + "ref_logps/chosen": -91.47549438476562, + "ref_logps/rejected": -81.94749450683594, + "rewards/accuracies": 0.9375, + "rewards/chosen": 76.58503723144531, + "rewards/margins": 144.46566772460938, + "rewards/rejected": -67.88063049316406, + "step": 5390, + "u": -4.2765278816223145, + "weight": 0.0625 + }, + { + "diff_generated": -72.81483459472656, + "epoch": 1.7498379779650033, + "grad_norm": 429.0403541723934, + "learning_rate": 3.536516063432293e-07, + "logits/chosen": -2.4231066703796387, + "logits/rejected": -2.5467000007629395, + "logps/chosen": -16.429943084716797, + "logps/rejected": -159.30337524414062, + "loss": 16.9469, + "losses_ref": -0.0024499078281223774, + "ref_logps/chosen": -93.90434265136719, + "ref_logps/rejected": -86.48854064941406, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 77.47440338134766, + "rewards/margins": 150.28924560546875, + "rewards/rejected": -72.81483459472656, + "step": 5400, + "u": -4.391019344329834, + "weight": 0.03762707859277725 + }, + { + "diff_generated": -71.17898559570312, + "epoch": 1.7530784186649384, + "grad_norm": 426.8218423325292, + "learning_rate": 3.5215389243138326e-07, + "logits/chosen": -2.41701340675354, + "logits/rejected": -2.485673189163208, + "logps/chosen": -19.6455135345459, + "logps/rejected": -159.58499145507812, + "loss": 16.824, + "losses_ref": -0.0006879680440761149, + "ref_logps/chosen": -102.19815063476562, + "ref_logps/rejected": -88.40601348876953, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 82.55262756347656, + "rewards/margins": 153.7316131591797, + "rewards/rejected": -71.17898559570312, + "step": 5410, + "u": -4.41085958480835, + "weight": 0.0375228226184845 + }, + { + "diff_generated": -74.72703552246094, + "epoch": 1.7563188593648738, + "grad_norm": 483.34852960013836, + "learning_rate": 3.50656858736285e-07, + "logits/chosen": -2.439666271209717, + "logits/rejected": -2.4910497665405273, + "logps/chosen": -17.57253646850586, + "logps/rejected": -167.9637451171875, + "loss": 16.7944, + "losses_ref": -0.00021042392472736537, + "ref_logps/chosen": -101.20845031738281, + "ref_logps/rejected": -93.2367172241211, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 83.63592529296875, + "rewards/margins": 158.3629608154297, + "rewards/rejected": -74.72703552246094, + "step": 5420, + "u": -4.3953704833984375, + "weight": 0.02500920556485653 + }, + { + "diff_generated": -69.79595947265625, + "epoch": 1.7595593000648089, + "grad_norm": 442.3153908664761, + "learning_rate": 3.491605265409073e-07, + "logits/chosen": -2.4521777629852295, + "logits/rejected": -2.4844508171081543, + "logps/chosen": -19.611408233642578, + "logps/rejected": -159.6874237060547, + "loss": 16.9916, + "losses_ref": -0.0011415036860853434, + "ref_logps/chosen": -99.37894439697266, + "ref_logps/rejected": -89.89147186279297, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 79.76752471923828, + "rewards/margins": 149.5634765625, + "rewards/rejected": -69.79595947265625, + "step": 5430, + "u": -4.332057952880859, + "weight": 0.05630398914217949 + }, + { + "diff_generated": -70.03108978271484, + "epoch": 1.762799740764744, + "grad_norm": 499.24694029734786, + "learning_rate": 3.4766491711824916e-07, + "logits/chosen": -2.3980777263641357, + "logits/rejected": -2.501035213470459, + "logps/chosen": -17.213281631469727, + "logps/rejected": -157.7755584716797, + "loss": 17.2019, + "losses_ref": -0.04993446543812752, + "ref_logps/chosen": -93.63558197021484, + "ref_logps/rejected": -87.74449157714844, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 76.42230224609375, + "rewards/margins": 146.45339965820312, + "rewards/rejected": -70.03108978271484, + "step": 5440, + "u": -4.326253414154053, + "weight": 0.05067465454339981 + }, + { + "diff_generated": -71.91776275634766, + "epoch": 1.7660401814646791, + "grad_norm": 449.3354623063938, + "learning_rate": 3.4617005173103497e-07, + "logits/chosen": -2.4744386672973633, + "logits/rejected": -2.538677930831909, + "logps/chosen": -17.06021499633789, + "logps/rejected": -158.7646942138672, + "loss": 16.6296, + "losses_ref": -0.0002994223905261606, + "ref_logps/chosen": -99.29143524169922, + "ref_logps/rejected": -86.84693908691406, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 82.2312240600586, + "rewards/margins": 154.1489715576172, + "rewards/rejected": -71.91776275634766, + "step": 5450, + "u": -4.445778846740723, + "weight": 0.025012975558638573 + }, + { + "diff_generated": -69.34452819824219, + "epoch": 1.7692806221646142, + "grad_norm": 420.9992020837075, + "learning_rate": 3.4467595163141056e-07, + "logits/chosen": -2.4221789836883545, + "logits/rejected": -2.531097888946533, + "logps/chosen": -16.535367965698242, + "logps/rejected": -154.95436096191406, + "loss": 16.9989, + "losses_ref": -8.049047755775973e-05, + "ref_logps/chosen": -93.34659576416016, + "ref_logps/rejected": -85.6098403930664, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 76.81124114990234, + "rewards/margins": 146.15576171875, + "rewards/rejected": -69.34452819824219, + "step": 5460, + "u": -4.314999580383301, + "weight": 0.056253306567668915 + }, + { + "diff_generated": -68.81937408447266, + "epoch": 1.7725210628645496, + "grad_norm": 460.36253791839636, + "learning_rate": 3.4318263806064244e-07, + "logits/chosen": -2.4252026081085205, + "logits/rejected": -2.4815948009490967, + "logps/chosen": -17.621421813964844, + "logps/rejected": -155.63185119628906, + "loss": 17.1712, + "losses_ref": -9.521203173790127e-05, + "ref_logps/chosen": -96.93167114257812, + "ref_logps/rejected": -86.81249237060547, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 79.31024932861328, + "rewards/margins": 148.12960815429688, + "rewards/rejected": -68.81937408447266, + "step": 5470, + "u": -4.36357307434082, + "weight": 0.04375128075480461 + }, + { + "diff_generated": -69.7218246459961, + "epoch": 1.775761503564485, + "grad_norm": 436.30995540960345, + "learning_rate": 3.4169013224881475e-07, + "logits/chosen": -2.473205089569092, + "logits/rejected": -2.5455079078674316, + "logps/chosen": -17.088150024414062, + "logps/rejected": -156.84713745117188, + "loss": 16.8389, + "losses_ref": -0.0009352788329124451, + "ref_logps/chosen": -96.74797058105469, + "ref_logps/rejected": -87.12532043457031, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 79.65982055664062, + "rewards/margins": 149.3816680908203, + "rewards/rejected": -69.7218246459961, + "step": 5480, + "u": -4.265133380889893, + "weight": 0.06879539787769318 + }, + { + "diff_generated": -65.35900115966797, + "epoch": 1.77900194426442, + "grad_norm": 480.0695222800333, + "learning_rate": 3.4019845541452844e-07, + "logits/chosen": -2.4091029167175293, + "logits/rejected": -2.4511494636535645, + "logps/chosen": -16.346660614013672, + "logps/rejected": -145.93875122070312, + "loss": 17.1195, + "losses_ref": -0.00016973615856841207, + "ref_logps/chosen": -91.6805191040039, + "ref_logps/rejected": -80.57975769042969, + "rewards/accuracies": 0.9375, + "rewards/chosen": 75.3338623046875, + "rewards/margins": 140.69284057617188, + "rewards/rejected": -65.35900115966797, + "step": 5490, + "u": -4.253424167633057, + "weight": 0.06250713765621185 + }, + { + "diff_generated": -68.89006805419922, + "epoch": 1.7822423849643552, + "grad_norm": 457.98814975416997, + "learning_rate": 3.387076287645985e-07, + "logits/chosen": -2.422269105911255, + "logits/rejected": -2.521350383758545, + "logps/chosen": -16.43721580505371, + "logps/rejected": -155.05738830566406, + "loss": 16.6865, + "losses_ref": -0.012262609787285328, + "ref_logps/chosen": -94.85432434082031, + "ref_logps/rejected": -86.16730499267578, + "rewards/accuracies": 0.9375, + "rewards/chosen": 78.41709899902344, + "rewards/margins": 147.3071746826172, + "rewards/rejected": -68.89006805419922, + "step": 5500, + "u": -4.225312232971191, + "weight": 0.06295279413461685 + }, + { + "diff_generated": -69.23516845703125, + "epoch": 1.7854828256642903, + "grad_norm": 413.9341227063301, + "learning_rate": 3.372176734937536e-07, + "logits/chosen": -2.376582145690918, + "logits/rejected": -2.510887622833252, + "logps/chosen": -15.711641311645508, + "logps/rejected": -158.05885314941406, + "loss": 16.1692, + "losses_ref": -3.3730197174008936e-05, + "ref_logps/chosen": -94.01397705078125, + "ref_logps/rejected": -88.82369232177734, + "rewards/accuracies": 0.9375, + "rewards/chosen": 78.30233001708984, + "rewards/margins": 147.53750610351562, + "rewards/rejected": -69.23516845703125, + "step": 5510, + "u": -4.285754203796387, + "weight": 0.06250113993883133 + }, + { + "diff_generated": -67.3664321899414, + "epoch": 1.7887232663642254, + "grad_norm": 467.5054402064834, + "learning_rate": 3.3572861078433376e-07, + "logits/chosen": -2.4425301551818848, + "logits/rejected": -2.4949724674224854, + "logps/chosen": -15.836771965026855, + "logps/rejected": -149.77706909179688, + "loss": 16.8381, + "losses_ref": -0.0006229934515431523, + "ref_logps/chosen": -92.31486511230469, + "ref_logps/rejected": -82.41062927246094, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 76.47808837890625, + "rewards/margins": 143.84451293945312, + "rewards/rejected": -67.3664321899414, + "step": 5520, + "u": -4.226956367492676, + "weight": 0.06877995282411575 + }, + { + "diff_generated": -68.57489013671875, + "epoch": 1.7919637070641607, + "grad_norm": 435.08875035945834, + "learning_rate": 3.3424046180599e-07, + "logits/chosen": -2.438937187194824, + "logits/rejected": -2.516026258468628, + "logps/chosen": -15.992448806762695, + "logps/rejected": -149.87245178222656, + "loss": 16.5741, + "losses_ref": -1.454621241236964e-07, + "ref_logps/chosen": -91.31656646728516, + "ref_logps/rejected": -81.29755401611328, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 75.32411193847656, + "rewards/margins": 143.8990020751953, + "rewards/rejected": -68.57489013671875, + "step": 5530, + "u": -4.198179244995117, + "weight": 0.08124999701976776 + }, + { + "diff_generated": -68.36991882324219, + "epoch": 1.7952041477640959, + "grad_norm": 470.5825818556089, + "learning_rate": 3.3275324771538273e-07, + "logits/chosen": -2.397613525390625, + "logits/rejected": -2.4743857383728027, + "logps/chosen": -16.997846603393555, + "logps/rejected": -155.87808227539062, + "loss": 16.447, + "losses_ref": -1.585126119607594e-05, + "ref_logps/chosen": -92.73638916015625, + "ref_logps/rejected": -87.50816345214844, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 75.7385482788086, + "rewards/margins": 144.1084442138672, + "rewards/rejected": -68.36991882324219, + "step": 5540, + "u": -4.300636291503906, + "weight": 0.056250762194395065 + }, + { + "diff_generated": -67.50434112548828, + "epoch": 1.7984445884640312, + "grad_norm": 434.58597111300475, + "learning_rate": 3.312669896558816e-07, + "logits/chosen": -2.470421314239502, + "logits/rejected": -2.513103485107422, + "logps/chosen": -16.30929183959961, + "logps/rejected": -154.2463836669922, + "loss": 16.3786, + "losses_ref": -0.0007042810902930796, + "ref_logps/chosen": -96.81012725830078, + "ref_logps/rejected": -86.74208068847656, + "rewards/accuracies": 0.9375, + "rewards/chosen": 80.5008316040039, + "rewards/margins": 148.0051727294922, + "rewards/rejected": -67.50434112548828, + "step": 5550, + "u": -4.243407249450684, + "weight": 0.06253115087747574 + }, + { + "diff_generated": -69.06336212158203, + "epoch": 1.8016850291639663, + "grad_norm": 499.72614956659856, + "learning_rate": 3.2978170875726454e-07, + "logits/chosen": -2.4539406299591064, + "logits/rejected": -2.540009021759033, + "logps/chosen": -14.689851760864258, + "logps/rejected": -154.6982879638672, + "loss": 16.1144, + "losses_ref": -8.424254183125868e-09, + "ref_logps/chosen": -92.51824188232422, + "ref_logps/rejected": -85.6349105834961, + "rewards/accuracies": 0.9375, + "rewards/chosen": 77.8283920288086, + "rewards/margins": 146.89175415039062, + "rewards/rejected": -69.06336212158203, + "step": 5560, + "u": -4.2520318031311035, + "weight": 0.0625 + }, + { + "diff_generated": -69.83843994140625, + "epoch": 1.8049254698639015, + "grad_norm": 460.6886443876183, + "learning_rate": 3.2829742613541704e-07, + "logits/chosen": -2.3935115337371826, + "logits/rejected": -2.545502185821533, + "logps/chosen": -17.24357795715332, + "logps/rejected": -160.28634643554688, + "loss": 16.6144, + "losses_ref": -0.0011852236930280924, + "ref_logps/chosen": -94.0816650390625, + "ref_logps/rejected": -90.4478988647461, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 76.83808898925781, + "rewards/margins": 146.67652893066406, + "rewards/rejected": -69.83843994140625, + "step": 5570, + "u": -4.173181533813477, + "weight": 0.0750543624162674 + }, + { + "diff_generated": -68.1365966796875, + "epoch": 1.8081659105638366, + "grad_norm": 457.9151207111868, + "learning_rate": 3.26814162892033e-07, + "logits/chosen": -2.4605672359466553, + "logits/rejected": -2.5549590587615967, + "logps/chosen": -17.256122589111328, + "logps/rejected": -154.7444610595703, + "loss": 16.5853, + "losses_ref": -6.146209670987446e-06, + "ref_logps/chosen": -98.32067108154297, + "ref_logps/rejected": -86.60784912109375, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 81.06455993652344, + "rewards/margins": 149.20115661621094, + "rewards/rejected": -68.1365966796875, + "step": 5580, + "u": -4.302131652832031, + "weight": 0.05625029653310776 + }, + { + "diff_generated": -68.90168762207031, + "epoch": 1.811406351263772, + "grad_norm": 402.7850425533113, + "learning_rate": 3.2533194011431346e-07, + "logits/chosen": -2.439361572265625, + "logits/rejected": -2.5364279747009277, + "logps/chosen": -15.55030632019043, + "logps/rejected": -153.10989379882812, + "loss": 16.0603, + "losses_ref": -2.9312252181057374e-08, + "ref_logps/chosen": -92.38101196289062, + "ref_logps/rejected": -84.20820617675781, + "rewards/accuracies": 0.9375, + "rewards/chosen": 76.8307113647461, + "rewards/margins": 145.73239135742188, + "rewards/rejected": -68.90168762207031, + "step": 5590, + "u": -4.271599292755127, + "weight": 0.0625 + }, + { + "diff_generated": -72.8712387084961, + "epoch": 1.814646791963707, + "grad_norm": 448.7536562243639, + "learning_rate": 3.2385077887466766e-07, + "logits/chosen": -2.4683947563171387, + "logits/rejected": -2.58646821975708, + "logps/chosen": -16.766773223876953, + "logps/rejected": -164.80836486816406, + "loss": 16.4598, + "losses_ref": -0.0009516210993751884, + "ref_logps/chosen": -97.44658660888672, + "ref_logps/rejected": -91.93711853027344, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 80.67982482910156, + "rewards/margins": 153.55105590820312, + "rewards/rejected": -72.8712387084961, + "step": 5600, + "u": -4.3830766677856445, + "weight": 0.03754193335771561 + }, + { + "diff_generated": -69.22303771972656, + "epoch": 1.8178872326636424, + "grad_norm": 486.62192104812317, + "learning_rate": 3.223707002304131e-07, + "logits/chosen": -2.3914060592651367, + "logits/rejected": -2.5072906017303467, + "logps/chosen": -17.703372955322266, + "logps/rejected": -157.489501953125, + "loss": 17.4487, + "losses_ref": -0.000594664248637855, + "ref_logps/chosen": -90.99412536621094, + "ref_logps/rejected": -88.26644897460938, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 73.29075622558594, + "rewards/margins": 142.5137939453125, + "rewards/rejected": -69.22303771972656, + "step": 5610, + "u": -4.2054243087768555, + "weight": 0.07502665370702744 + }, + { + "diff_generated": -70.07087707519531, + "epoch": 1.8211276733635775, + "grad_norm": 458.51573979209115, + "learning_rate": 3.208917252234765e-07, + "logits/chosen": -2.414137601852417, + "logits/rejected": -2.5393738746643066, + "logps/chosen": -14.719442367553711, + "logps/rejected": -154.69467163085938, + "loss": 16.6353, + "losses_ref": -0.0017374107846990228, + "ref_logps/chosen": -91.6000747680664, + "ref_logps/rejected": -84.62378692626953, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 76.8806381225586, + "rewards/margins": 146.95150756835938, + "rewards/rejected": -70.07087707519531, + "step": 5620, + "u": -4.319277286529541, + "weight": 0.05008368566632271 + }, + { + "diff_generated": -67.59291076660156, + "epoch": 1.8243681140635126, + "grad_norm": 460.39703583819204, + "learning_rate": 3.1941387488009396e-07, + "logits/chosen": -2.426354169845581, + "logits/rejected": -2.5217723846435547, + "logps/chosen": -16.977571487426758, + "logps/rejected": -152.21470642089844, + "loss": 16.6692, + "losses_ref": -0.0006723630940541625, + "ref_logps/chosen": -95.56573486328125, + "ref_logps/rejected": -84.6218032836914, + "rewards/accuracies": 0.9375, + "rewards/chosen": 78.58818054199219, + "rewards/margins": 146.1810760498047, + "rewards/rejected": -67.59291076660156, + "step": 5630, + "u": -4.250881195068359, + "weight": 0.06253048777580261 + }, + { + "diff_generated": -69.4136962890625, + "epoch": 1.8276085547634477, + "grad_norm": 446.7409918562372, + "learning_rate": 3.179371702105132e-07, + "logits/chosen": -2.4898746013641357, + "logits/rejected": -2.5813000202178955, + "logps/chosen": -18.666357040405273, + "logps/rejected": -157.55398559570312, + "loss": 17.0778, + "losses_ref": -0.0025139835197478533, + "ref_logps/chosen": -100.08354187011719, + "ref_logps/rejected": -88.14030456542969, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 81.41717529296875, + "rewards/margins": 150.83087158203125, + "rewards/rejected": -69.4136962890625, + "step": 5640, + "u": -4.346514701843262, + "weight": 0.037541624158620834 + }, + { + "diff_generated": -68.67171478271484, + "epoch": 1.8308489954633829, + "grad_norm": 451.5965545928365, + "learning_rate": 3.164616322086936e-07, + "logits/chosen": -2.4457664489746094, + "logits/rejected": -2.5104620456695557, + "logps/chosen": -16.123414993286133, + "logps/rejected": -154.59475708007812, + "loss": 17.171, + "losses_ref": -7.124496903543331e-08, + "ref_logps/chosen": -95.08893585205078, + "ref_logps/rejected": -85.92303466796875, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 78.96553039550781, + "rewards/margins": 147.63723754882812, + "rewards/rejected": -68.67171478271484, + "step": 5650, + "u": -4.360419273376465, + "weight": 0.05000000074505806 + }, + { + "diff_generated": -75.24528503417969, + "epoch": 1.8340894361633182, + "grad_norm": 423.36652708511275, + "learning_rate": 3.1498728185200845e-07, + "logits/chosen": -2.484219551086426, + "logits/rejected": -2.571659564971924, + "logps/chosen": -18.681427001953125, + "logps/rejected": -167.66644287109375, + "loss": 16.7819, + "losses_ref": -0.0029082954861223698, + "ref_logps/chosen": -99.9460678100586, + "ref_logps/rejected": -92.42115020751953, + "rewards/accuracies": 0.96875, + "rewards/chosen": 81.2646484375, + "rewards/margins": 156.50994873046875, + "rewards/rejected": -75.24528503417969, + "step": 5660, + "u": -4.3646111488342285, + "weight": 0.03129839152097702 + }, + { + "diff_generated": -68.78709411621094, + "epoch": 1.8373298768632536, + "grad_norm": 454.5740812362012, + "learning_rate": 3.1351414010094683e-07, + "logits/chosen": -2.405937671661377, + "logits/rejected": -2.524935007095337, + "logps/chosen": -18.181316375732422, + "logps/rejected": -155.01504516601562, + "loss": 16.65, + "losses_ref": -6.4683889355876545e-09, + "ref_logps/chosen": -93.27643585205078, + "ref_logps/rejected": -86.22793579101562, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 75.09513092041016, + "rewards/margins": 143.88223266601562, + "rewards/rejected": -68.78709411621094, + "step": 5670, + "u": -4.331615447998047, + "weight": 0.04374999925494194 + }, + { + "diff_generated": -69.10374450683594, + "epoch": 1.8405703175631887, + "grad_norm": 434.643897722096, + "learning_rate": 3.120422278988149e-07, + "logits/chosen": -2.4068522453308105, + "logits/rejected": -2.507352590560913, + "logps/chosen": -17.3289852142334, + "logps/rejected": -155.24156188964844, + "loss": 17.0959, + "losses_ref": -5.715831503039226e-05, + "ref_logps/chosen": -95.69300842285156, + "ref_logps/rejected": -86.13782501220703, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 78.36402893066406, + "rewards/margins": 147.4677734375, + "rewards/rejected": -69.10374450683594, + "step": 5680, + "u": -4.363373756408691, + "weight": 0.04375145584344864 + }, + { + "diff_generated": -68.53547668457031, + "epoch": 1.8438107582631238, + "grad_norm": 459.7871284739823, + "learning_rate": 3.10571566171439e-07, + "logits/chosen": -2.435319185256958, + "logits/rejected": -2.5634605884552, + "logps/chosen": -17.82388687133789, + "logps/rejected": -155.66110229492188, + "loss": 17.2522, + "losses_ref": -0.0006606754614040256, + "ref_logps/chosen": -94.27436828613281, + "ref_logps/rejected": -87.12561798095703, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 76.45048522949219, + "rewards/margins": 144.98594665527344, + "rewards/rejected": -68.53547668457031, + "step": 5690, + "u": -4.3797197341918945, + "weight": 0.03752906247973442 + }, + { + "diff_generated": -68.64985656738281, + "epoch": 1.847051198963059, + "grad_norm": 427.58675584780747, + "learning_rate": 3.0910217582686756e-07, + "logits/chosen": -2.4144062995910645, + "logits/rejected": -2.5840725898742676, + "logps/chosen": -16.03522491455078, + "logps/rejected": -154.49697875976562, + "loss": 17.4557, + "losses_ref": -0.0017615113174542785, + "ref_logps/chosen": -88.2295150756836, + "ref_logps/rejected": -85.84712219238281, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 72.19429016113281, + "rewards/margins": 140.84414672851562, + "rewards/rejected": -68.64985656738281, + "step": 5700, + "u": -4.2154765129089355, + "weight": 0.06880569458007812 + }, + { + "diff_generated": -67.02249145507812, + "epoch": 1.850291639662994, + "grad_norm": 412.16964669146034, + "learning_rate": 3.0763407775507426e-07, + "logits/chosen": -2.4929986000061035, + "logits/rejected": -2.591066360473633, + "logps/chosen": -16.952259063720703, + "logps/rejected": -154.13525390625, + "loss": 16.801, + "losses_ref": -0.0003933070693165064, + "ref_logps/chosen": -93.57677459716797, + "ref_logps/rejected": -87.11276245117188, + "rewards/accuracies": 0.90625, + "rewards/chosen": 76.62451171875, + "rewards/margins": 143.64700317382812, + "rewards/rejected": -67.02249145507812, + "step": 5710, + "u": -4.159280300140381, + "weight": 0.09376726299524307 + }, + { + "diff_generated": -69.30915832519531, + "epoch": 1.8535320803629294, + "grad_norm": 427.6719015690686, + "learning_rate": 3.0616729282766037e-07, + "logits/chosen": -2.395036220550537, + "logits/rejected": -2.5346271991729736, + "logps/chosen": -15.799379348754883, + "logps/rejected": -152.8411865234375, + "loss": 17.1469, + "losses_ref": -8.902359738272025e-09, + "ref_logps/chosen": -90.04239654541016, + "ref_logps/rejected": -83.53202819824219, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 74.24299621582031, + "rewards/margins": 143.55215454101562, + "rewards/rejected": -69.30915832519531, + "step": 5720, + "u": -4.275652885437012, + "weight": 0.06875000149011612 + }, + { + "diff_generated": -66.54766845703125, + "epoch": 1.8567725210628645, + "grad_norm": 482.6223148458128, + "learning_rate": 3.047018418975593e-07, + "logits/chosen": -2.4510700702667236, + "logits/rejected": -2.4502556324005127, + "logps/chosen": -18.8032283782959, + "logps/rejected": -148.94503784179688, + "loss": 17.357, + "losses_ref": -0.0006291717290878296, + "ref_logps/chosen": -98.66024780273438, + "ref_logps/rejected": -82.39737701416016, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 79.85701751708984, + "rewards/margins": 146.40469360351562, + "rewards/rejected": -66.54766845703125, + "step": 5730, + "u": -4.319241523742676, + "weight": 0.05003003031015396 + }, + { + "diff_generated": -70.15180969238281, + "epoch": 1.8600129617627998, + "grad_norm": 436.60195450201667, + "learning_rate": 3.032377457987385e-07, + "logits/chosen": -2.4268975257873535, + "logits/rejected": -2.542205810546875, + "logps/chosen": -17.047893524169922, + "logps/rejected": -156.6510772705078, + "loss": 16.1966, + "losses_ref": -4.815445208805613e-06, + "ref_logps/chosen": -95.20332336425781, + "ref_logps/rejected": -86.49925231933594, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 78.15543365478516, + "rewards/margins": 148.30723571777344, + "rewards/rejected": -70.15180969238281, + "step": 5740, + "u": -4.337141990661621, + "weight": 0.0500001423060894 + }, + { + "diff_generated": -72.12767791748047, + "epoch": 1.863253402462735, + "grad_norm": 487.1011768414387, + "learning_rate": 3.017750253459048e-07, + "logits/chosen": -2.4596199989318848, + "logits/rejected": -2.529163360595703, + "logps/chosen": -18.66078758239746, + "logps/rejected": -162.290283203125, + "loss": 17.5329, + "losses_ref": -6.803600172133883e-07, + "ref_logps/chosen": -98.63432312011719, + "ref_logps/rejected": -90.16262817382812, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 79.9735336303711, + "rewards/margins": 152.10122680664062, + "rewards/rejected": -72.12767791748047, + "step": 5750, + "u": -4.4022650718688965, + "weight": 0.03750002756714821 + }, + { + "diff_generated": -72.43559265136719, + "epoch": 1.86649384316267, + "grad_norm": 437.2306151875033, + "learning_rate": 3.003137013342071e-07, + "logits/chosen": -2.48848819732666, + "logits/rejected": -2.634258508682251, + "logps/chosen": -15.372105598449707, + "logps/rejected": -163.10440063476562, + "loss": 16.7388, + "losses_ref": -0.0001623667194508016, + "ref_logps/chosen": -92.11798095703125, + "ref_logps/rejected": -90.66879272460938, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 76.7458724975586, + "rewards/margins": 149.1814727783203, + "rewards/rejected": -72.43559265136719, + "step": 5760, + "u": -4.388466835021973, + "weight": 0.04375699535012245 + }, + { + "diff_generated": -72.01509857177734, + "epoch": 1.8697342838626052, + "grad_norm": 456.3011398041365, + "learning_rate": 2.9885379453894224e-07, + "logits/chosen": -2.477651596069336, + "logits/rejected": -2.6130967140197754, + "logps/chosen": -15.400982856750488, + "logps/rejected": -157.45724487304688, + "loss": 16.5494, + "losses_ref": -0.0009191132267005742, + "ref_logps/chosen": -94.54651641845703, + "ref_logps/rejected": -85.44215393066406, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 79.1455307006836, + "rewards/margins": 151.16061401367188, + "rewards/rejected": -72.01509857177734, + "step": 5770, + "u": -4.335265159606934, + "weight": 0.05004185438156128 + }, + { + "diff_generated": -71.83040618896484, + "epoch": 1.8729747245625405, + "grad_norm": 453.99773682282233, + "learning_rate": 2.9739532571525806e-07, + "logits/chosen": -2.494704008102417, + "logits/rejected": -2.625767946243286, + "logps/chosen": -15.995404243469238, + "logps/rejected": -157.23162841796875, + "loss": 17.0838, + "losses_ref": -0.0009462740272283554, + "ref_logps/chosen": -95.97868347167969, + "ref_logps/rejected": -85.40122985839844, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 79.98329162597656, + "rewards/margins": 151.8136749267578, + "rewards/rejected": -71.83040618896484, + "step": 5780, + "u": -4.389662265777588, + "weight": 0.03754409775137901 + }, + { + "diff_generated": -68.6485366821289, + "epoch": 1.8762151652624757, + "grad_norm": 475.71899486903936, + "learning_rate": 2.959383155978596e-07, + "logits/chosen": -2.435675621032715, + "logits/rejected": -2.5268807411193848, + "logps/chosen": -15.956690788269043, + "logps/rejected": -156.03738403320312, + "loss": 17.3861, + "losses_ref": -3.328962702653371e-05, + "ref_logps/chosen": -95.65718078613281, + "ref_logps/rejected": -87.38885498046875, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 79.70049285888672, + "rewards/margins": 148.34902954101562, + "rewards/rejected": -68.6485366821289, + "step": 5790, + "u": -4.327332496643066, + "weight": 0.05000089854001999 + }, + { + "diff_generated": -67.33222198486328, + "epoch": 1.879455605962411, + "grad_norm": 458.83112990203347, + "learning_rate": 2.9448278490071373e-07, + "logits/chosen": -2.4544126987457275, + "logits/rejected": -2.554018020629883, + "logps/chosen": -16.464946746826172, + "logps/rejected": -151.57635498046875, + "loss": 17.5263, + "losses_ref": -1.2650698977267893e-07, + "ref_logps/chosen": -93.53096771240234, + "ref_logps/rejected": -84.24412536621094, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 77.06602478027344, + "rewards/margins": 144.39825439453125, + "rewards/rejected": -67.33222198486328, + "step": 5800, + "u": -4.192667484283447, + "weight": 0.07500000298023224 + }, + { + "diff_generated": -73.76513671875, + "epoch": 1.8826960466623461, + "grad_norm": 451.9329289107445, + "learning_rate": 2.930287543167544e-07, + "logits/chosen": -2.510129451751709, + "logits/rejected": -2.558194875717163, + "logps/chosen": -17.454477310180664, + "logps/rejected": -160.72348022460938, + "loss": 16.6293, + "losses_ref": -0.00039221724728122354, + "ref_logps/chosen": -104.30326080322266, + "ref_logps/rejected": -86.95834350585938, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 86.84878540039062, + "rewards/margins": 160.61392211914062, + "rewards/rejected": -73.76513671875, + "step": 5810, + "u": -4.528841018676758, + "weight": 0.006267140153795481 + }, + { + "diff_generated": -67.52911376953125, + "epoch": 1.8859364873622813, + "grad_norm": 454.44370150744254, + "learning_rate": 2.9157624451758944e-07, + "logits/chosen": -2.428351640701294, + "logits/rejected": -2.445155382156372, + "logps/chosen": -17.483226776123047, + "logps/rejected": -150.34759521484375, + "loss": 16.8459, + "losses_ref": -5.854690954265607e-08, + "ref_logps/chosen": -97.53578186035156, + "ref_logps/rejected": -82.81848907470703, + "rewards/accuracies": 0.9375, + "rewards/chosen": 80.05255889892578, + "rewards/margins": 147.5816650390625, + "rewards/rejected": -67.52911376953125, + "step": 5820, + "u": -4.267212867736816, + "weight": 0.0625 + }, + { + "diff_generated": -70.92760467529297, + "epoch": 1.8891769280622164, + "grad_norm": 451.70117899869916, + "learning_rate": 2.901252761532055e-07, + "logits/chosen": -2.491132974624634, + "logits/rejected": -2.526857376098633, + "logps/chosen": -17.131113052368164, + "logps/rejected": -158.18505859375, + "loss": 16.4416, + "losses_ref": -7.195662874437403e-06, + "ref_logps/chosen": -100.11669921875, + "ref_logps/rejected": -87.25745391845703, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 82.985595703125, + "rewards/margins": 153.91317749023438, + "rewards/rejected": -70.92760467529297, + "step": 5830, + "u": -4.349549293518066, + "weight": 0.03750026598572731 + }, + { + "diff_generated": -68.52245330810547, + "epoch": 1.8924173687621515, + "grad_norm": 524.9177651666605, + "learning_rate": 2.8867586985167523e-07, + "logits/chosen": -2.51749324798584, + "logits/rejected": -2.5768675804138184, + "logps/chosen": -16.798006057739258, + "logps/rejected": -156.85104370117188, + "loss": 16.435, + "losses_ref": -4.86377757624723e-05, + "ref_logps/chosen": -96.13957977294922, + "ref_logps/rejected": -88.32856750488281, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 79.3415756225586, + "rewards/margins": 147.86402893066406, + "rewards/rejected": -68.52245330810547, + "step": 5840, + "u": -4.243464469909668, + "weight": 0.0687512457370758 + }, + { + "diff_generated": -69.49964141845703, + "epoch": 1.8956578094620868, + "grad_norm": 443.47953975122937, + "learning_rate": 2.8722804621886364e-07, + "logits/chosen": -2.4797089099884033, + "logits/rejected": -2.560629367828369, + "logps/chosen": -17.303192138671875, + "logps/rejected": -153.7431182861328, + "loss": 17.1681, + "losses_ref": -0.00043771107448264956, + "ref_logps/chosen": -92.99197387695312, + "ref_logps/rejected": -84.24346923828125, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 75.68878936767578, + "rewards/margins": 145.1884307861328, + "rewards/rejected": -69.49964141845703, + "step": 5850, + "u": -4.391529560089111, + "weight": 0.04376835376024246 + }, + { + "diff_generated": -71.12813568115234, + "epoch": 1.8988982501620222, + "grad_norm": 469.21594692643225, + "learning_rate": 2.857818258381358e-07, + "logits/chosen": -2.4311540126800537, + "logits/rejected": -2.511909246444702, + "logps/chosen": -17.4440975189209, + "logps/rejected": -162.2183380126953, + "loss": 16.9258, + "losses_ref": -0.0005043140263296664, + "ref_logps/chosen": -99.31665802001953, + "ref_logps/rejected": -91.09019470214844, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 81.87255859375, + "rewards/margins": 153.00070190429688, + "rewards/rejected": -71.12813568115234, + "step": 5860, + "u": -4.432432651519775, + "weight": 0.037522491067647934 + }, + { + "diff_generated": -69.65772247314453, + "epoch": 1.9021386908619573, + "grad_norm": 448.62853805445513, + "learning_rate": 2.8433722927006314e-07, + "logits/chosen": -2.4755847454071045, + "logits/rejected": -2.5879673957824707, + "logps/chosen": -18.797056198120117, + "logps/rejected": -156.71347045898438, + "loss": 16.861, + "losses_ref": -9.900189979816787e-06, + "ref_logps/chosen": -96.04248046875, + "ref_logps/rejected": -87.05574798583984, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 77.24542236328125, + "rewards/margins": 146.9031219482422, + "rewards/rejected": -69.65772247314453, + "step": 5870, + "u": -4.378717422485352, + "weight": 0.037500329315662384 + }, + { + "diff_generated": -69.74181365966797, + "epoch": 1.9053791315618924, + "grad_norm": 445.2510424039169, + "learning_rate": 2.82894277052132e-07, + "logits/chosen": -2.4473252296447754, + "logits/rejected": -2.5861172676086426, + "logps/chosen": -16.19223976135254, + "logps/rejected": -154.62640380859375, + "loss": 16.6272, + "losses_ref": -1.7233291146112606e-05, + "ref_logps/chosen": -93.07598114013672, + "ref_logps/rejected": -84.88458251953125, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 76.88375091552734, + "rewards/margins": 146.62557983398438, + "rewards/rejected": -69.74181365966797, + "step": 5880, + "u": -4.307101249694824, + "weight": 0.05625048279762268 + }, + { + "diff_generated": -72.17448425292969, + "epoch": 1.9086195722618275, + "grad_norm": 480.57871183808413, + "learning_rate": 2.814529896984514e-07, + "logits/chosen": -2.4194865226745605, + "logits/rejected": -2.496985673904419, + "logps/chosen": -17.08938980102539, + "logps/rejected": -162.45860290527344, + "loss": 16.8849, + "losses_ref": -0.0012498985743150115, + "ref_logps/chosen": -94.307861328125, + "ref_logps/rejected": -90.28411865234375, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 77.2184829711914, + "rewards/margins": 149.39297485351562, + "rewards/rejected": -72.17448425292969, + "step": 5890, + "u": -4.246086597442627, + "weight": 0.06881099194288254 + }, + { + "diff_generated": -67.37945556640625, + "epoch": 1.9118600129617627, + "grad_norm": 471.3713556679269, + "learning_rate": 2.8001338769946126e-07, + "logits/chosen": -2.438253879547119, + "logits/rejected": -2.4733853340148926, + "logps/chosen": -16.475162506103516, + "logps/rejected": -146.17007446289062, + "loss": 16.9889, + "losses_ref": -8.270395483123139e-05, + "ref_logps/chosen": -95.65721893310547, + "ref_logps/rejected": -78.79060363769531, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 79.18206024169922, + "rewards/margins": 146.56150817871094, + "rewards/rejected": -67.37945556640625, + "step": 5900, + "u": -4.228701591491699, + "weight": 0.06875334680080414 + }, + { + "diff_generated": -69.47562408447266, + "epoch": 1.915100453661698, + "grad_norm": 423.27160932440387, + "learning_rate": 2.7857549152164153e-07, + "logits/chosen": -2.474318027496338, + "logits/rejected": -2.5566134452819824, + "logps/chosen": -14.384271621704102, + "logps/rejected": -156.02700805664062, + "loss": 16.5497, + "losses_ref": -0.0011477151419967413, + "ref_logps/chosen": -91.45785522460938, + "ref_logps/rejected": -86.55137634277344, + "rewards/accuracies": 0.9375, + "rewards/chosen": 77.07359313964844, + "rewards/margins": 146.54920959472656, + "rewards/rejected": -69.47562408447266, + "step": 5910, + "u": -4.288008213043213, + "weight": 0.06255216151475906 + }, + { + "diff_generated": -73.12073516845703, + "epoch": 1.9183408943616331, + "grad_norm": 450.40903172345037, + "learning_rate": 2.7713932160722043e-07, + "logits/chosen": -2.4164717197418213, + "logits/rejected": -2.566805124282837, + "logps/chosen": -17.061847686767578, + "logps/rejected": -162.56674194335938, + "loss": 16.6963, + "losses_ref": -0.0018255922477692366, + "ref_logps/chosen": -95.25648498535156, + "ref_logps/rejected": -89.44600677490234, + "rewards/accuracies": 0.96875, + "rewards/chosen": 78.19463348388672, + "rewards/margins": 151.31536865234375, + "rewards/rejected": -73.12073516845703, + "step": 5920, + "u": -4.383121967315674, + "weight": 0.03133777529001236 + }, + { + "diff_generated": -72.05683135986328, + "epoch": 1.9215813350615685, + "grad_norm": 447.0661139454466, + "learning_rate": 2.757048983738847e-07, + "logits/chosen": -2.460322856903076, + "logits/rejected": -2.580812931060791, + "logps/chosen": -16.62095832824707, + "logps/rejected": -163.3804931640625, + "loss": 16.1758, + "losses_ref": -0.001195084652863443, + "ref_logps/chosen": -94.19022369384766, + "ref_logps/rejected": -91.32366180419922, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 77.56925964355469, + "rewards/margins": 149.6260986328125, + "rewards/rejected": -72.05683135986328, + "step": 5930, + "u": -4.401998519897461, + "weight": 0.03755543380975723 + }, + { + "diff_generated": -69.83851623535156, + "epoch": 1.9248217757615036, + "grad_norm": 437.08750385278915, + "learning_rate": 2.742722422144885e-07, + "logits/chosen": -2.4585840702056885, + "logits/rejected": -2.5811221599578857, + "logps/chosen": -17.641761779785156, + "logps/rejected": -161.27040100097656, + "loss": 16.6886, + "losses_ref": -0.00032297830330207944, + "ref_logps/chosen": -96.47654724121094, + "ref_logps/rejected": -91.43187713623047, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 78.83478546142578, + "rewards/margins": 148.6732940673828, + "rewards/rejected": -69.83851623535156, + "step": 5940, + "u": -4.332736015319824, + "weight": 0.05626402050256729 + }, + { + "diff_generated": -69.04411315917969, + "epoch": 1.9280622164614387, + "grad_norm": 460.6095520647696, + "learning_rate": 2.7284137349676466e-07, + "logits/chosen": -2.393308162689209, + "logits/rejected": -2.4955241680145264, + "logps/chosen": -15.419522285461426, + "logps/rejected": -156.3408660888672, + "loss": 16.5459, + "losses_ref": -2.796388720582854e-08, + "ref_logps/chosen": -90.29603576660156, + "ref_logps/rejected": -87.29673767089844, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 74.87651062011719, + "rewards/margins": 143.92063903808594, + "rewards/rejected": -69.04411315917969, + "step": 5950, + "u": -4.220925807952881, + "weight": 0.07500000298023224 + }, + { + "diff_generated": -70.74433898925781, + "epoch": 1.9313026571613738, + "grad_norm": 455.37578432642556, + "learning_rate": 2.7141231256303343e-07, + "logits/chosen": -2.4291133880615234, + "logits/rejected": -2.5476202964782715, + "logps/chosen": -19.036624908447266, + "logps/rejected": -159.58172607421875, + "loss": 17.6372, + "losses_ref": -1.3405813660938293e-05, + "ref_logps/chosen": -98.10065460205078, + "ref_logps/rejected": -88.83736419677734, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 79.06402587890625, + "rewards/margins": 149.80838012695312, + "rewards/rejected": -70.74433898925781, + "step": 5960, + "u": -4.429462432861328, + "weight": 0.03750030696392059 + }, + { + "diff_generated": -68.28518676757812, + "epoch": 1.9345430978613092, + "grad_norm": 460.5678292263605, + "learning_rate": 2.69985079729915e-07, + "logits/chosen": -2.4392125606536865, + "logits/rejected": -2.5044326782226562, + "logps/chosen": -17.118776321411133, + "logps/rejected": -148.4984588623047, + "loss": 17.0562, + "losses_ref": -0.0002496158122085035, + "ref_logps/chosen": -98.86695861816406, + "ref_logps/rejected": -80.21326446533203, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 81.74818420410156, + "rewards/margins": 150.0333709716797, + "rewards/rejected": -68.28518676757812, + "step": 5970, + "u": -4.321126461029053, + "weight": 0.05626310035586357 + }, + { + "diff_generated": -70.55073547363281, + "epoch": 1.9377835385612443, + "grad_norm": 492.5859735668745, + "learning_rate": 2.6855969528803945e-07, + "logits/chosen": -2.444809675216675, + "logits/rejected": -2.5491554737091064, + "logps/chosen": -18.09766387939453, + "logps/rejected": -157.51620483398438, + "loss": 17.5105, + "losses_ref": -1.5437821275554597e-05, + "ref_logps/chosen": -97.93743896484375, + "ref_logps/rejected": -86.9654769897461, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 79.83978271484375, + "rewards/margins": 150.3905029296875, + "rewards/rejected": -70.55073547363281, + "step": 5980, + "u": -4.346704959869385, + "weight": 0.043750226497650146 + }, + { + "diff_generated": -66.5762939453125, + "epoch": 1.9410239792611796, + "grad_norm": 431.79970088562357, + "learning_rate": 2.6713617950175903e-07, + "logits/chosen": -2.4027295112609863, + "logits/rejected": -2.5291686058044434, + "logps/chosen": -15.071945190429688, + "logps/rejected": -148.65245056152344, + "loss": 16.5865, + "losses_ref": -1.4883593166814535e-06, + "ref_logps/chosen": -91.81842803955078, + "ref_logps/rejected": -82.0761489868164, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 76.74647521972656, + "rewards/margins": 143.32278442382812, + "rewards/rejected": -66.5762939453125, + "step": 5990, + "u": -4.325742721557617, + "weight": 0.05000003054738045 + }, + { + "diff_generated": -67.45310974121094, + "epoch": 1.9442644199611148, + "grad_norm": 466.72615095363875, + "learning_rate": 2.657145526088593e-07, + "logits/chosen": -2.3987174034118652, + "logits/rejected": -2.532212018966675, + "logps/chosen": -16.78557586669922, + "logps/rejected": -147.83082580566406, + "loss": 16.9198, + "losses_ref": -2.534356724481768e-07, + "ref_logps/chosen": -89.77131652832031, + "ref_logps/rejected": -80.37770080566406, + "rewards/accuracies": 0.90625, + "rewards/chosen": 72.98574829101562, + "rewards/margins": 140.43885803222656, + "rewards/rejected": -67.45310974121094, + "step": 6000, + "u": -4.156952857971191, + "weight": 0.0937500074505806 + }, + { + "diff_generated": -69.40829467773438, + "epoch": 1.9475048606610499, + "grad_norm": 468.2504707444501, + "learning_rate": 2.6429483482027243e-07, + "logits/chosen": -2.43640398979187, + "logits/rejected": -2.5646090507507324, + "logps/chosen": -16.859195709228516, + "logps/rejected": -153.06393432617188, + "loss": 17.001, + "losses_ref": -0.0001398011518176645, + "ref_logps/chosen": -93.74928283691406, + "ref_logps/rejected": -83.6556396484375, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 76.89009094238281, + "rewards/margins": 146.2984161376953, + "rewards/rejected": -69.40829467773438, + "step": 6010, + "u": -4.381274223327637, + "weight": 0.04375586658716202 + }, + { + "diff_generated": -66.53333282470703, + "epoch": 1.950745301360985, + "grad_norm": 453.98594442615945, + "learning_rate": 2.628770463197889e-07, + "logits/chosen": -2.504582643508911, + "logits/rejected": -2.5561158657073975, + "logps/chosen": -17.2025146484375, + "logps/rejected": -151.67718505859375, + "loss": 16.9012, + "losses_ref": -2.9776818337268196e-05, + "ref_logps/chosen": -98.95398712158203, + "ref_logps/rejected": -85.14385223388672, + "rewards/accuracies": 0.9375, + "rewards/chosen": 81.75148010253906, + "rewards/margins": 148.28482055664062, + "rewards/rejected": -66.53333282470703, + "step": 6020, + "u": -4.286922931671143, + "weight": 0.06250147521495819 + }, + { + "diff_generated": -67.9385986328125, + "epoch": 1.9539857420609201, + "grad_norm": 437.9430910005116, + "learning_rate": 2.6146120726377103e-07, + "logits/chosen": -2.3654942512512207, + "logits/rejected": -2.5123817920684814, + "logps/chosen": -14.911382675170898, + "logps/rejected": -147.7805633544922, + "loss": 17.0955, + "losses_ref": -0.0007130379672162235, + "ref_logps/chosen": -89.0401611328125, + "ref_logps/rejected": -79.84195709228516, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 74.12876892089844, + "rewards/margins": 142.0673828125, + "rewards/rejected": -67.9385986328125, + "step": 6030, + "u": -4.1883392333984375, + "weight": 0.0875338613986969 + }, + { + "diff_generated": -64.7969741821289, + "epoch": 1.9572261827608555, + "grad_norm": 448.64790944231504, + "learning_rate": 2.600473377808667e-07, + "logits/chosen": -2.430640697479248, + "logits/rejected": -2.480102062225342, + "logps/chosen": -16.628345489501953, + "logps/rejected": -141.89987182617188, + "loss": 16.4593, + "losses_ref": -8.310528937727213e-05, + "ref_logps/chosen": -89.28855895996094, + "ref_logps/rejected": -77.10289764404297, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 72.66020965576172, + "rewards/margins": 137.45718383789062, + "rewards/rejected": -64.7969741821289, + "step": 6040, + "u": -4.257237434387207, + "weight": 0.06875047832727432 + }, + { + "diff_generated": -67.9603500366211, + "epoch": 1.9604666234607908, + "grad_norm": 452.8056559318795, + "learning_rate": 2.5863545797172226e-07, + "logits/chosen": -2.4394993782043457, + "logits/rejected": -2.533747673034668, + "logps/chosen": -17.315380096435547, + "logps/rejected": -150.82473754882812, + "loss": 17.2681, + "losses_ref": -0.001021057483740151, + "ref_logps/chosen": -93.55432891845703, + "ref_logps/rejected": -82.8643798828125, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 76.23894500732422, + "rewards/margins": 144.1992950439453, + "rewards/rejected": -67.9603500366211, + "step": 6050, + "u": -4.3375749588012695, + "weight": 0.05630000680685043 + }, + { + "diff_generated": -67.12407684326172, + "epoch": 1.963707064160726, + "grad_norm": 450.7826790115826, + "learning_rate": 2.5722558790869786e-07, + "logits/chosen": -2.413055181503296, + "logits/rejected": -2.4825711250305176, + "logps/chosen": -17.044614791870117, + "logps/rejected": -145.07261657714844, + "loss": 16.397, + "losses_ref": -2.6650173822417855e-05, + "ref_logps/chosen": -91.9409408569336, + "ref_logps/rejected": -77.94853210449219, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 74.89633178710938, + "rewards/margins": 142.02041625976562, + "rewards/rejected": -67.12407684326172, + "step": 6060, + "u": -4.174244403839111, + "weight": 0.08125053346157074 + }, + { + "diff_generated": -72.6106185913086, + "epoch": 1.966947504860661, + "grad_norm": 448.19207789067633, + "learning_rate": 2.558177476355812e-07, + "logits/chosen": -2.4541189670562744, + "logits/rejected": -2.589430332183838, + "logps/chosen": -19.113964080810547, + "logps/rejected": -162.339111328125, + "loss": 16.7823, + "losses_ref": -0.0004977741627953947, + "ref_logps/chosen": -98.77516174316406, + "ref_logps/rejected": -89.72850036621094, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 79.66120147705078, + "rewards/margins": 152.27182006835938, + "rewards/rejected": -72.6106185913086, + "step": 6070, + "u": -4.469677448272705, + "weight": 0.02501002512872219 + }, + { + "diff_generated": -72.3133316040039, + "epoch": 1.9701879455605962, + "grad_norm": 457.84983535438994, + "learning_rate": 2.544119571673031e-07, + "logits/chosen": -2.467571496963501, + "logits/rejected": -2.5975966453552246, + "logps/chosen": -16.998483657836914, + "logps/rejected": -161.43502807617188, + "loss": 16.7225, + "losses_ref": -0.015463406220078468, + "ref_logps/chosen": -95.86445617675781, + "ref_logps/rejected": -89.1217041015625, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 78.86597442626953, + "rewards/margins": 151.17929077148438, + "rewards/rejected": -72.3133316040039, + "step": 6080, + "u": -4.396916389465332, + "weight": 0.03814239054918289 + }, + { + "diff_generated": -72.62419128417969, + "epoch": 1.9734283862605313, + "grad_norm": 508.9547507073546, + "learning_rate": 2.5300823648965267e-07, + "logits/chosen": -2.4152259826660156, + "logits/rejected": -2.534400463104248, + "logps/chosen": -15.61021614074707, + "logps/rejected": -160.32122802734375, + "loss": 16.9425, + "losses_ref": -1.0055341590486933e-06, + "ref_logps/chosen": -92.42676544189453, + "ref_logps/rejected": -87.69705200195312, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 76.8165512084961, + "rewards/margins": 149.44073486328125, + "rewards/rejected": -72.62419128417969, + "step": 6090, + "u": -4.382531642913818, + "weight": 0.04375002905726433 + }, + { + "diff_generated": -72.01034545898438, + "epoch": 1.9766688269604666, + "grad_norm": 439.69088541526725, + "learning_rate": 2.516066055589937e-07, + "logits/chosen": -2.447578191757202, + "logits/rejected": -2.578108549118042, + "logps/chosen": -15.061620712280273, + "logps/rejected": -158.85385131835938, + "loss": 16.9858, + "losses_ref": -7.055670039335382e-07, + "ref_logps/chosen": -92.02629852294922, + "ref_logps/rejected": -86.84352111816406, + "rewards/accuracies": 0.9375, + "rewards/chosen": 76.96467590332031, + "rewards/margins": 148.9750213623047, + "rewards/rejected": -72.01034545898438, + "step": 6100, + "u": -4.283817291259766, + "weight": 0.0625000149011612 + }, + { + "diff_generated": -71.61358642578125, + "epoch": 1.9799092676604018, + "grad_norm": 412.81350587394274, + "learning_rate": 2.502070843019799e-07, + "logits/chosen": -2.438821315765381, + "logits/rejected": -2.559591770172119, + "logps/chosen": -17.955440521240234, + "logps/rejected": -159.21438598632812, + "loss": 16.1884, + "losses_ref": -0.00022827927023172379, + "ref_logps/chosen": -97.82897186279297, + "ref_logps/rejected": -87.60079193115234, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 79.87352752685547, + "rewards/margins": 151.48712158203125, + "rewards/rejected": -71.61358642578125, + "step": 6110, + "u": -4.4112467765808105, + "weight": 0.03751049190759659 + }, + { + "diff_generated": -68.79540252685547, + "epoch": 1.983149708360337, + "grad_norm": 438.77721573717224, + "learning_rate": 2.4880969261527294e-07, + "logits/chosen": -2.4548709392547607, + "logits/rejected": -2.555267095565796, + "logps/chosen": -16.763917922973633, + "logps/rejected": -156.84829711914062, + "loss": 16.7081, + "losses_ref": -9.546678484184667e-06, + "ref_logps/chosen": -96.22209167480469, + "ref_logps/rejected": -88.05290222167969, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 79.45818328857422, + "rewards/margins": 148.2535858154297, + "rewards/rejected": -68.79540252685547, + "step": 6120, + "u": -4.3454976081848145, + "weight": 0.03750038146972656 + }, + { + "diff_generated": -70.16226196289062, + "epoch": 1.9863901490602722, + "grad_norm": 472.14024979167294, + "learning_rate": 2.4741445036525814e-07, + "logits/chosen": -2.4340896606445312, + "logits/rejected": -2.510011911392212, + "logps/chosen": -14.988530158996582, + "logps/rejected": -155.96693420410156, + "loss": 16.1331, + "losses_ref": -0.0012874031672254205, + "ref_logps/chosen": -92.43404388427734, + "ref_logps/rejected": -85.80467224121094, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 77.44551086425781, + "rewards/margins": 147.60777282714844, + "rewards/rejected": -70.16226196289062, + "step": 6130, + "u": -4.145638942718506, + "weight": 0.08755604922771454 + }, + { + "diff_generated": -66.4461669921875, + "epoch": 1.9896305897602073, + "grad_norm": 460.0733260209511, + "learning_rate": 2.460213773877635e-07, + "logits/chosen": -2.3939826488494873, + "logits/rejected": -2.493049144744873, + "logps/chosen": -15.80755615234375, + "logps/rejected": -144.96444702148438, + "loss": 16.6138, + "losses_ref": -0.0007387199439108372, + "ref_logps/chosen": -90.4625473022461, + "ref_logps/rejected": -78.51826477050781, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 74.65499114990234, + "rewards/margins": 141.10116577148438, + "rewards/rejected": -66.4461669921875, + "step": 6140, + "u": -4.115630149841309, + "weight": 0.10003437101840973 + }, + { + "diff_generated": -73.65410614013672, + "epoch": 1.9928710304601425, + "grad_norm": 505.0343263835002, + "learning_rate": 2.4463049348777666e-07, + "logits/chosen": -2.4246139526367188, + "logits/rejected": -2.5277953147888184, + "logps/chosen": -15.673370361328125, + "logps/rejected": -161.61569213867188, + "loss": 16.5994, + "losses_ref": -1.5297347388809612e-08, + "ref_logps/chosen": -96.08084869384766, + "ref_logps/rejected": -87.96159362792969, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 80.407470703125, + "rewards/margins": 154.0615692138672, + "rewards/rejected": -73.65410614013672, + "step": 6150, + "u": -4.396246910095215, + "weight": 0.03750000149011612 + }, + { + "diff_generated": -68.90767669677734, + "epoch": 1.9961114711600778, + "grad_norm": 453.67577081346997, + "learning_rate": 2.4324181843916364e-07, + "logits/chosen": -2.441559076309204, + "logits/rejected": -2.5246987342834473, + "logps/chosen": -19.356121063232422, + "logps/rejected": -152.5162353515625, + "loss": 16.7865, + "losses_ref": -1.9142080986966903e-07, + "ref_logps/chosen": -94.289306640625, + "ref_logps/rejected": -83.60856628417969, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 74.93318176269531, + "rewards/margins": 143.8408660888672, + "rewards/rejected": -68.90767669677734, + "step": 6160, + "u": -4.186587333679199, + "weight": 0.07500000298023224 + }, + { + "diff_generated": -74.30818176269531, + "epoch": 1.999351911860013, + "grad_norm": 440.2516626198898, + "learning_rate": 2.4185537198438777e-07, + "logits/chosen": -2.484731674194336, + "logits/rejected": -2.582129955291748, + "logps/chosen": -17.606393814086914, + "logps/rejected": -162.97836303710938, + "loss": 16.7489, + "losses_ref": -0.0005036066868342459, + "ref_logps/chosen": -99.83641052246094, + "ref_logps/rejected": -88.67017364501953, + "rewards/accuracies": 0.96875, + "rewards/chosen": 82.23001861572266, + "rewards/margins": 156.5382080078125, + "rewards/rejected": -74.30818176269531, + "step": 6170, + "u": -4.434797763824463, + "weight": 0.03127221763134003 + }, + { + "diff_generated": -73.82777404785156, + "epoch": 2.0025923525599483, + "grad_norm": 465.6875766799557, + "learning_rate": 2.40471173834229e-07, + "logits/chosen": -2.477844476699829, + "logits/rejected": -2.580146074295044, + "logps/chosen": -14.714004516601562, + "logps/rejected": -164.83810424804688, + "loss": 14.2976, + "losses_ref": -0.014145202934741974, + "ref_logps/chosen": -98.5716781616211, + "ref_logps/rejected": -91.01033782958984, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 83.85767364501953, + "rewards/margins": 157.68545532226562, + "rewards/rejected": -73.82777404785156, + "step": 6180, + "u": -6.301093578338623, + "weight": 0.019366895779967308 + }, + { + "diff_generated": -76.43583679199219, + "epoch": 2.0058327932598834, + "grad_norm": 442.8113423529574, + "learning_rate": 2.3908924366750385e-07, + "logits/chosen": -2.419363498687744, + "logits/rejected": -2.518618583679199, + "logps/chosen": -13.03242301940918, + "logps/rejected": -163.94589233398438, + "loss": 13.0716, + "losses_ref": -2.3213447093439754e-06, + "ref_logps/chosen": -93.88200378417969, + "ref_logps/rejected": -87.51005554199219, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 80.84957885742188, + "rewards/margins": 157.28541564941406, + "rewards/rejected": -76.43583679199219, + "step": 6190, + "u": -6.368585109710693, + "weight": 0.07500007748603821 + }, + { + "diff_generated": -82.67878723144531, + "epoch": 2.0090732339598185, + "grad_norm": 494.5893161029665, + "learning_rate": 2.3770960113078505e-07, + "logits/chosen": -2.412153720855713, + "logits/rejected": -2.5851235389709473, + "logps/chosen": -12.750777244567871, + "logps/rejected": -172.10757446289062, + "loss": 13.332, + "losses_ref": -0.00504049938172102, + "ref_logps/chosen": -93.04595184326172, + "ref_logps/rejected": -89.42879486083984, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 80.29518127441406, + "rewards/margins": 162.97393798828125, + "rewards/rejected": -82.67878723144531, + "step": 6200, + "u": -6.533880710601807, + "weight": 0.050222884863615036 + }, + { + "diff_generated": -81.47315216064453, + "epoch": 2.0123136746597536, + "grad_norm": 444.69402811189104, + "learning_rate": 2.3633226583812304e-07, + "logits/chosen": -2.3955163955688477, + "logits/rejected": -2.5131001472473145, + "logps/chosen": -13.523590087890625, + "logps/rejected": -170.2469482421875, + "loss": 12.7886, + "losses_ref": -0.018442081287503242, + "ref_logps/chosen": -95.49665832519531, + "ref_logps/rejected": -88.77379608154297, + "rewards/accuracies": 0.96875, + "rewards/chosen": 81.97306823730469, + "rewards/margins": 163.4462127685547, + "rewards/rejected": -81.47315216064453, + "step": 6210, + "u": -6.657259464263916, + "weight": 0.0321735255420208 + }, + { + "diff_generated": -77.23645782470703, + "epoch": 2.0155541153596888, + "grad_norm": 438.333881371591, + "learning_rate": 2.3495725737076642e-07, + "logits/chosen": -2.4523422718048096, + "logits/rejected": -2.5315611362457275, + "logps/chosen": -13.320953369140625, + "logps/rejected": -159.6463623046875, + "loss": 13.2749, + "losses_ref": -0.0011979702394455671, + "ref_logps/chosen": -97.73123931884766, + "ref_logps/rejected": -82.40990447998047, + "rewards/accuracies": 0.9375, + "rewards/chosen": 84.41028594970703, + "rewards/margins": 161.64674377441406, + "rewards/rejected": -77.23645782470703, + "step": 6220, + "u": -6.433352470397949, + "weight": 0.06255219876766205 + }, + { + "diff_generated": -77.82986450195312, + "epoch": 2.0187945560596243, + "grad_norm": 421.5016639924878, + "learning_rate": 2.3358459527688432e-07, + "logits/chosen": -2.430393695831299, + "logits/rejected": -2.5355336666107178, + "logps/chosen": -13.974141120910645, + "logps/rejected": -165.40695190429688, + "loss": 13.4308, + "losses_ref": -0.007256612181663513, + "ref_logps/chosen": -99.11470794677734, + "ref_logps/rejected": -87.57707214355469, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 85.14057159423828, + "rewards/margins": 162.97042846679688, + "rewards/rejected": -77.82986450195312, + "step": 6230, + "u": -6.607992649078369, + "weight": 0.0378216877579689 + }, + { + "diff_generated": -81.22264099121094, + "epoch": 2.0220349967595594, + "grad_norm": 485.103031244035, + "learning_rate": 2.3221429907128734e-07, + "logits/chosen": -2.4243063926696777, + "logits/rejected": -2.553650140762329, + "logps/chosen": -13.33491039276123, + "logps/rejected": -173.98171997070312, + "loss": 13.053, + "losses_ref": -1.8116115825250745e-05, + "ref_logps/chosen": -93.6789779663086, + "ref_logps/rejected": -92.75906372070312, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 80.34407043457031, + "rewards/margins": 161.5666961669922, + "rewards/rejected": -81.22264099121094, + "step": 6240, + "u": -6.483554840087891, + "weight": 0.050000619143247604 + }, + { + "diff_generated": -79.91209411621094, + "epoch": 2.0252754374594946, + "grad_norm": 509.1077077436576, + "learning_rate": 2.3084638823515136e-07, + "logits/chosen": -2.411858558654785, + "logits/rejected": -2.514808177947998, + "logps/chosen": -12.484736442565918, + "logps/rejected": -165.52288818359375, + "loss": 12.819, + "losses_ref": -0.00534836994484067, + "ref_logps/chosen": -96.26007843017578, + "ref_logps/rejected": -85.61079406738281, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 83.77534484863281, + "rewards/margins": 163.6874542236328, + "rewards/rejected": -79.91209411621094, + "step": 6250, + "u": -6.536829948425293, + "weight": 0.05019962787628174 + }, + { + "diff_generated": -78.60842895507812, + "epoch": 2.0285158781594297, + "grad_norm": 497.35899680965207, + "learning_rate": 2.2948088221573986e-07, + "logits/chosen": -2.4031450748443604, + "logits/rejected": -2.511592149734497, + "logps/chosen": -15.3632230758667, + "logps/rejected": -166.2181396484375, + "loss": 13.2077, + "losses_ref": -3.695975010487018e-07, + "ref_logps/chosen": -100.11011505126953, + "ref_logps/rejected": -87.60970306396484, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 84.74689483642578, + "rewards/margins": 163.35531616210938, + "rewards/rejected": -78.60842895507812, + "step": 6260, + "u": -6.557508945465088, + "weight": 0.037500012665987015 + }, + { + "diff_generated": -77.8721923828125, + "epoch": 2.031756318859365, + "grad_norm": 471.59980342396824, + "learning_rate": 2.2811780042612753e-07, + "logits/chosen": -2.382750988006592, + "logits/rejected": -2.492981433868408, + "logps/chosen": -13.257951736450195, + "logps/rejected": -166.88327026367188, + "loss": 12.9873, + "losses_ref": -3.5366301176509296e-07, + "ref_logps/chosen": -95.4902114868164, + "ref_logps/rejected": -89.01109313964844, + "rewards/accuracies": 0.9375, + "rewards/chosen": 82.23226165771484, + "rewards/margins": 160.1044464111328, + "rewards/rejected": -77.8721923828125, + "step": 6270, + "u": -6.430145263671875, + "weight": 0.0625000074505806 + }, + { + "diff_generated": -78.22283935546875, + "epoch": 2.0349967595593, + "grad_norm": 483.27826658503307, + "learning_rate": 2.267571622449246e-07, + "logits/chosen": -2.4012675285339355, + "logits/rejected": -2.4772655963897705, + "logps/chosen": -13.27515983581543, + "logps/rejected": -161.35049438476562, + "loss": 13.185, + "losses_ref": -0.02035510167479515, + "ref_logps/chosen": -92.0546875, + "ref_logps/rejected": -83.12764739990234, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 78.77952575683594, + "rewards/margins": 157.0023651123047, + "rewards/rejected": -78.22283935546875, + "step": 6280, + "u": -6.3433518409729, + "weight": 0.0760183110833168 + }, + { + "diff_generated": -77.20220184326172, + "epoch": 2.038237200259235, + "grad_norm": 449.763223317915, + "learning_rate": 2.2539898701600082e-07, + "logits/chosen": -2.3824188709259033, + "logits/rejected": -2.4665560722351074, + "logps/chosen": -13.088310241699219, + "logps/rejected": -159.70404052734375, + "loss": 12.8279, + "losses_ref": -1.1242273956213467e-07, + "ref_logps/chosen": -95.3175277709961, + "ref_logps/rejected": -82.50181579589844, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 82.22923278808594, + "rewards/margins": 159.43142700195312, + "rewards/rejected": -77.20220184326172, + "step": 6290, + "u": -6.379042625427246, + "weight": 0.06875000149011612 + }, + { + "diff_generated": -76.66944885253906, + "epoch": 2.0414776409591706, + "grad_norm": 477.9389997620248, + "learning_rate": 2.2404329404821086e-07, + "logits/chosen": -2.3812286853790283, + "logits/rejected": -2.4753291606903076, + "logps/chosen": -14.383306503295898, + "logps/rejected": -158.17892456054688, + "loss": 13.2235, + "losses_ref": -1.3196420695749111e-05, + "ref_logps/chosen": -93.89494323730469, + "ref_logps/rejected": -81.50947570800781, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 79.51163482666016, + "rewards/margins": 156.18109130859375, + "rewards/rejected": -76.66944885253906, + "step": 6300, + "u": -6.47409725189209, + "weight": 0.05000042915344238 + }, + { + "diff_generated": -81.24650573730469, + "epoch": 2.0447180816591057, + "grad_norm": 513.9727361727379, + "learning_rate": 2.2269010261511974e-07, + "logits/chosen": -2.4193193912506104, + "logits/rejected": -2.4820616245269775, + "logps/chosen": -13.977895736694336, + "logps/rejected": -167.22427368164062, + "loss": 13.3155, + "losses_ref": -0.0016800116281956434, + "ref_logps/chosen": -100.33441162109375, + "ref_logps/rejected": -85.97776794433594, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 86.35652160644531, + "rewards/margins": 167.60301208496094, + "rewards/rejected": -81.24650573730469, + "step": 6310, + "u": -6.495428562164307, + "weight": 0.05632109194993973 + }, + { + "diff_generated": -78.64732360839844, + "epoch": 2.047958522359041, + "grad_norm": 509.7672668654707, + "learning_rate": 2.2133943195472874e-07, + "logits/chosen": -2.389063596725464, + "logits/rejected": -2.543116807937622, + "logps/chosen": -12.62861156463623, + "logps/rejected": -160.91986083984375, + "loss": 12.6887, + "losses_ref": -4.11522727006286e-08, + "ref_logps/chosen": -92.83280944824219, + "ref_logps/rejected": -82.27253723144531, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 80.2042007446289, + "rewards/margins": 158.8515167236328, + "rewards/rejected": -78.64732360839844, + "step": 6320, + "u": -6.228217124938965, + "weight": 0.08124999701976776 + }, + { + "diff_generated": -80.81334686279297, + "epoch": 2.051198963058976, + "grad_norm": 516.1515524619925, + "learning_rate": 2.1999130126920158e-07, + "logits/chosen": -2.4193766117095947, + "logits/rejected": -2.548755168914795, + "logps/chosen": -13.09967041015625, + "logps/rejected": -170.51239013671875, + "loss": 12.9505, + "losses_ref": -0.0040515633299946785, + "ref_logps/chosen": -97.01029205322266, + "ref_logps/rejected": -89.69903564453125, + "rewards/accuracies": 0.96875, + "rewards/chosen": 83.91061401367188, + "rewards/margins": 164.72396850585938, + "rewards/rejected": -80.81334686279297, + "step": 6330, + "u": -6.651572227478027, + "weight": 0.03141610696911812 + }, + { + "diff_generated": -79.29988098144531, + "epoch": 2.054439403758911, + "grad_norm": 512.5347774712834, + "learning_rate": 2.1864572972459228e-07, + "logits/chosen": -2.3943607807159424, + "logits/rejected": -2.567241668701172, + "logps/chosen": -11.203665733337402, + "logps/rejected": -165.3535919189453, + "loss": 13.1086, + "losses_ref": -0.0017887745052576065, + "ref_logps/chosen": -88.66815185546875, + "ref_logps/rejected": -86.05369567871094, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 77.46449279785156, + "rewards/margins": 156.76437377929688, + "rewards/rejected": -79.29988098144531, + "step": 6340, + "u": -6.356213092803955, + "weight": 0.06882587820291519 + }, + { + "diff_generated": -79.52970886230469, + "epoch": 2.057679844458846, + "grad_norm": 472.6827846426297, + "learning_rate": 2.1730273645057173e-07, + "logits/chosen": -2.3555686473846436, + "logits/rejected": -2.450321674346924, + "logps/chosen": -13.726099014282227, + "logps/rejected": -167.40455627441406, + "loss": 12.6711, + "losses_ref": -0.00632984284311533, + "ref_logps/chosen": -96.25859069824219, + "ref_logps/rejected": -87.87483978271484, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 82.53250122070312, + "rewards/margins": 162.0622100830078, + "rewards/rejected": -79.52970886230469, + "step": 6350, + "u": -6.746617317199707, + "weight": 0.019036132842302322 + }, + { + "diff_generated": -82.3400650024414, + "epoch": 2.060920285158782, + "grad_norm": 521.7664594366094, + "learning_rate": 2.1596234054015654e-07, + "logits/chosen": -2.384187936782837, + "logits/rejected": -2.5147311687469482, + "logps/chosen": -13.163564682006836, + "logps/rejected": -171.75308227539062, + "loss": 13.3316, + "losses_ref": -0.001959248911589384, + "ref_logps/chosen": -94.92860412597656, + "ref_logps/rejected": -89.41303253173828, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 81.76502990722656, + "rewards/margins": 164.1051025390625, + "rewards/rejected": -82.3400650024414, + "step": 6360, + "u": -6.575686454772949, + "weight": 0.037587970495224 + }, + { + "diff_generated": -79.38497161865234, + "epoch": 2.064160725858717, + "grad_norm": 470.4498756243637, + "learning_rate": 2.1462456104943692e-07, + "logits/chosen": -2.356865406036377, + "logits/rejected": -2.474856376647949, + "logps/chosen": -12.403861045837402, + "logps/rejected": -163.8896942138672, + "loss": 12.8589, + "losses_ref": -1.5257610357366502e-05, + "ref_logps/chosen": -93.32125091552734, + "ref_logps/rejected": -84.50472259521484, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 80.91740417480469, + "rewards/margins": 160.3023681640625, + "rewards/rejected": -79.38497161865234, + "step": 6370, + "u": -6.5001540184021, + "weight": 0.056250590831041336 + }, + { + "diff_generated": -77.94718170166016, + "epoch": 2.067401166558652, + "grad_norm": 490.2535163314096, + "learning_rate": 2.132894169973063e-07, + "logits/chosen": -2.41898250579834, + "logits/rejected": -2.504578113555908, + "logps/chosen": -13.155477523803711, + "logps/rejected": -160.1114959716797, + "loss": 13.4251, + "losses_ref": -0.004206720273941755, + "ref_logps/chosen": -95.85162353515625, + "ref_logps/rejected": -82.16432189941406, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 82.69615173339844, + "rewards/margins": 160.64334106445312, + "rewards/rejected": -77.94718170166016, + "step": 6380, + "u": -6.4894819259643555, + "weight": 0.056429021060466766 + }, + { + "diff_generated": -80.44488525390625, + "epoch": 2.070641607258587, + "grad_norm": 489.00207122247065, + "learning_rate": 2.1195692736519013e-07, + "logits/chosen": -2.4086267948150635, + "logits/rejected": -2.506894588470459, + "logps/chosen": -13.574335098266602, + "logps/rejected": -168.43133544921875, + "loss": 13.2942, + "losses_ref": -0.040814243257045746, + "ref_logps/chosen": -95.96758270263672, + "ref_logps/rejected": -87.98646545410156, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 82.39324951171875, + "rewards/margins": 162.83810424804688, + "rewards/rejected": -80.44488525390625, + "step": 6390, + "u": -6.472268581390381, + "weight": 0.039497148245573044 + }, + { + "diff_generated": -80.75009155273438, + "epoch": 2.0738820479585223, + "grad_norm": 435.56787766493755, + "learning_rate": 2.1062711109677757e-07, + "logits/chosen": -2.4283664226531982, + "logits/rejected": -2.535419464111328, + "logps/chosen": -13.241666793823242, + "logps/rejected": -166.3533477783203, + "loss": 12.7151, + "losses_ref": -0.002088053384795785, + "ref_logps/chosen": -94.4981689453125, + "ref_logps/rejected": -85.60326385498047, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 81.25650787353516, + "rewards/margins": 162.006591796875, + "rewards/rejected": -80.75009155273438, + "step": 6400, + "u": -6.565550327301025, + "weight": 0.0438404306769371 + }, + { + "diff_generated": -80.47855377197266, + "epoch": 2.0771224886584574, + "grad_norm": 480.20915565188733, + "learning_rate": 2.0929998709775068e-07, + "logits/chosen": -2.414332389831543, + "logits/rejected": -2.4279377460479736, + "logps/chosen": -13.154367446899414, + "logps/rejected": -161.90716552734375, + "loss": 12.9361, + "losses_ref": -0.0005014360649511218, + "ref_logps/chosen": -97.96044921875, + "ref_logps/rejected": -81.42860412597656, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 84.80607604980469, + "rewards/margins": 165.28463745117188, + "rewards/rejected": -80.47855377197266, + "step": 6410, + "u": -6.4327850341796875, + "weight": 0.06876204907894135 + }, + { + "diff_generated": -81.22803497314453, + "epoch": 2.080362929358393, + "grad_norm": 515.953440725428, + "learning_rate": 2.0797557423551574e-07, + "logits/chosen": -2.416334629058838, + "logits/rejected": -2.521374225616455, + "logps/chosen": -13.194360733032227, + "logps/rejected": -172.79388427734375, + "loss": 13.1244, + "losses_ref": -0.004450161475688219, + "ref_logps/chosen": -100.98467254638672, + "ref_logps/rejected": -91.56584167480469, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 87.7903060913086, + "rewards/margins": 169.01834106445312, + "rewards/rejected": -81.22803497314453, + "step": 6420, + "u": -6.475274085998535, + "weight": 0.05643799155950546 + }, + { + "diff_generated": -82.28623962402344, + "epoch": 2.083603370058328, + "grad_norm": 444.8669315098703, + "learning_rate": 2.066538913389361e-07, + "logits/chosen": -2.400272846221924, + "logits/rejected": -2.5217082500457764, + "logps/chosen": -13.127087593078613, + "logps/rejected": -171.65762329101562, + "loss": 13.2154, + "losses_ref": -5.08952371092164e-06, + "ref_logps/chosen": -97.62368774414062, + "ref_logps/rejected": -89.37138366699219, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 84.49659729003906, + "rewards/margins": 166.7828369140625, + "rewards/rejected": -82.28623962402344, + "step": 6430, + "u": -6.559577941894531, + "weight": 0.03750015050172806 + }, + { + "diff_generated": -80.60981750488281, + "epoch": 2.086843810758263, + "grad_norm": 472.61054469292606, + "learning_rate": 2.053349571980635e-07, + "logits/chosen": -2.444110155105591, + "logits/rejected": -2.498958110809326, + "logps/chosen": -13.0723876953125, + "logps/rejected": -164.4912567138672, + "loss": 12.6799, + "losses_ref": -0.001947386539541185, + "ref_logps/chosen": -98.06462097167969, + "ref_logps/rejected": -83.88143920898438, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 84.99222564697266, + "rewards/margins": 165.60203552246094, + "rewards/rejected": -80.60981750488281, + "step": 6440, + "u": -6.712790489196777, + "weight": 0.025087004527449608 + }, + { + "diff_generated": -78.48230743408203, + "epoch": 2.0900842514581983, + "grad_norm": 441.39652718067543, + "learning_rate": 2.0401879056387155e-07, + "logits/chosen": -2.354447841644287, + "logits/rejected": -2.46140718460083, + "logps/chosen": -11.920354843139648, + "logps/rejected": -160.78182983398438, + "loss": 13.2252, + "losses_ref": -0.00024359929375350475, + "ref_logps/chosen": -93.72767639160156, + "ref_logps/rejected": -82.29952239990234, + "rewards/accuracies": 0.9375, + "rewards/chosen": 81.80731964111328, + "rewards/margins": 160.28964233398438, + "rewards/rejected": -78.48230743408203, + "step": 6450, + "u": -6.464148044586182, + "weight": 0.06251000612974167 + }, + { + "diff_generated": -77.48588562011719, + "epoch": 2.0933246921581334, + "grad_norm": 520.8812809619677, + "learning_rate": 2.0270541014798864e-07, + "logits/chosen": -2.358785390853882, + "logits/rejected": -2.443070888519287, + "logps/chosen": -12.780475616455078, + "logps/rejected": -161.23361206054688, + "loss": 13.0773, + "losses_ref": -0.0012289454462006688, + "ref_logps/chosen": -94.05496215820312, + "ref_logps/rejected": -83.74772644042969, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 81.27449035644531, + "rewards/margins": 158.7603759765625, + "rewards/rejected": -77.48588562011719, + "step": 6460, + "u": -6.318059921264648, + "weight": 0.08130304515361786 + }, + { + "diff_generated": -79.57652282714844, + "epoch": 2.0965651328580686, + "grad_norm": 469.0737272469611, + "learning_rate": 2.0139483462243225e-07, + "logits/chosen": -2.327782392501831, + "logits/rejected": -2.4817862510681152, + "logps/chosen": -11.793447494506836, + "logps/rejected": -166.00112915039062, + "loss": 12.8989, + "losses_ref": -3.7005463582318043e-08, + "ref_logps/chosen": -91.22651672363281, + "ref_logps/rejected": -86.42463684082031, + "rewards/accuracies": 0.9375, + "rewards/chosen": 79.43307495117188, + "rewards/margins": 159.0095977783203, + "rewards/rejected": -79.57652282714844, + "step": 6470, + "u": -6.46950626373291, + "weight": 0.0625 + }, + { + "diff_generated": -84.00804138183594, + "epoch": 2.0998055735580037, + "grad_norm": 478.75599326278547, + "learning_rate": 2.00087082619343e-07, + "logits/chosen": -2.361438274383545, + "logits/rejected": -2.494079351425171, + "logps/chosen": -12.477500915527344, + "logps/rejected": -171.92276000976562, + "loss": 12.8422, + "losses_ref": -0.02715255320072174, + "ref_logps/chosen": -95.34806060791016, + "ref_logps/rejected": -87.91471862792969, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 82.87055969238281, + "rewards/margins": 166.87860107421875, + "rewards/rejected": -84.00804138183594, + "step": 6480, + "u": -6.616391181945801, + "weight": 0.038793593645095825 + }, + { + "diff_generated": -83.79143524169922, + "epoch": 2.1030460142579392, + "grad_norm": 481.8031524288851, + "learning_rate": 1.9878217273072116e-07, + "logits/chosen": -2.3388664722442627, + "logits/rejected": -2.511909246444702, + "logps/chosen": -11.842902183532715, + "logps/rejected": -170.57864379882812, + "loss": 12.9571, + "losses_ref": -0.0015412219800055027, + "ref_logps/chosen": -88.08900451660156, + "ref_logps/rejected": -86.78721618652344, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 76.24609375, + "rewards/margins": 160.03753662109375, + "rewards/rejected": -83.79143524169922, + "step": 6490, + "u": -6.510128021240234, + "weight": 0.05006871372461319 + }, + { + "diff_generated": -83.82054901123047, + "epoch": 2.1062864549578744, + "grad_norm": 484.8975833780512, + "learning_rate": 1.974801235081602e-07, + "logits/chosen": -2.3324244022369385, + "logits/rejected": -2.4709558486938477, + "logps/chosen": -12.802592277526855, + "logps/rejected": -167.81314086914062, + "loss": 13.0662, + "losses_ref": -0.0014063044218346477, + "ref_logps/chosen": -90.68067932128906, + "ref_logps/rejected": -83.99259948730469, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 77.87808990478516, + "rewards/margins": 161.69863891601562, + "rewards/rejected": -83.82054901123047, + "step": 6500, + "u": -6.287791728973389, + "weight": 0.08131426572799683 + }, + { + "diff_generated": -80.39985656738281, + "epoch": 2.1095268956578095, + "grad_norm": 519.4806520608191, + "learning_rate": 1.9618095346258485e-07, + "logits/chosen": -2.369863986968994, + "logits/rejected": -2.460906505584717, + "logps/chosen": -12.442307472229004, + "logps/rejected": -161.36331176757812, + "loss": 13.0987, + "losses_ref": -1.0886478776228614e-06, + "ref_logps/chosen": -96.80855560302734, + "ref_logps/rejected": -80.96344757080078, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 84.36624908447266, + "rewards/margins": 164.76611328125, + "rewards/rejected": -80.39985656738281, + "step": 6510, + "u": -6.362314224243164, + "weight": 0.07500002533197403 + }, + { + "diff_generated": -85.63273620605469, + "epoch": 2.1127673363577446, + "grad_norm": 529.5689988573159, + "learning_rate": 1.948846810639871e-07, + "logits/chosen": -2.4165825843811035, + "logits/rejected": -2.5419535636901855, + "logps/chosen": -14.049939155578613, + "logps/rejected": -174.29861450195312, + "loss": 13.6717, + "losses_ref": -0.0036067564506083727, + "ref_logps/chosen": -99.15531921386719, + "ref_logps/rejected": -88.66587829589844, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 85.10539245605469, + "rewards/margins": 170.73812866210938, + "rewards/rejected": -85.63273620605469, + "step": 6520, + "u": -6.568295478820801, + "weight": 0.043914470821619034 + }, + { + "diff_generated": -84.3892822265625, + "epoch": 2.1160077770576797, + "grad_norm": 495.94061382304113, + "learning_rate": 1.9359132474116374e-07, + "logits/chosen": -2.4003686904907227, + "logits/rejected": -2.512028217315674, + "logps/chosen": -13.148529052734375, + "logps/rejected": -171.59902954101562, + "loss": 13.2403, + "losses_ref": -4.4650082031694183e-07, + "ref_logps/chosen": -96.95204162597656, + "ref_logps/rejected": -87.20973205566406, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 83.80351257324219, + "rewards/margins": 168.1927947998047, + "rewards/rejected": -84.3892822265625, + "step": 6530, + "u": -6.600459098815918, + "weight": 0.037500012665987015 + }, + { + "diff_generated": -83.33629608154297, + "epoch": 2.119248217757615, + "grad_norm": 502.5286805379201, + "learning_rate": 1.923009028814545e-07, + "logits/chosen": -2.3959062099456787, + "logits/rejected": -2.577664852142334, + "logps/chosen": -13.528100967407227, + "logps/rejected": -170.61611938476562, + "loss": 13.1637, + "losses_ref": -1.0654134712240193e-05, + "ref_logps/chosen": -93.49635314941406, + "ref_logps/rejected": -87.2798080444336, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 79.96824645996094, + "rewards/margins": 163.30455017089844, + "rewards/rejected": -83.33629608154297, + "step": 6540, + "u": -6.5688958168029785, + "weight": 0.04375031217932701 + }, + { + "diff_generated": -80.15025329589844, + "epoch": 2.1224886584575504, + "grad_norm": 510.76521545503005, + "learning_rate": 1.910134338304804e-07, + "logits/chosen": -2.3848767280578613, + "logits/rejected": -2.5160529613494873, + "logps/chosen": -13.322404861450195, + "logps/rejected": -164.24024963378906, + "loss": 12.8367, + "losses_ref": -0.000348121888237074, + "ref_logps/chosen": -96.18370056152344, + "ref_logps/rejected": -84.08997344970703, + "rewards/accuracies": 0.9375, + "rewards/chosen": 82.86128997802734, + "rewards/margins": 163.0115509033203, + "rewards/rejected": -80.15025329589844, + "step": 6550, + "u": -6.42775821685791, + "weight": 0.06251399964094162 + }, + { + "diff_generated": -84.13975524902344, + "epoch": 2.1257290991574855, + "grad_norm": 496.24500877500515, + "learning_rate": 1.897289358918834e-07, + "logits/chosen": -2.368081569671631, + "logits/rejected": -2.479598045349121, + "logps/chosen": -12.546297073364258, + "logps/rejected": -171.93328857421875, + "loss": 12.9928, + "losses_ref": -0.0011568386107683182, + "ref_logps/chosen": -97.453857421875, + "ref_logps/rejected": -87.79353332519531, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 84.90755462646484, + "rewards/margins": 169.0473175048828, + "rewards/rejected": -84.13975524902344, + "step": 6560, + "u": -6.587471008300781, + "weight": 0.03754870221018791 + }, + { + "diff_generated": -78.54136657714844, + "epoch": 2.1289695398574207, + "grad_norm": 533.887128493409, + "learning_rate": 1.8844742732706508e-07, + "logits/chosen": -2.360599994659424, + "logits/rejected": -2.4836483001708984, + "logps/chosen": -12.47716236114502, + "logps/rejected": -158.98513793945312, + "loss": 12.3918, + "losses_ref": -6.709530862281099e-05, + "ref_logps/chosen": -89.12882995605469, + "ref_logps/rejected": -80.44377136230469, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 76.65166473388672, + "rewards/margins": 155.1930389404297, + "rewards/rejected": -78.54136657714844, + "step": 6570, + "u": -6.196860313415527, + "weight": 0.10000260174274445 + }, + { + "diff_generated": -83.926025390625, + "epoch": 2.1322099805573558, + "grad_norm": 499.9450985494422, + "learning_rate": 1.8716892635492906e-07, + "logits/chosen": -2.401566982269287, + "logits/rejected": -2.545031785964966, + "logps/chosen": -12.370294570922852, + "logps/rejected": -174.70640563964844, + "loss": 12.6282, + "losses_ref": -3.838214013285324e-08, + "ref_logps/chosen": -96.84266662597656, + "ref_logps/rejected": -90.7803726196289, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 84.47237396240234, + "rewards/margins": 168.3983917236328, + "rewards/rejected": -83.926025390625, + "step": 6580, + "u": -6.578399658203125, + "weight": 0.03750000149011612 + }, + { + "diff_generated": -79.67466735839844, + "epoch": 2.135450421257291, + "grad_norm": 470.58095351712876, + "learning_rate": 1.8589345115161948e-07, + "logits/chosen": -2.4138946533203125, + "logits/rejected": -2.506059169769287, + "logps/chosen": -12.929830551147461, + "logps/rejected": -166.64724731445312, + "loss": 12.697, + "losses_ref": -9.55009671343987e-09, + "ref_logps/chosen": -96.51829528808594, + "ref_logps/rejected": -86.97258758544922, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 83.58845520019531, + "rewards/margins": 163.26312255859375, + "rewards/rejected": -79.67466735839844, + "step": 6590, + "u": -6.518712520599365, + "weight": 0.05624999850988388 + }, + { + "diff_generated": -84.69886016845703, + "epoch": 2.138690861957226, + "grad_norm": 460.42093157266436, + "learning_rate": 1.846210198502646e-07, + "logits/chosen": -2.4073596000671387, + "logits/rejected": -2.526094436645508, + "logps/chosen": -12.00372314453125, + "logps/rejected": -172.64437866210938, + "loss": 12.7592, + "losses_ref": -0.04379742592573166, + "ref_logps/chosen": -93.78791809082031, + "ref_logps/rejected": -87.94551086425781, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 81.78419494628906, + "rewards/margins": 166.48306274414062, + "rewards/rejected": -84.69886016845703, + "step": 6600, + "u": -6.497406005859375, + "weight": 0.05853024125099182 + }, + { + "diff_generated": -86.61919403076172, + "epoch": 2.141931302657161, + "grad_norm": 484.96256944654857, + "learning_rate": 1.8335165054071795e-07, + "logits/chosen": -2.371428966522217, + "logits/rejected": -2.5984182357788086, + "logps/chosen": -11.764432907104492, + "logps/rejected": -177.4099884033203, + "loss": 12.6912, + "losses_ref": -0.005594468675553799, + "ref_logps/chosen": -90.90178680419922, + "ref_logps/rejected": -90.79080963134766, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 79.13734436035156, + "rewards/margins": 165.75656127929688, + "rewards/rejected": -86.61919403076172, + "step": 6610, + "u": -6.5773138999938965, + "weight": 0.04402199015021324 + }, + { + "diff_generated": -83.58558654785156, + "epoch": 2.1451717433570967, + "grad_norm": 526.7006537364417, + "learning_rate": 1.8208536126930173e-07, + "logits/chosen": -2.4086477756500244, + "logits/rejected": -2.5347962379455566, + "logps/chosen": -13.11962890625, + "logps/rejected": -171.22561645507812, + "loss": 13.1348, + "losses_ref": -0.0004633056523744017, + "ref_logps/chosen": -95.87711334228516, + "ref_logps/rejected": -87.64002990722656, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 82.75748443603516, + "rewards/margins": 166.3430633544922, + "rewards/rejected": -83.58558654785156, + "step": 6620, + "u": -6.69122838973999, + "weight": 0.02501877211034298 + }, + { + "diff_generated": -81.69627380371094, + "epoch": 2.148412184057032, + "grad_norm": 500.3112282939001, + "learning_rate": 1.8082217003854933e-07, + "logits/chosen": -2.4026248455047607, + "logits/rejected": -2.5078542232513428, + "logps/chosen": -12.539911270141602, + "logps/rejected": -167.52072143554688, + "loss": 12.9024, + "losses_ref": -0.002466453704982996, + "ref_logps/chosen": -94.2463150024414, + "ref_logps/rejected": -85.824462890625, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 81.70640563964844, + "rewards/margins": 163.40267944335938, + "rewards/rejected": -81.69627380371094, + "step": 6630, + "u": -6.409876346588135, + "weight": 0.0688544362783432 + }, + { + "diff_generated": -80.04955291748047, + "epoch": 2.151652624756967, + "grad_norm": 497.75656122376796, + "learning_rate": 1.7956209480695087e-07, + "logits/chosen": -2.389005661010742, + "logits/rejected": -2.5154757499694824, + "logps/chosen": -13.171731948852539, + "logps/rejected": -161.18557739257812, + "loss": 12.8453, + "losses_ref": -0.007567479275166988, + "ref_logps/chosen": -93.38511657714844, + "ref_logps/rejected": -81.13603210449219, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 80.21339416503906, + "rewards/margins": 160.262939453125, + "rewards/rejected": -80.04955291748047, + "step": 6640, + "u": -6.542353630065918, + "weight": 0.03783651068806648 + }, + { + "diff_generated": -80.24171447753906, + "epoch": 2.154893065456902, + "grad_norm": 512.22661502845, + "learning_rate": 1.7830515348869664e-07, + "logits/chosen": -2.3576130867004395, + "logits/rejected": -2.528032064437866, + "logps/chosen": -12.79431438446045, + "logps/rejected": -163.2600555419922, + "loss": 13.1941, + "losses_ref": -7.352201691901428e-07, + "ref_logps/chosen": -88.53156280517578, + "ref_logps/rejected": -83.01834106445312, + "rewards/accuracies": 0.9375, + "rewards/chosen": 75.73725128173828, + "rewards/margins": 155.97897338867188, + "rewards/rejected": -80.24171447753906, + "step": 6650, + "u": -6.4445905685424805, + "weight": 0.06250002980232239 + }, + { + "diff_generated": -78.14437103271484, + "epoch": 2.158133506156837, + "grad_norm": 486.94817034851, + "learning_rate": 1.770513639534225e-07, + "logits/chosen": -2.3804211616516113, + "logits/rejected": -2.4835352897644043, + "logps/chosen": -12.82800006866455, + "logps/rejected": -158.52310180664062, + "loss": 12.9297, + "losses_ref": -1.1419600014050957e-05, + "ref_logps/chosen": -90.69715881347656, + "ref_logps/rejected": -80.37873840332031, + "rewards/accuracies": 0.90625, + "rewards/chosen": 77.86915588378906, + "rewards/margins": 156.01351928710938, + "rewards/rejected": -78.14437103271484, + "step": 6660, + "u": -6.209801197052002, + "weight": 0.09375027567148209 + }, + { + "diff_generated": -79.85459899902344, + "epoch": 2.1613739468567728, + "grad_norm": 504.888228932194, + "learning_rate": 1.7580074402595698e-07, + "logits/chosen": -2.3688759803771973, + "logits/rejected": -2.5014474391937256, + "logps/chosen": -11.610100746154785, + "logps/rejected": -164.87625122070312, + "loss": 12.3143, + "losses_ref": -4.962249704476562e-07, + "ref_logps/chosen": -90.61629486083984, + "ref_logps/rejected": -85.02165985107422, + "rewards/accuracies": 0.90625, + "rewards/chosen": 79.0062026977539, + "rewards/margins": 158.86080932617188, + "rewards/rejected": -79.85459899902344, + "step": 6670, + "u": -6.213131904602051, + "weight": 0.0937500149011612 + }, + { + "diff_generated": -84.20277404785156, + "epoch": 2.164614387556708, + "grad_norm": 459.58600259916545, + "learning_rate": 1.7455331148606618e-07, + "logits/chosen": -2.350407123565674, + "logits/rejected": -2.4642868041992188, + "logps/chosen": -12.381486892700195, + "logps/rejected": -170.39273071289062, + "loss": 12.3502, + "losses_ref": -0.001807486405596137, + "ref_logps/chosen": -92.08818817138672, + "ref_logps/rejected": -86.18995666503906, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 79.70670318603516, + "rewards/margins": 163.9094696044922, + "rewards/rejected": -84.20277404785156, + "step": 6680, + "u": -6.574119567871094, + "weight": 0.03757709637284279 + }, + { + "diff_generated": -83.0318374633789, + "epoch": 2.167854828256643, + "grad_norm": 494.0762548465317, + "learning_rate": 1.7330908406820237e-07, + "logits/chosen": -2.378242254257202, + "logits/rejected": -2.481672763824463, + "logps/chosen": -11.903432846069336, + "logps/rejected": -163.31051635742188, + "loss": 12.7796, + "losses_ref": -5.902011657177297e-10, + "ref_logps/chosen": -93.77474975585938, + "ref_logps/rejected": -80.27867126464844, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 81.87132263183594, + "rewards/margins": 164.90316772460938, + "rewards/rejected": -83.0318374633789, + "step": 6690, + "u": -6.509033203125, + "weight": 0.05000000074505806 + }, + { + "diff_generated": -81.65437316894531, + "epoch": 2.171095268956578, + "grad_norm": 502.7757106064382, + "learning_rate": 1.7206807946125123e-07, + "logits/chosen": -2.394428014755249, + "logits/rejected": -2.4850785732269287, + "logps/chosen": -13.18397045135498, + "logps/rejected": -170.88265991210938, + "loss": 13.5294, + "losses_ref": -0.00011690105020534247, + "ref_logps/chosen": -98.64112854003906, + "ref_logps/rejected": -89.22828674316406, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 85.45716857910156, + "rewards/margins": 167.11154174804688, + "rewards/rejected": -81.65437316894531, + "step": 6700, + "u": -6.533175468444824, + "weight": 0.05000308156013489 + }, + { + "diff_generated": -84.60969543457031, + "epoch": 2.1743357096565132, + "grad_norm": 493.7980281327751, + "learning_rate": 1.7083031530828072e-07, + "logits/chosen": -2.3933663368225098, + "logits/rejected": -2.4629433155059814, + "logps/chosen": -13.83466911315918, + "logps/rejected": -172.16818237304688, + "loss": 12.7764, + "losses_ref": -0.021861081942915916, + "ref_logps/chosen": -104.56904602050781, + "ref_logps/rejected": -87.55848693847656, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 90.734375, + "rewards/margins": 175.34405517578125, + "rewards/rejected": -84.60969543457031, + "step": 6710, + "u": -6.686763763427734, + "weight": 0.01909906603395939 + }, + { + "diff_generated": -83.56124114990234, + "epoch": 2.1775761503564484, + "grad_norm": 463.96518711732523, + "learning_rate": 1.6959580920628937e-07, + "logits/chosen": -2.364957332611084, + "logits/rejected": -2.4778125286102295, + "logps/chosen": -13.045331001281738, + "logps/rejected": -167.7306365966797, + "loss": 12.9641, + "losses_ref": -0.0005527561879716814, + "ref_logps/chosen": -95.67125701904297, + "ref_logps/rejected": -84.16940307617188, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 82.62592315673828, + "rewards/margins": 166.18714904785156, + "rewards/rejected": -83.56124114990234, + "step": 6720, + "u": -6.551631927490234, + "weight": 0.04377732053399086 + }, + { + "diff_generated": -78.09506225585938, + "epoch": 2.1808165910563835, + "grad_norm": 534.711775566111, + "learning_rate": 1.6836457870595783e-07, + "logits/chosen": -2.340938091278076, + "logits/rejected": -2.4108238220214844, + "logps/chosen": -12.503395080566406, + "logps/rejected": -156.5635986328125, + "loss": 12.6524, + "losses_ref": -0.006767577491700649, + "ref_logps/chosen": -91.59136962890625, + "ref_logps/rejected": -78.46855163574219, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 79.08796691894531, + "rewards/margins": 157.1830291748047, + "rewards/rejected": -78.09506225585938, + "step": 6730, + "u": -6.41765832901001, + "weight": 0.06905090808868408 + }, + { + "diff_generated": -83.96517181396484, + "epoch": 2.184057031756319, + "grad_norm": 531.0780111374985, + "learning_rate": 1.6713664131139723e-07, + "logits/chosen": -2.363882303237915, + "logits/rejected": -2.435429096221924, + "logps/chosen": -12.827771186828613, + "logps/rejected": -169.38075256347656, + "loss": 13.0853, + "losses_ref": -0.0035223353188484907, + "ref_logps/chosen": -98.59832763671875, + "ref_logps/rejected": -85.41558837890625, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 85.77055358886719, + "rewards/margins": 169.73573303222656, + "rewards/rejected": -83.96517181396484, + "step": 6740, + "u": -6.626309394836426, + "weight": 0.03765714913606644 + }, + { + "diff_generated": -83.90516662597656, + "epoch": 2.187297472456254, + "grad_norm": 570.9212004915166, + "learning_rate": 1.659120144799019e-07, + "logits/chosen": -2.4047927856445312, + "logits/rejected": -2.524055242538452, + "logps/chosen": -13.002700805664062, + "logps/rejected": -169.1326446533203, + "loss": 12.9903, + "losses_ref": -0.008439160883426666, + "ref_logps/chosen": -94.8218994140625, + "ref_logps/rejected": -85.22748565673828, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 81.81919860839844, + "rewards/margins": 165.72434997558594, + "rewards/rejected": -83.90516662597656, + "step": 6750, + "u": -6.468390464782715, + "weight": 0.056635864078998566 + }, + { + "diff_generated": -81.97956848144531, + "epoch": 2.1905379131561893, + "grad_norm": 479.3194514385537, + "learning_rate": 1.6469071562170114e-07, + "logits/chosen": -2.4284636974334717, + "logits/rejected": -2.5151870250701904, + "logps/chosen": -12.341389656066895, + "logps/rejected": -165.90318298339844, + "loss": 12.7468, + "losses_ref": -3.743227505736968e-09, + "ref_logps/chosen": -93.52154541015625, + "ref_logps/rejected": -83.92363739013672, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 81.1801528930664, + "rewards/margins": 163.15969848632812, + "rewards/rejected": -81.97956848144531, + "step": 6760, + "u": -6.347648620605469, + "weight": 0.06875000149011612 + }, + { + "diff_generated": -83.8197250366211, + "epoch": 2.1937783538561244, + "grad_norm": 503.7998354414286, + "learning_rate": 1.6347276209971024e-07, + "logits/chosen": -2.3805127143859863, + "logits/rejected": -2.5530269145965576, + "logps/chosen": -11.702221870422363, + "logps/rejected": -168.94638061523438, + "loss": 12.4645, + "losses_ref": -0.0016042323550209403, + "ref_logps/chosen": -90.09027099609375, + "ref_logps/rejected": -85.12665557861328, + "rewards/accuracies": 0.9375, + "rewards/chosen": 78.38804626464844, + "rewards/margins": 162.20777893066406, + "rewards/rejected": -83.8197250366211, + "step": 6770, + "u": -6.439352989196777, + "weight": 0.06256560236215591 + }, + { + "diff_generated": -80.71089935302734, + "epoch": 2.1970187945560595, + "grad_norm": 490.02924840371116, + "learning_rate": 1.6225817122928534e-07, + "logits/chosen": -2.3680498600006104, + "logits/rejected": -2.483076333999634, + "logps/chosen": -11.827688217163086, + "logps/rejected": -166.25616455078125, + "loss": 12.9312, + "losses_ref": -0.0018793217604979873, + "ref_logps/chosen": -91.48072814941406, + "ref_logps/rejected": -85.54527282714844, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 79.65303802490234, + "rewards/margins": 160.3639373779297, + "rewards/rejected": -80.71089935302734, + "step": 6780, + "u": -6.448507785797119, + "weight": 0.056329526007175446 + }, + { + "diff_generated": -82.11483001708984, + "epoch": 2.2002592352559946, + "grad_norm": 484.9852882132878, + "learning_rate": 1.6104696027797635e-07, + "logits/chosen": -2.302804470062256, + "logits/rejected": -2.517000675201416, + "logps/chosen": -11.84687328338623, + "logps/rejected": -165.51992797851562, + "loss": 12.6721, + "losses_ref": -1.3320375558123487e-07, + "ref_logps/chosen": -86.8465805053711, + "ref_logps/rejected": -83.40510559082031, + "rewards/accuracies": 0.9375, + "rewards/chosen": 74.99971771240234, + "rewards/margins": 157.1145477294922, + "rewards/rejected": -82.11483001708984, + "step": 6790, + "u": -6.432467460632324, + "weight": 0.0625 + }, + { + "diff_generated": -85.60330200195312, + "epoch": 2.20349967595593, + "grad_norm": 500.89423272331715, + "learning_rate": 1.5983914646528193e-07, + "logits/chosen": -2.3903326988220215, + "logits/rejected": -2.4877140522003174, + "logps/chosen": -13.389312744140625, + "logps/rejected": -171.62155151367188, + "loss": 12.7111, + "losses_ref": -1.1886468200827949e-05, + "ref_logps/chosen": -93.59146118164062, + "ref_logps/rejected": -86.01826477050781, + "rewards/accuracies": 0.9375, + "rewards/chosen": 80.20214080810547, + "rewards/margins": 165.80545043945312, + "rewards/rejected": -85.60330200195312, + "step": 6800, + "u": -6.437894344329834, + "weight": 0.06250022351741791 + }, + { + "diff_generated": -84.71478271484375, + "epoch": 2.2067401166558653, + "grad_norm": 473.07032489741954, + "learning_rate": 1.5863474696240365e-07, + "logits/chosen": -2.424415111541748, + "logits/rejected": -2.5038774013519287, + "logps/chosen": -13.811508178710938, + "logps/rejected": -169.12124633789062, + "loss": 13.1273, + "losses_ref": -0.000734654429834336, + "ref_logps/chosen": -96.21209716796875, + "ref_logps/rejected": -84.40647888183594, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 82.40059661865234, + "rewards/margins": 167.11537170410156, + "rewards/rejected": -84.71478271484375, + "step": 6810, + "u": -6.517383575439453, + "weight": 0.05002971366047859 + }, + { + "diff_generated": -81.48091125488281, + "epoch": 2.2099805573558005, + "grad_norm": 542.4283769774358, + "learning_rate": 1.5743377889200388e-07, + "logits/chosen": -2.375166654586792, + "logits/rejected": -2.5270323753356934, + "logps/chosen": -13.916537284851074, + "logps/rejected": -167.28854370117188, + "loss": 13.3413, + "losses_ref": -0.003760767402127385, + "ref_logps/chosen": -94.32147216796875, + "ref_logps/rejected": -85.80763244628906, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 80.40492248535156, + "rewards/margins": 161.88583374023438, + "rewards/rejected": -81.48091125488281, + "step": 6820, + "u": -6.543093204498291, + "weight": 0.05016561597585678 + }, + { + "diff_generated": -82.93003845214844, + "epoch": 2.2132209980557356, + "grad_norm": 511.37752565432453, + "learning_rate": 1.5623625932795994e-07, + "logits/chosen": -2.41325044631958, + "logits/rejected": -2.519089698791504, + "logps/chosen": -13.878247261047363, + "logps/rejected": -170.31137084960938, + "loss": 13.2477, + "losses_ref": -0.01453393418341875, + "ref_logps/chosen": -97.43132781982422, + "ref_logps/rejected": -87.38133239746094, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 83.55308532714844, + "rewards/margins": 166.48312377929688, + "rewards/rejected": -82.93003845214844, + "step": 6830, + "u": -6.694279670715332, + "weight": 0.02572527527809143 + }, + { + "diff_generated": -83.49250793457031, + "epoch": 2.2164614387556707, + "grad_norm": 526.8628217578116, + "learning_rate": 1.5504220529512324e-07, + "logits/chosen": -2.3933825492858887, + "logits/rejected": -2.5277512073516846, + "logps/chosen": -12.323593139648438, + "logps/rejected": -166.7972869873047, + "loss": 13.3676, + "losses_ref": -0.06453749537467957, + "ref_logps/chosen": -97.86778259277344, + "ref_logps/rejected": -83.30477905273438, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 85.544189453125, + "rewards/margins": 169.03668212890625, + "rewards/rejected": -83.49250793457031, + "step": 6840, + "u": -6.69271183013916, + "weight": 0.02855154313147068 + }, + { + "diff_generated": -81.74871063232422, + "epoch": 2.219701879455606, + "grad_norm": 501.04138572115914, + "learning_rate": 1.5385163376907636e-07, + "logits/chosen": -2.4174468517303467, + "logits/rejected": -2.477114200592041, + "logps/chosen": -13.157957077026367, + "logps/rejected": -164.51651000976562, + "loss": 12.6108, + "losses_ref": -0.0037777810357511044, + "ref_logps/chosen": -97.16854858398438, + "ref_logps/rejected": -82.76777648925781, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 84.0105972290039, + "rewards/margins": 165.75930786132812, + "rewards/rejected": -81.74871063232422, + "step": 6850, + "u": -6.4757208824157715, + "weight": 0.056429821997880936 + }, + { + "diff_generated": -78.98966979980469, + "epoch": 2.222942320155541, + "grad_norm": 504.75515042657014, + "learning_rate": 1.526645616758921e-07, + "logits/chosen": -2.3435864448547363, + "logits/rejected": -2.469954252243042, + "logps/chosen": -12.959577560424805, + "logps/rejected": -161.3673553466797, + "loss": 13.1083, + "losses_ref": -0.0011534191435202956, + "ref_logps/chosen": -90.16709899902344, + "ref_logps/rejected": -82.37767791748047, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 77.20752716064453, + "rewards/margins": 156.19717407226562, + "rewards/rejected": -78.98966979980469, + "step": 6860, + "u": -6.452353000640869, + "weight": 0.056289829313755035 + }, + { + "diff_generated": -86.8541030883789, + "epoch": 2.2261827608554765, + "grad_norm": 486.5098764500833, + "learning_rate": 1.5148100589189205e-07, + "logits/chosen": -2.4341390132904053, + "logits/rejected": -2.5227787494659424, + "logps/chosen": -13.576004028320312, + "logps/rejected": -172.723876953125, + "loss": 13.1079, + "losses_ref": -0.0012087022187188268, + "ref_logps/chosen": -97.78997039794922, + "ref_logps/rejected": -85.8697738647461, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 84.2139663696289, + "rewards/margins": 171.06808471679688, + "rewards/rejected": -86.8541030883789, + "step": 6870, + "u": -6.57666540145874, + "weight": 0.0375499427318573 + }, + { + "diff_generated": -87.36380004882812, + "epoch": 2.2294232015554116, + "grad_norm": 499.7333864772283, + "learning_rate": 1.5030098324340808e-07, + "logits/chosen": -2.41763973236084, + "logits/rejected": -2.5282013416290283, + "logps/chosen": -12.894952774047852, + "logps/rejected": -174.74172973632812, + "loss": 12.5028, + "losses_ref": -0.002254816237837076, + "ref_logps/chosen": -100.2215347290039, + "ref_logps/rejected": -87.3779296875, + "rewards/accuracies": 0.96875, + "rewards/chosen": 87.32657623291016, + "rewards/margins": 174.6903839111328, + "rewards/rejected": -87.36380004882812, + "step": 6880, + "u": -6.649659156799316, + "weight": 0.03135109692811966 + }, + { + "diff_generated": -85.03706359863281, + "epoch": 2.2326636422553467, + "grad_norm": 503.3857547944509, + "learning_rate": 1.491245105065419e-07, + "logits/chosen": -2.418447732925415, + "logits/rejected": -2.5524816513061523, + "logps/chosen": -12.30914306640625, + "logps/rejected": -173.20089721679688, + "loss": 12.9675, + "losses_ref": -0.005227755755186081, + "ref_logps/chosen": -95.12874603271484, + "ref_logps/rejected": -88.1638412475586, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 82.81959533691406, + "rewards/margins": 167.85665893554688, + "rewards/rejected": -85.03706359863281, + "step": 6890, + "u": -6.624598026275635, + "weight": 0.037730418145656586 + }, + { + "diff_generated": -86.80964660644531, + "epoch": 2.235904082955282, + "grad_norm": 518.1303914744179, + "learning_rate": 1.4795160440692672e-07, + "logits/chosen": -2.43729305267334, + "logits/rejected": -2.527352809906006, + "logps/chosen": -13.276708602905273, + "logps/rejected": -173.63804626464844, + "loss": 12.9862, + "losses_ref": -0.0007822831976227462, + "ref_logps/chosen": -99.302978515625, + "ref_logps/rejected": -86.82841491699219, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 86.02627563476562, + "rewards/margins": 172.83590698242188, + "rewards/rejected": -86.80964660644531, + "step": 6900, + "u": -6.6816582679748535, + "weight": 0.025032568722963333 + }, + { + "diff_generated": -84.83689880371094, + "epoch": 2.239144523655217, + "grad_norm": 491.41988866598723, + "learning_rate": 1.467822816194904e-07, + "logits/chosen": -2.4127767086029053, + "logits/rejected": -2.5224814414978027, + "logps/chosen": -12.989709854125977, + "logps/rejected": -169.14642333984375, + "loss": 12.7208, + "losses_ref": -0.005537012591958046, + "ref_logps/chosen": -96.77596282958984, + "ref_logps/rejected": -84.30952453613281, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 83.78624725341797, + "rewards/margins": 168.62315368652344, + "rewards/rejected": -84.83689880371094, + "step": 6910, + "u": -6.521003723144531, + "weight": 0.050242386758327484 + }, + { + "diff_generated": -85.5491943359375, + "epoch": 2.242384964355152, + "grad_norm": 550.1375066039961, + "learning_rate": 1.4561655876821694e-07, + "logits/chosen": -2.3261666297912598, + "logits/rejected": -2.512333631515503, + "logps/chosen": -12.838945388793945, + "logps/rejected": -173.94326782226562, + "loss": 12.9348, + "losses_ref": -0.0017390226712450385, + "ref_logps/chosen": -91.61780548095703, + "ref_logps/rejected": -88.39408111572266, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 78.77885437011719, + "rewards/margins": 164.3280487060547, + "rewards/rejected": -85.5491943359375, + "step": 6920, + "u": -6.529515266418457, + "weight": 0.05007760971784592 + }, + { + "diff_generated": -84.65425109863281, + "epoch": 2.2456254050550877, + "grad_norm": 524.7811803424514, + "learning_rate": 1.4445445242591138e-07, + "logits/chosen": -2.37631893157959, + "logits/rejected": -2.5157320499420166, + "logps/chosen": -12.7593412399292, + "logps/rejected": -170.8683319091797, + "loss": 13.0834, + "losses_ref": -2.6054085111582026e-08, + "ref_logps/chosen": -96.13145446777344, + "ref_logps/rejected": -86.21408081054688, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 83.37211608886719, + "rewards/margins": 168.0263671875, + "rewards/rejected": -84.65425109863281, + "step": 6930, + "u": -6.5352582931518555, + "weight": 0.05000000074505806 + }, + { + "diff_generated": -83.07829284667969, + "epoch": 2.248865845755023, + "grad_norm": 502.47717459419925, + "learning_rate": 1.4329597911396362e-07, + "logits/chosen": -2.4220199584960938, + "logits/rejected": -2.498976230621338, + "logps/chosen": -13.39250373840332, + "logps/rejected": -166.42909240722656, + "loss": 12.9699, + "losses_ref": -0.00041079233051277697, + "ref_logps/chosen": -100.69302368164062, + "ref_logps/rejected": -83.35079193115234, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 87.30052185058594, + "rewards/margins": 170.37881469726562, + "rewards/rejected": -83.07829284667969, + "step": 6940, + "u": -6.401181697845459, + "weight": 0.06876643002033234 + }, + { + "diff_generated": -84.72123718261719, + "epoch": 2.252106286454958, + "grad_norm": 516.1839062498832, + "learning_rate": 1.421411553021137e-07, + "logits/chosen": -2.440908193588257, + "logits/rejected": -2.558413028717041, + "logps/chosen": -14.195680618286133, + "logps/rejected": -170.11984252929688, + "loss": 12.8176, + "losses_ref": -1.3914039698192937e-07, + "ref_logps/chosen": -101.77497863769531, + "ref_logps/rejected": -85.39862060546875, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 87.57929992675781, + "rewards/margins": 172.30055236816406, + "rewards/rejected": -84.72123718261719, + "step": 6950, + "u": -6.594904899597168, + "weight": 0.03750000149011612 + }, + { + "diff_generated": -84.58776092529297, + "epoch": 2.255346727154893, + "grad_norm": 531.8777877793126, + "learning_rate": 1.4098999740821716e-07, + "logits/chosen": -2.356252908706665, + "logits/rejected": -2.452484369277954, + "logps/chosen": -11.869619369506836, + "logps/rejected": -170.38638305664062, + "loss": 12.8437, + "losses_ref": -8.181816156138666e-06, + "ref_logps/chosen": -94.5312728881836, + "ref_logps/rejected": -85.7986068725586, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 82.66166687011719, + "rewards/margins": 167.24942016601562, + "rewards/rejected": -84.58776092529297, + "step": 6960, + "u": -6.414976596832275, + "weight": 0.0687502771615982 + }, + { + "diff_generated": -84.35314178466797, + "epoch": 2.258587167854828, + "grad_norm": 509.9867754579174, + "learning_rate": 1.3984252179801277e-07, + "logits/chosen": -2.3887689113616943, + "logits/rejected": -2.5168299674987793, + "logps/chosen": -13.763440132141113, + "logps/rejected": -173.8634033203125, + "loss": 12.9378, + "losses_ref": -2.446452924687037e-07, + "ref_logps/chosen": -94.25096130371094, + "ref_logps/rejected": -89.51025390625, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 80.4875259399414, + "rewards/margins": 164.84066772460938, + "rewards/rejected": -84.35314178466797, + "step": 6970, + "u": -6.471514701843262, + "weight": 0.056250013411045074 + }, + { + "diff_generated": -81.13739013671875, + "epoch": 2.2618276085547633, + "grad_norm": 485.95806985386764, + "learning_rate": 1.3869874478488846e-07, + "logits/chosen": -2.355803966522217, + "logits/rejected": -2.518582582473755, + "logps/chosen": -12.210909843444824, + "logps/rejected": -163.37709045410156, + "loss": 12.4143, + "losses_ref": -4.4140466570752324e-07, + "ref_logps/chosen": -90.22859954833984, + "ref_logps/rejected": -82.23968505859375, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 78.01768493652344, + "rewards/margins": 159.1550750732422, + "rewards/rejected": -81.13739013671875, + "step": 6980, + "u": -6.590836524963379, + "weight": 0.043750010430812836 + }, + { + "diff_generated": -80.5860824584961, + "epoch": 2.2650680492546984, + "grad_norm": 510.0246999329795, + "learning_rate": 1.3755868262965047e-07, + "logits/chosen": -2.4310669898986816, + "logits/rejected": -2.4750888347625732, + "logps/chosen": -12.444555282592773, + "logps/rejected": -160.20654296875, + "loss": 12.9388, + "losses_ref": -0.011145448312163353, + "ref_logps/chosen": -96.44710540771484, + "ref_logps/rejected": -79.6204605102539, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 84.00254821777344, + "rewards/margins": 164.588623046875, + "rewards/rejected": -80.5860824584961, + "step": 6990, + "u": -6.405091285705566, + "weight": 0.06928651034832001 + }, + { + "diff_generated": -82.3926010131836, + "epoch": 2.268308489954634, + "grad_norm": 481.8003241790466, + "learning_rate": 1.3642235154029172e-07, + "logits/chosen": -2.41209077835083, + "logits/rejected": -2.4497084617614746, + "logps/chosen": -14.089653015136719, + "logps/rejected": -163.5379638671875, + "loss": 12.7251, + "losses_ref": -1.1824274537275414e-07, + "ref_logps/chosen": -100.86749267578125, + "ref_logps/rejected": -81.14535522460938, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 86.77783966064453, + "rewards/margins": 169.1704559326172, + "rewards/rejected": -82.3926010131836, + "step": 7000, + "u": -6.477536201477051, + "weight": 0.05624999850988388 + }, + { + "diff_generated": -83.03773498535156, + "epoch": 2.271548930654569, + "grad_norm": 498.2740157382577, + "learning_rate": 1.352897676717614e-07, + "logits/chosen": -2.4076733589172363, + "logits/rejected": -2.476492404937744, + "logps/chosen": -13.60411548614502, + "logps/rejected": -168.27487182617188, + "loss": 13.318, + "losses_ref": -1.2477959899115376e-05, + "ref_logps/chosen": -99.06245422363281, + "ref_logps/rejected": -85.23712158203125, + "rewards/accuracies": 0.96875, + "rewards/chosen": 85.45832824707031, + "rewards/margins": 168.49607849121094, + "rewards/rejected": -83.03773498535156, + "step": 7010, + "u": -6.662878513336182, + "weight": 0.031250398606061935 + }, + { + "diff_generated": -82.75672149658203, + "epoch": 2.274789371354504, + "grad_norm": 469.74278121718174, + "learning_rate": 1.341609471257354e-07, + "logits/chosen": -2.348039388656616, + "logits/rejected": -2.5116515159606934, + "logps/chosen": -12.133288383483887, + "logps/rejected": -166.936767578125, + "loss": 12.7259, + "losses_ref": -1.324751337961061e-06, + "ref_logps/chosen": -92.88429260253906, + "ref_logps/rejected": -84.18003845214844, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 80.75099182128906, + "rewards/margins": 163.50772094726562, + "rewards/rejected": -82.75672149658203, + "step": 7020, + "u": -6.534178733825684, + "weight": 0.043750010430812836 + }, + { + "diff_generated": -85.21153259277344, + "epoch": 2.2780298120544393, + "grad_norm": 460.40707525635656, + "learning_rate": 1.3303590595038735e-07, + "logits/chosen": -2.3632588386535645, + "logits/rejected": -2.5475070476531982, + "logps/chosen": -13.21232795715332, + "logps/rejected": -170.18663024902344, + "loss": 12.9074, + "losses_ref": -5.1966587477636494e-08, + "ref_logps/chosen": -93.5289306640625, + "ref_logps/rejected": -84.97509765625, + "rewards/accuracies": 0.96875, + "rewards/chosen": 80.31660461425781, + "rewards/margins": 165.5281524658203, + "rewards/rejected": -85.21153259277344, + "step": 7030, + "u": -6.636476993560791, + "weight": 0.03125 + }, + { + "diff_generated": -79.51531219482422, + "epoch": 2.2812702527543745, + "grad_norm": 498.87865447951674, + "learning_rate": 1.3191466014016049e-07, + "logits/chosen": -2.384385347366333, + "logits/rejected": -2.4455251693725586, + "logps/chosen": -11.143257141113281, + "logps/rejected": -163.07403564453125, + "loss": 12.4045, + "losses_ref": -6.7265777943248395e-06, + "ref_logps/chosen": -95.297607421875, + "ref_logps/rejected": -83.55873107910156, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 84.15434265136719, + "rewards/margins": 163.66966247558594, + "rewards/rejected": -79.51531219482422, + "step": 7040, + "u": -6.4666948318481445, + "weight": 0.05625021457672119 + }, + { + "diff_generated": -81.31959533691406, + "epoch": 2.28451069345431, + "grad_norm": 476.81821446420355, + "learning_rate": 1.3079722563553994e-07, + "logits/chosen": -2.4070966243743896, + "logits/rejected": -2.495264768600464, + "logps/chosen": -12.539602279663086, + "logps/rejected": -164.7382354736328, + "loss": 12.565, + "losses_ref": -0.006604082882404327, + "ref_logps/chosen": -93.5062255859375, + "ref_logps/rejected": -83.41862487792969, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 80.96662139892578, + "rewards/margins": 162.28622436523438, + "rewards/rejected": -81.31959533691406, + "step": 7050, + "u": -6.380990028381348, + "weight": 0.06906723976135254 + }, + { + "diff_generated": -84.39396667480469, + "epoch": 2.287751134154245, + "grad_norm": 499.95043530982576, + "learning_rate": 1.2968361832282705e-07, + "logits/chosen": -2.3694443702697754, + "logits/rejected": -2.499861717224121, + "logps/chosen": -12.672450065612793, + "logps/rejected": -170.8816680908203, + "loss": 12.9321, + "losses_ref": -2.2672212551810844e-08, + "ref_logps/chosen": -96.10008239746094, + "ref_logps/rejected": -86.48768615722656, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 83.42762756347656, + "rewards/margins": 167.8216094970703, + "rewards/rejected": -84.39396667480469, + "step": 7060, + "u": -6.547879695892334, + "weight": 0.05000000074505806 + }, + { + "diff_generated": -85.82933044433594, + "epoch": 2.2909915748541803, + "grad_norm": 498.6979087032496, + "learning_rate": 1.2857385403391226e-07, + "logits/chosen": -2.3565831184387207, + "logits/rejected": -2.492098093032837, + "logps/chosen": -13.709028244018555, + "logps/rejected": -174.83709716796875, + "loss": 13.3289, + "losses_ref": -0.007923029363155365, + "ref_logps/chosen": -96.8045883178711, + "ref_logps/rejected": -89.00776672363281, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 83.09556579589844, + "rewards/margins": 168.9248809814453, + "rewards/rejected": -85.82933044433594, + "step": 7070, + "u": -6.711148262023926, + "weight": 0.02536194957792759 + }, + { + "diff_generated": -81.84160614013672, + "epoch": 2.2942320155541154, + "grad_norm": 533.2120579610759, + "learning_rate": 1.274679485460509e-07, + "logits/chosen": -2.3976547718048096, + "logits/rejected": -2.453458309173584, + "logps/chosen": -13.225214004516602, + "logps/rejected": -165.89492797851562, + "loss": 12.7832, + "losses_ref": -3.6161025036562933e-06, + "ref_logps/chosen": -97.51805114746094, + "ref_logps/rejected": -84.05333709716797, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 84.29283905029297, + "rewards/margins": 166.13442993164062, + "rewards/rejected": -81.84160614013672, + "step": 7080, + "u": -6.306405544281006, + "weight": 0.0812501460313797 + }, + { + "diff_generated": -82.47283935546875, + "epoch": 2.2974724562540505, + "grad_norm": 472.59279168597607, + "learning_rate": 1.2636591758163868e-07, + "logits/chosen": -2.3648295402526855, + "logits/rejected": -2.542153835296631, + "logps/chosen": -12.415548324584961, + "logps/rejected": -167.7449188232422, + "loss": 12.7337, + "losses_ref": -0.0115945003926754, + "ref_logps/chosen": -92.28016662597656, + "ref_logps/rejected": -85.27208709716797, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 79.8646240234375, + "rewards/margins": 162.33746337890625, + "rewards/rejected": -82.47283935546875, + "step": 7090, + "u": -6.5920257568359375, + "weight": 0.03803582862019539 + }, + { + "diff_generated": -81.75079345703125, + "epoch": 2.3007128969539856, + "grad_norm": 505.3085147920675, + "learning_rate": 1.2526777680798813e-07, + "logits/chosen": -2.3778653144836426, + "logits/rejected": -2.5342462062835693, + "logps/chosen": -12.207781791687012, + "logps/rejected": -170.48773193359375, + "loss": 12.6916, + "losses_ref": -0.0039069210179150105, + "ref_logps/chosen": -91.88865661621094, + "ref_logps/rejected": -88.7369384765625, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 79.68087005615234, + "rewards/margins": 161.43167114257812, + "rewards/rejected": -81.75079345703125, + "step": 7100, + "u": -6.36777925491333, + "weight": 0.06892150640487671 + }, + { + "diff_generated": -85.78407287597656, + "epoch": 2.3039533376539207, + "grad_norm": 491.3990275948269, + "learning_rate": 1.241735418371057e-07, + "logits/chosen": -2.3440051078796387, + "logits/rejected": -2.523491144180298, + "logps/chosen": -12.945859909057617, + "logps/rejected": -174.6090850830078, + "loss": 12.7095, + "losses_ref": -0.0023649369832128286, + "ref_logps/chosen": -91.75056457519531, + "ref_logps/rejected": -88.82500457763672, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 78.80470275878906, + "rewards/margins": 164.58877563476562, + "rewards/rejected": -85.78407287597656, + "step": 7110, + "u": -6.561973571777344, + "weight": 0.03760456293821335 + }, + { + "diff_generated": -83.95856475830078, + "epoch": 2.3071937783538563, + "grad_norm": 533.5355848679819, + "learning_rate": 1.2308322822547027e-07, + "logits/chosen": -2.362577199935913, + "logits/rejected": -2.4960556030273438, + "logps/chosen": -12.309553146362305, + "logps/rejected": -167.9497833251953, + "loss": 12.4265, + "losses_ref": -5.8301971250784845e-08, + "ref_logps/chosen": -95.14002990722656, + "ref_logps/rejected": -83.99121856689453, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 82.83045959472656, + "rewards/margins": 166.78904724121094, + "rewards/rejected": -83.95856475830078, + "step": 7120, + "u": -6.475826263427734, + "weight": 0.05624999850988388 + }, + { + "diff_generated": -83.14198303222656, + "epoch": 2.3104342190537914, + "grad_norm": 499.945946875676, + "learning_rate": 1.2199685147381148e-07, + "logits/chosen": -2.4278907775878906, + "logits/rejected": -2.5294861793518066, + "logps/chosen": -13.416807174682617, + "logps/rejected": -169.05868530273438, + "loss": 12.9315, + "losses_ref": -0.00023093321942724288, + "ref_logps/chosen": -95.8389892578125, + "ref_logps/rejected": -85.91670227050781, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 82.42218780517578, + "rewards/margins": 165.56417846679688, + "rewards/rejected": -83.14198303222656, + "step": 7130, + "u": -6.518418788909912, + "weight": 0.050009287893772125 + }, + { + "diff_generated": -82.58621215820312, + "epoch": 2.3136746597537265, + "grad_norm": 500.1642698273991, + "learning_rate": 1.2091442702688933e-07, + "logits/chosen": -2.383411169052124, + "logits/rejected": -2.5127146244049072, + "logps/chosen": -12.741984367370605, + "logps/rejected": -169.25953674316406, + "loss": 13.1358, + "losses_ref": -0.0027023288421332836, + "ref_logps/chosen": -92.45108795166016, + "ref_logps/rejected": -86.67330932617188, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 79.70909118652344, + "rewards/margins": 162.29531860351562, + "rewards/rejected": -82.58621215820312, + "step": 7140, + "u": -6.45116662979126, + "weight": 0.05637402459979057 + }, + { + "diff_generated": -79.54682922363281, + "epoch": 2.3169151004536617, + "grad_norm": 527.4886173738923, + "learning_rate": 1.198359702732755e-07, + "logits/chosen": -2.3902125358581543, + "logits/rejected": -2.4888460636138916, + "logps/chosen": -13.180867195129395, + "logps/rejected": -162.79171752929688, + "loss": 12.8312, + "losses_ref": -0.0008114447700791061, + "ref_logps/chosen": -95.47314453125, + "ref_logps/rejected": -83.24490356445312, + "rewards/accuracies": 0.9375, + "rewards/chosen": 82.29228210449219, + "rewards/margins": 161.839111328125, + "rewards/rejected": -79.54682922363281, + "step": 7150, + "u": -6.421548366546631, + "weight": 0.06253501027822495 + }, + { + "diff_generated": -83.003662109375, + "epoch": 2.320155541153597, + "grad_norm": 504.246990999001, + "learning_rate": 1.1876149654513321e-07, + "logits/chosen": -2.3551647663116455, + "logits/rejected": -2.5079431533813477, + "logps/chosen": -13.358386039733887, + "logps/rejected": -171.64193725585938, + "loss": 13.2005, + "losses_ref": -0.00260176626034081, + "ref_logps/chosen": -94.94227600097656, + "ref_logps/rejected": -88.63829803466797, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 81.58389282226562, + "rewards/margins": 164.58755493164062, + "rewards/rejected": -83.003662109375, + "step": 7160, + "u": -6.538697719573975, + "weight": 0.05011763423681259 + }, + { + "diff_generated": -83.80158996582031, + "epoch": 2.323395981853532, + "grad_norm": 476.0883198202353, + "learning_rate": 1.1769102111800036e-07, + "logits/chosen": -2.3814778327941895, + "logits/rejected": -2.5466561317443848, + "logps/chosen": -13.39539909362793, + "logps/rejected": -170.6013946533203, + "loss": 12.7603, + "losses_ref": -0.007953451946377754, + "ref_logps/chosen": -96.72367858886719, + "ref_logps/rejected": -86.7998046875, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 83.3282699584961, + "rewards/margins": 167.12985229492188, + "rewards/rejected": -83.80158996582031, + "step": 7170, + "u": -6.736584663391113, + "weight": 0.019122375175356865 + }, + { + "diff_generated": -80.74140930175781, + "epoch": 2.3266364225534675, + "grad_norm": 503.6890165150702, + "learning_rate": 1.166245592105719e-07, + "logits/chosen": -2.3559749126434326, + "logits/rejected": -2.4508726596832275, + "logps/chosen": -13.094064712524414, + "logps/rejected": -160.50881958007812, + "loss": 12.853, + "losses_ref": -3.550645999439439e-07, + "ref_logps/chosen": -96.27635955810547, + "ref_logps/rejected": -79.76742553710938, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 83.18229675292969, + "rewards/margins": 163.9237060546875, + "rewards/rejected": -80.74140930175781, + "step": 7180, + "u": -6.578315734863281, + "weight": 0.04375000670552254 + }, + { + "diff_generated": -83.57976531982422, + "epoch": 2.3298768632534026, + "grad_norm": 506.1377150659979, + "learning_rate": 1.1556212598448349e-07, + "logits/chosen": -2.3926408290863037, + "logits/rejected": -2.502164840698242, + "logps/chosen": -14.994264602661133, + "logps/rejected": -171.16343688964844, + "loss": 13.0739, + "losses_ref": -4.4104973540015635e-07, + "ref_logps/chosen": -98.24559020996094, + "ref_logps/rejected": -87.58365631103516, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 83.2513198852539, + "rewards/margins": 166.83108520507812, + "rewards/rejected": -83.57976531982422, + "step": 7190, + "u": -6.625774383544922, + "weight": 0.037500012665987015 + }, + { + "diff_generated": -87.09251403808594, + "epoch": 2.3331173039533377, + "grad_norm": 532.4466559165295, + "learning_rate": 1.1450373654409591e-07, + "logits/chosen": -2.3863420486450195, + "logits/rejected": -2.56119966506958, + "logps/chosen": -12.581506729125977, + "logps/rejected": -175.84518432617188, + "loss": 13.0843, + "losses_ref": -0.003380722599104047, + "ref_logps/chosen": -93.87139892578125, + "ref_logps/rejected": -88.75267028808594, + "rewards/accuracies": 0.9375, + "rewards/chosen": 81.28990173339844, + "rewards/margins": 168.3824005126953, + "rewards/rejected": -87.09251403808594, + "step": 7200, + "u": -6.45444393157959, + "weight": 0.06264631450176239 + }, + { + "diff_generated": -79.31378936767578, + "epoch": 2.336357744653273, + "grad_norm": 492.9742860836095, + "learning_rate": 1.1344940593628063e-07, + "logits/chosen": -2.340789318084717, + "logits/rejected": -2.4311439990997314, + "logps/chosen": -12.333671569824219, + "logps/rejected": -159.327880859375, + "loss": 12.8581, + "losses_ref": -0.008099230006337166, + "ref_logps/chosen": -94.08828735351562, + "ref_logps/rejected": -80.01409149169922, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 81.7546157836914, + "rewards/margins": 161.0684051513672, + "rewards/rejected": -79.31378936767578, + "step": 7210, + "u": -6.362817764282227, + "weight": 0.07535295188426971 + }, + { + "diff_generated": -82.0297622680664, + "epoch": 2.339598185353208, + "grad_norm": 499.32682808368605, + "learning_rate": 1.1239914915020512e-07, + "logits/chosen": -2.3604369163513184, + "logits/rejected": -2.5125110149383545, + "logps/chosen": -12.218565940856934, + "logps/rejected": -167.6261749267578, + "loss": 13.0646, + "losses_ref": -0.0029511586762964725, + "ref_logps/chosen": -91.50608825683594, + "ref_logps/rejected": -85.59638977050781, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 79.28752136230469, + "rewards/margins": 161.31729125976562, + "rewards/rejected": -82.0297622680664, + "step": 7220, + "u": -6.343608379364014, + "weight": 0.08137090504169464 + }, + { + "diff_generated": -83.73043823242188, + "epoch": 2.342838626053143, + "grad_norm": 490.66335871155917, + "learning_rate": 1.1135298111712122e-07, + "logits/chosen": -2.356489896774292, + "logits/rejected": -2.4991321563720703, + "logps/chosen": -12.847094535827637, + "logps/rejected": -168.40830993652344, + "loss": 12.8496, + "losses_ref": -1.1878989347735569e-08, + "ref_logps/chosen": -94.01008605957031, + "ref_logps/rejected": -84.67787170410156, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 81.1629867553711, + "rewards/margins": 164.89340209960938, + "rewards/rejected": -83.73043823242188, + "step": 7230, + "u": -6.560157775878906, + "weight": 0.05000000074505806 + }, + { + "diff_generated": -81.16157531738281, + "epoch": 2.346079066753078, + "grad_norm": 494.03668382681167, + "learning_rate": 1.1031091671015094e-07, + "logits/chosen": -2.342160940170288, + "logits/rejected": -2.462944507598877, + "logps/chosen": -11.229791641235352, + "logps/rejected": -164.2108917236328, + "loss": 12.7249, + "losses_ref": -0.007652191910892725, + "ref_logps/chosen": -91.34001922607422, + "ref_logps/rejected": -83.0493392944336, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 80.1102294921875, + "rewards/margins": 161.27178955078125, + "rewards/rejected": -81.16157531738281, + "step": 7240, + "u": -6.273160934448242, + "weight": 0.08783890306949615 + }, + { + "diff_generated": -86.33053588867188, + "epoch": 2.3493195074530138, + "grad_norm": 519.6081817427561, + "learning_rate": 1.0927297074407662e-07, + "logits/chosen": -2.3760571479797363, + "logits/rejected": -2.5106639862060547, + "logps/chosen": -12.724100112915039, + "logps/rejected": -176.9182891845703, + "loss": 12.4812, + "losses_ref": -1.339400341748842e-06, + "ref_logps/chosen": -94.36858367919922, + "ref_logps/rejected": -90.58776092529297, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 81.64447784423828, + "rewards/margins": 167.97500610351562, + "rewards/rejected": -86.33053588867188, + "step": 7250, + "u": -6.500415802001953, + "weight": 0.050000034272670746 + }, + { + "diff_generated": -85.05168151855469, + "epoch": 2.352559948152949, + "grad_norm": 506.91432397616205, + "learning_rate": 1.0823915797512952e-07, + "logits/chosen": -2.3924827575683594, + "logits/rejected": -2.5272819995880127, + "logps/chosen": -11.652626991271973, + "logps/rejected": -168.64675903320312, + "loss": 12.5285, + "losses_ref": -0.001545862527564168, + "ref_logps/chosen": -94.11674499511719, + "ref_logps/rejected": -83.59507751464844, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 82.46412658691406, + "rewards/margins": 167.5157928466797, + "rewards/rejected": -85.05168151855469, + "step": 7260, + "u": -6.477601051330566, + "weight": 0.05631608888506889 + }, + { + "diff_generated": -86.03199768066406, + "epoch": 2.355800388852884, + "grad_norm": 487.9596910661287, + "learning_rate": 1.0720949310078032e-07, + "logits/chosen": -2.362509250640869, + "logits/rejected": -2.5025877952575684, + "logps/chosen": -12.802087783813477, + "logps/rejected": -173.5316925048828, + "loss": 12.7406, + "losses_ref": -5.450115025951163e-08, + "ref_logps/chosen": -96.65104675292969, + "ref_logps/rejected": -87.49970245361328, + "rewards/accuracies": 0.96875, + "rewards/chosen": 83.84896087646484, + "rewards/margins": 169.88095092773438, + "rewards/rejected": -86.03199768066406, + "step": 7270, + "u": -6.663240909576416, + "weight": 0.03125 + }, + { + "diff_generated": -85.66532897949219, + "epoch": 2.359040829552819, + "grad_norm": 529.4654595969122, + "learning_rate": 1.0618399075952993e-07, + "logits/chosen": -2.411695957183838, + "logits/rejected": -2.5174262523651123, + "logps/chosen": -12.513715744018555, + "logps/rejected": -174.34246826171875, + "loss": 12.7701, + "losses_ref": -0.001065053860656917, + "ref_logps/chosen": -97.73978424072266, + "ref_logps/rejected": -88.6771469116211, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 85.22607421875, + "rewards/margins": 170.89138793945312, + "rewards/rejected": -85.66532897949219, + "step": 7280, + "u": -6.444096565246582, + "weight": 0.056295327842235565 + }, + { + "diff_generated": -83.40331268310547, + "epoch": 2.3622812702527543, + "grad_norm": 486.96056947331795, + "learning_rate": 1.0516266553070159e-07, + "logits/chosen": -2.3514552116394043, + "logits/rejected": -2.4933013916015625, + "logps/chosen": -13.503857612609863, + "logps/rejected": -168.85653686523438, + "loss": 12.5846, + "losses_ref": -4.49640893407377e-08, + "ref_logps/chosen": -94.25113677978516, + "ref_logps/rejected": -85.4532241821289, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 80.74726867675781, + "rewards/margins": 164.1505889892578, + "rewards/rejected": -83.40331268310547, + "step": 7290, + "u": -6.5332183837890625, + "weight": 0.05000000074505806 + }, + { + "diff_generated": -85.90556335449219, + "epoch": 2.3655217109526894, + "grad_norm": 512.973851912908, + "learning_rate": 1.041455319342336e-07, + "logits/chosen": -2.3956139087677, + "logits/rejected": -2.5182456970214844, + "logps/chosen": -12.487277030944824, + "logps/rejected": -172.7784423828125, + "loss": 12.8994, + "losses_ref": -4.39100585936103e-06, + "ref_logps/chosen": -101.12513732910156, + "ref_logps/rejected": -86.8729019165039, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 88.6378402709961, + "rewards/margins": 174.5434112548828, + "rewards/rejected": -85.90556335449219, + "step": 7300, + "u": -6.607975959777832, + "weight": 0.03750017657876015 + }, + { + "diff_generated": -84.39129638671875, + "epoch": 2.368762151652625, + "grad_norm": 500.970037385123, + "learning_rate": 1.0313260443047247e-07, + "logits/chosen": -2.295703887939453, + "logits/rejected": -2.526883125305176, + "logps/chosen": -11.327601432800293, + "logps/rejected": -168.99575805664062, + "loss": 13.0578, + "losses_ref": -0.004974209703505039, + "ref_logps/chosen": -88.77967834472656, + "ref_logps/rejected": -84.60444641113281, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 77.45207214355469, + "rewards/margins": 161.8433837890625, + "rewards/rejected": -84.39129638671875, + "step": 7310, + "u": -6.404440879821777, + "weight": 0.06897146999835968 + }, + { + "diff_generated": -85.76993560791016, + "epoch": 2.37200259235256, + "grad_norm": 489.33374283528923, + "learning_rate": 1.0212389741996834e-07, + "logits/chosen": -2.4081289768218994, + "logits/rejected": -2.583893299102783, + "logps/chosen": -12.362791061401367, + "logps/rejected": -174.359130859375, + "loss": 12.7088, + "losses_ref": -0.003301215823739767, + "ref_logps/chosen": -94.69625091552734, + "ref_logps/rejected": -88.58919525146484, + "rewards/accuracies": 0.9375, + "rewards/chosen": 82.33345794677734, + "rewards/margins": 168.1033935546875, + "rewards/rejected": -85.76993560791016, + "step": 7320, + "u": -6.429391384124756, + "weight": 0.06264026463031769 + }, + { + "diff_generated": -87.53488159179688, + "epoch": 2.375243033052495, + "grad_norm": 465.1530214753127, + "learning_rate": 1.0111942524326891e-07, + "logits/chosen": -2.370417356491089, + "logits/rejected": -2.5388333797454834, + "logps/chosen": -12.295100212097168, + "logps/rejected": -178.72235107421875, + "loss": 12.6474, + "losses_ref": -1.535337673885806e-06, + "ref_logps/chosen": -93.35027313232422, + "ref_logps/rejected": -91.18746185302734, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 81.05517578125, + "rewards/margins": 168.59005737304688, + "rewards/rejected": -87.53488159179688, + "step": 7330, + "u": -6.5652756690979, + "weight": 0.04375005513429642 + }, + { + "diff_generated": -85.3342514038086, + "epoch": 2.3784834737524303, + "grad_norm": 496.91622123501764, + "learning_rate": 1.0011920218071664e-07, + "logits/chosen": -2.452415704727173, + "logits/rejected": -2.5386643409729004, + "logps/chosen": -12.925765991210938, + "logps/rejected": -175.0770721435547, + "loss": 12.6224, + "losses_ref": -8.268243618658744e-06, + "ref_logps/chosen": -95.21087646484375, + "ref_logps/rejected": -89.7428207397461, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 82.28511810302734, + "rewards/margins": 167.61936950683594, + "rewards/rejected": -85.3342514038086, + "step": 7340, + "u": -6.49071741104126, + "weight": 0.0562501959502697 + }, + { + "diff_generated": -90.63404846191406, + "epoch": 2.3817239144523654, + "grad_norm": 488.04327837680876, + "learning_rate": 9.912324245224524e-08, + "logits/chosen": -2.4498038291931152, + "logits/rejected": -2.5991556644439697, + "logps/chosen": -13.388803482055664, + "logps/rejected": -182.3604278564453, + "loss": 12.4083, + "losses_ref": -0.00018008516053669155, + "ref_logps/chosen": -98.12663269042969, + "ref_logps/rejected": -91.72637939453125, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 84.73783111572266, + "rewards/margins": 175.3718719482422, + "rewards/rejected": -90.63404846191406, + "step": 7350, + "u": -6.733044624328613, + "weight": 0.012507098726928234 + }, + { + "diff_generated": -81.8043441772461, + "epoch": 2.3849643551523005, + "grad_norm": 522.1737155431391, + "learning_rate": 9.813156021717763e-08, + "logits/chosen": -2.3892436027526855, + "logits/rejected": -2.4666178226470947, + "logps/chosen": -12.565677642822266, + "logps/rejected": -163.40260314941406, + "loss": 12.871, + "losses_ref": -0.00020811586000490934, + "ref_logps/chosen": -95.48179626464844, + "ref_logps/rejected": -81.59825134277344, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 82.91612243652344, + "rewards/margins": 164.72047424316406, + "rewards/rejected": -81.8043441772461, + "step": 7360, + "u": -6.4513349533081055, + "weight": 0.056258104741573334 + }, + { + "diff_generated": -86.375, + "epoch": 2.3882047958522357, + "grad_norm": 490.9721252761537, + "learning_rate": 9.714416957402468e-08, + "logits/chosen": -2.383329391479492, + "logits/rejected": -2.558297634124756, + "logps/chosen": -12.554519653320312, + "logps/rejected": -174.5645751953125, + "loss": 12.7989, + "losses_ref": -0.00037836996489204466, + "ref_logps/chosen": -92.61976623535156, + "ref_logps/rejected": -88.18955993652344, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 80.06523895263672, + "rewards/margins": 166.44024658203125, + "rewards/rejected": -86.375, + "step": 7370, + "u": -6.5053887367248535, + "weight": 0.05626590922474861 + }, + { + "diff_generated": -89.63764953613281, + "epoch": 2.3914452365521712, + "grad_norm": 498.76182579058303, + "learning_rate": 9.616108456028462e-08, + "logits/chosen": -2.383967399597168, + "logits/rejected": -2.5193114280700684, + "logps/chosen": -12.145334243774414, + "logps/rejected": -180.26341247558594, + "loss": 12.7477, + "losses_ref": -1.3903306808060734e-06, + "ref_logps/chosen": -91.63188171386719, + "ref_logps/rejected": -90.6257553100586, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 79.48655700683594, + "rewards/margins": 169.1241912841797, + "rewards/rejected": -89.63764953613281, + "step": 7380, + "u": -6.723959445953369, + "weight": 0.02500004507601261 + }, + { + "diff_generated": -86.11238098144531, + "epoch": 2.3946856772521063, + "grad_norm": 494.50793530286535, + "learning_rate": 9.518231915224371e-08, + "logits/chosen": -2.386399030685425, + "logits/rejected": -2.483867645263672, + "logps/chosen": -12.247283935546875, + "logps/rejected": -176.57333374023438, + "loss": 12.4872, + "losses_ref": -0.0006079341983422637, + "ref_logps/chosen": -97.18465423583984, + "ref_logps/rejected": -90.4609375, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 84.93736267089844, + "rewards/margins": 171.0497589111328, + "rewards/rejected": -86.11238098144531, + "step": 7390, + "u": -6.583271026611328, + "weight": 0.04377469792962074 + }, + { + "diff_generated": -87.58899688720703, + "epoch": 2.3979261179520415, + "grad_norm": 482.55961854403847, + "learning_rate": 9.4207887264777e-08, + "logits/chosen": -2.32859468460083, + "logits/rejected": -2.561094284057617, + "logps/chosen": -11.497884750366211, + "logps/rejected": -175.28054809570312, + "loss": 12.848, + "losses_ref": -0.0022828192450106144, + "ref_logps/chosen": -88.45655822753906, + "ref_logps/rejected": -87.69156646728516, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 76.95866394042969, + "rewards/margins": 164.5476531982422, + "rewards/rejected": -87.58899688720703, + "step": 7400, + "u": -6.567407131195068, + "weight": 0.04385367035865784 + }, + { + "diff_generated": -86.6302261352539, + "epoch": 2.4011665586519766, + "grad_norm": 472.6398812524397, + "learning_rate": 9.323780275115156e-08, + "logits/chosen": -2.3910207748413086, + "logits/rejected": -2.5345606803894043, + "logps/chosen": -12.565729141235352, + "logps/rejected": -172.0085906982422, + "loss": 12.8935, + "losses_ref": -0.003717700717970729, + "ref_logps/chosen": -96.87049102783203, + "ref_logps/rejected": -85.37837982177734, + "rewards/accuracies": 0.96875, + "rewards/chosen": 84.30476379394531, + "rewards/margins": 170.9349822998047, + "rewards/rejected": -86.6302261352539, + "step": 7410, + "u": -6.673760414123535, + "weight": 0.03141096979379654 + }, + { + "diff_generated": -81.17831420898438, + "epoch": 2.4044069993519117, + "grad_norm": 460.87154964636005, + "learning_rate": 9.22720794028283e-08, + "logits/chosen": -2.4026007652282715, + "logits/rejected": -2.4874207973480225, + "logps/chosen": -12.965787887573242, + "logps/rejected": -165.58517456054688, + "loss": 13.0799, + "losses_ref": -1.249319012686101e-07, + "ref_logps/chosen": -96.84405517578125, + "ref_logps/rejected": -84.4068603515625, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 83.87825775146484, + "rewards/margins": 165.0565643310547, + "rewards/rejected": -81.17831420898438, + "step": 7420, + "u": -6.408229827880859, + "weight": 0.06875000149011612 + }, + { + "diff_generated": -83.51194763183594, + "epoch": 2.4076474400518473, + "grad_norm": 744.9085311548929, + "learning_rate": 9.13107309492668e-08, + "logits/chosen": -2.3378539085388184, + "logits/rejected": -2.490166187286377, + "logps/chosen": -12.576040267944336, + "logps/rejected": -166.04745483398438, + "loss": 12.4652, + "losses_ref": -0.0005707393283955753, + "ref_logps/chosen": -94.02613830566406, + "ref_logps/rejected": -82.5354995727539, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 81.45008850097656, + "rewards/margins": 164.9620361328125, + "rewards/rejected": -83.51194763183594, + "step": 7430, + "u": -6.59079122543335, + "weight": 0.04377346485853195 + }, + { + "diff_generated": -80.52021789550781, + "epoch": 2.4108878807517824, + "grad_norm": 510.8672346234607, + "learning_rate": 9.035377105772966e-08, + "logits/chosen": -2.3841404914855957, + "logits/rejected": -2.4778783321380615, + "logps/chosen": -12.856233596801758, + "logps/rejected": -162.56240844726562, + "loss": 13.0052, + "losses_ref": -0.0017416516784578562, + "ref_logps/chosen": -93.31913757324219, + "ref_logps/rejected": -82.04218292236328, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 80.46290588378906, + "rewards/margins": 160.98312377929688, + "rewards/rejected": -80.52021789550781, + "step": 7440, + "u": -6.387154579162598, + "weight": 0.06882445514202118 + }, + { + "diff_generated": -85.48727416992188, + "epoch": 2.4141283214517175, + "grad_norm": 515.1576877546976, + "learning_rate": 8.940121333308849e-08, + "logits/chosen": -2.3297321796417236, + "logits/rejected": -2.5619239807128906, + "logps/chosen": -11.624930381774902, + "logps/rejected": -170.84661865234375, + "loss": 12.8739, + "losses_ref": -1.706665557321685e-06, + "ref_logps/chosen": -85.62763977050781, + "ref_logps/rejected": -85.35934448242188, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 74.00270080566406, + "rewards/margins": 159.48997497558594, + "rewards/rejected": -85.48727416992188, + "step": 7450, + "u": -6.44665002822876, + "weight": 0.05625002458691597 + }, + { + "diff_generated": -87.2186279296875, + "epoch": 2.4173687621516526, + "grad_norm": 464.9770061734808, + "learning_rate": 8.845307131762991e-08, + "logits/chosen": -2.39561128616333, + "logits/rejected": -2.5099291801452637, + "logps/chosen": -13.754659652709961, + "logps/rejected": -172.62655639648438, + "loss": 12.8239, + "losses_ref": -0.0005621786694973707, + "ref_logps/chosen": -99.22911071777344, + "ref_logps/rejected": -85.40792083740234, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 85.47445678710938, + "rewards/margins": 172.69308471679688, + "rewards/rejected": -87.2186279296875, + "step": 7460, + "u": -6.46007776260376, + "weight": 0.05627412348985672 + }, + { + "diff_generated": -87.04798126220703, + "epoch": 2.4206092028515878, + "grad_norm": 480.5447502967754, + "learning_rate": 8.750935849086424e-08, + "logits/chosen": -2.4041030406951904, + "logits/rejected": -2.5212578773498535, + "logps/chosen": -13.613739013671875, + "logps/rejected": -176.61572265625, + "loss": 12.7459, + "losses_ref": -0.005251473747193813, + "ref_logps/chosen": -102.5396957397461, + "ref_logps/rejected": -89.56773376464844, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 88.92596435546875, + "rewards/margins": 175.97393798828125, + "rewards/rejected": -87.04798126220703, + "step": 7470, + "u": -6.651014804840088, + "weight": 0.025226224213838577 + }, + { + "diff_generated": -86.72041320800781, + "epoch": 2.423849643551523, + "grad_norm": 522.3173239436194, + "learning_rate": 8.657008826933223e-08, + "logits/chosen": -2.357637643814087, + "logits/rejected": -2.539961814880371, + "logps/chosen": -12.631352424621582, + "logps/rejected": -175.08822631835938, + "loss": 13.1405, + "losses_ref": -3.868344933266599e-08, + "ref_logps/chosen": -95.20268249511719, + "ref_logps/rejected": -88.36781311035156, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 82.57133483886719, + "rewards/margins": 169.29177856445312, + "rewards/rejected": -86.72041320800781, + "step": 7480, + "u": -6.551732540130615, + "weight": 0.04374999925494194 + }, + { + "diff_generated": -86.01445770263672, + "epoch": 2.427090084251458, + "grad_norm": 466.80659358478727, + "learning_rate": 8.563527400641559e-08, + "logits/chosen": -2.383613109588623, + "logits/rejected": -2.5712387561798096, + "logps/chosen": -12.490646362304688, + "logps/rejected": -173.9608917236328, + "loss": 12.3431, + "losses_ref": -0.00016282778233289719, + "ref_logps/chosen": -90.7940673828125, + "ref_logps/rejected": -87.94645690917969, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 78.30342102050781, + "rewards/margins": 164.31788635253906, + "rewards/rejected": -86.01445770263672, + "step": 7490, + "u": -6.375924587249756, + "weight": 0.07500634342432022 + }, + { + "diff_generated": -81.46131896972656, + "epoch": 2.4303305249513936, + "grad_norm": 490.9905956290775, + "learning_rate": 8.470492899214696e-08, + "logits/chosen": -2.35518217086792, + "logits/rejected": -2.424717426300049, + "logps/chosen": -12.561359405517578, + "logps/rejected": -160.87351989746094, + "loss": 12.4984, + "losses_ref": -0.001344609772786498, + "ref_logps/chosen": -91.66764068603516, + "ref_logps/rejected": -79.4122085571289, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 79.10627746582031, + "rewards/margins": 160.56759643554688, + "rewards/rejected": -81.46131896972656, + "step": 7500, + "u": -6.370742321014404, + "weight": 0.0688091292977333 + }, + { + "diff_generated": -87.88899993896484, + "epoch": 2.4335709656513287, + "grad_norm": 496.72652571908117, + "learning_rate": 8.377906645302015e-08, + "logits/chosen": -2.3465425968170166, + "logits/rejected": -2.5052857398986816, + "logps/chosen": -13.459360122680664, + "logps/rejected": -179.36007690429688, + "loss": 12.85, + "losses_ref": -0.0046943118795752525, + "ref_logps/chosen": -94.39802551269531, + "ref_logps/rejected": -91.47105407714844, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 80.93867492675781, + "rewards/margins": 168.82766723632812, + "rewards/rejected": -87.88899993896484, + "step": 7510, + "u": -6.632806301116943, + "weight": 0.037701454013586044 + }, + { + "diff_generated": -81.4451675415039, + "epoch": 2.436811406351264, + "grad_norm": 478.270473192927, + "learning_rate": 8.28576995518031e-08, + "logits/chosen": -2.3640151023864746, + "logits/rejected": -2.4249727725982666, + "logps/chosen": -13.7388277053833, + "logps/rejected": -164.45909118652344, + "loss": 12.9501, + "losses_ref": -0.006367249879986048, + "ref_logps/chosen": -98.01344299316406, + "ref_logps/rejected": -83.01393127441406, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 84.27461242675781, + "rewards/margins": 165.71978759765625, + "rewards/rejected": -81.4451675415039, + "step": 7520, + "u": -6.509137153625488, + "weight": 0.05030224844813347 + }, + { + "diff_generated": -88.09165954589844, + "epoch": 2.440051847051199, + "grad_norm": 530.0567812682875, + "learning_rate": 8.194084138735023e-08, + "logits/chosen": -2.3818490505218506, + "logits/rejected": -2.541490077972412, + "logps/chosen": -13.17347526550293, + "logps/rejected": -179.87777709960938, + "loss": 12.7952, + "losses_ref": -0.0015509051736444235, + "ref_logps/chosen": -92.35968780517578, + "ref_logps/rejected": -91.78611755371094, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 79.18620300292969, + "rewards/margins": 167.27786254882812, + "rewards/rejected": -88.09165954589844, + "step": 7530, + "u": -6.595727443695068, + "weight": 0.043817587196826935 + }, + { + "diff_generated": -82.6982650756836, + "epoch": 2.443292287751134, + "grad_norm": 482.2876230032835, + "learning_rate": 8.102850499441638e-08, + "logits/chosen": -2.3446247577667236, + "logits/rejected": -2.476062774658203, + "logps/chosen": -12.425708770751953, + "logps/rejected": -167.6982421875, + "loss": 12.9033, + "losses_ref": -5.4635460600138686e-08, + "ref_logps/chosen": -91.31380462646484, + "ref_logps/rejected": -84.99995422363281, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": 78.88810729980469, + "rewards/margins": 161.58636474609375, + "rewards/rejected": -82.6982650756836, + "step": 7540, + "u": -6.135350227355957, + "weight": 0.10625000298023224 + }, + { + "diff_generated": -87.83369445800781, + "epoch": 2.446532728451069, + "grad_norm": 472.3359347546331, + "learning_rate": 8.012070334347103e-08, + "logits/chosen": -2.394005537033081, + "logits/rejected": -2.520233154296875, + "logps/chosen": -13.785612106323242, + "logps/rejected": -177.16799926757812, + "loss": 12.4521, + "losses_ref": -0.00033257578616030514, + "ref_logps/chosen": -99.60476684570312, + "ref_logps/rejected": -89.33430480957031, + "rewards/accuracies": 0.96875, + "rewards/chosen": 85.81916046142578, + "rewards/margins": 173.65286254882812, + "rewards/rejected": -87.83369445800781, + "step": 7550, + "u": -6.665897369384766, + "weight": 0.03126341477036476 + }, + { + "diff_generated": -83.32557678222656, + "epoch": 2.4497731691510047, + "grad_norm": 483.4723322508994, + "learning_rate": 7.921744934051515e-08, + "logits/chosen": -2.3564887046813965, + "logits/rejected": -2.497473955154419, + "logps/chosen": -12.408846855163574, + "logps/rejected": -169.51048278808594, + "loss": 12.3062, + "losses_ref": -0.001365851378068328, + "ref_logps/chosen": -90.70404052734375, + "ref_logps/rejected": -86.18492126464844, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 78.29520416259766, + "rewards/margins": 161.62078857421875, + "rewards/rejected": -83.32557678222656, + "step": 7560, + "u": -6.274251937866211, + "weight": 0.08130748569965363 + }, + { + "diff_generated": -83.88063049316406, + "epoch": 2.45301360985094, + "grad_norm": 497.2983036407309, + "learning_rate": 7.831875582689598e-08, + "logits/chosen": -2.3528811931610107, + "logits/rejected": -2.4794909954071045, + "logps/chosen": -12.237689018249512, + "logps/rejected": -165.22021484375, + "loss": 12.5008, + "losses_ref": -2.523396176457027e-07, + "ref_logps/chosen": -91.89834594726562, + "ref_logps/rejected": -81.33958435058594, + "rewards/accuracies": 0.9375, + "rewards/chosen": 79.66065216064453, + "rewards/margins": 163.54129028320312, + "rewards/rejected": -83.88063049316406, + "step": 7570, + "u": -6.427340507507324, + "weight": 0.0625000074505806 + }, + { + "diff_generated": -84.7837905883789, + "epoch": 2.456254050550875, + "grad_norm": 534.7371450262729, + "learning_rate": 7.742463557912593e-08, + "logits/chosen": -2.3579623699188232, + "logits/rejected": -2.508383274078369, + "logps/chosen": -12.433358192443848, + "logps/rejected": -170.16738891601562, + "loss": 12.6894, + "losses_ref": -0.0018981487955898046, + "ref_logps/chosen": -93.244140625, + "ref_logps/rejected": -85.38359069824219, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 80.81077575683594, + "rewards/margins": 165.59457397460938, + "rewards/rejected": -84.7837905883789, + "step": 7580, + "u": -6.416357517242432, + "weight": 0.06882932037115097 + }, + { + "diff_generated": -90.3713607788086, + "epoch": 2.45949449125081, + "grad_norm": 524.2400804507265, + "learning_rate": 7.65351013087002e-08, + "logits/chosen": -2.3619818687438965, + "logits/rejected": -2.5806076526641846, + "logps/chosen": -11.463775634765625, + "logps/rejected": -184.11569213867188, + "loss": 12.7985, + "losses_ref": -0.004068558104336262, + "ref_logps/chosen": -91.43190002441406, + "ref_logps/rejected": -93.74433135986328, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 79.9681167602539, + "rewards/margins": 170.33946228027344, + "rewards/rejected": -90.3713607788086, + "step": 7590, + "u": -6.602597713470459, + "weight": 0.04393560811877251 + }, + { + "diff_generated": -85.97703552246094, + "epoch": 2.462734931950745, + "grad_norm": 525.1137645178529, + "learning_rate": 7.565016566191631e-08, + "logits/chosen": -2.306933641433716, + "logits/rejected": -2.4763407707214355, + "logps/chosen": -12.672266006469727, + "logps/rejected": -170.2589874267578, + "loss": 12.5481, + "losses_ref": -2.8921974148943264e-07, + "ref_logps/chosen": -92.51892852783203, + "ref_logps/rejected": -84.2819595336914, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 79.8466567993164, + "rewards/margins": 165.82369995117188, + "rewards/rejected": -85.97703552246094, + "step": 7600, + "u": -6.497170925140381, + "weight": 0.050000011920928955 + }, + { + "diff_generated": -84.66465759277344, + "epoch": 2.4659753726506803, + "grad_norm": 538.5819875031116, + "learning_rate": 7.47698412196939e-08, + "logits/chosen": -2.407266139984131, + "logits/rejected": -2.4851815700531006, + "logps/chosen": -12.466886520385742, + "logps/rejected": -174.0008087158203, + "loss": 12.9953, + "losses_ref": -0.03617560863494873, + "ref_logps/chosen": -98.9618911743164, + "ref_logps/rejected": -89.3361587524414, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 86.49501037597656, + "rewards/margins": 171.15965270996094, + "rewards/rejected": -84.66465759277344, + "step": 7610, + "u": -6.4572882652282715, + "weight": 0.056824516505002975 + }, + { + "diff_generated": -86.6190185546875, + "epoch": 2.4692158133506155, + "grad_norm": 486.0634635356683, + "learning_rate": 7.389414049739682e-08, + "logits/chosen": -2.4103035926818848, + "logits/rejected": -2.5272722244262695, + "logps/chosen": -12.675764083862305, + "logps/rejected": -172.9994659423828, + "loss": 12.7994, + "losses_ref": -0.003231314243748784, + "ref_logps/chosen": -97.57255554199219, + "ref_logps/rejected": -86.38043212890625, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 84.89678955078125, + "rewards/margins": 171.5157928466797, + "rewards/rejected": -86.6190185546875, + "step": 7620, + "u": -6.525698661804199, + "weight": 0.05013857036828995 + }, + { + "diff_generated": -87.88928985595703, + "epoch": 2.472456254050551, + "grad_norm": 478.67004558614457, + "learning_rate": 7.302307594465422e-08, + "logits/chosen": -2.385324716567993, + "logits/rejected": -2.594041109085083, + "logps/chosen": -13.049673080444336, + "logps/rejected": -181.9893798828125, + "loss": 12.4213, + "losses_ref": -0.00019054643053095788, + "ref_logps/chosen": -95.00083923339844, + "ref_logps/rejected": -94.10009765625, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 81.95115661621094, + "rewards/margins": 169.84043884277344, + "rewards/rejected": -87.88928985595703, + "step": 7630, + "u": -6.503349304199219, + "weight": 0.04375634342432022 + }, + { + "diff_generated": -85.76202392578125, + "epoch": 2.475696694750486, + "grad_norm": 515.7209612263493, + "learning_rate": 7.215665994518367e-08, + "logits/chosen": -2.3745484352111816, + "logits/rejected": -2.5339818000793457, + "logps/chosen": -12.694661140441895, + "logps/rejected": -171.424560546875, + "loss": 12.8026, + "losses_ref": -2.924050477304263e-07, + "ref_logps/chosen": -91.8033218383789, + "ref_logps/rejected": -85.66253662109375, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 79.1086654663086, + "rewards/margins": 164.8706817626953, + "rewards/rejected": -85.76202392578125, + "step": 7640, + "u": -6.478395938873291, + "weight": 0.056250013411045074 + }, + { + "diff_generated": -88.4301528930664, + "epoch": 2.4789371354504213, + "grad_norm": 555.0381925773094, + "learning_rate": 7.129490481661605e-08, + "logits/chosen": -2.4156832695007324, + "logits/rejected": -2.5395307540893555, + "logps/chosen": -14.228047370910645, + "logps/rejected": -182.26687622070312, + "loss": 12.8529, + "losses_ref": -0.03741047531366348, + "ref_logps/chosen": -104.72440338134766, + "ref_logps/rejected": -93.83674621582031, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 90.49635314941406, + "rewards/margins": 178.92648315429688, + "rewards/rejected": -88.4301528930664, + "step": 7650, + "u": -6.741459846496582, + "weight": 0.014490276575088501 + }, + { + "diff_generated": -80.32636260986328, + "epoch": 2.4821775761503564, + "grad_norm": 500.4918228868725, + "learning_rate": 7.043782281031911e-08, + "logits/chosen": -2.3989498615264893, + "logits/rejected": -2.4414381980895996, + "logps/chosen": -14.554104804992676, + "logps/rejected": -162.0161895751953, + "loss": 12.6795, + "losses_ref": -0.002818151144310832, + "ref_logps/chosen": -97.82319641113281, + "ref_logps/rejected": -81.6898193359375, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 83.26908874511719, + "rewards/margins": 163.59544372558594, + "rewards/rejected": -80.32636260986328, + "step": 7660, + "u": -6.331325054168701, + "weight": 0.07511264085769653 + }, + { + "diff_generated": -86.31947326660156, + "epoch": 2.4854180168502915, + "grad_norm": 505.39487820480815, + "learning_rate": 6.958542611122422e-08, + "logits/chosen": -2.4010703563690186, + "logits/rejected": -2.5161900520324707, + "logps/chosen": -12.543961524963379, + "logps/rejected": -170.29434204101562, + "loss": 12.7568, + "losses_ref": -1.1059737126117852e-08, + "ref_logps/chosen": -97.9937973022461, + "ref_logps/rejected": -83.97486877441406, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 85.4498291015625, + "rewards/margins": 171.76930236816406, + "rewards/rejected": -86.31947326660156, + "step": 7670, + "u": -6.538404941558838, + "weight": 0.04374999925494194 + }, + { + "diff_generated": -82.28599548339844, + "epoch": 2.488658457550227, + "grad_norm": 515.332660650725, + "learning_rate": 6.873772683765283e-08, + "logits/chosen": -2.3219847679138184, + "logits/rejected": -2.484175205230713, + "logps/chosen": -12.61694622039795, + "logps/rejected": -168.28042602539062, + "loss": 12.2396, + "losses_ref": -0.0011883302358910441, + "ref_logps/chosen": -91.09086608886719, + "ref_logps/rejected": -85.99443054199219, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 78.47391510009766, + "rewards/margins": 160.75991821289062, + "rewards/rejected": -82.28599548339844, + "step": 7680, + "u": -6.3603620529174805, + "weight": 0.07505214214324951 + }, + { + "diff_generated": -86.8857650756836, + "epoch": 2.491898898250162, + "grad_norm": 520.0914704230654, + "learning_rate": 6.789473704114428e-08, + "logits/chosen": -2.389352321624756, + "logits/rejected": -2.4951353073120117, + "logps/chosen": -12.93322467803955, + "logps/rejected": -173.37252807617188, + "loss": 12.8485, + "losses_ref": -4.2353548224127735e-07, + "ref_logps/chosen": -99.44371032714844, + "ref_logps/rejected": -86.48677062988281, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 86.51048278808594, + "rewards/margins": 173.39625549316406, + "rewards/rejected": -86.8857650756836, + "step": 7690, + "u": -6.696573734283447, + "weight": 0.018750013783574104 + }, + { + "diff_generated": -83.52281188964844, + "epoch": 2.4951393389500973, + "grad_norm": 485.94800288731153, + "learning_rate": 6.7056468706284e-08, + "logits/chosen": -2.368849277496338, + "logits/rejected": -2.4442880153656006, + "logps/chosen": -13.004542350769043, + "logps/rejected": -167.32431030273438, + "loss": 12.8778, + "losses_ref": -4.529524488816605e-08, + "ref_logps/chosen": -97.18318939208984, + "ref_logps/rejected": -83.801513671875, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 84.17866516113281, + "rewards/margins": 167.70147705078125, + "rewards/rejected": -83.52281188964844, + "step": 7700, + "u": -6.416772365570068, + "weight": 0.05624999850988388 + }, + { + "diff_generated": -81.74546813964844, + "epoch": 2.4983797796500324, + "grad_norm": 475.0479170919597, + "learning_rate": 6.622293375053422e-08, + "logits/chosen": -2.3324759006500244, + "logits/rejected": -2.4128990173339844, + "logps/chosen": -12.462357521057129, + "logps/rejected": -165.7411651611328, + "loss": 13.0175, + "losses_ref": -0.0005353426095098257, + "ref_logps/chosen": -95.83375549316406, + "ref_logps/rejected": -83.99568176269531, + "rewards/accuracies": 0.9375, + "rewards/chosen": 83.37139892578125, + "rewards/margins": 165.11688232421875, + "rewards/rejected": -81.74546813964844, + "step": 7710, + "u": -6.465706825256348, + "weight": 0.0625208243727684 + }, + { + "diff_generated": -84.4853515625, + "epoch": 2.5016202203499676, + "grad_norm": 518.0026426198627, + "learning_rate": 6.539414402406316e-08, + "logits/chosen": -2.3503990173339844, + "logits/rejected": -2.4843220710754395, + "logps/chosen": -12.903669357299805, + "logps/rejected": -174.85250854492188, + "loss": 12.8774, + "losses_ref": -1.42912256251293e-06, + "ref_logps/chosen": -98.93122863769531, + "ref_logps/rejected": -90.3671646118164, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 86.0275650024414, + "rewards/margins": 170.512939453125, + "rewards/rejected": -84.4853515625, + "step": 7720, + "u": -6.619194984436035, + "weight": 0.0375000536441803 + }, + { + "diff_generated": -80.78546905517578, + "epoch": 2.5048606610499027, + "grad_norm": 515.8740681402575, + "learning_rate": 6.457011130957747e-08, + "logits/chosen": -2.3500072956085205, + "logits/rejected": -2.442248821258545, + "logps/chosen": -13.2356538772583, + "logps/rejected": -162.047119140625, + "loss": 12.8804, + "losses_ref": -3.20266941855607e-08, + "ref_logps/chosen": -98.41993713378906, + "ref_logps/rejected": -81.26166534423828, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 85.18428039550781, + "rewards/margins": 165.96974182128906, + "rewards/rejected": -80.78546905517578, + "step": 7730, + "u": -6.437910556793213, + "weight": 0.05624999850988388 + }, + { + "diff_generated": -85.4696273803711, + "epoch": 2.508101101749838, + "grad_norm": 546.6845540934409, + "learning_rate": 6.37508473221549e-08, + "logits/chosen": -2.37815260887146, + "logits/rejected": -2.4845166206359863, + "logps/chosen": -13.139554977416992, + "logps/rejected": -173.57278442382812, + "loss": 12.9879, + "losses_ref": -0.00040427473140880466, + "ref_logps/chosen": -96.54368591308594, + "ref_logps/rejected": -88.1031494140625, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 83.40412902832031, + "rewards/margins": 168.87376403808594, + "rewards/rejected": -85.4696273803711, + "step": 7740, + "u": -6.579657077789307, + "weight": 0.043766215443611145 + }, + { + "diff_generated": -89.28426361083984, + "epoch": 2.511341542449773, + "grad_norm": 494.946916076293, + "learning_rate": 6.293636370907665e-08, + "logits/chosen": -2.3897414207458496, + "logits/rejected": -2.5804595947265625, + "logps/chosen": -12.588485717773438, + "logps/rejected": -178.6708984375, + "loss": 12.7232, + "losses_ref": -0.012705594301223755, + "ref_logps/chosen": -94.24117279052734, + "ref_logps/rejected": -89.38664245605469, + "rewards/accuracies": 0.96875, + "rewards/chosen": 81.6526870727539, + "rewards/margins": 170.9369659423828, + "rewards/rejected": -89.28426361083984, + "step": 7750, + "u": -6.6324920654296875, + "weight": 0.03186144679784775 + }, + { + "diff_generated": -89.33751678466797, + "epoch": 2.5145819831497085, + "grad_norm": 512.3183896963463, + "learning_rate": 6.212667204966293e-08, + "logits/chosen": -2.418869972229004, + "logits/rejected": -2.5200929641723633, + "logps/chosen": -13.113365173339844, + "logps/rejected": -179.0424346923828, + "loss": 13.1742, + "losses_ref": -2.3532444881624315e-09, + "ref_logps/chosen": -98.5973129272461, + "ref_logps/rejected": -89.70491790771484, + "rewards/accuracies": 0.96875, + "rewards/chosen": 85.48394012451172, + "rewards/margins": 174.8214569091797, + "rewards/rejected": -89.33751678466797, + "step": 7760, + "u": -6.676608085632324, + "weight": 0.03125 + }, + { + "diff_generated": -85.3768310546875, + "epoch": 2.5178224238496436, + "grad_norm": 488.4091704506323, + "learning_rate": 6.132178385510772e-08, + "logits/chosen": -2.3752994537353516, + "logits/rejected": -2.5198378562927246, + "logps/chosen": -12.729066848754883, + "logps/rejected": -173.28065490722656, + "loss": 12.7355, + "losses_ref": -0.0031602573581039906, + "ref_logps/chosen": -92.53146362304688, + "ref_logps/rejected": -87.90381622314453, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 79.80239868164062, + "rewards/margins": 165.1792449951172, + "rewards/rejected": -85.3768310546875, + "step": 7770, + "u": -6.568255424499512, + "weight": 0.04388529807329178 + }, + { + "diff_generated": -83.88563537597656, + "epoch": 2.5210628645495787, + "grad_norm": 474.70341248308773, + "learning_rate": 6.052171056831547e-08, + "logits/chosen": -2.3717455863952637, + "logits/rejected": -2.528341770172119, + "logps/chosen": -11.977242469787598, + "logps/rejected": -165.62997436523438, + "loss": 12.6246, + "losses_ref": -0.0009697287459857762, + "ref_logps/chosen": -90.6422348022461, + "ref_logps/rejected": -81.74433135986328, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 78.66499328613281, + "rewards/margins": 162.55062866210938, + "rewards/rejected": -83.88563537597656, + "step": 7780, + "u": -6.627276420593262, + "weight": 0.03754133731126785 + }, + { + "diff_generated": -86.89268493652344, + "epoch": 2.524303305249514, + "grad_norm": 521.5084899838258, + "learning_rate": 5.972646356373779e-08, + "logits/chosen": -2.3941633701324463, + "logits/rejected": -2.4962596893310547, + "logps/chosen": -14.027194023132324, + "logps/rejected": -172.46983337402344, + "loss": 12.6922, + "losses_ref": -1.3707585821975954e-07, + "ref_logps/chosen": -97.36051940917969, + "ref_logps/rejected": -85.57716369628906, + "rewards/accuracies": 0.96875, + "rewards/chosen": 83.33332824707031, + "rewards/margins": 170.2259979248047, + "rewards/rejected": -86.89268493652344, + "step": 7790, + "u": -6.6635541915893555, + "weight": 0.0312500037252903 + }, + { + "diff_generated": -85.21521759033203, + "epoch": 2.527543745949449, + "grad_norm": 511.26271392949695, + "learning_rate": 5.893605414721277e-08, + "logits/chosen": -2.403787136077881, + "logits/rejected": -2.5516161918640137, + "logps/chosen": -11.525293350219727, + "logps/rejected": -168.976318359375, + "loss": 12.3588, + "losses_ref": -0.008900230750441551, + "ref_logps/chosen": -94.09223937988281, + "ref_logps/rejected": -83.76110076904297, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 82.56694793701172, + "rewards/margins": 167.78216552734375, + "rewards/rejected": -85.21521759033203, + "step": 7800, + "u": -6.569947242736816, + "weight": 0.044144630432128906 + }, + { + "diff_generated": -88.24825286865234, + "epoch": 2.5307841866493845, + "grad_norm": 513.7535587783697, + "learning_rate": 5.815049355580317e-08, + "logits/chosen": -2.3834657669067383, + "logits/rejected": -2.5250654220581055, + "logps/chosen": -12.82642650604248, + "logps/rejected": -176.42503356933594, + "loss": 12.4961, + "losses_ref": -1.1369086116985727e-08, + "ref_logps/chosen": -96.07859802246094, + "ref_logps/rejected": -88.1767807006836, + "rewards/accuracies": 0.96875, + "rewards/chosen": 83.25218200683594, + "rewards/margins": 171.50042724609375, + "rewards/rejected": -88.24825286865234, + "step": 7810, + "u": -6.661282539367676, + "weight": 0.03125 + }, + { + "diff_generated": -85.78338623046875, + "epoch": 2.5340246273493197, + "grad_norm": 527.7786162945669, + "learning_rate": 5.736979295763742e-08, + "logits/chosen": -2.367727041244507, + "logits/rejected": -2.4320693016052246, + "logps/chosen": -13.660482406616211, + "logps/rejected": -169.5161895751953, + "loss": 12.9545, + "losses_ref": -2.9643251764355227e-05, + "ref_logps/chosen": -101.06855773925781, + "ref_logps/rejected": -83.73277282714844, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 87.4080810546875, + "rewards/margins": 173.1914825439453, + "rewards/rejected": -85.78338623046875, + "step": 7820, + "u": -6.695784568786621, + "weight": 0.02500077709555626 + }, + { + "diff_generated": -87.93540954589844, + "epoch": 2.537265068049255, + "grad_norm": 506.13584198160663, + "learning_rate": 5.659396345175049e-08, + "logits/chosen": -2.348079204559326, + "logits/rejected": -2.476435422897339, + "logps/chosen": -13.36473274230957, + "logps/rejected": -174.88262939453125, + "loss": 12.3834, + "losses_ref": -0.0071576847694814205, + "ref_logps/chosen": -100.14743041992188, + "ref_logps/rejected": -86.94721984863281, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 86.78269958496094, + "rewards/margins": 174.71810913085938, + "rewards/rejected": -87.93540954589844, + "step": 7830, + "u": -6.6284918785095215, + "weight": 0.037833355367183685 + }, + { + "diff_generated": -84.89559173583984, + "epoch": 2.54050550874919, + "grad_norm": 488.6686843515456, + "learning_rate": 5.5823016067926234e-08, + "logits/chosen": -2.34673810005188, + "logits/rejected": -2.4491865634918213, + "logps/chosen": -13.33026123046875, + "logps/rejected": -171.094970703125, + "loss": 12.5142, + "losses_ref": -0.003317506518214941, + "ref_logps/chosen": -96.6379165649414, + "ref_logps/rejected": -86.1993637084961, + "rewards/accuracies": 0.9375, + "rewards/chosen": 83.30766296386719, + "rewards/margins": 168.20326232910156, + "rewards/rejected": -84.89559173583984, + "step": 7840, + "u": -6.420805931091309, + "weight": 0.06264188140630722 + }, + { + "diff_generated": -81.24967956542969, + "epoch": 2.543745949449125, + "grad_norm": 491.2639204565971, + "learning_rate": 5.5056961766540444e-08, + "logits/chosen": -2.3522627353668213, + "logits/rejected": -2.464498519897461, + "logps/chosen": -12.752184867858887, + "logps/rejected": -168.3269500732422, + "loss": 12.7157, + "losses_ref": -5.747070304096269e-07, + "ref_logps/chosen": -92.69832611083984, + "ref_logps/rejected": -87.07726287841797, + "rewards/accuracies": 0.90625, + "rewards/chosen": 79.94613647460938, + "rewards/margins": 161.19583129882812, + "rewards/rejected": -81.24967956542969, + "step": 7850, + "u": -6.237468719482422, + "weight": 0.0937500074505806 + }, + { + "diff_generated": -86.9503173828125, + "epoch": 2.54698639014906, + "grad_norm": 528.2205381646952, + "learning_rate": 5.429581143840525e-08, + "logits/chosen": -2.3642375469207764, + "logits/rejected": -2.516458511352539, + "logps/chosen": -12.689916610717773, + "logps/rejected": -173.31358337402344, + "loss": 13.1316, + "losses_ref": -0.002915473422035575, + "ref_logps/chosen": -98.03343200683594, + "ref_logps/rejected": -86.36326599121094, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 85.34352111816406, + "rewards/margins": 172.29385375976562, + "rewards/rejected": -86.9503173828125, + "step": 7860, + "u": -6.518470764160156, + "weight": 0.043878473341464996 + }, + { + "diff_generated": -82.84322357177734, + "epoch": 2.5502268308489953, + "grad_norm": 498.4432213297694, + "learning_rate": 5.3539575904614176e-08, + "logits/chosen": -2.381110429763794, + "logits/rejected": -2.499281167984009, + "logps/chosen": -13.256353378295898, + "logps/rejected": -167.81980895996094, + "loss": 12.5832, + "losses_ref": -0.00034237594809383154, + "ref_logps/chosen": -93.40634155273438, + "ref_logps/rejected": -84.97659301757812, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 80.14997863769531, + "rewards/margins": 162.99319458007812, + "rewards/rejected": -82.84322357177734, + "step": 7870, + "u": -6.393845081329346, + "weight": 0.06876394152641296 + }, + { + "diff_generated": -86.20030212402344, + "epoch": 2.5534672715489304, + "grad_norm": 468.91919847693663, + "learning_rate": 5.278826591638794e-08, + "logits/chosen": -2.384467124938965, + "logits/rejected": -2.534027338027954, + "logps/chosen": -13.1602783203125, + "logps/rejected": -177.34799194335938, + "loss": 13.1184, + "losses_ref": -1.0411190487502608e-07, + "ref_logps/chosen": -95.07594299316406, + "ref_logps/rejected": -91.1476821899414, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 81.91566467285156, + "rewards/margins": 168.115966796875, + "rewards/rejected": -86.20030212402344, + "step": 7880, + "u": -6.67303466796875, + "weight": 0.02500000223517418 + }, + { + "diff_generated": -87.34493255615234, + "epoch": 2.556707712248866, + "grad_norm": 541.1356078527222, + "learning_rate": 5.204189215492252e-08, + "logits/chosen": -2.356952667236328, + "logits/rejected": -2.517338275909424, + "logps/chosen": -12.496864318847656, + "logps/rejected": -175.65017700195312, + "loss": 12.5768, + "losses_ref": -2.529996265820955e-07, + "ref_logps/chosen": -97.91763305664062, + "ref_logps/rejected": -88.30525970458984, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 85.42076110839844, + "rewards/margins": 172.7657012939453, + "rewards/rejected": -87.34493255615234, + "step": 7890, + "u": -6.555611610412598, + "weight": 0.05000000447034836 + }, + { + "diff_generated": -85.5751724243164, + "epoch": 2.559948152948801, + "grad_norm": 467.6387257104504, + "learning_rate": 5.1300465231236145e-08, + "logits/chosen": -2.372973918914795, + "logits/rejected": -2.455167293548584, + "logps/chosen": -13.153573989868164, + "logps/rejected": -166.5150146484375, + "loss": 12.7604, + "losses_ref": -1.840089396409894e-07, + "ref_logps/chosen": -98.97048950195312, + "ref_logps/rejected": -80.93985748291016, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 85.8169174194336, + "rewards/margins": 171.39208984375, + "rewards/rejected": -85.5751724243164, + "step": 7900, + "u": -6.509287357330322, + "weight": 0.05000000447034836 + }, + { + "diff_generated": -87.47126770019531, + "epoch": 2.563188593648736, + "grad_norm": 480.3490266312158, + "learning_rate": 5.056399568601946e-08, + "logits/chosen": -2.405813694000244, + "logits/rejected": -2.511786937713623, + "logps/chosen": -13.059216499328613, + "logps/rejected": -175.0909881591797, + "loss": 13.0647, + "losses_ref": -0.006940539926290512, + "ref_logps/chosen": -95.51531982421875, + "ref_logps/rejected": -87.61970520019531, + "rewards/accuracies": 0.9375, + "rewards/chosen": 82.45610046386719, + "rewards/margins": 169.9273681640625, + "rewards/rejected": -87.47126770019531, + "step": 7910, + "u": -6.41863489151001, + "weight": 0.06280623376369476 + }, + { + "diff_generated": -85.1175308227539, + "epoch": 2.5664290343486713, + "grad_norm": 503.66718179721994, + "learning_rate": 4.983249398948502e-08, + "logits/chosen": -2.4177050590515137, + "logits/rejected": -2.4998488426208496, + "logps/chosen": -13.102018356323242, + "logps/rejected": -174.5506134033203, + "loss": 12.8097, + "losses_ref": -3.150193515466526e-05, + "ref_logps/chosen": -99.92887115478516, + "ref_logps/rejected": -89.43309020996094, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 86.82683563232422, + "rewards/margins": 171.94436645507812, + "rewards/rejected": -85.1175308227539, + "step": 7920, + "u": -6.611052513122559, + "weight": 0.03750133514404297 + }, + { + "diff_generated": -84.71986389160156, + "epoch": 2.569669475048607, + "grad_norm": 505.03372562716527, + "learning_rate": 4.910597054121877e-08, + "logits/chosen": -2.362746000289917, + "logits/rejected": -2.449460744857788, + "logps/chosen": -14.314231872558594, + "logps/rejected": -166.46458435058594, + "loss": 12.7078, + "losses_ref": -0.0003298623487353325, + "ref_logps/chosen": -99.32063293457031, + "ref_logps/rejected": -81.74470520019531, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 85.00639343261719, + "rewards/margins": 169.7262725830078, + "rewards/rejected": -84.71986389160156, + "step": 7930, + "u": -6.521559238433838, + "weight": 0.04376343637704849 + }, + { + "diff_generated": -87.66574096679688, + "epoch": 2.572909915748542, + "grad_norm": 512.5633770227915, + "learning_rate": 4.838443567003194e-08, + "logits/chosen": -2.3789749145507812, + "logits/rejected": -2.5361878871917725, + "logps/chosen": -11.975103378295898, + "logps/rejected": -174.80824279785156, + "loss": 12.4216, + "losses_ref": -0.009310315363109112, + "ref_logps/chosen": -92.89860534667969, + "ref_logps/rejected": -87.14249420166016, + "rewards/accuracies": 0.96875, + "rewards/chosen": 80.92350769042969, + "rewards/margins": 168.58924865722656, + "rewards/rejected": -87.66574096679688, + "step": 7940, + "u": -6.625628471374512, + "weight": 0.03165370970964432 + }, + { + "diff_generated": -86.88675689697266, + "epoch": 2.576150356448477, + "grad_norm": 533.2632256964176, + "learning_rate": 4.766789963381459e-08, + "logits/chosen": -2.383544921875, + "logits/rejected": -2.5006332397460938, + "logps/chosen": -13.377037048339844, + "logps/rejected": -173.28250122070312, + "loss": 12.9989, + "losses_ref": -0.008450334891676903, + "ref_logps/chosen": -96.12030792236328, + "ref_logps/rejected": -86.395751953125, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 82.74327850341797, + "rewards/margins": 169.63002014160156, + "rewards/rejected": -86.88675689697266, + "step": 7950, + "u": -6.491290092468262, + "weight": 0.056649159640073776 + }, + { + "diff_generated": -87.16014099121094, + "epoch": 2.5793907971484122, + "grad_norm": 499.656898104997, + "learning_rate": 4.695637261938912e-08, + "logits/chosen": -2.391625165939331, + "logits/rejected": -2.4923512935638428, + "logps/chosen": -12.244363784790039, + "logps/rejected": -173.76107788085938, + "loss": 12.6396, + "losses_ref": -6.377808290380926e-07, + "ref_logps/chosen": -97.15655517578125, + "ref_logps/rejected": -86.6009292602539, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 84.91219329833984, + "rewards/margins": 172.0723419189453, + "rewards/rejected": -87.16014099121094, + "step": 7960, + "u": -6.561807155609131, + "weight": 0.03750001639127731 + }, + { + "diff_generated": -85.76423645019531, + "epoch": 2.5826312378483474, + "grad_norm": 465.2509841706746, + "learning_rate": 4.624986474236623e-08, + "logits/chosen": -2.414842128753662, + "logits/rejected": -2.5293426513671875, + "logps/chosen": -12.228536605834961, + "logps/rejected": -172.13262939453125, + "loss": 12.4326, + "losses_ref": -4.4096346130118036e-08, + "ref_logps/chosen": -96.31724548339844, + "ref_logps/rejected": -86.36839294433594, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 84.08869934082031, + "rewards/margins": 169.85293579101562, + "rewards/rejected": -85.76423645019531, + "step": 7970, + "u": -6.528074741363525, + "weight": 0.05000000074505806 + }, + { + "diff_generated": -83.78959655761719, + "epoch": 2.5858716785482825, + "grad_norm": 523.6643190508312, + "learning_rate": 4.554838604700073e-08, + "logits/chosen": -2.3502275943756104, + "logits/rejected": -2.461812973022461, + "logps/chosen": -12.568353652954102, + "logps/rejected": -164.447265625, + "loss": 12.6323, + "losses_ref": -0.006550629623234272, + "ref_logps/chosen": -90.35621643066406, + "ref_logps/rejected": -80.65765380859375, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 77.78785705566406, + "rewards/margins": 161.57745361328125, + "rewards/rejected": -83.78959655761719, + "step": 7980, + "u": -6.409112453460693, + "weight": 0.06904677301645279 + }, + { + "diff_generated": -86.20465850830078, + "epoch": 2.5891121192482176, + "grad_norm": 526.2645841769199, + "learning_rate": 4.4851946506048445e-08, + "logits/chosen": -2.3885281085968018, + "logits/rejected": -2.506099224090576, + "logps/chosen": -12.333206176757812, + "logps/rejected": -172.03591918945312, + "loss": 12.9873, + "losses_ref": -3.100065537608998e-09, + "ref_logps/chosen": -94.87696838378906, + "ref_logps/rejected": -85.83128356933594, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 82.54376220703125, + "rewards/margins": 168.7484130859375, + "rewards/rejected": -86.20465850830078, + "step": 7990, + "u": -6.473536014556885, + "weight": 0.05624999850988388 + }, + { + "diff_generated": -83.8978042602539, + "epoch": 2.5923525599481527, + "grad_norm": 493.1932292731906, + "learning_rate": 4.4160556020625026e-08, + "logits/chosen": -2.3730297088623047, + "logits/rejected": -2.524240016937256, + "logps/chosen": -13.095507621765137, + "logps/rejected": -168.27194213867188, + "loss": 12.7239, + "losses_ref": -8.18051262285735e-07, + "ref_logps/chosen": -91.02601623535156, + "ref_logps/rejected": -84.37416076660156, + "rewards/accuracies": 0.9375, + "rewards/chosen": 77.93051147460938, + "rewards/margins": 161.8282928466797, + "rewards/rejected": -83.8978042602539, + "step": 8000, + "u": -6.421624660491943, + "weight": 0.06250002235174179 + }, + { + "diff_generated": -86.31498718261719, + "epoch": 2.5955930006480883, + "grad_norm": 501.71546954800465, + "learning_rate": 4.347422442006476e-08, + "logits/chosen": -2.3795294761657715, + "logits/rejected": -2.50423264503479, + "logps/chosen": -12.979804992675781, + "logps/rejected": -172.81736755371094, + "loss": 12.5756, + "losses_ref": -0.00048283609794452786, + "ref_logps/chosen": -98.77983856201172, + "ref_logps/rejected": -86.50237274169922, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 85.80003356933594, + "rewards/margins": 172.11502075195312, + "rewards/rejected": -86.31498718261719, + "step": 8010, + "u": -6.6053948402404785, + "weight": 0.04377000406384468 + }, + { + "diff_generated": -86.91321563720703, + "epoch": 2.5988334413480234, + "grad_norm": 534.5271316801942, + "learning_rate": 4.2792961461781064e-08, + "logits/chosen": -2.4245381355285645, + "logits/rejected": -2.550638437271118, + "logps/chosen": -13.415209770202637, + "logps/rejected": -171.35308837890625, + "loss": 13.083, + "losses_ref": -0.005519128870218992, + "ref_logps/chosen": -96.28224182128906, + "ref_logps/rejected": -84.43988037109375, + "rewards/accuracies": 0.96875, + "rewards/chosen": 82.86702728271484, + "rewards/margins": 169.78024291992188, + "rewards/rejected": -86.91321563720703, + "step": 8020, + "u": -6.6761980056762695, + "weight": 0.03151529282331467 + }, + { + "diff_generated": -84.75545501708984, + "epoch": 2.6020738820479585, + "grad_norm": 483.275655835092, + "learning_rate": 4.211677683112751e-08, + "logits/chosen": -2.3873836994171143, + "logits/rejected": -2.5185983180999756, + "logps/chosen": -12.050555229187012, + "logps/rejected": -171.59332275390625, + "loss": 12.2757, + "losses_ref": -2.054375727311708e-07, + "ref_logps/chosen": -95.49308013916016, + "ref_logps/rejected": -86.83785247802734, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 83.44252014160156, + "rewards/margins": 168.19798278808594, + "rewards/rejected": -84.75545501708984, + "step": 8030, + "u": -6.334198474884033, + "weight": 0.07500000298023224 + }, + { + "diff_generated": -89.45109558105469, + "epoch": 2.6053143227478937, + "grad_norm": 486.574861960957, + "learning_rate": 4.1445680141260594e-08, + "logits/chosen": -2.4202070236206055, + "logits/rejected": -2.5206031799316406, + "logps/chosen": -14.027897834777832, + "logps/rejected": -174.24551391601562, + "loss": 13.1134, + "losses_ref": -0.0034157063346356153, + "ref_logps/chosen": -102.09026336669922, + "ref_logps/rejected": -84.7944107055664, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 88.06236267089844, + "rewards/margins": 177.51345825195312, + "rewards/rejected": -89.45109558105469, + "step": 8040, + "u": -6.611212253570557, + "weight": 0.03765515238046646 + }, + { + "diff_generated": -88.83689880371094, + "epoch": 2.6085547634478288, + "grad_norm": 477.13792511325187, + "learning_rate": 4.077968093300237e-08, + "logits/chosen": -2.3851420879364014, + "logits/rejected": -2.5064923763275146, + "logps/chosen": -12.40340805053711, + "logps/rejected": -178.78726196289062, + "loss": 12.5005, + "losses_ref": -0.0012303909752517939, + "ref_logps/chosen": -96.12643432617188, + "ref_logps/rejected": -89.95036315917969, + "rewards/accuracies": 0.96875, + "rewards/chosen": 83.72303009033203, + "rewards/margins": 172.55990600585938, + "rewards/rejected": -88.83689880371094, + "step": 8050, + "u": -6.618790626525879, + "weight": 0.03129594027996063 + }, + { + "diff_generated": -89.02252197265625, + "epoch": 2.6117952041477643, + "grad_norm": 491.7406817234213, + "learning_rate": 4.011878867470542e-08, + "logits/chosen": -2.388709306716919, + "logits/rejected": -2.545560359954834, + "logps/chosen": -13.7767333984375, + "logps/rejected": -177.96762084960938, + "loss": 12.8765, + "losses_ref": -0.0017306599766016006, + "ref_logps/chosen": -97.41840362548828, + "ref_logps/rejected": -88.94508361816406, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 83.64167022705078, + "rewards/margins": 172.66419982910156, + "rewards/rejected": -89.02252197265625, + "step": 8060, + "u": -6.748841285705566, + "weight": 0.01882075145840645 + }, + { + "diff_generated": -86.64888000488281, + "epoch": 2.6150356448476995, + "grad_norm": 509.8738582350685, + "learning_rate": 3.9463012762118144e-08, + "logits/chosen": -2.321654796600342, + "logits/rejected": -2.5281715393066406, + "logps/chosen": -11.479809761047363, + "logps/rejected": -175.80943298339844, + "loss": 12.4087, + "losses_ref": -0.024613162502646446, + "ref_logps/chosen": -89.32393646240234, + "ref_logps/rejected": -89.16055297851562, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 77.84413146972656, + "rewards/margins": 164.49301147460938, + "rewards/rejected": -86.64888000488281, + "step": 8070, + "u": -6.471219539642334, + "weight": 0.0574127733707428 + }, + { + "diff_generated": -86.16117858886719, + "epoch": 2.6182760855476346, + "grad_norm": 497.2157398866359, + "learning_rate": 3.8812362518250816e-08, + "logits/chosen": -2.401134967803955, + "logits/rejected": -2.519824743270874, + "logps/chosen": -13.755337715148926, + "logps/rejected": -171.7306365966797, + "loss": 13.0926, + "losses_ref": -3.134549686478749e-08, + "ref_logps/chosen": -96.5144271850586, + "ref_logps/rejected": -85.56947326660156, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 82.75908660888672, + "rewards/margins": 168.92025756835938, + "rewards/rejected": -86.16117858886719, + "step": 8080, + "u": -6.461573600769043, + "weight": 0.05624999850988388 + }, + { + "diff_generated": -86.84434509277344, + "epoch": 2.6215165262475697, + "grad_norm": 450.8214177458869, + "learning_rate": 3.816684719324352e-08, + "logits/chosen": -2.345268487930298, + "logits/rejected": -2.538722515106201, + "logps/chosen": -11.970441818237305, + "logps/rejected": -173.00057983398438, + "loss": 12.524, + "losses_ref": -0.003985968884080648, + "ref_logps/chosen": -89.08036804199219, + "ref_logps/rejected": -86.15625, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 77.10990905761719, + "rewards/margins": 163.9542694091797, + "rewards/rejected": -86.84434509277344, + "step": 8090, + "u": -6.583459377288818, + "weight": 0.03768478333950043 + }, + { + "diff_generated": -85.60336303710938, + "epoch": 2.624756966947505, + "grad_norm": 495.38888541572555, + "learning_rate": 3.7526475964234286e-08, + "logits/chosen": -2.3677401542663574, + "logits/rejected": -2.5159714221954346, + "logps/chosen": -12.289840698242188, + "logps/rejected": -170.49014282226562, + "loss": 12.9352, + "losses_ref": -9.287772329003019e-09, + "ref_logps/chosen": -95.17460632324219, + "ref_logps/rejected": -84.88678741455078, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 82.884765625, + "rewards/margins": 168.48812866210938, + "rewards/rejected": -85.60336303710938, + "step": 8100, + "u": -6.533503532409668, + "weight": 0.05000000074505806 + }, + { + "diff_generated": -83.99730682373047, + "epoch": 2.62799740764744, + "grad_norm": 495.7291956689171, + "learning_rate": 3.689125793522874e-08, + "logits/chosen": -2.3391013145446777, + "logits/rejected": -2.4577574729919434, + "logps/chosen": -11.927130699157715, + "logps/rejected": -171.8058319091797, + "loss": 12.5268, + "losses_ref": -0.003856272902339697, + "ref_logps/chosen": -90.72027587890625, + "ref_logps/rejected": -87.80851745605469, + "rewards/accuracies": 0.9375, + "rewards/chosen": 78.79315185546875, + "rewards/margins": 162.79046630859375, + "rewards/rejected": -83.99730682373047, + "step": 8110, + "u": -6.4522294998168945, + "weight": 0.06267724931240082 + }, + { + "diff_generated": -83.66921997070312, + "epoch": 2.631237848347375, + "grad_norm": 494.1546333660402, + "learning_rate": 3.6261202136970814e-08, + "logits/chosen": -2.3662924766540527, + "logits/rejected": -2.4921998977661133, + "logps/chosen": -12.41911792755127, + "logps/rejected": -171.39114379882812, + "loss": 12.6946, + "losses_ref": -2.692143823423976e-07, + "ref_logps/chosen": -96.14432525634766, + "ref_logps/rejected": -87.72193908691406, + "rewards/accuracies": 0.9375, + "rewards/chosen": 83.72520446777344, + "rewards/margins": 167.39442443847656, + "rewards/rejected": -83.66921997070312, + "step": 8120, + "u": -6.4582319259643555, + "weight": 0.0625000074505806 + }, + { + "diff_generated": -86.19597625732422, + "epoch": 2.63447828904731, + "grad_norm": 482.8019019904665, + "learning_rate": 3.563631752681422e-08, + "logits/chosen": -2.3443899154663086, + "logits/rejected": -2.5313496589660645, + "logps/chosen": -12.188249588012695, + "logps/rejected": -174.0465850830078, + "loss": 12.8193, + "losses_ref": -6.370446499204263e-07, + "ref_logps/chosen": -91.61663818359375, + "ref_logps/rejected": -87.85062408447266, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 79.42839050292969, + "rewards/margins": 165.62435913085938, + "rewards/rejected": -86.19597625732422, + "step": 8130, + "u": -6.591284275054932, + "weight": 0.04375001788139343 + }, + { + "diff_generated": -87.0280990600586, + "epoch": 2.6377187297472457, + "grad_norm": 507.050776481255, + "learning_rate": 3.501661298859489e-08, + "logits/chosen": -2.3417413234710693, + "logits/rejected": -2.473163366317749, + "logps/chosen": -13.762643814086914, + "logps/rejected": -178.0719451904297, + "loss": 12.8385, + "losses_ref": -1.76808725882438e-07, + "ref_logps/chosen": -95.24227905273438, + "ref_logps/rejected": -91.04386138916016, + "rewards/accuracies": 0.96875, + "rewards/chosen": 81.47962188720703, + "rewards/margins": 168.50772094726562, + "rewards/rejected": -87.0280990600586, + "step": 8140, + "u": -6.665112495422363, + "weight": 0.0312500074505806 + }, + { + "diff_generated": -84.09680938720703, + "epoch": 2.640959170447181, + "grad_norm": 535.1578726596663, + "learning_rate": 3.4402097332505074e-08, + "logits/chosen": -2.3699049949645996, + "logits/rejected": -2.505519151687622, + "logps/chosen": -12.561290740966797, + "logps/rejected": -168.15115356445312, + "loss": 12.7527, + "losses_ref": -1.7613732694599094e-08, + "ref_logps/chosen": -91.27629089355469, + "ref_logps/rejected": -84.05433654785156, + "rewards/accuracies": 0.9375, + "rewards/chosen": 78.71501159667969, + "rewards/margins": 162.81182861328125, + "rewards/rejected": -84.09680938720703, + "step": 8150, + "u": -6.432689666748047, + "weight": 0.0625 + }, + { + "diff_generated": -80.46368408203125, + "epoch": 2.644199611147116, + "grad_norm": 495.8366130046274, + "learning_rate": 3.379277929496798e-08, + "logits/chosen": -2.337191581726074, + "logits/rejected": -2.4601244926452637, + "logps/chosen": -12.33616828918457, + "logps/rejected": -167.09994506835938, + "loss": 12.7435, + "losses_ref": -8.165208242871813e-08, + "ref_logps/chosen": -93.17217254638672, + "ref_logps/rejected": -86.6362533569336, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 80.83600616455078, + "rewards/margins": 161.2996826171875, + "rewards/rejected": -80.46368408203125, + "step": 8160, + "u": -6.322528839111328, + "weight": 0.07500000298023224 + }, + { + "diff_generated": -81.28134155273438, + "epoch": 2.647440051847051, + "grad_norm": 527.2054934900391, + "learning_rate": 3.3188667538513435e-08, + "logits/chosen": -2.3057284355163574, + "logits/rejected": -2.496896743774414, + "logps/chosen": -12.039349555969238, + "logps/rejected": -169.8492889404297, + "loss": 12.5938, + "losses_ref": -0.005264888517558575, + "ref_logps/chosen": -88.80217742919922, + "ref_logps/rejected": -88.56796264648438, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 76.76283264160156, + "rewards/margins": 158.04417419433594, + "rewards/rejected": -81.28134155273438, + "step": 8170, + "u": -6.3711838722229, + "weight": 0.07514555752277374 + }, + { + "diff_generated": -85.64323425292969, + "epoch": 2.6506804925469862, + "grad_norm": 500.54881737349564, + "learning_rate": 3.258977065165478e-08, + "logits/chosen": -2.3974757194519043, + "logits/rejected": -2.4821276664733887, + "logps/chosen": -12.726496696472168, + "logps/rejected": -169.305908203125, + "loss": 12.981, + "losses_ref": -2.1205362799037175e-08, + "ref_logps/chosen": -96.9805679321289, + "ref_logps/rejected": -83.66265106201172, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 84.25407409667969, + "rewards/margins": 169.89730834960938, + "rewards/rejected": -85.64323425292969, + "step": 8180, + "u": -6.555540561676025, + "weight": 0.05000000074505806 + }, + { + "diff_generated": -89.43001556396484, + "epoch": 2.653920933246922, + "grad_norm": 505.4959091442462, + "learning_rate": 3.1996097148766897e-08, + "logits/chosen": -2.3234241008758545, + "logits/rejected": -2.5218329429626465, + "logps/chosen": -12.170676231384277, + "logps/rejected": -179.04788208007812, + "loss": 12.4663, + "losses_ref": -1.6466621673316695e-05, + "ref_logps/chosen": -91.13664245605469, + "ref_logps/rejected": -89.61786651611328, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 78.9659652709961, + "rewards/margins": 168.39596557617188, + "rewards/rejected": -89.43001556396484, + "step": 8190, + "u": -6.49074649810791, + "weight": 0.056250572204589844 + }, + { + "diff_generated": -84.8772964477539, + "epoch": 2.657161373946857, + "grad_norm": 486.00921903786366, + "learning_rate": 3.1407655469964754e-08, + "logits/chosen": -2.4197030067443848, + "logits/rejected": -2.5114760398864746, + "logps/chosen": -12.12280559539795, + "logps/rejected": -176.52188110351562, + "loss": 12.4027, + "losses_ref": -0.0027509736828505993, + "ref_logps/chosen": -94.04446411132812, + "ref_logps/rejected": -91.64457702636719, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 81.92166137695312, + "rewards/margins": 166.7989501953125, + "rewards/rejected": -84.8772964477539, + "step": 8200, + "u": -6.483721733093262, + "weight": 0.0501210018992424 + }, + { + "diff_generated": -87.22736358642578, + "epoch": 2.660401814646792, + "grad_norm": 494.58958979856357, + "learning_rate": 3.0824453980984234e-08, + "logits/chosen": -2.3733162879943848, + "logits/rejected": -2.483097553253174, + "logps/chosen": -12.148710250854492, + "logps/rejected": -173.47787475585938, + "loss": 13.1226, + "losses_ref": -0.012098370119929314, + "ref_logps/chosen": -92.38720703125, + "ref_logps/rejected": -86.25053405761719, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 80.23848724365234, + "rewards/margins": 167.46585083007812, + "rewards/rejected": -87.22736358642578, + "step": 8210, + "u": -6.378817558288574, + "weight": 0.06930071115493774 + }, + { + "diff_generated": -82.57842254638672, + "epoch": 2.663642255346727, + "grad_norm": 521.9251566733633, + "learning_rate": 3.0246500973062184e-08, + "logits/chosen": -2.374394416809082, + "logits/rejected": -2.4588265419006348, + "logps/chosen": -13.365102767944336, + "logps/rejected": -162.84796142578125, + "loss": 12.6945, + "losses_ref": -0.025809219107031822, + "ref_logps/chosen": -92.61255645751953, + "ref_logps/rejected": -80.26952362060547, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 79.24744415283203, + "rewards/margins": 161.8258819580078, + "rewards/rejected": -82.57842254638672, + "step": 8220, + "u": -6.362309455871582, + "weight": 0.06996141374111176 + }, + { + "diff_generated": -84.21598052978516, + "epoch": 2.6668826960466623, + "grad_norm": 478.60892978321334, + "learning_rate": 2.9673804662819324e-08, + "logits/chosen": -2.367405414581299, + "logits/rejected": -2.4610178470611572, + "logps/chosen": -12.569252967834473, + "logps/rejected": -165.32691955566406, + "loss": 12.6299, + "losses_ref": -0.0011514907237142324, + "ref_logps/chosen": -92.96192932128906, + "ref_logps/rejected": -81.11091613769531, + "rewards/accuracies": 0.9375, + "rewards/chosen": 80.39268493652344, + "rewards/margins": 164.60867309570312, + "rewards/rejected": -84.21598052978516, + "step": 8230, + "u": -6.457944393157959, + "weight": 0.06254855543375015 + }, + { + "diff_generated": -87.5735092163086, + "epoch": 2.6701231367465974, + "grad_norm": 508.32581470095147, + "learning_rate": 2.9106373192143087e-08, + "logits/chosen": -2.3813018798828125, + "logits/rejected": -2.548152208328247, + "logps/chosen": -11.875589370727539, + "logps/rejected": -176.88278198242188, + "loss": 12.6674, + "losses_ref": -0.0047495425678789616, + "ref_logps/chosen": -90.63737487792969, + "ref_logps/rejected": -89.30928039550781, + "rewards/accuracies": 0.9375, + "rewards/chosen": 78.76178741455078, + "rewards/margins": 166.33529663085938, + "rewards/rejected": -87.5735092163086, + "step": 8240, + "u": -6.413567543029785, + "weight": 0.06272226572036743 + }, + { + "diff_generated": -86.81454467773438, + "epoch": 2.6733635774465325, + "grad_norm": 501.0978421599488, + "learning_rate": 2.854421462807193e-08, + "logits/chosen": -2.36572527885437, + "logits/rejected": -2.4993362426757812, + "logps/chosen": -11.884244918823242, + "logps/rejected": -171.13729858398438, + "loss": 12.3468, + "losses_ref": -7.660739242965064e-07, + "ref_logps/chosen": -97.11248016357422, + "ref_logps/rejected": -84.32276153564453, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 85.22823333740234, + "rewards/margins": 172.04278564453125, + "rewards/rejected": -86.81454467773438, + "step": 8250, + "u": -6.548535346984863, + "weight": 0.04375002905726433 + }, + { + "diff_generated": -81.70663452148438, + "epoch": 2.6766040181464676, + "grad_norm": 492.62343461174584, + "learning_rate": 2.798733696268063e-08, + "logits/chosen": -2.344013214111328, + "logits/rejected": -2.4782357215881348, + "logps/chosen": -13.149152755737305, + "logps/rejected": -164.31423950195312, + "loss": 13.1615, + "losses_ref": -1.994596203758192e-07, + "ref_logps/chosen": -93.44435119628906, + "ref_logps/rejected": -82.60762786865234, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 80.29519653320312, + "rewards/margins": 162.0018310546875, + "rewards/rejected": -81.70663452148438, + "step": 8260, + "u": -6.470333099365234, + "weight": 0.05625000596046448 + }, + { + "diff_generated": -89.9212646484375, + "epoch": 2.679844458846403, + "grad_norm": 477.27097344952216, + "learning_rate": 2.7435748112966694e-08, + "logits/chosen": -2.3386342525482178, + "logits/rejected": -2.525294065475464, + "logps/chosen": -11.611748695373535, + "logps/rejected": -183.88877868652344, + "loss": 12.5768, + "losses_ref": -0.001180317485705018, + "ref_logps/chosen": -99.32749938964844, + "ref_logps/rejected": -93.96751403808594, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 87.71575164794922, + "rewards/margins": 177.6370086669922, + "rewards/rejected": -89.9212646484375, + "step": 8270, + "u": -6.4835076332092285, + "weight": 0.05005160719156265 + }, + { + "diff_generated": -79.57349395751953, + "epoch": 2.6830848995463383, + "grad_norm": 528.2810782020115, + "learning_rate": 2.6889455920737903e-08, + "logits/chosen": -2.3760812282562256, + "logits/rejected": -2.441226005554199, + "logps/chosen": -14.722673416137695, + "logps/rejected": -158.25631713867188, + "loss": 12.8496, + "losses_ref": -0.0030767028219997883, + "ref_logps/chosen": -94.90017700195312, + "ref_logps/rejected": -78.68284606933594, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 80.17750549316406, + "rewards/margins": 159.75100708007812, + "rewards/rejected": -79.57349395751953, + "step": 8280, + "u": -6.374342918395996, + "weight": 0.07513058185577393 + }, + { + "diff_generated": -89.38903045654297, + "epoch": 2.6863253402462735, + "grad_norm": 508.05682186752637, + "learning_rate": 2.6348468152500357e-08, + "logits/chosen": -2.3738772869110107, + "logits/rejected": -2.524718761444092, + "logps/chosen": -11.598087310791016, + "logps/rejected": -176.02505493164062, + "loss": 12.7554, + "losses_ref": -0.002667112974449992, + "ref_logps/chosen": -90.2225570678711, + "ref_logps/rejected": -86.63602447509766, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 78.62446594238281, + "rewards/margins": 168.01351928710938, + "rewards/rejected": -89.38903045654297, + "step": 8290, + "u": -6.515158653259277, + "weight": 0.05636848136782646 + }, + { + "diff_generated": -88.80402374267578, + "epoch": 2.6895657809462086, + "grad_norm": 482.00697128423803, + "learning_rate": 2.5812792499348935e-08, + "logits/chosen": -2.3881940841674805, + "logits/rejected": -2.4838156700134277, + "logps/chosen": -13.3690185546875, + "logps/rejected": -177.01930236816406, + "loss": 12.4884, + "losses_ref": -9.388824764755554e-06, + "ref_logps/chosen": -98.63578796386719, + "ref_logps/rejected": -88.21526336669922, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 85.26676940917969, + "rewards/margins": 174.07078552246094, + "rewards/rejected": -88.80402374267578, + "step": 8300, + "u": -6.709897518157959, + "weight": 0.025000352412462234 + }, + { + "diff_generated": -88.19657897949219, + "epoch": 2.692806221646144, + "grad_norm": 462.05724084912237, + "learning_rate": 2.5282436576857046e-08, + "logits/chosen": -2.3803226947784424, + "logits/rejected": -2.5157971382141113, + "logps/chosen": -11.993725776672363, + "logps/rejected": -177.28646850585938, + "loss": 12.5235, + "losses_ref": -0.006812377832829952, + "ref_logps/chosen": -95.5112075805664, + "ref_logps/rejected": -89.08988952636719, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 83.5174789428711, + "rewards/margins": 171.71405029296875, + "rewards/rejected": -88.19657897949219, + "step": 8310, + "u": -6.486662864685059, + "weight": 0.05032174661755562 + }, + { + "diff_generated": -84.83506774902344, + "epoch": 2.6960466623460793, + "grad_norm": 511.6195205388548, + "learning_rate": 2.4757407924968878e-08, + "logits/chosen": -2.3571653366088867, + "logits/rejected": -2.5362257957458496, + "logps/chosen": -11.663923263549805, + "logps/rejected": -170.24285888671875, + "loss": 12.5592, + "losses_ref": -0.012650948949158192, + "ref_logps/chosen": -91.58747863769531, + "ref_logps/rejected": -85.40780639648438, + "rewards/accuracies": 0.9375, + "rewards/chosen": 79.92354583740234, + "rewards/margins": 164.7586212158203, + "rewards/rejected": -84.83506774902344, + "step": 8320, + "u": -6.4202470779418945, + "weight": 0.06304998695850372 + }, + { + "diff_generated": -87.63829040527344, + "epoch": 2.6992871030460144, + "grad_norm": 491.1195406215226, + "learning_rate": 2.4237714007892117e-08, + "logits/chosen": -2.4236512184143066, + "logits/rejected": -2.51503324508667, + "logps/chosen": -13.503862380981445, + "logps/rejected": -179.43336486816406, + "loss": 12.9538, + "losses_ref": -1.1350125006526213e-10, + "ref_logps/chosen": -102.19181823730469, + "ref_logps/rejected": -91.7950668334961, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 88.68795013427734, + "rewards/margins": 176.32623291015625, + "rewards/rejected": -87.63829040527344, + "step": 8330, + "u": -6.6181640625, + "weight": 0.03750000149011612 + }, + { + "diff_generated": -84.88908386230469, + "epoch": 2.7025275437459495, + "grad_norm": 509.93968497196573, + "learning_rate": 2.372336221399176e-08, + "logits/chosen": -2.3671317100524902, + "logits/rejected": -2.4888668060302734, + "logps/chosen": -12.767683982849121, + "logps/rejected": -175.57974243164062, + "loss": 12.3047, + "losses_ref": -0.005530247930437326, + "ref_logps/chosen": -94.7016830444336, + "ref_logps/rejected": -90.6906509399414, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 81.93400573730469, + "rewards/margins": 166.82308959960938, + "rewards/rejected": -84.88908386230469, + "step": 8340, + "u": -6.289109706878662, + "weight": 0.08149810135364532 + }, + { + "diff_generated": -84.93101501464844, + "epoch": 2.7057679844458846, + "grad_norm": 460.3913834206574, + "learning_rate": 2.3214359855685095e-08, + "logits/chosen": -2.3634300231933594, + "logits/rejected": -2.460094451904297, + "logps/chosen": -12.92186164855957, + "logps/rejected": -169.9513397216797, + "loss": 12.3929, + "losses_ref": -0.002780128736048937, + "ref_logps/chosen": -97.1541748046875, + "ref_logps/rejected": -85.02031707763672, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 84.2323226928711, + "rewards/margins": 169.163330078125, + "rewards/rejected": -84.93101501464844, + "step": 8350, + "u": -6.59981632232666, + "weight": 0.0376209020614624 + }, + { + "diff_generated": -84.09104919433594, + "epoch": 2.7090084251458197, + "grad_norm": 522.8097767684436, + "learning_rate": 2.271071416933772e-08, + "logits/chosen": -2.390049934387207, + "logits/rejected": -2.5383729934692383, + "logps/chosen": -11.891609191894531, + "logps/rejected": -169.39395141601562, + "loss": 12.5033, + "losses_ref": -3.2407893741037697e-05, + "ref_logps/chosen": -91.81161499023438, + "ref_logps/rejected": -85.30288696289062, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 79.92000579833984, + "rewards/margins": 164.01104736328125, + "rewards/rejected": -84.09104919433594, + "step": 8360, + "u": -6.485341548919678, + "weight": 0.05000120401382446 + }, + { + "diff_generated": -83.08517456054688, + "epoch": 2.712248865845755, + "grad_norm": 503.23447806756167, + "learning_rate": 2.2212432315160855e-08, + "logits/chosen": -2.3702919483184814, + "logits/rejected": -2.48654842376709, + "logps/chosen": -12.602476119995117, + "logps/rejected": -167.88804626464844, + "loss": 13.0166, + "losses_ref": -0.01596415974199772, + "ref_logps/chosen": -92.63908386230469, + "ref_logps/rejected": -84.80287170410156, + "rewards/accuracies": 0.9375, + "rewards/chosen": 80.0365982055664, + "rewards/margins": 163.1217803955078, + "rewards/rejected": -83.08517456054688, + "step": 8370, + "u": -6.448731422424316, + "weight": 0.06323273479938507 + }, + { + "diff_generated": -89.09854888916016, + "epoch": 2.71548930654569, + "grad_norm": 492.9086390613034, + "learning_rate": 2.171952137710904e-08, + "logits/chosen": -2.425790309906006, + "logits/rejected": -2.5577409267425537, + "logps/chosen": -11.534905433654785, + "logps/rejected": -177.09619140625, + "loss": 12.5847, + "losses_ref": -2.4892568006862348e-08, + "ref_logps/chosen": -96.31239318847656, + "ref_logps/rejected": -87.9976577758789, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 84.7774887084961, + "rewards/margins": 173.8760528564453, + "rewards/rejected": -89.09854888916016, + "step": 8380, + "u": -6.6341071128845215, + "weight": 0.03750000149011612 + }, + { + "diff_generated": -83.87593841552734, + "epoch": 2.7187297472456255, + "grad_norm": 471.84603527073546, + "learning_rate": 2.1231988362780327e-08, + "logits/chosen": -2.3621182441711426, + "logits/rejected": -2.477931261062622, + "logps/chosen": -12.505033493041992, + "logps/rejected": -169.10244750976562, + "loss": 12.6486, + "losses_ref": -1.6721272686481825e-06, + "ref_logps/chosen": -95.0557861328125, + "ref_logps/rejected": -85.22648620605469, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 82.5507583618164, + "rewards/margins": 166.4267120361328, + "rewards/rejected": -83.87593841552734, + "step": 8390, + "u": -6.376302242279053, + "weight": 0.0687500461935997 + }, + { + "diff_generated": -90.33910369873047, + "epoch": 2.7219701879455607, + "grad_norm": 512.9739846227034, + "learning_rate": 2.0749840203315584e-08, + "logits/chosen": -2.392246723175049, + "logits/rejected": -2.558375597000122, + "logps/chosen": -13.82586669921875, + "logps/rejected": -179.183349609375, + "loss": 12.564, + "losses_ref": -0.0003666019765660167, + "ref_logps/chosen": -95.07868957519531, + "ref_logps/rejected": -88.84425354003906, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 81.25282287597656, + "rewards/margins": 171.5919189453125, + "rewards/rejected": -90.33910369873047, + "step": 8400, + "u": -6.5747785568237305, + "weight": 0.043763164430856705 + }, + { + "diff_generated": -85.40348815917969, + "epoch": 2.725210628645496, + "grad_norm": 460.2332420013002, + "learning_rate": 2.0273083753300724e-08, + "logits/chosen": -2.3995018005371094, + "logits/rejected": -2.485994338989258, + "logps/chosen": -13.077387809753418, + "logps/rejected": -169.70018005371094, + "loss": 12.8605, + "losses_ref": -0.000844582449644804, + "ref_logps/chosen": -100.71969604492188, + "ref_logps/rejected": -84.29668426513672, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 87.6423110961914, + "rewards/margins": 173.04580688476562, + "rewards/rejected": -85.40348815917969, + "step": 8410, + "u": -6.576523780822754, + "weight": 0.04378564655780792 + }, + { + "diff_generated": -87.17457580566406, + "epoch": 2.728451069345431, + "grad_norm": 538.4549395583889, + "learning_rate": 1.980172579066899e-08, + "logits/chosen": -2.4052786827087402, + "logits/rejected": -2.537781238555908, + "logps/chosen": -13.450469970703125, + "logps/rejected": -174.35183715820312, + "loss": 12.7838, + "losses_ref": -3.527696662786184e-06, + "ref_logps/chosen": -98.7660903930664, + "ref_logps/rejected": -87.17725372314453, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 85.31561279296875, + "rewards/margins": 172.49020385742188, + "rewards/rejected": -87.17457580566406, + "step": 8420, + "u": -6.604406833648682, + "weight": 0.037500061094760895 + }, + { + "diff_generated": -91.29328918457031, + "epoch": 2.731691510045366, + "grad_norm": 535.2424483734745, + "learning_rate": 1.9335773016604608e-08, + "logits/chosen": -2.394975185394287, + "logits/rejected": -2.547994375228882, + "logps/chosen": -13.524676322937012, + "logps/rejected": -183.4735107421875, + "loss": 12.9211, + "losses_ref": -6.3783551773610725e-09, + "ref_logps/chosen": -98.69517517089844, + "ref_logps/rejected": -92.18020629882812, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 85.17050170898438, + "rewards/margins": 176.46377563476562, + "rewards/rejected": -91.29328918457031, + "step": 8430, + "u": -6.727007865905762, + "weight": 0.02500000037252903 + }, + { + "diff_generated": -82.27983093261719, + "epoch": 2.7349319507453016, + "grad_norm": 526.8174489152042, + "learning_rate": 1.887523205544741e-08, + "logits/chosen": -2.3671412467956543, + "logits/rejected": -2.4857192039489746, + "logps/chosen": -13.052943229675293, + "logps/rejected": -163.49301147460938, + "loss": 12.8229, + "losses_ref": -0.0011958193499594927, + "ref_logps/chosen": -92.99552154541016, + "ref_logps/rejected": -81.21318054199219, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 79.94257354736328, + "rewards/margins": 162.222412109375, + "rewards/rejected": -82.27983093261719, + "step": 8440, + "u": -6.532777309417725, + "weight": 0.05005018785595894 + }, + { + "diff_generated": -86.48809814453125, + "epoch": 2.7381723914452367, + "grad_norm": 478.5720153145638, + "learning_rate": 1.8420109454598997e-08, + "logits/chosen": -2.4062106609344482, + "logits/rejected": -2.5305914878845215, + "logps/chosen": -11.691521644592285, + "logps/rejected": -170.15487670898438, + "loss": 12.7834, + "losses_ref": -0.0031040345784276724, + "ref_logps/chosen": -92.42660522460938, + "ref_logps/rejected": -83.66678619384766, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 80.7350845336914, + "rewards/margins": 167.22317504882812, + "rewards/rejected": -86.48809814453125, + "step": 8450, + "u": -6.624154567718506, + "weight": 0.037630610167980194 + }, + { + "diff_generated": -86.4380874633789, + "epoch": 2.741412832145172, + "grad_norm": 525.8424870433707, + "learning_rate": 1.797041168442921e-08, + "logits/chosen": -2.4024062156677246, + "logits/rejected": -2.5008559226989746, + "logps/chosen": -12.73488712310791, + "logps/rejected": -169.62216186523438, + "loss": 12.8644, + "losses_ref": -0.0007443568902090192, + "ref_logps/chosen": -97.05931091308594, + "ref_logps/rejected": -83.18407440185547, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 84.32442474365234, + "rewards/margins": 170.76251220703125, + "rewards/rejected": -86.4380874633789, + "step": 8460, + "u": -6.547275543212891, + "weight": 0.04378039762377739 + }, + { + "diff_generated": -87.27616882324219, + "epoch": 2.744653272845107, + "grad_norm": 497.21888671107916, + "learning_rate": 1.7526145138184377e-08, + "logits/chosen": -2.4237253665924072, + "logits/rejected": -2.5312697887420654, + "logps/chosen": -12.734219551086426, + "logps/rejected": -172.1459503173828, + "loss": 13.1801, + "losses_ref": -1.1529791343889428e-08, + "ref_logps/chosen": -97.7723617553711, + "ref_logps/rejected": -84.86976623535156, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 85.03813171386719, + "rewards/margins": 172.3143310546875, + "rewards/rejected": -87.27616882324219, + "step": 8470, + "u": -6.614732265472412, + "weight": 0.03750000149011612 + }, + { + "diff_generated": -84.35192108154297, + "epoch": 2.747893713545042, + "grad_norm": 527.620403277501, + "learning_rate": 1.708731613189669e-08, + "logits/chosen": -2.4409520626068115, + "logits/rejected": -2.5005288124084473, + "logps/chosen": -14.184789657592773, + "logps/rejected": -166.6622314453125, + "loss": 12.6912, + "losses_ref": -0.018728725612163544, + "ref_logps/chosen": -101.62376403808594, + "ref_logps/rejected": -82.31028747558594, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 87.43898010253906, + "rewards/margins": 171.79090881347656, + "rewards/rejected": -84.35192108154297, + "step": 8480, + "u": -6.550576210021973, + "weight": 0.04464550316333771 + }, + { + "diff_generated": -85.64671325683594, + "epoch": 2.751134154244977, + "grad_norm": 511.1466343453559, + "learning_rate": 1.6653930904293677e-08, + "logits/chosen": -2.3939261436462402, + "logits/rejected": -2.4896676540374756, + "logps/chosen": -13.360217094421387, + "logps/rejected": -173.86691284179688, + "loss": 12.6242, + "losses_ref": -0.004592637997120619, + "ref_logps/chosen": -101.32906341552734, + "ref_logps/rejected": -88.22019958496094, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 87.96885681152344, + "rewards/margins": 173.61557006835938, + "rewards/rejected": -85.64671325683594, + "step": 8490, + "u": -6.602023124694824, + "weight": 0.03769877552986145 + }, + { + "diff_generated": -85.69892883300781, + "epoch": 2.7543745949449123, + "grad_norm": 504.5576680791223, + "learning_rate": 1.6225995616710297e-08, + "logits/chosen": -2.4041481018066406, + "logits/rejected": -2.4688780307769775, + "logps/chosen": -13.450655937194824, + "logps/rejected": -172.2867889404297, + "loss": 12.4601, + "losses_ref": -6.851646503491793e-06, + "ref_logps/chosen": -98.85618591308594, + "ref_logps/rejected": -86.58786010742188, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 85.4055404663086, + "rewards/margins": 171.10446166992188, + "rewards/rejected": -85.69892883300781, + "step": 8500, + "u": -6.673095703125, + "weight": 0.02500019408762455 + }, + { + "diff_generated": -88.74124145507812, + "epoch": 2.7576150356448474, + "grad_norm": 551.6726809002059, + "learning_rate": 1.58035163530009e-08, + "logits/chosen": -2.4708077907562256, + "logits/rejected": -2.537278652191162, + "logps/chosen": -13.46081256866455, + "logps/rejected": -178.23049926757812, + "loss": 12.9081, + "losses_ref": -0.0019827443175017834, + "ref_logps/chosen": -100.0168685913086, + "ref_logps/rejected": -89.4892578125, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 86.5560531616211, + "rewards/margins": 175.2973175048828, + "rewards/rejected": -88.74124145507812, + "step": 8510, + "u": -6.623356819152832, + "weight": 0.037584926933050156 + }, + { + "diff_generated": -87.86695861816406, + "epoch": 2.760855476344783, + "grad_norm": 504.9036060217802, + "learning_rate": 1.538649911945291e-08, + "logits/chosen": -2.3583149909973145, + "logits/rejected": -2.548267126083374, + "logps/chosen": -13.576777458190918, + "logps/rejected": -178.18577575683594, + "loss": 13.2624, + "losses_ref": -0.0017562673892825842, + "ref_logps/chosen": -95.54901123046875, + "ref_logps/rejected": -90.31883239746094, + "rewards/accuracies": 0.96875, + "rewards/chosen": 81.97222137451172, + "rewards/margins": 169.8391876220703, + "rewards/rejected": -87.86695861816406, + "step": 8520, + "u": -6.665831089019775, + "weight": 0.031327299773693085 + }, + { + "diff_generated": -80.264892578125, + "epoch": 2.764095917044718, + "grad_norm": 497.41903616203007, + "learning_rate": 1.497494984470107e-08, + "logits/chosen": -2.3654308319091797, + "logits/rejected": -2.46647310256958, + "logps/chosen": -14.232625961303711, + "logps/rejected": -164.5033416748047, + "loss": 12.9144, + "losses_ref": -0.020502448081970215, + "ref_logps/chosen": -99.23295593261719, + "ref_logps/rejected": -84.23846435546875, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 85.00032806396484, + "rewards/margins": 165.26519775390625, + "rewards/rejected": -80.264892578125, + "step": 8530, + "u": -6.4730224609375, + "weight": 0.057284872978925705 + }, + { + "diff_generated": -81.96501159667969, + "epoch": 2.7673363577446533, + "grad_norm": 499.74748943064225, + "learning_rate": 1.4568874379643936e-08, + "logits/chosen": -2.392131805419922, + "logits/rejected": -2.49577260017395, + "logps/chosen": -11.832303047180176, + "logps/rejected": -164.77926635742188, + "loss": 12.4482, + "losses_ref": -0.0021569118835031986, + "ref_logps/chosen": -90.89392852783203, + "ref_logps/rejected": -82.8142318725586, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 79.0616226196289, + "rewards/margins": 161.02664184570312, + "rewards/rejected": -81.96501159667969, + "step": 8540, + "u": -6.1016950607299805, + "weight": 0.11259114742279053 + }, + { + "diff_generated": -88.95906066894531, + "epoch": 2.7705767984445884, + "grad_norm": 536.4161948876989, + "learning_rate": 1.4168278497359798e-08, + "logits/chosen": -2.42805814743042, + "logits/rejected": -2.557976245880127, + "logps/chosen": -12.606058120727539, + "logps/rejected": -181.2037811279297, + "loss": 12.5847, + "losses_ref": -1.0212413137367093e-08, + "ref_logps/chosen": -96.47504425048828, + "ref_logps/rejected": -92.24471282958984, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 83.86898040771484, + "rewards/margins": 172.8280487060547, + "rewards/rejected": -88.95906066894531, + "step": 8550, + "u": -6.537459373474121, + "weight": 0.05000000074505806 + }, + { + "diff_generated": -90.14323425292969, + "epoch": 2.7738172391445235, + "grad_norm": 469.26948291565867, + "learning_rate": 1.3773167893025161e-08, + "logits/chosen": -2.397914409637451, + "logits/rejected": -2.5138306617736816, + "logps/chosen": -13.796974182128906, + "logps/rejected": -180.79031372070312, + "loss": 12.9604, + "losses_ref": -0.0008936094818636775, + "ref_logps/chosen": -98.47550201416016, + "ref_logps/rejected": -90.6470718383789, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 84.67852020263672, + "rewards/margins": 174.82176208496094, + "rewards/rejected": -90.14323425292969, + "step": 8560, + "u": -6.502872467041016, + "weight": 0.05628952383995056 + }, + { + "diff_generated": -84.3171615600586, + "epoch": 2.777057679844459, + "grad_norm": 481.68081385509606, + "learning_rate": 1.3383548183833715e-08, + "logits/chosen": -2.3971917629241943, + "logits/rejected": -2.4937527179718018, + "logps/chosen": -13.414143562316895, + "logps/rejected": -165.9224853515625, + "loss": 12.8354, + "losses_ref": -4.630480361811351e-06, + "ref_logps/chosen": -96.98786926269531, + "ref_logps/rejected": -81.6053237915039, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 83.57373046875, + "rewards/margins": 167.89089965820312, + "rewards/rejected": -84.3171615600586, + "step": 8570, + "u": -6.320916175842285, + "weight": 0.06875015795230865 + }, + { + "diff_generated": -82.9990234375, + "epoch": 2.780298120544394, + "grad_norm": 511.05202783179914, + "learning_rate": 1.2999424908916346e-08, + "logits/chosen": -2.3500263690948486, + "logits/rejected": -2.445317029953003, + "logps/chosen": -13.810076713562012, + "logps/rejected": -169.82754516601562, + "loss": 12.5559, + "losses_ref": -0.010504474863409996, + "ref_logps/chosen": -97.06626892089844, + "ref_logps/rejected": -86.8285140991211, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 83.2562026977539, + "rewards/margins": 166.25521850585938, + "rewards/rejected": -82.9990234375, + "step": 8580, + "u": -6.568633079528809, + "weight": 0.044232361018657684 + }, + { + "diff_generated": -83.91486358642578, + "epoch": 2.7835385612443293, + "grad_norm": 505.11501311692376, + "learning_rate": 1.2620803529262357e-08, + "logits/chosen": -2.3712546825408936, + "logits/rejected": -2.4964489936828613, + "logps/chosen": -11.956393241882324, + "logps/rejected": -168.12840270996094, + "loss": 12.4465, + "losses_ref": -0.00022628402803093195, + "ref_logps/chosen": -96.67184448242188, + "ref_logps/rejected": -84.2135238647461, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 84.71544647216797, + "rewards/margins": 168.6303253173828, + "rewards/rejected": -83.91486358642578, + "step": 8590, + "u": -6.529415130615234, + "weight": 0.05000894516706467 + }, + { + "diff_generated": -86.57169342041016, + "epoch": 2.7867790019442644, + "grad_norm": 506.88211246823346, + "learning_rate": 1.2247689427642027e-08, + "logits/chosen": -2.4125804901123047, + "logits/rejected": -2.5085551738739014, + "logps/chosen": -13.256278991699219, + "logps/rejected": -175.28707885742188, + "loss": 12.6541, + "losses_ref": -7.255699152608486e-09, + "ref_logps/chosen": -97.22776794433594, + "ref_logps/rejected": -88.71540069580078, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 83.97148895263672, + "rewards/margins": 170.5431671142578, + "rewards/rejected": -86.57169342041016, + "step": 8600, + "u": -6.509591102600098, + "weight": 0.05000000074505806 + }, + { + "diff_generated": -84.92244720458984, + "epoch": 2.7900194426441995, + "grad_norm": 518.6409227763658, + "learning_rate": 1.1880087908529945e-08, + "logits/chosen": -2.3713769912719727, + "logits/rejected": -2.474369764328003, + "logps/chosen": -13.483648300170898, + "logps/rejected": -166.7283172607422, + "loss": 12.8296, + "losses_ref": -0.0012584489304572344, + "ref_logps/chosen": -96.45650482177734, + "ref_logps/rejected": -81.80587768554688, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 82.97285461425781, + "rewards/margins": 167.8953094482422, + "rewards/rejected": -84.92244720458984, + "step": 8610, + "u": -6.539379119873047, + "weight": 0.05004796385765076 + }, + { + "diff_generated": -84.60955810546875, + "epoch": 2.7932598833441347, + "grad_norm": 459.4485713200273, + "learning_rate": 1.1518004198029529e-08, + "logits/chosen": -2.409595012664795, + "logits/rejected": -2.5200963020324707, + "logps/chosen": -13.681007385253906, + "logps/rejected": -168.84112548828125, + "loss": 12.6365, + "losses_ref": -3.974982405452465e-07, + "ref_logps/chosen": -99.23111724853516, + "ref_logps/rejected": -84.23157501220703, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 85.55010986328125, + "rewards/margins": 170.15965270996094, + "rewards/rejected": -84.60955810546875, + "step": 8620, + "u": -6.522212982177734, + "weight": 0.050000011920928955 + }, + { + "diff_generated": -86.31803131103516, + "epoch": 2.79650032404407, + "grad_norm": 511.1749130208627, + "learning_rate": 1.1161443443798946e-08, + "logits/chosen": -2.377769947052002, + "logits/rejected": -2.535102367401123, + "logps/chosen": -12.180910110473633, + "logps/rejected": -173.79360961914062, + "loss": 12.752, + "losses_ref": -3.1645927265344653e-06, + "ref_logps/chosen": -94.26895141601562, + "ref_logps/rejected": -87.47557830810547, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 82.0880355834961, + "rewards/margins": 168.4060516357422, + "rewards/rejected": -86.31803131103516, + "step": 8630, + "u": -6.404803276062012, + "weight": 0.06875012069940567 + }, + { + "diff_generated": -83.41902160644531, + "epoch": 2.7997407647440054, + "grad_norm": 602.2522933359077, + "learning_rate": 1.0810410714977747e-08, + "logits/chosen": -2.3440022468566895, + "logits/rejected": -2.4584457874298096, + "logps/chosen": -13.622339248657227, + "logps/rejected": -166.0839385986328, + "loss": 12.9616, + "losses_ref": -0.0007582043763250113, + "ref_logps/chosen": -92.6719970703125, + "ref_logps/rejected": -82.6649169921875, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 79.04966735839844, + "rewards/margins": 162.46865844726562, + "rewards/rejected": -83.41902160644531, + "step": 8640, + "u": -6.447157859802246, + "weight": 0.05628180503845215 + }, + { + "diff_generated": -91.5981674194336, + "epoch": 2.8029812054439405, + "grad_norm": 489.3615918457275, + "learning_rate": 1.0464911002114885e-08, + "logits/chosen": -2.392082691192627, + "logits/rejected": -2.563253402709961, + "logps/chosen": -13.450553894042969, + "logps/rejected": -183.0391845703125, + "loss": 12.6733, + "losses_ref": -0.002970527159050107, + "ref_logps/chosen": -96.15373229980469, + "ref_logps/rejected": -91.44102478027344, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 82.70317840576172, + "rewards/margins": 174.3013458251953, + "rewards/rejected": -91.5981674194336, + "step": 8650, + "u": -6.729278564453125, + "weight": 0.01888248324394226 + }, + { + "diff_generated": -88.04591369628906, + "epoch": 2.8062216461438756, + "grad_norm": 463.19974618327154, + "learning_rate": 1.0124949217097656e-08, + "logits/chosen": -2.4216623306274414, + "logits/rejected": -2.558814287185669, + "logps/chosen": -11.761835098266602, + "logps/rejected": -177.6961669921875, + "loss": 12.6788, + "losses_ref": -1.860668089648243e-05, + "ref_logps/chosen": -95.4864730834961, + "ref_logps/rejected": -89.6502456665039, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 83.72464752197266, + "rewards/margins": 171.77053833007812, + "rewards/rejected": -88.04591369628906, + "step": 8660, + "u": -6.536529541015625, + "weight": 0.04375075548887253 + }, + { + "diff_generated": -86.40773010253906, + "epoch": 2.8094620868438107, + "grad_norm": 472.62108278894453, + "learning_rate": 9.790530193082114e-09, + "logits/chosen": -2.3973047733306885, + "logits/rejected": -2.4718689918518066, + "logps/chosen": -14.47937297821045, + "logps/rejected": -171.40640258789062, + "loss": 13.1518, + "losses_ref": -0.0019137548515573144, + "ref_logps/chosen": -98.88545227050781, + "ref_logps/rejected": -84.99864196777344, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 84.40608215332031, + "rewards/margins": 170.81381225585938, + "rewards/rejected": -86.40773010253906, + "step": 8670, + "u": -6.6227264404296875, + "weight": 0.037581831216812134 + }, + { + "diff_generated": -85.9549789428711, + "epoch": 2.812702527543746, + "grad_norm": 538.9199023880284, + "learning_rate": 9.461658684423968e-09, + "logits/chosen": -2.3717586994171143, + "logits/rejected": -2.513598680496216, + "logps/chosen": -14.2130126953125, + "logps/rejected": -169.3182373046875, + "loss": 13.0126, + "losses_ref": -6.566205229319166e-06, + "ref_logps/chosen": -94.47737121582031, + "ref_logps/rejected": -83.3632583618164, + "rewards/accuracies": 0.9375, + "rewards/chosen": 80.26435852050781, + "rewards/margins": 166.21934509277344, + "rewards/rejected": -85.9549789428711, + "step": 8680, + "u": -6.473133087158203, + "weight": 0.06250016391277313 + }, + { + "diff_generated": -89.59869384765625, + "epoch": 2.8159429682436814, + "grad_norm": 518.7525170558285, + "learning_rate": 9.138339366611526e-09, + "logits/chosen": -2.4366939067840576, + "logits/rejected": -2.547464609146118, + "logps/chosen": -12.505029678344727, + "logps/rejected": -180.10690307617188, + "loss": 12.5597, + "losses_ref": -1.1943488686938508e-07, + "ref_logps/chosen": -98.70265197753906, + "ref_logps/rejected": -90.5082015991211, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 86.19761657714844, + "rewards/margins": 175.79629516601562, + "rewards/rejected": -89.59869384765625, + "step": 8690, + "u": -6.688715934753418, + "weight": 0.02500000223517418 + }, + { + "diff_generated": -85.91513061523438, + "epoch": 2.8191834089436165, + "grad_norm": 491.41658674176796, + "learning_rate": 8.82057683619859e-09, + "logits/chosen": -2.3363218307495117, + "logits/rejected": -2.519660711288452, + "logps/chosen": -10.693092346191406, + "logps/rejected": -171.95101928710938, + "loss": 12.0402, + "losses_ref": -0.0021391697227954865, + "ref_logps/chosen": -86.15867614746094, + "ref_logps/rejected": -86.03590393066406, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 75.465576171875, + "rewards/margins": 161.38070678710938, + "rewards/rejected": -85.91513061523438, + "step": 8700, + "u": -6.5091071128845215, + "weight": 0.05634554475545883 + }, + { + "diff_generated": -85.58265686035156, + "epoch": 2.8224238496435516, + "grad_norm": 486.4172883940246, + "learning_rate": 8.508375610739626e-09, + "logits/chosen": -2.4158897399902344, + "logits/rejected": -2.500046730041504, + "logps/chosen": -13.407247543334961, + "logps/rejected": -168.00059509277344, + "loss": 12.6608, + "losses_ref": -4.46725152869476e-06, + "ref_logps/chosen": -95.40724182128906, + "ref_logps/rejected": -82.41795349121094, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 82.0, + "rewards/margins": 167.58267211914062, + "rewards/rejected": -85.58265686035156, + "step": 8710, + "u": -6.590667724609375, + "weight": 0.04375016316771507 + }, + { + "diff_generated": -87.30168151855469, + "epoch": 2.8256642903434868, + "grad_norm": 530.6312982735535, + "learning_rate": 8.201740128725365e-09, + "logits/chosen": -2.369098663330078, + "logits/rejected": -2.536067008972168, + "logps/chosen": -12.3034029006958, + "logps/rejected": -173.2048797607422, + "loss": 12.2594, + "losses_ref": -1.9929467853785354e-08, + "ref_logps/chosen": -92.73594665527344, + "ref_logps/rejected": -85.90321350097656, + "rewards/accuracies": 0.9375, + "rewards/chosen": 80.43254089355469, + "rewards/margins": 167.73423767089844, + "rewards/rejected": -87.30168151855469, + "step": 8720, + "u": -6.42525577545166, + "weight": 0.0625 + }, + { + "diff_generated": -87.40478515625, + "epoch": 2.828904731043422, + "grad_norm": 471.09383996051514, + "learning_rate": 7.900674749519564e-09, + "logits/chosen": -2.412445545196533, + "logits/rejected": -2.513763904571533, + "logps/chosen": -13.007713317871094, + "logps/rejected": -177.88394165039062, + "loss": 12.564, + "losses_ref": -1.7794054230080292e-08, + "ref_logps/chosen": -94.12556457519531, + "ref_logps/rejected": -90.47914123535156, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 81.11785125732422, + "rewards/margins": 168.5226593017578, + "rewards/rejected": -87.40478515625, + "step": 8730, + "u": -6.489443302154541, + "weight": 0.05624999850988388 + }, + { + "diff_generated": -83.98377990722656, + "epoch": 2.832145171743357, + "grad_norm": 501.12454027761083, + "learning_rate": 7.605183753297283e-09, + "logits/chosen": -2.4327521324157715, + "logits/rejected": -2.488560438156128, + "logps/chosen": -13.543904304504395, + "logps/rejected": -171.91537475585938, + "loss": 12.5131, + "losses_ref": -5.7025822570722084e-06, + "ref_logps/chosen": -99.78609466552734, + "ref_logps/rejected": -87.93158721923828, + "rewards/accuracies": 0.9375, + "rewards/chosen": 86.2421875, + "rewards/margins": 170.22598266601562, + "rewards/rejected": -83.98377990722656, + "step": 8740, + "u": -6.419000148773193, + "weight": 0.06250022351741791 + }, + { + "diff_generated": -83.87056732177734, + "epoch": 2.835385612443292, + "grad_norm": 485.1619344748365, + "learning_rate": 7.315271340983731e-09, + "logits/chosen": -2.407285451889038, + "logits/rejected": -2.5274343490600586, + "logps/chosen": -11.9943265914917, + "logps/rejected": -169.26734924316406, + "loss": 12.3454, + "losses_ref": -0.0011153435334563255, + "ref_logps/chosen": -95.7374267578125, + "ref_logps/rejected": -85.39678192138672, + "rewards/accuracies": 0.9375, + "rewards/chosen": 83.74310302734375, + "rewards/margins": 167.61367797851562, + "rewards/rejected": -83.87056732177734, + "step": 8750, + "u": -6.429316520690918, + "weight": 0.06254558265209198 + }, + { + "diff_generated": -86.22040557861328, + "epoch": 2.8386260531432272, + "grad_norm": 470.0720536433388, + "learning_rate": 7.030941634194932e-09, + "logits/chosen": -2.4146881103515625, + "logits/rejected": -2.539377450942993, + "logps/chosen": -13.119758605957031, + "logps/rejected": -174.09194946289062, + "loss": 13.0131, + "losses_ref": -8.802903721516486e-06, + "ref_logps/chosen": -94.57557678222656, + "ref_logps/rejected": -87.87154388427734, + "rewards/accuracies": 0.9375, + "rewards/chosen": 81.45582580566406, + "rewards/margins": 167.6762237548828, + "rewards/rejected": -86.22040557861328, + "step": 8760, + "u": -6.445687770843506, + "weight": 0.06250028312206268 + }, + { + "diff_generated": -87.8705062866211, + "epoch": 2.841866493843163, + "grad_norm": 486.0851192565662, + "learning_rate": 6.752198675178711e-09, + "logits/chosen": -2.417034387588501, + "logits/rejected": -2.5238921642303467, + "logps/chosen": -11.984928131103516, + "logps/rejected": -175.46932983398438, + "loss": 12.2577, + "losses_ref": -1.0461581601006742e-09, + "ref_logps/chosen": -95.73543548583984, + "ref_logps/rejected": -87.59882354736328, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 83.7505111694336, + "rewards/margins": 171.6210174560547, + "rewards/rejected": -87.8705062866211, + "step": 8770, + "u": -6.54312801361084, + "weight": 0.05000000074505806 + }, + { + "diff_generated": -82.89167785644531, + "epoch": 2.845106934543098, + "grad_norm": 457.19363309019275, + "learning_rate": 6.479046426757584e-09, + "logits/chosen": -2.3589627742767334, + "logits/rejected": -2.44968581199646, + "logps/chosen": -12.40269660949707, + "logps/rejected": -163.45108032226562, + "loss": 12.3775, + "losses_ref": -0.009188800118863583, + "ref_logps/chosen": -95.39823913574219, + "ref_logps/rejected": -80.55940246582031, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 82.99554443359375, + "rewards/margins": 165.88722229003906, + "rewards/rejected": -82.89167785644531, + "step": 8780, + "u": -6.339357852935791, + "weight": 0.06919096410274506 + }, + { + "diff_generated": -86.87873840332031, + "epoch": 2.848347375243033, + "grad_norm": 503.57331972834254, + "learning_rate": 6.211488772272133e-09, + "logits/chosen": -2.3619651794433594, + "logits/rejected": -2.5722765922546387, + "logps/chosen": -11.627964973449707, + "logps/rejected": -178.09188842773438, + "loss": 12.2835, + "losses_ref": -3.3028396018153217e-08, + "ref_logps/chosen": -89.77183532714844, + "ref_logps/rejected": -91.21315002441406, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 78.14387512207031, + "rewards/margins": 165.0226287841797, + "rewards/rejected": -86.87873840332031, + "step": 8790, + "u": -6.495707035064697, + "weight": 0.05624999850988388 + }, + { + "diff_generated": -88.65650939941406, + "epoch": 2.851587815942968, + "grad_norm": 484.89134879727567, + "learning_rate": 5.9495295155260305e-09, + "logits/chosen": -2.408686399459839, + "logits/rejected": -2.5539755821228027, + "logps/chosen": -13.315896987915039, + "logps/rejected": -174.79092407226562, + "loss": 12.9074, + "losses_ref": -2.7582482076127235e-08, + "ref_logps/chosen": -98.5407485961914, + "ref_logps/rejected": -86.13442993164062, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 85.22486877441406, + "rewards/margins": 173.88134765625, + "rewards/rejected": -88.65650939941406, + "step": 8800, + "u": -6.520726203918457, + "weight": 0.05000000074505806 + }, + { + "diff_generated": -84.28498840332031, + "epoch": 2.8548282566429033, + "grad_norm": 507.28233972164253, + "learning_rate": 5.69317238073177e-09, + "logits/chosen": -2.3808228969573975, + "logits/rejected": -2.4702324867248535, + "logps/chosen": -12.662498474121094, + "logps/rejected": -165.72178649902344, + "loss": 12.5595, + "losses_ref": -0.0004648033936973661, + "ref_logps/chosen": -94.13968658447266, + "ref_logps/rejected": -81.43679809570312, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 81.47718811035156, + "rewards/margins": 165.76219177246094, + "rewards/rejected": -84.28498840332031, + "step": 8810, + "u": -6.3492021560668945, + "weight": 0.06876911222934723 + }, + { + "diff_generated": -82.69854736328125, + "epoch": 2.858068697342839, + "grad_norm": 489.5156510442447, + "learning_rate": 5.442421012457909e-09, + "logits/chosen": -2.3480446338653564, + "logits/rejected": -2.4562230110168457, + "logps/chosen": -11.1311616897583, + "logps/rejected": -165.80661010742188, + "loss": 12.5271, + "losses_ref": -0.0005079759284853935, + "ref_logps/chosen": -89.9560546875, + "ref_logps/rejected": -83.10804748535156, + "rewards/accuracies": 0.90625, + "rewards/chosen": 78.82488250732422, + "rewards/margins": 161.52342224121094, + "rewards/rejected": -82.69854736328125, + "step": 8820, + "u": -6.202415943145752, + "weight": 0.09377063810825348 + }, + { + "diff_generated": -83.34148406982422, + "epoch": 2.861309138042774, + "grad_norm": 510.3538193736458, + "learning_rate": 5.197278975577069e-09, + "logits/chosen": -2.339171886444092, + "logits/rejected": -2.503340721130371, + "logps/chosen": -13.35303020477295, + "logps/rejected": -167.5297393798828, + "loss": 12.7168, + "losses_ref": -0.0007630180334672332, + "ref_logps/chosen": -93.05812072753906, + "ref_logps/rejected": -84.18824768066406, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 79.70509338378906, + "rewards/margins": 163.0465850830078, + "rewards/rejected": -83.34148406982422, + "step": 8830, + "u": -6.637404441833496, + "weight": 0.0375315360724926 + }, + { + "diff_generated": -88.45011901855469, + "epoch": 2.864549578742709, + "grad_norm": 519.6341574740505, + "learning_rate": 4.957749755215346e-09, + "logits/chosen": -2.3442203998565674, + "logits/rejected": -2.563141345977783, + "logps/chosen": -11.600044250488281, + "logps/rejected": -178.43556213378906, + "loss": 12.97, + "losses_ref": -2.4012560828623464e-08, + "ref_logps/chosen": -90.67213439941406, + "ref_logps/rejected": -89.98545837402344, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 79.07207489013672, + "rewards/margins": 167.52218627929688, + "rewards/rejected": -88.45011901855469, + "step": 8840, + "u": -6.600878715515137, + "weight": 0.03750000149011612 + }, + { + "diff_generated": -82.25421905517578, + "epoch": 2.8677900194426442, + "grad_norm": 508.07055814248287, + "learning_rate": 4.723836756702848e-09, + "logits/chosen": -2.353315830230713, + "logits/rejected": -2.471273183822632, + "logps/chosen": -11.62907600402832, + "logps/rejected": -161.6550750732422, + "loss": 12.5092, + "losses_ref": -1.6753695035731653e-06, + "ref_logps/chosen": -93.2217788696289, + "ref_logps/rejected": -79.40087127685547, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 81.59269714355469, + "rewards/margins": 163.84690856933594, + "rewards/rejected": -82.25421905517578, + "step": 8850, + "u": -6.413300514221191, + "weight": 0.0687500387430191 + }, + { + "diff_generated": -83.08013916015625, + "epoch": 2.8710304601425793, + "grad_norm": 543.348378628187, + "learning_rate": 4.495543305524974e-09, + "logits/chosen": -2.3752541542053223, + "logits/rejected": -2.5042357444763184, + "logps/chosen": -12.29096794128418, + "logps/rejected": -165.60269165039062, + "loss": 12.9747, + "losses_ref": -5.240957534624613e-07, + "ref_logps/chosen": -93.07635498046875, + "ref_logps/rejected": -82.5225601196289, + "rewards/accuracies": 0.9375, + "rewards/chosen": 80.78538513183594, + "rewards/margins": 163.8655242919922, + "rewards/rejected": -83.08013916015625, + "step": 8860, + "u": -6.475934028625488, + "weight": 0.0625000149011612 + }, + { + "diff_generated": -86.92012023925781, + "epoch": 2.8742709008425145, + "grad_norm": 506.52607107402076, + "learning_rate": 4.2728726472756934e-09, + "logits/chosen": -2.382050037384033, + "logits/rejected": -2.5092759132385254, + "logps/chosen": -14.272692680358887, + "logps/rejected": -175.8685760498047, + "loss": 12.8992, + "losses_ref": -0.01005796529352665, + "ref_logps/chosen": -99.31636047363281, + "ref_logps/rejected": -88.94844818115234, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 85.0436782836914, + "rewards/margins": 171.9637908935547, + "rewards/rejected": -86.92012023925781, + "step": 8870, + "u": -6.565062522888184, + "weight": 0.03794359415769577 + }, + { + "diff_generated": -86.80320739746094, + "epoch": 2.8775113415424496, + "grad_norm": 500.519894631932, + "learning_rate": 4.055827947610746e-09, + "logits/chosen": -2.383944272994995, + "logits/rejected": -2.494957208633423, + "logps/chosen": -13.14458179473877, + "logps/rejected": -174.47988891601562, + "loss": 12.9073, + "losses_ref": -0.002788522047922015, + "ref_logps/chosen": -99.15926361083984, + "ref_logps/rejected": -87.67668151855469, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 86.01467895507812, + "rewards/margins": 172.81790161132812, + "rewards/rejected": -86.80320739746094, + "step": 8880, + "u": -6.616991996765137, + "weight": 0.03761402145028114 + }, + { + "diff_generated": -81.07831573486328, + "epoch": 2.8807517822423847, + "grad_norm": 518.0233566543808, + "learning_rate": 3.844412292203092e-09, + "logits/chosen": -2.3635849952697754, + "logits/rejected": -2.442833662033081, + "logps/chosen": -12.299894332885742, + "logps/rejected": -162.95193481445312, + "loss": 12.2416, + "losses_ref": -0.0006439397693611681, + "ref_logps/chosen": -94.99177551269531, + "ref_logps/rejected": -81.8736343383789, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 82.69188690185547, + "rewards/margins": 163.7701873779297, + "rewards/rejected": -81.07831573486328, + "step": 8890, + "u": -6.533448219299316, + "weight": 0.050026316195726395 + }, + { + "diff_generated": -85.1846694946289, + "epoch": 2.8839922229423203, + "grad_norm": 498.04616266719586, + "learning_rate": 3.638628686698908e-09, + "logits/chosen": -2.3565611839294434, + "logits/rejected": -2.4600491523742676, + "logps/chosen": -12.45705509185791, + "logps/rejected": -177.6852264404297, + "loss": 12.9739, + "losses_ref": -0.041970349848270416, + "ref_logps/chosen": -97.85175323486328, + "ref_logps/rejected": -92.50055694580078, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 85.39469909667969, + "rewards/margins": 170.57937622070312, + "rewards/rejected": -85.1846694946289, + "step": 8900, + "u": -6.387704372406006, + "weight": 0.07687592506408691 + }, + { + "diff_generated": -85.67487335205078, + "epoch": 2.8872326636422554, + "grad_norm": 526.337540490445, + "learning_rate": 3.438480056674864e-09, + "logits/chosen": -2.3919167518615723, + "logits/rejected": -2.477691173553467, + "logps/chosen": -12.685729026794434, + "logps/rejected": -175.30699157714844, + "loss": 12.8342, + "losses_ref": -0.0003273399197496474, + "ref_logps/chosen": -101.33018493652344, + "ref_logps/rejected": -89.63212585449219, + "rewards/accuracies": 0.9375, + "rewards/chosen": 88.64444732666016, + "rewards/margins": 174.31930541992188, + "rewards/rejected": -85.67487335205078, + "step": 8910, + "u": -6.431562900543213, + "weight": 0.06251437962055206 + }, + { + "diff_generated": -89.34169006347656, + "epoch": 2.8904731043421905, + "grad_norm": 526.3476451399375, + "learning_rate": 3.243969247596423e-09, + "logits/chosen": -2.37211275100708, + "logits/rejected": -2.491511821746826, + "logps/chosen": -12.31456470489502, + "logps/rejected": -179.0964813232422, + "loss": 13.1078, + "losses_ref": -0.0004912428557872772, + "ref_logps/chosen": -96.56272888183594, + "ref_logps/rejected": -89.7547836303711, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 84.24817657470703, + "rewards/margins": 173.58987426757812, + "rewards/rejected": -89.34169006347656, + "step": 8920, + "u": -6.722291469573975, + "weight": 0.025020426139235497 + }, + { + "diff_generated": -84.24349212646484, + "epoch": 2.8937135450421256, + "grad_norm": 527.0982706366657, + "learning_rate": 3.0550990247776522e-09, + "logits/chosen": -2.3859753608703613, + "logits/rejected": -2.5176186561584473, + "logps/chosen": -11.915032386779785, + "logps/rejected": -167.55758666992188, + "loss": 12.8392, + "losses_ref": -3.0404958550889205e-08, + "ref_logps/chosen": -94.97882080078125, + "ref_logps/rejected": -83.31407165527344, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 83.06379699707031, + "rewards/margins": 167.30728149414062, + "rewards/rejected": -84.24349212646484, + "step": 8930, + "u": -6.539845943450928, + "weight": 0.05000000074505806 + }, + { + "diff_generated": -85.3631362915039, + "epoch": 2.8969539857420608, + "grad_norm": 490.6648318727779, + "learning_rate": 2.871872073341608e-09, + "logits/chosen": -2.400597095489502, + "logits/rejected": -2.5779776573181152, + "logps/chosen": -12.452852249145508, + "logps/rejected": -173.8939208984375, + "loss": 12.8271, + "losses_ref": -0.000623942818492651, + "ref_logps/chosen": -94.02854919433594, + "ref_logps/rejected": -88.53079223632812, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 81.57572174072266, + "rewards/margins": 166.9388427734375, + "rewards/rejected": -85.3631362915039, + "step": 8940, + "u": -6.538206577301025, + "weight": 0.05002504587173462 + }, + { + "diff_generated": -87.08778381347656, + "epoch": 2.9001944264419963, + "grad_norm": 520.9679196841599, + "learning_rate": 2.694290998182325e-09, + "logits/chosen": -2.403104305267334, + "logits/rejected": -2.558917999267578, + "logps/chosen": -12.943342208862305, + "logps/rejected": -174.18057250976562, + "loss": 12.9416, + "losses_ref": -0.0016796886920928955, + "ref_logps/chosen": -98.12152099609375, + "ref_logps/rejected": -87.09278106689453, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 85.17818450927734, + "rewards/margins": 172.26596069335938, + "rewards/rejected": -87.08778381347656, + "step": 8950, + "u": -6.579586029052734, + "weight": 0.04382295534014702 + }, + { + "diff_generated": -86.29190826416016, + "epoch": 2.9034348671419314, + "grad_norm": 529.2791272283124, + "learning_rate": 2.52235832392782e-09, + "logits/chosen": -2.38016414642334, + "logits/rejected": -2.5228476524353027, + "logps/chosen": -12.305742263793945, + "logps/rejected": -176.33604431152344, + "loss": 13.1021, + "losses_ref": -7.620108954142779e-05, + "ref_logps/chosen": -95.00885009765625, + "ref_logps/rejected": -90.04413604736328, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 82.70310974121094, + "rewards/margins": 168.99502563476562, + "rewards/rejected": -86.29190826416016, + "step": 8960, + "u": -6.599948883056641, + "weight": 0.037502940744161606 + }, + { + "diff_generated": -87.52955627441406, + "epoch": 2.9066753078418666, + "grad_norm": 469.7649862031913, + "learning_rate": 2.35607649490408e-09, + "logits/chosen": -2.4092063903808594, + "logits/rejected": -2.477895736694336, + "logps/chosen": -13.770861625671387, + "logps/rejected": -174.51356506347656, + "loss": 13.0702, + "losses_ref": -4.5696660322391836e-07, + "ref_logps/chosen": -104.1186752319336, + "ref_logps/rejected": -86.9840087890625, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 90.34780883789062, + "rewards/margins": 177.8773651123047, + "rewards/rejected": -87.52955627441406, + "step": 8970, + "u": -6.810555934906006, + "weight": 0.012500005774199963 + }, + { + "diff_generated": -83.99754333496094, + "epoch": 2.9099157485418017, + "grad_norm": 566.3807216829343, + "learning_rate": 2.1954478751003313e-09, + "logits/chosen": -2.3780055046081543, + "logits/rejected": -2.497126340866089, + "logps/chosen": -11.279232025146484, + "logps/rejected": -165.64120483398438, + "loss": 12.4931, + "losses_ref": -0.006799762137234211, + "ref_logps/chosen": -94.69685363769531, + "ref_logps/rejected": -81.6436538696289, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 83.41761016845703, + "rewards/margins": 167.41517639160156, + "rewards/rejected": -83.99754333496094, + "step": 8980, + "u": -6.490334987640381, + "weight": 0.05654176324605942 + }, + { + "diff_generated": -84.52422332763672, + "epoch": 2.913156189241737, + "grad_norm": 472.8819765444885, + "learning_rate": 2.040474748135512e-09, + "logits/chosen": -2.3666622638702393, + "logits/rejected": -2.467114210128784, + "logps/chosen": -12.454826354980469, + "logps/rejected": -170.0270538330078, + "loss": 12.9191, + "losses_ref": -0.0022382144816219807, + "ref_logps/chosen": -97.16545104980469, + "ref_logps/rejected": -85.50281524658203, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 84.71062469482422, + "rewards/margins": 169.23487854003906, + "rewards/rejected": -84.52422332763672, + "step": 8990, + "u": -6.418060302734375, + "weight": 0.06884618103504181 + }, + { + "diff_generated": -85.54656982421875, + "epoch": 2.916396629941672, + "grad_norm": 488.54506638633603, + "learning_rate": 1.8911593172258544e-09, + "logits/chosen": -2.3644773960113525, + "logits/rejected": -2.4817874431610107, + "logps/chosen": -12.495416641235352, + "logps/rejected": -169.99362182617188, + "loss": 12.6552, + "losses_ref": -1.4663429581673881e-08, + "ref_logps/chosen": -94.41749572753906, + "ref_logps/rejected": -84.44705200195312, + "rewards/accuracies": 0.9375, + "rewards/chosen": 81.92208099365234, + "rewards/margins": 167.46865844726562, + "rewards/rejected": -85.54656982421875, + "step": 9000, + "u": -6.4284987449646, + "weight": 0.0625 + }, + { + "diff_generated": -87.22413635253906, + "epoch": 2.919637070641607, + "grad_norm": 529.4796688062927, + "learning_rate": 1.7475037051532638e-09, + "logits/chosen": -2.4001526832580566, + "logits/rejected": -2.506021499633789, + "logps/chosen": -13.3471097946167, + "logps/rejected": -172.06570434570312, + "loss": 12.8322, + "losses_ref": -0.0021448889747262, + "ref_logps/chosen": -95.56883239746094, + "ref_logps/rejected": -84.8415756225586, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 82.22171783447266, + "rewards/margins": 169.4458465576172, + "rewards/rejected": -87.22413635253906, + "step": 9010, + "u": -6.472896575927734, + "weight": 0.05634375661611557 + }, + { + "diff_generated": -89.62074279785156, + "epoch": 2.9228775113415426, + "grad_norm": 491.15091859051904, + "learning_rate": 1.609509954235566e-09, + "logits/chosen": -2.3617372512817383, + "logits/rejected": -2.571871280670166, + "logps/chosen": -12.765652656555176, + "logps/rejected": -181.67649841308594, + "loss": 12.5575, + "losses_ref": -0.004562483634799719, + "ref_logps/chosen": -94.67340087890625, + "ref_logps/rejected": -92.05574035644531, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 81.90774536132812, + "rewards/margins": 171.5284881591797, + "rewards/rejected": -89.62074279785156, + "step": 9020, + "u": -6.612214088439941, + "weight": 0.03769397363066673 + }, + { + "diff_generated": -87.36817932128906, + "epoch": 2.9261179520414777, + "grad_norm": 529.3732264024782, + "learning_rate": 1.4771800262970203e-09, + "logits/chosen": -2.3470075130462646, + "logits/rejected": -2.5225512981414795, + "logps/chosen": -13.415380477905273, + "logps/rejected": -176.37176513671875, + "loss": 12.5222, + "losses_ref": -1.5120046725769498e-07, + "ref_logps/chosen": -93.91580963134766, + "ref_logps/rejected": -89.00359344482422, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 80.50043487548828, + "rewards/margins": 167.86862182617188, + "rewards/rejected": -87.36817932128906, + "step": 9030, + "u": -6.55454158782959, + "weight": 0.04375000298023224 + }, + { + "diff_generated": -86.6895980834961, + "epoch": 2.929358392741413, + "grad_norm": 483.88931269328356, + "learning_rate": 1.3505158026408724e-09, + "logits/chosen": -2.367731809616089, + "logits/rejected": -2.493482828140259, + "logps/chosen": -14.568082809448242, + "logps/rejected": -176.5614013671875, + "loss": 12.352, + "losses_ref": -0.02299603261053562, + "ref_logps/chosen": -97.7130126953125, + "ref_logps/rejected": -89.87179565429688, + "rewards/accuracies": 0.96875, + "rewards/chosen": 83.1449203491211, + "rewards/margins": 169.8345184326172, + "rewards/rejected": -86.6895980834961, + "step": 9040, + "u": -6.6297607421875, + "weight": 0.032338447868824005 + }, + { + "diff_generated": -87.48365783691406, + "epoch": 2.932598833441348, + "grad_norm": 509.056014485162, + "learning_rate": 1.2295190840223125e-09, + "logits/chosen": -2.392026901245117, + "logits/rejected": -2.5566885471343994, + "logps/chosen": -12.913421630859375, + "logps/rejected": -175.8246612548828, + "loss": 12.3104, + "losses_ref": -0.003945712000131607, + "ref_logps/chosen": -98.818603515625, + "ref_logps/rejected": -88.34100341796875, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 85.9051742553711, + "rewards/margins": 173.38882446289062, + "rewards/rejected": -87.48365783691406, + "step": 9050, + "u": -6.52199649810791, + "weight": 0.05018042400479317 + }, + { + "diff_generated": -86.2895736694336, + "epoch": 2.935839274141283, + "grad_norm": 522.5397264350086, + "learning_rate": 1.1141915906228928e-09, + "logits/chosen": -2.38533353805542, + "logits/rejected": -2.4990665912628174, + "logps/chosen": -12.127532958984375, + "logps/rejected": -172.2735595703125, + "loss": 12.553, + "losses_ref": -2.1370703962020343e-07, + "ref_logps/chosen": -95.51380920410156, + "ref_logps/rejected": -85.9839859008789, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 83.38627624511719, + "rewards/margins": 169.67587280273438, + "rewards/rejected": -86.2895736694336, + "step": 9060, + "u": -6.544399261474609, + "weight": 0.04375000670552254 + }, + { + "diff_generated": -83.47368621826172, + "epoch": 2.9390797148412187, + "grad_norm": 563.9845256802446, + "learning_rate": 1.0045349620262379e-09, + "logits/chosen": -2.3863961696624756, + "logits/rejected": -2.5087244510650635, + "logps/chosen": -12.521696090698242, + "logps/rejected": -165.8682861328125, + "loss": 12.6952, + "losses_ref": -0.00032701349118724465, + "ref_logps/chosen": -96.12919616699219, + "ref_logps/rejected": -82.39459991455078, + "rewards/accuracies": 0.9375, + "rewards/chosen": 83.60749816894531, + "rewards/margins": 167.08120727539062, + "rewards/rejected": -83.47368621826172, + "step": 9070, + "u": -6.404754638671875, + "weight": 0.06251270323991776 + }, + { + "diff_generated": -84.08647155761719, + "epoch": 2.942320155541154, + "grad_norm": 497.23346440289765, + "learning_rate": 9.005507571945958e-10, + "logits/chosen": -2.402247905731201, + "logits/rejected": -2.4763851165771484, + "logps/chosen": -13.246496200561523, + "logps/rejected": -165.32626342773438, + "loss": 12.3413, + "losses_ref": -2.0963292968190217e-07, + "ref_logps/chosen": -95.0720443725586, + "ref_logps/rejected": -81.23980712890625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 81.8255615234375, + "rewards/margins": 165.9120330810547, + "rewards/rejected": -84.08647155761719, + "step": 9080, + "u": -6.462907314300537, + "weight": 0.0625000074505806 + }, + { + "diff_generated": -85.76387023925781, + "epoch": 2.945560596241089, + "grad_norm": 492.894108034187, + "learning_rate": 8.022404544466788e-10, + "logits/chosen": -2.3979382514953613, + "logits/rejected": -2.5150272846221924, + "logps/chosen": -12.623950004577637, + "logps/rejected": -169.61444091796875, + "loss": 12.4399, + "losses_ref": -0.004426004830747843, + "ref_logps/chosen": -95.57967376708984, + "ref_logps/rejected": -83.8505630493164, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 82.95572662353516, + "rewards/margins": 168.71958923339844, + "rewards/rejected": -85.76387023925781, + "step": 9090, + "u": -6.603558540344238, + "weight": 0.04393995180726051 + }, + { + "diff_generated": -82.2959976196289, + "epoch": 2.948801036941024, + "grad_norm": 537.1206136986993, + "learning_rate": 7.096054514367455e-10, + "logits/chosen": -2.3304896354675293, + "logits/rejected": -2.522718906402588, + "logps/chosen": -12.409939765930176, + "logps/rejected": -164.3667755126953, + "loss": 12.5513, + "losses_ref": -0.0012399861589074135, + "ref_logps/chosen": -86.47990417480469, + "ref_logps/rejected": -82.07076263427734, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 74.06996154785156, + "rewards/margins": 156.36595153808594, + "rewards/rejected": -82.2959976196289, + "step": 9100, + "u": -6.275343894958496, + "weight": 0.08755262196063995 + }, + { + "diff_generated": -90.0444107055664, + "epoch": 2.952041477640959, + "grad_norm": 506.99486602906086, + "learning_rate": 6.226470651346182e-10, + "logits/chosen": -2.366097927093506, + "logits/rejected": -2.557908535003662, + "logps/chosen": -12.234922409057617, + "logps/rejected": -182.9877471923828, + "loss": 12.5142, + "losses_ref": -1.0447491760601224e-08, + "ref_logps/chosen": -92.40087890625, + "ref_logps/rejected": -92.94332885742188, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 80.16595458984375, + "rewards/margins": 170.21035766601562, + "rewards/rejected": -90.0444107055664, + "step": 9110, + "u": -6.548572540283203, + "weight": 0.05000000074505806 + }, + { + "diff_generated": -89.26964569091797, + "epoch": 2.9552819183408943, + "grad_norm": 501.46049075234794, + "learning_rate": 5.413665318070304e-10, + "logits/chosen": -2.3693573474884033, + "logits/rejected": -2.546677350997925, + "logps/chosen": -13.509109497070312, + "logps/rejected": -181.3638458251953, + "loss": 13.2782, + "losses_ref": -7.902246466073848e-07, + "ref_logps/chosen": -96.25882720947266, + "ref_logps/rejected": -92.09419250488281, + "rewards/accuracies": 0.96875, + "rewards/chosen": 82.74971771240234, + "rewards/margins": 172.0193634033203, + "rewards/rejected": -89.26964569091797, + "step": 9120, + "u": -6.638380527496338, + "weight": 0.03125002235174179 + }, + { + "diff_generated": -89.65789794921875, + "epoch": 2.9585223590408294, + "grad_norm": 535.7757933629651, + "learning_rate": 4.657650069999963e-10, + "logits/chosen": -2.3974645137786865, + "logits/rejected": -2.5350089073181152, + "logps/chosen": -12.196057319641113, + "logps/rejected": -177.3319854736328, + "loss": 12.9091, + "losses_ref": -0.014481325633823872, + "ref_logps/chosen": -94.6359634399414, + "ref_logps/rejected": -87.67405700683594, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 82.43990325927734, + "rewards/margins": 172.09780883789062, + "rewards/rejected": -89.65789794921875, + "step": 9130, + "u": -6.52349853515625, + "weight": 0.050685059279203415 + }, + { + "diff_generated": -84.9540023803711, + "epoch": 2.9617627997407645, + "grad_norm": 522.8122611896282, + "learning_rate": 3.95843565522469e-10, + "logits/chosen": -2.385798215866089, + "logits/rejected": -2.4658164978027344, + "logps/chosen": -11.826509475708008, + "logps/rejected": -171.10751342773438, + "loss": 12.475, + "losses_ref": -0.0056012957356870174, + "ref_logps/chosen": -97.4726333618164, + "ref_logps/rejected": -86.15352630615234, + "rewards/accuracies": 0.96875, + "rewards/chosen": 85.6461181640625, + "rewards/margins": 170.6001434326172, + "rewards/rejected": -84.9540023803711, + "step": 9140, + "u": -6.638216495513916, + "weight": 0.031489625573158264 + }, + { + "diff_generated": -83.58999633789062, + "epoch": 2.9650032404407, + "grad_norm": 461.03397205632257, + "learning_rate": 3.3160320143097444e-10, + "logits/chosen": -2.415177583694458, + "logits/rejected": -2.542485237121582, + "logps/chosen": -12.850624084472656, + "logps/rejected": -167.17276000976562, + "loss": 12.8863, + "losses_ref": -4.105303560208995e-06, + "ref_logps/chosen": -92.3864974975586, + "ref_logps/rejected": -83.58274841308594, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 79.53587341308594, + "rewards/margins": 163.1258544921875, + "rewards/rejected": -83.58999633789062, + "step": 9150, + "u": -6.271819591522217, + "weight": 0.08750005066394806 + }, + { + "diff_generated": -83.55833435058594, + "epoch": 2.968243681140635, + "grad_norm": 526.4209216536375, + "learning_rate": 2.7304482801548957e-10, + "logits/chosen": -2.3909294605255127, + "logits/rejected": -2.435844898223877, + "logps/chosen": -12.864395141601562, + "logps/rejected": -165.61740112304688, + "loss": 12.5398, + "losses_ref": -1.5140069535846123e-06, + "ref_logps/chosen": -95.42951965332031, + "ref_logps/rejected": -82.05905151367188, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 82.56513214111328, + "rewards/margins": 166.12344360351562, + "rewards/rejected": -83.55833435058594, + "step": 9160, + "u": -6.4903082847595215, + "weight": 0.05625002458691597 + }, + { + "diff_generated": -86.75840759277344, + "epoch": 2.9714841218405703, + "grad_norm": 515.474364819348, + "learning_rate": 2.201692777865194e-10, + "logits/chosen": -2.3582444190979004, + "logits/rejected": -2.4979777336120605, + "logps/chosen": -11.988899230957031, + "logps/rejected": -175.92791748046875, + "loss": 12.4569, + "losses_ref": -0.0018433972727507353, + "ref_logps/chosen": -95.80802917480469, + "ref_logps/rejected": -89.16950988769531, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 83.81912994384766, + "rewards/margins": 170.57754516601562, + "rewards/rejected": -86.75840759277344, + "step": 9170, + "u": -6.5215277671813965, + "weight": 0.05007495731115341 + }, + { + "diff_generated": -84.22948455810547, + "epoch": 2.9747245625405054, + "grad_norm": 491.027791113081, + "learning_rate": 1.729773024631953e-10, + "logits/chosen": -2.3653883934020996, + "logits/rejected": -2.491422653198242, + "logps/chosen": -12.891985893249512, + "logps/rejected": -170.76918029785156, + "loss": 13.2191, + "losses_ref": -0.00504010496661067, + "ref_logps/chosen": -97.28108215332031, + "ref_logps/rejected": -86.5396728515625, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 84.38908386230469, + "rewards/margins": 168.6185760498047, + "rewards/rejected": -84.22948455810547, + "step": 9180, + "u": -6.577776908874512, + "weight": 0.04397150129079819 + }, + { + "diff_generated": -82.56714630126953, + "epoch": 2.9779650032404406, + "grad_norm": 524.5628445911263, + "learning_rate": 1.3146957296261696e-10, + "logits/chosen": -2.296567916870117, + "logits/rejected": -2.5240516662597656, + "logps/chosen": -11.981468200683594, + "logps/rejected": -168.29286193847656, + "loss": 12.8342, + "losses_ref": -1.685024031417015e-08, + "ref_logps/chosen": -87.14879608154297, + "ref_logps/rejected": -85.72572326660156, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 75.16732025146484, + "rewards/margins": 157.7344512939453, + "rewards/rejected": -82.56714630126953, + "step": 9190, + "u": -6.213091850280762, + "weight": 0.08749999850988388 + }, + { + "diff_generated": -90.73418426513672, + "epoch": 2.981205443940376, + "grad_norm": 496.7884393889243, + "learning_rate": 9.564667939030435e-11, + "logits/chosen": -2.4040732383728027, + "logits/rejected": -2.5331547260284424, + "logps/chosen": -12.428566932678223, + "logps/rejected": -181.51348876953125, + "loss": 12.9255, + "losses_ref": -0.0026974931824952364, + "ref_logps/chosen": -95.91474914550781, + "ref_logps/rejected": -90.779296875, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 83.4861831665039, + "rewards/margins": 174.22036743164062, + "rewards/rejected": -90.73418426513672, + "step": 9200, + "u": -6.768782615661621, + "weight": 0.018865812569856644 + }, + { + "diff_generated": -87.49602508544922, + "epoch": 2.9844458846403112, + "grad_norm": 486.7750746787758, + "learning_rate": 6.550913103189337e-11, + "logits/chosen": -2.3700268268585205, + "logits/rejected": -2.5084478855133057, + "logps/chosen": -11.562259674072266, + "logps/rejected": -178.05300903320312, + "loss": 12.3706, + "losses_ref": -0.000528900243807584, + "ref_logps/chosen": -93.57711029052734, + "ref_logps/rejected": -90.5569839477539, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 82.01486206054688, + "rewards/margins": 169.51089477539062, + "rewards/rejected": -87.49602508544922, + "step": 9210, + "u": -6.715609550476074, + "weight": 0.018771730363368988 + }, + { + "diff_generated": -80.7569808959961, + "epoch": 2.9876863253402464, + "grad_norm": 527.7795938159417, + "learning_rate": 4.1057356345675085e-11, + "logits/chosen": -2.3647098541259766, + "logits/rejected": -2.3917651176452637, + "logps/chosen": -14.38011646270752, + "logps/rejected": -163.92794799804688, + "loss": 13.0986, + "losses_ref": -0.0022724694572389126, + "ref_logps/chosen": -99.43622589111328, + "ref_logps/rejected": -83.17097473144531, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 85.05610656738281, + "rewards/margins": 165.81307983398438, + "rewards/rejected": -80.7569808959961, + "step": 9220, + "u": -6.606814384460449, + "weight": 0.03759920597076416 + }, + { + "diff_generated": -90.11248779296875, + "epoch": 2.9909267660401815, + "grad_norm": 502.63482025081817, + "learning_rate": 2.229170295673377e-11, + "logits/chosen": -2.401397228240967, + "logits/rejected": -2.52555513381958, + "logps/chosen": -12.586407661437988, + "logps/rejected": -177.9874725341797, + "loss": 12.5562, + "losses_ref": -4.895515992586752e-09, + "ref_logps/chosen": -97.63871002197266, + "ref_logps/rejected": -87.87500762939453, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 85.05230712890625, + "rewards/margins": 175.164794921875, + "rewards/rejected": -90.11248779296875, + "step": 9230, + "u": -6.748744964599609, + "weight": 0.01875000074505806 + }, + { + "diff_generated": -86.4819564819336, + "epoch": 2.9941672067401166, + "grad_norm": 497.2515717059789, + "learning_rate": 9.212437651973103e-12, + "logits/chosen": -2.4102160930633545, + "logits/rejected": -2.5013763904571533, + "logps/chosen": -13.198896408081055, + "logps/rejected": -174.96900939941406, + "loss": 12.526, + "losses_ref": -0.0031384092289954424, + "ref_logps/chosen": -96.73193359375, + "ref_logps/rejected": -88.48704528808594, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 83.53303527832031, + "rewards/margins": 170.01498413085938, + "rewards/rejected": -86.4819564819336, + "step": 9240, + "u": -6.405519008636475, + "weight": 0.06888642907142639 + }, + { + "diff_generated": -87.38936614990234, + "epoch": 2.9974076474400517, + "grad_norm": 471.9558730646237, + "learning_rate": 1.819746376119369e-12, + "logits/chosen": -2.3839592933654785, + "logits/rejected": -2.4822239875793457, + "logps/chosen": -14.210899353027344, + "logps/rejected": -170.20504760742188, + "loss": 13.092, + "losses_ref": -0.0017271274700760841, + "ref_logps/chosen": -96.49150848388672, + "ref_logps/rejected": -82.81568145751953, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 82.28060913085938, + "rewards/margins": 169.66998291015625, + "rewards/rejected": -87.38936614990234, + "step": 9250, + "u": -6.5622968673706055, + "weight": 0.050075747072696686 + } + ], + "logging_steps": 10, + "max_steps": 9258, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}