{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 9258, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "diff_generated": -5.324533939361572, "epoch": 0.0003240440699935191, "grad_norm": 25.298450180570082, "learning_rate": 8.639308855291577e-10, "logits/chosen": -2.6053388118743896, "logits/rejected": -2.4319162368774414, "logps/chosen": -116.55142974853516, "logps/rejected": -89.49524688720703, "logps_avg/chosen": -0.5783171057701111, "logps_avg/rejected": -0.5324533581733704, "loss": 0.5351, "losses_ref": -0.028132084757089615, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1, "u": -1.679854393005371, "weight": 0.16303405165672302 }, { "diff_generated": -4.9921698570251465, "epoch": 0.0032404406999351912, "grad_norm": 23.66608439819008, "learning_rate": 8.639308855291576e-09, "logits/chosen": -2.4976794719696045, "logits/rejected": -2.571298599243164, "logps/chosen": -92.15830993652344, "logps/rejected": -91.23859405517578, "logps_avg/chosen": -0.5637891888618469, "logps_avg/rejected": -0.4992169737815857, "loss": 0.533, "losses_ref": -0.0346137136220932, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 10, "u": -1.6916941404342651, "weight": 0.16587796807289124 }, { "diff_generated": -4.709494590759277, "epoch": 0.0064808813998703824, "grad_norm": 21.640052012860544, "learning_rate": 1.727861771058315e-08, "logits/chosen": -2.5320584774017334, "logits/rejected": -2.588595390319824, "logps/chosen": -100.08524322509766, "logps/rejected": -85.40359497070312, "logps_avg/chosen": -0.5972418189048767, "logps_avg/rejected": -0.4709494709968567, "loss": 0.5334, "losses_ref": -0.03922479599714279, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 20, "u": -1.6618385314941406, "weight": 0.1921975165605545 }, { "diff_generated": -5.0379767417907715, "epoch": 0.009721322099805573, "grad_norm": 22.995127298370598, "learning_rate": 2.591792656587473e-08, "logits/chosen": -2.5383121967315674, "logits/rejected": -2.569267988204956, "logps/chosen": -100.78271484375, "logps/rejected": -87.62537384033203, "logps_avg/chosen": -0.5688080191612244, "logps_avg/rejected": -0.5037976503372192, "loss": 0.5337, "losses_ref": -0.03752085939049721, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 30, "u": -1.685185194015503, "weight": 0.17561769485473633 }, { "diff_generated": -4.766201019287109, "epoch": 0.012961762799740765, "grad_norm": 24.543446757562368, "learning_rate": 3.45572354211663e-08, "logits/chosen": -2.5598511695861816, "logits/rejected": -2.614499568939209, "logps/chosen": -96.36283874511719, "logps/rejected": -88.67526245117188, "logps_avg/chosen": -0.5644618272781372, "logps_avg/rejected": -0.47662001848220825, "loss": 0.5162, "losses_ref": -0.04021410271525383, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 40, "u": -1.6799871921539307, "weight": 0.18456074595451355 }, { "diff_generated": -4.541996479034424, "epoch": 0.016202203499675955, "grad_norm": 18.98878051668679, "learning_rate": 4.319654427645788e-08, "logits/chosen": -2.51167631149292, "logits/rejected": -2.5764548778533936, "logps/chosen": -89.32698059082031, "logps/rejected": -83.97340393066406, "logps_avg/chosen": -0.5328198075294495, "logps_avg/rejected": -0.4541996121406555, "loss": 0.496, "losses_ref": -0.040191732347011566, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 50, "u": -1.6649795770645142, "weight": 0.19056639075279236 }, { "diff_generated": -4.699794292449951, "epoch": 0.019442644199611146, "grad_norm": 20.824924101981164, "learning_rate": 5.183585313174946e-08, "logits/chosen": -2.5374608039855957, "logits/rejected": -2.598020553588867, "logps/chosen": -79.26811218261719, "logps/rejected": -82.68241882324219, "logps_avg/chosen": -0.4793620705604553, "logps_avg/rejected": -0.469979465007782, "loss": 0.4413, "losses_ref": -0.041936349123716354, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 60, "u": -1.6706863641738892, "weight": 0.19364500045776367 }, { "diff_generated": -5.023270606994629, "epoch": 0.02268308489954634, "grad_norm": 15.085833079737961, "learning_rate": 6.047516198704104e-08, "logits/chosen": -2.494488477706909, "logits/rejected": -2.553434371948242, "logps/chosen": -71.15379333496094, "logps/rejected": -87.61297607421875, "logps_avg/chosen": -0.4256020188331604, "logps_avg/rejected": -0.5023270845413208, "loss": 0.38, "losses_ref": -0.03602486103773117, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 70, "u": -1.6736557483673096, "weight": 0.1774376928806305 }, { "diff_generated": -5.608443737030029, "epoch": 0.02592352559948153, "grad_norm": 10.17433229960028, "learning_rate": 6.91144708423326e-08, "logits/chosen": -2.466387987136841, "logits/rejected": -2.5406899452209473, "logps/chosen": -56.756500244140625, "logps/rejected": -99.27467346191406, "logps_avg/chosen": -0.32987886667251587, "logps_avg/rejected": -0.560844361782074, "loss": 0.3069, "losses_ref": -0.028670093044638634, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 80, "u": -1.7049707174301147, "weight": 0.14695551991462708 }, { "diff_generated": -7.005269527435303, "epoch": 0.02916396629941672, "grad_norm": 6.782996553692984, "learning_rate": 7.775377969762419e-08, "logits/chosen": -2.508361577987671, "logits/rejected": -2.4981703758239746, "logps/chosen": -52.216339111328125, "logps/rejected": -107.29414367675781, "logps_avg/chosen": -0.29603785276412964, "logps_avg/rejected": -0.700527012348175, "loss": 0.2762, "losses_ref": -0.01938403770327568, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 90, "u": -1.7649612426757812, "weight": 0.09771289676427841 }, { "diff_generated": -7.859269618988037, "epoch": 0.03240440699935191, "grad_norm": 5.915285953748342, "learning_rate": 8.639308855291576e-08, "logits/chosen": -2.5046210289001465, "logits/rejected": -2.5096004009246826, "logps/chosen": -53.19324493408203, "logps/rejected": -132.89813232421875, "logps_avg/chosen": -0.28428006172180176, "logps_avg/rejected": -0.7859269976615906, "loss": 0.2653, "losses_ref": -0.00893635768443346, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 100, "u": -1.7851336002349854, "weight": 0.06954724341630936 }, { "diff_generated": -8.105627059936523, "epoch": 0.0356448476992871, "grad_norm": 5.485745202835744, "learning_rate": 9.503239740820734e-08, "logits/chosen": -2.486294984817505, "logits/rejected": -2.5347485542297363, "logps/chosen": -46.411155700683594, "logps/rejected": -135.87271118164062, "logps_avg/chosen": -0.26198580861091614, "logps_avg/rejected": -0.8105627298355103, "loss": 0.2395, "losses_ref": -0.011290923692286015, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 110, "u": -1.7970489263534546, "weight": 0.06864650547504425 }, { "diff_generated": -9.39500617980957, "epoch": 0.03888528839922229, "grad_norm": 5.858206689278202, "learning_rate": 1.0367170626349892e-07, "logits/chosen": -2.473548412322998, "logits/rejected": -2.5916199684143066, "logps/chosen": -40.70339584350586, "logps/rejected": -163.2863006591797, "logps_avg/chosen": -0.24593877792358398, "logps_avg/rejected": -0.939500629901886, "loss": 0.2436, "losses_ref": -0.00593178765848279, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 120, "u": -1.8373692035675049, "weight": 0.03868420049548149 }, { "diff_generated": -8.719804763793945, "epoch": 0.04212572909915749, "grad_norm": 4.8636295363498165, "learning_rate": 1.1231101511879049e-07, "logits/chosen": -2.42881441116333, "logits/rejected": -2.526031255722046, "logps/chosen": -39.71488952636719, "logps/rejected": -155.34878540039062, "logps_avg/chosen": -0.2515028715133667, "logps_avg/rejected": -0.8719803690910339, "loss": 0.2291, "losses_ref": -0.007679730653762817, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 130, "u": -1.757032036781311, "weight": 0.08497841656208038 }, { "diff_generated": -10.024818420410156, "epoch": 0.04536616979909268, "grad_norm": 5.129995804872467, "learning_rate": 1.2095032397408208e-07, "logits/chosen": -2.4535014629364014, "logits/rejected": -2.5594677925109863, "logps/chosen": -39.28099060058594, "logps/rejected": -180.23167419433594, "logps_avg/chosen": -0.24077431857585907, "logps_avg/rejected": -1.0024818181991577, "loss": 0.2251, "losses_ref": -0.004803563468158245, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 140, "u": -1.8035099506378174, "weight": 0.055381983518600464 }, { "diff_generated": -10.327981948852539, "epoch": 0.04860661049902787, "grad_norm": 7.665590400359612, "learning_rate": 1.2958963282937366e-07, "logits/chosen": -2.503351926803589, "logits/rejected": -2.5048727989196777, "logps/chosen": -40.956787109375, "logps/rejected": -175.9888153076172, "logps_avg/chosen": -0.22418427467346191, "logps_avg/rejected": -1.032798171043396, "loss": 0.2247, "losses_ref": -0.004349695052951574, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 150, "u": -1.769809365272522, "weight": 0.07237504422664642 }, { "diff_generated": -11.050540924072266, "epoch": 0.05184705119896306, "grad_norm": 5.2218156254610975, "learning_rate": 1.382289416846652e-07, "logits/chosen": -2.4762940406799316, "logits/rejected": -2.5197434425354004, "logps/chosen": -41.60301971435547, "logps/rejected": -190.45809936523438, "logps_avg/chosen": -0.2423749417066574, "logps_avg/rejected": -1.1050540208816528, "loss": 0.2202, "losses_ref": -0.0036694530863314867, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 160, "u": -1.819371223449707, "weight": 0.045095235109329224 }, { "diff_generated": -10.912653923034668, "epoch": 0.05508749189889825, "grad_norm": 4.72316488652966, "learning_rate": 1.468682505399568e-07, "logits/chosen": -2.494835376739502, "logits/rejected": -2.5751283168792725, "logps/chosen": -34.90558624267578, "logps/rejected": -183.92214965820312, "logps_avg/chosen": -0.21363107860088348, "logps_avg/rejected": -1.0912654399871826, "loss": 0.2089, "losses_ref": -0.004652983509004116, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 170, "u": -1.8284003734588623, "weight": 0.041924357414245605 }, { "diff_generated": -10.835649490356445, "epoch": 0.05832793259883344, "grad_norm": 5.4755997415237365, "learning_rate": 1.5550755939524837e-07, "logits/chosen": -2.4843239784240723, "logits/rejected": -2.5287718772888184, "logps/chosen": -37.10668182373047, "logps/rejected": -182.80319213867188, "logps_avg/chosen": -0.2310374677181244, "logps_avg/rejected": -1.0835647583007812, "loss": 0.2062, "losses_ref": -0.005131029523909092, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 180, "u": -1.7793972492218018, "weight": 0.06860009580850601 }, { "diff_generated": -12.018121719360352, "epoch": 0.06156837329876863, "grad_norm": 4.985706566011136, "learning_rate": 1.6414686825053995e-07, "logits/chosen": -2.4900550842285156, "logits/rejected": -2.4969873428344727, "logps/chosen": -38.894493103027344, "logps/rejected": -195.72805786132812, "logps_avg/chosen": -0.2240675985813141, "logps_avg/rejected": -1.2018121480941772, "loss": 0.2041, "losses_ref": -0.003515923861414194, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 190, "u": -1.852745771408081, "weight": 0.028062384575605392 }, { "diff_generated": -12.754137992858887, "epoch": 0.06480881399870382, "grad_norm": 5.067587022106536, "learning_rate": 1.7278617710583153e-07, "logits/chosen": -2.4320194721221924, "logits/rejected": -2.496293544769287, "logps/chosen": -33.140167236328125, "logps/rejected": -204.68572998046875, "logps_avg/chosen": -0.1981068104505539, "logps_avg/rejected": -1.2754138708114624, "loss": 0.1953, "losses_ref": -0.0022536544129252434, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 200, "u": -1.749868392944336, "weight": 0.08045514672994614 }, { "diff_generated": -12.116033554077148, "epoch": 0.06804925469863901, "grad_norm": 5.601872457233822, "learning_rate": 1.814254859611231e-07, "logits/chosen": -2.475778102874756, "logits/rejected": -2.53865122795105, "logps/chosen": -35.73398208618164, "logps/rejected": -209.81747436523438, "logps_avg/chosen": -0.21023687720298767, "logps_avg/rejected": -1.2116032838821411, "loss": 0.2003, "losses_ref": -0.0035675906110554934, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 210, "u": -1.8303534984588623, "weight": 0.039604417979717255 }, { "diff_generated": -11.962373733520508, "epoch": 0.0712896953985742, "grad_norm": 4.880544384760887, "learning_rate": 1.900647948164147e-07, "logits/chosen": -2.4443392753601074, "logits/rejected": -2.4997096061706543, "logps/chosen": -35.777610778808594, "logps/rejected": -202.77987670898438, "logps_avg/chosen": -0.2033710926771164, "logps_avg/rejected": -1.1962374448776245, "loss": 0.1934, "losses_ref": -0.003498962614685297, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 220, "u": -1.7824407815933228, "weight": 0.06514163315296173 }, { "diff_generated": -12.74167251586914, "epoch": 0.07453013609850939, "grad_norm": 5.644389790526717, "learning_rate": 1.9870410367170624e-07, "logits/chosen": -2.4751639366149902, "logits/rejected": -2.472346067428589, "logps/chosen": -33.588478088378906, "logps/rejected": -194.97332763671875, "logps_avg/chosen": -0.19568376243114471, "logps_avg/rejected": -1.2741672992706299, "loss": 0.1957, "losses_ref": -0.0029640875291079283, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 230, "u": -1.7599050998687744, "weight": 0.07632104307413101 }, { "diff_generated": -12.876994132995605, "epoch": 0.07777057679844458, "grad_norm": 5.417843349387695, "learning_rate": 2.0734341252699785e-07, "logits/chosen": -2.4684810638427734, "logits/rejected": -2.5136160850524902, "logps/chosen": -31.660619735717773, "logps/rejected": -216.2432403564453, "logps_avg/chosen": -0.18595094978809357, "logps_avg/rejected": -1.287699580192566, "loss": 0.1874, "losses_ref": -0.0023983852006495, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 240, "u": -1.7855768203735352, "weight": 0.06173267960548401 }, { "diff_generated": -12.893110275268555, "epoch": 0.08101101749837979, "grad_norm": 5.038605999289401, "learning_rate": 2.159827213822894e-07, "logits/chosen": -2.455719470977783, "logits/rejected": -2.5414233207702637, "logps/chosen": -33.99782180786133, "logps/rejected": -238.786376953125, "logps_avg/chosen": -0.19342893362045288, "logps_avg/rejected": -1.2893109321594238, "loss": 0.1938, "losses_ref": -0.002995225368067622, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 250, "u": -1.7953672409057617, "weight": 0.05768171697854996 }, { "diff_generated": -13.075129508972168, "epoch": 0.08425145819831498, "grad_norm": 5.0467419773950715, "learning_rate": 2.2462203023758098e-07, "logits/chosen": -2.458019256591797, "logits/rejected": -2.485776662826538, "logps/chosen": -35.66144561767578, "logps/rejected": -226.5882568359375, "logps_avg/chosen": -0.1955607533454895, "logps_avg/rejected": -1.3075129985809326, "loss": 0.1902, "losses_ref": -0.0024760509841144085, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 260, "u": -1.7950090169906616, "weight": 0.05749331787228584 }, { "diff_generated": -12.619610786437988, "epoch": 0.08749189889825017, "grad_norm": 5.0082322883287365, "learning_rate": 2.3326133909287256e-07, "logits/chosen": -2.4652304649353027, "logits/rejected": -2.509176731109619, "logps/chosen": -32.687461853027344, "logps/rejected": -214.37661743164062, "logps_avg/chosen": -0.17882901430130005, "logps_avg/rejected": -1.2619612216949463, "loss": 0.1866, "losses_ref": -0.0019212514162063599, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 270, "u": -1.7393690347671509, "weight": 0.0853450745344162 }, { "diff_generated": -12.650653839111328, "epoch": 0.09073233959818536, "grad_norm": 4.891295319023462, "learning_rate": 2.4190064794816416e-07, "logits/chosen": -2.44873309135437, "logits/rejected": -2.5190067291259766, "logps/chosen": -32.08829879760742, "logps/rejected": -221.0590362548828, "logps_avg/chosen": -0.18478551506996155, "logps_avg/rejected": -1.2650654315948486, "loss": 0.1855, "losses_ref": -0.004490494728088379, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 280, "u": -1.7792739868164062, "weight": 0.06853736937046051 }, { "diff_generated": -12.423052787780762, "epoch": 0.09397278029812055, "grad_norm": 4.797689682089731, "learning_rate": 2.505399568034557e-07, "logits/chosen": -2.4705300331115723, "logits/rejected": -2.496605634689331, "logps/chosen": -34.917354583740234, "logps/rejected": -211.85708618164062, "logps_avg/chosen": -0.19044120609760284, "logps_avg/rejected": -1.2423055171966553, "loss": 0.1835, "losses_ref": -0.00314778508618474, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 290, "u": -1.7591886520385742, "weight": 0.07695788890123367 }, { "diff_generated": -14.825152397155762, "epoch": 0.09721322099805574, "grad_norm": 5.0295975035207405, "learning_rate": 2.591792656587473e-07, "logits/chosen": -2.4914021492004395, "logits/rejected": -2.534492254257202, "logps/chosen": -30.0008544921875, "logps/rejected": -238.60293579101562, "logps_avg/chosen": -0.17152948677539825, "logps_avg/rejected": -1.4825150966644287, "loss": 0.1796, "losses_ref": -0.0015082244062796235, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 300, "u": -1.8350900411605835, "weight": 0.03436414152383804 }, { "diff_generated": -13.583699226379395, "epoch": 0.10045366169799093, "grad_norm": 4.75263679666864, "learning_rate": 2.6781857451403887e-07, "logits/chosen": -2.4600813388824463, "logits/rejected": -2.5308713912963867, "logps/chosen": -29.974111557006836, "logps/rejected": -225.1046905517578, "logps_avg/chosen": -0.18076516687870026, "logps_avg/rejected": -1.3583698272705078, "loss": 0.1744, "losses_ref": -0.0025918360333889723, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 310, "u": -1.808953881263733, "weight": 0.04961549490690231 }, { "diff_generated": -13.35509967803955, "epoch": 0.10369410239792612, "grad_norm": 4.556274279970173, "learning_rate": 2.764578833693304e-07, "logits/chosen": -2.432319402694702, "logits/rejected": -2.4622726440429688, "logps/chosen": -30.676654815673828, "logps/rejected": -216.61288452148438, "logps_avg/chosen": -0.17878147959709167, "logps_avg/rejected": -1.3355098962783813, "loss": 0.1776, "losses_ref": -0.0024136919528245926, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 320, "u": -1.737860918045044, "weight": 0.08699294179677963 }, { "diff_generated": -13.178678512573242, "epoch": 0.10693454309786131, "grad_norm": 5.291823716686084, "learning_rate": 2.8509719222462203e-07, "logits/chosen": -2.499701976776123, "logits/rejected": -2.5902438163757324, "logps/chosen": -31.878047943115234, "logps/rejected": -240.4405517578125, "logps_avg/chosen": -0.18391458690166473, "logps_avg/rejected": -1.3178678750991821, "loss": 0.1795, "losses_ref": -0.0012684818357229233, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 330, "u": -1.776533842086792, "weight": 0.06484408676624298 }, { "diff_generated": -13.998690605163574, "epoch": 0.1101749837977965, "grad_norm": 5.383787138213607, "learning_rate": 2.937365010799136e-07, "logits/chosen": -2.4777579307556152, "logits/rejected": -2.5280511379241943, "logps/chosen": -30.474166870117188, "logps/rejected": -237.3151092529297, "logps_avg/chosen": -0.1744353324174881, "logps_avg/rejected": -1.3998689651489258, "loss": 0.1794, "losses_ref": -0.00262268865481019, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 340, "u": -1.8082706928253174, "weight": 0.05032556504011154 }, { "diff_generated": -14.047930717468262, "epoch": 0.11341542449773169, "grad_norm": 4.581918938488034, "learning_rate": 3.023758099352052e-07, "logits/chosen": -2.4632320404052734, "logits/rejected": -2.5006260871887207, "logps/chosen": -32.7548828125, "logps/rejected": -238.8568572998047, "logps_avg/chosen": -0.20024879276752472, "logps_avg/rejected": -1.4047930240631104, "loss": 0.1811, "losses_ref": -0.0019572232849895954, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 350, "u": -1.8337892293930054, "weight": 0.035822127014398575 }, { "diff_generated": -13.68774700164795, "epoch": 0.11665586519766688, "grad_norm": 4.738270576170094, "learning_rate": 3.1101511879049674e-07, "logits/chosen": -2.4731106758117676, "logits/rejected": -2.5093135833740234, "logps/chosen": -31.073184967041016, "logps/rejected": -229.44540405273438, "logps_avg/chosen": -0.17675338685512543, "logps_avg/rejected": -1.3687747716903687, "loss": 0.1774, "losses_ref": -0.0017537868116050959, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 360, "u": -1.7635164260864258, "weight": 0.07238699495792389 }, { "diff_generated": -14.045831680297852, "epoch": 0.11989630589760207, "grad_norm": 4.620735691114173, "learning_rate": 3.1965442764578835e-07, "logits/chosen": -2.553758382797241, "logits/rejected": -2.5590033531188965, "logps/chosen": -29.680404663085938, "logps/rejected": -251.1548309326172, "logps_avg/chosen": -0.16406632959842682, "logps_avg/rejected": -1.404583215713501, "loss": 0.1748, "losses_ref": -0.002590332878753543, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 370, "u": -1.8194854259490967, "weight": 0.04447915405035019 }, { "diff_generated": -13.711982727050781, "epoch": 0.12313674659753726, "grad_norm": 4.494188858637888, "learning_rate": 3.282937365010799e-07, "logits/chosen": -2.5184168815612793, "logits/rejected": -2.5638442039489746, "logps/chosen": -30.885112762451172, "logps/rejected": -246.29946899414062, "logps_avg/chosen": -0.1680624783039093, "logps_avg/rejected": -1.371198296546936, "loss": 0.1742, "losses_ref": -0.0014965020818635821, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 380, "u": -1.776185393333435, "weight": 0.06525006145238876 }, { "diff_generated": -13.470802307128906, "epoch": 0.12637718729747247, "grad_norm": 4.740378132166772, "learning_rate": 3.3693304535637145e-07, "logits/chosen": -2.5411252975463867, "logits/rejected": -2.614982843399048, "logps/chosen": -33.133872985839844, "logps/rejected": -245.2790985107422, "logps_avg/chosen": -0.18118393421173096, "logps_avg/rejected": -1.3470804691314697, "loss": 0.1766, "losses_ref": -0.002783800009638071, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 390, "u": -1.8315505981445312, "weight": 0.03817793354392052 }, { "diff_generated": -14.816889762878418, "epoch": 0.12961762799740764, "grad_norm": 5.062656449082662, "learning_rate": 3.4557235421166306e-07, "logits/chosen": -2.4957797527313232, "logits/rejected": -2.543741226196289, "logps/chosen": -30.80033302307129, "logps/rejected": -253.0391845703125, "logps_avg/chosen": -0.17897175252437592, "logps_avg/rejected": -1.4816890954971313, "loss": 0.1667, "losses_ref": -0.0012838852126151323, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 400, "u": -1.8120357990264893, "weight": 0.04616154357790947 }, { "diff_generated": -14.791712760925293, "epoch": 0.13285806869734285, "grad_norm": 4.643159185649741, "learning_rate": 3.542116630669546e-07, "logits/chosen": -2.4794423580169678, "logits/rejected": -2.5461440086364746, "logps/chosen": -27.8972110748291, "logps/rejected": -251.6249237060547, "logps_avg/chosen": -0.17388319969177246, "logps_avg/rejected": -1.4791711568832397, "loss": 0.1672, "losses_ref": -0.0017990957712754607, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 410, "u": -1.8340179920196533, "weight": 0.03552815318107605 }, { "diff_generated": -14.0972261428833, "epoch": 0.13609850939727802, "grad_norm": 4.3280080110575305, "learning_rate": 3.628509719222462e-07, "logits/chosen": -2.5374386310577393, "logits/rejected": -2.5984349250793457, "logps/chosen": -29.984893798828125, "logps/rejected": -250.20181274414062, "logps_avg/chosen": -0.16934213042259216, "logps_avg/rejected": -1.4097226858139038, "loss": 0.1697, "losses_ref": -0.0023155449889600277, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 420, "u": -1.8452469110488892, "weight": 0.030071932822465897 }, { "diff_generated": -13.562525749206543, "epoch": 0.13933895009721323, "grad_norm": 4.280500048816285, "learning_rate": 3.7149028077753777e-07, "logits/chosen": -2.4581573009490967, "logits/rejected": -2.5427894592285156, "logps/chosen": -26.301082611083984, "logps/rejected": -238.8366241455078, "logps_avg/chosen": -0.15402349829673767, "logps_avg/rejected": -1.3562524318695068, "loss": 0.1651, "losses_ref": -0.0015942498575896025, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 430, "u": -1.6928117275238037, "weight": 0.10941553115844727 }, { "diff_generated": -13.584875106811523, "epoch": 0.1425793907971484, "grad_norm": 4.924019126408653, "learning_rate": 3.801295896328294e-07, "logits/chosen": -2.499994993209839, "logits/rejected": -2.533939838409424, "logps/chosen": -29.343231201171875, "logps/rejected": -238.6305389404297, "logps_avg/chosen": -0.1552811563014984, "logps_avg/rejected": -1.3584874868392944, "loss": 0.1696, "losses_ref": -0.0027523760218173265, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 440, "u": -1.760610818862915, "weight": 0.0755021795630455 }, { "diff_generated": -13.918545722961426, "epoch": 0.1458198314970836, "grad_norm": 4.446751093583341, "learning_rate": 3.887688984881209e-07, "logits/chosen": -2.516287326812744, "logits/rejected": -2.5299954414367676, "logps/chosen": -27.761011123657227, "logps/rejected": -243.63455200195312, "logps_avg/chosen": -0.15204386413097382, "logps_avg/rejected": -1.3918545246124268, "loss": 0.1654, "losses_ref": -0.003121785121038556, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 450, "u": -1.736435890197754, "weight": 0.08867697417736053 }, { "diff_generated": -14.489161491394043, "epoch": 0.14906027219701878, "grad_norm": 4.1975650434326415, "learning_rate": 3.974082073434125e-07, "logits/chosen": -2.5312724113464355, "logits/rejected": -2.553266763687134, "logps/chosen": -30.491928100585938, "logps/rejected": -250.10696411132812, "logps_avg/chosen": -0.16546538472175598, "logps_avg/rejected": -1.4489161968231201, "loss": 0.1621, "losses_ref": -0.0017298974562436342, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 460, "u": -1.7870547771453857, "weight": 0.06000211834907532 }, { "diff_generated": -14.744906425476074, "epoch": 0.152300712896954, "grad_norm": 7.566673616414248, "learning_rate": 4.060475161987041e-07, "logits/chosen": -2.5269410610198975, "logits/rejected": -2.5724127292633057, "logps/chosen": -31.23870277404785, "logps/rejected": -262.3318176269531, "logps_avg/chosen": -0.16487276554107666, "logps_avg/rejected": -1.4744906425476074, "loss": 0.1635, "losses_ref": -0.001866829232312739, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 470, "u": -1.8341680765151978, "weight": 0.03543657064437866 }, { "diff_generated": -15.317721366882324, "epoch": 0.15554115359688916, "grad_norm": 4.387634572440091, "learning_rate": 4.146868250539957e-07, "logits/chosen": -2.5223655700683594, "logits/rejected": -2.5226664543151855, "logps/chosen": -27.46135902404785, "logps/rejected": -246.641357421875, "logps_avg/chosen": -0.1489681452512741, "logps_avg/rejected": -1.5317721366882324, "loss": 0.1642, "losses_ref": -0.002479640068486333, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 480, "u": -1.8073724508285522, "weight": 0.05108867958188057 }, { "diff_generated": -13.857152938842773, "epoch": 0.15878159429682437, "grad_norm": 4.240910416137423, "learning_rate": 4.2332613390928724e-07, "logits/chosen": -2.5168867111206055, "logits/rejected": -2.518345355987549, "logps/chosen": -30.311405181884766, "logps/rejected": -233.2909698486328, "logps_avg/chosen": -0.1701345443725586, "logps_avg/rejected": -1.3857154846191406, "loss": 0.1675, "losses_ref": -0.0022363795433193445, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 490, "u": -1.7861015796661377, "weight": 0.06115210801362991 }, { "diff_generated": -14.959016799926758, "epoch": 0.16202203499675957, "grad_norm": 4.4420016624530465, "learning_rate": 4.319654427645788e-07, "logits/chosen": -2.51141095161438, "logits/rejected": -2.5964505672454834, "logps/chosen": -26.861099243164062, "logps/rejected": -251.33291625976562, "logps_avg/chosen": -0.15614867210388184, "logps_avg/rejected": -1.4959017038345337, "loss": 0.1657, "losses_ref": -0.0017895328346639872, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 500, "u": -1.763580560684204, "weight": 0.07234685122966766 }, { "diff_generated": -13.89812183380127, "epoch": 0.16526247569669475, "grad_norm": 4.139783101709871, "learning_rate": 4.406047516198704e-07, "logits/chosen": -2.451756000518799, "logits/rejected": -2.5747926235198975, "logps/chosen": -25.249866485595703, "logps/rejected": -244.6548309326172, "logps_avg/chosen": -0.15236565470695496, "logps_avg/rejected": -1.3898121118545532, "loss": 0.1592, "losses_ref": -0.0019647441804409027, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 510, "u": -1.7391914129257202, "weight": 0.08555871993303299 }, { "diff_generated": -15.070231437683105, "epoch": 0.16850291639662995, "grad_norm": 4.448321299018289, "learning_rate": 4.4924406047516195e-07, "logits/chosen": -2.5451064109802246, "logits/rejected": -2.5574862957000732, "logps/chosen": -30.759517669677734, "logps/rejected": -274.0057067871094, "logps_avg/chosen": -0.1640317738056183, "logps_avg/rejected": -1.5070230960845947, "loss": 0.158, "losses_ref": -0.001814872375689447, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 520, "u": -1.8107799291610718, "weight": 0.047605521976947784 }, { "diff_generated": -14.907147407531738, "epoch": 0.17174335709656513, "grad_norm": 4.548145949240779, "learning_rate": 4.5788336933045356e-07, "logits/chosen": -2.5269455909729004, "logits/rejected": -2.5774500370025635, "logps/chosen": -26.586517333984375, "logps/rejected": -250.545654296875, "logps_avg/chosen": -0.15523529052734375, "logps_avg/rejected": -1.4907147884368896, "loss": 0.1545, "losses_ref": -0.0010867482051253319, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 530, "u": -1.7412116527557373, "weight": 0.08329650014638901 }, { "diff_generated": -14.34411334991455, "epoch": 0.17498379779650033, "grad_norm": 4.565857951910505, "learning_rate": 4.665226781857451e-07, "logits/chosen": -2.5096616744995117, "logits/rejected": -2.6065526008605957, "logps/chosen": -28.218563079833984, "logps/rejected": -267.3673095703125, "logps_avg/chosen": -0.1656300127506256, "logps_avg/rejected": -1.4344114065170288, "loss": 0.1562, "losses_ref": -0.001910742954351008, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 540, "u": -1.7859923839569092, "weight": 0.06093848496675491 }, { "diff_generated": -15.374615669250488, "epoch": 0.1782242384964355, "grad_norm": 5.0178902060092945, "learning_rate": 4.751619870410367e-07, "logits/chosen": -2.494371175765991, "logits/rejected": -2.471282958984375, "logps/chosen": -28.187641143798828, "logps/rejected": -259.7303771972656, "logps_avg/chosen": -0.15739896893501282, "logps_avg/rejected": -1.537461519241333, "loss": 0.158, "losses_ref": -0.0012739974772557616, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 550, "u": -1.7645008563995361, "weight": 0.07127384841442108 }, { "diff_generated": -15.536798477172852, "epoch": 0.18146467919637072, "grad_norm": 4.020211574052711, "learning_rate": 4.838012958963283e-07, "logits/chosen": -2.4912216663360596, "logits/rejected": -2.5614213943481445, "logps/chosen": -24.883838653564453, "logps/rejected": -277.3970642089844, "logps_avg/chosen": -0.15538007020950317, "logps_avg/rejected": -1.5536797046661377, "loss": 0.16, "losses_ref": -0.001234889728948474, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 560, "u": -1.835831642150879, "weight": 0.03358057886362076 }, { "diff_generated": -15.803949356079102, "epoch": 0.1847051198963059, "grad_norm": 4.00381538300631, "learning_rate": 4.924406047516198e-07, "logits/chosen": -2.478405475616455, "logits/rejected": -2.5514659881591797, "logps/chosen": -27.245723724365234, "logps/rejected": -280.4998779296875, "logps_avg/chosen": -0.15291285514831543, "logps_avg/rejected": -1.580394983291626, "loss": 0.1566, "losses_ref": -0.0008858289802446961, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 570, "u": -1.835996389389038, "weight": 0.03329852223396301 }, { "diff_generated": -15.373080253601074, "epoch": 0.1879455605962411, "grad_norm": 4.317052696494499, "learning_rate": 5.010799136069114e-07, "logits/chosen": -2.544964075088501, "logits/rejected": -2.5514461994171143, "logps/chosen": -29.379268646240234, "logps/rejected": -260.7983093261719, "logps_avg/chosen": -0.15511760115623474, "logps_avg/rejected": -1.5373082160949707, "loss": 0.1563, "losses_ref": -0.0011349378619343042, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 580, "u": -1.7878364324569702, "weight": 0.05907173082232475 }, { "diff_generated": -16.546499252319336, "epoch": 0.19118600129617627, "grad_norm": 3.9727178680185586, "learning_rate": 5.097192224622029e-07, "logits/chosen": -2.5150680541992188, "logits/rejected": -2.523024082183838, "logps/chosen": -26.727558135986328, "logps/rejected": -257.0491943359375, "logps_avg/chosen": -0.15087701380252838, "logps_avg/rejected": -1.654650092124939, "loss": 0.157, "losses_ref": -0.0016078378539532423, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 590, "u": -1.8469066619873047, "weight": 0.02822394296526909 }, { "diff_generated": -16.31281280517578, "epoch": 0.19442644199611148, "grad_norm": 4.257100975350278, "learning_rate": 5.183585313174946e-07, "logits/chosen": -2.529265880584717, "logits/rejected": -2.5448126792907715, "logps/chosen": -32.634605407714844, "logps/rejected": -280.5689392089844, "logps_avg/chosen": -0.17440596222877502, "logps_avg/rejected": -1.6312812566757202, "loss": 0.1589, "losses_ref": -0.0015461514703929424, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 600, "u": -1.7755861282348633, "weight": 0.06589512526988983 }, { "diff_generated": -16.440349578857422, "epoch": 0.19766688269604665, "grad_norm": 4.685778188867323, "learning_rate": 5.269978401727861e-07, "logits/chosen": -2.5129942893981934, "logits/rejected": -2.5518910884857178, "logps/chosen": -29.388286590576172, "logps/rejected": -282.8636474609375, "logps_avg/chosen": -0.15934798121452332, "logps_avg/rejected": -1.6440349817276, "loss": 0.1551, "losses_ref": -0.0010432195849716663, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 610, "u": -1.8006556034088135, "weight": 0.05187270790338516 }, { "diff_generated": -14.875322341918945, "epoch": 0.20090732339598186, "grad_norm": 3.9541371244197405, "learning_rate": 5.356371490280777e-07, "logits/chosen": -2.521015167236328, "logits/rejected": -2.575629711151123, "logps/chosen": -28.5093936920166, "logps/rejected": -282.8163146972656, "logps_avg/chosen": -0.16011330485343933, "logps_avg/rejected": -1.487532138824463, "loss": 0.1515, "losses_ref": -0.001480486593209207, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 620, "u": -1.775630235671997, "weight": 0.06583119183778763 }, { "diff_generated": -15.518178939819336, "epoch": 0.20414776409591703, "grad_norm": 4.521151306660312, "learning_rate": 5.442764578833693e-07, "logits/chosen": -2.5314955711364746, "logits/rejected": -2.546788454055786, "logps/chosen": -30.999698638916016, "logps/rejected": -282.10113525390625, "logps_avg/chosen": -0.16733181476593018, "logps_avg/rejected": -1.5518181324005127, "loss": 0.1547, "losses_ref": -0.002137316856533289, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 630, "u": -1.8101974725723267, "weight": 0.048265378922224045 }, { "diff_generated": -14.888731002807617, "epoch": 0.20738820479585224, "grad_norm": 4.3267193602618645, "learning_rate": 5.529157667386608e-07, "logits/chosen": -2.525298595428467, "logits/rejected": -2.5428686141967773, "logps/chosen": -26.18801498413086, "logps/rejected": -257.1575622558594, "logps_avg/chosen": -0.14402839541435242, "logps_avg/rejected": -1.4888732433319092, "loss": 0.1503, "losses_ref": -0.0013474032748490572, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 640, "u": -1.752764344215393, "weight": 0.07741077244281769 }, { "diff_generated": -16.706762313842773, "epoch": 0.21062864549578741, "grad_norm": 3.9874178439606176, "learning_rate": 5.615550755939525e-07, "logits/chosen": -2.4880218505859375, "logits/rejected": -2.5559372901916504, "logps/chosen": -27.271615982055664, "logps/rejected": -281.1944885253906, "logps_avg/chosen": -0.15678571164608002, "logps_avg/rejected": -1.6706759929656982, "loss": 0.159, "losses_ref": -0.0017356419702991843, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 650, "u": -1.811038613319397, "weight": 0.04730648174881935 }, { "diff_generated": -15.391021728515625, "epoch": 0.21386908619572262, "grad_norm": 4.036784512123039, "learning_rate": 5.701943844492441e-07, "logits/chosen": -2.529348611831665, "logits/rejected": -2.511889934539795, "logps/chosen": -29.90597915649414, "logps/rejected": -270.4825134277344, "logps_avg/chosen": -0.15700222551822662, "logps_avg/rejected": -1.5391019582748413, "loss": 0.1551, "losses_ref": -0.0016960095381364226, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 660, "u": -1.775199294090271, "weight": 0.06631585210561752 }, { "diff_generated": -17.663175582885742, "epoch": 0.21710952689565782, "grad_norm": 4.358399155338858, "learning_rate": 5.788336933045357e-07, "logits/chosen": -2.5374488830566406, "logits/rejected": -2.5241658687591553, "logps/chosen": -30.06842613220215, "logps/rejected": -302.4938659667969, "logps_avg/chosen": -0.16507597267627716, "logps_avg/rejected": -1.7663177251815796, "loss": 0.1526, "losses_ref": -0.0008090495830401778, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 670, "u": -1.8485357761383057, "weight": 0.02635175548493862 }, { "diff_generated": -16.56492805480957, "epoch": 0.220349967595593, "grad_norm": 3.8599372759018835, "learning_rate": 5.874730021598272e-07, "logits/chosen": -2.5052294731140137, "logits/rejected": -2.5328030586242676, "logps/chosen": -25.340312957763672, "logps/rejected": -298.6836853027344, "logps_avg/chosen": -0.1403449922800064, "logps_avg/rejected": -1.656492829322815, "loss": 0.1514, "losses_ref": -0.0011725362855941057, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 680, "u": -1.8475481271743774, "weight": 0.027439361438155174 }, { "diff_generated": -16.202449798583984, "epoch": 0.2235904082955282, "grad_norm": 5.707656978098627, "learning_rate": 5.961123110151188e-07, "logits/chosen": -2.5147368907928467, "logits/rejected": -2.5265755653381348, "logps/chosen": -27.811309814453125, "logps/rejected": -282.5999755859375, "logps_avg/chosen": -0.16103322803974152, "logps_avg/rejected": -1.6202447414398193, "loss": 0.1572, "losses_ref": -0.0010156487114727497, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 690, "u": -1.8123470544815063, "weight": 0.045763980597257614 }, { "diff_generated": -15.581779479980469, "epoch": 0.22683084899546338, "grad_norm": 4.543595624237527, "learning_rate": 6.047516198704104e-07, "logits/chosen": -2.5117552280426025, "logits/rejected": -2.5222716331481934, "logps/chosen": -24.74635887145996, "logps/rejected": -259.0545959472656, "logps_avg/chosen": -0.1400236338376999, "logps_avg/rejected": -1.5581779479980469, "loss": 0.1523, "losses_ref": -0.0016117949271574616, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 700, "u": -1.7632642984390259, "weight": 0.07257858663797379 }, { "diff_generated": -17.33824348449707, "epoch": 0.23007128969539858, "grad_norm": 3.756311999295835, "learning_rate": 6.133909287257019e-07, "logits/chosen": -2.4569334983825684, "logits/rejected": -2.4747729301452637, "logps/chosen": -25.68191909790039, "logps/rejected": -288.65692138671875, "logps_avg/chosen": -0.14832261204719543, "logps_avg/rejected": -1.7338241338729858, "loss": 0.153, "losses_ref": -0.0006162269273772836, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 710, "u": -1.7656484842300415, "weight": 0.06995699554681778 }, { "diff_generated": -17.465576171875, "epoch": 0.23331173039533376, "grad_norm": 3.9925886017873653, "learning_rate": 6.220302375809935e-07, "logits/chosen": -2.522162914276123, "logits/rejected": -2.556715726852417, "logps/chosen": -25.977420806884766, "logps/rejected": -301.238037109375, "logps_avg/chosen": -0.14257541298866272, "logps_avg/rejected": -1.746557593345642, "loss": 0.1509, "losses_ref": -0.0010114299366250634, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 720, "u": -1.8123047351837158, "weight": 0.045841820538043976 }, { "diff_generated": -16.138324737548828, "epoch": 0.23655217109526896, "grad_norm": 4.368943205860947, "learning_rate": 6.306695464362851e-07, "logits/chosen": -2.495678663253784, "logits/rejected": -2.5630602836608887, "logps/chosen": -26.281463623046875, "logps/rejected": -291.1576843261719, "logps_avg/chosen": -0.15310141444206238, "logps_avg/rejected": -1.6138322353363037, "loss": 0.1493, "losses_ref": -0.0015504444018006325, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 730, "u": -1.7755588293075562, "weight": 0.06590535491704941 }, { "diff_generated": -16.800281524658203, "epoch": 0.23979261179520414, "grad_norm": 4.140277380523364, "learning_rate": 6.393088552915767e-07, "logits/chosen": -2.417356014251709, "logits/rejected": -2.488996982574463, "logps/chosen": -25.04463005065918, "logps/rejected": -295.3211669921875, "logps_avg/chosen": -0.14725720882415771, "logps_avg/rejected": -1.680027723312378, "loss": 0.15, "losses_ref": -0.0007039078627713025, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 740, "u": -1.742006540298462, "weight": 0.08238840103149414 }, { "diff_generated": -16.212671279907227, "epoch": 0.24303305249513935, "grad_norm": 3.7113691272292164, "learning_rate": 6.479481641468682e-07, "logits/chosen": -2.4623329639434814, "logits/rejected": -2.4828693866729736, "logps/chosen": -26.126338958740234, "logps/rejected": -287.18853759765625, "logps_avg/chosen": -0.14278826117515564, "logps_avg/rejected": -1.6212670803070068, "loss": 0.1514, "losses_ref": -0.001221969723701477, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 750, "u": -1.7647031545639038, "weight": 0.07106192409992218 }, { "diff_generated": -17.23666000366211, "epoch": 0.24627349319507452, "grad_norm": 3.626914226331303, "learning_rate": 6.565874730021598e-07, "logits/chosen": -2.4999337196350098, "logits/rejected": -2.5974574089050293, "logps/chosen": -26.908031463623047, "logps/rejected": -307.95330810546875, "logps_avg/chosen": -0.1553696095943451, "logps_avg/rejected": -1.7236659526824951, "loss": 0.1528, "losses_ref": -0.0013714140513911843, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 760, "u": -1.859244704246521, "weight": 0.021420713514089584 }, { "diff_generated": -16.649688720703125, "epoch": 0.24951393389500973, "grad_norm": 3.6449586231357523, "learning_rate": 6.652267818574514e-07, "logits/chosen": -2.5069055557250977, "logits/rejected": -2.5311920642852783, "logps/chosen": -27.44228172302246, "logps/rejected": -280.15423583984375, "logps_avg/chosen": -0.1512620747089386, "logps_avg/rejected": -1.6649688482284546, "loss": 0.1469, "losses_ref": -0.0011087359162047505, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 770, "u": -1.740947961807251, "weight": 0.08358202874660492 }, { "diff_generated": -17.594615936279297, "epoch": 0.25275437459494493, "grad_norm": 3.591067576204669, "learning_rate": 6.738660907127429e-07, "logits/chosen": -2.5164694786071777, "logits/rejected": -2.544618606567383, "logps/chosen": -29.995891571044922, "logps/rejected": -302.5132141113281, "logps_avg/chosen": -0.16520099341869354, "logps_avg/rejected": -1.7594616413116455, "loss": 0.1514, "losses_ref": -0.0011694171698763967, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 780, "u": -1.8233131170272827, "weight": 0.04046357423067093 }, { "diff_generated": -15.048116683959961, "epoch": 0.2559948152948801, "grad_norm": 4.007762938469329, "learning_rate": 6.825053995680345e-07, "logits/chosen": -2.4432899951934814, "logits/rejected": -2.543459415435791, "logps/chosen": -26.264307022094727, "logps/rejected": -274.15142822265625, "logps_avg/chosen": -0.15596263110637665, "logps_avg/rejected": -1.5048116445541382, "loss": 0.152, "losses_ref": -0.0016327224439010024, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 790, "u": -1.751920461654663, "weight": 0.07835827022790909 }, { "diff_generated": -16.8079891204834, "epoch": 0.2592352559948153, "grad_norm": 3.803322718876922, "learning_rate": 6.911447084233261e-07, "logits/chosen": -2.4625935554504395, "logits/rejected": -2.491464376449585, "logps/chosen": -25.487167358398438, "logps/rejected": -294.3965759277344, "logps_avg/chosen": -0.14450162649154663, "logps_avg/rejected": -1.680798888206482, "loss": 0.1441, "losses_ref": -0.0011468358570709825, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 800, "u": -1.8119513988494873, "weight": 0.046235114336013794 }, { "diff_generated": -16.325275421142578, "epoch": 0.26247569669475046, "grad_norm": 3.8441167643161624, "learning_rate": 6.997840172786177e-07, "logits/chosen": -2.5127015113830566, "logits/rejected": -2.5084917545318604, "logps/chosen": -28.14730453491211, "logps/rejected": -269.518310546875, "logps_avg/chosen": -0.15695425868034363, "logps_avg/rejected": -1.6325273513793945, "loss": 0.1459, "losses_ref": -0.0017818144988268614, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 810, "u": -1.8110424280166626, "weight": 0.047296833246946335 }, { "diff_generated": -16.837024688720703, "epoch": 0.2657161373946857, "grad_norm": 3.8403571035756245, "learning_rate": 7.084233261339092e-07, "logits/chosen": -2.4789652824401855, "logits/rejected": -2.552279472351074, "logps/chosen": -25.903278350830078, "logps/rejected": -297.843994140625, "logps_avg/chosen": -0.15136049687862396, "logps_avg/rejected": -1.6837022304534912, "loss": 0.1484, "losses_ref": -0.0007834344869479537, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 820, "u": -1.7774156332015991, "weight": 0.06382577121257782 }, { "diff_generated": -16.415281295776367, "epoch": 0.26895657809462087, "grad_norm": 3.8122228541342973, "learning_rate": 7.170626349892008e-07, "logits/chosen": -2.4843931198120117, "logits/rejected": -2.530792713165283, "logps/chosen": -25.79958724975586, "logps/rejected": -278.10589599609375, "logps_avg/chosen": -0.14485926926136017, "logps_avg/rejected": -1.6415281295776367, "loss": 0.1513, "losses_ref": -0.0013059931807219982, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 830, "u": -1.7524468898773193, "weight": 0.07773178815841675 }, { "diff_generated": -17.35388946533203, "epoch": 0.27219701879455604, "grad_norm": 5.721419042997992, "learning_rate": 7.257019438444924e-07, "logits/chosen": -2.543403148651123, "logits/rejected": -2.4924099445343018, "logps/chosen": -28.061603546142578, "logps/rejected": -278.6396484375, "logps_avg/chosen": -0.14713650941848755, "logps_avg/rejected": -1.735388994216919, "loss": 0.147, "losses_ref": -0.0010668208124116063, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 840, "u": -1.8004766702651978, "weight": 0.05206400901079178 }, { "diff_generated": -16.650218963623047, "epoch": 0.2754374594944913, "grad_norm": 3.4977682328654813, "learning_rate": 7.343412526997839e-07, "logits/chosen": -2.469409227371216, "logits/rejected": -2.451508045196533, "logps/chosen": -26.62298583984375, "logps/rejected": -277.678466796875, "logps_avg/chosen": -0.14454862475395203, "logps_avg/rejected": -1.6650216579437256, "loss": 0.1465, "losses_ref": -0.0009407054749317467, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 850, "u": -1.729368805885315, "weight": 0.08950553089380264 }, { "diff_generated": -16.65591049194336, "epoch": 0.27867790019442645, "grad_norm": 3.7789630141654427, "learning_rate": 7.429805615550755e-07, "logits/chosen": -2.5125536918640137, "logits/rejected": -2.518594264984131, "logps/chosen": -28.266979217529297, "logps/rejected": -279.36236572265625, "logps_avg/chosen": -0.14472495019435883, "logps_avg/rejected": -1.6655908823013306, "loss": 0.1457, "losses_ref": -0.0014739853795617819, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 860, "u": -1.7628333568572998, "weight": 0.07282988727092743 }, { "diff_generated": -17.289453506469727, "epoch": 0.28191834089436163, "grad_norm": 3.816298176813016, "learning_rate": 7.516198704103671e-07, "logits/chosen": -2.4170525074005127, "logits/rejected": -2.5361487865448, "logps/chosen": -21.06148338317871, "logps/rejected": -310.81689453125, "logps_avg/chosen": -0.13158485293388367, "logps_avg/rejected": -1.7289453744888306, "loss": 0.1437, "losses_ref": -0.0011063070269301534, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 870, "u": -1.788311243057251, "weight": 0.0586506649851799 }, { "diff_generated": -17.18234634399414, "epoch": 0.2851587815942968, "grad_norm": 3.5053935087270074, "learning_rate": 7.602591792656587e-07, "logits/chosen": -2.512596845626831, "logits/rejected": -2.5777673721313477, "logps/chosen": -28.489471435546875, "logps/rejected": -299.99554443359375, "logps_avg/chosen": -0.16591385006904602, "logps_avg/rejected": -1.7182344198226929, "loss": 0.145, "losses_ref": -0.0012544682249426842, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 880, "u": -1.8237812519073486, "weight": 0.04003932327032089 }, { "diff_generated": -16.68752670288086, "epoch": 0.28839922229423204, "grad_norm": 3.6606878088736043, "learning_rate": 7.688984881209502e-07, "logits/chosen": -2.4937093257904053, "logits/rejected": -2.5517916679382324, "logps/chosen": -26.318889617919922, "logps/rejected": -286.62115478515625, "logps_avg/chosen": -0.1460975706577301, "logps_avg/rejected": -1.668752908706665, "loss": 0.1507, "losses_ref": -0.001337700174190104, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 890, "u": -1.7759170532226562, "weight": 0.06547953933477402 }, { "diff_generated": -17.303630828857422, "epoch": 0.2916396629941672, "grad_norm": 3.96462786512709, "learning_rate": 7.775377969762419e-07, "logits/chosen": -2.5112993717193604, "logits/rejected": -2.552793502807617, "logps/chosen": -23.081388473510742, "logps/rejected": -312.2502136230469, "logps_avg/chosen": -0.13418585062026978, "logps_avg/rejected": -1.730363130569458, "loss": 0.1435, "losses_ref": -0.0010935317259281874, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 900, "u": -1.8119033575057983, "weight": 0.046246398240327835 }, { "diff_generated": -17.516305923461914, "epoch": 0.2948801036941024, "grad_norm": 3.960436884466544, "learning_rate": 7.861771058315335e-07, "logits/chosen": -2.536668300628662, "logits/rejected": -2.579866409301758, "logps/chosen": -24.779300689697266, "logps/rejected": -301.9021911621094, "logps_avg/chosen": -0.1520080268383026, "logps_avg/rejected": -1.7516307830810547, "loss": 0.1499, "losses_ref": -0.0007299171993508935, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 910, "u": -1.836708426475525, "weight": 0.03255882114171982 }, { "diff_generated": -18.22040557861328, "epoch": 0.29812054439403757, "grad_norm": 4.216145687800187, "learning_rate": 7.94816414686825e-07, "logits/chosen": -2.4581117630004883, "logits/rejected": -2.4852912425994873, "logps/chosen": -23.488094329833984, "logps/rejected": -296.0298156738281, "logps_avg/chosen": -0.1450100541114807, "logps_avg/rejected": -1.8220407962799072, "loss": 0.1444, "losses_ref": -0.0011233676923438907, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 920, "u": -1.8120393753051758, "weight": 0.046129390597343445 }, { "diff_generated": -16.802902221679688, "epoch": 0.3013609850939728, "grad_norm": 3.540189522204402, "learning_rate": 7.999995450631473e-07, "logits/chosen": -2.5070629119873047, "logits/rejected": -2.578346014022827, "logps/chosen": -24.117534637451172, "logps/rejected": -302.5053405761719, "logps_avg/chosen": -0.13478049635887146, "logps_avg/rejected": -1.6802902221679688, "loss": 0.1461, "losses_ref": -0.000596770434640348, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 930, "u": -1.7776219844818115, "weight": 0.06357844918966293 }, { "diff_generated": -17.449804306030273, "epoch": 0.304601425793908, "grad_norm": 3.4216904204057315, "learning_rate": 7.999944270354383e-07, "logits/chosen": -2.4589948654174805, "logits/rejected": -2.5639543533325195, "logps/chosen": -26.90707778930664, "logps/rejected": -313.41455078125, "logps_avg/chosen": -0.16656050086021423, "logps_avg/rejected": -1.7449802160263062, "loss": 0.1507, "losses_ref": -0.000916670891456306, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 940, "u": -1.8126401901245117, "weight": 0.04546584561467171 }, { "diff_generated": -16.95920181274414, "epoch": 0.30784186649384315, "grad_norm": 3.5840189261030604, "learning_rate": 7.99983622381959e-07, "logits/chosen": -2.49526047706604, "logits/rejected": -2.5164170265197754, "logps/chosen": -27.150075912475586, "logps/rejected": -277.107666015625, "logps_avg/chosen": -0.15076836943626404, "logps_avg/rejected": -1.6959202289581299, "loss": 0.1482, "losses_ref": -0.0008078349055722356, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 950, "u": -1.8009493350982666, "weight": 0.05151613801717758 }, { "diff_generated": -19.485225677490234, "epoch": 0.31108230719377833, "grad_norm": 3.884047901709329, "learning_rate": 7.999671312563164e-07, "logits/chosen": -2.5171866416931152, "logits/rejected": -2.4757168292999268, "logps/chosen": -26.446279525756836, "logps/rejected": -301.2461242675781, "logps_avg/chosen": -0.15032193064689636, "logps_avg/rejected": -1.9485225677490234, "loss": 0.1438, "losses_ref": -0.0010997498175129294, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 960, "u": -1.7880767583847046, "weight": 0.0588577575981617 }, { "diff_generated": -18.37019920349121, "epoch": 0.31432274789371356, "grad_norm": 3.5647427602940236, "learning_rate": 7.999449538929611e-07, "logits/chosen": -2.4516000747680664, "logits/rejected": -2.4881412982940674, "logps/chosen": -25.292774200439453, "logps/rejected": -301.6772766113281, "logps_avg/chosen": -0.1464090645313263, "logps_avg/rejected": -1.8370201587677002, "loss": 0.1489, "losses_ref": -0.0010070034768432379, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 970, "u": -1.8125054836273193, "weight": 0.04562786594033241 }, { "diff_generated": -17.277942657470703, "epoch": 0.31756318859364874, "grad_norm": 3.9531898301627604, "learning_rate": 7.99917090607183e-07, "logits/chosen": -2.4834158420562744, "logits/rejected": -2.5573599338531494, "logps/chosen": -23.000879287719727, "logps/rejected": -308.1183776855469, "logps_avg/chosen": -0.14088958501815796, "logps_avg/rejected": -1.7277939319610596, "loss": 0.1458, "losses_ref": -0.0007574810297228396, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 980, "u": -1.8011051416397095, "weight": 0.05134943872690201 }, { "diff_generated": -17.71529197692871, "epoch": 0.3208036292935839, "grad_norm": 4.066482069190587, "learning_rate": 7.998835417951081e-07, "logits/chosen": -2.5377254486083984, "logits/rejected": -2.5539002418518066, "logps/chosen": -25.564077377319336, "logps/rejected": -285.80694580078125, "logps_avg/chosen": -0.14639118313789368, "logps_avg/rejected": -1.771528959274292, "loss": 0.1446, "losses_ref": -0.0009276577038690448, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 990, "u": -1.8244966268539429, "weight": 0.03922320902347565 }, { "diff_generated": -16.78716468811035, "epoch": 0.32404406999351915, "grad_norm": 3.8501332564370996, "learning_rate": 7.998443079336919e-07, "logits/chosen": -2.495265483856201, "logits/rejected": -2.565699815750122, "logps/chosen": -26.15287208557129, "logps/rejected": -311.22125244140625, "logps_avg/chosen": -0.14763453602790833, "logps_avg/rejected": -1.6787166595458984, "loss": 0.1436, "losses_ref": -0.0007319619762711227, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1000, "u": -1.8367958068847656, "weight": 0.03246457502245903 }, { "diff_generated": -18.131811141967773, "epoch": 0.3272845106934543, "grad_norm": 3.4334006893079447, "learning_rate": 7.997993895807128e-07, "logits/chosen": -2.5410752296447754, "logits/rejected": -2.5453710556030273, "logps/chosen": -25.54119300842285, "logps/rejected": -319.1593017578125, "logps_avg/chosen": -0.14520543813705444, "logps_avg/rejected": -1.813180923461914, "loss": 0.1437, "losses_ref": -0.0006047133356332779, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1010, "u": -1.8132398128509521, "weight": 0.0447794608771801 }, { "diff_generated": -16.725383758544922, "epoch": 0.3305249513933895, "grad_norm": 3.6574381218888523, "learning_rate": 7.997487873747646e-07, "logits/chosen": -2.5111083984375, "logits/rejected": -2.532496929168701, "logps/chosen": -23.3675537109375, "logps/rejected": -300.4358215332031, "logps_avg/chosen": -0.12948718667030334, "logps_avg/rejected": -1.67253839969635, "loss": 0.1365, "losses_ref": -0.0006616250611841679, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1020, "u": -1.765761375427246, "weight": 0.06983973830938339 }, { "diff_generated": -18.22255516052246, "epoch": 0.3337653920933247, "grad_norm": 3.731162582661096, "learning_rate": 7.996925020352465e-07, "logits/chosen": -2.4977619647979736, "logits/rejected": -2.462693452835083, "logps/chosen": -28.452739715576172, "logps/rejected": -295.1959533691406, "logps_avg/chosen": -0.14971373975276947, "logps_avg/rejected": -1.8222553730010986, "loss": 0.1509, "losses_ref": -0.001343491836450994, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1030, "u": -1.8117315769195557, "weight": 0.04650767147541046 }, { "diff_generated": -18.08208465576172, "epoch": 0.3370058327932599, "grad_norm": 3.387416001248169, "learning_rate": 7.99630534362354e-07, "logits/chosen": -2.4722683429718018, "logits/rejected": -2.4949660301208496, "logps/chosen": -23.3449649810791, "logps/rejected": -305.1861877441406, "logps_avg/chosen": -0.1336612105369568, "logps_avg/rejected": -1.8082084655761719, "loss": 0.1417, "losses_ref": -0.0007842335617169738, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1040, "u": -1.765332579612732, "weight": 0.07031672447919846 }, { "diff_generated": -17.331754684448242, "epoch": 0.3402462734931951, "grad_norm": 3.2793739525196455, "learning_rate": 7.995628852370667e-07, "logits/chosen": -2.4455838203430176, "logits/rejected": -2.495821475982666, "logps/chosen": -24.51308250427246, "logps/rejected": -306.106689453125, "logps_avg/chosen": -0.14820663630962372, "logps_avg/rejected": -1.733175277709961, "loss": 0.1452, "losses_ref": -0.0010632300982251763, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1050, "u": -1.7886251211166382, "weight": 0.058313362300395966 }, { "diff_generated": -17.65144157409668, "epoch": 0.34348671419313026, "grad_norm": 3.414314107092028, "learning_rate": 7.994895556211363e-07, "logits/chosen": -2.4573745727539062, "logits/rejected": -2.5492000579833984, "logps/chosen": -25.012226104736328, "logps/rejected": -307.28863525390625, "logps_avg/chosen": -0.15316030383110046, "logps_avg/rejected": -1.7651441097259521, "loss": 0.1426, "losses_ref": -0.0007830454269424081, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1060, "u": -1.8129937648773193, "weight": 0.045065782964229584 }, { "diff_generated": -18.741573333740234, "epoch": 0.34672715489306544, "grad_norm": 3.5362740373683845, "learning_rate": 7.994105465570722e-07, "logits/chosen": -2.4750843048095703, "logits/rejected": -2.4775915145874023, "logps/chosen": -28.20367431640625, "logps/rejected": -321.14019775390625, "logps_avg/chosen": -0.15645088255405426, "logps_avg/rejected": -1.8741573095321655, "loss": 0.1445, "losses_ref": -0.0004584606795106083, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1070, "u": -1.7779619693756104, "weight": 0.06318999081850052 }, { "diff_generated": -17.927629470825195, "epoch": 0.34996759559300067, "grad_norm": 3.5224573664899963, "learning_rate": 7.993258591681279e-07, "logits/chosen": -2.446620464324951, "logits/rejected": -2.461449384689331, "logps/chosen": -25.738372802734375, "logps/rejected": -298.7737121582031, "logps_avg/chosen": -0.14429491758346558, "logps_avg/rejected": -1.7927627563476562, "loss": 0.1449, "losses_ref": -0.00048292643623426557, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1080, "u": -1.7422895431518555, "weight": 0.0820549726486206 }, { "diff_generated": -17.745529174804688, "epoch": 0.35320803629293585, "grad_norm": 3.4810028090401506, "learning_rate": 7.992354946582836e-07, "logits/chosen": -2.5051581859588623, "logits/rejected": -2.5519306659698486, "logps/chosen": -24.41130256652832, "logps/rejected": -324.4436340332031, "logps_avg/chosen": -0.13996274769306183, "logps_avg/rejected": -1.7745529413223267, "loss": 0.142, "losses_ref": -0.00065957399783656, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1090, "u": -1.8132047653198242, "weight": 0.04482080414891243 }, { "diff_generated": -17.48514747619629, "epoch": 0.356448476992871, "grad_norm": 3.776538208700194, "learning_rate": 7.991394543122304e-07, "logits/chosen": -2.4793810844421387, "logits/rejected": -2.5294454097747803, "logps/chosen": -25.155223846435547, "logps/rejected": -291.52569580078125, "logps_avg/chosen": -0.14746752381324768, "logps_avg/rejected": -1.7485147714614868, "loss": 0.1442, "losses_ref": -0.001555544906295836, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1100, "u": -1.7873111963272095, "weight": 0.05974424630403519 }, { "diff_generated": -15.942682266235352, "epoch": 0.3596889176928062, "grad_norm": 3.3175723086642335, "learning_rate": 7.990377394953507e-07, "logits/chosen": -2.450894832611084, "logits/rejected": -2.564906597137451, "logps/chosen": -24.385358810424805, "logps/rejected": -290.43048095703125, "logps_avg/chosen": -0.14321152865886688, "logps_avg/rejected": -1.5942682027816772, "loss": 0.1431, "losses_ref": -0.0007358678849413991, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1110, "u": -1.8011974096298218, "weight": 0.051243700087070465 }, { "diff_generated": -17.29189682006836, "epoch": 0.36292935839274143, "grad_norm": 3.205495384000928, "learning_rate": 7.989303516537001e-07, "logits/chosen": -2.5006320476531982, "logits/rejected": -2.5535852909088135, "logps/chosen": -21.054277420043945, "logps/rejected": -285.4083557128906, "logps_avg/chosen": -0.13172145187854767, "logps_avg/rejected": -1.7291895151138306, "loss": 0.1407, "losses_ref": -0.0006592486170120537, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1120, "u": -1.8012969493865967, "weight": 0.05112830922007561 }, { "diff_generated": -16.982044219970703, "epoch": 0.3661697990926766, "grad_norm": 3.625175942353783, "learning_rate": 7.98817292313986e-07, "logits/chosen": -2.5410523414611816, "logits/rejected": -2.588095188140869, "logps/chosen": -28.69525146484375, "logps/rejected": -307.0025329589844, "logps_avg/chosen": -0.16247490048408508, "logps_avg/rejected": -1.6982042789459229, "loss": 0.1429, "losses_ref": -0.0008386773988604546, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1130, "u": -1.8246562480926514, "weight": 0.039037130773067474 }, { "diff_generated": -17.986581802368164, "epoch": 0.3694102397926118, "grad_norm": 3.2460056654881613, "learning_rate": 7.986985630835463e-07, "logits/chosen": -2.4848456382751465, "logits/rejected": -2.5159919261932373, "logps/chosen": -24.781518936157227, "logps/rejected": -280.80987548828125, "logps_avg/chosen": -0.15118327736854553, "logps_avg/rejected": -1.798658013343811, "loss": 0.1426, "losses_ref": -0.001054365886375308, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1140, "u": -1.8358449935913086, "weight": 0.03351539373397827 }, { "diff_generated": -16.1344051361084, "epoch": 0.37265068049254696, "grad_norm": 3.3260113618842433, "learning_rate": 7.985741656503261e-07, "logits/chosen": -2.5159730911254883, "logits/rejected": -2.542235851287842, "logps/chosen": -29.317920684814453, "logps/rejected": -286.7655334472656, "logps_avg/chosen": -0.16192345321178436, "logps_avg/rejected": -1.6134405136108398, "loss": 0.1435, "losses_ref": -0.001453616307117045, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1150, "u": -1.7875741720199585, "weight": 0.05947249010205269 }, { "diff_generated": -17.043901443481445, "epoch": 0.3758911211924822, "grad_norm": 3.120515366538399, "learning_rate": 7.984441017828543e-07, "logits/chosen": -2.4753224849700928, "logits/rejected": -2.545196056365967, "logps/chosen": -26.04819679260254, "logps/rejected": -308.024169921875, "logps_avg/chosen": -0.15330848097801208, "logps_avg/rejected": -1.7043901681900024, "loss": 0.1391, "losses_ref": -0.0013215601211413741, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1160, "u": -1.8354936838150024, "weight": 0.0339578315615654 }, { "diff_generated": -17.90146255493164, "epoch": 0.37913156189241737, "grad_norm": 3.4954463643326585, "learning_rate": 7.983083733302178e-07, "logits/chosen": -2.534641742706299, "logits/rejected": -2.529932975769043, "logps/chosen": -25.712291717529297, "logps/rejected": -310.0232849121094, "logps_avg/chosen": -0.13911107182502747, "logps_avg/rejected": -1.7901462316513062, "loss": 0.1423, "losses_ref": -0.0007827662047930062, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1170, "u": -1.8010227680206299, "weight": 0.05144274979829788 }, { "diff_generated": -17.42496681213379, "epoch": 0.38237200259235254, "grad_norm": 3.5827637114687736, "learning_rate": 7.98166982222036e-07, "logits/chosen": -2.508788585662842, "logits/rejected": -2.5179848670959473, "logps/chosen": -25.84026527404785, "logps/rejected": -284.7757873535156, "logps_avg/chosen": -0.1503192037343979, "logps_avg/rejected": -1.7424967288970947, "loss": 0.1409, "losses_ref": -0.0010919750202447176, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1180, "u": -1.8121187686920166, "weight": 0.04601747542619705 }, { "diff_generated": -18.662456512451172, "epoch": 0.3856124432922878, "grad_norm": 3.171676752032205, "learning_rate": 7.980199304684328e-07, "logits/chosen": -2.4683659076690674, "logits/rejected": -2.454379081726074, "logps/chosen": -26.418987274169922, "logps/rejected": -315.4735412597656, "logps_avg/chosen": -0.1484156847000122, "logps_avg/rejected": -1.8662456274032593, "loss": 0.1405, "losses_ref": -0.0005341099458746612, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1190, "u": -1.8251888751983643, "weight": 0.03842061385512352 }, { "diff_generated": -18.877710342407227, "epoch": 0.38885288399222295, "grad_norm": 3.353939901355228, "learning_rate": 7.978672201600077e-07, "logits/chosen": -2.4355032444000244, "logits/rejected": -2.4991493225097656, "logps/chosen": -24.02352523803711, "logps/rejected": -321.2890319824219, "logps_avg/chosen": -0.15375861525535583, "logps_avg/rejected": -1.8877712488174438, "loss": 0.1373, "losses_ref": -0.0003830655477941036, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1200, "u": -1.7779605388641357, "weight": 0.0631849393248558 }, { "diff_generated": -18.390823364257812, "epoch": 0.39209332469215813, "grad_norm": 3.150342260956621, "learning_rate": 7.97708853467807e-07, "logits/chosen": -2.4914722442626953, "logits/rejected": -2.5442190170288086, "logps/chosen": -22.80034828186035, "logps/rejected": -316.0704650878906, "logps_avg/chosen": -0.1339089572429657, "logps_avg/rejected": -1.8390823602676392, "loss": 0.1378, "losses_ref": -0.00068041862687096, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1210, "u": -1.8366985321044922, "weight": 0.03256749361753464 }, { "diff_generated": -17.826963424682617, "epoch": 0.3953337653920933, "grad_norm": 3.4799528088131115, "learning_rate": 7.975448326432927e-07, "logits/chosen": -2.4840798377990723, "logits/rejected": -2.551450252532959, "logps/chosen": -24.82818031311035, "logps/rejected": -318.79144287109375, "logps_avg/chosen": -0.14494793117046356, "logps_avg/rejected": -1.7826963663101196, "loss": 0.1428, "losses_ref": -0.0009499592706561089, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1220, "u": -1.8125343322753906, "weight": 0.04559002444148064 }, { "diff_generated": -17.58133888244629, "epoch": 0.39857420609202854, "grad_norm": 3.4018737399228827, "learning_rate": 7.973751600183094e-07, "logits/chosen": -2.5015528202056885, "logits/rejected": -2.5259804725646973, "logps/chosen": -26.3048038482666, "logps/rejected": -304.74456787109375, "logps_avg/chosen": -0.14304566383361816, "logps_avg/rejected": -1.758133888244629, "loss": 0.1436, "losses_ref": -0.0007128279539756477, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1230, "u": -1.836792230606079, "weight": 0.032467663288116455 }, { "diff_generated": -19.12925148010254, "epoch": 0.4018146467919637, "grad_norm": 3.352873304767975, "learning_rate": 7.971998380050529e-07, "logits/chosen": -2.4705023765563965, "logits/rejected": -2.506176710128784, "logps/chosen": -25.72548484802246, "logps/rejected": -313.7716979980469, "logps_avg/chosen": -0.15574082732200623, "logps_avg/rejected": -1.9129250049591064, "loss": 0.1445, "losses_ref": -0.000700434495229274, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1240, "u": -1.8604224920272827, "weight": 0.02005375549197197 }, { "diff_generated": -18.428089141845703, "epoch": 0.4050550874918989, "grad_norm": 3.3160451492197414, "learning_rate": 7.970188690960343e-07, "logits/chosen": -2.4253203868865967, "logits/rejected": -2.5191216468811035, "logps/chosen": -21.341529846191406, "logps/rejected": -321.898193359375, "logps_avg/chosen": -0.1284700334072113, "logps_avg/rejected": -1.8428089618682861, "loss": 0.1369, "losses_ref": -0.0007181521505117416, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1250, "u": -1.7890920639038086, "weight": 0.057752568274736404 }, { "diff_generated": -18.595012664794922, "epoch": 0.40829552819183407, "grad_norm": 3.3685643424162057, "learning_rate": 7.968322558640458e-07, "logits/chosen": -2.4410624504089355, "logits/rejected": -2.517035961151123, "logps/chosen": -24.76461410522461, "logps/rejected": -324.0437927246094, "logps_avg/chosen": -0.14681437611579895, "logps_avg/rejected": -1.8595014810562134, "loss": 0.1438, "losses_ref": -0.0008349610725417733, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1260, "u": -1.7889350652694702, "weight": 0.05794642120599747 }, { "diff_generated": -18.137805938720703, "epoch": 0.4115359688917693, "grad_norm": 3.1562946170434243, "learning_rate": 7.966400009621233e-07, "logits/chosen": -2.465362071990967, "logits/rejected": -2.499647855758667, "logps/chosen": -25.29610252380371, "logps/rejected": -318.85333251953125, "logps_avg/chosen": -0.1431424915790558, "logps_avg/rejected": -1.8137805461883545, "loss": 0.1422, "losses_ref": -0.0006325670401565731, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1270, "u": -1.7420135736465454, "weight": 0.08237271010875702 }, { "diff_generated": -17.22241973876953, "epoch": 0.4147764095917045, "grad_norm": 3.2772344941264353, "learning_rate": 7.964421071235092e-07, "logits/chosen": -2.4533419609069824, "logits/rejected": -2.53004789352417, "logps/chosen": -23.705324172973633, "logps/rejected": -298.66033935546875, "logps_avg/chosen": -0.14016704261302948, "logps_avg/rejected": -1.722241997718811, "loss": 0.1385, "losses_ref": -0.0007769926451146603, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1280, "u": -1.7418180704116821, "weight": 0.08260531723499298 }, { "diff_generated": -18.119998931884766, "epoch": 0.41801685029163965, "grad_norm": 3.1222000038882425, "learning_rate": 7.962385771616133e-07, "logits/chosen": -2.484968662261963, "logits/rejected": -2.4719197750091553, "logps/chosen": -25.072969436645508, "logps/rejected": -285.41241455078125, "logps_avg/chosen": -0.14321108162403107, "logps_avg/rejected": -1.8119999170303345, "loss": 0.1349, "losses_ref": -0.0002476568624842912, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1290, "u": -1.7545337677001953, "weight": 0.07535454630851746 }, { "diff_generated": -17.06702995300293, "epoch": 0.42125729099157483, "grad_norm": 3.343738027534707, "learning_rate": 7.960294139699724e-07, "logits/chosen": -2.4875121116638184, "logits/rejected": -2.5455143451690674, "logps/chosen": -23.458911895751953, "logps/rejected": -308.88604736328125, "logps_avg/chosen": -0.13438265025615692, "logps_avg/rejected": -1.7067029476165771, "loss": 0.1389, "losses_ref": -0.0009179472108371556, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1300, "u": -1.7887804508209229, "weight": 0.058129895478487015 }, { "diff_generated": -19.01449966430664, "epoch": 0.42449773169151006, "grad_norm": 3.390022462334716, "learning_rate": 7.958146205222102e-07, "logits/chosen": -2.442352533340454, "logits/rejected": -2.488405227661133, "logps/chosen": -22.647235870361328, "logps/rejected": -317.5475769042969, "logps_avg/chosen": -0.1311555802822113, "logps_avg/rejected": -1.9014499187469482, "loss": 0.136, "losses_ref": -0.0005218187579885125, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1310, "u": -1.8371353149414062, "weight": 0.03206893056631088 }, { "diff_generated": -16.678707122802734, "epoch": 0.42773817239144524, "grad_norm": 3.4549953797681585, "learning_rate": 7.955941998719939e-07, "logits/chosen": -2.438683032989502, "logits/rejected": -2.4860570430755615, "logps/chosen": -24.95685386657715, "logps/rejected": -300.39422607421875, "logps_avg/chosen": -0.13486911356449127, "logps_avg/rejected": -1.6678707599639893, "loss": 0.1363, "losses_ref": -0.0008987674373202026, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1320, "u": -1.7414432764053345, "weight": 0.08302642405033112 }, { "diff_generated": -17.49447250366211, "epoch": 0.4309786130913804, "grad_norm": 3.2061398818826845, "learning_rate": 7.953681551529918e-07, "logits/chosen": -2.4157721996307373, "logits/rejected": -2.4662671089172363, "logps/chosen": -22.527088165283203, "logps/rejected": -311.30169677734375, "logps_avg/chosen": -0.13368523120880127, "logps_avg/rejected": -1.749447226524353, "loss": 0.1362, "losses_ref": -0.000775449734646827, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1330, "u": -1.7892115116119385, "weight": 0.057642944157123566 }, { "diff_generated": -17.374311447143555, "epoch": 0.43421905379131565, "grad_norm": 3.9259248576570536, "learning_rate": 7.951364895788277e-07, "logits/chosen": -2.468964099884033, "logits/rejected": -2.4919381141662598, "logps/chosen": -23.396032333374023, "logps/rejected": -300.768310546875, "logps_avg/chosen": -0.12941637635231018, "logps_avg/rejected": -1.737431287765503, "loss": 0.1357, "losses_ref": -0.0006515913410112262, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1340, "u": -1.813141107559204, "weight": 0.04489173740148544 }, { "diff_generated": -17.841054916381836, "epoch": 0.4374594944912508, "grad_norm": 3.042669878051496, "learning_rate": 7.948992064430363e-07, "logits/chosen": -2.462354898452759, "logits/rejected": -2.524773120880127, "logps/chosen": -25.153942108154297, "logps/rejected": -328.6318664550781, "logps_avg/chosen": -0.14619532227516174, "logps_avg/rejected": -1.784105658531189, "loss": 0.138, "losses_ref": -0.0007250936469063163, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1350, "u": -1.8129888772964478, "weight": 0.04506770148873329 }, { "diff_generated": -17.569915771484375, "epoch": 0.440699935191186, "grad_norm": 3.2656238746919963, "learning_rate": 7.946563091190154e-07, "logits/chosen": -2.4542012214660645, "logits/rejected": -2.5044617652893066, "logps/chosen": -24.700918197631836, "logps/rejected": -302.47125244140625, "logps_avg/chosen": -0.14593173563480377, "logps_avg/rejected": -1.7569917440414429, "loss": 0.139, "losses_ref": -0.001583050936460495, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1360, "u": -1.7991024255752563, "weight": 0.053593169897794724 }, { "diff_generated": -17.115385055541992, "epoch": 0.4439403758911212, "grad_norm": 3.789637622440191, "learning_rate": 7.944078010599788e-07, "logits/chosen": -2.496446132659912, "logits/rejected": -2.4480183124542236, "logps/chosen": -25.29878044128418, "logps/rejected": -303.20843505859375, "logps_avg/chosen": -0.1379314810037613, "logps_avg/rejected": -1.711538553237915, "loss": 0.1356, "losses_ref": -0.00036644996725954115, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1370, "u": -1.7425174713134766, "weight": 0.0817914828658104 }, { "diff_generated": -17.19664192199707, "epoch": 0.4471808165910564, "grad_norm": 3.184134531621745, "learning_rate": 7.941536857989063e-07, "logits/chosen": -2.421329975128174, "logits/rejected": -2.449415683746338, "logps/chosen": -25.185787200927734, "logps/rejected": -303.1695556640625, "logps_avg/chosen": -0.1363663375377655, "logps_avg/rejected": -1.719664216041565, "loss": 0.1363, "losses_ref": -0.0008979662088677287, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1380, "u": -1.7649774551391602, "weight": 0.07069804519414902 }, { "diff_generated": -17.848804473876953, "epoch": 0.4504212572909916, "grad_norm": 3.22439705826391, "learning_rate": 7.938939669484943e-07, "logits/chosen": -2.469378709793091, "logits/rejected": -2.52380108833313, "logps/chosen": -21.16245460510254, "logps/rejected": -316.0628662109375, "logps_avg/chosen": -0.12351296842098236, "logps_avg/rejected": -1.7848806381225586, "loss": 0.1336, "losses_ref": -0.0010566998971626163, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1390, "u": -1.8121494054794312, "weight": 0.04600784555077553 }, { "diff_generated": -18.572917938232422, "epoch": 0.45366169799092676, "grad_norm": 3.175962641830383, "learning_rate": 7.936286482011041e-07, "logits/chosen": -2.426278829574585, "logits/rejected": -2.4540011882781982, "logps/chosen": -24.904693603515625, "logps/rejected": -297.0711364746094, "logps_avg/chosen": -0.14129196107387543, "logps_avg/rejected": -1.8572919368743896, "loss": 0.1423, "losses_ref": -0.0006665909895673394, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1400, "u": -1.8010095357894897, "weight": 0.05142979696393013 }, { "diff_generated": -18.499448776245117, "epoch": 0.45690213869086194, "grad_norm": 3.23552476927222, "learning_rate": 7.933577333287091e-07, "logits/chosen": -2.4180374145507812, "logits/rejected": -2.5311505794525146, "logps/chosen": -23.67328643798828, "logps/rejected": -336.3960876464844, "logps_avg/chosen": -0.13547472655773163, "logps_avg/rejected": -1.8499447107315063, "loss": 0.1332, "losses_ref": -0.00015220060595311224, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1410, "u": -1.813956618309021, "weight": 0.04394307732582092 }, { "diff_generated": -18.732492446899414, "epoch": 0.46014257939079717, "grad_norm": 3.0412880571488885, "learning_rate": 7.930812261828421e-07, "logits/chosen": -2.4600558280944824, "logits/rejected": -2.493492603302002, "logps/chosen": -27.773700714111328, "logps/rejected": -328.41448974609375, "logps_avg/chosen": -0.16207179427146912, "logps_avg/rejected": -1.8732492923736572, "loss": 0.1445, "losses_ref": -0.0006029005744494498, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1420, "u": -1.81326425075531, "weight": 0.04474567994475365 }, { "diff_generated": -17.517717361450195, "epoch": 0.46338302009073234, "grad_norm": 3.1714790753997812, "learning_rate": 7.92799130694539e-07, "logits/chosen": -2.4764244556427, "logits/rejected": -2.491389751434326, "logps/chosen": -24.16140365600586, "logps/rejected": -310.711181640625, "logps_avg/chosen": -0.13903963565826416, "logps_avg/rejected": -1.7517716884613037, "loss": 0.1337, "losses_ref": -0.000567762996070087, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1430, "u": -1.7658287286758423, "weight": 0.06975488364696503 }, { "diff_generated": -17.2349853515625, "epoch": 0.4666234607906675, "grad_norm": 3.0821823873883964, "learning_rate": 7.925114508742848e-07, "logits/chosen": -2.455271005630493, "logits/rejected": -2.5200881958007812, "logps/chosen": -22.790693283081055, "logps/rejected": -302.3728942871094, "logps_avg/chosen": -0.1398683339357376, "logps_avg/rejected": -1.7234985828399658, "loss": 0.1372, "losses_ref": -0.0007977086352184415, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1440, "u": -1.8008880615234375, "weight": 0.05159289389848709 }, { "diff_generated": -18.428184509277344, "epoch": 0.4698639014906027, "grad_norm": 3.3487169217807415, "learning_rate": 7.92218190811955e-07, "logits/chosen": -2.4557693004608154, "logits/rejected": -2.555664300918579, "logps/chosen": -23.864511489868164, "logps/rejected": -331.5283203125, "logps_avg/chosen": -0.13912031054496765, "logps_avg/rejected": -1.8428184986114502, "loss": 0.1353, "losses_ref": -0.0007266084430739284, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1450, "u": -1.8129682540893555, "weight": 0.04508272558450699 }, { "diff_generated": -19.4451961517334, "epoch": 0.47310434219053793, "grad_norm": 2.9957250282590704, "learning_rate": 7.919193546767581e-07, "logits/chosen": -2.44518780708313, "logits/rejected": -2.4765005111694336, "logps/chosen": -24.367023468017578, "logps/rejected": -325.8844299316406, "logps_avg/chosen": -0.1385246217250824, "logps_avg/rejected": -1.9445196390151978, "loss": 0.1356, "losses_ref": -0.0005143691087141633, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1460, "u": -1.8133465051651, "weight": 0.04465331509709358 }, { "diff_generated": -17.70899200439453, "epoch": 0.4763447828904731, "grad_norm": 3.110747927329135, "learning_rate": 7.916149467171768e-07, "logits/chosen": -2.455876588821411, "logits/rejected": -2.483983039855957, "logps/chosen": -20.568796157836914, "logps/rejected": -288.34600830078125, "logps_avg/chosen": -0.12485536187887192, "logps_avg/rejected": -1.7708991765975952, "loss": 0.1332, "losses_ref": -0.0005488159949891269, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1470, "u": -1.7777249813079834, "weight": 0.0634617805480957 }, { "diff_generated": -17.289661407470703, "epoch": 0.4795852235904083, "grad_norm": 3.157389076417406, "learning_rate": 7.913049712609066e-07, "logits/chosen": -2.433224678039551, "logits/rejected": -2.474834442138672, "logps/chosen": -22.74091148376465, "logps/rejected": -307.838623046875, "logps_avg/chosen": -0.1298968493938446, "logps_avg/rejected": -1.7289661169052124, "loss": 0.1361, "losses_ref": -0.0003924695774912834, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1480, "u": -1.7424627542495728, "weight": 0.081854909658432 }, { "diff_generated": -18.811738967895508, "epoch": 0.48282566429034346, "grad_norm": 3.2532023130455094, "learning_rate": 7.909894327147949e-07, "logits/chosen": -2.4604969024658203, "logits/rejected": -2.4856350421905518, "logps/chosen": -23.749942779541016, "logps/rejected": -318.9234619140625, "logps_avg/chosen": -0.13616855442523956, "logps_avg/rejected": -1.8811737298965454, "loss": 0.1353, "losses_ref": -0.0009188092080876231, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1490, "u": -1.824424386024475, "weight": 0.039294369518756866 }, { "diff_generated": -19.15116310119629, "epoch": 0.4860661049902787, "grad_norm": 3.155040368233549, "learning_rate": 7.906683355647783e-07, "logits/chosen": -2.451903820037842, "logits/rejected": -2.511476516723633, "logps/chosen": -22.985902786254883, "logps/rejected": -335.32269287109375, "logps_avg/chosen": -0.13014516234397888, "logps_avg/rejected": -1.915116548538208, "loss": 0.1323, "losses_ref": -0.0009978034067898989, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1500, "u": -1.8238977193832397, "weight": 0.03983033448457718 }, { "diff_generated": -18.532636642456055, "epoch": 0.48930654569021387, "grad_norm": 3.2032888148117427, "learning_rate": 7.903416843758187e-07, "logits/chosen": -2.495983839035034, "logits/rejected": -2.544659376144409, "logps/chosen": -22.922616958618164, "logps/rejected": -329.5682067871094, "logps_avg/chosen": -0.13078387081623077, "logps_avg/rejected": -1.8532636165618896, "loss": 0.1352, "losses_ref": -0.0004203950520604849, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1510, "u": -1.7779722213745117, "weight": 0.06317600607872009 }, { "diff_generated": -18.766742706298828, "epoch": 0.49254698639014904, "grad_norm": 3.1952501859878306, "learning_rate": 7.900094837918385e-07, "logits/chosen": -2.4817230701446533, "logits/rejected": -2.5090298652648926, "logps/chosen": -26.534265518188477, "logps/rejected": -344.45526123046875, "logps_avg/chosen": -0.14062072336673737, "logps_avg/rejected": -1.8766740560531616, "loss": 0.1385, "losses_ref": -0.0007466255337931216, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1520, "u": -1.8367220163345337, "weight": 0.03255104273557663 }, { "diff_generated": -18.747167587280273, "epoch": 0.4957874270900843, "grad_norm": 3.178512660094918, "learning_rate": 7.896717385356545e-07, "logits/chosen": -2.4801974296569824, "logits/rejected": -2.5688040256500244, "logps/chosen": -22.872400283813477, "logps/rejected": -333.326904296875, "logps_avg/chosen": -0.1437094360589981, "logps_avg/rejected": -1.8747165203094482, "loss": 0.134, "losses_ref": -0.0007747443160042167, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1530, "u": -1.8481905460357666, "weight": 0.02670850232243538 }, { "diff_generated": -19.107723236083984, "epoch": 0.49902786779001945, "grad_norm": 3.326305584609048, "learning_rate": 7.893284534089109e-07, "logits/chosen": -2.452545404434204, "logits/rejected": -2.5040736198425293, "logps/chosen": -22.993648529052734, "logps/rejected": -326.62591552734375, "logps_avg/chosen": -0.1290530264377594, "logps_avg/rejected": -1.9107720851898193, "loss": 0.1381, "losses_ref": -0.0006093319389037788, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1540, "u": -1.8250234127044678, "weight": 0.0386078879237175 }, { "diff_generated": -17.362712860107422, "epoch": 0.5022683084899546, "grad_norm": 3.424929998445367, "learning_rate": 7.889796332920106e-07, "logits/chosen": -2.432164430618286, "logits/rejected": -2.529879570007324, "logps/chosen": -21.980777740478516, "logps/rejected": -312.9666442871094, "logps_avg/chosen": -0.13036580383777618, "logps_avg/rejected": -1.7362712621688843, "loss": 0.1311, "losses_ref": -0.00039959652349352837, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1550, "u": -1.7542804479599, "weight": 0.07564841955900192 }, { "diff_generated": -18.691469192504883, "epoch": 0.5055087491898899, "grad_norm": 3.2649242690203035, "learning_rate": 7.886252831440465e-07, "logits/chosen": -2.4698634147644043, "logits/rejected": -2.5349888801574707, "logps/chosen": -25.230175018310547, "logps/rejected": -343.2588195800781, "logps_avg/chosen": -0.1457994431257248, "logps_avg/rejected": -1.8691469430923462, "loss": 0.1348, "losses_ref": -0.0006545605137944221, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1560, "u": -1.848717451095581, "weight": 0.026136714965105057 }, { "diff_generated": -18.0201358795166, "epoch": 0.508749189889825, "grad_norm": 3.0505128913232067, "learning_rate": 7.882654080027304e-07, "logits/chosen": -2.4563963413238525, "logits/rejected": -2.526946544647217, "logps/chosen": -23.22417640686035, "logps/rejected": -325.34759521484375, "logps_avg/chosen": -0.13335327804088593, "logps_avg/rejected": -1.802013635635376, "loss": 0.1378, "losses_ref": -0.0006427440093830228, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1570, "u": -1.8248546123504639, "weight": 0.03879604488611221 }, { "diff_generated": -15.900899887084961, "epoch": 0.5119896305897602, "grad_norm": 2.813559786691504, "learning_rate": 7.879000129843218e-07, "logits/chosen": -2.519257068634033, "logits/rejected": -2.5333638191223145, "logps/chosen": -26.80712890625, "logps/rejected": -287.0588073730469, "logps_avg/chosen": -0.1401996612548828, "logps_avg/rejected": -1.590090036392212, "loss": 0.1328, "losses_ref": -0.000904018641449511, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1580, "u": -1.7532364130020142, "weight": 0.07683442533016205 }, { "diff_generated": -15.259477615356445, "epoch": 0.5152300712896954, "grad_norm": 2.863048008897734, "learning_rate": 7.87529103283555e-07, "logits/chosen": -2.5320329666137695, "logits/rejected": -2.584820508956909, "logps/chosen": -24.788921356201172, "logps/rejected": -275.0646057128906, "logps_avg/chosen": -0.12735766172409058, "logps_avg/rejected": -1.5259478092193604, "loss": 0.1367, "losses_ref": -0.0005550708156079054, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1590, "u": -1.7896106243133545, "weight": 0.05717957019805908 }, { "diff_generated": -17.436412811279297, "epoch": 0.5184705119896306, "grad_norm": 2.7173964853628267, "learning_rate": 7.871526841735649e-07, "logits/chosen": -2.4900612831115723, "logits/rejected": -2.5238966941833496, "logps/chosen": -22.51504135131836, "logps/rejected": -283.78741455078125, "logps_avg/chosen": -0.12943613529205322, "logps_avg/rejected": -1.7436414957046509, "loss": 0.1335, "losses_ref": -0.0005736102466471493, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1600, "u": -1.789592981338501, "weight": 0.05720081925392151 }, { "diff_generated": -16.706300735473633, "epoch": 0.5217109526895658, "grad_norm": 3.163777546228782, "learning_rate": 7.867707610058127e-07, "logits/chosen": -2.497654438018799, "logits/rejected": -2.5916731357574463, "logps/chosen": -22.602352142333984, "logps/rejected": -291.9427490234375, "logps_avg/chosen": -0.13002575933933258, "logps_avg/rejected": -1.670629858970642, "loss": 0.1392, "losses_ref": -0.000581948203034699, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1610, "u": -1.8132070302963257, "weight": 0.04481140524148941 }, { "diff_generated": -17.194438934326172, "epoch": 0.5249513933895009, "grad_norm": 3.166949089273434, "learning_rate": 7.863833392100093e-07, "logits/chosen": -2.4334254264831543, "logits/rejected": -2.5457873344421387, "logps/chosen": -21.274616241455078, "logps/rejected": -282.6556091308594, "logps_avg/chosen": -0.13356930017471313, "logps_avg/rejected": -1.719443917274475, "loss": 0.1323, "losses_ref": -0.0006519744638353586, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1620, "u": -1.8368451595306396, "weight": 0.032400231808423996 }, { "diff_generated": -17.77353286743164, "epoch": 0.5281918340894362, "grad_norm": 2.9581354223187604, "learning_rate": 7.859904242940385e-07, "logits/chosen": -2.489903450012207, "logits/rejected": -2.5374815464019775, "logps/chosen": -22.93409538269043, "logps/rejected": -314.50799560546875, "logps_avg/chosen": -0.1305612027645111, "logps_avg/rejected": -1.777353048324585, "loss": 0.1336, "losses_ref": -0.0008023073896765709, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1630, "u": -1.836499810218811, "weight": 0.032795753329992294 }, { "diff_generated": -16.64279556274414, "epoch": 0.5314322747893714, "grad_norm": 3.3278103760783795, "learning_rate": 7.855920218438783e-07, "logits/chosen": -2.453221321105957, "logits/rejected": -2.4900641441345215, "logps/chosen": -23.192838668823242, "logps/rejected": -283.46697998046875, "logps_avg/chosen": -0.13836851716041565, "logps_avg/rejected": -1.664279580116272, "loss": 0.1397, "losses_ref": -0.0008003627881407738, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1640, "u": -1.8007957935333252, "weight": 0.05168105289340019 }, { "diff_generated": -15.557705879211426, "epoch": 0.5346727154893065, "grad_norm": 2.7979372317645166, "learning_rate": 7.851881375235216e-07, "logits/chosen": -2.5243003368377686, "logits/rejected": -2.556443691253662, "logps/chosen": -21.914457321166992, "logps/rejected": -285.3017272949219, "logps_avg/chosen": -0.11913974583148956, "logps_avg/rejected": -1.5557703971862793, "loss": 0.1304, "losses_ref": -0.000646257889457047, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1650, "u": -1.7420024871826172, "weight": 0.08238764107227325 }, { "diff_generated": -18.791248321533203, "epoch": 0.5379131561892417, "grad_norm": 3.743434576482437, "learning_rate": 7.847787770748959e-07, "logits/chosen": -2.527073860168457, "logits/rejected": -2.5413413047790527, "logps/chosen": -24.805885314941406, "logps/rejected": -315.00653076171875, "logps_avg/chosen": -0.136772021651268, "logps_avg/rejected": -1.8791248798370361, "loss": 0.1384, "losses_ref": -0.0006690368754789233, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1660, "u": -1.8249105215072632, "weight": 0.038742512464523315 }, { "diff_generated": -17.38671112060547, "epoch": 0.541153596889177, "grad_norm": 3.482304529621643, "learning_rate": 7.843639463177815e-07, "logits/chosen": -2.4997076988220215, "logits/rejected": -2.5827596187591553, "logps/chosen": -23.33755874633789, "logps/rejected": -318.40997314453125, "logps_avg/chosen": -0.13309702277183533, "logps_avg/rejected": -1.738671064376831, "loss": 0.1293, "losses_ref": -0.0007650243933312595, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1670, "u": -1.8129169940948486, "weight": 0.045146699994802475 }, { "diff_generated": -16.854154586791992, "epoch": 0.5443940375891121, "grad_norm": 3.0307492113807837, "learning_rate": 7.839436511497288e-07, "logits/chosen": -2.4866318702697754, "logits/rejected": -2.540236473083496, "logps/chosen": -23.997835159301758, "logps/rejected": -310.1125793457031, "logps_avg/chosen": -0.1269882470369339, "logps_avg/rejected": -1.685415506362915, "loss": 0.1286, "losses_ref": -0.0004096725897397846, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1680, "u": -1.789876937866211, "weight": 0.05687083676457405 }, { "diff_generated": -18.999799728393555, "epoch": 0.5476344782890473, "grad_norm": 3.1436110896708285, "learning_rate": 7.835178975459744e-07, "logits/chosen": -2.471205949783325, "logits/rejected": -2.503660202026367, "logps/chosen": -22.559701919555664, "logps/rejected": -324.4423522949219, "logps_avg/chosen": -0.13547468185424805, "logps_avg/rejected": -1.8999799489974976, "loss": 0.1324, "losses_ref": -0.00037550865090452135, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1690, "u": -1.813604712486267, "weight": 0.04435449838638306 }, { "diff_generated": -18.0362548828125, "epoch": 0.5508749189889826, "grad_norm": 3.094785074613179, "learning_rate": 7.83086691559356e-07, "logits/chosen": -2.518862724304199, "logits/rejected": -2.5251712799072266, "logps/chosen": -21.84812355041504, "logps/rejected": -308.73828125, "logps_avg/chosen": -0.12069737911224365, "logps_avg/rejected": -1.803625464439392, "loss": 0.1306, "losses_ref": -0.0006218409398570657, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1700, "u": -1.8013778924942017, "weight": 0.05103649944067001 }, { "diff_generated": -17.041568756103516, "epoch": 0.5541153596889177, "grad_norm": 3.0017656954128835, "learning_rate": 7.826500393202268e-07, "logits/chosen": -2.4499335289001465, "logits/rejected": -2.464740514755249, "logps/chosen": -25.1323184967041, "logps/rejected": -290.95654296875, "logps_avg/chosen": -0.13508830964565277, "logps_avg/rejected": -1.7041568756103516, "loss": 0.136, "losses_ref": -0.0009132762206718326, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1710, "u": -1.8007068634033203, "weight": 0.051802944391965866 }, { "diff_generated": -16.525867462158203, "epoch": 0.5573558003888529, "grad_norm": 2.9335307587728936, "learning_rate": 7.82207947036368e-07, "logits/chosen": -2.420897960662842, "logits/rejected": -2.4787497520446777, "logps/chosen": -22.71407127380371, "logps/rejected": -290.0786437988281, "logps_avg/chosen": -0.13314175605773926, "logps_avg/rejected": -1.652586579322815, "loss": 0.1329, "losses_ref": -0.0007877512834966183, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1720, "u": -1.7889564037322998, "weight": 0.05788535624742508 }, { "diff_generated": -18.824562072753906, "epoch": 0.560596241088788, "grad_norm": 3.099890996831872, "learning_rate": 7.817604209929007e-07, "logits/chosen": -2.461977243423462, "logits/rejected": -2.4332797527313232, "logps/chosen": -25.484416961669922, "logps/rejected": -305.5503845214844, "logps_avg/chosen": -0.1392674595117569, "logps_avg/rejected": -1.8824561834335327, "loss": 0.135, "losses_ref": -0.0006474562687799335, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1730, "u": -1.7774829864501953, "weight": 0.06373722851276398 }, { "diff_generated": -18.078306198120117, "epoch": 0.5638366817887233, "grad_norm": 2.859866263751752, "learning_rate": 7.813074675521962e-07, "logits/chosen": -2.511709213256836, "logits/rejected": -2.506988763809204, "logps/chosen": -25.205087661743164, "logps/rejected": -306.67950439453125, "logps_avg/chosen": -0.14072729647159576, "logps_avg/rejected": -1.807830810546875, "loss": 0.1319, "losses_ref": -0.0010248484322801232, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1740, "u": -1.8477213382720947, "weight": 0.027229273691773415 }, { "diff_generated": -18.262256622314453, "epoch": 0.5670771224886585, "grad_norm": 3.0360024902130034, "learning_rate": 7.80849093153786e-07, "logits/chosen": -2.4646198749542236, "logits/rejected": -2.5201735496520996, "logps/chosen": -21.531408309936523, "logps/rejected": -317.57916259765625, "logps_avg/chosen": -0.12418844550848007, "logps_avg/rejected": -1.8262255191802979, "loss": 0.1289, "losses_ref": -0.0006071639945730567, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1750, "u": -1.777687668800354, "weight": 0.06350871175527573 }, { "diff_generated": -17.833709716796875, "epoch": 0.5703175631885936, "grad_norm": 2.9028000810936545, "learning_rate": 7.803853043142702e-07, "logits/chosen": -2.4486467838287354, "logits/rejected": -2.5134153366088867, "logps/chosen": -24.652416229248047, "logps/rejected": -309.2928161621094, "logps_avg/chosen": -0.1339518278837204, "logps_avg/rejected": -1.7833709716796875, "loss": 0.1291, "losses_ref": -0.00018930871738120914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1760, "u": -1.7902014255523682, "weight": 0.05649164319038391 }, { "diff_generated": -17.00735855102539, "epoch": 0.5735580038885288, "grad_norm": 3.29055949352831, "learning_rate": 7.799161076272245e-07, "logits/chosen": -2.4540839195251465, "logits/rejected": -2.514845371246338, "logps/chosen": -23.15195655822754, "logps/rejected": -304.8998107910156, "logps_avg/chosen": -0.13627921044826508, "logps_avg/rejected": -1.7007356882095337, "loss": 0.1367, "losses_ref": -0.0010348598007112741, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1770, "u": -1.8003708124160767, "weight": 0.05215916782617569 }, { "diff_generated": -18.713973999023438, "epoch": 0.5767984445884641, "grad_norm": 3.46189518615115, "learning_rate": 7.794415097631066e-07, "logits/chosen": -2.4763245582580566, "logits/rejected": -2.493891477584839, "logps/chosen": -22.45502471923828, "logps/rejected": -320.9071350097656, "logps_avg/chosen": -0.1303728073835373, "logps_avg/rejected": -1.871397614479065, "loss": 0.1357, "losses_ref": -0.0007566340500488877, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1780, "u": -1.848497748374939, "weight": 0.02639012411236763 }, { "diff_generated": -17.699176788330078, "epoch": 0.5800388852883992, "grad_norm": 2.931900487175917, "learning_rate": 7.789615174691619e-07, "logits/chosen": -2.4306411743164062, "logits/rejected": -2.51458477973938, "logps/chosen": -24.722259521484375, "logps/rejected": -323.888916015625, "logps_avg/chosen": -0.13683000206947327, "logps_avg/rejected": -1.7699177265167236, "loss": 0.1305, "losses_ref": -0.00117635412607342, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1790, "u": -1.8001972436904907, "weight": 0.052384234964847565 }, { "diff_generated": -18.133087158203125, "epoch": 0.5832793259883344, "grad_norm": 2.835865807866508, "learning_rate": 7.784761375693268e-07, "logits/chosen": -2.3992950916290283, "logits/rejected": -2.4438071250915527, "logps/chosen": -23.660818099975586, "logps/rejected": -328.71295166015625, "logps_avg/chosen": -0.13329359889030457, "logps_avg/rejected": -1.8133087158203125, "loss": 0.1297, "losses_ref": -0.0003332248597871512, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1800, "u": -1.7899528741836548, "weight": 0.05678124353289604 }, { "diff_generated": -18.213998794555664, "epoch": 0.5865197666882696, "grad_norm": 3.173336533470699, "learning_rate": 7.779853769641319e-07, "logits/chosen": -2.4195868968963623, "logits/rejected": -2.473097801208496, "logps/chosen": -25.410669326782227, "logps/rejected": -328.18853759765625, "logps_avg/chosen": -0.15266582369804382, "logps_avg/rejected": -1.8214000463485718, "loss": 0.1307, "losses_ref": -0.0005910733598284423, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1810, "u": -1.8131601810455322, "weight": 0.04486403614282608 }, { "diff_generated": -18.346664428710938, "epoch": 0.5897602073882048, "grad_norm": 3.120402178443489, "learning_rate": 7.774892426306042e-07, "logits/chosen": -2.4505624771118164, "logits/rejected": -2.5352671146392822, "logps/chosen": -22.137514114379883, "logps/rejected": -330.32745361328125, "logps_avg/chosen": -0.12556853890419006, "logps_avg/rejected": -1.8346662521362305, "loss": 0.1315, "losses_ref": -0.0006458786083385348, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1820, "u": -1.753527283668518, "weight": 0.07647226750850677 }, { "diff_generated": -18.87307357788086, "epoch": 0.59300064808814, "grad_norm": 2.939036330764369, "learning_rate": 7.769877416221678e-07, "logits/chosen": -2.459193229675293, "logits/rejected": -2.4645588397979736, "logps/chosen": -26.066543579101562, "logps/rejected": -318.68603515625, "logps_avg/chosen": -0.14139781892299652, "logps_avg/rejected": -1.8873074054718018, "loss": 0.1379, "losses_ref": -0.0006232672603800893, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1830, "u": -1.801300048828125, "weight": 0.051123809069395065 }, { "diff_generated": -18.721759796142578, "epoch": 0.5962410887880751, "grad_norm": 2.8573207260724844, "learning_rate": 7.764808810685433e-07, "logits/chosen": -2.428995370864868, "logits/rejected": -2.5126430988311768, "logps/chosen": -19.536548614501953, "logps/rejected": -310.163818359375, "logps_avg/chosen": -0.12359301000833511, "logps_avg/rejected": -1.8721758127212524, "loss": 0.133, "losses_ref": -0.000460333249066025, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1840, "u": -1.754206657409668, "weight": 0.07573723793029785 }, { "diff_generated": -18.366769790649414, "epoch": 0.5994815294880104, "grad_norm": 3.0942383224635437, "learning_rate": 7.759686681756468e-07, "logits/chosen": -2.457427740097046, "logits/rejected": -2.482417106628418, "logps/chosen": -22.40389060974121, "logps/rejected": -315.10528564453125, "logps_avg/chosen": -0.12646666169166565, "logps_avg/rejected": -1.836676836013794, "loss": 0.1295, "losses_ref": -0.0004205040750093758, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1850, "u": -1.7898060083389282, "weight": 0.056951187551021576 }, { "diff_generated": -20.095348358154297, "epoch": 0.6027219701879456, "grad_norm": 2.7487787654908575, "learning_rate": 7.754511102254876e-07, "logits/chosen": -2.4285895824432373, "logits/rejected": -2.4910686016082764, "logps/chosen": -21.96750259399414, "logps/rejected": -335.4107360839844, "logps_avg/chosen": -0.13040336966514587, "logps_avg/rejected": -2.009535074234009, "loss": 0.1313, "losses_ref": -0.0006380341947078705, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1860, "u": -1.8131011724472046, "weight": 0.04493032768368721 }, { "diff_generated": -19.728809356689453, "epoch": 0.6059624108878807, "grad_norm": 2.7799453498174085, "learning_rate": 7.74928214576064e-07, "logits/chosen": -2.4481120109558105, "logits/rejected": -2.4329230785369873, "logps/chosen": -23.952800750732422, "logps/rejected": -323.2978210449219, "logps_avg/chosen": -0.13100168108940125, "logps_avg/rejected": -1.9728807210922241, "loss": 0.131, "losses_ref": -0.0006491635576821864, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1870, "u": -1.825012445449829, "weight": 0.038625117391347885 }, { "diff_generated": -20.258499145507812, "epoch": 0.609202851587816, "grad_norm": 3.132382816862127, "learning_rate": 7.743999886612591e-07, "logits/chosen": -2.4395930767059326, "logits/rejected": -2.496419668197632, "logps/chosen": -23.156782150268555, "logps/rejected": -342.83392333984375, "logps_avg/chosen": -0.13323050737380981, "logps_avg/rejected": -2.0258498191833496, "loss": 0.1333, "losses_ref": -0.0004270991194061935, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1880, "u": -1.8727830648422241, "weight": 0.013226887211203575 }, { "diff_generated": -19.925138473510742, "epoch": 0.6124432922877512, "grad_norm": 2.8991126228539743, "learning_rate": 7.738664399907355e-07, "logits/chosen": -2.4587321281433105, "logits/rejected": -2.522761583328247, "logps/chosen": -22.69471549987793, "logps/rejected": -340.39208984375, "logps_avg/chosen": -0.14121794700622559, "logps_avg/rejected": -1.9925140142440796, "loss": 0.1298, "losses_ref": -0.000661244208458811, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1890, "u": -1.8604755401611328, "weight": 0.019995156675577164 }, { "diff_generated": -18.109577178955078, "epoch": 0.6156837329876863, "grad_norm": 2.8760242471354327, "learning_rate": 7.733275761498278e-07, "logits/chosen": -2.472839593887329, "logits/rejected": -2.471781015396118, "logps/chosen": -24.88907241821289, "logps/rejected": -314.57965087890625, "logps_avg/chosen": -0.1315891295671463, "logps_avg/rejected": -1.8109575510025024, "loss": 0.1284, "losses_ref": -0.0005121930735185742, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1900, "u": -1.7897313833236694, "weight": 0.057041965425014496 }, { "diff_generated": -17.49472427368164, "epoch": 0.6189241736876215, "grad_norm": 2.823280138116411, "learning_rate": 7.727834047994353e-07, "logits/chosen": -2.4597010612487793, "logits/rejected": -2.508735179901123, "logps/chosen": -26.73147201538086, "logps/rejected": -324.4564514160156, "logps_avg/chosen": -0.14764484763145447, "logps_avg/rejected": -1.7494723796844482, "loss": 0.1299, "losses_ref": -0.0005785429384559393, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1910, "u": -1.801418662071228, "weight": 0.05098617821931839 }, { "diff_generated": -18.79656982421875, "epoch": 0.6221646143875567, "grad_norm": 3.051226672879621, "learning_rate": 7.722339336759129e-07, "logits/chosen": -2.3774640560150146, "logits/rejected": -2.4868431091308594, "logps/chosen": -23.647579193115234, "logps/rejected": -331.06463623046875, "logps_avg/chosen": -0.14270934462547302, "logps_avg/rejected": -1.8796570301055908, "loss": 0.135, "losses_ref": -0.0005043046548962593, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1920, "u": -1.7778398990631104, "weight": 0.06332896649837494 }, { "diff_generated": -17.094919204711914, "epoch": 0.6254050550874919, "grad_norm": 3.2172445082389216, "learning_rate": 7.71679170590961e-07, "logits/chosen": -2.49719500541687, "logits/rejected": -2.503236770629883, "logps/chosen": -24.753795623779297, "logps/rejected": -301.22210693359375, "logps_avg/chosen": -0.13670535385608673, "logps_avg/rejected": -1.7094919681549072, "loss": 0.1308, "losses_ref": -0.0005722006899304688, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1930, "u": -1.801491141319275, "weight": 0.05090496689081192 }, { "diff_generated": -17.40401840209961, "epoch": 0.6286454957874271, "grad_norm": 2.7455202356055373, "learning_rate": 7.711191234315146e-07, "logits/chosen": -2.4483723640441895, "logits/rejected": -2.4890902042388916, "logps/chosen": -23.994434356689453, "logps/rejected": -324.2485046386719, "logps_avg/chosen": -0.1330624669790268, "logps_avg/rejected": -1.7404018640518188, "loss": 0.129, "losses_ref": -0.0003373560612089932, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1940, "u": -1.849252462387085, "weight": 0.025515040382742882 }, { "diff_generated": -18.809823989868164, "epoch": 0.6318859364873622, "grad_norm": 2.9120368373047643, "learning_rate": 7.705538001596312e-07, "logits/chosen": -2.468559741973877, "logits/rejected": -2.5485262870788574, "logps/chosen": -20.931529998779297, "logps/rejected": -319.64813232421875, "logps_avg/chosen": -0.12360795587301254, "logps_avg/rejected": -1.8809821605682373, "loss": 0.1338, "losses_ref": -0.0002663102059159428, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1950, "u": -1.8019386529922485, "weight": 0.05038148909807205 }, { "diff_generated": -18.93647003173828, "epoch": 0.6351263771872975, "grad_norm": 2.8712568410008026, "learning_rate": 7.699832088123774e-07, "logits/chosen": -2.486135482788086, "logits/rejected": -2.466231107711792, "logps/chosen": -24.849327087402344, "logps/rejected": -333.1363220214844, "logps_avg/chosen": -0.12802192568778992, "logps_avg/rejected": -1.8936468362808228, "loss": 0.1318, "losses_ref": -0.0005878577358089387, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1960, "u": -1.8132880926132202, "weight": 0.04472342133522034 }, { "diff_generated": -18.771728515625, "epoch": 0.6383668178872327, "grad_norm": 3.715166163883544, "learning_rate": 7.694073575017151e-07, "logits/chosen": -2.3794102668762207, "logits/rejected": -2.4397597312927246, "logps/chosen": -20.941625595092773, "logps/rejected": -332.5712585449219, "logps_avg/chosen": -0.11972401291131973, "logps_avg/rejected": -1.877172827720642, "loss": 0.1286, "losses_ref": -0.00048074816004373133, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1970, "u": -1.777799367904663, "weight": 0.0633687973022461 }, { "diff_generated": -18.586057662963867, "epoch": 0.6416072585871678, "grad_norm": 2.985248147364007, "learning_rate": 7.688262544143854e-07, "logits/chosen": -2.4497241973876953, "logits/rejected": -2.462395429611206, "logps/chosen": -22.778026580810547, "logps/rejected": -336.16632080078125, "logps_avg/chosen": -0.12197883427143097, "logps_avg/rejected": -1.8586056232452393, "loss": 0.127, "losses_ref": -0.000526120129507035, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1980, "u": -1.753989815711975, "weight": 0.07598290592432022 }, { "diff_generated": -19.0280704498291, "epoch": 0.6448476992871031, "grad_norm": 2.7181558462233046, "learning_rate": 7.682399078117928e-07, "logits/chosen": -2.453800678253174, "logits/rejected": -2.4369149208068848, "logps/chosen": -21.635766983032227, "logps/rejected": -329.36199951171875, "logps_avg/chosen": -0.12099570035934448, "logps_avg/rejected": -1.9028072357177734, "loss": 0.1318, "losses_ref": -0.0004379908205009997, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1990, "u": -1.7659953832626343, "weight": 0.06955628842115402 }, { "diff_generated": -18.570884704589844, "epoch": 0.6480881399870383, "grad_norm": 2.9614913162809473, "learning_rate": 7.67648326029888e-07, "logits/chosen": -2.4518074989318848, "logits/rejected": -2.4435629844665527, "logps/chosen": -24.682865142822266, "logps/rejected": -324.78253173828125, "logps_avg/chosen": -0.13669325411319733, "logps_avg/rejected": -1.8570884466171265, "loss": 0.133, "losses_ref": -0.0006112282280810177, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2000, "u": -1.777618408203125, "weight": 0.06358519196510315 }, { "diff_generated": -19.15464210510254, "epoch": 0.6513285806869734, "grad_norm": 2.807859572737755, "learning_rate": 7.670515174790485e-07, "logits/chosen": -2.437833547592163, "logits/rejected": -2.437678813934326, "logps/chosen": -24.248523712158203, "logps/rejected": -325.72210693359375, "logps_avg/chosen": -0.14109835028648376, "logps_avg/rejected": -1.915464162826538, "loss": 0.1357, "losses_ref": -0.0006648440612480044, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2010, "u": -1.824631929397583, "weight": 0.039017364382743835 }, { "diff_generated": -18.24823760986328, "epoch": 0.6545690213869086, "grad_norm": 3.1332940725212928, "learning_rate": 7.664494906439598e-07, "logits/chosen": -2.4478213787078857, "logits/rejected": -2.462120532989502, "logps/chosen": -21.301809310913086, "logps/rejected": -335.829345703125, "logps_avg/chosen": -0.1297096312046051, "logps_avg/rejected": -1.8248237371444702, "loss": 0.1251, "losses_ref": -0.00029017007909715176, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2020, "u": -1.8137531280517578, "weight": 0.04418119788169861 }, { "diff_generated": -19.48830795288086, "epoch": 0.6578094620868438, "grad_norm": 2.883202474548215, "learning_rate": 7.658422540834943e-07, "logits/chosen": -2.4664320945739746, "logits/rejected": -2.4568514823913574, "logps/chosen": -25.807851791381836, "logps/rejected": -343.0586853027344, "logps_avg/chosen": -0.13989664614200592, "logps_avg/rejected": -1.9488309621810913, "loss": 0.1335, "losses_ref": -0.0007401621551252902, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2030, "u": -1.8485580682754517, "weight": 0.02631906047463417 }, { "diff_generated": -19.335189819335938, "epoch": 0.661049902786779, "grad_norm": 3.184220687164625, "learning_rate": 7.6522981643059e-07, "logits/chosen": -2.4369640350341797, "logits/rejected": -2.463068723678589, "logps/chosen": -24.900054931640625, "logps/rejected": -317.1351013183594, "logps_avg/chosen": -0.14506976306438446, "logps_avg/rejected": -1.9335190057754517, "loss": 0.1311, "losses_ref": -0.00047019642079249024, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2040, "u": -1.8249590396881104, "weight": 0.03864910453557968 }, { "diff_generated": -19.70545768737793, "epoch": 0.6642903434867142, "grad_norm": 2.8651056705390197, "learning_rate": 7.646121863921278e-07, "logits/chosen": -2.4327144622802734, "logits/rejected": -2.4198977947235107, "logps/chosen": -25.717498779296875, "logps/rejected": -342.68389892578125, "logps_avg/chosen": -0.13026151061058044, "logps_avg/rejected": -1.9705455303192139, "loss": 0.1296, "losses_ref": -0.0005326059181243181, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2050, "u": -1.837100625038147, "weight": 0.03210743889212608 }, { "diff_generated": -19.015865325927734, "epoch": 0.6675307841866494, "grad_norm": 2.765621585891966, "learning_rate": 7.639893727488069e-07, "logits/chosen": -2.4133496284484863, "logits/rejected": -2.49336576461792, "logps/chosen": -21.6951961517334, "logps/rejected": -355.7476806640625, "logps_avg/chosen": -0.1258120834827423, "logps_avg/rejected": -1.9015867710113525, "loss": 0.1245, "losses_ref": -0.000237293541431427, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2060, "u": -1.825690507888794, "weight": 0.0378374308347702 }, { "diff_generated": -18.98021697998047, "epoch": 0.6707712248865846, "grad_norm": 3.073794356382388, "learning_rate": 7.633613843550212e-07, "logits/chosen": -2.467679977416992, "logits/rejected": -2.475241184234619, "logps/chosen": -24.288105010986328, "logps/rejected": -328.1109924316406, "logps_avg/chosen": -0.13157445192337036, "logps_avg/rejected": -1.8980216979980469, "loss": 0.1325, "losses_ref": -0.0005475075449794531, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2070, "u": -1.7896312475204468, "weight": 0.05715782567858696 }, { "diff_generated": -20.291763305664062, "epoch": 0.6740116655865198, "grad_norm": 3.0389243347898502, "learning_rate": 7.627282301387325e-07, "logits/chosen": -2.365746021270752, "logits/rejected": -2.4230730533599854, "logps/chosen": -21.869232177734375, "logps/rejected": -336.66961669921875, "logps_avg/chosen": -0.12987583875656128, "logps_avg/rejected": -2.0291762351989746, "loss": 0.1282, "losses_ref": -0.0003781206323765218, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2080, "u": -1.7662067413330078, "weight": 0.06932022422552109 }, { "diff_generated": -17.792755126953125, "epoch": 0.6772521062864549, "grad_norm": 2.698865927398295, "learning_rate": 7.620899191013438e-07, "logits/chosen": -2.3889107704162598, "logits/rejected": -2.4569685459136963, "logps/chosen": -24.946392059326172, "logps/rejected": -329.18304443359375, "logps_avg/chosen": -0.14580973982810974, "logps_avg/rejected": -1.77927565574646, "loss": 0.1331, "losses_ref": -0.0004922214429825544, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2090, "u": -1.7897088527679443, "weight": 0.05706654116511345 }, { "diff_generated": -19.587724685668945, "epoch": 0.6804925469863902, "grad_norm": 2.749917366534381, "learning_rate": 7.614464603175717e-07, "logits/chosen": -2.4647059440612793, "logits/rejected": -2.4155337810516357, "logps/chosen": -23.76377296447754, "logps/rejected": -333.32037353515625, "logps_avg/chosen": -0.12452936172485352, "logps_avg/rejected": -1.9587726593017578, "loss": 0.1239, "losses_ref": -0.000524552131537348, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2100, "u": -1.7895400524139404, "weight": 0.05724791809916496 }, { "diff_generated": -20.409175872802734, "epoch": 0.6837329876863253, "grad_norm": 2.629710299155267, "learning_rate": 7.607978629353167e-07, "logits/chosen": -2.4125261306762695, "logits/rejected": -2.4562489986419678, "logps/chosen": -23.55038070678711, "logps/rejected": -352.37255859375, "logps_avg/chosen": -0.1393522024154663, "logps_avg/rejected": -2.0409178733825684, "loss": 0.1292, "losses_ref": -0.0005357457557693124, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2110, "u": -1.801489233970642, "weight": 0.05090557411313057 }, { "diff_generated": -19.790348052978516, "epoch": 0.6869734283862605, "grad_norm": 2.990669632149249, "learning_rate": 7.60144136175534e-07, "logits/chosen": -2.43841814994812, "logits/rejected": -2.4627225399017334, "logps/chosen": -20.112375259399414, "logps/rejected": -362.16241455078125, "logps_avg/chosen": -0.11792151629924774, "logps_avg/rejected": -1.9790347814559937, "loss": 0.1258, "losses_ref": -0.00014056343934498727, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2120, "u": -1.7783987522125244, "weight": 0.06267792731523514 }, { "diff_generated": -18.127880096435547, "epoch": 0.6902138690861958, "grad_norm": 3.047337082334964, "learning_rate": 7.594852893321015e-07, "logits/chosen": -2.392712354660034, "logits/rejected": -2.476658821105957, "logps/chosen": -22.044334411621094, "logps/rejected": -330.33990478515625, "logps_avg/chosen": -0.1308022439479828, "logps_avg/rejected": -1.8127880096435547, "loss": 0.1266, "losses_ref": -0.0004497022891882807, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2130, "u": -1.7779629230499268, "weight": 0.06318801641464233 }, { "diff_generated": -18.819965362548828, "epoch": 0.6934543097861309, "grad_norm": 2.737166621848905, "learning_rate": 7.588213317716883e-07, "logits/chosen": -2.330166816711426, "logits/rejected": -2.4178690910339355, "logps/chosen": -20.24831199645996, "logps/rejected": -323.7247314453125, "logps_avg/chosen": -0.1308317482471466, "logps_avg/rejected": -1.881996750831604, "loss": 0.1315, "losses_ref": -0.00039175135316327214, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2140, "u": -1.778019905090332, "weight": 0.06312072277069092 }, { "diff_generated": -18.78085708618164, "epoch": 0.6966947504860661, "grad_norm": 3.04134332336856, "learning_rate": 7.581522729336214e-07, "logits/chosen": -2.349966049194336, "logits/rejected": -2.368582248687744, "logps/chosen": -21.596080780029297, "logps/rejected": -319.7588195800781, "logps_avg/chosen": -0.12146018445491791, "logps_avg/rejected": -1.878085732460022, "loss": 0.1288, "losses_ref": -0.0005237095756456256, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2150, "u": -1.7657558917999268, "weight": 0.06982530653476715 }, { "diff_generated": -19.86001968383789, "epoch": 0.6999351911860013, "grad_norm": 2.719146372081679, "learning_rate": 7.574781223297513e-07, "logits/chosen": -2.427701950073242, "logits/rejected": -2.4226479530334473, "logps/chosen": -24.214614868164062, "logps/rejected": -320.67120361328125, "logps_avg/chosen": -0.13417170941829681, "logps_avg/rejected": -1.9860022068023682, "loss": 0.1242, "losses_ref": -0.0006660787621513009, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2160, "u": -1.7893577814102173, "weight": 0.05747256428003311 }, { "diff_generated": -17.657316207885742, "epoch": 0.7031756318859365, "grad_norm": 2.841932411038946, "learning_rate": 7.567988895443173e-07, "logits/chosen": -2.3967366218566895, "logits/rejected": -2.3901848793029785, "logps/chosen": -20.9189510345459, "logps/rejected": -293.32379150390625, "logps_avg/chosen": -0.11422063410282135, "logps_avg/rejected": -1.7657315731048584, "loss": 0.1289, "losses_ref": -0.0006414534873329103, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2170, "u": -1.753784418106079, "weight": 0.07621309906244278 }, { "diff_generated": -19.626449584960938, "epoch": 0.7064160725858717, "grad_norm": 2.761946332942343, "learning_rate": 7.561145842338102e-07, "logits/chosen": -2.402575969696045, "logits/rejected": -2.4128315448760986, "logps/chosen": -22.947673797607422, "logps/rejected": -326.78875732421875, "logps_avg/chosen": -0.129831463098526, "logps_avg/rejected": -1.9626449346542358, "loss": 0.126, "losses_ref": -0.0004920439096167684, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2180, "u": -1.8133924007415771, "weight": 0.04459898918867111 }, { "diff_generated": -18.372360229492188, "epoch": 0.7096565132858069, "grad_norm": 3.0603236768477347, "learning_rate": 7.554252161268365e-07, "logits/chosen": -2.3838937282562256, "logits/rejected": -2.439671277999878, "logps/chosen": -23.422100067138672, "logps/rejected": -328.13372802734375, "logps_avg/chosen": -0.1293845921754837, "logps_avg/rejected": -1.8372361660003662, "loss": 0.1268, "losses_ref": -0.0006820982089266181, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2190, "u": -1.7893273830413818, "weight": 0.05750828981399536 }, { "diff_generated": -18.612842559814453, "epoch": 0.712896953985742, "grad_norm": 2.8177511342534567, "learning_rate": 7.547307950239785e-07, "logits/chosen": -2.439180850982666, "logits/rejected": -2.455467700958252, "logps/chosen": -22.881404876708984, "logps/rejected": -339.25054931640625, "logps_avg/chosen": -0.1290069967508316, "logps_avg/rejected": -1.8612842559814453, "loss": 0.1294, "losses_ref": -0.00032429193379357457, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2200, "u": -1.8136647939682007, "weight": 0.04428309202194214 }, { "diff_generated": -17.87989044189453, "epoch": 0.7161373946856773, "grad_norm": 3.1016688974304736, "learning_rate": 7.540313307976563e-07, "logits/chosen": -2.411724805831909, "logits/rejected": -2.445690631866455, "logps/chosen": -22.11930274963379, "logps/rejected": -321.18231201171875, "logps_avg/chosen": -0.12484880536794662, "logps_avg/rejected": -1.7879889011383057, "loss": 0.1318, "losses_ref": -0.00042836330248974264, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2210, "u": -1.7186676263809204, "weight": 0.09444756805896759 }, { "diff_generated": -18.738765716552734, "epoch": 0.7193778353856124, "grad_norm": 2.6819135530863374, "learning_rate": 7.533268333919865e-07, "logits/chosen": -2.409003496170044, "logits/rejected": -2.454528331756592, "logps/chosen": -23.465421676635742, "logps/rejected": -350.64312744140625, "logps_avg/chosen": -0.12644416093826294, "logps_avg/rejected": -1.873876929283142, "loss": 0.1264, "losses_ref": -0.0005657867877744138, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2220, "u": -1.8251807689666748, "weight": 0.038431938737630844 }, { "diff_generated": -18.670055389404297, "epoch": 0.7226182760855476, "grad_norm": 2.689524904058088, "learning_rate": 7.526173128226416e-07, "logits/chosen": -2.3890652656555176, "logits/rejected": -2.4550390243530273, "logps/chosen": -24.16277313232422, "logps/rejected": -330.9000549316406, "logps_avg/chosen": -0.13710932433605194, "logps_avg/rejected": -1.8670055866241455, "loss": 0.1259, "losses_ref": -0.00020371482241898775, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2230, "u": -1.8138744831085205, "weight": 0.04403935372829437 }, { "diff_generated": -19.508420944213867, "epoch": 0.7258587167854829, "grad_norm": 2.691809977523224, "learning_rate": 7.519027791767069e-07, "logits/chosen": -2.386385679244995, "logits/rejected": -2.3834471702575684, "logps/chosen": -25.55209732055664, "logps/rejected": -366.0524597167969, "logps_avg/chosen": -0.13459105789661407, "logps_avg/rejected": -1.9508421421051025, "loss": 0.1292, "losses_ref": -0.0003441698499955237, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2240, "u": -1.8136498928070068, "weight": 0.04430101439356804 }, { "diff_generated": -19.35506820678711, "epoch": 0.729099157485418, "grad_norm": 2.949240428236712, "learning_rate": 7.511832426125375e-07, "logits/chosen": -2.434502601623535, "logits/rejected": -2.4232802391052246, "logps/chosen": -23.403701782226562, "logps/rejected": -336.49774169921875, "logps_avg/chosen": -0.13677889108657837, "logps_avg/rejected": -1.935506820678711, "loss": 0.1267, "losses_ref": -0.0003745288122445345, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2250, "u": -1.7543065547943115, "weight": 0.07561810314655304 }, { "diff_generated": -18.26801300048828, "epoch": 0.7323395981853532, "grad_norm": 2.9652966972491486, "learning_rate": 7.504587133596141e-07, "logits/chosen": -2.4914777278900146, "logits/rejected": -2.4946160316467285, "logps/chosen": -22.081167221069336, "logps/rejected": -318.71551513671875, "logps_avg/chosen": -0.12174037843942642, "logps_avg/rejected": -1.8268013000488281, "loss": 0.1266, "losses_ref": -0.0004123027320019901, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2260, "u": -1.7543058395385742, "weight": 0.0756215900182724 }, { "diff_generated": -19.877885818481445, "epoch": 0.7355800388852884, "grad_norm": 2.7188422014185103, "learning_rate": 7.497292017183965e-07, "logits/chosen": -2.4941792488098145, "logits/rejected": -2.523242235183716, "logps/chosen": -22.369342803955078, "logps/rejected": -337.4557189941406, "logps_avg/chosen": -0.1296032965183258, "logps_avg/rejected": -1.987788438796997, "loss": 0.1296, "losses_ref": -0.000514883198775351, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2270, "u": -1.8608055114746094, "weight": 0.019619230180978775 }, { "diff_generated": -18.854595184326172, "epoch": 0.7388204795852236, "grad_norm": 2.608393200125136, "learning_rate": 7.489947180601791e-07, "logits/chosen": -2.4092555046081543, "logits/rejected": -2.431570529937744, "logps/chosen": -21.66644287109375, "logps/rejected": -321.78411865234375, "logps_avg/chosen": -0.12630699574947357, "logps_avg/rejected": -1.885459542274475, "loss": 0.1221, "losses_ref": -0.0007705268217250705, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2280, "u": -1.7890784740447998, "weight": 0.05778322368860245 }, { "diff_generated": -18.43848991394043, "epoch": 0.7420609202851588, "grad_norm": 2.858225633134005, "learning_rate": 7.482552728269412e-07, "logits/chosen": -2.4584872722625732, "logits/rejected": -2.4965240955352783, "logps/chosen": -23.285232543945312, "logps/rejected": -313.27606201171875, "logps_avg/chosen": -0.12950658798217773, "logps_avg/rejected": -1.8438488245010376, "loss": 0.1243, "losses_ref": -0.0004050957504659891, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2290, "u": -1.8017374277114868, "weight": 0.050617121160030365 }, { "diff_generated": -19.22142219543457, "epoch": 0.7453013609850939, "grad_norm": 2.7175854602679155, "learning_rate": 7.475108765312001e-07, "logits/chosen": -2.429678201675415, "logits/rejected": -2.4203219413757324, "logps/chosen": -22.69501304626465, "logps/rejected": -315.50128173828125, "logps_avg/chosen": -0.12515577673912048, "logps_avg/rejected": -1.9221423864364624, "loss": 0.1272, "losses_ref": -0.0004372203256934881, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2300, "u": -1.777909517288208, "weight": 0.06324687600135803 }, { "diff_generated": -18.608448028564453, "epoch": 0.7485418016850292, "grad_norm": 2.9184389772376202, "learning_rate": 7.467615397558613e-07, "logits/chosen": -2.4402008056640625, "logits/rejected": -2.495391368865967, "logps/chosen": -22.847639083862305, "logps/rejected": -340.8104248046875, "logps_avg/chosen": -0.13191382586956024, "logps_avg/rejected": -1.8608448505401611, "loss": 0.1296, "losses_ref": -0.0003498257137835026, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2310, "u": -1.7780853509902954, "weight": 0.0630442276597023 }, { "diff_generated": -18.730741500854492, "epoch": 0.7517822423849644, "grad_norm": 2.6569163215979152, "learning_rate": 7.460072731540676e-07, "logits/chosen": -2.4269540309906006, "logits/rejected": -2.4742822647094727, "logps/chosen": -20.393117904663086, "logps/rejected": -338.66778564453125, "logps_avg/chosen": -0.11943888664245605, "logps_avg/rejected": -1.8730741739273071, "loss": 0.1238, "losses_ref": -0.0005994164384901524, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2320, "u": -1.8012882471084595, "weight": 0.051135510206222534 }, { "diff_generated": -19.9935359954834, "epoch": 0.7550226830848995, "grad_norm": 2.83798592756184, "learning_rate": 7.452480874490483e-07, "logits/chosen": -2.4465527534484863, "logits/rejected": -2.4860479831695557, "logps/chosen": -21.021358489990234, "logps/rejected": -345.9947814941406, "logps_avg/chosen": -0.12317808717489243, "logps_avg/rejected": -1.9993536472320557, "loss": 0.1268, "losses_ref": -0.0003948205558117479, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2330, "u": -1.7897589206695557, "weight": 0.05699906870722771 }, { "diff_generated": -18.668241500854492, "epoch": 0.7582631237848347, "grad_norm": 2.6863736286433286, "learning_rate": 7.44483993433966e-07, "logits/chosen": -2.42374587059021, "logits/rejected": -2.472597122192383, "logps/chosen": -18.5694522857666, "logps/rejected": -326.936767578125, "logps_avg/chosen": -0.11334365606307983, "logps_avg/rejected": -1.8668243885040283, "loss": 0.1272, "losses_ref": -0.0002126133331330493, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2340, "u": -1.7901561260223389, "weight": 0.056544482707977295 }, { "diff_generated": -18.23128890991211, "epoch": 0.76150356448477, "grad_norm": 2.904875126249053, "learning_rate": 7.437150019717641e-07, "logits/chosen": -2.4157471656799316, "logits/rejected": -2.490884304046631, "logps/chosen": -20.360605239868164, "logps/rejected": -321.0450439453125, "logps_avg/chosen": -0.11621763557195663, "logps_avg/rejected": -1.8231290578842163, "loss": 0.1288, "losses_ref": -0.00025394646218046546, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2350, "u": -1.7545111179351807, "weight": 0.07538101077079773 }, { "diff_generated": -18.002696990966797, "epoch": 0.7647440051847051, "grad_norm": 3.9274823374656465, "learning_rate": 7.429411239950116e-07, "logits/chosen": -2.4598190784454346, "logits/rejected": -2.530359983444214, "logps/chosen": -23.51648712158203, "logps/rejected": -334.88507080078125, "logps_avg/chosen": -0.13798591494560242, "logps_avg/rejected": -1.8002697229385376, "loss": 0.1314, "losses_ref": -0.00022468708630185574, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2360, "u": -1.802004098892212, "weight": 0.05030521750450134 }, { "diff_generated": -18.326831817626953, "epoch": 0.7679844458846403, "grad_norm": 2.53469457896195, "learning_rate": 7.421623705057477e-07, "logits/chosen": -2.4758832454681396, "logits/rejected": -2.449039936065674, "logps/chosen": -19.222278594970703, "logps/rejected": -310.03961181640625, "logps_avg/chosen": -0.1095147579908371, "logps_avg/rejected": -1.8326833248138428, "loss": 0.1218, "losses_ref": -0.0004961226368322968, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2370, "u": -1.7539732456207275, "weight": 0.07599438726902008 }, { "diff_generated": -16.816598892211914, "epoch": 0.7712248865845756, "grad_norm": 2.984273026869485, "learning_rate": 7.413787525753261e-07, "logits/chosen": -2.4309310913085938, "logits/rejected": -2.494706630706787, "logps/chosen": -22.003509521484375, "logps/rejected": -303.1298828125, "logps_avg/chosen": -0.13067765533924103, "logps_avg/rejected": -1.6816600561141968, "loss": 0.13, "losses_ref": -0.00040191778680309653, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2380, "u": -1.7305409908294678, "weight": 0.08817703276872635 }, { "diff_generated": -18.891063690185547, "epoch": 0.7744653272845107, "grad_norm": 2.80526700675378, "learning_rate": 7.405902813442564e-07, "logits/chosen": -2.4479176998138428, "logits/rejected": -2.4607460498809814, "logps/chosen": -19.49251365661621, "logps/rejected": -311.84820556640625, "logps_avg/chosen": -0.11860658973455429, "logps_avg/rejected": -1.8891067504882812, "loss": 0.1223, "losses_ref": -0.00031381563167087734, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2390, "u": -1.813751220703125, "weight": 0.04418398439884186 }, { "diff_generated": -19.603668212890625, "epoch": 0.7777057679844459, "grad_norm": 2.880688772648272, "learning_rate": 7.39796968022047e-07, "logits/chosen": -2.4030654430389404, "logits/rejected": -2.4530603885650635, "logps/chosen": -20.101438522338867, "logps/rejected": -326.55462646484375, "logps_avg/chosen": -0.12463720887899399, "logps_avg/rejected": -1.9603666067123413, "loss": 0.1222, "losses_ref": -0.0004716304247267544, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2400, "u": -1.8371467590332031, "weight": 0.03205372765660286 }, { "diff_generated": -18.70380210876465, "epoch": 0.780946208684381, "grad_norm": 2.9044256233773917, "learning_rate": 7.389988238870451e-07, "logits/chosen": -2.42336106300354, "logits/rejected": -2.3942418098449707, "logps/chosen": -24.53568458557129, "logps/rejected": -307.4025573730469, "logps_avg/chosen": -0.13119980692863464, "logps_avg/rejected": -1.870380163192749, "loss": 0.1279, "losses_ref": -0.0005647986545227468, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2410, "u": -1.8251574039459229, "weight": 0.038459379225969315 }, { "diff_generated": -19.824970245361328, "epoch": 0.7841866493843163, "grad_norm": 2.7147095449979846, "learning_rate": 7.381958602862763e-07, "logits/chosen": -2.430551528930664, "logits/rejected": -2.4522652626037598, "logps/chosen": -23.182209014892578, "logps/rejected": -343.49114990234375, "logps_avg/chosen": -0.124916672706604, "logps_avg/rejected": -1.982496976852417, "loss": 0.1243, "losses_ref": -0.00035436113830655813, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2420, "u": -1.8254915475845337, "weight": 0.03806937485933304 }, { "diff_generated": -18.005935668945312, "epoch": 0.7874270900842515, "grad_norm": 2.8190879408890854, "learning_rate": 7.373880886352832e-07, "logits/chosen": -2.4851255416870117, "logits/rejected": -2.4959912300109863, "logps/chosen": -25.28909683227539, "logps/rejected": -329.841064453125, "logps_avg/chosen": -0.13179995119571686, "logps_avg/rejected": -1.8005939722061157, "loss": 0.1253, "losses_ref": -0.0004605629947036505, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2430, "u": -1.8253390789031982, "weight": 0.03824792057275772 }, { "diff_generated": -18.01476287841797, "epoch": 0.7906675307841866, "grad_norm": 2.673066237519979, "learning_rate": 7.365755204179637e-07, "logits/chosen": -2.3529014587402344, "logits/rejected": -2.483888626098633, "logps/chosen": -21.870464324951172, "logps/rejected": -335.36883544921875, "logps_avg/chosen": -0.12962986528873444, "logps_avg/rejected": -1.8014764785766602, "loss": 0.1267, "losses_ref": -0.0006634207093156874, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2440, "u": -1.8130687475204468, "weight": 0.04497240111231804 }, { "diff_generated": -18.597171783447266, "epoch": 0.7939079714841218, "grad_norm": 2.966769139844808, "learning_rate": 7.357581671864073e-07, "logits/chosen": -2.393500566482544, "logits/rejected": -2.484731912612915, "logps/chosen": -22.13661766052246, "logps/rejected": -328.3583068847656, "logps_avg/chosen": -0.13498273491859436, "logps_avg/rejected": -1.8597170114517212, "loss": 0.1287, "losses_ref": -0.000405018130550161, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2450, "u": -1.7896159887313843, "weight": 0.057148706167936325 }, { "diff_generated": -17.847610473632812, "epoch": 0.7971484121840571, "grad_norm": 2.6948564560211197, "learning_rate": 7.349360405607303e-07, "logits/chosen": -2.3624050617218018, "logits/rejected": -2.451986789703369, "logps/chosen": -18.373266220092773, "logps/rejected": -325.03704833984375, "logps_avg/chosen": -0.10835101455450058, "logps_avg/rejected": -1.7847610712051392, "loss": 0.1251, "losses_ref": -0.0003813363437075168, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2460, "u": -1.7306047677993774, "weight": 0.08810532838106155 }, { "diff_generated": -19.39354705810547, "epoch": 0.8003888528839922, "grad_norm": 2.570981381506959, "learning_rate": 7.341091522289122e-07, "logits/chosen": -2.489992618560791, "logits/rejected": -2.4882442951202393, "logps/chosen": -21.162927627563477, "logps/rejected": -330.13720703125, "logps_avg/chosen": -0.11893127858638763, "logps_avg/rejected": -1.939354658126831, "loss": 0.1232, "losses_ref": -0.0003837384865619242, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2470, "u": -1.8136303424835205, "weight": 0.04432538151741028 }, { "diff_generated": -19.998611450195312, "epoch": 0.8036292935839274, "grad_norm": 2.8313005386612504, "learning_rate": 7.332775139466278e-07, "logits/chosen": -2.513796806335449, "logits/rejected": -2.5728158950805664, "logps/chosen": -20.988895416259766, "logps/rejected": -350.8438720703125, "logps_avg/chosen": -0.12285809218883514, "logps_avg/rejected": -1.9998611211776733, "loss": 0.1277, "losses_ref": -0.0004251801874488592, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2480, "u": -1.8609235286712646, "weight": 0.019479917362332344 }, { "diff_generated": -17.742862701416016, "epoch": 0.8068697342838627, "grad_norm": 2.7325143797161178, "learning_rate": 7.324411375370809e-07, "logits/chosen": -2.4243478775024414, "logits/rejected": -2.478469133377075, "logps/chosen": -21.79288101196289, "logps/rejected": -317.43975830078125, "logps_avg/chosen": -0.12675701081752777, "logps_avg/rejected": -1.7742862701416016, "loss": 0.1256, "losses_ref": -0.0003512470575515181, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2490, "u": -1.7543703317642212, "weight": 0.07554493844509125 }, { "diff_generated": -17.564847946166992, "epoch": 0.8101101749837978, "grad_norm": 2.7498341190763944, "learning_rate": 7.316000348908365e-07, "logits/chosen": -2.465071439743042, "logits/rejected": -2.5179238319396973, "logps/chosen": -23.108013153076172, "logps/rejected": -323.61297607421875, "logps_avg/chosen": -0.1270761638879776, "logps_avg/rejected": -1.7564847469329834, "loss": 0.1262, "losses_ref": -0.00033712232834659517, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2500, "u": -1.754339575767517, "weight": 0.07557834684848785 }, { "diff_generated": -17.836788177490234, "epoch": 0.813350615683733, "grad_norm": 2.8566271067083804, "learning_rate": 7.307542179656511e-07, "logits/chosen": -2.4499549865722656, "logits/rejected": -2.4926228523254395, "logps/chosen": -21.77320098876953, "logps/rejected": -338.5235290527344, "logps_avg/chosen": -0.11738236248493195, "logps_avg/rejected": -1.7836787700653076, "loss": 0.1248, "losses_ref": -0.0005360871437005699, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2510, "u": -1.7539886236190796, "weight": 0.07598712295293808 }, { "diff_generated": -19.29046058654785, "epoch": 0.8165910563836681, "grad_norm": 2.7446895235493387, "learning_rate": 7.29903698786303e-07, "logits/chosen": -2.4418554306030273, "logits/rejected": -2.4206745624542236, "logps/chosen": -22.280296325683594, "logps/rejected": -300.85528564453125, "logps_avg/chosen": -0.12339627742767334, "logps_avg/rejected": -1.9290460348129272, "loss": 0.1248, "losses_ref": -0.0006306341965682805, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2520, "u": -1.8012568950653076, "weight": 0.05117018148303032 }, { "diff_generated": -17.447559356689453, "epoch": 0.8198314970836034, "grad_norm": 2.7651369105110253, "learning_rate": 7.290484894444214e-07, "logits/chosen": -2.3910071849823, "logits/rejected": -2.4577226638793945, "logps/chosen": -19.478683471679688, "logps/rejected": -303.2452087402344, "logps_avg/chosen": -0.11861952394247055, "logps_avg/rejected": -1.7447561025619507, "loss": 0.1221, "losses_ref": -0.0005060589173808694, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2530, "u": -1.6829626560211182, "weight": 0.1133555918931961 }, { "diff_generated": -18.10799789428711, "epoch": 0.8230719377835386, "grad_norm": 3.053125479055488, "learning_rate": 7.281886020983144e-07, "logits/chosen": -2.4256653785705566, "logits/rejected": -2.4212918281555176, "logps/chosen": -23.599895477294922, "logps/rejected": -292.9998474121094, "logps_avg/chosen": -0.13201817870140076, "logps_avg/rejected": -1.8107995986938477, "loss": 0.1242, "losses_ref": -0.00020123887225054204, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2540, "u": -1.8257560729980469, "weight": 0.037760891020298004 }, { "diff_generated": -17.104455947875977, "epoch": 0.8263123784834737, "grad_norm": 3.1444280521819663, "learning_rate": 7.273240489727963e-07, "logits/chosen": -2.398249864578247, "logits/rejected": -2.408512830734253, "logps/chosen": -22.64269256591797, "logps/rejected": -290.8938293457031, "logps_avg/chosen": -0.11623907089233398, "logps_avg/rejected": -1.7104456424713135, "loss": 0.122, "losses_ref": -0.00043374235974624753, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2550, "u": -1.7661218643188477, "weight": 0.06941927224397659 }, { "diff_generated": -17.132604598999023, "epoch": 0.829552819183409, "grad_norm": 2.8401955072126595, "learning_rate": 7.264548423590133e-07, "logits/chosen": -2.4102237224578857, "logits/rejected": -2.4627585411071777, "logps/chosen": -21.5040340423584, "logps/rejected": -293.8085632324219, "logps_avg/chosen": -0.12476116418838501, "logps_avg/rejected": -1.7132604122161865, "loss": 0.1234, "losses_ref": -0.00025950567214749753, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2560, "u": -1.7901074886322021, "weight": 0.05660170316696167 }, { "diff_generated": -18.14133071899414, "epoch": 0.8327932598833442, "grad_norm": 3.1965970439975666, "learning_rate": 7.255809946142695e-07, "logits/chosen": -2.3983757495880127, "logits/rejected": -2.409802198410034, "logps/chosen": -23.022171020507812, "logps/rejected": -306.9036865234375, "logps_avg/chosen": -0.12413071095943451, "logps_avg/rejected": -1.8141329288482666, "loss": 0.1305, "losses_ref": -0.0003770699549932033, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2570, "u": -1.7898855209350586, "weight": 0.05686057358980179 }, { "diff_generated": -18.652851104736328, "epoch": 0.8360337005832793, "grad_norm": 2.8716772820827066, "learning_rate": 7.247025181618508e-07, "logits/chosen": -2.4277095794677734, "logits/rejected": -2.4492273330688477, "logps/chosen": -22.899845123291016, "logps/rejected": -318.9675598144531, "logps_avg/chosen": -0.12769187986850739, "logps_avg/rejected": -1.8652846813201904, "loss": 0.1238, "losses_ref": -0.0003064598422497511, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2580, "u": -1.82554030418396, "weight": 0.03801097720861435 }, { "diff_generated": -17.709041595458984, "epoch": 0.8392741412832145, "grad_norm": 2.9226486099172098, "learning_rate": 7.238194254908483e-07, "logits/chosen": -2.386096477508545, "logits/rejected": -2.435162305831909, "logps/chosen": -21.952648162841797, "logps/rejected": -303.5704345703125, "logps_avg/chosen": -0.1209806352853775, "logps_avg/rejected": -1.770904302597046, "loss": 0.1311, "losses_ref": -0.0005370815051719546, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2590, "u": -1.7895606756210327, "weight": 0.05723338574171066 }, { "diff_generated": -18.281713485717773, "epoch": 0.8425145819831497, "grad_norm": 2.639473802718026, "learning_rate": 7.229317291559807e-07, "logits/chosen": -2.424675464630127, "logits/rejected": -2.4955246448516846, "logps/chosen": -22.622589111328125, "logps/rejected": -296.0838623046875, "logps_avg/chosen": -0.13772353529930115, "logps_avg/rejected": -1.8281714916229248, "loss": 0.1263, "losses_ref": -0.0005213414551690221, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2600, "u": -1.8131945133209229, "weight": 0.04481424018740654 }, { "diff_generated": -18.075288772583008, "epoch": 0.8457550226830849, "grad_norm": 3.2797998071561225, "learning_rate": 7.22039441777416e-07, "logits/chosen": -2.3893349170684814, "logits/rejected": -2.470026731491089, "logps/chosen": -20.48826026916504, "logps/rejected": -301.62359619140625, "logps_avg/chosen": -0.12221121788024902, "logps_avg/rejected": -1.8075288534164429, "loss": 0.1282, "losses_ref": -0.0005865787388756871, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2610, "u": -1.76569402217865, "weight": 0.06990896910429001 }, { "diff_generated": -19.311792373657227, "epoch": 0.8489954633830201, "grad_norm": 2.7049674814121976, "learning_rate": 7.21142576040592e-07, "logits/chosen": -2.4634876251220703, "logits/rejected": -2.534529447555542, "logps/chosen": -24.17624282836914, "logps/rejected": -318.659423828125, "logps_avg/chosen": -0.14250484108924866, "logps_avg/rejected": -1.931179404258728, "loss": 0.1243, "losses_ref": -0.0004970087902620435, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2620, "u": -1.8252204656600952, "weight": 0.038382790982723236 }, { "diff_generated": -19.849037170410156, "epoch": 0.8522359040829552, "grad_norm": 3.065765529226203, "learning_rate": 7.202411446960357e-07, "logits/chosen": -2.4388208389282227, "logits/rejected": -2.462719440460205, "logps/chosen": -24.322383880615234, "logps/rejected": -326.2467041015625, "logps_avg/chosen": -0.1309942901134491, "logps_avg/rejected": -1.9849036931991577, "loss": 0.1268, "losses_ref": -0.0006085868226364255, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2630, "u": -1.8603699207305908, "weight": 0.020091459155082703 }, { "diff_generated": -18.355745315551758, "epoch": 0.8554763447828905, "grad_norm": 2.619018012456487, "learning_rate": 7.193351605591825e-07, "logits/chosen": -2.4444096088409424, "logits/rejected": -2.524109363555908, "logps/chosen": -20.568607330322266, "logps/rejected": -314.8489074707031, "logps_avg/chosen": -0.1212010383605957, "logps_avg/rejected": -1.8355745077133179, "loss": 0.1198, "losses_ref": -0.0005308730178512633, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2640, "u": -1.789407730102539, "weight": 0.0573933906853199 }, { "diff_generated": -17.939889907836914, "epoch": 0.8587167854828257, "grad_norm": 2.815675633641462, "learning_rate": 7.184246365101939e-07, "logits/chosen": -2.48952054977417, "logits/rejected": -2.46121883392334, "logps/chosen": -23.912261962890625, "logps/rejected": -312.788330078125, "logps_avg/chosen": -0.12124598026275635, "logps_avg/rejected": -1.793988585472107, "loss": 0.1254, "losses_ref": -0.0005175786791369319, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2650, "u": -1.825222373008728, "weight": 0.03838255628943443 }, { "diff_generated": -18.960969924926758, "epoch": 0.8619572261827608, "grad_norm": 2.7869375126473614, "learning_rate": 7.175095854937739e-07, "logits/chosen": -2.4368441104888916, "logits/rejected": -2.4551730155944824, "logps/chosen": -21.652315139770508, "logps/rejected": -331.0331115722656, "logps_avg/chosen": -0.12064101547002792, "logps_avg/rejected": -1.8960968255996704, "loss": 0.1261, "losses_ref": -0.00044710320071317255, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2660, "u": -1.837181806564331, "weight": 0.0320126935839653 }, { "diff_generated": -17.435054779052734, "epoch": 0.8651976668826961, "grad_norm": 2.557409320653384, "learning_rate": 7.165900205189853e-07, "logits/chosen": -2.4384474754333496, "logits/rejected": -2.5136446952819824, "logps/chosen": -20.79629135131836, "logps/rejected": -318.3617248535156, "logps_avg/chosen": -0.1220201700925827, "logps_avg/rejected": -1.7435054779052734, "loss": 0.1207, "losses_ref": -0.00030411581974476576, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2670, "u": -1.7900310754776, "weight": 0.05669097229838371 }, { "diff_generated": -18.737735748291016, "epoch": 0.8684381075826313, "grad_norm": 2.56682824122742, "learning_rate": 7.156659546590653e-07, "logits/chosen": -2.3800575733184814, "logits/rejected": -2.4281136989593506, "logps/chosen": -19.449628829956055, "logps/rejected": -336.49591064453125, "logps_avg/chosen": -0.11861036717891693, "logps_avg/rejected": -1.8737735748291016, "loss": 0.1257, "losses_ref": -0.00045010895701125264, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2680, "u": -1.813449501991272, "weight": 0.044532887637615204 }, { "diff_generated": -20.041152954101562, "epoch": 0.8716785482825664, "grad_norm": 2.555750310681515, "learning_rate": 7.147374010512385e-07, "logits/chosen": -2.340986490249634, "logits/rejected": -2.3440699577331543, "logps/chosen": -20.139862060546875, "logps/rejected": -306.624755859375, "logps_avg/chosen": -0.11705954372882843, "logps_avg/rejected": -2.004115104675293, "loss": 0.1216, "losses_ref": -0.0003802308929152787, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2690, "u": -1.7304770946502686, "weight": 0.08824525773525238 }, { "diff_generated": -18.48080825805664, "epoch": 0.8749189889825016, "grad_norm": 2.5824846669144725, "learning_rate": 7.13804372896531e-07, "logits/chosen": -2.3439040184020996, "logits/rejected": -2.4303998947143555, "logps/chosen": -20.847734451293945, "logps/rejected": -316.1014404296875, "logps_avg/chosen": -0.12687210738658905, "logps_avg/rejected": -1.8480808734893799, "loss": 0.1246, "losses_ref": -0.0005675092106685042, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2700, "u": -1.789607048034668, "weight": 0.057185959070920944 }, { "diff_generated": -17.809646606445312, "epoch": 0.8781594296824368, "grad_norm": 2.81662394572902, "learning_rate": 7.128668834595827e-07, "logits/chosen": -2.453523635864258, "logits/rejected": -2.4558768272399902, "logps/chosen": -24.155872344970703, "logps/rejected": -307.4092102050781, "logps_avg/chosen": -0.1282106637954712, "logps_avg/rejected": -1.7809646129608154, "loss": 0.1209, "losses_ref": -0.000484933378174901, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2710, "u": -1.7897233963012695, "weight": 0.05704890564084053 }, { "diff_generated": -19.045629501342773, "epoch": 0.881399870382372, "grad_norm": 2.7453290952914093, "learning_rate": 7.119249460684583e-07, "logits/chosen": -2.3899433612823486, "logits/rejected": -2.3870387077331543, "logps/chosen": -23.397602081298828, "logps/rejected": -324.876708984375, "logps_avg/chosen": -0.12680859863758087, "logps_avg/rejected": -1.9045629501342773, "loss": 0.1248, "losses_ref": -0.00038239354034885764, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2720, "u": -1.8254715204238892, "weight": 0.03809309005737305 }, { "diff_generated": -17.462833404541016, "epoch": 0.8846403110823072, "grad_norm": 2.5421040372025097, "learning_rate": 7.109785741144577e-07, "logits/chosen": -2.344905376434326, "logits/rejected": -2.4533557891845703, "logps/chosen": -22.297576904296875, "logps/rejected": -321.774169921875, "logps_avg/chosen": -0.13205796480178833, "logps_avg/rejected": -1.7462832927703857, "loss": 0.1248, "losses_ref": -0.00043138963519595563, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2730, "u": -1.8135309219360352, "weight": 0.04443947225809097 }, { "diff_generated": -18.100156784057617, "epoch": 0.8878807517822424, "grad_norm": 2.8009501005613204, "learning_rate": 7.100277810519264e-07, "logits/chosen": -2.437243938446045, "logits/rejected": -2.4590842723846436, "logps/chosen": -22.2004337310791, "logps/rejected": -316.9300842285156, "logps_avg/chosen": -0.1315070539712906, "logps_avg/rejected": -1.8100156784057617, "loss": 0.1249, "losses_ref": -0.0005317996838130057, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2740, "u": -1.8133379220962524, "weight": 0.04465905949473381 }, { "diff_generated": -17.46011734008789, "epoch": 0.8911211924821776, "grad_norm": 2.689152478337482, "learning_rate": 7.090725803980633e-07, "logits/chosen": -2.3765041828155518, "logits/rejected": -2.4582343101501465, "logps/chosen": -21.084001541137695, "logps/rejected": -305.22955322265625, "logps_avg/chosen": -0.12233324348926544, "logps_avg/rejected": -1.7460119724273682, "loss": 0.1289, "losses_ref": -0.00030663347570225596, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2750, "u": -1.7663196325302124, "weight": 0.06918799877166748 }, { "diff_generated": -17.585119247436523, "epoch": 0.8943616331821128, "grad_norm": 2.752813540127061, "learning_rate": 7.081129857327297e-07, "logits/chosen": -2.4216065406799316, "logits/rejected": -2.4935240745544434, "logps/chosen": -20.991987228393555, "logps/rejected": -300.592041015625, "logps_avg/chosen": -0.12113398313522339, "logps_avg/rejected": -1.7585121393203735, "loss": 0.128, "losses_ref": -0.00042823137482628226, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2760, "u": -1.8016542196273804, "weight": 0.05071256309747696 }, { "diff_generated": -17.064542770385742, "epoch": 0.8976020738820479, "grad_norm": 3.0441665586298736, "learning_rate": 7.071490106982547e-07, "logits/chosen": -2.417194128036499, "logits/rejected": -2.45021915435791, "logps/chosen": -22.914186477661133, "logps/rejected": -302.40167236328125, "logps_avg/chosen": -0.12843842804431915, "logps_avg/rejected": -1.7064542770385742, "loss": 0.1269, "losses_ref": -0.00028399689472280443, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2770, "u": -1.7544708251953125, "weight": 0.07542793452739716 }, { "diff_generated": -18.65851402282715, "epoch": 0.9008425145819832, "grad_norm": 2.7869155962688974, "learning_rate": 7.061806689992424e-07, "logits/chosen": -2.3915419578552246, "logits/rejected": -2.4333293437957764, "logps/chosen": -22.446435928344727, "logps/rejected": -309.77508544921875, "logps_avg/chosen": -0.12451604753732681, "logps_avg/rejected": -1.865851640701294, "loss": 0.1234, "losses_ref": -0.000297236634651199, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2780, "u": -1.7900089025497437, "weight": 0.05671622231602669 }, { "diff_generated": -18.07571792602539, "epoch": 0.9040829552819183, "grad_norm": 2.6626958452708966, "learning_rate": 7.052079744023769e-07, "logits/chosen": -2.5439717769622803, "logits/rejected": -2.5482470989227295, "logps/chosen": -25.701541900634766, "logps/rejected": -313.8611755371094, "logps_avg/chosen": -0.13316160440444946, "logps_avg/rejected": -1.8075717687606812, "loss": 0.1212, "losses_ref": -0.0006367530440911651, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2790, "u": -1.8368667364120483, "weight": 0.032375458627939224 }, { "diff_generated": -18.336414337158203, "epoch": 0.9073233959818535, "grad_norm": 2.6099270748234957, "learning_rate": 7.042309407362264e-07, "logits/chosen": -2.410386562347412, "logits/rejected": -2.498459577560425, "logps/chosen": -20.105188369750977, "logps/rejected": -320.20513916015625, "logps_avg/chosen": -0.12039705365896225, "logps_avg/rejected": -1.833641767501831, "loss": 0.1228, "losses_ref": -0.0007827078807167709, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2800, "u": -1.8128219842910767, "weight": 0.0452541820704937 }, { "diff_generated": -18.090600967407227, "epoch": 0.9105638366817888, "grad_norm": 2.6260436552631856, "learning_rate": 7.032495818910462e-07, "logits/chosen": -2.471694231033325, "logits/rejected": -2.494658946990967, "logps/chosen": -19.806758880615234, "logps/rejected": -302.5240478515625, "logps_avg/chosen": -0.1169753298163414, "logps_avg/rejected": -1.8090598583221436, "loss": 0.122, "losses_ref": -0.00042266439413651824, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2810, "u": -1.7542486190795898, "weight": 0.0756872147321701 }, { "diff_generated": -18.19870376586914, "epoch": 0.9138042773817239, "grad_norm": 2.78178387937294, "learning_rate": 7.022639118185819e-07, "logits/chosen": -2.462240219116211, "logits/rejected": -2.4601945877075195, "logps/chosen": -23.47665786743164, "logps/rejected": -315.44085693359375, "logps_avg/chosen": -0.12503954768180847, "logps_avg/rejected": -1.8198707103729248, "loss": 0.1208, "losses_ref": -0.00048798826173879206, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2820, "u": -1.8015903234481812, "weight": 0.05078895017504692 }, { "diff_generated": -18.751773834228516, "epoch": 0.9170447180816591, "grad_norm": 2.5155286172970377, "learning_rate": 7.012739445318712e-07, "logits/chosen": -2.4825854301452637, "logits/rejected": -2.507294178009033, "logps/chosen": -22.81148910522461, "logps/rejected": -320.12750244140625, "logps_avg/chosen": -0.1307867467403412, "logps_avg/rejected": -1.8751773834228516, "loss": 0.1206, "losses_ref": -0.00032793093123473227, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2830, "u": -1.8255561590194702, "weight": 0.037995077669620514 }, { "diff_generated": -18.128347396850586, "epoch": 0.9202851587815943, "grad_norm": 2.809119024607745, "learning_rate": 7.002796941050435e-07, "logits/chosen": -2.4425601959228516, "logits/rejected": -2.4837136268615723, "logps/chosen": -21.08615493774414, "logps/rejected": -314.3614807128906, "logps_avg/chosen": -0.12382993847131729, "logps_avg/rejected": -1.8128345012664795, "loss": 0.1296, "losses_ref": -0.00028563165687955916, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2840, "u": -1.7782018184661865, "weight": 0.06290866434574127 }, { "diff_generated": -20.078081130981445, "epoch": 0.9235255994815295, "grad_norm": 2.6148467990309787, "learning_rate": 6.992811746731213e-07, "logits/chosen": -2.469362735748291, "logits/rejected": -2.504396438598633, "logps/chosen": -23.93619728088379, "logps/rejected": -332.0574035644531, "logps_avg/chosen": -0.1358002871274948, "logps_avg/rejected": -2.007808208465576, "loss": 0.1255, "losses_ref": -0.0002602954918984324, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2850, "u": -1.813787817955017, "weight": 0.044140513986349106 }, { "diff_generated": -19.35173225402832, "epoch": 0.9267660401814647, "grad_norm": 2.82164225700258, "learning_rate": 6.98278400431818e-07, "logits/chosen": -2.491528034210205, "logits/rejected": -2.5512471199035645, "logps/chosen": -24.169204711914062, "logps/rejected": -357.38128662109375, "logps_avg/chosen": -0.13326986134052277, "logps_avg/rejected": -1.9351732730865479, "loss": 0.1258, "losses_ref": -0.00046310765901580453, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2860, "u": -1.8490760326385498, "weight": 0.025722075253725052 }, { "diff_generated": -19.551773071289062, "epoch": 0.9300064808813999, "grad_norm": 2.6044844599935377, "learning_rate": 6.972713856373369e-07, "logits/chosen": -2.474952220916748, "logits/rejected": -2.5388407707214355, "logps/chosen": -21.96600341796875, "logps/rejected": -343.4984436035156, "logps_avg/chosen": -0.12933237850666046, "logps_avg/rejected": -1.9551775455474854, "loss": 0.1232, "losses_ref": -0.00046262479736469686, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2870, "u": -1.849073052406311, "weight": 0.025724787265062332 }, { "diff_generated": -19.89356803894043, "epoch": 0.933246921581335, "grad_norm": 2.580690966488642, "learning_rate": 6.962601446061681e-07, "logits/chosen": -2.4569146633148193, "logits/rejected": -2.4288723468780518, "logps/chosen": -21.41404914855957, "logps/rejected": -319.9272766113281, "logps_avg/chosen": -0.11869911849498749, "logps_avg/rejected": -1.9893567562103271, "loss": 0.12, "losses_ref": -0.0005856683710590005, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2880, "u": -1.8369076251983643, "weight": 0.03232860192656517 }, { "diff_generated": -19.329910278320312, "epoch": 0.9364873622812703, "grad_norm": 2.614494401402133, "learning_rate": 6.952446917148853e-07, "logits/chosen": -2.4458394050598145, "logits/rejected": -2.514772891998291, "logps/chosen": -22.14116668701172, "logps/rejected": -359.1854553222656, "logps_avg/chosen": -0.1248125210404396, "logps_avg/rejected": -1.9329910278320312, "loss": 0.1226, "losses_ref": -0.00042284480878151953, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2890, "u": -1.8491137027740479, "weight": 0.025677403435111046 }, { "diff_generated": -18.59128189086914, "epoch": 0.9397278029812054, "grad_norm": 2.837551798748602, "learning_rate": 6.94225041399941e-07, "logits/chosen": -2.4578464031219482, "logits/rejected": -2.5545461177825928, "logps/chosen": -21.286333084106445, "logps/rejected": -350.63970947265625, "logps_avg/chosen": -0.12239722162485123, "logps_avg/rejected": -1.8591279983520508, "loss": 0.1168, "losses_ref": -0.0003485087654553354, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2900, "u": -1.789947271347046, "weight": 0.05678866058588028 }, { "diff_generated": -19.49415397644043, "epoch": 0.9429682436811406, "grad_norm": 2.5579438992340373, "learning_rate": 6.932012081574615e-07, "logits/chosen": -2.4774105548858643, "logits/rejected": -2.502908229827881, "logps/chosen": -21.979785919189453, "logps/rejected": -329.8486022949219, "logps_avg/chosen": -0.12885500490665436, "logps_avg/rejected": -1.9494152069091797, "loss": 0.1237, "losses_ref": -0.00031258963281288743, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2910, "u": -1.8137260675430298, "weight": 0.04421340674161911 }, { "diff_generated": -18.232284545898438, "epoch": 0.9462086843810759, "grad_norm": 2.6238007530926613, "learning_rate": 6.921732065430411e-07, "logits/chosen": -2.4125030040740967, "logits/rejected": -2.5023012161254883, "logps/chosen": -18.845687866210938, "logps/rejected": -336.7342834472656, "logps_avg/chosen": -0.11462052166461945, "logps_avg/rejected": -1.8232284784317017, "loss": 0.1219, "losses_ref": -0.00029631194774992764, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2920, "u": -1.7544281482696533, "weight": 0.07547671347856522 }, { "diff_generated": -19.532318115234375, "epoch": 0.949449125081011, "grad_norm": 2.5409270526015506, "learning_rate": 6.911410511715343e-07, "logits/chosen": -2.4430928230285645, "logits/rejected": -2.4564061164855957, "logps/chosen": -21.696651458740234, "logps/rejected": -330.7839050292969, "logps_avg/chosen": -0.11758589744567871, "logps_avg/rejected": -1.9532318115234375, "loss": 0.1206, "losses_ref": -0.00047477110638283193, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2930, "u": -1.7541587352752686, "weight": 0.07578897476196289 }, { "diff_generated": -19.280086517333984, "epoch": 0.9526895657809462, "grad_norm": 2.8239915570390406, "learning_rate": 6.901047567168491e-07, "logits/chosen": -2.4925901889801025, "logits/rejected": -2.501408338546753, "logps/chosen": -21.877737045288086, "logps/rejected": -320.40167236328125, "logps_avg/chosen": -0.12425835430622101, "logps_avg/rejected": -1.9280086755752563, "loss": 0.1235, "losses_ref": -0.0002563064044807106, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2940, "u": -1.7545255422592163, "weight": 0.07536445558071136 }, { "diff_generated": -19.411279678344727, "epoch": 0.9559300064808814, "grad_norm": 2.665040621540858, "learning_rate": 6.890643379117374e-07, "logits/chosen": -2.46891188621521, "logits/rejected": -2.480846405029297, "logps/chosen": -22.25421142578125, "logps/rejected": -343.4288330078125, "logps_avg/chosen": -0.12037277221679688, "logps_avg/rejected": -1.9411280155181885, "loss": 0.1205, "losses_ref": -0.000287555914837867, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2950, "u": -1.790021538734436, "weight": 0.05670164152979851 }, { "diff_generated": -20.14754867553711, "epoch": 0.9591704471808166, "grad_norm": 2.859031779876307, "learning_rate": 6.880198095475866e-07, "logits/chosen": -2.4958252906799316, "logits/rejected": -2.4784576892852783, "logps/chosen": -26.015636444091797, "logps/rejected": -353.650390625, "logps_avg/chosen": -0.13471297919750214, "logps_avg/rejected": -2.0147547721862793, "loss": 0.1223, "losses_ref": -0.0003499361628200859, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2960, "u": -1.825506567955017, "weight": 0.038051966577768326 }, { "diff_generated": -18.965621948242188, "epoch": 0.9624108878807518, "grad_norm": 2.8676870600784765, "learning_rate": 6.86971186474208e-07, "logits/chosen": -2.4640886783599854, "logits/rejected": -2.4847826957702637, "logps/chosen": -21.747859954833984, "logps/rejected": -330.61016845703125, "logps_avg/chosen": -0.11688639968633652, "logps_avg/rejected": -1.8965622186660767, "loss": 0.1266, "losses_ref": -0.0004715279792435467, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2970, "u": -1.7897437810897827, "weight": 0.057025156915187836 }, { "diff_generated": -19.038909912109375, "epoch": 0.9656513285806869, "grad_norm": 2.6947451628009302, "learning_rate": 6.859184835996271e-07, "logits/chosen": -2.4535346031188965, "logits/rejected": -2.5355563163757324, "logps/chosen": -20.323822021484375, "logps/rejected": -343.13238525390625, "logps_avg/chosen": -0.1202494278550148, "logps_avg/rejected": -1.9038912057876587, "loss": 0.1223, "losses_ref": -0.0005739832413382828, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2980, "u": -1.813147783279419, "weight": 0.04487653821706772 }, { "diff_generated": -22.221275329589844, "epoch": 0.9688917692806222, "grad_norm": 2.7201209987510726, "learning_rate": 6.848617158898704e-07, "logits/chosen": -2.4459598064422607, "logits/rejected": -2.4944732189178467, "logps/chosen": -18.379690170288086, "logps/rejected": -399.9496765136719, "logps_avg/chosen": -0.1081472784280777, "logps_avg/rejected": -2.2221274375915527, "loss": 0.1167, "losses_ref": -0.00019583315588533878, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2990, "u": -1.8138822317123413, "weight": 0.04402980953454971 }, { "diff_generated": -19.042484283447266, "epoch": 0.9721322099805574, "grad_norm": 2.887559738472061, "learning_rate": 6.838008983687538e-07, "logits/chosen": -2.420860767364502, "logits/rejected": -2.479161500930786, "logps/chosen": -20.06986427307129, "logps/rejected": -370.847412109375, "logps_avg/chosen": -0.11698007583618164, "logps_avg/rejected": -1.9042482376098633, "loss": 0.1213, "losses_ref": -0.0003117799642495811, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3000, "u": -1.825399398803711, "weight": 0.03816502168774605 }, { "diff_generated": -18.925434112548828, "epoch": 0.9753726506804925, "grad_norm": 2.685837856043556, "learning_rate": 6.827360461176675e-07, "logits/chosen": -2.4443156719207764, "logits/rejected": -2.495382785797119, "logps/chosen": -22.687870025634766, "logps/rejected": -352.8595886230469, "logps_avg/chosen": -0.12558838725090027, "logps_avg/rejected": -1.8925431966781616, "loss": 0.126, "losses_ref": -0.00045695697190240026, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3010, "u": -1.8014158010482788, "weight": 0.05096786096692085 }, { "diff_generated": -20.009876251220703, "epoch": 0.9786130913804277, "grad_norm": 2.559954495098906, "learning_rate": 6.816671742753636e-07, "logits/chosen": -2.4376111030578613, "logits/rejected": -2.4606010913848877, "logps/chosen": -22.668743133544922, "logps/rejected": -343.001953125, "logps_avg/chosen": -0.13240326941013336, "logps_avg/rejected": -2.0009875297546387, "loss": 0.1196, "losses_ref": -0.00020331182167865336, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3020, "u": -1.7545816898345947, "weight": 0.07529846578836441 }, { "diff_generated": -20.478519439697266, "epoch": 0.981853532080363, "grad_norm": 2.7208132567620624, "learning_rate": 6.80594298037739e-07, "logits/chosen": -2.4369359016418457, "logits/rejected": -2.4600374698638916, "logps/chosen": -20.999563217163086, "logps/rejected": -346.7327880859375, "logps_avg/chosen": -0.12476013600826263, "logps_avg/rejected": -2.047852039337158, "loss": 0.1243, "losses_ref": -0.00038672907976433635, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3030, "u": -1.8017244338989258, "weight": 0.05063174292445183 }, { "diff_generated": -20.338376998901367, "epoch": 0.9850939727802981, "grad_norm": 2.5661099093643, "learning_rate": 6.795174326576201e-07, "logits/chosen": -2.5011813640594482, "logits/rejected": -2.517927646636963, "logps/chosen": -22.143497467041016, "logps/rejected": -354.784423828125, "logps_avg/chosen": -0.12776055932044983, "logps_avg/rejected": -2.0338377952575684, "loss": 0.122, "losses_ref": -0.00024083026801235974, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3040, "u": -1.8375473022460938, "weight": 0.031587857753038406 }, { "diff_generated": -20.159313201904297, "epoch": 0.9883344134802333, "grad_norm": 2.5998402846143156, "learning_rate": 6.784365934445467e-07, "logits/chosen": -2.397493839263916, "logits/rejected": -2.485623598098755, "logps/chosen": -20.126605987548828, "logps/rejected": -366.147216796875, "logps_avg/chosen": -0.11706575006246567, "logps_avg/rejected": -2.0159313678741455, "loss": 0.1218, "losses_ref": -0.0002532999496906996, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3050, "u": -1.7782090902328491, "weight": 0.06289789080619812 }, { "diff_generated": -20.2012882232666, "epoch": 0.9915748541801686, "grad_norm": 2.932890060620594, "learning_rate": 6.77351795764553e-07, "logits/chosen": -2.498687505722046, "logits/rejected": -2.518296003341675, "logps/chosen": -21.37833595275879, "logps/rejected": -342.5185852050781, "logps_avg/chosen": -0.11720434576272964, "logps_avg/rejected": -2.0201287269592285, "loss": 0.1245, "losses_ref": -0.0004471830034162849, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3060, "u": -1.8253414630889893, "weight": 0.03824517875909805 }, { "diff_generated": -21.252620697021484, "epoch": 0.9948152948801037, "grad_norm": 2.6432494995363283, "learning_rate": 6.7626305503995e-07, "logits/chosen": -2.4260194301605225, "logits/rejected": -2.465452194213867, "logps/chosen": -21.87104034423828, "logps/rejected": -357.416015625, "logps_avg/chosen": -0.12750808894634247, "logps_avg/rejected": -2.1252620220184326, "loss": 0.1202, "losses_ref": -0.00040028925286605954, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3070, "u": -1.8135614395141602, "weight": 0.044404350221157074 }, { "diff_generated": -20.740575790405273, "epoch": 0.9980557355800389, "grad_norm": 2.695896336401798, "learning_rate": 6.75170386749106e-07, "logits/chosen": -2.4324469566345215, "logits/rejected": -2.488874912261963, "logps/chosen": -23.11557388305664, "logps/rejected": -379.96612548828125, "logps_avg/chosen": -0.12699179351329803, "logps_avg/rejected": -2.0740573406219482, "loss": 0.1191, "losses_ref": -0.0006312219775281847, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3080, "u": -1.884240746498108, "weight": 0.007436756044626236 }, { "diff_generated": -19.466602325439453, "epoch": 1.0012961762799741, "grad_norm": 2.7553187871328, "learning_rate": 6.740738064262265e-07, "logits/chosen": -2.4823808670043945, "logits/rejected": -2.5409321784973145, "logps/chosen": -19.86790657043457, "logps/rejected": -359.1689453125, "logps_avg/chosen": -0.11252466589212418, "logps_avg/rejected": -1.946660041809082, "loss": 0.1121, "losses_ref": -0.0014723313506692648, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3090, "u": -2.5461132526397705, "weight": 0.0461733303964138 }, { "diff_generated": -20.70392417907715, "epoch": 1.0045366169799093, "grad_norm": 2.6373592590329578, "learning_rate": 6.729733296611336e-07, "logits/chosen": -2.4947752952575684, "logits/rejected": -2.5320487022399902, "logps/chosen": -17.02425765991211, "logps/rejected": -360.64306640625, "logps_avg/chosen": -0.09909389168024063, "logps_avg/rejected": -2.070392370223999, "loss": 0.0994, "losses_ref": -0.0014184715691953897, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3100, "u": -3.6951279640197754, "weight": 0.027127737179398537 }, { "diff_generated": -22.544784545898438, "epoch": 1.0077770576798444, "grad_norm": 2.612035515483608, "learning_rate": 6.718689720990442e-07, "logits/chosen": -2.457213878631592, "logits/rejected": -2.4981653690338135, "logps/chosen": -17.546300888061523, "logps/rejected": -378.4374694824219, "logps_avg/chosen": -0.10462252795696259, "logps_avg/rejected": -2.2544784545898438, "loss": 0.1007, "losses_ref": -0.001810407848097384, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3110, "u": -3.5930373668670654, "weight": 0.05361621454358101 }, { "diff_generated": -22.213926315307617, "epoch": 1.0110174983797797, "grad_norm": 2.6028609758829067, "learning_rate": 6.707607494403471e-07, "logits/chosen": -2.4518966674804688, "logits/rejected": -2.48047137260437, "logps/chosen": -16.816543579101562, "logps/rejected": -366.20770263671875, "logps_avg/chosen": -0.09637521207332611, "logps_avg/rejected": -2.221392869949341, "loss": 0.1, "losses_ref": -0.0008464438142254949, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3120, "u": -3.5740058422088623, "weight": 0.05738803744316101 }, { "diff_generated": -23.24093246459961, "epoch": 1.0142579390797148, "grad_norm": 2.7720360808935234, "learning_rate": 6.696486774403812e-07, "logits/chosen": -2.4274613857269287, "logits/rejected": -2.472154140472412, "logps/chosen": -18.64761734008789, "logps/rejected": -397.8034973144531, "logps_avg/chosen": -0.10990948975086212, "logps_avg/rejected": -2.3240933418273926, "loss": 0.1029, "losses_ref": -0.0008102835854515433, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3130, "u": -3.6011669635772705, "weight": 0.05116075277328491 }, { "diff_generated": -21.840497970581055, "epoch": 1.01749837977965, "grad_norm": 2.8145720544967787, "learning_rate": 6.685327719092096e-07, "logits/chosen": -2.383756399154663, "logits/rejected": -2.4836862087249756, "logps/chosen": -14.916833877563477, "logps/rejected": -383.84881591796875, "logps_avg/chosen": -0.09246564656496048, "logps_avg/rejected": -2.184049606323242, "loss": 0.1024, "losses_ref": -0.0007215240621007979, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3140, "u": -3.503340482711792, "weight": 0.07616675645112991 }, { "diff_generated": -22.751646041870117, "epoch": 1.0207388204795853, "grad_norm": 2.537710280076668, "learning_rate": 6.674130487113962e-07, "logits/chosen": -2.4844613075256348, "logits/rejected": -2.4696061611175537, "logps/chosen": -19.461294174194336, "logps/rejected": -396.66644287109375, "logps_avg/chosen": -0.1096779853105545, "logps_avg/rejected": -2.275164842605591, "loss": 0.1064, "losses_ref": -0.0012650018325075507, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3150, "u": -3.6209092140197754, "weight": 0.04566306993365288 }, { "diff_generated": -23.81186866760254, "epoch": 1.0239792611795204, "grad_norm": 2.6188946979483423, "learning_rate": 6.662895237657799e-07, "logits/chosen": -2.495620012283325, "logits/rejected": -2.4823803901672363, "logps/chosen": -17.750776290893555, "logps/rejected": -385.9418640136719, "logps_avg/chosen": -0.10011390596628189, "logps_avg/rejected": -2.3811867237091064, "loss": 0.1028, "losses_ref": -0.0011827899143099785, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3160, "u": -3.694988250732422, "weight": 0.026975449174642563 }, { "diff_generated": -20.148897171020508, "epoch": 1.0272197018794555, "grad_norm": 2.5582619440911447, "learning_rate": 6.651622130452481e-07, "logits/chosen": -2.43251371383667, "logits/rejected": -2.451153516769409, "logps/chosen": -20.973651885986328, "logps/rejected": -354.74237060546875, "logps_avg/chosen": -0.10991770029067993, "logps_avg/rejected": -2.0148894786834717, "loss": 0.1007, "losses_ref": -0.0011846128618344665, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3170, "u": -3.530184268951416, "weight": 0.07041925191879272 }, { "diff_generated": -23.65302848815918, "epoch": 1.030460142579391, "grad_norm": 2.547304600356209, "learning_rate": 6.640311325765096e-07, "logits/chosen": -2.4048352241516113, "logits/rejected": -2.462200880050659, "logps/chosen": -17.424785614013672, "logps/rejected": -404.6559143066406, "logps_avg/chosen": -0.10332882404327393, "logps_avg/rejected": -2.365302801132202, "loss": 0.103, "losses_ref": -0.00218791700899601, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3180, "u": -3.5986220836639404, "weight": 0.053771477192640305 }, { "diff_generated": -23.062740325927734, "epoch": 1.033700583279326, "grad_norm": 2.911656332092707, "learning_rate": 6.628962984398663e-07, "logits/chosen": -2.444711208343506, "logits/rejected": -2.468648910522461, "logps/chosen": -17.87255859375, "logps/rejected": -410.46258544921875, "logps_avg/chosen": -0.10444997251033783, "logps_avg/rejected": -2.306273937225342, "loss": 0.102, "losses_ref": -0.002171388128772378, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3190, "u": -3.617579698562622, "weight": 0.047922343015670776 }, { "diff_generated": -23.605072021484375, "epoch": 1.0369410239792611, "grad_norm": 3.169648553209789, "learning_rate": 6.617577267689863e-07, "logits/chosen": -2.4097084999084473, "logits/rejected": -2.4340286254882812, "logps/chosen": -17.338579177856445, "logps/rejected": -428.2044372558594, "logps_avg/chosen": -0.10172301530838013, "logps_avg/rejected": -2.360507011413574, "loss": 0.1027, "losses_ref": -0.0015846488531678915, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3200, "u": -3.598299741744995, "weight": 0.05309338495135307 }, { "diff_generated": -22.836376190185547, "epoch": 1.0401814646791965, "grad_norm": 2.6924560678811087, "learning_rate": 6.606154337506721e-07, "logits/chosen": -2.45587420463562, "logits/rejected": -2.441114902496338, "logps/chosen": -21.14226722717285, "logps/rejected": -389.21380615234375, "logps_avg/chosen": -0.10923053324222565, "logps_avg/rejected": -2.283637523651123, "loss": 0.1031, "losses_ref": -0.0016331791412085295, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3210, "u": -3.598700761795044, "weight": 0.05264229327440262 }, { "diff_generated": -23.247394561767578, "epoch": 1.0434219053791316, "grad_norm": 2.482087757645902, "learning_rate": 6.594694356246325e-07, "logits/chosen": -2.4460015296936035, "logits/rejected": -2.3826279640197754, "logps/chosen": -19.749439239501953, "logps/rejected": -404.3794250488281, "logps_avg/chosen": -0.10343378782272339, "logps_avg/rejected": -2.324739456176758, "loss": 0.1008, "losses_ref": -0.0012167459353804588, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3220, "u": -3.483008623123169, "weight": 0.08297277987003326 }, { "diff_generated": -23.389196395874023, "epoch": 1.0466623460790667, "grad_norm": 2.6984683560319453, "learning_rate": 6.583197486832506e-07, "logits/chosen": -2.434844493865967, "logits/rejected": -2.393265724182129, "logps/chosen": -17.93399429321289, "logps/rejected": -400.63348388671875, "logps_avg/chosen": -0.09716083109378815, "logps_avg/rejected": -2.3389194011688232, "loss": 0.1032, "losses_ref": -0.001094849780201912, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3230, "u": -3.4998157024383545, "weight": 0.07667265087366104 }, { "diff_generated": -24.4296932220459, "epoch": 1.0499027867790018, "grad_norm": 3.0013555479217744, "learning_rate": 6.571663892713527e-07, "logits/chosen": -2.432621479034424, "logits/rejected": -2.430283784866333, "logps/chosen": -18.3193359375, "logps/rejected": -403.04559326171875, "logps_avg/chosen": -0.10598043352365494, "logps_avg/rejected": -2.442969560623169, "loss": 0.0992, "losses_ref": -0.0009507110225968063, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3240, "u": -3.6481566429138184, "weight": 0.038829002529382706 }, { "diff_generated": -23.248714447021484, "epoch": 1.0531432274789372, "grad_norm": 2.4764423351374614, "learning_rate": 6.560093737859755e-07, "logits/chosen": -2.4444186687469482, "logits/rejected": -2.3516175746917725, "logps/chosen": -18.378459930419922, "logps/rejected": -367.12188720703125, "logps_avg/chosen": -0.1013893112540245, "logps_avg/rejected": -2.32487154006958, "loss": 0.101, "losses_ref": -0.000590079347603023, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3250, "u": -3.4993865489959717, "weight": 0.07574503868818283 }, { "diff_generated": -23.602413177490234, "epoch": 1.0563836681788723, "grad_norm": 2.6191646456197426, "learning_rate": 6.548487186761334e-07, "logits/chosen": -2.430243968963623, "logits/rejected": -2.46002459526062, "logps/chosen": -17.586816787719727, "logps/rejected": -410.814453125, "logps_avg/chosen": -0.10161665827035904, "logps_avg/rejected": -2.360241413116455, "loss": 0.1031, "losses_ref": -0.0013376142596825957, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3260, "u": -3.6159908771514893, "weight": 0.04571908712387085 }, { "diff_generated": -24.22919273376465, "epoch": 1.0596241088788074, "grad_norm": 2.5647852276867207, "learning_rate": 6.536844404425845e-07, "logits/chosen": -2.4303784370422363, "logits/rejected": -2.4501612186431885, "logps/chosen": -17.670944213867188, "logps/rejected": -422.8164978027344, "logps_avg/chosen": -0.09900084882974625, "logps_avg/rejected": -2.4229190349578857, "loss": 0.0999, "losses_ref": -0.0014200543519109488, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3270, "u": -3.547405958175659, "weight": 0.06507633626461029 }, { "diff_generated": -24.190811157226562, "epoch": 1.0628645495787428, "grad_norm": 2.6807521594131627, "learning_rate": 6.525165556375959e-07, "logits/chosen": -2.3963959217071533, "logits/rejected": -2.445218324661255, "logps/chosen": -16.344669342041016, "logps/rejected": -413.4169006347656, "logps_avg/chosen": -0.09862269461154938, "logps_avg/rejected": -2.419081211090088, "loss": 0.1016, "losses_ref": -0.001218282151967287, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3280, "u": -3.5981674194335938, "weight": 0.051731787621974945 }, { "diff_generated": -25.94841957092285, "epoch": 1.0661049902786779, "grad_norm": 2.564309940524817, "learning_rate": 6.513450808647086e-07, "logits/chosen": -2.3707902431488037, "logits/rejected": -2.3697311878204346, "logps/chosen": -19.391185760498047, "logps/rejected": -457.5367736816406, "logps_avg/chosen": -0.10474257171154022, "logps_avg/rejected": -2.5948421955108643, "loss": 0.1038, "losses_ref": -0.0006827990291640162, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3290, "u": -3.6699092388153076, "weight": 0.032251156866550446 }, { "diff_generated": -24.062957763671875, "epoch": 1.069345430978613, "grad_norm": 2.460241055536533, "learning_rate": 6.501700327785011e-07, "logits/chosen": -2.46614408493042, "logits/rejected": -2.426342487335205, "logps/chosen": -17.56804656982422, "logps/rejected": -433.91009521484375, "logps_avg/chosen": -0.09171821922063828, "logps_avg/rejected": -2.4062960147857666, "loss": 0.1013, "losses_ref": -0.0011504015419632196, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3300, "u": -3.5259463787078857, "weight": 0.07042489945888519 }, { "diff_generated": -24.045822143554688, "epoch": 1.0725858716785484, "grad_norm": 2.7323088636588135, "learning_rate": 6.489914280843528e-07, "logits/chosen": -2.449624538421631, "logits/rejected": -2.3936076164245605, "logps/chosen": -19.183971405029297, "logps/rejected": -411.39794921875, "logps_avg/chosen": -0.10804645717144012, "logps_avg/rejected": -2.4045822620391846, "loss": 0.1026, "losses_ref": -0.0014338415348902345, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3310, "u": -3.5029220581054688, "weight": 0.07727103680372238 }, { "diff_generated": -24.791278839111328, "epoch": 1.0758263123784835, "grad_norm": 2.6200584841102064, "learning_rate": 6.478092835382071e-07, "logits/chosen": -2.4237277507781982, "logits/rejected": -2.404797315597534, "logps/chosen": -19.5428409576416, "logps/rejected": -424.99224853515625, "logps_avg/chosen": -0.10355500131845474, "logps_avg/rejected": -2.479128122329712, "loss": 0.1005, "losses_ref": -0.0005835418705828488, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3320, "u": -3.6703476905822754, "weight": 0.03202248364686966 }, { "diff_generated": -24.558528900146484, "epoch": 1.0790667530784186, "grad_norm": 2.407433322048499, "learning_rate": 6.466236159463319e-07, "logits/chosen": -2.4131906032562256, "logits/rejected": -2.4160373210906982, "logps/chosen": -17.557640075683594, "logps/rejected": -444.14642333984375, "logps_avg/chosen": -0.0994739979505539, "logps_avg/rejected": -2.45585298538208, "loss": 0.1003, "losses_ref": -0.0005983190494589508, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3330, "u": -3.6192123889923096, "weight": 0.04453558474779129 }, { "diff_generated": -26.1569881439209, "epoch": 1.082307193778354, "grad_norm": 2.6781150132399545, "learning_rate": 6.45434442165082e-07, "logits/chosen": -2.433701753616333, "logits/rejected": -2.446474075317383, "logps/chosen": -18.218320846557617, "logps/rejected": -465.1985778808594, "logps_avg/chosen": -0.10705505311489105, "logps_avg/rejected": -2.6156985759735107, "loss": 0.1041, "losses_ref": -0.00022716677631251514, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3340, "u": -3.717374086380005, "weight": 0.019019024446606636 }, { "diff_generated": -23.62293815612793, "epoch": 1.085547634478289, "grad_norm": 2.7613908310267683, "learning_rate": 6.442417791006585e-07, "logits/chosen": -2.438436985015869, "logits/rejected": -2.4464330673217773, "logps/chosen": -17.62456512451172, "logps/rejected": -413.90838623046875, "logps_avg/chosen": -0.09845836460590363, "logps_avg/rejected": -2.3622939586639404, "loss": 0.1031, "losses_ref": -0.0006310438038781285, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3350, "u": -3.567746639251709, "weight": 0.05706679821014404 }, { "diff_generated": -25.488025665283203, "epoch": 1.0887880751782242, "grad_norm": 2.9149257650274767, "learning_rate": 6.43045643708869e-07, "logits/chosen": -2.42579984664917, "logits/rejected": -2.3910865783691406, "logps/chosen": -18.47863006591797, "logps/rejected": -431.6898498535156, "logps_avg/chosen": -0.09963358938694, "logps_avg/rejected": -2.5488028526306152, "loss": 0.1054, "losses_ref": -0.0014607172925025225, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3360, "u": -3.5925323963165283, "weight": 0.052489422261714935 }, { "diff_generated": -24.272735595703125, "epoch": 1.0920285158781595, "grad_norm": 2.784142824615618, "learning_rate": 6.418460529948861e-07, "logits/chosen": -2.409794330596924, "logits/rejected": -2.445279598236084, "logps/chosen": -15.560731887817383, "logps/rejected": -420.5341796875, "logps_avg/chosen": -0.09469757974147797, "logps_avg/rejected": -2.4272732734680176, "loss": 0.1028, "losses_ref": -0.0006133883143775165, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3370, "u": -3.4843497276306152, "weight": 0.08204265683889389 }, { "diff_generated": -24.337230682373047, "epoch": 1.0952689565780946, "grad_norm": 2.862517085263023, "learning_rate": 6.406430240130064e-07, "logits/chosen": -2.401892900466919, "logits/rejected": -2.421947956085205, "logps/chosen": -18.046703338623047, "logps/rejected": -440.37841796875, "logps_avg/chosen": -0.10726320743560791, "logps_avg/rejected": -2.433722972869873, "loss": 0.1027, "losses_ref": -0.0007345007034018636, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3380, "u": -3.5500590801239014, "weight": 0.06357249617576599 }, { "diff_generated": -24.39935302734375, "epoch": 1.0985093972780298, "grad_norm": 2.4164264455871005, "learning_rate": 6.39436573866407e-07, "logits/chosen": -2.433347702026367, "logits/rejected": -2.4411110877990723, "logps/chosen": -19.208223342895508, "logps/rejected": -429.08551025390625, "logps_avg/chosen": -0.11225831508636475, "logps_avg/rejected": -2.4399352073669434, "loss": 0.1042, "losses_ref": -0.0008622838067822158, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3390, "u": -3.5989012718200684, "weight": 0.05120759457349777 }, { "diff_generated": -24.014942169189453, "epoch": 1.101749837977965, "grad_norm": 2.845834142850375, "learning_rate": 6.38226719706903e-07, "logits/chosen": -2.405179500579834, "logits/rejected": -2.427467107772827, "logps/chosen": -17.116865158081055, "logps/rejected": -419.35650634765625, "logps_avg/chosen": -0.09478393942117691, "logps_avg/rejected": -2.401494026184082, "loss": 0.1038, "losses_ref": -0.0004464842495508492, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3400, "u": -3.5061697959899902, "weight": 0.07553113251924515 }, { "diff_generated": -23.331575393676758, "epoch": 1.1049902786779002, "grad_norm": 2.561300325422103, "learning_rate": 6.370134787347039e-07, "logits/chosen": -2.4273059368133545, "logits/rejected": -2.436058521270752, "logps/chosen": -17.860082626342773, "logps/rejected": -430.041748046875, "logps_avg/chosen": -0.09594331681728363, "logps_avg/rejected": -2.333157539367676, "loss": 0.0992, "losses_ref": -0.0009888919303193688, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3410, "u": -3.5523743629455566, "weight": 0.06385111808776855 }, { "diff_generated": -23.73761558532715, "epoch": 1.1082307193778353, "grad_norm": 2.7184926559084523, "learning_rate": 6.357968681981683e-07, "logits/chosen": -2.386045217514038, "logits/rejected": -2.3527228832244873, "logps/chosen": -19.80392074584961, "logps/rejected": -416.7049255371094, "logps_avg/chosen": -0.1074841246008873, "logps_avg/rejected": -2.3737616539001465, "loss": 0.1027, "losses_ref": -0.0009749646415002644, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3420, "u": -3.549647808074951, "weight": 0.06383351981639862 }, { "diff_generated": -23.26014518737793, "epoch": 1.1114711600777705, "grad_norm": 2.550522174520947, "learning_rate": 6.345769053935595e-07, "logits/chosen": -2.39861798286438, "logits/rejected": -2.437791109085083, "logps/chosen": -14.868988037109375, "logps/rejected": -423.1236877441406, "logps_avg/chosen": -0.08718402683734894, "logps_avg/rejected": -2.326014757156372, "loss": 0.0988, "losses_ref": -0.0009409437188878655, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3430, "u": -3.527477264404297, "weight": 0.07046804577112198 }, { "diff_generated": -26.601919174194336, "epoch": 1.1147116007777058, "grad_norm": 2.840256866062134, "learning_rate": 6.333536076647985e-07, "logits/chosen": -2.350879192352295, "logits/rejected": -2.3992514610290527, "logps/chosen": -17.582849502563477, "logps/rejected": -481.417236328125, "logps_avg/chosen": -0.10916352272033691, "logps_avg/rejected": -2.6601920127868652, "loss": 0.1021, "losses_ref": -0.0011694144923239946, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3440, "u": -3.5961785316467285, "weight": 0.051804203540086746 }, { "diff_generated": -24.66084098815918, "epoch": 1.117952041477641, "grad_norm": 3.0778330431175824, "learning_rate": 6.321269924032188e-07, "logits/chosen": -2.388843059539795, "logits/rejected": -2.358248472213745, "logps/chosen": -21.2260684967041, "logps/rejected": -451.9033203125, "logps_avg/chosen": -0.11745420843362808, "logps_avg/rejected": -2.4660842418670654, "loss": 0.1055, "losses_ref": -0.0006822725990787148, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3450, "u": -3.6240851879119873, "weight": 0.04461303725838661 }, { "diff_generated": -24.651397705078125, "epoch": 1.121192482177576, "grad_norm": 4.340111621801555, "learning_rate": 6.308970770473184e-07, "logits/chosen": -2.3606228828430176, "logits/rejected": -2.352263927459717, "logps/chosen": -18.791608810424805, "logps/rejected": -407.0849609375, "logps_avg/chosen": -0.10350509732961655, "logps_avg/rejected": -2.465139865875244, "loss": 0.1039, "losses_ref": -0.0009811132913455367, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3460, "u": -3.693281650543213, "weight": 0.026510203257203102 }, { "diff_generated": -24.678524017333984, "epoch": 1.1244329228775114, "grad_norm": 2.7093578852340716, "learning_rate": 6.296638790825117e-07, "logits/chosen": -2.418677806854248, "logits/rejected": -2.3865280151367188, "logps/chosen": -18.040279388427734, "logps/rejected": -437.81591796875, "logps_avg/chosen": -0.10123646259307861, "logps_avg/rejected": -2.467852830886841, "loss": 0.1022, "losses_ref": -0.0017416279297322035, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3470, "u": -3.615842342376709, "weight": 0.04664891958236694 }, { "diff_generated": -25.569656372070312, "epoch": 1.1276733635774465, "grad_norm": 2.908672299139381, "learning_rate": 6.284274160408812e-07, "logits/chosen": -2.40313720703125, "logits/rejected": -2.4013543128967285, "logps/chosen": -16.56866455078125, "logps/rejected": -483.9769592285156, "logps_avg/chosen": -0.09553743898868561, "logps_avg/rejected": -2.5569653511047363, "loss": 0.1003, "losses_ref": -0.0011046285508200526, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3480, "u": -3.4805283546447754, "weight": 0.08319718390703201 }, { "diff_generated": -23.765193939208984, "epoch": 1.1309138042773816, "grad_norm": 2.96216221046641, "learning_rate": 6.271877055009284e-07, "logits/chosen": -2.4011592864990234, "logits/rejected": -2.4146828651428223, "logps/chosen": -18.33474349975586, "logps/rejected": -423.14990234375, "logps_avg/chosen": -0.10465750843286514, "logps_avg/rejected": -2.3765194416046143, "loss": 0.1043, "losses_ref": -0.0013917352771386504, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3490, "u": -3.595651626586914, "weight": 0.052030790597200394 }, { "diff_generated": -25.037397384643555, "epoch": 1.134154244977317, "grad_norm": 2.630923168673329, "learning_rate": 6.259447650873236e-07, "logits/chosen": -2.4729576110839844, "logits/rejected": -2.468740224838257, "logps/chosen": -16.419998168945312, "logps/rejected": -451.05816650390625, "logps_avg/chosen": -0.09578843414783478, "logps_avg/rejected": -2.503739833831787, "loss": 0.105, "losses_ref": -0.0009075348498299718, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3500, "u": -3.5995700359344482, "weight": 0.05147087574005127 }, { "diff_generated": -23.72597885131836, "epoch": 1.137394685677252, "grad_norm": 2.6716976448304885, "learning_rate": 6.246986124706555e-07, "logits/chosen": -2.3940160274505615, "logits/rejected": -2.427333116531372, "logps/chosen": -19.011890411376953, "logps/rejected": -453.88067626953125, "logps_avg/chosen": -0.10743583738803864, "logps_avg/rejected": -2.372598171234131, "loss": 0.1028, "losses_ref": -0.0011909648310393095, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3510, "u": -3.5484108924865723, "weight": 0.06427115201950073 }, { "diff_generated": -24.411705017089844, "epoch": 1.1406351263771872, "grad_norm": 2.6436120624467687, "learning_rate": 6.234492653671797e-07, "logits/chosen": -2.4433653354644775, "logits/rejected": -2.4274849891662598, "logps/chosen": -19.20479393005371, "logps/rejected": -426.07843017578125, "logps_avg/chosen": -0.10876087099313736, "logps_avg/rejected": -2.4411709308624268, "loss": 0.1029, "losses_ref": -0.0012901790905743837, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3520, "u": -3.600743055343628, "weight": 0.05254561826586723 }, { "diff_generated": -23.339073181152344, "epoch": 1.1438755670771226, "grad_norm": 3.032839087746657, "learning_rate": 6.221967415385675e-07, "logits/chosen": -2.451753616333008, "logits/rejected": -2.4012863636016846, "logps/chosen": -18.745807647705078, "logps/rejected": -415.9112243652344, "logps_avg/chosen": -0.10285329818725586, "logps_avg/rejected": -2.333907127380371, "loss": 0.106, "losses_ref": -0.0013694807421416044, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3530, "u": -3.5984578132629395, "weight": 0.05219249799847603 }, { "diff_generated": -24.094181060791016, "epoch": 1.1471160077770577, "grad_norm": 2.739604298054967, "learning_rate": 6.209410587916524e-07, "logits/chosen": -2.3793702125549316, "logits/rejected": -2.333580255508423, "logps/chosen": -19.96246910095215, "logps/rejected": -401.35626220703125, "logps_avg/chosen": -0.1089554876089096, "logps_avg/rejected": -2.4094183444976807, "loss": 0.1037, "losses_ref": -0.0021559642627835274, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3540, "u": -3.6416687965393066, "weight": 0.04153052344918251 }, { "diff_generated": -23.563488006591797, "epoch": 1.1503564484769928, "grad_norm": 2.8511259194261727, "learning_rate": 6.196822349781781e-07, "logits/chosen": -2.404599189758301, "logits/rejected": -2.3727211952209473, "logps/chosen": -19.58984375, "logps/rejected": -400.2362365722656, "logps_avg/chosen": -0.10796210914850235, "logps_avg/rejected": -2.3563485145568848, "loss": 0.105, "losses_ref": -0.000914513599127531, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3550, "u": -3.6716854572296143, "weight": 0.032539792358875275 }, { "diff_generated": -25.360666275024414, "epoch": 1.1535968891769282, "grad_norm": 2.4611237466061575, "learning_rate": 6.184202879945437e-07, "logits/chosen": -2.3852033615112305, "logits/rejected": -2.3722081184387207, "logps/chosen": -18.83804702758789, "logps/rejected": -429.45111083984375, "logps_avg/chosen": -0.10900785773992538, "logps_avg/rejected": -2.5360665321350098, "loss": 0.1049, "losses_ref": -0.0009189220145344734, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3560, "u": -3.4985721111297607, "weight": 0.0763770267367363 }, { "diff_generated": -23.74805450439453, "epoch": 1.1568373298768633, "grad_norm": 2.539078034873699, "learning_rate": 6.171552357815497e-07, "logits/chosen": -2.3657662868499756, "logits/rejected": -2.409804105758667, "logps/chosen": -17.804948806762695, "logps/rejected": -431.1031188964844, "logps_avg/chosen": -0.10777918249368668, "logps_avg/rejected": -2.374805450439453, "loss": 0.1025, "losses_ref": -0.0020144102163612843, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3570, "u": -3.688842296600342, "weight": 0.028050964698195457 }, { "diff_generated": -23.931705474853516, "epoch": 1.1600777705767984, "grad_norm": 2.764114064987785, "learning_rate": 6.15887096324143e-07, "logits/chosen": -2.4409403800964355, "logits/rejected": -2.4246106147766113, "logps/chosen": -19.8535213470459, "logps/rejected": -418.84710693359375, "logps_avg/chosen": -0.10594918578863144, "logps_avg/rejected": -2.3931703567504883, "loss": 0.1025, "losses_ref": -0.0015651138965040445, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3580, "u": -3.5757503509521484, "weight": 0.05872585251927376 }, { "diff_generated": -24.774555206298828, "epoch": 1.1633182112767337, "grad_norm": 2.49708903723434, "learning_rate": 6.14615887651161e-07, "logits/chosen": -2.4068663120269775, "logits/rejected": -2.4403574466705322, "logps/chosen": -16.835186004638672, "logps/rejected": -434.1429138183594, "logps_avg/chosen": -0.09970332682132721, "logps_avg/rejected": -2.4774553775787354, "loss": 0.1003, "losses_ref": -0.0007696760585531592, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3590, "u": -3.5268020629882812, "weight": 0.06986421346664429 }, { "diff_generated": -24.963159561157227, "epoch": 1.1665586519766689, "grad_norm": 2.73474011135887, "learning_rate": 6.133416278350756e-07, "logits/chosen": -2.4077696800231934, "logits/rejected": -2.391592502593994, "logps/chosen": -17.72994613647461, "logps/rejected": -439.46258544921875, "logps_avg/chosen": -0.1045701652765274, "logps_avg/rejected": -2.4963157176971436, "loss": 0.1018, "losses_ref": -0.0007292412337847054, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3600, "u": -3.5760302543640137, "weight": 0.05725475028157234 }, { "diff_generated": -23.916915893554688, "epoch": 1.169799092676604, "grad_norm": 2.704649464362536, "learning_rate": 6.120643349917359e-07, "logits/chosen": -2.429378032684326, "logits/rejected": -2.3929200172424316, "logps/chosen": -17.818084716796875, "logps/rejected": -440.4522399902344, "logps_avg/chosen": -0.09617959707975388, "logps_avg/rejected": -2.3916916847229004, "loss": 0.1017, "losses_ref": -0.0010255600791424513, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3610, "u": -3.59245228767395, "weight": 0.05164354294538498 }, { "diff_generated": -25.387792587280273, "epoch": 1.173039533376539, "grad_norm": 2.6381580124499404, "learning_rate": 6.107840272801108e-07, "logits/chosen": -2.4290518760681152, "logits/rejected": -2.3904924392700195, "logps/chosen": -19.25713539123535, "logps/rejected": -442.5533752441406, "logps_avg/chosen": -0.10800061374902725, "logps_avg/rejected": -2.5387792587280273, "loss": 0.1041, "losses_ref": -0.0016010403633117676, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3620, "u": -3.624995708465576, "weight": 0.04630355164408684 }, { "diff_generated": -24.498355865478516, "epoch": 1.1762799740764744, "grad_norm": 2.6553993227399504, "learning_rate": 6.095007229020311e-07, "logits/chosen": -2.40181040763855, "logits/rejected": -2.434415817260742, "logps/chosen": -16.159542083740234, "logps/rejected": -464.08544921875, "logps_avg/chosen": -0.09530286490917206, "logps_avg/rejected": -2.4498355388641357, "loss": 0.103, "losses_ref": -0.0005648103542625904, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3630, "u": -3.5751399993896484, "weight": 0.0569901242852211 }, { "diff_generated": -25.366138458251953, "epoch": 1.1795204147764096, "grad_norm": 2.6255957192728454, "learning_rate": 6.082144401019304e-07, "logits/chosen": -2.42928409576416, "logits/rejected": -2.3760671615600586, "logps/chosen": -18.25739097595215, "logps/rejected": -442.1548767089844, "logps_avg/chosen": -0.10382506996393204, "logps_avg/rejected": -2.536613941192627, "loss": 0.101, "losses_ref": -0.0005856143543496728, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3640, "u": -3.5992000102996826, "weight": 0.05081767961382866 }, { "diff_generated": -24.045963287353516, "epoch": 1.1827608554763447, "grad_norm": 2.662881729483114, "learning_rate": 6.069251971665857e-07, "logits/chosen": -2.36564302444458, "logits/rejected": -2.3937220573425293, "logps/chosen": -18.25554847717285, "logps/rejected": -465.24969482421875, "logps_avg/chosen": -0.10365450382232666, "logps_avg/rejected": -2.4045963287353516, "loss": 0.1026, "losses_ref": -0.0008571479702368379, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3650, "u": -3.525778293609619, "weight": 0.06990309059619904 }, { "diff_generated": -25.264225006103516, "epoch": 1.18600129617628, "grad_norm": 2.7684940174192825, "learning_rate": 6.056330124248576e-07, "logits/chosen": -2.3799309730529785, "logits/rejected": -2.4361531734466553, "logps/chosen": -16.044706344604492, "logps/rejected": -469.60760498046875, "logps_avg/chosen": -0.09844937920570374, "logps_avg/rejected": -2.5264222621917725, "loss": 0.102, "losses_ref": -0.0006588260876014829, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3660, "u": -3.670767307281494, "weight": 0.032099511474370956 }, { "diff_generated": -23.373096466064453, "epoch": 1.1892417368762151, "grad_norm": 2.5773576512927594, "learning_rate": 6.043379042474297e-07, "logits/chosen": -2.3959336280822754, "logits/rejected": -2.4232144355773926, "logps/chosen": -19.539710998535156, "logps/rejected": -443.3605041503906, "logps_avg/chosen": -0.1098468154668808, "logps_avg/rejected": -2.3373095989227295, "loss": 0.1012, "losses_ref": -0.001127001247368753, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3670, "u": -3.599931240081787, "weight": 0.05171620100736618 }, { "diff_generated": -24.989749908447266, "epoch": 1.1924821775761503, "grad_norm": 2.6554402069227088, "learning_rate": 6.030398910465475e-07, "logits/chosen": -2.358093023300171, "logits/rejected": -2.3458518981933594, "logps/chosen": -17.702842712402344, "logps/rejected": -471.4501953125, "logps_avg/chosen": -0.10465174913406372, "logps_avg/rejected": -2.4989750385284424, "loss": 0.1054, "losses_ref": -0.0005576363764703274, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3680, "u": -3.5466511249542236, "weight": 0.06327757984399796 }, { "diff_generated": -25.277515411376953, "epoch": 1.1957226182760856, "grad_norm": 2.6168702606831933, "learning_rate": 6.017389912757561e-07, "logits/chosen": -2.4333736896514893, "logits/rejected": -2.441929578781128, "logps/chosen": -16.576610565185547, "logps/rejected": -475.36920166015625, "logps_avg/chosen": -0.09709908068180084, "logps_avg/rejected": -2.5277514457702637, "loss": 0.1004, "losses_ref": -0.0004923694650642574, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3690, "u": -3.552541732788086, "weight": 0.06316892802715302 }, { "diff_generated": -24.737037658691406, "epoch": 1.1989630589760207, "grad_norm": 2.5817750174964824, "learning_rate": 6.004352234296389e-07, "logits/chosen": -2.3979382514953613, "logits/rejected": -2.369443416595459, "logps/chosen": -19.797863006591797, "logps/rejected": -437.35369873046875, "logps_avg/chosen": -0.10848965495824814, "logps_avg/rejected": -2.4737040996551514, "loss": 0.107, "losses_ref": -0.0006861963192932308, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3700, "u": -3.524837017059326, "weight": 0.06967984139919281 }, { "diff_generated": -24.86382484436035, "epoch": 1.2022034996759559, "grad_norm": 2.517128501919303, "learning_rate": 5.991286060435536e-07, "logits/chosen": -2.3625564575195312, "logits/rejected": -2.312837600708008, "logps/chosen": -19.742130279541016, "logps/rejected": -429.26092529296875, "logps_avg/chosen": -0.10398830473423004, "logps_avg/rejected": -2.486382484436035, "loss": 0.104, "losses_ref": -0.0011359945638105273, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3710, "u": -3.4778130054473877, "weight": 0.08374304324388504 }, { "diff_generated": -24.32589340209961, "epoch": 1.2054439403758912, "grad_norm": 2.7384186952104748, "learning_rate": 5.978191576933692e-07, "logits/chosen": -2.3719379901885986, "logits/rejected": -2.3585941791534424, "logps/chosen": -17.426773071289062, "logps/rejected": -442.98077392578125, "logps_avg/chosen": -0.09669273346662521, "logps_avg/rejected": -2.432589054107666, "loss": 0.1024, "losses_ref": -0.000999335665255785, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3720, "u": -3.5731139183044434, "weight": 0.057646702975034714 }, { "diff_generated": -25.280864715576172, "epoch": 1.2086843810758263, "grad_norm": 2.5733058201722634, "learning_rate": 5.965068969952017e-07, "logits/chosen": -2.408519983291626, "logits/rejected": -2.411043405532837, "logps/chosen": -18.149112701416016, "logps/rejected": -454.8963317871094, "logps_avg/chosen": -0.1049225777387619, "logps_avg/rejected": -2.5280864238739014, "loss": 0.1002, "losses_ref": -0.0010424638167023659, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3730, "u": -3.6665706634521484, "weight": 0.032865602523088455 }, { "diff_generated": -26.70073890686035, "epoch": 1.2119248217757614, "grad_norm": 2.601747780328003, "learning_rate": 5.951918426051502e-07, "logits/chosen": -2.4358088970184326, "logits/rejected": -2.401461124420166, "logps/chosen": -14.340431213378906, "logps/rejected": -465.8653259277344, "logps_avg/chosen": -0.08685021847486496, "logps_avg/rejected": -2.670073986053467, "loss": 0.1012, "losses_ref": -0.0008388949790969491, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3740, "u": -3.5967020988464355, "weight": 0.051382988691329956 }, { "diff_generated": -25.41323471069336, "epoch": 1.2151652624756968, "grad_norm": 2.5745745278867975, "learning_rate": 5.938740132190306e-07, "logits/chosen": -2.381037712097168, "logits/rejected": -2.3758397102355957, "logps/chosen": -17.276548385620117, "logps/rejected": -461.35028076171875, "logps_avg/chosen": -0.09561358392238617, "logps_avg/rejected": -2.541323661804199, "loss": 0.1053, "losses_ref": -0.0006424171733669937, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3750, "u": -3.623504161834717, "weight": 0.04461907222867012 }, { "diff_generated": -26.197582244873047, "epoch": 1.218405703175632, "grad_norm": 2.4934283986348493, "learning_rate": 5.9255342757211e-07, "logits/chosen": -2.405440092086792, "logits/rejected": -2.415987968444824, "logps/chosen": -17.704856872558594, "logps/rejected": -454.7588806152344, "logps_avg/chosen": -0.1022186279296875, "logps_avg/rejected": -2.619758129119873, "loss": 0.1008, "losses_ref": -0.0006152585265226662, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3760, "u": -3.693714141845703, "weight": 0.02580828033387661 }, { "diff_generated": -26.245763778686523, "epoch": 1.221646143875567, "grad_norm": 2.8453632401755886, "learning_rate": 5.91230104438841e-07, "logits/chosen": -2.414271593093872, "logits/rejected": -2.3170933723449707, "logps/chosen": -19.21923065185547, "logps/rejected": -435.68133544921875, "logps_avg/chosen": -0.10188715159893036, "logps_avg/rejected": -2.6245763301849365, "loss": 0.1015, "losses_ref": -0.0007117214845493436, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3770, "u": -3.5293922424316406, "weight": 0.0698995441198349 }, { "diff_generated": -24.69413185119629, "epoch": 1.2248865845755024, "grad_norm": 2.531053854092759, "learning_rate": 5.899040626325945e-07, "logits/chosen": -2.432494640350342, "logits/rejected": -2.3904690742492676, "logps/chosen": -17.50114631652832, "logps/rejected": -440.77154541015625, "logps_avg/chosen": -0.1005210131406784, "logps_avg/rejected": -2.4694130420684814, "loss": 0.103, "losses_ref": -0.0005105865420773625, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3780, "u": -3.5267860889434814, "weight": 0.06939880549907684 }, { "diff_generated": -25.48592758178711, "epoch": 1.2281270252754375, "grad_norm": 2.768212237607484, "learning_rate": 5.885753210053917e-07, "logits/chosen": -2.4312968254089355, "logits/rejected": -2.398667335510254, "logps/chosen": -18.666072845458984, "logps/rejected": -453.98492431640625, "logps_avg/chosen": -0.1044863611459732, "logps_avg/rejected": -2.5485928058624268, "loss": 0.106, "losses_ref": -0.00037497709854505956, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3790, "u": -3.553412675857544, "weight": 0.06297777593135834 }, { "diff_generated": -28.831745147705078, "epoch": 1.2313674659753726, "grad_norm": 2.788790523416712, "learning_rate": 5.872438984476368e-07, "logits/chosen": -2.42059588432312, "logits/rejected": -2.3469738960266113, "logps/chosen": -19.486249923706055, "logps/rejected": -455.1859436035156, "logps_avg/chosen": -0.11247433722019196, "logps_avg/rejected": -2.883174419403076, "loss": 0.1045, "losses_ref": -0.001660289941355586, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3800, "u": -3.6212081909179688, "weight": 0.046906400471925735 }, { "diff_generated": -24.617816925048828, "epoch": 1.2346079066753077, "grad_norm": 2.454521001860937, "learning_rate": 5.859098138878482e-07, "logits/chosen": -2.4189045429229736, "logits/rejected": -2.3941142559051514, "logps/chosen": -20.42704200744629, "logps/rejected": -439.0167541503906, "logps_avg/chosen": -0.11315326392650604, "logps_avg/rejected": -2.4617817401885986, "loss": 0.1036, "losses_ref": -0.0006618654588237405, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3810, "u": -3.6706244945526123, "weight": 0.0321231447160244 }, { "diff_generated": -25.22861671447754, "epoch": 1.237848347375243, "grad_norm": 2.678018217467591, "learning_rate": 5.845730862923889e-07, "logits/chosen": -2.370227098464966, "logits/rejected": -2.3375167846679688, "logps/chosen": -19.023059844970703, "logps/rejected": -453.9886169433594, "logps_avg/chosen": -0.10912897437810898, "logps_avg/rejected": -2.5228614807128906, "loss": 0.1036, "losses_ref": -0.0008484205463901162, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3820, "u": -3.552034854888916, "weight": 0.06384466588497162 }, { "diff_generated": -26.32217788696289, "epoch": 1.2410887880751782, "grad_norm": 2.722738395369602, "learning_rate": 5.83233734665198e-07, "logits/chosen": -2.3956446647644043, "logits/rejected": -2.3501551151275635, "logps/chosen": -17.891752243041992, "logps/rejected": -444.76263427734375, "logps_avg/chosen": -0.10333029925823212, "logps_avg/rejected": -2.6322176456451416, "loss": 0.1015, "losses_ref": -0.0008470058673992753, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3830, "u": -3.6235289573669434, "weight": 0.04495188593864441 }, { "diff_generated": -26.583011627197266, "epoch": 1.2443292287751135, "grad_norm": 2.3441085033146565, "learning_rate": 5.818917780475196e-07, "logits/chosen": -2.4250235557556152, "logits/rejected": -2.425710678100586, "logps/chosen": -21.300302505493164, "logps/rejected": -467.77203369140625, "logps_avg/chosen": -0.11703227460384369, "logps_avg/rejected": -2.6583011150360107, "loss": 0.1033, "losses_ref": -0.0011255014687776566, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3840, "u": -3.7144665718078613, "weight": 0.020351290702819824 }, { "diff_generated": -24.3111629486084, "epoch": 1.2475696694750487, "grad_norm": 2.7823770770996576, "learning_rate": 5.805472355176318e-07, "logits/chosen": -2.446390390396118, "logits/rejected": -2.4117252826690674, "logps/chosen": -18.355083465576172, "logps/rejected": -452.235107421875, "logps_avg/chosen": -0.10057584196329117, "logps_avg/rejected": -2.4311161041259766, "loss": 0.1024, "losses_ref": -0.00039589227526448667, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3850, "u": -3.5967726707458496, "weight": 0.05048195645213127 }, { "diff_generated": -23.1942081451416, "epoch": 1.2508101101749838, "grad_norm": 2.5046015246562234, "learning_rate": 5.792001261905767e-07, "logits/chosen": -2.410804271697998, "logits/rejected": -2.4035861492156982, "logps/chosen": -17.919591903686523, "logps/rejected": -415.97052001953125, "logps_avg/chosen": -0.10415836423635483, "logps_avg/rejected": -2.319420576095581, "loss": 0.0987, "losses_ref": -0.0004912324948236346, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3860, "u": -3.4360504150390625, "weight": 0.0943656861782074 }, { "diff_generated": -23.43748664855957, "epoch": 1.254050550874919, "grad_norm": 2.667129273327817, "learning_rate": 5.778504692178876e-07, "logits/chosen": -2.3880181312561035, "logits/rejected": -2.438152551651001, "logps/chosen": -16.907007217407227, "logps/rejected": -435.5437927246094, "logps_avg/chosen": -0.09878456592559814, "logps_avg/rejected": -2.3437483310699463, "loss": 0.1008, "losses_ref": -0.0016443884233012795, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3870, "u": -3.505042314529419, "weight": 0.07795653492212296 }, { "diff_generated": -25.2240047454834, "epoch": 1.2572909915748542, "grad_norm": 2.5542328005408907, "learning_rate": 5.76498283787317e-07, "logits/chosen": -2.403738498687744, "logits/rejected": -2.368741273880005, "logps/chosen": -17.75848960876465, "logps/rejected": -440.37982177734375, "logps_avg/chosen": -0.09870745241641998, "logps_avg/rejected": -2.522400379180908, "loss": 0.1013, "losses_ref": -0.0002985192695632577, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3880, "u": -3.5994458198547363, "weight": 0.050349898636341095 }, { "diff_generated": -24.899044036865234, "epoch": 1.2605314322747894, "grad_norm": 2.7016093842722824, "learning_rate": 5.751435891225643e-07, "logits/chosen": -2.355231761932373, "logits/rejected": -2.37161922454834, "logps/chosen": -15.857172012329102, "logps/rejected": -432.68292236328125, "logps_avg/chosen": -0.09380076825618744, "logps_avg/rejected": -2.4899046421051025, "loss": 0.1008, "losses_ref": -0.000942692335229367, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3890, "u": -3.574742078781128, "weight": 0.057598210871219635 }, { "diff_generated": -25.746257781982422, "epoch": 1.2637718729747245, "grad_norm": 2.590886361364249, "learning_rate": 5.737864044830015e-07, "logits/chosen": -2.3851318359375, "logits/rejected": -2.355273962020874, "logps/chosen": -19.687891006469727, "logps/rejected": -454.10455322265625, "logps_avg/chosen": -0.1099676862359047, "logps_avg/rejected": -2.5746259689331055, "loss": 0.1035, "losses_ref": -0.0006579064065590501, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3900, "u": -3.5053353309631348, "weight": 0.07585734874010086 }, { "diff_generated": -24.64337158203125, "epoch": 1.2670123136746598, "grad_norm": 2.7276886008169754, "learning_rate": 5.724267491634006e-07, "logits/chosen": -2.3683695793151855, "logits/rejected": -2.3457908630371094, "logps/chosen": -17.896350860595703, "logps/rejected": -466.97198486328125, "logps_avg/chosen": -0.09884298592805862, "logps_avg/rejected": -2.464337110519409, "loss": 0.1017, "losses_ref": -0.0007082788506522775, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3910, "u": -3.6250851154327393, "weight": 0.044809214770793915 }, { "diff_generated": -25.45005989074707, "epoch": 1.270252754374595, "grad_norm": 2.3372799665605837, "learning_rate": 5.710646424936581e-07, "logits/chosen": -2.4046618938446045, "logits/rejected": -2.378971576690674, "logps/chosen": -19.94418716430664, "logps/rejected": -448.55194091796875, "logps_avg/chosen": -0.1066654697060585, "logps_avg/rejected": -2.545006036758423, "loss": 0.1037, "losses_ref": -0.0008164637838490307, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3920, "u": -3.6216578483581543, "weight": 0.04495164006948471 }, { "diff_generated": -27.618610382080078, "epoch": 1.27349319507453, "grad_norm": 2.7112289690219784, "learning_rate": 5.697001038385212e-07, "logits/chosen": -2.361335277557373, "logits/rejected": -2.328921318054199, "logps/chosen": -19.196826934814453, "logps/rejected": -480.14007568359375, "logps_avg/chosen": -0.10564608871936798, "logps_avg/rejected": -2.7618608474731445, "loss": 0.1036, "losses_ref": -0.000225507072173059, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3930, "u": -3.621340274810791, "weight": 0.04401098191738129 }, { "diff_generated": -24.551250457763672, "epoch": 1.2767336357744652, "grad_norm": 2.8313354420297157, "learning_rate": 5.683331525973118e-07, "logits/chosen": -2.3341686725616455, "logits/rejected": -2.3362679481506348, "logps/chosen": -19.423187255859375, "logps/rejected": -442.8011169433594, "logps_avg/chosen": -0.10974836349487305, "logps_avg/rejected": -2.455125093460083, "loss": 0.1058, "losses_ref": -0.000763989461120218, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3940, "u": -3.5773825645446777, "weight": 0.05737464502453804 }, { "diff_generated": -26.735946655273438, "epoch": 1.2799740764744005, "grad_norm": 2.794923295177927, "learning_rate": 5.66963808203651e-07, "logits/chosen": -2.391003131866455, "logits/rejected": -2.3569178581237793, "logps/chosen": -18.766311645507812, "logps/rejected": -493.0743103027344, "logps_avg/chosen": -0.11267992109060287, "logps_avg/rejected": -2.6735944747924805, "loss": 0.1037, "losses_ref": -0.0009607706451788545, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3950, "u": -3.669952392578125, "weight": 0.032738275825977325 }, { "diff_generated": -25.84634017944336, "epoch": 1.2832145171743357, "grad_norm": 2.8018447875503116, "learning_rate": 5.65592090125183e-07, "logits/chosen": -2.3674049377441406, "logits/rejected": -2.370068073272705, "logps/chosen": -13.854530334472656, "logps/rejected": -458.9341735839844, "logps_avg/chosen": -0.08643799275159836, "logps_avg/rejected": -2.584634304046631, "loss": 0.0986, "losses_ref": -0.000706795952282846, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3960, "u": -3.499607801437378, "weight": 0.07594309747219086 }, { "diff_generated": -24.824703216552734, "epoch": 1.286454957874271, "grad_norm": 2.668284886295544, "learning_rate": 5.642180178632977e-07, "logits/chosen": -2.3868956565856934, "logits/rejected": -2.383216381072998, "logps/chosen": -17.2413330078125, "logps/rejected": -467.44232177734375, "logps_avg/chosen": -0.1045045480132103, "logps_avg/rejected": -2.482470750808716, "loss": 0.102, "losses_ref": -0.0006459239521063864, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3970, "u": -3.624206066131592, "weight": 0.04473030939698219 }, { "diff_generated": -27.599124908447266, "epoch": 1.2896953985742061, "grad_norm": 2.7179931685781096, "learning_rate": 5.628416109528542e-07, "logits/chosen": -2.3622069358825684, "logits/rejected": -2.3574976921081543, "logps/chosen": -17.356678009033203, "logps/rejected": -505.03668212890625, "logps_avg/chosen": -0.09987537562847137, "logps_avg/rejected": -2.7599120140075684, "loss": 0.1058, "losses_ref": -0.0009466443443670869, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3980, "u": -3.5990073680877686, "weight": 0.051428310573101044 }, { "diff_generated": -26.108234405517578, "epoch": 1.2929358392741412, "grad_norm": 2.4210253725599857, "learning_rate": 5.614628889619029e-07, "logits/chosen": -2.3183486461639404, "logits/rejected": -2.35087513923645, "logps/chosen": -17.084163665771484, "logps/rejected": -474.3221740722656, "logps_avg/chosen": -0.10216245800256729, "logps_avg/rejected": -2.610823154449463, "loss": 0.1032, "losses_ref": -0.0010031044948846102, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3990, "u": -3.596558094024658, "weight": 0.05139036104083061 }, { "diff_generated": -24.630847930908203, "epoch": 1.2961762799740764, "grad_norm": 2.858796440896099, "learning_rate": 5.600818714914065e-07, "logits/chosen": -2.377168893814087, "logits/rejected": -2.3589062690734863, "logps/chosen": -20.87276840209961, "logps/rejected": -453.269287109375, "logps_avg/chosen": -0.1113833412528038, "logps_avg/rejected": -2.4630846977233887, "loss": 0.107, "losses_ref": -0.0004416326410137117, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4000, "u": -3.577017307281494, "weight": 0.05680033564567566 }, { "diff_generated": -27.36174964904785, "epoch": 1.2994167206740117, "grad_norm": 2.741777502121723, "learning_rate": 5.586985781749625e-07, "logits/chosen": -2.400063991546631, "logits/rejected": -2.4467995166778564, "logps/chosen": -17.76604461669922, "logps/rejected": -507.6504821777344, "logps_avg/chosen": -0.10631255805492401, "logps_avg/rejected": -2.736175060272217, "loss": 0.1008, "losses_ref": -0.0009669626015238464, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4010, "u": -3.6947712898254395, "weight": 0.026510408148169518 }, { "diff_generated": -24.537931442260742, "epoch": 1.3026571613739468, "grad_norm": 2.4445506760186744, "learning_rate": 5.573130286785237e-07, "logits/chosen": -2.429718017578125, "logits/rejected": -2.3343007564544678, "logps/chosen": -18.747915267944336, "logps/rejected": -424.55841064453125, "logps_avg/chosen": -0.097903311252594, "logps_avg/rejected": -2.4537932872772217, "loss": 0.1011, "losses_ref": -0.00035963323898613453, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4020, "u": -3.5008933544158936, "weight": 0.0754479244351387 }, { "diff_generated": -25.826641082763672, "epoch": 1.3058976020738822, "grad_norm": 2.7129845467722435, "learning_rate": 5.559252427001178e-07, "logits/chosen": -2.3235254287719727, "logits/rejected": -2.312380075454712, "logps/chosen": -18.90648651123047, "logps/rejected": -452.123046875, "logps_avg/chosen": -0.1057731956243515, "logps_avg/rejected": -2.5826640129089355, "loss": 0.1013, "losses_ref": -0.0019954966846853495, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4030, "u": -3.547513961791992, "weight": 0.06588520109653473 }, { "diff_generated": -27.287891387939453, "epoch": 1.3091380427738173, "grad_norm": 2.6551302568919963, "learning_rate": 5.545352399695687e-07, "logits/chosen": -2.360625982284546, "logits/rejected": -2.3697447776794434, "logps/chosen": -17.551830291748047, "logps/rejected": -478.44818115234375, "logps_avg/chosen": -0.10685201734304428, "logps_avg/rejected": -2.7287890911102295, "loss": 0.1034, "losses_ref": -0.0005914900102652609, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4040, "u": -3.6493136882781982, "weight": 0.0383087582886219 }, { "diff_generated": -26.916757583618164, "epoch": 1.3123784834737524, "grad_norm": 2.463094256206414, "learning_rate": 5.531430402482153e-07, "logits/chosen": -2.3738372325897217, "logits/rejected": -2.3218994140625, "logps/chosen": -18.182100296020508, "logps/rejected": -479.9534606933594, "logps_avg/chosen": -0.10409387201070786, "logps_avg/rejected": -2.6916756629943848, "loss": 0.101, "losses_ref": -0.0010314477840438485, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4050, "u": -3.5255794525146484, "weight": 0.07031405717134476 }, { "diff_generated": -25.09537124633789, "epoch": 1.3156189241736875, "grad_norm": 2.394567509090008, "learning_rate": 5.517486633286299e-07, "logits/chosen": -2.3621907234191895, "logits/rejected": -2.3411543369293213, "logps/chosen": -18.44365119934082, "logps/rejected": -482.3141174316406, "logps_avg/chosen": -0.09819479286670685, "logps_avg/rejected": -2.5095372200012207, "loss": 0.101, "losses_ref": -0.0005044209538027644, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4060, "u": -3.524848222732544, "weight": 0.06937181949615479 }, { "diff_generated": -27.66803550720215, "epoch": 1.3188593648736229, "grad_norm": 2.4050570067723416, "learning_rate": 5.503521290343384e-07, "logits/chosen": -2.400895595550537, "logits/rejected": -2.377678155899048, "logps/chosen": -19.440929412841797, "logps/rejected": -528.3328857421875, "logps_avg/chosen": -0.10858882963657379, "logps_avg/rejected": -2.7668039798736572, "loss": 0.1032, "losses_ref": -0.0008151186630129814, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4070, "u": -3.669245958328247, "weight": 0.03257065638899803 }, { "diff_generated": -26.13507652282715, "epoch": 1.322099805573558, "grad_norm": 2.756233667724102, "learning_rate": 5.489534572195373e-07, "logits/chosen": -2.3385581970214844, "logits/rejected": -2.392242193222046, "logps/chosen": -15.599695205688477, "logps/rejected": -485.8387756347656, "logps_avg/chosen": -0.09932243078947067, "logps_avg/rejected": -2.6135077476501465, "loss": 0.1014, "losses_ref": -0.0006077963043935597, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4080, "u": -3.4798247814178467, "weight": 0.08210951834917068 }, { "diff_generated": -27.595714569091797, "epoch": 1.3253402462734931, "grad_norm": 2.6932022364658033, "learning_rate": 5.47552667768811e-07, "logits/chosen": -2.347517490386963, "logits/rejected": -2.3942277431488037, "logps/chosen": -15.13756275177002, "logps/rejected": -492.9637145996094, "logps_avg/chosen": -0.09007562696933746, "logps_avg/rejected": -2.7595715522766113, "loss": 0.1035, "losses_ref": -0.0006332875927910209, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4090, "u": -3.6677443981170654, "weight": 0.0321536622941494 }, { "diff_generated": -27.55939292907715, "epoch": 1.3285806869734285, "grad_norm": 2.4145917904418317, "learning_rate": 5.46149780596851e-07, "logits/chosen": -2.388737201690674, "logits/rejected": -2.415613889694214, "logps/chosen": -17.241397857666016, "logps/rejected": -492.285888671875, "logps_avg/chosen": -0.1004859209060669, "logps_avg/rejected": -2.75593900680542, "loss": 0.1026, "losses_ref": -0.0004385868087410927, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4100, "u": -3.6940712928771973, "weight": 0.025556892156600952 }, { "diff_generated": -28.32535743713379, "epoch": 1.3318211276733636, "grad_norm": 2.7341334751798336, "learning_rate": 5.447448156481708e-07, "logits/chosen": -2.4162185192108154, "logits/rejected": -2.3701248168945312, "logps/chosen": -16.101211547851562, "logps/rejected": -506.6935119628906, "logps_avg/chosen": -0.09412642568349838, "logps_avg/rejected": -2.832535743713379, "loss": 0.1017, "losses_ref": -0.000632374722044915, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4110, "u": -3.6228606700897217, "weight": 0.044598862528800964 }, { "diff_generated": -26.807104110717773, "epoch": 1.3350615683732987, "grad_norm": 2.568298597173405, "learning_rate": 5.433377928968234e-07, "logits/chosen": -2.41631817817688, "logits/rejected": -2.374326705932617, "logps/chosen": -19.040271759033203, "logps/rejected": -466.0682678222656, "logps_avg/chosen": -0.10614614188671112, "logps_avg/rejected": -2.6807103157043457, "loss": 0.1031, "losses_ref": -0.0004517110646702349, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4120, "u": -3.6942248344421387, "weight": 0.025584762915968895 }, { "diff_generated": -26.3829345703125, "epoch": 1.3383020090732338, "grad_norm": 2.5147286408642318, "learning_rate": 5.41928732346117e-07, "logits/chosen": -2.3779680728912354, "logits/rejected": -2.329960346221924, "logps/chosen": -18.907283782958984, "logps/rejected": -471.4302673339844, "logps_avg/chosen": -0.10900212824344635, "logps_avg/rejected": -2.638293504714966, "loss": 0.1025, "losses_ref": -0.0008699931204319, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4130, "u": -3.645224094390869, "weight": 0.038796357810497284 }, { "diff_generated": -26.643444061279297, "epoch": 1.3415424497731692, "grad_norm": 2.3922661548115394, "learning_rate": 5.405176540283311e-07, "logits/chosen": -2.365628480911255, "logits/rejected": -2.3224751949310303, "logps/chosen": -18.312501907348633, "logps/rejected": -505.20098876953125, "logps_avg/chosen": -0.09830234944820404, "logps_avg/rejected": -2.6643447875976562, "loss": 0.1025, "losses_ref": -0.0002646732027642429, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4140, "u": -3.504613161087036, "weight": 0.07531232386827469 }, { "diff_generated": -26.2756290435791, "epoch": 1.3447828904731043, "grad_norm": 2.5777576798673714, "learning_rate": 5.391045780044308e-07, "logits/chosen": -2.4077603816986084, "logits/rejected": -2.394535779953003, "logps/chosen": -18.91120719909668, "logps/rejected": -510.40521240234375, "logps_avg/chosen": -0.10508742183446884, "logps_avg/rejected": -2.6275627613067627, "loss": 0.1018, "losses_ref": -0.00047807503142394125, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4150, "u": -3.6172034740448, "weight": 0.044402316212654114 }, { "diff_generated": -28.211669921875, "epoch": 1.3480233311730396, "grad_norm": 2.594955434688894, "learning_rate": 5.376895243637823e-07, "logits/chosen": -2.370952844619751, "logits/rejected": -2.290727376937866, "logps/chosen": -19.83003807067871, "logps/rejected": -510.25714111328125, "logps_avg/chosen": -0.10845856368541718, "logps_avg/rejected": -2.8211669921875, "loss": 0.1042, "losses_ref": -0.0005390375154092908, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4160, "u": -3.504507541656494, "weight": 0.07578588277101517 }, { "diff_generated": -28.59592056274414, "epoch": 1.3512637718729748, "grad_norm": 2.6192143853640877, "learning_rate": 5.362725132238672e-07, "logits/chosen": -2.3691391944885254, "logits/rejected": -2.4109010696411133, "logps/chosen": -17.159189224243164, "logps/rejected": -558.8998413085938, "logps_avg/chosen": -0.10241004079580307, "logps_avg/rejected": -2.859591484069824, "loss": 0.104, "losses_ref": -0.0003753933997359127, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4170, "u": -3.6483771800994873, "weight": 0.0379684753715992 }, { "diff_generated": -26.94635581970215, "epoch": 1.3545042125729099, "grad_norm": 3.987238518571662, "learning_rate": 5.348535647299964e-07, "logits/chosen": -2.3350539207458496, "logits/rejected": -2.329770565032959, "logps/chosen": -17.921201705932617, "logps/rejected": -511.4327697753906, "logps_avg/chosen": -0.1062774658203125, "logps_avg/rejected": -2.6946353912353516, "loss": 0.103, "losses_ref": -0.0009642991935834289, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4180, "u": -3.670365571975708, "weight": 0.032805364578962326 }, { "diff_generated": -27.179916381835938, "epoch": 1.357744653272845, "grad_norm": 2.520799402886161, "learning_rate": 5.334326990550234e-07, "logits/chosen": -2.3830137252807617, "logits/rejected": -2.3622913360595703, "logps/chosen": -17.147418975830078, "logps/rejected": -505.0753479003906, "logps_avg/chosen": -0.09717821329832077, "logps_avg/rejected": -2.717991828918457, "loss": 0.0988, "losses_ref": -0.0002480837283656001, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4190, "u": -3.6003196239471436, "weight": 0.05031207948923111 }, { "diff_generated": -27.000635147094727, "epoch": 1.3609850939727803, "grad_norm": 2.5495151878301865, "learning_rate": 5.320099363990584e-07, "logits/chosen": -2.390007495880127, "logits/rejected": -2.3277785778045654, "logps/chosen": -18.247713088989258, "logps/rejected": -478.716796875, "logps_avg/chosen": -0.10145987570285797, "logps_avg/rejected": -2.700063467025757, "loss": 0.0962, "losses_ref": -0.0004951705923303962, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4200, "u": -3.649203062057495, "weight": 0.03820186108350754 }, { "diff_generated": -27.969797134399414, "epoch": 1.3642255346727155, "grad_norm": 2.4410988251552315, "learning_rate": 5.305852969891799e-07, "logits/chosen": -2.4071946144104004, "logits/rejected": -2.3240163326263428, "logps/chosen": -18.725177764892578, "logps/rejected": -465.49053955078125, "logps_avg/chosen": -0.10521489381790161, "logps_avg/rejected": -2.796980142593384, "loss": 0.0991, "losses_ref": -0.0006577539024874568, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4210, "u": -3.5468764305114746, "weight": 0.0634467676281929 }, { "diff_generated": -25.992996215820312, "epoch": 1.3674659753726508, "grad_norm": 2.4988856155190424, "learning_rate": 5.29158801079148e-07, "logits/chosen": -2.3208134174346924, "logits/rejected": -2.2926430702209473, "logps/chosen": -17.967212677001953, "logps/rejected": -439.5184020996094, "logps_avg/chosen": -0.0993885025382042, "logps_avg/rejected": -2.599299192428589, "loss": 0.1009, "losses_ref": -0.0003469690855126828, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4220, "u": -3.42718505859375, "weight": 0.0941712036728859 }, { "diff_generated": -26.823211669921875, "epoch": 1.370706416072586, "grad_norm": 2.750645300373666, "learning_rate": 5.277304689491165e-07, "logits/chosen": -2.3862364292144775, "logits/rejected": -2.3794219493865967, "logps/chosen": -18.44593620300293, "logps/rejected": -465.5531311035156, "logps_avg/chosen": -0.10832039266824722, "logps_avg/rejected": -2.682321071624756, "loss": 0.1032, "losses_ref": -0.0004914809833280742, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4230, "u": -3.6443495750427246, "weight": 0.03814055770635605 }, { "diff_generated": -28.399890899658203, "epoch": 1.373946856772521, "grad_norm": 2.4662347731325136, "learning_rate": 5.26300320905344e-07, "logits/chosen": -2.3697032928466797, "logits/rejected": -2.345850706100464, "logps/chosen": -17.219181060791016, "logps/rejected": -479.1373596191406, "logps_avg/chosen": -0.10566030442714691, "logps_avg/rejected": -2.839988946914673, "loss": 0.1018, "losses_ref": -0.00043061329051852226, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4240, "u": -3.6740641593933105, "weight": 0.031836919486522675 }, { "diff_generated": -25.946186065673828, "epoch": 1.3771872974724562, "grad_norm": 2.5775217864622917, "learning_rate": 5.248683772799054e-07, "logits/chosen": -2.359295129776001, "logits/rejected": -2.305798053741455, "logps/chosen": -20.096813201904297, "logps/rejected": -470.4903259277344, "logps_avg/chosen": -0.10649679601192474, "logps_avg/rejected": -2.594618320465088, "loss": 0.1004, "losses_ref": -0.00032928684959188104, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4250, "u": -3.618279218673706, "weight": 0.04415220022201538 }, { "diff_generated": -25.108627319335938, "epoch": 1.3804277381723915, "grad_norm": 2.5376430727776853, "learning_rate": 5.234346584304033e-07, "logits/chosen": -2.3466343879699707, "logits/rejected": -2.3309569358825684, "logps/chosen": -17.8268985748291, "logps/rejected": -446.1979064941406, "logps_avg/chosen": -0.09819166362285614, "logps_avg/rejected": -2.510862350463867, "loss": 0.1023, "losses_ref": -0.0008442547405138612, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4260, "u": -3.5203120708465576, "weight": 0.07002700120210648 }, { "diff_generated": -25.804737091064453, "epoch": 1.3836681788723266, "grad_norm": 2.6515230074033322, "learning_rate": 5.21999184739678e-07, "logits/chosen": -2.3517873287200928, "logits/rejected": -2.3346188068389893, "logps/chosen": -20.65091323852539, "logps/rejected": -440.77716064453125, "logps_avg/chosen": -0.11302739381790161, "logps_avg/rejected": -2.5804734230041504, "loss": 0.1037, "losses_ref": -0.0011006726417690516, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4270, "u": -3.6943564414978027, "weight": 0.02656673453748226 }, { "diff_generated": -24.770761489868164, "epoch": 1.3869086195722617, "grad_norm": 2.672763114594347, "learning_rate": 5.205619766155182e-07, "logits/chosen": -2.378600597381592, "logits/rejected": -2.378143310546875, "logps/chosen": -18.211240768432617, "logps/rejected": -422.57904052734375, "logps_avg/chosen": -0.1082327589392662, "logps_avg/rejected": -2.477076292037964, "loss": 0.1045, "losses_ref": -0.0007958011701703072, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4280, "u": -3.5233681201934814, "weight": 0.069813571870327 }, { "diff_generated": -26.24502182006836, "epoch": 1.390149060272197, "grad_norm": 2.6044796369955248, "learning_rate": 5.191230544903702e-07, "logits/chosen": -2.34401798248291, "logits/rejected": -2.33591890335083, "logps/chosen": -16.507028579711914, "logps/rejected": -415.982421875, "logps_avg/chosen": -0.09873421490192413, "logps_avg/rejected": -2.624502420425415, "loss": 0.0999, "losses_ref": -0.001061144983395934, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4290, "u": -3.596564531326294, "weight": 0.051502663642168045 }, { "diff_generated": -24.274839401245117, "epoch": 1.3933895009721322, "grad_norm": 2.8559559028148884, "learning_rate": 5.176824388210483e-07, "logits/chosen": -2.342895030975342, "logits/rejected": -2.345672130584717, "logps/chosen": -18.14669418334961, "logps/rejected": -449.62115478515625, "logps_avg/chosen": -0.10630394518375397, "logps_avg/rejected": -2.4274840354919434, "loss": 0.1019, "losses_ref": -0.0009579318575561047, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4300, "u": -3.5042850971221924, "weight": 0.07662221789360046 }, { "diff_generated": -25.419696807861328, "epoch": 1.3966299416720673, "grad_norm": 2.5561510029103247, "learning_rate": 5.162401500884432e-07, "logits/chosen": -2.3567914962768555, "logits/rejected": -2.342838764190674, "logps/chosen": -18.089414596557617, "logps/rejected": -439.552490234375, "logps_avg/chosen": -0.10499085485935211, "logps_avg/rejected": -2.5419695377349854, "loss": 0.1006, "losses_ref": -0.0009210168500430882, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4310, "u": -3.5712103843688965, "weight": 0.05744823068380356 }, { "diff_generated": -24.52872657775879, "epoch": 1.3998703823720027, "grad_norm": 2.486952067870786, "learning_rate": 5.147962087972314e-07, "logits/chosen": -2.3481554985046387, "logits/rejected": -2.308380603790283, "logps/chosen": -19.289180755615234, "logps/rejected": -414.15338134765625, "logps_avg/chosen": -0.1033637523651123, "logps_avg/rejected": -2.4528725147247314, "loss": 0.1008, "losses_ref": -0.0008901024120859802, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4320, "u": -3.5753231048583984, "weight": 0.05747845023870468 }, { "diff_generated": -25.5739688873291, "epoch": 1.4031108230719378, "grad_norm": 2.6831985223645582, "learning_rate": 5.133506354755833e-07, "logits/chosen": -2.365851640701294, "logits/rejected": -2.36531138420105, "logps/chosen": -15.5805025100708, "logps/rejected": -452.583740234375, "logps_avg/chosen": -0.09160830080509186, "logps_avg/rejected": -2.557396650314331, "loss": 0.1001, "losses_ref": -0.00025015632854774594, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4330, "u": -3.5732452869415283, "weight": 0.056541360914707184 }, { "diff_generated": -25.01066017150879, "epoch": 1.406351263771873, "grad_norm": 2.6088447308224008, "learning_rate": 5.119034506748713e-07, "logits/chosen": -2.2929553985595703, "logits/rejected": -2.281355142593384, "logps/chosen": -16.640697479248047, "logps/rejected": -421.694091796875, "logps_avg/chosen": -0.10242215543985367, "logps_avg/rejected": -2.501065969467163, "loss": 0.0998, "losses_ref": -0.001229285029694438, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4340, "u": -3.5283074378967285, "weight": 0.07093143463134766 }, { "diff_generated": -25.70849609375, "epoch": 1.4095917044718083, "grad_norm": 2.641049340662074, "learning_rate": 5.104546749693781e-07, "logits/chosen": -2.3369216918945312, "logits/rejected": -2.337437391281128, "logps/chosen": -19.4946231842041, "logps/rejected": -474.26947021484375, "logps_avg/chosen": -0.10807840526103973, "logps_avg/rejected": -2.570849895477295, "loss": 0.1, "losses_ref": -0.0015168010722845793, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4350, "u": -3.618104934692383, "weight": 0.04644922539591789 }, { "diff_generated": -24.85693359375, "epoch": 1.4128321451717434, "grad_norm": 2.5766541848668107, "learning_rate": 5.09004328956004e-07, "logits/chosen": -2.3693432807922363, "logits/rejected": -2.3518214225769043, "logps/chosen": -18.382400512695312, "logps/rejected": -447.8953552246094, "logps_avg/chosen": -0.09862877428531647, "logps_avg/rejected": -2.4856934547424316, "loss": 0.0992, "losses_ref": -0.00043182895751670003, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4360, "u": -3.5792031288146973, "weight": 0.056764155626297 }, { "diff_generated": -27.943578720092773, "epoch": 1.4160725858716785, "grad_norm": 2.4124447556956183, "learning_rate": 5.075524332539736e-07, "logits/chosen": -2.3231358528137207, "logits/rejected": -2.263293504714966, "logps/chosen": -17.485557556152344, "logps/rejected": -466.57318115234375, "logps_avg/chosen": -0.10093804448843002, "logps_avg/rejected": -2.7943577766418457, "loss": 0.1014, "losses_ref": -0.0006617440958507359, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4370, "u": -3.670496702194214, "weight": 0.03214912861585617 }, { "diff_generated": -26.72478675842285, "epoch": 1.4193130265716136, "grad_norm": 2.6571784933053384, "learning_rate": 5.060990085045432e-07, "logits/chosen": -2.333707332611084, "logits/rejected": -2.3314852714538574, "logps/chosen": -17.644012451171875, "logps/rejected": -456.2490234375, "logps_avg/chosen": -0.1047801747918129, "logps_avg/rejected": -2.6724789142608643, "loss": 0.103, "losses_ref": -0.0011025893036276102, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4380, "u": -3.668222427368164, "weight": 0.032760731875896454 }, { "diff_generated": -25.63039207458496, "epoch": 1.422553467271549, "grad_norm": 2.528344636368985, "learning_rate": 5.046440753707077e-07, "logits/chosen": -2.4096553325653076, "logits/rejected": -2.361562728881836, "logps/chosen": -16.573205947875977, "logps/rejected": -436.4747009277344, "logps_avg/chosen": -0.08926139771938324, "logps_avg/rejected": -2.5630393028259277, "loss": 0.1012, "losses_ref": -0.000798130058683455, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4390, "u": -3.575213670730591, "weight": 0.05734226852655411 }, { "diff_generated": -25.784008026123047, "epoch": 1.425793907971484, "grad_norm": 2.449284288470074, "learning_rate": 5.031876545369054e-07, "logits/chosen": -2.3840537071228027, "logits/rejected": -2.3444948196411133, "logps/chosen": -18.734729766845703, "logps/rejected": -443.230712890625, "logps_avg/chosen": -0.10571378469467163, "logps_avg/rejected": -2.5784008502960205, "loss": 0.1035, "losses_ref": -0.0009342956473119557, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4400, "u": -3.6010608673095703, "weight": 0.051356326788663864 }, { "diff_generated": -24.286157608032227, "epoch": 1.4290343486714194, "grad_norm": 2.78291150567027, "learning_rate": 5.017297667087257e-07, "logits/chosen": -2.376237392425537, "logits/rejected": -2.360668897628784, "logps/chosen": -19.048891067504883, "logps/rejected": -441.42730712890625, "logps_avg/chosen": -0.10474354028701782, "logps_avg/rejected": -2.4286153316497803, "loss": 0.1025, "losses_ref": -0.001150609226897359, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4410, "u": -3.6455307006835938, "weight": 0.03921959549188614 }, { "diff_generated": -25.352760314941406, "epoch": 1.4322747893713546, "grad_norm": 3.0038648383145907, "learning_rate": 5.002704326126135e-07, "logits/chosen": -2.4247918128967285, "logits/rejected": -2.381690502166748, "logps/chosen": -19.50627326965332, "logps/rejected": -454.374267578125, "logps_avg/chosen": -0.10728516429662704, "logps_avg/rejected": -2.535275936126709, "loss": 0.1041, "losses_ref": -0.0011060578981414437, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4420, "u": -3.572949171066284, "weight": 0.05791778117418289 }, { "diff_generated": -23.273242950439453, "epoch": 1.4355152300712897, "grad_norm": 2.8747090374081736, "learning_rate": 4.988096729955751e-07, "logits/chosen": -2.4148807525634766, "logits/rejected": -2.405785083770752, "logps/chosen": -16.78266143798828, "logps/rejected": -423.6563415527344, "logps_avg/chosen": -0.09588910639286041, "logps_avg/rejected": -2.327324390411377, "loss": 0.1025, "losses_ref": -0.000644886982627213, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4430, "u": -3.52955961227417, "weight": 0.06961119174957275 }, { "diff_generated": -27.36212158203125, "epoch": 1.4387556707712248, "grad_norm": 2.5997624366455425, "learning_rate": 4.97347508624883e-07, "logits/chosen": -2.425786256790161, "logits/rejected": -2.3510758876800537, "logps/chosen": -16.762413024902344, "logps/rejected": -475.9835510253906, "logps_avg/chosen": -0.09029469639062881, "logps_avg/rejected": -2.7362122535705566, "loss": 0.0975, "losses_ref": -0.0004997922806069255, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4440, "u": -3.481595993041992, "weight": 0.08191341161727905 }, { "diff_generated": -26.46923828125, "epoch": 1.4419961114711601, "grad_norm": 2.4993217583552427, "learning_rate": 4.958839602877809e-07, "logits/chosen": -2.3639867305755615, "logits/rejected": -2.367396116256714, "logps/chosen": -18.619909286499023, "logps/rejected": -472.052734375, "logps_avg/chosen": -0.11092057079076767, "logps_avg/rejected": -2.646923542022705, "loss": 0.1025, "losses_ref": -0.000601834908593446, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4450, "u": -3.649272918701172, "weight": 0.03825830668210983 }, { "diff_generated": -26.878698348999023, "epoch": 1.4452365521710953, "grad_norm": 2.55277677854579, "learning_rate": 4.944190487911878e-07, "logits/chosen": -2.3663926124572754, "logits/rejected": -2.350630760192871, "logps/chosen": -17.180818557739258, "logps/rejected": -497.48370361328125, "logps_avg/chosen": -0.09979522228240967, "logps_avg/rejected": -2.6878700256347656, "loss": 0.1019, "losses_ref": -0.0002570479118730873, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4460, "u": -3.5531105995178223, "weight": 0.06281425058841705 }, { "diff_generated": -27.2506103515625, "epoch": 1.4484769928710304, "grad_norm": 2.534264307758894, "learning_rate": 4.929527949614025e-07, "logits/chosen": -2.384092330932617, "logits/rejected": -2.297410249710083, "logps/chosen": -19.391942977905273, "logps/rejected": -457.280517578125, "logps_avg/chosen": -0.11064749956130981, "logps_avg/rejected": -2.7250614166259766, "loss": 0.1026, "losses_ref": -0.0014421206433326006, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4470, "u": -3.645411729812622, "weight": 0.03970836102962494 }, { "diff_generated": -25.837894439697266, "epoch": 1.4517174335709657, "grad_norm": 2.5956162633184574, "learning_rate": 4.914852196438077e-07, "logits/chosen": -2.332778215408325, "logits/rejected": -2.330631971359253, "logps/chosen": -17.86171531677246, "logps/rejected": -464.32635498046875, "logps_avg/chosen": -0.10497362911701202, "logps_avg/rejected": -2.583789348602295, "loss": 0.1014, "losses_ref": -0.0013370258966460824, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4480, "u": -3.6205012798309326, "weight": 0.04628719761967659 }, { "diff_generated": -26.748376846313477, "epoch": 1.4549578742709008, "grad_norm": 2.6545092363012954, "learning_rate": 4.900163437025727e-07, "logits/chosen": -2.3440780639648438, "logits/rejected": -2.3486673831939697, "logps/chosen": -17.950197219848633, "logps/rejected": -524.4556884765625, "logps_avg/chosen": -0.10626323521137238, "logps_avg/rejected": -2.674837589263916, "loss": 0.1049, "losses_ref": -0.0006623517838306725, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4490, "u": -3.6240296363830566, "weight": 0.04461196810007095 }, { "diff_generated": -27.1101016998291, "epoch": 1.458198314970836, "grad_norm": 2.75990280679827, "learning_rate": 4.885461880203582e-07, "logits/chosen": -2.3479933738708496, "logits/rejected": -2.3495919704437256, "logps/chosen": -17.832767486572266, "logps/rejected": -487.9822692871094, "logps_avg/chosen": -0.10218171775341034, "logps_avg/rejected": -2.711009979248047, "loss": 0.1004, "losses_ref": -0.0007707371842116117, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4500, "u": -3.5700430870056152, "weight": 0.05738046020269394 }, { "diff_generated": -25.453678131103516, "epoch": 1.4614387556707713, "grad_norm": 2.4597411297031435, "learning_rate": 4.870747734980186e-07, "logits/chosen": -2.3789215087890625, "logits/rejected": -2.377337694168091, "logps/chosen": -16.965253829956055, "logps/rejected": -483.447265625, "logps_avg/chosen": -0.09473135322332382, "logps_avg/rejected": -2.545367956161499, "loss": 0.1016, "losses_ref": -0.00035776724689640105, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4510, "u": -3.625748872756958, "weight": 0.0441717728972435 }, { "diff_generated": -26.31353187561035, "epoch": 1.4646791963707064, "grad_norm": 2.708017657370048, "learning_rate": 4.856021210543043e-07, "logits/chosen": -2.306912660598755, "logits/rejected": -2.295328140258789, "logps/chosen": -17.95050811767578, "logps/rejected": -469.47222900390625, "logps_avg/chosen": -0.10379139333963394, "logps_avg/rejected": -2.6313533782958984, "loss": 0.1018, "losses_ref": -0.0002655963471625, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4520, "u": -3.3879246711730957, "weight": 0.10659865289926529 }, { "diff_generated": -27.170995712280273, "epoch": 1.4679196370706415, "grad_norm": 2.5677323619140227, "learning_rate": 4.841282516255653e-07, "logits/chosen": -2.42441987991333, "logits/rejected": -2.3640689849853516, "logps/chosen": -19.41526222229004, "logps/rejected": -499.07623291015625, "logps_avg/chosen": -0.10374051332473755, "logps_avg/rejected": -2.717099666595459, "loss": 0.1033, "losses_ref": -0.0010943252127617598, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4530, "u": -3.689488649368286, "weight": 0.026894647628068924 }, { "diff_generated": -27.157360076904297, "epoch": 1.471160077770577, "grad_norm": 2.7424484198068413, "learning_rate": 4.826531861654537e-07, "logits/chosen": -2.3322556018829346, "logits/rejected": -2.292630910873413, "logps/chosen": -19.540157318115234, "logps/rejected": -459.62542724609375, "logps_avg/chosen": -0.10732056945562363, "logps_avg/rejected": -2.715735912322998, "loss": 0.1019, "losses_ref": -0.001661944785155356, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4540, "u": -3.644774913787842, "weight": 0.0400865264236927 }, { "diff_generated": -26.436452865600586, "epoch": 1.474400518470512, "grad_norm": 2.55585976008871, "learning_rate": 4.811769456446243e-07, "logits/chosen": -2.3906633853912354, "logits/rejected": -2.3819384574890137, "logps/chosen": -17.60432243347168, "logps/rejected": -491.45013427734375, "logps_avg/chosen": -0.09582848846912384, "logps_avg/rejected": -2.6436455249786377, "loss": 0.1, "losses_ref": -0.0004539464716799557, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4550, "u": -3.646775007247925, "weight": 0.03811759501695633 }, { "diff_generated": -25.415651321411133, "epoch": 1.4776409591704471, "grad_norm": 2.64179745579947, "learning_rate": 4.796995510504384e-07, "logits/chosen": -2.331956386566162, "logits/rejected": -2.3741514682769775, "logps/chosen": -16.415557861328125, "logps/rejected": -496.5760803222656, "logps_avg/chosen": -0.09389199316501617, "logps_avg/rejected": -2.541565418243408, "loss": 0.1042, "losses_ref": -0.0003689295845106244, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4560, "u": -3.502955198287964, "weight": 0.07547652721405029 }, { "diff_generated": -25.140544891357422, "epoch": 1.4808813998703823, "grad_norm": 2.627265550508191, "learning_rate": 4.782210233866637e-07, "logits/chosen": -2.3608133792877197, "logits/rejected": -2.3634140491485596, "logps/chosen": -17.05031967163086, "logps/rejected": -443.302978515625, "logps_avg/chosen": -0.0978686511516571, "logps_avg/rejected": -2.514054536819458, "loss": 0.0987, "losses_ref": -0.0007118280627764761, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4570, "u": -3.5265719890594482, "weight": 0.0697406679391861 }, { "diff_generated": -27.18475914001465, "epoch": 1.4841218405703176, "grad_norm": 2.7690753607986145, "learning_rate": 4.76741383673177e-07, "logits/chosen": -2.402216911315918, "logits/rejected": -2.374706745147705, "logps/chosen": -17.55205726623535, "logps/rejected": -485.1881408691406, "logps_avg/chosen": -0.10320468991994858, "logps_avg/rejected": -2.7184762954711914, "loss": 0.1031, "losses_ref": -0.0003614898887462914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4580, "u": -3.5736613273620605, "weight": 0.05670735239982605 }, { "diff_generated": -27.21875, "epoch": 1.4873622812702527, "grad_norm": 3.0064926638398144, "learning_rate": 4.752606529456648e-07, "logits/chosen": -2.3634047508239746, "logits/rejected": -2.363259792327881, "logps/chosen": -15.848172187805176, "logps/rejected": -457.4466857910156, "logps_avg/chosen": -0.09651388227939606, "logps_avg/rejected": -2.721874952316284, "loss": 0.102, "losses_ref": -0.0007729289936833084, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4590, "u": -3.450721025466919, "weight": 0.08863823860883713 }, { "diff_generated": -26.816186904907227, "epoch": 1.490602721970188, "grad_norm": 2.536867119938274, "learning_rate": 4.7377885225532396e-07, "logits/chosen": -2.4007298946380615, "logits/rejected": -2.3915462493896484, "logps/chosen": -16.55607795715332, "logps/rejected": -489.6505432128906, "logps_avg/chosen": -0.09947662055492401, "logps_avg/rejected": -2.6816184520721436, "loss": 0.1005, "losses_ref": -0.00034102320205420256, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4600, "u": -3.64630389213562, "weight": 0.037940699607133865 }, { "diff_generated": -26.589834213256836, "epoch": 1.4938431626701232, "grad_norm": 2.5753359062753005, "learning_rate": 4.722960026685633e-07, "logits/chosen": -2.346069812774658, "logits/rejected": -2.3274240493774414, "logps/chosen": -16.945472717285156, "logps/rejected": -475.73468017578125, "logps_avg/chosen": -0.0991564393043518, "logps_avg/rejected": -2.6589832305908203, "loss": 0.0997, "losses_ref": -0.0006340649561025202, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4610, "u": -3.5988471508026123, "weight": 0.050868142396211624 }, { "diff_generated": -27.050867080688477, "epoch": 1.4970836033700583, "grad_norm": 2.6721082376929246, "learning_rate": 4.7081212526670267e-07, "logits/chosen": -2.3203649520874023, "logits/rejected": -2.274970054626465, "logps/chosen": -20.65582275390625, "logps/rejected": -442.66790771484375, "logps_avg/chosen": -0.1143292635679245, "logps_avg/rejected": -2.7050864696502686, "loss": 0.1019, "losses_ref": -0.0003582318313419819, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4620, "u": -3.595930814743042, "weight": 0.05042849853634834 }, { "diff_generated": -28.128681182861328, "epoch": 1.5003240440699934, "grad_norm": 3.0838171392134517, "learning_rate": 4.693272411456753e-07, "logits/chosen": -2.4106602668762207, "logits/rejected": -2.3465323448181152, "logps/chosen": -18.15391731262207, "logps/rejected": -472.01849365234375, "logps_avg/chosen": -0.09831465780735016, "logps_avg/rejected": -2.8128678798675537, "loss": 0.1016, "losses_ref": -0.0006209076964296401, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4630, "u": -3.6010029315948486, "weight": 0.050917524844408035 }, { "diff_generated": -25.924785614013672, "epoch": 1.5035644847699285, "grad_norm": 2.777284153501298, "learning_rate": 4.6784137141572566e-07, "logits/chosen": -2.376234292984009, "logits/rejected": -2.3401947021484375, "logps/chosen": -17.53774070739746, "logps/rejected": -451.0185546875, "logps_avg/chosen": -0.09319780766963959, "logps_avg/rejected": -2.5924789905548096, "loss": 0.099, "losses_ref": -0.0008334851590916514, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4640, "u": -3.57263445854187, "weight": 0.05742005258798599 }, { "diff_generated": -26.946765899658203, "epoch": 1.5068049254698639, "grad_norm": 2.8874402391178284, "learning_rate": 4.6635453720111096e-07, "logits/chosen": -2.3638510704040527, "logits/rejected": -2.33133864402771, "logps/chosen": -17.957733154296875, "logps/rejected": -478.43609619140625, "logps_avg/chosen": -0.10447684675455093, "logps_avg/rejected": -2.6946768760681152, "loss": 0.0975, "losses_ref": -0.0003875306574627757, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4650, "u": -3.502568006515503, "weight": 0.07550157606601715 }, { "diff_generated": -24.884929656982422, "epoch": 1.5100453661697992, "grad_norm": 2.6861817231220577, "learning_rate": 4.6486675963980014e-07, "logits/chosen": -2.404689073562622, "logits/rejected": -2.452115535736084, "logps/chosen": -18.500164031982422, "logps/rejected": -459.638916015625, "logps_avg/chosen": -0.10741446167230606, "logps_avg/rejected": -2.4884932041168213, "loss": 0.101, "losses_ref": -0.001469132723286748, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4660, "u": -3.669245481491089, "weight": 0.033659983426332474 }, { "diff_generated": -27.58524513244629, "epoch": 1.5132858068697344, "grad_norm": 2.6204230175484136, "learning_rate": 4.633780598831733e-07, "logits/chosen": -2.416443347930908, "logits/rejected": -2.349015712738037, "logps/chosen": -20.454132080078125, "logps/rejected": -504.7102966308594, "logps_avg/chosen": -0.11305411159992218, "logps_avg/rejected": -2.7585246562957764, "loss": 0.0985, "losses_ref": -0.0005273165879771113, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4670, "u": -3.5723938941955566, "weight": 0.056988675147295 }, { "diff_generated": -28.445322036743164, "epoch": 1.5165262475696695, "grad_norm": 2.768496657784131, "learning_rate": 4.6188845909572143e-07, "logits/chosen": -2.3770623207092285, "logits/rejected": -2.3327393531799316, "logps/chosen": -16.653573989868164, "logps/rejected": -527.7097778320312, "logps_avg/chosen": -0.09490346163511276, "logps_avg/rejected": -2.8445322513580322, "loss": 0.102, "losses_ref": -0.00033215107396245003, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4680, "u": -3.5773749351501465, "weight": 0.056677620857954025 }, { "diff_generated": -26.633886337280273, "epoch": 1.5197666882696046, "grad_norm": 2.4800042440884926, "learning_rate": 4.603979784547451e-07, "logits/chosen": -2.334624767303467, "logits/rejected": -2.3243727684020996, "logps/chosen": -18.37417984008789, "logps/rejected": -509.787353515625, "logps_avg/chosen": -0.10755424201488495, "logps_avg/rejected": -2.663388729095459, "loss": 0.1024, "losses_ref": -0.0005498843383975327, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4690, "u": -3.5532336235046387, "weight": 0.06324771791696548 }, { "diff_generated": -29.298620223999023, "epoch": 1.5230071289695397, "grad_norm": 2.706514444080492, "learning_rate": 4.5890663915005364e-07, "logits/chosen": -2.3914692401885986, "logits/rejected": -2.373861789703369, "logps/chosen": -16.200611114501953, "logps/rejected": -524.3837890625, "logps_avg/chosen": -0.09766872227191925, "logps_avg/rejected": -2.9298622608184814, "loss": 0.1018, "losses_ref": -0.0002807065029628575, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4700, "u": -3.6486129760742188, "weight": 0.037851959466934204 }, { "diff_generated": -26.626922607421875, "epoch": 1.526247569669475, "grad_norm": 2.7443458990985814, "learning_rate": 4.574144623836637e-07, "logits/chosen": -2.3963630199432373, "logits/rejected": -2.3771467208862305, "logps/chosen": -16.957136154174805, "logps/rejected": -482.7359313964844, "logps_avg/chosen": -0.09588057547807693, "logps_avg/rejected": -2.6626925468444824, "loss": 0.1036, "losses_ref": -0.0004056665929965675, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4710, "u": -3.524570941925049, "weight": 0.06927379965782166 }, { "diff_generated": -28.789709091186523, "epoch": 1.5294880103694104, "grad_norm": 2.419049898861885, "learning_rate": 4.5592146936949785e-07, "logits/chosen": -2.3638224601745605, "logits/rejected": -2.3208470344543457, "logps/chosen": -20.11721420288086, "logps/rejected": -505.06396484375, "logps_avg/chosen": -0.10543084144592285, "logps_avg/rejected": -2.8789710998535156, "loss": 0.0983, "losses_ref": -0.0007130955345928669, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4720, "u": -3.5926082134246826, "weight": 0.050989795476198196 }, { "diff_generated": -26.855182647705078, "epoch": 1.5327284510693455, "grad_norm": 2.673613793203479, "learning_rate": 4.544276813330835e-07, "logits/chosen": -2.381993055343628, "logits/rejected": -2.3274385929107666, "logps/chosen": -17.30748176574707, "logps/rejected": -472.8213806152344, "logps_avg/chosen": -0.09686298668384552, "logps_avg/rejected": -2.6855180263519287, "loss": 0.1028, "losses_ref": -0.0003440210421103984, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4730, "u": -3.602659225463867, "weight": 0.050429295748472214 }, { "diff_generated": -26.854650497436523, "epoch": 1.5359688917692806, "grad_norm": 2.6334580009309163, "learning_rate": 4.529331195112501e-07, "logits/chosen": -2.3390755653381348, "logits/rejected": -2.3454232215881348, "logps/chosen": -17.794170379638672, "logps/rejected": -506.09326171875, "logps_avg/chosen": -0.10273711383342743, "logps_avg/rejected": -2.6854653358459473, "loss": 0.1006, "losses_ref": -0.0007093682652339339, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4740, "u": -3.6918129920959473, "weight": 0.02626415155827999 }, { "diff_generated": -28.0623779296875, "epoch": 1.5392093324692158, "grad_norm": 2.4259277405372384, "learning_rate": 4.5143780515182833e-07, "logits/chosen": -2.377840757369995, "logits/rejected": -2.3175711631774902, "logps/chosen": -20.87468910217285, "logps/rejected": -508.7308654785156, "logps_avg/chosen": -0.11975955963134766, "logps_avg/rejected": -2.8062376976013184, "loss": 0.1011, "losses_ref": -0.0005875771166756749, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4750, "u": -3.7144553661346436, "weight": 0.01955435611307621 }, { "diff_generated": -26.727203369140625, "epoch": 1.5424497731691509, "grad_norm": 2.6889369988231215, "learning_rate": 4.499417595133471e-07, "logits/chosen": -2.3149609565734863, "logits/rejected": -2.2860519886016846, "logps/chosen": -17.685163497924805, "logps/rejected": -484.66571044921875, "logps_avg/chosen": -0.10537519305944443, "logps_avg/rejected": -2.672720432281494, "loss": 0.1016, "losses_ref": -0.0006560144247487187, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4760, "u": -3.641984224319458, "weight": 0.03838873654603958 }, { "diff_generated": -27.540283203125, "epoch": 1.5456902138690862, "grad_norm": 2.6264864150084257, "learning_rate": 4.4844500386473207e-07, "logits/chosen": -2.373394012451172, "logits/rejected": -2.3297057151794434, "logps/chosen": -19.133085250854492, "logps/rejected": -499.7777404785156, "logps_avg/chosen": -0.10392173379659653, "logps_avg/rejected": -2.754028558731079, "loss": 0.1024, "losses_ref": -0.0007838995079509914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4770, "u": -3.648332118988037, "weight": 0.038741402328014374 }, { "diff_generated": -27.40139389038086, "epoch": 1.5489306545690213, "grad_norm": 2.8154896252136044, "learning_rate": 4.4694755948500276e-07, "logits/chosen": -2.3506534099578857, "logits/rejected": -2.353341579437256, "logps/chosen": -14.5455322265625, "logps/rejected": -509.96673583984375, "logps_avg/chosen": -0.09225939214229584, "logps_avg/rejected": -2.7401394844055176, "loss": 0.0999, "losses_ref": -0.0001783154293661937, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4780, "u": -3.5768604278564453, "weight": 0.056461431086063385 }, { "diff_generated": -26.885112762451172, "epoch": 1.5521710952689567, "grad_norm": 2.5855058383683995, "learning_rate": 4.4544944766297037e-07, "logits/chosen": -2.395085573196411, "logits/rejected": -2.425724506378174, "logps/chosen": -16.19194793701172, "logps/rejected": -509.50238037109375, "logps_avg/chosen": -0.09882526099681854, "logps_avg/rejected": -2.688511371612549, "loss": 0.0985, "losses_ref": -0.0004003371577709913, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4790, "u": -3.7154109477996826, "weight": 0.019246309995651245 }, { "diff_generated": -26.92526626586914, "epoch": 1.5554115359688918, "grad_norm": 2.646503604424476, "learning_rate": 4.439506896969348e-07, "logits/chosen": -2.295779228210449, "logits/rejected": -2.2422423362731934, "logps/chosen": -16.581674575805664, "logps/rejected": -458.66778564453125, "logps_avg/chosen": -0.09982822835445404, "logps_avg/rejected": -2.692526340484619, "loss": 0.0987, "losses_ref": -0.0005184352630749345, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4800, "u": -3.4579899311065674, "weight": 0.08816748857498169 }, { "diff_generated": -27.13262939453125, "epoch": 1.558651976668827, "grad_norm": 2.857413374977336, "learning_rate": 4.4245130689438206e-07, "logits/chosen": -2.313910961151123, "logits/rejected": -2.255359172821045, "logps/chosen": -18.43231773376465, "logps/rejected": -488.8936462402344, "logps_avg/chosen": -0.1034407764673233, "logps_avg/rejected": -2.7132630348205566, "loss": 0.1024, "losses_ref": -0.0005251936381682754, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4810, "u": -3.5007331371307373, "weight": 0.07584112137556076 }, { "diff_generated": -27.614675521850586, "epoch": 1.561892417368762, "grad_norm": 2.7006985101228276, "learning_rate": 4.4095132057168145e-07, "logits/chosen": -2.3693792819976807, "logits/rejected": -2.308382511138916, "logps/chosen": -19.743377685546875, "logps/rejected": -480.5498962402344, "logps_avg/chosen": -0.10749039798974991, "logps_avg/rejected": -2.7614681720733643, "loss": 0.0994, "losses_ref": -0.0005830880254507065, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4820, "u": -3.6227993965148926, "weight": 0.04462616890668869 }, { "diff_generated": -27.29495620727539, "epoch": 1.5651328580686974, "grad_norm": 2.7541811438721946, "learning_rate": 4.3945075205378215e-07, "logits/chosen": -2.308394193649292, "logits/rejected": -2.331428050994873, "logps/chosen": -16.660200119018555, "logps/rejected": -496.21783447265625, "logps_avg/chosen": -0.10463666915893555, "logps_avg/rejected": -2.7294955253601074, "loss": 0.103, "losses_ref": -0.0001776438148226589, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4830, "u": -3.6182701587677, "weight": 0.043968915939331055 }, { "diff_generated": -26.718585968017578, "epoch": 1.5683732987686325, "grad_norm": 2.619996271538521, "learning_rate": 4.379496226739104e-07, "logits/chosen": -2.3933868408203125, "logits/rejected": -2.354672431945801, "logps/chosen": -17.114768981933594, "logps/rejected": -501.43194580078125, "logps_avg/chosen": -0.09793446958065033, "logps_avg/rejected": -2.671858310699463, "loss": 0.1008, "losses_ref": -0.0003559653414413333, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4840, "u": -3.5752758979797363, "weight": 0.056691162288188934 }, { "diff_generated": -25.645092010498047, "epoch": 1.5716137394685679, "grad_norm": 2.7091730491201695, "learning_rate": 4.364479537732663e-07, "logits/chosen": -2.3813581466674805, "logits/rejected": -2.3487915992736816, "logps/chosen": -18.565942764282227, "logps/rejected": -461.3694763183594, "logps_avg/chosen": -0.10648205131292343, "logps_avg/rejected": -2.564509391784668, "loss": 0.1029, "losses_ref": -0.00037918920861557126, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4850, "u": -3.614135265350342, "weight": 0.04421050101518631 }, { "diff_generated": -26.901065826416016, "epoch": 1.574854180168503, "grad_norm": 2.637242307433007, "learning_rate": 4.349457667007197e-07, "logits/chosen": -2.3863844871520996, "logits/rejected": -2.3578927516937256, "logps/chosen": -19.903520584106445, "logps/rejected": -504.9957580566406, "logps_avg/chosen": -0.10746286809444427, "logps_avg/rejected": -2.6901066303253174, "loss": 0.0998, "losses_ref": -0.0007895805174484849, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4860, "u": -3.7391810417175293, "weight": 0.01379163283854723 }, { "diff_generated": -28.473886489868164, "epoch": 1.578094620868438, "grad_norm": 2.533652732727638, "learning_rate": 4.334430828125074e-07, "logits/chosen": -2.3725333213806152, "logits/rejected": -2.340877056121826, "logps/chosen": -18.550180435180664, "logps/rejected": -516.2704467773438, "logps_avg/chosen": -0.10123230516910553, "logps_avg/rejected": -2.847388744354248, "loss": 0.0984, "losses_ref": -0.00029261037707328796, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4870, "u": -3.530827283859253, "weight": 0.06912322342395782 }, { "diff_generated": -25.443593978881836, "epoch": 1.5813350615683732, "grad_norm": 2.616320595845464, "learning_rate": 4.319399234719297e-07, "logits/chosen": -2.3242273330688477, "logits/rejected": -2.3418664932250977, "logps/chosen": -15.317204475402832, "logps/rejected": -483.1455993652344, "logps_avg/chosen": -0.09046544134616852, "logps_avg/rejected": -2.5443596839904785, "loss": 0.0966, "losses_ref": -0.00041991929174400866, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4880, "u": -3.4812328815460205, "weight": 0.08183874189853668 }, { "diff_generated": -28.15468406677246, "epoch": 1.5845755022683083, "grad_norm": 2.8396134964728246, "learning_rate": 4.3043631004904563e-07, "logits/chosen": -2.3555305004119873, "logits/rejected": -2.3145227432250977, "logps/chosen": -15.45680046081543, "logps/rejected": -495.0728454589844, "logps_avg/chosen": -0.08831733465194702, "logps_avg/rejected": -2.8154685497283936, "loss": 0.0997, "losses_ref": -0.00037405334296636283, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4890, "u": -3.670712947845459, "weight": 0.031772710382938385 }, { "diff_generated": -27.696598052978516, "epoch": 1.5878159429682437, "grad_norm": 2.581755430760835, "learning_rate": 4.2893226392037024e-07, "logits/chosen": -2.396336555480957, "logits/rejected": -2.33992075920105, "logps/chosen": -19.942581176757812, "logps/rejected": -473.76385498046875, "logps_avg/chosen": -0.10936687886714935, "logps_avg/rejected": -2.7696597576141357, "loss": 0.1013, "losses_ref": -0.00029218022245913744, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4900, "u": -3.7187163829803467, "weight": 0.01911265216767788 }, { "diff_generated": -26.277267456054688, "epoch": 1.591056383668179, "grad_norm": 2.6896412858160237, "learning_rate": 4.2742780646857015e-07, "logits/chosen": -2.382976770401001, "logits/rejected": -2.3719167709350586, "logps/chosen": -16.212665557861328, "logps/rejected": -494.34515380859375, "logps_avg/chosen": -0.09449265152215958, "logps_avg/rejected": -2.6277265548706055, "loss": 0.0998, "losses_ref": -0.00017206800112035125, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4910, "u": -3.6703362464904785, "weight": 0.03144761547446251 }, { "diff_generated": -25.981069564819336, "epoch": 1.5942968243681142, "grad_norm": 2.6831933594658275, "learning_rate": 4.2592295908215953e-07, "logits/chosen": -2.353114128112793, "logits/rejected": -2.336235284805298, "logps/chosen": -19.42054557800293, "logps/rejected": -469.2264709472656, "logps_avg/chosen": -0.1102244108915329, "logps_avg/rejected": -2.598106861114502, "loss": 0.1045, "losses_ref": -0.0005944965523667634, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4920, "u": -3.5998847484588623, "weight": 0.05080182105302811 }, { "diff_generated": -28.747379302978516, "epoch": 1.5975372650680493, "grad_norm": 2.625533076941181, "learning_rate": 4.2441774315519645e-07, "logits/chosen": -2.389402389526367, "logits/rejected": -2.3536899089813232, "logps/chosen": -17.513187408447266, "logps/rejected": -512.0242919921875, "logps_avg/chosen": -0.10442149639129639, "logps_avg/rejected": -2.8747377395629883, "loss": 0.0989, "losses_ref": -0.0005745574599131942, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4930, "u": -3.644648313522339, "weight": 0.038289912045001984 }, { "diff_generated": -27.752788543701172, "epoch": 1.6007777057679844, "grad_norm": 2.492184484923244, "learning_rate": 4.229121800869781e-07, "logits/chosen": -2.3942487239837646, "logits/rejected": -2.342210292816162, "logps/chosen": -16.346881866455078, "logps/rejected": -507.65704345703125, "logps_avg/chosen": -0.09216523170471191, "logps_avg/rejected": -2.7752792835235596, "loss": 0.1007, "losses_ref": -0.0008915389771573246, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4940, "u": -3.6660046577453613, "weight": 0.032541196793317795 }, { "diff_generated": -27.836124420166016, "epoch": 1.6040181464679195, "grad_norm": 2.6417464893100777, "learning_rate": 4.2140629128173703e-07, "logits/chosen": -2.4343490600585938, "logits/rejected": -2.3680715560913086, "logps/chosen": -15.843406677246094, "logps/rejected": -476.01019287109375, "logps_avg/chosen": -0.0944889634847641, "logps_avg/rejected": -2.7836124897003174, "loss": 0.1008, "losses_ref": -0.0009327017469331622, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4950, "u": -3.599611282348633, "weight": 0.05153653770685196 }, { "diff_generated": -26.701419830322266, "epoch": 1.6072585871678549, "grad_norm": 2.6680702408063492, "learning_rate": 4.199000981483368e-07, "logits/chosen": -2.406346559524536, "logits/rejected": -2.365767002105713, "logps/chosen": -21.22658348083496, "logps/rejected": -499.58746337890625, "logps_avg/chosen": -0.11067505925893784, "logps_avg/rejected": -2.6701416969299316, "loss": 0.1017, "losses_ref": -0.001109774224460125, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4960, "u": -3.6634132862091064, "weight": 0.032880425453186035 }, { "diff_generated": -29.07406234741211, "epoch": 1.61049902786779, "grad_norm": 2.2941148581571915, "learning_rate": 4.183936220999676e-07, "logits/chosen": -2.3704891204833984, "logits/rejected": -2.2919983863830566, "logps/chosen": -18.74502182006836, "logps/rejected": -533.35595703125, "logps_avg/chosen": -0.10289647430181503, "logps_avg/rejected": -2.9074063301086426, "loss": 0.101, "losses_ref": -0.0003367816680110991, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4970, "u": -3.599052906036377, "weight": 0.05049259588122368 }, { "diff_generated": -27.42887306213379, "epoch": 1.6137394685677253, "grad_norm": 2.6430433694036717, "learning_rate": 4.168868845538414e-07, "logits/chosen": -2.381389617919922, "logits/rejected": -2.3356552124023438, "logps/chosen": -16.56113052368164, "logps/rejected": -488.47802734375, "logps_avg/chosen": -0.09255286306142807, "logps_avg/rejected": -2.742887020111084, "loss": 0.0985, "losses_ref": -0.0003578144242055714, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4980, "u": -3.6470627784729004, "weight": 0.037930212914943695 }, { "diff_generated": -25.372648239135742, "epoch": 1.6169799092676604, "grad_norm": 2.5942333804563944, "learning_rate": 4.15379906930888e-07, "logits/chosen": -2.3385634422302246, "logits/rejected": -2.297896385192871, "logps/chosen": -16.051712036132812, "logps/rejected": -463.76702880859375, "logps_avg/chosen": -0.08722599595785141, "logps_avg/rejected": -2.537264823913574, "loss": 0.0976, "losses_ref": -0.0003934988344553858, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4990, "u": -3.364407777786255, "weight": 0.11301133781671524 }, { "diff_generated": -25.59117889404297, "epoch": 1.6202203499675956, "grad_norm": 2.6715693330169716, "learning_rate": 4.1387271065545074e-07, "logits/chosen": -2.409432888031006, "logits/rejected": -2.3450863361358643, "logps/chosen": -18.909839630126953, "logps/rejected": -421.7919921875, "logps_avg/chosen": -0.10501371324062347, "logps_avg/rejected": -2.5591180324554443, "loss": 0.1023, "losses_ref": -0.001767330220900476, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5000, "u": -3.644735336303711, "weight": 0.04043537750840187 }, { "diff_generated": -29.76255226135254, "epoch": 1.6234607906675307, "grad_norm": 2.588133194181167, "learning_rate": 4.123653171549807e-07, "logits/chosen": -2.4070725440979004, "logits/rejected": -2.345527172088623, "logps/chosen": -16.648374557495117, "logps/rejected": -498.03363037109375, "logps_avg/chosen": -0.09451456367969513, "logps_avg/rejected": -2.976254940032959, "loss": 0.101, "losses_ref": -0.0006677792407572269, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5010, "u": -3.6961212158203125, "weight": 0.025899719446897507 }, { "diff_generated": -28.065227508544922, "epoch": 1.626701231367466, "grad_norm": 2.5261516177295644, "learning_rate": 4.108577478597335e-07, "logits/chosen": -2.3303914070129395, "logits/rejected": -2.349775791168213, "logps/chosen": -19.008838653564453, "logps/rejected": -515.6997680664062, "logps_avg/chosen": -0.1155603900551796, "logps_avg/rejected": -2.806522846221924, "loss": 0.1029, "losses_ref": -0.0006258910289034247, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5020, "u": -3.6222190856933594, "weight": 0.044567983597517014 }, { "diff_generated": -26.53865623474121, "epoch": 1.6299416720674011, "grad_norm": 2.84698151865833, "learning_rate": 4.093500242024637e-07, "logits/chosen": -2.45613169670105, "logits/rejected": -2.3126373291015625, "logps/chosen": -18.254779815673828, "logps/rejected": -486.0533142089844, "logps_avg/chosen": -0.0972185954451561, "logps_avg/rejected": -2.6538655757904053, "loss": 0.1012, "losses_ref": -0.0005458712694235146, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5030, "u": -3.574019193649292, "weight": 0.05697847157716751 }, { "diff_generated": -28.153987884521484, "epoch": 1.6331821127673365, "grad_norm": 2.520512810715581, "learning_rate": 4.0784216761812044e-07, "logits/chosen": -2.3809313774108887, "logits/rejected": -2.272681474685669, "logps/chosen": -17.655887603759766, "logps/rejected": -482.1053771972656, "logps_avg/chosen": -0.09897877275943756, "logps_avg/rejected": -2.815398931503296, "loss": 0.0995, "losses_ref": -0.0003890444932039827, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5040, "u": -3.5757968425750732, "weight": 0.05678637698292732 }, { "diff_generated": -27.308612823486328, "epoch": 1.6364225534672716, "grad_norm": 2.6515109042801455, "learning_rate": 4.063341995435427e-07, "logits/chosen": -2.3556971549987793, "logits/rejected": -2.326655149459839, "logps/chosen": -15.506830215454102, "logps/rejected": -467.6351623535156, "logps_avg/chosen": -0.09428389370441437, "logps_avg/rejected": -2.730861186981201, "loss": 0.0975, "losses_ref": -0.0002866701106540859, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5050, "u": -3.6228299140930176, "weight": 0.044095564633607864 }, { "diff_generated": -28.53106689453125, "epoch": 1.6396629941672067, "grad_norm": 2.9572529071312377, "learning_rate": 4.048261414171544e-07, "logits/chosen": -2.403313159942627, "logits/rejected": -2.2974865436553955, "logps/chosen": -17.453428268432617, "logps/rejected": -519.951171875, "logps_avg/chosen": -0.09405800700187683, "logps_avg/rejected": -2.85310697555542, "loss": 0.1006, "losses_ref": -0.0006021251319907606, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5060, "u": -3.4560837745666504, "weight": 0.08834328502416611 }, { "diff_generated": -26.335346221923828, "epoch": 1.6429034348671419, "grad_norm": 2.5939033940015097, "learning_rate": 4.0331801467865967e-07, "logits/chosen": -2.3971405029296875, "logits/rejected": -2.3532299995422363, "logps/chosen": -17.34989356994629, "logps/rejected": -462.61346435546875, "logps_avg/chosen": -0.09877597540616989, "logps_avg/rejected": -2.6335346698760986, "loss": 0.1011, "losses_ref": -0.0008893858757801354, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5070, "u": -3.5909934043884277, "weight": 0.05130688473582268 }, { "diff_generated": -26.316930770874023, "epoch": 1.646143875567077, "grad_norm": 2.522885112902731, "learning_rate": 4.0180984076873833e-07, "logits/chosen": -2.3249759674072266, "logits/rejected": -2.3163132667541504, "logps/chosen": -17.908002853393555, "logps/rejected": -466.9813537597656, "logps_avg/chosen": -0.09990663826465607, "logps_avg/rejected": -2.631693124771118, "loss": 0.1022, "losses_ref": -0.0005635431734845042, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5080, "u": -3.525578260421753, "weight": 0.06955597549676895 }, { "diff_generated": -28.45648193359375, "epoch": 1.6493843162670123, "grad_norm": 2.7553435256115106, "learning_rate": 4.003016411287407e-07, "logits/chosen": -2.3902664184570312, "logits/rejected": -2.4012134075164795, "logps/chosen": -17.522396087646484, "logps/rejected": -532.9021606445312, "logps_avg/chosen": -0.09973875433206558, "logps_avg/rejected": -2.8456482887268066, "loss": 0.101, "losses_ref": -0.00033324985997751355, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5090, "u": -3.667588472366333, "weight": 0.03167085349559784 }, { "diff_generated": -27.200836181640625, "epoch": 1.6526247569669477, "grad_norm": 2.494211247820919, "learning_rate": 3.9879343720038276e-07, "logits/chosen": -2.3775768280029297, "logits/rejected": -2.329467296600342, "logps/chosen": -17.555004119873047, "logps/rejected": -490.5977478027344, "logps_avg/chosen": -0.09881128370761871, "logps_avg/rejected": -2.720083713531494, "loss": 0.1012, "losses_ref": -0.0001987409486901015, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5100, "u": -3.624324321746826, "weight": 0.04399067163467407 }, { "diff_generated": -27.48602294921875, "epoch": 1.6558651976668828, "grad_norm": 2.444013794792136, "learning_rate": 3.972852504254415e-07, "logits/chosen": -2.327697277069092, "logits/rejected": -2.3307526111602783, "logps/chosen": -17.439884185791016, "logps/rejected": -510.11419677734375, "logps_avg/chosen": -0.09989776462316513, "logps_avg/rejected": -2.7486021518707275, "loss": 0.0982, "losses_ref": -0.0003228532150387764, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5110, "u": -3.526263475418091, "weight": 0.06914493441581726 }, { "diff_generated": -28.441503524780273, "epoch": 1.659105638366818, "grad_norm": 2.689868582483409, "learning_rate": 3.9577710224545033e-07, "logits/chosen": -2.378460645675659, "logits/rejected": -2.34100341796875, "logps/chosen": -19.18258285522461, "logps/rejected": -546.9826049804688, "logps_avg/chosen": -0.1077943816781044, "logps_avg/rejected": -2.8441507816314697, "loss": 0.1027, "losses_ref": -0.000511771475430578, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5120, "u": -3.6669769287109375, "weight": 0.03194146603345871 }, { "diff_generated": -28.624170303344727, "epoch": 1.662346079066753, "grad_norm": 2.4570949091067664, "learning_rate": 3.9426901410139346e-07, "logits/chosen": -2.4005510807037354, "logits/rejected": -2.2961833477020264, "logps/chosen": -20.022676467895508, "logps/rejected": -509.7611389160156, "logps_avg/chosen": -0.10960058122873306, "logps_avg/rejected": -2.8624167442321777, "loss": 0.0979, "losses_ref": -0.000778104062192142, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5130, "u": -3.73748779296875, "weight": 0.013706320896744728 }, { "diff_generated": -27.759662628173828, "epoch": 1.6655865197666881, "grad_norm": 2.5771408853614264, "learning_rate": 3.9276100743340217e-07, "logits/chosen": -2.474146604537964, "logits/rejected": -2.404331684112549, "logps/chosen": -18.846654891967773, "logps/rejected": -517.0947265625, "logps_avg/chosen": -0.10051999241113663, "logps_avg/rejected": -2.775965929031372, "loss": 0.1, "losses_ref": -0.00044742418685927987, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5140, "u": -3.7430527210235596, "weight": 0.013072418980300426 }, { "diff_generated": -29.991321563720703, "epoch": 1.6688269604666235, "grad_norm": 2.5266094791464906, "learning_rate": 3.9125310368044877e-07, "logits/chosen": -2.351802349090576, "logits/rejected": -2.3035707473754883, "logps/chosen": -17.116596221923828, "logps/rejected": -536.9442138671875, "logps_avg/chosen": -0.09962300956249237, "logps_avg/rejected": -2.9991321563720703, "loss": 0.0987, "losses_ref": -0.0001824962382670492, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5150, "u": -3.649186372756958, "weight": 0.03772037476301193 }, { "diff_generated": -29.356170654296875, "epoch": 1.6720674011665586, "grad_norm": 2.70751200307167, "learning_rate": 3.8974532428004305e-07, "logits/chosen": -2.336583137512207, "logits/rejected": -2.2621965408325195, "logps/chosen": -18.180213928222656, "logps/rejected": -547.4981689453125, "logps_avg/chosen": -0.09748770296573639, "logps_avg/rejected": -2.935617208480835, "loss": 0.0995, "losses_ref": -0.0003386014432180673, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5160, "u": -3.5745251178741455, "weight": 0.05664762109518051 }, { "diff_generated": -27.39190673828125, "epoch": 1.675307841866494, "grad_norm": 2.456643971816381, "learning_rate": 3.8823769066792643e-07, "logits/chosen": -2.3570380210876465, "logits/rejected": -2.2994766235351562, "logps/chosen": -18.147729873657227, "logps/rejected": -480.56640625, "logps_avg/chosen": -0.10367520898580551, "logps_avg/rejected": -2.7391905784606934, "loss": 0.1007, "losses_ref": -0.0006972316186875105, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5170, "u": -3.551652193069458, "weight": 0.06359126418828964 }, { "diff_generated": -26.773178100585938, "epoch": 1.678548282566429, "grad_norm": 2.6000506565970296, "learning_rate": 3.867302242777681e-07, "logits/chosen": -2.4094491004943848, "logits/rejected": -2.415447235107422, "logps/chosen": -17.810815811157227, "logps/rejected": -493.0499572753906, "logps_avg/chosen": -0.10398928821086884, "logps_avg/rejected": -2.6773180961608887, "loss": 0.101, "losses_ref": -0.00047280610306188464, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5180, "u": -3.673081874847412, "weight": 0.03182986378669739 }, { "diff_generated": -26.299346923828125, "epoch": 1.6817887232663642, "grad_norm": 2.6802640950141465, "learning_rate": 3.852229465408597e-07, "logits/chosen": -2.344144344329834, "logits/rejected": -2.3785035610198975, "logps/chosen": -17.98956871032715, "logps/rejected": -492.83538818359375, "logps_avg/chosen": -0.10476674139499664, "logps_avg/rejected": -2.629934787750244, "loss": 0.1018, "losses_ref": -0.0006202187505550683, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5190, "u": -3.5049166679382324, "weight": 0.07590137422084808 }, { "diff_generated": -28.270172119140625, "epoch": 1.6850291639662993, "grad_norm": 2.588015465650208, "learning_rate": 3.8371587888581067e-07, "logits/chosen": -2.3521931171417236, "logits/rejected": -2.301591157913208, "logps/chosen": -18.41990852355957, "logps/rejected": -510.8887634277344, "logps_avg/chosen": -0.09935440123081207, "logps_avg/rejected": -2.827017307281494, "loss": 0.0989, "losses_ref": -0.00043917092261835933, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5200, "u": -3.6940014362335205, "weight": 0.02555156871676445 }, { "diff_generated": -28.505340576171875, "epoch": 1.6882696046662347, "grad_norm": 2.80389923443133, "learning_rate": 3.822090427382442e-07, "logits/chosen": -2.382777690887451, "logits/rejected": -2.2807302474975586, "logps/chosen": -17.81949806213379, "logps/rejected": -509.1504821777344, "logps_avg/chosen": -0.09310451149940491, "logps_avg/rejected": -2.850533962249756, "loss": 0.0999, "losses_ref": -0.0003349473117850721, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5210, "u": -3.5480377674102783, "weight": 0.06295160949230194 }, { "diff_generated": -28.742563247680664, "epoch": 1.6915100453661698, "grad_norm": 2.769761655194945, "learning_rate": 3.807024595204916e-07, "logits/chosen": -2.367471933364868, "logits/rejected": -2.283769130706787, "logps/chosen": -17.1124267578125, "logps/rejected": -494.98858642578125, "logps_avg/chosen": -0.09259551763534546, "logps_avg/rejected": -2.874256134033203, "loss": 0.1013, "losses_ref": -7.784694025758654e-05, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5220, "u": -3.6688854694366455, "weight": 0.031324684619903564 }, { "diff_generated": -28.937397003173828, "epoch": 1.6947504860661051, "grad_norm": 2.512445899650775, "learning_rate": 3.7919615065128905e-07, "logits/chosen": -2.437547206878662, "logits/rejected": -2.3424153327941895, "logps/chosen": -18.564697265625, "logps/rejected": -510.82269287109375, "logps_avg/chosen": -0.1011086255311966, "logps_avg/rejected": -2.8937392234802246, "loss": 0.104, "losses_ref": -0.0008404834079556167, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5230, "u": -3.6209702491760254, "weight": 0.045037899166345596 }, { "diff_generated": -26.764385223388672, "epoch": 1.6979909267660402, "grad_norm": 2.4946620977284, "learning_rate": 3.7769013754547155e-07, "logits/chosen": -2.3728580474853516, "logits/rejected": -2.299055814743042, "logps/chosen": -18.191150665283203, "logps/rejected": -498.5304260253906, "logps_avg/chosen": -0.09499356895685196, "logps_avg/rejected": -2.676438808441162, "loss": 0.0986, "losses_ref": -0.0007090292638167739, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5240, "u": -3.5050902366638184, "weight": 0.07596190273761749 }, { "diff_generated": -26.970651626586914, "epoch": 1.7012313674659754, "grad_norm": 2.4838710224921887, "learning_rate": 3.761844416136701e-07, "logits/chosen": -2.3771610260009766, "logits/rejected": -2.3464341163635254, "logps/chosen": -17.874975204467773, "logps/rejected": -479.57244873046875, "logps_avg/chosen": -0.10110364854335785, "logps_avg/rejected": -2.6970648765563965, "loss": 0.0975, "losses_ref": -0.0009680521907284856, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5250, "u": -3.6237990856170654, "weight": 0.045234501361846924 }, { "diff_generated": -25.719568252563477, "epoch": 1.7044718081659105, "grad_norm": 2.4165842463216545, "learning_rate": 3.746790842620059e-07, "logits/chosen": -2.356290817260742, "logits/rejected": -2.313058614730835, "logps/chosen": -16.402015686035156, "logps/rejected": -457.26116943359375, "logps_avg/chosen": -0.09279557317495346, "logps_avg/rejected": -2.5719568729400635, "loss": 0.1017, "losses_ref": -0.0006630142452195287, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5260, "u": -3.576869249343872, "weight": 0.05718719959259033 }, { "diff_generated": -26.614282608032227, "epoch": 1.7077122488658456, "grad_norm": 2.5094338378018812, "learning_rate": 3.731740868917872e-07, "logits/chosen": -2.309357166290283, "logits/rejected": -2.2900514602661133, "logps/chosen": -18.272537231445312, "logps/rejected": -487.6533203125, "logps_avg/chosen": -0.10436661541461945, "logps_avg/rejected": -2.6614279747009277, "loss": 0.1026, "losses_ref": -0.0007275083917193115, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5270, "u": -3.551506519317627, "weight": 0.06346292048692703 }, { "diff_generated": -28.740280151367188, "epoch": 1.710952689565781, "grad_norm": 2.7448608243155945, "learning_rate": 3.716694708992039e-07, "logits/chosen": -2.3863444328308105, "logits/rejected": -2.3174705505371094, "logps/chosen": -18.374553680419922, "logps/rejected": -497.02398681640625, "logps_avg/chosen": -0.0997755229473114, "logps_avg/rejected": -2.874027967453003, "loss": 0.1001, "losses_ref": -0.0012814865913242102, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5280, "u": -3.6460769176483154, "weight": 0.040156055241823196 }, { "diff_generated": -28.114795684814453, "epoch": 1.7141931302657163, "grad_norm": 2.7845599728937867, "learning_rate": 3.701652576750242e-07, "logits/chosen": -2.3712282180786133, "logits/rejected": -2.336479663848877, "logps/chosen": -16.633949279785156, "logps/rejected": -495.52093505859375, "logps_avg/chosen": -0.10134591907262802, "logps_avg/rejected": -2.8114798069000244, "loss": 0.1021, "losses_ref": -0.00033907522447407246, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5290, "u": -3.6014404296875, "weight": 0.05043425410985947 }, { "diff_generated": -27.336872100830078, "epoch": 1.7174335709656514, "grad_norm": 2.6502131059024645, "learning_rate": 3.686614686042906e-07, "logits/chosen": -2.3674819469451904, "logits/rejected": -2.3362417221069336, "logps/chosen": -16.538209915161133, "logps/rejected": -468.26104736328125, "logps_avg/chosen": -0.0999147891998291, "logps_avg/rejected": -2.733687400817871, "loss": 0.1, "losses_ref": -0.00046807853505015373, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5300, "u": -3.6691298484802246, "weight": 0.031879961490631104 }, { "diff_generated": -28.801959991455078, "epoch": 1.7206740116655865, "grad_norm": 2.6100803813734093, "learning_rate": 3.6715812506601493e-07, "logits/chosen": -2.367793560028076, "logits/rejected": -2.2648661136627197, "logps/chosen": -18.021644592285156, "logps/rejected": -510.4642028808594, "logps_avg/chosen": -0.09823264181613922, "logps_avg/rejected": -2.8801960945129395, "loss": 0.0977, "losses_ref": -0.0003415598184801638, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5310, "u": -3.646897554397583, "weight": 0.03793282434344292 }, { "diff_generated": -29.1380615234375, "epoch": 1.7239144523655217, "grad_norm": 2.7772687087152974, "learning_rate": 3.6565524843287526e-07, "logits/chosen": -2.382816791534424, "logits/rejected": -2.3172459602355957, "logps/chosen": -15.904184341430664, "logps/rejected": -503.6893615722656, "logps_avg/chosen": -0.09637071192264557, "logps_avg/rejected": -2.913806438446045, "loss": 0.0989, "losses_ref": -0.00024001784913707525, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5320, "u": -3.5073089599609375, "weight": 0.07528218626976013 }, { "diff_generated": -26.45065689086914, "epoch": 1.7271548930654568, "grad_norm": 2.7264672239761234, "learning_rate": 3.641528600709115e-07, "logits/chosen": -2.3781332969665527, "logits/rejected": -2.3246679306030273, "logps/chosen": -18.484371185302734, "logps/rejected": -476.2315979003906, "logps_avg/chosen": -0.10251567512750626, "logps_avg/rejected": -2.6450653076171875, "loss": 0.0996, "losses_ref": -0.0004218421527184546, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5330, "u": -3.5488994121551514, "weight": 0.06301628053188324 }, { "diff_generated": -26.43978500366211, "epoch": 1.7303953337653921, "grad_norm": 3.075617478340941, "learning_rate": 3.6265098133922277e-07, "logits/chosen": -2.417259931564331, "logits/rejected": -2.3201329708099365, "logps/chosen": -15.935078620910645, "logps/rejected": -465.5621032714844, "logps_avg/chosen": -0.0904788076877594, "logps_avg/rejected": -2.6439785957336426, "loss": 0.0966, "losses_ref": -0.0006274757906794548, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5340, "u": -3.411005735397339, "weight": 0.10097329318523407 }, { "diff_generated": -27.395992279052734, "epoch": 1.7336357744653272, "grad_norm": 2.5422219006330207, "learning_rate": 3.611496335896617e-07, "logits/chosen": -2.378610610961914, "logits/rejected": -2.3410072326660156, "logps/chosen": -16.637414932250977, "logps/rejected": -533.4635009765625, "logps_avg/chosen": -0.095014788210392, "logps_avg/rejected": -2.7395992279052734, "loss": 0.0987, "losses_ref": -0.0004987435531802475, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5350, "u": -3.6030936241149902, "weight": 0.05063430219888687 }, { "diff_generated": -30.47920799255371, "epoch": 1.7368762151652626, "grad_norm": 2.678820876041137, "learning_rate": 3.59648838166533e-07, "logits/chosen": -2.388047456741333, "logits/rejected": -2.3407864570617676, "logps/chosen": -18.07908821105957, "logps/rejected": -551.9603271484375, "logps_avg/chosen": -0.10280290991067886, "logps_avg/rejected": -3.0479209423065186, "loss": 0.1011, "losses_ref": -0.00017604381719138473, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5360, "u": -3.6247005462646484, "weight": 0.0439545139670372 }, { "diff_generated": -28.07529640197754, "epoch": 1.7401166558651977, "grad_norm": 2.6355285338892913, "learning_rate": 3.5814861640628864e-07, "logits/chosen": -2.3373141288757324, "logits/rejected": -2.271507501602173, "logps/chosen": -19.13628578186035, "logps/rejected": -518.8572998046875, "logps_avg/chosen": -0.1072794646024704, "logps_avg/rejected": -2.807529926300049, "loss": 0.1, "losses_ref": -0.0006126166554167867, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5370, "u": -3.452075242996216, "weight": 0.08837278187274933 }, { "diff_generated": -27.70608901977539, "epoch": 1.7433570965651328, "grad_norm": 2.5643449664389344, "learning_rate": 3.5664898963722526e-07, "logits/chosen": -2.325183629989624, "logits/rejected": -2.30161190032959, "logps/chosen": -18.24054718017578, "logps/rejected": -512.8580322265625, "logps_avg/chosen": -0.10216137021780014, "logps_avg/rejected": -2.770608901977539, "loss": 0.0994, "losses_ref": -0.0006244811811484396, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5380, "u": -3.600287675857544, "weight": 0.05084812641143799 }, { "diff_generated": -26.136837005615234, "epoch": 1.746597537265068, "grad_norm": 2.768633325019539, "learning_rate": 3.5514997917918016e-07, "logits/chosen": -2.353252649307251, "logits/rejected": -2.330204486846924, "logps/chosen": -15.285616874694824, "logps/rejected": -486.61785888671875, "logps_avg/chosen": -0.09202093631029129, "logps_avg/rejected": -2.6136839389801025, "loss": 0.0982, "losses_ref": -0.0004441851342562586, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5390, "u": -3.549830675125122, "weight": 0.06307470798492432 }, { "diff_generated": -27.409582138061523, "epoch": 1.7498379779650033, "grad_norm": 2.514774319959278, "learning_rate": 3.536516063432293e-07, "logits/chosen": -2.353785991668701, "logits/rejected": -2.3390090465545654, "logps/chosen": -16.88019561767578, "logps/rejected": -487.05462646484375, "logps_avg/chosen": -0.09791740030050278, "logps_avg/rejected": -2.7409584522247314, "loss": 0.0978, "losses_ref": -0.0005682742339558899, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5400, "u": -3.6471595764160156, "weight": 0.038246989250183105 }, { "diff_generated": -25.251800537109375, "epoch": 1.7530784186649384, "grad_norm": 2.4130679264226096, "learning_rate": 3.5215389243138326e-07, "logits/chosen": -2.342167854309082, "logits/rejected": -2.2756638526916504, "logps/chosen": -20.983112335205078, "logps/rejected": -471.53961181640625, "logps_avg/chosen": -0.11012951284646988, "logps_avg/rejected": -2.525179862976074, "loss": 0.0972, "losses_ref": -0.0012510241940617561, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5410, "u": -3.6464996337890625, "weight": 0.039689868688583374 }, { "diff_generated": -28.358959197998047, "epoch": 1.7563188593648738, "grad_norm": 2.6050020331663473, "learning_rate": 3.50656858736285e-07, "logits/chosen": -2.373044490814209, "logits/rejected": -2.2874221801757812, "logps/chosen": -18.268918991088867, "logps/rejected": -506.542724609375, "logps_avg/chosen": -0.09958215057849884, "logps_avg/rejected": -2.8358960151672363, "loss": 0.1001, "losses_ref": -0.0005597332492470741, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5420, "u": -3.6918044090270996, "weight": 0.025833910331130028 }, { "diff_generated": -25.419437408447266, "epoch": 1.7595593000648089, "grad_norm": 2.5888083689908723, "learning_rate": 3.491605265409073e-07, "logits/chosen": -2.3774361610412598, "logits/rejected": -2.284318447113037, "logps/chosen": -20.87076187133789, "logps/rejected": -468.8209533691406, "logps_avg/chosen": -0.10591413825750351, "logps_avg/rejected": -2.5419440269470215, "loss": 0.098, "losses_ref": -0.00048125721514225006, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5430, "u": -3.5768661499023438, "weight": 0.056924331933259964 }, { "diff_generated": -26.61014175415039, "epoch": 1.762799740764744, "grad_norm": 2.7238365394521344, "learning_rate": 3.4766491711824916e-07, "logits/chosen": -2.3245720863342285, "logits/rejected": -2.2953591346740723, "logps/chosen": -17.809200286865234, "logps/rejected": -473.62841796875, "logps_avg/chosen": -0.10384336858987808, "logps_avg/rejected": -2.6610138416290283, "loss": 0.1013, "losses_ref": -0.0007365869241766632, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5440, "u": -3.5971107482910156, "weight": 0.051128558814525604 }, { "diff_generated": -27.968036651611328, "epoch": 1.7660401814646791, "grad_norm": 2.6465858443300543, "learning_rate": 3.4617005173103497e-07, "logits/chosen": -2.404662847518921, "logits/rejected": -2.319225311279297, "logps/chosen": -17.643014907836914, "logps/rejected": -494.31884765625, "logps_avg/chosen": -0.0980905294418335, "logps_avg/rejected": -2.7968032360076904, "loss": 0.0965, "losses_ref": -0.000339856487698853, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5450, "u": -3.6960651874542236, "weight": 0.02541317604482174 }, { "diff_generated": -27.50223731994629, "epoch": 1.7692806221646142, "grad_norm": 2.439340308045177, "learning_rate": 3.4467595163141056e-07, "logits/chosen": -2.356353282928467, "logits/rejected": -2.3252711296081543, "logps/chosen": -17.11294937133789, "logps/rejected": -481.44329833984375, "logps_avg/chosen": -0.09781317412853241, "logps_avg/rejected": -2.7502236366271973, "loss": 0.0985, "losses_ref": -0.00023348219110630453, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5460, "u": -3.5757458209991455, "weight": 0.0565178282558918 }, { "diff_generated": -28.132930755615234, "epoch": 1.7725210628645496, "grad_norm": 2.57485074500873, "learning_rate": 3.4318263806064244e-07, "logits/chosen": -2.3583288192749023, "logits/rejected": -2.2734169960021973, "logps/chosen": -18.482982635498047, "logps/rejected": -491.0975646972656, "logps_avg/chosen": -0.09854892641305923, "logps_avg/rejected": -2.813292980194092, "loss": 0.1018, "losses_ref": -0.0007904424564912915, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5470, "u": -3.6228435039520264, "weight": 0.04524999111890793 }, { "diff_generated": -28.949413299560547, "epoch": 1.775761503564485, "grad_norm": 2.59795459877875, "learning_rate": 3.4169013224881475e-07, "logits/chosen": -2.405365467071533, "logits/rejected": -2.324328660964966, "logps/chosen": -17.76644515991211, "logps/rejected": -519.6321411132812, "logps_avg/chosen": -0.09652705490589142, "logps_avg/rejected": -2.8949413299560547, "loss": 0.0988, "losses_ref": -0.00027256523026153445, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5480, "u": -3.5291507244110107, "weight": 0.0690760463476181 }, { "diff_generated": -27.986907958984375, "epoch": 1.77900194426442, "grad_norm": 2.6145142118362203, "learning_rate": 3.4019845541452844e-07, "logits/chosen": -2.3335089683532715, "logits/rejected": -2.2344508171081543, "logps/chosen": -16.983856201171875, "logps/rejected": -477.7383728027344, "logps_avg/chosen": -0.09876149892807007, "logps_avg/rejected": -2.7986905574798584, "loss": 0.0998, "losses_ref": -0.00013874072465114295, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5490, "u": -3.554018020629883, "weight": 0.06264077126979828 }, { "diff_generated": -28.678543090820312, "epoch": 1.7822423849643552, "grad_norm": 2.608081553570393, "learning_rate": 3.387076287645985e-07, "logits/chosen": -2.353555917739868, "logits/rejected": -2.283008098602295, "logps/chosen": -17.25589942932129, "logps/rejected": -534.0850830078125, "logps_avg/chosen": -0.09464980661869049, "logps_avg/rejected": -2.867854595184326, "loss": 0.0967, "losses_ref": -0.00037761160638183355, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5500, "u": -3.550675868988037, "weight": 0.06296338140964508 }, { "diff_generated": -27.80132484436035, "epoch": 1.7854828256642903, "grad_norm": 2.402528371744228, "learning_rate": 3.372176734937536e-07, "logits/chosen": -2.3043529987335205, "logits/rejected": -2.2722156047821045, "logps/chosen": -16.18600082397461, "logps/rejected": -531.1876831054688, "logps_avg/chosen": -0.09347482025623322, "logps_avg/rejected": -2.78013277053833, "loss": 0.0965, "losses_ref": -0.001090071047656238, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5510, "u": -3.5538277626037598, "weight": 0.06445705145597458 }, { "diff_generated": -28.665653228759766, "epoch": 1.7887232663642254, "grad_norm": 2.5830371919480215, "learning_rate": 3.3572861078433376e-07, "logits/chosen": -2.359614849090576, "logits/rejected": -2.265977621078491, "logps/chosen": -16.606525421142578, "logps/rejected": -502.6050720214844, "logps_avg/chosen": -0.09027236700057983, "logps_avg/rejected": -2.866565227508545, "loss": 0.098, "losses_ref": -0.00043819882557727396, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5520, "u": -3.527933120727539, "weight": 0.06936118751764297 }, { "diff_generated": -26.51803970336914, "epoch": 1.7919637070641607, "grad_norm": 2.5432017679630983, "learning_rate": 3.3424046180599e-07, "logits/chosen": -2.3557043075561523, "logits/rejected": -2.2849130630493164, "logps/chosen": -16.62330436706543, "logps/rejected": -487.97198486328125, "logps_avg/chosen": -0.09761399030685425, "logps_avg/rejected": -2.651803493499756, "loss": 0.0957, "losses_ref": -0.0005190398078411818, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5530, "u": -3.477447986602783, "weight": 0.08194705098867416 }, { "diff_generated": -28.85833168029785, "epoch": 1.7952041477640959, "grad_norm": 2.684314115645967, "learning_rate": 3.3275324771538273e-07, "logits/chosen": -2.320209264755249, "logits/rejected": -2.2477760314941406, "logps/chosen": -17.556983947753906, "logps/rejected": -518.7302856445312, "logps_avg/chosen": -0.09914490580558777, "logps_avg/rejected": -2.885833263397217, "loss": 0.0976, "losses_ref": -0.00039971404476091266, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5540, "u": -3.5764288902282715, "weight": 0.056789614260196686 }, { "diff_generated": -28.340740203857422, "epoch": 1.7984445884640312, "grad_norm": 2.5980842410538054, "learning_rate": 3.312669896558816e-07, "logits/chosen": -2.3921077251434326, "logits/rejected": -2.2863588333129883, "logps/chosen": -16.911273956298828, "logps/rejected": -481.89947509765625, "logps_avg/chosen": -0.09584518522024155, "logps_avg/rejected": -2.834073781967163, "loss": 0.0971, "losses_ref": -0.0004901793436147273, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5550, "u": -3.549412965774536, "weight": 0.06320427358150482 }, { "diff_generated": -26.314910888671875, "epoch": 1.8016850291639663, "grad_norm": 2.4257047801717784, "learning_rate": 3.2978170875726454e-07, "logits/chosen": -2.371443271636963, "logits/rejected": -2.3299648761749268, "logps/chosen": -15.302289009094238, "logps/rejected": -467.28704833984375, "logps_avg/chosen": -0.08800629526376724, "logps_avg/rejected": -2.631491184234619, "loss": 0.0957, "losses_ref": -0.000557105871848762, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5560, "u": -3.551307201385498, "weight": 0.06321394443511963 }, { "diff_generated": -26.27902603149414, "epoch": 1.8049254698639015, "grad_norm": 2.6868694467477146, "learning_rate": 3.2829742613541704e-07, "logits/chosen": -2.3159596920013428, "logits/rejected": -2.309455633163452, "logps/chosen": -17.738338470458984, "logps/rejected": -523.1011352539062, "logps_avg/chosen": -0.10448671877384186, "logps_avg/rejected": -2.6279029846191406, "loss": 0.0988, "losses_ref": -0.0001589446037542075, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5570, "u": -3.5064120292663574, "weight": 0.07517381012439728 }, { "diff_generated": -29.1507568359375, "epoch": 1.8081659105638366, "grad_norm": 2.6650715531360296, "learning_rate": 3.26814162892033e-07, "logits/chosen": -2.3827013969421387, "logits/rejected": -2.309659719467163, "logps/chosen": -17.93830108642578, "logps/rejected": -545.5929565429688, "logps_avg/chosen": -0.1010148674249649, "logps_avg/rejected": -2.9150757789611816, "loss": 0.0962, "losses_ref": -0.0002275872539030388, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5580, "u": -3.5754737854003906, "weight": 0.056514762341976166 }, { "diff_generated": -26.319509506225586, "epoch": 1.811406351263772, "grad_norm": 2.507425280586255, "learning_rate": 3.2533194011431346e-07, "logits/chosen": -2.357861280441284, "logits/rejected": -2.3017077445983887, "logps/chosen": -16.048376083374023, "logps/rejected": -488.1123962402344, "logps_avg/chosen": -0.09536460041999817, "logps_avg/rejected": -2.631950855255127, "loss": 0.096, "losses_ref": -0.00048077874816954136, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5590, "u": -3.5518956184387207, "weight": 0.06308998167514801 }, { "diff_generated": -29.063892364501953, "epoch": 1.814646791963707, "grad_norm": 2.504515850196088, "learning_rate": 3.2385077887466766e-07, "logits/chosen": -2.395941734313965, "logits/rejected": -2.3485054969787598, "logps/chosen": -17.349185943603516, "logps/rejected": -541.281494140625, "logps_avg/chosen": -0.10105355829000473, "logps_avg/rejected": -2.9063892364501953, "loss": 0.0962, "losses_ref": -0.0004301825538277626, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5600, "u": -3.644620418548584, "weight": 0.03806838393211365 }, { "diff_generated": -27.6377010345459, "epoch": 1.8178872326636424, "grad_norm": 2.4776668300868097, "learning_rate": 3.223707002304131e-07, "logits/chosen": -2.3200387954711914, "logits/rejected": -2.286799192428589, "logps/chosen": -18.38614273071289, "logps/rejected": -511.392822265625, "logps_avg/chosen": -0.10474991798400879, "logps_avg/rejected": -2.7637698650360107, "loss": 0.0991, "losses_ref": -0.00030589301604777575, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5610, "u": -3.505202531814575, "weight": 0.07536722719669342 }, { "diff_generated": -27.71368408203125, "epoch": 1.8211276733635775, "grad_norm": 2.588751701695597, "learning_rate": 3.208917252234765e-07, "logits/chosen": -2.3411502838134766, "logits/rejected": -2.304018259048462, "logps/chosen": -15.185452461242676, "logps/rejected": -518.5407104492188, "logps_avg/chosen": -0.09042085707187653, "logps_avg/rejected": -2.7713687419891357, "loss": 0.0965, "losses_ref": -0.00028060507611371577, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5620, "u": -3.595822811126709, "weight": 0.050334203988313675 }, { "diff_generated": -32.019447326660156, "epoch": 1.8243681140635126, "grad_norm": 2.5938523274013296, "learning_rate": 3.1941387488009396e-07, "logits/chosen": -2.346749782562256, "logits/rejected": -2.261305332183838, "logps/chosen": -17.656230926513672, "logps/rejected": -577.9088134765625, "logps_avg/chosen": -0.10112802684307098, "logps_avg/rejected": -3.201944351196289, "loss": 0.0979, "losses_ref": -0.0008206478087231517, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5630, "u": -3.5457470417022705, "weight": 0.06364034861326218 }, { "diff_generated": -28.698589324951172, "epoch": 1.8276085547634477, "grad_norm": 2.4916285253469663, "learning_rate": 3.179371702105132e-07, "logits/chosen": -2.4086880683898926, "logits/rejected": -2.3404719829559326, "logps/chosen": -19.253461837768555, "logps/rejected": -524.8831176757812, "logps_avg/chosen": -0.10744975507259369, "logps_avg/rejected": -2.869859218597412, "loss": 0.0989, "losses_ref": -0.00035506210406310856, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5640, "u": -3.6465015411376953, "weight": 0.037953395396471024 }, { "diff_generated": -29.409387588500977, "epoch": 1.8308489954633829, "grad_norm": 2.509263833905603, "learning_rate": 3.164616322086936e-07, "logits/chosen": -2.3691985607147217, "logits/rejected": -2.285266876220703, "logps/chosen": -16.659765243530273, "logps/rejected": -507.52618408203125, "logps_avg/chosen": -0.09539072215557098, "logps_avg/rejected": -2.940938949584961, "loss": 0.0985, "losses_ref": -0.0007100877119228244, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5650, "u": -3.6012961864471436, "weight": 0.051039986312389374 }, { "diff_generated": -27.477447509765625, "epoch": 1.8340894361633182, "grad_norm": 2.571417507210119, "learning_rate": 3.1498728185200845e-07, "logits/chosen": -2.403351068496704, "logits/rejected": -2.3440890312194824, "logps/chosen": -19.3876953125, "logps/rejected": -519.7465209960938, "logps_avg/chosen": -0.10591503232717514, "logps_avg/rejected": -2.7477450370788574, "loss": 0.0995, "losses_ref": -0.0003261718084104359, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5660, "u": -3.6660637855529785, "weight": 0.03169737011194229 }, { "diff_generated": -25.8249454498291, "epoch": 1.8373298768632536, "grad_norm": 2.5069798884516152, "learning_rate": 3.1351414010094683e-07, "logits/chosen": -2.328765392303467, "logits/rejected": -2.3035292625427246, "logps/chosen": -18.959909439086914, "logps/rejected": -489.73211669921875, "logps_avg/chosen": -0.1038951650261879, "logps_avg/rejected": -2.5824942588806152, "loss": 0.099, "losses_ref": -0.000567997747566551, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5670, "u": -3.6191246509552, "weight": 0.044458914548158646 }, { "diff_generated": -29.907428741455078, "epoch": 1.8405703175631887, "grad_norm": 2.501875729145946, "learning_rate": 3.120422278988149e-07, "logits/chosen": -2.348712205886841, "logits/rejected": -2.2924861907958984, "logps/chosen": -17.87131118774414, "logps/rejected": -526.6512451171875, "logps_avg/chosen": -0.10166116058826447, "logps_avg/rejected": -2.9907429218292236, "loss": 0.0987, "losses_ref": -0.00026783597422763705, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5680, "u": -3.624166488647461, "weight": 0.04405301436781883 }, { "diff_generated": -27.886672973632812, "epoch": 1.8438107582631238, "grad_norm": 2.566034051408787, "learning_rate": 3.10571566171439e-07, "logits/chosen": -2.371107578277588, "logits/rejected": -2.3411777019500732, "logps/chosen": -18.45060157775879, "logps/rejected": -516.3961181640625, "logps_avg/chosen": -0.10506061464548111, "logps_avg/rejected": -2.7886672019958496, "loss": 0.1, "losses_ref": -0.00029390607960522175, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5690, "u": -3.6481080055236816, "weight": 0.03786667063832283 }, { "diff_generated": -27.870288848876953, "epoch": 1.847051198963059, "grad_norm": 2.809915486187473, "learning_rate": 3.0910217582686756e-07, "logits/chosen": -2.3481717109680176, "logits/rejected": -2.354294776916504, "logps/chosen": -16.52988052368164, "logps/rejected": -515.648193359375, "logps_avg/chosen": -0.09819358587265015, "logps_avg/rejected": -2.7870285511016846, "loss": 0.1008, "losses_ref": -0.0013325114268809557, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5700, "u": -3.5237045288085938, "weight": 0.07092602550983429 }, { "diff_generated": -25.086254119873047, "epoch": 1.850291639662994, "grad_norm": 2.4862422113018505, "learning_rate": 3.0763407775507426e-07, "logits/chosen": -2.4297471046447754, "logits/rejected": -2.395003080368042, "logps/chosen": -17.715723037719727, "logps/rejected": -481.25506591796875, "logps_avg/chosen": -0.09285169094800949, "logps_avg/rejected": -2.5086257457733154, "loss": 0.0971, "losses_ref": -0.00029824889497831464, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5710, "u": -3.4351601600646973, "weight": 0.09410148113965988 }, { "diff_generated": -25.64459800720215, "epoch": 1.8535320803629294, "grad_norm": 2.6048627692959703, "learning_rate": 3.0616729282766037e-07, "logits/chosen": -2.341219425201416, "logits/rejected": -2.346372365951538, "logps/chosen": -16.424030303955078, "logps/rejected": -451.0950622558594, "logps_avg/chosen": -0.09924010932445526, "logps_avg/rejected": -2.564460039138794, "loss": 0.1003, "losses_ref": -0.0005212887190282345, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5720, "u": -3.530282497406006, "weight": 0.06945204734802246 }, { "diff_generated": -27.70074462890625, "epoch": 1.8567725210628645, "grad_norm": 2.6690629016870737, "learning_rate": 3.047018418975593e-07, "logits/chosen": -2.3930373191833496, "logits/rejected": -2.262077808380127, "logps/chosen": -19.89908218383789, "logps/rejected": -474.302734375, "logps_avg/chosen": -0.10280998051166534, "logps_avg/rejected": -2.7700743675231934, "loss": 0.1009, "losses_ref": -0.0002895224606618285, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5730, "u": -3.597874402999878, "weight": 0.050351161509752274 }, { "diff_generated": -27.63687515258789, "epoch": 1.8600129617627998, "grad_norm": 2.6096737058446564, "learning_rate": 3.032377457987385e-07, "logits/chosen": -2.367377758026123, "logits/rejected": -2.329639196395874, "logps/chosen": -17.626415252685547, "logps/rejected": -510.41253662109375, "logps_avg/chosen": -0.10201933234930038, "logps_avg/rejected": -2.7636873722076416, "loss": 0.0959, "losses_ref": -0.0012955269776284695, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5740, "u": -3.5941002368927, "weight": 0.05208515003323555 }, { "diff_generated": -26.940898895263672, "epoch": 1.863253402462735, "grad_norm": 2.788615366488837, "learning_rate": 3.017750253459048e-07, "logits/chosen": -2.3940701484680176, "logits/rejected": -2.3186657428741455, "logps/chosen": -19.841453552246094, "logps/rejected": -506.88275146484375, "logps_avg/chosen": -0.10610809177160263, "logps_avg/rejected": -2.6940901279449463, "loss": 0.1009, "losses_ref": -0.0002789639984257519, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5750, "u": -3.6475138664245605, "weight": 0.03784211724996567 }, { "diff_generated": -27.238052368164062, "epoch": 1.86649384316267, "grad_norm": 2.503504965592765, "learning_rate": 3.003137013342071e-07, "logits/chosen": -2.430534601211548, "logits/rejected": -2.417689323425293, "logps/chosen": -15.796445846557617, "logps/rejected": -508.17730712890625, "logps_avg/chosen": -0.09056727588176727, "logps_avg/rejected": -2.7238051891326904, "loss": 0.0965, "losses_ref": -0.0003995650331489742, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5760, "u": -3.62666392326355, "weight": 0.044256873428821564 }, { "diff_generated": -28.615772247314453, "epoch": 1.8697342838626052, "grad_norm": 2.614017479610386, "learning_rate": 2.9885379453894224e-07, "logits/chosen": -2.4150617122650146, "logits/rejected": -2.37510347366333, "logps/chosen": -16.090747833251953, "logps/rejected": -553.6571044921875, "logps_avg/chosen": -0.09207513183355331, "logps_avg/rejected": -2.861577272415161, "loss": 0.0974, "losses_ref": -0.00022796406119596213, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5770, "u": -3.597252607345581, "weight": 0.05026254802942276 }, { "diff_generated": -27.86861228942871, "epoch": 1.8729747245625405, "grad_norm": 2.6691451138289835, "learning_rate": 2.9739532571525806e-07, "logits/chosen": -2.4372811317443848, "logits/rejected": -2.4184963703155518, "logps/chosen": -16.568439483642578, "logps/rejected": -506.2725524902344, "logps_avg/chosen": -0.09399370849132538, "logps_avg/rejected": -2.7868611812591553, "loss": 0.1029, "losses_ref": -0.0002700109616853297, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5780, "u": -3.6414883136749268, "weight": 0.03782133013010025 }, { "diff_generated": -29.793752670288086, "epoch": 1.8762151652624757, "grad_norm": 2.436557894850836, "learning_rate": 2.959383155978596e-07, "logits/chosen": -2.3705596923828125, "logits/rejected": -2.3041160106658936, "logps/chosen": -16.600704193115234, "logps/rejected": -536.4827880859375, "logps_avg/chosen": -0.09566892683506012, "logps_avg/rejected": -2.9793753623962402, "loss": 0.0985, "losses_ref": -0.00015063644968904555, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5790, "u": -3.5993595123291016, "weight": 0.0501655749976635 }, { "diff_generated": -26.078826904296875, "epoch": 1.879455605962411, "grad_norm": 2.557904930799931, "learning_rate": 2.9448278490071373e-07, "logits/chosen": -2.3876595497131348, "logits/rejected": -2.3382132053375244, "logps/chosen": -16.884082794189453, "logps/rejected": -469.434326171875, "logps_avg/chosen": -0.09592945873737335, "logps_avg/rejected": -2.6078827381134033, "loss": 0.1009, "losses_ref": -0.0003953514969907701, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5800, "u": -3.5042953491210938, "weight": 0.0754755288362503 }, { "diff_generated": -27.1308650970459, "epoch": 1.8826960466623461, "grad_norm": 2.410795608150016, "learning_rate": 2.930287543167544e-07, "logits/chosen": -2.4403939247131348, "logits/rejected": -2.338735342025757, "logps/chosen": -18.41120719909668, "logps/rejected": -498.0870056152344, "logps_avg/chosen": -0.0972508043050766, "logps_avg/rejected": -2.7130866050720215, "loss": 0.0976, "losses_ref": -0.0004312940873205662, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5810, "u": -3.7621655464172363, "weight": 0.006767953280359507 }, { "diff_generated": -30.3148193359375, "epoch": 1.8859364873622813, "grad_norm": 2.53770678947922, "learning_rate": 2.9157624451758944e-07, "logits/chosen": -2.3555285930633545, "logits/rejected": -2.2266926765441895, "logps/chosen": -18.102619171142578, "logps/rejected": -496.95428466796875, "logps_avg/chosen": -0.10375206172466278, "logps_avg/rejected": -3.031481981277466, "loss": 0.0996, "losses_ref": -0.0006220188806764781, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5820, "u": -3.550969362258911, "weight": 0.06340692937374115 }, { "diff_generated": -29.127822875976562, "epoch": 1.8891769280622164, "grad_norm": 2.5479669591981704, "learning_rate": 2.901252761532055e-07, "logits/chosen": -2.4188151359558105, "logits/rejected": -2.309403896331787, "logps/chosen": -17.828176498413086, "logps/rejected": -494.87353515625, "logps_avg/chosen": -0.0992671474814415, "logps_avg/rejected": -2.9127821922302246, "loss": 0.0965, "losses_ref": -0.0007391165709123015, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5830, "u": -3.646193742752075, "weight": 0.038570336997509 }, { "diff_generated": -27.222660064697266, "epoch": 1.8924173687621515, "grad_norm": 2.7100293576269094, "learning_rate": 2.8867586985167523e-07, "logits/chosen": -2.4463648796081543, "logits/rejected": -2.3553476333618164, "logps/chosen": -17.450244903564453, "logps/rejected": -484.1527404785156, "logps_avg/chosen": -0.0959417000412941, "logps_avg/rejected": -2.72226619720459, "loss": 0.0961, "losses_ref": -0.00026212536613456905, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5840, "u": -3.525402784347534, "weight": 0.06906180083751678 }, { "diff_generated": -29.353763580322266, "epoch": 1.8956578094620868, "grad_norm": 2.5836151874470787, "learning_rate": 2.8722804621886364e-07, "logits/chosen": -2.404148817062378, "logits/rejected": -2.3284482955932617, "logps/chosen": -18.062631607055664, "logps/rejected": -501.10595703125, "logps_avg/chosen": -0.09879143536090851, "logps_avg/rejected": -2.9353766441345215, "loss": 0.1012, "losses_ref": -0.0004886888200417161, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5850, "u": -3.6259491443634033, "weight": 0.04446321353316307 }, { "diff_generated": -28.762012481689453, "epoch": 1.8988982501620222, "grad_norm": 2.5550213842894003, "learning_rate": 2.857818258381358e-07, "logits/chosen": -2.3610546588897705, "logits/rejected": -2.2954795360565186, "logps/chosen": -18.180984497070312, "logps/rejected": -513.5320434570312, "logps_avg/chosen": -0.10295577347278595, "logps_avg/rejected": -2.8762011528015137, "loss": 0.101, "losses_ref": -0.00014926650328561664, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5860, "u": -3.6503257751464844, "weight": 0.03767084330320358 }, { "diff_generated": -29.47927474975586, "epoch": 1.9021386908619573, "grad_norm": 2.4124126862845583, "learning_rate": 2.8433722927006314e-07, "logits/chosen": -2.413966178894043, "logits/rejected": -2.357156991958618, "logps/chosen": -19.497705459594727, "logps/rejected": -542.2984619140625, "logps_avg/chosen": -0.11012458801269531, "logps_avg/rejected": -2.947927474975586, "loss": 0.1002, "losses_ref": -0.0004483603988774121, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5870, "u": -3.6454529762268066, "weight": 0.03807947039604187 }, { "diff_generated": -28.179845809936523, "epoch": 1.9053791315618924, "grad_norm": 2.6754838770049605, "learning_rate": 2.82894277052132e-07, "logits/chosen": -2.382157564163208, "logits/rejected": -2.352442502975464, "logps/chosen": -16.77843475341797, "logps/rejected": -531.6185913085938, "logps_avg/chosen": -0.09153401851654053, "logps_avg/rejected": -2.8179848194122314, "loss": 0.0968, "losses_ref": -0.0003780314582400024, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5880, "u": -3.5757038593292236, "weight": 0.05669553950428963 }, { "diff_generated": -28.038406372070312, "epoch": 1.9086195722618275, "grad_norm": 2.618243243916407, "learning_rate": 2.814529896984514e-07, "logits/chosen": -2.344517469406128, "logits/rejected": -2.2761929035186768, "logps/chosen": -18.029022216796875, "logps/rejected": -504.73651123046875, "logps_avg/chosen": -0.09274528920650482, "logps_avg/rejected": -2.8038411140441895, "loss": 0.0987, "losses_ref": -0.000521031382959336, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5890, "u": -3.5262343883514404, "weight": 0.06953036785125732 }, { "diff_generated": -26.303661346435547, "epoch": 1.9118600129617627, "grad_norm": 2.780793568277981, "learning_rate": 2.8001338769946126e-07, "logits/chosen": -2.3630564212799072, "logits/rejected": -2.257866621017456, "logps/chosen": -17.455060958862305, "logps/rejected": -465.14990234375, "logps_avg/chosen": -0.09545596688985825, "logps_avg/rejected": -2.630366563796997, "loss": 0.1005, "losses_ref": -0.0006079275044612586, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5900, "u": -3.531212329864502, "weight": 0.06961339712142944 }, { "diff_generated": -28.460521697998047, "epoch": 1.915100453661698, "grad_norm": 2.420976109120161, "learning_rate": 2.7857549152164153e-07, "logits/chosen": -2.4027175903320312, "logits/rejected": -2.3249917030334473, "logps/chosen": -14.792211532592773, "logps/rejected": -504.2300720214844, "logps_avg/chosen": -0.08837342262268066, "logps_avg/rejected": -2.846052646636963, "loss": 0.0971, "losses_ref": -0.0005176137783564627, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5910, "u": -3.5528805255889893, "weight": 0.06331194937229156 }, { "diff_generated": -28.62518310546875, "epoch": 1.9183408943616331, "grad_norm": 2.471182929487115, "learning_rate": 2.7713932160722043e-07, "logits/chosen": -2.344817876815796, "logits/rejected": -2.3178930282592773, "logps/chosen": -17.537582397460938, "logps/rejected": -524.8299560546875, "logps_avg/chosen": -0.10338269174098969, "logps_avg/rejected": -2.862518787384033, "loss": 0.0984, "losses_ref": -0.0008483555866405368, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5920, "u": -3.669346332550049, "weight": 0.032698508352041245 }, { "diff_generated": -30.585668563842773, "epoch": 1.9215813350615685, "grad_norm": 2.4739113244660285, "learning_rate": 2.757048983738847e-07, "logits/chosen": -2.399620771408081, "logits/rejected": -2.3443222045898438, "logps/chosen": -17.10749626159668, "logps/rejected": -557.9705810546875, "logps_avg/chosen": -0.09897866100072861, "logps_avg/rejected": -3.0585668087005615, "loss": 0.0957, "losses_ref": -0.0004411758854985237, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5930, "u": -3.6455721855163574, "weight": 0.03806128352880478 }, { "diff_generated": -26.73529624938965, "epoch": 1.9248217757615036, "grad_norm": 2.570894429245623, "learning_rate": 2.742722422144885e-07, "logits/chosen": -2.394821882247925, "logits/rejected": -2.3548684120178223, "logps/chosen": -18.255279541015625, "logps/rejected": -517.0158081054688, "logps_avg/chosen": -0.10357420146465302, "logps_avg/rejected": -2.673529624938965, "loss": 0.0971, "losses_ref": -0.0006127547239884734, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5940, "u": -3.576965808868408, "weight": 0.057065822184085846 }, { "diff_generated": -28.054443359375, "epoch": 1.9280622164614387, "grad_norm": 2.5841664834314546, "learning_rate": 2.7284137349676466e-07, "logits/chosen": -2.325408697128296, "logits/rejected": -2.256453275680542, "logps/chosen": -16.044095993041992, "logps/rejected": -539.1276245117188, "logps_avg/chosen": -0.09099126607179642, "logps_avg/rejected": -2.8054440021514893, "loss": 0.0979, "losses_ref": -0.0003426524926908314, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5950, "u": -3.50434947013855, "weight": 0.07543531805276871 }, { "diff_generated": -28.748315811157227, "epoch": 1.9313026571613738, "grad_norm": 2.596492156896544, "learning_rate": 2.7141231256303343e-07, "logits/chosen": -2.355212688446045, "logits/rejected": -2.3038063049316406, "logps/chosen": -19.689380645751953, "logps/rejected": -532.0252685546875, "logps_avg/chosen": -0.11005760729312897, "logps_avg/rejected": -2.8748316764831543, "loss": 0.102, "losses_ref": -0.0007306236075237393, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5960, "u": -3.6479029655456543, "weight": 0.03859782963991165 }, { "diff_generated": -28.284626007080078, "epoch": 1.9345430978613092, "grad_norm": 2.5146143330713686, "learning_rate": 2.69985079729915e-07, "logits/chosen": -2.369210720062256, "logits/rejected": -2.283693790435791, "logps/chosen": -17.795032501220703, "logps/rejected": -495.90472412109375, "logps_avg/chosen": -0.09974905848503113, "logps_avg/rejected": -2.828463077545166, "loss": 0.0998, "losses_ref": -0.0007459482294507325, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5970, "u": -3.5771775245666504, "weight": 0.05748031288385391 }, { "diff_generated": -29.488983154296875, "epoch": 1.9377835385612443, "grad_norm": 2.8211061840508513, "learning_rate": 2.6855969528803945e-07, "logits/chosen": -2.379024028778076, "logits/rejected": -2.3205015659332275, "logps/chosen": -18.999738693237305, "logps/rejected": -525.9266967773438, "logps_avg/chosen": -0.09963471442461014, "logps_avg/rejected": -2.9488985538482666, "loss": 0.1008, "losses_ref": -0.0004714926762972027, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5980, "u": -3.623253583908081, "weight": 0.04440300166606903 }, { "diff_generated": -28.684600830078125, "epoch": 1.9410239792611796, "grad_norm": 2.4773623060647436, "learning_rate": 2.6713617950175903e-07, "logits/chosen": -2.339857578277588, "logits/rejected": -2.2977325916290283, "logps/chosen": -15.516563415527344, "logps/rejected": -526.1066284179688, "logps_avg/chosen": -0.0898626446723938, "logps_avg/rejected": -2.868460178375244, "loss": 0.097, "losses_ref": -0.0008933226345106959, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5990, "u": -3.5968101024627686, "weight": 0.051392458379268646 }, { "diff_generated": -25.407421112060547, "epoch": 1.9442644199611148, "grad_norm": 2.562426557717335, "learning_rate": 2.657145526088593e-07, "logits/chosen": -2.334357500076294, "logits/rejected": -2.325047016143799, "logps/chosen": -17.33317756652832, "logps/rejected": -470.50018310546875, "logps_avg/chosen": -0.09964416176080704, "logps_avg/rejected": -2.5407423973083496, "loss": 0.0967, "losses_ref": -0.00018075959815178066, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6000, "u": -3.4317283630371094, "weight": 0.0939575582742691 }, { "diff_generated": -30.041431427001953, "epoch": 1.9475048606610499, "grad_norm": 2.4632218445410867, "learning_rate": 2.6429483482027243e-07, "logits/chosen": -2.3687427043914795, "logits/rejected": -2.3227016925811768, "logps/chosen": -17.55387306213379, "logps/rejected": -546.512451171875, "logps_avg/chosen": -0.10221491754055023, "logps_avg/rejected": -3.004142999649048, "loss": 0.0989, "losses_ref": -0.00030045019229874015, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6010, "u": -3.62287974357605, "weight": 0.0441289022564888 }, { "diff_generated": -29.048913955688477, "epoch": 1.950745301360985, "grad_norm": 2.551806434254762, "learning_rate": 2.628770463197889e-07, "logits/chosen": -2.429622173309326, "logits/rejected": -2.3124306201934814, "logps/chosen": -17.99391746520996, "logps/rejected": -529.9110717773438, "logps_avg/chosen": -0.09834562987089157, "logps_avg/rejected": -2.904891014099121, "loss": 0.0982, "losses_ref": -0.0004878188483417034, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6020, "u": -3.5515480041503906, "weight": 0.0631481185555458 }, { "diff_generated": -27.4799747467041, "epoch": 1.9539857420609201, "grad_norm": 2.3886479332440596, "learning_rate": 2.6146120726377103e-07, "logits/chosen": -2.291303873062134, "logits/rejected": -2.2781028747558594, "logps/chosen": -15.211416244506836, "logps/rejected": -499.1487731933594, "logps_avg/chosen": -0.09400470554828644, "logps_avg/rejected": -2.747997522354126, "loss": 0.0972, "losses_ref": -0.00037772621726617217, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6030, "u": -3.458329677581787, "weight": 0.08798156678676605 }, { "diff_generated": -30.40256118774414, "epoch": 1.9572261827608555, "grad_norm": 2.6555707656206167, "learning_rate": 2.600473377808667e-07, "logits/chosen": -2.3612496852874756, "logits/rejected": -2.2357964515686035, "logps/chosen": -17.25994110107422, "logps/rejected": -531.9276123046875, "logps_avg/chosen": -0.1040547713637352, "logps_avg/rejected": -3.0402560234069824, "loss": 0.0965, "losses_ref": -0.0002499911352060735, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6040, "u": -3.5274498462677, "weight": 0.0690465196967125 }, { "diff_generated": -31.004383087158203, "epoch": 1.9604666234607908, "grad_norm": 2.400620684294024, "learning_rate": 2.5863545797172226e-07, "logits/chosen": -2.3689889907836914, "logits/rejected": -2.2837939262390137, "logps/chosen": -18.03851890563965, "logps/rejected": -553.7256469726562, "logps_avg/chosen": -0.10278002917766571, "logps_avg/rejected": -3.100438117980957, "loss": 0.0994, "losses_ref": -0.0005681588081642985, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6050, "u": -3.5772476196289062, "weight": 0.057164423167705536 }, { "diff_generated": -28.485675811767578, "epoch": 1.963707064160726, "grad_norm": 2.4873118516999924, "learning_rate": 2.5722558790869786e-07, "logits/chosen": -2.335010528564453, "logits/rejected": -2.248538017272949, "logps/chosen": -18.01328468322754, "logps/rejected": -488.8133239746094, "logps_avg/chosen": -0.09517219662666321, "logps_avg/rejected": -2.848567485809326, "loss": 0.0946, "losses_ref": -0.00048435464850626886, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6060, "u": -3.477914333343506, "weight": 0.08194047212600708 }, { "diff_generated": -30.61606216430664, "epoch": 1.966947504860661, "grad_norm": 2.7205134584585933, "learning_rate": 2.558177476355812e-07, "logits/chosen": -2.380079507827759, "logits/rejected": -2.3212497234344482, "logps/chosen": -19.776500701904297, "logps/rejected": -590.427978515625, "logps_avg/chosen": -0.1100943312048912, "logps_avg/rejected": -3.0616061687469482, "loss": 0.0968, "losses_ref": -0.00031202996615320444, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6070, "u": -3.69507098197937, "weight": 0.025381267070770264 }, { "diff_generated": -27.755123138427734, "epoch": 1.9701879455605962, "grad_norm": 2.558741965970259, "learning_rate": 2.544119571673031e-07, "logits/chosen": -2.401541233062744, "logits/rejected": -2.357109785079956, "logps/chosen": -17.667585372924805, "logps/rejected": -539.28515625, "logps_avg/chosen": -0.09603724628686905, "logps_avg/rejected": -2.7755126953125, "loss": 0.0962, "losses_ref": -0.00046056945575401187, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6080, "u": -3.648192882537842, "weight": 0.03804687783122063 }, { "diff_generated": -28.202579498291016, "epoch": 1.9734283862605313, "grad_norm": 2.5918846208989015, "learning_rate": 2.5300823648965267e-07, "logits/chosen": -2.3541946411132812, "logits/rejected": -2.293105363845825, "logps/chosen": -16.276870727539062, "logps/rejected": -532.5204467773438, "logps_avg/chosen": -0.0957450419664383, "logps_avg/rejected": -2.8202579021453857, "loss": 0.0986, "losses_ref": -0.0005978021072223783, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6090, "u": -3.6244893074035645, "weight": 0.04452654346823692 }, { "diff_generated": -27.33587074279785, "epoch": 1.9766688269604666, "grad_norm": 2.5232128616627265, "learning_rate": 2.516066055589937e-07, "logits/chosen": -2.3826522827148438, "logits/rejected": -2.3479719161987305, "logps/chosen": -15.652276992797852, "logps/rejected": -512.0379028320312, "logps_avg/chosen": -0.09057996422052383, "logps_avg/rejected": -2.7335870265960693, "loss": 0.0972, "losses_ref": -0.00041287043131887913, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6100, "u": -3.5540382862091064, "weight": 0.0630248486995697 }, { "diff_generated": -29.23862075805664, "epoch": 1.9799092676604018, "grad_norm": 2.5030703804011942, "learning_rate": 2.502070843019799e-07, "logits/chosen": -2.3702540397644043, "logits/rejected": -2.3024659156799316, "logps/chosen": -18.456167221069336, "logps/rejected": -559.3988037109375, "logps_avg/chosen": -0.10194142907857895, "logps_avg/rejected": -2.9238619804382324, "loss": 0.0962, "losses_ref": -0.00033079044078476727, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6110, "u": -3.645400285720825, "weight": 0.03791101649403572 }, { "diff_generated": -29.87836265563965, "epoch": 1.983149708360337, "grad_norm": 2.4895773624046775, "learning_rate": 2.4880969261527294e-07, "logits/chosen": -2.3824191093444824, "logits/rejected": -2.306915044784546, "logps/chosen": -17.128381729125977, "logps/rejected": -556.4266357421875, "logps_avg/chosen": -0.10241828113794327, "logps_avg/rejected": -2.9878363609313965, "loss": 0.098, "losses_ref": -0.0001866271486505866, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6120, "u": -3.646498918533325, "weight": 0.03771054744720459 }, { "diff_generated": -27.77744483947754, "epoch": 1.9863901490602722, "grad_norm": 2.5199031859632006, "learning_rate": 2.4741445036525814e-07, "logits/chosen": -2.3641562461853027, "logits/rejected": -2.2675235271453857, "logps/chosen": -15.651080131530762, "logps/rejected": -518.16455078125, "logps_avg/chosen": -0.08747779577970505, "logps_avg/rejected": -2.7777445316314697, "loss": 0.0958, "losses_ref": -0.00010507024126127362, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6130, "u": -3.459134340286255, "weight": 0.08761118352413177 }, { "diff_generated": -26.81686782836914, "epoch": 1.9896305897602073, "grad_norm": 2.64875778356808, "learning_rate": 2.460213773877635e-07, "logits/chosen": -2.3199706077575684, "logits/rejected": -2.2571868896484375, "logps/chosen": -16.348854064941406, "logps/rejected": -492.2103576660156, "logps_avg/chosen": -0.09274972975254059, "logps_avg/rejected": -2.6816866397857666, "loss": 0.0976, "losses_ref": -0.0004981858073733747, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6140, "u": -3.4069740772247314, "weight": 0.10068309307098389 }, { "diff_generated": -29.773651123046875, "epoch": 1.9928710304601425, "grad_norm": 2.7060223991553105, "learning_rate": 2.4463049348777666e-07, "logits/chosen": -2.354555130004883, "logits/rejected": -2.282043933868408, "logps/chosen": -16.243932723999023, "logps/rejected": -546.9548950195312, "logps_avg/chosen": -0.0912802442908287, "logps_avg/rejected": -2.977365016937256, "loss": 0.0966, "losses_ref": -0.0002333047305000946, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6150, "u": -3.646449327468872, "weight": 0.0378076434135437 }, { "diff_generated": -27.507165908813477, "epoch": 1.9961114711600778, "grad_norm": 2.655346958804383, "learning_rate": 2.4324181843916364e-07, "logits/chosen": -2.380687952041626, "logits/rejected": -2.311513662338257, "logps/chosen": -20.26241111755371, "logps/rejected": -494.68878173828125, "logps_avg/chosen": -0.10843686014413834, "logps_avg/rejected": -2.7507166862487793, "loss": 0.0975, "losses_ref": -0.0007293138187378645, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6160, "u": -3.4976515769958496, "weight": 0.07613696157932281 }, { "diff_generated": -28.97373390197754, "epoch": 1.999351911860013, "grad_norm": 2.574664793901018, "learning_rate": 2.4185537198438777e-07, "logits/chosen": -2.4231228828430176, "logits/rejected": -2.3481459617614746, "logps/chosen": -18.296947479248047, "logps/rejected": -531.1096801757812, "logps_avg/chosen": -0.10054130852222443, "logps_avg/rejected": -2.8973731994628906, "loss": 0.096, "losses_ref": -0.0007058627670630813, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6170, "u": -3.6683974266052246, "weight": 0.032234203070402145 }, { "diff_generated": -30.53542709350586, "epoch": 2.0025923525599483, "grad_norm": 2.5901636980927556, "learning_rate": 2.40471173834229e-07, "logits/chosen": -2.397902488708496, "logits/rejected": -2.3140671253204346, "logps/chosen": -15.657752990722656, "logps/rejected": -562.3306884765625, "logps_avg/chosen": -0.08636067062616348, "logps_avg/rejected": -3.0535435676574707, "loss": 0.0829, "losses_ref": -0.0028026457875967026, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6180, "u": -5.210684776306152, "weight": 0.02268834412097931 }, { "diff_generated": -29.695453643798828, "epoch": 2.0058327932598834, "grad_norm": 2.571727590975468, "learning_rate": 2.3908924366750385e-07, "logits/chosen": -2.336836338043213, "logits/rejected": -2.2530758380889893, "logps/chosen": -13.952987670898438, "logps/rejected": -537.3453369140625, "logps_avg/chosen": -0.08065802603960037, "logps_avg/rejected": -2.969545364379883, "loss": 0.079, "losses_ref": -0.0006570112309418619, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6190, "u": -5.256657600402832, "weight": 0.07569855451583862 }, { "diff_generated": -31.76747703552246, "epoch": 2.0090732339598185, "grad_norm": 2.9585220072637743, "learning_rate": 2.3770960113078505e-07, "logits/chosen": -2.3309884071350098, "logits/rejected": -2.2648239135742188, "logps/chosen": -13.648565292358398, "logps/rejected": -609.6402587890625, "logps_avg/chosen": -0.07571685314178467, "logps_avg/rejected": -3.1767475605010986, "loss": 0.0789, "losses_ref": -0.0007576612988486886, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6200, "u": -5.403063774108887, "weight": 0.05089284107089043 }, { "diff_generated": -29.608722686767578, "epoch": 2.0123136746597536, "grad_norm": 2.567397987321688, "learning_rate": 2.3633226583812304e-07, "logits/chosen": -2.3135323524475098, "logits/rejected": -2.234470844268799, "logps/chosen": -14.518350601196289, "logps/rejected": -518.7788696289062, "logps_avg/chosen": -0.08325570076704025, "logps_avg/rejected": -2.960872173309326, "loss": 0.0773, "losses_ref": -0.0017503865528851748, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6210, "u": -5.509909629821777, "weight": 0.033318065106868744 }, { "diff_generated": -30.771066665649414, "epoch": 2.0155541153596888, "grad_norm": 2.4664748917173296, "learning_rate": 2.3495725737076642e-07, "logits/chosen": -2.3625543117523193, "logits/rejected": -2.2188992500305176, "logps/chosen": -14.710256576538086, "logps/rejected": -569.1174926757812, "logps_avg/chosen": -0.07622543722391129, "logps_avg/rejected": -3.077106475830078, "loss": 0.0782, "losses_ref": -0.0008851033635437489, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6220, "u": -5.327384948730469, "weight": 0.06347521394491196 }, { "diff_generated": -29.374542236328125, "epoch": 2.0187945560596243, "grad_norm": 2.5104772335097487, "learning_rate": 2.3358459527688432e-07, "logits/chosen": -2.334970235824585, "logits/rejected": -2.214024066925049, "logps/chosen": -14.989812850952148, "logps/rejected": -542.8701171875, "logps_avg/chosen": -0.08332401514053345, "logps_avg/rejected": -2.9374542236328125, "loss": 0.0797, "losses_ref": -0.0009830545168370008, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6230, "u": -5.472678184509277, "weight": 0.03868403658270836 }, { "diff_generated": -31.6467227935791, "epoch": 2.0220349967595594, "grad_norm": 2.592959712448732, "learning_rate": 2.3221429907128734e-07, "logits/chosen": -2.3337042331695557, "logits/rejected": -2.2146174907684326, "logps/chosen": -14.46679973602295, "logps/rejected": -598.3749389648438, "logps_avg/chosen": -0.08240757882595062, "logps_avg/rejected": -3.1646721363067627, "loss": 0.0776, "losses_ref": -0.0014417509082704782, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6240, "u": -5.394257545471191, "weight": 0.051792167127132416 }, { "diff_generated": -31.339035034179688, "epoch": 2.0252754374594946, "grad_norm": 2.70418171358979, "learning_rate": 2.3084638823515136e-07, "logits/chosen": -2.305694103240967, "logits/rejected": -2.170424699783325, "logps/chosen": -13.748693466186523, "logps/rejected": -589.9194946289062, "logps_avg/chosen": -0.07391176372766495, "logps_avg/rejected": -3.1339030265808105, "loss": 0.0768, "losses_ref": -0.0010343308094888926, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6250, "u": -5.400518417358398, "weight": 0.0511666014790535 }, { "diff_generated": -32.47716522216797, "epoch": 2.0285158781594297, "grad_norm": 2.7810488017581885, "learning_rate": 2.2948088221573986e-07, "logits/chosen": -2.3060505390167236, "logits/rejected": -2.139094352722168, "logps/chosen": -17.142841339111328, "logps/rejected": -604.8924560546875, "logps_avg/chosen": -0.09249739348888397, "logps_avg/rejected": -3.2477169036865234, "loss": 0.0789, "losses_ref": -0.0014247202780097723, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6260, "u": -5.4678754806518555, "weight": 0.03933858126401901 }, { "diff_generated": -32.207603454589844, "epoch": 2.031756318859365, "grad_norm": 2.6916719515805796, "learning_rate": 2.2811780042612753e-07, "logits/chosen": -2.2889606952667236, "logits/rejected": -2.150083065032959, "logps/chosen": -14.05456256866455, "logps/rejected": -600.9343872070312, "logps_avg/chosen": -0.07899312674999237, "logps_avg/rejected": -3.2207603454589844, "loss": 0.0787, "losses_ref": -0.0009017119882628322, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6270, "u": -5.3306355476379395, "weight": 0.06367681920528412 }, { "diff_generated": -32.79245376586914, "epoch": 2.0349967595593, "grad_norm": 2.7245432916785584, "learning_rate": 2.267571622449246e-07, "logits/chosen": -2.302232265472412, "logits/rejected": -2.132932662963867, "logps/chosen": -14.357002258300781, "logps/rejected": -575.6806030273438, "logps_avg/chosen": -0.0796835646033287, "logps_avg/rejected": -3.279245376586914, "loss": 0.0793, "losses_ref": -0.00011451655154814944, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6280, "u": -5.25557279586792, "weight": 0.07510305941104889 }, { "diff_generated": -31.563732147216797, "epoch": 2.038237200259235, "grad_norm": 2.6092658044234533, "learning_rate": 2.2539898701600082e-07, "logits/chosen": -2.2864041328430176, "logits/rejected": -2.1257951259613037, "logps/chosen": -14.23120403289795, "logps/rejected": -566.61083984375, "logps_avg/chosen": -0.07742153108119965, "logps_avg/rejected": -3.1563732624053955, "loss": 0.0769, "losses_ref": -0.0016743981977924705, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6290, "u": -5.29136323928833, "weight": 0.0709114819765091 }, { "diff_generated": -33.81085968017578, "epoch": 2.0414776409591706, "grad_norm": 2.707014491753683, "learning_rate": 2.2404329404821086e-07, "logits/chosen": -2.279618740081787, "logits/rejected": -2.113089084625244, "logps/chosen": -15.665722846984863, "logps/rejected": -586.9880981445312, "logps_avg/chosen": -0.08939939737319946, "logps_avg/rejected": -3.3810858726501465, "loss": 0.08, "losses_ref": -0.00024769414449110627, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6300, "u": -5.396568298339844, "weight": 0.05024506896734238 }, { "diff_generated": -30.7529296875, "epoch": 2.0447180816591057, "grad_norm": 2.9484328369759427, "learning_rate": 2.2269010261511974e-07, "logits/chosen": -2.31408429145813, "logits/rejected": -2.150662899017334, "logps/chosen": -16.079818725585938, "logps/rejected": -563.0985107421875, "logps_avg/chosen": -0.07982766628265381, "logps_avg/rejected": -3.0752930641174316, "loss": 0.0779, "losses_ref": -0.0020618215203285217, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6310, "u": -5.365875720977783, "weight": 0.05885094404220581 }, { "diff_generated": -31.242420196533203, "epoch": 2.047958522359041, "grad_norm": 2.8357529765475378, "learning_rate": 2.2133943195472874e-07, "logits/chosen": -2.3051440715789795, "logits/rejected": -2.2008893489837646, "logps/chosen": -13.617289543151855, "logps/rejected": -574.6190185546875, "logps_avg/chosen": -0.07658557593822479, "logps_avg/rejected": -3.124242067337036, "loss": 0.076, "losses_ref": -0.0007927521946839988, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6320, "u": -5.217470169067383, "weight": 0.08214583247900009 }, { "diff_generated": -33.446495056152344, "epoch": 2.051198963058976, "grad_norm": 2.792880239658355, "learning_rate": 2.1999130126920158e-07, "logits/chosen": -2.334336519241333, "logits/rejected": -2.1936802864074707, "logps/chosen": -13.958539962768555, "logps/rejected": -610.8575439453125, "logps_avg/chosen": -0.08001341670751572, "logps_avg/rejected": -3.3446497917175293, "loss": 0.0772, "losses_ref": -0.0007364677148871124, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6330, "u": -5.504420757293701, "weight": 0.03204908221960068 }, { "diff_generated": -33.31673049926758, "epoch": 2.054439403758911, "grad_norm": 2.765456961949322, "learning_rate": 2.1864572972459228e-07, "logits/chosen": -2.305427074432373, "logits/rejected": -2.1972203254699707, "logps/chosen": -11.824773788452148, "logps/rejected": -606.9254150390625, "logps_avg/chosen": -0.06999178230762482, "logps_avg/rejected": -3.3316726684570312, "loss": 0.0784, "losses_ref": -0.00028468476375564933, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6340, "u": -5.2944512367248535, "weight": 0.06903310865163803 }, { "diff_generated": -33.31203842163086, "epoch": 2.057679844458846, "grad_norm": 2.7025491580315513, "learning_rate": 2.1730273645057173e-07, "logits/chosen": -2.2650606632232666, "logits/rejected": -2.115473508834839, "logps/chosen": -15.001439094543457, "logps/rejected": -572.0594482421875, "logps_avg/chosen": -0.08333877474069595, "logps_avg/rejected": -3.3312039375305176, "loss": 0.0755, "losses_ref": -0.0007068266859278083, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6350, "u": -5.579641819000244, "weight": 0.019526129588484764 }, { "diff_generated": -31.222393035888672, "epoch": 2.060920285158782, "grad_norm": 2.8444818657075537, "learning_rate": 2.1596234054015654e-07, "logits/chosen": -2.2948861122131348, "logits/rejected": -2.157444715499878, "logps/chosen": -14.057197570800781, "logps/rejected": -567.6610107421875, "logps_avg/chosen": -0.08014727383852005, "logps_avg/rejected": -3.122239589691162, "loss": 0.0787, "losses_ref": -0.000226367570576258, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6360, "u": -5.466059684753418, "weight": 0.03770763427019119 }, { "diff_generated": -32.59468078613281, "epoch": 2.064160725858717, "grad_norm": 2.7306308341749213, "learning_rate": 2.1462456104943692e-07, "logits/chosen": -2.2609167098999023, "logits/rejected": -2.0981216430664062, "logps/chosen": -13.265954971313477, "logps/rejected": -601.2452392578125, "logps_avg/chosen": -0.07434628903865814, "logps_avg/rejected": -3.259467601776123, "loss": 0.0775, "losses_ref": -0.00033033458748832345, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6370, "u": -5.3675761222839355, "weight": 0.05661793425679207 }, { "diff_generated": -31.082448959350586, "epoch": 2.067401166558652, "grad_norm": 2.7488718393815987, "learning_rate": 2.132894169973063e-07, "logits/chosen": -2.330197811126709, "logits/rejected": -2.1672775745391846, "logps/chosen": -14.261502265930176, "logps/rejected": -547.7105712890625, "logps_avg/chosen": -0.07912299036979675, "logps_avg/rejected": -3.1082448959350586, "loss": 0.0793, "losses_ref": -0.00029022307717241347, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6380, "u": -5.365514278411865, "weight": 0.05652255937457085 }, { "diff_generated": -32.763587951660156, "epoch": 2.070641607258587, "grad_norm": 2.7081410301841835, "learning_rate": 2.1195692736519013e-07, "logits/chosen": -2.313753128051758, "logits/rejected": -2.0885937213897705, "logps/chosen": -14.403570175170898, "logps/rejected": -634.0228271484375, "logps_avg/chosen": -0.08032946288585663, "logps_avg/rejected": -3.2763583660125732, "loss": 0.0798, "losses_ref": -0.0003196417819708586, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6390, "u": -5.469879150390625, "weight": 0.037800583988428116 }, { "diff_generated": -32.15003967285156, "epoch": 2.0738820479585223, "grad_norm": 2.4964061534649042, "learning_rate": 2.1062711109677757e-07, "logits/chosen": -2.3299503326416016, "logits/rejected": -2.1573243141174316, "logps/chosen": -14.105905532836914, "logps/rejected": -579.3314208984375, "logps_avg/chosen": -0.0796867087483406, "logps_avg/rejected": -3.2150039672851562, "loss": 0.0768, "losses_ref": -0.00031392709934152663, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6400, "u": -5.433914661407471, "weight": 0.04409373179078102 }, { "diff_generated": -32.85166931152344, "epoch": 2.0771224886584574, "grad_norm": 2.6173716641305895, "learning_rate": 2.0929998709775068e-07, "logits/chosen": -2.301513433456421, "logits/rejected": -2.072667121887207, "logps/chosen": -14.194091796875, "logps/rejected": -551.7338256835938, "logps_avg/chosen": -0.07612624764442444, "logps_avg/rejected": -3.2851669788360596, "loss": 0.0764, "losses_ref": -0.0004419487959239632, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6410, "u": -5.298669815063477, "weight": 0.06921719759702682 }, { "diff_generated": -33.96429443359375, "epoch": 2.080362929358393, "grad_norm": 2.798558040379388, "learning_rate": 2.0797557423551574e-07, "logits/chosen": -2.304396152496338, "logits/rejected": -2.1104958057403564, "logps/chosen": -14.470054626464844, "logps/rejected": -656.9310302734375, "logps_avg/chosen": -0.07742521166801453, "logps_avg/rejected": -3.3964295387268066, "loss": 0.0779, "losses_ref": -0.0004889139672741294, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6420, "u": -5.361597537994385, "weight": 0.056789349764585495 }, { "diff_generated": -32.24937057495117, "epoch": 2.083603370058328, "grad_norm": 2.6907956388834, "learning_rate": 2.066538913389361e-07, "logits/chosen": -2.2897346019744873, "logits/rejected": -2.14884614944458, "logps/chosen": -14.133687019348145, "logps/rejected": -573.21923828125, "logps_avg/chosen": -0.07917702943086624, "logps_avg/rejected": -3.2249374389648438, "loss": 0.0789, "losses_ref": -0.0005835826741531491, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6430, "u": -5.470171928405762, "weight": 0.038123439997434616 }, { "diff_generated": -33.30463790893555, "epoch": 2.086843810758263, "grad_norm": 3.1474384018069403, "learning_rate": 2.053349571980635e-07, "logits/chosen": -2.3388473987579346, "logits/rejected": -2.123534679412842, "logps/chosen": -13.677331924438477, "logps/rejected": -579.9287719726562, "logps_avg/chosen": -0.07830803096294403, "logps_avg/rejected": -3.3304641246795654, "loss": 0.0768, "losses_ref": -0.00024560894235037267, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6440, "u": -5.5442609786987305, "weight": 0.025242527946829796 }, { "diff_generated": -35.16442108154297, "epoch": 2.0900842514581983, "grad_norm": 2.5503238617943627, "learning_rate": 2.0401879056387155e-07, "logits/chosen": -2.2443032264709473, "logits/rejected": -2.050220489501953, "logps/chosen": -13.342799186706543, "logps/rejected": -631.4598388671875, "logps_avg/chosen": -0.07115190476179123, "logps_avg/rejected": -3.516442060470581, "loss": 0.0781, "losses_ref": -0.0018637517932802439, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6450, "u": -5.328794002532959, "weight": 0.06499633193016052 }, { "diff_generated": -31.10691261291504, "epoch": 2.0933246921581334, "grad_norm": 2.955648628374549, "learning_rate": 2.0270541014798864e-07, "logits/chosen": -2.252042531967163, "logits/rejected": -2.037557601928711, "logps/chosen": -13.579818725585938, "logps/rejected": -583.5672607421875, "logps_avg/chosen": -0.07687920331954956, "logps_avg/rejected": -3.1106910705566406, "loss": 0.0786, "losses_ref": -0.0009676915360614657, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6460, "u": -5.223965167999268, "weight": 0.08236159384250641 }, { "diff_generated": -34.064735412597656, "epoch": 2.0965651328580686, "grad_norm": 2.6542132953205413, "learning_rate": 2.0139483462243225e-07, "logits/chosen": -2.227311372756958, "logits/rejected": -2.0686700344085693, "logps/chosen": -12.253379821777344, "logps/rejected": -615.8561401367188, "logps_avg/chosen": -0.07306130230426788, "logps_avg/rejected": -3.4064738750457764, "loss": 0.079, "losses_ref": -0.0022037180606275797, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6470, "u": -5.328154563903809, "weight": 0.06612833589315414 }, { "diff_generated": -33.95252990722656, "epoch": 2.0998055735580037, "grad_norm": 2.9236702346226657, "learning_rate": 2.00087082619343e-07, "logits/chosen": -2.2646069526672363, "logits/rejected": -2.100008487701416, "logps/chosen": -13.059414863586426, "logps/rejected": -631.9442138671875, "logps_avg/chosen": -0.08072508871555328, "logps_avg/rejected": -3.3952529430389404, "loss": 0.0782, "losses_ref": -0.0003588471154216677, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6480, "u": -5.472105979919434, "weight": 0.03786256164312363 }, { "diff_generated": -32.90629196166992, "epoch": 2.1030460142579392, "grad_norm": 2.673428354153091, "learning_rate": 1.9878217273072116e-07, "logits/chosen": -2.2418177127838135, "logits/rejected": -2.12835431098938, "logps/chosen": -12.385138511657715, "logps/rejected": -594.1697387695312, "logps_avg/chosen": -0.07691224664449692, "logps_avg/rejected": -3.2906289100646973, "loss": 0.0781, "losses_ref": -0.0013034009607508779, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6490, "u": -5.400871753692627, "weight": 0.05187705159187317 }, { "diff_generated": -30.338708877563477, "epoch": 2.1062864549578744, "grad_norm": 2.824418049212177, "learning_rate": 1.974801235081602e-07, "logits/chosen": -2.232614517211914, "logits/rejected": -2.101982831954956, "logps/chosen": -13.843696594238281, "logps/rejected": -548.92724609375, "logps_avg/chosen": -0.08000337332487106, "logps_avg/rejected": -3.0338706970214844, "loss": 0.0799, "losses_ref": -0.00043142749927937984, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6500, "u": -5.224959373474121, "weight": 0.08169388771057129 }, { "diff_generated": -31.706974029541016, "epoch": 2.1095268956578095, "grad_norm": 2.9654927434221228, "learning_rate": 1.9618095346258485e-07, "logits/chosen": -2.2640607357025146, "logits/rejected": -2.08363938331604, "logps/chosen": -13.451878547668457, "logps/rejected": -562.90966796875, "logps_avg/chosen": -0.07569324225187302, "logps_avg/rejected": -3.1706976890563965, "loss": 0.079, "losses_ref": -0.0006684382678940892, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6510, "u": -5.258929252624512, "weight": 0.07574840635061264 }, { "diff_generated": -30.676361083984375, "epoch": 2.1127673363577446, "grad_norm": 2.8414460475953054, "learning_rate": 1.948846810639871e-07, "logits/chosen": -2.3207340240478516, "logits/rejected": -2.166395664215088, "logps/chosen": -15.973104476928711, "logps/rejected": -601.5938720703125, "logps_avg/chosen": -0.08310873061418533, "logps_avg/rejected": -3.067636251449585, "loss": 0.0812, "losses_ref": -0.0016424820059910417, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6520, "u": -5.4342474937438965, "weight": 0.04602918028831482 }, { "diff_generated": -33.77393341064453, "epoch": 2.1160077770576797, "grad_norm": 2.633707010619091, "learning_rate": 1.9359132474116374e-07, "logits/chosen": -2.316953420639038, "logits/rejected": -2.1647286415100098, "logps/chosen": -14.174545288085938, "logps/rejected": -579.6941528320312, "logps_avg/chosen": -0.07961226999759674, "logps_avg/rejected": -3.3773937225341797, "loss": 0.079, "losses_ref": -0.001250475412234664, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6530, "u": -5.469130039215088, "weight": 0.039084821939468384 }, { "diff_generated": -31.735015869140625, "epoch": 2.119248217757615, "grad_norm": 2.719005475319987, "learning_rate": 1.923009028814545e-07, "logits/chosen": -2.298225164413452, "logits/rejected": -2.190268039703369, "logps/chosen": -14.34514331817627, "logps/rejected": -585.7277221679688, "logps_avg/chosen": -0.08426286280155182, "logps_avg/rejected": -3.173501491546631, "loss": 0.0797, "losses_ref": -0.0006787871243432164, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6540, "u": -5.432929515838623, "weight": 0.044521529227495193 }, { "diff_generated": -32.375831604003906, "epoch": 2.1224886584575504, "grad_norm": 2.7209691896498986, "learning_rate": 1.910134338304804e-07, "logits/chosen": -2.2743377685546875, "logits/rejected": -2.1104390621185303, "logps/chosen": -14.415522575378418, "logps/rejected": -597.8012084960938, "logps_avg/chosen": -0.08089585602283478, "logps_avg/rejected": -3.237583637237549, "loss": 0.0771, "losses_ref": -0.001353858271613717, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6550, "u": -5.330113410949707, "weight": 0.06419380009174347 }, { "diff_generated": -32.30270767211914, "epoch": 2.1257290991574855, "grad_norm": 2.7649372131038845, "learning_rate": 1.897289358918834e-07, "logits/chosen": -2.2590532302856445, "logits/rejected": -2.0955405235290527, "logps/chosen": -13.316218376159668, "logps/rejected": -587.7503662109375, "logps_avg/chosen": -0.07635252177715302, "logps_avg/rejected": -3.230271100997925, "loss": 0.0771, "losses_ref": -0.0015015669632703066, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6560, "u": -5.469440460205078, "weight": 0.039542146027088165 }, { "diff_generated": -30.48178482055664, "epoch": 2.1289695398574207, "grad_norm": 3.028113436370566, "learning_rate": 1.8844742732706508e-07, "logits/chosen": -2.26723051071167, "logits/rejected": -2.130545139312744, "logps/chosen": -13.461906433105469, "logps/rejected": -548.2975463867188, "logps_avg/chosen": -0.07495727390050888, "logps_avg/rejected": -3.0481784343719482, "loss": 0.0752, "losses_ref": -0.0005471274489536881, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6570, "u": -5.119858741760254, "weight": 0.10057337582111359 }, { "diff_generated": -33.4208869934082, "epoch": 2.1322099805573558, "grad_norm": 2.774153777490152, "learning_rate": 1.8716892635492906e-07, "logits/chosen": -2.309311628341675, "logits/rejected": -2.1527256965637207, "logps/chosen": -13.22758674621582, "logps/rejected": -617.5025634765625, "logps_avg/chosen": -0.07590781897306442, "logps_avg/rejected": -3.342088222503662, "loss": 0.075, "losses_ref": -0.0011381434742361307, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6580, "u": -5.4658942222595215, "weight": 0.0390189066529274 }, { "diff_generated": -32.68239974975586, "epoch": 2.135450421257291, "grad_norm": 2.9090982333609197, "learning_rate": 1.8589345115161948e-07, "logits/chosen": -2.3101859092712402, "logits/rejected": -2.113839626312256, "logps/chosen": -14.117950439453125, "logps/rejected": -594.4866333007812, "logps_avg/chosen": -0.07787175476551056, "logps_avg/rejected": -3.268240451812744, "loss": 0.0768, "losses_ref": -0.00029373442521318793, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6590, "u": -5.369537830352783, "weight": 0.05655078962445259 }, { "diff_generated": -30.384124755859375, "epoch": 2.138690861957226, "grad_norm": 2.5312553909221207, "learning_rate": 1.846210198502646e-07, "logits/chosen": -2.303065776824951, "logits/rejected": -2.1762404441833496, "logps/chosen": -12.848559379577637, "logps/rejected": -539.0001220703125, "logps_avg/chosen": -0.0744701474905014, "logps_avg/rejected": -3.038412570953369, "loss": 0.0765, "losses_ref": -0.0014434943441301584, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6600, "u": -5.364091873168945, "weight": 0.05824242904782295 }, { "diff_generated": -31.471532821655273, "epoch": 2.141931302657161, "grad_norm": 2.7702712125169553, "learning_rate": 1.8335165054071795e-07, "logits/chosen": -2.2743592262268066, "logits/rejected": -2.2176575660705566, "logps/chosen": -12.365696907043457, "logps/rejected": -598.3367309570312, "logps_avg/chosen": -0.07541408389806747, "logps_avg/rejected": -3.14715313911438, "loss": 0.0776, "losses_ref": -0.0008119974518194795, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6610, "u": -5.437493801116943, "weight": 0.044645097106695175 }, { "diff_generated": -35.88068389892578, "epoch": 2.1451717433570967, "grad_norm": 2.8827133227774215, "learning_rate": 1.8208536126930173e-07, "logits/chosen": -2.31424617767334, "logits/rejected": -2.1438095569610596, "logps/chosen": -14.181668281555176, "logps/rejected": -659.5066528320312, "logps_avg/chosen": -0.08195123821496964, "logps_avg/rejected": -3.5880680084228516, "loss": 0.08, "losses_ref": -0.00035342806950211525, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6620, "u": -5.541591644287109, "weight": 0.02537021040916443 }, { "diff_generated": -32.14451599121094, "epoch": 2.148412184057032, "grad_norm": 2.6764558944119945, "learning_rate": 1.8082217003854933e-07, "logits/chosen": -2.3081860542297363, "logits/rejected": -2.1434569358825684, "logps/chosen": -13.4950590133667, "logps/rejected": -587.8795776367188, "logps_avg/chosen": -0.0746789425611496, "logps_avg/rejected": -3.214451551437378, "loss": 0.078, "losses_ref": -0.001070145284757018, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6630, "u": -5.292096138000488, "weight": 0.07006745040416718 }, { "diff_generated": -34.20409393310547, "epoch": 2.151652624756967, "grad_norm": 2.7582898602691706, "learning_rate": 1.7956209480695087e-07, "logits/chosen": -2.2990448474884033, "logits/rejected": -2.1080923080444336, "logps/chosen": -13.895889282226562, "logps/rejected": -613.466552734375, "logps_avg/chosen": -0.08254117518663406, "logps_avg/rejected": -3.4204094409942627, "loss": 0.0778, "losses_ref": -0.000966493331361562, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6640, "u": -5.470082759857178, "weight": 0.038777273148298264 }, { "diff_generated": -29.56864356994629, "epoch": 2.154893065456902, "grad_norm": 2.933211681518209, "learning_rate": 1.7830515348869664e-07, "logits/chosen": -2.2532479763031006, "logits/rejected": -2.14927339553833, "logps/chosen": -13.479177474975586, "logps/rejected": -545.1463012695312, "logps_avg/chosen": -0.07876866310834885, "logps_avg/rejected": -2.956864356994629, "loss": 0.0791, "losses_ref": -0.0007795925484970212, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6650, "u": -5.326550006866455, "weight": 0.06341485679149628 }, { "diff_generated": -30.022064208984375, "epoch": 2.158133506156837, "grad_norm": 2.5585770502772993, "learning_rate": 1.770513639534225e-07, "logits/chosen": -2.2722833156585693, "logits/rejected": -2.1155576705932617, "logps/chosen": -13.819868087768555, "logps/rejected": -539.40966796875, "logps_avg/chosen": -0.075398750603199, "logps_avg/rejected": -3.002206325531006, "loss": 0.0774, "losses_ref": -0.0006787824677303433, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6660, "u": -5.14738655090332, "weight": 0.09448657929897308 }, { "diff_generated": -31.8614559173584, "epoch": 2.1613739468567728, "grad_norm": 2.840762893548993, "learning_rate": 1.7580074402595698e-07, "logits/chosen": -2.2734367847442627, "logits/rejected": -2.13392972946167, "logps/chosen": -12.674986839294434, "logps/rejected": -590.9808349609375, "logps_avg/chosen": -0.06906630098819733, "logps_avg/rejected": -3.186145305633545, "loss": 0.0756, "losses_ref": -0.0005319962510839105, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6670, "u": -5.151522159576416, "weight": 0.09430833160877228 }, { "diff_generated": -34.026451110839844, "epoch": 2.164614387556708, "grad_norm": 2.7539821387470984, "learning_rate": 1.7455331148606618e-07, "logits/chosen": -2.2525477409362793, "logits/rejected": -2.076921224594116, "logps/chosen": -13.29094123840332, "logps/rejected": -592.8287353515625, "logps_avg/chosen": -0.07590679079294205, "logps_avg/rejected": -3.4026455879211426, "loss": 0.0746, "losses_ref": -0.00040179031202569604, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6680, "u": -5.470437526702881, "weight": 0.03791480511426926 }, { "diff_generated": -30.604236602783203, "epoch": 2.167854828256643, "grad_norm": 2.6701230239114015, "learning_rate": 1.7330908406820237e-07, "logits/chosen": -2.276629686355591, "logits/rejected": -2.1089656352996826, "logps/chosen": -12.869527816772461, "logps/rejected": -558.6658325195312, "logps_avg/chosen": -0.07457348704338074, "logps_avg/rejected": -3.0604236125946045, "loss": 0.0775, "losses_ref": -0.00016242492711171508, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6690, "u": -5.397185325622559, "weight": 0.05015290901064873 }, { "diff_generated": -31.642303466796875, "epoch": 2.171095268956578, "grad_norm": 2.7561051877511447, "learning_rate": 1.7206807946125123e-07, "logits/chosen": -2.289897918701172, "logits/rejected": -2.0907204151153564, "logps/chosen": -14.516186714172363, "logps/rejected": -578.0810546875, "logps_avg/chosen": -0.07717464864253998, "logps_avg/rejected": -3.1642303466796875, "loss": 0.0782, "losses_ref": -0.0008535755914635956, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6700, "u": -5.4019575119018555, "weight": 0.05095864459872246 }, { "diff_generated": -32.917266845703125, "epoch": 2.1743357096565132, "grad_norm": 2.7672422197678537, "learning_rate": 1.7083031530828072e-07, "logits/chosen": -2.2950797080993652, "logits/rejected": -2.0778796672821045, "logps/chosen": -15.119161605834961, "logps/rejected": -578.9310913085938, "logps_avg/chosen": -0.08268582075834274, "logps_avg/rejected": -3.291726589202881, "loss": 0.0775, "losses_ref": -0.0006864489405415952, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6710, "u": -5.572969436645508, "weight": 0.01962578110396862 }, { "diff_generated": -33.86672592163086, "epoch": 2.1775761503564484, "grad_norm": 2.7065540236942005, "learning_rate": 1.6959580920628937e-07, "logits/chosen": -2.2735941410064697, "logits/rejected": -2.1080164909362793, "logps/chosen": -13.885465621948242, "logps/rejected": -576.6862182617188, "logps_avg/chosen": -0.08365106582641602, "logps_avg/rejected": -3.386672258377075, "loss": 0.0789, "losses_ref": -0.0004785112105309963, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6720, "u": -5.434020042419434, "weight": 0.04426304250955582 }, { "diff_generated": -32.36909866333008, "epoch": 2.1808165910563835, "grad_norm": 2.8643963575772133, "learning_rate": 1.6836457870595783e-07, "logits/chosen": -2.240960121154785, "logits/rejected": -2.0428242683410645, "logps/chosen": -13.639554977416992, "logps/rejected": -571.4022216796875, "logps_avg/chosen": -0.07686988264322281, "logps_avg/rejected": -3.236909866333008, "loss": 0.0766, "losses_ref": -0.0023015381302684546, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6730, "u": -5.292568683624268, "weight": 0.07232372462749481 }, { "diff_generated": -34.247276306152344, "epoch": 2.184057031756319, "grad_norm": 2.896275016681993, "learning_rate": 1.6713664131139723e-07, "logits/chosen": -2.252135753631592, "logits/rejected": -2.0358219146728516, "logps/chosen": -13.957839965820312, "logps/rejected": -590.4846801757812, "logps_avg/chosen": -0.07644806802272797, "logps_avg/rejected": -3.42472767829895, "loss": 0.0786, "losses_ref": -0.00032314620329998434, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6740, "u": -5.4735212326049805, "weight": 0.03785574808716774 }, { "diff_generated": -30.658884048461914, "epoch": 2.187297472456254, "grad_norm": 3.0652940579384653, "learning_rate": 1.659120144799019e-07, "logits/chosen": -2.3066296577453613, "logits/rejected": -2.1544699668884277, "logps/chosen": -13.968292236328125, "logps/rejected": -547.006591796875, "logps_avg/chosen": -0.08002481609582901, "logps_avg/rejected": -3.0658886432647705, "loss": 0.078, "losses_ref": -0.00048600314767099917, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6750, "u": -5.3647356033325195, "weight": 0.0567636601626873 }, { "diff_generated": -31.262588500976562, "epoch": 2.1905379131561893, "grad_norm": 2.728896191443824, "learning_rate": 1.6469071562170114e-07, "logits/chosen": -2.3213558197021484, "logits/rejected": -2.1392080783843994, "logps/chosen": -13.566169738769531, "logps/rejected": -572.1866455078125, "logps_avg/chosen": -0.07140545547008514, "logps_avg/rejected": -3.1262588500976562, "loss": 0.0769, "losses_ref": -0.0004521248338278383, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6760, "u": -5.292697429656982, "weight": 0.06922182440757751 }, { "diff_generated": -31.948001861572266, "epoch": 2.1937783538561244, "grad_norm": 2.7531998569337452, "learning_rate": 1.6347276209971024e-07, "logits/chosen": -2.285494804382324, "logits/rejected": -2.1591503620147705, "logps/chosen": -12.3690185546875, "logps/rejected": -594.0634765625, "logps_avg/chosen": -0.07330231368541718, "logps_avg/rejected": -3.19480037689209, "loss": 0.0763, "losses_ref": -0.0008386679110117257, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6770, "u": -5.331079959869385, "weight": 0.06344042718410492 }, { "diff_generated": -33.420738220214844, "epoch": 2.1970187945560595, "grad_norm": 2.9338438350002782, "learning_rate": 1.6225817122928534e-07, "logits/chosen": -2.2678208351135254, "logits/rejected": -2.0875954627990723, "logps/chosen": -12.525259017944336, "logps/rejected": -606.2054443359375, "logps_avg/chosen": -0.07486884295940399, "logps_avg/rejected": -3.342073917388916, "loss": 0.077, "losses_ref": -0.000290944502921775, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6780, "u": -5.362326145172119, "weight": 0.05652598291635513 }, { "diff_generated": -32.1254997253418, "epoch": 2.2002592352559946, "grad_norm": 2.675527870675733, "learning_rate": 1.6104696027797635e-07, "logits/chosen": -2.206605911254883, "logits/rejected": -2.1036322116851807, "logps/chosen": -12.653177261352539, "logps/rejected": -598.8486938476562, "logps_avg/chosen": -0.07566668093204498, "logps_avg/rejected": -3.2125496864318848, "loss": 0.076, "losses_ref": -0.00034772203071042895, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6790, "u": -5.33026123046875, "weight": 0.06287422776222229 }, { "diff_generated": -31.460948944091797, "epoch": 2.20349967595593, "grad_norm": 2.816219731944842, "learning_rate": 1.5983914646528193e-07, "logits/chosen": -2.2932732105255127, "logits/rejected": -2.10170578956604, "logps/chosen": -14.501579284667969, "logps/rejected": -575.257568359375, "logps_avg/chosen": -0.08044446259737015, "logps_avg/rejected": -3.146095037460327, "loss": 0.077, "losses_ref": -0.0013265017187222838, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6800, "u": -5.328404903411865, "weight": 0.06457807123661041 }, { "diff_generated": -31.099594116210938, "epoch": 2.2067401166558653, "grad_norm": 2.6429539812751006, "learning_rate": 1.5863474696240365e-07, "logits/chosen": -2.329437255859375, "logits/rejected": -2.1275200843811035, "logps/chosen": -14.690841674804688, "logps/rejected": -565.6914672851562, "logps_avg/chosen": -0.07997091859579086, "logps_avg/rejected": -3.109959125518799, "loss": 0.0778, "losses_ref": -0.003032374195754528, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6810, "u": -5.390047550201416, "weight": 0.056095026433467865 }, { "diff_generated": -31.575679779052734, "epoch": 2.2099805573558005, "grad_norm": 2.812628607977736, "learning_rate": 1.5743377889200388e-07, "logits/chosen": -2.274228572845459, "logits/rejected": -2.1092209815979004, "logps/chosen": -14.860143661499023, "logps/rejected": -590.8905639648438, "logps_avg/chosen": -0.08572975546121597, "logps_avg/rejected": -3.1575682163238525, "loss": 0.0802, "losses_ref": -0.0005182913737371564, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6820, "u": -5.398507118225098, "weight": 0.05052924156188965 }, { "diff_generated": -32.276546478271484, "epoch": 2.2132209980557356, "grad_norm": 2.7600270080887004, "learning_rate": 1.5623625932795994e-07, "logits/chosen": -2.312800645828247, "logits/rejected": -2.0970990657806396, "logps/chosen": -14.993402481079102, "logps/rejected": -598.1676025390625, "logps_avg/chosen": -0.08526170253753662, "logps_avg/rejected": -3.2276546955108643, "loss": 0.0805, "losses_ref": -0.0005118830013088882, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6830, "u": -5.5376434326171875, "weight": 0.025591537356376648 }, { "diff_generated": -32.459495544433594, "epoch": 2.2164614387556707, "grad_norm": 3.2774846208784205, "learning_rate": 1.5504220529512324e-07, "logits/chosen": -2.2891764640808105, "logits/rejected": -2.115154981613159, "logps/chosen": -13.242490768432617, "logps/rejected": -590.5769653320312, "logps_avg/chosen": -0.07609592378139496, "logps_avg/rejected": -3.2459495067596436, "loss": 0.0796, "losses_ref": -0.0008921163389459252, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6840, "u": -5.542930603027344, "weight": 0.02612874284386635 }, { "diff_generated": -33.76580047607422, "epoch": 2.219701879455606, "grad_norm": 2.747927789084302, "learning_rate": 1.5385163376907636e-07, "logits/chosen": -2.3085949420928955, "logits/rejected": -2.064383029937744, "logps/chosen": -14.200657844543457, "logps/rejected": -614.0256958007812, "logps_avg/chosen": -0.08223428577184677, "logps_avg/rejected": -3.376579761505127, "loss": 0.0762, "losses_ref": -0.0007276682299561799, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6850, "u": -5.363381862640381, "weight": 0.05715851113200188 }, { "diff_generated": -32.7970085144043, "epoch": 2.222942320155541, "grad_norm": 2.788225867383144, "learning_rate": 1.526645616758921e-07, "logits/chosen": -2.235722064971924, "logits/rejected": -2.0513463020324707, "logps/chosen": -13.610944747924805, "logps/rejected": -576.8745727539062, "logps_avg/chosen": -0.08273743093013763, "logps_avg/rejected": -3.279700517654419, "loss": 0.0777, "losses_ref": -0.0009066167986020446, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6860, "u": -5.363965034484863, "weight": 0.05731652304530144 }, { "diff_generated": -33.81819152832031, "epoch": 2.2261827608554765, "grad_norm": 2.8476275963836217, "learning_rate": 1.5148100589189205e-07, "logits/chosen": -2.322516918182373, "logits/rejected": -2.0998401641845703, "logps/chosen": -15.207572937011719, "logps/rejected": -619.1622924804688, "logps_avg/chosen": -0.07979702204465866, "logps_avg/rejected": -3.381819248199463, "loss": 0.0791, "losses_ref": -0.00029921572422608733, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6870, "u": -5.4724297523498535, "weight": 0.03779328987002373 }, { "diff_generated": -34.35366439819336, "epoch": 2.2294232015554116, "grad_norm": 2.843435625809698, "learning_rate": 1.5030098324340808e-07, "logits/chosen": -2.3158698081970215, "logits/rejected": -2.1082563400268555, "logps/chosen": -13.8373384475708, "logps/rejected": -618.6031494140625, "logps_avg/chosen": -0.07845751941204071, "logps_avg/rejected": -3.43536639213562, "loss": 0.0762, "losses_ref": -0.00038599842810072005, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6880, "u": -5.506179332733154, "weight": 0.03169042617082596 }, { "diff_generated": -34.03045654296875, "epoch": 2.2326636422553467, "grad_norm": 2.8953107407337026, "learning_rate": 1.491245105065419e-07, "logits/chosen": -2.3265156745910645, "logits/rejected": -2.1272525787353516, "logps/chosen": -13.283559799194336, "logps/rejected": -629.44775390625, "logps_avg/chosen": -0.07462642341852188, "logps_avg/rejected": -3.403046131134033, "loss": 0.0784, "losses_ref": -0.0001504389219917357, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6890, "u": -5.471314430236816, "weight": 0.03763309493660927 }, { "diff_generated": -30.7342586517334, "epoch": 2.235904082955282, "grad_norm": 2.9007177421575587, "learning_rate": 1.4795160440692672e-07, "logits/chosen": -2.345430374145508, "logits/rejected": -2.146914005279541, "logps/chosen": -14.193719863891602, "logps/rejected": -549.6990356445312, "logps_avg/chosen": -0.08124233782291412, "logps_avg/rejected": -3.0734260082244873, "loss": 0.0782, "losses_ref": -0.0012097887229174376, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6900, "u": -5.5394673347473145, "weight": 0.02644205465912819 }, { "diff_generated": -32.822933197021484, "epoch": 2.239144523655217, "grad_norm": 2.8833451708893, "learning_rate": 1.467822816194904e-07, "logits/chosen": -2.3125786781311035, "logits/rejected": -2.111271381378174, "logps/chosen": -13.782983779907227, "logps/rejected": -604.3697509765625, "logps_avg/chosen": -0.07867380231618881, "logps_avg/rejected": -3.2822933197021484, "loss": 0.0761, "losses_ref": -0.0003412619116716087, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6910, "u": -5.401597499847412, "weight": 0.05035555362701416 }, { "diff_generated": -31.626598358154297, "epoch": 2.242384964355152, "grad_norm": 2.814659419342255, "learning_rate": 1.4561655876821694e-07, "logits/chosen": -2.2330501079559326, "logits/rejected": -2.104743480682373, "logps/chosen": -13.798090934753418, "logps/rejected": -591.9242553710938, "logps_avg/chosen": -0.07953254878520966, "logps_avg/rejected": -3.1626598834991455, "loss": 0.0789, "losses_ref": -0.00016892193525563926, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6920, "u": -5.404656410217285, "weight": 0.05016558617353439 }, { "diff_generated": -32.88931655883789, "epoch": 2.2456254050550877, "grad_norm": 2.896216572470763, "learning_rate": 1.4445445242591138e-07, "logits/chosen": -2.2789807319641113, "logits/rejected": -2.1155471801757812, "logps/chosen": -13.4810209274292, "logps/rejected": -598.04150390625, "logps_avg/chosen": -0.0773623138666153, "logps_avg/rejected": -3.2889316082000732, "loss": 0.0785, "losses_ref": -0.0006119104218669236, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6930, "u": -5.398166656494141, "weight": 0.05070475861430168 }, { "diff_generated": -32.10064697265625, "epoch": 2.248865845755023, "grad_norm": 2.743282441118848, "learning_rate": 1.4329597911396362e-07, "logits/chosen": -2.32409930229187, "logits/rejected": -2.1036975383758545, "logps/chosen": -15.0281400680542, "logps/rejected": -578.3131103515625, "logps_avg/chosen": -0.07574650645256042, "logps_avg/rejected": -3.210064649581909, "loss": 0.0765, "losses_ref": -0.00012708675058092922, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6940, "u": -5.294000148773193, "weight": 0.06886385381221771 }, { "diff_generated": -33.120174407958984, "epoch": 2.252106286454958, "grad_norm": 2.8897561216334315, "learning_rate": 1.421411553021137e-07, "logits/chosen": -2.3341989517211914, "logits/rejected": -2.1409738063812256, "logps/chosen": -15.546142578125, "logps/rejected": -595.2667236328125, "logps_avg/chosen": -0.08349694311618805, "logps_avg/rejected": -3.3120174407958984, "loss": 0.0773, "losses_ref": -0.0012247232953086495, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6950, "u": -5.466020584106445, "weight": 0.039082638919353485 }, { "diff_generated": -30.71538734436035, "epoch": 2.255346727154893, "grad_norm": 2.859122418072979, "learning_rate": 1.4098999740821716e-07, "logits/chosen": -2.2431702613830566, "logits/rejected": -2.063222646713257, "logps/chosen": -13.041366577148438, "logps/rejected": -560.9396362304688, "logps_avg/chosen": -0.07067658007144928, "logps_avg/rejected": -3.0715386867523193, "loss": 0.0769, "losses_ref": -0.0007479506894014776, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6960, "u": -5.294276237487793, "weight": 0.06961636990308762 }, { "diff_generated": -32.428977966308594, "epoch": 2.258587167854828, "grad_norm": 2.877870001625762, "learning_rate": 1.3984252179801277e-07, "logits/chosen": -2.2853071689605713, "logits/rejected": -2.106396198272705, "logps/chosen": -14.881681442260742, "logps/rejected": -623.0894775390625, "logps_avg/chosen": -0.08217870444059372, "logps_avg/rejected": -3.2428977489471436, "loss": 0.0791, "losses_ref": -0.00023555834195576608, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6970, "u": -5.364887237548828, "weight": 0.05648752301931381 }, { "diff_generated": -32.93585968017578, "epoch": 2.2618276085547633, "grad_norm": 2.717148340358031, "learning_rate": 1.3869874478488846e-07, "logits/chosen": -2.26002836227417, "logits/rejected": -2.120847225189209, "logps/chosen": -12.688130378723145, "logps/rejected": -582.8048706054688, "logps_avg/chosen": -0.07913817465305328, "logps_avg/rejected": -3.293586015701294, "loss": 0.0759, "losses_ref": -0.0017868172144517303, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6980, "u": -5.436811923980713, "weight": 0.04643816873431206 }, { "diff_generated": -33.98379135131836, "epoch": 2.2650680492546984, "grad_norm": 2.9089997065506754, "learning_rate": 1.3755868262965047e-07, "logits/chosen": -2.333571434020996, "logits/rejected": -2.081078052520752, "logps/chosen": -13.215835571289062, "logps/rejected": -569.4586181640625, "logps_avg/chosen": -0.07337544113397598, "logps_avg/rejected": -3.39837908744812, "loss": 0.0772, "losses_ref": -0.0013759381836280227, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6990, "u": -5.292944431304932, "weight": 0.07080823183059692 }, { "diff_generated": -35.02412033081055, "epoch": 2.268308489954634, "grad_norm": 2.8189637995994863, "learning_rate": 1.3642235154029172e-07, "logits/chosen": -2.306802272796631, "logits/rejected": -2.036937713623047, "logps/chosen": -15.109029769897461, "logps/rejected": -626.7437744140625, "logps_avg/chosen": -0.07939799129962921, "logps_avg/rejected": -3.5024120807647705, "loss": 0.0762, "losses_ref": -0.0005451668985188007, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7000, "u": -5.360957145690918, "weight": 0.05685426667332649 }, { "diff_generated": -34.771018981933594, "epoch": 2.271548930654569, "grad_norm": 2.8519990889263025, "learning_rate": 1.352897676717614e-07, "logits/chosen": -2.3075461387634277, "logits/rejected": -2.081054210662842, "logps/chosen": -14.556310653686523, "logps/rejected": -605.6522216796875, "logps_avg/chosen": -0.08247552812099457, "logps_avg/rejected": -3.4771018028259277, "loss": 0.0812, "losses_ref": -0.0002231432154076174, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7010, "u": -5.5091047286987305, "weight": 0.03146742656826973 }, { "diff_generated": -35.00230026245117, "epoch": 2.274789371354504, "grad_norm": 2.8569966796193853, "learning_rate": 1.341609471257354e-07, "logits/chosen": -2.2501139640808105, "logits/rejected": -2.0715503692626953, "logps/chosen": -12.842994689941406, "logps/rejected": -641.0640869140625, "logps_avg/chosen": -0.07570213079452515, "logps_avg/rejected": -3.500230312347412, "loss": 0.0768, "losses_ref": -6.805827433709055e-05, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7020, "u": -5.432784557342529, "weight": 0.04380672797560692 }, { "diff_generated": -32.28007125854492, "epoch": 2.2780298120544393, "grad_norm": 2.807385880123519, "learning_rate": 1.3303590595038735e-07, "logits/chosen": -2.2671284675598145, "logits/rejected": -2.1051955223083496, "logps/chosen": -14.239480972290039, "logps/rejected": -614.7636108398438, "logps_avg/chosen": -0.08152355253696442, "logps_avg/rejected": -3.2280075550079346, "loss": 0.0781, "losses_ref": -0.0008757902542129159, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7030, "u": -5.508584976196289, "weight": 0.03236890211701393 }, { "diff_generated": -31.199777603149414, "epoch": 2.2812702527543745, "grad_norm": 2.8868259547417314, "learning_rate": 1.3191466014016049e-07, "logits/chosen": -2.2917065620422363, "logits/rejected": -2.060508966445923, "logps/chosen": -11.897598266601562, "logps/rejected": -556.2362060546875, "logps_avg/chosen": -0.06840696185827255, "logps_avg/rejected": -3.1199779510498047, "loss": 0.0747, "losses_ref": -0.00041570625035092235, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7040, "u": -5.368067741394043, "weight": 0.05673006922006607 }, { "diff_generated": -32.5605354309082, "epoch": 2.28451069345431, "grad_norm": 2.817689113037822, "learning_rate": 1.3079722563553994e-07, "logits/chosen": -2.31117582321167, "logits/rejected": -2.0909972190856934, "logps/chosen": -13.309144973754883, "logps/rejected": -568.3492431640625, "logps_avg/chosen": -0.07407195121049881, "logps_avg/rejected": -3.2560532093048096, "loss": 0.0761, "losses_ref": -0.0011109040351584554, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7050, "u": -5.291151523590088, "weight": 0.07028138637542725 }, { "diff_generated": -32.47726058959961, "epoch": 2.287751134154245, "grad_norm": 2.7693781817642926, "learning_rate": 1.2968361832282705e-07, "logits/chosen": -2.275883674621582, "logits/rejected": -2.0943799018859863, "logps/chosen": -13.563992500305176, "logps/rejected": -600.7876586914062, "logps_avg/chosen": -0.07618103921413422, "logps_avg/rejected": -3.2477259635925293, "loss": 0.0779, "losses_ref": -0.0003883329627569765, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7060, "u": -5.403430461883545, "weight": 0.05040976405143738 }, { "diff_generated": -34.68756866455078, "epoch": 2.2909915748541803, "grad_norm": 2.861574037971138, "learning_rate": 1.2857385403391226e-07, "logits/chosen": -2.2599575519561768, "logits/rejected": -2.075666904449463, "logps/chosen": -14.708311080932617, "logps/rejected": -625.8695678710938, "logps_avg/chosen": -0.08552752435207367, "logps_avg/rejected": -3.4687564373016357, "loss": 0.0793, "losses_ref": -0.000556853658054024, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7070, "u": -5.541098594665527, "weight": 0.025598809123039246 }, { "diff_generated": -33.260467529296875, "epoch": 2.2942320155541154, "grad_norm": 3.047435358439903, "learning_rate": 1.274679485460509e-07, "logits/chosen": -2.295015811920166, "logits/rejected": -2.0713677406311035, "logps/chosen": -14.201037406921387, "logps/rejected": -591.999755859375, "logps_avg/chosen": -0.07558928430080414, "logps_avg/rejected": -3.32604718208313, "loss": 0.0772, "losses_ref": -0.0007015225710347295, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7080, "u": -5.222864627838135, "weight": 0.0821448564529419 }, { "diff_generated": -34.55098342895508, "epoch": 2.2974724562540505, "grad_norm": 2.849633449181555, "learning_rate": 1.2636591758163868e-07, "logits/chosen": -2.266608715057373, "logits/rejected": -2.1019034385681152, "logps/chosen": -13.151138305664062, "logps/rejected": -632.1087036132812, "logps_avg/chosen": -0.08013930916786194, "logps_avg/rejected": -3.4550983905792236, "loss": 0.0773, "losses_ref": -0.00018653420556802303, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7090, "u": -5.4691925048828125, "weight": 0.037679560482501984 }, { "diff_generated": -33.21096420288086, "epoch": 2.3007128969539856, "grad_norm": 2.948310681297529, "learning_rate": 1.2526777680798813e-07, "logits/chosen": -2.2835330963134766, "logits/rejected": -2.1416866779327393, "logps/chosen": -12.66423225402832, "logps/rejected": -602.2303466796875, "logps_avg/chosen": -0.07687665522098541, "logps_avg/rejected": -3.321096420288086, "loss": 0.077, "losses_ref": -0.0004062841762788594, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7100, "u": -5.293711185455322, "weight": 0.06918938457965851 }, { "diff_generated": -34.442100524902344, "epoch": 2.3039533376539207, "grad_norm": 2.6467761062799173, "learning_rate": 1.241735418371057e-07, "logits/chosen": -2.2501444816589355, "logits/rejected": -2.081749200820923, "logps/chosen": -13.935762405395508, "logps/rejected": -641.1041870117188, "logps_avg/chosen": -0.08066975325345993, "logps_avg/rejected": -3.4442100524902344, "loss": 0.0777, "losses_ref": -0.0010616803774610162, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7110, "u": -5.468209266662598, "weight": 0.03915365785360336 }, { "diff_generated": -34.00629425048828, "epoch": 2.3071937783538563, "grad_norm": 3.0293111653961007, "learning_rate": 1.2308322822547027e-07, "logits/chosen": -2.2660059928894043, "logits/rejected": -2.0726606845855713, "logps/chosen": -12.898577690124512, "logps/rejected": -650.5553588867188, "logps_avg/chosen": -0.07671193778514862, "logps_avg/rejected": -3.4006295204162598, "loss": 0.0753, "losses_ref": -0.0006363748689182103, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7120, "u": -5.365374565124512, "weight": 0.05698163062334061 }, { "diff_generated": -34.85182189941406, "epoch": 2.3104342190537914, "grad_norm": 2.7605565687874845, "learning_rate": 1.2199685147381148e-07, "logits/chosen": -2.3294003009796143, "logits/rejected": -2.102637767791748, "logps/chosen": -14.504648208618164, "logps/rejected": -635.5806884765625, "logps_avg/chosen": -0.07878074049949646, "logps_avg/rejected": -3.485182285308838, "loss": 0.0776, "losses_ref": -0.0002665507490746677, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7130, "u": -5.397416114807129, "weight": 0.05026886612176895 }, { "diff_generated": -34.99089431762695, "epoch": 2.3136746597537265, "grad_norm": 2.8144242097826297, "learning_rate": 1.2091442702688933e-07, "logits/chosen": -2.2866711616516113, "logits/rejected": -2.0564143657684326, "logps/chosen": -13.967157363891602, "logps/rejected": -695.5399169921875, "logps_avg/chosen": -0.07526645809412003, "logps_avg/rejected": -3.499089002609253, "loss": 0.0783, "losses_ref": -0.000559016945771873, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7140, "u": -5.363223075866699, "weight": 0.05697736144065857 }, { "diff_generated": -32.008792877197266, "epoch": 2.3169151004536617, "grad_norm": 2.835996775285066, "learning_rate": 1.198359702732755e-07, "logits/chosen": -2.296750545501709, "logits/rejected": -2.0789742469787598, "logps/chosen": -14.332858085632324, "logps/rejected": -583.1030883789062, "logps_avg/chosen": -0.07787901908159256, "logps_avg/rejected": -3.2008793354034424, "loss": 0.0776, "losses_ref": -0.0011311148991808295, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7150, "u": -5.330229759216309, "weight": 0.06403504312038422 }, { "diff_generated": -31.642242431640625, "epoch": 2.320155541153597, "grad_norm": 2.763604684901349, "learning_rate": 1.1876149654513321e-07, "logits/chosen": -2.2709290981292725, "logits/rejected": -2.121748208999634, "logps/chosen": -14.095861434936523, "logps/rejected": -578.5704345703125, "logps_avg/chosen": -0.08579359948635101, "logps_avg/rejected": -3.164224147796631, "loss": 0.0786, "losses_ref": -0.0008059808169491589, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7160, "u": -5.40159797668457, "weight": 0.050987452268600464 }, { "diff_generated": -34.49913787841797, "epoch": 2.323395981853532, "grad_norm": 2.7881090281670287, "learning_rate": 1.1769102111800036e-07, "logits/chosen": -2.2897281646728516, "logits/rejected": -2.1038079261779785, "logps/chosen": -14.366230964660645, "logps/rejected": -645.638671875, "logps_avg/chosen": -0.08264970034360886, "logps_avg/rejected": -3.449913740158081, "loss": 0.0783, "losses_ref": -0.00026110856560990214, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7170, "u": -5.575833797454834, "weight": 0.018995631486177444 }, { "diff_generated": -35.24726104736328, "epoch": 2.3266364225534675, "grad_norm": 2.7809487105597297, "learning_rate": 1.166245592105719e-07, "logits/chosen": -2.2621166706085205, "logits/rejected": -2.042609453201294, "logps/chosen": -14.316975593566895, "logps/rejected": -612.24560546875, "logps_avg/chosen": -0.07764624059200287, "logps_avg/rejected": -3.524726152420044, "loss": 0.0782, "losses_ref": -0.0006773438071832061, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7180, "u": -5.435733795166016, "weight": 0.04453923553228378 }, { "diff_generated": -33.23479461669922, "epoch": 2.3298768632534026, "grad_norm": 2.9081941910794766, "learning_rate": 1.1556212598448349e-07, "logits/chosen": -2.294276714324951, "logits/rejected": -2.085162401199341, "logps/chosen": -16.098182678222656, "logps/rejected": -591.9588012695312, "logps_avg/chosen": -0.08865799009799957, "logps_avg/rejected": -3.3234798908233643, "loss": 0.0773, "losses_ref": -0.0010387629736214876, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7190, "u": -5.470877647399902, "weight": 0.03885335102677345 }, { "diff_generated": -32.696388244628906, "epoch": 2.3331173039533377, "grad_norm": 2.8852609735392485, "learning_rate": 1.1450373654409591e-07, "logits/chosen": -2.289748191833496, "logits/rejected": -2.143613338470459, "logps/chosen": -13.386698722839355, "logps/rejected": -610.17041015625, "logps_avg/chosen": -0.07896269857883453, "logps_avg/rejected": -3.269639253616333, "loss": 0.0801, "losses_ref": -0.00018862645083572716, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7200, "u": -5.330229759216309, "weight": 0.0626850426197052 }, { "diff_generated": -32.930179595947266, "epoch": 2.336357744653273, "grad_norm": 2.938215867459973, "learning_rate": 1.1344940593628063e-07, "logits/chosen": -2.24363112449646, "logits/rejected": -2.0247769355773926, "logps/chosen": -13.017687797546387, "logps/rejected": -571.5632934570312, "logps_avg/chosen": -0.07627496123313904, "logps_avg/rejected": -3.293017864227295, "loss": 0.0784, "losses_ref": -0.00047731236554682255, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7210, "u": -5.259675979614258, "weight": 0.07552912831306458 }, { "diff_generated": -30.685317993164062, "epoch": 2.339598185353208, "grad_norm": 2.7653327775749963, "learning_rate": 1.1239914915020512e-07, "logits/chosen": -2.26278018951416, "logits/rejected": -2.109288215637207, "logps/chosen": -13.016032218933105, "logps/rejected": -577.0557861328125, "logps_avg/chosen": -0.07486730068922043, "logps_avg/rejected": -3.0685317516326904, "loss": 0.0783, "losses_ref": -0.0003721018729265779, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7220, "u": -5.224618911743164, "weight": 0.08162766695022583 }, { "diff_generated": -34.90629196166992, "epoch": 2.342838626053143, "grad_norm": 2.825449442856524, "learning_rate": 1.1135298111712122e-07, "logits/chosen": -2.26003098487854, "logits/rejected": -2.0450892448425293, "logps/chosen": -13.561320304870605, "logps/rejected": -630.9426879882812, "logps_avg/chosen": -0.07841043919324875, "logps_avg/rejected": -3.490629196166992, "loss": 0.0767, "losses_ref": -0.0005720141343772411, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7230, "u": -5.402764797210693, "weight": 0.05068415403366089 }, { "diff_generated": -31.86376953125, "epoch": 2.346079066753078, "grad_norm": 2.8668736907248458, "learning_rate": 1.1031091671015094e-07, "logits/chosen": -2.245361804962158, "logits/rejected": -2.041193962097168, "logps/chosen": -12.147971153259277, "logps/rejected": -587.3404541015625, "logps_avg/chosen": -0.07041692733764648, "logps_avg/rejected": -3.1863768100738525, "loss": 0.076, "losses_ref": -0.002164191100746393, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7240, "u": -5.182686805725098, "weight": 0.09167562425136566 }, { "diff_generated": -33.82771301269531, "epoch": 2.3493195074530138, "grad_norm": 2.9888161658925148, "learning_rate": 1.0927297074407662e-07, "logits/chosen": -2.2794957160949707, "logits/rejected": -2.085437297821045, "logps/chosen": -13.637370109558105, "logps/rejected": -622.2664794921875, "logps_avg/chosen": -0.0804823487997055, "logps_avg/rejected": -3.3827712535858154, "loss": 0.0764, "losses_ref": -0.00035343001945875585, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7250, "u": -5.403036594390869, "weight": 0.050387442111968994 }, { "diff_generated": -32.11924362182617, "epoch": 2.352559948152949, "grad_norm": 2.789941220163072, "learning_rate": 1.0823915797512952e-07, "logits/chosen": -2.293781280517578, "logits/rejected": -2.1002774238586426, "logps/chosen": -12.634278297424316, "logps/rejected": -586.3161010742188, "logps_avg/chosen": -0.07193853706121445, "logps_avg/rejected": -3.2119243144989014, "loss": 0.0769, "losses_ref": -0.0006824486772529781, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7260, "u": -5.363787651062012, "weight": 0.05698443576693535 }, { "diff_generated": -34.903221130371094, "epoch": 2.355800388852884, "grad_norm": 3.1192025526659153, "learning_rate": 1.0720949310078032e-07, "logits/chosen": -2.262953281402588, "logits/rejected": -2.07498836517334, "logps/chosen": -13.899110794067383, "logps/rejected": -624.4613037109375, "logps_avg/chosen": -0.080237478017807, "logps_avg/rejected": -3.4903221130371094, "loss": 0.078, "losses_ref": -0.00040980131598189473, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7270, "u": -5.509830951690674, "weight": 0.03166963905096054 }, { "diff_generated": -30.5557804107666, "epoch": 2.359040829552819, "grad_norm": 2.760514648523447, "learning_rate": 1.0618399075952993e-07, "logits/chosen": -2.3034751415252686, "logits/rejected": -2.089022159576416, "logps/chosen": -13.662466049194336, "logps/rejected": -608.3770141601562, "logps_avg/chosen": -0.07312439382076263, "logps_avg/rejected": -3.0555779933929443, "loss": 0.0769, "losses_ref": -0.0008744834922254086, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7280, "u": -5.36469030380249, "weight": 0.05743665248155594 }, { "diff_generated": -33.99993133544922, "epoch": 2.3622812702527543, "grad_norm": 3.2206744990322393, "learning_rate": 1.0516266553070159e-07, "logits/chosen": -2.250345230102539, "logits/rejected": -2.046128749847412, "logps/chosen": -14.4988431930542, "logps/rejected": -647.0737915039062, "logps_avg/chosen": -0.08133145421743393, "logps_avg/rejected": -3.3999931812286377, "loss": 0.0765, "losses_ref": -0.0007096336339600384, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7290, "u": -5.402334690093994, "weight": 0.05076984316110611 }, { "diff_generated": -34.168174743652344, "epoch": 2.3655217109526894, "grad_norm": 2.819305164659092, "learning_rate": 1.041455319342336e-07, "logits/chosen": -2.2885193824768066, "logits/rejected": -2.079343318939209, "logps/chosen": -13.748265266418457, "logps/rejected": -630.8150024414062, "logps_avg/chosen": -0.07698606699705124, "logps_avg/rejected": -3.4168171882629395, "loss": 0.0788, "losses_ref": -0.00023140080156736076, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7300, "u": -5.473116874694824, "weight": 0.03773132711648941 }, { "diff_generated": -30.992996215820312, "epoch": 2.368762151652625, "grad_norm": 2.8064681132146423, "learning_rate": 1.0313260443047247e-07, "logits/chosen": -2.202005386352539, "logits/rejected": -2.107422351837158, "logps/chosen": -11.852638244628906, "logps/rejected": -606.5282592773438, "logps_avg/chosen": -0.07290005683898926, "logps_avg/rejected": -3.099299907684326, "loss": 0.0791, "losses_ref": -0.0019258193206042051, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7310, "u": -5.29049015045166, "weight": 0.0717729777097702 }, { "diff_generated": -31.909597396850586, "epoch": 2.37200259235256, "grad_norm": 2.8230981127930814, "learning_rate": 1.0212389741996834e-07, "logits/chosen": -2.3181042671203613, "logits/rejected": -2.1704022884368896, "logps/chosen": -13.069847106933594, "logps/rejected": -612.2677612304688, "logps_avg/chosen": -0.07597730308771133, "logps_avg/rejected": -3.190959930419922, "loss": 0.0756, "losses_ref": -0.0003046352358069271, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7320, "u": -5.331242561340332, "weight": 0.06280872970819473 }, { "diff_generated": -34.66915512084961, "epoch": 2.375243033052495, "grad_norm": 2.7740004009290598, "learning_rate": 1.0111942524326891e-07, "logits/chosen": -2.2749009132385254, "logits/rejected": -2.11348557472229, "logps/chosen": -12.951632499694824, "logps/rejected": -642.3538818359375, "logps_avg/chosen": -0.07797206938266754, "logps_avg/rejected": -3.4669156074523926, "loss": 0.0777, "losses_ref": -0.00030273018637672067, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7330, "u": -5.437063694000244, "weight": 0.04405757784843445 }, { "diff_generated": -35.043922424316406, "epoch": 2.3784834737524303, "grad_norm": 2.817685977122686, "learning_rate": 1.0011920218071664e-07, "logits/chosen": -2.3587987422943115, "logits/rejected": -2.1169161796569824, "logps/chosen": -13.914976119995117, "logps/rejected": -663.571044921875, "logps_avg/chosen": -0.07692556828260422, "logps_avg/rejected": -3.5043931007385254, "loss": 0.0759, "losses_ref": -0.0008672567782923579, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7340, "u": -5.3598151206970215, "weight": 0.05720771476626396 }, { "diff_generated": -33.76276397705078, "epoch": 2.3817239144523654, "grad_norm": 2.9478435470621456, "learning_rate": 9.912324245224524e-08, "logits/chosen": -2.351743221282959, "logits/rejected": -2.1735739707946777, "logps/chosen": -14.532655715942383, "logps/rejected": -635.3209838867188, "logps_avg/chosen": -0.07836208492517471, "logps_avg/rejected": -3.3762760162353516, "loss": 0.075, "losses_ref": -0.0017885919660329819, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7350, "u": -5.608242988586426, "weight": 0.01502218097448349 }, { "diff_generated": -33.28066635131836, "epoch": 2.3849643551523005, "grad_norm": 3.1743935272367123, "learning_rate": 9.813156021717763e-08, "logits/chosen": -2.2940306663513184, "logits/rejected": -2.065108060836792, "logps/chosen": -13.723846435546875, "logps/rejected": -577.5977783203125, "logps_avg/chosen": -0.07425703853368759, "logps_avg/rejected": -3.3280670642852783, "loss": 0.0764, "losses_ref": -0.00109779997728765, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7360, "u": -5.363638401031494, "weight": 0.05759688466787338 }, { "diff_generated": -34.619773864746094, "epoch": 2.3882047958522357, "grad_norm": 2.7146783287290597, "learning_rate": 9.714416957402468e-08, "logits/chosen": -2.283629894256592, "logits/rejected": -2.141249895095825, "logps/chosen": -13.449475288391113, "logps/rejected": -633.1432495117188, "logps_avg/chosen": -0.07728450745344162, "logps_avg/rejected": -3.461977481842041, "loss": 0.078, "losses_ref": -0.00025040132459253073, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7370, "u": -5.369337558746338, "weight": 0.056508757174015045 }, { "diff_generated": -34.74108123779297, "epoch": 2.3914452365521712, "grad_norm": 2.760996835929895, "learning_rate": 9.616108456028462e-08, "logits/chosen": -2.283205032348633, "logits/rejected": -2.0816237926483154, "logps/chosen": -12.97753620147705, "logps/rejected": -624.5291137695312, "logps_avg/chosen": -0.07539506256580353, "logps_avg/rejected": -3.4741077423095703, "loss": 0.0772, "losses_ref": -0.00020238272554706782, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7380, "u": -5.54359245300293, "weight": 0.025206467136740685 }, { "diff_generated": -34.526756286621094, "epoch": 2.3946856772521063, "grad_norm": 2.8408676129849475, "learning_rate": 9.518231915224371e-08, "logits/chosen": -2.2826147079467773, "logits/rejected": -2.0404651165008545, "logps/chosen": -13.243906021118164, "logps/rejected": -653.4534912109375, "logps_avg/chosen": -0.07447250932455063, "logps_avg/rejected": -3.4526753425598145, "loss": 0.0762, "losses_ref": -0.0010482899378985167, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7390, "u": -5.43655252456665, "weight": 0.04528099671006203 }, { "diff_generated": -32.078582763671875, "epoch": 2.3979261179520415, "grad_norm": 2.88027935851885, "learning_rate": 9.4207887264777e-08, "logits/chosen": -2.2226669788360596, "logits/rejected": -2.1108269691467285, "logps/chosen": -12.079364776611328, "logps/rejected": -617.87255859375, "logps_avg/chosen": -0.07426749914884567, "logps_avg/rejected": -3.2078583240509033, "loss": 0.0777, "losses_ref": -0.0010231093037873507, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7400, "u": -5.437130928039551, "weight": 0.04528171569108963 }, { "diff_generated": -36.370052337646484, "epoch": 2.4011665586519766, "grad_norm": 2.884539046607622, "learning_rate": 9.323780275115156e-08, "logits/chosen": -2.2833988666534424, "logits/rejected": -2.0623676776885986, "logps/chosen": -13.492776870727539, "logps/rejected": -685.26025390625, "logps_avg/chosen": -0.07685331255197525, "logps_avg/rejected": -3.63700532913208, "loss": 0.0771, "losses_ref": -0.0007987999124452472, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7410, "u": -5.509256839752197, "weight": 0.03232438117265701 }, { "diff_generated": -32.873191833496094, "epoch": 2.4044069993519117, "grad_norm": 2.6715246297813766, "learning_rate": 9.22720794028283e-08, "logits/chosen": -2.301788091659546, "logits/rejected": -2.076624631881714, "logps/chosen": -13.993014335632324, "logps/rejected": -576.7996215820312, "logps_avg/chosen": -0.0792023241519928, "logps_avg/rejected": -3.287318706512451, "loss": 0.0782, "losses_ref": -0.0004467566031962633, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7420, "u": -5.295356273651123, "weight": 0.06927172839641571 }, { "diff_generated": -34.844940185546875, "epoch": 2.4076474400518473, "grad_norm": 2.6567574359573825, "learning_rate": 9.13107309492668e-08, "logits/chosen": -2.234447717666626, "logits/rejected": -2.0357723236083984, "logps/chosen": -13.651655197143555, "logps/rejected": -662.6868286132812, "logps_avg/chosen": -0.07822132855653763, "logps_avg/rejected": -3.4844939708709717, "loss": 0.0774, "losses_ref": -0.0003171044809278101, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7430, "u": -5.438276290893555, "weight": 0.044068168848752975 }, { "diff_generated": -33.605289459228516, "epoch": 2.4108878807517824, "grad_norm": 2.8362513207746827, "learning_rate": 9.035377105772966e-08, "logits/chosen": -2.272564172744751, "logits/rejected": -2.041059970855713, "logps/chosen": -13.95252799987793, "logps/rejected": -624.0958251953125, "logps_avg/chosen": -0.0766502246260643, "logps_avg/rejected": -3.3605289459228516, "loss": 0.0787, "losses_ref": -0.000506135169416666, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7440, "u": -5.293671607971191, "weight": 0.06933457404375076 }, { "diff_generated": -32.606605529785156, "epoch": 2.4141283214517175, "grad_norm": 2.9002538813054826, "learning_rate": 8.940121333308849e-08, "logits/chosen": -2.2347395420074463, "logits/rejected": -2.111093044281006, "logps/chosen": -11.986194610595703, "logps/rejected": -595.4306030273438, "logps_avg/chosen": -0.07371608167886734, "logps_avg/rejected": -3.2606606483459473, "loss": 0.0777, "losses_ref": -0.0014970863703638315, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7450, "u": -5.362734794616699, "weight": 0.058197181671857834 }, { "diff_generated": -35.42874526977539, "epoch": 2.4173687621516526, "grad_norm": 2.7272928463051858, "learning_rate": 8.845307131762991e-08, "logits/chosen": -2.294037342071533, "logits/rejected": -2.0677173137664795, "logps/chosen": -14.819440841674805, "logps/rejected": -634.6419677734375, "logps_avg/chosen": -0.07748381793498993, "logps_avg/rejected": -3.5428740978240967, "loss": 0.0771, "losses_ref": -0.0001443286018911749, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7460, "u": -5.366196155548096, "weight": 0.05638740211725235 }, { "diff_generated": -33.44629669189453, "epoch": 2.4206092028515878, "grad_norm": 2.740936329712308, "learning_rate": 8.750935849086424e-08, "logits/chosen": -2.296396255493164, "logits/rejected": -2.058582305908203, "logps/chosen": -14.692959785461426, "logps/rejected": -647.8707275390625, "logps_avg/chosen": -0.07983608543872833, "logps_avg/rejected": -3.3446297645568848, "loss": 0.0768, "losses_ref": -0.00016290844359900802, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7470, "u": -5.538523197174072, "weight": 0.025174766778945923 }, { "diff_generated": -34.08763122558594, "epoch": 2.423849643551523, "grad_norm": 2.874307206616938, "learning_rate": 8.657008826933223e-08, "logits/chosen": -2.247481107711792, "logits/rejected": -2.082742691040039, "logps/chosen": -13.472023010253906, "logps/rejected": -634.2883911132812, "logps_avg/chosen": -0.07857387512922287, "logps_avg/rejected": -3.4087631702423096, "loss": 0.0793, "losses_ref": -0.000744626100640744, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7480, "u": -5.435318470001221, "weight": 0.044631458818912506 }, { "diff_generated": -32.04920196533203, "epoch": 2.427090084251458, "grad_norm": 2.6646272712366614, "learning_rate": 8.563527400641559e-08, "logits/chosen": -2.277095317840576, "logits/rejected": -2.1428980827331543, "logps/chosen": -13.724957466125488, "logps/rejected": -603.6646728515625, "logps_avg/chosen": -0.07841077446937561, "logps_avg/rejected": -3.2049202919006348, "loss": 0.0755, "losses_ref": -0.00028366531478241086, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7490, "u": -5.261726379394531, "weight": 0.0752992182970047 }, { "diff_generated": -35.079063415527344, "epoch": 2.4303305249513936, "grad_norm": 2.8130066664130466, "learning_rate": 8.470492899214696e-08, "logits/chosen": -2.2499706745147705, "logits/rejected": -2.0175976753234863, "logps/chosen": -13.57239818572998, "logps/rejected": -608.1451416015625, "logps_avg/chosen": -0.07596118748188019, "logps_avg/rejected": -3.507906675338745, "loss": 0.0776, "losses_ref": -0.0003359224647283554, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7500, "u": -5.288158893585205, "weight": 0.06913691014051437 }, { "diff_generated": -33.463157653808594, "epoch": 2.4335709656513287, "grad_norm": 2.8587796142319215, "learning_rate": 8.377906645302015e-08, "logits/chosen": -2.242366313934326, "logits/rejected": -2.0637431144714355, "logps/chosen": -14.448846817016602, "logps/rejected": -635.8395385742188, "logps_avg/chosen": -0.0818089097738266, "logps_avg/rejected": -3.346315860748291, "loss": 0.0777, "losses_ref": -0.00039851194014772773, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7510, "u": -5.469162940979004, "weight": 0.03789610415697098 }, { "diff_generated": -36.01557540893555, "epoch": 2.436811406351264, "grad_norm": 2.704493399261753, "learning_rate": 8.28576995518031e-08, "logits/chosen": -2.257784128189087, "logits/rejected": -1.9793716669082642, "logps/chosen": -15.093899726867676, "logps/rejected": -609.7103881835938, "logps_avg/chosen": -0.08029602468013763, "logps_avg/rejected": -3.601557493209839, "loss": 0.0775, "losses_ref": -0.0009452321683056653, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7520, "u": -5.399094104766846, "weight": 0.051128365099430084 }, { "diff_generated": -32.34758758544922, "epoch": 2.440051847051199, "grad_norm": 2.994888257339022, "learning_rate": 8.194084138735023e-08, "logits/chosen": -2.2800023555755615, "logits/rejected": -2.082465648651123, "logps/chosen": -14.151315689086914, "logps/rejected": -611.8934326171875, "logps_avg/chosen": -0.08025664836168289, "logps_avg/rejected": -3.2347590923309326, "loss": 0.0776, "losses_ref": -0.00029332557460293174, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7530, "u": -5.439518451690674, "weight": 0.044044096022844315 }, { "diff_generated": -33.730262756347656, "epoch": 2.443292287751134, "grad_norm": 2.709483158923101, "learning_rate": 8.102850499441638e-08, "logits/chosen": -2.238938331604004, "logits/rejected": -2.0246846675872803, "logps/chosen": -13.48927116394043, "logps/rejected": -658.44140625, "logps_avg/chosen": -0.07260926812887192, "logps_avg/rejected": -3.3730266094207764, "loss": 0.0768, "losses_ref": -0.00044730809167958796, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7540, "u": -5.0764479637146, "weight": 0.10680226981639862 }, { "diff_generated": -35.54068374633789, "epoch": 2.446532728451069, "grad_norm": 2.7716548897069804, "learning_rate": 8.012070334347103e-08, "logits/chosen": -2.2876362800598145, "logits/rejected": -2.0490283966064453, "logps/chosen": -15.265823364257812, "logps/rejected": -662.9925537109375, "logps_avg/chosen": -0.08061473816633224, "logps_avg/rejected": -3.5540683269500732, "loss": 0.0763, "losses_ref": -0.00046191777801141143, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7550, "u": -5.507439613342285, "weight": 0.03176378458738327 }, { "diff_generated": -33.28374481201172, "epoch": 2.4497731691510047, "grad_norm": 2.6459787187816888, "learning_rate": 7.921744934051515e-08, "logits/chosen": -2.255577802658081, "logits/rejected": -2.0689337253570557, "logps/chosen": -12.737321853637695, "logps/rejected": -604.4900512695312, "logps_avg/chosen": -0.07835245132446289, "logps_avg/rejected": -3.3283743858337402, "loss": 0.0753, "losses_ref": -0.0007992651080712676, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7560, "u": -5.214974880218506, "weight": 0.08231941610574722 }, { "diff_generated": -32.80999755859375, "epoch": 2.45301360985094, "grad_norm": 2.7422000300867486, "learning_rate": 7.831875582689598e-08, "logits/chosen": -2.2509467601776123, "logits/rejected": -2.058760404586792, "logps/chosen": -13.229718208312988, "logps/rejected": -591.0678100585938, "logps_avg/chosen": -0.07577583193778992, "logps_avg/rejected": -3.2809996604919434, "loss": 0.0758, "losses_ref": -0.0003204692038707435, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7570, "u": -5.329041957855225, "weight": 0.06283210217952728 }, { "diff_generated": -32.483097076416016, "epoch": 2.456254050550875, "grad_norm": 2.8835914157486426, "learning_rate": 7.742463557912593e-08, "logits/chosen": -2.2545275688171387, "logits/rejected": -2.090634822845459, "logps/chosen": -13.476033210754395, "logps/rejected": -588.4400024414062, "logps_avg/chosen": -0.07627163827419281, "logps_avg/rejected": -3.248309373855591, "loss": 0.0771, "losses_ref": -0.0012753000482916832, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7580, "u": -5.296724796295166, "weight": 0.0706983357667923 }, { "diff_generated": -35.58943557739258, "epoch": 2.45949449125081, "grad_norm": 2.695827748470073, "learning_rate": 7.65351013087002e-08, "logits/chosen": -2.267869472503662, "logits/rejected": -2.1123318672180176, "logps/chosen": -11.977426528930664, "logps/rejected": -692.4275512695312, "logps_avg/chosen": -0.0733996331691742, "logps_avg/rejected": -3.558943271636963, "loss": 0.0782, "losses_ref": -0.00019766921468544751, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7590, "u": -5.437938690185547, "weight": 0.04395180940628052 }, { "diff_generated": -34.45747756958008, "epoch": 2.462734931950745, "grad_norm": 3.04541194251132, "learning_rate": 7.565016566191631e-08, "logits/chosen": -2.2000534534454346, "logits/rejected": -2.032714366912842, "logps/chosen": -13.62053108215332, "logps/rejected": -630.7037353515625, "logps_avg/chosen": -0.07926841825246811, "logps_avg/rejected": -3.4457478523254395, "loss": 0.0771, "losses_ref": -0.0006812643841840327, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7600, "u": -5.399075984954834, "weight": 0.05084569379687309 }, { "diff_generated": -33.85066604614258, "epoch": 2.4659753726506803, "grad_norm": 2.861072627997925, "learning_rate": 7.47698412196939e-08, "logits/chosen": -2.2991385459899902, "logits/rejected": -2.0473597049713135, "logps/chosen": -13.630948066711426, "logps/rejected": -632.0382080078125, "logps_avg/chosen": -0.07360848784446716, "logps_avg/rejected": -3.385066270828247, "loss": 0.0781, "losses_ref": -0.001396728097461164, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7610, "u": -5.36118745803833, "weight": 0.05879664421081543 }, { "diff_generated": -34.452186584472656, "epoch": 2.4692158133506155, "grad_norm": 2.6760339852260264, "learning_rate": 7.389414049739682e-08, "logits/chosen": -2.298701763153076, "logits/rejected": -2.094611167907715, "logps/chosen": -13.719430923461914, "logps/rejected": -635.7684326171875, "logps_avg/chosen": -0.07470625638961792, "logps_avg/rejected": -3.445218563079834, "loss": 0.0759, "losses_ref": -0.0010666692396625876, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7620, "u": -5.395898342132568, "weight": 0.051368117332458496 }, { "diff_generated": -33.900569915771484, "epoch": 2.472456254050551, "grad_norm": 2.7214995018689887, "learning_rate": 7.302307594465422e-08, "logits/chosen": -2.2754058837890625, "logits/rejected": -2.1271181106567383, "logps/chosen": -13.97209644317627, "logps/rejected": -676.8109130859375, "logps_avg/chosen": -0.07918776571750641, "logps_avg/rejected": -3.39005708694458, "loss": 0.0756, "losses_ref": -0.00014776972238905728, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7630, "u": -5.429097652435303, "weight": 0.04388615861535072 }, { "diff_generated": -35.28455352783203, "epoch": 2.475696694750486, "grad_norm": 3.172336067859759, "learning_rate": 7.215665994518367e-08, "logits/chosen": -2.266859769821167, "logits/rejected": -2.0783934593200684, "logps/chosen": -13.267038345336914, "logps/rejected": -638.2308349609375, "logps_avg/chosen": -0.0793921947479248, "logps_avg/rejected": -3.5284552574157715, "loss": 0.0777, "losses_ref": -0.00048756637261249125, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7640, "u": -5.361421585083008, "weight": 0.05682788044214249 }, { "diff_generated": -34.95798873901367, "epoch": 2.4789371354504213, "grad_norm": 2.958338712244271, "learning_rate": 7.129490481661605e-08, "logits/chosen": -2.29850172996521, "logits/rejected": -2.0712332725524902, "logps/chosen": -15.60583209991455, "logps/rejected": -665.9384155273438, "logps_avg/chosen": -0.08422177284955978, "logps_avg/rejected": -3.4957988262176514, "loss": 0.076, "losses_ref": -0.0006580519257113338, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7650, "u": -5.613326072692871, "weight": 0.013244752772152424 }, { "diff_generated": -32.899208068847656, "epoch": 2.4821775761503564, "grad_norm": 3.544889417217034, "learning_rate": 7.043782281031911e-08, "logits/chosen": -2.2815823554992676, "logits/rejected": -1.97531259059906, "logps/chosen": -16.198482513427734, "logps/rejected": -604.340576171875, "logps_avg/chosen": -0.08431107550859451, "logps_avg/rejected": -3.2899203300476074, "loss": 0.0768, "losses_ref": -0.0005749968695454299, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7660, "u": -5.256075859069824, "weight": 0.07563985884189606 }, { "diff_generated": -35.965213775634766, "epoch": 2.4854180168502915, "grad_norm": 2.803402804866771, "learning_rate": 6.958542611122422e-08, "logits/chosen": -2.2891929149627686, "logits/rejected": -2.0426852703094482, "logps/chosen": -13.56506061553955, "logps/rejected": -666.633544921875, "logps_avg/chosen": -0.07456330955028534, "logps_avg/rejected": -3.5965213775634766, "loss": 0.0776, "losses_ref": -0.0004795569402631372, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7670, "u": -5.431191921234131, "weight": 0.04434206336736679 }, { "diff_generated": -32.99568176269531, "epoch": 2.488658457550227, "grad_norm": 2.657620426284608, "learning_rate": 6.873772683765283e-08, "logits/chosen": -2.212829351425171, "logits/rejected": -2.029841899871826, "logps/chosen": -13.612431526184082, "logps/rejected": -614.3855590820312, "logps_avg/chosen": -0.07634242624044418, "logps_avg/rejected": -3.2995681762695312, "loss": 0.0735, "losses_ref": -0.0002592132077552378, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7680, "u": -5.2603278160095215, "weight": 0.07525938004255295 }, { "diff_generated": -35.51023483276367, "epoch": 2.491898898250162, "grad_norm": 2.8426490408551204, "learning_rate": 6.789473704114428e-08, "logits/chosen": -2.2818164825439453, "logits/rejected": -2.0289533138275146, "logps/chosen": -13.806233406066895, "logps/rejected": -657.150390625, "logps_avg/chosen": -0.07651212066411972, "logps_avg/rejected": -3.551023483276367, "loss": 0.0784, "losses_ref": -0.0007175356731750071, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7690, "u": -5.575205326080322, "weight": 0.01952420175075531 }, { "diff_generated": -33.23609161376953, "epoch": 2.4951393389500973, "grad_norm": 2.8013204477737217, "learning_rate": 6.7056468706284e-08, "logits/chosen": -2.257474184036255, "logits/rejected": -2.029222011566162, "logps/chosen": -14.227697372436523, "logps/rejected": -584.1106567382812, "logps_avg/chosen": -0.07747099548578262, "logps_avg/rejected": -3.3236095905303955, "loss": 0.0783, "losses_ref": -0.0004787585639860481, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7700, "u": -5.363088607788086, "weight": 0.05679760128259659 }, { "diff_generated": -35.77177047729492, "epoch": 2.4983797796500324, "grad_norm": 2.6951188107579145, "learning_rate": 6.622293375053422e-08, "logits/chosen": -2.2117533683776855, "logits/rejected": -1.9806289672851562, "logps/chosen": -14.14848804473877, "logps/rejected": -601.2380981445312, "logps_avg/chosen": -0.07564245164394379, "logps_avg/rejected": -3.5771775245666504, "loss": 0.078, "losses_ref": -0.0010219484101980925, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7710, "u": -5.331390380859375, "weight": 0.06371685117483139 }, { "diff_generated": -33.446556091308594, "epoch": 2.5016202203499676, "grad_norm": 3.003938973826779, "learning_rate": 6.539414402406316e-08, "logits/chosen": -2.236816167831421, "logits/rejected": -2.0297279357910156, "logps/chosen": -13.686360359191895, "logps/rejected": -624.5142822265625, "logps_avg/chosen": -0.07835282385349274, "logps_avg/rejected": -3.3446555137634277, "loss": 0.0774, "losses_ref": -0.00030313601018860936, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7720, "u": -5.471747398376465, "weight": 0.037795376032590866 }, { "diff_generated": -32.8238410949707, "epoch": 2.5048606610499027, "grad_norm": 2.867173383688279, "learning_rate": 6.457011130957747e-08, "logits/chosen": -2.2398643493652344, "logits/rejected": -1.9696300029754639, "logps/chosen": -14.389925956726074, "logps/rejected": -597.840087890625, "logps_avg/chosen": -0.07920937240123749, "logps_avg/rejected": -3.282384157180786, "loss": 0.0773, "losses_ref": -0.0005577055853791535, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7730, "u": -5.359132289886475, "weight": 0.05690532177686691 }, { "diff_generated": -36.69569396972656, "epoch": 2.508101101749838, "grad_norm": 2.839278217911586, "learning_rate": 6.37508473221549e-08, "logits/chosen": -2.269911289215088, "logits/rejected": -2.018099308013916, "logps/chosen": -14.185853958129883, "logps/rejected": -667.17041015625, "logps_avg/chosen": -0.08158121258020401, "logps_avg/rejected": -3.669569492340088, "loss": 0.078, "losses_ref": -0.0003814251977019012, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7740, "u": -5.434433460235596, "weight": 0.04420917108654976 }, { "diff_generated": -36.33928298950195, "epoch": 2.511341542449773, "grad_norm": 2.8251290016911526, "learning_rate": 6.293636370907665e-08, "logits/chosen": -2.288761615753174, "logits/rejected": -2.0692386627197266, "logps/chosen": -13.266860961914062, "logps/rejected": -708.5743408203125, "logps_avg/chosen": -0.07769857347011566, "logps_avg/rejected": -3.6339282989501953, "loss": 0.0775, "losses_ref": -0.0007504144450649619, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7750, "u": -5.508566379547119, "weight": 0.03215345740318298 }, { "diff_generated": -33.397117614746094, "epoch": 2.5145819831497085, "grad_norm": 2.830751045089104, "learning_rate": 6.212667204966293e-08, "logits/chosen": -2.3138840198516846, "logits/rejected": -2.0809566974639893, "logps/chosen": -13.976930618286133, "logps/rejected": -618.265869140625, "logps_avg/chosen": -0.07728879898786545, "logps_avg/rejected": -3.339712142944336, "loss": 0.0783, "losses_ref": -0.00014159231795929372, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7760, "u": -5.511630058288574, "weight": 0.031377941370010376 }, { "diff_generated": -34.5837516784668, "epoch": 2.5178224238496436, "grad_norm": 2.836185839666556, "learning_rate": 6.132178385510772e-08, "logits/chosen": -2.270347833633423, "logits/rejected": -2.0604121685028076, "logps/chosen": -13.438015937805176, "logps/rejected": -615.5458984375, "logps_avg/chosen": -0.07801367342472076, "logps_avg/rejected": -3.4583754539489746, "loss": 0.0758, "losses_ref": -0.0010575618362054229, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7770, "u": -5.43215274810791, "weight": 0.045181840658187866 }, { "diff_generated": -34.33323287963867, "epoch": 2.5210628645495787, "grad_norm": 2.7319029632555822, "learning_rate": 6.052171056831547e-08, "logits/chosen": -2.2712090015411377, "logits/rejected": -2.0803134441375732, "logps/chosen": -12.562702178955078, "logps/rejected": -632.27490234375, "logps_avg/chosen": -0.07550046592950821, "logps_avg/rejected": -3.433323383331299, "loss": 0.075, "losses_ref": -0.0002697540621738881, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7780, "u": -5.4767985343933105, "weight": 0.0377730131149292 }, { "diff_generated": -35.092098236083984, "epoch": 2.524303305249514, "grad_norm": 2.770684012831786, "learning_rate": 5.972646356373779e-08, "logits/chosen": -2.2754201889038086, "logits/rejected": -2.0463852882385254, "logps/chosen": -15.457331657409668, "logps/rejected": -633.0758056640625, "logps_avg/chosen": -0.0822768360376358, "logps_avg/rejected": -3.5092101097106934, "loss": 0.0754, "losses_ref": -0.0001960611407412216, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7790, "u": -5.5086774826049805, "weight": 0.031437747180461884 }, { "diff_generated": -34.240562438964844, "epoch": 2.527543745949449, "grad_norm": 2.796408548522239, "learning_rate": 5.893605414721277e-08, "logits/chosen": -2.299086093902588, "logits/rejected": -2.0886199474334717, "logps/chosen": -12.549464225769043, "logps/rejected": -640.3765869140625, "logps_avg/chosen": -0.07397869974374771, "logps_avg/rejected": -3.4240562915802, "loss": 0.0758, "losses_ref": -0.00014421154628507793, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7800, "u": -5.437371253967285, "weight": 0.04388119652867317 }, { "diff_generated": -34.39710235595703, "epoch": 2.5307841866493845, "grad_norm": 2.9301569992729046, "learning_rate": 5.815049355580317e-08, "logits/chosen": -2.2772209644317627, "logits/rejected": -2.0563406944274902, "logps/chosen": -14.136589050292969, "logps/rejected": -647.86181640625, "logps_avg/chosen": -0.07758349925279617, "logps_avg/rejected": -3.4397106170654297, "loss": 0.076, "losses_ref": -0.00013244636647868901, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7810, "u": -5.507662296295166, "weight": 0.03137155622243881 }, { "diff_generated": -37.23072814941406, "epoch": 2.5340246273493197, "grad_norm": 2.9462919405130785, "learning_rate": 5.736979295763742e-08, "logits/chosen": -2.2507474422454834, "logits/rejected": -1.970627784729004, "logps/chosen": -15.260380744934082, "logps/rejected": -679.1442260742188, "logps_avg/chosen": -0.0788363367319107, "logps_avg/rejected": -3.7230727672576904, "loss": 0.0782, "losses_ref": -0.0007362683536484838, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7820, "u": -5.543445587158203, "weight": 0.025849919766187668 }, { "diff_generated": -33.65333557128906, "epoch": 2.537265068049255, "grad_norm": 2.9489102885078937, "learning_rate": 5.659396345175049e-08, "logits/chosen": -2.241163730621338, "logits/rejected": -2.024839401245117, "logps/chosen": -14.190361022949219, "logps/rejected": -610.3209228515625, "logps_avg/chosen": -0.08231045305728912, "logps_avg/rejected": -3.3653335571289062, "loss": 0.0751, "losses_ref": -0.0008589104982092977, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7830, "u": -5.471013069152832, "weight": 0.03844798356294632 }, { "diff_generated": -33.5227165222168, "epoch": 2.54050550874919, "grad_norm": 2.729628595505013, "learning_rate": 5.5823016067926234e-08, "logits/chosen": -2.2366445064544678, "logits/rejected": -2.0078482627868652, "logps/chosen": -14.653826713562012, "logps/rejected": -614.4781494140625, "logps_avg/chosen": -0.07879281044006348, "logps_avg/rejected": -3.352271556854248, "loss": 0.0763, "losses_ref": -0.0001931044098455459, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7840, "u": -5.330508708953857, "weight": 0.06269881129264832 }, { "diff_generated": -32.063846588134766, "epoch": 2.543745949449125, "grad_norm": 2.89176776237173, "learning_rate": 5.5056961766540444e-08, "logits/chosen": -2.2384262084960938, "logits/rejected": -2.0224967002868652, "logps/chosen": -13.754913330078125, "logps/rejected": -595.7368774414062, "logps_avg/chosen": -0.07618410885334015, "logps_avg/rejected": -3.2063846588134766, "loss": 0.0765, "losses_ref": -0.0006604836671613157, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7850, "u": -5.15533971786499, "weight": 0.09443579614162445 }, { "diff_generated": -35.68360900878906, "epoch": 2.54698639014906, "grad_norm": 2.823496183396589, "learning_rate": 5.429581143840525e-08, "logits/chosen": -2.244398355484009, "logits/rejected": -2.027529001235962, "logps/chosen": -13.530553817749023, "logps/rejected": -645.8378295898438, "logps_avg/chosen": -0.07708597183227539, "logps_avg/rejected": -3.5683608055114746, "loss": 0.079, "losses_ref": -0.0003415651444811374, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7860, "u": -5.435323238372803, "weight": 0.044158075004816055 }, { "diff_generated": -35.86713790893555, "epoch": 2.5502268308489953, "grad_norm": 2.91644868184901, "learning_rate": 5.3539575904614176e-08, "logits/chosen": -2.2696292400360107, "logits/rejected": -2.0367541313171387, "logps/chosen": -14.176198959350586, "logps/rejected": -651.2911987304688, "logps_avg/chosen": -0.08043045550584793, "logps_avg/rejected": -3.586714267730713, "loss": 0.0761, "losses_ref": -0.0014636798296123743, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7870, "u": -5.29245662689209, "weight": 0.07079814374446869 }, { "diff_generated": -35.674190521240234, "epoch": 2.5534672715489304, "grad_norm": 2.5833971813648806, "learning_rate": 5.278826591638794e-08, "logits/chosen": -2.2695140838623047, "logits/rejected": -2.061769962310791, "logps/chosen": -14.17786979675293, "logps/rejected": -668.9481201171875, "logps_avg/chosen": -0.08042646944522858, "logps_avg/rejected": -3.5674190521240234, "loss": 0.0773, "losses_ref": -0.0002508517063688487, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7880, "u": -5.541116237640381, "weight": 0.02528039552271366 }, { "diff_generated": -35.04924774169922, "epoch": 2.556707712248866, "grad_norm": 2.7802142146267546, "learning_rate": 5.204189215492252e-08, "logits/chosen": -2.2482147216796875, "logits/rejected": -2.0366978645324707, "logps/chosen": -13.320378303527832, "logps/rejected": -670.7872314453125, "logps_avg/chosen": -0.07657703012228012, "logps_avg/rejected": -3.50492525100708, "loss": 0.0772, "losses_ref": -0.0008224450284615159, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7890, "u": -5.401477336883545, "weight": 0.05123826116323471 }, { "diff_generated": -34.19906997680664, "epoch": 2.559948152948801, "grad_norm": 2.864889372141515, "learning_rate": 5.1300465231236145e-08, "logits/chosen": -2.263850450515747, "logits/rejected": -2.032125949859619, "logps/chosen": -14.5372314453125, "logps/rejected": -612.401611328125, "logps_avg/chosen": -0.075019970536232, "logps_avg/rejected": -3.4199066162109375, "loss": 0.0782, "losses_ref": -0.0018288299906998873, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7900, "u": -5.394659996032715, "weight": 0.05333171412348747 }, { "diff_generated": -33.32030487060547, "epoch": 2.563188593648736, "grad_norm": 2.6476779853936376, "learning_rate": 5.056399568601946e-08, "logits/chosen": -2.297189235687256, "logits/rejected": -2.0654282569885254, "logps/chosen": -13.988309860229492, "logps/rejected": -588.9055786132812, "logps_avg/chosen": -0.07717391848564148, "logps_avg/rejected": -3.3320305347442627, "loss": 0.0785, "losses_ref": -0.0003493439289741218, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7910, "u": -5.32882022857666, "weight": 0.06289394199848175 }, { "diff_generated": -36.19215393066406, "epoch": 2.5664290343486713, "grad_norm": 2.9522400865572314, "learning_rate": 4.983249398948502e-08, "logits/chosen": -2.314213752746582, "logits/rejected": -2.0248019695281982, "logps/chosen": -14.250593185424805, "logps/rejected": -701.8851318359375, "logps_avg/chosen": -0.07895634323358536, "logps_avg/rejected": -3.619215488433838, "loss": 0.0781, "losses_ref": -0.00045448317541740835, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7920, "u": -5.472224235534668, "weight": 0.0380551740527153 }, { "diff_generated": -34.47525405883789, "epoch": 2.569669475048607, "grad_norm": 2.809750356358677, "learning_rate": 4.910597054121877e-08, "logits/chosen": -2.257275104522705, "logits/rejected": -2.013333797454834, "logps/chosen": -15.664281845092773, "logps/rejected": -614.7738037109375, "logps_avg/chosen": -0.08708717674016953, "logps_avg/rejected": -3.4475257396698, "loss": 0.0772, "losses_ref": -0.00023688049986958504, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7930, "u": -5.436405658721924, "weight": 0.04399479180574417 }, { "diff_generated": -35.79109573364258, "epoch": 2.572909915748542, "grad_norm": 2.965292107484825, "learning_rate": 4.838443567003194e-08, "logits/chosen": -2.2718281745910645, "logits/rejected": -2.0739755630493164, "logps/chosen": -12.573837280273438, "logps/rejected": -654.598388671875, "logps_avg/chosen": -0.07561159133911133, "logps_avg/rejected": -3.5791091918945312, "loss": 0.0752, "losses_ref": -0.0008258657762780786, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7940, "u": -5.506420612335205, "weight": 0.03219860419631004 }, { "diff_generated": -33.60948181152344, "epoch": 2.576150356448477, "grad_norm": 2.9055064732309073, "learning_rate": 4.766789963381459e-08, "logits/chosen": -2.279853105545044, "logits/rejected": -2.0685627460479736, "logps/chosen": -14.392298698425293, "logps/rejected": -609.1856689453125, "logps_avg/chosen": -0.07995419949293137, "logps_avg/rejected": -3.360948085784912, "loss": 0.0789, "losses_ref": -0.0005912907072342932, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7950, "u": -5.365494251251221, "weight": 0.05691393464803696 }, { "diff_generated": -34.88692855834961, "epoch": 2.5793907971484122, "grad_norm": 2.7422363528394085, "learning_rate": 4.695637261938912e-08, "logits/chosen": -2.280608654022217, "logits/rejected": -2.0431153774261475, "logps/chosen": -13.370687484741211, "logps/rejected": -620.4929809570312, "logps_avg/chosen": -0.07715877145528793, "logps_avg/rejected": -3.48869252204895, "loss": 0.0759, "losses_ref": -0.00044079654617235065, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7960, "u": -5.467726230621338, "weight": 0.03798266500234604 }, { "diff_generated": -33.26984405517578, "epoch": 2.5826312378483474, "grad_norm": 2.616784269180188, "learning_rate": 4.624986474236623e-08, "logits/chosen": -2.30104398727417, "logits/rejected": -2.053588390350342, "logps/chosen": -13.474719047546387, "logps/rejected": -634.5220947265625, "logps_avg/chosen": -0.07315264642238617, "logps_avg/rejected": -3.3269848823547363, "loss": 0.0757, "losses_ref": -0.0005079508991912007, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7970, "u": -5.402222633361816, "weight": 0.05061950162053108 }, { "diff_generated": -34.360939025878906, "epoch": 2.5858716785482825, "grad_norm": 2.778302410578972, "learning_rate": 4.554838604700073e-08, "logits/chosen": -2.2400848865509033, "logits/rejected": -2.007932186126709, "logps/chosen": -13.436427116394043, "logps/rejected": -611.0859985351562, "logps_avg/chosen": -0.07698436081409454, "logps_avg/rejected": -3.43609356880188, "loss": 0.0771, "losses_ref": -0.000243752496317029, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7980, "u": -5.294255256652832, "weight": 0.06898865848779678 }, { "diff_generated": -35.74687576293945, "epoch": 2.5891121192482176, "grad_norm": 3.077691021737918, "learning_rate": 4.4851946506048445e-08, "logits/chosen": -2.279158592224121, "logits/rejected": -2.046879529953003, "logps/chosen": -12.938554763793945, "logps/rejected": -642.0476684570312, "logps_avg/chosen": -0.07505569607019424, "logps_avg/rejected": -3.5746874809265137, "loss": 0.0774, "losses_ref": -0.0007500805077143013, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7990, "u": -5.365180492401123, "weight": 0.05708513781428337 }, { "diff_generated": -33.259788513183594, "epoch": 2.5923525599481527, "grad_norm": 2.6148008147179413, "learning_rate": 4.4160556020625026e-08, "logits/chosen": -2.260484218597412, "logits/rejected": -2.0400381088256836, "logps/chosen": -14.224832534790039, "logps/rejected": -642.1659545898438, "logps_avg/chosen": -0.07914839684963226, "logps_avg/rejected": -3.325979232788086, "loss": 0.0761, "losses_ref": -0.0006360385450534523, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8000, "u": -5.328615665435791, "weight": 0.06318946927785873 }, { "diff_generated": -35.72733688354492, "epoch": 2.5955930006480883, "grad_norm": 2.7878707857614335, "learning_rate": 4.347422442006476e-08, "logits/chosen": -2.2746644020080566, "logits/rejected": -2.0587804317474365, "logps/chosen": -14.194560050964355, "logps/rejected": -652.4595947265625, "logps_avg/chosen": -0.0780181735754013, "logps_avg/rejected": -3.5727341175079346, "loss": 0.0751, "losses_ref": -0.0010231004562228918, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8010, "u": -5.437048435211182, "weight": 0.04501374438405037 }, { "diff_generated": -35.51588821411133, "epoch": 2.5988334413480234, "grad_norm": 2.7540198565819782, "learning_rate": 4.2792961461781064e-08, "logits/chosen": -2.321173667907715, "logits/rejected": -2.0663743019104004, "logps/chosen": -14.140558242797852, "logps/rejected": -648.8035888671875, "logps_avg/chosen": -0.07980917394161224, "logps_avg/rejected": -3.551588773727417, "loss": 0.0773, "losses_ref": -0.0003094382118433714, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8020, "u": -5.509263515472412, "weight": 0.0315590463578701 }, { "diff_generated": -33.125953674316406, "epoch": 2.6020738820479585, "grad_norm": 2.755074755828332, "learning_rate": 4.211677683112751e-08, "logits/chosen": -2.28379487991333, "logits/rejected": -2.045283794403076, "logps/chosen": -12.904928207397461, "logps/rejected": -624.020263671875, "logps_avg/chosen": -0.07249008119106293, "logps_avg/rejected": -3.3125953674316406, "loss": 0.0759, "losses_ref": -0.00020831017172895372, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8030, "u": -5.257404804229736, "weight": 0.07521463930606842 }, { "diff_generated": -35.30225372314453, "epoch": 2.6053143227478937, "grad_norm": 2.6717290920800196, "learning_rate": 4.1445680141260594e-08, "logits/chosen": -2.317934036254883, "logits/rejected": -2.0665221214294434, "logps/chosen": -15.472373962402344, "logps/rejected": -655.91748046875, "logps_avg/chosen": -0.08271267265081406, "logps_avg/rejected": -3.5302252769470215, "loss": 0.0786, "losses_ref": -8.515668741893023e-05, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8040, "u": -5.471126079559326, "weight": 0.037577468901872635 }, { "diff_generated": -34.73823547363281, "epoch": 2.6085547634478288, "grad_norm": 2.72154979767351, "learning_rate": 4.077968093300237e-08, "logits/chosen": -2.2775912284851074, "logits/rejected": -2.0561890602111816, "logps/chosen": -13.31470775604248, "logps/rejected": -645.74072265625, "logps_avg/chosen": -0.0767243281006813, "logps_avg/rejected": -3.4738240242004395, "loss": 0.0758, "losses_ref": -0.0005114816012792289, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8050, "u": -5.501688003540039, "weight": 0.03185393661260605 }, { "diff_generated": -37.13385009765625, "epoch": 2.6117952041477643, "grad_norm": 2.855539364129418, "learning_rate": 4.011878867470542e-08, "logits/chosen": -2.282379627227783, "logits/rejected": -2.0570123195648193, "logps/chosen": -15.232747077941895, "logps/rejected": -693.3724365234375, "logps_avg/chosen": -0.08379258215427399, "logps_avg/rejected": -3.7133851051330566, "loss": 0.0785, "losses_ref": -0.00034684973070397973, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8060, "u": -5.579532623291016, "weight": 0.01912674866616726 }, { "diff_generated": -34.14933395385742, "epoch": 2.6150356448476995, "grad_norm": 2.6408453666827367, "learning_rate": 3.9463012762118144e-08, "logits/chosen": -2.21820068359375, "logits/rejected": -2.0539538860321045, "logps/chosen": -11.912099838256836, "logps/rejected": -666.9979248046875, "logps_avg/chosen": -0.07595182210206985, "logps_avg/rejected": -3.414933443069458, "loss": 0.0756, "losses_ref": -0.0010541939409449697, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8070, "u": -5.364609241485596, "weight": 0.05779505521059036 }, { "diff_generated": -34.31537628173828, "epoch": 2.6182760855476346, "grad_norm": 2.8379084234493486, "learning_rate": 3.8812362518250816e-08, "logits/chosen": -2.2986514568328857, "logits/rejected": -2.073610305786133, "logps/chosen": -14.934292793273926, "logps/rejected": -623.4342651367188, "logps_avg/chosen": -0.0827370285987854, "logps_avg/rejected": -3.431537628173828, "loss": 0.0801, "losses_ref": -0.00031489311368204653, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8080, "u": -5.367594242095947, "weight": 0.05658316612243652 }, { "diff_generated": -35.86017608642578, "epoch": 2.6215165262475697, "grad_norm": 2.6788056351776177, "learning_rate": 3.816684719324352e-08, "logits/chosen": -2.241401195526123, "logits/rejected": -2.0460305213928223, "logps/chosen": -12.66572380065918, "logps/rejected": -684.3659057617188, "logps_avg/chosen": -0.07562129199504852, "logps_avg/rejected": -3.586017608642578, "loss": 0.0771, "losses_ref": -0.0008628388168290257, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8090, "u": -5.4722185134887695, "weight": 0.03859802335500717 }, { "diff_generated": -34.25069046020508, "epoch": 2.624756966947505, "grad_norm": 2.7244222042198363, "learning_rate": 3.7526475964234286e-08, "logits/chosen": -2.25890851020813, "logits/rejected": -2.0612587928771973, "logps/chosen": -13.041460990905762, "logps/rejected": -637.5123901367188, "logps_avg/chosen": -0.07453655451536179, "logps_avg/rejected": -3.4250690937042236, "loss": 0.0771, "losses_ref": -0.0009745571878738701, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8100, "u": -5.402337551116943, "weight": 0.0512116476893425 }, { "diff_generated": -34.15746307373047, "epoch": 2.62799740764744, "grad_norm": 2.7350343370428125, "learning_rate": 3.689125793522874e-08, "logits/chosen": -2.233297109603882, "logits/rejected": -1.999696969985962, "logps/chosen": -12.732789993286133, "logps/rejected": -624.6839599609375, "logps_avg/chosen": -0.07528980076313019, "logps_avg/rejected": -3.4157466888427734, "loss": 0.077, "losses_ref": -7.013262074906379e-05, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8110, "u": -5.3310370445251465, "weight": 0.06256024539470673 }, { "diff_generated": -34.039451599121094, "epoch": 2.631237848347375, "grad_norm": 2.891218805017069, "learning_rate": 3.6261202136970814e-08, "logits/chosen": -2.2619783878326416, "logits/rejected": -2.040761709213257, "logps/chosen": -13.35377025604248, "logps/rejected": -598.27880859375, "logps_avg/chosen": -0.07534444332122803, "logps_avg/rejected": -3.403945207595825, "loss": 0.0762, "losses_ref": -0.0006816794048063457, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8120, "u": -5.329941749572754, "weight": 0.06335194408893585 }, { "diff_generated": -32.30815887451172, "epoch": 2.63447828904731, "grad_norm": 3.2827797507216117, "learning_rate": 3.563631752681422e-08, "logits/chosen": -2.2446212768554688, "logits/rejected": -2.0790886878967285, "logps/chosen": -13.016275405883789, "logps/rejected": -612.4979248046875, "logps_avg/chosen": -0.07507045567035675, "logps_avg/rejected": -3.2308154106140137, "loss": 0.0772, "losses_ref": -0.0002835232298821211, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8130, "u": -5.437108993530273, "weight": 0.04403982684016228 }, { "diff_generated": -35.24883270263672, "epoch": 2.6377187297472457, "grad_norm": 2.659663241127319, "learning_rate": 3.501661298859489e-08, "logits/chosen": -2.2246174812316895, "logits/rejected": -2.0028109550476074, "logps/chosen": -15.065336227416992, "logps/rejected": -649.087646484375, "logps_avg/chosen": -0.08108867704868317, "logps_avg/rejected": -3.524883270263672, "loss": 0.0776, "losses_ref": -0.0013233883073553443, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8140, "u": -5.504677772521973, "weight": 0.03295652195811272 }, { "diff_generated": -32.72626495361328, "epoch": 2.640959170447181, "grad_norm": 2.8495814105659174, "learning_rate": 3.4402097332505074e-08, "logits/chosen": -2.2631309032440186, "logits/rejected": -2.032370090484619, "logps/chosen": -13.472801208496094, "logps/rejected": -625.4802856445312, "logps_avg/chosen": -0.07656830549240112, "logps_avg/rejected": -3.2726263999938965, "loss": 0.0771, "losses_ref": -0.0003127239178866148, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8150, "u": -5.327264785766602, "weight": 0.06283261626958847 }, { "diff_generated": -34.16876983642578, "epoch": 2.644199611147116, "grad_norm": 2.784594617604692, "learning_rate": 3.379277929496798e-08, "logits/chosen": -2.2317280769348145, "logits/rejected": -2.0046327114105225, "logps/chosen": -13.157957077026367, "logps/rejected": -641.9828491210938, "logps_avg/chosen": -0.07431378960609436, "logps_avg/rejected": -3.416877269744873, "loss": 0.0771, "losses_ref": -0.000572945165913552, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8160, "u": -5.249545574188232, "weight": 0.07572882622480392 }, { "diff_generated": -32.216583251953125, "epoch": 2.647440051847051, "grad_norm": 3.033818763876657, "learning_rate": 3.3188667538513435e-08, "logits/chosen": -2.201606273651123, "logits/rejected": -2.018627643585205, "logps/chosen": -12.753862380981445, "logps/rejected": -629.6954345703125, "logps_avg/chosen": -0.07567889988422394, "logps_avg/rejected": -3.221658229827881, "loss": 0.0762, "losses_ref": -0.0009025133331306279, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8170, "u": -5.260069847106934, "weight": 0.07612423598766327 }, { "diff_generated": -35.8400993347168, "epoch": 2.6506804925469862, "grad_norm": 2.712181633748497, "learning_rate": 3.258977065165478e-08, "logits/chosen": -2.292752742767334, "logits/rejected": -2.0390093326568604, "logps/chosen": -13.461270332336426, "logps/rejected": -626.172119140625, "logps_avg/chosen": -0.0749615877866745, "logps_avg/rejected": -3.584010362625122, "loss": 0.0772, "losses_ref": -0.000735091685783118, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8180, "u": -5.398656368255615, "weight": 0.05097651481628418 }, { "diff_generated": -34.66440963745117, "epoch": 2.653920933246922, "grad_norm": 2.907996168398012, "learning_rate": 3.1996097148766897e-08, "logits/chosen": -2.2223732471466064, "logits/rejected": -2.05733060836792, "logps/chosen": -12.831570625305176, "logps/rejected": -675.23779296875, "logps_avg/chosen": -0.07762549072504044, "logps_avg/rejected": -3.4664406776428223, "loss": 0.0757, "losses_ref": -0.0008757191826589406, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8190, "u": -5.366507530212402, "weight": 0.05740770697593689 }, { "diff_generated": -37.49953842163086, "epoch": 2.657161373946857, "grad_norm": 2.923785207024441, "learning_rate": 3.1407655469964754e-08, "logits/chosen": -2.308901071548462, "logits/rejected": -2.0445334911346436, "logps/chosen": -13.068899154663086, "logps/rejected": -683.8781127929688, "logps_avg/chosen": -0.0730833113193512, "logps_avg/rejected": -3.7499542236328125, "loss": 0.0749, "losses_ref": -0.0002511995262466371, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8200, "u": -5.397935390472412, "weight": 0.05026369169354439 }, { "diff_generated": -32.94683074951172, "epoch": 2.660401814646792, "grad_norm": 2.7834058244584887, "learning_rate": 3.0824453980984234e-08, "logits/chosen": -2.2628774642944336, "logits/rejected": -2.0234005451202393, "logps/chosen": -13.097677230834961, "logps/rejected": -620.1732177734375, "logps_avg/chosen": -0.07187594473361969, "logps_avg/rejected": -3.2946829795837402, "loss": 0.0785, "losses_ref": -0.0001284389873035252, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8210, "u": -5.295039176940918, "weight": 0.0688696950674057 }, { "diff_generated": -32.9188117980957, "epoch": 2.663642255346727, "grad_norm": 2.6893460228311223, "learning_rate": 3.0246500973062184e-08, "logits/chosen": -2.2631959915161133, "logits/rejected": -1.998734474182129, "logps/chosen": -14.383687019348145, "logps/rejected": -600.3707275390625, "logps_avg/chosen": -0.07979197800159454, "logps_avg/rejected": -3.2918810844421387, "loss": 0.077, "losses_ref": -0.001167912851087749, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8220, "u": -5.286744117736816, "weight": 0.07036517560482025 }, { "diff_generated": -35.13602066040039, "epoch": 2.6668826960466623, "grad_norm": 2.7774076969393944, "learning_rate": 2.9673804662819324e-08, "logits/chosen": -2.256629467010498, "logits/rejected": -1.9869369268417358, "logps/chosen": -13.804142951965332, "logps/rejected": -632.3235473632812, "logps_avg/chosen": -0.07562381774187088, "logps_avg/rejected": -3.513601779937744, "loss": 0.0774, "losses_ref": -0.000869055453222245, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8230, "u": -5.329628944396973, "weight": 0.06361083686351776 }, { "diff_generated": -33.075477600097656, "epoch": 2.6701231367465974, "grad_norm": 2.7856281925799227, "learning_rate": 2.9106373192143087e-08, "logits/chosen": -2.2716612815856934, "logits/rejected": -2.0574049949645996, "logps/chosen": -12.818155288696289, "logps/rejected": -628.197998046875, "logps_avg/chosen": -0.07139711827039719, "logps_avg/rejected": -3.3075473308563232, "loss": 0.0758, "losses_ref": -0.00030460403650067747, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8240, "u": -5.325705528259277, "weight": 0.06283704191446304 }, { "diff_generated": -35.3209342956543, "epoch": 2.6733635774465325, "grad_norm": 2.905241637475343, "learning_rate": 2.854421462807193e-08, "logits/chosen": -2.2535653114318848, "logits/rejected": -2.0166335105895996, "logps/chosen": -13.077981948852539, "logps/rejected": -663.8329467773438, "logps_avg/chosen": -0.0728570744395256, "logps_avg/rejected": -3.5320937633514404, "loss": 0.074, "losses_ref": -8.225092460634187e-05, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8250, "u": -5.434994697570801, "weight": 0.04382305219769478 }, { "diff_generated": -33.154258728027344, "epoch": 2.6766040181464676, "grad_norm": 2.9496787017841335, "learning_rate": 2.798733696268063e-08, "logits/chosen": -2.2341184616088867, "logits/rejected": -1.998202919960022, "logps/chosen": -14.134020805358887, "logps/rejected": -610.1220092773438, "logps_avg/chosen": -0.07861624658107758, "logps_avg/rejected": -3.315425395965576, "loss": 0.0796, "losses_ref": -0.00030415848596021533, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8260, "u": -5.36326789855957, "weight": 0.056549690663814545 }, { "diff_generated": -34.65415573120117, "epoch": 2.679844458846403, "grad_norm": 2.862086996557468, "learning_rate": 2.7435748112966694e-08, "logits/chosen": -2.2305984497070312, "logits/rejected": -2.041346311569214, "logps/chosen": -12.520244598388672, "logps/rejected": -678.1482543945312, "logps_avg/chosen": -0.07185360789299011, "logps_avg/rejected": -3.4654159545898438, "loss": 0.0765, "losses_ref": -7.135640771593899e-05, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8270, "u": -5.397140979766846, "weight": 0.05006079748272896 }, { "diff_generated": -35.461692810058594, "epoch": 2.6830848995463383, "grad_norm": 3.093628968139604, "learning_rate": 2.6889455920737903e-08, "logits/chosen": -2.2679262161254883, "logits/rejected": -1.9786310195922852, "logps/chosen": -16.094085693359375, "logps/rejected": -624.7288818359375, "logps_avg/chosen": -0.08841854333877563, "logps_avg/rejected": -3.5461692810058594, "loss": 0.0782, "losses_ref": -0.00011158763663843274, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8280, "u": -5.260346412658691, "weight": 0.07509959489107132 }, { "diff_generated": -33.50742721557617, "epoch": 2.6863253402462735, "grad_norm": 2.9685480466908, "learning_rate": 2.6348468152500357e-08, "logits/chosen": -2.2659404277801514, "logits/rejected": -2.062710762023926, "logps/chosen": -12.312782287597656, "logps/rejected": -627.5169677734375, "logps_avg/chosen": -0.07124846428632736, "logps_avg/rejected": -3.3507423400878906, "loss": 0.078, "losses_ref": -6.677229976048693e-05, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8290, "u": -5.362844944000244, "weight": 0.05630839988589287 }, { "diff_generated": -36.79378890991211, "epoch": 2.6895657809462086, "grad_norm": 2.809794362122494, "learning_rate": 2.5812792499348935e-08, "logits/chosen": -2.2740607261657715, "logits/rejected": -2.0205042362213135, "logps/chosen": -14.752975463867188, "logps/rejected": -658.4273681640625, "logps_avg/chosen": -0.07980714738368988, "logps_avg/rejected": -3.6793792247772217, "loss": 0.0758, "losses_ref": -4.300654472899623e-05, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8300, "u": -5.545115947723389, "weight": 0.02503451332449913 }, { "diff_generated": -34.12705612182617, "epoch": 2.692806221646144, "grad_norm": 2.580660517105373, "learning_rate": 2.5282436576857046e-08, "logits/chosen": -2.2692408561706543, "logits/rejected": -2.07869291305542, "logps/chosen": -13.142602920532227, "logps/rejected": -611.646728515625, "logps_avg/chosen": -0.07218606770038605, "logps_avg/rejected": -3.412705898284912, "loss": 0.0745, "losses_ref": -0.0006400069105438888, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8310, "u": -5.396452903747559, "weight": 0.050875671207904816 }, { "diff_generated": -34.58293151855469, "epoch": 2.6960466623460793, "grad_norm": 2.9235840639131943, "learning_rate": 2.4757407924968878e-08, "logits/chosen": -2.2564258575439453, "logits/rejected": -2.059000015258789, "logps/chosen": -12.19434642791748, "logps/rejected": -637.7523803710938, "logps_avg/chosen": -0.07454869151115417, "logps_avg/rejected": -3.4582931995391846, "loss": 0.0768, "losses_ref": -0.0002925902372226119, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8320, "u": -5.332849502563477, "weight": 0.06286852061748505 }, { "diff_generated": -35.07324981689453, "epoch": 2.6992871030460144, "grad_norm": 2.8504828315929247, "learning_rate": 2.4237714007892117e-08, "logits/chosen": -2.3100998401641846, "logits/rejected": -2.068859577178955, "logps/chosen": -15.198066711425781, "logps/rejected": -663.3590698242188, "logps_avg/chosen": -0.08022447675466537, "logps_avg/rejected": -3.5073249340057373, "loss": 0.0776, "losses_ref": -0.0007650894112884998, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8330, "u": -5.469299793243408, "weight": 0.03833349421620369 }, { "diff_generated": -33.52021408081055, "epoch": 2.7025275437459495, "grad_norm": 2.9592373704389416, "learning_rate": 2.372336221399176e-08, "logits/chosen": -2.2604236602783203, "logits/rejected": -2.0275778770446777, "logps/chosen": -13.657310485839844, "logps/rejected": -625.4544067382812, "logps_avg/chosen": -0.07668985426425934, "logps_avg/rejected": -3.3520214557647705, "loss": 0.075, "losses_ref": -0.0005120009882375598, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8340, "u": -5.224454402923584, "weight": 0.08183753490447998 }, { "diff_generated": -33.873512268066406, "epoch": 2.7057679844458846, "grad_norm": 2.71910932232536, "learning_rate": 2.3214359855685095e-08, "logits/chosen": -2.250415563583374, "logits/rejected": -2.01396107673645, "logps/chosen": -13.784021377563477, "logps/rejected": -591.624267578125, "logps_avg/chosen": -0.07690791040658951, "logps_avg/rejected": -3.3873507976531982, "loss": 0.0741, "losses_ref": -0.0011790532153099775, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8350, "u": -5.472784042358398, "weight": 0.03912579268217087 }, { "diff_generated": -36.73569869995117, "epoch": 2.7090084251458197, "grad_norm": 2.7945233636437603, "learning_rate": 2.271071416933772e-08, "logits/chosen": -2.282506227493286, "logits/rejected": -2.0563690662384033, "logps/chosen": -12.65495491027832, "logps/rejected": -665.0225830078125, "logps_avg/chosen": -0.07482697814702988, "logps_avg/rejected": -3.673570156097412, "loss": 0.0759, "losses_ref": -4.985265695722774e-05, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8360, "u": -5.4010467529296875, "weight": 0.05004154518246651 }, { "diff_generated": -33.571414947509766, "epoch": 2.712248865845755, "grad_norm": 2.9122007198187756, "learning_rate": 2.2212432315160855e-08, "logits/chosen": -2.2580132484436035, "logits/rejected": -2.0520682334899902, "logps/chosen": -13.503583908081055, "logps/rejected": -601.1464233398438, "logps_avg/chosen": -0.07640071213245392, "logps_avg/rejected": -3.3571410179138184, "loss": 0.0769, "losses_ref": -0.00020067494187969714, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8370, "u": -5.330541133880615, "weight": 0.06268791854381561 }, { "diff_generated": -33.323726654052734, "epoch": 2.71548930654569, "grad_norm": 2.8899911784312664, "learning_rate": 2.171952137710904e-08, "logits/chosen": -2.313049793243408, "logits/rejected": -2.0854876041412354, "logps/chosen": -12.245684623718262, "logps/rejected": -647.4600830078125, "logps_avg/chosen": -0.06888893991708755, "logps_avg/rejected": -3.3323721885681152, "loss": 0.0768, "losses_ref": -0.0003442336746957153, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8380, "u": -5.473523139953613, "weight": 0.037879910320043564 }, { "diff_generated": -34.752471923828125, "epoch": 2.7187297472456255, "grad_norm": 2.624287373775904, "learning_rate": 2.1231988362780327e-08, "logits/chosen": -2.252321720123291, "logits/rejected": -2.008906602859497, "logps/chosen": -13.585721969604492, "logps/rejected": -649.451171875, "logps_avg/chosen": -0.07445680350065231, "logps_avg/rejected": -3.475247621536255, "loss": 0.0766, "losses_ref": -0.0002541754802223295, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8390, "u": -5.295945644378662, "weight": 0.06904669106006622 }, { "diff_generated": -36.02851867675781, "epoch": 2.7219701879455607, "grad_norm": 2.916752708658845, "learning_rate": 2.0749840203315584e-08, "logits/chosen": -2.2832603454589844, "logits/rejected": -2.0546741485595703, "logps/chosen": -14.805493354797363, "logps/rejected": -696.9401245117188, "logps_avg/chosen": -0.08261923491954803, "logps_avg/rejected": -3.6028518676757812, "loss": 0.0751, "losses_ref": -0.00011947475286433473, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8400, "u": -5.43557071685791, "weight": 0.0438610278069973 }, { "diff_generated": -35.815834045410156, "epoch": 2.725210628645496, "grad_norm": 2.6124609937812986, "learning_rate": 2.0273083753300724e-08, "logits/chosen": -2.291124105453491, "logits/rejected": -1.9985876083374023, "logps/chosen": -14.099342346191406, "logps/rejected": -640.3782348632812, "logps_avg/chosen": -0.07880159467458725, "logps_avg/rejected": -3.5815834999084473, "loss": 0.0777, "losses_ref": -0.0009120380273088813, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8410, "u": -5.433448314666748, "weight": 0.04484781622886658 }, { "diff_generated": -33.46442794799805, "epoch": 2.728451069345431, "grad_norm": 3.0081522859711636, "learning_rate": 1.980172579066899e-08, "logits/chosen": -2.3004908561706543, "logits/rejected": -2.056856870651245, "logps/chosen": -14.426023483276367, "logps/rejected": -644.3485717773438, "logps_avg/chosen": -0.08110615611076355, "logps_avg/rejected": -3.346442699432373, "loss": 0.0767, "losses_ref": -0.0003365448210388422, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8420, "u": -5.472744464874268, "weight": 0.03788283094763756 }, { "diff_generated": -35.44267272949219, "epoch": 2.731691510045366, "grad_norm": 2.665139502680527, "learning_rate": 1.9335773016604608e-08, "logits/chosen": -2.2856247425079346, "logits/rejected": -2.055262804031372, "logps/chosen": -14.615339279174805, "logps/rejected": -697.371826171875, "logps_avg/chosen": -0.07996072620153427, "logps_avg/rejected": -3.544267177581787, "loss": 0.0777, "losses_ref": -0.000313569646095857, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8430, "u": -5.545459747314453, "weight": 0.025315571576356888 }, { "diff_generated": -35.521690368652344, "epoch": 2.7349319507453016, "grad_norm": 2.7734083827432023, "learning_rate": 1.887523205544741e-08, "logits/chosen": -2.2610440254211426, "logits/rejected": -2.050652027130127, "logps/chosen": -14.45335865020752, "logps/rejected": -622.8594970703125, "logps_avg/chosen": -0.07939153164625168, "logps_avg/rejected": -3.552168607711792, "loss": 0.0766, "losses_ref": -0.0003281990939285606, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8440, "u": -5.403284072875977, "weight": 0.05036498233675957 }, { "diff_generated": -36.24019241333008, "epoch": 2.7381723914452367, "grad_norm": 2.7395054198494893, "learning_rate": 1.8420109454598997e-08, "logits/chosen": -2.304332971572876, "logits/rejected": -2.057947874069214, "logps/chosen": -12.37016487121582, "logps/rejected": -682.6101684570312, "logps_avg/chosen": -0.07354007661342621, "logps_avg/rejected": -3.624018907546997, "loss": 0.0773, "losses_ref": -0.0003148287651129067, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8450, "u": -5.472707271575928, "weight": 0.0378398522734642 }, { "diff_generated": -32.481624603271484, "epoch": 2.741412832145172, "grad_norm": 2.8390132929113516, "learning_rate": 1.797041168442921e-08, "logits/chosen": -2.293044328689575, "logits/rejected": -2.065861701965332, "logps/chosen": -14.110382080078125, "logps/rejected": -585.2341918945312, "logps_avg/chosen": -0.0777052789926529, "logps_avg/rejected": -3.2481625080108643, "loss": 0.078, "losses_ref": -0.00031997732003219426, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8460, "u": -5.43752908706665, "weight": 0.04411545395851135 }, { "diff_generated": -34.74770736694336, "epoch": 2.744653272845107, "grad_norm": 2.7392703290260005, "learning_rate": 1.7526145138184377e-08, "logits/chosen": -2.314601182937622, "logits/rejected": -2.0743463039398193, "logps/chosen": -13.695550918579102, "logps/rejected": -634.08740234375, "logps_avg/chosen": -0.07747956365346909, "logps_avg/rejected": -3.474771022796631, "loss": 0.0782, "losses_ref": -0.0006029107025824487, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8470, "u": -5.472765922546387, "weight": 0.0382402203977108 }, { "diff_generated": -34.0060920715332, "epoch": 2.747893713545042, "grad_norm": 2.8765591727104396, "learning_rate": 1.708731613189669e-08, "logits/chosen": -2.3314247131347656, "logits/rejected": -2.0638935565948486, "logps/chosen": -15.631362915039062, "logps/rejected": -601.2354736328125, "logps_avg/chosen": -0.0813279002904892, "logps_avg/rejected": -3.400609254837036, "loss": 0.0773, "losses_ref": -0.00018041368457488716, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8480, "u": -5.435568332672119, "weight": 0.04391627386212349 }, { "diff_generated": -35.1057243347168, "epoch": 2.751134154244977, "grad_norm": 2.781336468432542, "learning_rate": 1.6653930904293677e-08, "logits/chosen": -2.2818095684051514, "logits/rejected": -2.0274269580841064, "logps/chosen": -14.73913860321045, "logps/rejected": -648.7092895507812, "logps_avg/chosen": -0.07969610393047333, "logps_avg/rejected": -3.5105724334716797, "loss": 0.0764, "losses_ref": -0.0015385873848572373, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8490, "u": -5.467836856842041, "weight": 0.03979206085205078 }, { "diff_generated": -38.000701904296875, "epoch": 2.7543745949449123, "grad_norm": 2.7782768367868798, "learning_rate": 1.6225995616710297e-08, "logits/chosen": -2.2996420860290527, "logits/rejected": -2.0055785179138184, "logps/chosen": -14.530261039733887, "logps/rejected": -654.1446533203125, "logps_avg/chosen": -0.07845546305179596, "logps_avg/rejected": -3.800069808959961, "loss": 0.0748, "losses_ref": -0.0009555866126902401, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8500, "u": -5.5420684814453125, "weight": 0.026282688602805138 }, { "diff_generated": -35.94993209838867, "epoch": 2.7576150356448474, "grad_norm": 2.7776443099222017, "learning_rate": 1.58035163530009e-08, "logits/chosen": -2.367295026779175, "logits/rejected": -2.0653717517852783, "logps/chosen": -15.043771743774414, "logps/rejected": -671.5582275390625, "logps_avg/chosen": -0.07488191872835159, "logps_avg/rejected": -3.5949931144714355, "loss": 0.0771, "losses_ref": -0.0011525630252435803, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8510, "u": -5.470739364624023, "weight": 0.03922674432396889 }, { "diff_generated": -34.60573959350586, "epoch": 2.760855476344783, "grad_norm": 2.773851757598069, "learning_rate": 1.538649911945291e-08, "logits/chosen": -2.2526590824127197, "logits/rejected": -2.052077531814575, "logps/chosen": -14.62193489074707, "logps/rejected": -670.3350830078125, "logps_avg/chosen": -0.0839783325791359, "logps_avg/rejected": -3.4605743885040283, "loss": 0.079, "losses_ref": -0.0002155410184059292, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8520, "u": -5.506975173950195, "weight": 0.03145802766084671 }, { "diff_generated": -34.427696228027344, "epoch": 2.764095917044718, "grad_norm": 2.9143017584146054, "learning_rate": 1.497494984470107e-08, "logits/chosen": -2.257265567779541, "logits/rejected": -2.0002946853637695, "logps/chosen": -15.755085945129395, "logps/rejected": -644.8309936523438, "logps_avg/chosen": -0.08270631730556488, "logps_avg/rejected": -3.4427692890167236, "loss": 0.0774, "losses_ref": -0.001095159212127328, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8530, "u": -5.358209609985352, "weight": 0.057795751839876175 }, { "diff_generated": -31.565780639648438, "epoch": 2.7673363577446533, "grad_norm": 2.716723351573739, "learning_rate": 1.4568874379643936e-08, "logits/chosen": -2.290512800216675, "logits/rejected": -2.0521178245544434, "logps/chosen": -12.882516860961914, "logps/rejected": -598.2491455078125, "logps_avg/chosen": -0.06945003569126129, "logps_avg/rejected": -3.1565780639648438, "loss": 0.075, "losses_ref": -0.0006087241927161813, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8540, "u": -5.04725456237793, "weight": 0.113297238945961 }, { "diff_generated": -33.27885055541992, "epoch": 2.7705767984445884, "grad_norm": 2.7208468194715025, "learning_rate": 1.4168278497359798e-08, "logits/chosen": -2.32669734954834, "logits/rejected": -2.113250255584717, "logps/chosen": -13.624165534973145, "logps/rejected": -610.7620849609375, "logps_avg/chosen": -0.07655289769172668, "logps_avg/rejected": -3.327885150909424, "loss": 0.0757, "losses_ref": -0.00013436308654490858, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8550, "u": -5.400485038757324, "weight": 0.05012362450361252 }, { "diff_generated": -34.342716217041016, "epoch": 2.7738172391445235, "grad_norm": 2.768052375106582, "learning_rate": 1.3773167893025161e-08, "logits/chosen": -2.2872402667999268, "logits/rejected": -2.050503730773926, "logps/chosen": -15.019485473632812, "logps/rejected": -664.238525390625, "logps_avg/chosen": -0.0805942639708519, "logps_avg/rejected": -3.4342715740203857, "loss": 0.0769, "losses_ref": -0.0003937376313842833, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8560, "u": -5.361881732940674, "weight": 0.056711576879024506 }, { "diff_generated": -33.52961730957031, "epoch": 2.777057679844459, "grad_norm": 2.779576150611529, "learning_rate": 1.3383548183833715e-08, "logits/chosen": -2.2941019535064697, "logits/rejected": -2.067523241043091, "logps/chosen": -14.290433883666992, "logps/rejected": -590.431884765625, "logps_avg/chosen": -0.08122588694095612, "logps_avg/rejected": -3.352961778640747, "loss": 0.0768, "losses_ref": -0.0005186675698496401, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8570, "u": -5.29172420501709, "weight": 0.06934425234794617 }, { "diff_generated": -36.81610107421875, "epoch": 2.780298120544394, "grad_norm": 2.7797296431792393, "learning_rate": 1.2999424908916346e-08, "logits/chosen": -2.2424275875091553, "logits/rejected": -1.9814599752426147, "logps/chosen": -14.798460006713867, "logps/rejected": -660.2183227539062, "logps_avg/chosen": -0.08434576541185379, "logps_avg/rejected": -3.681610107421875, "loss": 0.0764, "losses_ref": -0.00028714913059957325, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8580, "u": -5.438215732574463, "weight": 0.04404758661985397 }, { "diff_generated": -33.61157989501953, "epoch": 2.7835385612443293, "grad_norm": 2.783877857928745, "learning_rate": 1.2620803529262357e-08, "logits/chosen": -2.2669458389282227, "logits/rejected": -2.053138256072998, "logps/chosen": -12.835203170776367, "logps/rejected": -592.4732055664062, "logps_avg/chosen": -0.07494383305311203, "logps_avg/rejected": -3.3611583709716797, "loss": 0.0747, "losses_ref": -0.0006721061654388905, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8590, "u": -5.401367664337158, "weight": 0.05073683336377144 }, { "diff_generated": -33.477386474609375, "epoch": 2.7867790019442644, "grad_norm": 2.7995289130877468, "learning_rate": 1.2247689427642027e-08, "logits/chosen": -2.3007161617279053, "logits/rejected": -2.055169105529785, "logps/chosen": -14.55711555480957, "logps/rejected": -620.1129150390625, "logps_avg/chosen": -0.0780087560415268, "logps_avg/rejected": -3.347738742828369, "loss": 0.0764, "losses_ref": -0.0003297061484772712, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8600, "u": -5.402604103088379, "weight": 0.05038810521364212 }, { "diff_generated": -37.25822830200195, "epoch": 2.7900194426441995, "grad_norm": 2.8526707944449226, "learning_rate": 1.1880087908529945e-08, "logits/chosen": -2.26621675491333, "logits/rejected": -2.0047826766967773, "logps/chosen": -14.627006530761719, "logps/rejected": -665.5721435546875, "logps_avg/chosen": -0.08128681033849716, "logps_avg/rejected": -3.725823163986206, "loss": 0.0769, "losses_ref": -0.0007085074321366847, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8610, "u": -5.400392055511475, "weight": 0.0508403554558754 }, { "diff_generated": -34.120948791503906, "epoch": 2.7932598833441347, "grad_norm": 2.535138540278953, "learning_rate": 1.1518004198029529e-08, "logits/chosen": -2.300736665725708, "logits/rejected": -2.0538346767425537, "logps/chosen": -15.004858016967773, "logps/rejected": -642.0592041015625, "logps_avg/chosen": -0.0812058076262474, "logps_avg/rejected": -3.412094831466675, "loss": 0.0772, "losses_ref": -0.00012083786714356393, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8620, "u": -5.397956371307373, "weight": 0.05010969564318657 }, { "diff_generated": -32.81501770019531, "epoch": 2.79650032404407, "grad_norm": 2.877966483352646, "learning_rate": 1.1161443443798946e-08, "logits/chosen": -2.2787914276123047, "logits/rejected": -2.059709072113037, "logps/chosen": -13.227984428405762, "logps/rejected": -628.3048095703125, "logps_avg/chosen": -0.07205172628164291, "logps_avg/rejected": -3.281501293182373, "loss": 0.077, "losses_ref": -5.9588219301076606e-05, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8630, "u": -5.293186187744141, "weight": 0.06879880279302597 }, { "diff_generated": -36.92168426513672, "epoch": 2.7997407647440054, "grad_norm": 2.9731936844378373, "learning_rate": 1.0810410714977747e-08, "logits/chosen": -2.236611843109131, "logits/rejected": -1.9774243831634521, "logps/chosen": -15.089825630187988, "logps/rejected": -671.8287963867188, "logps_avg/chosen": -0.08256001025438309, "logps_avg/rejected": -3.692168712615967, "loss": 0.0774, "losses_ref": -0.00027638330357149243, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8640, "u": -5.357158184051514, "weight": 0.05651743337512016 }, { "diff_generated": -36.185157775878906, "epoch": 2.8029812054439405, "grad_norm": 3.0221447823428393, "learning_rate": 1.0464911002114885e-08, "logits/chosen": -2.2851452827453613, "logits/rejected": -2.084177255630493, "logps/chosen": -14.222898483276367, "logps/rejected": -670.3880004882812, "logps_avg/chosen": -0.08353248238563538, "logps_avg/rejected": -3.6185154914855957, "loss": 0.0767, "losses_ref": -0.0012774534989148378, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8650, "u": -5.578444004058838, "weight": 0.020543891936540604 }, { "diff_generated": -34.61139678955078, "epoch": 2.8062216461438756, "grad_norm": 2.7265514642437094, "learning_rate": 1.0124949217097656e-08, "logits/chosen": -2.317802667617798, "logits/rejected": -2.0749332904815674, "logps/chosen": -12.769566535949707, "logps/rejected": -647.1751708984375, "logps_avg/chosen": -0.07187042385339737, "logps_avg/rejected": -3.461139678955078, "loss": 0.0773, "losses_ref": -0.0005036048823967576, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8660, "u": -5.4337568283081055, "weight": 0.04427201300859451 }, { "diff_generated": -36.47317886352539, "epoch": 2.8094620868438107, "grad_norm": 2.8915693908709352, "learning_rate": 9.790530193082114e-09, "logits/chosen": -2.285475730895996, "logits/rejected": -2.0016865730285645, "logps/chosen": -15.852392196655273, "logps/rejected": -661.9172973632812, "logps_avg/chosen": -0.08655592799186707, "logps_avg/rejected": -3.647317409515381, "loss": 0.0802, "losses_ref": -0.0001104946932173334, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8670, "u": -5.475028038024902, "weight": 0.03760233893990517 }, { "diff_generated": -34.181583404541016, "epoch": 2.812702527543746, "grad_norm": 2.895358001000874, "learning_rate": 9.461658684423968e-09, "logits/chosen": -2.2664928436279297, "logits/rejected": -2.0277068614959717, "logps/chosen": -15.316492080688477, "logps/rejected": -675.1531982421875, "logps_avg/chosen": -0.08478286117315292, "logps_avg/rejected": -3.4181582927703857, "loss": 0.0785, "losses_ref": -0.00024127769574988633, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8680, "u": -5.331995964050293, "weight": 0.06275250762701035 }, { "diff_generated": -34.03679656982422, "epoch": 2.8159429682436814, "grad_norm": 2.86929217256149, "learning_rate": 9.138339366611526e-09, "logits/chosen": -2.335113763809204, "logits/rejected": -2.091174840927124, "logps/chosen": -13.370208740234375, "logps/rejected": -636.9915161132812, "logps_avg/chosen": -0.07555247843265533, "logps_avg/rejected": -3.403679609298706, "loss": 0.0765, "losses_ref": -0.0005011368775740266, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8690, "u": -5.543493747711182, "weight": 0.025575250387191772 }, { "diff_generated": -34.14396286010742, "epoch": 2.8191834089436165, "grad_norm": 2.7985689135675913, "learning_rate": 8.82057683619859e-09, "logits/chosen": -2.233975887298584, "logits/rejected": -2.043182611465454, "logps/chosen": -11.014206886291504, "logps/rejected": -620.0697631835938, "logps_avg/chosen": -0.07187635451555252, "logps_avg/rejected": -3.414396286010742, "loss": 0.0737, "losses_ref": -0.0006925543420948088, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8700, "u": -5.368803977966309, "weight": 0.05707864090800285 }, { "diff_generated": -35.2857666015625, "epoch": 2.8224238496435516, "grad_norm": 2.780204667008307, "learning_rate": 8.508375610739626e-09, "logits/chosen": -2.307744264602661, "logits/rejected": -2.0247180461883545, "logps/chosen": -14.565858840942383, "logps/rejected": -652.158935546875, "logps_avg/chosen": -0.07844166457653046, "logps_avg/rejected": -3.528576612472534, "loss": 0.0767, "losses_ref": -0.0003016654518432915, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8710, "u": -5.438412666320801, "weight": 0.04406419396400452 }, { "diff_generated": -35.18162536621094, "epoch": 2.8256642903434868, "grad_norm": 2.952787034975414, "learning_rate": 8.201740128725365e-09, "logits/chosen": -2.264735221862793, "logits/rejected": -2.0731773376464844, "logps/chosen": -12.935811996459961, "logps/rejected": -644.2762451171875, "logps_avg/chosen": -0.0769585520029068, "logps_avg/rejected": -3.518162488937378, "loss": 0.0749, "losses_ref": -0.0003010678628925234, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8720, "u": -5.331357479095459, "weight": 0.0628284215927124 }, { "diff_generated": -33.50090789794922, "epoch": 2.828904731043422, "grad_norm": 2.6378440705183106, "learning_rate": 7.900674749519564e-09, "logits/chosen": -2.302125930786133, "logits/rejected": -2.0807623863220215, "logps/chosen": -14.297933578491211, "logps/rejected": -624.4725341796875, "logps_avg/chosen": -0.07768501341342926, "logps_avg/rejected": -3.3500912189483643, "loss": 0.0767, "losses_ref": -0.00035457880585454404, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8730, "u": -5.366321563720703, "weight": 0.05664187669754028 }, { "diff_generated": -36.1597785949707, "epoch": 2.832145171743357, "grad_norm": 2.731065373320376, "learning_rate": 7.605183753297283e-09, "logits/chosen": -2.332219123840332, "logits/rejected": -2.050250768661499, "logps/chosen": -14.659116744995117, "logps/rejected": -635.1754150390625, "logps_avg/chosen": -0.07973220199346542, "logps_avg/rejected": -3.6159775257110596, "loss": 0.0765, "losses_ref": -9.081260213861242e-05, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8740, "u": -5.324737548828125, "weight": 0.06257982552051544 }, { "diff_generated": -32.69268798828125, "epoch": 2.835385612443292, "grad_norm": 2.732954337771039, "learning_rate": 7.315271340983731e-09, "logits/chosen": -2.301483154296875, "logits/rejected": -2.0711069107055664, "logps/chosen": -13.039764404296875, "logps/rejected": -624.9254150390625, "logps_avg/chosen": -0.07299993187189102, "logps_avg/rejected": -3.269268751144409, "loss": 0.0752, "losses_ref": -0.00042771859443746507, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8750, "u": -5.33248233795166, "weight": 0.0629906952381134 }, { "diff_generated": -36.09053421020508, "epoch": 2.8386260531432272, "grad_norm": 2.672121170797753, "learning_rate": 7.030941634194932e-09, "logits/chosen": -2.306715726852417, "logits/rejected": -2.0734031200408936, "logps/chosen": -14.156460762023926, "logps/rejected": -667.5546264648438, "logps_avg/chosen": -0.07908565551042557, "logps_avg/rejected": -3.609053373336792, "loss": 0.079, "losses_ref": -0.00014179803838487715, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8760, "u": -5.332102298736572, "weight": 0.0626293420791626 }, { "diff_generated": -35.369422912597656, "epoch": 2.841866493843163, "grad_norm": 3.2762929988972, "learning_rate": 6.752198675178711e-09, "logits/chosen": -2.310044765472412, "logits/rejected": -2.0577964782714844, "logps/chosen": -12.950109481811523, "logps/rejected": -658.7679443359375, "logps_avg/chosen": -0.07082493603229523, "logps_avg/rejected": -3.53694224357605, "loss": 0.0751, "losses_ref": -0.0007642454584129155, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8770, "u": -5.402093410491943, "weight": 0.05093027278780937 }, { "diff_generated": -33.57276153564453, "epoch": 2.845106934543098, "grad_norm": 2.9315528501466384, "learning_rate": 6.479046426757584e-09, "logits/chosen": -2.2507405281066895, "logits/rejected": -1.9945474863052368, "logps/chosen": -13.323570251464844, "logps/rejected": -611.6863403320312, "logps_avg/chosen": -0.07454784214496613, "logps_avg/rejected": -3.3572757244110107, "loss": 0.0749, "losses_ref": -0.00011777288455050439, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8780, "u": -5.293331623077393, "weight": 0.06885615736246109 }, { "diff_generated": -33.651031494140625, "epoch": 2.848347375243033, "grad_norm": 2.9451502753913834, "learning_rate": 6.211488772272133e-09, "logits/chosen": -2.257718324661255, "logits/rejected": -2.0689949989318848, "logps/chosen": -12.400961875915527, "logps/rejected": -665.5693969726562, "logps_avg/chosen": -0.07117728888988495, "logps_avg/rejected": -3.365103244781494, "loss": 0.0754, "losses_ref": -0.0011662624310702085, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8790, "u": -5.36535120010376, "weight": 0.05799577385187149 }, { "diff_generated": -32.80426788330078, "epoch": 2.851587815942968, "grad_norm": 2.7998741601476436, "learning_rate": 5.9495295155260305e-09, "logits/chosen": -2.3018977642059326, "logits/rejected": -2.0845401287078857, "logps/chosen": -14.197778701782227, "logps/rejected": -623.5587158203125, "logps_avg/chosen": -0.07896491140127182, "logps_avg/rejected": -3.2804272174835205, "loss": 0.0784, "losses_ref": -0.00036378385266289115, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8800, "u": -5.40054178237915, "weight": 0.050371408462524414 }, { "diff_generated": -35.56111526489258, "epoch": 2.8548282566429033, "grad_norm": 2.9920608913232547, "learning_rate": 5.69317238073177e-09, "logits/chosen": -2.272759199142456, "logits/rejected": -2.0058791637420654, "logps/chosen": -13.803054809570312, "logps/rejected": -639.3179931640625, "logps_avg/chosen": -0.07773126661777496, "logps_avg/rejected": -3.5561110973358154, "loss": 0.0749, "losses_ref": -0.00025532610015943646, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8810, "u": -5.288957118988037, "weight": 0.06902964413166046 }, { "diff_generated": -34.240325927734375, "epoch": 2.858068697342839, "grad_norm": 2.6821027741068324, "learning_rate": 5.442421012457909e-09, "logits/chosen": -2.240999698638916, "logits/rejected": -2.008993148803711, "logps/chosen": -12.136984825134277, "logps/rejected": -616.26416015625, "logps_avg/chosen": -0.07080712169408798, "logps_avg/rejected": -3.424032688140869, "loss": 0.0753, "losses_ref": -9.598202450433746e-05, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8820, "u": -5.154082775115967, "weight": 0.09383802115917206 }, { "diff_generated": -36.475528717041016, "epoch": 2.861309138042774, "grad_norm": 3.178350218749426, "learning_rate": 5.197278975577069e-09, "logits/chosen": -2.230213165283203, "logits/rejected": -1.9734901189804077, "logps/chosen": -13.831930160522461, "logps/rejected": -667.8572998046875, "logps_avg/chosen": -0.08242715895175934, "logps_avg/rejected": -3.647552967071533, "loss": 0.0771, "losses_ref": -0.0016425810754299164, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8830, "u": -5.473348617553711, "weight": 0.039764031767845154 }, { "diff_generated": -33.9207763671875, "epoch": 2.864549578742709, "grad_norm": 2.8690068694207493, "learning_rate": 4.957749755215346e-09, "logits/chosen": -2.2395389080047607, "logits/rejected": -2.0926663875579834, "logps/chosen": -12.328548431396484, "logps/rejected": -645.32275390625, "logps_avg/chosen": -0.07605170458555222, "logps_avg/rejected": -3.392077684402466, "loss": 0.0791, "losses_ref": -0.0004766159108839929, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8840, "u": -5.4698357582092285, "weight": 0.03803374990820885 }, { "diff_generated": -36.080318450927734, "epoch": 2.8677900194426442, "grad_norm": 2.9374770724521193, "learning_rate": 4.723836756702848e-09, "logits/chosen": -2.2504429817199707, "logits/rejected": -1.9984312057495117, "logps/chosen": -12.243127822875977, "logps/rejected": -637.8248291015625, "logps_avg/chosen": -0.07207809388637543, "logps_avg/rejected": -3.6080322265625, "loss": 0.0759, "losses_ref": -0.0004624236316885799, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8850, "u": -5.293095588684082, "weight": 0.0692666620016098 }, { "diff_generated": -34.3936653137207, "epoch": 2.8710304601425793, "grad_norm": 2.863835701676596, "learning_rate": 4.495543305524974e-09, "logits/chosen": -2.269010066986084, "logits/rejected": -2.0275280475616455, "logps/chosen": -13.334935188293457, "logps/rejected": -628.5553588867188, "logps_avg/chosen": -0.0751592367887497, "logps_avg/rejected": -3.439366102218628, "loss": 0.077, "losses_ref": -0.00043837359407916665, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8860, "u": -5.3313493728637695, "weight": 0.06295828521251678 }, { "diff_generated": -35.203651428222656, "epoch": 2.8742709008425145, "grad_norm": 2.8246905754348792, "learning_rate": 4.2728726472756934e-09, "logits/chosen": -2.278592109680176, "logits/rejected": -2.0227110385894775, "logps/chosen": -15.298222541809082, "logps/rejected": -655.2586669921875, "logps_avg/chosen": -0.08556310832500458, "logps_avg/rejected": -3.5203652381896973, "loss": 0.0781, "losses_ref": -0.0014250215608626604, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8870, "u": -5.466298580169678, "weight": 0.03932160139083862 }, { "diff_generated": -37.24098205566406, "epoch": 2.8775113415424496, "grad_norm": 2.9024973540127244, "learning_rate": 4.055827947610746e-09, "logits/chosen": -2.273810625076294, "logits/rejected": -2.0100204944610596, "logps/chosen": -14.425886154174805, "logps/rejected": -702.1931762695312, "logps_avg/chosen": -0.07655589282512665, "logps_avg/rejected": -3.7240982055664062, "loss": 0.0751, "losses_ref": -0.0006063595064915717, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8880, "u": -5.471449851989746, "weight": 0.03819319233298302 }, { "diff_generated": -37.44664764404297, "epoch": 2.8807517822423847, "grad_norm": 2.9010138738691547, "learning_rate": 3.844412292203092e-09, "logits/chosen": -2.254565954208374, "logits/rejected": -1.9562934637069702, "logps/chosen": -13.297273635864258, "logps/rejected": -671.3079223632812, "logps_avg/chosen": -0.07370129227638245, "logps_avg/rejected": -3.7446651458740234, "loss": 0.0759, "losses_ref": -0.00030630582477897406, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8890, "u": -5.402109622955322, "weight": 0.05031546205282211 }, { "diff_generated": -33.91117477416992, "epoch": 2.8839922229423203, "grad_norm": 2.78041629343256, "learning_rate": 3.638628686698908e-09, "logits/chosen": -2.2411606311798096, "logits/rejected": -1.9876295328140259, "logps/chosen": -14.107019424438477, "logps/rejected": -651.743408203125, "logps_avg/chosen": -0.07398734986782074, "logps_avg/rejected": -3.3911170959472656, "loss": 0.0766, "losses_ref": -0.0007171139004640281, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8900, "u": -5.2610578536987305, "weight": 0.07587677985429764 }, { "diff_generated": -34.82379913330078, "epoch": 2.8872326636422554, "grad_norm": 3.1121766646536337, "learning_rate": 3.438480056674864e-09, "logits/chosen": -2.277547597885132, "logits/rejected": -2.014165163040161, "logps/chosen": -13.730894088745117, "logps/rejected": -658.84423828125, "logps_avg/chosen": -0.07599518448114395, "logps_avg/rejected": -3.482379913330078, "loss": 0.0769, "losses_ref": -0.0007300475845113397, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8910, "u": -5.328692436218262, "weight": 0.06340476125478745 }, { "diff_generated": -35.029197692871094, "epoch": 2.8904731043421905, "grad_norm": 2.96319952309038, "learning_rate": 3.243969247596423e-09, "logits/chosen": -2.2526772022247314, "logits/rejected": -2.013214111328125, "logps/chosen": -13.636917114257812, "logps/rejected": -669.37939453125, "logps_avg/chosen": -0.07388485968112946, "logps_avg/rejected": -3.5029196739196777, "loss": 0.0791, "losses_ref": -0.00036009997711516917, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8920, "u": -5.543162822723389, "weight": 0.025403300300240517 }, { "diff_generated": -35.807281494140625, "epoch": 2.8937135450421256, "grad_norm": 2.831128371216867, "learning_rate": 3.0550990247776522e-09, "logits/chosen": -2.2789690494537354, "logits/rejected": -2.0398330688476562, "logps/chosen": -12.486102104187012, "logps/rejected": -649.1627197265625, "logps_avg/chosen": -0.07312561571598053, "logps_avg/rejected": -3.580728530883789, "loss": 0.0771, "losses_ref": -0.001107497839257121, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8930, "u": -5.4004716873168945, "weight": 0.05142833665013313 }, { "diff_generated": -35.70695495605469, "epoch": 2.8969539857420608, "grad_norm": 2.79925996934682, "learning_rate": 2.871872073341608e-09, "logits/chosen": -2.2894484996795654, "logits/rejected": -2.0516390800476074, "logps/chosen": -13.297342300415039, "logps/rejected": -691.9161376953125, "logps_avg/chosen": -0.07701648771762848, "logps_avg/rejected": -3.570695400238037, "loss": 0.076, "losses_ref": -0.0008863200200721622, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8940, "u": -5.4015045166015625, "weight": 0.05104244500398636 }, { "diff_generated": -33.015342712402344, "epoch": 2.9001944264419963, "grad_norm": 3.05899367637055, "learning_rate": 2.694290998182325e-09, "logits/chosen": -2.293231964111328, "logits/rejected": -2.0784249305725098, "logps/chosen": -14.234578132629395, "logps/rejected": -644.7971801757812, "logps_avg/chosen": -0.0775744691491127, "logps_avg/rejected": -3.3015339374542236, "loss": 0.0775, "losses_ref": -0.0009552057599648833, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8950, "u": -5.434551239013672, "weight": 0.04488217085599899 }, { "diff_generated": -34.278602600097656, "epoch": 2.9034348671419314, "grad_norm": 2.9237886474738604, "learning_rate": 2.52235832392782e-09, "logits/chosen": -2.2705326080322266, "logits/rejected": -2.050858736038208, "logps/chosen": -13.253606796264648, "logps/rejected": -638.6285400390625, "logps_avg/chosen": -0.07300833612680435, "logps_avg/rejected": -3.4278602600097656, "loss": 0.0777, "losses_ref": -0.0007817854057066143, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8960, "u": -5.468316555023193, "weight": 0.03847404569387436 }, { "diff_generated": -35.571598052978516, "epoch": 2.9066753078418666, "grad_norm": 2.689594232404768, "learning_rate": 2.35607649490408e-09, "logits/chosen": -2.3017730712890625, "logits/rejected": -2.005580186843872, "logps/chosen": -15.054954528808594, "logps/rejected": -660.2969970703125, "logps_avg/chosen": -0.07933951914310455, "logps_avg/rejected": -3.557159900665283, "loss": 0.0779, "losses_ref": -0.0001488261332269758, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8970, "u": -5.6155853271484375, "weight": 0.012644929811358452 }, { "diff_generated": -34.998565673828125, "epoch": 2.9099157485418017, "grad_norm": 3.1797481778973307, "learning_rate": 2.1954478751003313e-09, "logits/chosen": -2.2666759490966797, "logits/rejected": -2.0279688835144043, "logps/chosen": -11.937381744384766, "logps/rejected": -630.4774780273438, "logps_avg/chosen": -0.06900829821825027, "logps_avg/rejected": -3.499856472015381, "loss": 0.0746, "losses_ref": -0.0006766252918168902, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8980, "u": -5.366180896759033, "weight": 0.05714557692408562 }, { "diff_generated": -35.84283447265625, "epoch": 2.913156189241737, "grad_norm": 2.8220235271975103, "learning_rate": 2.040474748135512e-09, "logits/chosen": -2.2554118633270264, "logits/rejected": -2.0103278160095215, "logps/chosen": -13.508143424987793, "logps/rejected": -655.4435424804688, "logps_avg/chosen": -0.07491391152143478, "logps_avg/rejected": -3.5842833518981934, "loss": 0.0764, "losses_ref": -0.00030079128919169307, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 8990, "u": -5.29534387588501, "weight": 0.06904434412717819 }, { "diff_generated": -32.51457977294922, "epoch": 2.916396629941672, "grad_norm": 2.909222858975261, "learning_rate": 1.8911593172258544e-09, "logits/chosen": -2.2562718391418457, "logits/rejected": -2.0374486446380615, "logps/chosen": -13.960149765014648, "logps/rejected": -612.7658081054688, "logps_avg/chosen": -0.07424553483724594, "logps_avg/rejected": -3.2514584064483643, "loss": 0.0773, "losses_ref": -0.0002548511838540435, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9000, "u": -5.330750942230225, "weight": 0.06278066337108612 }, { "diff_generated": -33.9921760559082, "epoch": 2.919637070641607, "grad_norm": 2.8440369601590207, "learning_rate": 1.7475037051532638e-09, "logits/chosen": -2.2932300567626953, "logits/rejected": -2.03836727142334, "logps/chosen": -14.461469650268555, "logps/rejected": -643.2342529296875, "logps_avg/chosen": -0.0805402547121048, "logps_avg/rejected": -3.3992176055908203, "loss": 0.0777, "losses_ref": -0.00012567141675390303, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9010, "u": -5.36099910736084, "weight": 0.05636243149638176 }, { "diff_generated": -33.33674240112305, "epoch": 2.9228775113415426, "grad_norm": 2.7812330472973668, "learning_rate": 1.609509954235566e-09, "logits/chosen": -2.2596919536590576, "logits/rejected": -2.0829384326934814, "logps/chosen": -13.54491138458252, "logps/rejected": -645.8247680664062, "logps_avg/chosen": -0.0812302976846695, "logps_avg/rejected": -3.333674669265747, "loss": 0.0753, "losses_ref": -0.0008088911999948323, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9020, "u": -5.473405361175537, "weight": 0.03860088437795639 }, { "diff_generated": -34.202003479003906, "epoch": 2.9261179520414777, "grad_norm": 3.025771626101646, "learning_rate": 1.4771800262970203e-09, "logits/chosen": -2.244457960128784, "logits/rejected": -2.0527331829071045, "logps/chosen": -14.208236694335938, "logps/rejected": -643.656005859375, "logps_avg/chosen": -0.0844058021903038, "logps_avg/rejected": -3.4202003479003906, "loss": 0.0764, "losses_ref": -0.0002643067273311317, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9030, "u": -5.436182975769043, "weight": 0.044031400233507156 }, { "diff_generated": -34.946044921875, "epoch": 2.929358392741413, "grad_norm": 2.7504660685046667, "learning_rate": 1.3505158026408724e-09, "logits/chosen": -2.258826494216919, "logits/rejected": -2.033946990966797, "logps/chosen": -16.03264808654785, "logps/rejected": -628.7940673828125, "logps_avg/chosen": -0.08787710964679718, "logps_avg/rejected": -3.4946041107177734, "loss": 0.076, "losses_ref": -0.000992965535260737, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9040, "u": -5.508906364440918, "weight": 0.032481517642736435 }, { "diff_generated": -35.106163024902344, "epoch": 2.932598833441348, "grad_norm": 2.646894119734614, "learning_rate": 1.2295190840223125e-09, "logits/chosen": -2.2918362617492676, "logits/rejected": -2.0843429565429688, "logps/chosen": -13.711763381958008, "logps/rejected": -652.3299560546875, "logps_avg/chosen": -0.07778448611497879, "logps_avg/rejected": -3.5106163024902344, "loss": 0.074, "losses_ref": -0.0004062582738697529, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9050, "u": -5.400134086608887, "weight": 0.050418026745319366 }, { "diff_generated": -34.945518493652344, "epoch": 2.935839274141283, "grad_norm": 2.731671929569239, "learning_rate": 1.1141915906228928e-09, "logits/chosen": -2.275437116622925, "logits/rejected": -2.0286378860473633, "logps/chosen": -12.794441223144531, "logps/rejected": -651.5943603515625, "logps_avg/chosen": -0.07423492521047592, "logps_avg/rejected": -3.494551420211792, "loss": 0.0755, "losses_ref": -0.0004529617144726217, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9060, "u": -5.432457447052002, "weight": 0.04429139569401741 }, { "diff_generated": -33.463844299316406, "epoch": 2.9390797148412187, "grad_norm": 2.9375110396431, "learning_rate": 1.0045349620262379e-09, "logits/chosen": -2.2780842781066895, "logits/rejected": -2.049278974533081, "logps/chosen": -13.470865249633789, "logps/rejected": -626.6282958984375, "logps_avg/chosen": -0.07648901641368866, "logps_avg/rejected": -3.3463847637176514, "loss": 0.0752, "losses_ref": -0.00029730232199653983, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9070, "u": -5.330691337585449, "weight": 0.06280551105737686 }, { "diff_generated": -33.51942443847656, "epoch": 2.942320155541154, "grad_norm": 2.8372455817426117, "learning_rate": 9.005507571945958e-10, "logits/chosen": -2.2972397804260254, "logits/rejected": -1.9933170080184937, "logps/chosen": -14.175498962402344, "logps/rejected": -617.5062255859375, "logps_avg/chosen": -0.07958875596523285, "logps_avg/rejected": -3.351942539215088, "loss": 0.0744, "losses_ref": -0.00039287720574066043, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9080, "u": -5.3291778564453125, "weight": 0.0629565417766571 }, { "diff_generated": -35.39436340332031, "epoch": 2.945560596241089, "grad_norm": 2.7022321365970066, "learning_rate": 8.022404544466788e-10, "logits/chosen": -2.286496877670288, "logits/rejected": -2.0509543418884277, "logps/chosen": -13.762849807739258, "logps/rejected": -622.6043701171875, "logps_avg/chosen": -0.07843898236751556, "logps_avg/rejected": -3.5394368171691895, "loss": 0.0746, "losses_ref": -0.0006384230218827724, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9090, "u": -5.439492225646973, "weight": 0.044552553445100784 }, { "diff_generated": -32.19483184814453, "epoch": 2.948801036941024, "grad_norm": 3.274540869163379, "learning_rate": 7.096054514367455e-10, "logits/chosen": -2.22894024848938, "logits/rejected": -2.036980390548706, "logps/chosen": -12.699054718017578, "logps/rejected": -615.8016357421875, "logps_avg/chosen": -0.07660754024982452, "logps_avg/rejected": -3.2194831371307373, "loss": 0.075, "losses_ref": -0.000949250883422792, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9100, "u": -5.186242580413818, "weight": 0.08869956433773041 }, { "diff_generated": -33.05915451049805, "epoch": 2.952041477640959, "grad_norm": 2.8465823055989925, "learning_rate": 6.226470651346182e-10, "logits/chosen": -2.2585575580596924, "logits/rejected": -2.082470655441284, "logps/chosen": -13.074926376342773, "logps/rejected": -632.4801025390625, "logps_avg/chosen": -0.07661643624305725, "logps_avg/rejected": -3.3059158325195312, "loss": 0.0765, "losses_ref": -0.0003152258286718279, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9110, "u": -5.401144027709961, "weight": 0.05031600594520569 }, { "diff_generated": -34.64142608642578, "epoch": 2.9552819183408943, "grad_norm": 2.778297006969446, "learning_rate": 5.413665318070304e-10, "logits/chosen": -2.264369487762451, "logits/rejected": -2.0411882400512695, "logps/chosen": -14.223039627075195, "logps/rejected": -649.0701904296875, "logps_avg/chosen": -0.08272770047187805, "logps_avg/rejected": -3.464142322540283, "loss": 0.0791, "losses_ref": -0.0005291260313242674, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9120, "u": -5.506434917449951, "weight": 0.031941771507263184 }, { "diff_generated": -34.728755950927734, "epoch": 2.9585223590408294, "grad_norm": 3.030304877955613, "learning_rate": 4.657650069999963e-10, "logits/chosen": -2.2826015949249268, "logits/rejected": -2.050818920135498, "logps/chosen": -13.238696098327637, "logps/rejected": -654.2847900390625, "logps_avg/chosen": -0.07198430597782135, "logps_avg/rejected": -3.4728755950927734, "loss": 0.0777, "losses_ref": -9.715888882055879e-05, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9130, "u": -5.3992228507995605, "weight": 0.05008460208773613 }, { "diff_generated": -37.10472869873047, "epoch": 2.9617627997407645, "grad_norm": 2.8495368006170585, "learning_rate": 3.95843565522469e-10, "logits/chosen": -2.2772858142852783, "logits/rejected": -1.9869455099105835, "logps/chosen": -12.714567184448242, "logps/rejected": -666.079833984375, "logps_avg/chosen": -0.07313890010118484, "logps_avg/rejected": -3.710472583770752, "loss": 0.0753, "losses_ref": -0.0004528468125499785, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9140, "u": -5.504135608673096, "weight": 0.03178011626005173 }, { "diff_generated": -32.66667175292969, "epoch": 2.9650032404407, "grad_norm": 2.6945972204400293, "learning_rate": 3.3160320143097444e-10, "logits/chosen": -2.309319496154785, "logits/rejected": -2.0778040885925293, "logps/chosen": -13.646771430969238, "logps/rejected": -614.1995849609375, "logps_avg/chosen": -0.07696790993213654, "logps_avg/rejected": -3.266667127609253, "loss": 0.0764, "losses_ref": -0.001008645980618894, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9150, "u": -5.1822099685668945, "weight": 0.0887695699930191 }, { "diff_generated": -37.78270721435547, "epoch": 2.968243681140635, "grad_norm": 3.783364664902114, "learning_rate": 2.7304482801548957e-10, "logits/chosen": -2.2790446281433105, "logits/rejected": -1.9892492294311523, "logps/chosen": -13.996490478515625, "logps/rejected": -638.9437255859375, "logps_avg/chosen": -0.0773763507604599, "logps_avg/rejected": -3.778270721435547, "loss": 0.0761, "losses_ref": -0.0019606896676123142, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9160, "u": -5.3608598709106445, "weight": 0.059903018176555634 }, { "diff_generated": -34.349769592285156, "epoch": 2.9714841218405703, "grad_norm": 2.763967615865681, "learning_rate": 2.201692777865194e-10, "logits/chosen": -2.25236177444458, "logits/rejected": -2.01774263381958, "logps/chosen": -12.498361587524414, "logps/rejected": -638.0401000976562, "logps_avg/chosen": -0.07391633093357086, "logps_avg/rejected": -3.4349770545959473, "loss": 0.0761, "losses_ref": -0.0002926269080489874, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9170, "u": -5.401417255401611, "weight": 0.050305772572755814 }, { "diff_generated": -37.91081237792969, "epoch": 2.9747245625405054, "grad_norm": 2.9257328592725975, "learning_rate": 1.729773024631953e-10, "logits/chosen": -2.2642340660095215, "logits/rejected": -2.015472888946533, "logps/chosen": -13.496482849121094, "logps/rejected": -673.7174682617188, "logps_avg/chosen": -0.0796927958726883, "logps_avg/rejected": -3.7910804748535156, "loss": 0.079, "losses_ref": -0.000292365497443825, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9180, "u": -5.440535545349121, "weight": 0.04405521973967552 }, { "diff_generated": -32.530948638916016, "epoch": 2.9779650032404406, "grad_norm": 2.713846199857598, "learning_rate": 1.3146957296261696e-10, "logits/chosen": -2.1936991214752197, "logits/rejected": -2.0365915298461914, "logps/chosen": -12.709261894226074, "logps/rejected": -644.2265014648438, "logps_avg/chosen": -0.07670806348323822, "logps_avg/rejected": -3.2530949115753174, "loss": 0.0778, "losses_ref": -0.0005275515140965581, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9190, "u": -5.187443256378174, "weight": 0.08807355910539627 }, { "diff_generated": -35.345008850097656, "epoch": 2.981205443940376, "grad_norm": 2.6283452311065227, "learning_rate": 9.564667939030435e-11, "logits/chosen": -2.2979166507720947, "logits/rejected": -2.068493366241455, "logps/chosen": -13.378866195678711, "logps/rejected": -643.0728759765625, "logps_avg/chosen": -0.07449330389499664, "logps_avg/rejected": -3.53450083732605, "loss": 0.0766, "losses_ref": -0.0003306058351881802, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9200, "u": -5.58246374130249, "weight": 0.019121162593364716 }, { "diff_generated": -37.17274856567383, "epoch": 2.9844458846403112, "grad_norm": 2.83432801334109, "learning_rate": 6.550913103189337e-11, "logits/chosen": -2.266289234161377, "logits/rejected": -2.0171196460723877, "logps/chosen": -12.02930736541748, "logps/rejected": -709.2539672851562, "logps_avg/chosen": -0.07492565363645554, "logps_avg/rejected": -3.7172749042510986, "loss": 0.0745, "losses_ref": -0.0002527030010242015, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9210, "u": -5.570784568786621, "weight": 0.019006643444299698 }, { "diff_generated": -36.656272888183594, "epoch": 2.9876863253402464, "grad_norm": 2.9330329822293932, "learning_rate": 4.1057356345675085e-11, "logits/chosen": -2.255496025085449, "logits/rejected": -1.9425241947174072, "logps/chosen": -15.756494522094727, "logps/rejected": -619.2554931640625, "logps_avg/chosen": -0.08364422619342804, "logps_avg/rejected": -3.6656272411346436, "loss": 0.0785, "losses_ref": -0.0012684316607192159, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9220, "u": -5.468225955963135, "weight": 0.03942258656024933 }, { "diff_generated": -35.7361946105957, "epoch": 2.9909267660401815, "grad_norm": 2.6544514182140717, "learning_rate": 2.229170295673377e-11, "logits/chosen": -2.2972233295440674, "logits/rejected": -2.0817008018493652, "logps/chosen": -13.582697868347168, "logps/rejected": -631.3333740234375, "logps_avg/chosen": -0.07444195449352264, "logps_avg/rejected": -3.5736191272735596, "loss": 0.0744, "losses_ref": -9.415384556632489e-05, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9230, "u": -5.581204891204834, "weight": 0.018835904076695442 }, { "diff_generated": -32.17867660522461, "epoch": 2.9941672067401166, "grad_norm": 2.7794692518725967, "learning_rate": 9.212437651973103e-12, "logits/chosen": -2.300394058227539, "logits/rejected": -2.0665271282196045, "logps/chosen": -14.166768074035645, "logps/rejected": -594.6533813476562, "logps_avg/chosen": -0.07981442660093307, "logps_avg/rejected": -3.217867612838745, "loss": 0.0756, "losses_ref": -0.0003018935676664114, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9240, "u": -5.294437408447266, "weight": 0.06905417144298553 }, { "diff_generated": -33.743309020996094, "epoch": 2.9974076474400517, "grad_norm": 2.7191689143451625, "learning_rate": 1.819746376119369e-12, "logits/chosen": -2.2722482681274414, "logits/rejected": -2.033750057220459, "logps/chosen": -15.440897941589355, "logps/rejected": -607.9214477539062, "logps_avg/chosen": -0.08211788535118103, "logps_avg/rejected": -3.3743317127227783, "loss": 0.0794, "losses_ref": -0.0002549213822931051, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 9250, "u": -5.401520729064941, "weight": 0.05025525018572807 } ], "logging_steps": 10, "max_steps": 9258, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }