diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,19479 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 9258, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "diff_generated": -5.324533939361572, + "epoch": 0.0003240440699935191, + "grad_norm": 25.298450180570082, + "learning_rate": 8.639308855291577e-10, + "logits/chosen": -2.6053388118743896, + "logits/rejected": -2.4319162368774414, + "logps/chosen": -116.55142974853516, + "logps/rejected": -89.49524688720703, + "logps_avg/chosen": -0.5783171057701111, + "logps_avg/rejected": -0.5324533581733704, + "loss": 0.5351, + "losses_ref": -0.028132084757089615, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1, + "u": -1.679854393005371, + "weight": 0.16303405165672302 + }, + { + "diff_generated": -4.9921698570251465, + "epoch": 0.0032404406999351912, + "grad_norm": 23.66608439819008, + "learning_rate": 8.639308855291576e-09, + "logits/chosen": -2.4976794719696045, + "logits/rejected": -2.571298599243164, + "logps/chosen": -92.15830993652344, + "logps/rejected": -91.23859405517578, + "logps_avg/chosen": -0.5637891888618469, + "logps_avg/rejected": -0.4992169737815857, + "loss": 0.533, + "losses_ref": -0.0346137136220932, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 10, + "u": -1.6916941404342651, + "weight": 0.16587796807289124 + }, + { + "diff_generated": -4.709494590759277, + "epoch": 0.0064808813998703824, + "grad_norm": 21.640052012860544, + "learning_rate": 1.727861771058315e-08, + "logits/chosen": -2.5320584774017334, + "logits/rejected": -2.588595390319824, + "logps/chosen": -100.08524322509766, + "logps/rejected": -85.40359497070312, + "logps_avg/chosen": -0.5972418189048767, + "logps_avg/rejected": -0.4709494709968567, + "loss": 0.5334, + "losses_ref": -0.03922479599714279, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 20, + "u": -1.6618385314941406, + "weight": 0.1921975165605545 + }, + { + "diff_generated": -5.0379767417907715, + "epoch": 0.009721322099805573, + "grad_norm": 22.995127298370598, + "learning_rate": 2.591792656587473e-08, + "logits/chosen": -2.5383121967315674, + "logits/rejected": -2.569267988204956, + "logps/chosen": -100.78271484375, + "logps/rejected": -87.62537384033203, + "logps_avg/chosen": -0.5688080191612244, + "logps_avg/rejected": -0.5037976503372192, + "loss": 0.5337, + "losses_ref": -0.03752085939049721, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 30, + "u": -1.685185194015503, + "weight": 0.17561769485473633 + }, + { + "diff_generated": -4.766201019287109, + "epoch": 0.012961762799740765, + "grad_norm": 24.543446757562368, + "learning_rate": 3.45572354211663e-08, + "logits/chosen": -2.5598511695861816, + "logits/rejected": -2.614499568939209, + "logps/chosen": -96.36283874511719, + "logps/rejected": -88.67526245117188, + "logps_avg/chosen": -0.5644618272781372, + "logps_avg/rejected": -0.47662001848220825, + "loss": 0.5162, + "losses_ref": -0.04021410271525383, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 40, + "u": -1.6799871921539307, + "weight": 0.18456074595451355 + }, + { + "diff_generated": -4.541996479034424, + "epoch": 0.016202203499675955, + "grad_norm": 18.98878051668679, + "learning_rate": 4.319654427645788e-08, + "logits/chosen": -2.51167631149292, + "logits/rejected": -2.5764548778533936, + "logps/chosen": -89.32698059082031, + "logps/rejected": -83.97340393066406, + "logps_avg/chosen": -0.5328198075294495, + "logps_avg/rejected": -0.4541996121406555, + "loss": 0.496, + "losses_ref": -0.040191732347011566, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 50, + "u": -1.6649795770645142, + "weight": 0.19056639075279236 + }, + { + "diff_generated": -4.699794292449951, + "epoch": 0.019442644199611146, + "grad_norm": 20.824924101981164, + "learning_rate": 5.183585313174946e-08, + "logits/chosen": -2.5374608039855957, + "logits/rejected": -2.598020553588867, + "logps/chosen": -79.26811218261719, + "logps/rejected": -82.68241882324219, + "logps_avg/chosen": -0.4793620705604553, + "logps_avg/rejected": -0.469979465007782, + "loss": 0.4413, + "losses_ref": -0.041936349123716354, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 60, + "u": -1.6706863641738892, + "weight": 0.19364500045776367 + }, + { + "diff_generated": -5.023270606994629, + "epoch": 0.02268308489954634, + "grad_norm": 15.085833079737961, + "learning_rate": 6.047516198704104e-08, + "logits/chosen": -2.494488477706909, + "logits/rejected": -2.553434371948242, + "logps/chosen": -71.15379333496094, + "logps/rejected": -87.61297607421875, + "logps_avg/chosen": -0.4256020188331604, + "logps_avg/rejected": -0.5023270845413208, + "loss": 0.38, + "losses_ref": -0.03602486103773117, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 70, + "u": -1.6736557483673096, + "weight": 0.1774376928806305 + }, + { + "diff_generated": -5.608443737030029, + "epoch": 0.02592352559948153, + "grad_norm": 10.17433229960028, + "learning_rate": 6.91144708423326e-08, + "logits/chosen": -2.466387987136841, + "logits/rejected": -2.5406899452209473, + "logps/chosen": -56.756500244140625, + "logps/rejected": -99.27467346191406, + "logps_avg/chosen": -0.32987886667251587, + "logps_avg/rejected": -0.560844361782074, + "loss": 0.3069, + "losses_ref": -0.028670093044638634, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 80, + "u": -1.7049707174301147, + "weight": 0.14695551991462708 + }, + { + "diff_generated": -7.005269527435303, + "epoch": 0.02916396629941672, + "grad_norm": 6.782996553692984, + "learning_rate": 7.775377969762419e-08, + "logits/chosen": -2.508361577987671, + "logits/rejected": -2.4981703758239746, + "logps/chosen": -52.216339111328125, + "logps/rejected": -107.29414367675781, + "logps_avg/chosen": -0.29603785276412964, + "logps_avg/rejected": -0.700527012348175, + "loss": 0.2762, + "losses_ref": -0.01938403770327568, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 90, + "u": -1.7649612426757812, + "weight": 0.09771289676427841 + }, + { + "diff_generated": -7.859269618988037, + "epoch": 0.03240440699935191, + "grad_norm": 5.915285953748342, + "learning_rate": 8.639308855291576e-08, + "logits/chosen": -2.5046210289001465, + "logits/rejected": -2.5096004009246826, + "logps/chosen": -53.19324493408203, + "logps/rejected": -132.89813232421875, + "logps_avg/chosen": -0.28428006172180176, + "logps_avg/rejected": -0.7859269976615906, + "loss": 0.2653, + "losses_ref": -0.00893635768443346, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 100, + "u": -1.7851336002349854, + "weight": 0.06954724341630936 + }, + { + "diff_generated": -8.105627059936523, + "epoch": 0.0356448476992871, + "grad_norm": 5.485745202835744, + "learning_rate": 9.503239740820734e-08, + "logits/chosen": -2.486294984817505, + "logits/rejected": -2.5347485542297363, + "logps/chosen": -46.411155700683594, + "logps/rejected": -135.87271118164062, + "logps_avg/chosen": -0.26198580861091614, + "logps_avg/rejected": -0.8105627298355103, + "loss": 0.2395, + "losses_ref": -0.011290923692286015, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 110, + "u": -1.7970489263534546, + "weight": 0.06864650547504425 + }, + { + "diff_generated": -9.39500617980957, + "epoch": 0.03888528839922229, + "grad_norm": 5.858206689278202, + "learning_rate": 1.0367170626349892e-07, + "logits/chosen": -2.473548412322998, + "logits/rejected": -2.5916199684143066, + "logps/chosen": -40.70339584350586, + "logps/rejected": -163.2863006591797, + "logps_avg/chosen": -0.24593877792358398, + "logps_avg/rejected": -0.939500629901886, + "loss": 0.2436, + "losses_ref": -0.00593178765848279, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 120, + "u": -1.8373692035675049, + "weight": 0.03868420049548149 + }, + { + "diff_generated": -8.719804763793945, + "epoch": 0.04212572909915749, + "grad_norm": 4.8636295363498165, + "learning_rate": 1.1231101511879049e-07, + "logits/chosen": -2.42881441116333, + "logits/rejected": -2.526031255722046, + "logps/chosen": -39.71488952636719, + "logps/rejected": -155.34878540039062, + "logps_avg/chosen": -0.2515028715133667, + "logps_avg/rejected": -0.8719803690910339, + "loss": 0.2291, + "losses_ref": -0.007679730653762817, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 130, + "u": -1.757032036781311, + "weight": 0.08497841656208038 + }, + { + "diff_generated": -10.024818420410156, + "epoch": 0.04536616979909268, + "grad_norm": 5.129995804872467, + "learning_rate": 1.2095032397408208e-07, + "logits/chosen": -2.4535014629364014, + "logits/rejected": -2.5594677925109863, + "logps/chosen": -39.28099060058594, + "logps/rejected": -180.23167419433594, + "logps_avg/chosen": -0.24077431857585907, + "logps_avg/rejected": -1.0024818181991577, + "loss": 0.2251, + "losses_ref": -0.004803563468158245, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 140, + "u": -1.8035099506378174, + "weight": 0.055381983518600464 + }, + { + "diff_generated": -10.327981948852539, + "epoch": 0.04860661049902787, + "grad_norm": 7.665590400359612, + "learning_rate": 1.2958963282937366e-07, + "logits/chosen": -2.503351926803589, + "logits/rejected": -2.5048727989196777, + "logps/chosen": -40.956787109375, + "logps/rejected": -175.9888153076172, + "logps_avg/chosen": -0.22418427467346191, + "logps_avg/rejected": -1.032798171043396, + "loss": 0.2247, + "losses_ref": -0.004349695052951574, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 150, + "u": -1.769809365272522, + "weight": 0.07237504422664642 + }, + { + "diff_generated": -11.050540924072266, + "epoch": 0.05184705119896306, + "grad_norm": 5.2218156254610975, + "learning_rate": 1.382289416846652e-07, + "logits/chosen": -2.4762940406799316, + "logits/rejected": -2.5197434425354004, + "logps/chosen": -41.60301971435547, + "logps/rejected": -190.45809936523438, + "logps_avg/chosen": -0.2423749417066574, + "logps_avg/rejected": -1.1050540208816528, + "loss": 0.2202, + "losses_ref": -0.0036694530863314867, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 160, + "u": -1.819371223449707, + "weight": 0.045095235109329224 + }, + { + "diff_generated": -10.912653923034668, + "epoch": 0.05508749189889825, + "grad_norm": 4.72316488652966, + "learning_rate": 1.468682505399568e-07, + "logits/chosen": -2.494835376739502, + "logits/rejected": -2.5751283168792725, + "logps/chosen": -34.90558624267578, + "logps/rejected": -183.92214965820312, + "logps_avg/chosen": -0.21363107860088348, + "logps_avg/rejected": -1.0912654399871826, + "loss": 0.2089, + "losses_ref": -0.004652983509004116, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 170, + "u": -1.8284003734588623, + "weight": 0.041924357414245605 + }, + { + "diff_generated": -10.835649490356445, + "epoch": 0.05832793259883344, + "grad_norm": 5.4755997415237365, + "learning_rate": 1.5550755939524837e-07, + "logits/chosen": -2.4843239784240723, + "logits/rejected": -2.5287718772888184, + "logps/chosen": -37.10668182373047, + "logps/rejected": -182.80319213867188, + "logps_avg/chosen": -0.2310374677181244, + "logps_avg/rejected": -1.0835647583007812, + "loss": 0.2062, + "losses_ref": -0.005131029523909092, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 180, + "u": -1.7793972492218018, + "weight": 0.06860009580850601 + }, + { + "diff_generated": -12.018121719360352, + "epoch": 0.06156837329876863, + "grad_norm": 4.985706566011136, + "learning_rate": 1.6414686825053995e-07, + "logits/chosen": -2.4900550842285156, + "logits/rejected": -2.4969873428344727, + "logps/chosen": -38.894493103027344, + "logps/rejected": -195.72805786132812, + "logps_avg/chosen": -0.2240675985813141, + "logps_avg/rejected": -1.2018121480941772, + "loss": 0.2041, + "losses_ref": -0.003515923861414194, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 190, + "u": -1.852745771408081, + "weight": 0.028062384575605392 + }, + { + "diff_generated": -12.754137992858887, + "epoch": 0.06480881399870382, + "grad_norm": 5.067587022106536, + "learning_rate": 1.7278617710583153e-07, + "logits/chosen": -2.4320194721221924, + "logits/rejected": -2.496293544769287, + "logps/chosen": -33.140167236328125, + "logps/rejected": -204.68572998046875, + "logps_avg/chosen": -0.1981068104505539, + "logps_avg/rejected": -1.2754138708114624, + "loss": 0.1953, + "losses_ref": -0.0022536544129252434, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 200, + "u": -1.749868392944336, + "weight": 0.08045514672994614 + }, + { + "diff_generated": -12.116033554077148, + "epoch": 0.06804925469863901, + "grad_norm": 5.601872457233822, + "learning_rate": 1.814254859611231e-07, + "logits/chosen": -2.475778102874756, + "logits/rejected": -2.53865122795105, + "logps/chosen": -35.73398208618164, + "logps/rejected": -209.81747436523438, + "logps_avg/chosen": -0.21023687720298767, + "logps_avg/rejected": -1.2116032838821411, + "loss": 0.2003, + "losses_ref": -0.0035675906110554934, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 210, + "u": -1.8303534984588623, + "weight": 0.039604417979717255 + }, + { + "diff_generated": -11.962373733520508, + "epoch": 0.0712896953985742, + "grad_norm": 4.880544384760887, + "learning_rate": 1.900647948164147e-07, + "logits/chosen": -2.4443392753601074, + "logits/rejected": -2.4997096061706543, + "logps/chosen": -35.777610778808594, + "logps/rejected": -202.77987670898438, + "logps_avg/chosen": -0.2033710926771164, + "logps_avg/rejected": -1.1962374448776245, + "loss": 0.1934, + "losses_ref": -0.003498962614685297, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 220, + "u": -1.7824407815933228, + "weight": 0.06514163315296173 + }, + { + "diff_generated": -12.74167251586914, + "epoch": 0.07453013609850939, + "grad_norm": 5.644389790526717, + "learning_rate": 1.9870410367170624e-07, + "logits/chosen": -2.4751639366149902, + "logits/rejected": -2.472346067428589, + "logps/chosen": -33.588478088378906, + "logps/rejected": -194.97332763671875, + "logps_avg/chosen": -0.19568376243114471, + "logps_avg/rejected": -1.2741672992706299, + "loss": 0.1957, + "losses_ref": -0.0029640875291079283, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 230, + "u": -1.7599050998687744, + "weight": 0.07632104307413101 + }, + { + "diff_generated": -12.876994132995605, + "epoch": 0.07777057679844458, + "grad_norm": 5.417843349387695, + "learning_rate": 2.0734341252699785e-07, + "logits/chosen": -2.4684810638427734, + "logits/rejected": -2.5136160850524902, + "logps/chosen": -31.660619735717773, + "logps/rejected": -216.2432403564453, + "logps_avg/chosen": -0.18595094978809357, + "logps_avg/rejected": -1.287699580192566, + "loss": 0.1874, + "losses_ref": -0.0023983852006495, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 240, + "u": -1.7855768203735352, + "weight": 0.06173267960548401 + }, + { + "diff_generated": -12.893110275268555, + "epoch": 0.08101101749837979, + "grad_norm": 5.038605999289401, + "learning_rate": 2.159827213822894e-07, + "logits/chosen": -2.455719470977783, + "logits/rejected": -2.5414233207702637, + "logps/chosen": -33.99782180786133, + "logps/rejected": -238.786376953125, + "logps_avg/chosen": -0.19342893362045288, + "logps_avg/rejected": -1.2893109321594238, + "loss": 0.1938, + "losses_ref": -0.002995225368067622, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 250, + "u": -1.7953672409057617, + "weight": 0.05768171697854996 + }, + { + "diff_generated": -13.075129508972168, + "epoch": 0.08425145819831498, + "grad_norm": 5.0467419773950715, + "learning_rate": 2.2462203023758098e-07, + "logits/chosen": -2.458019256591797, + "logits/rejected": -2.485776662826538, + "logps/chosen": -35.66144561767578, + "logps/rejected": -226.5882568359375, + "logps_avg/chosen": -0.1955607533454895, + "logps_avg/rejected": -1.3075129985809326, + "loss": 0.1902, + "losses_ref": -0.0024760509841144085, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 260, + "u": -1.7950090169906616, + "weight": 0.05749331787228584 + }, + { + "diff_generated": -12.619610786437988, + "epoch": 0.08749189889825017, + "grad_norm": 5.0082322883287365, + "learning_rate": 2.3326133909287256e-07, + "logits/chosen": -2.4652304649353027, + "logits/rejected": -2.509176731109619, + "logps/chosen": -32.687461853027344, + "logps/rejected": -214.37661743164062, + "logps_avg/chosen": -0.17882901430130005, + "logps_avg/rejected": -1.2619612216949463, + "loss": 0.1866, + "losses_ref": -0.0019212514162063599, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 270, + "u": -1.7393690347671509, + "weight": 0.0853450745344162 + }, + { + "diff_generated": -12.650653839111328, + "epoch": 0.09073233959818536, + "grad_norm": 4.891295319023462, + "learning_rate": 2.4190064794816416e-07, + "logits/chosen": -2.44873309135437, + "logits/rejected": -2.5190067291259766, + "logps/chosen": -32.08829879760742, + "logps/rejected": -221.0590362548828, + "logps_avg/chosen": -0.18478551506996155, + "logps_avg/rejected": -1.2650654315948486, + "loss": 0.1855, + "losses_ref": -0.004490494728088379, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 280, + "u": -1.7792739868164062, + "weight": 0.06853736937046051 + }, + { + "diff_generated": -12.423052787780762, + "epoch": 0.09397278029812055, + "grad_norm": 4.797689682089731, + "learning_rate": 2.505399568034557e-07, + "logits/chosen": -2.4705300331115723, + "logits/rejected": -2.496605634689331, + "logps/chosen": -34.917354583740234, + "logps/rejected": -211.85708618164062, + "logps_avg/chosen": -0.19044120609760284, + "logps_avg/rejected": -1.2423055171966553, + "loss": 0.1835, + "losses_ref": -0.00314778508618474, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 290, + "u": -1.7591886520385742, + "weight": 0.07695788890123367 + }, + { + "diff_generated": -14.825152397155762, + "epoch": 0.09721322099805574, + "grad_norm": 5.0295975035207405, + "learning_rate": 2.591792656587473e-07, + "logits/chosen": -2.4914021492004395, + "logits/rejected": -2.534492254257202, + "logps/chosen": -30.0008544921875, + "logps/rejected": -238.60293579101562, + "logps_avg/chosen": -0.17152948677539825, + "logps_avg/rejected": -1.4825150966644287, + "loss": 0.1796, + "losses_ref": -0.0015082244062796235, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 300, + "u": -1.8350900411605835, + "weight": 0.03436414152383804 + }, + { + "diff_generated": -13.583699226379395, + "epoch": 0.10045366169799093, + "grad_norm": 4.75263679666864, + "learning_rate": 2.6781857451403887e-07, + "logits/chosen": -2.4600813388824463, + "logits/rejected": -2.5308713912963867, + "logps/chosen": -29.974111557006836, + "logps/rejected": -225.1046905517578, + "logps_avg/chosen": -0.18076516687870026, + "logps_avg/rejected": -1.3583698272705078, + "loss": 0.1744, + "losses_ref": -0.0025918360333889723, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 310, + "u": -1.808953881263733, + "weight": 0.04961549490690231 + }, + { + "diff_generated": -13.35509967803955, + "epoch": 0.10369410239792612, + "grad_norm": 4.556274279970173, + "learning_rate": 2.764578833693304e-07, + "logits/chosen": -2.432319402694702, + "logits/rejected": -2.4622726440429688, + "logps/chosen": -30.676654815673828, + "logps/rejected": -216.61288452148438, + "logps_avg/chosen": -0.17878147959709167, + "logps_avg/rejected": -1.3355098962783813, + "loss": 0.1776, + "losses_ref": -0.0024136919528245926, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 320, + "u": -1.737860918045044, + "weight": 0.08699294179677963 + }, + { + "diff_generated": -13.178678512573242, + "epoch": 0.10693454309786131, + "grad_norm": 5.291823716686084, + "learning_rate": 2.8509719222462203e-07, + "logits/chosen": -2.499701976776123, + "logits/rejected": -2.5902438163757324, + "logps/chosen": -31.878047943115234, + "logps/rejected": -240.4405517578125, + "logps_avg/chosen": -0.18391458690166473, + "logps_avg/rejected": -1.3178678750991821, + "loss": 0.1795, + "losses_ref": -0.0012684818357229233, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 330, + "u": -1.776533842086792, + "weight": 0.06484408676624298 + }, + { + "diff_generated": -13.998690605163574, + "epoch": 0.1101749837977965, + "grad_norm": 5.383787138213607, + "learning_rate": 2.937365010799136e-07, + "logits/chosen": -2.4777579307556152, + "logits/rejected": -2.5280511379241943, + "logps/chosen": -30.474166870117188, + "logps/rejected": -237.3151092529297, + "logps_avg/chosen": -0.1744353324174881, + "logps_avg/rejected": -1.3998689651489258, + "loss": 0.1794, + "losses_ref": -0.00262268865481019, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 340, + "u": -1.8082706928253174, + "weight": 0.05032556504011154 + }, + { + "diff_generated": -14.047930717468262, + "epoch": 0.11341542449773169, + "grad_norm": 4.581918938488034, + "learning_rate": 3.023758099352052e-07, + "logits/chosen": -2.4632320404052734, + "logits/rejected": -2.5006260871887207, + "logps/chosen": -32.7548828125, + "logps/rejected": -238.8568572998047, + "logps_avg/chosen": -0.20024879276752472, + "logps_avg/rejected": -1.4047930240631104, + "loss": 0.1811, + "losses_ref": -0.0019572232849895954, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 350, + "u": -1.8337892293930054, + "weight": 0.035822127014398575 + }, + { + "diff_generated": -13.68774700164795, + "epoch": 0.11665586519766688, + "grad_norm": 4.738270576170094, + "learning_rate": 3.1101511879049674e-07, + "logits/chosen": -2.4731106758117676, + "logits/rejected": -2.5093135833740234, + "logps/chosen": -31.073184967041016, + "logps/rejected": -229.44540405273438, + "logps_avg/chosen": -0.17675338685512543, + "logps_avg/rejected": -1.3687747716903687, + "loss": 0.1774, + "losses_ref": -0.0017537868116050959, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 360, + "u": -1.7635164260864258, + "weight": 0.07238699495792389 + }, + { + "diff_generated": -14.045831680297852, + "epoch": 0.11989630589760207, + "grad_norm": 4.620735691114173, + "learning_rate": 3.1965442764578835e-07, + "logits/chosen": -2.553758382797241, + "logits/rejected": -2.5590033531188965, + "logps/chosen": -29.680404663085938, + "logps/rejected": -251.1548309326172, + "logps_avg/chosen": -0.16406632959842682, + "logps_avg/rejected": -1.404583215713501, + "loss": 0.1748, + "losses_ref": -0.002590332878753543, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 370, + "u": -1.8194854259490967, + "weight": 0.04447915405035019 + }, + { + "diff_generated": -13.711982727050781, + "epoch": 0.12313674659753726, + "grad_norm": 4.494188858637888, + "learning_rate": 3.282937365010799e-07, + "logits/chosen": -2.5184168815612793, + "logits/rejected": -2.5638442039489746, + "logps/chosen": -30.885112762451172, + "logps/rejected": -246.29946899414062, + "logps_avg/chosen": -0.1680624783039093, + "logps_avg/rejected": -1.371198296546936, + "loss": 0.1742, + "losses_ref": -0.0014965020818635821, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 380, + "u": -1.776185393333435, + "weight": 0.06525006145238876 + }, + { + "diff_generated": -13.470802307128906, + "epoch": 0.12637718729747247, + "grad_norm": 4.740378132166772, + "learning_rate": 3.3693304535637145e-07, + "logits/chosen": -2.5411252975463867, + "logits/rejected": -2.614982843399048, + "logps/chosen": -33.133872985839844, + "logps/rejected": -245.2790985107422, + "logps_avg/chosen": -0.18118393421173096, + "logps_avg/rejected": -1.3470804691314697, + "loss": 0.1766, + "losses_ref": -0.002783800009638071, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 390, + "u": -1.8315505981445312, + "weight": 0.03817793354392052 + }, + { + "diff_generated": -14.816889762878418, + "epoch": 0.12961762799740764, + "grad_norm": 5.062656449082662, + "learning_rate": 3.4557235421166306e-07, + "logits/chosen": -2.4957797527313232, + "logits/rejected": -2.543741226196289, + "logps/chosen": -30.80033302307129, + "logps/rejected": -253.0391845703125, + "logps_avg/chosen": -0.17897175252437592, + "logps_avg/rejected": -1.4816890954971313, + "loss": 0.1667, + "losses_ref": -0.0012838852126151323, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 400, + "u": -1.8120357990264893, + "weight": 0.04616154357790947 + }, + { + "diff_generated": -14.791712760925293, + "epoch": 0.13285806869734285, + "grad_norm": 4.643159185649741, + "learning_rate": 3.542116630669546e-07, + "logits/chosen": -2.4794423580169678, + "logits/rejected": -2.5461440086364746, + "logps/chosen": -27.8972110748291, + "logps/rejected": -251.6249237060547, + "logps_avg/chosen": -0.17388319969177246, + "logps_avg/rejected": -1.4791711568832397, + "loss": 0.1672, + "losses_ref": -0.0017990957712754607, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 410, + "u": -1.8340179920196533, + "weight": 0.03552815318107605 + }, + { + "diff_generated": -14.0972261428833, + "epoch": 0.13609850939727802, + "grad_norm": 4.3280080110575305, + "learning_rate": 3.628509719222462e-07, + "logits/chosen": -2.5374386310577393, + "logits/rejected": -2.5984349250793457, + "logps/chosen": -29.984893798828125, + "logps/rejected": -250.20181274414062, + "logps_avg/chosen": -0.16934213042259216, + "logps_avg/rejected": -1.4097226858139038, + "loss": 0.1697, + "losses_ref": -0.0023155449889600277, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 420, + "u": -1.8452469110488892, + "weight": 0.030071932822465897 + }, + { + "diff_generated": -13.562525749206543, + "epoch": 0.13933895009721323, + "grad_norm": 4.280500048816285, + "learning_rate": 3.7149028077753777e-07, + "logits/chosen": -2.4581573009490967, + "logits/rejected": -2.5427894592285156, + "logps/chosen": -26.301082611083984, + "logps/rejected": -238.8366241455078, + "logps_avg/chosen": -0.15402349829673767, + "logps_avg/rejected": -1.3562524318695068, + "loss": 0.1651, + "losses_ref": -0.0015942498575896025, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 430, + "u": -1.6928117275238037, + "weight": 0.10941553115844727 + }, + { + "diff_generated": -13.584875106811523, + "epoch": 0.1425793907971484, + "grad_norm": 4.924019126408653, + "learning_rate": 3.801295896328294e-07, + "logits/chosen": -2.499994993209839, + "logits/rejected": -2.533939838409424, + "logps/chosen": -29.343231201171875, + "logps/rejected": -238.6305389404297, + "logps_avg/chosen": -0.1552811563014984, + "logps_avg/rejected": -1.3584874868392944, + "loss": 0.1696, + "losses_ref": -0.0027523760218173265, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 440, + "u": -1.760610818862915, + "weight": 0.0755021795630455 + }, + { + "diff_generated": -13.918545722961426, + "epoch": 0.1458198314970836, + "grad_norm": 4.446751093583341, + "learning_rate": 3.887688984881209e-07, + "logits/chosen": -2.516287326812744, + "logits/rejected": -2.5299954414367676, + "logps/chosen": -27.761011123657227, + "logps/rejected": -243.63455200195312, + "logps_avg/chosen": -0.15204386413097382, + "logps_avg/rejected": -1.3918545246124268, + "loss": 0.1654, + "losses_ref": -0.003121785121038556, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 450, + "u": -1.736435890197754, + "weight": 0.08867697417736053 + }, + { + "diff_generated": -14.489161491394043, + "epoch": 0.14906027219701878, + "grad_norm": 4.1975650434326415, + "learning_rate": 3.974082073434125e-07, + "logits/chosen": -2.5312724113464355, + "logits/rejected": -2.553266763687134, + "logps/chosen": -30.491928100585938, + "logps/rejected": -250.10696411132812, + "logps_avg/chosen": -0.16546538472175598, + "logps_avg/rejected": -1.4489161968231201, + "loss": 0.1621, + "losses_ref": -0.0017298974562436342, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 460, + "u": -1.7870547771453857, + "weight": 0.06000211834907532 + }, + { + "diff_generated": -14.744906425476074, + "epoch": 0.152300712896954, + "grad_norm": 7.566673616414248, + "learning_rate": 4.060475161987041e-07, + "logits/chosen": -2.5269410610198975, + "logits/rejected": -2.5724127292633057, + "logps/chosen": -31.23870277404785, + "logps/rejected": -262.3318176269531, + "logps_avg/chosen": -0.16487276554107666, + "logps_avg/rejected": -1.4744906425476074, + "loss": 0.1635, + "losses_ref": -0.001866829232312739, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 470, + "u": -1.8341680765151978, + "weight": 0.03543657064437866 + }, + { + "diff_generated": -15.317721366882324, + "epoch": 0.15554115359688916, + "grad_norm": 4.387634572440091, + "learning_rate": 4.146868250539957e-07, + "logits/chosen": -2.5223655700683594, + "logits/rejected": -2.5226664543151855, + "logps/chosen": -27.46135902404785, + "logps/rejected": -246.641357421875, + "logps_avg/chosen": -0.1489681452512741, + "logps_avg/rejected": -1.5317721366882324, + "loss": 0.1642, + "losses_ref": -0.002479640068486333, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 480, + "u": -1.8073724508285522, + "weight": 0.05108867958188057 + }, + { + "diff_generated": -13.857152938842773, + "epoch": 0.15878159429682437, + "grad_norm": 4.240910416137423, + "learning_rate": 4.2332613390928724e-07, + "logits/chosen": -2.5168867111206055, + "logits/rejected": -2.518345355987549, + "logps/chosen": -30.311405181884766, + "logps/rejected": -233.2909698486328, + "logps_avg/chosen": -0.1701345443725586, + "logps_avg/rejected": -1.3857154846191406, + "loss": 0.1675, + "losses_ref": -0.0022363795433193445, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 490, + "u": -1.7861015796661377, + "weight": 0.06115210801362991 + }, + { + "diff_generated": -14.959016799926758, + "epoch": 0.16202203499675957, + "grad_norm": 4.4420016624530465, + "learning_rate": 4.319654427645788e-07, + "logits/chosen": -2.51141095161438, + "logits/rejected": -2.5964505672454834, + "logps/chosen": -26.861099243164062, + "logps/rejected": -251.33291625976562, + "logps_avg/chosen": -0.15614867210388184, + "logps_avg/rejected": -1.4959017038345337, + "loss": 0.1657, + "losses_ref": -0.0017895328346639872, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 500, + "u": -1.763580560684204, + "weight": 0.07234685122966766 + }, + { + "diff_generated": -13.89812183380127, + "epoch": 0.16526247569669475, + "grad_norm": 4.139783101709871, + "learning_rate": 4.406047516198704e-07, + "logits/chosen": -2.451756000518799, + "logits/rejected": -2.5747926235198975, + "logps/chosen": -25.249866485595703, + "logps/rejected": -244.6548309326172, + "logps_avg/chosen": -0.15236565470695496, + "logps_avg/rejected": -1.3898121118545532, + "loss": 0.1592, + "losses_ref": -0.0019647441804409027, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 510, + "u": -1.7391914129257202, + "weight": 0.08555871993303299 + }, + { + "diff_generated": -15.070231437683105, + "epoch": 0.16850291639662995, + "grad_norm": 4.448321299018289, + "learning_rate": 4.4924406047516195e-07, + "logits/chosen": -2.5451064109802246, + "logits/rejected": -2.5574862957000732, + "logps/chosen": -30.759517669677734, + "logps/rejected": -274.0057067871094, + "logps_avg/chosen": -0.1640317738056183, + "logps_avg/rejected": -1.5070230960845947, + "loss": 0.158, + "losses_ref": -0.001814872375689447, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 520, + "u": -1.8107799291610718, + "weight": 0.047605521976947784 + }, + { + "diff_generated": -14.907147407531738, + "epoch": 0.17174335709656513, + "grad_norm": 4.548145949240779, + "learning_rate": 4.5788336933045356e-07, + "logits/chosen": -2.5269455909729004, + "logits/rejected": -2.5774500370025635, + "logps/chosen": -26.586517333984375, + "logps/rejected": -250.545654296875, + "logps_avg/chosen": -0.15523529052734375, + "logps_avg/rejected": -1.4907147884368896, + "loss": 0.1545, + "losses_ref": -0.0010867482051253319, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 530, + "u": -1.7412116527557373, + "weight": 0.08329650014638901 + }, + { + "diff_generated": -14.34411334991455, + "epoch": 0.17498379779650033, + "grad_norm": 4.565857951910505, + "learning_rate": 4.665226781857451e-07, + "logits/chosen": -2.5096616744995117, + "logits/rejected": -2.6065526008605957, + "logps/chosen": -28.218563079833984, + "logps/rejected": -267.3673095703125, + "logps_avg/chosen": -0.1656300127506256, + "logps_avg/rejected": -1.4344114065170288, + "loss": 0.1562, + "losses_ref": -0.001910742954351008, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 540, + "u": -1.7859923839569092, + "weight": 0.06093848496675491 + }, + { + "diff_generated": -15.374615669250488, + "epoch": 0.1782242384964355, + "grad_norm": 5.0178902060092945, + "learning_rate": 4.751619870410367e-07, + "logits/chosen": -2.494371175765991, + "logits/rejected": -2.471282958984375, + "logps/chosen": -28.187641143798828, + "logps/rejected": -259.7303771972656, + "logps_avg/chosen": -0.15739896893501282, + "logps_avg/rejected": -1.537461519241333, + "loss": 0.158, + "losses_ref": -0.0012739974772557616, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 550, + "u": -1.7645008563995361, + "weight": 0.07127384841442108 + }, + { + "diff_generated": -15.536798477172852, + "epoch": 0.18146467919637072, + "grad_norm": 4.020211574052711, + "learning_rate": 4.838012958963283e-07, + "logits/chosen": -2.4912216663360596, + "logits/rejected": -2.5614213943481445, + "logps/chosen": -24.883838653564453, + "logps/rejected": -277.3970642089844, + "logps_avg/chosen": -0.15538007020950317, + "logps_avg/rejected": -1.5536797046661377, + "loss": 0.16, + "losses_ref": -0.001234889728948474, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 560, + "u": -1.835831642150879, + "weight": 0.03358057886362076 + }, + { + "diff_generated": -15.803949356079102, + "epoch": 0.1847051198963059, + "grad_norm": 4.00381538300631, + "learning_rate": 4.924406047516198e-07, + "logits/chosen": -2.478405475616455, + "logits/rejected": -2.5514659881591797, + "logps/chosen": -27.245723724365234, + "logps/rejected": -280.4998779296875, + "logps_avg/chosen": -0.15291285514831543, + "logps_avg/rejected": -1.580394983291626, + "loss": 0.1566, + "losses_ref": -0.0008858289802446961, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 570, + "u": -1.835996389389038, + "weight": 0.03329852223396301 + }, + { + "diff_generated": -15.373080253601074, + "epoch": 0.1879455605962411, + "grad_norm": 4.317052696494499, + "learning_rate": 5.010799136069114e-07, + "logits/chosen": -2.544964075088501, + "logits/rejected": -2.5514461994171143, + "logps/chosen": -29.379268646240234, + "logps/rejected": -260.7983093261719, + "logps_avg/chosen": -0.15511760115623474, + "logps_avg/rejected": -1.5373082160949707, + "loss": 0.1563, + "losses_ref": -0.0011349378619343042, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 580, + "u": -1.7878364324569702, + "weight": 0.05907173082232475 + }, + { + "diff_generated": -16.546499252319336, + "epoch": 0.19118600129617627, + "grad_norm": 3.9727178680185586, + "learning_rate": 5.097192224622029e-07, + "logits/chosen": -2.5150680541992188, + "logits/rejected": -2.523024082183838, + "logps/chosen": -26.727558135986328, + "logps/rejected": -257.0491943359375, + "logps_avg/chosen": -0.15087701380252838, + "logps_avg/rejected": -1.654650092124939, + "loss": 0.157, + "losses_ref": -0.0016078378539532423, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 590, + "u": -1.8469066619873047, + "weight": 0.02822394296526909 + }, + { + "diff_generated": -16.31281280517578, + "epoch": 0.19442644199611148, + "grad_norm": 4.257100975350278, + "learning_rate": 5.183585313174946e-07, + "logits/chosen": -2.529265880584717, + "logits/rejected": -2.5448126792907715, + "logps/chosen": -32.634605407714844, + "logps/rejected": -280.5689392089844, + "logps_avg/chosen": -0.17440596222877502, + "logps_avg/rejected": -1.6312812566757202, + "loss": 0.1589, + "losses_ref": -0.0015461514703929424, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 600, + "u": -1.7755861282348633, + "weight": 0.06589512526988983 + }, + { + "diff_generated": -16.440349578857422, + "epoch": 0.19766688269604665, + "grad_norm": 4.685778188867323, + "learning_rate": 5.269978401727861e-07, + "logits/chosen": -2.5129942893981934, + "logits/rejected": -2.5518910884857178, + "logps/chosen": -29.388286590576172, + "logps/rejected": -282.8636474609375, + "logps_avg/chosen": -0.15934798121452332, + "logps_avg/rejected": -1.6440349817276, + "loss": 0.1551, + "losses_ref": -0.0010432195849716663, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 610, + "u": -1.8006556034088135, + "weight": 0.05187270790338516 + }, + { + "diff_generated": -14.875322341918945, + "epoch": 0.20090732339598186, + "grad_norm": 3.9541371244197405, + "learning_rate": 5.356371490280777e-07, + "logits/chosen": -2.521015167236328, + "logits/rejected": -2.575629711151123, + "logps/chosen": -28.5093936920166, + "logps/rejected": -282.8163146972656, + "logps_avg/chosen": -0.16011330485343933, + "logps_avg/rejected": -1.487532138824463, + "loss": 0.1515, + "losses_ref": -0.001480486593209207, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 620, + "u": -1.775630235671997, + "weight": 0.06583119183778763 + }, + { + "diff_generated": -15.518178939819336, + "epoch": 0.20414776409591703, + "grad_norm": 4.521151306660312, + "learning_rate": 5.442764578833693e-07, + "logits/chosen": -2.5314955711364746, + "logits/rejected": -2.546788454055786, + "logps/chosen": -30.999698638916016, + "logps/rejected": -282.10113525390625, + "logps_avg/chosen": -0.16733181476593018, + "logps_avg/rejected": -1.5518181324005127, + "loss": 0.1547, + "losses_ref": -0.002137316856533289, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 630, + "u": -1.8101974725723267, + "weight": 0.048265378922224045 + }, + { + "diff_generated": -14.888731002807617, + "epoch": 0.20738820479585224, + "grad_norm": 4.3267193602618645, + "learning_rate": 5.529157667386608e-07, + "logits/chosen": -2.525298595428467, + "logits/rejected": -2.5428686141967773, + "logps/chosen": -26.18801498413086, + "logps/rejected": -257.1575622558594, + "logps_avg/chosen": -0.14402839541435242, + "logps_avg/rejected": -1.4888732433319092, + "loss": 0.1503, + "losses_ref": -0.0013474032748490572, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 640, + "u": -1.752764344215393, + "weight": 0.07741077244281769 + }, + { + "diff_generated": -16.706762313842773, + "epoch": 0.21062864549578741, + "grad_norm": 3.9874178439606176, + "learning_rate": 5.615550755939525e-07, + "logits/chosen": -2.4880218505859375, + "logits/rejected": -2.5559372901916504, + "logps/chosen": -27.271615982055664, + "logps/rejected": -281.1944885253906, + "logps_avg/chosen": -0.15678571164608002, + "logps_avg/rejected": -1.6706759929656982, + "loss": 0.159, + "losses_ref": -0.0017356419702991843, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 650, + "u": -1.811038613319397, + "weight": 0.04730648174881935 + }, + { + "diff_generated": -15.391021728515625, + "epoch": 0.21386908619572262, + "grad_norm": 4.036784512123039, + "learning_rate": 5.701943844492441e-07, + "logits/chosen": -2.529348611831665, + "logits/rejected": -2.511889934539795, + "logps/chosen": -29.90597915649414, + "logps/rejected": -270.4825134277344, + "logps_avg/chosen": -0.15700222551822662, + "logps_avg/rejected": -1.5391019582748413, + "loss": 0.1551, + "losses_ref": -0.0016960095381364226, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 660, + "u": -1.775199294090271, + "weight": 0.06631585210561752 + }, + { + "diff_generated": -17.663175582885742, + "epoch": 0.21710952689565782, + "grad_norm": 4.358399155338858, + "learning_rate": 5.788336933045357e-07, + "logits/chosen": -2.5374488830566406, + "logits/rejected": -2.5241658687591553, + "logps/chosen": -30.06842613220215, + "logps/rejected": -302.4938659667969, + "logps_avg/chosen": -0.16507597267627716, + "logps_avg/rejected": -1.7663177251815796, + "loss": 0.1526, + "losses_ref": -0.0008090495830401778, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 670, + "u": -1.8485357761383057, + "weight": 0.02635175548493862 + }, + { + "diff_generated": -16.56492805480957, + "epoch": 0.220349967595593, + "grad_norm": 3.8599372759018835, + "learning_rate": 5.874730021598272e-07, + "logits/chosen": -2.5052294731140137, + "logits/rejected": -2.5328030586242676, + "logps/chosen": -25.340312957763672, + "logps/rejected": -298.6836853027344, + "logps_avg/chosen": -0.1403449922800064, + "logps_avg/rejected": -1.656492829322815, + "loss": 0.1514, + "losses_ref": -0.0011725362855941057, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 680, + "u": -1.8475481271743774, + "weight": 0.027439361438155174 + }, + { + "diff_generated": -16.202449798583984, + "epoch": 0.2235904082955282, + "grad_norm": 5.707656978098627, + "learning_rate": 5.961123110151188e-07, + "logits/chosen": -2.5147368907928467, + "logits/rejected": -2.5265755653381348, + "logps/chosen": -27.811309814453125, + "logps/rejected": -282.5999755859375, + "logps_avg/chosen": -0.16103322803974152, + "logps_avg/rejected": -1.6202447414398193, + "loss": 0.1572, + "losses_ref": -0.0010156487114727497, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 690, + "u": -1.8123470544815063, + "weight": 0.045763980597257614 + }, + { + "diff_generated": -15.581779479980469, + "epoch": 0.22683084899546338, + "grad_norm": 4.543595624237527, + "learning_rate": 6.047516198704104e-07, + "logits/chosen": -2.5117552280426025, + "logits/rejected": -2.5222716331481934, + "logps/chosen": -24.74635887145996, + "logps/rejected": -259.0545959472656, + "logps_avg/chosen": -0.1400236338376999, + "logps_avg/rejected": -1.5581779479980469, + "loss": 0.1523, + "losses_ref": -0.0016117949271574616, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 700, + "u": -1.7632642984390259, + "weight": 0.07257858663797379 + }, + { + "diff_generated": -17.33824348449707, + "epoch": 0.23007128969539858, + "grad_norm": 3.756311999295835, + "learning_rate": 6.133909287257019e-07, + "logits/chosen": -2.4569334983825684, + "logits/rejected": -2.4747729301452637, + "logps/chosen": -25.68191909790039, + "logps/rejected": -288.65692138671875, + "logps_avg/chosen": -0.14832261204719543, + "logps_avg/rejected": -1.7338241338729858, + "loss": 0.153, + "losses_ref": -0.0006162269273772836, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 710, + "u": -1.7656484842300415, + "weight": 0.06995699554681778 + }, + { + "diff_generated": -17.465576171875, + "epoch": 0.23331173039533376, + "grad_norm": 3.9925886017873653, + "learning_rate": 6.220302375809935e-07, + "logits/chosen": -2.522162914276123, + "logits/rejected": -2.556715726852417, + "logps/chosen": -25.977420806884766, + "logps/rejected": -301.238037109375, + "logps_avg/chosen": -0.14257541298866272, + "logps_avg/rejected": -1.746557593345642, + "loss": 0.1509, + "losses_ref": -0.0010114299366250634, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 720, + "u": -1.8123047351837158, + "weight": 0.045841820538043976 + }, + { + "diff_generated": -16.138324737548828, + "epoch": 0.23655217109526896, + "grad_norm": 4.368943205860947, + "learning_rate": 6.306695464362851e-07, + "logits/chosen": -2.495678663253784, + "logits/rejected": -2.5630602836608887, + "logps/chosen": -26.281463623046875, + "logps/rejected": -291.1576843261719, + "logps_avg/chosen": -0.15310141444206238, + "logps_avg/rejected": -1.6138322353363037, + "loss": 0.1493, + "losses_ref": -0.0015504444018006325, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 730, + "u": -1.7755588293075562, + "weight": 0.06590535491704941 + }, + { + "diff_generated": -16.800281524658203, + "epoch": 0.23979261179520414, + "grad_norm": 4.140277380523364, + "learning_rate": 6.393088552915767e-07, + "logits/chosen": -2.417356014251709, + "logits/rejected": -2.488996982574463, + "logps/chosen": -25.04463005065918, + "logps/rejected": -295.3211669921875, + "logps_avg/chosen": -0.14725720882415771, + "logps_avg/rejected": -1.680027723312378, + "loss": 0.15, + "losses_ref": -0.0007039078627713025, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 740, + "u": -1.742006540298462, + "weight": 0.08238840103149414 + }, + { + "diff_generated": -16.212671279907227, + "epoch": 0.24303305249513935, + "grad_norm": 3.7113691272292164, + "learning_rate": 6.479481641468682e-07, + "logits/chosen": -2.4623329639434814, + "logits/rejected": -2.4828693866729736, + "logps/chosen": -26.126338958740234, + "logps/rejected": -287.18853759765625, + "logps_avg/chosen": -0.14278826117515564, + "logps_avg/rejected": -1.6212670803070068, + "loss": 0.1514, + "losses_ref": -0.001221969723701477, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 750, + "u": -1.7647031545639038, + "weight": 0.07106192409992218 + }, + { + "diff_generated": -17.23666000366211, + "epoch": 0.24627349319507452, + "grad_norm": 3.626914226331303, + "learning_rate": 6.565874730021598e-07, + "logits/chosen": -2.4999337196350098, + "logits/rejected": -2.5974574089050293, + "logps/chosen": -26.908031463623047, + "logps/rejected": -307.95330810546875, + "logps_avg/chosen": -0.1553696095943451, + "logps_avg/rejected": -1.7236659526824951, + "loss": 0.1528, + "losses_ref": -0.0013714140513911843, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 760, + "u": -1.859244704246521, + "weight": 0.021420713514089584 + }, + { + "diff_generated": -16.649688720703125, + "epoch": 0.24951393389500973, + "grad_norm": 3.6449586231357523, + "learning_rate": 6.652267818574514e-07, + "logits/chosen": -2.5069055557250977, + "logits/rejected": -2.5311920642852783, + "logps/chosen": -27.44228172302246, + "logps/rejected": -280.15423583984375, + "logps_avg/chosen": -0.1512620747089386, + "logps_avg/rejected": -1.6649688482284546, + "loss": 0.1469, + "losses_ref": -0.0011087359162047505, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 770, + "u": -1.740947961807251, + "weight": 0.08358202874660492 + }, + { + "diff_generated": -17.594615936279297, + "epoch": 0.25275437459494493, + "grad_norm": 3.591067576204669, + "learning_rate": 6.738660907127429e-07, + "logits/chosen": -2.5164694786071777, + "logits/rejected": -2.544618606567383, + "logps/chosen": -29.995891571044922, + "logps/rejected": -302.5132141113281, + "logps_avg/chosen": -0.16520099341869354, + "logps_avg/rejected": -1.7594616413116455, + "loss": 0.1514, + "losses_ref": -0.0011694171698763967, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 780, + "u": -1.8233131170272827, + "weight": 0.04046357423067093 + }, + { + "diff_generated": -15.048116683959961, + "epoch": 0.2559948152948801, + "grad_norm": 4.007762938469329, + "learning_rate": 6.825053995680345e-07, + "logits/chosen": -2.4432899951934814, + "logits/rejected": -2.543459415435791, + "logps/chosen": -26.264307022094727, + "logps/rejected": -274.15142822265625, + "logps_avg/chosen": -0.15596263110637665, + "logps_avg/rejected": -1.5048116445541382, + "loss": 0.152, + "losses_ref": -0.0016327224439010024, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 790, + "u": -1.751920461654663, + "weight": 0.07835827022790909 + }, + { + "diff_generated": -16.8079891204834, + "epoch": 0.2592352559948153, + "grad_norm": 3.803322718876922, + "learning_rate": 6.911447084233261e-07, + "logits/chosen": -2.4625935554504395, + "logits/rejected": -2.491464376449585, + "logps/chosen": -25.487167358398438, + "logps/rejected": -294.3965759277344, + "logps_avg/chosen": -0.14450162649154663, + "logps_avg/rejected": -1.680798888206482, + "loss": 0.1441, + "losses_ref": -0.0011468358570709825, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 800, + "u": -1.8119513988494873, + "weight": 0.046235114336013794 + }, + { + "diff_generated": -16.325275421142578, + "epoch": 0.26247569669475046, + "grad_norm": 3.8441167643161624, + "learning_rate": 6.997840172786177e-07, + "logits/chosen": -2.5127015113830566, + "logits/rejected": -2.5084917545318604, + "logps/chosen": -28.14730453491211, + "logps/rejected": -269.518310546875, + "logps_avg/chosen": -0.15695425868034363, + "logps_avg/rejected": -1.6325273513793945, + "loss": 0.1459, + "losses_ref": -0.0017818144988268614, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 810, + "u": -1.8110424280166626, + "weight": 0.047296833246946335 + }, + { + "diff_generated": -16.837024688720703, + "epoch": 0.2657161373946857, + "grad_norm": 3.8403571035756245, + "learning_rate": 7.084233261339092e-07, + "logits/chosen": -2.4789652824401855, + "logits/rejected": -2.552279472351074, + "logps/chosen": -25.903278350830078, + "logps/rejected": -297.843994140625, + "logps_avg/chosen": -0.15136049687862396, + "logps_avg/rejected": -1.6837022304534912, + "loss": 0.1484, + "losses_ref": -0.0007834344869479537, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 820, + "u": -1.7774156332015991, + "weight": 0.06382577121257782 + }, + { + "diff_generated": -16.415281295776367, + "epoch": 0.26895657809462087, + "grad_norm": 3.8122228541342973, + "learning_rate": 7.170626349892008e-07, + "logits/chosen": -2.4843931198120117, + "logits/rejected": -2.530792713165283, + "logps/chosen": -25.79958724975586, + "logps/rejected": -278.10589599609375, + "logps_avg/chosen": -0.14485926926136017, + "logps_avg/rejected": -1.6415281295776367, + "loss": 0.1513, + "losses_ref": -0.0013059931807219982, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 830, + "u": -1.7524468898773193, + "weight": 0.07773178815841675 + }, + { + "diff_generated": -17.35388946533203, + "epoch": 0.27219701879455604, + "grad_norm": 5.721419042997992, + "learning_rate": 7.257019438444924e-07, + "logits/chosen": -2.543403148651123, + "logits/rejected": -2.4924099445343018, + "logps/chosen": -28.061603546142578, + "logps/rejected": -278.6396484375, + "logps_avg/chosen": -0.14713650941848755, + "logps_avg/rejected": -1.735388994216919, + "loss": 0.147, + "losses_ref": -0.0010668208124116063, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 840, + "u": -1.8004766702651978, + "weight": 0.05206400901079178 + }, + { + "diff_generated": -16.650218963623047, + "epoch": 0.2754374594944913, + "grad_norm": 3.4977682328654813, + "learning_rate": 7.343412526997839e-07, + "logits/chosen": -2.469409227371216, + "logits/rejected": -2.451508045196533, + "logps/chosen": -26.62298583984375, + "logps/rejected": -277.678466796875, + "logps_avg/chosen": -0.14454862475395203, + "logps_avg/rejected": -1.6650216579437256, + "loss": 0.1465, + "losses_ref": -0.0009407054749317467, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 850, + "u": -1.729368805885315, + "weight": 0.08950553089380264 + }, + { + "diff_generated": -16.65591049194336, + "epoch": 0.27867790019442645, + "grad_norm": 3.7789630141654427, + "learning_rate": 7.429805615550755e-07, + "logits/chosen": -2.5125536918640137, + "logits/rejected": -2.518594264984131, + "logps/chosen": -28.266979217529297, + "logps/rejected": -279.36236572265625, + "logps_avg/chosen": -0.14472495019435883, + "logps_avg/rejected": -1.6655908823013306, + "loss": 0.1457, + "losses_ref": -0.0014739853795617819, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 860, + "u": -1.7628333568572998, + "weight": 0.07282988727092743 + }, + { + "diff_generated": -17.289453506469727, + "epoch": 0.28191834089436163, + "grad_norm": 3.816298176813016, + "learning_rate": 7.516198704103671e-07, + "logits/chosen": -2.4170525074005127, + "logits/rejected": -2.5361487865448, + "logps/chosen": -21.06148338317871, + "logps/rejected": -310.81689453125, + "logps_avg/chosen": -0.13158485293388367, + "logps_avg/rejected": -1.7289453744888306, + "loss": 0.1437, + "losses_ref": -0.0011063070269301534, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 870, + "u": -1.788311243057251, + "weight": 0.0586506649851799 + }, + { + "diff_generated": -17.18234634399414, + "epoch": 0.2851587815942968, + "grad_norm": 3.5053935087270074, + "learning_rate": 7.602591792656587e-07, + "logits/chosen": -2.512596845626831, + "logits/rejected": -2.5777673721313477, + "logps/chosen": -28.489471435546875, + "logps/rejected": -299.99554443359375, + "logps_avg/chosen": -0.16591385006904602, + "logps_avg/rejected": -1.7182344198226929, + "loss": 0.145, + "losses_ref": -0.0012544682249426842, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 880, + "u": -1.8237812519073486, + "weight": 0.04003932327032089 + }, + { + "diff_generated": -16.68752670288086, + "epoch": 0.28839922229423204, + "grad_norm": 3.6606878088736043, + "learning_rate": 7.688984881209502e-07, + "logits/chosen": -2.4937093257904053, + "logits/rejected": -2.5517916679382324, + "logps/chosen": -26.318889617919922, + "logps/rejected": -286.62115478515625, + "logps_avg/chosen": -0.1460975706577301, + "logps_avg/rejected": -1.668752908706665, + "loss": 0.1507, + "losses_ref": -0.001337700174190104, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 890, + "u": -1.7759170532226562, + "weight": 0.06547953933477402 + }, + { + "diff_generated": -17.303630828857422, + "epoch": 0.2916396629941672, + "grad_norm": 3.96462786512709, + "learning_rate": 7.775377969762419e-07, + "logits/chosen": -2.5112993717193604, + "logits/rejected": -2.552793502807617, + "logps/chosen": -23.081388473510742, + "logps/rejected": -312.2502136230469, + "logps_avg/chosen": -0.13418585062026978, + "logps_avg/rejected": -1.730363130569458, + "loss": 0.1435, + "losses_ref": -0.0010935317259281874, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 900, + "u": -1.8119033575057983, + "weight": 0.046246398240327835 + }, + { + "diff_generated": -17.516305923461914, + "epoch": 0.2948801036941024, + "grad_norm": 3.960436884466544, + "learning_rate": 7.861771058315335e-07, + "logits/chosen": -2.536668300628662, + "logits/rejected": -2.579866409301758, + "logps/chosen": -24.779300689697266, + "logps/rejected": -301.9021911621094, + "logps_avg/chosen": -0.1520080268383026, + "logps_avg/rejected": -1.7516307830810547, + "loss": 0.1499, + "losses_ref": -0.0007299171993508935, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 910, + "u": -1.836708426475525, + "weight": 0.03255882114171982 + }, + { + "diff_generated": -18.22040557861328, + "epoch": 0.29812054439403757, + "grad_norm": 4.216145687800187, + "learning_rate": 7.94816414686825e-07, + "logits/chosen": -2.4581117630004883, + "logits/rejected": -2.4852912425994873, + "logps/chosen": -23.488094329833984, + "logps/rejected": -296.0298156738281, + "logps_avg/chosen": -0.1450100541114807, + "logps_avg/rejected": -1.8220407962799072, + "loss": 0.1444, + "losses_ref": -0.0011233676923438907, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 920, + "u": -1.8120393753051758, + "weight": 0.046129390597343445 + }, + { + "diff_generated": -16.802902221679688, + "epoch": 0.3013609850939728, + "grad_norm": 3.540189522204402, + "learning_rate": 7.999995450631473e-07, + "logits/chosen": -2.5070629119873047, + "logits/rejected": -2.578346014022827, + "logps/chosen": -24.117534637451172, + "logps/rejected": -302.5053405761719, + "logps_avg/chosen": -0.13478049635887146, + "logps_avg/rejected": -1.6802902221679688, + "loss": 0.1461, + "losses_ref": -0.000596770434640348, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 930, + "u": -1.7776219844818115, + "weight": 0.06357844918966293 + }, + { + "diff_generated": -17.449804306030273, + "epoch": 0.304601425793908, + "grad_norm": 3.4216904204057315, + "learning_rate": 7.999944270354383e-07, + "logits/chosen": -2.4589948654174805, + "logits/rejected": -2.5639543533325195, + "logps/chosen": -26.90707778930664, + "logps/rejected": -313.41455078125, + "logps_avg/chosen": -0.16656050086021423, + "logps_avg/rejected": -1.7449802160263062, + "loss": 0.1507, + "losses_ref": -0.000916670891456306, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 940, + "u": -1.8126401901245117, + "weight": 0.04546584561467171 + }, + { + "diff_generated": -16.95920181274414, + "epoch": 0.30784186649384315, + "grad_norm": 3.5840189261030604, + "learning_rate": 7.99983622381959e-07, + "logits/chosen": -2.49526047706604, + "logits/rejected": -2.5164170265197754, + "logps/chosen": -27.150075912475586, + "logps/rejected": -277.107666015625, + "logps_avg/chosen": -0.15076836943626404, + "logps_avg/rejected": -1.6959202289581299, + "loss": 0.1482, + "losses_ref": -0.0008078349055722356, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 950, + "u": -1.8009493350982666, + "weight": 0.05151613801717758 + }, + { + "diff_generated": -19.485225677490234, + "epoch": 0.31108230719377833, + "grad_norm": 3.884047901709329, + "learning_rate": 7.999671312563164e-07, + "logits/chosen": -2.5171866416931152, + "logits/rejected": -2.4757168292999268, + "logps/chosen": -26.446279525756836, + "logps/rejected": -301.2461242675781, + "logps_avg/chosen": -0.15032193064689636, + "logps_avg/rejected": -1.9485225677490234, + "loss": 0.1438, + "losses_ref": -0.0010997498175129294, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 960, + "u": -1.7880767583847046, + "weight": 0.0588577575981617 + }, + { + "diff_generated": -18.37019920349121, + "epoch": 0.31432274789371356, + "grad_norm": 3.5647427602940236, + "learning_rate": 7.999449538929611e-07, + "logits/chosen": -2.4516000747680664, + "logits/rejected": -2.4881412982940674, + "logps/chosen": -25.292774200439453, + "logps/rejected": -301.6772766113281, + "logps_avg/chosen": -0.1464090645313263, + "logps_avg/rejected": -1.8370201587677002, + "loss": 0.1489, + "losses_ref": -0.0010070034768432379, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 970, + "u": -1.8125054836273193, + "weight": 0.04562786594033241 + }, + { + "diff_generated": -17.277942657470703, + "epoch": 0.31756318859364874, + "grad_norm": 3.9531898301627604, + "learning_rate": 7.99917090607183e-07, + "logits/chosen": -2.4834158420562744, + "logits/rejected": -2.5573599338531494, + "logps/chosen": -23.000879287719727, + "logps/rejected": -308.1183776855469, + "logps_avg/chosen": -0.14088958501815796, + "logps_avg/rejected": -1.7277939319610596, + "loss": 0.1458, + "losses_ref": -0.0007574810297228396, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 980, + "u": -1.8011051416397095, + "weight": 0.05134943872690201 + }, + { + "diff_generated": -17.71529197692871, + "epoch": 0.3208036292935839, + "grad_norm": 4.066482069190587, + "learning_rate": 7.998835417951081e-07, + "logits/chosen": -2.5377254486083984, + "logits/rejected": -2.5539002418518066, + "logps/chosen": -25.564077377319336, + "logps/rejected": -285.80694580078125, + "logps_avg/chosen": -0.14639118313789368, + "logps_avg/rejected": -1.771528959274292, + "loss": 0.1446, + "losses_ref": -0.0009276577038690448, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 990, + "u": -1.8244966268539429, + "weight": 0.03922320902347565 + }, + { + "diff_generated": -16.78716468811035, + "epoch": 0.32404406999351915, + "grad_norm": 3.8501332564370996, + "learning_rate": 7.998443079336919e-07, + "logits/chosen": -2.495265483856201, + "logits/rejected": -2.565699815750122, + "logps/chosen": -26.15287208557129, + "logps/rejected": -311.22125244140625, + "logps_avg/chosen": -0.14763453602790833, + "logps_avg/rejected": -1.6787166595458984, + "loss": 0.1436, + "losses_ref": -0.0007319619762711227, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1000, + "u": -1.8367958068847656, + "weight": 0.03246457502245903 + }, + { + "diff_generated": -18.131811141967773, + "epoch": 0.3272845106934543, + "grad_norm": 3.4334006893079447, + "learning_rate": 7.997993895807128e-07, + "logits/chosen": -2.5410752296447754, + "logits/rejected": -2.5453710556030273, + "logps/chosen": -25.54119300842285, + "logps/rejected": -319.1593017578125, + "logps_avg/chosen": -0.14520543813705444, + "logps_avg/rejected": -1.813180923461914, + "loss": 0.1437, + "losses_ref": -0.0006047133356332779, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1010, + "u": -1.8132398128509521, + "weight": 0.0447794608771801 + }, + { + "diff_generated": -16.725383758544922, + "epoch": 0.3305249513933895, + "grad_norm": 3.6574381218888523, + "learning_rate": 7.997487873747646e-07, + "logits/chosen": -2.5111083984375, + "logits/rejected": -2.532496929168701, + "logps/chosen": -23.3675537109375, + "logps/rejected": -300.4358215332031, + "logps_avg/chosen": -0.12948718667030334, + "logps_avg/rejected": -1.67253839969635, + "loss": 0.1365, + "losses_ref": -0.0006616250611841679, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1020, + "u": -1.765761375427246, + "weight": 0.06983973830938339 + }, + { + "diff_generated": -18.22255516052246, + "epoch": 0.3337653920933247, + "grad_norm": 3.731162582661096, + "learning_rate": 7.996925020352465e-07, + "logits/chosen": -2.4977619647979736, + "logits/rejected": -2.462693452835083, + "logps/chosen": -28.452739715576172, + "logps/rejected": -295.1959533691406, + "logps_avg/chosen": -0.14971373975276947, + "logps_avg/rejected": -1.8222553730010986, + "loss": 0.1509, + "losses_ref": -0.001343491836450994, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1030, + "u": -1.8117315769195557, + "weight": 0.04650767147541046 + }, + { + "diff_generated": -18.08208465576172, + "epoch": 0.3370058327932599, + "grad_norm": 3.387416001248169, + "learning_rate": 7.99630534362354e-07, + "logits/chosen": -2.4722683429718018, + "logits/rejected": -2.4949660301208496, + "logps/chosen": -23.3449649810791, + "logps/rejected": -305.1861877441406, + "logps_avg/chosen": -0.1336612105369568, + "logps_avg/rejected": -1.8082084655761719, + "loss": 0.1417, + "losses_ref": -0.0007842335617169738, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1040, + "u": -1.765332579612732, + "weight": 0.07031672447919846 + }, + { + "diff_generated": -17.331754684448242, + "epoch": 0.3402462734931951, + "grad_norm": 3.2793739525196455, + "learning_rate": 7.995628852370667e-07, + "logits/chosen": -2.4455838203430176, + "logits/rejected": -2.495821475982666, + "logps/chosen": -24.51308250427246, + "logps/rejected": -306.106689453125, + "logps_avg/chosen": -0.14820663630962372, + "logps_avg/rejected": -1.733175277709961, + "loss": 0.1452, + "losses_ref": -0.0010632300982251763, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1050, + "u": -1.7886251211166382, + "weight": 0.058313362300395966 + }, + { + "diff_generated": -17.65144157409668, + "epoch": 0.34348671419313026, + "grad_norm": 3.414314107092028, + "learning_rate": 7.994895556211363e-07, + "logits/chosen": -2.4573745727539062, + "logits/rejected": -2.5492000579833984, + "logps/chosen": -25.012226104736328, + "logps/rejected": -307.28863525390625, + "logps_avg/chosen": -0.15316030383110046, + "logps_avg/rejected": -1.7651441097259521, + "loss": 0.1426, + "losses_ref": -0.0007830454269424081, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1060, + "u": -1.8129937648773193, + "weight": 0.045065782964229584 + }, + { + "diff_generated": -18.741573333740234, + "epoch": 0.34672715489306544, + "grad_norm": 3.5362740373683845, + "learning_rate": 7.994105465570722e-07, + "logits/chosen": -2.4750843048095703, + "logits/rejected": -2.4775915145874023, + "logps/chosen": -28.20367431640625, + "logps/rejected": -321.14019775390625, + "logps_avg/chosen": -0.15645088255405426, + "logps_avg/rejected": -1.8741573095321655, + "loss": 0.1445, + "losses_ref": -0.0004584606795106083, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1070, + "u": -1.7779619693756104, + "weight": 0.06318999081850052 + }, + { + "diff_generated": -17.927629470825195, + "epoch": 0.34996759559300067, + "grad_norm": 3.5224573664899963, + "learning_rate": 7.993258591681279e-07, + "logits/chosen": -2.446620464324951, + "logits/rejected": -2.461449384689331, + "logps/chosen": -25.738372802734375, + "logps/rejected": -298.7737121582031, + "logps_avg/chosen": -0.14429491758346558, + "logps_avg/rejected": -1.7927627563476562, + "loss": 0.1449, + "losses_ref": -0.00048292643623426557, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1080, + "u": -1.7422895431518555, + "weight": 0.0820549726486206 + }, + { + "diff_generated": -17.745529174804688, + "epoch": 0.35320803629293585, + "grad_norm": 3.4810028090401506, + "learning_rate": 7.992354946582836e-07, + "logits/chosen": -2.5051581859588623, + "logits/rejected": -2.5519306659698486, + "logps/chosen": -24.41130256652832, + "logps/rejected": -324.4436340332031, + "logps_avg/chosen": -0.13996274769306183, + "logps_avg/rejected": -1.7745529413223267, + "loss": 0.142, + "losses_ref": -0.00065957399783656, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1090, + "u": -1.8132047653198242, + "weight": 0.04482080414891243 + }, + { + "diff_generated": -17.48514747619629, + "epoch": 0.356448476992871, + "grad_norm": 3.776538208700194, + "learning_rate": 7.991394543122304e-07, + "logits/chosen": -2.4793810844421387, + "logits/rejected": -2.5294454097747803, + "logps/chosen": -25.155223846435547, + "logps/rejected": -291.52569580078125, + "logps_avg/chosen": -0.14746752381324768, + "logps_avg/rejected": -1.7485147714614868, + "loss": 0.1442, + "losses_ref": -0.001555544906295836, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1100, + "u": -1.7873111963272095, + "weight": 0.05974424630403519 + }, + { + "diff_generated": -15.942682266235352, + "epoch": 0.3596889176928062, + "grad_norm": 3.3175723086642335, + "learning_rate": 7.990377394953507e-07, + "logits/chosen": -2.450894832611084, + "logits/rejected": -2.564906597137451, + "logps/chosen": -24.385358810424805, + "logps/rejected": -290.43048095703125, + "logps_avg/chosen": -0.14321152865886688, + "logps_avg/rejected": -1.5942682027816772, + "loss": 0.1431, + "losses_ref": -0.0007358678849413991, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1110, + "u": -1.8011974096298218, + "weight": 0.051243700087070465 + }, + { + "diff_generated": -17.29189682006836, + "epoch": 0.36292935839274143, + "grad_norm": 3.205495384000928, + "learning_rate": 7.989303516537001e-07, + "logits/chosen": -2.5006320476531982, + "logits/rejected": -2.5535852909088135, + "logps/chosen": -21.054277420043945, + "logps/rejected": -285.4083557128906, + "logps_avg/chosen": -0.13172145187854767, + "logps_avg/rejected": -1.7291895151138306, + "loss": 0.1407, + "losses_ref": -0.0006592486170120537, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1120, + "u": -1.8012969493865967, + "weight": 0.05112830922007561 + }, + { + "diff_generated": -16.982044219970703, + "epoch": 0.3661697990926766, + "grad_norm": 3.625175942353783, + "learning_rate": 7.98817292313986e-07, + "logits/chosen": -2.5410523414611816, + "logits/rejected": -2.588095188140869, + "logps/chosen": -28.69525146484375, + "logps/rejected": -307.0025329589844, + "logps_avg/chosen": -0.16247490048408508, + "logps_avg/rejected": -1.6982042789459229, + "loss": 0.1429, + "losses_ref": -0.0008386773988604546, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1130, + "u": -1.8246562480926514, + "weight": 0.039037130773067474 + }, + { + "diff_generated": -17.986581802368164, + "epoch": 0.3694102397926118, + "grad_norm": 3.2460056654881613, + "learning_rate": 7.986985630835463e-07, + "logits/chosen": -2.4848456382751465, + "logits/rejected": -2.5159919261932373, + "logps/chosen": -24.781518936157227, + "logps/rejected": -280.80987548828125, + "logps_avg/chosen": -0.15118327736854553, + "logps_avg/rejected": -1.798658013343811, + "loss": 0.1426, + "losses_ref": -0.001054365886375308, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1140, + "u": -1.8358449935913086, + "weight": 0.03351539373397827 + }, + { + "diff_generated": -16.1344051361084, + "epoch": 0.37265068049254696, + "grad_norm": 3.3260113618842433, + "learning_rate": 7.985741656503261e-07, + "logits/chosen": -2.5159730911254883, + "logits/rejected": -2.542235851287842, + "logps/chosen": -29.317920684814453, + "logps/rejected": -286.7655334472656, + "logps_avg/chosen": -0.16192345321178436, + "logps_avg/rejected": -1.6134405136108398, + "loss": 0.1435, + "losses_ref": -0.001453616307117045, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1150, + "u": -1.7875741720199585, + "weight": 0.05947249010205269 + }, + { + "diff_generated": -17.043901443481445, + "epoch": 0.3758911211924822, + "grad_norm": 3.120515366538399, + "learning_rate": 7.984441017828543e-07, + "logits/chosen": -2.4753224849700928, + "logits/rejected": -2.545196056365967, + "logps/chosen": -26.04819679260254, + "logps/rejected": -308.024169921875, + "logps_avg/chosen": -0.15330848097801208, + "logps_avg/rejected": -1.7043901681900024, + "loss": 0.1391, + "losses_ref": -0.0013215601211413741, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1160, + "u": -1.8354936838150024, + "weight": 0.0339578315615654 + }, + { + "diff_generated": -17.90146255493164, + "epoch": 0.37913156189241737, + "grad_norm": 3.4954463643326585, + "learning_rate": 7.983083733302178e-07, + "logits/chosen": -2.534641742706299, + "logits/rejected": -2.529932975769043, + "logps/chosen": -25.712291717529297, + "logps/rejected": -310.0232849121094, + "logps_avg/chosen": -0.13911107182502747, + "logps_avg/rejected": -1.7901462316513062, + "loss": 0.1423, + "losses_ref": -0.0007827662047930062, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1170, + "u": -1.8010227680206299, + "weight": 0.05144274979829788 + }, + { + "diff_generated": -17.42496681213379, + "epoch": 0.38237200259235254, + "grad_norm": 3.5827637114687736, + "learning_rate": 7.98166982222036e-07, + "logits/chosen": -2.508788585662842, + "logits/rejected": -2.5179848670959473, + "logps/chosen": -25.84026527404785, + "logps/rejected": -284.7757873535156, + "logps_avg/chosen": -0.1503192037343979, + "logps_avg/rejected": -1.7424967288970947, + "loss": 0.1409, + "losses_ref": -0.0010919750202447176, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1180, + "u": -1.8121187686920166, + "weight": 0.04601747542619705 + }, + { + "diff_generated": -18.662456512451172, + "epoch": 0.3856124432922878, + "grad_norm": 3.171676752032205, + "learning_rate": 7.980199304684328e-07, + "logits/chosen": -2.4683659076690674, + "logits/rejected": -2.454379081726074, + "logps/chosen": -26.418987274169922, + "logps/rejected": -315.4735412597656, + "logps_avg/chosen": -0.1484156847000122, + "logps_avg/rejected": -1.8662456274032593, + "loss": 0.1405, + "losses_ref": -0.0005341099458746612, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1190, + "u": -1.8251888751983643, + "weight": 0.03842061385512352 + }, + { + "diff_generated": -18.877710342407227, + "epoch": 0.38885288399222295, + "grad_norm": 3.353939901355228, + "learning_rate": 7.978672201600077e-07, + "logits/chosen": -2.4355032444000244, + "logits/rejected": -2.4991493225097656, + "logps/chosen": -24.02352523803711, + "logps/rejected": -321.2890319824219, + "logps_avg/chosen": -0.15375861525535583, + "logps_avg/rejected": -1.8877712488174438, + "loss": 0.1373, + "losses_ref": -0.0003830655477941036, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1200, + "u": -1.7779605388641357, + "weight": 0.0631849393248558 + }, + { + "diff_generated": -18.390823364257812, + "epoch": 0.39209332469215813, + "grad_norm": 3.150342260956621, + "learning_rate": 7.97708853467807e-07, + "logits/chosen": -2.4914722442626953, + "logits/rejected": -2.5442190170288086, + "logps/chosen": -22.80034828186035, + "logps/rejected": -316.0704650878906, + "logps_avg/chosen": -0.1339089572429657, + "logps_avg/rejected": -1.8390823602676392, + "loss": 0.1378, + "losses_ref": -0.00068041862687096, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1210, + "u": -1.8366985321044922, + "weight": 0.03256749361753464 + }, + { + "diff_generated": -17.826963424682617, + "epoch": 0.3953337653920933, + "grad_norm": 3.4799528088131115, + "learning_rate": 7.975448326432927e-07, + "logits/chosen": -2.4840798377990723, + "logits/rejected": -2.551450252532959, + "logps/chosen": -24.82818031311035, + "logps/rejected": -318.79144287109375, + "logps_avg/chosen": -0.14494793117046356, + "logps_avg/rejected": -1.7826963663101196, + "loss": 0.1428, + "losses_ref": -0.0009499592706561089, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1220, + "u": -1.8125343322753906, + "weight": 0.04559002444148064 + }, + { + "diff_generated": -17.58133888244629, + "epoch": 0.39857420609202854, + "grad_norm": 3.4018737399228827, + "learning_rate": 7.973751600183094e-07, + "logits/chosen": -2.5015528202056885, + "logits/rejected": -2.5259804725646973, + "logps/chosen": -26.3048038482666, + "logps/rejected": -304.74456787109375, + "logps_avg/chosen": -0.14304566383361816, + "logps_avg/rejected": -1.758133888244629, + "loss": 0.1436, + "losses_ref": -0.0007128279539756477, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1230, + "u": -1.836792230606079, + "weight": 0.032467663288116455 + }, + { + "diff_generated": -19.12925148010254, + "epoch": 0.4018146467919637, + "grad_norm": 3.352873304767975, + "learning_rate": 7.971998380050529e-07, + "logits/chosen": -2.4705023765563965, + "logits/rejected": -2.506176710128784, + "logps/chosen": -25.72548484802246, + "logps/rejected": -313.7716979980469, + "logps_avg/chosen": -0.15574082732200623, + "logps_avg/rejected": -1.9129250049591064, + "loss": 0.1445, + "losses_ref": -0.000700434495229274, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1240, + "u": -1.8604224920272827, + "weight": 0.02005375549197197 + }, + { + "diff_generated": -18.428089141845703, + "epoch": 0.4050550874918989, + "grad_norm": 3.3160451492197414, + "learning_rate": 7.970188690960343e-07, + "logits/chosen": -2.4253203868865967, + "logits/rejected": -2.5191216468811035, + "logps/chosen": -21.341529846191406, + "logps/rejected": -321.898193359375, + "logps_avg/chosen": -0.1284700334072113, + "logps_avg/rejected": -1.8428089618682861, + "loss": 0.1369, + "losses_ref": -0.0007181521505117416, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1250, + "u": -1.7890920639038086, + "weight": 0.057752568274736404 + }, + { + "diff_generated": -18.595012664794922, + "epoch": 0.40829552819183407, + "grad_norm": 3.3685643424162057, + "learning_rate": 7.968322558640458e-07, + "logits/chosen": -2.4410624504089355, + "logits/rejected": -2.517035961151123, + "logps/chosen": -24.76461410522461, + "logps/rejected": -324.0437927246094, + "logps_avg/chosen": -0.14681437611579895, + "logps_avg/rejected": -1.8595014810562134, + "loss": 0.1438, + "losses_ref": -0.0008349610725417733, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1260, + "u": -1.7889350652694702, + "weight": 0.05794642120599747 + }, + { + "diff_generated": -18.137805938720703, + "epoch": 0.4115359688917693, + "grad_norm": 3.1562946170434243, + "learning_rate": 7.966400009621233e-07, + "logits/chosen": -2.465362071990967, + "logits/rejected": -2.499647855758667, + "logps/chosen": -25.29610252380371, + "logps/rejected": -318.85333251953125, + "logps_avg/chosen": -0.1431424915790558, + "logps_avg/rejected": -1.8137805461883545, + "loss": 0.1422, + "losses_ref": -0.0006325670401565731, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1270, + "u": -1.7420135736465454, + "weight": 0.08237271010875702 + }, + { + "diff_generated": -17.22241973876953, + "epoch": 0.4147764095917045, + "grad_norm": 3.2772344941264353, + "learning_rate": 7.964421071235092e-07, + "logits/chosen": -2.4533419609069824, + "logits/rejected": -2.53004789352417, + "logps/chosen": -23.705324172973633, + "logps/rejected": -298.66033935546875, + "logps_avg/chosen": -0.14016704261302948, + "logps_avg/rejected": -1.722241997718811, + "loss": 0.1385, + "losses_ref": -0.0007769926451146603, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1280, + "u": -1.7418180704116821, + "weight": 0.08260531723499298 + }, + { + "diff_generated": -18.119998931884766, + "epoch": 0.41801685029163965, + "grad_norm": 3.1222000038882425, + "learning_rate": 7.962385771616133e-07, + "logits/chosen": -2.484968662261963, + "logits/rejected": -2.4719197750091553, + "logps/chosen": -25.072969436645508, + "logps/rejected": -285.41241455078125, + "logps_avg/chosen": -0.14321108162403107, + "logps_avg/rejected": -1.8119999170303345, + "loss": 0.1349, + "losses_ref": -0.0002476568624842912, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1290, + "u": -1.7545337677001953, + "weight": 0.07535454630851746 + }, + { + "diff_generated": -17.06702995300293, + "epoch": 0.42125729099157483, + "grad_norm": 3.343738027534707, + "learning_rate": 7.960294139699724e-07, + "logits/chosen": -2.4875121116638184, + "logits/rejected": -2.5455143451690674, + "logps/chosen": -23.458911895751953, + "logps/rejected": -308.88604736328125, + "logps_avg/chosen": -0.13438265025615692, + "logps_avg/rejected": -1.7067029476165771, + "loss": 0.1389, + "losses_ref": -0.0009179472108371556, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1300, + "u": -1.7887804508209229, + "weight": 0.058129895478487015 + }, + { + "diff_generated": -19.01449966430664, + "epoch": 0.42449773169151006, + "grad_norm": 3.390022462334716, + "learning_rate": 7.958146205222102e-07, + "logits/chosen": -2.442352533340454, + "logits/rejected": -2.488405227661133, + "logps/chosen": -22.647235870361328, + "logps/rejected": -317.5475769042969, + "logps_avg/chosen": -0.1311555802822113, + "logps_avg/rejected": -1.9014499187469482, + "loss": 0.136, + "losses_ref": -0.0005218187579885125, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1310, + "u": -1.8371353149414062, + "weight": 0.03206893056631088 + }, + { + "diff_generated": -16.678707122802734, + "epoch": 0.42773817239144524, + "grad_norm": 3.4549953797681585, + "learning_rate": 7.955941998719939e-07, + "logits/chosen": -2.438683032989502, + "logits/rejected": -2.4860570430755615, + "logps/chosen": -24.95685386657715, + "logps/rejected": -300.39422607421875, + "logps_avg/chosen": -0.13486911356449127, + "logps_avg/rejected": -1.6678707599639893, + "loss": 0.1363, + "losses_ref": -0.0008987674373202026, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1320, + "u": -1.7414432764053345, + "weight": 0.08302642405033112 + }, + { + "diff_generated": -17.49447250366211, + "epoch": 0.4309786130913804, + "grad_norm": 3.2061398818826845, + "learning_rate": 7.953681551529918e-07, + "logits/chosen": -2.4157721996307373, + "logits/rejected": -2.4662671089172363, + "logps/chosen": -22.527088165283203, + "logps/rejected": -311.30169677734375, + "logps_avg/chosen": -0.13368523120880127, + "logps_avg/rejected": -1.749447226524353, + "loss": 0.1362, + "losses_ref": -0.000775449734646827, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1330, + "u": -1.7892115116119385, + "weight": 0.057642944157123566 + }, + { + "diff_generated": -17.374311447143555, + "epoch": 0.43421905379131565, + "grad_norm": 3.9259248576570536, + "learning_rate": 7.951364895788277e-07, + "logits/chosen": -2.468964099884033, + "logits/rejected": -2.4919381141662598, + "logps/chosen": -23.396032333374023, + "logps/rejected": -300.768310546875, + "logps_avg/chosen": -0.12941637635231018, + "logps_avg/rejected": -1.737431287765503, + "loss": 0.1357, + "losses_ref": -0.0006515913410112262, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1340, + "u": -1.813141107559204, + "weight": 0.04489173740148544 + }, + { + "diff_generated": -17.841054916381836, + "epoch": 0.4374594944912508, + "grad_norm": 3.042669878051496, + "learning_rate": 7.948992064430363e-07, + "logits/chosen": -2.462354898452759, + "logits/rejected": -2.524773120880127, + "logps/chosen": -25.153942108154297, + "logps/rejected": -328.6318664550781, + "logps_avg/chosen": -0.14619532227516174, + "logps_avg/rejected": -1.784105658531189, + "loss": 0.138, + "losses_ref": -0.0007250936469063163, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1350, + "u": -1.8129888772964478, + "weight": 0.04506770148873329 + }, + { + "diff_generated": -17.569915771484375, + "epoch": 0.440699935191186, + "grad_norm": 3.2656238746919963, + "learning_rate": 7.946563091190154e-07, + "logits/chosen": -2.4542012214660645, + "logits/rejected": -2.5044617652893066, + "logps/chosen": -24.700918197631836, + "logps/rejected": -302.47125244140625, + "logps_avg/chosen": -0.14593173563480377, + "logps_avg/rejected": -1.7569917440414429, + "loss": 0.139, + "losses_ref": -0.001583050936460495, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1360, + "u": -1.7991024255752563, + "weight": 0.053593169897794724 + }, + { + "diff_generated": -17.115385055541992, + "epoch": 0.4439403758911212, + "grad_norm": 3.789637622440191, + "learning_rate": 7.944078010599788e-07, + "logits/chosen": -2.496446132659912, + "logits/rejected": -2.4480183124542236, + "logps/chosen": -25.29878044128418, + "logps/rejected": -303.20843505859375, + "logps_avg/chosen": -0.1379314810037613, + "logps_avg/rejected": -1.711538553237915, + "loss": 0.1356, + "losses_ref": -0.00036644996725954115, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1370, + "u": -1.7425174713134766, + "weight": 0.0817914828658104 + }, + { + "diff_generated": -17.19664192199707, + "epoch": 0.4471808165910564, + "grad_norm": 3.184134531621745, + "learning_rate": 7.941536857989063e-07, + "logits/chosen": -2.421329975128174, + "logits/rejected": -2.449415683746338, + "logps/chosen": -25.185787200927734, + "logps/rejected": -303.1695556640625, + "logps_avg/chosen": -0.1363663375377655, + "logps_avg/rejected": -1.719664216041565, + "loss": 0.1363, + "losses_ref": -0.0008979662088677287, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1380, + "u": -1.7649774551391602, + "weight": 0.07069804519414902 + }, + { + "diff_generated": -17.848804473876953, + "epoch": 0.4504212572909916, + "grad_norm": 3.22439705826391, + "learning_rate": 7.938939669484943e-07, + "logits/chosen": -2.469378709793091, + "logits/rejected": -2.52380108833313, + "logps/chosen": -21.16245460510254, + "logps/rejected": -316.0628662109375, + "logps_avg/chosen": -0.12351296842098236, + "logps_avg/rejected": -1.7848806381225586, + "loss": 0.1336, + "losses_ref": -0.0010566998971626163, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1390, + "u": -1.8121494054794312, + "weight": 0.04600784555077553 + }, + { + "diff_generated": -18.572917938232422, + "epoch": 0.45366169799092676, + "grad_norm": 3.175962641830383, + "learning_rate": 7.936286482011041e-07, + "logits/chosen": -2.426278829574585, + "logits/rejected": -2.4540011882781982, + "logps/chosen": -24.904693603515625, + "logps/rejected": -297.0711364746094, + "logps_avg/chosen": -0.14129196107387543, + "logps_avg/rejected": -1.8572919368743896, + "loss": 0.1423, + "losses_ref": -0.0006665909895673394, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1400, + "u": -1.8010095357894897, + "weight": 0.05142979696393013 + }, + { + "diff_generated": -18.499448776245117, + "epoch": 0.45690213869086194, + "grad_norm": 3.23552476927222, + "learning_rate": 7.933577333287091e-07, + "logits/chosen": -2.4180374145507812, + "logits/rejected": -2.5311505794525146, + "logps/chosen": -23.67328643798828, + "logps/rejected": -336.3960876464844, + "logps_avg/chosen": -0.13547472655773163, + "logps_avg/rejected": -1.8499447107315063, + "loss": 0.1332, + "losses_ref": -0.00015220060595311224, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1410, + "u": -1.813956618309021, + "weight": 0.04394307732582092 + }, + { + "diff_generated": -18.732492446899414, + "epoch": 0.46014257939079717, + "grad_norm": 3.0412880571488885, + "learning_rate": 7.930812261828421e-07, + "logits/chosen": -2.4600558280944824, + "logits/rejected": -2.493492603302002, + "logps/chosen": -27.773700714111328, + "logps/rejected": -328.41448974609375, + "logps_avg/chosen": -0.16207179427146912, + "logps_avg/rejected": -1.8732492923736572, + "loss": 0.1445, + "losses_ref": -0.0006029005744494498, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1420, + "u": -1.81326425075531, + "weight": 0.04474567994475365 + }, + { + "diff_generated": -17.517717361450195, + "epoch": 0.46338302009073234, + "grad_norm": 3.1714790753997812, + "learning_rate": 7.92799130694539e-07, + "logits/chosen": -2.4764244556427, + "logits/rejected": -2.491389751434326, + "logps/chosen": -24.16140365600586, + "logps/rejected": -310.711181640625, + "logps_avg/chosen": -0.13903963565826416, + "logps_avg/rejected": -1.7517716884613037, + "loss": 0.1337, + "losses_ref": -0.000567762996070087, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1430, + "u": -1.7658287286758423, + "weight": 0.06975488364696503 + }, + { + "diff_generated": -17.2349853515625, + "epoch": 0.4666234607906675, + "grad_norm": 3.0821823873883964, + "learning_rate": 7.925114508742848e-07, + "logits/chosen": -2.455271005630493, + "logits/rejected": -2.5200881958007812, + "logps/chosen": -22.790693283081055, + "logps/rejected": -302.3728942871094, + "logps_avg/chosen": -0.1398683339357376, + "logps_avg/rejected": -1.7234985828399658, + "loss": 0.1372, + "losses_ref": -0.0007977086352184415, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1440, + "u": -1.8008880615234375, + "weight": 0.05159289389848709 + }, + { + "diff_generated": -18.428184509277344, + "epoch": 0.4698639014906027, + "grad_norm": 3.3487169217807415, + "learning_rate": 7.92218190811955e-07, + "logits/chosen": -2.4557693004608154, + "logits/rejected": -2.555664300918579, + "logps/chosen": -23.864511489868164, + "logps/rejected": -331.5283203125, + "logps_avg/chosen": -0.13912031054496765, + "logps_avg/rejected": -1.8428184986114502, + "loss": 0.1353, + "losses_ref": -0.0007266084430739284, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1450, + "u": -1.8129682540893555, + "weight": 0.04508272558450699 + }, + { + "diff_generated": -19.4451961517334, + "epoch": 0.47310434219053793, + "grad_norm": 2.9957250282590704, + "learning_rate": 7.919193546767581e-07, + "logits/chosen": -2.44518780708313, + "logits/rejected": -2.4765005111694336, + "logps/chosen": -24.367023468017578, + "logps/rejected": -325.8844299316406, + "logps_avg/chosen": -0.1385246217250824, + "logps_avg/rejected": -1.9445196390151978, + "loss": 0.1356, + "losses_ref": -0.0005143691087141633, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1460, + "u": -1.8133465051651, + "weight": 0.04465331509709358 + }, + { + "diff_generated": -17.70899200439453, + "epoch": 0.4763447828904731, + "grad_norm": 3.110747927329135, + "learning_rate": 7.916149467171768e-07, + "logits/chosen": -2.455876588821411, + "logits/rejected": -2.483983039855957, + "logps/chosen": -20.568796157836914, + "logps/rejected": -288.34600830078125, + "logps_avg/chosen": -0.12485536187887192, + "logps_avg/rejected": -1.7708991765975952, + "loss": 0.1332, + "losses_ref": -0.0005488159949891269, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1470, + "u": -1.7777249813079834, + "weight": 0.0634617805480957 + }, + { + "diff_generated": -17.289661407470703, + "epoch": 0.4795852235904083, + "grad_norm": 3.157389076417406, + "learning_rate": 7.913049712609066e-07, + "logits/chosen": -2.433224678039551, + "logits/rejected": -2.474834442138672, + "logps/chosen": -22.74091148376465, + "logps/rejected": -307.838623046875, + "logps_avg/chosen": -0.1298968493938446, + "logps_avg/rejected": -1.7289661169052124, + "loss": 0.1361, + "losses_ref": -0.0003924695774912834, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1480, + "u": -1.7424627542495728, + "weight": 0.081854909658432 + }, + { + "diff_generated": -18.811738967895508, + "epoch": 0.48282566429034346, + "grad_norm": 3.2532023130455094, + "learning_rate": 7.909894327147949e-07, + "logits/chosen": -2.4604969024658203, + "logits/rejected": -2.4856350421905518, + "logps/chosen": -23.749942779541016, + "logps/rejected": -318.9234619140625, + "logps_avg/chosen": -0.13616855442523956, + "logps_avg/rejected": -1.8811737298965454, + "loss": 0.1353, + "losses_ref": -0.0009188092080876231, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1490, + "u": -1.824424386024475, + "weight": 0.039294369518756866 + }, + { + "diff_generated": -19.15116310119629, + "epoch": 0.4860661049902787, + "grad_norm": 3.155040368233549, + "learning_rate": 7.906683355647783e-07, + "logits/chosen": -2.451903820037842, + "logits/rejected": -2.511476516723633, + "logps/chosen": -22.985902786254883, + "logps/rejected": -335.32269287109375, + "logps_avg/chosen": -0.13014516234397888, + "logps_avg/rejected": -1.915116548538208, + "loss": 0.1323, + "losses_ref": -0.0009978034067898989, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1500, + "u": -1.8238977193832397, + "weight": 0.03983033448457718 + }, + { + "diff_generated": -18.532636642456055, + "epoch": 0.48930654569021387, + "grad_norm": 3.2032888148117427, + "learning_rate": 7.903416843758187e-07, + "logits/chosen": -2.495983839035034, + "logits/rejected": -2.544659376144409, + "logps/chosen": -22.922616958618164, + "logps/rejected": -329.5682067871094, + "logps_avg/chosen": -0.13078387081623077, + "logps_avg/rejected": -1.8532636165618896, + "loss": 0.1352, + "losses_ref": -0.0004203950520604849, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1510, + "u": -1.7779722213745117, + "weight": 0.06317600607872009 + }, + { + "diff_generated": -18.766742706298828, + "epoch": 0.49254698639014904, + "grad_norm": 3.1952501859878306, + "learning_rate": 7.900094837918385e-07, + "logits/chosen": -2.4817230701446533, + "logits/rejected": -2.5090298652648926, + "logps/chosen": -26.534265518188477, + "logps/rejected": -344.45526123046875, + "logps_avg/chosen": -0.14062072336673737, + "logps_avg/rejected": -1.8766740560531616, + "loss": 0.1385, + "losses_ref": -0.0007466255337931216, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1520, + "u": -1.8367220163345337, + "weight": 0.03255104273557663 + }, + { + "diff_generated": -18.747167587280273, + "epoch": 0.4957874270900843, + "grad_norm": 3.178512660094918, + "learning_rate": 7.896717385356545e-07, + "logits/chosen": -2.4801974296569824, + "logits/rejected": -2.5688040256500244, + "logps/chosen": -22.872400283813477, + "logps/rejected": -333.326904296875, + "logps_avg/chosen": -0.1437094360589981, + "logps_avg/rejected": -1.8747165203094482, + "loss": 0.134, + "losses_ref": -0.0007747443160042167, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1530, + "u": -1.8481905460357666, + "weight": 0.02670850232243538 + }, + { + "diff_generated": -19.107723236083984, + "epoch": 0.49902786779001945, + "grad_norm": 3.326305584609048, + "learning_rate": 7.893284534089109e-07, + "logits/chosen": -2.452545404434204, + "logits/rejected": -2.5040736198425293, + "logps/chosen": -22.993648529052734, + "logps/rejected": -326.62591552734375, + "logps_avg/chosen": -0.1290530264377594, + "logps_avg/rejected": -1.9107720851898193, + "loss": 0.1381, + "losses_ref": -0.0006093319389037788, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1540, + "u": -1.8250234127044678, + "weight": 0.0386078879237175 + }, + { + "diff_generated": -17.362712860107422, + "epoch": 0.5022683084899546, + "grad_norm": 3.424929998445367, + "learning_rate": 7.889796332920106e-07, + "logits/chosen": -2.432164430618286, + "logits/rejected": -2.529879570007324, + "logps/chosen": -21.980777740478516, + "logps/rejected": -312.9666442871094, + "logps_avg/chosen": -0.13036580383777618, + "logps_avg/rejected": -1.7362712621688843, + "loss": 0.1311, + "losses_ref": -0.00039959652349352837, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1550, + "u": -1.7542804479599, + "weight": 0.07564841955900192 + }, + { + "diff_generated": -18.691469192504883, + "epoch": 0.5055087491898899, + "grad_norm": 3.2649242690203035, + "learning_rate": 7.886252831440465e-07, + "logits/chosen": -2.4698634147644043, + "logits/rejected": -2.5349888801574707, + "logps/chosen": -25.230175018310547, + "logps/rejected": -343.2588195800781, + "logps_avg/chosen": -0.1457994431257248, + "logps_avg/rejected": -1.8691469430923462, + "loss": 0.1348, + "losses_ref": -0.0006545605137944221, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1560, + "u": -1.848717451095581, + "weight": 0.026136714965105057 + }, + { + "diff_generated": -18.0201358795166, + "epoch": 0.508749189889825, + "grad_norm": 3.0505128913232067, + "learning_rate": 7.882654080027304e-07, + "logits/chosen": -2.4563963413238525, + "logits/rejected": -2.526946544647217, + "logps/chosen": -23.22417640686035, + "logps/rejected": -325.34759521484375, + "logps_avg/chosen": -0.13335327804088593, + "logps_avg/rejected": -1.802013635635376, + "loss": 0.1378, + "losses_ref": -0.0006427440093830228, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1570, + "u": -1.8248546123504639, + "weight": 0.03879604488611221 + }, + { + "diff_generated": -15.900899887084961, + "epoch": 0.5119896305897602, + "grad_norm": 2.813559786691504, + "learning_rate": 7.879000129843218e-07, + "logits/chosen": -2.519257068634033, + "logits/rejected": -2.5333638191223145, + "logps/chosen": -26.80712890625, + "logps/rejected": -287.0588073730469, + "logps_avg/chosen": -0.1401996612548828, + "logps_avg/rejected": -1.590090036392212, + "loss": 0.1328, + "losses_ref": -0.000904018641449511, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1580, + "u": -1.7532364130020142, + "weight": 0.07683442533016205 + }, + { + "diff_generated": -15.259477615356445, + "epoch": 0.5152300712896954, + "grad_norm": 2.863048008897734, + "learning_rate": 7.87529103283555e-07, + "logits/chosen": -2.5320329666137695, + "logits/rejected": -2.584820508956909, + "logps/chosen": -24.788921356201172, + "logps/rejected": -275.0646057128906, + "logps_avg/chosen": -0.12735766172409058, + "logps_avg/rejected": -1.5259478092193604, + "loss": 0.1367, + "losses_ref": -0.0005550708156079054, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1590, + "u": -1.7896106243133545, + "weight": 0.05717957019805908 + }, + { + "diff_generated": -17.436412811279297, + "epoch": 0.5184705119896306, + "grad_norm": 2.7173964853628267, + "learning_rate": 7.871526841735649e-07, + "logits/chosen": -2.4900612831115723, + "logits/rejected": -2.5238966941833496, + "logps/chosen": -22.51504135131836, + "logps/rejected": -283.78741455078125, + "logps_avg/chosen": -0.12943613529205322, + "logps_avg/rejected": -1.7436414957046509, + "loss": 0.1335, + "losses_ref": -0.0005736102466471493, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1600, + "u": -1.789592981338501, + "weight": 0.05720081925392151 + }, + { + "diff_generated": -16.706300735473633, + "epoch": 0.5217109526895658, + "grad_norm": 3.163777546228782, + "learning_rate": 7.867707610058127e-07, + "logits/chosen": -2.497654438018799, + "logits/rejected": -2.5916731357574463, + "logps/chosen": -22.602352142333984, + "logps/rejected": -291.9427490234375, + "logps_avg/chosen": -0.13002575933933258, + "logps_avg/rejected": -1.670629858970642, + "loss": 0.1392, + "losses_ref": -0.000581948203034699, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1610, + "u": -1.8132070302963257, + "weight": 0.04481140524148941 + }, + { + "diff_generated": -17.194438934326172, + "epoch": 0.5249513933895009, + "grad_norm": 3.166949089273434, + "learning_rate": 7.863833392100093e-07, + "logits/chosen": -2.4334254264831543, + "logits/rejected": -2.5457873344421387, + "logps/chosen": -21.274616241455078, + "logps/rejected": -282.6556091308594, + "logps_avg/chosen": -0.13356930017471313, + "logps_avg/rejected": -1.719443917274475, + "loss": 0.1323, + "losses_ref": -0.0006519744638353586, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1620, + "u": -1.8368451595306396, + "weight": 0.032400231808423996 + }, + { + "diff_generated": -17.77353286743164, + "epoch": 0.5281918340894362, + "grad_norm": 2.9581354223187604, + "learning_rate": 7.859904242940385e-07, + "logits/chosen": -2.489903450012207, + "logits/rejected": -2.5374815464019775, + "logps/chosen": -22.93409538269043, + "logps/rejected": -314.50799560546875, + "logps_avg/chosen": -0.1305612027645111, + "logps_avg/rejected": -1.777353048324585, + "loss": 0.1336, + "losses_ref": -0.0008023073896765709, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1630, + "u": -1.836499810218811, + "weight": 0.032795753329992294 + }, + { + "diff_generated": -16.64279556274414, + "epoch": 0.5314322747893714, + "grad_norm": 3.3278103760783795, + "learning_rate": 7.855920218438783e-07, + "logits/chosen": -2.453221321105957, + "logits/rejected": -2.4900641441345215, + "logps/chosen": -23.192838668823242, + "logps/rejected": -283.46697998046875, + "logps_avg/chosen": -0.13836851716041565, + "logps_avg/rejected": -1.664279580116272, + "loss": 0.1397, + "losses_ref": -0.0008003627881407738, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1640, + "u": -1.8007957935333252, + "weight": 0.05168105289340019 + }, + { + "diff_generated": -15.557705879211426, + "epoch": 0.5346727154893065, + "grad_norm": 2.7979372317645166, + "learning_rate": 7.851881375235216e-07, + "logits/chosen": -2.5243003368377686, + "logits/rejected": -2.556443691253662, + "logps/chosen": -21.914457321166992, + "logps/rejected": -285.3017272949219, + "logps_avg/chosen": -0.11913974583148956, + "logps_avg/rejected": -1.5557703971862793, + "loss": 0.1304, + "losses_ref": -0.000646257889457047, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1650, + "u": -1.7420024871826172, + "weight": 0.08238764107227325 + }, + { + "diff_generated": -18.791248321533203, + "epoch": 0.5379131561892417, + "grad_norm": 3.743434576482437, + "learning_rate": 7.847787770748959e-07, + "logits/chosen": -2.527073860168457, + "logits/rejected": -2.5413413047790527, + "logps/chosen": -24.805885314941406, + "logps/rejected": -315.00653076171875, + "logps_avg/chosen": -0.136772021651268, + "logps_avg/rejected": -1.8791248798370361, + "loss": 0.1384, + "losses_ref": -0.0006690368754789233, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1660, + "u": -1.8249105215072632, + "weight": 0.038742512464523315 + }, + { + "diff_generated": -17.38671112060547, + "epoch": 0.541153596889177, + "grad_norm": 3.482304529621643, + "learning_rate": 7.843639463177815e-07, + "logits/chosen": -2.4997076988220215, + "logits/rejected": -2.5827596187591553, + "logps/chosen": -23.33755874633789, + "logps/rejected": -318.40997314453125, + "logps_avg/chosen": -0.13309702277183533, + "logps_avg/rejected": -1.738671064376831, + "loss": 0.1293, + "losses_ref": -0.0007650243933312595, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1670, + "u": -1.8129169940948486, + "weight": 0.045146699994802475 + }, + { + "diff_generated": -16.854154586791992, + "epoch": 0.5443940375891121, + "grad_norm": 3.0307492113807837, + "learning_rate": 7.839436511497288e-07, + "logits/chosen": -2.4866318702697754, + "logits/rejected": -2.540236473083496, + "logps/chosen": -23.997835159301758, + "logps/rejected": -310.1125793457031, + "logps_avg/chosen": -0.1269882470369339, + "logps_avg/rejected": -1.685415506362915, + "loss": 0.1286, + "losses_ref": -0.0004096725897397846, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1680, + "u": -1.789876937866211, + "weight": 0.05687083676457405 + }, + { + "diff_generated": -18.999799728393555, + "epoch": 0.5476344782890473, + "grad_norm": 3.1436110896708285, + "learning_rate": 7.835178975459744e-07, + "logits/chosen": -2.471205949783325, + "logits/rejected": -2.503660202026367, + "logps/chosen": -22.559701919555664, + "logps/rejected": -324.4423522949219, + "logps_avg/chosen": -0.13547468185424805, + "logps_avg/rejected": -1.8999799489974976, + "loss": 0.1324, + "losses_ref": -0.00037550865090452135, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1690, + "u": -1.813604712486267, + "weight": 0.04435449838638306 + }, + { + "diff_generated": -18.0362548828125, + "epoch": 0.5508749189889826, + "grad_norm": 3.094785074613179, + "learning_rate": 7.83086691559356e-07, + "logits/chosen": -2.518862724304199, + "logits/rejected": -2.5251712799072266, + "logps/chosen": -21.84812355041504, + "logps/rejected": -308.73828125, + "logps_avg/chosen": -0.12069737911224365, + "logps_avg/rejected": -1.803625464439392, + "loss": 0.1306, + "losses_ref": -0.0006218409398570657, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1700, + "u": -1.8013778924942017, + "weight": 0.05103649944067001 + }, + { + "diff_generated": -17.041568756103516, + "epoch": 0.5541153596889177, + "grad_norm": 3.0017656954128835, + "learning_rate": 7.826500393202268e-07, + "logits/chosen": -2.4499335289001465, + "logits/rejected": -2.464740514755249, + "logps/chosen": -25.1323184967041, + "logps/rejected": -290.95654296875, + "logps_avg/chosen": -0.13508830964565277, + "logps_avg/rejected": -1.7041568756103516, + "loss": 0.136, + "losses_ref": -0.0009132762206718326, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1710, + "u": -1.8007068634033203, + "weight": 0.051802944391965866 + }, + { + "diff_generated": -16.525867462158203, + "epoch": 0.5573558003888529, + "grad_norm": 2.9335307587728936, + "learning_rate": 7.82207947036368e-07, + "logits/chosen": -2.420897960662842, + "logits/rejected": -2.4787497520446777, + "logps/chosen": -22.71407127380371, + "logps/rejected": -290.0786437988281, + "logps_avg/chosen": -0.13314175605773926, + "logps_avg/rejected": -1.652586579322815, + "loss": 0.1329, + "losses_ref": -0.0007877512834966183, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1720, + "u": -1.7889564037322998, + "weight": 0.05788535624742508 + }, + { + "diff_generated": -18.824562072753906, + "epoch": 0.560596241088788, + "grad_norm": 3.099890996831872, + "learning_rate": 7.817604209929007e-07, + "logits/chosen": -2.461977243423462, + "logits/rejected": -2.4332797527313232, + "logps/chosen": -25.484416961669922, + "logps/rejected": -305.5503845214844, + "logps_avg/chosen": -0.1392674595117569, + "logps_avg/rejected": -1.8824561834335327, + "loss": 0.135, + "losses_ref": -0.0006474562687799335, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1730, + "u": -1.7774829864501953, + "weight": 0.06373722851276398 + }, + { + "diff_generated": -18.078306198120117, + "epoch": 0.5638366817887233, + "grad_norm": 2.859866263751752, + "learning_rate": 7.813074675521962e-07, + "logits/chosen": -2.511709213256836, + "logits/rejected": -2.506988763809204, + "logps/chosen": -25.205087661743164, + "logps/rejected": -306.67950439453125, + "logps_avg/chosen": -0.14072729647159576, + "logps_avg/rejected": -1.807830810546875, + "loss": 0.1319, + "losses_ref": -0.0010248484322801232, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1740, + "u": -1.8477213382720947, + "weight": 0.027229273691773415 + }, + { + "diff_generated": -18.262256622314453, + "epoch": 0.5670771224886585, + "grad_norm": 3.0360024902130034, + "learning_rate": 7.80849093153786e-07, + "logits/chosen": -2.4646198749542236, + "logits/rejected": -2.5201735496520996, + "logps/chosen": -21.531408309936523, + "logps/rejected": -317.57916259765625, + "logps_avg/chosen": -0.12418844550848007, + "logps_avg/rejected": -1.8262255191802979, + "loss": 0.1289, + "losses_ref": -0.0006071639945730567, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1750, + "u": -1.777687668800354, + "weight": 0.06350871175527573 + }, + { + "diff_generated": -17.833709716796875, + "epoch": 0.5703175631885936, + "grad_norm": 2.9028000810936545, + "learning_rate": 7.803853043142702e-07, + "logits/chosen": -2.4486467838287354, + "logits/rejected": -2.5134153366088867, + "logps/chosen": -24.652416229248047, + "logps/rejected": -309.2928161621094, + "logps_avg/chosen": -0.1339518278837204, + "logps_avg/rejected": -1.7833709716796875, + "loss": 0.1291, + "losses_ref": -0.00018930871738120914, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1760, + "u": -1.7902014255523682, + "weight": 0.05649164319038391 + }, + { + "diff_generated": -17.00735855102539, + "epoch": 0.5735580038885288, + "grad_norm": 3.29055949352831, + "learning_rate": 7.799161076272245e-07, + "logits/chosen": -2.4540839195251465, + "logits/rejected": -2.514845371246338, + "logps/chosen": -23.15195655822754, + "logps/rejected": -304.8998107910156, + "logps_avg/chosen": -0.13627921044826508, + "logps_avg/rejected": -1.7007356882095337, + "loss": 0.1367, + "losses_ref": -0.0010348598007112741, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1770, + "u": -1.8003708124160767, + "weight": 0.05215916782617569 + }, + { + "diff_generated": -18.713973999023438, + "epoch": 0.5767984445884641, + "grad_norm": 3.46189518615115, + "learning_rate": 7.794415097631066e-07, + "logits/chosen": -2.4763245582580566, + "logits/rejected": -2.493891477584839, + "logps/chosen": -22.45502471923828, + "logps/rejected": -320.9071350097656, + "logps_avg/chosen": -0.1303728073835373, + "logps_avg/rejected": -1.871397614479065, + "loss": 0.1357, + "losses_ref": -0.0007566340500488877, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1780, + "u": -1.848497748374939, + "weight": 0.02639012411236763 + }, + { + "diff_generated": -17.699176788330078, + "epoch": 0.5800388852883992, + "grad_norm": 2.931900487175917, + "learning_rate": 7.789615174691619e-07, + "logits/chosen": -2.4306411743164062, + "logits/rejected": -2.51458477973938, + "logps/chosen": -24.722259521484375, + "logps/rejected": -323.888916015625, + "logps_avg/chosen": -0.13683000206947327, + "logps_avg/rejected": -1.7699177265167236, + "loss": 0.1305, + "losses_ref": -0.00117635412607342, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1790, + "u": -1.8001972436904907, + "weight": 0.052384234964847565 + }, + { + "diff_generated": -18.133087158203125, + "epoch": 0.5832793259883344, + "grad_norm": 2.835865807866508, + "learning_rate": 7.784761375693268e-07, + "logits/chosen": -2.3992950916290283, + "logits/rejected": -2.4438071250915527, + "logps/chosen": -23.660818099975586, + "logps/rejected": -328.71295166015625, + "logps_avg/chosen": -0.13329359889030457, + "logps_avg/rejected": -1.8133087158203125, + "loss": 0.1297, + "losses_ref": -0.0003332248597871512, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1800, + "u": -1.7899528741836548, + "weight": 0.05678124353289604 + }, + { + "diff_generated": -18.213998794555664, + "epoch": 0.5865197666882696, + "grad_norm": 3.173336533470699, + "learning_rate": 7.779853769641319e-07, + "logits/chosen": -2.4195868968963623, + "logits/rejected": -2.473097801208496, + "logps/chosen": -25.410669326782227, + "logps/rejected": -328.18853759765625, + "logps_avg/chosen": -0.15266582369804382, + "logps_avg/rejected": -1.8214000463485718, + "loss": 0.1307, + "losses_ref": -0.0005910733598284423, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1810, + "u": -1.8131601810455322, + "weight": 0.04486403614282608 + }, + { + "diff_generated": -18.346664428710938, + "epoch": 0.5897602073882048, + "grad_norm": 3.120402178443489, + "learning_rate": 7.774892426306042e-07, + "logits/chosen": -2.4505624771118164, + "logits/rejected": -2.5352671146392822, + "logps/chosen": -22.137514114379883, + "logps/rejected": -330.32745361328125, + "logps_avg/chosen": -0.12556853890419006, + "logps_avg/rejected": -1.8346662521362305, + "loss": 0.1315, + "losses_ref": -0.0006458786083385348, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1820, + "u": -1.753527283668518, + "weight": 0.07647226750850677 + }, + { + "diff_generated": -18.87307357788086, + "epoch": 0.59300064808814, + "grad_norm": 2.939036330764369, + "learning_rate": 7.769877416221678e-07, + "logits/chosen": -2.459193229675293, + "logits/rejected": -2.4645588397979736, + "logps/chosen": -26.066543579101562, + "logps/rejected": -318.68603515625, + "logps_avg/chosen": -0.14139781892299652, + "logps_avg/rejected": -1.8873074054718018, + "loss": 0.1379, + "losses_ref": -0.0006232672603800893, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1830, + "u": -1.801300048828125, + "weight": 0.051123809069395065 + }, + { + "diff_generated": -18.721759796142578, + "epoch": 0.5962410887880751, + "grad_norm": 2.8573207260724844, + "learning_rate": 7.764808810685433e-07, + "logits/chosen": -2.428995370864868, + "logits/rejected": -2.5126430988311768, + "logps/chosen": -19.536548614501953, + "logps/rejected": -310.163818359375, + "logps_avg/chosen": -0.12359301000833511, + "logps_avg/rejected": -1.8721758127212524, + "loss": 0.133, + "losses_ref": -0.000460333249066025, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1840, + "u": -1.754206657409668, + "weight": 0.07573723793029785 + }, + { + "diff_generated": -18.366769790649414, + "epoch": 0.5994815294880104, + "grad_norm": 3.0942383224635437, + "learning_rate": 7.759686681756468e-07, + "logits/chosen": -2.457427740097046, + "logits/rejected": -2.482417106628418, + "logps/chosen": -22.40389060974121, + "logps/rejected": -315.10528564453125, + "logps_avg/chosen": -0.12646666169166565, + "logps_avg/rejected": -1.836676836013794, + "loss": 0.1295, + "losses_ref": -0.0004205040750093758, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1850, + "u": -1.7898060083389282, + "weight": 0.056951187551021576 + }, + { + "diff_generated": -20.095348358154297, + "epoch": 0.6027219701879456, + "grad_norm": 2.7487787654908575, + "learning_rate": 7.754511102254876e-07, + "logits/chosen": -2.4285895824432373, + "logits/rejected": -2.4910686016082764, + "logps/chosen": -21.96750259399414, + "logps/rejected": -335.4107360839844, + "logps_avg/chosen": -0.13040336966514587, + "logps_avg/rejected": -2.009535074234009, + "loss": 0.1313, + "losses_ref": -0.0006380341947078705, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1860, + "u": -1.8131011724472046, + "weight": 0.04493032768368721 + }, + { + "diff_generated": -19.728809356689453, + "epoch": 0.6059624108878807, + "grad_norm": 2.7799453498174085, + "learning_rate": 7.74928214576064e-07, + "logits/chosen": -2.4481120109558105, + "logits/rejected": -2.4329230785369873, + "logps/chosen": -23.952800750732422, + "logps/rejected": -323.2978210449219, + "logps_avg/chosen": -0.13100168108940125, + "logps_avg/rejected": -1.9728807210922241, + "loss": 0.131, + "losses_ref": -0.0006491635576821864, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1870, + "u": -1.825012445449829, + "weight": 0.038625117391347885 + }, + { + "diff_generated": -20.258499145507812, + "epoch": 0.609202851587816, + "grad_norm": 3.132382816862127, + "learning_rate": 7.743999886612591e-07, + "logits/chosen": -2.4395930767059326, + "logits/rejected": -2.496419668197632, + "logps/chosen": -23.156782150268555, + "logps/rejected": -342.83392333984375, + "logps_avg/chosen": -0.13323050737380981, + "logps_avg/rejected": -2.0258498191833496, + "loss": 0.1333, + "losses_ref": -0.0004270991194061935, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1880, + "u": -1.8727830648422241, + "weight": 0.013226887211203575 + }, + { + "diff_generated": -19.925138473510742, + "epoch": 0.6124432922877512, + "grad_norm": 2.8991126228539743, + "learning_rate": 7.738664399907355e-07, + "logits/chosen": -2.4587321281433105, + "logits/rejected": -2.522761583328247, + "logps/chosen": -22.69471549987793, + "logps/rejected": -340.39208984375, + "logps_avg/chosen": -0.14121794700622559, + "logps_avg/rejected": -1.9925140142440796, + "loss": 0.1298, + "losses_ref": -0.000661244208458811, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1890, + "u": -1.8604755401611328, + "weight": 0.019995156675577164 + }, + { + "diff_generated": -18.109577178955078, + "epoch": 0.6156837329876863, + "grad_norm": 2.8760242471354327, + "learning_rate": 7.733275761498278e-07, + "logits/chosen": -2.472839593887329, + "logits/rejected": -2.471781015396118, + "logps/chosen": -24.88907241821289, + "logps/rejected": -314.57965087890625, + "logps_avg/chosen": -0.1315891295671463, + "logps_avg/rejected": -1.8109575510025024, + "loss": 0.1284, + "losses_ref": -0.0005121930735185742, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1900, + "u": -1.7897313833236694, + "weight": 0.057041965425014496 + }, + { + "diff_generated": -17.49472427368164, + "epoch": 0.6189241736876215, + "grad_norm": 2.823280138116411, + "learning_rate": 7.727834047994353e-07, + "logits/chosen": -2.4597010612487793, + "logits/rejected": -2.508735179901123, + "logps/chosen": -26.73147201538086, + "logps/rejected": -324.4564514160156, + "logps_avg/chosen": -0.14764484763145447, + "logps_avg/rejected": -1.7494723796844482, + "loss": 0.1299, + "losses_ref": -0.0005785429384559393, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1910, + "u": -1.801418662071228, + "weight": 0.05098617821931839 + }, + { + "diff_generated": -18.79656982421875, + "epoch": 0.6221646143875567, + "grad_norm": 3.051226672879621, + "learning_rate": 7.722339336759129e-07, + "logits/chosen": -2.3774640560150146, + "logits/rejected": -2.4868431091308594, + "logps/chosen": -23.647579193115234, + "logps/rejected": -331.06463623046875, + "logps_avg/chosen": -0.14270934462547302, + "logps_avg/rejected": -1.8796570301055908, + "loss": 0.135, + "losses_ref": -0.0005043046548962593, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1920, + "u": -1.7778398990631104, + "weight": 0.06332896649837494 + }, + { + "diff_generated": -17.094919204711914, + "epoch": 0.6254050550874919, + "grad_norm": 3.2172445082389216, + "learning_rate": 7.71679170590961e-07, + "logits/chosen": -2.49719500541687, + "logits/rejected": -2.503236770629883, + "logps/chosen": -24.753795623779297, + "logps/rejected": -301.22210693359375, + "logps_avg/chosen": -0.13670535385608673, + "logps_avg/rejected": -1.7094919681549072, + "loss": 0.1308, + "losses_ref": -0.0005722006899304688, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1930, + "u": -1.801491141319275, + "weight": 0.05090496689081192 + }, + { + "diff_generated": -17.40401840209961, + "epoch": 0.6286454957874271, + "grad_norm": 2.7455202356055373, + "learning_rate": 7.711191234315146e-07, + "logits/chosen": -2.4483723640441895, + "logits/rejected": -2.4890902042388916, + "logps/chosen": -23.994434356689453, + "logps/rejected": -324.2485046386719, + "logps_avg/chosen": -0.1330624669790268, + "logps_avg/rejected": -1.7404018640518188, + "loss": 0.129, + "losses_ref": -0.0003373560612089932, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1940, + "u": -1.849252462387085, + "weight": 0.025515040382742882 + }, + { + "diff_generated": -18.809823989868164, + "epoch": 0.6318859364873622, + "grad_norm": 2.9120368373047643, + "learning_rate": 7.705538001596312e-07, + "logits/chosen": -2.468559741973877, + "logits/rejected": -2.5485262870788574, + "logps/chosen": -20.931529998779297, + "logps/rejected": -319.64813232421875, + "logps_avg/chosen": -0.12360795587301254, + "logps_avg/rejected": -1.8809821605682373, + "loss": 0.1338, + "losses_ref": -0.0002663102059159428, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1950, + "u": -1.8019386529922485, + "weight": 0.05038148909807205 + }, + { + "diff_generated": -18.93647003173828, + "epoch": 0.6351263771872975, + "grad_norm": 2.8712568410008026, + "learning_rate": 7.699832088123774e-07, + "logits/chosen": -2.486135482788086, + "logits/rejected": -2.466231107711792, + "logps/chosen": -24.849327087402344, + "logps/rejected": -333.1363220214844, + "logps_avg/chosen": -0.12802192568778992, + "logps_avg/rejected": -1.8936468362808228, + "loss": 0.1318, + "losses_ref": -0.0005878577358089387, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1960, + "u": -1.8132880926132202, + "weight": 0.04472342133522034 + }, + { + "diff_generated": -18.771728515625, + "epoch": 0.6383668178872327, + "grad_norm": 3.715166163883544, + "learning_rate": 7.694073575017151e-07, + "logits/chosen": -2.3794102668762207, + "logits/rejected": -2.4397597312927246, + "logps/chosen": -20.941625595092773, + "logps/rejected": -332.5712585449219, + "logps_avg/chosen": -0.11972401291131973, + "logps_avg/rejected": -1.877172827720642, + "loss": 0.1286, + "losses_ref": -0.00048074816004373133, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1970, + "u": -1.777799367904663, + "weight": 0.0633687973022461 + }, + { + "diff_generated": -18.586057662963867, + "epoch": 0.6416072585871678, + "grad_norm": 2.985248147364007, + "learning_rate": 7.688262544143854e-07, + "logits/chosen": -2.4497241973876953, + "logits/rejected": -2.462395429611206, + "logps/chosen": -22.778026580810547, + "logps/rejected": -336.16632080078125, + "logps_avg/chosen": -0.12197883427143097, + "logps_avg/rejected": -1.8586056232452393, + "loss": 0.127, + "losses_ref": -0.000526120129507035, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1980, + "u": -1.753989815711975, + "weight": 0.07598290592432022 + }, + { + "diff_generated": -19.0280704498291, + "epoch": 0.6448476992871031, + "grad_norm": 2.7181558462233046, + "learning_rate": 7.682399078117928e-07, + "logits/chosen": -2.453800678253174, + "logits/rejected": -2.4369149208068848, + "logps/chosen": -21.635766983032227, + "logps/rejected": -329.36199951171875, + "logps_avg/chosen": -0.12099570035934448, + "logps_avg/rejected": -1.9028072357177734, + "loss": 0.1318, + "losses_ref": -0.0004379908205009997, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1990, + "u": -1.7659953832626343, + "weight": 0.06955628842115402 + }, + { + "diff_generated": -18.570884704589844, + "epoch": 0.6480881399870383, + "grad_norm": 2.9614913162809473, + "learning_rate": 7.67648326029888e-07, + "logits/chosen": -2.4518074989318848, + "logits/rejected": -2.4435629844665527, + "logps/chosen": -24.682865142822266, + "logps/rejected": -324.78253173828125, + "logps_avg/chosen": -0.13669325411319733, + "logps_avg/rejected": -1.8570884466171265, + "loss": 0.133, + "losses_ref": -0.0006112282280810177, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2000, + "u": -1.777618408203125, + "weight": 0.06358519196510315 + }, + { + "diff_generated": -19.15464210510254, + "epoch": 0.6513285806869734, + "grad_norm": 2.807859572737755, + "learning_rate": 7.670515174790485e-07, + "logits/chosen": -2.437833547592163, + "logits/rejected": -2.437678813934326, + "logps/chosen": -24.248523712158203, + "logps/rejected": -325.72210693359375, + "logps_avg/chosen": -0.14109835028648376, + "logps_avg/rejected": -1.915464162826538, + "loss": 0.1357, + "losses_ref": -0.0006648440612480044, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2010, + "u": -1.824631929397583, + "weight": 0.039017364382743835 + }, + { + "diff_generated": -18.24823760986328, + "epoch": 0.6545690213869086, + "grad_norm": 3.1332940725212928, + "learning_rate": 7.664494906439598e-07, + "logits/chosen": -2.4478213787078857, + "logits/rejected": -2.462120532989502, + "logps/chosen": -21.301809310913086, + "logps/rejected": -335.829345703125, + "logps_avg/chosen": -0.1297096312046051, + "logps_avg/rejected": -1.8248237371444702, + "loss": 0.1251, + "losses_ref": -0.00029017007909715176, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2020, + "u": -1.8137531280517578, + "weight": 0.04418119788169861 + }, + { + "diff_generated": -19.48830795288086, + "epoch": 0.6578094620868438, + "grad_norm": 2.883202474548215, + "learning_rate": 7.658422540834943e-07, + "logits/chosen": -2.4664320945739746, + "logits/rejected": -2.4568514823913574, + "logps/chosen": -25.807851791381836, + "logps/rejected": -343.0586853027344, + "logps_avg/chosen": -0.13989664614200592, + "logps_avg/rejected": -1.9488309621810913, + "loss": 0.1335, + "losses_ref": -0.0007401621551252902, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2030, + "u": -1.8485580682754517, + "weight": 0.02631906047463417 + }, + { + "diff_generated": -19.335189819335938, + "epoch": 0.661049902786779, + "grad_norm": 3.184220687164625, + "learning_rate": 7.6522981643059e-07, + "logits/chosen": -2.4369640350341797, + "logits/rejected": -2.463068723678589, + "logps/chosen": -24.900054931640625, + "logps/rejected": -317.1351013183594, + "logps_avg/chosen": -0.14506976306438446, + "logps_avg/rejected": -1.9335190057754517, + "loss": 0.1311, + "losses_ref": -0.00047019642079249024, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2040, + "u": -1.8249590396881104, + "weight": 0.03864910453557968 + }, + { + "diff_generated": -19.70545768737793, + "epoch": 0.6642903434867142, + "grad_norm": 2.8651056705390197, + "learning_rate": 7.646121863921278e-07, + "logits/chosen": -2.4327144622802734, + "logits/rejected": -2.4198977947235107, + "logps/chosen": -25.717498779296875, + "logps/rejected": -342.68389892578125, + "logps_avg/chosen": -0.13026151061058044, + "logps_avg/rejected": -1.9705455303192139, + "loss": 0.1296, + "losses_ref": -0.0005326059181243181, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2050, + "u": -1.837100625038147, + "weight": 0.03210743889212608 + }, + { + "diff_generated": -19.015865325927734, + "epoch": 0.6675307841866494, + "grad_norm": 2.765621585891966, + "learning_rate": 7.639893727488069e-07, + "logits/chosen": -2.4133496284484863, + "logits/rejected": -2.49336576461792, + "logps/chosen": -21.6951961517334, + "logps/rejected": -355.7476806640625, + "logps_avg/chosen": -0.1258120834827423, + "logps_avg/rejected": -1.9015867710113525, + "loss": 0.1245, + "losses_ref": -0.000237293541431427, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2060, + "u": -1.825690507888794, + "weight": 0.0378374308347702 + }, + { + "diff_generated": -18.98021697998047, + "epoch": 0.6707712248865846, + "grad_norm": 3.073794356382388, + "learning_rate": 7.633613843550212e-07, + "logits/chosen": -2.467679977416992, + "logits/rejected": -2.475241184234619, + "logps/chosen": -24.288105010986328, + "logps/rejected": -328.1109924316406, + "logps_avg/chosen": -0.13157445192337036, + "logps_avg/rejected": -1.8980216979980469, + "loss": 0.1325, + "losses_ref": -0.0005475075449794531, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2070, + "u": -1.7896312475204468, + "weight": 0.05715782567858696 + }, + { + "diff_generated": -20.291763305664062, + "epoch": 0.6740116655865198, + "grad_norm": 3.0389243347898502, + "learning_rate": 7.627282301387325e-07, + "logits/chosen": -2.365746021270752, + "logits/rejected": -2.4230730533599854, + "logps/chosen": -21.869232177734375, + "logps/rejected": -336.66961669921875, + "logps_avg/chosen": -0.12987583875656128, + "logps_avg/rejected": -2.0291762351989746, + "loss": 0.1282, + "losses_ref": -0.0003781206323765218, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2080, + "u": -1.7662067413330078, + "weight": 0.06932022422552109 + }, + { + "diff_generated": -17.792755126953125, + "epoch": 0.6772521062864549, + "grad_norm": 2.698865927398295, + "learning_rate": 7.620899191013438e-07, + "logits/chosen": -2.3889107704162598, + "logits/rejected": -2.4569685459136963, + "logps/chosen": -24.946392059326172, + "logps/rejected": -329.18304443359375, + "logps_avg/chosen": -0.14580973982810974, + "logps_avg/rejected": -1.77927565574646, + "loss": 0.1331, + "losses_ref": -0.0004922214429825544, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2090, + "u": -1.7897088527679443, + "weight": 0.05706654116511345 + }, + { + "diff_generated": -19.587724685668945, + "epoch": 0.6804925469863902, + "grad_norm": 2.749917366534381, + "learning_rate": 7.614464603175717e-07, + "logits/chosen": -2.4647059440612793, + "logits/rejected": -2.4155337810516357, + "logps/chosen": -23.76377296447754, + "logps/rejected": -333.32037353515625, + "logps_avg/chosen": -0.12452936172485352, + "logps_avg/rejected": -1.9587726593017578, + "loss": 0.1239, + "losses_ref": -0.000524552131537348, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2100, + "u": -1.7895400524139404, + "weight": 0.05724791809916496 + }, + { + "diff_generated": -20.409175872802734, + "epoch": 0.6837329876863253, + "grad_norm": 2.629710299155267, + "learning_rate": 7.607978629353167e-07, + "logits/chosen": -2.4125261306762695, + "logits/rejected": -2.4562489986419678, + "logps/chosen": -23.55038070678711, + "logps/rejected": -352.37255859375, + "logps_avg/chosen": -0.1393522024154663, + "logps_avg/rejected": -2.0409178733825684, + "loss": 0.1292, + "losses_ref": -0.0005357457557693124, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2110, + "u": -1.801489233970642, + "weight": 0.05090557411313057 + }, + { + "diff_generated": -19.790348052978516, + "epoch": 0.6869734283862605, + "grad_norm": 2.990669632149249, + "learning_rate": 7.60144136175534e-07, + "logits/chosen": -2.43841814994812, + "logits/rejected": -2.4627225399017334, + "logps/chosen": -20.112375259399414, + "logps/rejected": -362.16241455078125, + "logps_avg/chosen": -0.11792151629924774, + "logps_avg/rejected": -1.9790347814559937, + "loss": 0.1258, + "losses_ref": -0.00014056343934498727, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2120, + "u": -1.7783987522125244, + "weight": 0.06267792731523514 + }, + { + "diff_generated": -18.127880096435547, + "epoch": 0.6902138690861958, + "grad_norm": 3.047337082334964, + "learning_rate": 7.594852893321015e-07, + "logits/chosen": -2.392712354660034, + "logits/rejected": -2.476658821105957, + "logps/chosen": -22.044334411621094, + "logps/rejected": -330.33990478515625, + "logps_avg/chosen": -0.1308022439479828, + "logps_avg/rejected": -1.8127880096435547, + "loss": 0.1266, + "losses_ref": -0.0004497022891882807, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2130, + "u": -1.7779629230499268, + "weight": 0.06318801641464233 + }, + { + "diff_generated": -18.819965362548828, + "epoch": 0.6934543097861309, + "grad_norm": 2.737166621848905, + "learning_rate": 7.588213317716883e-07, + "logits/chosen": -2.330166816711426, + "logits/rejected": -2.4178690910339355, + "logps/chosen": -20.24831199645996, + "logps/rejected": -323.7247314453125, + "logps_avg/chosen": -0.1308317482471466, + "logps_avg/rejected": -1.881996750831604, + "loss": 0.1315, + "losses_ref": -0.00039175135316327214, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2140, + "u": -1.778019905090332, + "weight": 0.06312072277069092 + }, + { + "diff_generated": -18.78085708618164, + "epoch": 0.6966947504860661, + "grad_norm": 3.04134332336856, + "learning_rate": 7.581522729336214e-07, + "logits/chosen": -2.349966049194336, + "logits/rejected": -2.368582248687744, + "logps/chosen": -21.596080780029297, + "logps/rejected": -319.7588195800781, + "logps_avg/chosen": -0.12146018445491791, + "logps_avg/rejected": -1.878085732460022, + "loss": 0.1288, + "losses_ref": -0.0005237095756456256, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2150, + "u": -1.7657558917999268, + "weight": 0.06982530653476715 + }, + { + "diff_generated": -19.86001968383789, + "epoch": 0.6999351911860013, + "grad_norm": 2.719146372081679, + "learning_rate": 7.574781223297513e-07, + "logits/chosen": -2.427701950073242, + "logits/rejected": -2.4226479530334473, + "logps/chosen": -24.214614868164062, + "logps/rejected": -320.67120361328125, + "logps_avg/chosen": -0.13417170941829681, + "logps_avg/rejected": -1.9860022068023682, + "loss": 0.1242, + "losses_ref": -0.0006660787621513009, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2160, + "u": -1.7893577814102173, + "weight": 0.05747256428003311 + }, + { + "diff_generated": -17.657316207885742, + "epoch": 0.7031756318859365, + "grad_norm": 2.841932411038946, + "learning_rate": 7.567988895443173e-07, + "logits/chosen": -2.3967366218566895, + "logits/rejected": -2.3901848793029785, + "logps/chosen": -20.9189510345459, + "logps/rejected": -293.32379150390625, + "logps_avg/chosen": -0.11422063410282135, + "logps_avg/rejected": -1.7657315731048584, + "loss": 0.1289, + "losses_ref": -0.0006414534873329103, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2170, + "u": -1.753784418106079, + "weight": 0.07621309906244278 + }, + { + "diff_generated": -19.626449584960938, + "epoch": 0.7064160725858717, + "grad_norm": 2.761946332942343, + "learning_rate": 7.561145842338102e-07, + "logits/chosen": -2.402575969696045, + "logits/rejected": -2.4128315448760986, + "logps/chosen": -22.947673797607422, + "logps/rejected": -326.78875732421875, + "logps_avg/chosen": -0.129831463098526, + "logps_avg/rejected": -1.9626449346542358, + "loss": 0.126, + "losses_ref": -0.0004920439096167684, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2180, + "u": -1.8133924007415771, + "weight": 0.04459898918867111 + }, + { + "diff_generated": -18.372360229492188, + "epoch": 0.7096565132858069, + "grad_norm": 3.0603236768477347, + "learning_rate": 7.554252161268365e-07, + "logits/chosen": -2.3838937282562256, + "logits/rejected": -2.439671277999878, + "logps/chosen": -23.422100067138672, + "logps/rejected": -328.13372802734375, + "logps_avg/chosen": -0.1293845921754837, + "logps_avg/rejected": -1.8372361660003662, + "loss": 0.1268, + "losses_ref": -0.0006820982089266181, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2190, + "u": -1.7893273830413818, + "weight": 0.05750828981399536 + }, + { + "diff_generated": -18.612842559814453, + "epoch": 0.712896953985742, + "grad_norm": 2.8177511342534567, + "learning_rate": 7.547307950239785e-07, + "logits/chosen": -2.439180850982666, + "logits/rejected": -2.455467700958252, + "logps/chosen": -22.881404876708984, + "logps/rejected": -339.25054931640625, + "logps_avg/chosen": -0.1290069967508316, + "logps_avg/rejected": -1.8612842559814453, + "loss": 0.1294, + "losses_ref": -0.00032429193379357457, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2200, + "u": -1.8136647939682007, + "weight": 0.04428309202194214 + }, + { + "diff_generated": -17.87989044189453, + "epoch": 0.7161373946856773, + "grad_norm": 3.1016688974304736, + "learning_rate": 7.540313307976563e-07, + "logits/chosen": -2.411724805831909, + "logits/rejected": -2.445690631866455, + "logps/chosen": -22.11930274963379, + "logps/rejected": -321.18231201171875, + "logps_avg/chosen": -0.12484880536794662, + "logps_avg/rejected": -1.7879889011383057, + "loss": 0.1318, + "losses_ref": -0.00042836330248974264, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2210, + "u": -1.7186676263809204, + "weight": 0.09444756805896759 + }, + { + "diff_generated": -18.738765716552734, + "epoch": 0.7193778353856124, + "grad_norm": 2.6819135530863374, + "learning_rate": 7.533268333919865e-07, + "logits/chosen": -2.409003496170044, + "logits/rejected": -2.454528331756592, + "logps/chosen": -23.465421676635742, + "logps/rejected": -350.64312744140625, + "logps_avg/chosen": -0.12644416093826294, + "logps_avg/rejected": -1.873876929283142, + "loss": 0.1264, + "losses_ref": -0.0005657867877744138, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2220, + "u": -1.8251807689666748, + "weight": 0.038431938737630844 + }, + { + "diff_generated": -18.670055389404297, + "epoch": 0.7226182760855476, + "grad_norm": 2.689524904058088, + "learning_rate": 7.526173128226416e-07, + "logits/chosen": -2.3890652656555176, + "logits/rejected": -2.4550390243530273, + "logps/chosen": -24.16277313232422, + "logps/rejected": -330.9000549316406, + "logps_avg/chosen": -0.13710932433605194, + "logps_avg/rejected": -1.8670055866241455, + "loss": 0.1259, + "losses_ref": -0.00020371482241898775, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2230, + "u": -1.8138744831085205, + "weight": 0.04403935372829437 + }, + { + "diff_generated": -19.508420944213867, + "epoch": 0.7258587167854829, + "grad_norm": 2.691809977523224, + "learning_rate": 7.519027791767069e-07, + "logits/chosen": -2.386385679244995, + "logits/rejected": -2.3834471702575684, + "logps/chosen": -25.55209732055664, + "logps/rejected": -366.0524597167969, + "logps_avg/chosen": -0.13459105789661407, + "logps_avg/rejected": -1.9508421421051025, + "loss": 0.1292, + "losses_ref": -0.0003441698499955237, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2240, + "u": -1.8136498928070068, + "weight": 0.04430101439356804 + }, + { + "diff_generated": -19.35506820678711, + "epoch": 0.729099157485418, + "grad_norm": 2.949240428236712, + "learning_rate": 7.511832426125375e-07, + "logits/chosen": -2.434502601623535, + "logits/rejected": -2.4232802391052246, + "logps/chosen": -23.403701782226562, + "logps/rejected": -336.49774169921875, + "logps_avg/chosen": -0.13677889108657837, + "logps_avg/rejected": -1.935506820678711, + "loss": 0.1267, + "losses_ref": -0.0003745288122445345, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2250, + "u": -1.7543065547943115, + "weight": 0.07561810314655304 + }, + { + "diff_generated": -18.26801300048828, + "epoch": 0.7323395981853532, + "grad_norm": 2.9652966972491486, + "learning_rate": 7.504587133596141e-07, + "logits/chosen": -2.4914777278900146, + "logits/rejected": -2.4946160316467285, + "logps/chosen": -22.081167221069336, + "logps/rejected": -318.71551513671875, + "logps_avg/chosen": -0.12174037843942642, + "logps_avg/rejected": -1.8268013000488281, + "loss": 0.1266, + "losses_ref": -0.0004123027320019901, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2260, + "u": -1.7543058395385742, + "weight": 0.0756215900182724 + }, + { + "diff_generated": -19.877885818481445, + "epoch": 0.7355800388852884, + "grad_norm": 2.7188422014185103, + "learning_rate": 7.497292017183965e-07, + "logits/chosen": -2.4941792488098145, + "logits/rejected": -2.523242235183716, + "logps/chosen": -22.369342803955078, + "logps/rejected": -337.4557189941406, + "logps_avg/chosen": -0.1296032965183258, + "logps_avg/rejected": -1.987788438796997, + "loss": 0.1296, + "losses_ref": -0.000514883198775351, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2270, + "u": -1.8608055114746094, + "weight": 0.019619230180978775 + }, + { + "diff_generated": -18.854595184326172, + "epoch": 0.7388204795852236, + "grad_norm": 2.608393200125136, + "learning_rate": 7.489947180601791e-07, + "logits/chosen": -2.4092555046081543, + "logits/rejected": -2.431570529937744, + "logps/chosen": -21.66644287109375, + "logps/rejected": -321.78411865234375, + "logps_avg/chosen": -0.12630699574947357, + "logps_avg/rejected": -1.885459542274475, + "loss": 0.1221, + "losses_ref": -0.0007705268217250705, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2280, + "u": -1.7890784740447998, + "weight": 0.05778322368860245 + }, + { + "diff_generated": -18.43848991394043, + "epoch": 0.7420609202851588, + "grad_norm": 2.858225633134005, + "learning_rate": 7.482552728269412e-07, + "logits/chosen": -2.4584872722625732, + "logits/rejected": -2.4965240955352783, + "logps/chosen": -23.285232543945312, + "logps/rejected": -313.27606201171875, + "logps_avg/chosen": -0.12950658798217773, + "logps_avg/rejected": -1.8438488245010376, + "loss": 0.1243, + "losses_ref": -0.0004050957504659891, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2290, + "u": -1.8017374277114868, + "weight": 0.050617121160030365 + }, + { + "diff_generated": -19.22142219543457, + "epoch": 0.7453013609850939, + "grad_norm": 2.7175854602679155, + "learning_rate": 7.475108765312001e-07, + "logits/chosen": -2.429678201675415, + "logits/rejected": -2.4203219413757324, + "logps/chosen": -22.69501304626465, + "logps/rejected": -315.50128173828125, + "logps_avg/chosen": -0.12515577673912048, + "logps_avg/rejected": -1.9221423864364624, + "loss": 0.1272, + "losses_ref": -0.0004372203256934881, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2300, + "u": -1.777909517288208, + "weight": 0.06324687600135803 + }, + { + "diff_generated": -18.608448028564453, + "epoch": 0.7485418016850292, + "grad_norm": 2.9184389772376202, + "learning_rate": 7.467615397558613e-07, + "logits/chosen": -2.4402008056640625, + "logits/rejected": -2.495391368865967, + "logps/chosen": -22.847639083862305, + "logps/rejected": -340.8104248046875, + "logps_avg/chosen": -0.13191382586956024, + "logps_avg/rejected": -1.8608448505401611, + "loss": 0.1296, + "losses_ref": -0.0003498257137835026, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2310, + "u": -1.7780853509902954, + "weight": 0.0630442276597023 + }, + { + "diff_generated": -18.730741500854492, + "epoch": 0.7517822423849644, + "grad_norm": 2.6569163215979152, + "learning_rate": 7.460072731540676e-07, + "logits/chosen": -2.4269540309906006, + "logits/rejected": -2.4742822647094727, + "logps/chosen": -20.393117904663086, + "logps/rejected": -338.66778564453125, + "logps_avg/chosen": -0.11943888664245605, + "logps_avg/rejected": -1.8730741739273071, + "loss": 0.1238, + "losses_ref": -0.0005994164384901524, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2320, + "u": -1.8012882471084595, + "weight": 0.051135510206222534 + }, + { + "diff_generated": -19.9935359954834, + "epoch": 0.7550226830848995, + "grad_norm": 2.83798592756184, + "learning_rate": 7.452480874490483e-07, + "logits/chosen": -2.4465527534484863, + "logits/rejected": -2.4860479831695557, + "logps/chosen": -21.021358489990234, + "logps/rejected": -345.9947814941406, + "logps_avg/chosen": -0.12317808717489243, + "logps_avg/rejected": -1.9993536472320557, + "loss": 0.1268, + "losses_ref": -0.0003948205558117479, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2330, + "u": -1.7897589206695557, + "weight": 0.05699906870722771 + }, + { + "diff_generated": -18.668241500854492, + "epoch": 0.7582631237848347, + "grad_norm": 2.6863736286433286, + "learning_rate": 7.44483993433966e-07, + "logits/chosen": -2.42374587059021, + "logits/rejected": -2.472597122192383, + "logps/chosen": -18.5694522857666, + "logps/rejected": -326.936767578125, + "logps_avg/chosen": -0.11334365606307983, + "logps_avg/rejected": -1.8668243885040283, + "loss": 0.1272, + "losses_ref": -0.0002126133331330493, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2340, + "u": -1.7901561260223389, + "weight": 0.056544482707977295 + }, + { + "diff_generated": -18.23128890991211, + "epoch": 0.76150356448477, + "grad_norm": 2.904875126249053, + "learning_rate": 7.437150019717641e-07, + "logits/chosen": -2.4157471656799316, + "logits/rejected": -2.490884304046631, + "logps/chosen": -20.360605239868164, + "logps/rejected": -321.0450439453125, + "logps_avg/chosen": -0.11621763557195663, + "logps_avg/rejected": -1.8231290578842163, + "loss": 0.1288, + "losses_ref": -0.00025394646218046546, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2350, + "u": -1.7545111179351807, + "weight": 0.07538101077079773 + }, + { + "diff_generated": -18.002696990966797, + "epoch": 0.7647440051847051, + "grad_norm": 3.9274823374656465, + "learning_rate": 7.429411239950116e-07, + "logits/chosen": -2.4598190784454346, + "logits/rejected": -2.530359983444214, + "logps/chosen": -23.51648712158203, + "logps/rejected": -334.88507080078125, + "logps_avg/chosen": -0.13798591494560242, + "logps_avg/rejected": -1.8002697229385376, + "loss": 0.1314, + "losses_ref": -0.00022468708630185574, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2360, + "u": -1.802004098892212, + "weight": 0.05030521750450134 + }, + { + "diff_generated": -18.326831817626953, + "epoch": 0.7679844458846403, + "grad_norm": 2.53469457896195, + "learning_rate": 7.421623705057477e-07, + "logits/chosen": -2.4758832454681396, + "logits/rejected": -2.449039936065674, + "logps/chosen": -19.222278594970703, + "logps/rejected": -310.03961181640625, + "logps_avg/chosen": -0.1095147579908371, + "logps_avg/rejected": -1.8326833248138428, + "loss": 0.1218, + "losses_ref": -0.0004961226368322968, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2370, + "u": -1.7539732456207275, + "weight": 0.07599438726902008 + }, + { + "diff_generated": -16.816598892211914, + "epoch": 0.7712248865845756, + "grad_norm": 2.984273026869485, + "learning_rate": 7.413787525753261e-07, + "logits/chosen": -2.4309310913085938, + "logits/rejected": -2.494706630706787, + "logps/chosen": -22.003509521484375, + "logps/rejected": -303.1298828125, + "logps_avg/chosen": -0.13067765533924103, + "logps_avg/rejected": -1.6816600561141968, + "loss": 0.13, + "losses_ref": -0.00040191778680309653, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2380, + "u": -1.7305409908294678, + "weight": 0.08817703276872635 + }, + { + "diff_generated": -18.891063690185547, + "epoch": 0.7744653272845107, + "grad_norm": 2.80526700675378, + "learning_rate": 7.405902813442564e-07, + "logits/chosen": -2.4479176998138428, + "logits/rejected": -2.4607460498809814, + "logps/chosen": -19.49251365661621, + "logps/rejected": -311.84820556640625, + "logps_avg/chosen": -0.11860658973455429, + "logps_avg/rejected": -1.8891067504882812, + "loss": 0.1223, + "losses_ref": -0.00031381563167087734, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2390, + "u": -1.813751220703125, + "weight": 0.04418398439884186 + }, + { + "diff_generated": -19.603668212890625, + "epoch": 0.7777057679844459, + "grad_norm": 2.880688772648272, + "learning_rate": 7.39796968022047e-07, + "logits/chosen": -2.4030654430389404, + "logits/rejected": -2.4530603885650635, + "logps/chosen": -20.101438522338867, + "logps/rejected": -326.55462646484375, + "logps_avg/chosen": -0.12463720887899399, + "logps_avg/rejected": -1.9603666067123413, + "loss": 0.1222, + "losses_ref": -0.0004716304247267544, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2400, + "u": -1.8371467590332031, + "weight": 0.03205372765660286 + }, + { + "diff_generated": -18.70380210876465, + "epoch": 0.780946208684381, + "grad_norm": 2.9044256233773917, + "learning_rate": 7.389988238870451e-07, + "logits/chosen": -2.42336106300354, + "logits/rejected": -2.3942418098449707, + "logps/chosen": -24.53568458557129, + "logps/rejected": -307.4025573730469, + "logps_avg/chosen": -0.13119980692863464, + "logps_avg/rejected": -1.870380163192749, + "loss": 0.1279, + "losses_ref": -0.0005647986545227468, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2410, + "u": -1.8251574039459229, + "weight": 0.038459379225969315 + }, + { + "diff_generated": -19.824970245361328, + "epoch": 0.7841866493843163, + "grad_norm": 2.7147095449979846, + "learning_rate": 7.381958602862763e-07, + "logits/chosen": -2.430551528930664, + "logits/rejected": -2.4522652626037598, + "logps/chosen": -23.182209014892578, + "logps/rejected": -343.49114990234375, + "logps_avg/chosen": -0.124916672706604, + "logps_avg/rejected": -1.982496976852417, + "loss": 0.1243, + "losses_ref": -0.00035436113830655813, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2420, + "u": -1.8254915475845337, + "weight": 0.03806937485933304 + }, + { + "diff_generated": -18.005935668945312, + "epoch": 0.7874270900842515, + "grad_norm": 2.8190879408890854, + "learning_rate": 7.373880886352832e-07, + "logits/chosen": -2.4851255416870117, + "logits/rejected": -2.4959912300109863, + "logps/chosen": -25.28909683227539, + "logps/rejected": -329.841064453125, + "logps_avg/chosen": -0.13179995119571686, + "logps_avg/rejected": -1.8005939722061157, + "loss": 0.1253, + "losses_ref": -0.0004605629947036505, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2430, + "u": -1.8253390789031982, + "weight": 0.03824792057275772 + }, + { + "diff_generated": -18.01476287841797, + "epoch": 0.7906675307841866, + "grad_norm": 2.673066237519979, + "learning_rate": 7.365755204179637e-07, + "logits/chosen": -2.3529014587402344, + "logits/rejected": -2.483888626098633, + "logps/chosen": -21.870464324951172, + "logps/rejected": -335.36883544921875, + "logps_avg/chosen": -0.12962986528873444, + "logps_avg/rejected": -1.8014764785766602, + "loss": 0.1267, + "losses_ref": -0.0006634207093156874, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2440, + "u": -1.8130687475204468, + "weight": 0.04497240111231804 + }, + { + "diff_generated": -18.597171783447266, + "epoch": 0.7939079714841218, + "grad_norm": 2.966769139844808, + "learning_rate": 7.357581671864073e-07, + "logits/chosen": -2.393500566482544, + "logits/rejected": -2.484731912612915, + "logps/chosen": -22.13661766052246, + "logps/rejected": -328.3583068847656, + "logps_avg/chosen": -0.13498273491859436, + "logps_avg/rejected": -1.8597170114517212, + "loss": 0.1287, + "losses_ref": -0.000405018130550161, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2450, + "u": -1.7896159887313843, + "weight": 0.057148706167936325 + }, + { + "diff_generated": -17.847610473632812, + "epoch": 0.7971484121840571, + "grad_norm": 2.6948564560211197, + "learning_rate": 7.349360405607303e-07, + "logits/chosen": -2.3624050617218018, + "logits/rejected": -2.451986789703369, + "logps/chosen": -18.373266220092773, + "logps/rejected": -325.03704833984375, + "logps_avg/chosen": -0.10835101455450058, + "logps_avg/rejected": -1.7847610712051392, + "loss": 0.1251, + "losses_ref": -0.0003813363437075168, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2460, + "u": -1.7306047677993774, + "weight": 0.08810532838106155 + }, + { + "diff_generated": -19.39354705810547, + "epoch": 0.8003888528839922, + "grad_norm": 2.570981381506959, + "learning_rate": 7.341091522289122e-07, + "logits/chosen": -2.489992618560791, + "logits/rejected": -2.4882442951202393, + "logps/chosen": -21.162927627563477, + "logps/rejected": -330.13720703125, + "logps_avg/chosen": -0.11893127858638763, + "logps_avg/rejected": -1.939354658126831, + "loss": 0.1232, + "losses_ref": -0.0003837384865619242, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2470, + "u": -1.8136303424835205, + "weight": 0.04432538151741028 + }, + { + "diff_generated": -19.998611450195312, + "epoch": 0.8036292935839274, + "grad_norm": 2.8313005386612504, + "learning_rate": 7.332775139466278e-07, + "logits/chosen": -2.513796806335449, + "logits/rejected": -2.5728158950805664, + "logps/chosen": -20.988895416259766, + "logps/rejected": -350.8438720703125, + "logps_avg/chosen": -0.12285809218883514, + "logps_avg/rejected": -1.9998611211776733, + "loss": 0.1277, + "losses_ref": -0.0004251801874488592, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2480, + "u": -1.8609235286712646, + "weight": 0.019479917362332344 + }, + { + "diff_generated": -17.742862701416016, + "epoch": 0.8068697342838627, + "grad_norm": 2.7325143797161178, + "learning_rate": 7.324411375370809e-07, + "logits/chosen": -2.4243478775024414, + "logits/rejected": -2.478469133377075, + "logps/chosen": -21.79288101196289, + "logps/rejected": -317.43975830078125, + "logps_avg/chosen": -0.12675701081752777, + "logps_avg/rejected": -1.7742862701416016, + "loss": 0.1256, + "losses_ref": -0.0003512470575515181, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2490, + "u": -1.7543703317642212, + "weight": 0.07554493844509125 + }, + { + "diff_generated": -17.564847946166992, + "epoch": 0.8101101749837978, + "grad_norm": 2.7498341190763944, + "learning_rate": 7.316000348908365e-07, + "logits/chosen": -2.465071439743042, + "logits/rejected": -2.5179238319396973, + "logps/chosen": -23.108013153076172, + "logps/rejected": -323.61297607421875, + "logps_avg/chosen": -0.1270761638879776, + "logps_avg/rejected": -1.7564847469329834, + "loss": 0.1262, + "losses_ref": -0.00033712232834659517, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2500, + "u": -1.754339575767517, + "weight": 0.07557834684848785 + }, + { + "diff_generated": -17.836788177490234, + "epoch": 0.813350615683733, + "grad_norm": 2.8566271067083804, + "learning_rate": 7.307542179656511e-07, + "logits/chosen": -2.4499549865722656, + "logits/rejected": -2.4926228523254395, + "logps/chosen": -21.77320098876953, + "logps/rejected": -338.5235290527344, + "logps_avg/chosen": -0.11738236248493195, + "logps_avg/rejected": -1.7836787700653076, + "loss": 0.1248, + "losses_ref": -0.0005360871437005699, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2510, + "u": -1.7539886236190796, + "weight": 0.07598712295293808 + }, + { + "diff_generated": -19.29046058654785, + "epoch": 0.8165910563836681, + "grad_norm": 2.7446895235493387, + "learning_rate": 7.29903698786303e-07, + "logits/chosen": -2.4418554306030273, + "logits/rejected": -2.4206745624542236, + "logps/chosen": -22.280296325683594, + "logps/rejected": -300.85528564453125, + "logps_avg/chosen": -0.12339627742767334, + "logps_avg/rejected": -1.9290460348129272, + "loss": 0.1248, + "losses_ref": -0.0006306341965682805, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2520, + "u": -1.8012568950653076, + "weight": 0.05117018148303032 + }, + { + "diff_generated": -17.447559356689453, + "epoch": 0.8198314970836034, + "grad_norm": 2.7651369105110253, + "learning_rate": 7.290484894444214e-07, + "logits/chosen": -2.3910071849823, + "logits/rejected": -2.4577226638793945, + "logps/chosen": -19.478683471679688, + "logps/rejected": -303.2452087402344, + "logps_avg/chosen": -0.11861952394247055, + "logps_avg/rejected": -1.7447561025619507, + "loss": 0.1221, + "losses_ref": -0.0005060589173808694, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2530, + "u": -1.6829626560211182, + "weight": 0.1133555918931961 + }, + { + "diff_generated": -18.10799789428711, + "epoch": 0.8230719377835386, + "grad_norm": 3.053125479055488, + "learning_rate": 7.281886020983144e-07, + "logits/chosen": -2.4256653785705566, + "logits/rejected": -2.4212918281555176, + "logps/chosen": -23.599895477294922, + "logps/rejected": -292.9998474121094, + "logps_avg/chosen": -0.13201817870140076, + "logps_avg/rejected": -1.8107995986938477, + "loss": 0.1242, + "losses_ref": -0.00020123887225054204, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2540, + "u": -1.8257560729980469, + "weight": 0.037760891020298004 + }, + { + "diff_generated": -17.104455947875977, + "epoch": 0.8263123784834737, + "grad_norm": 3.1444280521819663, + "learning_rate": 7.273240489727963e-07, + "logits/chosen": -2.398249864578247, + "logits/rejected": -2.408512830734253, + "logps/chosen": -22.64269256591797, + "logps/rejected": -290.8938293457031, + "logps_avg/chosen": -0.11623907089233398, + "logps_avg/rejected": -1.7104456424713135, + "loss": 0.122, + "losses_ref": -0.00043374235974624753, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2550, + "u": -1.7661218643188477, + "weight": 0.06941927224397659 + }, + { + "diff_generated": -17.132604598999023, + "epoch": 0.829552819183409, + "grad_norm": 2.8401955072126595, + "learning_rate": 7.264548423590133e-07, + "logits/chosen": -2.4102237224578857, + "logits/rejected": -2.4627585411071777, + "logps/chosen": -21.5040340423584, + "logps/rejected": -293.8085632324219, + "logps_avg/chosen": -0.12476116418838501, + "logps_avg/rejected": -1.7132604122161865, + "loss": 0.1234, + "losses_ref": -0.00025950567214749753, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2560, + "u": -1.7901074886322021, + "weight": 0.05660170316696167 + }, + { + "diff_generated": -18.14133071899414, + "epoch": 0.8327932598833442, + "grad_norm": 3.1965970439975666, + "learning_rate": 7.255809946142695e-07, + "logits/chosen": -2.3983757495880127, + "logits/rejected": -2.409802198410034, + "logps/chosen": -23.022171020507812, + "logps/rejected": -306.9036865234375, + "logps_avg/chosen": -0.12413071095943451, + "logps_avg/rejected": -1.8141329288482666, + "loss": 0.1305, + "losses_ref": -0.0003770699549932033, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2570, + "u": -1.7898855209350586, + "weight": 0.05686057358980179 + }, + { + "diff_generated": -18.652851104736328, + "epoch": 0.8360337005832793, + "grad_norm": 2.8716772820827066, + "learning_rate": 7.247025181618508e-07, + "logits/chosen": -2.4277095794677734, + "logits/rejected": -2.4492273330688477, + "logps/chosen": -22.899845123291016, + "logps/rejected": -318.9675598144531, + "logps_avg/chosen": -0.12769187986850739, + "logps_avg/rejected": -1.8652846813201904, + "loss": 0.1238, + "losses_ref": -0.0003064598422497511, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2580, + "u": -1.82554030418396, + "weight": 0.03801097720861435 + }, + { + "diff_generated": -17.709041595458984, + "epoch": 0.8392741412832145, + "grad_norm": 2.9226486099172098, + "learning_rate": 7.238194254908483e-07, + "logits/chosen": -2.386096477508545, + "logits/rejected": -2.435162305831909, + "logps/chosen": -21.952648162841797, + "logps/rejected": -303.5704345703125, + "logps_avg/chosen": -0.1209806352853775, + "logps_avg/rejected": -1.770904302597046, + "loss": 0.1311, + "losses_ref": -0.0005370815051719546, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2590, + "u": -1.7895606756210327, + "weight": 0.05723338574171066 + }, + { + "diff_generated": -18.281713485717773, + "epoch": 0.8425145819831497, + "grad_norm": 2.639473802718026, + "learning_rate": 7.229317291559807e-07, + "logits/chosen": -2.424675464630127, + "logits/rejected": -2.4955246448516846, + "logps/chosen": -22.622589111328125, + "logps/rejected": -296.0838623046875, + "logps_avg/chosen": -0.13772353529930115, + "logps_avg/rejected": -1.8281714916229248, + "loss": 0.1263, + "losses_ref": -0.0005213414551690221, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2600, + "u": -1.8131945133209229, + "weight": 0.04481424018740654 + }, + { + "diff_generated": -18.075288772583008, + "epoch": 0.8457550226830849, + "grad_norm": 3.2797998071561225, + "learning_rate": 7.22039441777416e-07, + "logits/chosen": -2.3893349170684814, + "logits/rejected": -2.470026731491089, + "logps/chosen": -20.48826026916504, + "logps/rejected": -301.62359619140625, + "logps_avg/chosen": -0.12221121788024902, + "logps_avg/rejected": -1.8075288534164429, + "loss": 0.1282, + "losses_ref": -0.0005865787388756871, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2610, + "u": -1.76569402217865, + "weight": 0.06990896910429001 + }, + { + "diff_generated": -19.311792373657227, + "epoch": 0.8489954633830201, + "grad_norm": 2.7049674814121976, + "learning_rate": 7.21142576040592e-07, + "logits/chosen": -2.4634876251220703, + "logits/rejected": -2.534529447555542, + "logps/chosen": -24.17624282836914, + "logps/rejected": -318.659423828125, + "logps_avg/chosen": -0.14250484108924866, + "logps_avg/rejected": -1.931179404258728, + "loss": 0.1243, + "losses_ref": -0.0004970087902620435, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2620, + "u": -1.8252204656600952, + "weight": 0.038382790982723236 + }, + { + "diff_generated": -19.849037170410156, + "epoch": 0.8522359040829552, + "grad_norm": 3.065765529226203, + "learning_rate": 7.202411446960357e-07, + "logits/chosen": -2.4388208389282227, + "logits/rejected": -2.462719440460205, + "logps/chosen": -24.322383880615234, + "logps/rejected": -326.2467041015625, + "logps_avg/chosen": -0.1309942901134491, + "logps_avg/rejected": -1.9849036931991577, + "loss": 0.1268, + "losses_ref": -0.0006085868226364255, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2630, + "u": -1.8603699207305908, + "weight": 0.020091459155082703 + }, + { + "diff_generated": -18.355745315551758, + "epoch": 0.8554763447828905, + "grad_norm": 2.619018012456487, + "learning_rate": 7.193351605591825e-07, + "logits/chosen": -2.4444096088409424, + "logits/rejected": -2.524109363555908, + "logps/chosen": -20.568607330322266, + "logps/rejected": -314.8489074707031, + "logps_avg/chosen": -0.1212010383605957, + "logps_avg/rejected": -1.8355745077133179, + "loss": 0.1198, + "losses_ref": -0.0005308730178512633, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2640, + "u": -1.789407730102539, + "weight": 0.0573933906853199 + }, + { + "diff_generated": -17.939889907836914, + "epoch": 0.8587167854828257, + "grad_norm": 2.815675633641462, + "learning_rate": 7.184246365101939e-07, + "logits/chosen": -2.48952054977417, + "logits/rejected": -2.46121883392334, + "logps/chosen": -23.912261962890625, + "logps/rejected": -312.788330078125, + "logps_avg/chosen": -0.12124598026275635, + "logps_avg/rejected": -1.793988585472107, + "loss": 0.1254, + "losses_ref": -0.0005175786791369319, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2650, + "u": -1.825222373008728, + "weight": 0.03838255628943443 + }, + { + "diff_generated": -18.960969924926758, + "epoch": 0.8619572261827608, + "grad_norm": 2.7869375126473614, + "learning_rate": 7.175095854937739e-07, + "logits/chosen": -2.4368441104888916, + "logits/rejected": -2.4551730155944824, + "logps/chosen": -21.652315139770508, + "logps/rejected": -331.0331115722656, + "logps_avg/chosen": -0.12064101547002792, + "logps_avg/rejected": -1.8960968255996704, + "loss": 0.1261, + "losses_ref": -0.00044710320071317255, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2660, + "u": -1.837181806564331, + "weight": 0.0320126935839653 + }, + { + "diff_generated": -17.435054779052734, + "epoch": 0.8651976668826961, + "grad_norm": 2.557409320653384, + "learning_rate": 7.165900205189853e-07, + "logits/chosen": -2.4384474754333496, + "logits/rejected": -2.5136446952819824, + "logps/chosen": -20.79629135131836, + "logps/rejected": -318.3617248535156, + "logps_avg/chosen": -0.1220201700925827, + "logps_avg/rejected": -1.7435054779052734, + "loss": 0.1207, + "losses_ref": -0.00030411581974476576, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2670, + "u": -1.7900310754776, + "weight": 0.05669097229838371 + }, + { + "diff_generated": -18.737735748291016, + "epoch": 0.8684381075826313, + "grad_norm": 2.56682824122742, + "learning_rate": 7.156659546590653e-07, + "logits/chosen": -2.3800575733184814, + "logits/rejected": -2.4281136989593506, + "logps/chosen": -19.449628829956055, + "logps/rejected": -336.49591064453125, + "logps_avg/chosen": -0.11861036717891693, + "logps_avg/rejected": -1.8737735748291016, + "loss": 0.1257, + "losses_ref": -0.00045010895701125264, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2680, + "u": -1.813449501991272, + "weight": 0.044532887637615204 + }, + { + "diff_generated": -20.041152954101562, + "epoch": 0.8716785482825664, + "grad_norm": 2.555750310681515, + "learning_rate": 7.147374010512385e-07, + "logits/chosen": -2.340986490249634, + "logits/rejected": -2.3440699577331543, + "logps/chosen": -20.139862060546875, + "logps/rejected": -306.624755859375, + "logps_avg/chosen": -0.11705954372882843, + "logps_avg/rejected": -2.004115104675293, + "loss": 0.1216, + "losses_ref": -0.0003802308929152787, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2690, + "u": -1.7304770946502686, + "weight": 0.08824525773525238 + }, + { + "diff_generated": -18.48080825805664, + "epoch": 0.8749189889825016, + "grad_norm": 2.5824846669144725, + "learning_rate": 7.13804372896531e-07, + "logits/chosen": -2.3439040184020996, + "logits/rejected": -2.4303998947143555, + "logps/chosen": -20.847734451293945, + "logps/rejected": -316.1014404296875, + "logps_avg/chosen": -0.12687210738658905, + "logps_avg/rejected": -1.8480808734893799, + "loss": 0.1246, + "losses_ref": -0.0005675092106685042, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2700, + "u": -1.789607048034668, + "weight": 0.057185959070920944 + }, + { + "diff_generated": -17.809646606445312, + "epoch": 0.8781594296824368, + "grad_norm": 2.81662394572902, + "learning_rate": 7.128668834595827e-07, + "logits/chosen": -2.453523635864258, + "logits/rejected": -2.4558768272399902, + "logps/chosen": -24.155872344970703, + "logps/rejected": -307.4092102050781, + "logps_avg/chosen": -0.1282106637954712, + "logps_avg/rejected": -1.7809646129608154, + "loss": 0.1209, + "losses_ref": -0.000484933378174901, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2710, + "u": -1.7897233963012695, + "weight": 0.05704890564084053 + }, + { + "diff_generated": -19.045629501342773, + "epoch": 0.881399870382372, + "grad_norm": 2.7453290952914093, + "learning_rate": 7.119249460684583e-07, + "logits/chosen": -2.3899433612823486, + "logits/rejected": -2.3870387077331543, + "logps/chosen": -23.397602081298828, + "logps/rejected": -324.876708984375, + "logps_avg/chosen": -0.12680859863758087, + "logps_avg/rejected": -1.9045629501342773, + "loss": 0.1248, + "losses_ref": -0.00038239354034885764, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2720, + "u": -1.8254715204238892, + "weight": 0.03809309005737305 + }, + { + "diff_generated": -17.462833404541016, + "epoch": 0.8846403110823072, + "grad_norm": 2.5421040372025097, + "learning_rate": 7.109785741144577e-07, + "logits/chosen": -2.344905376434326, + "logits/rejected": -2.4533557891845703, + "logps/chosen": -22.297576904296875, + "logps/rejected": -321.774169921875, + "logps_avg/chosen": -0.13205796480178833, + "logps_avg/rejected": -1.7462832927703857, + "loss": 0.1248, + "losses_ref": -0.00043138963519595563, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2730, + "u": -1.8135309219360352, + "weight": 0.04443947225809097 + }, + { + "diff_generated": -18.100156784057617, + "epoch": 0.8878807517822424, + "grad_norm": 2.8009501005613204, + "learning_rate": 7.100277810519264e-07, + "logits/chosen": -2.437243938446045, + "logits/rejected": -2.4590842723846436, + "logps/chosen": -22.2004337310791, + "logps/rejected": -316.9300842285156, + "logps_avg/chosen": -0.1315070539712906, + "logps_avg/rejected": -1.8100156784057617, + "loss": 0.1249, + "losses_ref": -0.0005317996838130057, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2740, + "u": -1.8133379220962524, + "weight": 0.04465905949473381 + }, + { + "diff_generated": -17.46011734008789, + "epoch": 0.8911211924821776, + "grad_norm": 2.689152478337482, + "learning_rate": 7.090725803980633e-07, + "logits/chosen": -2.3765041828155518, + "logits/rejected": -2.4582343101501465, + "logps/chosen": -21.084001541137695, + "logps/rejected": -305.22955322265625, + "logps_avg/chosen": -0.12233324348926544, + "logps_avg/rejected": -1.7460119724273682, + "loss": 0.1289, + "losses_ref": -0.00030663347570225596, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2750, + "u": -1.7663196325302124, + "weight": 0.06918799877166748 + }, + { + "diff_generated": -17.585119247436523, + "epoch": 0.8943616331821128, + "grad_norm": 2.752813540127061, + "learning_rate": 7.081129857327297e-07, + "logits/chosen": -2.4216065406799316, + "logits/rejected": -2.4935240745544434, + "logps/chosen": -20.991987228393555, + "logps/rejected": -300.592041015625, + "logps_avg/chosen": -0.12113398313522339, + "logps_avg/rejected": -1.7585121393203735, + "loss": 0.128, + "losses_ref": -0.00042823137482628226, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2760, + "u": -1.8016542196273804, + "weight": 0.05071256309747696 + }, + { + "diff_generated": -17.064542770385742, + "epoch": 0.8976020738820479, + "grad_norm": 3.0441665586298736, + "learning_rate": 7.071490106982547e-07, + "logits/chosen": -2.417194128036499, + "logits/rejected": -2.45021915435791, + "logps/chosen": -22.914186477661133, + "logps/rejected": -302.40167236328125, + "logps_avg/chosen": -0.12843842804431915, + "logps_avg/rejected": -1.7064542770385742, + "loss": 0.1269, + "losses_ref": -0.00028399689472280443, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2770, + "u": -1.7544708251953125, + "weight": 0.07542793452739716 + }, + { + "diff_generated": -18.65851402282715, + "epoch": 0.9008425145819832, + "grad_norm": 2.7869155962688974, + "learning_rate": 7.061806689992424e-07, + "logits/chosen": -2.3915419578552246, + "logits/rejected": -2.4333293437957764, + "logps/chosen": -22.446435928344727, + "logps/rejected": -309.77508544921875, + "logps_avg/chosen": -0.12451604753732681, + "logps_avg/rejected": -1.865851640701294, + "loss": 0.1234, + "losses_ref": -0.000297236634651199, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2780, + "u": -1.7900089025497437, + "weight": 0.05671622231602669 + }, + { + "diff_generated": -18.07571792602539, + "epoch": 0.9040829552819183, + "grad_norm": 2.6626958452708966, + "learning_rate": 7.052079744023769e-07, + "logits/chosen": -2.5439717769622803, + "logits/rejected": -2.5482470989227295, + "logps/chosen": -25.701541900634766, + "logps/rejected": -313.8611755371094, + "logps_avg/chosen": -0.13316160440444946, + "logps_avg/rejected": -1.8075717687606812, + "loss": 0.1212, + "losses_ref": -0.0006367530440911651, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2790, + "u": -1.8368667364120483, + "weight": 0.032375458627939224 + }, + { + "diff_generated": -18.336414337158203, + "epoch": 0.9073233959818535, + "grad_norm": 2.6099270748234957, + "learning_rate": 7.042309407362264e-07, + "logits/chosen": -2.410386562347412, + "logits/rejected": -2.498459577560425, + "logps/chosen": -20.105188369750977, + "logps/rejected": -320.20513916015625, + "logps_avg/chosen": -0.12039705365896225, + "logps_avg/rejected": -1.833641767501831, + "loss": 0.1228, + "losses_ref": -0.0007827078807167709, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2800, + "u": -1.8128219842910767, + "weight": 0.0452541820704937 + }, + { + "diff_generated": -18.090600967407227, + "epoch": 0.9105638366817888, + "grad_norm": 2.6260436552631856, + "learning_rate": 7.032495818910462e-07, + "logits/chosen": -2.471694231033325, + "logits/rejected": -2.494658946990967, + "logps/chosen": -19.806758880615234, + "logps/rejected": -302.5240478515625, + "logps_avg/chosen": -0.1169753298163414, + "logps_avg/rejected": -1.8090598583221436, + "loss": 0.122, + "losses_ref": -0.00042266439413651824, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2810, + "u": -1.7542486190795898, + "weight": 0.0756872147321701 + }, + { + "diff_generated": -18.19870376586914, + "epoch": 0.9138042773817239, + "grad_norm": 2.78178387937294, + "learning_rate": 7.022639118185819e-07, + "logits/chosen": -2.462240219116211, + "logits/rejected": -2.4601945877075195, + "logps/chosen": -23.47665786743164, + "logps/rejected": -315.44085693359375, + "logps_avg/chosen": -0.12503954768180847, + "logps_avg/rejected": -1.8198707103729248, + "loss": 0.1208, + "losses_ref": -0.00048798826173879206, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2820, + "u": -1.8015903234481812, + "weight": 0.05078895017504692 + }, + { + "diff_generated": -18.751773834228516, + "epoch": 0.9170447180816591, + "grad_norm": 2.5155286172970377, + "learning_rate": 7.012739445318712e-07, + "logits/chosen": -2.4825854301452637, + "logits/rejected": -2.507294178009033, + "logps/chosen": -22.81148910522461, + "logps/rejected": -320.12750244140625, + "logps_avg/chosen": -0.1307867467403412, + "logps_avg/rejected": -1.8751773834228516, + "loss": 0.1206, + "losses_ref": -0.00032793093123473227, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2830, + "u": -1.8255561590194702, + "weight": 0.037995077669620514 + }, + { + "diff_generated": -18.128347396850586, + "epoch": 0.9202851587815943, + "grad_norm": 2.809119024607745, + "learning_rate": 7.002796941050435e-07, + "logits/chosen": -2.4425601959228516, + "logits/rejected": -2.4837136268615723, + "logps/chosen": -21.08615493774414, + "logps/rejected": -314.3614807128906, + "logps_avg/chosen": -0.12382993847131729, + "logps_avg/rejected": -1.8128345012664795, + "loss": 0.1296, + "losses_ref": -0.00028563165687955916, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2840, + "u": -1.7782018184661865, + "weight": 0.06290866434574127 + }, + { + "diff_generated": -20.078081130981445, + "epoch": 0.9235255994815295, + "grad_norm": 2.6148467990309787, + "learning_rate": 6.992811746731213e-07, + "logits/chosen": -2.469362735748291, + "logits/rejected": -2.504396438598633, + "logps/chosen": -23.93619728088379, + "logps/rejected": -332.0574035644531, + "logps_avg/chosen": -0.1358002871274948, + "logps_avg/rejected": -2.007808208465576, + "loss": 0.1255, + "losses_ref": -0.0002602954918984324, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2850, + "u": -1.813787817955017, + "weight": 0.044140513986349106 + }, + { + "diff_generated": -19.35173225402832, + "epoch": 0.9267660401814647, + "grad_norm": 2.82164225700258, + "learning_rate": 6.98278400431818e-07, + "logits/chosen": -2.491528034210205, + "logits/rejected": -2.5512471199035645, + "logps/chosen": -24.169204711914062, + "logps/rejected": -357.38128662109375, + "logps_avg/chosen": -0.13326986134052277, + "logps_avg/rejected": -1.9351732730865479, + "loss": 0.1258, + "losses_ref": -0.00046310765901580453, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2860, + "u": -1.8490760326385498, + "weight": 0.025722075253725052 + }, + { + "diff_generated": -19.551773071289062, + "epoch": 0.9300064808813999, + "grad_norm": 2.6044844599935377, + "learning_rate": 6.972713856373369e-07, + "logits/chosen": -2.474952220916748, + "logits/rejected": -2.5388407707214355, + "logps/chosen": -21.96600341796875, + "logps/rejected": -343.4984436035156, + "logps_avg/chosen": -0.12933237850666046, + "logps_avg/rejected": -1.9551775455474854, + "loss": 0.1232, + "losses_ref": -0.00046262479736469686, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2870, + "u": -1.849073052406311, + "weight": 0.025724787265062332 + }, + { + "diff_generated": -19.89356803894043, + "epoch": 0.933246921581335, + "grad_norm": 2.580690966488642, + "learning_rate": 6.962601446061681e-07, + "logits/chosen": -2.4569146633148193, + "logits/rejected": -2.4288723468780518, + "logps/chosen": -21.41404914855957, + "logps/rejected": -319.9272766113281, + "logps_avg/chosen": -0.11869911849498749, + "logps_avg/rejected": -1.9893567562103271, + "loss": 0.12, + "losses_ref": -0.0005856683710590005, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2880, + "u": -1.8369076251983643, + "weight": 0.03232860192656517 + }, + { + "diff_generated": -19.329910278320312, + "epoch": 0.9364873622812703, + "grad_norm": 2.614494401402133, + "learning_rate": 6.952446917148853e-07, + "logits/chosen": -2.4458394050598145, + "logits/rejected": -2.514772891998291, + "logps/chosen": -22.14116668701172, + "logps/rejected": -359.1854553222656, + "logps_avg/chosen": -0.1248125210404396, + "logps_avg/rejected": -1.9329910278320312, + "loss": 0.1226, + "losses_ref": -0.00042284480878151953, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2890, + "u": -1.8491137027740479, + "weight": 0.025677403435111046 + }, + { + "diff_generated": -18.59128189086914, + "epoch": 0.9397278029812054, + "grad_norm": 2.837551798748602, + "learning_rate": 6.94225041399941e-07, + "logits/chosen": -2.4578464031219482, + "logits/rejected": -2.5545461177825928, + "logps/chosen": -21.286333084106445, + "logps/rejected": -350.63970947265625, + "logps_avg/chosen": -0.12239722162485123, + "logps_avg/rejected": -1.8591279983520508, + "loss": 0.1168, + "losses_ref": -0.0003485087654553354, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2900, + "u": -1.789947271347046, + "weight": 0.05678866058588028 + }, + { + "diff_generated": -19.49415397644043, + "epoch": 0.9429682436811406, + "grad_norm": 2.5579438992340373, + "learning_rate": 6.932012081574615e-07, + "logits/chosen": -2.4774105548858643, + "logits/rejected": -2.502908229827881, + "logps/chosen": -21.979785919189453, + "logps/rejected": -329.8486022949219, + "logps_avg/chosen": -0.12885500490665436, + "logps_avg/rejected": -1.9494152069091797, + "loss": 0.1237, + "losses_ref": -0.00031258963281288743, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2910, + "u": -1.8137260675430298, + "weight": 0.04421340674161911 + }, + { + "diff_generated": -18.232284545898438, + "epoch": 0.9462086843810759, + "grad_norm": 2.6238007530926613, + "learning_rate": 6.921732065430411e-07, + "logits/chosen": -2.4125030040740967, + "logits/rejected": -2.5023012161254883, + "logps/chosen": -18.845687866210938, + "logps/rejected": -336.7342834472656, + "logps_avg/chosen": -0.11462052166461945, + "logps_avg/rejected": -1.8232284784317017, + "loss": 0.1219, + "losses_ref": -0.00029631194774992764, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2920, + "u": -1.7544281482696533, + "weight": 0.07547671347856522 + }, + { + "diff_generated": -19.532318115234375, + "epoch": 0.949449125081011, + "grad_norm": 2.5409270526015506, + "learning_rate": 6.911410511715343e-07, + "logits/chosen": -2.4430928230285645, + "logits/rejected": -2.4564061164855957, + "logps/chosen": -21.696651458740234, + "logps/rejected": -330.7839050292969, + "logps_avg/chosen": -0.11758589744567871, + "logps_avg/rejected": -1.9532318115234375, + "loss": 0.1206, + "losses_ref": -0.00047477110638283193, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2930, + "u": -1.7541587352752686, + "weight": 0.07578897476196289 + }, + { + "diff_generated": -19.280086517333984, + "epoch": 0.9526895657809462, + "grad_norm": 2.8239915570390406, + "learning_rate": 6.901047567168491e-07, + "logits/chosen": -2.4925901889801025, + "logits/rejected": -2.501408338546753, + "logps/chosen": -21.877737045288086, + "logps/rejected": -320.40167236328125, + "logps_avg/chosen": -0.12425835430622101, + "logps_avg/rejected": -1.9280086755752563, + "loss": 0.1235, + "losses_ref": -0.0002563064044807106, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2940, + "u": -1.7545255422592163, + "weight": 0.07536445558071136 + }, + { + "diff_generated": -19.411279678344727, + "epoch": 0.9559300064808814, + "grad_norm": 2.665040621540858, + "learning_rate": 6.890643379117374e-07, + "logits/chosen": -2.46891188621521, + "logits/rejected": -2.480846405029297, + "logps/chosen": -22.25421142578125, + "logps/rejected": -343.4288330078125, + "logps_avg/chosen": -0.12037277221679688, + "logps_avg/rejected": -1.9411280155181885, + "loss": 0.1205, + "losses_ref": -0.000287555914837867, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2950, + "u": -1.790021538734436, + "weight": 0.05670164152979851 + }, + { + "diff_generated": -20.14754867553711, + "epoch": 0.9591704471808166, + "grad_norm": 2.859031779876307, + "learning_rate": 6.880198095475866e-07, + "logits/chosen": -2.4958252906799316, + "logits/rejected": -2.4784576892852783, + "logps/chosen": -26.015636444091797, + "logps/rejected": -353.650390625, + "logps_avg/chosen": -0.13471297919750214, + "logps_avg/rejected": -2.0147547721862793, + "loss": 0.1223, + "losses_ref": -0.0003499361628200859, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2960, + "u": -1.825506567955017, + "weight": 0.038051966577768326 + }, + { + "diff_generated": -18.965621948242188, + "epoch": 0.9624108878807518, + "grad_norm": 2.8676870600784765, + "learning_rate": 6.86971186474208e-07, + "logits/chosen": -2.4640886783599854, + "logits/rejected": -2.4847826957702637, + "logps/chosen": -21.747859954833984, + "logps/rejected": -330.61016845703125, + "logps_avg/chosen": -0.11688639968633652, + "logps_avg/rejected": -1.8965622186660767, + "loss": 0.1266, + "losses_ref": -0.0004715279792435467, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2970, + "u": -1.7897437810897827, + "weight": 0.057025156915187836 + }, + { + "diff_generated": -19.038909912109375, + "epoch": 0.9656513285806869, + "grad_norm": 2.6947451628009302, + "learning_rate": 6.859184835996271e-07, + "logits/chosen": -2.4535346031188965, + "logits/rejected": -2.5355563163757324, + "logps/chosen": -20.323822021484375, + "logps/rejected": -343.13238525390625, + "logps_avg/chosen": -0.1202494278550148, + "logps_avg/rejected": -1.9038912057876587, + "loss": 0.1223, + "losses_ref": -0.0005739832413382828, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2980, + "u": -1.813147783279419, + "weight": 0.04487653821706772 + }, + { + "diff_generated": -22.221275329589844, + "epoch": 0.9688917692806222, + "grad_norm": 2.7201209987510726, + "learning_rate": 6.848617158898704e-07, + "logits/chosen": -2.4459598064422607, + "logits/rejected": -2.4944732189178467, + "logps/chosen": -18.379690170288086, + "logps/rejected": -399.9496765136719, + "logps_avg/chosen": -0.1081472784280777, + "logps_avg/rejected": -2.2221274375915527, + "loss": 0.1167, + "losses_ref": -0.00019583315588533878, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2990, + "u": -1.8138822317123413, + "weight": 0.04402980953454971 + }, + { + "diff_generated": -19.042484283447266, + "epoch": 0.9721322099805574, + "grad_norm": 2.887559738472061, + "learning_rate": 6.838008983687538e-07, + "logits/chosen": -2.420860767364502, + "logits/rejected": -2.479161500930786, + "logps/chosen": -20.06986427307129, + "logps/rejected": -370.847412109375, + "logps_avg/chosen": -0.11698007583618164, + "logps_avg/rejected": -1.9042482376098633, + "loss": 0.1213, + "losses_ref": -0.0003117799642495811, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3000, + "u": -1.825399398803711, + "weight": 0.03816502168774605 + }, + { + "diff_generated": -18.925434112548828, + "epoch": 0.9753726506804925, + "grad_norm": 2.685837856043556, + "learning_rate": 6.827360461176675e-07, + "logits/chosen": -2.4443156719207764, + "logits/rejected": -2.495382785797119, + "logps/chosen": -22.687870025634766, + "logps/rejected": -352.8595886230469, + "logps_avg/chosen": -0.12558838725090027, + "logps_avg/rejected": -1.8925431966781616, + "loss": 0.126, + "losses_ref": -0.00045695697190240026, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3010, + "u": -1.8014158010482788, + "weight": 0.05096786096692085 + }, + { + "diff_generated": -20.009876251220703, + "epoch": 0.9786130913804277, + "grad_norm": 2.559954495098906, + "learning_rate": 6.816671742753636e-07, + "logits/chosen": -2.4376111030578613, + "logits/rejected": -2.4606010913848877, + "logps/chosen": -22.668743133544922, + "logps/rejected": -343.001953125, + "logps_avg/chosen": -0.13240326941013336, + "logps_avg/rejected": -2.0009875297546387, + "loss": 0.1196, + "losses_ref": -0.00020331182167865336, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3020, + "u": -1.7545816898345947, + "weight": 0.07529846578836441 + }, + { + "diff_generated": -20.478519439697266, + "epoch": 0.981853532080363, + "grad_norm": 2.7208132567620624, + "learning_rate": 6.80594298037739e-07, + "logits/chosen": -2.4369359016418457, + "logits/rejected": -2.4600374698638916, + "logps/chosen": -20.999563217163086, + "logps/rejected": -346.7327880859375, + "logps_avg/chosen": -0.12476013600826263, + "logps_avg/rejected": -2.047852039337158, + "loss": 0.1243, + "losses_ref": -0.00038672907976433635, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3030, + "u": -1.8017244338989258, + "weight": 0.05063174292445183 + }, + { + "diff_generated": -20.338376998901367, + "epoch": 0.9850939727802981, + "grad_norm": 2.5661099093643, + "learning_rate": 6.795174326576201e-07, + "logits/chosen": -2.5011813640594482, + "logits/rejected": -2.517927646636963, + "logps/chosen": -22.143497467041016, + "logps/rejected": -354.784423828125, + "logps_avg/chosen": -0.12776055932044983, + "logps_avg/rejected": -2.0338377952575684, + "loss": 0.122, + "losses_ref": -0.00024083026801235974, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3040, + "u": -1.8375473022460938, + "weight": 0.031587857753038406 + }, + { + "diff_generated": -20.159313201904297, + "epoch": 0.9883344134802333, + "grad_norm": 2.5998402846143156, + "learning_rate": 6.784365934445467e-07, + "logits/chosen": -2.397493839263916, + "logits/rejected": -2.485623598098755, + "logps/chosen": -20.126605987548828, + "logps/rejected": -366.147216796875, + "logps_avg/chosen": -0.11706575006246567, + "logps_avg/rejected": -2.0159313678741455, + "loss": 0.1218, + "losses_ref": -0.0002532999496906996, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3050, + "u": -1.7782090902328491, + "weight": 0.06289789080619812 + }, + { + "diff_generated": -20.2012882232666, + "epoch": 0.9915748541801686, + "grad_norm": 2.932890060620594, + "learning_rate": 6.77351795764553e-07, + "logits/chosen": -2.498687505722046, + "logits/rejected": -2.518296003341675, + "logps/chosen": -21.37833595275879, + "logps/rejected": -342.5185852050781, + "logps_avg/chosen": -0.11720434576272964, + "logps_avg/rejected": -2.0201287269592285, + "loss": 0.1245, + "losses_ref": -0.0004471830034162849, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3060, + "u": -1.8253414630889893, + "weight": 0.03824517875909805 + }, + { + "diff_generated": -21.252620697021484, + "epoch": 0.9948152948801037, + "grad_norm": 2.6432494995363283, + "learning_rate": 6.7626305503995e-07, + "logits/chosen": -2.4260194301605225, + "logits/rejected": -2.465452194213867, + "logps/chosen": -21.87104034423828, + "logps/rejected": -357.416015625, + "logps_avg/chosen": -0.12750808894634247, + "logps_avg/rejected": -2.1252620220184326, + "loss": 0.1202, + "losses_ref": -0.00040028925286605954, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3070, + "u": -1.8135614395141602, + "weight": 0.044404350221157074 + }, + { + "diff_generated": -20.740575790405273, + "epoch": 0.9980557355800389, + "grad_norm": 2.695896336401798, + "learning_rate": 6.75170386749106e-07, + "logits/chosen": -2.4324469566345215, + "logits/rejected": -2.488874912261963, + "logps/chosen": -23.11557388305664, + "logps/rejected": -379.96612548828125, + "logps_avg/chosen": -0.12699179351329803, + "logps_avg/rejected": -2.0740573406219482, + "loss": 0.1191, + "losses_ref": -0.0006312219775281847, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3080, + "u": -1.884240746498108, + "weight": 0.007436756044626236 + }, + { + "diff_generated": -19.466602325439453, + "epoch": 1.0012961762799741, + "grad_norm": 2.7553187871328, + "learning_rate": 6.740738064262265e-07, + "logits/chosen": -2.4823808670043945, + "logits/rejected": -2.5409321784973145, + "logps/chosen": -19.86790657043457, + "logps/rejected": -359.1689453125, + "logps_avg/chosen": -0.11252466589212418, + "logps_avg/rejected": -1.946660041809082, + "loss": 0.1121, + "losses_ref": -0.0014723313506692648, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3090, + "u": -2.5461132526397705, + "weight": 0.0461733303964138 + }, + { + "diff_generated": -20.70392417907715, + "epoch": 1.0045366169799093, + "grad_norm": 2.6373592590329578, + "learning_rate": 6.729733296611336e-07, + "logits/chosen": -2.4947752952575684, + "logits/rejected": -2.5320487022399902, + "logps/chosen": -17.02425765991211, + "logps/rejected": -360.64306640625, + "logps_avg/chosen": -0.09909389168024063, + "logps_avg/rejected": -2.070392370223999, + "loss": 0.0994, + "losses_ref": -0.0014184715691953897, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3100, + "u": -3.6951279640197754, + "weight": 0.027127737179398537 + }, + { + "diff_generated": -22.544784545898438, + "epoch": 1.0077770576798444, + "grad_norm": 2.612035515483608, + "learning_rate": 6.718689720990442e-07, + "logits/chosen": -2.457213878631592, + "logits/rejected": -2.4981653690338135, + "logps/chosen": -17.546300888061523, + "logps/rejected": -378.4374694824219, + "logps_avg/chosen": -0.10462252795696259, + "logps_avg/rejected": -2.2544784545898438, + "loss": 0.1007, + "losses_ref": -0.001810407848097384, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3110, + "u": -3.5930373668670654, + "weight": 0.05361621454358101 + }, + { + "diff_generated": -22.213926315307617, + "epoch": 1.0110174983797797, + "grad_norm": 2.6028609758829067, + "learning_rate": 6.707607494403471e-07, + "logits/chosen": -2.4518966674804688, + "logits/rejected": -2.48047137260437, + "logps/chosen": -16.816543579101562, + "logps/rejected": -366.20770263671875, + "logps_avg/chosen": -0.09637521207332611, + "logps_avg/rejected": -2.221392869949341, + "loss": 0.1, + "losses_ref": -0.0008464438142254949, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3120, + "u": -3.5740058422088623, + "weight": 0.05738803744316101 + }, + { + "diff_generated": -23.24093246459961, + "epoch": 1.0142579390797148, + "grad_norm": 2.7720360808935234, + "learning_rate": 6.696486774403812e-07, + "logits/chosen": -2.4274613857269287, + "logits/rejected": -2.472154140472412, + "logps/chosen": -18.64761734008789, + "logps/rejected": -397.8034973144531, + "logps_avg/chosen": -0.10990948975086212, + "logps_avg/rejected": -2.3240933418273926, + "loss": 0.1029, + "losses_ref": -0.0008102835854515433, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3130, + "u": -3.6011669635772705, + "weight": 0.05116075277328491 + }, + { + "diff_generated": -21.840497970581055, + "epoch": 1.01749837977965, + "grad_norm": 2.8145720544967787, + "learning_rate": 6.685327719092096e-07, + "logits/chosen": -2.383756399154663, + "logits/rejected": -2.4836862087249756, + "logps/chosen": -14.916833877563477, + "logps/rejected": -383.84881591796875, + "logps_avg/chosen": -0.09246564656496048, + "logps_avg/rejected": -2.184049606323242, + "loss": 0.1024, + "losses_ref": -0.0007215240621007979, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3140, + "u": -3.503340482711792, + "weight": 0.07616675645112991 + }, + { + "diff_generated": -22.751646041870117, + "epoch": 1.0207388204795853, + "grad_norm": 2.537710280076668, + "learning_rate": 6.674130487113962e-07, + "logits/chosen": -2.4844613075256348, + "logits/rejected": -2.4696061611175537, + "logps/chosen": -19.461294174194336, + "logps/rejected": -396.66644287109375, + "logps_avg/chosen": -0.1096779853105545, + "logps_avg/rejected": -2.275164842605591, + "loss": 0.1064, + "losses_ref": -0.0012650018325075507, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3150, + "u": -3.6209092140197754, + "weight": 0.04566306993365288 + }, + { + "diff_generated": -23.81186866760254, + "epoch": 1.0239792611795204, + "grad_norm": 2.6188946979483423, + "learning_rate": 6.662895237657799e-07, + "logits/chosen": -2.495620012283325, + "logits/rejected": -2.4823803901672363, + "logps/chosen": -17.750776290893555, + "logps/rejected": -385.9418640136719, + "logps_avg/chosen": -0.10011390596628189, + "logps_avg/rejected": -2.3811867237091064, + "loss": 0.1028, + "losses_ref": -0.0011827899143099785, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3160, + "u": -3.694988250732422, + "weight": 0.026975449174642563 + }, + { + "diff_generated": -20.148897171020508, + "epoch": 1.0272197018794555, + "grad_norm": 2.5582619440911447, + "learning_rate": 6.651622130452481e-07, + "logits/chosen": -2.43251371383667, + "logits/rejected": -2.451153516769409, + "logps/chosen": -20.973651885986328, + "logps/rejected": -354.74237060546875, + "logps_avg/chosen": -0.10991770029067993, + "logps_avg/rejected": -2.0148894786834717, + "loss": 0.1007, + "losses_ref": -0.0011846128618344665, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3170, + "u": -3.530184268951416, + "weight": 0.07041925191879272 + }, + { + "diff_generated": -23.65302848815918, + "epoch": 1.030460142579391, + "grad_norm": 2.547304600356209, + "learning_rate": 6.640311325765096e-07, + "logits/chosen": -2.4048352241516113, + "logits/rejected": -2.462200880050659, + "logps/chosen": -17.424785614013672, + "logps/rejected": -404.6559143066406, + "logps_avg/chosen": -0.10332882404327393, + "logps_avg/rejected": -2.365302801132202, + "loss": 0.103, + "losses_ref": -0.00218791700899601, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3180, + "u": -3.5986220836639404, + "weight": 0.053771477192640305 + }, + { + "diff_generated": -23.062740325927734, + "epoch": 1.033700583279326, + "grad_norm": 2.911656332092707, + "learning_rate": 6.628962984398663e-07, + "logits/chosen": -2.444711208343506, + "logits/rejected": -2.468648910522461, + "logps/chosen": -17.87255859375, + "logps/rejected": -410.46258544921875, + "logps_avg/chosen": -0.10444997251033783, + "logps_avg/rejected": -2.306273937225342, + "loss": 0.102, + "losses_ref": -0.002171388128772378, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3190, + "u": -3.617579698562622, + "weight": 0.047922343015670776 + }, + { + "diff_generated": -23.605072021484375, + "epoch": 1.0369410239792611, + "grad_norm": 3.169648553209789, + "learning_rate": 6.617577267689863e-07, + "logits/chosen": -2.4097084999084473, + "logits/rejected": -2.4340286254882812, + "logps/chosen": -17.338579177856445, + "logps/rejected": -428.2044372558594, + "logps_avg/chosen": -0.10172301530838013, + "logps_avg/rejected": -2.360507011413574, + "loss": 0.1027, + "losses_ref": -0.0015846488531678915, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3200, + "u": -3.598299741744995, + "weight": 0.05309338495135307 + }, + { + "diff_generated": -22.836376190185547, + "epoch": 1.0401814646791965, + "grad_norm": 2.6924560678811087, + "learning_rate": 6.606154337506721e-07, + "logits/chosen": -2.45587420463562, + "logits/rejected": -2.441114902496338, + "logps/chosen": -21.14226722717285, + "logps/rejected": -389.21380615234375, + "logps_avg/chosen": -0.10923053324222565, + "logps_avg/rejected": -2.283637523651123, + "loss": 0.1031, + "losses_ref": -0.0016331791412085295, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3210, + "u": -3.598700761795044, + "weight": 0.05264229327440262 + }, + { + "diff_generated": -23.247394561767578, + "epoch": 1.0434219053791316, + "grad_norm": 2.482087757645902, + "learning_rate": 6.594694356246325e-07, + "logits/chosen": -2.4460015296936035, + "logits/rejected": -2.3826279640197754, + "logps/chosen": -19.749439239501953, + "logps/rejected": -404.3794250488281, + "logps_avg/chosen": -0.10343378782272339, + "logps_avg/rejected": -2.324739456176758, + "loss": 0.1008, + "losses_ref": -0.0012167459353804588, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3220, + "u": -3.483008623123169, + "weight": 0.08297277987003326 + }, + { + "diff_generated": -23.389196395874023, + "epoch": 1.0466623460790667, + "grad_norm": 2.6984683560319453, + "learning_rate": 6.583197486832506e-07, + "logits/chosen": -2.434844493865967, + "logits/rejected": -2.393265724182129, + "logps/chosen": -17.93399429321289, + "logps/rejected": -400.63348388671875, + "logps_avg/chosen": -0.09716083109378815, + "logps_avg/rejected": -2.3389194011688232, + "loss": 0.1032, + "losses_ref": -0.001094849780201912, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3230, + "u": -3.4998157024383545, + "weight": 0.07667265087366104 + }, + { + "diff_generated": -24.4296932220459, + "epoch": 1.0499027867790018, + "grad_norm": 3.0013555479217744, + "learning_rate": 6.571663892713527e-07, + "logits/chosen": -2.432621479034424, + "logits/rejected": -2.430283784866333, + "logps/chosen": -18.3193359375, + "logps/rejected": -403.04559326171875, + "logps_avg/chosen": -0.10598043352365494, + "logps_avg/rejected": -2.442969560623169, + "loss": 0.0992, + "losses_ref": -0.0009507110225968063, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3240, + "u": -3.6481566429138184, + "weight": 0.038829002529382706 + }, + { + "diff_generated": -23.248714447021484, + "epoch": 1.0531432274789372, + "grad_norm": 2.4764423351374614, + "learning_rate": 6.560093737859755e-07, + "logits/chosen": -2.4444186687469482, + "logits/rejected": -2.3516175746917725, + "logps/chosen": -18.378459930419922, + "logps/rejected": -367.12188720703125, + "logps_avg/chosen": -0.1013893112540245, + "logps_avg/rejected": -2.32487154006958, + "loss": 0.101, + "losses_ref": -0.000590079347603023, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3250, + "u": -3.4993865489959717, + "weight": 0.07574503868818283 + }, + { + "diff_generated": -23.602413177490234, + "epoch": 1.0563836681788723, + "grad_norm": 2.6191646456197426, + "learning_rate": 6.548487186761334e-07, + "logits/chosen": -2.430243968963623, + "logits/rejected": -2.46002459526062, + "logps/chosen": -17.586816787719727, + "logps/rejected": -410.814453125, + "logps_avg/chosen": -0.10161665827035904, + "logps_avg/rejected": -2.360241413116455, + "loss": 0.1031, + "losses_ref": -0.0013376142596825957, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3260, + "u": -3.6159908771514893, + "weight": 0.04571908712387085 + }, + { + "diff_generated": -24.22919273376465, + "epoch": 1.0596241088788074, + "grad_norm": 2.5647852276867207, + "learning_rate": 6.536844404425845e-07, + "logits/chosen": -2.4303784370422363, + "logits/rejected": -2.4501612186431885, + "logps/chosen": -17.670944213867188, + "logps/rejected": -422.8164978027344, + "logps_avg/chosen": -0.09900084882974625, + "logps_avg/rejected": -2.4229190349578857, + "loss": 0.0999, + "losses_ref": -0.0014200543519109488, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3270, + "u": -3.547405958175659, + "weight": 0.06507633626461029 + }, + { + "diff_generated": -24.190811157226562, + "epoch": 1.0628645495787428, + "grad_norm": 2.6807521594131627, + "learning_rate": 6.525165556375959e-07, + "logits/chosen": -2.3963959217071533, + "logits/rejected": -2.445218324661255, + "logps/chosen": -16.344669342041016, + "logps/rejected": -413.4169006347656, + "logps_avg/chosen": -0.09862269461154938, + "logps_avg/rejected": -2.419081211090088, + "loss": 0.1016, + "losses_ref": -0.001218282151967287, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3280, + "u": -3.5981674194335938, + "weight": 0.051731787621974945 + }, + { + "diff_generated": -25.94841957092285, + "epoch": 1.0661049902786779, + "grad_norm": 2.564309940524817, + "learning_rate": 6.513450808647086e-07, + "logits/chosen": -2.3707902431488037, + "logits/rejected": -2.3697311878204346, + "logps/chosen": -19.391185760498047, + "logps/rejected": -457.5367736816406, + "logps_avg/chosen": -0.10474257171154022, + "logps_avg/rejected": -2.5948421955108643, + "loss": 0.1038, + "losses_ref": -0.0006827990291640162, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3290, + "u": -3.6699092388153076, + "weight": 0.032251156866550446 + }, + { + "diff_generated": -24.062957763671875, + "epoch": 1.069345430978613, + "grad_norm": 2.460241055536533, + "learning_rate": 6.501700327785011e-07, + "logits/chosen": -2.46614408493042, + "logits/rejected": -2.426342487335205, + "logps/chosen": -17.56804656982422, + "logps/rejected": -433.91009521484375, + "logps_avg/chosen": -0.09171821922063828, + "logps_avg/rejected": -2.4062960147857666, + "loss": 0.1013, + "losses_ref": -0.0011504015419632196, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3300, + "u": -3.5259463787078857, + "weight": 0.07042489945888519 + }, + { + "diff_generated": -24.045822143554688, + "epoch": 1.0725858716785484, + "grad_norm": 2.7323088636588135, + "learning_rate": 6.489914280843528e-07, + "logits/chosen": -2.449624538421631, + "logits/rejected": -2.3936076164245605, + "logps/chosen": -19.183971405029297, + "logps/rejected": -411.39794921875, + "logps_avg/chosen": -0.10804645717144012, + "logps_avg/rejected": -2.4045822620391846, + "loss": 0.1026, + "losses_ref": -0.0014338415348902345, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3310, + "u": -3.5029220581054688, + "weight": 0.07727103680372238 + }, + { + "diff_generated": -24.791278839111328, + "epoch": 1.0758263123784835, + "grad_norm": 2.6200584841102064, + "learning_rate": 6.478092835382071e-07, + "logits/chosen": -2.4237277507781982, + "logits/rejected": -2.404797315597534, + "logps/chosen": -19.5428409576416, + "logps/rejected": -424.99224853515625, + "logps_avg/chosen": -0.10355500131845474, + "logps_avg/rejected": -2.479128122329712, + "loss": 0.1005, + "losses_ref": -0.0005835418705828488, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3320, + "u": -3.6703476905822754, + "weight": 0.03202248364686966 + }, + { + "diff_generated": -24.558528900146484, + "epoch": 1.0790667530784186, + "grad_norm": 2.407433322048499, + "learning_rate": 6.466236159463319e-07, + "logits/chosen": -2.4131906032562256, + "logits/rejected": -2.4160373210906982, + "logps/chosen": -17.557640075683594, + "logps/rejected": -444.14642333984375, + "logps_avg/chosen": -0.0994739979505539, + "logps_avg/rejected": -2.45585298538208, + "loss": 0.1003, + "losses_ref": -0.0005983190494589508, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3330, + "u": -3.6192123889923096, + "weight": 0.04453558474779129 + }, + { + "diff_generated": -26.1569881439209, + "epoch": 1.082307193778354, + "grad_norm": 2.6781150132399545, + "learning_rate": 6.45434442165082e-07, + "logits/chosen": -2.433701753616333, + "logits/rejected": -2.446474075317383, + "logps/chosen": -18.218320846557617, + "logps/rejected": -465.1985778808594, + "logps_avg/chosen": -0.10705505311489105, + "logps_avg/rejected": -2.6156985759735107, + "loss": 0.1041, + "losses_ref": -0.00022716677631251514, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3340, + "u": -3.717374086380005, + "weight": 0.019019024446606636 + }, + { + "diff_generated": -23.62293815612793, + "epoch": 1.085547634478289, + "grad_norm": 2.7613908310267683, + "learning_rate": 6.442417791006585e-07, + "logits/chosen": -2.438436985015869, + "logits/rejected": -2.4464330673217773, + "logps/chosen": -17.62456512451172, + "logps/rejected": -413.90838623046875, + "logps_avg/chosen": -0.09845836460590363, + "logps_avg/rejected": -2.3622939586639404, + "loss": 0.1031, + "losses_ref": -0.0006310438038781285, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3350, + "u": -3.567746639251709, + "weight": 0.05706679821014404 + }, + { + "diff_generated": -25.488025665283203, + "epoch": 1.0887880751782242, + "grad_norm": 2.9149257650274767, + "learning_rate": 6.43045643708869e-07, + "logits/chosen": -2.42579984664917, + "logits/rejected": -2.3910865783691406, + "logps/chosen": -18.47863006591797, + "logps/rejected": -431.6898498535156, + "logps_avg/chosen": -0.09963358938694, + "logps_avg/rejected": -2.5488028526306152, + "loss": 0.1054, + "losses_ref": -0.0014607172925025225, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3360, + "u": -3.5925323963165283, + "weight": 0.052489422261714935 + }, + { + "diff_generated": -24.272735595703125, + "epoch": 1.0920285158781595, + "grad_norm": 2.784142824615618, + "learning_rate": 6.418460529948861e-07, + "logits/chosen": -2.409794330596924, + "logits/rejected": -2.445279598236084, + "logps/chosen": -15.560731887817383, + "logps/rejected": -420.5341796875, + "logps_avg/chosen": -0.09469757974147797, + "logps_avg/rejected": -2.4272732734680176, + "loss": 0.1028, + "losses_ref": -0.0006133883143775165, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3370, + "u": -3.4843497276306152, + "weight": 0.08204265683889389 + }, + { + "diff_generated": -24.337230682373047, + "epoch": 1.0952689565780946, + "grad_norm": 2.862517085263023, + "learning_rate": 6.406430240130064e-07, + "logits/chosen": -2.401892900466919, + "logits/rejected": -2.421947956085205, + "logps/chosen": -18.046703338623047, + "logps/rejected": -440.37841796875, + "logps_avg/chosen": -0.10726320743560791, + "logps_avg/rejected": -2.433722972869873, + "loss": 0.1027, + "losses_ref": -0.0007345007034018636, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3380, + "u": -3.5500590801239014, + "weight": 0.06357249617576599 + }, + { + "diff_generated": -24.39935302734375, + "epoch": 1.0985093972780298, + "grad_norm": 2.4164264455871005, + "learning_rate": 6.39436573866407e-07, + "logits/chosen": -2.433347702026367, + "logits/rejected": -2.4411110877990723, + "logps/chosen": -19.208223342895508, + "logps/rejected": -429.08551025390625, + "logps_avg/chosen": -0.11225831508636475, + "logps_avg/rejected": -2.4399352073669434, + "loss": 0.1042, + "losses_ref": -0.0008622838067822158, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3390, + "u": -3.5989012718200684, + "weight": 0.05120759457349777 + }, + { + "diff_generated": -24.014942169189453, + "epoch": 1.101749837977965, + "grad_norm": 2.845834142850375, + "learning_rate": 6.38226719706903e-07, + "logits/chosen": -2.405179500579834, + "logits/rejected": -2.427467107772827, + "logps/chosen": -17.116865158081055, + "logps/rejected": -419.35650634765625, + "logps_avg/chosen": -0.09478393942117691, + "logps_avg/rejected": -2.401494026184082, + "loss": 0.1038, + "losses_ref": -0.0004464842495508492, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3400, + "u": -3.5061697959899902, + "weight": 0.07553113251924515 + }, + { + "diff_generated": -23.331575393676758, + "epoch": 1.1049902786779002, + "grad_norm": 2.561300325422103, + "learning_rate": 6.370134787347039e-07, + "logits/chosen": -2.4273059368133545, + "logits/rejected": -2.436058521270752, + "logps/chosen": -17.860082626342773, + "logps/rejected": -430.041748046875, + "logps_avg/chosen": -0.09594331681728363, + "logps_avg/rejected": -2.333157539367676, + "loss": 0.0992, + "losses_ref": -0.0009888919303193688, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3410, + "u": -3.5523743629455566, + "weight": 0.06385111808776855 + }, + { + "diff_generated": -23.73761558532715, + "epoch": 1.1082307193778353, + "grad_norm": 2.7184926559084523, + "learning_rate": 6.357968681981683e-07, + "logits/chosen": -2.386045217514038, + "logits/rejected": -2.3527228832244873, + "logps/chosen": -19.80392074584961, + "logps/rejected": -416.7049255371094, + "logps_avg/chosen": -0.1074841246008873, + "logps_avg/rejected": -2.3737616539001465, + "loss": 0.1027, + "losses_ref": -0.0009749646415002644, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3420, + "u": -3.549647808074951, + "weight": 0.06383351981639862 + }, + { + "diff_generated": -23.26014518737793, + "epoch": 1.1114711600777705, + "grad_norm": 2.550522174520947, + "learning_rate": 6.345769053935595e-07, + "logits/chosen": -2.39861798286438, + "logits/rejected": -2.437791109085083, + "logps/chosen": -14.868988037109375, + "logps/rejected": -423.1236877441406, + "logps_avg/chosen": -0.08718402683734894, + "logps_avg/rejected": -2.326014757156372, + "loss": 0.0988, + "losses_ref": -0.0009409437188878655, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3430, + "u": -3.527477264404297, + "weight": 0.07046804577112198 + }, + { + "diff_generated": -26.601919174194336, + "epoch": 1.1147116007777058, + "grad_norm": 2.840256866062134, + "learning_rate": 6.333536076647985e-07, + "logits/chosen": -2.350879192352295, + "logits/rejected": -2.3992514610290527, + "logps/chosen": -17.582849502563477, + "logps/rejected": -481.417236328125, + "logps_avg/chosen": -0.10916352272033691, + "logps_avg/rejected": -2.6601920127868652, + "loss": 0.1021, + "losses_ref": -0.0011694144923239946, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3440, + "u": -3.5961785316467285, + "weight": 0.051804203540086746 + }, + { + "diff_generated": -24.66084098815918, + "epoch": 1.117952041477641, + "grad_norm": 3.0778330431175824, + "learning_rate": 6.321269924032188e-07, + "logits/chosen": -2.388843059539795, + "logits/rejected": -2.358248472213745, + "logps/chosen": -21.2260684967041, + "logps/rejected": -451.9033203125, + "logps_avg/chosen": -0.11745420843362808, + "logps_avg/rejected": -2.4660842418670654, + "loss": 0.1055, + "losses_ref": -0.0006822725990787148, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3450, + "u": -3.6240851879119873, + "weight": 0.04461303725838661 + }, + { + "diff_generated": -24.651397705078125, + "epoch": 1.121192482177576, + "grad_norm": 4.340111621801555, + "learning_rate": 6.308970770473184e-07, + "logits/chosen": -2.3606228828430176, + "logits/rejected": -2.352263927459717, + "logps/chosen": -18.791608810424805, + "logps/rejected": -407.0849609375, + "logps_avg/chosen": -0.10350509732961655, + "logps_avg/rejected": -2.465139865875244, + "loss": 0.1039, + "losses_ref": -0.0009811132913455367, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3460, + "u": -3.693281650543213, + "weight": 0.026510203257203102 + }, + { + "diff_generated": -24.678524017333984, + "epoch": 1.1244329228775114, + "grad_norm": 2.7093578852340716, + "learning_rate": 6.296638790825117e-07, + "logits/chosen": -2.418677806854248, + "logits/rejected": -2.3865280151367188, + "logps/chosen": -18.040279388427734, + "logps/rejected": -437.81591796875, + "logps_avg/chosen": -0.10123646259307861, + "logps_avg/rejected": -2.467852830886841, + "loss": 0.1022, + "losses_ref": -0.0017416279297322035, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3470, + "u": -3.615842342376709, + "weight": 0.04664891958236694 + }, + { + "diff_generated": -25.569656372070312, + "epoch": 1.1276733635774465, + "grad_norm": 2.908672299139381, + "learning_rate": 6.284274160408812e-07, + "logits/chosen": -2.40313720703125, + "logits/rejected": -2.4013543128967285, + "logps/chosen": -16.56866455078125, + "logps/rejected": -483.9769592285156, + "logps_avg/chosen": -0.09553743898868561, + "logps_avg/rejected": -2.5569653511047363, + "loss": 0.1003, + "losses_ref": -0.0011046285508200526, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3480, + "u": -3.4805283546447754, + "weight": 0.08319718390703201 + }, + { + "diff_generated": -23.765193939208984, + "epoch": 1.1309138042773816, + "grad_norm": 2.96216221046641, + "learning_rate": 6.271877055009284e-07, + "logits/chosen": -2.4011592864990234, + "logits/rejected": -2.4146828651428223, + "logps/chosen": -18.33474349975586, + "logps/rejected": -423.14990234375, + "logps_avg/chosen": -0.10465750843286514, + "logps_avg/rejected": -2.3765194416046143, + "loss": 0.1043, + "losses_ref": -0.0013917352771386504, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3490, + "u": -3.595651626586914, + "weight": 0.052030790597200394 + }, + { + "diff_generated": -25.037397384643555, + "epoch": 1.134154244977317, + "grad_norm": 2.630923168673329, + "learning_rate": 6.259447650873236e-07, + "logits/chosen": -2.4729576110839844, + "logits/rejected": -2.468740224838257, + "logps/chosen": -16.419998168945312, + "logps/rejected": -451.05816650390625, + "logps_avg/chosen": -0.09578843414783478, + "logps_avg/rejected": -2.503739833831787, + "loss": 0.105, + "losses_ref": -0.0009075348498299718, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3500, + "u": -3.5995700359344482, + "weight": 0.05147087574005127 + }, + { + "diff_generated": -23.72597885131836, + "epoch": 1.137394685677252, + "grad_norm": 2.6716976448304885, + "learning_rate": 6.246986124706555e-07, + "logits/chosen": -2.3940160274505615, + "logits/rejected": -2.427333116531372, + "logps/chosen": -19.011890411376953, + "logps/rejected": -453.88067626953125, + "logps_avg/chosen": -0.10743583738803864, + "logps_avg/rejected": -2.372598171234131, + "loss": 0.1028, + "losses_ref": -0.0011909648310393095, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3510, + "u": -3.5484108924865723, + "weight": 0.06427115201950073 + }, + { + "diff_generated": -24.411705017089844, + "epoch": 1.1406351263771872, + "grad_norm": 2.6436120624467687, + "learning_rate": 6.234492653671797e-07, + "logits/chosen": -2.4433653354644775, + "logits/rejected": -2.4274849891662598, + "logps/chosen": -19.20479393005371, + "logps/rejected": -426.07843017578125, + "logps_avg/chosen": -0.10876087099313736, + "logps_avg/rejected": -2.4411709308624268, + "loss": 0.1029, + "losses_ref": -0.0012901790905743837, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3520, + "u": -3.600743055343628, + "weight": 0.05254561826586723 + }, + { + "diff_generated": -23.339073181152344, + "epoch": 1.1438755670771226, + "grad_norm": 3.032839087746657, + "learning_rate": 6.221967415385675e-07, + "logits/chosen": -2.451753616333008, + "logits/rejected": -2.4012863636016846, + "logps/chosen": -18.745807647705078, + "logps/rejected": -415.9112243652344, + "logps_avg/chosen": -0.10285329818725586, + "logps_avg/rejected": -2.333907127380371, + "loss": 0.106, + "losses_ref": -0.0013694807421416044, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3530, + "u": -3.5984578132629395, + "weight": 0.05219249799847603 + }, + { + "diff_generated": -24.094181060791016, + "epoch": 1.1471160077770577, + "grad_norm": 2.739604298054967, + "learning_rate": 6.209410587916524e-07, + "logits/chosen": -2.3793702125549316, + "logits/rejected": -2.333580255508423, + "logps/chosen": -19.96246910095215, + "logps/rejected": -401.35626220703125, + "logps_avg/chosen": -0.1089554876089096, + "logps_avg/rejected": -2.4094183444976807, + "loss": 0.1037, + "losses_ref": -0.0021559642627835274, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3540, + "u": -3.6416687965393066, + "weight": 0.04153052344918251 + }, + { + "diff_generated": -23.563488006591797, + "epoch": 1.1503564484769928, + "grad_norm": 2.8511259194261727, + "learning_rate": 6.196822349781781e-07, + "logits/chosen": -2.404599189758301, + "logits/rejected": -2.3727211952209473, + "logps/chosen": -19.58984375, + "logps/rejected": -400.2362365722656, + "logps_avg/chosen": -0.10796210914850235, + "logps_avg/rejected": -2.3563485145568848, + "loss": 0.105, + "losses_ref": -0.000914513599127531, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3550, + "u": -3.6716854572296143, + "weight": 0.032539792358875275 + }, + { + "diff_generated": -25.360666275024414, + "epoch": 1.1535968891769282, + "grad_norm": 2.4611237466061575, + "learning_rate": 6.184202879945437e-07, + "logits/chosen": -2.3852033615112305, + "logits/rejected": -2.3722081184387207, + "logps/chosen": -18.83804702758789, + "logps/rejected": -429.45111083984375, + "logps_avg/chosen": -0.10900785773992538, + "logps_avg/rejected": -2.5360665321350098, + "loss": 0.1049, + "losses_ref": -0.0009189220145344734, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3560, + "u": -3.4985721111297607, + "weight": 0.0763770267367363 + }, + { + "diff_generated": -23.74805450439453, + "epoch": 1.1568373298768633, + "grad_norm": 2.539078034873699, + "learning_rate": 6.171552357815497e-07, + "logits/chosen": -2.3657662868499756, + "logits/rejected": -2.409804105758667, + "logps/chosen": -17.804948806762695, + "logps/rejected": -431.1031188964844, + "logps_avg/chosen": -0.10777918249368668, + "logps_avg/rejected": -2.374805450439453, + "loss": 0.1025, + "losses_ref": -0.0020144102163612843, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3570, + "u": -3.688842296600342, + "weight": 0.028050964698195457 + }, + { + "diff_generated": -23.931705474853516, + "epoch": 1.1600777705767984, + "grad_norm": 2.764114064987785, + "learning_rate": 6.15887096324143e-07, + "logits/chosen": -2.4409403800964355, + "logits/rejected": -2.4246106147766113, + "logps/chosen": -19.8535213470459, + "logps/rejected": -418.84710693359375, + "logps_avg/chosen": -0.10594918578863144, + "logps_avg/rejected": -2.3931703567504883, + "loss": 0.1025, + "losses_ref": -0.0015651138965040445, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3580, + "u": -3.5757503509521484, + "weight": 0.05872585251927376 + }, + { + "diff_generated": -24.774555206298828, + "epoch": 1.1633182112767337, + "grad_norm": 2.49708903723434, + "learning_rate": 6.14615887651161e-07, + "logits/chosen": -2.4068663120269775, + "logits/rejected": -2.4403574466705322, + "logps/chosen": -16.835186004638672, + "logps/rejected": -434.1429138183594, + "logps_avg/chosen": -0.09970332682132721, + "logps_avg/rejected": -2.4774553775787354, + "loss": 0.1003, + "losses_ref": -0.0007696760585531592, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3590, + "u": -3.5268020629882812, + "weight": 0.06986421346664429 + }, + { + "diff_generated": -24.963159561157227, + "epoch": 1.1665586519766689, + "grad_norm": 2.73474011135887, + "learning_rate": 6.133416278350756e-07, + "logits/chosen": -2.4077696800231934, + "logits/rejected": -2.391592502593994, + "logps/chosen": -17.72994613647461, + "logps/rejected": -439.46258544921875, + "logps_avg/chosen": -0.1045701652765274, + "logps_avg/rejected": -2.4963157176971436, + "loss": 0.1018, + "losses_ref": -0.0007292412337847054, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3600, + "u": -3.5760302543640137, + "weight": 0.05725475028157234 + }, + { + "diff_generated": -23.916915893554688, + "epoch": 1.169799092676604, + "grad_norm": 2.704649464362536, + "learning_rate": 6.120643349917359e-07, + "logits/chosen": -2.429378032684326, + "logits/rejected": -2.3929200172424316, + "logps/chosen": -17.818084716796875, + "logps/rejected": -440.4522399902344, + "logps_avg/chosen": -0.09617959707975388, + "logps_avg/rejected": -2.3916916847229004, + "loss": 0.1017, + "losses_ref": -0.0010255600791424513, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3610, + "u": -3.59245228767395, + "weight": 0.05164354294538498 + }, + { + "diff_generated": -25.387792587280273, + "epoch": 1.173039533376539, + "grad_norm": 2.6381580124499404, + "learning_rate": 6.107840272801108e-07, + "logits/chosen": -2.4290518760681152, + "logits/rejected": -2.3904924392700195, + "logps/chosen": -19.25713539123535, + "logps/rejected": -442.5533752441406, + "logps_avg/chosen": -0.10800061374902725, + "logps_avg/rejected": -2.5387792587280273, + "loss": 0.1041, + "losses_ref": -0.0016010403633117676, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3620, + "u": -3.624995708465576, + "weight": 0.04630355164408684 + }, + { + "diff_generated": -24.498355865478516, + "epoch": 1.1762799740764744, + "grad_norm": 2.6553993227399504, + "learning_rate": 6.095007229020311e-07, + "logits/chosen": -2.40181040763855, + "logits/rejected": -2.434415817260742, + "logps/chosen": -16.159542083740234, + "logps/rejected": -464.08544921875, + "logps_avg/chosen": -0.09530286490917206, + "logps_avg/rejected": -2.4498355388641357, + "loss": 0.103, + "losses_ref": -0.0005648103542625904, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3630, + "u": -3.5751399993896484, + "weight": 0.0569901242852211 + }, + { + "diff_generated": -25.366138458251953, + "epoch": 1.1795204147764096, + "grad_norm": 2.6255957192728454, + "learning_rate": 6.082144401019304e-07, + "logits/chosen": -2.42928409576416, + "logits/rejected": -2.3760671615600586, + "logps/chosen": -18.25739097595215, + "logps/rejected": -442.1548767089844, + "logps_avg/chosen": -0.10382506996393204, + "logps_avg/rejected": -2.536613941192627, + "loss": 0.101, + "losses_ref": -0.0005856143543496728, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3640, + "u": -3.5992000102996826, + "weight": 0.05081767961382866 + }, + { + "diff_generated": -24.045963287353516, + "epoch": 1.1827608554763447, + "grad_norm": 2.662881729483114, + "learning_rate": 6.069251971665857e-07, + "logits/chosen": -2.36564302444458, + "logits/rejected": -2.3937220573425293, + "logps/chosen": -18.25554847717285, + "logps/rejected": -465.24969482421875, + "logps_avg/chosen": -0.10365450382232666, + "logps_avg/rejected": -2.4045963287353516, + "loss": 0.1026, + "losses_ref": -0.0008571479702368379, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3650, + "u": -3.525778293609619, + "weight": 0.06990309059619904 + }, + { + "diff_generated": -25.264225006103516, + "epoch": 1.18600129617628, + "grad_norm": 2.7684940174192825, + "learning_rate": 6.056330124248576e-07, + "logits/chosen": -2.3799309730529785, + "logits/rejected": -2.4361531734466553, + "logps/chosen": -16.044706344604492, + "logps/rejected": -469.60760498046875, + "logps_avg/chosen": -0.09844937920570374, + "logps_avg/rejected": -2.5264222621917725, + "loss": 0.102, + "losses_ref": -0.0006588260876014829, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3660, + "u": -3.670767307281494, + "weight": 0.032099511474370956 + }, + { + "diff_generated": -23.373096466064453, + "epoch": 1.1892417368762151, + "grad_norm": 2.5773576512927594, + "learning_rate": 6.043379042474297e-07, + "logits/chosen": -2.3959336280822754, + "logits/rejected": -2.4232144355773926, + "logps/chosen": -19.539710998535156, + "logps/rejected": -443.3605041503906, + "logps_avg/chosen": -0.1098468154668808, + "logps_avg/rejected": -2.3373095989227295, + "loss": 0.1012, + "losses_ref": -0.001127001247368753, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3670, + "u": -3.599931240081787, + "weight": 0.05171620100736618 + }, + { + "diff_generated": -24.989749908447266, + "epoch": 1.1924821775761503, + "grad_norm": 2.6554402069227088, + "learning_rate": 6.030398910465475e-07, + "logits/chosen": -2.358093023300171, + "logits/rejected": -2.3458518981933594, + "logps/chosen": -17.702842712402344, + "logps/rejected": -471.4501953125, + "logps_avg/chosen": -0.10465174913406372, + "logps_avg/rejected": -2.4989750385284424, + "loss": 0.1054, + "losses_ref": -0.0005576363764703274, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3680, + "u": -3.5466511249542236, + "weight": 0.06327757984399796 + }, + { + "diff_generated": -25.277515411376953, + "epoch": 1.1957226182760856, + "grad_norm": 2.6168702606831933, + "learning_rate": 6.017389912757561e-07, + "logits/chosen": -2.4333736896514893, + "logits/rejected": -2.441929578781128, + "logps/chosen": -16.576610565185547, + "logps/rejected": -475.36920166015625, + "logps_avg/chosen": -0.09709908068180084, + "logps_avg/rejected": -2.5277514457702637, + "loss": 0.1004, + "losses_ref": -0.0004923694650642574, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3690, + "u": -3.552541732788086, + "weight": 0.06316892802715302 + }, + { + "diff_generated": -24.737037658691406, + "epoch": 1.1989630589760207, + "grad_norm": 2.5817750174964824, + "learning_rate": 6.004352234296389e-07, + "logits/chosen": -2.3979382514953613, + "logits/rejected": -2.369443416595459, + "logps/chosen": -19.797863006591797, + "logps/rejected": -437.35369873046875, + "logps_avg/chosen": -0.10848965495824814, + "logps_avg/rejected": -2.4737040996551514, + "loss": 0.107, + "losses_ref": -0.0006861963192932308, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3700, + "u": -3.524837017059326, + "weight": 0.06967984139919281 + }, + { + "diff_generated": -24.86382484436035, + "epoch": 1.2022034996759559, + "grad_norm": 2.517128501919303, + "learning_rate": 5.991286060435536e-07, + "logits/chosen": -2.3625564575195312, + "logits/rejected": -2.312837600708008, + "logps/chosen": -19.742130279541016, + "logps/rejected": -429.26092529296875, + "logps_avg/chosen": -0.10398830473423004, + "logps_avg/rejected": -2.486382484436035, + "loss": 0.104, + "losses_ref": -0.0011359945638105273, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3710, + "u": -3.4778130054473877, + "weight": 0.08374304324388504 + }, + { + "diff_generated": -24.32589340209961, + "epoch": 1.2054439403758912, + "grad_norm": 2.7384186952104748, + "learning_rate": 5.978191576933692e-07, + "logits/chosen": -2.3719379901885986, + "logits/rejected": -2.3585941791534424, + "logps/chosen": -17.426773071289062, + "logps/rejected": -442.98077392578125, + "logps_avg/chosen": -0.09669273346662521, + "logps_avg/rejected": -2.432589054107666, + "loss": 0.1024, + "losses_ref": -0.000999335665255785, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3720, + "u": -3.5731139183044434, + "weight": 0.057646702975034714 + }, + { + "diff_generated": -25.280864715576172, + "epoch": 1.2086843810758263, + "grad_norm": 2.5733058201722634, + "learning_rate": 5.965068969952017e-07, + "logits/chosen": -2.408519983291626, + "logits/rejected": -2.411043405532837, + "logps/chosen": -18.149112701416016, + "logps/rejected": -454.8963317871094, + "logps_avg/chosen": -0.1049225777387619, + "logps_avg/rejected": -2.5280864238739014, + "loss": 0.1002, + "losses_ref": -0.0010424638167023659, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3730, + "u": -3.6665706634521484, + "weight": 0.032865602523088455 + }, + { + "diff_generated": -26.70073890686035, + "epoch": 1.2119248217757614, + "grad_norm": 2.601747780328003, + "learning_rate": 5.951918426051502e-07, + "logits/chosen": -2.4358088970184326, + "logits/rejected": -2.401461124420166, + "logps/chosen": -14.340431213378906, + "logps/rejected": -465.8653259277344, + "logps_avg/chosen": -0.08685021847486496, + "logps_avg/rejected": -2.670073986053467, + "loss": 0.1012, + "losses_ref": -0.0008388949790969491, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3740, + "u": -3.5967020988464355, + "weight": 0.051382988691329956 + }, + { + "diff_generated": -25.41323471069336, + "epoch": 1.2151652624756968, + "grad_norm": 2.5745745278867975, + "learning_rate": 5.938740132190306e-07, + "logits/chosen": -2.381037712097168, + "logits/rejected": -2.3758397102355957, + "logps/chosen": -17.276548385620117, + "logps/rejected": -461.35028076171875, + "logps_avg/chosen": -0.09561358392238617, + "logps_avg/rejected": -2.541323661804199, + "loss": 0.1053, + "losses_ref": -0.0006424171733669937, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3750, + "u": -3.623504161834717, + "weight": 0.04461907222867012 + }, + { + "diff_generated": -26.197582244873047, + "epoch": 1.218405703175632, + "grad_norm": 2.4934283986348493, + "learning_rate": 5.9255342757211e-07, + "logits/chosen": -2.405440092086792, + "logits/rejected": -2.415987968444824, + "logps/chosen": -17.704856872558594, + "logps/rejected": -454.7588806152344, + "logps_avg/chosen": -0.1022186279296875, + "logps_avg/rejected": -2.619758129119873, + "loss": 0.1008, + "losses_ref": -0.0006152585265226662, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3760, + "u": -3.693714141845703, + "weight": 0.02580828033387661 + }, + { + "diff_generated": -26.245763778686523, + "epoch": 1.221646143875567, + "grad_norm": 2.8453632401755886, + "learning_rate": 5.91230104438841e-07, + "logits/chosen": -2.414271593093872, + "logits/rejected": -2.3170933723449707, + "logps/chosen": -19.21923065185547, + "logps/rejected": -435.68133544921875, + "logps_avg/chosen": -0.10188715159893036, + "logps_avg/rejected": -2.6245763301849365, + "loss": 0.1015, + "losses_ref": -0.0007117214845493436, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3770, + "u": -3.5293922424316406, + "weight": 0.0698995441198349 + }, + { + "diff_generated": -24.69413185119629, + "epoch": 1.2248865845755024, + "grad_norm": 2.531053854092759, + "learning_rate": 5.899040626325945e-07, + "logits/chosen": -2.432494640350342, + "logits/rejected": -2.3904690742492676, + "logps/chosen": -17.50114631652832, + "logps/rejected": -440.77154541015625, + "logps_avg/chosen": -0.1005210131406784, + "logps_avg/rejected": -2.4694130420684814, + "loss": 0.103, + "losses_ref": -0.0005105865420773625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3780, + "u": -3.5267860889434814, + "weight": 0.06939880549907684 + }, + { + "diff_generated": -25.48592758178711, + "epoch": 1.2281270252754375, + "grad_norm": 2.768212237607484, + "learning_rate": 5.885753210053917e-07, + "logits/chosen": -2.4312968254089355, + "logits/rejected": -2.398667335510254, + "logps/chosen": -18.666072845458984, + "logps/rejected": -453.98492431640625, + "logps_avg/chosen": -0.1044863611459732, + "logps_avg/rejected": -2.5485928058624268, + "loss": 0.106, + "losses_ref": -0.00037497709854505956, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3790, + "u": -3.553412675857544, + "weight": 0.06297777593135834 + }, + { + "diff_generated": -28.831745147705078, + "epoch": 1.2313674659753726, + "grad_norm": 2.788790523416712, + "learning_rate": 5.872438984476368e-07, + "logits/chosen": -2.42059588432312, + "logits/rejected": -2.3469738960266113, + "logps/chosen": -19.486249923706055, + "logps/rejected": -455.1859436035156, + "logps_avg/chosen": -0.11247433722019196, + "logps_avg/rejected": -2.883174419403076, + "loss": 0.1045, + "losses_ref": -0.001660289941355586, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3800, + "u": -3.6212081909179688, + "weight": 0.046906400471925735 + }, + { + "diff_generated": -24.617816925048828, + "epoch": 1.2346079066753077, + "grad_norm": 2.454521001860937, + "learning_rate": 5.859098138878482e-07, + "logits/chosen": -2.4189045429229736, + "logits/rejected": -2.3941142559051514, + "logps/chosen": -20.42704200744629, + "logps/rejected": -439.0167541503906, + "logps_avg/chosen": -0.11315326392650604, + "logps_avg/rejected": -2.4617817401885986, + "loss": 0.1036, + "losses_ref": -0.0006618654588237405, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3810, + "u": -3.6706244945526123, + "weight": 0.0321231447160244 + }, + { + "diff_generated": -25.22861671447754, + "epoch": 1.237848347375243, + "grad_norm": 2.678018217467591, + "learning_rate": 5.845730862923889e-07, + "logits/chosen": -2.370227098464966, + "logits/rejected": -2.3375167846679688, + "logps/chosen": -19.023059844970703, + "logps/rejected": -453.9886169433594, + "logps_avg/chosen": -0.10912897437810898, + "logps_avg/rejected": -2.5228614807128906, + "loss": 0.1036, + "losses_ref": -0.0008484205463901162, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3820, + "u": -3.552034854888916, + "weight": 0.06384466588497162 + }, + { + "diff_generated": -26.32217788696289, + "epoch": 1.2410887880751782, + "grad_norm": 2.722738395369602, + "learning_rate": 5.83233734665198e-07, + "logits/chosen": -2.3956446647644043, + "logits/rejected": -2.3501551151275635, + "logps/chosen": -17.891752243041992, + "logps/rejected": -444.76263427734375, + "logps_avg/chosen": -0.10333029925823212, + "logps_avg/rejected": -2.6322176456451416, + "loss": 0.1015, + "losses_ref": -0.0008470058673992753, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3830, + "u": -3.6235289573669434, + "weight": 0.04495188593864441 + }, + { + "diff_generated": -26.583011627197266, + "epoch": 1.2443292287751135, + "grad_norm": 2.3441085033146565, + "learning_rate": 5.818917780475196e-07, + "logits/chosen": -2.4250235557556152, + "logits/rejected": -2.425710678100586, + "logps/chosen": -21.300302505493164, + "logps/rejected": -467.77203369140625, + "logps_avg/chosen": -0.11703227460384369, + "logps_avg/rejected": -2.6583011150360107, + "loss": 0.1033, + "losses_ref": -0.0011255014687776566, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3840, + "u": -3.7144665718078613, + "weight": 0.020351290702819824 + }, + { + "diff_generated": -24.3111629486084, + "epoch": 1.2475696694750487, + "grad_norm": 2.7823770770996576, + "learning_rate": 5.805472355176318e-07, + "logits/chosen": -2.446390390396118, + "logits/rejected": -2.4117252826690674, + "logps/chosen": -18.355083465576172, + "logps/rejected": -452.235107421875, + "logps_avg/chosen": -0.10057584196329117, + "logps_avg/rejected": -2.4311161041259766, + "loss": 0.1024, + "losses_ref": -0.00039589227526448667, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3850, + "u": -3.5967726707458496, + "weight": 0.05048195645213127 + }, + { + "diff_generated": -23.1942081451416, + "epoch": 1.2508101101749838, + "grad_norm": 2.5046015246562234, + "learning_rate": 5.792001261905767e-07, + "logits/chosen": -2.410804271697998, + "logits/rejected": -2.4035861492156982, + "logps/chosen": -17.919591903686523, + "logps/rejected": -415.97052001953125, + "logps_avg/chosen": -0.10415836423635483, + "logps_avg/rejected": -2.319420576095581, + "loss": 0.0987, + "losses_ref": -0.0004912324948236346, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3860, + "u": -3.4360504150390625, + "weight": 0.0943656861782074 + }, + { + "diff_generated": -23.43748664855957, + "epoch": 1.254050550874919, + "grad_norm": 2.667129273327817, + "learning_rate": 5.778504692178876e-07, + "logits/chosen": -2.3880181312561035, + "logits/rejected": -2.438152551651001, + "logps/chosen": -16.907007217407227, + "logps/rejected": -435.5437927246094, + "logps_avg/chosen": -0.09878456592559814, + "logps_avg/rejected": -2.3437483310699463, + "loss": 0.1008, + "losses_ref": -0.0016443884233012795, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3870, + "u": -3.505042314529419, + "weight": 0.07795653492212296 + }, + { + "diff_generated": -25.2240047454834, + "epoch": 1.2572909915748542, + "grad_norm": 2.5542328005408907, + "learning_rate": 5.76498283787317e-07, + "logits/chosen": -2.403738498687744, + "logits/rejected": -2.368741273880005, + "logps/chosen": -17.75848960876465, + "logps/rejected": -440.37982177734375, + "logps_avg/chosen": -0.09870745241641998, + "logps_avg/rejected": -2.522400379180908, + "loss": 0.1013, + "losses_ref": -0.0002985192695632577, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3880, + "u": -3.5994458198547363, + "weight": 0.050349898636341095 + }, + { + "diff_generated": -24.899044036865234, + "epoch": 1.2605314322747894, + "grad_norm": 2.7016093842722824, + "learning_rate": 5.751435891225643e-07, + "logits/chosen": -2.355231761932373, + "logits/rejected": -2.37161922454834, + "logps/chosen": -15.857172012329102, + "logps/rejected": -432.68292236328125, + "logps_avg/chosen": -0.09380076825618744, + "logps_avg/rejected": -2.4899046421051025, + "loss": 0.1008, + "losses_ref": -0.000942692335229367, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3890, + "u": -3.574742078781128, + "weight": 0.057598210871219635 + }, + { + "diff_generated": -25.746257781982422, + "epoch": 1.2637718729747245, + "grad_norm": 2.590886361364249, + "learning_rate": 5.737864044830015e-07, + "logits/chosen": -2.3851318359375, + "logits/rejected": -2.355273962020874, + "logps/chosen": -19.687891006469727, + "logps/rejected": -454.10455322265625, + "logps_avg/chosen": -0.1099676862359047, + "logps_avg/rejected": -2.5746259689331055, + "loss": 0.1035, + "losses_ref": -0.0006579064065590501, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3900, + "u": -3.5053353309631348, + "weight": 0.07585734874010086 + }, + { + "diff_generated": -24.64337158203125, + "epoch": 1.2670123136746598, + "grad_norm": 2.7276886008169754, + "learning_rate": 5.724267491634006e-07, + "logits/chosen": -2.3683695793151855, + "logits/rejected": -2.3457908630371094, + "logps/chosen": -17.896350860595703, + "logps/rejected": -466.97198486328125, + "logps_avg/chosen": -0.09884298592805862, + "logps_avg/rejected": -2.464337110519409, + "loss": 0.1017, + "losses_ref": -0.0007082788506522775, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3910, + "u": -3.6250851154327393, + "weight": 0.044809214770793915 + }, + { + "diff_generated": -25.45005989074707, + "epoch": 1.270252754374595, + "grad_norm": 2.3372799665605837, + "learning_rate": 5.710646424936581e-07, + "logits/chosen": -2.4046618938446045, + "logits/rejected": -2.378971576690674, + "logps/chosen": -19.94418716430664, + "logps/rejected": -448.55194091796875, + "logps_avg/chosen": -0.1066654697060585, + "logps_avg/rejected": -2.545006036758423, + "loss": 0.1037, + "losses_ref": -0.0008164637838490307, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3920, + "u": -3.6216578483581543, + "weight": 0.04495164006948471 + }, + { + "diff_generated": -27.618610382080078, + "epoch": 1.27349319507453, + "grad_norm": 2.7112289690219784, + "learning_rate": 5.697001038385212e-07, + "logits/chosen": -2.361335277557373, + "logits/rejected": -2.328921318054199, + "logps/chosen": -19.196826934814453, + "logps/rejected": -480.14007568359375, + "logps_avg/chosen": -0.10564608871936798, + "logps_avg/rejected": -2.7618608474731445, + "loss": 0.1036, + "losses_ref": -0.000225507072173059, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3930, + "u": -3.621340274810791, + "weight": 0.04401098191738129 + }, + { + "diff_generated": -24.551250457763672, + "epoch": 1.2767336357744652, + "grad_norm": 2.8313354420297157, + "learning_rate": 5.683331525973118e-07, + "logits/chosen": -2.3341686725616455, + "logits/rejected": -2.3362679481506348, + "logps/chosen": -19.423187255859375, + "logps/rejected": -442.8011169433594, + "logps_avg/chosen": -0.10974836349487305, + "logps_avg/rejected": -2.455125093460083, + "loss": 0.1058, + "losses_ref": -0.000763989461120218, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3940, + "u": -3.5773825645446777, + "weight": 0.05737464502453804 + }, + { + "diff_generated": -26.735946655273438, + "epoch": 1.2799740764744005, + "grad_norm": 2.794923295177927, + "learning_rate": 5.66963808203651e-07, + "logits/chosen": -2.391003131866455, + "logits/rejected": -2.3569178581237793, + "logps/chosen": -18.766311645507812, + "logps/rejected": -493.0743103027344, + "logps_avg/chosen": -0.11267992109060287, + "logps_avg/rejected": -2.6735944747924805, + "loss": 0.1037, + "losses_ref": -0.0009607706451788545, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3950, + "u": -3.669952392578125, + "weight": 0.032738275825977325 + }, + { + "diff_generated": -25.84634017944336, + "epoch": 1.2832145171743357, + "grad_norm": 2.8018447875503116, + "learning_rate": 5.65592090125183e-07, + "logits/chosen": -2.3674049377441406, + "logits/rejected": -2.370068073272705, + "logps/chosen": -13.854530334472656, + "logps/rejected": -458.9341735839844, + "logps_avg/chosen": -0.08643799275159836, + "logps_avg/rejected": -2.584634304046631, + "loss": 0.0986, + "losses_ref": -0.000706795952282846, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3960, + "u": -3.499607801437378, + "weight": 0.07594309747219086 + }, + { + "diff_generated": -24.824703216552734, + "epoch": 1.286454957874271, + "grad_norm": 2.668284886295544, + "learning_rate": 5.642180178632977e-07, + "logits/chosen": -2.3868956565856934, + "logits/rejected": -2.383216381072998, + "logps/chosen": -17.2413330078125, + "logps/rejected": -467.44232177734375, + "logps_avg/chosen": -0.1045045480132103, + "logps_avg/rejected": -2.482470750808716, + "loss": 0.102, + "losses_ref": -0.0006459239521063864, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3970, + "u": -3.624206066131592, + "weight": 0.04473030939698219 + }, + { + "diff_generated": -27.599124908447266, + "epoch": 1.2896953985742061, + "grad_norm": 2.7179931685781096, + "learning_rate": 5.628416109528542e-07, + "logits/chosen": -2.3622069358825684, + "logits/rejected": -2.3574976921081543, + "logps/chosen": -17.356678009033203, + "logps/rejected": -505.03668212890625, + "logps_avg/chosen": -0.09987537562847137, + "logps_avg/rejected": -2.7599120140075684, + "loss": 0.1058, + "losses_ref": -0.0009466443443670869, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3980, + "u": -3.5990073680877686, + "weight": 0.051428310573101044 + }, + { + "diff_generated": -26.108234405517578, + "epoch": 1.2929358392741412, + "grad_norm": 2.4210253725599857, + "learning_rate": 5.614628889619029e-07, + "logits/chosen": -2.3183486461639404, + "logits/rejected": -2.35087513923645, + "logps/chosen": -17.084163665771484, + "logps/rejected": -474.3221740722656, + "logps_avg/chosen": -0.10216245800256729, + "logps_avg/rejected": -2.610823154449463, + "loss": 0.1032, + "losses_ref": -0.0010031044948846102, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 3990, + "u": -3.596558094024658, + "weight": 0.05139036104083061 + }, + { + "diff_generated": -24.630847930908203, + "epoch": 1.2961762799740764, + "grad_norm": 2.858796440896099, + "learning_rate": 5.600818714914065e-07, + "logits/chosen": -2.377168893814087, + "logits/rejected": -2.3589062690734863, + "logps/chosen": -20.87276840209961, + "logps/rejected": -453.269287109375, + "logps_avg/chosen": -0.1113833412528038, + "logps_avg/rejected": -2.4630846977233887, + "loss": 0.107, + "losses_ref": -0.0004416326410137117, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4000, + "u": -3.577017307281494, + "weight": 0.05680033564567566 + }, + { + "diff_generated": -27.36174964904785, + "epoch": 1.2994167206740117, + "grad_norm": 2.741777502121723, + "learning_rate": 5.586985781749625e-07, + "logits/chosen": -2.400063991546631, + "logits/rejected": -2.4467995166778564, + "logps/chosen": -17.76604461669922, + "logps/rejected": -507.6504821777344, + "logps_avg/chosen": -0.10631255805492401, + "logps_avg/rejected": -2.736175060272217, + "loss": 0.1008, + "losses_ref": -0.0009669626015238464, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4010, + "u": -3.6947712898254395, + "weight": 0.026510408148169518 + }, + { + "diff_generated": -24.537931442260742, + "epoch": 1.3026571613739468, + "grad_norm": 2.4445506760186744, + "learning_rate": 5.573130286785237e-07, + "logits/chosen": -2.429718017578125, + "logits/rejected": -2.3343007564544678, + "logps/chosen": -18.747915267944336, + "logps/rejected": -424.55841064453125, + "logps_avg/chosen": -0.097903311252594, + "logps_avg/rejected": -2.4537932872772217, + "loss": 0.1011, + "losses_ref": -0.00035963323898613453, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4020, + "u": -3.5008933544158936, + "weight": 0.0754479244351387 + }, + { + "diff_generated": -25.826641082763672, + "epoch": 1.3058976020738822, + "grad_norm": 2.7129845467722435, + "learning_rate": 5.559252427001178e-07, + "logits/chosen": -2.3235254287719727, + "logits/rejected": -2.312380075454712, + "logps/chosen": -18.90648651123047, + "logps/rejected": -452.123046875, + "logps_avg/chosen": -0.1057731956243515, + "logps_avg/rejected": -2.5826640129089355, + "loss": 0.1013, + "losses_ref": -0.0019954966846853495, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4030, + "u": -3.547513961791992, + "weight": 0.06588520109653473 + }, + { + "diff_generated": -27.287891387939453, + "epoch": 1.3091380427738173, + "grad_norm": 2.6551302568919963, + "learning_rate": 5.545352399695687e-07, + "logits/chosen": -2.360625982284546, + "logits/rejected": -2.3697447776794434, + "logps/chosen": -17.551830291748047, + "logps/rejected": -478.44818115234375, + "logps_avg/chosen": -0.10685201734304428, + "logps_avg/rejected": -2.7287890911102295, + "loss": 0.1034, + "losses_ref": -0.0005914900102652609, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4040, + "u": -3.6493136882781982, + "weight": 0.0383087582886219 + }, + { + "diff_generated": -26.916757583618164, + "epoch": 1.3123784834737524, + "grad_norm": 2.463094256206414, + "learning_rate": 5.531430402482153e-07, + "logits/chosen": -2.3738372325897217, + "logits/rejected": -2.3218994140625, + "logps/chosen": -18.182100296020508, + "logps/rejected": -479.9534606933594, + "logps_avg/chosen": -0.10409387201070786, + "logps_avg/rejected": -2.6916756629943848, + "loss": 0.101, + "losses_ref": -0.0010314477840438485, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4050, + "u": -3.5255794525146484, + "weight": 0.07031405717134476 + }, + { + "diff_generated": -25.09537124633789, + "epoch": 1.3156189241736875, + "grad_norm": 2.394567509090008, + "learning_rate": 5.517486633286299e-07, + "logits/chosen": -2.3621907234191895, + "logits/rejected": -2.3411543369293213, + "logps/chosen": -18.44365119934082, + "logps/rejected": -482.3141174316406, + "logps_avg/chosen": -0.09819479286670685, + "logps_avg/rejected": -2.5095372200012207, + "loss": 0.101, + "losses_ref": -0.0005044209538027644, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4060, + "u": -3.524848222732544, + "weight": 0.06937181949615479 + }, + { + "diff_generated": -27.66803550720215, + "epoch": 1.3188593648736229, + "grad_norm": 2.4050570067723416, + "learning_rate": 5.503521290343384e-07, + "logits/chosen": -2.400895595550537, + "logits/rejected": -2.377678155899048, + "logps/chosen": -19.440929412841797, + "logps/rejected": -528.3328857421875, + "logps_avg/chosen": -0.10858882963657379, + "logps_avg/rejected": -2.7668039798736572, + "loss": 0.1032, + "losses_ref": -0.0008151186630129814, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4070, + "u": -3.669245958328247, + "weight": 0.03257065638899803 + }, + { + "diff_generated": -26.13507652282715, + "epoch": 1.322099805573558, + "grad_norm": 2.756233667724102, + "learning_rate": 5.489534572195373e-07, + "logits/chosen": -2.3385581970214844, + "logits/rejected": -2.392242193222046, + "logps/chosen": -15.599695205688477, + "logps/rejected": -485.8387756347656, + "logps_avg/chosen": -0.09932243078947067, + "logps_avg/rejected": -2.6135077476501465, + "loss": 0.1014, + "losses_ref": -0.0006077963043935597, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4080, + "u": -3.4798247814178467, + "weight": 0.08210951834917068 + }, + { + "diff_generated": -27.595714569091797, + "epoch": 1.3253402462734931, + "grad_norm": 2.6932022364658033, + "learning_rate": 5.47552667768811e-07, + "logits/chosen": -2.347517490386963, + "logits/rejected": -2.3942277431488037, + "logps/chosen": -15.13756275177002, + "logps/rejected": -492.9637145996094, + "logps_avg/chosen": -0.09007562696933746, + "logps_avg/rejected": -2.7595715522766113, + "loss": 0.1035, + "losses_ref": -0.0006332875927910209, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4090, + "u": -3.6677443981170654, + "weight": 0.0321536622941494 + }, + { + "diff_generated": -27.55939292907715, + "epoch": 1.3285806869734285, + "grad_norm": 2.4145917904418317, + "learning_rate": 5.46149780596851e-07, + "logits/chosen": -2.388737201690674, + "logits/rejected": -2.415613889694214, + "logps/chosen": -17.241397857666016, + "logps/rejected": -492.285888671875, + "logps_avg/chosen": -0.1004859209060669, + "logps_avg/rejected": -2.75593900680542, + "loss": 0.1026, + "losses_ref": -0.0004385868087410927, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4100, + "u": -3.6940712928771973, + "weight": 0.025556892156600952 + }, + { + "diff_generated": -28.32535743713379, + "epoch": 1.3318211276733636, + "grad_norm": 2.7341334751798336, + "learning_rate": 5.447448156481708e-07, + "logits/chosen": -2.4162185192108154, + "logits/rejected": -2.3701248168945312, + "logps/chosen": -16.101211547851562, + "logps/rejected": -506.6935119628906, + "logps_avg/chosen": -0.09412642568349838, + "logps_avg/rejected": -2.832535743713379, + "loss": 0.1017, + "losses_ref": -0.000632374722044915, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4110, + "u": -3.6228606700897217, + "weight": 0.044598862528800964 + }, + { + "diff_generated": -26.807104110717773, + "epoch": 1.3350615683732987, + "grad_norm": 2.568298597173405, + "learning_rate": 5.433377928968234e-07, + "logits/chosen": -2.41631817817688, + "logits/rejected": -2.374326705932617, + "logps/chosen": -19.040271759033203, + "logps/rejected": -466.0682678222656, + "logps_avg/chosen": -0.10614614188671112, + "logps_avg/rejected": -2.6807103157043457, + "loss": 0.1031, + "losses_ref": -0.0004517110646702349, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4120, + "u": -3.6942248344421387, + "weight": 0.025584762915968895 + }, + { + "diff_generated": -26.3829345703125, + "epoch": 1.3383020090732338, + "grad_norm": 2.5147286408642318, + "learning_rate": 5.41928732346117e-07, + "logits/chosen": -2.3779680728912354, + "logits/rejected": -2.329960346221924, + "logps/chosen": -18.907283782958984, + "logps/rejected": -471.4302673339844, + "logps_avg/chosen": -0.10900212824344635, + "logps_avg/rejected": -2.638293504714966, + "loss": 0.1025, + "losses_ref": -0.0008699931204319, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4130, + "u": -3.645224094390869, + "weight": 0.038796357810497284 + }, + { + "diff_generated": -26.643444061279297, + "epoch": 1.3415424497731692, + "grad_norm": 2.3922661548115394, + "learning_rate": 5.405176540283311e-07, + "logits/chosen": -2.365628480911255, + "logits/rejected": -2.3224751949310303, + "logps/chosen": -18.312501907348633, + "logps/rejected": -505.20098876953125, + "logps_avg/chosen": -0.09830234944820404, + "logps_avg/rejected": -2.6643447875976562, + "loss": 0.1025, + "losses_ref": -0.0002646732027642429, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4140, + "u": -3.504613161087036, + "weight": 0.07531232386827469 + }, + { + "diff_generated": -26.2756290435791, + "epoch": 1.3447828904731043, + "grad_norm": 2.5777576798673714, + "learning_rate": 5.391045780044308e-07, + "logits/chosen": -2.4077603816986084, + "logits/rejected": -2.394535779953003, + "logps/chosen": -18.91120719909668, + "logps/rejected": -510.40521240234375, + "logps_avg/chosen": -0.10508742183446884, + "logps_avg/rejected": -2.6275627613067627, + "loss": 0.1018, + "losses_ref": -0.00047807503142394125, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4150, + "u": -3.6172034740448, + "weight": 0.044402316212654114 + }, + { + "diff_generated": -28.211669921875, + "epoch": 1.3480233311730396, + "grad_norm": 2.594955434688894, + "learning_rate": 5.376895243637823e-07, + "logits/chosen": -2.370952844619751, + "logits/rejected": -2.290727376937866, + "logps/chosen": -19.83003807067871, + "logps/rejected": -510.25714111328125, + "logps_avg/chosen": -0.10845856368541718, + "logps_avg/rejected": -2.8211669921875, + "loss": 0.1042, + "losses_ref": -0.0005390375154092908, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4160, + "u": -3.504507541656494, + "weight": 0.07578588277101517 + }, + { + "diff_generated": -28.59592056274414, + "epoch": 1.3512637718729748, + "grad_norm": 2.6192143853640877, + "learning_rate": 5.362725132238672e-07, + "logits/chosen": -2.3691391944885254, + "logits/rejected": -2.4109010696411133, + "logps/chosen": -17.159189224243164, + "logps/rejected": -558.8998413085938, + "logps_avg/chosen": -0.10241004079580307, + "logps_avg/rejected": -2.859591484069824, + "loss": 0.104, + "losses_ref": -0.0003753933997359127, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4170, + "u": -3.6483771800994873, + "weight": 0.0379684753715992 + }, + { + "diff_generated": -26.94635581970215, + "epoch": 1.3545042125729099, + "grad_norm": 3.987238518571662, + "learning_rate": 5.348535647299964e-07, + "logits/chosen": -2.3350539207458496, + "logits/rejected": -2.329770565032959, + "logps/chosen": -17.921201705932617, + "logps/rejected": -511.4327697753906, + "logps_avg/chosen": -0.1062774658203125, + "logps_avg/rejected": -2.6946353912353516, + "loss": 0.103, + "losses_ref": -0.0009642991935834289, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4180, + "u": -3.670365571975708, + "weight": 0.032805364578962326 + }, + { + "diff_generated": -27.179916381835938, + "epoch": 1.357744653272845, + "grad_norm": 2.520799402886161, + "learning_rate": 5.334326990550234e-07, + "logits/chosen": -2.3830137252807617, + "logits/rejected": -2.3622913360595703, + "logps/chosen": -17.147418975830078, + "logps/rejected": -505.0753479003906, + "logps_avg/chosen": -0.09717821329832077, + "logps_avg/rejected": -2.717991828918457, + "loss": 0.0988, + "losses_ref": -0.0002480837283656001, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4190, + "u": -3.6003196239471436, + "weight": 0.05031207948923111 + }, + { + "diff_generated": -27.000635147094727, + "epoch": 1.3609850939727803, + "grad_norm": 2.5495151878301865, + "learning_rate": 5.320099363990584e-07, + "logits/chosen": -2.390007495880127, + "logits/rejected": -2.3277785778045654, + "logps/chosen": -18.247713088989258, + "logps/rejected": -478.716796875, + "logps_avg/chosen": -0.10145987570285797, + "logps_avg/rejected": -2.700063467025757, + "loss": 0.0962, + "losses_ref": -0.0004951705923303962, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4200, + "u": -3.649203062057495, + "weight": 0.03820186108350754 + }, + { + "diff_generated": -27.969797134399414, + "epoch": 1.3642255346727155, + "grad_norm": 2.4410988251552315, + "learning_rate": 5.305852969891799e-07, + "logits/chosen": -2.4071946144104004, + "logits/rejected": -2.3240163326263428, + "logps/chosen": -18.725177764892578, + "logps/rejected": -465.49053955078125, + "logps_avg/chosen": -0.10521489381790161, + "logps_avg/rejected": -2.796980142593384, + "loss": 0.0991, + "losses_ref": -0.0006577539024874568, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4210, + "u": -3.5468764305114746, + "weight": 0.0634467676281929 + }, + { + "diff_generated": -25.992996215820312, + "epoch": 1.3674659753726508, + "grad_norm": 2.4988856155190424, + "learning_rate": 5.29158801079148e-07, + "logits/chosen": -2.3208134174346924, + "logits/rejected": -2.2926430702209473, + "logps/chosen": -17.967212677001953, + "logps/rejected": -439.5184020996094, + "logps_avg/chosen": -0.0993885025382042, + "logps_avg/rejected": -2.599299192428589, + "loss": 0.1009, + "losses_ref": -0.0003469690855126828, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4220, + "u": -3.42718505859375, + "weight": 0.0941712036728859 + }, + { + "diff_generated": -26.823211669921875, + "epoch": 1.370706416072586, + "grad_norm": 2.750645300373666, + "learning_rate": 5.277304689491165e-07, + "logits/chosen": -2.3862364292144775, + "logits/rejected": -2.3794219493865967, + "logps/chosen": -18.44593620300293, + "logps/rejected": -465.5531311035156, + "logps_avg/chosen": -0.10832039266824722, + "logps_avg/rejected": -2.682321071624756, + "loss": 0.1032, + "losses_ref": -0.0004914809833280742, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4230, + "u": -3.6443495750427246, + "weight": 0.03814055770635605 + }, + { + "diff_generated": -28.399890899658203, + "epoch": 1.373946856772521, + "grad_norm": 2.4662347731325136, + "learning_rate": 5.26300320905344e-07, + "logits/chosen": -2.3697032928466797, + "logits/rejected": -2.345850706100464, + "logps/chosen": -17.219181060791016, + "logps/rejected": -479.1373596191406, + "logps_avg/chosen": -0.10566030442714691, + "logps_avg/rejected": -2.839988946914673, + "loss": 0.1018, + "losses_ref": -0.00043061329051852226, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4240, + "u": -3.6740641593933105, + "weight": 0.031836919486522675 + }, + { + "diff_generated": -25.946186065673828, + "epoch": 1.3771872974724562, + "grad_norm": 2.5775217864622917, + "learning_rate": 5.248683772799054e-07, + "logits/chosen": -2.359295129776001, + "logits/rejected": -2.305798053741455, + "logps/chosen": -20.096813201904297, + "logps/rejected": -470.4903259277344, + "logps_avg/chosen": -0.10649679601192474, + "logps_avg/rejected": -2.594618320465088, + "loss": 0.1004, + "losses_ref": -0.00032928684959188104, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4250, + "u": -3.618279218673706, + "weight": 0.04415220022201538 + }, + { + "diff_generated": -25.108627319335938, + "epoch": 1.3804277381723915, + "grad_norm": 2.5376430727776853, + "learning_rate": 5.234346584304033e-07, + "logits/chosen": -2.3466343879699707, + "logits/rejected": -2.3309569358825684, + "logps/chosen": -17.8268985748291, + "logps/rejected": -446.1979064941406, + "logps_avg/chosen": -0.09819166362285614, + "logps_avg/rejected": -2.510862350463867, + "loss": 0.1023, + "losses_ref": -0.0008442547405138612, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4260, + "u": -3.5203120708465576, + "weight": 0.07002700120210648 + }, + { + "diff_generated": -25.804737091064453, + "epoch": 1.3836681788723266, + "grad_norm": 2.6515230074033322, + "learning_rate": 5.21999184739678e-07, + "logits/chosen": -2.3517873287200928, + "logits/rejected": -2.3346188068389893, + "logps/chosen": -20.65091323852539, + "logps/rejected": -440.77716064453125, + "logps_avg/chosen": -0.11302739381790161, + "logps_avg/rejected": -2.5804734230041504, + "loss": 0.1037, + "losses_ref": -0.0011006726417690516, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4270, + "u": -3.6943564414978027, + "weight": 0.02656673453748226 + }, + { + "diff_generated": -24.770761489868164, + "epoch": 1.3869086195722617, + "grad_norm": 2.672763114594347, + "learning_rate": 5.205619766155182e-07, + "logits/chosen": -2.378600597381592, + "logits/rejected": -2.378143310546875, + "logps/chosen": -18.211240768432617, + "logps/rejected": -422.57904052734375, + "logps_avg/chosen": -0.1082327589392662, + "logps_avg/rejected": -2.477076292037964, + "loss": 0.1045, + "losses_ref": -0.0007958011701703072, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4280, + "u": -3.5233681201934814, + "weight": 0.069813571870327 + }, + { + "diff_generated": -26.24502182006836, + "epoch": 1.390149060272197, + "grad_norm": 2.6044796369955248, + "learning_rate": 5.191230544903702e-07, + "logits/chosen": -2.34401798248291, + "logits/rejected": -2.33591890335083, + "logps/chosen": -16.507028579711914, + "logps/rejected": -415.982421875, + "logps_avg/chosen": -0.09873421490192413, + "logps_avg/rejected": -2.624502420425415, + "loss": 0.0999, + "losses_ref": -0.001061144983395934, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4290, + "u": -3.596564531326294, + "weight": 0.051502663642168045 + }, + { + "diff_generated": -24.274839401245117, + "epoch": 1.3933895009721322, + "grad_norm": 2.8559559028148884, + "learning_rate": 5.176824388210483e-07, + "logits/chosen": -2.342895030975342, + "logits/rejected": -2.345672130584717, + "logps/chosen": -18.14669418334961, + "logps/rejected": -449.62115478515625, + "logps_avg/chosen": -0.10630394518375397, + "logps_avg/rejected": -2.4274840354919434, + "loss": 0.1019, + "losses_ref": -0.0009579318575561047, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4300, + "u": -3.5042850971221924, + "weight": 0.07662221789360046 + }, + { + "diff_generated": -25.419696807861328, + "epoch": 1.3966299416720673, + "grad_norm": 2.5561510029103247, + "learning_rate": 5.162401500884432e-07, + "logits/chosen": -2.3567914962768555, + "logits/rejected": -2.342838764190674, + "logps/chosen": -18.089414596557617, + "logps/rejected": -439.552490234375, + "logps_avg/chosen": -0.10499085485935211, + "logps_avg/rejected": -2.5419695377349854, + "loss": 0.1006, + "losses_ref": -0.0009210168500430882, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4310, + "u": -3.5712103843688965, + "weight": 0.05744823068380356 + }, + { + "diff_generated": -24.52872657775879, + "epoch": 1.3998703823720027, + "grad_norm": 2.486952067870786, + "learning_rate": 5.147962087972314e-07, + "logits/chosen": -2.3481554985046387, + "logits/rejected": -2.308380603790283, + "logps/chosen": -19.289180755615234, + "logps/rejected": -414.15338134765625, + "logps_avg/chosen": -0.1033637523651123, + "logps_avg/rejected": -2.4528725147247314, + "loss": 0.1008, + "losses_ref": -0.0008901024120859802, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4320, + "u": -3.5753231048583984, + "weight": 0.05747845023870468 + }, + { + "diff_generated": -25.5739688873291, + "epoch": 1.4031108230719378, + "grad_norm": 2.6831985223645582, + "learning_rate": 5.133506354755833e-07, + "logits/chosen": -2.365851640701294, + "logits/rejected": -2.36531138420105, + "logps/chosen": -15.5805025100708, + "logps/rejected": -452.583740234375, + "logps_avg/chosen": -0.09160830080509186, + "logps_avg/rejected": -2.557396650314331, + "loss": 0.1001, + "losses_ref": -0.00025015632854774594, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4330, + "u": -3.5732452869415283, + "weight": 0.056541360914707184 + }, + { + "diff_generated": -25.01066017150879, + "epoch": 1.406351263771873, + "grad_norm": 2.6088447308224008, + "learning_rate": 5.119034506748713e-07, + "logits/chosen": -2.2929553985595703, + "logits/rejected": -2.281355142593384, + "logps/chosen": -16.640697479248047, + "logps/rejected": -421.694091796875, + "logps_avg/chosen": -0.10242215543985367, + "logps_avg/rejected": -2.501065969467163, + "loss": 0.0998, + "losses_ref": -0.001229285029694438, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4340, + "u": -3.5283074378967285, + "weight": 0.07093143463134766 + }, + { + "diff_generated": -25.70849609375, + "epoch": 1.4095917044718083, + "grad_norm": 2.641049340662074, + "learning_rate": 5.104546749693781e-07, + "logits/chosen": -2.3369216918945312, + "logits/rejected": -2.337437391281128, + "logps/chosen": -19.4946231842041, + "logps/rejected": -474.26947021484375, + "logps_avg/chosen": -0.10807840526103973, + "logps_avg/rejected": -2.570849895477295, + "loss": 0.1, + "losses_ref": -0.0015168010722845793, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4350, + "u": -3.618104934692383, + "weight": 0.04644922539591789 + }, + { + "diff_generated": -24.85693359375, + "epoch": 1.4128321451717434, + "grad_norm": 2.5766541848668107, + "learning_rate": 5.09004328956004e-07, + "logits/chosen": -2.3693432807922363, + "logits/rejected": -2.3518214225769043, + "logps/chosen": -18.382400512695312, + "logps/rejected": -447.8953552246094, + "logps_avg/chosen": -0.09862877428531647, + "logps_avg/rejected": -2.4856934547424316, + "loss": 0.0992, + "losses_ref": -0.00043182895751670003, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4360, + "u": -3.5792031288146973, + "weight": 0.056764155626297 + }, + { + "diff_generated": -27.943578720092773, + "epoch": 1.4160725858716785, + "grad_norm": 2.4124447556956183, + "learning_rate": 5.075524332539736e-07, + "logits/chosen": -2.3231358528137207, + "logits/rejected": -2.263293504714966, + "logps/chosen": -17.485557556152344, + "logps/rejected": -466.57318115234375, + "logps_avg/chosen": -0.10093804448843002, + "logps_avg/rejected": -2.7943577766418457, + "loss": 0.1014, + "losses_ref": -0.0006617440958507359, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4370, + "u": -3.670496702194214, + "weight": 0.03214912861585617 + }, + { + "diff_generated": -26.72478675842285, + "epoch": 1.4193130265716136, + "grad_norm": 2.6571784933053384, + "learning_rate": 5.060990085045432e-07, + "logits/chosen": -2.333707332611084, + "logits/rejected": -2.3314852714538574, + "logps/chosen": -17.644012451171875, + "logps/rejected": -456.2490234375, + "logps_avg/chosen": -0.1047801747918129, + "logps_avg/rejected": -2.6724789142608643, + "loss": 0.103, + "losses_ref": -0.0011025893036276102, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4380, + "u": -3.668222427368164, + "weight": 0.032760731875896454 + }, + { + "diff_generated": -25.63039207458496, + "epoch": 1.422553467271549, + "grad_norm": 2.528344636368985, + "learning_rate": 5.046440753707077e-07, + "logits/chosen": -2.4096553325653076, + "logits/rejected": -2.361562728881836, + "logps/chosen": -16.573205947875977, + "logps/rejected": -436.4747009277344, + "logps_avg/chosen": -0.08926139771938324, + "logps_avg/rejected": -2.5630393028259277, + "loss": 0.1012, + "losses_ref": -0.000798130058683455, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4390, + "u": -3.575213670730591, + "weight": 0.05734226852655411 + }, + { + "diff_generated": -25.784008026123047, + "epoch": 1.425793907971484, + "grad_norm": 2.449284288470074, + "learning_rate": 5.031876545369054e-07, + "logits/chosen": -2.3840537071228027, + "logits/rejected": -2.3444948196411133, + "logps/chosen": -18.734729766845703, + "logps/rejected": -443.230712890625, + "logps_avg/chosen": -0.10571378469467163, + "logps_avg/rejected": -2.5784008502960205, + "loss": 0.1035, + "losses_ref": -0.0009342956473119557, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4400, + "u": -3.6010608673095703, + "weight": 0.051356326788663864 + }, + { + "diff_generated": -24.286157608032227, + "epoch": 1.4290343486714194, + "grad_norm": 2.78291150567027, + "learning_rate": 5.017297667087257e-07, + "logits/chosen": -2.376237392425537, + "logits/rejected": -2.360668897628784, + "logps/chosen": -19.048891067504883, + "logps/rejected": -441.42730712890625, + "logps_avg/chosen": -0.10474354028701782, + "logps_avg/rejected": -2.4286153316497803, + "loss": 0.1025, + "losses_ref": -0.001150609226897359, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4410, + "u": -3.6455307006835938, + "weight": 0.03921959549188614 + }, + { + "diff_generated": -25.352760314941406, + "epoch": 1.4322747893713546, + "grad_norm": 3.0038648383145907, + "learning_rate": 5.002704326126135e-07, + "logits/chosen": -2.4247918128967285, + "logits/rejected": -2.381690502166748, + "logps/chosen": -19.50627326965332, + "logps/rejected": -454.374267578125, + "logps_avg/chosen": -0.10728516429662704, + "logps_avg/rejected": -2.535275936126709, + "loss": 0.1041, + "losses_ref": -0.0011060578981414437, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4420, + "u": -3.572949171066284, + "weight": 0.05791778117418289 + }, + { + "diff_generated": -23.273242950439453, + "epoch": 1.4355152300712897, + "grad_norm": 2.8747090374081736, + "learning_rate": 4.988096729955751e-07, + "logits/chosen": -2.4148807525634766, + "logits/rejected": -2.405785083770752, + "logps/chosen": -16.78266143798828, + "logps/rejected": -423.6563415527344, + "logps_avg/chosen": -0.09588910639286041, + "logps_avg/rejected": -2.327324390411377, + "loss": 0.1025, + "losses_ref": -0.000644886982627213, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4430, + "u": -3.52955961227417, + "weight": 0.06961119174957275 + }, + { + "diff_generated": -27.36212158203125, + "epoch": 1.4387556707712248, + "grad_norm": 2.5997624366455425, + "learning_rate": 4.97347508624883e-07, + "logits/chosen": -2.425786256790161, + "logits/rejected": -2.3510758876800537, + "logps/chosen": -16.762413024902344, + "logps/rejected": -475.9835510253906, + "logps_avg/chosen": -0.09029469639062881, + "logps_avg/rejected": -2.7362122535705566, + "loss": 0.0975, + "losses_ref": -0.0004997922806069255, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4440, + "u": -3.481595993041992, + "weight": 0.08191341161727905 + }, + { + "diff_generated": -26.46923828125, + "epoch": 1.4419961114711601, + "grad_norm": 2.4993217583552427, + "learning_rate": 4.958839602877809e-07, + "logits/chosen": -2.3639867305755615, + "logits/rejected": -2.367396116256714, + "logps/chosen": -18.619909286499023, + "logps/rejected": -472.052734375, + "logps_avg/chosen": -0.11092057079076767, + "logps_avg/rejected": -2.646923542022705, + "loss": 0.1025, + "losses_ref": -0.000601834908593446, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4450, + "u": -3.649272918701172, + "weight": 0.03825830668210983 + }, + { + "diff_generated": -26.878698348999023, + "epoch": 1.4452365521710953, + "grad_norm": 2.55277677854579, + "learning_rate": 4.944190487911878e-07, + "logits/chosen": -2.3663926124572754, + "logits/rejected": -2.350630760192871, + "logps/chosen": -17.180818557739258, + "logps/rejected": -497.48370361328125, + "logps_avg/chosen": -0.09979522228240967, + "logps_avg/rejected": -2.6878700256347656, + "loss": 0.1019, + "losses_ref": -0.0002570479118730873, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4460, + "u": -3.5531105995178223, + "weight": 0.06281425058841705 + }, + { + "diff_generated": -27.2506103515625, + "epoch": 1.4484769928710304, + "grad_norm": 2.534264307758894, + "learning_rate": 4.929527949614025e-07, + "logits/chosen": -2.384092330932617, + "logits/rejected": -2.297410249710083, + "logps/chosen": -19.391942977905273, + "logps/rejected": -457.280517578125, + "logps_avg/chosen": -0.11064749956130981, + "logps_avg/rejected": -2.7250614166259766, + "loss": 0.1026, + "losses_ref": -0.0014421206433326006, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4470, + "u": -3.645411729812622, + "weight": 0.03970836102962494 + }, + { + "diff_generated": -25.837894439697266, + "epoch": 1.4517174335709657, + "grad_norm": 2.5956162633184574, + "learning_rate": 4.914852196438077e-07, + "logits/chosen": -2.332778215408325, + "logits/rejected": -2.330631971359253, + "logps/chosen": -17.86171531677246, + "logps/rejected": -464.32635498046875, + "logps_avg/chosen": -0.10497362911701202, + "logps_avg/rejected": -2.583789348602295, + "loss": 0.1014, + "losses_ref": -0.0013370258966460824, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4480, + "u": -3.6205012798309326, + "weight": 0.04628719761967659 + }, + { + "diff_generated": -26.748376846313477, + "epoch": 1.4549578742709008, + "grad_norm": 2.6545092363012954, + "learning_rate": 4.900163437025727e-07, + "logits/chosen": -2.3440780639648438, + "logits/rejected": -2.3486673831939697, + "logps/chosen": -17.950197219848633, + "logps/rejected": -524.4556884765625, + "logps_avg/chosen": -0.10626323521137238, + "logps_avg/rejected": -2.674837589263916, + "loss": 0.1049, + "losses_ref": -0.0006623517838306725, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4490, + "u": -3.6240296363830566, + "weight": 0.04461196810007095 + }, + { + "diff_generated": -27.1101016998291, + "epoch": 1.458198314970836, + "grad_norm": 2.75990280679827, + "learning_rate": 4.885461880203582e-07, + "logits/chosen": -2.3479933738708496, + "logits/rejected": -2.3495919704437256, + "logps/chosen": -17.832767486572266, + "logps/rejected": -487.9822692871094, + "logps_avg/chosen": -0.10218171775341034, + "logps_avg/rejected": -2.711009979248047, + "loss": 0.1004, + "losses_ref": -0.0007707371842116117, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4500, + "u": -3.5700430870056152, + "weight": 0.05738046020269394 + }, + { + "diff_generated": -25.453678131103516, + "epoch": 1.4614387556707713, + "grad_norm": 2.4597411297031435, + "learning_rate": 4.870747734980186e-07, + "logits/chosen": -2.3789215087890625, + "logits/rejected": -2.377337694168091, + "logps/chosen": -16.965253829956055, + "logps/rejected": -483.447265625, + "logps_avg/chosen": -0.09473135322332382, + "logps_avg/rejected": -2.545367956161499, + "loss": 0.1016, + "losses_ref": -0.00035776724689640105, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4510, + "u": -3.625748872756958, + "weight": 0.0441717728972435 + }, + { + "diff_generated": -26.31353187561035, + "epoch": 1.4646791963707064, + "grad_norm": 2.708017657370048, + "learning_rate": 4.856021210543043e-07, + "logits/chosen": -2.306912660598755, + "logits/rejected": -2.295328140258789, + "logps/chosen": -17.95050811767578, + "logps/rejected": -469.47222900390625, + "logps_avg/chosen": -0.10379139333963394, + "logps_avg/rejected": -2.6313533782958984, + "loss": 0.1018, + "losses_ref": -0.0002655963471625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4520, + "u": -3.3879246711730957, + "weight": 0.10659865289926529 + }, + { + "diff_generated": -27.170995712280273, + "epoch": 1.4679196370706415, + "grad_norm": 2.5677323619140227, + "learning_rate": 4.841282516255653e-07, + "logits/chosen": -2.42441987991333, + "logits/rejected": -2.3640689849853516, + "logps/chosen": -19.41526222229004, + "logps/rejected": -499.07623291015625, + "logps_avg/chosen": -0.10374051332473755, + "logps_avg/rejected": -2.717099666595459, + "loss": 0.1033, + "losses_ref": -0.0010943252127617598, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4530, + "u": -3.689488649368286, + "weight": 0.026894647628068924 + }, + { + "diff_generated": -27.157360076904297, + "epoch": 1.471160077770577, + "grad_norm": 2.7424484198068413, + "learning_rate": 4.826531861654537e-07, + "logits/chosen": -2.3322556018829346, + "logits/rejected": -2.292630910873413, + "logps/chosen": -19.540157318115234, + "logps/rejected": -459.62542724609375, + "logps_avg/chosen": -0.10732056945562363, + "logps_avg/rejected": -2.715735912322998, + "loss": 0.1019, + "losses_ref": -0.001661944785155356, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4540, + "u": -3.644774913787842, + "weight": 0.0400865264236927 + }, + { + "diff_generated": -26.436452865600586, + "epoch": 1.474400518470512, + "grad_norm": 2.55585976008871, + "learning_rate": 4.811769456446243e-07, + "logits/chosen": -2.3906633853912354, + "logits/rejected": -2.3819384574890137, + "logps/chosen": -17.60432243347168, + "logps/rejected": -491.45013427734375, + "logps_avg/chosen": -0.09582848846912384, + "logps_avg/rejected": -2.6436455249786377, + "loss": 0.1, + "losses_ref": -0.0004539464716799557, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4550, + "u": -3.646775007247925, + "weight": 0.03811759501695633 + }, + { + "diff_generated": -25.415651321411133, + "epoch": 1.4776409591704471, + "grad_norm": 2.64179745579947, + "learning_rate": 4.796995510504384e-07, + "logits/chosen": -2.331956386566162, + "logits/rejected": -2.3741514682769775, + "logps/chosen": -16.415557861328125, + "logps/rejected": -496.5760803222656, + "logps_avg/chosen": -0.09389199316501617, + "logps_avg/rejected": -2.541565418243408, + "loss": 0.1042, + "losses_ref": -0.0003689295845106244, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4560, + "u": -3.502955198287964, + "weight": 0.07547652721405029 + }, + { + "diff_generated": -25.140544891357422, + "epoch": 1.4808813998703823, + "grad_norm": 2.627265550508191, + "learning_rate": 4.782210233866637e-07, + "logits/chosen": -2.3608133792877197, + "logits/rejected": -2.3634140491485596, + "logps/chosen": -17.05031967163086, + "logps/rejected": -443.302978515625, + "logps_avg/chosen": -0.0978686511516571, + "logps_avg/rejected": -2.514054536819458, + "loss": 0.0987, + "losses_ref": -0.0007118280627764761, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4570, + "u": -3.5265719890594482, + "weight": 0.0697406679391861 + }, + { + "diff_generated": -27.18475914001465, + "epoch": 1.4841218405703176, + "grad_norm": 2.7690753607986145, + "learning_rate": 4.76741383673177e-07, + "logits/chosen": -2.402216911315918, + "logits/rejected": -2.374706745147705, + "logps/chosen": -17.55205726623535, + "logps/rejected": -485.1881408691406, + "logps_avg/chosen": -0.10320468991994858, + "logps_avg/rejected": -2.7184762954711914, + "loss": 0.1031, + "losses_ref": -0.0003614898887462914, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4580, + "u": -3.5736613273620605, + "weight": 0.05670735239982605 + }, + { + "diff_generated": -27.21875, + "epoch": 1.4873622812702527, + "grad_norm": 3.0064926638398144, + "learning_rate": 4.752606529456648e-07, + "logits/chosen": -2.3634047508239746, + "logits/rejected": -2.363259792327881, + "logps/chosen": -15.848172187805176, + "logps/rejected": -457.4466857910156, + "logps_avg/chosen": -0.09651388227939606, + "logps_avg/rejected": -2.721874952316284, + "loss": 0.102, + "losses_ref": -0.0007729289936833084, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4590, + "u": -3.450721025466919, + "weight": 0.08863823860883713 + }, + { + "diff_generated": -26.816186904907227, + "epoch": 1.490602721970188, + "grad_norm": 2.536867119938274, + "learning_rate": 4.7377885225532396e-07, + "logits/chosen": -2.4007298946380615, + "logits/rejected": -2.3915462493896484, + "logps/chosen": -16.55607795715332, + "logps/rejected": -489.6505432128906, + "logps_avg/chosen": -0.09947662055492401, + "logps_avg/rejected": -2.6816184520721436, + "loss": 0.1005, + "losses_ref": -0.00034102320205420256, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4600, + "u": -3.64630389213562, + "weight": 0.037940699607133865 + }, + { + "diff_generated": -26.589834213256836, + "epoch": 1.4938431626701232, + "grad_norm": 2.5753359062753005, + "learning_rate": 4.722960026685633e-07, + "logits/chosen": -2.346069812774658, + "logits/rejected": -2.3274240493774414, + "logps/chosen": -16.945472717285156, + "logps/rejected": -475.73468017578125, + "logps_avg/chosen": -0.0991564393043518, + "logps_avg/rejected": -2.6589832305908203, + "loss": 0.0997, + "losses_ref": -0.0006340649561025202, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4610, + "u": -3.5988471508026123, + "weight": 0.050868142396211624 + }, + { + "diff_generated": -27.050867080688477, + "epoch": 1.4970836033700583, + "grad_norm": 2.6721082376929246, + "learning_rate": 4.7081212526670267e-07, + "logits/chosen": -2.3203649520874023, + "logits/rejected": -2.274970054626465, + "logps/chosen": -20.65582275390625, + "logps/rejected": -442.66790771484375, + "logps_avg/chosen": -0.1143292635679245, + "logps_avg/rejected": -2.7050864696502686, + "loss": 0.1019, + "losses_ref": -0.0003582318313419819, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4620, + "u": -3.595930814743042, + "weight": 0.05042849853634834 + }, + { + "diff_generated": -28.128681182861328, + "epoch": 1.5003240440699934, + "grad_norm": 3.0838171392134517, + "learning_rate": 4.693272411456753e-07, + "logits/chosen": -2.4106602668762207, + "logits/rejected": -2.3465323448181152, + "logps/chosen": -18.15391731262207, + "logps/rejected": -472.01849365234375, + "logps_avg/chosen": -0.09831465780735016, + "logps_avg/rejected": -2.8128678798675537, + "loss": 0.1016, + "losses_ref": -0.0006209076964296401, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4630, + "u": -3.6010029315948486, + "weight": 0.050917524844408035 + }, + { + "diff_generated": -25.924785614013672, + "epoch": 1.5035644847699285, + "grad_norm": 2.777284153501298, + "learning_rate": 4.6784137141572566e-07, + "logits/chosen": -2.376234292984009, + "logits/rejected": -2.3401947021484375, + "logps/chosen": -17.53774070739746, + "logps/rejected": -451.0185546875, + "logps_avg/chosen": -0.09319780766963959, + "logps_avg/rejected": -2.5924789905548096, + "loss": 0.099, + "losses_ref": -0.0008334851590916514, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4640, + "u": -3.57263445854187, + "weight": 0.05742005258798599 + }, + { + "diff_generated": -26.946765899658203, + "epoch": 1.5068049254698639, + "grad_norm": 2.8874402391178284, + "learning_rate": 4.6635453720111096e-07, + "logits/chosen": -2.3638510704040527, + "logits/rejected": -2.33133864402771, + "logps/chosen": -17.957733154296875, + "logps/rejected": -478.43609619140625, + "logps_avg/chosen": -0.10447684675455093, + "logps_avg/rejected": -2.6946768760681152, + "loss": 0.0975, + "losses_ref": -0.0003875306574627757, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4650, + "u": -3.502568006515503, + "weight": 0.07550157606601715 + }, + { + "diff_generated": -24.884929656982422, + "epoch": 1.5100453661697992, + "grad_norm": 2.6861817231220577, + "learning_rate": 4.6486675963980014e-07, + "logits/chosen": -2.404689073562622, + "logits/rejected": -2.452115535736084, + "logps/chosen": -18.500164031982422, + "logps/rejected": -459.638916015625, + "logps_avg/chosen": -0.10741446167230606, + "logps_avg/rejected": -2.4884932041168213, + "loss": 0.101, + "losses_ref": -0.001469132723286748, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4660, + "u": -3.669245481491089, + "weight": 0.033659983426332474 + }, + { + "diff_generated": -27.58524513244629, + "epoch": 1.5132858068697344, + "grad_norm": 2.6204230175484136, + "learning_rate": 4.633780598831733e-07, + "logits/chosen": -2.416443347930908, + "logits/rejected": -2.349015712738037, + "logps/chosen": -20.454132080078125, + "logps/rejected": -504.7102966308594, + "logps_avg/chosen": -0.11305411159992218, + "logps_avg/rejected": -2.7585246562957764, + "loss": 0.0985, + "losses_ref": -0.0005273165879771113, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4670, + "u": -3.5723938941955566, + "weight": 0.056988675147295 + }, + { + "diff_generated": -28.445322036743164, + "epoch": 1.5165262475696695, + "grad_norm": 2.768496657784131, + "learning_rate": 4.6188845909572143e-07, + "logits/chosen": -2.3770623207092285, + "logits/rejected": -2.3327393531799316, + "logps/chosen": -16.653573989868164, + "logps/rejected": -527.7097778320312, + "logps_avg/chosen": -0.09490346163511276, + "logps_avg/rejected": -2.8445322513580322, + "loss": 0.102, + "losses_ref": -0.00033215107396245003, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4680, + "u": -3.5773749351501465, + "weight": 0.056677620857954025 + }, + { + "diff_generated": -26.633886337280273, + "epoch": 1.5197666882696046, + "grad_norm": 2.4800042440884926, + "learning_rate": 4.603979784547451e-07, + "logits/chosen": -2.334624767303467, + "logits/rejected": -2.3243727684020996, + "logps/chosen": -18.37417984008789, + "logps/rejected": -509.787353515625, + "logps_avg/chosen": -0.10755424201488495, + "logps_avg/rejected": -2.663388729095459, + "loss": 0.1024, + "losses_ref": -0.0005498843383975327, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4690, + "u": -3.5532336235046387, + "weight": 0.06324771791696548 + }, + { + "diff_generated": -29.298620223999023, + "epoch": 1.5230071289695397, + "grad_norm": 2.706514444080492, + "learning_rate": 4.5890663915005364e-07, + "logits/chosen": -2.3914692401885986, + "logits/rejected": -2.373861789703369, + "logps/chosen": -16.200611114501953, + "logps/rejected": -524.3837890625, + "logps_avg/chosen": -0.09766872227191925, + "logps_avg/rejected": -2.9298622608184814, + "loss": 0.1018, + "losses_ref": -0.0002807065029628575, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4700, + "u": -3.6486129760742188, + "weight": 0.037851959466934204 + }, + { + "diff_generated": -26.626922607421875, + "epoch": 1.526247569669475, + "grad_norm": 2.7443458990985814, + "learning_rate": 4.574144623836637e-07, + "logits/chosen": -2.3963630199432373, + "logits/rejected": -2.3771467208862305, + "logps/chosen": -16.957136154174805, + "logps/rejected": -482.7359313964844, + "logps_avg/chosen": -0.09588057547807693, + "logps_avg/rejected": -2.6626925468444824, + "loss": 0.1036, + "losses_ref": -0.0004056665929965675, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4710, + "u": -3.524570941925049, + "weight": 0.06927379965782166 + }, + { + "diff_generated": -28.789709091186523, + "epoch": 1.5294880103694104, + "grad_norm": 2.419049898861885, + "learning_rate": 4.5592146936949785e-07, + "logits/chosen": -2.3638224601745605, + "logits/rejected": -2.3208470344543457, + "logps/chosen": -20.11721420288086, + "logps/rejected": -505.06396484375, + "logps_avg/chosen": -0.10543084144592285, + "logps_avg/rejected": -2.8789710998535156, + "loss": 0.0983, + "losses_ref": -0.0007130955345928669, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4720, + "u": -3.5926082134246826, + "weight": 0.050989795476198196 + }, + { + "diff_generated": -26.855182647705078, + "epoch": 1.5327284510693455, + "grad_norm": 2.673613793203479, + "learning_rate": 4.544276813330835e-07, + "logits/chosen": -2.381993055343628, + "logits/rejected": -2.3274385929107666, + "logps/chosen": -17.30748176574707, + "logps/rejected": -472.8213806152344, + "logps_avg/chosen": -0.09686298668384552, + "logps_avg/rejected": -2.6855180263519287, + "loss": 0.1028, + "losses_ref": -0.0003440210421103984, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4730, + "u": -3.602659225463867, + "weight": 0.050429295748472214 + }, + { + "diff_generated": -26.854650497436523, + "epoch": 1.5359688917692806, + "grad_norm": 2.6334580009309163, + "learning_rate": 4.529331195112501e-07, + "logits/chosen": -2.3390755653381348, + "logits/rejected": -2.3454232215881348, + "logps/chosen": -17.794170379638672, + "logps/rejected": -506.09326171875, + "logps_avg/chosen": -0.10273711383342743, + "logps_avg/rejected": -2.6854653358459473, + "loss": 0.1006, + "losses_ref": -0.0007093682652339339, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4740, + "u": -3.6918129920959473, + "weight": 0.02626415155827999 + }, + { + "diff_generated": -28.0623779296875, + "epoch": 1.5392093324692158, + "grad_norm": 2.4259277405372384, + "learning_rate": 4.5143780515182833e-07, + "logits/chosen": -2.377840757369995, + "logits/rejected": -2.3175711631774902, + "logps/chosen": -20.87468910217285, + "logps/rejected": -508.7308654785156, + "logps_avg/chosen": -0.11975955963134766, + "logps_avg/rejected": -2.8062376976013184, + "loss": 0.1011, + "losses_ref": -0.0005875771166756749, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4750, + "u": -3.7144553661346436, + "weight": 0.01955435611307621 + }, + { + "diff_generated": -26.727203369140625, + "epoch": 1.5424497731691509, + "grad_norm": 2.6889369988231215, + "learning_rate": 4.499417595133471e-07, + "logits/chosen": -2.3149609565734863, + "logits/rejected": -2.2860519886016846, + "logps/chosen": -17.685163497924805, + "logps/rejected": -484.66571044921875, + "logps_avg/chosen": -0.10537519305944443, + "logps_avg/rejected": -2.672720432281494, + "loss": 0.1016, + "losses_ref": -0.0006560144247487187, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4760, + "u": -3.641984224319458, + "weight": 0.03838873654603958 + }, + { + "diff_generated": -27.540283203125, + "epoch": 1.5456902138690862, + "grad_norm": 2.6264864150084257, + "learning_rate": 4.4844500386473207e-07, + "logits/chosen": -2.373394012451172, + "logits/rejected": -2.3297057151794434, + "logps/chosen": -19.133085250854492, + "logps/rejected": -499.7777404785156, + "logps_avg/chosen": -0.10392173379659653, + "logps_avg/rejected": -2.754028558731079, + "loss": 0.1024, + "losses_ref": -0.0007838995079509914, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4770, + "u": -3.648332118988037, + "weight": 0.038741402328014374 + }, + { + "diff_generated": -27.40139389038086, + "epoch": 1.5489306545690213, + "grad_norm": 2.8154896252136044, + "learning_rate": 4.4694755948500276e-07, + "logits/chosen": -2.3506534099578857, + "logits/rejected": -2.353341579437256, + "logps/chosen": -14.5455322265625, + "logps/rejected": -509.96673583984375, + "logps_avg/chosen": -0.09225939214229584, + "logps_avg/rejected": -2.7401394844055176, + "loss": 0.0999, + "losses_ref": -0.0001783154293661937, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4780, + "u": -3.5768604278564453, + "weight": 0.056461431086063385 + }, + { + "diff_generated": -26.885112762451172, + "epoch": 1.5521710952689567, + "grad_norm": 2.5855058383683995, + "learning_rate": 4.4544944766297037e-07, + "logits/chosen": -2.395085573196411, + "logits/rejected": -2.425724506378174, + "logps/chosen": -16.19194793701172, + "logps/rejected": -509.50238037109375, + "logps_avg/chosen": -0.09882526099681854, + "logps_avg/rejected": -2.688511371612549, + "loss": 0.0985, + "losses_ref": -0.0004003371577709913, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4790, + "u": -3.7154109477996826, + "weight": 0.019246309995651245 + }, + { + "diff_generated": -26.92526626586914, + "epoch": 1.5554115359688918, + "grad_norm": 2.646503604424476, + "learning_rate": 4.439506896969348e-07, + "logits/chosen": -2.295779228210449, + "logits/rejected": -2.2422423362731934, + "logps/chosen": -16.581674575805664, + "logps/rejected": -458.66778564453125, + "logps_avg/chosen": -0.09982822835445404, + "logps_avg/rejected": -2.692526340484619, + "loss": 0.0987, + "losses_ref": -0.0005184352630749345, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4800, + "u": -3.4579899311065674, + "weight": 0.08816748857498169 + }, + { + "diff_generated": -27.13262939453125, + "epoch": 1.558651976668827, + "grad_norm": 2.857413374977336, + "learning_rate": 4.4245130689438206e-07, + "logits/chosen": -2.313910961151123, + "logits/rejected": -2.255359172821045, + "logps/chosen": -18.43231773376465, + "logps/rejected": -488.8936462402344, + "logps_avg/chosen": -0.1034407764673233, + "logps_avg/rejected": -2.7132630348205566, + "loss": 0.1024, + "losses_ref": -0.0005251936381682754, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4810, + "u": -3.5007331371307373, + "weight": 0.07584112137556076 + }, + { + "diff_generated": -27.614675521850586, + "epoch": 1.561892417368762, + "grad_norm": 2.7006985101228276, + "learning_rate": 4.4095132057168145e-07, + "logits/chosen": -2.3693792819976807, + "logits/rejected": -2.308382511138916, + "logps/chosen": -19.743377685546875, + "logps/rejected": -480.5498962402344, + "logps_avg/chosen": -0.10749039798974991, + "logps_avg/rejected": -2.7614681720733643, + "loss": 0.0994, + "losses_ref": -0.0005830880254507065, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4820, + "u": -3.6227993965148926, + "weight": 0.04462616890668869 + }, + { + "diff_generated": -27.29495620727539, + "epoch": 1.5651328580686974, + "grad_norm": 2.7541811438721946, + "learning_rate": 4.3945075205378215e-07, + "logits/chosen": -2.308394193649292, + "logits/rejected": -2.331428050994873, + "logps/chosen": -16.660200119018555, + "logps/rejected": -496.21783447265625, + "logps_avg/chosen": -0.10463666915893555, + "logps_avg/rejected": -2.7294955253601074, + "loss": 0.103, + "losses_ref": -0.0001776438148226589, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4830, + "u": -3.6182701587677, + "weight": 0.043968915939331055 + }, + { + "diff_generated": -26.718585968017578, + "epoch": 1.5683732987686325, + "grad_norm": 2.619996271538521, + "learning_rate": 4.379496226739104e-07, + "logits/chosen": -2.3933868408203125, + "logits/rejected": -2.354672431945801, + "logps/chosen": -17.114768981933594, + "logps/rejected": -501.43194580078125, + "logps_avg/chosen": -0.09793446958065033, + "logps_avg/rejected": -2.671858310699463, + "loss": 0.1008, + "losses_ref": -0.0003559653414413333, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4840, + "u": -3.5752758979797363, + "weight": 0.056691162288188934 + }, + { + "diff_generated": -25.645092010498047, + "epoch": 1.5716137394685679, + "grad_norm": 2.7091730491201695, + "learning_rate": 4.364479537732663e-07, + "logits/chosen": -2.3813581466674805, + "logits/rejected": -2.3487915992736816, + "logps/chosen": -18.565942764282227, + "logps/rejected": -461.3694763183594, + "logps_avg/chosen": -0.10648205131292343, + "logps_avg/rejected": -2.564509391784668, + "loss": 0.1029, + "losses_ref": -0.00037918920861557126, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4850, + "u": -3.614135265350342, + "weight": 0.04421050101518631 + }, + { + "diff_generated": -26.901065826416016, + "epoch": 1.574854180168503, + "grad_norm": 2.637242307433007, + "learning_rate": 4.349457667007197e-07, + "logits/chosen": -2.3863844871520996, + "logits/rejected": -2.3578927516937256, + "logps/chosen": -19.903520584106445, + "logps/rejected": -504.9957580566406, + "logps_avg/chosen": -0.10746286809444427, + "logps_avg/rejected": -2.6901066303253174, + "loss": 0.0998, + "losses_ref": -0.0007895805174484849, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4860, + "u": -3.7391810417175293, + "weight": 0.01379163283854723 + }, + { + "diff_generated": -28.473886489868164, + "epoch": 1.578094620868438, + "grad_norm": 2.533652732727638, + "learning_rate": 4.334430828125074e-07, + "logits/chosen": -2.3725333213806152, + "logits/rejected": -2.340877056121826, + "logps/chosen": -18.550180435180664, + "logps/rejected": -516.2704467773438, + "logps_avg/chosen": -0.10123230516910553, + "logps_avg/rejected": -2.847388744354248, + "loss": 0.0984, + "losses_ref": -0.00029261037707328796, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4870, + "u": -3.530827283859253, + "weight": 0.06912322342395782 + }, + { + "diff_generated": -25.443593978881836, + "epoch": 1.5813350615683732, + "grad_norm": 2.616320595845464, + "learning_rate": 4.319399234719297e-07, + "logits/chosen": -2.3242273330688477, + "logits/rejected": -2.3418664932250977, + "logps/chosen": -15.317204475402832, + "logps/rejected": -483.1455993652344, + "logps_avg/chosen": -0.09046544134616852, + "logps_avg/rejected": -2.5443596839904785, + "loss": 0.0966, + "losses_ref": -0.00041991929174400866, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4880, + "u": -3.4812328815460205, + "weight": 0.08183874189853668 + }, + { + "diff_generated": -28.15468406677246, + "epoch": 1.5845755022683083, + "grad_norm": 2.8396134964728246, + "learning_rate": 4.3043631004904563e-07, + "logits/chosen": -2.3555305004119873, + "logits/rejected": -2.3145227432250977, + "logps/chosen": -15.45680046081543, + "logps/rejected": -495.0728454589844, + "logps_avg/chosen": -0.08831733465194702, + "logps_avg/rejected": -2.8154685497283936, + "loss": 0.0997, + "losses_ref": -0.00037405334296636283, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4890, + "u": -3.670712947845459, + "weight": 0.031772710382938385 + }, + { + "diff_generated": -27.696598052978516, + "epoch": 1.5878159429682437, + "grad_norm": 2.581755430760835, + "learning_rate": 4.2893226392037024e-07, + "logits/chosen": -2.396336555480957, + "logits/rejected": -2.33992075920105, + "logps/chosen": -19.942581176757812, + "logps/rejected": -473.76385498046875, + "logps_avg/chosen": -0.10936687886714935, + "logps_avg/rejected": -2.7696597576141357, + "loss": 0.1013, + "losses_ref": -0.00029218022245913744, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4900, + "u": -3.7187163829803467, + "weight": 0.01911265216767788 + }, + { + "diff_generated": -26.277267456054688, + "epoch": 1.591056383668179, + "grad_norm": 2.6896412858160237, + "learning_rate": 4.2742780646857015e-07, + "logits/chosen": -2.382976770401001, + "logits/rejected": -2.3719167709350586, + "logps/chosen": -16.212665557861328, + "logps/rejected": -494.34515380859375, + "logps_avg/chosen": -0.09449265152215958, + "logps_avg/rejected": -2.6277265548706055, + "loss": 0.0998, + "losses_ref": -0.00017206800112035125, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4910, + "u": -3.6703362464904785, + "weight": 0.03144761547446251 + }, + { + "diff_generated": -25.981069564819336, + "epoch": 1.5942968243681142, + "grad_norm": 2.6831933594658275, + "learning_rate": 4.2592295908215953e-07, + "logits/chosen": -2.353114128112793, + "logits/rejected": -2.336235284805298, + "logps/chosen": -19.42054557800293, + "logps/rejected": -469.2264709472656, + "logps_avg/chosen": -0.1102244108915329, + "logps_avg/rejected": -2.598106861114502, + "loss": 0.1045, + "losses_ref": -0.0005944965523667634, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4920, + "u": -3.5998847484588623, + "weight": 0.05080182105302811 + }, + { + "diff_generated": -28.747379302978516, + "epoch": 1.5975372650680493, + "grad_norm": 2.625533076941181, + "learning_rate": 4.2441774315519645e-07, + "logits/chosen": -2.389402389526367, + "logits/rejected": -2.3536899089813232, + "logps/chosen": -17.513187408447266, + "logps/rejected": -512.0242919921875, + "logps_avg/chosen": -0.10442149639129639, + "logps_avg/rejected": -2.8747377395629883, + "loss": 0.0989, + "losses_ref": -0.0005745574599131942, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4930, + "u": -3.644648313522339, + "weight": 0.038289912045001984 + }, + { + "diff_generated": -27.752788543701172, + "epoch": 1.6007777057679844, + "grad_norm": 2.492184484923244, + "learning_rate": 4.229121800869781e-07, + "logits/chosen": -2.3942487239837646, + "logits/rejected": -2.342210292816162, + "logps/chosen": -16.346881866455078, + "logps/rejected": -507.65704345703125, + "logps_avg/chosen": -0.09216523170471191, + "logps_avg/rejected": -2.7752792835235596, + "loss": 0.1007, + "losses_ref": -0.0008915389771573246, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4940, + "u": -3.6660046577453613, + "weight": 0.032541196793317795 + }, + { + "diff_generated": -27.836124420166016, + "epoch": 1.6040181464679195, + "grad_norm": 2.6417464893100777, + "learning_rate": 4.2140629128173703e-07, + "logits/chosen": -2.4343490600585938, + "logits/rejected": -2.3680715560913086, + "logps/chosen": -15.843406677246094, + "logps/rejected": -476.01019287109375, + "logps_avg/chosen": -0.0944889634847641, + "logps_avg/rejected": -2.7836124897003174, + "loss": 0.1008, + "losses_ref": -0.0009327017469331622, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4950, + "u": -3.599611282348633, + "weight": 0.05153653770685196 + }, + { + "diff_generated": -26.701419830322266, + "epoch": 1.6072585871678549, + "grad_norm": 2.6680702408063492, + "learning_rate": 4.199000981483368e-07, + "logits/chosen": -2.406346559524536, + "logits/rejected": -2.365767002105713, + "logps/chosen": -21.22658348083496, + "logps/rejected": -499.58746337890625, + "logps_avg/chosen": -0.11067505925893784, + "logps_avg/rejected": -2.6701416969299316, + "loss": 0.1017, + "losses_ref": -0.001109774224460125, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4960, + "u": -3.6634132862091064, + "weight": 0.032880425453186035 + }, + { + "diff_generated": -29.07406234741211, + "epoch": 1.61049902786779, + "grad_norm": 2.2941148581571915, + "learning_rate": 4.183936220999676e-07, + "logits/chosen": -2.3704891204833984, + "logits/rejected": -2.2919983863830566, + "logps/chosen": -18.74502182006836, + "logps/rejected": -533.35595703125, + "logps_avg/chosen": -0.10289647430181503, + "logps_avg/rejected": -2.9074063301086426, + "loss": 0.101, + "losses_ref": -0.0003367816680110991, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4970, + "u": -3.599052906036377, + "weight": 0.05049259588122368 + }, + { + "diff_generated": -27.42887306213379, + "epoch": 1.6137394685677253, + "grad_norm": 2.6430433694036717, + "learning_rate": 4.168868845538414e-07, + "logits/chosen": -2.381389617919922, + "logits/rejected": -2.3356552124023438, + "logps/chosen": -16.56113052368164, + "logps/rejected": -488.47802734375, + "logps_avg/chosen": -0.09255286306142807, + "logps_avg/rejected": -2.742887020111084, + "loss": 0.0985, + "losses_ref": -0.0003578144242055714, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4980, + "u": -3.6470627784729004, + "weight": 0.037930212914943695 + }, + { + "diff_generated": -25.372648239135742, + "epoch": 1.6169799092676604, + "grad_norm": 2.5942333804563944, + "learning_rate": 4.15379906930888e-07, + "logits/chosen": -2.3385634422302246, + "logits/rejected": -2.297896385192871, + "logps/chosen": -16.051712036132812, + "logps/rejected": -463.76702880859375, + "logps_avg/chosen": -0.08722599595785141, + "logps_avg/rejected": -2.537264823913574, + "loss": 0.0976, + "losses_ref": -0.0003934988344553858, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 4990, + "u": -3.364407777786255, + "weight": 0.11301133781671524 + }, + { + "diff_generated": -25.59117889404297, + "epoch": 1.6202203499675956, + "grad_norm": 2.6715693330169716, + "learning_rate": 4.1387271065545074e-07, + "logits/chosen": -2.409432888031006, + "logits/rejected": -2.3450863361358643, + "logps/chosen": -18.909839630126953, + "logps/rejected": -421.7919921875, + "logps_avg/chosen": -0.10501371324062347, + "logps_avg/rejected": -2.5591180324554443, + "loss": 0.1023, + "losses_ref": -0.001767330220900476, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5000, + "u": -3.644735336303711, + "weight": 0.04043537750840187 + }, + { + "diff_generated": -29.76255226135254, + "epoch": 1.6234607906675307, + "grad_norm": 2.588133194181167, + "learning_rate": 4.123653171549807e-07, + "logits/chosen": -2.4070725440979004, + "logits/rejected": -2.345527172088623, + "logps/chosen": -16.648374557495117, + "logps/rejected": -498.03363037109375, + "logps_avg/chosen": -0.09451456367969513, + "logps_avg/rejected": -2.976254940032959, + "loss": 0.101, + "losses_ref": -0.0006677792407572269, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5010, + "u": -3.6961212158203125, + "weight": 0.025899719446897507 + }, + { + "diff_generated": -28.065227508544922, + "epoch": 1.626701231367466, + "grad_norm": 2.5261516177295644, + "learning_rate": 4.108577478597335e-07, + "logits/chosen": -2.3303914070129395, + "logits/rejected": -2.349775791168213, + "logps/chosen": -19.008838653564453, + "logps/rejected": -515.6997680664062, + "logps_avg/chosen": -0.1155603900551796, + "logps_avg/rejected": -2.806522846221924, + "loss": 0.1029, + "losses_ref": -0.0006258910289034247, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5020, + "u": -3.6222190856933594, + "weight": 0.044567983597517014 + }, + { + "diff_generated": -26.53865623474121, + "epoch": 1.6299416720674011, + "grad_norm": 2.84698151865833, + "learning_rate": 4.093500242024637e-07, + "logits/chosen": -2.45613169670105, + "logits/rejected": -2.3126373291015625, + "logps/chosen": -18.254779815673828, + "logps/rejected": -486.0533142089844, + "logps_avg/chosen": -0.0972185954451561, + "logps_avg/rejected": -2.6538655757904053, + "loss": 0.1012, + "losses_ref": -0.0005458712694235146, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5030, + "u": -3.574019193649292, + "weight": 0.05697847157716751 + }, + { + "diff_generated": -28.153987884521484, + "epoch": 1.6331821127673365, + "grad_norm": 2.520512810715581, + "learning_rate": 4.0784216761812044e-07, + "logits/chosen": -2.3809313774108887, + "logits/rejected": -2.272681474685669, + "logps/chosen": -17.655887603759766, + "logps/rejected": -482.1053771972656, + "logps_avg/chosen": -0.09897877275943756, + "logps_avg/rejected": -2.815398931503296, + "loss": 0.0995, + "losses_ref": -0.0003890444932039827, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5040, + "u": -3.5757968425750732, + "weight": 0.05678637698292732 + }, + { + "diff_generated": -27.308612823486328, + "epoch": 1.6364225534672716, + "grad_norm": 2.6515109042801455, + "learning_rate": 4.063341995435427e-07, + "logits/chosen": -2.3556971549987793, + "logits/rejected": -2.326655149459839, + "logps/chosen": -15.506830215454102, + "logps/rejected": -467.6351623535156, + "logps_avg/chosen": -0.09428389370441437, + "logps_avg/rejected": -2.730861186981201, + "loss": 0.0975, + "losses_ref": -0.0002866701106540859, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5050, + "u": -3.6228299140930176, + "weight": 0.044095564633607864 + }, + { + "diff_generated": -28.53106689453125, + "epoch": 1.6396629941672067, + "grad_norm": 2.9572529071312377, + "learning_rate": 4.048261414171544e-07, + "logits/chosen": -2.403313159942627, + "logits/rejected": -2.2974865436553955, + "logps/chosen": -17.453428268432617, + "logps/rejected": -519.951171875, + "logps_avg/chosen": -0.09405800700187683, + "logps_avg/rejected": -2.85310697555542, + "loss": 0.1006, + "losses_ref": -0.0006021251319907606, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5060, + "u": -3.4560837745666504, + "weight": 0.08834328502416611 + }, + { + "diff_generated": -26.335346221923828, + "epoch": 1.6429034348671419, + "grad_norm": 2.5939033940015097, + "learning_rate": 4.0331801467865967e-07, + "logits/chosen": -2.3971405029296875, + "logits/rejected": -2.3532299995422363, + "logps/chosen": -17.34989356994629, + "logps/rejected": -462.61346435546875, + "logps_avg/chosen": -0.09877597540616989, + "logps_avg/rejected": -2.6335346698760986, + "loss": 0.1011, + "losses_ref": -0.0008893858757801354, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5070, + "u": -3.5909934043884277, + "weight": 0.05130688473582268 + }, + { + "diff_generated": -26.316930770874023, + "epoch": 1.646143875567077, + "grad_norm": 2.522885112902731, + "learning_rate": 4.0180984076873833e-07, + "logits/chosen": -2.3249759674072266, + "logits/rejected": -2.3163132667541504, + "logps/chosen": -17.908002853393555, + "logps/rejected": -466.9813537597656, + "logps_avg/chosen": -0.09990663826465607, + "logps_avg/rejected": -2.631693124771118, + "loss": 0.1022, + "losses_ref": -0.0005635431734845042, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5080, + "u": -3.525578260421753, + "weight": 0.06955597549676895 + }, + { + "diff_generated": -28.45648193359375, + "epoch": 1.6493843162670123, + "grad_norm": 2.7553435256115106, + "learning_rate": 4.003016411287407e-07, + "logits/chosen": -2.3902664184570312, + "logits/rejected": -2.4012134075164795, + "logps/chosen": -17.522396087646484, + "logps/rejected": -532.9021606445312, + "logps_avg/chosen": -0.09973875433206558, + "logps_avg/rejected": -2.8456482887268066, + "loss": 0.101, + "losses_ref": -0.00033324985997751355, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5090, + "u": -3.667588472366333, + "weight": 0.03167085349559784 + }, + { + "diff_generated": -27.200836181640625, + "epoch": 1.6526247569669477, + "grad_norm": 2.494211247820919, + "learning_rate": 3.9879343720038276e-07, + "logits/chosen": -2.3775768280029297, + "logits/rejected": -2.329467296600342, + "logps/chosen": -17.555004119873047, + "logps/rejected": -490.5977478027344, + "logps_avg/chosen": -0.09881128370761871, + "logps_avg/rejected": -2.720083713531494, + "loss": 0.1012, + "losses_ref": -0.0001987409486901015, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5100, + "u": -3.624324321746826, + "weight": 0.04399067163467407 + }, + { + "diff_generated": -27.48602294921875, + "epoch": 1.6558651976668828, + "grad_norm": 2.444013794792136, + "learning_rate": 3.972852504254415e-07, + "logits/chosen": -2.327697277069092, + "logits/rejected": -2.3307526111602783, + "logps/chosen": -17.439884185791016, + "logps/rejected": -510.11419677734375, + "logps_avg/chosen": -0.09989776462316513, + "logps_avg/rejected": -2.7486021518707275, + "loss": 0.0982, + "losses_ref": -0.0003228532150387764, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5110, + "u": -3.526263475418091, + "weight": 0.06914493441581726 + }, + { + "diff_generated": -28.441503524780273, + "epoch": 1.659105638366818, + "grad_norm": 2.689868582483409, + "learning_rate": 3.9577710224545033e-07, + "logits/chosen": -2.378460645675659, + "logits/rejected": -2.34100341796875, + "logps/chosen": -19.18258285522461, + "logps/rejected": -546.9826049804688, + "logps_avg/chosen": -0.1077943816781044, + "logps_avg/rejected": -2.8441507816314697, + "loss": 0.1027, + "losses_ref": -0.000511771475430578, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5120, + "u": -3.6669769287109375, + "weight": 0.03194146603345871 + }, + { + "diff_generated": -28.624170303344727, + "epoch": 1.662346079066753, + "grad_norm": 2.4570949091067664, + "learning_rate": 3.9426901410139346e-07, + "logits/chosen": -2.4005510807037354, + "logits/rejected": -2.2961833477020264, + "logps/chosen": -20.022676467895508, + "logps/rejected": -509.7611389160156, + "logps_avg/chosen": -0.10960058122873306, + "logps_avg/rejected": -2.8624167442321777, + "loss": 0.0979, + "losses_ref": -0.000778104062192142, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5130, + "u": -3.73748779296875, + "weight": 0.013706320896744728 + }, + { + "diff_generated": -27.759662628173828, + "epoch": 1.6655865197666881, + "grad_norm": 2.5771408853614264, + "learning_rate": 3.9276100743340217e-07, + "logits/chosen": -2.474146604537964, + "logits/rejected": -2.404331684112549, + "logps/chosen": -18.846654891967773, + "logps/rejected": -517.0947265625, + "logps_avg/chosen": -0.10051999241113663, + "logps_avg/rejected": -2.775965929031372, + "loss": 0.1, + "losses_ref": -0.00044742418685927987, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5140, + "u": -3.7430527210235596, + "weight": 0.013072418980300426 + }, + { + "diff_generated": -29.991321563720703, + "epoch": 1.6688269604666235, + "grad_norm": 2.5266094791464906, + "learning_rate": 3.9125310368044877e-07, + "logits/chosen": -2.351802349090576, + "logits/rejected": -2.3035707473754883, + "logps/chosen": -17.116596221923828, + "logps/rejected": -536.9442138671875, + "logps_avg/chosen": -0.09962300956249237, + "logps_avg/rejected": -2.9991321563720703, + "loss": 0.0987, + "losses_ref": -0.0001824962382670492, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5150, + "u": -3.649186372756958, + "weight": 0.03772037476301193 + }, + { + "diff_generated": -29.356170654296875, + "epoch": 1.6720674011665586, + "grad_norm": 2.70751200307167, + "learning_rate": 3.8974532428004305e-07, + "logits/chosen": -2.336583137512207, + "logits/rejected": -2.2621965408325195, + "logps/chosen": -18.180213928222656, + "logps/rejected": -547.4981689453125, + "logps_avg/chosen": -0.09748770296573639, + "logps_avg/rejected": -2.935617208480835, + "loss": 0.0995, + "losses_ref": -0.0003386014432180673, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5160, + "u": -3.5745251178741455, + "weight": 0.05664762109518051 + }, + { + "diff_generated": -27.39190673828125, + "epoch": 1.675307841866494, + "grad_norm": 2.456643971816381, + "learning_rate": 3.8823769066792643e-07, + "logits/chosen": -2.3570380210876465, + "logits/rejected": -2.2994766235351562, + "logps/chosen": -18.147729873657227, + "logps/rejected": -480.56640625, + "logps_avg/chosen": -0.10367520898580551, + "logps_avg/rejected": -2.7391905784606934, + "loss": 0.1007, + "losses_ref": -0.0006972316186875105, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5170, + "u": -3.551652193069458, + "weight": 0.06359126418828964 + }, + { + "diff_generated": -26.773178100585938, + "epoch": 1.678548282566429, + "grad_norm": 2.6000506565970296, + "learning_rate": 3.867302242777681e-07, + "logits/chosen": -2.4094491004943848, + "logits/rejected": -2.415447235107422, + "logps/chosen": -17.810815811157227, + "logps/rejected": -493.0499572753906, + "logps_avg/chosen": -0.10398928821086884, + "logps_avg/rejected": -2.6773180961608887, + "loss": 0.101, + "losses_ref": -0.00047280610306188464, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5180, + "u": -3.673081874847412, + "weight": 0.03182986378669739 + }, + { + "diff_generated": -26.299346923828125, + "epoch": 1.6817887232663642, + "grad_norm": 2.6802640950141465, + "learning_rate": 3.852229465408597e-07, + "logits/chosen": -2.344144344329834, + "logits/rejected": -2.3785035610198975, + "logps/chosen": -17.98956871032715, + "logps/rejected": -492.83538818359375, + "logps_avg/chosen": -0.10476674139499664, + "logps_avg/rejected": -2.629934787750244, + "loss": 0.1018, + "losses_ref": -0.0006202187505550683, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5190, + "u": -3.5049166679382324, + "weight": 0.07590137422084808 + }, + { + "diff_generated": -28.270172119140625, + "epoch": 1.6850291639662993, + "grad_norm": 2.588015465650208, + "learning_rate": 3.8371587888581067e-07, + "logits/chosen": -2.3521931171417236, + "logits/rejected": -2.301591157913208, + "logps/chosen": -18.41990852355957, + "logps/rejected": -510.8887634277344, + "logps_avg/chosen": -0.09935440123081207, + "logps_avg/rejected": -2.827017307281494, + "loss": 0.0989, + "losses_ref": -0.00043917092261835933, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5200, + "u": -3.6940014362335205, + "weight": 0.02555156871676445 + }, + { + "diff_generated": -28.505340576171875, + "epoch": 1.6882696046662347, + "grad_norm": 2.80389923443133, + "learning_rate": 3.822090427382442e-07, + "logits/chosen": -2.382777690887451, + "logits/rejected": -2.2807302474975586, + "logps/chosen": -17.81949806213379, + "logps/rejected": -509.1504821777344, + "logps_avg/chosen": -0.09310451149940491, + "logps_avg/rejected": -2.850533962249756, + "loss": 0.0999, + "losses_ref": -0.0003349473117850721, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5210, + "u": -3.5480377674102783, + "weight": 0.06295160949230194 + }, + { + "diff_generated": -28.742563247680664, + "epoch": 1.6915100453661698, + "grad_norm": 2.769761655194945, + "learning_rate": 3.807024595204916e-07, + "logits/chosen": -2.367471933364868, + "logits/rejected": -2.283769130706787, + "logps/chosen": -17.1124267578125, + "logps/rejected": -494.98858642578125, + "logps_avg/chosen": -0.09259551763534546, + "logps_avg/rejected": -2.874256134033203, + "loss": 0.1013, + "losses_ref": -7.784694025758654e-05, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5220, + "u": -3.6688854694366455, + "weight": 0.031324684619903564 + }, + { + "diff_generated": -28.937397003173828, + "epoch": 1.6947504860661051, + "grad_norm": 2.512445899650775, + "learning_rate": 3.7919615065128905e-07, + "logits/chosen": -2.437547206878662, + "logits/rejected": -2.3424153327941895, + "logps/chosen": -18.564697265625, + "logps/rejected": -510.82269287109375, + "logps_avg/chosen": -0.1011086255311966, + "logps_avg/rejected": -2.8937392234802246, + "loss": 0.104, + "losses_ref": -0.0008404834079556167, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5230, + "u": -3.6209702491760254, + "weight": 0.045037899166345596 + }, + { + "diff_generated": -26.764385223388672, + "epoch": 1.6979909267660402, + "grad_norm": 2.4946620977284, + "learning_rate": 3.7769013754547155e-07, + "logits/chosen": -2.3728580474853516, + "logits/rejected": -2.299055814743042, + "logps/chosen": -18.191150665283203, + "logps/rejected": -498.5304260253906, + "logps_avg/chosen": -0.09499356895685196, + "logps_avg/rejected": -2.676438808441162, + "loss": 0.0986, + "losses_ref": -0.0007090292638167739, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5240, + "u": -3.5050902366638184, + "weight": 0.07596190273761749 + }, + { + "diff_generated": -26.970651626586914, + "epoch": 1.7012313674659754, + "grad_norm": 2.4838710224921887, + "learning_rate": 3.761844416136701e-07, + "logits/chosen": -2.3771610260009766, + "logits/rejected": -2.3464341163635254, + "logps/chosen": -17.874975204467773, + "logps/rejected": -479.57244873046875, + "logps_avg/chosen": -0.10110364854335785, + "logps_avg/rejected": -2.6970648765563965, + "loss": 0.0975, + "losses_ref": -0.0009680521907284856, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5250, + "u": -3.6237990856170654, + "weight": 0.045234501361846924 + }, + { + "diff_generated": -25.719568252563477, + "epoch": 1.7044718081659105, + "grad_norm": 2.4165842463216545, + "learning_rate": 3.746790842620059e-07, + "logits/chosen": -2.356290817260742, + "logits/rejected": -2.313058614730835, + "logps/chosen": -16.402015686035156, + "logps/rejected": -457.26116943359375, + "logps_avg/chosen": -0.09279557317495346, + "logps_avg/rejected": -2.5719568729400635, + "loss": 0.1017, + "losses_ref": -0.0006630142452195287, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5260, + "u": -3.576869249343872, + "weight": 0.05718719959259033 + }, + { + "diff_generated": -26.614282608032227, + "epoch": 1.7077122488658456, + "grad_norm": 2.5094338378018812, + "learning_rate": 3.731740868917872e-07, + "logits/chosen": -2.309357166290283, + "logits/rejected": -2.2900514602661133, + "logps/chosen": -18.272537231445312, + "logps/rejected": -487.6533203125, + "logps_avg/chosen": -0.10436661541461945, + "logps_avg/rejected": -2.6614279747009277, + "loss": 0.1026, + "losses_ref": -0.0007275083917193115, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5270, + "u": -3.551506519317627, + "weight": 0.06346292048692703 + }, + { + "diff_generated": -28.740280151367188, + "epoch": 1.710952689565781, + "grad_norm": 2.7448608243155945, + "learning_rate": 3.716694708992039e-07, + "logits/chosen": -2.3863444328308105, + "logits/rejected": -2.3174705505371094, + "logps/chosen": -18.374553680419922, + "logps/rejected": -497.02398681640625, + "logps_avg/chosen": -0.0997755229473114, + "logps_avg/rejected": -2.874027967453003, + "loss": 0.1001, + "losses_ref": -0.0012814865913242102, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5280, + "u": -3.6460769176483154, + "weight": 0.040156055241823196 + }, + { + "diff_generated": -28.114795684814453, + "epoch": 1.7141931302657163, + "grad_norm": 2.7845599728937867, + "learning_rate": 3.701652576750242e-07, + "logits/chosen": -2.3712282180786133, + "logits/rejected": -2.336479663848877, + "logps/chosen": -16.633949279785156, + "logps/rejected": -495.52093505859375, + "logps_avg/chosen": -0.10134591907262802, + "logps_avg/rejected": -2.8114798069000244, + "loss": 0.1021, + "losses_ref": -0.00033907522447407246, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5290, + "u": -3.6014404296875, + "weight": 0.05043425410985947 + }, + { + "diff_generated": -27.336872100830078, + "epoch": 1.7174335709656514, + "grad_norm": 2.6502131059024645, + "learning_rate": 3.686614686042906e-07, + "logits/chosen": -2.3674819469451904, + "logits/rejected": -2.3362417221069336, + "logps/chosen": -16.538209915161133, + "logps/rejected": -468.26104736328125, + "logps_avg/chosen": -0.0999147891998291, + "logps_avg/rejected": -2.733687400817871, + "loss": 0.1, + "losses_ref": -0.00046807853505015373, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5300, + "u": -3.6691298484802246, + "weight": 0.031879961490631104 + }, + { + "diff_generated": -28.801959991455078, + "epoch": 1.7206740116655865, + "grad_norm": 2.6100803813734093, + "learning_rate": 3.6715812506601493e-07, + "logits/chosen": -2.367793560028076, + "logits/rejected": -2.2648661136627197, + "logps/chosen": -18.021644592285156, + "logps/rejected": -510.4642028808594, + "logps_avg/chosen": -0.09823264181613922, + "logps_avg/rejected": -2.8801960945129395, + "loss": 0.0977, + "losses_ref": -0.0003415598184801638, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5310, + "u": -3.646897554397583, + "weight": 0.03793282434344292 + }, + { + "diff_generated": -29.1380615234375, + "epoch": 1.7239144523655217, + "grad_norm": 2.7772687087152974, + "learning_rate": 3.6565524843287526e-07, + "logits/chosen": -2.382816791534424, + "logits/rejected": -2.3172459602355957, + "logps/chosen": -15.904184341430664, + "logps/rejected": -503.6893615722656, + "logps_avg/chosen": -0.09637071192264557, + "logps_avg/rejected": -2.913806438446045, + "loss": 0.0989, + "losses_ref": -0.00024001784913707525, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5320, + "u": -3.5073089599609375, + "weight": 0.07528218626976013 + }, + { + "diff_generated": -26.45065689086914, + "epoch": 1.7271548930654568, + "grad_norm": 2.7264672239761234, + "learning_rate": 3.641528600709115e-07, + "logits/chosen": -2.3781332969665527, + "logits/rejected": -2.3246679306030273, + "logps/chosen": -18.484371185302734, + "logps/rejected": -476.2315979003906, + "logps_avg/chosen": -0.10251567512750626, + "logps_avg/rejected": -2.6450653076171875, + "loss": 0.0996, + "losses_ref": -0.0004218421527184546, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5330, + "u": -3.5488994121551514, + "weight": 0.06301628053188324 + }, + { + "diff_generated": -26.43978500366211, + "epoch": 1.7303953337653921, + "grad_norm": 3.075617478340941, + "learning_rate": 3.6265098133922277e-07, + "logits/chosen": -2.417259931564331, + "logits/rejected": -2.3201329708099365, + "logps/chosen": -15.935078620910645, + "logps/rejected": -465.5621032714844, + "logps_avg/chosen": -0.0904788076877594, + "logps_avg/rejected": -2.6439785957336426, + "loss": 0.0966, + "losses_ref": -0.0006274757906794548, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5340, + "u": -3.411005735397339, + "weight": 0.10097329318523407 + }, + { + "diff_generated": -27.395992279052734, + "epoch": 1.7336357744653272, + "grad_norm": 2.5422219006330207, + "learning_rate": 3.611496335896617e-07, + "logits/chosen": -2.378610610961914, + "logits/rejected": -2.3410072326660156, + "logps/chosen": -16.637414932250977, + "logps/rejected": -533.4635009765625, + "logps_avg/chosen": -0.095014788210392, + "logps_avg/rejected": -2.7395992279052734, + "loss": 0.0987, + "losses_ref": -0.0004987435531802475, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5350, + "u": -3.6030936241149902, + "weight": 0.05063430219888687 + }, + { + "diff_generated": -30.47920799255371, + "epoch": 1.7368762151652626, + "grad_norm": 2.678820876041137, + "learning_rate": 3.59648838166533e-07, + "logits/chosen": -2.388047456741333, + "logits/rejected": -2.3407864570617676, + "logps/chosen": -18.07908821105957, + "logps/rejected": -551.9603271484375, + "logps_avg/chosen": -0.10280290991067886, + "logps_avg/rejected": -3.0479209423065186, + "loss": 0.1011, + "losses_ref": -0.00017604381719138473, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5360, + "u": -3.6247005462646484, + "weight": 0.0439545139670372 + }, + { + "diff_generated": -28.07529640197754, + "epoch": 1.7401166558651977, + "grad_norm": 2.6355285338892913, + "learning_rate": 3.5814861640628864e-07, + "logits/chosen": -2.3373141288757324, + "logits/rejected": -2.271507501602173, + "logps/chosen": -19.13628578186035, + "logps/rejected": -518.8572998046875, + "logps_avg/chosen": -0.1072794646024704, + "logps_avg/rejected": -2.807529926300049, + "loss": 0.1, + "losses_ref": -0.0006126166554167867, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5370, + "u": -3.452075242996216, + "weight": 0.08837278187274933 + }, + { + "diff_generated": -27.70608901977539, + "epoch": 1.7433570965651328, + "grad_norm": 2.5643449664389344, + "learning_rate": 3.5664898963722526e-07, + "logits/chosen": -2.325183629989624, + "logits/rejected": -2.30161190032959, + "logps/chosen": -18.24054718017578, + "logps/rejected": -512.8580322265625, + "logps_avg/chosen": -0.10216137021780014, + "logps_avg/rejected": -2.770608901977539, + "loss": 0.0994, + "losses_ref": -0.0006244811811484396, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5380, + "u": -3.600287675857544, + "weight": 0.05084812641143799 + }, + { + "diff_generated": -26.136837005615234, + "epoch": 1.746597537265068, + "grad_norm": 2.768633325019539, + "learning_rate": 3.5514997917918016e-07, + "logits/chosen": -2.353252649307251, + "logits/rejected": -2.330204486846924, + "logps/chosen": -15.285616874694824, + "logps/rejected": -486.61785888671875, + "logps_avg/chosen": -0.09202093631029129, + "logps_avg/rejected": -2.6136839389801025, + "loss": 0.0982, + "losses_ref": -0.0004441851342562586, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5390, + "u": -3.549830675125122, + "weight": 0.06307470798492432 + }, + { + "diff_generated": -27.409582138061523, + "epoch": 1.7498379779650033, + "grad_norm": 2.514774319959278, + "learning_rate": 3.536516063432293e-07, + "logits/chosen": -2.353785991668701, + "logits/rejected": -2.3390090465545654, + "logps/chosen": -16.88019561767578, + "logps/rejected": -487.05462646484375, + "logps_avg/chosen": -0.09791740030050278, + "logps_avg/rejected": -2.7409584522247314, + "loss": 0.0978, + "losses_ref": -0.0005682742339558899, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5400, + "u": -3.6471595764160156, + "weight": 0.038246989250183105 + }, + { + "diff_generated": -25.251800537109375, + "epoch": 1.7530784186649384, + "grad_norm": 2.4130679264226096, + "learning_rate": 3.5215389243138326e-07, + "logits/chosen": -2.342167854309082, + "logits/rejected": -2.2756638526916504, + "logps/chosen": -20.983112335205078, + "logps/rejected": -471.53961181640625, + "logps_avg/chosen": -0.11012951284646988, + "logps_avg/rejected": -2.525179862976074, + "loss": 0.0972, + "losses_ref": -0.0012510241940617561, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5410, + "u": -3.6464996337890625, + "weight": 0.039689868688583374 + }, + { + "diff_generated": -28.358959197998047, + "epoch": 1.7563188593648738, + "grad_norm": 2.6050020331663473, + "learning_rate": 3.50656858736285e-07, + "logits/chosen": -2.373044490814209, + "logits/rejected": -2.2874221801757812, + "logps/chosen": -18.268918991088867, + "logps/rejected": -506.542724609375, + "logps_avg/chosen": -0.09958215057849884, + "logps_avg/rejected": -2.8358960151672363, + "loss": 0.1001, + "losses_ref": -0.0005597332492470741, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5420, + "u": -3.6918044090270996, + "weight": 0.025833910331130028 + }, + { + "diff_generated": -25.419437408447266, + "epoch": 1.7595593000648089, + "grad_norm": 2.5888083689908723, + "learning_rate": 3.491605265409073e-07, + "logits/chosen": -2.3774361610412598, + "logits/rejected": -2.284318447113037, + "logps/chosen": -20.87076187133789, + "logps/rejected": -468.8209533691406, + "logps_avg/chosen": -0.10591413825750351, + "logps_avg/rejected": -2.5419440269470215, + "loss": 0.098, + "losses_ref": -0.00048125721514225006, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5430, + "u": -3.5768661499023438, + "weight": 0.056924331933259964 + }, + { + "diff_generated": -26.61014175415039, + "epoch": 1.762799740764744, + "grad_norm": 2.7238365394521344, + "learning_rate": 3.4766491711824916e-07, + "logits/chosen": -2.3245720863342285, + "logits/rejected": -2.2953591346740723, + "logps/chosen": -17.809200286865234, + "logps/rejected": -473.62841796875, + "logps_avg/chosen": -0.10384336858987808, + "logps_avg/rejected": -2.6610138416290283, + "loss": 0.1013, + "losses_ref": -0.0007365869241766632, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5440, + "u": -3.5971107482910156, + "weight": 0.051128558814525604 + }, + { + "diff_generated": -27.968036651611328, + "epoch": 1.7660401814646791, + "grad_norm": 2.6465858443300543, + "learning_rate": 3.4617005173103497e-07, + "logits/chosen": -2.404662847518921, + "logits/rejected": -2.319225311279297, + "logps/chosen": -17.643014907836914, + "logps/rejected": -494.31884765625, + "logps_avg/chosen": -0.0980905294418335, + "logps_avg/rejected": -2.7968032360076904, + "loss": 0.0965, + "losses_ref": -0.000339856487698853, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5450, + "u": -3.6960651874542236, + "weight": 0.02541317604482174 + }, + { + "diff_generated": -27.50223731994629, + "epoch": 1.7692806221646142, + "grad_norm": 2.439340308045177, + "learning_rate": 3.4467595163141056e-07, + "logits/chosen": -2.356353282928467, + "logits/rejected": -2.3252711296081543, + "logps/chosen": -17.11294937133789, + "logps/rejected": -481.44329833984375, + "logps_avg/chosen": -0.09781317412853241, + "logps_avg/rejected": -2.7502236366271973, + "loss": 0.0985, + "losses_ref": -0.00023348219110630453, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5460, + "u": -3.5757458209991455, + "weight": 0.0565178282558918 + }, + { + "diff_generated": -28.132930755615234, + "epoch": 1.7725210628645496, + "grad_norm": 2.57485074500873, + "learning_rate": 3.4318263806064244e-07, + "logits/chosen": -2.3583288192749023, + "logits/rejected": -2.2734169960021973, + "logps/chosen": -18.482982635498047, + "logps/rejected": -491.0975646972656, + "logps_avg/chosen": -0.09854892641305923, + "logps_avg/rejected": -2.813292980194092, + "loss": 0.1018, + "losses_ref": -0.0007904424564912915, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5470, + "u": -3.6228435039520264, + "weight": 0.04524999111890793 + }, + { + "diff_generated": -28.949413299560547, + "epoch": 1.775761503564485, + "grad_norm": 2.59795459877875, + "learning_rate": 3.4169013224881475e-07, + "logits/chosen": -2.405365467071533, + "logits/rejected": -2.324328660964966, + "logps/chosen": -17.76644515991211, + "logps/rejected": -519.6321411132812, + "logps_avg/chosen": -0.09652705490589142, + "logps_avg/rejected": -2.8949413299560547, + "loss": 0.0988, + "losses_ref": -0.00027256523026153445, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5480, + "u": -3.5291507244110107, + "weight": 0.0690760463476181 + }, + { + "diff_generated": -27.986907958984375, + "epoch": 1.77900194426442, + "grad_norm": 2.6145142118362203, + "learning_rate": 3.4019845541452844e-07, + "logits/chosen": -2.3335089683532715, + "logits/rejected": -2.2344508171081543, + "logps/chosen": -16.983856201171875, + "logps/rejected": -477.7383728027344, + "logps_avg/chosen": -0.09876149892807007, + "logps_avg/rejected": -2.7986905574798584, + "loss": 0.0998, + "losses_ref": -0.00013874072465114295, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5490, + "u": -3.554018020629883, + "weight": 0.06264077126979828 + }, + { + "diff_generated": -28.678543090820312, + "epoch": 1.7822423849643552, + "grad_norm": 2.608081553570393, + "learning_rate": 3.387076287645985e-07, + "logits/chosen": -2.353555917739868, + "logits/rejected": -2.283008098602295, + "logps/chosen": -17.25589942932129, + "logps/rejected": -534.0850830078125, + "logps_avg/chosen": -0.09464980661869049, + "logps_avg/rejected": -2.867854595184326, + "loss": 0.0967, + "losses_ref": -0.00037761160638183355, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5500, + "u": -3.550675868988037, + "weight": 0.06296338140964508 + }, + { + "diff_generated": -27.80132484436035, + "epoch": 1.7854828256642903, + "grad_norm": 2.402528371744228, + "learning_rate": 3.372176734937536e-07, + "logits/chosen": -2.3043529987335205, + "logits/rejected": -2.2722156047821045, + "logps/chosen": -16.18600082397461, + "logps/rejected": -531.1876831054688, + "logps_avg/chosen": -0.09347482025623322, + "logps_avg/rejected": -2.78013277053833, + "loss": 0.0965, + "losses_ref": -0.001090071047656238, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5510, + "u": -3.5538277626037598, + "weight": 0.06445705145597458 + }, + { + "diff_generated": -28.665653228759766, + "epoch": 1.7887232663642254, + "grad_norm": 2.5830371919480215, + "learning_rate": 3.3572861078433376e-07, + "logits/chosen": -2.359614849090576, + "logits/rejected": -2.265977621078491, + "logps/chosen": -16.606525421142578, + "logps/rejected": -502.6050720214844, + "logps_avg/chosen": -0.09027236700057983, + "logps_avg/rejected": -2.866565227508545, + "loss": 0.098, + "losses_ref": -0.00043819882557727396, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5520, + "u": -3.527933120727539, + "weight": 0.06936118751764297 + }, + { + "diff_generated": -26.51803970336914, + "epoch": 1.7919637070641607, + "grad_norm": 2.5432017679630983, + "learning_rate": 3.3424046180599e-07, + "logits/chosen": -2.3557043075561523, + "logits/rejected": -2.2849130630493164, + "logps/chosen": -16.62330436706543, + "logps/rejected": -487.97198486328125, + "logps_avg/chosen": -0.09761399030685425, + "logps_avg/rejected": -2.651803493499756, + "loss": 0.0957, + "losses_ref": -0.0005190398078411818, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5530, + "u": -3.477447986602783, + "weight": 0.08194705098867416 + }, + { + "diff_generated": -28.85833168029785, + "epoch": 1.7952041477640959, + "grad_norm": 2.684314115645967, + "learning_rate": 3.3275324771538273e-07, + "logits/chosen": -2.320209264755249, + "logits/rejected": -2.2477760314941406, + "logps/chosen": -17.556983947753906, + "logps/rejected": -518.7302856445312, + "logps_avg/chosen": -0.09914490580558777, + "logps_avg/rejected": -2.885833263397217, + "loss": 0.0976, + "losses_ref": -0.00039971404476091266, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5540, + "u": -3.5764288902282715, + "weight": 0.056789614260196686 + }, + { + "diff_generated": -28.340740203857422, + "epoch": 1.7984445884640312, + "grad_norm": 2.5980842410538054, + "learning_rate": 3.312669896558816e-07, + "logits/chosen": -2.3921077251434326, + "logits/rejected": -2.2863588333129883, + "logps/chosen": -16.911273956298828, + "logps/rejected": -481.89947509765625, + "logps_avg/chosen": -0.09584518522024155, + "logps_avg/rejected": -2.834073781967163, + "loss": 0.0971, + "losses_ref": -0.0004901793436147273, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5550, + "u": -3.549412965774536, + "weight": 0.06320427358150482 + }, + { + "diff_generated": -26.314910888671875, + "epoch": 1.8016850291639663, + "grad_norm": 2.4257047801717784, + "learning_rate": 3.2978170875726454e-07, + "logits/chosen": -2.371443271636963, + "logits/rejected": -2.3299648761749268, + "logps/chosen": -15.302289009094238, + "logps/rejected": -467.28704833984375, + "logps_avg/chosen": -0.08800629526376724, + "logps_avg/rejected": -2.631491184234619, + "loss": 0.0957, + "losses_ref": -0.000557105871848762, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5560, + "u": -3.551307201385498, + "weight": 0.06321394443511963 + }, + { + "diff_generated": -26.27902603149414, + "epoch": 1.8049254698639015, + "grad_norm": 2.6868694467477146, + "learning_rate": 3.2829742613541704e-07, + "logits/chosen": -2.3159596920013428, + "logits/rejected": -2.309455633163452, + "logps/chosen": -17.738338470458984, + "logps/rejected": -523.1011352539062, + "logps_avg/chosen": -0.10448671877384186, + "logps_avg/rejected": -2.6279029846191406, + "loss": 0.0988, + "losses_ref": -0.0001589446037542075, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5570, + "u": -3.5064120292663574, + "weight": 0.07517381012439728 + }, + { + "diff_generated": -29.1507568359375, + "epoch": 1.8081659105638366, + "grad_norm": 2.6650715531360296, + "learning_rate": 3.26814162892033e-07, + "logits/chosen": -2.3827013969421387, + "logits/rejected": -2.309659719467163, + "logps/chosen": -17.93830108642578, + "logps/rejected": -545.5929565429688, + "logps_avg/chosen": -0.1010148674249649, + "logps_avg/rejected": -2.9150757789611816, + "loss": 0.0962, + "losses_ref": -0.0002275872539030388, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5580, + "u": -3.5754737854003906, + "weight": 0.056514762341976166 + }, + { + "diff_generated": -26.319509506225586, + "epoch": 1.811406351263772, + "grad_norm": 2.507425280586255, + "learning_rate": 3.2533194011431346e-07, + "logits/chosen": -2.357861280441284, + "logits/rejected": -2.3017077445983887, + "logps/chosen": -16.048376083374023, + "logps/rejected": -488.1123962402344, + "logps_avg/chosen": -0.09536460041999817, + "logps_avg/rejected": -2.631950855255127, + "loss": 0.096, + "losses_ref": -0.00048077874816954136, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5590, + "u": -3.5518956184387207, + "weight": 0.06308998167514801 + }, + { + "diff_generated": -29.063892364501953, + "epoch": 1.814646791963707, + "grad_norm": 2.504515850196088, + "learning_rate": 3.2385077887466766e-07, + "logits/chosen": -2.395941734313965, + "logits/rejected": -2.3485054969787598, + "logps/chosen": -17.349185943603516, + "logps/rejected": -541.281494140625, + "logps_avg/chosen": -0.10105355829000473, + "logps_avg/rejected": -2.9063892364501953, + "loss": 0.0962, + "losses_ref": -0.0004301825538277626, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5600, + "u": -3.644620418548584, + "weight": 0.03806838393211365 + }, + { + "diff_generated": -27.6377010345459, + "epoch": 1.8178872326636424, + "grad_norm": 2.4776668300868097, + "learning_rate": 3.223707002304131e-07, + "logits/chosen": -2.3200387954711914, + "logits/rejected": -2.286799192428589, + "logps/chosen": -18.38614273071289, + "logps/rejected": -511.392822265625, + "logps_avg/chosen": -0.10474991798400879, + "logps_avg/rejected": -2.7637698650360107, + "loss": 0.0991, + "losses_ref": -0.00030589301604777575, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5610, + "u": -3.505202531814575, + "weight": 0.07536722719669342 + }, + { + "diff_generated": -27.71368408203125, + "epoch": 1.8211276733635775, + "grad_norm": 2.588751701695597, + "learning_rate": 3.208917252234765e-07, + "logits/chosen": -2.3411502838134766, + "logits/rejected": -2.304018259048462, + "logps/chosen": -15.185452461242676, + "logps/rejected": -518.5407104492188, + "logps_avg/chosen": -0.09042085707187653, + "logps_avg/rejected": -2.7713687419891357, + "loss": 0.0965, + "losses_ref": -0.00028060507611371577, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5620, + "u": -3.595822811126709, + "weight": 0.050334203988313675 + }, + { + "diff_generated": -32.019447326660156, + "epoch": 1.8243681140635126, + "grad_norm": 2.5938523274013296, + "learning_rate": 3.1941387488009396e-07, + "logits/chosen": -2.346749782562256, + "logits/rejected": -2.261305332183838, + "logps/chosen": -17.656230926513672, + "logps/rejected": -577.9088134765625, + "logps_avg/chosen": -0.10112802684307098, + "logps_avg/rejected": -3.201944351196289, + "loss": 0.0979, + "losses_ref": -0.0008206478087231517, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5630, + "u": -3.5457470417022705, + "weight": 0.06364034861326218 + }, + { + "diff_generated": -28.698589324951172, + "epoch": 1.8276085547634477, + "grad_norm": 2.4916285253469663, + "learning_rate": 3.179371702105132e-07, + "logits/chosen": -2.4086880683898926, + "logits/rejected": -2.3404719829559326, + "logps/chosen": -19.253461837768555, + "logps/rejected": -524.8831176757812, + "logps_avg/chosen": -0.10744975507259369, + "logps_avg/rejected": -2.869859218597412, + "loss": 0.0989, + "losses_ref": -0.00035506210406310856, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5640, + "u": -3.6465015411376953, + "weight": 0.037953395396471024 + }, + { + "diff_generated": -29.409387588500977, + "epoch": 1.8308489954633829, + "grad_norm": 2.509263833905603, + "learning_rate": 3.164616322086936e-07, + "logits/chosen": -2.3691985607147217, + "logits/rejected": -2.285266876220703, + "logps/chosen": -16.659765243530273, + "logps/rejected": -507.52618408203125, + "logps_avg/chosen": -0.09539072215557098, + "logps_avg/rejected": -2.940938949584961, + "loss": 0.0985, + "losses_ref": -0.0007100877119228244, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5650, + "u": -3.6012961864471436, + "weight": 0.051039986312389374 + }, + { + "diff_generated": -27.477447509765625, + "epoch": 1.8340894361633182, + "grad_norm": 2.571417507210119, + "learning_rate": 3.1498728185200845e-07, + "logits/chosen": -2.403351068496704, + "logits/rejected": -2.3440890312194824, + "logps/chosen": -19.3876953125, + "logps/rejected": -519.7465209960938, + "logps_avg/chosen": -0.10591503232717514, + "logps_avg/rejected": -2.7477450370788574, + "loss": 0.0995, + "losses_ref": -0.0003261718084104359, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5660, + "u": -3.6660637855529785, + "weight": 0.03169737011194229 + }, + { + "diff_generated": -25.8249454498291, + "epoch": 1.8373298768632536, + "grad_norm": 2.5069798884516152, + "learning_rate": 3.1351414010094683e-07, + "logits/chosen": -2.328765392303467, + "logits/rejected": -2.3035292625427246, + "logps/chosen": -18.959909439086914, + "logps/rejected": -489.73211669921875, + "logps_avg/chosen": -0.1038951650261879, + "logps_avg/rejected": -2.5824942588806152, + "loss": 0.099, + "losses_ref": -0.000567997747566551, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5670, + "u": -3.6191246509552, + "weight": 0.044458914548158646 + }, + { + "diff_generated": -29.907428741455078, + "epoch": 1.8405703175631887, + "grad_norm": 2.501875729145946, + "learning_rate": 3.120422278988149e-07, + "logits/chosen": -2.348712205886841, + "logits/rejected": -2.2924861907958984, + "logps/chosen": -17.87131118774414, + "logps/rejected": -526.6512451171875, + "logps_avg/chosen": -0.10166116058826447, + "logps_avg/rejected": -2.9907429218292236, + "loss": 0.0987, + "losses_ref": -0.00026783597422763705, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5680, + "u": -3.624166488647461, + "weight": 0.04405301436781883 + }, + { + "diff_generated": -27.886672973632812, + "epoch": 1.8438107582631238, + "grad_norm": 2.566034051408787, + "learning_rate": 3.10571566171439e-07, + "logits/chosen": -2.371107578277588, + "logits/rejected": -2.3411777019500732, + "logps/chosen": -18.45060157775879, + "logps/rejected": -516.3961181640625, + "logps_avg/chosen": -0.10506061464548111, + "logps_avg/rejected": -2.7886672019958496, + "loss": 0.1, + "losses_ref": -0.00029390607960522175, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5690, + "u": -3.6481080055236816, + "weight": 0.03786667063832283 + }, + { + "diff_generated": -27.870288848876953, + "epoch": 1.847051198963059, + "grad_norm": 2.809915486187473, + "learning_rate": 3.0910217582686756e-07, + "logits/chosen": -2.3481717109680176, + "logits/rejected": -2.354294776916504, + "logps/chosen": -16.52988052368164, + "logps/rejected": -515.648193359375, + "logps_avg/chosen": -0.09819358587265015, + "logps_avg/rejected": -2.7870285511016846, + "loss": 0.1008, + "losses_ref": -0.0013325114268809557, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5700, + "u": -3.5237045288085938, + "weight": 0.07092602550983429 + }, + { + "diff_generated": -25.086254119873047, + "epoch": 1.850291639662994, + "grad_norm": 2.4862422113018505, + "learning_rate": 3.0763407775507426e-07, + "logits/chosen": -2.4297471046447754, + "logits/rejected": -2.395003080368042, + "logps/chosen": -17.715723037719727, + "logps/rejected": -481.25506591796875, + "logps_avg/chosen": -0.09285169094800949, + "logps_avg/rejected": -2.5086257457733154, + "loss": 0.0971, + "losses_ref": -0.00029824889497831464, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5710, + "u": -3.4351601600646973, + "weight": 0.09410148113965988 + }, + { + "diff_generated": -25.64459800720215, + "epoch": 1.8535320803629294, + "grad_norm": 2.6048627692959703, + "learning_rate": 3.0616729282766037e-07, + "logits/chosen": -2.341219425201416, + "logits/rejected": -2.346372365951538, + "logps/chosen": -16.424030303955078, + "logps/rejected": -451.0950622558594, + "logps_avg/chosen": -0.09924010932445526, + "logps_avg/rejected": -2.564460039138794, + "loss": 0.1003, + "losses_ref": -0.0005212887190282345, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5720, + "u": -3.530282497406006, + "weight": 0.06945204734802246 + }, + { + "diff_generated": -27.70074462890625, + "epoch": 1.8567725210628645, + "grad_norm": 2.6690629016870737, + "learning_rate": 3.047018418975593e-07, + "logits/chosen": -2.3930373191833496, + "logits/rejected": -2.262077808380127, + "logps/chosen": -19.89908218383789, + "logps/rejected": -474.302734375, + "logps_avg/chosen": -0.10280998051166534, + "logps_avg/rejected": -2.7700743675231934, + "loss": 0.1009, + "losses_ref": -0.0002895224606618285, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5730, + "u": -3.597874402999878, + "weight": 0.050351161509752274 + }, + { + "diff_generated": -27.63687515258789, + "epoch": 1.8600129617627998, + "grad_norm": 2.6096737058446564, + "learning_rate": 3.032377457987385e-07, + "logits/chosen": -2.367377758026123, + "logits/rejected": -2.329639196395874, + "logps/chosen": -17.626415252685547, + "logps/rejected": -510.41253662109375, + "logps_avg/chosen": -0.10201933234930038, + "logps_avg/rejected": -2.7636873722076416, + "loss": 0.0959, + "losses_ref": -0.0012955269776284695, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5740, + "u": -3.5941002368927, + "weight": 0.05208515003323555 + }, + { + "diff_generated": -26.940898895263672, + "epoch": 1.863253402462735, + "grad_norm": 2.788615366488837, + "learning_rate": 3.017750253459048e-07, + "logits/chosen": -2.3940701484680176, + "logits/rejected": -2.3186657428741455, + "logps/chosen": -19.841453552246094, + "logps/rejected": -506.88275146484375, + "logps_avg/chosen": -0.10610809177160263, + "logps_avg/rejected": -2.6940901279449463, + "loss": 0.1009, + "losses_ref": -0.0002789639984257519, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5750, + "u": -3.6475138664245605, + "weight": 0.03784211724996567 + }, + { + "diff_generated": -27.238052368164062, + "epoch": 1.86649384316267, + "grad_norm": 2.503504965592765, + "learning_rate": 3.003137013342071e-07, + "logits/chosen": -2.430534601211548, + "logits/rejected": -2.417689323425293, + "logps/chosen": -15.796445846557617, + "logps/rejected": -508.17730712890625, + "logps_avg/chosen": -0.09056727588176727, + "logps_avg/rejected": -2.7238051891326904, + "loss": 0.0965, + "losses_ref": -0.0003995650331489742, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5760, + "u": -3.62666392326355, + "weight": 0.044256873428821564 + }, + { + "diff_generated": -28.615772247314453, + "epoch": 1.8697342838626052, + "grad_norm": 2.614017479610386, + "learning_rate": 2.9885379453894224e-07, + "logits/chosen": -2.4150617122650146, + "logits/rejected": -2.37510347366333, + "logps/chosen": -16.090747833251953, + "logps/rejected": -553.6571044921875, + "logps_avg/chosen": -0.09207513183355331, + "logps_avg/rejected": -2.861577272415161, + "loss": 0.0974, + "losses_ref": -0.00022796406119596213, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5770, + "u": -3.597252607345581, + "weight": 0.05026254802942276 + }, + { + "diff_generated": -27.86861228942871, + "epoch": 1.8729747245625405, + "grad_norm": 2.6691451138289835, + "learning_rate": 2.9739532571525806e-07, + "logits/chosen": -2.4372811317443848, + "logits/rejected": -2.4184963703155518, + "logps/chosen": -16.568439483642578, + "logps/rejected": -506.2725524902344, + "logps_avg/chosen": -0.09399370849132538, + "logps_avg/rejected": -2.7868611812591553, + "loss": 0.1029, + "losses_ref": -0.0002700109616853297, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5780, + "u": -3.6414883136749268, + "weight": 0.03782133013010025 + }, + { + "diff_generated": -29.793752670288086, + "epoch": 1.8762151652624757, + "grad_norm": 2.436557894850836, + "learning_rate": 2.959383155978596e-07, + "logits/chosen": -2.3705596923828125, + "logits/rejected": -2.3041160106658936, + "logps/chosen": -16.600704193115234, + "logps/rejected": -536.4827880859375, + "logps_avg/chosen": -0.09566892683506012, + "logps_avg/rejected": -2.9793753623962402, + "loss": 0.0985, + "losses_ref": -0.00015063644968904555, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5790, + "u": -3.5993595123291016, + "weight": 0.0501655749976635 + }, + { + "diff_generated": -26.078826904296875, + "epoch": 1.879455605962411, + "grad_norm": 2.557904930799931, + "learning_rate": 2.9448278490071373e-07, + "logits/chosen": -2.3876595497131348, + "logits/rejected": -2.3382132053375244, + "logps/chosen": -16.884082794189453, + "logps/rejected": -469.434326171875, + "logps_avg/chosen": -0.09592945873737335, + "logps_avg/rejected": -2.6078827381134033, + "loss": 0.1009, + "losses_ref": -0.0003953514969907701, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5800, + "u": -3.5042953491210938, + "weight": 0.0754755288362503 + }, + { + "diff_generated": -27.1308650970459, + "epoch": 1.8826960466623461, + "grad_norm": 2.410795608150016, + "learning_rate": 2.930287543167544e-07, + "logits/chosen": -2.4403939247131348, + "logits/rejected": -2.338735342025757, + "logps/chosen": -18.41120719909668, + "logps/rejected": -498.0870056152344, + "logps_avg/chosen": -0.0972508043050766, + "logps_avg/rejected": -2.7130866050720215, + "loss": 0.0976, + "losses_ref": -0.0004312940873205662, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5810, + "u": -3.7621655464172363, + "weight": 0.006767953280359507 + }, + { + "diff_generated": -30.3148193359375, + "epoch": 1.8859364873622813, + "grad_norm": 2.53770678947922, + "learning_rate": 2.9157624451758944e-07, + "logits/chosen": -2.3555285930633545, + "logits/rejected": -2.2266926765441895, + "logps/chosen": -18.102619171142578, + "logps/rejected": -496.95428466796875, + "logps_avg/chosen": -0.10375206172466278, + "logps_avg/rejected": -3.031481981277466, + "loss": 0.0996, + "losses_ref": -0.0006220188806764781, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5820, + "u": -3.550969362258911, + "weight": 0.06340692937374115 + }, + { + "diff_generated": -29.127822875976562, + "epoch": 1.8891769280622164, + "grad_norm": 2.5479669591981704, + "learning_rate": 2.901252761532055e-07, + "logits/chosen": -2.4188151359558105, + "logits/rejected": -2.309403896331787, + "logps/chosen": -17.828176498413086, + "logps/rejected": -494.87353515625, + "logps_avg/chosen": -0.0992671474814415, + "logps_avg/rejected": -2.9127821922302246, + "loss": 0.0965, + "losses_ref": -0.0007391165709123015, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5830, + "u": -3.646193742752075, + "weight": 0.038570336997509 + }, + { + "diff_generated": -27.222660064697266, + "epoch": 1.8924173687621515, + "grad_norm": 2.7100293576269094, + "learning_rate": 2.8867586985167523e-07, + "logits/chosen": -2.4463648796081543, + "logits/rejected": -2.3553476333618164, + "logps/chosen": -17.450244903564453, + "logps/rejected": -484.1527404785156, + "logps_avg/chosen": -0.0959417000412941, + "logps_avg/rejected": -2.72226619720459, + "loss": 0.0961, + "losses_ref": -0.00026212536613456905, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5840, + "u": -3.525402784347534, + "weight": 0.06906180083751678 + }, + { + "diff_generated": -29.353763580322266, + "epoch": 1.8956578094620868, + "grad_norm": 2.5836151874470787, + "learning_rate": 2.8722804621886364e-07, + "logits/chosen": -2.404148817062378, + "logits/rejected": -2.3284482955932617, + "logps/chosen": -18.062631607055664, + "logps/rejected": -501.10595703125, + "logps_avg/chosen": -0.09879143536090851, + "logps_avg/rejected": -2.9353766441345215, + "loss": 0.1012, + "losses_ref": -0.0004886888200417161, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5850, + "u": -3.6259491443634033, + "weight": 0.04446321353316307 + }, + { + "diff_generated": -28.762012481689453, + "epoch": 1.8988982501620222, + "grad_norm": 2.5550213842894003, + "learning_rate": 2.857818258381358e-07, + "logits/chosen": -2.3610546588897705, + "logits/rejected": -2.2954795360565186, + "logps/chosen": -18.180984497070312, + "logps/rejected": -513.5320434570312, + "logps_avg/chosen": -0.10295577347278595, + "logps_avg/rejected": -2.8762011528015137, + "loss": 0.101, + "losses_ref": -0.00014926650328561664, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5860, + "u": -3.6503257751464844, + "weight": 0.03767084330320358 + }, + { + "diff_generated": -29.47927474975586, + "epoch": 1.9021386908619573, + "grad_norm": 2.4124126862845583, + "learning_rate": 2.8433722927006314e-07, + "logits/chosen": -2.413966178894043, + "logits/rejected": -2.357156991958618, + "logps/chosen": -19.497705459594727, + "logps/rejected": -542.2984619140625, + "logps_avg/chosen": -0.11012458801269531, + "logps_avg/rejected": -2.947927474975586, + "loss": 0.1002, + "losses_ref": -0.0004483603988774121, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5870, + "u": -3.6454529762268066, + "weight": 0.03807947039604187 + }, + { + "diff_generated": -28.179845809936523, + "epoch": 1.9053791315618924, + "grad_norm": 2.6754838770049605, + "learning_rate": 2.82894277052132e-07, + "logits/chosen": -2.382157564163208, + "logits/rejected": -2.352442502975464, + "logps/chosen": -16.77843475341797, + "logps/rejected": -531.6185913085938, + "logps_avg/chosen": -0.09153401851654053, + "logps_avg/rejected": -2.8179848194122314, + "loss": 0.0968, + "losses_ref": -0.0003780314582400024, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5880, + "u": -3.5757038593292236, + "weight": 0.05669553950428963 + }, + { + "diff_generated": -28.038406372070312, + "epoch": 1.9086195722618275, + "grad_norm": 2.618243243916407, + "learning_rate": 2.814529896984514e-07, + "logits/chosen": -2.344517469406128, + "logits/rejected": -2.2761929035186768, + "logps/chosen": -18.029022216796875, + "logps/rejected": -504.73651123046875, + "logps_avg/chosen": -0.09274528920650482, + "logps_avg/rejected": -2.8038411140441895, + "loss": 0.0987, + "losses_ref": -0.000521031382959336, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5890, + "u": -3.5262343883514404, + "weight": 0.06953036785125732 + }, + { + "diff_generated": -26.303661346435547, + "epoch": 1.9118600129617627, + "grad_norm": 2.780793568277981, + "learning_rate": 2.8001338769946126e-07, + "logits/chosen": -2.3630564212799072, + "logits/rejected": -2.257866621017456, + "logps/chosen": -17.455060958862305, + "logps/rejected": -465.14990234375, + "logps_avg/chosen": -0.09545596688985825, + "logps_avg/rejected": -2.630366563796997, + "loss": 0.1005, + "losses_ref": -0.0006079275044612586, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5900, + "u": -3.531212329864502, + "weight": 0.06961339712142944 + }, + { + "diff_generated": -28.460521697998047, + "epoch": 1.915100453661698, + "grad_norm": 2.420976109120161, + "learning_rate": 2.7857549152164153e-07, + "logits/chosen": -2.4027175903320312, + "logits/rejected": -2.3249917030334473, + "logps/chosen": -14.792211532592773, + "logps/rejected": -504.2300720214844, + "logps_avg/chosen": -0.08837342262268066, + "logps_avg/rejected": -2.846052646636963, + "loss": 0.0971, + "losses_ref": -0.0005176137783564627, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5910, + "u": -3.5528805255889893, + "weight": 0.06331194937229156 + }, + { + "diff_generated": -28.62518310546875, + "epoch": 1.9183408943616331, + "grad_norm": 2.471182929487115, + "learning_rate": 2.7713932160722043e-07, + "logits/chosen": -2.344817876815796, + "logits/rejected": -2.3178930282592773, + "logps/chosen": -17.537582397460938, + "logps/rejected": -524.8299560546875, + "logps_avg/chosen": -0.10338269174098969, + "logps_avg/rejected": -2.862518787384033, + "loss": 0.0984, + "losses_ref": -0.0008483555866405368, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5920, + "u": -3.669346332550049, + "weight": 0.032698508352041245 + }, + { + "diff_generated": -30.585668563842773, + "epoch": 1.9215813350615685, + "grad_norm": 2.4739113244660285, + "learning_rate": 2.757048983738847e-07, + "logits/chosen": -2.399620771408081, + "logits/rejected": -2.3443222045898438, + "logps/chosen": -17.10749626159668, + "logps/rejected": -557.9705810546875, + "logps_avg/chosen": -0.09897866100072861, + "logps_avg/rejected": -3.0585668087005615, + "loss": 0.0957, + "losses_ref": -0.0004411758854985237, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5930, + "u": -3.6455721855163574, + "weight": 0.03806128352880478 + }, + { + "diff_generated": -26.73529624938965, + "epoch": 1.9248217757615036, + "grad_norm": 2.570894429245623, + "learning_rate": 2.742722422144885e-07, + "logits/chosen": -2.394821882247925, + "logits/rejected": -2.3548684120178223, + "logps/chosen": -18.255279541015625, + "logps/rejected": -517.0158081054688, + "logps_avg/chosen": -0.10357420146465302, + "logps_avg/rejected": -2.673529624938965, + "loss": 0.0971, + "losses_ref": -0.0006127547239884734, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5940, + "u": -3.576965808868408, + "weight": 0.057065822184085846 + }, + { + "diff_generated": -28.054443359375, + "epoch": 1.9280622164614387, + "grad_norm": 2.5841664834314546, + "learning_rate": 2.7284137349676466e-07, + "logits/chosen": -2.325408697128296, + "logits/rejected": -2.256453275680542, + "logps/chosen": -16.044095993041992, + "logps/rejected": -539.1276245117188, + "logps_avg/chosen": -0.09099126607179642, + "logps_avg/rejected": -2.8054440021514893, + "loss": 0.0979, + "losses_ref": -0.0003426524926908314, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5950, + "u": -3.50434947013855, + "weight": 0.07543531805276871 + }, + { + "diff_generated": -28.748315811157227, + "epoch": 1.9313026571613738, + "grad_norm": 2.596492156896544, + "learning_rate": 2.7141231256303343e-07, + "logits/chosen": -2.355212688446045, + "logits/rejected": -2.3038063049316406, + "logps/chosen": -19.689380645751953, + "logps/rejected": -532.0252685546875, + "logps_avg/chosen": -0.11005760729312897, + "logps_avg/rejected": -2.8748316764831543, + "loss": 0.102, + "losses_ref": -0.0007306236075237393, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5960, + "u": -3.6479029655456543, + "weight": 0.03859782963991165 + }, + { + "diff_generated": -28.284626007080078, + "epoch": 1.9345430978613092, + "grad_norm": 2.5146143330713686, + "learning_rate": 2.69985079729915e-07, + "logits/chosen": -2.369210720062256, + "logits/rejected": -2.283693790435791, + "logps/chosen": -17.795032501220703, + "logps/rejected": -495.90472412109375, + "logps_avg/chosen": -0.09974905848503113, + "logps_avg/rejected": -2.828463077545166, + "loss": 0.0998, + "losses_ref": -0.0007459482294507325, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5970, + "u": -3.5771775245666504, + "weight": 0.05748031288385391 + }, + { + "diff_generated": -29.488983154296875, + "epoch": 1.9377835385612443, + "grad_norm": 2.8211061840508513, + "learning_rate": 2.6855969528803945e-07, + "logits/chosen": -2.379024028778076, + "logits/rejected": -2.3205015659332275, + "logps/chosen": -18.999738693237305, + "logps/rejected": -525.9266967773438, + "logps_avg/chosen": -0.09963471442461014, + "logps_avg/rejected": -2.9488985538482666, + "loss": 0.1008, + "losses_ref": -0.0004714926762972027, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5980, + "u": -3.623253583908081, + "weight": 0.04440300166606903 + }, + { + "diff_generated": -28.684600830078125, + "epoch": 1.9410239792611796, + "grad_norm": 2.4773623060647436, + "learning_rate": 2.6713617950175903e-07, + "logits/chosen": -2.339857578277588, + "logits/rejected": -2.2977325916290283, + "logps/chosen": -15.516563415527344, + "logps/rejected": -526.1066284179688, + "logps_avg/chosen": -0.0898626446723938, + "logps_avg/rejected": -2.868460178375244, + "loss": 0.097, + "losses_ref": -0.0008933226345106959, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 5990, + "u": -3.5968101024627686, + "weight": 0.051392458379268646 + }, + { + "diff_generated": -25.407421112060547, + "epoch": 1.9442644199611148, + "grad_norm": 2.562426557717335, + "learning_rate": 2.657145526088593e-07, + "logits/chosen": -2.334357500076294, + "logits/rejected": -2.325047016143799, + "logps/chosen": -17.33317756652832, + "logps/rejected": -470.50018310546875, + "logps_avg/chosen": -0.09964416176080704, + "logps_avg/rejected": -2.5407423973083496, + "loss": 0.0967, + "losses_ref": -0.00018075959815178066, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6000, + "u": -3.4317283630371094, + "weight": 0.0939575582742691 + }, + { + "diff_generated": -30.041431427001953, + "epoch": 1.9475048606610499, + "grad_norm": 2.4632218445410867, + "learning_rate": 2.6429483482027243e-07, + "logits/chosen": -2.3687427043914795, + "logits/rejected": -2.3227016925811768, + "logps/chosen": -17.55387306213379, + "logps/rejected": -546.512451171875, + "logps_avg/chosen": -0.10221491754055023, + "logps_avg/rejected": -3.004142999649048, + "loss": 0.0989, + "losses_ref": -0.00030045019229874015, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6010, + "u": -3.62287974357605, + "weight": 0.0441289022564888 + }, + { + "diff_generated": -29.048913955688477, + "epoch": 1.950745301360985, + "grad_norm": 2.551806434254762, + "learning_rate": 2.628770463197889e-07, + "logits/chosen": -2.429622173309326, + "logits/rejected": -2.3124306201934814, + "logps/chosen": -17.99391746520996, + "logps/rejected": -529.9110717773438, + "logps_avg/chosen": -0.09834562987089157, + "logps_avg/rejected": -2.904891014099121, + "loss": 0.0982, + "losses_ref": -0.0004878188483417034, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6020, + "u": -3.5515480041503906, + "weight": 0.0631481185555458 + }, + { + "diff_generated": -27.4799747467041, + "epoch": 1.9539857420609201, + "grad_norm": 2.3886479332440596, + "learning_rate": 2.6146120726377103e-07, + "logits/chosen": -2.291303873062134, + "logits/rejected": -2.2781028747558594, + "logps/chosen": -15.211416244506836, + "logps/rejected": -499.1487731933594, + "logps_avg/chosen": -0.09400470554828644, + "logps_avg/rejected": -2.747997522354126, + "loss": 0.0972, + "losses_ref": -0.00037772621726617217, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6030, + "u": -3.458329677581787, + "weight": 0.08798156678676605 + }, + { + "diff_generated": -30.40256118774414, + "epoch": 1.9572261827608555, + "grad_norm": 2.6555707656206167, + "learning_rate": 2.600473377808667e-07, + "logits/chosen": -2.3612496852874756, + "logits/rejected": -2.2357964515686035, + "logps/chosen": -17.25994110107422, + "logps/rejected": -531.9276123046875, + "logps_avg/chosen": -0.1040547713637352, + "logps_avg/rejected": -3.0402560234069824, + "loss": 0.0965, + "losses_ref": -0.0002499911352060735, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6040, + "u": -3.5274498462677, + "weight": 0.0690465196967125 + }, + { + "diff_generated": -31.004383087158203, + "epoch": 1.9604666234607908, + "grad_norm": 2.400620684294024, + "learning_rate": 2.5863545797172226e-07, + "logits/chosen": -2.3689889907836914, + "logits/rejected": -2.2837939262390137, + "logps/chosen": -18.03851890563965, + "logps/rejected": -553.7256469726562, + "logps_avg/chosen": -0.10278002917766571, + "logps_avg/rejected": -3.100438117980957, + "loss": 0.0994, + "losses_ref": -0.0005681588081642985, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6050, + "u": -3.5772476196289062, + "weight": 0.057164423167705536 + }, + { + "diff_generated": -28.485675811767578, + "epoch": 1.963707064160726, + "grad_norm": 2.4873118516999924, + "learning_rate": 2.5722558790869786e-07, + "logits/chosen": -2.335010528564453, + "logits/rejected": -2.248538017272949, + "logps/chosen": -18.01328468322754, + "logps/rejected": -488.8133239746094, + "logps_avg/chosen": -0.09517219662666321, + "logps_avg/rejected": -2.848567485809326, + "loss": 0.0946, + "losses_ref": -0.00048435464850626886, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6060, + "u": -3.477914333343506, + "weight": 0.08194047212600708 + }, + { + "diff_generated": -30.61606216430664, + "epoch": 1.966947504860661, + "grad_norm": 2.7205134584585933, + "learning_rate": 2.558177476355812e-07, + "logits/chosen": -2.380079507827759, + "logits/rejected": -2.3212497234344482, + "logps/chosen": -19.776500701904297, + "logps/rejected": -590.427978515625, + "logps_avg/chosen": -0.1100943312048912, + "logps_avg/rejected": -3.0616061687469482, + "loss": 0.0968, + "losses_ref": -0.00031202996615320444, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6070, + "u": -3.69507098197937, + "weight": 0.025381267070770264 + }, + { + "diff_generated": -27.755123138427734, + "epoch": 1.9701879455605962, + "grad_norm": 2.558741965970259, + "learning_rate": 2.544119571673031e-07, + "logits/chosen": -2.401541233062744, + "logits/rejected": -2.357109785079956, + "logps/chosen": -17.667585372924805, + "logps/rejected": -539.28515625, + "logps_avg/chosen": -0.09603724628686905, + "logps_avg/rejected": -2.7755126953125, + "loss": 0.0962, + "losses_ref": -0.00046056945575401187, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6080, + "u": -3.648192882537842, + "weight": 0.03804687783122063 + }, + { + "diff_generated": -28.202579498291016, + "epoch": 1.9734283862605313, + "grad_norm": 2.5918846208989015, + "learning_rate": 2.5300823648965267e-07, + "logits/chosen": -2.3541946411132812, + "logits/rejected": -2.293105363845825, + "logps/chosen": -16.276870727539062, + "logps/rejected": -532.5204467773438, + "logps_avg/chosen": -0.0957450419664383, + "logps_avg/rejected": -2.8202579021453857, + "loss": 0.0986, + "losses_ref": -0.0005978021072223783, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6090, + "u": -3.6244893074035645, + "weight": 0.04452654346823692 + }, + { + "diff_generated": -27.33587074279785, + "epoch": 1.9766688269604666, + "grad_norm": 2.5232128616627265, + "learning_rate": 2.516066055589937e-07, + "logits/chosen": -2.3826522827148438, + "logits/rejected": -2.3479719161987305, + "logps/chosen": -15.652276992797852, + "logps/rejected": -512.0379028320312, + "logps_avg/chosen": -0.09057996422052383, + "logps_avg/rejected": -2.7335870265960693, + "loss": 0.0972, + "losses_ref": -0.00041287043131887913, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6100, + "u": -3.5540382862091064, + "weight": 0.0630248486995697 + }, + { + "diff_generated": -29.23862075805664, + "epoch": 1.9799092676604018, + "grad_norm": 2.5030703804011942, + "learning_rate": 2.502070843019799e-07, + "logits/chosen": -2.3702540397644043, + "logits/rejected": -2.3024659156799316, + "logps/chosen": -18.456167221069336, + "logps/rejected": -559.3988037109375, + "logps_avg/chosen": -0.10194142907857895, + "logps_avg/rejected": -2.9238619804382324, + "loss": 0.0962, + "losses_ref": -0.00033079044078476727, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6110, + "u": -3.645400285720825, + "weight": 0.03791101649403572 + }, + { + "diff_generated": -29.87836265563965, + "epoch": 1.983149708360337, + "grad_norm": 2.4895773624046775, + "learning_rate": 2.4880969261527294e-07, + "logits/chosen": -2.3824191093444824, + "logits/rejected": -2.306915044784546, + "logps/chosen": -17.128381729125977, + "logps/rejected": -556.4266357421875, + "logps_avg/chosen": -0.10241828113794327, + "logps_avg/rejected": -2.9878363609313965, + "loss": 0.098, + "losses_ref": -0.0001866271486505866, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6120, + "u": -3.646498918533325, + "weight": 0.03771054744720459 + }, + { + "diff_generated": -27.77744483947754, + "epoch": 1.9863901490602722, + "grad_norm": 2.5199031859632006, + "learning_rate": 2.4741445036525814e-07, + "logits/chosen": -2.3641562461853027, + "logits/rejected": -2.2675235271453857, + "logps/chosen": -15.651080131530762, + "logps/rejected": -518.16455078125, + "logps_avg/chosen": -0.08747779577970505, + "logps_avg/rejected": -2.7777445316314697, + "loss": 0.0958, + "losses_ref": -0.00010507024126127362, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6130, + "u": -3.459134340286255, + "weight": 0.08761118352413177 + }, + { + "diff_generated": -26.81686782836914, + "epoch": 1.9896305897602073, + "grad_norm": 2.64875778356808, + "learning_rate": 2.460213773877635e-07, + "logits/chosen": -2.3199706077575684, + "logits/rejected": -2.2571868896484375, + "logps/chosen": -16.348854064941406, + "logps/rejected": -492.2103576660156, + "logps_avg/chosen": -0.09274972975254059, + "logps_avg/rejected": -2.6816866397857666, + "loss": 0.0976, + "losses_ref": -0.0004981858073733747, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6140, + "u": -3.4069740772247314, + "weight": 0.10068309307098389 + }, + { + "diff_generated": -29.773651123046875, + "epoch": 1.9928710304601425, + "grad_norm": 2.7060223991553105, + "learning_rate": 2.4463049348777666e-07, + "logits/chosen": -2.354555130004883, + "logits/rejected": -2.282043933868408, + "logps/chosen": -16.243932723999023, + "logps/rejected": -546.9548950195312, + "logps_avg/chosen": -0.0912802442908287, + "logps_avg/rejected": -2.977365016937256, + "loss": 0.0966, + "losses_ref": -0.0002333047305000946, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6150, + "u": -3.646449327468872, + "weight": 0.0378076434135437 + }, + { + "diff_generated": -27.507165908813477, + "epoch": 1.9961114711600778, + "grad_norm": 2.655346958804383, + "learning_rate": 2.4324181843916364e-07, + "logits/chosen": -2.380687952041626, + "logits/rejected": -2.311513662338257, + "logps/chosen": -20.26241111755371, + "logps/rejected": -494.68878173828125, + "logps_avg/chosen": -0.10843686014413834, + "logps_avg/rejected": -2.7507166862487793, + "loss": 0.0975, + "losses_ref": -0.0007293138187378645, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6160, + "u": -3.4976515769958496, + "weight": 0.07613696157932281 + }, + { + "diff_generated": -28.97373390197754, + "epoch": 1.999351911860013, + "grad_norm": 2.574664793901018, + "learning_rate": 2.4185537198438777e-07, + "logits/chosen": -2.4231228828430176, + "logits/rejected": -2.3481459617614746, + "logps/chosen": -18.296947479248047, + "logps/rejected": -531.1096801757812, + "logps_avg/chosen": -0.10054130852222443, + "logps_avg/rejected": -2.8973731994628906, + "loss": 0.096, + "losses_ref": -0.0007058627670630813, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6170, + "u": -3.6683974266052246, + "weight": 0.032234203070402145 + }, + { + "diff_generated": -30.53542709350586, + "epoch": 2.0025923525599483, + "grad_norm": 2.5901636980927556, + "learning_rate": 2.40471173834229e-07, + "logits/chosen": -2.397902488708496, + "logits/rejected": -2.3140671253204346, + "logps/chosen": -15.657752990722656, + "logps/rejected": -562.3306884765625, + "logps_avg/chosen": -0.08636067062616348, + "logps_avg/rejected": -3.0535435676574707, + "loss": 0.0829, + "losses_ref": -0.0028026457875967026, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6180, + "u": -5.210684776306152, + "weight": 0.02268834412097931 + }, + { + "diff_generated": -29.695453643798828, + "epoch": 2.0058327932598834, + "grad_norm": 2.571727590975468, + "learning_rate": 2.3908924366750385e-07, + "logits/chosen": -2.336836338043213, + "logits/rejected": -2.2530758380889893, + "logps/chosen": -13.952987670898438, + "logps/rejected": -537.3453369140625, + "logps_avg/chosen": -0.08065802603960037, + "logps_avg/rejected": -2.969545364379883, + "loss": 0.079, + "losses_ref": -0.0006570112309418619, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6190, + "u": -5.256657600402832, + "weight": 0.07569855451583862 + }, + { + "diff_generated": -31.76747703552246, + "epoch": 2.0090732339598185, + "grad_norm": 2.9585220072637743, + "learning_rate": 2.3770960113078505e-07, + "logits/chosen": -2.3309884071350098, + "logits/rejected": -2.2648239135742188, + "logps/chosen": -13.648565292358398, + "logps/rejected": -609.6402587890625, + "logps_avg/chosen": -0.07571685314178467, + "logps_avg/rejected": -3.1767475605010986, + "loss": 0.0789, + "losses_ref": -0.0007576612988486886, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6200, + "u": -5.403063774108887, + "weight": 0.05089284107089043 + }, + { + "diff_generated": -29.608722686767578, + "epoch": 2.0123136746597536, + "grad_norm": 2.567397987321688, + "learning_rate": 2.3633226583812304e-07, + "logits/chosen": -2.3135323524475098, + "logits/rejected": -2.234470844268799, + "logps/chosen": -14.518350601196289, + "logps/rejected": -518.7788696289062, + "logps_avg/chosen": -0.08325570076704025, + "logps_avg/rejected": -2.960872173309326, + "loss": 0.0773, + "losses_ref": -0.0017503865528851748, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6210, + "u": -5.509909629821777, + "weight": 0.033318065106868744 + }, + { + "diff_generated": -30.771066665649414, + "epoch": 2.0155541153596888, + "grad_norm": 2.4664748917173296, + "learning_rate": 2.3495725737076642e-07, + "logits/chosen": -2.3625543117523193, + "logits/rejected": -2.2188992500305176, + "logps/chosen": -14.710256576538086, + "logps/rejected": -569.1174926757812, + "logps_avg/chosen": -0.07622543722391129, + "logps_avg/rejected": -3.077106475830078, + "loss": 0.0782, + "losses_ref": -0.0008851033635437489, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6220, + "u": -5.327384948730469, + "weight": 0.06347521394491196 + }, + { + "diff_generated": -29.374542236328125, + "epoch": 2.0187945560596243, + "grad_norm": 2.5104772335097487, + "learning_rate": 2.3358459527688432e-07, + "logits/chosen": -2.334970235824585, + "logits/rejected": -2.214024066925049, + "logps/chosen": -14.989812850952148, + "logps/rejected": -542.8701171875, + "logps_avg/chosen": -0.08332401514053345, + "logps_avg/rejected": -2.9374542236328125, + "loss": 0.0797, + "losses_ref": -0.0009830545168370008, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6230, + "u": -5.472678184509277, + "weight": 0.03868403658270836 + }, + { + "diff_generated": -31.6467227935791, + "epoch": 2.0220349967595594, + "grad_norm": 2.592959712448732, + "learning_rate": 2.3221429907128734e-07, + "logits/chosen": -2.3337042331695557, + "logits/rejected": -2.2146174907684326, + "logps/chosen": -14.46679973602295, + "logps/rejected": -598.3749389648438, + "logps_avg/chosen": -0.08240757882595062, + "logps_avg/rejected": -3.1646721363067627, + "loss": 0.0776, + "losses_ref": -0.0014417509082704782, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6240, + "u": -5.394257545471191, + "weight": 0.051792167127132416 + }, + { + "diff_generated": -31.339035034179688, + "epoch": 2.0252754374594946, + "grad_norm": 2.70418171358979, + "learning_rate": 2.3084638823515136e-07, + "logits/chosen": -2.305694103240967, + "logits/rejected": -2.170424699783325, + "logps/chosen": -13.748693466186523, + "logps/rejected": -589.9194946289062, + "logps_avg/chosen": -0.07391176372766495, + "logps_avg/rejected": -3.1339030265808105, + "loss": 0.0768, + "losses_ref": -0.0010343308094888926, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6250, + "u": -5.400518417358398, + "weight": 0.0511666014790535 + }, + { + "diff_generated": -32.47716522216797, + "epoch": 2.0285158781594297, + "grad_norm": 2.7810488017581885, + "learning_rate": 2.2948088221573986e-07, + "logits/chosen": -2.3060505390167236, + "logits/rejected": -2.139094352722168, + "logps/chosen": -17.142841339111328, + "logps/rejected": -604.8924560546875, + "logps_avg/chosen": -0.09249739348888397, + "logps_avg/rejected": -3.2477169036865234, + "loss": 0.0789, + "losses_ref": -0.0014247202780097723, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6260, + "u": -5.4678754806518555, + "weight": 0.03933858126401901 + }, + { + "diff_generated": -32.207603454589844, + "epoch": 2.031756318859365, + "grad_norm": 2.6916719515805796, + "learning_rate": 2.2811780042612753e-07, + "logits/chosen": -2.2889606952667236, + "logits/rejected": -2.150083065032959, + "logps/chosen": -14.05456256866455, + "logps/rejected": -600.9343872070312, + "logps_avg/chosen": -0.07899312674999237, + "logps_avg/rejected": -3.2207603454589844, + "loss": 0.0787, + "losses_ref": -0.0009017119882628322, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6270, + "u": -5.3306355476379395, + "weight": 0.06367681920528412 + }, + { + "diff_generated": -32.79245376586914, + "epoch": 2.0349967595593, + "grad_norm": 2.7245432916785584, + "learning_rate": 2.267571622449246e-07, + "logits/chosen": -2.302232265472412, + "logits/rejected": -2.132932662963867, + "logps/chosen": -14.357002258300781, + "logps/rejected": -575.6806030273438, + "logps_avg/chosen": -0.0796835646033287, + "logps_avg/rejected": -3.279245376586914, + "loss": 0.0793, + "losses_ref": -0.00011451655154814944, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6280, + "u": -5.25557279586792, + "weight": 0.07510305941104889 + }, + { + "diff_generated": -31.563732147216797, + "epoch": 2.038237200259235, + "grad_norm": 2.6092658044234533, + "learning_rate": 2.2539898701600082e-07, + "logits/chosen": -2.2864041328430176, + "logits/rejected": -2.1257951259613037, + "logps/chosen": -14.23120403289795, + "logps/rejected": -566.61083984375, + "logps_avg/chosen": -0.07742153108119965, + "logps_avg/rejected": -3.1563732624053955, + "loss": 0.0769, + "losses_ref": -0.0016743981977924705, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6290, + "u": -5.29136323928833, + "weight": 0.0709114819765091 + }, + { + "diff_generated": -33.81085968017578, + "epoch": 2.0414776409591706, + "grad_norm": 2.707014491753683, + "learning_rate": 2.2404329404821086e-07, + "logits/chosen": -2.279618740081787, + "logits/rejected": -2.113089084625244, + "logps/chosen": -15.665722846984863, + "logps/rejected": -586.9880981445312, + "logps_avg/chosen": -0.08939939737319946, + "logps_avg/rejected": -3.3810858726501465, + "loss": 0.08, + "losses_ref": -0.00024769414449110627, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6300, + "u": -5.396568298339844, + "weight": 0.05024506896734238 + }, + { + "diff_generated": -30.7529296875, + "epoch": 2.0447180816591057, + "grad_norm": 2.9484328369759427, + "learning_rate": 2.2269010261511974e-07, + "logits/chosen": -2.31408429145813, + "logits/rejected": -2.150662899017334, + "logps/chosen": -16.079818725585938, + "logps/rejected": -563.0985107421875, + "logps_avg/chosen": -0.07982766628265381, + "logps_avg/rejected": -3.0752930641174316, + "loss": 0.0779, + "losses_ref": -0.0020618215203285217, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6310, + "u": -5.365875720977783, + "weight": 0.05885094404220581 + }, + { + "diff_generated": -31.242420196533203, + "epoch": 2.047958522359041, + "grad_norm": 2.8357529765475378, + "learning_rate": 2.2133943195472874e-07, + "logits/chosen": -2.3051440715789795, + "logits/rejected": -2.2008893489837646, + "logps/chosen": -13.617289543151855, + "logps/rejected": -574.6190185546875, + "logps_avg/chosen": -0.07658557593822479, + "logps_avg/rejected": -3.124242067337036, + "loss": 0.076, + "losses_ref": -0.0007927521946839988, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6320, + "u": -5.217470169067383, + "weight": 0.08214583247900009 + }, + { + "diff_generated": -33.446495056152344, + "epoch": 2.051198963058976, + "grad_norm": 2.792880239658355, + "learning_rate": 2.1999130126920158e-07, + "logits/chosen": -2.334336519241333, + "logits/rejected": -2.1936802864074707, + "logps/chosen": -13.958539962768555, + "logps/rejected": -610.8575439453125, + "logps_avg/chosen": -0.08001341670751572, + "logps_avg/rejected": -3.3446497917175293, + "loss": 0.0772, + "losses_ref": -0.0007364677148871124, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6330, + "u": -5.504420757293701, + "weight": 0.03204908221960068 + }, + { + "diff_generated": -33.31673049926758, + "epoch": 2.054439403758911, + "grad_norm": 2.765456961949322, + "learning_rate": 2.1864572972459228e-07, + "logits/chosen": -2.305427074432373, + "logits/rejected": -2.1972203254699707, + "logps/chosen": -11.824773788452148, + "logps/rejected": -606.9254150390625, + "logps_avg/chosen": -0.06999178230762482, + "logps_avg/rejected": -3.3316726684570312, + "loss": 0.0784, + "losses_ref": -0.00028468476375564933, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6340, + "u": -5.2944512367248535, + "weight": 0.06903310865163803 + }, + { + "diff_generated": -33.31203842163086, + "epoch": 2.057679844458846, + "grad_norm": 2.7025491580315513, + "learning_rate": 2.1730273645057173e-07, + "logits/chosen": -2.2650606632232666, + "logits/rejected": -2.115473508834839, + "logps/chosen": -15.001439094543457, + "logps/rejected": -572.0594482421875, + "logps_avg/chosen": -0.08333877474069595, + "logps_avg/rejected": -3.3312039375305176, + "loss": 0.0755, + "losses_ref": -0.0007068266859278083, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6350, + "u": -5.579641819000244, + "weight": 0.019526129588484764 + }, + { + "diff_generated": -31.222393035888672, + "epoch": 2.060920285158782, + "grad_norm": 2.8444818657075537, + "learning_rate": 2.1596234054015654e-07, + "logits/chosen": -2.2948861122131348, + "logits/rejected": -2.157444715499878, + "logps/chosen": -14.057197570800781, + "logps/rejected": -567.6610107421875, + "logps_avg/chosen": -0.08014727383852005, + "logps_avg/rejected": -3.122239589691162, + "loss": 0.0787, + "losses_ref": -0.000226367570576258, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6360, + "u": -5.466059684753418, + "weight": 0.03770763427019119 + }, + { + "diff_generated": -32.59468078613281, + "epoch": 2.064160725858717, + "grad_norm": 2.7306308341749213, + "learning_rate": 2.1462456104943692e-07, + "logits/chosen": -2.2609167098999023, + "logits/rejected": -2.0981216430664062, + "logps/chosen": -13.265954971313477, + "logps/rejected": -601.2452392578125, + "logps_avg/chosen": -0.07434628903865814, + "logps_avg/rejected": -3.259467601776123, + "loss": 0.0775, + "losses_ref": -0.00033033458748832345, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6370, + "u": -5.3675761222839355, + "weight": 0.05661793425679207 + }, + { + "diff_generated": -31.082448959350586, + "epoch": 2.067401166558652, + "grad_norm": 2.7488718393815987, + "learning_rate": 2.132894169973063e-07, + "logits/chosen": -2.330197811126709, + "logits/rejected": -2.1672775745391846, + "logps/chosen": -14.261502265930176, + "logps/rejected": -547.7105712890625, + "logps_avg/chosen": -0.07912299036979675, + "logps_avg/rejected": -3.1082448959350586, + "loss": 0.0793, + "losses_ref": -0.00029022307717241347, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6380, + "u": -5.365514278411865, + "weight": 0.05652255937457085 + }, + { + "diff_generated": -32.763587951660156, + "epoch": 2.070641607258587, + "grad_norm": 2.7081410301841835, + "learning_rate": 2.1195692736519013e-07, + "logits/chosen": -2.313753128051758, + "logits/rejected": -2.0885937213897705, + "logps/chosen": -14.403570175170898, + "logps/rejected": -634.0228271484375, + "logps_avg/chosen": -0.08032946288585663, + "logps_avg/rejected": -3.2763583660125732, + "loss": 0.0798, + "losses_ref": -0.0003196417819708586, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6390, + "u": -5.469879150390625, + "weight": 0.037800583988428116 + }, + { + "diff_generated": -32.15003967285156, + "epoch": 2.0738820479585223, + "grad_norm": 2.4964061534649042, + "learning_rate": 2.1062711109677757e-07, + "logits/chosen": -2.3299503326416016, + "logits/rejected": -2.1573243141174316, + "logps/chosen": -14.105905532836914, + "logps/rejected": -579.3314208984375, + "logps_avg/chosen": -0.0796867087483406, + "logps_avg/rejected": -3.2150039672851562, + "loss": 0.0768, + "losses_ref": -0.00031392709934152663, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6400, + "u": -5.433914661407471, + "weight": 0.04409373179078102 + }, + { + "diff_generated": -32.85166931152344, + "epoch": 2.0771224886584574, + "grad_norm": 2.6173716641305895, + "learning_rate": 2.0929998709775068e-07, + "logits/chosen": -2.301513433456421, + "logits/rejected": -2.072667121887207, + "logps/chosen": -14.194091796875, + "logps/rejected": -551.7338256835938, + "logps_avg/chosen": -0.07612624764442444, + "logps_avg/rejected": -3.2851669788360596, + "loss": 0.0764, + "losses_ref": -0.0004419487959239632, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6410, + "u": -5.298669815063477, + "weight": 0.06921719759702682 + }, + { + "diff_generated": -33.96429443359375, + "epoch": 2.080362929358393, + "grad_norm": 2.798558040379388, + "learning_rate": 2.0797557423551574e-07, + "logits/chosen": -2.304396152496338, + "logits/rejected": -2.1104958057403564, + "logps/chosen": -14.470054626464844, + "logps/rejected": -656.9310302734375, + "logps_avg/chosen": -0.07742521166801453, + "logps_avg/rejected": -3.3964295387268066, + "loss": 0.0779, + "losses_ref": -0.0004889139672741294, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6420, + "u": -5.361597537994385, + "weight": 0.056789349764585495 + }, + { + "diff_generated": -32.24937057495117, + "epoch": 2.083603370058328, + "grad_norm": 2.6907956388834, + "learning_rate": 2.066538913389361e-07, + "logits/chosen": -2.2897346019744873, + "logits/rejected": -2.14884614944458, + "logps/chosen": -14.133687019348145, + "logps/rejected": -573.21923828125, + "logps_avg/chosen": -0.07917702943086624, + "logps_avg/rejected": -3.2249374389648438, + "loss": 0.0789, + "losses_ref": -0.0005835826741531491, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6430, + "u": -5.470171928405762, + "weight": 0.038123439997434616 + }, + { + "diff_generated": -33.30463790893555, + "epoch": 2.086843810758263, + "grad_norm": 3.1474384018069403, + "learning_rate": 2.053349571980635e-07, + "logits/chosen": -2.3388473987579346, + "logits/rejected": -2.123534679412842, + "logps/chosen": -13.677331924438477, + "logps/rejected": -579.9287719726562, + "logps_avg/chosen": -0.07830803096294403, + "logps_avg/rejected": -3.3304641246795654, + "loss": 0.0768, + "losses_ref": -0.00024560894235037267, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6440, + "u": -5.5442609786987305, + "weight": 0.025242527946829796 + }, + { + "diff_generated": -35.16442108154297, + "epoch": 2.0900842514581983, + "grad_norm": 2.5503238617943627, + "learning_rate": 2.0401879056387155e-07, + "logits/chosen": -2.2443032264709473, + "logits/rejected": -2.050220489501953, + "logps/chosen": -13.342799186706543, + "logps/rejected": -631.4598388671875, + "logps_avg/chosen": -0.07115190476179123, + "logps_avg/rejected": -3.516442060470581, + "loss": 0.0781, + "losses_ref": -0.0018637517932802439, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6450, + "u": -5.328794002532959, + "weight": 0.06499633193016052 + }, + { + "diff_generated": -31.10691261291504, + "epoch": 2.0933246921581334, + "grad_norm": 2.955648628374549, + "learning_rate": 2.0270541014798864e-07, + "logits/chosen": -2.252042531967163, + "logits/rejected": -2.037557601928711, + "logps/chosen": -13.579818725585938, + "logps/rejected": -583.5672607421875, + "logps_avg/chosen": -0.07687920331954956, + "logps_avg/rejected": -3.1106910705566406, + "loss": 0.0786, + "losses_ref": -0.0009676915360614657, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6460, + "u": -5.223965167999268, + "weight": 0.08236159384250641 + }, + { + "diff_generated": -34.064735412597656, + "epoch": 2.0965651328580686, + "grad_norm": 2.6542132953205413, + "learning_rate": 2.0139483462243225e-07, + "logits/chosen": -2.227311372756958, + "logits/rejected": -2.0686700344085693, + "logps/chosen": -12.253379821777344, + "logps/rejected": -615.8561401367188, + "logps_avg/chosen": -0.07306130230426788, + "logps_avg/rejected": -3.4064738750457764, + "loss": 0.079, + "losses_ref": -0.0022037180606275797, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6470, + "u": -5.328154563903809, + "weight": 0.06612833589315414 + }, + { + "diff_generated": -33.95252990722656, + "epoch": 2.0998055735580037, + "grad_norm": 2.9236702346226657, + "learning_rate": 2.00087082619343e-07, + "logits/chosen": -2.2646069526672363, + "logits/rejected": -2.100008487701416, + "logps/chosen": -13.059414863586426, + "logps/rejected": -631.9442138671875, + "logps_avg/chosen": -0.08072508871555328, + "logps_avg/rejected": -3.3952529430389404, + "loss": 0.0782, + "losses_ref": -0.0003588471154216677, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6480, + "u": -5.472105979919434, + "weight": 0.03786256164312363 + }, + { + "diff_generated": -32.90629196166992, + "epoch": 2.1030460142579392, + "grad_norm": 2.673428354153091, + "learning_rate": 1.9878217273072116e-07, + "logits/chosen": -2.2418177127838135, + "logits/rejected": -2.12835431098938, + "logps/chosen": -12.385138511657715, + "logps/rejected": -594.1697387695312, + "logps_avg/chosen": -0.07691224664449692, + "logps_avg/rejected": -3.2906289100646973, + "loss": 0.0781, + "losses_ref": -0.0013034009607508779, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6490, + "u": -5.400871753692627, + "weight": 0.05187705159187317 + }, + { + "diff_generated": -30.338708877563477, + "epoch": 2.1062864549578744, + "grad_norm": 2.824418049212177, + "learning_rate": 1.974801235081602e-07, + "logits/chosen": -2.232614517211914, + "logits/rejected": -2.101982831954956, + "logps/chosen": -13.843696594238281, + "logps/rejected": -548.92724609375, + "logps_avg/chosen": -0.08000337332487106, + "logps_avg/rejected": -3.0338706970214844, + "loss": 0.0799, + "losses_ref": -0.00043142749927937984, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6500, + "u": -5.224959373474121, + "weight": 0.08169388771057129 + }, + { + "diff_generated": -31.706974029541016, + "epoch": 2.1095268956578095, + "grad_norm": 2.9654927434221228, + "learning_rate": 1.9618095346258485e-07, + "logits/chosen": -2.2640607357025146, + "logits/rejected": -2.08363938331604, + "logps/chosen": -13.451878547668457, + "logps/rejected": -562.90966796875, + "logps_avg/chosen": -0.07569324225187302, + "logps_avg/rejected": -3.1706976890563965, + "loss": 0.079, + "losses_ref": -0.0006684382678940892, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6510, + "u": -5.258929252624512, + "weight": 0.07574840635061264 + }, + { + "diff_generated": -30.676361083984375, + "epoch": 2.1127673363577446, + "grad_norm": 2.8414460475953054, + "learning_rate": 1.948846810639871e-07, + "logits/chosen": -2.3207340240478516, + "logits/rejected": -2.166395664215088, + "logps/chosen": -15.973104476928711, + "logps/rejected": -601.5938720703125, + "logps_avg/chosen": -0.08310873061418533, + "logps_avg/rejected": -3.067636251449585, + "loss": 0.0812, + "losses_ref": -0.0016424820059910417, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6520, + "u": -5.4342474937438965, + "weight": 0.04602918028831482 + }, + { + "diff_generated": -33.77393341064453, + "epoch": 2.1160077770576797, + "grad_norm": 2.633707010619091, + "learning_rate": 1.9359132474116374e-07, + "logits/chosen": -2.316953420639038, + "logits/rejected": -2.1647286415100098, + "logps/chosen": -14.174545288085938, + "logps/rejected": -579.6941528320312, + "logps_avg/chosen": -0.07961226999759674, + "logps_avg/rejected": -3.3773937225341797, + "loss": 0.079, + "losses_ref": -0.001250475412234664, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6530, + "u": -5.469130039215088, + "weight": 0.039084821939468384 + }, + { + "diff_generated": -31.735015869140625, + "epoch": 2.119248217757615, + "grad_norm": 2.719005475319987, + "learning_rate": 1.923009028814545e-07, + "logits/chosen": -2.298225164413452, + "logits/rejected": -2.190268039703369, + "logps/chosen": -14.34514331817627, + "logps/rejected": -585.7277221679688, + "logps_avg/chosen": -0.08426286280155182, + "logps_avg/rejected": -3.173501491546631, + "loss": 0.0797, + "losses_ref": -0.0006787871243432164, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6540, + "u": -5.432929515838623, + "weight": 0.044521529227495193 + }, + { + "diff_generated": -32.375831604003906, + "epoch": 2.1224886584575504, + "grad_norm": 2.7209691896498986, + "learning_rate": 1.910134338304804e-07, + "logits/chosen": -2.2743377685546875, + "logits/rejected": -2.1104390621185303, + "logps/chosen": -14.415522575378418, + "logps/rejected": -597.8012084960938, + "logps_avg/chosen": -0.08089585602283478, + "logps_avg/rejected": -3.237583637237549, + "loss": 0.0771, + "losses_ref": -0.001353858271613717, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6550, + "u": -5.330113410949707, + "weight": 0.06419380009174347 + }, + { + "diff_generated": -32.30270767211914, + "epoch": 2.1257290991574855, + "grad_norm": 2.7649372131038845, + "learning_rate": 1.897289358918834e-07, + "logits/chosen": -2.2590532302856445, + "logits/rejected": -2.0955405235290527, + "logps/chosen": -13.316218376159668, + "logps/rejected": -587.7503662109375, + "logps_avg/chosen": -0.07635252177715302, + "logps_avg/rejected": -3.230271100997925, + "loss": 0.0771, + "losses_ref": -0.0015015669632703066, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6560, + "u": -5.469440460205078, + "weight": 0.039542146027088165 + }, + { + "diff_generated": -30.48178482055664, + "epoch": 2.1289695398574207, + "grad_norm": 3.028113436370566, + "learning_rate": 1.8844742732706508e-07, + "logits/chosen": -2.26723051071167, + "logits/rejected": -2.130545139312744, + "logps/chosen": -13.461906433105469, + "logps/rejected": -548.2975463867188, + "logps_avg/chosen": -0.07495727390050888, + "logps_avg/rejected": -3.0481784343719482, + "loss": 0.0752, + "losses_ref": -0.0005471274489536881, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6570, + "u": -5.119858741760254, + "weight": 0.10057337582111359 + }, + { + "diff_generated": -33.4208869934082, + "epoch": 2.1322099805573558, + "grad_norm": 2.774153777490152, + "learning_rate": 1.8716892635492906e-07, + "logits/chosen": -2.309311628341675, + "logits/rejected": -2.1527256965637207, + "logps/chosen": -13.22758674621582, + "logps/rejected": -617.5025634765625, + "logps_avg/chosen": -0.07590781897306442, + "logps_avg/rejected": -3.342088222503662, + "loss": 0.075, + "losses_ref": -0.0011381434742361307, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6580, + "u": -5.4658942222595215, + "weight": 0.0390189066529274 + }, + { + "diff_generated": -32.68239974975586, + "epoch": 2.135450421257291, + "grad_norm": 2.9090982333609197, + "learning_rate": 1.8589345115161948e-07, + "logits/chosen": -2.3101859092712402, + "logits/rejected": -2.113839626312256, + "logps/chosen": -14.117950439453125, + "logps/rejected": -594.4866333007812, + "logps_avg/chosen": -0.07787175476551056, + "logps_avg/rejected": -3.268240451812744, + "loss": 0.0768, + "losses_ref": -0.00029373442521318793, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6590, + "u": -5.369537830352783, + "weight": 0.05655078962445259 + }, + { + "diff_generated": -30.384124755859375, + "epoch": 2.138690861957226, + "grad_norm": 2.5312553909221207, + "learning_rate": 1.846210198502646e-07, + "logits/chosen": -2.303065776824951, + "logits/rejected": -2.1762404441833496, + "logps/chosen": -12.848559379577637, + "logps/rejected": -539.0001220703125, + "logps_avg/chosen": -0.0744701474905014, + "logps_avg/rejected": -3.038412570953369, + "loss": 0.0765, + "losses_ref": -0.0014434943441301584, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6600, + "u": -5.364091873168945, + "weight": 0.05824242904782295 + }, + { + "diff_generated": -31.471532821655273, + "epoch": 2.141931302657161, + "grad_norm": 2.7702712125169553, + "learning_rate": 1.8335165054071795e-07, + "logits/chosen": -2.2743592262268066, + "logits/rejected": -2.2176575660705566, + "logps/chosen": -12.365696907043457, + "logps/rejected": -598.3367309570312, + "logps_avg/chosen": -0.07541408389806747, + "logps_avg/rejected": -3.14715313911438, + "loss": 0.0776, + "losses_ref": -0.0008119974518194795, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6610, + "u": -5.437493801116943, + "weight": 0.044645097106695175 + }, + { + "diff_generated": -35.88068389892578, + "epoch": 2.1451717433570967, + "grad_norm": 2.8827133227774215, + "learning_rate": 1.8208536126930173e-07, + "logits/chosen": -2.31424617767334, + "logits/rejected": -2.1438095569610596, + "logps/chosen": -14.181668281555176, + "logps/rejected": -659.5066528320312, + "logps_avg/chosen": -0.08195123821496964, + "logps_avg/rejected": -3.5880680084228516, + "loss": 0.08, + "losses_ref": -0.00035342806950211525, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6620, + "u": -5.541591644287109, + "weight": 0.02537021040916443 + }, + { + "diff_generated": -32.14451599121094, + "epoch": 2.148412184057032, + "grad_norm": 2.6764558944119945, + "learning_rate": 1.8082217003854933e-07, + "logits/chosen": -2.3081860542297363, + "logits/rejected": -2.1434569358825684, + "logps/chosen": -13.4950590133667, + "logps/rejected": -587.8795776367188, + "logps_avg/chosen": -0.0746789425611496, + "logps_avg/rejected": -3.214451551437378, + "loss": 0.078, + "losses_ref": -0.001070145284757018, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6630, + "u": -5.292096138000488, + "weight": 0.07006745040416718 + }, + { + "diff_generated": -34.20409393310547, + "epoch": 2.151652624756967, + "grad_norm": 2.7582898602691706, + "learning_rate": 1.7956209480695087e-07, + "logits/chosen": -2.2990448474884033, + "logits/rejected": -2.1080923080444336, + "logps/chosen": -13.895889282226562, + "logps/rejected": -613.466552734375, + "logps_avg/chosen": -0.08254117518663406, + "logps_avg/rejected": -3.4204094409942627, + "loss": 0.0778, + "losses_ref": -0.000966493331361562, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6640, + "u": -5.470082759857178, + "weight": 0.038777273148298264 + }, + { + "diff_generated": -29.56864356994629, + "epoch": 2.154893065456902, + "grad_norm": 2.933211681518209, + "learning_rate": 1.7830515348869664e-07, + "logits/chosen": -2.2532479763031006, + "logits/rejected": -2.14927339553833, + "logps/chosen": -13.479177474975586, + "logps/rejected": -545.1463012695312, + "logps_avg/chosen": -0.07876866310834885, + "logps_avg/rejected": -2.956864356994629, + "loss": 0.0791, + "losses_ref": -0.0007795925484970212, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6650, + "u": -5.326550006866455, + "weight": 0.06341485679149628 + }, + { + "diff_generated": -30.022064208984375, + "epoch": 2.158133506156837, + "grad_norm": 2.5585770502772993, + "learning_rate": 1.770513639534225e-07, + "logits/chosen": -2.2722833156585693, + "logits/rejected": -2.1155576705932617, + "logps/chosen": -13.819868087768555, + "logps/rejected": -539.40966796875, + "logps_avg/chosen": -0.075398750603199, + "logps_avg/rejected": -3.002206325531006, + "loss": 0.0774, + "losses_ref": -0.0006787824677303433, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6660, + "u": -5.14738655090332, + "weight": 0.09448657929897308 + }, + { + "diff_generated": -31.8614559173584, + "epoch": 2.1613739468567728, + "grad_norm": 2.840762893548993, + "learning_rate": 1.7580074402595698e-07, + "logits/chosen": -2.2734367847442627, + "logits/rejected": -2.13392972946167, + "logps/chosen": -12.674986839294434, + "logps/rejected": -590.9808349609375, + "logps_avg/chosen": -0.06906630098819733, + "logps_avg/rejected": -3.186145305633545, + "loss": 0.0756, + "losses_ref": -0.0005319962510839105, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6670, + "u": -5.151522159576416, + "weight": 0.09430833160877228 + }, + { + "diff_generated": -34.026451110839844, + "epoch": 2.164614387556708, + "grad_norm": 2.7539821387470984, + "learning_rate": 1.7455331148606618e-07, + "logits/chosen": -2.2525477409362793, + "logits/rejected": -2.076921224594116, + "logps/chosen": -13.29094123840332, + "logps/rejected": -592.8287353515625, + "logps_avg/chosen": -0.07590679079294205, + "logps_avg/rejected": -3.4026455879211426, + "loss": 0.0746, + "losses_ref": -0.00040179031202569604, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6680, + "u": -5.470437526702881, + "weight": 0.03791480511426926 + }, + { + "diff_generated": -30.604236602783203, + "epoch": 2.167854828256643, + "grad_norm": 2.6701230239114015, + "learning_rate": 1.7330908406820237e-07, + "logits/chosen": -2.276629686355591, + "logits/rejected": -2.1089656352996826, + "logps/chosen": -12.869527816772461, + "logps/rejected": -558.6658325195312, + "logps_avg/chosen": -0.07457348704338074, + "logps_avg/rejected": -3.0604236125946045, + "loss": 0.0775, + "losses_ref": -0.00016242492711171508, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6690, + "u": -5.397185325622559, + "weight": 0.05015290901064873 + }, + { + "diff_generated": -31.642303466796875, + "epoch": 2.171095268956578, + "grad_norm": 2.7561051877511447, + "learning_rate": 1.7206807946125123e-07, + "logits/chosen": -2.289897918701172, + "logits/rejected": -2.0907204151153564, + "logps/chosen": -14.516186714172363, + "logps/rejected": -578.0810546875, + "logps_avg/chosen": -0.07717464864253998, + "logps_avg/rejected": -3.1642303466796875, + "loss": 0.0782, + "losses_ref": -0.0008535755914635956, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6700, + "u": -5.4019575119018555, + "weight": 0.05095864459872246 + }, + { + "diff_generated": -32.917266845703125, + "epoch": 2.1743357096565132, + "grad_norm": 2.7672422197678537, + "learning_rate": 1.7083031530828072e-07, + "logits/chosen": -2.2950797080993652, + "logits/rejected": -2.0778796672821045, + "logps/chosen": -15.119161605834961, + "logps/rejected": -578.9310913085938, + "logps_avg/chosen": -0.08268582075834274, + "logps_avg/rejected": -3.291726589202881, + "loss": 0.0775, + "losses_ref": -0.0006864489405415952, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6710, + "u": -5.572969436645508, + "weight": 0.01962578110396862 + }, + { + "diff_generated": -33.86672592163086, + "epoch": 2.1775761503564484, + "grad_norm": 2.7065540236942005, + "learning_rate": 1.6959580920628937e-07, + "logits/chosen": -2.2735941410064697, + "logits/rejected": -2.1080164909362793, + "logps/chosen": -13.885465621948242, + "logps/rejected": -576.6862182617188, + "logps_avg/chosen": -0.08365106582641602, + "logps_avg/rejected": -3.386672258377075, + "loss": 0.0789, + "losses_ref": -0.0004785112105309963, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6720, + "u": -5.434020042419434, + "weight": 0.04426304250955582 + }, + { + "diff_generated": -32.36909866333008, + "epoch": 2.1808165910563835, + "grad_norm": 2.8643963575772133, + "learning_rate": 1.6836457870595783e-07, + "logits/chosen": -2.240960121154785, + "logits/rejected": -2.0428242683410645, + "logps/chosen": -13.639554977416992, + "logps/rejected": -571.4022216796875, + "logps_avg/chosen": -0.07686988264322281, + "logps_avg/rejected": -3.236909866333008, + "loss": 0.0766, + "losses_ref": -0.0023015381302684546, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6730, + "u": -5.292568683624268, + "weight": 0.07232372462749481 + }, + { + "diff_generated": -34.247276306152344, + "epoch": 2.184057031756319, + "grad_norm": 2.896275016681993, + "learning_rate": 1.6713664131139723e-07, + "logits/chosen": -2.252135753631592, + "logits/rejected": -2.0358219146728516, + "logps/chosen": -13.957839965820312, + "logps/rejected": -590.4846801757812, + "logps_avg/chosen": -0.07644806802272797, + "logps_avg/rejected": -3.42472767829895, + "loss": 0.0786, + "losses_ref": -0.00032314620329998434, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6740, + "u": -5.4735212326049805, + "weight": 0.03785574808716774 + }, + { + "diff_generated": -30.658884048461914, + "epoch": 2.187297472456254, + "grad_norm": 3.0652940579384653, + "learning_rate": 1.659120144799019e-07, + "logits/chosen": -2.3066296577453613, + "logits/rejected": -2.1544699668884277, + "logps/chosen": -13.968292236328125, + "logps/rejected": -547.006591796875, + "logps_avg/chosen": -0.08002481609582901, + "logps_avg/rejected": -3.0658886432647705, + "loss": 0.078, + "losses_ref": -0.00048600314767099917, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6750, + "u": -5.3647356033325195, + "weight": 0.0567636601626873 + }, + { + "diff_generated": -31.262588500976562, + "epoch": 2.1905379131561893, + "grad_norm": 2.728896191443824, + "learning_rate": 1.6469071562170114e-07, + "logits/chosen": -2.3213558197021484, + "logits/rejected": -2.1392080783843994, + "logps/chosen": -13.566169738769531, + "logps/rejected": -572.1866455078125, + "logps_avg/chosen": -0.07140545547008514, + "logps_avg/rejected": -3.1262588500976562, + "loss": 0.0769, + "losses_ref": -0.0004521248338278383, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6760, + "u": -5.292697429656982, + "weight": 0.06922182440757751 + }, + { + "diff_generated": -31.948001861572266, + "epoch": 2.1937783538561244, + "grad_norm": 2.7531998569337452, + "learning_rate": 1.6347276209971024e-07, + "logits/chosen": -2.285494804382324, + "logits/rejected": -2.1591503620147705, + "logps/chosen": -12.3690185546875, + "logps/rejected": -594.0634765625, + "logps_avg/chosen": -0.07330231368541718, + "logps_avg/rejected": -3.19480037689209, + "loss": 0.0763, + "losses_ref": -0.0008386679110117257, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6770, + "u": -5.331079959869385, + "weight": 0.06344042718410492 + }, + { + "diff_generated": -33.420738220214844, + "epoch": 2.1970187945560595, + "grad_norm": 2.9338438350002782, + "learning_rate": 1.6225817122928534e-07, + "logits/chosen": -2.2678208351135254, + "logits/rejected": -2.0875954627990723, + "logps/chosen": -12.525259017944336, + "logps/rejected": -606.2054443359375, + "logps_avg/chosen": -0.07486884295940399, + "logps_avg/rejected": -3.342073917388916, + "loss": 0.077, + "losses_ref": -0.000290944502921775, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6780, + "u": -5.362326145172119, + "weight": 0.05652598291635513 + }, + { + "diff_generated": -32.1254997253418, + "epoch": 2.2002592352559946, + "grad_norm": 2.675527870675733, + "learning_rate": 1.6104696027797635e-07, + "logits/chosen": -2.206605911254883, + "logits/rejected": -2.1036322116851807, + "logps/chosen": -12.653177261352539, + "logps/rejected": -598.8486938476562, + "logps_avg/chosen": -0.07566668093204498, + "logps_avg/rejected": -3.2125496864318848, + "loss": 0.076, + "losses_ref": -0.00034772203071042895, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6790, + "u": -5.33026123046875, + "weight": 0.06287422776222229 + }, + { + "diff_generated": -31.460948944091797, + "epoch": 2.20349967595593, + "grad_norm": 2.816219731944842, + "learning_rate": 1.5983914646528193e-07, + "logits/chosen": -2.2932732105255127, + "logits/rejected": -2.10170578956604, + "logps/chosen": -14.501579284667969, + "logps/rejected": -575.257568359375, + "logps_avg/chosen": -0.08044446259737015, + "logps_avg/rejected": -3.146095037460327, + "loss": 0.077, + "losses_ref": -0.0013265017187222838, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6800, + "u": -5.328404903411865, + "weight": 0.06457807123661041 + }, + { + "diff_generated": -31.099594116210938, + "epoch": 2.2067401166558653, + "grad_norm": 2.6429539812751006, + "learning_rate": 1.5863474696240365e-07, + "logits/chosen": -2.329437255859375, + "logits/rejected": -2.1275200843811035, + "logps/chosen": -14.690841674804688, + "logps/rejected": -565.6914672851562, + "logps_avg/chosen": -0.07997091859579086, + "logps_avg/rejected": -3.109959125518799, + "loss": 0.0778, + "losses_ref": -0.003032374195754528, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6810, + "u": -5.390047550201416, + "weight": 0.056095026433467865 + }, + { + "diff_generated": -31.575679779052734, + "epoch": 2.2099805573558005, + "grad_norm": 2.812628607977736, + "learning_rate": 1.5743377889200388e-07, + "logits/chosen": -2.274228572845459, + "logits/rejected": -2.1092209815979004, + "logps/chosen": -14.860143661499023, + "logps/rejected": -590.8905639648438, + "logps_avg/chosen": -0.08572975546121597, + "logps_avg/rejected": -3.1575682163238525, + "loss": 0.0802, + "losses_ref": -0.0005182913737371564, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6820, + "u": -5.398507118225098, + "weight": 0.05052924156188965 + }, + { + "diff_generated": -32.276546478271484, + "epoch": 2.2132209980557356, + "grad_norm": 2.7600270080887004, + "learning_rate": 1.5623625932795994e-07, + "logits/chosen": -2.312800645828247, + "logits/rejected": -2.0970990657806396, + "logps/chosen": -14.993402481079102, + "logps/rejected": -598.1676025390625, + "logps_avg/chosen": -0.08526170253753662, + "logps_avg/rejected": -3.2276546955108643, + "loss": 0.0805, + "losses_ref": -0.0005118830013088882, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6830, + "u": -5.5376434326171875, + "weight": 0.025591537356376648 + }, + { + "diff_generated": -32.459495544433594, + "epoch": 2.2164614387556707, + "grad_norm": 3.2774846208784205, + "learning_rate": 1.5504220529512324e-07, + "logits/chosen": -2.2891764640808105, + "logits/rejected": -2.115154981613159, + "logps/chosen": -13.242490768432617, + "logps/rejected": -590.5769653320312, + "logps_avg/chosen": -0.07609592378139496, + "logps_avg/rejected": -3.2459495067596436, + "loss": 0.0796, + "losses_ref": -0.0008921163389459252, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6840, + "u": -5.542930603027344, + "weight": 0.02612874284386635 + }, + { + "diff_generated": -33.76580047607422, + "epoch": 2.219701879455606, + "grad_norm": 2.747927789084302, + "learning_rate": 1.5385163376907636e-07, + "logits/chosen": -2.3085949420928955, + "logits/rejected": -2.064383029937744, + "logps/chosen": -14.200657844543457, + "logps/rejected": -614.0256958007812, + "logps_avg/chosen": -0.08223428577184677, + "logps_avg/rejected": -3.376579761505127, + "loss": 0.0762, + "losses_ref": -0.0007276682299561799, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6850, + "u": -5.363381862640381, + "weight": 0.05715851113200188 + }, + { + "diff_generated": -32.7970085144043, + "epoch": 2.222942320155541, + "grad_norm": 2.788225867383144, + "learning_rate": 1.526645616758921e-07, + "logits/chosen": -2.235722064971924, + "logits/rejected": -2.0513463020324707, + "logps/chosen": -13.610944747924805, + "logps/rejected": -576.8745727539062, + "logps_avg/chosen": -0.08273743093013763, + "logps_avg/rejected": -3.279700517654419, + "loss": 0.0777, + "losses_ref": -0.0009066167986020446, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6860, + "u": -5.363965034484863, + "weight": 0.05731652304530144 + }, + { + "diff_generated": -33.81819152832031, + "epoch": 2.2261827608554765, + "grad_norm": 2.8476275963836217, + "learning_rate": 1.5148100589189205e-07, + "logits/chosen": -2.322516918182373, + "logits/rejected": -2.0998401641845703, + "logps/chosen": -15.207572937011719, + "logps/rejected": -619.1622924804688, + "logps_avg/chosen": -0.07979702204465866, + "logps_avg/rejected": -3.381819248199463, + "loss": 0.0791, + "losses_ref": -0.00029921572422608733, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6870, + "u": -5.4724297523498535, + "weight": 0.03779328987002373 + }, + { + "diff_generated": -34.35366439819336, + "epoch": 2.2294232015554116, + "grad_norm": 2.843435625809698, + "learning_rate": 1.5030098324340808e-07, + "logits/chosen": -2.3158698081970215, + "logits/rejected": -2.1082563400268555, + "logps/chosen": -13.8373384475708, + "logps/rejected": -618.6031494140625, + "logps_avg/chosen": -0.07845751941204071, + "logps_avg/rejected": -3.43536639213562, + "loss": 0.0762, + "losses_ref": -0.00038599842810072005, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6880, + "u": -5.506179332733154, + "weight": 0.03169042617082596 + }, + { + "diff_generated": -34.03045654296875, + "epoch": 2.2326636422553467, + "grad_norm": 2.8953107407337026, + "learning_rate": 1.491245105065419e-07, + "logits/chosen": -2.3265156745910645, + "logits/rejected": -2.1272525787353516, + "logps/chosen": -13.283559799194336, + "logps/rejected": -629.44775390625, + "logps_avg/chosen": -0.07462642341852188, + "logps_avg/rejected": -3.403046131134033, + "loss": 0.0784, + "losses_ref": -0.0001504389219917357, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6890, + "u": -5.471314430236816, + "weight": 0.03763309493660927 + }, + { + "diff_generated": -30.7342586517334, + "epoch": 2.235904082955282, + "grad_norm": 2.9007177421575587, + "learning_rate": 1.4795160440692672e-07, + "logits/chosen": -2.345430374145508, + "logits/rejected": -2.146914005279541, + "logps/chosen": -14.193719863891602, + "logps/rejected": -549.6990356445312, + "logps_avg/chosen": -0.08124233782291412, + "logps_avg/rejected": -3.0734260082244873, + "loss": 0.0782, + "losses_ref": -0.0012097887229174376, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6900, + "u": -5.5394673347473145, + "weight": 0.02644205465912819 + }, + { + "diff_generated": -32.822933197021484, + "epoch": 2.239144523655217, + "grad_norm": 2.8833451708893, + "learning_rate": 1.467822816194904e-07, + "logits/chosen": -2.3125786781311035, + "logits/rejected": -2.111271381378174, + "logps/chosen": -13.782983779907227, + "logps/rejected": -604.3697509765625, + "logps_avg/chosen": -0.07867380231618881, + "logps_avg/rejected": -3.2822933197021484, + "loss": 0.0761, + "losses_ref": -0.0003412619116716087, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6910, + "u": -5.401597499847412, + "weight": 0.05035555362701416 + }, + { + "diff_generated": -31.626598358154297, + "epoch": 2.242384964355152, + "grad_norm": 2.814659419342255, + "learning_rate": 1.4561655876821694e-07, + "logits/chosen": -2.2330501079559326, + "logits/rejected": -2.104743480682373, + "logps/chosen": -13.798090934753418, + "logps/rejected": -591.9242553710938, + "logps_avg/chosen": -0.07953254878520966, + "logps_avg/rejected": -3.1626598834991455, + "loss": 0.0789, + "losses_ref": -0.00016892193525563926, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6920, + "u": -5.404656410217285, + "weight": 0.05016558617353439 + }, + { + "diff_generated": -32.88931655883789, + "epoch": 2.2456254050550877, + "grad_norm": 2.896216572470763, + "learning_rate": 1.4445445242591138e-07, + "logits/chosen": -2.2789807319641113, + "logits/rejected": -2.1155471801757812, + "logps/chosen": -13.4810209274292, + "logps/rejected": -598.04150390625, + "logps_avg/chosen": -0.0773623138666153, + "logps_avg/rejected": -3.2889316082000732, + "loss": 0.0785, + "losses_ref": -0.0006119104218669236, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6930, + "u": -5.398166656494141, + "weight": 0.05070475861430168 + }, + { + "diff_generated": -32.10064697265625, + "epoch": 2.248865845755023, + "grad_norm": 2.743282441118848, + "learning_rate": 1.4329597911396362e-07, + "logits/chosen": -2.32409930229187, + "logits/rejected": -2.1036975383758545, + "logps/chosen": -15.0281400680542, + "logps/rejected": -578.3131103515625, + "logps_avg/chosen": -0.07574650645256042, + "logps_avg/rejected": -3.210064649581909, + "loss": 0.0765, + "losses_ref": -0.00012708675058092922, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6940, + "u": -5.294000148773193, + "weight": 0.06886385381221771 + }, + { + "diff_generated": -33.120174407958984, + "epoch": 2.252106286454958, + "grad_norm": 2.8897561216334315, + "learning_rate": 1.421411553021137e-07, + "logits/chosen": -2.3341989517211914, + "logits/rejected": -2.1409738063812256, + "logps/chosen": -15.546142578125, + "logps/rejected": -595.2667236328125, + "logps_avg/chosen": -0.08349694311618805, + "logps_avg/rejected": -3.3120174407958984, + "loss": 0.0773, + "losses_ref": -0.0012247232953086495, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6950, + "u": -5.466020584106445, + "weight": 0.039082638919353485 + }, + { + "diff_generated": -30.71538734436035, + "epoch": 2.255346727154893, + "grad_norm": 2.859122418072979, + "learning_rate": 1.4098999740821716e-07, + "logits/chosen": -2.2431702613830566, + "logits/rejected": -2.063222646713257, + "logps/chosen": -13.041366577148438, + "logps/rejected": -560.9396362304688, + "logps_avg/chosen": -0.07067658007144928, + "logps_avg/rejected": -3.0715386867523193, + "loss": 0.0769, + "losses_ref": -0.0007479506894014776, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6960, + "u": -5.294276237487793, + "weight": 0.06961636990308762 + }, + { + "diff_generated": -32.428977966308594, + "epoch": 2.258587167854828, + "grad_norm": 2.877870001625762, + "learning_rate": 1.3984252179801277e-07, + "logits/chosen": -2.2853071689605713, + "logits/rejected": -2.106396198272705, + "logps/chosen": -14.881681442260742, + "logps/rejected": -623.0894775390625, + "logps_avg/chosen": -0.08217870444059372, + "logps_avg/rejected": -3.2428977489471436, + "loss": 0.0791, + "losses_ref": -0.00023555834195576608, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6970, + "u": -5.364887237548828, + "weight": 0.05648752301931381 + }, + { + "diff_generated": -32.93585968017578, + "epoch": 2.2618276085547633, + "grad_norm": 2.717148340358031, + "learning_rate": 1.3869874478488846e-07, + "logits/chosen": -2.26002836227417, + "logits/rejected": -2.120847225189209, + "logps/chosen": -12.688130378723145, + "logps/rejected": -582.8048706054688, + "logps_avg/chosen": -0.07913817465305328, + "logps_avg/rejected": -3.293586015701294, + "loss": 0.0759, + "losses_ref": -0.0017868172144517303, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6980, + "u": -5.436811923980713, + "weight": 0.04643816873431206 + }, + { + "diff_generated": -33.98379135131836, + "epoch": 2.2650680492546984, + "grad_norm": 2.9089997065506754, + "learning_rate": 1.3755868262965047e-07, + "logits/chosen": -2.333571434020996, + "logits/rejected": -2.081078052520752, + "logps/chosen": -13.215835571289062, + "logps/rejected": -569.4586181640625, + "logps_avg/chosen": -0.07337544113397598, + "logps_avg/rejected": -3.39837908744812, + "loss": 0.0772, + "losses_ref": -0.0013759381836280227, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 6990, + "u": -5.292944431304932, + "weight": 0.07080823183059692 + }, + { + "diff_generated": -35.02412033081055, + "epoch": 2.268308489954634, + "grad_norm": 2.8189637995994863, + "learning_rate": 1.3642235154029172e-07, + "logits/chosen": -2.306802272796631, + "logits/rejected": -2.036937713623047, + "logps/chosen": -15.109029769897461, + "logps/rejected": -626.7437744140625, + "logps_avg/chosen": -0.07939799129962921, + "logps_avg/rejected": -3.5024120807647705, + "loss": 0.0762, + "losses_ref": -0.0005451668985188007, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7000, + "u": -5.360957145690918, + "weight": 0.05685426667332649 + }, + { + "diff_generated": -34.771018981933594, + "epoch": 2.271548930654569, + "grad_norm": 2.8519990889263025, + "learning_rate": 1.352897676717614e-07, + "logits/chosen": -2.3075461387634277, + "logits/rejected": -2.081054210662842, + "logps/chosen": -14.556310653686523, + "logps/rejected": -605.6522216796875, + "logps_avg/chosen": -0.08247552812099457, + "logps_avg/rejected": -3.4771018028259277, + "loss": 0.0812, + "losses_ref": -0.0002231432154076174, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7010, + "u": -5.5091047286987305, + "weight": 0.03146742656826973 + }, + { + "diff_generated": -35.00230026245117, + "epoch": 2.274789371354504, + "grad_norm": 2.8569966796193853, + "learning_rate": 1.341609471257354e-07, + "logits/chosen": -2.2501139640808105, + "logits/rejected": -2.0715503692626953, + "logps/chosen": -12.842994689941406, + "logps/rejected": -641.0640869140625, + "logps_avg/chosen": -0.07570213079452515, + "logps_avg/rejected": -3.500230312347412, + "loss": 0.0768, + "losses_ref": -6.805827433709055e-05, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7020, + "u": -5.432784557342529, + "weight": 0.04380672797560692 + }, + { + "diff_generated": -32.28007125854492, + "epoch": 2.2780298120544393, + "grad_norm": 2.807385880123519, + "learning_rate": 1.3303590595038735e-07, + "logits/chosen": -2.2671284675598145, + "logits/rejected": -2.1051955223083496, + "logps/chosen": -14.239480972290039, + "logps/rejected": -614.7636108398438, + "logps_avg/chosen": -0.08152355253696442, + "logps_avg/rejected": -3.2280075550079346, + "loss": 0.0781, + "losses_ref": -0.0008757902542129159, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7030, + "u": -5.508584976196289, + "weight": 0.03236890211701393 + }, + { + "diff_generated": -31.199777603149414, + "epoch": 2.2812702527543745, + "grad_norm": 2.8868259547417314, + "learning_rate": 1.3191466014016049e-07, + "logits/chosen": -2.2917065620422363, + "logits/rejected": -2.060508966445923, + "logps/chosen": -11.897598266601562, + "logps/rejected": -556.2362060546875, + "logps_avg/chosen": -0.06840696185827255, + "logps_avg/rejected": -3.1199779510498047, + "loss": 0.0747, + "losses_ref": -0.00041570625035092235, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7040, + "u": -5.368067741394043, + "weight": 0.05673006922006607 + }, + { + "diff_generated": -32.5605354309082, + "epoch": 2.28451069345431, + "grad_norm": 2.817689113037822, + "learning_rate": 1.3079722563553994e-07, + "logits/chosen": -2.31117582321167, + "logits/rejected": -2.0909972190856934, + "logps/chosen": -13.309144973754883, + "logps/rejected": -568.3492431640625, + "logps_avg/chosen": -0.07407195121049881, + "logps_avg/rejected": -3.2560532093048096, + "loss": 0.0761, + "losses_ref": -0.0011109040351584554, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7050, + "u": -5.291151523590088, + "weight": 0.07028138637542725 + }, + { + "diff_generated": -32.47726058959961, + "epoch": 2.287751134154245, + "grad_norm": 2.7693781817642926, + "learning_rate": 1.2968361832282705e-07, + "logits/chosen": -2.275883674621582, + "logits/rejected": -2.0943799018859863, + "logps/chosen": -13.563992500305176, + "logps/rejected": -600.7876586914062, + "logps_avg/chosen": -0.07618103921413422, + "logps_avg/rejected": -3.2477259635925293, + "loss": 0.0779, + "losses_ref": -0.0003883329627569765, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7060, + "u": -5.403430461883545, + "weight": 0.05040976405143738 + }, + { + "diff_generated": -34.68756866455078, + "epoch": 2.2909915748541803, + "grad_norm": 2.861574037971138, + "learning_rate": 1.2857385403391226e-07, + "logits/chosen": -2.2599575519561768, + "logits/rejected": -2.075666904449463, + "logps/chosen": -14.708311080932617, + "logps/rejected": -625.8695678710938, + "logps_avg/chosen": -0.08552752435207367, + "logps_avg/rejected": -3.4687564373016357, + "loss": 0.0793, + "losses_ref": -0.000556853658054024, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7070, + "u": -5.541098594665527, + "weight": 0.025598809123039246 + }, + { + "diff_generated": -33.260467529296875, + "epoch": 2.2942320155541154, + "grad_norm": 3.047435358439903, + "learning_rate": 1.274679485460509e-07, + "logits/chosen": -2.295015811920166, + "logits/rejected": -2.0713677406311035, + "logps/chosen": -14.201037406921387, + "logps/rejected": -591.999755859375, + "logps_avg/chosen": -0.07558928430080414, + "logps_avg/rejected": -3.32604718208313, + "loss": 0.0772, + "losses_ref": -0.0007015225710347295, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7080, + "u": -5.222864627838135, + "weight": 0.0821448564529419 + }, + { + "diff_generated": -34.55098342895508, + "epoch": 2.2974724562540505, + "grad_norm": 2.849633449181555, + "learning_rate": 1.2636591758163868e-07, + "logits/chosen": -2.266608715057373, + "logits/rejected": -2.1019034385681152, + "logps/chosen": -13.151138305664062, + "logps/rejected": -632.1087036132812, + "logps_avg/chosen": -0.08013930916786194, + "logps_avg/rejected": -3.4550983905792236, + "loss": 0.0773, + "losses_ref": -0.00018653420556802303, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7090, + "u": -5.4691925048828125, + "weight": 0.037679560482501984 + }, + { + "diff_generated": -33.21096420288086, + "epoch": 2.3007128969539856, + "grad_norm": 2.948310681297529, + "learning_rate": 1.2526777680798813e-07, + "logits/chosen": -2.2835330963134766, + "logits/rejected": -2.1416866779327393, + "logps/chosen": -12.66423225402832, + "logps/rejected": -602.2303466796875, + "logps_avg/chosen": -0.07687665522098541, + "logps_avg/rejected": -3.321096420288086, + "loss": 0.077, + "losses_ref": -0.0004062841762788594, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7100, + "u": -5.293711185455322, + "weight": 0.06918938457965851 + }, + { + "diff_generated": -34.442100524902344, + "epoch": 2.3039533376539207, + "grad_norm": 2.6467761062799173, + "learning_rate": 1.241735418371057e-07, + "logits/chosen": -2.2501444816589355, + "logits/rejected": -2.081749200820923, + "logps/chosen": -13.935762405395508, + "logps/rejected": -641.1041870117188, + "logps_avg/chosen": -0.08066975325345993, + "logps_avg/rejected": -3.4442100524902344, + "loss": 0.0777, + "losses_ref": -0.0010616803774610162, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7110, + "u": -5.468209266662598, + "weight": 0.03915365785360336 + }, + { + "diff_generated": -34.00629425048828, + "epoch": 2.3071937783538563, + "grad_norm": 3.0293111653961007, + "learning_rate": 1.2308322822547027e-07, + "logits/chosen": -2.2660059928894043, + "logits/rejected": -2.0726606845855713, + "logps/chosen": -12.898577690124512, + "logps/rejected": -650.5553588867188, + "logps_avg/chosen": -0.07671193778514862, + "logps_avg/rejected": -3.4006295204162598, + "loss": 0.0753, + "losses_ref": -0.0006363748689182103, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7120, + "u": -5.365374565124512, + "weight": 0.05698163062334061 + }, + { + "diff_generated": -34.85182189941406, + "epoch": 2.3104342190537914, + "grad_norm": 2.7605565687874845, + "learning_rate": 1.2199685147381148e-07, + "logits/chosen": -2.3294003009796143, + "logits/rejected": -2.102637767791748, + "logps/chosen": -14.504648208618164, + "logps/rejected": -635.5806884765625, + "logps_avg/chosen": -0.07878074049949646, + "logps_avg/rejected": -3.485182285308838, + "loss": 0.0776, + "losses_ref": -0.0002665507490746677, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7130, + "u": -5.397416114807129, + "weight": 0.05026886612176895 + }, + { + "diff_generated": -34.99089431762695, + "epoch": 2.3136746597537265, + "grad_norm": 2.8144242097826297, + "learning_rate": 1.2091442702688933e-07, + "logits/chosen": -2.2866711616516113, + "logits/rejected": -2.0564143657684326, + "logps/chosen": -13.967157363891602, + "logps/rejected": -695.5399169921875, + "logps_avg/chosen": -0.07526645809412003, + "logps_avg/rejected": -3.499089002609253, + "loss": 0.0783, + "losses_ref": -0.000559016945771873, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7140, + "u": -5.363223075866699, + "weight": 0.05697736144065857 + }, + { + "diff_generated": -32.008792877197266, + "epoch": 2.3169151004536617, + "grad_norm": 2.835996775285066, + "learning_rate": 1.198359702732755e-07, + "logits/chosen": -2.296750545501709, + "logits/rejected": -2.0789742469787598, + "logps/chosen": -14.332858085632324, + "logps/rejected": -583.1030883789062, + "logps_avg/chosen": -0.07787901908159256, + "logps_avg/rejected": -3.2008793354034424, + "loss": 0.0776, + "losses_ref": -0.0011311148991808295, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7150, + "u": -5.330229759216309, + "weight": 0.06403504312038422 + }, + { + "diff_generated": -31.642242431640625, + "epoch": 2.320155541153597, + "grad_norm": 2.763604684901349, + "learning_rate": 1.1876149654513321e-07, + "logits/chosen": -2.2709290981292725, + "logits/rejected": -2.121748208999634, + "logps/chosen": -14.095861434936523, + "logps/rejected": -578.5704345703125, + "logps_avg/chosen": -0.08579359948635101, + "logps_avg/rejected": -3.164224147796631, + "loss": 0.0786, + "losses_ref": -0.0008059808169491589, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7160, + "u": -5.40159797668457, + "weight": 0.050987452268600464 + }, + { + "diff_generated": -34.49913787841797, + "epoch": 2.323395981853532, + "grad_norm": 2.7881090281670287, + "learning_rate": 1.1769102111800036e-07, + "logits/chosen": -2.2897281646728516, + "logits/rejected": -2.1038079261779785, + "logps/chosen": -14.366230964660645, + "logps/rejected": -645.638671875, + "logps_avg/chosen": -0.08264970034360886, + "logps_avg/rejected": -3.449913740158081, + "loss": 0.0783, + "losses_ref": -0.00026110856560990214, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7170, + "u": -5.575833797454834, + "weight": 0.018995631486177444 + }, + { + "diff_generated": -35.24726104736328, + "epoch": 2.3266364225534675, + "grad_norm": 2.7809487105597297, + "learning_rate": 1.166245592105719e-07, + "logits/chosen": -2.2621166706085205, + "logits/rejected": -2.042609453201294, + "logps/chosen": -14.316975593566895, + "logps/rejected": -612.24560546875, + "logps_avg/chosen": -0.07764624059200287, + "logps_avg/rejected": -3.524726152420044, + "loss": 0.0782, + "losses_ref": -0.0006773438071832061, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7180, + "u": -5.435733795166016, + "weight": 0.04453923553228378 + }, + { + "diff_generated": -33.23479461669922, + "epoch": 2.3298768632534026, + "grad_norm": 2.9081941910794766, + "learning_rate": 1.1556212598448349e-07, + "logits/chosen": -2.294276714324951, + "logits/rejected": -2.085162401199341, + "logps/chosen": -16.098182678222656, + "logps/rejected": -591.9588012695312, + "logps_avg/chosen": -0.08865799009799957, + "logps_avg/rejected": -3.3234798908233643, + "loss": 0.0773, + "losses_ref": -0.0010387629736214876, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7190, + "u": -5.470877647399902, + "weight": 0.03885335102677345 + }, + { + "diff_generated": -32.696388244628906, + "epoch": 2.3331173039533377, + "grad_norm": 2.8852609735392485, + "learning_rate": 1.1450373654409591e-07, + "logits/chosen": -2.289748191833496, + "logits/rejected": -2.143613338470459, + "logps/chosen": -13.386698722839355, + "logps/rejected": -610.17041015625, + "logps_avg/chosen": -0.07896269857883453, + "logps_avg/rejected": -3.269639253616333, + "loss": 0.0801, + "losses_ref": -0.00018862645083572716, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7200, + "u": -5.330229759216309, + "weight": 0.0626850426197052 + }, + { + "diff_generated": -32.930179595947266, + "epoch": 2.336357744653273, + "grad_norm": 2.938215867459973, + "learning_rate": 1.1344940593628063e-07, + "logits/chosen": -2.24363112449646, + "logits/rejected": -2.0247769355773926, + "logps/chosen": -13.017687797546387, + "logps/rejected": -571.5632934570312, + "logps_avg/chosen": -0.07627496123313904, + "logps_avg/rejected": -3.293017864227295, + "loss": 0.0784, + "losses_ref": -0.00047731236554682255, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7210, + "u": -5.259675979614258, + "weight": 0.07552912831306458 + }, + { + "diff_generated": -30.685317993164062, + "epoch": 2.339598185353208, + "grad_norm": 2.7653327775749963, + "learning_rate": 1.1239914915020512e-07, + "logits/chosen": -2.26278018951416, + "logits/rejected": -2.109288215637207, + "logps/chosen": -13.016032218933105, + "logps/rejected": -577.0557861328125, + "logps_avg/chosen": -0.07486730068922043, + "logps_avg/rejected": -3.0685317516326904, + "loss": 0.0783, + "losses_ref": -0.0003721018729265779, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7220, + "u": -5.224618911743164, + "weight": 0.08162766695022583 + }, + { + "diff_generated": -34.90629196166992, + "epoch": 2.342838626053143, + "grad_norm": 2.825449442856524, + "learning_rate": 1.1135298111712122e-07, + "logits/chosen": -2.26003098487854, + "logits/rejected": -2.0450892448425293, + "logps/chosen": -13.561320304870605, + "logps/rejected": -630.9426879882812, + "logps_avg/chosen": -0.07841043919324875, + "logps_avg/rejected": -3.490629196166992, + "loss": 0.0767, + "losses_ref": -0.0005720141343772411, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7230, + "u": -5.402764797210693, + "weight": 0.05068415403366089 + }, + { + "diff_generated": -31.86376953125, + "epoch": 2.346079066753078, + "grad_norm": 2.8668736907248458, + "learning_rate": 1.1031091671015094e-07, + "logits/chosen": -2.245361804962158, + "logits/rejected": -2.041193962097168, + "logps/chosen": -12.147971153259277, + "logps/rejected": -587.3404541015625, + "logps_avg/chosen": -0.07041692733764648, + "logps_avg/rejected": -3.1863768100738525, + "loss": 0.076, + "losses_ref": -0.002164191100746393, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7240, + "u": -5.182686805725098, + "weight": 0.09167562425136566 + }, + { + "diff_generated": -33.82771301269531, + "epoch": 2.3493195074530138, + "grad_norm": 2.9888161658925148, + "learning_rate": 1.0927297074407662e-07, + "logits/chosen": -2.2794957160949707, + "logits/rejected": -2.085437297821045, + "logps/chosen": -13.637370109558105, + "logps/rejected": -622.2664794921875, + "logps_avg/chosen": -0.0804823487997055, + "logps_avg/rejected": -3.3827712535858154, + "loss": 0.0764, + "losses_ref": -0.00035343001945875585, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7250, + "u": -5.403036594390869, + "weight": 0.050387442111968994 + }, + { + "diff_generated": -32.11924362182617, + "epoch": 2.352559948152949, + "grad_norm": 2.789941220163072, + "learning_rate": 1.0823915797512952e-07, + "logits/chosen": -2.293781280517578, + "logits/rejected": -2.1002774238586426, + "logps/chosen": -12.634278297424316, + "logps/rejected": -586.3161010742188, + "logps_avg/chosen": -0.07193853706121445, + "logps_avg/rejected": -3.2119243144989014, + "loss": 0.0769, + "losses_ref": -0.0006824486772529781, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7260, + "u": -5.363787651062012, + "weight": 0.05698443576693535 + }, + { + "diff_generated": -34.903221130371094, + "epoch": 2.355800388852884, + "grad_norm": 3.1192025526659153, + "learning_rate": 1.0720949310078032e-07, + "logits/chosen": -2.262953281402588, + "logits/rejected": -2.07498836517334, + "logps/chosen": -13.899110794067383, + "logps/rejected": -624.4613037109375, + "logps_avg/chosen": -0.080237478017807, + "logps_avg/rejected": -3.4903221130371094, + "loss": 0.078, + "losses_ref": -0.00040980131598189473, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7270, + "u": -5.509830951690674, + "weight": 0.03166963905096054 + }, + { + "diff_generated": -30.5557804107666, + "epoch": 2.359040829552819, + "grad_norm": 2.760514648523447, + "learning_rate": 1.0618399075952993e-07, + "logits/chosen": -2.3034751415252686, + "logits/rejected": -2.089022159576416, + "logps/chosen": -13.662466049194336, + "logps/rejected": -608.3770141601562, + "logps_avg/chosen": -0.07312439382076263, + "logps_avg/rejected": -3.0555779933929443, + "loss": 0.0769, + "losses_ref": -0.0008744834922254086, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7280, + "u": -5.36469030380249, + "weight": 0.05743665248155594 + }, + { + "diff_generated": -33.99993133544922, + "epoch": 2.3622812702527543, + "grad_norm": 3.2206744990322393, + "learning_rate": 1.0516266553070159e-07, + "logits/chosen": -2.250345230102539, + "logits/rejected": -2.046128749847412, + "logps/chosen": -14.4988431930542, + "logps/rejected": -647.0737915039062, + "logps_avg/chosen": -0.08133145421743393, + "logps_avg/rejected": -3.3999931812286377, + "loss": 0.0765, + "losses_ref": -0.0007096336339600384, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7290, + "u": -5.402334690093994, + "weight": 0.05076984316110611 + }, + { + "diff_generated": -34.168174743652344, + "epoch": 2.3655217109526894, + "grad_norm": 2.819305164659092, + "learning_rate": 1.041455319342336e-07, + "logits/chosen": -2.2885193824768066, + "logits/rejected": -2.079343318939209, + "logps/chosen": -13.748265266418457, + "logps/rejected": -630.8150024414062, + "logps_avg/chosen": -0.07698606699705124, + "logps_avg/rejected": -3.4168171882629395, + "loss": 0.0788, + "losses_ref": -0.00023140080156736076, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7300, + "u": -5.473116874694824, + "weight": 0.03773132711648941 + }, + { + "diff_generated": -30.992996215820312, + "epoch": 2.368762151652625, + "grad_norm": 2.8064681132146423, + "learning_rate": 1.0313260443047247e-07, + "logits/chosen": -2.202005386352539, + "logits/rejected": -2.107422351837158, + "logps/chosen": -11.852638244628906, + "logps/rejected": -606.5282592773438, + "logps_avg/chosen": -0.07290005683898926, + "logps_avg/rejected": -3.099299907684326, + "loss": 0.0791, + "losses_ref": -0.0019258193206042051, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7310, + "u": -5.29049015045166, + "weight": 0.0717729777097702 + }, + { + "diff_generated": -31.909597396850586, + "epoch": 2.37200259235256, + "grad_norm": 2.8230981127930814, + "learning_rate": 1.0212389741996834e-07, + "logits/chosen": -2.3181042671203613, + "logits/rejected": -2.1704022884368896, + "logps/chosen": -13.069847106933594, + "logps/rejected": -612.2677612304688, + "logps_avg/chosen": -0.07597730308771133, + "logps_avg/rejected": -3.190959930419922, + "loss": 0.0756, + "losses_ref": -0.0003046352358069271, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7320, + "u": -5.331242561340332, + "weight": 0.06280872970819473 + }, + { + "diff_generated": -34.66915512084961, + "epoch": 2.375243033052495, + "grad_norm": 2.7740004009290598, + "learning_rate": 1.0111942524326891e-07, + "logits/chosen": -2.2749009132385254, + "logits/rejected": -2.11348557472229, + "logps/chosen": -12.951632499694824, + "logps/rejected": -642.3538818359375, + "logps_avg/chosen": -0.07797206938266754, + "logps_avg/rejected": -3.4669156074523926, + "loss": 0.0777, + "losses_ref": -0.00030273018637672067, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7330, + "u": -5.437063694000244, + "weight": 0.04405757784843445 + }, + { + "diff_generated": -35.043922424316406, + "epoch": 2.3784834737524303, + "grad_norm": 2.817685977122686, + "learning_rate": 1.0011920218071664e-07, + "logits/chosen": -2.3587987422943115, + "logits/rejected": -2.1169161796569824, + "logps/chosen": -13.914976119995117, + "logps/rejected": -663.571044921875, + "logps_avg/chosen": -0.07692556828260422, + "logps_avg/rejected": -3.5043931007385254, + "loss": 0.0759, + "losses_ref": -0.0008672567782923579, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7340, + "u": -5.3598151206970215, + "weight": 0.05720771476626396 + }, + { + "diff_generated": -33.76276397705078, + "epoch": 2.3817239144523654, + "grad_norm": 2.9478435470621456, + "learning_rate": 9.912324245224524e-08, + "logits/chosen": -2.351743221282959, + "logits/rejected": -2.1735739707946777, + "logps/chosen": -14.532655715942383, + "logps/rejected": -635.3209838867188, + "logps_avg/chosen": -0.07836208492517471, + "logps_avg/rejected": -3.3762760162353516, + "loss": 0.075, + "losses_ref": -0.0017885919660329819, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7350, + "u": -5.608242988586426, + "weight": 0.01502218097448349 + }, + { + "diff_generated": -33.28066635131836, + "epoch": 2.3849643551523005, + "grad_norm": 3.1743935272367123, + "learning_rate": 9.813156021717763e-08, + "logits/chosen": -2.2940306663513184, + "logits/rejected": -2.065108060836792, + "logps/chosen": -13.723846435546875, + "logps/rejected": -577.5977783203125, + "logps_avg/chosen": -0.07425703853368759, + "logps_avg/rejected": -3.3280670642852783, + "loss": 0.0764, + "losses_ref": -0.00109779997728765, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7360, + "u": -5.363638401031494, + "weight": 0.05759688466787338 + }, + { + "diff_generated": -34.619773864746094, + "epoch": 2.3882047958522357, + "grad_norm": 2.7146783287290597, + "learning_rate": 9.714416957402468e-08, + "logits/chosen": -2.283629894256592, + "logits/rejected": -2.141249895095825, + "logps/chosen": -13.449475288391113, + "logps/rejected": -633.1432495117188, + "logps_avg/chosen": -0.07728450745344162, + "logps_avg/rejected": -3.461977481842041, + "loss": 0.078, + "losses_ref": -0.00025040132459253073, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7370, + "u": -5.369337558746338, + "weight": 0.056508757174015045 + }, + { + "diff_generated": -34.74108123779297, + "epoch": 2.3914452365521712, + "grad_norm": 2.760996835929895, + "learning_rate": 9.616108456028462e-08, + "logits/chosen": -2.283205032348633, + "logits/rejected": -2.0816237926483154, + "logps/chosen": -12.97753620147705, + "logps/rejected": -624.5291137695312, + "logps_avg/chosen": -0.07539506256580353, + "logps_avg/rejected": -3.4741077423095703, + "loss": 0.0772, + "losses_ref": -0.00020238272554706782, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7380, + "u": -5.54359245300293, + "weight": 0.025206467136740685 + }, + { + "diff_generated": -34.526756286621094, + "epoch": 2.3946856772521063, + "grad_norm": 2.8408676129849475, + "learning_rate": 9.518231915224371e-08, + "logits/chosen": -2.2826147079467773, + "logits/rejected": -2.0404651165008545, + "logps/chosen": -13.243906021118164, + "logps/rejected": -653.4534912109375, + "logps_avg/chosen": -0.07447250932455063, + "logps_avg/rejected": -3.4526753425598145, + "loss": 0.0762, + "losses_ref": -0.0010482899378985167, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7390, + "u": -5.43655252456665, + "weight": 0.04528099671006203 + }, + { + "diff_generated": -32.078582763671875, + "epoch": 2.3979261179520415, + "grad_norm": 2.88027935851885, + "learning_rate": 9.4207887264777e-08, + "logits/chosen": -2.2226669788360596, + "logits/rejected": -2.1108269691467285, + "logps/chosen": -12.079364776611328, + "logps/rejected": -617.87255859375, + "logps_avg/chosen": -0.07426749914884567, + "logps_avg/rejected": -3.2078583240509033, + "loss": 0.0777, + "losses_ref": -0.0010231093037873507, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7400, + "u": -5.437130928039551, + "weight": 0.04528171569108963 + }, + { + "diff_generated": -36.370052337646484, + "epoch": 2.4011665586519766, + "grad_norm": 2.884539046607622, + "learning_rate": 9.323780275115156e-08, + "logits/chosen": -2.2833988666534424, + "logits/rejected": -2.0623676776885986, + "logps/chosen": -13.492776870727539, + "logps/rejected": -685.26025390625, + "logps_avg/chosen": -0.07685331255197525, + "logps_avg/rejected": -3.63700532913208, + "loss": 0.0771, + "losses_ref": -0.0007987999124452472, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7410, + "u": -5.509256839752197, + "weight": 0.03232438117265701 + }, + { + "diff_generated": -32.873191833496094, + "epoch": 2.4044069993519117, + "grad_norm": 2.6715246297813766, + "learning_rate": 9.22720794028283e-08, + "logits/chosen": -2.301788091659546, + "logits/rejected": -2.076624631881714, + "logps/chosen": -13.993014335632324, + "logps/rejected": -576.7996215820312, + "logps_avg/chosen": -0.0792023241519928, + "logps_avg/rejected": -3.287318706512451, + "loss": 0.0782, + "losses_ref": -0.0004467566031962633, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7420, + "u": -5.295356273651123, + "weight": 0.06927172839641571 + }, + { + "diff_generated": -34.844940185546875, + "epoch": 2.4076474400518473, + "grad_norm": 2.6567574359573825, + "learning_rate": 9.13107309492668e-08, + "logits/chosen": -2.234447717666626, + "logits/rejected": -2.0357723236083984, + "logps/chosen": -13.651655197143555, + "logps/rejected": -662.6868286132812, + "logps_avg/chosen": -0.07822132855653763, + "logps_avg/rejected": -3.4844939708709717, + "loss": 0.0774, + "losses_ref": -0.0003171044809278101, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7430, + "u": -5.438276290893555, + "weight": 0.044068168848752975 + }, + { + "diff_generated": -33.605289459228516, + "epoch": 2.4108878807517824, + "grad_norm": 2.8362513207746827, + "learning_rate": 9.035377105772966e-08, + "logits/chosen": -2.272564172744751, + "logits/rejected": -2.041059970855713, + "logps/chosen": -13.95252799987793, + "logps/rejected": -624.0958251953125, + "logps_avg/chosen": -0.0766502246260643, + "logps_avg/rejected": -3.3605289459228516, + "loss": 0.0787, + "losses_ref": -0.000506135169416666, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7440, + "u": -5.293671607971191, + "weight": 0.06933457404375076 + }, + { + "diff_generated": -32.606605529785156, + "epoch": 2.4141283214517175, + "grad_norm": 2.9002538813054826, + "learning_rate": 8.940121333308849e-08, + "logits/chosen": -2.2347395420074463, + "logits/rejected": -2.111093044281006, + "logps/chosen": -11.986194610595703, + "logps/rejected": -595.4306030273438, + "logps_avg/chosen": -0.07371608167886734, + "logps_avg/rejected": -3.2606606483459473, + "loss": 0.0777, + "losses_ref": -0.0014970863703638315, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7450, + "u": -5.362734794616699, + "weight": 0.058197181671857834 + }, + { + "diff_generated": -35.42874526977539, + "epoch": 2.4173687621516526, + "grad_norm": 2.7272928463051858, + "learning_rate": 8.845307131762991e-08, + "logits/chosen": -2.294037342071533, + "logits/rejected": -2.0677173137664795, + "logps/chosen": -14.819440841674805, + "logps/rejected": -634.6419677734375, + "logps_avg/chosen": -0.07748381793498993, + "logps_avg/rejected": -3.5428740978240967, + "loss": 0.0771, + "losses_ref": -0.0001443286018911749, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7460, + "u": -5.366196155548096, + "weight": 0.05638740211725235 + }, + { + "diff_generated": -33.44629669189453, + "epoch": 2.4206092028515878, + "grad_norm": 2.740936329712308, + "learning_rate": 8.750935849086424e-08, + "logits/chosen": -2.296396255493164, + "logits/rejected": -2.058582305908203, + "logps/chosen": -14.692959785461426, + "logps/rejected": -647.8707275390625, + "logps_avg/chosen": -0.07983608543872833, + "logps_avg/rejected": -3.3446297645568848, + "loss": 0.0768, + "losses_ref": -0.00016290844359900802, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7470, + "u": -5.538523197174072, + "weight": 0.025174766778945923 + }, + { + "diff_generated": -34.08763122558594, + "epoch": 2.423849643551523, + "grad_norm": 2.874307206616938, + "learning_rate": 8.657008826933223e-08, + "logits/chosen": -2.247481107711792, + "logits/rejected": -2.082742691040039, + "logps/chosen": -13.472023010253906, + "logps/rejected": -634.2883911132812, + "logps_avg/chosen": -0.07857387512922287, + "logps_avg/rejected": -3.4087631702423096, + "loss": 0.0793, + "losses_ref": -0.000744626100640744, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7480, + "u": -5.435318470001221, + "weight": 0.044631458818912506 + }, + { + "diff_generated": -32.04920196533203, + "epoch": 2.427090084251458, + "grad_norm": 2.6646272712366614, + "learning_rate": 8.563527400641559e-08, + "logits/chosen": -2.277095317840576, + "logits/rejected": -2.1428980827331543, + "logps/chosen": -13.724957466125488, + "logps/rejected": -603.6646728515625, + "logps_avg/chosen": -0.07841077446937561, + "logps_avg/rejected": -3.2049202919006348, + "loss": 0.0755, + "losses_ref": -0.00028366531478241086, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7490, + "u": -5.261726379394531, + "weight": 0.0752992182970047 + }, + { + "diff_generated": -35.079063415527344, + "epoch": 2.4303305249513936, + "grad_norm": 2.8130066664130466, + "learning_rate": 8.470492899214696e-08, + "logits/chosen": -2.2499706745147705, + "logits/rejected": -2.0175976753234863, + "logps/chosen": -13.57239818572998, + "logps/rejected": -608.1451416015625, + "logps_avg/chosen": -0.07596118748188019, + "logps_avg/rejected": -3.507906675338745, + "loss": 0.0776, + "losses_ref": -0.0003359224647283554, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7500, + "u": -5.288158893585205, + "weight": 0.06913691014051437 + }, + { + "diff_generated": -33.463157653808594, + "epoch": 2.4335709656513287, + "grad_norm": 2.8587796142319215, + "learning_rate": 8.377906645302015e-08, + "logits/chosen": -2.242366313934326, + "logits/rejected": -2.0637431144714355, + "logps/chosen": -14.448846817016602, + "logps/rejected": -635.8395385742188, + "logps_avg/chosen": -0.0818089097738266, + "logps_avg/rejected": -3.346315860748291, + "loss": 0.0777, + "losses_ref": -0.00039851194014772773, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7510, + "u": -5.469162940979004, + "weight": 0.03789610415697098 + }, + { + "diff_generated": -36.01557540893555, + "epoch": 2.436811406351264, + "grad_norm": 2.704493399261753, + "learning_rate": 8.28576995518031e-08, + "logits/chosen": -2.257784128189087, + "logits/rejected": -1.9793716669082642, + "logps/chosen": -15.093899726867676, + "logps/rejected": -609.7103881835938, + "logps_avg/chosen": -0.08029602468013763, + "logps_avg/rejected": -3.601557493209839, + "loss": 0.0775, + "losses_ref": -0.0009452321683056653, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7520, + "u": -5.399094104766846, + "weight": 0.051128365099430084 + }, + { + "diff_generated": -32.34758758544922, + "epoch": 2.440051847051199, + "grad_norm": 2.994888257339022, + "learning_rate": 8.194084138735023e-08, + "logits/chosen": -2.2800023555755615, + "logits/rejected": -2.082465648651123, + "logps/chosen": -14.151315689086914, + "logps/rejected": -611.8934326171875, + "logps_avg/chosen": -0.08025664836168289, + "logps_avg/rejected": -3.2347590923309326, + "loss": 0.0776, + "losses_ref": -0.00029332557460293174, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7530, + "u": -5.439518451690674, + "weight": 0.044044096022844315 + }, + { + "diff_generated": -33.730262756347656, + "epoch": 2.443292287751134, + "grad_norm": 2.709483158923101, + "learning_rate": 8.102850499441638e-08, + "logits/chosen": -2.238938331604004, + "logits/rejected": -2.0246846675872803, + "logps/chosen": -13.48927116394043, + "logps/rejected": -658.44140625, + "logps_avg/chosen": -0.07260926812887192, + "logps_avg/rejected": -3.3730266094207764, + "loss": 0.0768, + "losses_ref": -0.00044730809167958796, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7540, + "u": -5.0764479637146, + "weight": 0.10680226981639862 + }, + { + "diff_generated": -35.54068374633789, + "epoch": 2.446532728451069, + "grad_norm": 2.7716548897069804, + "learning_rate": 8.012070334347103e-08, + "logits/chosen": -2.2876362800598145, + "logits/rejected": -2.0490283966064453, + "logps/chosen": -15.265823364257812, + "logps/rejected": -662.9925537109375, + "logps_avg/chosen": -0.08061473816633224, + "logps_avg/rejected": -3.5540683269500732, + "loss": 0.0763, + "losses_ref": -0.00046191777801141143, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7550, + "u": -5.507439613342285, + "weight": 0.03176378458738327 + }, + { + "diff_generated": -33.28374481201172, + "epoch": 2.4497731691510047, + "grad_norm": 2.6459787187816888, + "learning_rate": 7.921744934051515e-08, + "logits/chosen": -2.255577802658081, + "logits/rejected": -2.0689337253570557, + "logps/chosen": -12.737321853637695, + "logps/rejected": -604.4900512695312, + "logps_avg/chosen": -0.07835245132446289, + "logps_avg/rejected": -3.3283743858337402, + "loss": 0.0753, + "losses_ref": -0.0007992651080712676, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7560, + "u": -5.214974880218506, + "weight": 0.08231941610574722 + }, + { + "diff_generated": -32.80999755859375, + "epoch": 2.45301360985094, + "grad_norm": 2.7422000300867486, + "learning_rate": 7.831875582689598e-08, + "logits/chosen": -2.2509467601776123, + "logits/rejected": -2.058760404586792, + "logps/chosen": -13.229718208312988, + "logps/rejected": -591.0678100585938, + "logps_avg/chosen": -0.07577583193778992, + "logps_avg/rejected": -3.2809996604919434, + "loss": 0.0758, + "losses_ref": -0.0003204692038707435, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7570, + "u": -5.329041957855225, + "weight": 0.06283210217952728 + }, + { + "diff_generated": -32.483097076416016, + "epoch": 2.456254050550875, + "grad_norm": 2.8835914157486426, + "learning_rate": 7.742463557912593e-08, + "logits/chosen": -2.2545275688171387, + "logits/rejected": -2.090634822845459, + "logps/chosen": -13.476033210754395, + "logps/rejected": -588.4400024414062, + "logps_avg/chosen": -0.07627163827419281, + "logps_avg/rejected": -3.248309373855591, + "loss": 0.0771, + "losses_ref": -0.0012753000482916832, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7580, + "u": -5.296724796295166, + "weight": 0.0706983357667923 + }, + { + "diff_generated": -35.58943557739258, + "epoch": 2.45949449125081, + "grad_norm": 2.695827748470073, + "learning_rate": 7.65351013087002e-08, + "logits/chosen": -2.267869472503662, + "logits/rejected": -2.1123318672180176, + "logps/chosen": -11.977426528930664, + "logps/rejected": -692.4275512695312, + "logps_avg/chosen": -0.0733996331691742, + "logps_avg/rejected": -3.558943271636963, + "loss": 0.0782, + "losses_ref": -0.00019766921468544751, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7590, + "u": -5.437938690185547, + "weight": 0.04395180940628052 + }, + { + "diff_generated": -34.45747756958008, + "epoch": 2.462734931950745, + "grad_norm": 3.04541194251132, + "learning_rate": 7.565016566191631e-08, + "logits/chosen": -2.2000534534454346, + "logits/rejected": -2.032714366912842, + "logps/chosen": -13.62053108215332, + "logps/rejected": -630.7037353515625, + "logps_avg/chosen": -0.07926841825246811, + "logps_avg/rejected": -3.4457478523254395, + "loss": 0.0771, + "losses_ref": -0.0006812643841840327, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7600, + "u": -5.399075984954834, + "weight": 0.05084569379687309 + }, + { + "diff_generated": -33.85066604614258, + "epoch": 2.4659753726506803, + "grad_norm": 2.861072627997925, + "learning_rate": 7.47698412196939e-08, + "logits/chosen": -2.2991385459899902, + "logits/rejected": -2.0473597049713135, + "logps/chosen": -13.630948066711426, + "logps/rejected": -632.0382080078125, + "logps_avg/chosen": -0.07360848784446716, + "logps_avg/rejected": -3.385066270828247, + "loss": 0.0781, + "losses_ref": -0.001396728097461164, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7610, + "u": -5.36118745803833, + "weight": 0.05879664421081543 + }, + { + "diff_generated": -34.452186584472656, + "epoch": 2.4692158133506155, + "grad_norm": 2.6760339852260264, + "learning_rate": 7.389414049739682e-08, + "logits/chosen": -2.298701763153076, + "logits/rejected": -2.094611167907715, + "logps/chosen": -13.719430923461914, + "logps/rejected": -635.7684326171875, + "logps_avg/chosen": -0.07470625638961792, + "logps_avg/rejected": -3.445218563079834, + "loss": 0.0759, + "losses_ref": -0.0010666692396625876, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7620, + "u": -5.395898342132568, + "weight": 0.051368117332458496 + }, + { + "diff_generated": -33.900569915771484, + "epoch": 2.472456254050551, + "grad_norm": 2.7214995018689887, + "learning_rate": 7.302307594465422e-08, + "logits/chosen": -2.2754058837890625, + "logits/rejected": -2.1271181106567383, + "logps/chosen": -13.97209644317627, + "logps/rejected": -676.8109130859375, + "logps_avg/chosen": -0.07918776571750641, + "logps_avg/rejected": -3.39005708694458, + "loss": 0.0756, + "losses_ref": -0.00014776972238905728, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7630, + "u": -5.429097652435303, + "weight": 0.04388615861535072 + }, + { + "diff_generated": -35.28455352783203, + "epoch": 2.475696694750486, + "grad_norm": 3.172336067859759, + "learning_rate": 7.215665994518367e-08, + "logits/chosen": -2.266859769821167, + "logits/rejected": -2.0783934593200684, + "logps/chosen": -13.267038345336914, + "logps/rejected": -638.2308349609375, + "logps_avg/chosen": -0.0793921947479248, + "logps_avg/rejected": -3.5284552574157715, + "loss": 0.0777, + "losses_ref": -0.00048756637261249125, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7640, + "u": -5.361421585083008, + "weight": 0.05682788044214249 + }, + { + "diff_generated": -34.95798873901367, + "epoch": 2.4789371354504213, + "grad_norm": 2.958338712244271, + "learning_rate": 7.129490481661605e-08, + "logits/chosen": -2.29850172996521, + "logits/rejected": -2.0712332725524902, + "logps/chosen": -15.60583209991455, + "logps/rejected": -665.9384155273438, + "logps_avg/chosen": -0.08422177284955978, + "logps_avg/rejected": -3.4957988262176514, + "loss": 0.076, + "losses_ref": -0.0006580519257113338, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7650, + "u": -5.613326072692871, + "weight": 0.013244752772152424 + }, + { + "diff_generated": -32.899208068847656, + "epoch": 2.4821775761503564, + "grad_norm": 3.544889417217034, + "learning_rate": 7.043782281031911e-08, + "logits/chosen": -2.2815823554992676, + "logits/rejected": -1.97531259059906, + "logps/chosen": -16.198482513427734, + "logps/rejected": -604.340576171875, + "logps_avg/chosen": -0.08431107550859451, + "logps_avg/rejected": -3.2899203300476074, + "loss": 0.0768, + "losses_ref": -0.0005749968695454299, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7660, + "u": -5.256075859069824, + "weight": 0.07563985884189606 + }, + { + "diff_generated": -35.965213775634766, + "epoch": 2.4854180168502915, + "grad_norm": 2.803402804866771, + "learning_rate": 6.958542611122422e-08, + "logits/chosen": -2.2891929149627686, + "logits/rejected": -2.0426852703094482, + "logps/chosen": -13.56506061553955, + "logps/rejected": -666.633544921875, + "logps_avg/chosen": -0.07456330955028534, + "logps_avg/rejected": -3.5965213775634766, + "loss": 0.0776, + "losses_ref": -0.0004795569402631372, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7670, + "u": -5.431191921234131, + "weight": 0.04434206336736679 + }, + { + "diff_generated": -32.99568176269531, + "epoch": 2.488658457550227, + "grad_norm": 2.657620426284608, + "learning_rate": 6.873772683765283e-08, + "logits/chosen": -2.212829351425171, + "logits/rejected": -2.029841899871826, + "logps/chosen": -13.612431526184082, + "logps/rejected": -614.3855590820312, + "logps_avg/chosen": -0.07634242624044418, + "logps_avg/rejected": -3.2995681762695312, + "loss": 0.0735, + "losses_ref": -0.0002592132077552378, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7680, + "u": -5.2603278160095215, + "weight": 0.07525938004255295 + }, + { + "diff_generated": -35.51023483276367, + "epoch": 2.491898898250162, + "grad_norm": 2.8426490408551204, + "learning_rate": 6.789473704114428e-08, + "logits/chosen": -2.2818164825439453, + "logits/rejected": -2.0289533138275146, + "logps/chosen": -13.806233406066895, + "logps/rejected": -657.150390625, + "logps_avg/chosen": -0.07651212066411972, + "logps_avg/rejected": -3.551023483276367, + "loss": 0.0784, + "losses_ref": -0.0007175356731750071, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7690, + "u": -5.575205326080322, + "weight": 0.01952420175075531 + }, + { + "diff_generated": -33.23609161376953, + "epoch": 2.4951393389500973, + "grad_norm": 2.8013204477737217, + "learning_rate": 6.7056468706284e-08, + "logits/chosen": -2.257474184036255, + "logits/rejected": -2.029222011566162, + "logps/chosen": -14.227697372436523, + "logps/rejected": -584.1106567382812, + "logps_avg/chosen": -0.07747099548578262, + "logps_avg/rejected": -3.3236095905303955, + "loss": 0.0783, + "losses_ref": -0.0004787585639860481, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7700, + "u": -5.363088607788086, + "weight": 0.05679760128259659 + }, + { + "diff_generated": -35.77177047729492, + "epoch": 2.4983797796500324, + "grad_norm": 2.6951188107579145, + "learning_rate": 6.622293375053422e-08, + "logits/chosen": -2.2117533683776855, + "logits/rejected": -1.9806289672851562, + "logps/chosen": -14.14848804473877, + "logps/rejected": -601.2380981445312, + "logps_avg/chosen": -0.07564245164394379, + "logps_avg/rejected": -3.5771775245666504, + "loss": 0.078, + "losses_ref": -0.0010219484101980925, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7710, + "u": -5.331390380859375, + "weight": 0.06371685117483139 + }, + { + "diff_generated": -33.446556091308594, + "epoch": 2.5016202203499676, + "grad_norm": 3.003938973826779, + "learning_rate": 6.539414402406316e-08, + "logits/chosen": -2.236816167831421, + "logits/rejected": -2.0297279357910156, + "logps/chosen": -13.686360359191895, + "logps/rejected": -624.5142822265625, + "logps_avg/chosen": -0.07835282385349274, + "logps_avg/rejected": -3.3446555137634277, + "loss": 0.0774, + "losses_ref": -0.00030313601018860936, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7720, + "u": -5.471747398376465, + "weight": 0.037795376032590866 + }, + { + "diff_generated": -32.8238410949707, + "epoch": 2.5048606610499027, + "grad_norm": 2.867173383688279, + "learning_rate": 6.457011130957747e-08, + "logits/chosen": -2.2398643493652344, + "logits/rejected": -1.9696300029754639, + "logps/chosen": -14.389925956726074, + "logps/rejected": -597.840087890625, + "logps_avg/chosen": -0.07920937240123749, + "logps_avg/rejected": -3.282384157180786, + "loss": 0.0773, + "losses_ref": -0.0005577055853791535, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7730, + "u": -5.359132289886475, + "weight": 0.05690532177686691 + }, + { + "diff_generated": -36.69569396972656, + "epoch": 2.508101101749838, + "grad_norm": 2.839278217911586, + "learning_rate": 6.37508473221549e-08, + "logits/chosen": -2.269911289215088, + "logits/rejected": -2.018099308013916, + "logps/chosen": -14.185853958129883, + "logps/rejected": -667.17041015625, + "logps_avg/chosen": -0.08158121258020401, + "logps_avg/rejected": -3.669569492340088, + "loss": 0.078, + "losses_ref": -0.0003814251977019012, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7740, + "u": -5.434433460235596, + "weight": 0.04420917108654976 + }, + { + "diff_generated": -36.33928298950195, + "epoch": 2.511341542449773, + "grad_norm": 2.8251290016911526, + "learning_rate": 6.293636370907665e-08, + "logits/chosen": -2.288761615753174, + "logits/rejected": -2.0692386627197266, + "logps/chosen": -13.266860961914062, + "logps/rejected": -708.5743408203125, + "logps_avg/chosen": -0.07769857347011566, + "logps_avg/rejected": -3.6339282989501953, + "loss": 0.0775, + "losses_ref": -0.0007504144450649619, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7750, + "u": -5.508566379547119, + "weight": 0.03215345740318298 + }, + { + "diff_generated": -33.397117614746094, + "epoch": 2.5145819831497085, + "grad_norm": 2.830751045089104, + "learning_rate": 6.212667204966293e-08, + "logits/chosen": -2.3138840198516846, + "logits/rejected": -2.0809566974639893, + "logps/chosen": -13.976930618286133, + "logps/rejected": -618.265869140625, + "logps_avg/chosen": -0.07728879898786545, + "logps_avg/rejected": -3.339712142944336, + "loss": 0.0783, + "losses_ref": -0.00014159231795929372, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7760, + "u": -5.511630058288574, + "weight": 0.031377941370010376 + }, + { + "diff_generated": -34.5837516784668, + "epoch": 2.5178224238496436, + "grad_norm": 2.836185839666556, + "learning_rate": 6.132178385510772e-08, + "logits/chosen": -2.270347833633423, + "logits/rejected": -2.0604121685028076, + "logps/chosen": -13.438015937805176, + "logps/rejected": -615.5458984375, + "logps_avg/chosen": -0.07801367342472076, + "logps_avg/rejected": -3.4583754539489746, + "loss": 0.0758, + "losses_ref": -0.0010575618362054229, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7770, + "u": -5.43215274810791, + "weight": 0.045181840658187866 + }, + { + "diff_generated": -34.33323287963867, + "epoch": 2.5210628645495787, + "grad_norm": 2.7319029632555822, + "learning_rate": 6.052171056831547e-08, + "logits/chosen": -2.2712090015411377, + "logits/rejected": -2.0803134441375732, + "logps/chosen": -12.562702178955078, + "logps/rejected": -632.27490234375, + "logps_avg/chosen": -0.07550046592950821, + "logps_avg/rejected": -3.433323383331299, + "loss": 0.075, + "losses_ref": -0.0002697540621738881, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7780, + "u": -5.4767985343933105, + "weight": 0.0377730131149292 + }, + { + "diff_generated": -35.092098236083984, + "epoch": 2.524303305249514, + "grad_norm": 2.770684012831786, + "learning_rate": 5.972646356373779e-08, + "logits/chosen": -2.2754201889038086, + "logits/rejected": -2.0463852882385254, + "logps/chosen": -15.457331657409668, + "logps/rejected": -633.0758056640625, + "logps_avg/chosen": -0.0822768360376358, + "logps_avg/rejected": -3.5092101097106934, + "loss": 0.0754, + "losses_ref": -0.0001960611407412216, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7790, + "u": -5.5086774826049805, + "weight": 0.031437747180461884 + }, + { + "diff_generated": -34.240562438964844, + "epoch": 2.527543745949449, + "grad_norm": 2.796408548522239, + "learning_rate": 5.893605414721277e-08, + "logits/chosen": -2.299086093902588, + "logits/rejected": -2.0886199474334717, + "logps/chosen": -12.549464225769043, + "logps/rejected": -640.3765869140625, + "logps_avg/chosen": -0.07397869974374771, + "logps_avg/rejected": -3.4240562915802, + "loss": 0.0758, + "losses_ref": -0.00014421154628507793, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7800, + "u": -5.437371253967285, + "weight": 0.04388119652867317 + }, + { + "diff_generated": -34.39710235595703, + "epoch": 2.5307841866493845, + "grad_norm": 2.9301569992729046, + "learning_rate": 5.815049355580317e-08, + "logits/chosen": -2.2772209644317627, + "logits/rejected": -2.0563406944274902, + "logps/chosen": -14.136589050292969, + "logps/rejected": -647.86181640625, + "logps_avg/chosen": -0.07758349925279617, + "logps_avg/rejected": -3.4397106170654297, + "loss": 0.076, + "losses_ref": -0.00013244636647868901, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7810, + "u": -5.507662296295166, + "weight": 0.03137155622243881 + }, + { + "diff_generated": -37.23072814941406, + "epoch": 2.5340246273493197, + "grad_norm": 2.9462919405130785, + "learning_rate": 5.736979295763742e-08, + "logits/chosen": -2.2507474422454834, + "logits/rejected": -1.970627784729004, + "logps/chosen": -15.260380744934082, + "logps/rejected": -679.1442260742188, + "logps_avg/chosen": -0.0788363367319107, + "logps_avg/rejected": -3.7230727672576904, + "loss": 0.0782, + "losses_ref": -0.0007362683536484838, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7820, + "u": -5.543445587158203, + "weight": 0.025849919766187668 + }, + { + "diff_generated": -33.65333557128906, + "epoch": 2.537265068049255, + "grad_norm": 2.9489102885078937, + "learning_rate": 5.659396345175049e-08, + "logits/chosen": -2.241163730621338, + "logits/rejected": -2.024839401245117, + "logps/chosen": -14.190361022949219, + "logps/rejected": -610.3209228515625, + "logps_avg/chosen": -0.08231045305728912, + "logps_avg/rejected": -3.3653335571289062, + "loss": 0.0751, + "losses_ref": -0.0008589104982092977, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7830, + "u": -5.471013069152832, + "weight": 0.03844798356294632 + }, + { + "diff_generated": -33.5227165222168, + "epoch": 2.54050550874919, + "grad_norm": 2.729628595505013, + "learning_rate": 5.5823016067926234e-08, + "logits/chosen": -2.2366445064544678, + "logits/rejected": -2.0078482627868652, + "logps/chosen": -14.653826713562012, + "logps/rejected": -614.4781494140625, + "logps_avg/chosen": -0.07879281044006348, + "logps_avg/rejected": -3.352271556854248, + "loss": 0.0763, + "losses_ref": -0.0001931044098455459, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7840, + "u": -5.330508708953857, + "weight": 0.06269881129264832 + }, + { + "diff_generated": -32.063846588134766, + "epoch": 2.543745949449125, + "grad_norm": 2.89176776237173, + "learning_rate": 5.5056961766540444e-08, + "logits/chosen": -2.2384262084960938, + "logits/rejected": -2.0224967002868652, + "logps/chosen": -13.754913330078125, + "logps/rejected": -595.7368774414062, + "logps_avg/chosen": -0.07618410885334015, + "logps_avg/rejected": -3.2063846588134766, + "loss": 0.0765, + "losses_ref": -0.0006604836671613157, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7850, + "u": -5.15533971786499, + "weight": 0.09443579614162445 + }, + { + "diff_generated": -35.68360900878906, + "epoch": 2.54698639014906, + "grad_norm": 2.823496183396589, + "learning_rate": 5.429581143840525e-08, + "logits/chosen": -2.244398355484009, + "logits/rejected": -2.027529001235962, + "logps/chosen": -13.530553817749023, + "logps/rejected": -645.8378295898438, + "logps_avg/chosen": -0.07708597183227539, + "logps_avg/rejected": -3.5683608055114746, + "loss": 0.079, + "losses_ref": -0.0003415651444811374, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7860, + "u": -5.435323238372803, + "weight": 0.044158075004816055 + }, + { + "diff_generated": -35.86713790893555, + "epoch": 2.5502268308489953, + "grad_norm": 2.91644868184901, + "learning_rate": 5.3539575904614176e-08, + "logits/chosen": -2.2696292400360107, + "logits/rejected": -2.0367541313171387, + "logps/chosen": -14.176198959350586, + "logps/rejected": -651.2911987304688, + "logps_avg/chosen": -0.08043045550584793, + "logps_avg/rejected": -3.586714267730713, + "loss": 0.0761, + "losses_ref": -0.0014636798296123743, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7870, + "u": -5.29245662689209, + "weight": 0.07079814374446869 + }, + { + "diff_generated": -35.674190521240234, + "epoch": 2.5534672715489304, + "grad_norm": 2.5833971813648806, + "learning_rate": 5.278826591638794e-08, + "logits/chosen": -2.2695140838623047, + "logits/rejected": -2.061769962310791, + "logps/chosen": -14.17786979675293, + "logps/rejected": -668.9481201171875, + "logps_avg/chosen": -0.08042646944522858, + "logps_avg/rejected": -3.5674190521240234, + "loss": 0.0773, + "losses_ref": -0.0002508517063688487, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7880, + "u": -5.541116237640381, + "weight": 0.02528039552271366 + }, + { + "diff_generated": -35.04924774169922, + "epoch": 2.556707712248866, + "grad_norm": 2.7802142146267546, + "learning_rate": 5.204189215492252e-08, + "logits/chosen": -2.2482147216796875, + "logits/rejected": -2.0366978645324707, + "logps/chosen": -13.320378303527832, + "logps/rejected": -670.7872314453125, + "logps_avg/chosen": -0.07657703012228012, + "logps_avg/rejected": -3.50492525100708, + "loss": 0.0772, + "losses_ref": -0.0008224450284615159, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7890, + "u": -5.401477336883545, + "weight": 0.05123826116323471 + }, + { + "diff_generated": -34.19906997680664, + "epoch": 2.559948152948801, + "grad_norm": 2.864889372141515, + "learning_rate": 5.1300465231236145e-08, + "logits/chosen": -2.263850450515747, + "logits/rejected": -2.032125949859619, + "logps/chosen": -14.5372314453125, + "logps/rejected": -612.401611328125, + "logps_avg/chosen": -0.075019970536232, + "logps_avg/rejected": -3.4199066162109375, + "loss": 0.0782, + "losses_ref": -0.0018288299906998873, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7900, + "u": -5.394659996032715, + "weight": 0.05333171412348747 + }, + { + "diff_generated": -33.32030487060547, + "epoch": 2.563188593648736, + "grad_norm": 2.6476779853936376, + "learning_rate": 5.056399568601946e-08, + "logits/chosen": -2.297189235687256, + "logits/rejected": -2.0654282569885254, + "logps/chosen": -13.988309860229492, + "logps/rejected": -588.9055786132812, + "logps_avg/chosen": -0.07717391848564148, + "logps_avg/rejected": -3.3320305347442627, + "loss": 0.0785, + "losses_ref": -0.0003493439289741218, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7910, + "u": -5.32882022857666, + "weight": 0.06289394199848175 + }, + { + "diff_generated": -36.19215393066406, + "epoch": 2.5664290343486713, + "grad_norm": 2.9522400865572314, + "learning_rate": 4.983249398948502e-08, + "logits/chosen": -2.314213752746582, + "logits/rejected": -2.0248019695281982, + "logps/chosen": -14.250593185424805, + "logps/rejected": -701.8851318359375, + "logps_avg/chosen": -0.07895634323358536, + "logps_avg/rejected": -3.619215488433838, + "loss": 0.0781, + "losses_ref": -0.00045448317541740835, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7920, + "u": -5.472224235534668, + "weight": 0.0380551740527153 + }, + { + "diff_generated": -34.47525405883789, + "epoch": 2.569669475048607, + "grad_norm": 2.809750356358677, + "learning_rate": 4.910597054121877e-08, + "logits/chosen": -2.257275104522705, + "logits/rejected": -2.013333797454834, + "logps/chosen": -15.664281845092773, + "logps/rejected": -614.7738037109375, + "logps_avg/chosen": -0.08708717674016953, + "logps_avg/rejected": -3.4475257396698, + "loss": 0.0772, + "losses_ref": -0.00023688049986958504, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7930, + "u": -5.436405658721924, + "weight": 0.04399479180574417 + }, + { + "diff_generated": -35.79109573364258, + "epoch": 2.572909915748542, + "grad_norm": 2.965292107484825, + "learning_rate": 4.838443567003194e-08, + "logits/chosen": -2.2718281745910645, + "logits/rejected": -2.0739755630493164, + "logps/chosen": -12.573837280273438, + "logps/rejected": -654.598388671875, + "logps_avg/chosen": -0.07561159133911133, + "logps_avg/rejected": -3.5791091918945312, + "loss": 0.0752, + "losses_ref": -0.0008258657762780786, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7940, + "u": -5.506420612335205, + "weight": 0.03219860419631004 + }, + { + "diff_generated": -33.60948181152344, + "epoch": 2.576150356448477, + "grad_norm": 2.9055064732309073, + "learning_rate": 4.766789963381459e-08, + "logits/chosen": -2.279853105545044, + "logits/rejected": -2.0685627460479736, + "logps/chosen": -14.392298698425293, + "logps/rejected": -609.1856689453125, + "logps_avg/chosen": -0.07995419949293137, + "logps_avg/rejected": -3.360948085784912, + "loss": 0.0789, + "losses_ref": -0.0005912907072342932, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7950, + "u": -5.365494251251221, + "weight": 0.05691393464803696 + }, + { + "diff_generated": -34.88692855834961, + "epoch": 2.5793907971484122, + "grad_norm": 2.7422363528394085, + "learning_rate": 4.695637261938912e-08, + "logits/chosen": -2.280608654022217, + "logits/rejected": -2.0431153774261475, + "logps/chosen": -13.370687484741211, + "logps/rejected": -620.4929809570312, + "logps_avg/chosen": -0.07715877145528793, + "logps_avg/rejected": -3.48869252204895, + "loss": 0.0759, + "losses_ref": -0.00044079654617235065, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7960, + "u": -5.467726230621338, + "weight": 0.03798266500234604 + }, + { + "diff_generated": -33.26984405517578, + "epoch": 2.5826312378483474, + "grad_norm": 2.616784269180188, + "learning_rate": 4.624986474236623e-08, + "logits/chosen": -2.30104398727417, + "logits/rejected": -2.053588390350342, + "logps/chosen": -13.474719047546387, + "logps/rejected": -634.5220947265625, + "logps_avg/chosen": -0.07315264642238617, + "logps_avg/rejected": -3.3269848823547363, + "loss": 0.0757, + "losses_ref": -0.0005079508991912007, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7970, + "u": -5.402222633361816, + "weight": 0.05061950162053108 + }, + { + "diff_generated": -34.360939025878906, + "epoch": 2.5858716785482825, + "grad_norm": 2.778302410578972, + "learning_rate": 4.554838604700073e-08, + "logits/chosen": -2.2400848865509033, + "logits/rejected": -2.007932186126709, + "logps/chosen": -13.436427116394043, + "logps/rejected": -611.0859985351562, + "logps_avg/chosen": -0.07698436081409454, + "logps_avg/rejected": -3.43609356880188, + "loss": 0.0771, + "losses_ref": -0.000243752496317029, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7980, + "u": -5.294255256652832, + "weight": 0.06898865848779678 + }, + { + "diff_generated": -35.74687576293945, + "epoch": 2.5891121192482176, + "grad_norm": 3.077691021737918, + "learning_rate": 4.4851946506048445e-08, + "logits/chosen": -2.279158592224121, + "logits/rejected": -2.046879529953003, + "logps/chosen": -12.938554763793945, + "logps/rejected": -642.0476684570312, + "logps_avg/chosen": -0.07505569607019424, + "logps_avg/rejected": -3.5746874809265137, + "loss": 0.0774, + "losses_ref": -0.0007500805077143013, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 7990, + "u": -5.365180492401123, + "weight": 0.05708513781428337 + }, + { + "diff_generated": -33.259788513183594, + "epoch": 2.5923525599481527, + "grad_norm": 2.6148008147179413, + "learning_rate": 4.4160556020625026e-08, + "logits/chosen": -2.260484218597412, + "logits/rejected": -2.0400381088256836, + "logps/chosen": -14.224832534790039, + "logps/rejected": -642.1659545898438, + "logps_avg/chosen": -0.07914839684963226, + "logps_avg/rejected": -3.325979232788086, + "loss": 0.0761, + "losses_ref": -0.0006360385450534523, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8000, + "u": -5.328615665435791, + "weight": 0.06318946927785873 + }, + { + "diff_generated": -35.72733688354492, + "epoch": 2.5955930006480883, + "grad_norm": 2.7878707857614335, + "learning_rate": 4.347422442006476e-08, + "logits/chosen": -2.2746644020080566, + "logits/rejected": -2.0587804317474365, + "logps/chosen": -14.194560050964355, + "logps/rejected": -652.4595947265625, + "logps_avg/chosen": -0.0780181735754013, + "logps_avg/rejected": -3.5727341175079346, + "loss": 0.0751, + "losses_ref": -0.0010231004562228918, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8010, + "u": -5.437048435211182, + "weight": 0.04501374438405037 + }, + { + "diff_generated": -35.51588821411133, + "epoch": 2.5988334413480234, + "grad_norm": 2.7540198565819782, + "learning_rate": 4.2792961461781064e-08, + "logits/chosen": -2.321173667907715, + "logits/rejected": -2.0663743019104004, + "logps/chosen": -14.140558242797852, + "logps/rejected": -648.8035888671875, + "logps_avg/chosen": -0.07980917394161224, + "logps_avg/rejected": -3.551588773727417, + "loss": 0.0773, + "losses_ref": -0.0003094382118433714, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8020, + "u": -5.509263515472412, + "weight": 0.0315590463578701 + }, + { + "diff_generated": -33.125953674316406, + "epoch": 2.6020738820479585, + "grad_norm": 2.755074755828332, + "learning_rate": 4.211677683112751e-08, + "logits/chosen": -2.28379487991333, + "logits/rejected": -2.045283794403076, + "logps/chosen": -12.904928207397461, + "logps/rejected": -624.020263671875, + "logps_avg/chosen": -0.07249008119106293, + "logps_avg/rejected": -3.3125953674316406, + "loss": 0.0759, + "losses_ref": -0.00020831017172895372, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8030, + "u": -5.257404804229736, + "weight": 0.07521463930606842 + }, + { + "diff_generated": -35.30225372314453, + "epoch": 2.6053143227478937, + "grad_norm": 2.6717290920800196, + "learning_rate": 4.1445680141260594e-08, + "logits/chosen": -2.317934036254883, + "logits/rejected": -2.0665221214294434, + "logps/chosen": -15.472373962402344, + "logps/rejected": -655.91748046875, + "logps_avg/chosen": -0.08271267265081406, + "logps_avg/rejected": -3.5302252769470215, + "loss": 0.0786, + "losses_ref": -8.515668741893023e-05, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8040, + "u": -5.471126079559326, + "weight": 0.037577468901872635 + }, + { + "diff_generated": -34.73823547363281, + "epoch": 2.6085547634478288, + "grad_norm": 2.72154979767351, + "learning_rate": 4.077968093300237e-08, + "logits/chosen": -2.2775912284851074, + "logits/rejected": -2.0561890602111816, + "logps/chosen": -13.31470775604248, + "logps/rejected": -645.74072265625, + "logps_avg/chosen": -0.0767243281006813, + "logps_avg/rejected": -3.4738240242004395, + "loss": 0.0758, + "losses_ref": -0.0005114816012792289, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8050, + "u": -5.501688003540039, + "weight": 0.03185393661260605 + }, + { + "diff_generated": -37.13385009765625, + "epoch": 2.6117952041477643, + "grad_norm": 2.855539364129418, + "learning_rate": 4.011878867470542e-08, + "logits/chosen": -2.282379627227783, + "logits/rejected": -2.0570123195648193, + "logps/chosen": -15.232747077941895, + "logps/rejected": -693.3724365234375, + "logps_avg/chosen": -0.08379258215427399, + "logps_avg/rejected": -3.7133851051330566, + "loss": 0.0785, + "losses_ref": -0.00034684973070397973, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8060, + "u": -5.579532623291016, + "weight": 0.01912674866616726 + }, + { + "diff_generated": -34.14933395385742, + "epoch": 2.6150356448476995, + "grad_norm": 2.6408453666827367, + "learning_rate": 3.9463012762118144e-08, + "logits/chosen": -2.21820068359375, + "logits/rejected": -2.0539538860321045, + "logps/chosen": -11.912099838256836, + "logps/rejected": -666.9979248046875, + "logps_avg/chosen": -0.07595182210206985, + "logps_avg/rejected": -3.414933443069458, + "loss": 0.0756, + "losses_ref": -0.0010541939409449697, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8070, + "u": -5.364609241485596, + "weight": 0.05779505521059036 + }, + { + "diff_generated": -34.31537628173828, + "epoch": 2.6182760855476346, + "grad_norm": 2.8379084234493486, + "learning_rate": 3.8812362518250816e-08, + "logits/chosen": -2.2986514568328857, + "logits/rejected": -2.073610305786133, + "logps/chosen": -14.934292793273926, + "logps/rejected": -623.4342651367188, + "logps_avg/chosen": -0.0827370285987854, + "logps_avg/rejected": -3.431537628173828, + "loss": 0.0801, + "losses_ref": -0.00031489311368204653, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8080, + "u": -5.367594242095947, + "weight": 0.05658316612243652 + }, + { + "diff_generated": -35.86017608642578, + "epoch": 2.6215165262475697, + "grad_norm": 2.6788056351776177, + "learning_rate": 3.816684719324352e-08, + "logits/chosen": -2.241401195526123, + "logits/rejected": -2.0460305213928223, + "logps/chosen": -12.66572380065918, + "logps/rejected": -684.3659057617188, + "logps_avg/chosen": -0.07562129199504852, + "logps_avg/rejected": -3.586017608642578, + "loss": 0.0771, + "losses_ref": -0.0008628388168290257, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8090, + "u": -5.4722185134887695, + "weight": 0.03859802335500717 + }, + { + "diff_generated": -34.25069046020508, + "epoch": 2.624756966947505, + "grad_norm": 2.7244222042198363, + "learning_rate": 3.7526475964234286e-08, + "logits/chosen": -2.25890851020813, + "logits/rejected": -2.0612587928771973, + "logps/chosen": -13.041460990905762, + "logps/rejected": -637.5123901367188, + "logps_avg/chosen": -0.07453655451536179, + "logps_avg/rejected": -3.4250690937042236, + "loss": 0.0771, + "losses_ref": -0.0009745571878738701, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8100, + "u": -5.402337551116943, + "weight": 0.0512116476893425 + }, + { + "diff_generated": -34.15746307373047, + "epoch": 2.62799740764744, + "grad_norm": 2.7350343370428125, + "learning_rate": 3.689125793522874e-08, + "logits/chosen": -2.233297109603882, + "logits/rejected": -1.999696969985962, + "logps/chosen": -12.732789993286133, + "logps/rejected": -624.6839599609375, + "logps_avg/chosen": -0.07528980076313019, + "logps_avg/rejected": -3.4157466888427734, + "loss": 0.077, + "losses_ref": -7.013262074906379e-05, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8110, + "u": -5.3310370445251465, + "weight": 0.06256024539470673 + }, + { + "diff_generated": -34.039451599121094, + "epoch": 2.631237848347375, + "grad_norm": 2.891218805017069, + "learning_rate": 3.6261202136970814e-08, + "logits/chosen": -2.2619783878326416, + "logits/rejected": -2.040761709213257, + "logps/chosen": -13.35377025604248, + "logps/rejected": -598.27880859375, + "logps_avg/chosen": -0.07534444332122803, + "logps_avg/rejected": -3.403945207595825, + "loss": 0.0762, + "losses_ref": -0.0006816794048063457, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8120, + "u": -5.329941749572754, + "weight": 0.06335194408893585 + }, + { + "diff_generated": -32.30815887451172, + "epoch": 2.63447828904731, + "grad_norm": 3.2827797507216117, + "learning_rate": 3.563631752681422e-08, + "logits/chosen": -2.2446212768554688, + "logits/rejected": -2.0790886878967285, + "logps/chosen": -13.016275405883789, + "logps/rejected": -612.4979248046875, + "logps_avg/chosen": -0.07507045567035675, + "logps_avg/rejected": -3.2308154106140137, + "loss": 0.0772, + "losses_ref": -0.0002835232298821211, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8130, + "u": -5.437108993530273, + "weight": 0.04403982684016228 + }, + { + "diff_generated": -35.24883270263672, + "epoch": 2.6377187297472457, + "grad_norm": 2.659663241127319, + "learning_rate": 3.501661298859489e-08, + "logits/chosen": -2.2246174812316895, + "logits/rejected": -2.0028109550476074, + "logps/chosen": -15.065336227416992, + "logps/rejected": -649.087646484375, + "logps_avg/chosen": -0.08108867704868317, + "logps_avg/rejected": -3.524883270263672, + "loss": 0.0776, + "losses_ref": -0.0013233883073553443, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8140, + "u": -5.504677772521973, + "weight": 0.03295652195811272 + }, + { + "diff_generated": -32.72626495361328, + "epoch": 2.640959170447181, + "grad_norm": 2.8495814105659174, + "learning_rate": 3.4402097332505074e-08, + "logits/chosen": -2.2631309032440186, + "logits/rejected": -2.032370090484619, + "logps/chosen": -13.472801208496094, + "logps/rejected": -625.4802856445312, + "logps_avg/chosen": -0.07656830549240112, + "logps_avg/rejected": -3.2726263999938965, + "loss": 0.0771, + "losses_ref": -0.0003127239178866148, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8150, + "u": -5.327264785766602, + "weight": 0.06283261626958847 + }, + { + "diff_generated": -34.16876983642578, + "epoch": 2.644199611147116, + "grad_norm": 2.784594617604692, + "learning_rate": 3.379277929496798e-08, + "logits/chosen": -2.2317280769348145, + "logits/rejected": -2.0046327114105225, + "logps/chosen": -13.157957077026367, + "logps/rejected": -641.9828491210938, + "logps_avg/chosen": -0.07431378960609436, + "logps_avg/rejected": -3.416877269744873, + "loss": 0.0771, + "losses_ref": -0.000572945165913552, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8160, + "u": -5.249545574188232, + "weight": 0.07572882622480392 + }, + { + "diff_generated": -32.216583251953125, + "epoch": 2.647440051847051, + "grad_norm": 3.033818763876657, + "learning_rate": 3.3188667538513435e-08, + "logits/chosen": -2.201606273651123, + "logits/rejected": -2.018627643585205, + "logps/chosen": -12.753862380981445, + "logps/rejected": -629.6954345703125, + "logps_avg/chosen": -0.07567889988422394, + "logps_avg/rejected": -3.221658229827881, + "loss": 0.0762, + "losses_ref": -0.0009025133331306279, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8170, + "u": -5.260069847106934, + "weight": 0.07612423598766327 + }, + { + "diff_generated": -35.8400993347168, + "epoch": 2.6506804925469862, + "grad_norm": 2.712181633748497, + "learning_rate": 3.258977065165478e-08, + "logits/chosen": -2.292752742767334, + "logits/rejected": -2.0390093326568604, + "logps/chosen": -13.461270332336426, + "logps/rejected": -626.172119140625, + "logps_avg/chosen": -0.0749615877866745, + "logps_avg/rejected": -3.584010362625122, + "loss": 0.0772, + "losses_ref": -0.000735091685783118, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8180, + "u": -5.398656368255615, + "weight": 0.05097651481628418 + }, + { + "diff_generated": -34.66440963745117, + "epoch": 2.653920933246922, + "grad_norm": 2.907996168398012, + "learning_rate": 3.1996097148766897e-08, + "logits/chosen": -2.2223732471466064, + "logits/rejected": -2.05733060836792, + "logps/chosen": -12.831570625305176, + "logps/rejected": -675.23779296875, + "logps_avg/chosen": -0.07762549072504044, + "logps_avg/rejected": -3.4664406776428223, + "loss": 0.0757, + "losses_ref": -0.0008757191826589406, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8190, + "u": -5.366507530212402, + "weight": 0.05740770697593689 + }, + { + "diff_generated": -37.49953842163086, + "epoch": 2.657161373946857, + "grad_norm": 2.923785207024441, + "learning_rate": 3.1407655469964754e-08, + "logits/chosen": -2.308901071548462, + "logits/rejected": -2.0445334911346436, + "logps/chosen": -13.068899154663086, + "logps/rejected": -683.8781127929688, + "logps_avg/chosen": -0.0730833113193512, + "logps_avg/rejected": -3.7499542236328125, + "loss": 0.0749, + "losses_ref": -0.0002511995262466371, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8200, + "u": -5.397935390472412, + "weight": 0.05026369169354439 + }, + { + "diff_generated": -32.94683074951172, + "epoch": 2.660401814646792, + "grad_norm": 2.7834058244584887, + "learning_rate": 3.0824453980984234e-08, + "logits/chosen": -2.2628774642944336, + "logits/rejected": -2.0234005451202393, + "logps/chosen": -13.097677230834961, + "logps/rejected": -620.1732177734375, + "logps_avg/chosen": -0.07187594473361969, + "logps_avg/rejected": -3.2946829795837402, + "loss": 0.0785, + "losses_ref": -0.0001284389873035252, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8210, + "u": -5.295039176940918, + "weight": 0.0688696950674057 + }, + { + "diff_generated": -32.9188117980957, + "epoch": 2.663642255346727, + "grad_norm": 2.6893460228311223, + "learning_rate": 3.0246500973062184e-08, + "logits/chosen": -2.2631959915161133, + "logits/rejected": -1.998734474182129, + "logps/chosen": -14.383687019348145, + "logps/rejected": -600.3707275390625, + "logps_avg/chosen": -0.07979197800159454, + "logps_avg/rejected": -3.2918810844421387, + "loss": 0.077, + "losses_ref": -0.001167912851087749, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8220, + "u": -5.286744117736816, + "weight": 0.07036517560482025 + }, + { + "diff_generated": -35.13602066040039, + "epoch": 2.6668826960466623, + "grad_norm": 2.7774076969393944, + "learning_rate": 2.9673804662819324e-08, + "logits/chosen": -2.256629467010498, + "logits/rejected": -1.9869369268417358, + "logps/chosen": -13.804142951965332, + "logps/rejected": -632.3235473632812, + "logps_avg/chosen": -0.07562381774187088, + "logps_avg/rejected": -3.513601779937744, + "loss": 0.0774, + "losses_ref": -0.000869055453222245, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8230, + "u": -5.329628944396973, + "weight": 0.06361083686351776 + }, + { + "diff_generated": -33.075477600097656, + "epoch": 2.6701231367465974, + "grad_norm": 2.7856281925799227, + "learning_rate": 2.9106373192143087e-08, + "logits/chosen": -2.2716612815856934, + "logits/rejected": -2.0574049949645996, + "logps/chosen": -12.818155288696289, + "logps/rejected": -628.197998046875, + "logps_avg/chosen": -0.07139711827039719, + "logps_avg/rejected": -3.3075473308563232, + "loss": 0.0758, + "losses_ref": -0.00030460403650067747, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8240, + "u": -5.325705528259277, + "weight": 0.06283704191446304 + }, + { + "diff_generated": -35.3209342956543, + "epoch": 2.6733635774465325, + "grad_norm": 2.905241637475343, + "learning_rate": 2.854421462807193e-08, + "logits/chosen": -2.2535653114318848, + "logits/rejected": -2.0166335105895996, + "logps/chosen": -13.077981948852539, + "logps/rejected": -663.8329467773438, + "logps_avg/chosen": -0.0728570744395256, + "logps_avg/rejected": -3.5320937633514404, + "loss": 0.074, + "losses_ref": -8.225092460634187e-05, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8250, + "u": -5.434994697570801, + "weight": 0.04382305219769478 + }, + { + "diff_generated": -33.154258728027344, + "epoch": 2.6766040181464676, + "grad_norm": 2.9496787017841335, + "learning_rate": 2.798733696268063e-08, + "logits/chosen": -2.2341184616088867, + "logits/rejected": -1.998202919960022, + "logps/chosen": -14.134020805358887, + "logps/rejected": -610.1220092773438, + "logps_avg/chosen": -0.07861624658107758, + "logps_avg/rejected": -3.315425395965576, + "loss": 0.0796, + "losses_ref": -0.00030415848596021533, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8260, + "u": -5.36326789855957, + "weight": 0.056549690663814545 + }, + { + "diff_generated": -34.65415573120117, + "epoch": 2.679844458846403, + "grad_norm": 2.862086996557468, + "learning_rate": 2.7435748112966694e-08, + "logits/chosen": -2.2305984497070312, + "logits/rejected": -2.041346311569214, + "logps/chosen": -12.520244598388672, + "logps/rejected": -678.1482543945312, + "logps_avg/chosen": -0.07185360789299011, + "logps_avg/rejected": -3.4654159545898438, + "loss": 0.0765, + "losses_ref": -7.135640771593899e-05, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8270, + "u": -5.397140979766846, + "weight": 0.05006079748272896 + }, + { + "diff_generated": -35.461692810058594, + "epoch": 2.6830848995463383, + "grad_norm": 3.093628968139604, + "learning_rate": 2.6889455920737903e-08, + "logits/chosen": -2.2679262161254883, + "logits/rejected": -1.9786310195922852, + "logps/chosen": -16.094085693359375, + "logps/rejected": -624.7288818359375, + "logps_avg/chosen": -0.08841854333877563, + "logps_avg/rejected": -3.5461692810058594, + "loss": 0.0782, + "losses_ref": -0.00011158763663843274, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8280, + "u": -5.260346412658691, + "weight": 0.07509959489107132 + }, + { + "diff_generated": -33.50742721557617, + "epoch": 2.6863253402462735, + "grad_norm": 2.9685480466908, + "learning_rate": 2.6348468152500357e-08, + "logits/chosen": -2.2659404277801514, + "logits/rejected": -2.062710762023926, + "logps/chosen": -12.312782287597656, + "logps/rejected": -627.5169677734375, + "logps_avg/chosen": -0.07124846428632736, + "logps_avg/rejected": -3.3507423400878906, + "loss": 0.078, + "losses_ref": -6.677229976048693e-05, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8290, + "u": -5.362844944000244, + "weight": 0.05630839988589287 + }, + { + "diff_generated": -36.79378890991211, + "epoch": 2.6895657809462086, + "grad_norm": 2.809794362122494, + "learning_rate": 2.5812792499348935e-08, + "logits/chosen": -2.2740607261657715, + "logits/rejected": -2.0205042362213135, + "logps/chosen": -14.752975463867188, + "logps/rejected": -658.4273681640625, + "logps_avg/chosen": -0.07980714738368988, + "logps_avg/rejected": -3.6793792247772217, + "loss": 0.0758, + "losses_ref": -4.300654472899623e-05, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8300, + "u": -5.545115947723389, + "weight": 0.02503451332449913 + }, + { + "diff_generated": -34.12705612182617, + "epoch": 2.692806221646144, + "grad_norm": 2.580660517105373, + "learning_rate": 2.5282436576857046e-08, + "logits/chosen": -2.2692408561706543, + "logits/rejected": -2.07869291305542, + "logps/chosen": -13.142602920532227, + "logps/rejected": -611.646728515625, + "logps_avg/chosen": -0.07218606770038605, + "logps_avg/rejected": -3.412705898284912, + "loss": 0.0745, + "losses_ref": -0.0006400069105438888, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8310, + "u": -5.396452903747559, + "weight": 0.050875671207904816 + }, + { + "diff_generated": -34.58293151855469, + "epoch": 2.6960466623460793, + "grad_norm": 2.9235840639131943, + "learning_rate": 2.4757407924968878e-08, + "logits/chosen": -2.2564258575439453, + "logits/rejected": -2.059000015258789, + "logps/chosen": -12.19434642791748, + "logps/rejected": -637.7523803710938, + "logps_avg/chosen": -0.07454869151115417, + "logps_avg/rejected": -3.4582931995391846, + "loss": 0.0768, + "losses_ref": -0.0002925902372226119, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8320, + "u": -5.332849502563477, + "weight": 0.06286852061748505 + }, + { + "diff_generated": -35.07324981689453, + "epoch": 2.6992871030460144, + "grad_norm": 2.8504828315929247, + "learning_rate": 2.4237714007892117e-08, + "logits/chosen": -2.3100998401641846, + "logits/rejected": -2.068859577178955, + "logps/chosen": -15.198066711425781, + "logps/rejected": -663.3590698242188, + "logps_avg/chosen": -0.08022447675466537, + "logps_avg/rejected": -3.5073249340057373, + "loss": 0.0776, + "losses_ref": -0.0007650894112884998, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8330, + "u": -5.469299793243408, + "weight": 0.03833349421620369 + }, + { + "diff_generated": -33.52021408081055, + "epoch": 2.7025275437459495, + "grad_norm": 2.9592373704389416, + "learning_rate": 2.372336221399176e-08, + "logits/chosen": -2.2604236602783203, + "logits/rejected": -2.0275778770446777, + "logps/chosen": -13.657310485839844, + "logps/rejected": -625.4544067382812, + "logps_avg/chosen": -0.07668985426425934, + "logps_avg/rejected": -3.3520214557647705, + "loss": 0.075, + "losses_ref": -0.0005120009882375598, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8340, + "u": -5.224454402923584, + "weight": 0.08183753490447998 + }, + { + "diff_generated": -33.873512268066406, + "epoch": 2.7057679844458846, + "grad_norm": 2.71910932232536, + "learning_rate": 2.3214359855685095e-08, + "logits/chosen": -2.250415563583374, + "logits/rejected": -2.01396107673645, + "logps/chosen": -13.784021377563477, + "logps/rejected": -591.624267578125, + "logps_avg/chosen": -0.07690791040658951, + "logps_avg/rejected": -3.3873507976531982, + "loss": 0.0741, + "losses_ref": -0.0011790532153099775, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8350, + "u": -5.472784042358398, + "weight": 0.03912579268217087 + }, + { + "diff_generated": -36.73569869995117, + "epoch": 2.7090084251458197, + "grad_norm": 2.7945233636437603, + "learning_rate": 2.271071416933772e-08, + "logits/chosen": -2.282506227493286, + "logits/rejected": -2.0563690662384033, + "logps/chosen": -12.65495491027832, + "logps/rejected": -665.0225830078125, + "logps_avg/chosen": -0.07482697814702988, + "logps_avg/rejected": -3.673570156097412, + "loss": 0.0759, + "losses_ref": -4.985265695722774e-05, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8360, + "u": -5.4010467529296875, + "weight": 0.05004154518246651 + }, + { + "diff_generated": -33.571414947509766, + "epoch": 2.712248865845755, + "grad_norm": 2.9122007198187756, + "learning_rate": 2.2212432315160855e-08, + "logits/chosen": -2.2580132484436035, + "logits/rejected": -2.0520682334899902, + "logps/chosen": -13.503583908081055, + "logps/rejected": -601.1464233398438, + "logps_avg/chosen": -0.07640071213245392, + "logps_avg/rejected": -3.3571410179138184, + "loss": 0.0769, + "losses_ref": -0.00020067494187969714, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8370, + "u": -5.330541133880615, + "weight": 0.06268791854381561 + }, + { + "diff_generated": -33.323726654052734, + "epoch": 2.71548930654569, + "grad_norm": 2.8899911784312664, + "learning_rate": 2.171952137710904e-08, + "logits/chosen": -2.313049793243408, + "logits/rejected": -2.0854876041412354, + "logps/chosen": -12.245684623718262, + "logps/rejected": -647.4600830078125, + "logps_avg/chosen": -0.06888893991708755, + "logps_avg/rejected": -3.3323721885681152, + "loss": 0.0768, + "losses_ref": -0.0003442336746957153, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8380, + "u": -5.473523139953613, + "weight": 0.037879910320043564 + }, + { + "diff_generated": -34.752471923828125, + "epoch": 2.7187297472456255, + "grad_norm": 2.624287373775904, + "learning_rate": 2.1231988362780327e-08, + "logits/chosen": -2.252321720123291, + "logits/rejected": -2.008906602859497, + "logps/chosen": -13.585721969604492, + "logps/rejected": -649.451171875, + "logps_avg/chosen": -0.07445680350065231, + "logps_avg/rejected": -3.475247621536255, + "loss": 0.0766, + "losses_ref": -0.0002541754802223295, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8390, + "u": -5.295945644378662, + "weight": 0.06904669106006622 + }, + { + "diff_generated": -36.02851867675781, + "epoch": 2.7219701879455607, + "grad_norm": 2.916752708658845, + "learning_rate": 2.0749840203315584e-08, + "logits/chosen": -2.2832603454589844, + "logits/rejected": -2.0546741485595703, + "logps/chosen": -14.805493354797363, + "logps/rejected": -696.9401245117188, + "logps_avg/chosen": -0.08261923491954803, + "logps_avg/rejected": -3.6028518676757812, + "loss": 0.0751, + "losses_ref": -0.00011947475286433473, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8400, + "u": -5.43557071685791, + "weight": 0.0438610278069973 + }, + { + "diff_generated": -35.815834045410156, + "epoch": 2.725210628645496, + "grad_norm": 2.6124609937812986, + "learning_rate": 2.0273083753300724e-08, + "logits/chosen": -2.291124105453491, + "logits/rejected": -1.9985876083374023, + "logps/chosen": -14.099342346191406, + "logps/rejected": -640.3782348632812, + "logps_avg/chosen": -0.07880159467458725, + "logps_avg/rejected": -3.5815834999084473, + "loss": 0.0777, + "losses_ref": -0.0009120380273088813, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8410, + "u": -5.433448314666748, + "weight": 0.04484781622886658 + }, + { + "diff_generated": -33.46442794799805, + "epoch": 2.728451069345431, + "grad_norm": 3.0081522859711636, + "learning_rate": 1.980172579066899e-08, + "logits/chosen": -2.3004908561706543, + "logits/rejected": -2.056856870651245, + "logps/chosen": -14.426023483276367, + "logps/rejected": -644.3485717773438, + "logps_avg/chosen": -0.08110615611076355, + "logps_avg/rejected": -3.346442699432373, + "loss": 0.0767, + "losses_ref": -0.0003365448210388422, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8420, + "u": -5.472744464874268, + "weight": 0.03788283094763756 + }, + { + "diff_generated": -35.44267272949219, + "epoch": 2.731691510045366, + "grad_norm": 2.665139502680527, + "learning_rate": 1.9335773016604608e-08, + "logits/chosen": -2.2856247425079346, + "logits/rejected": -2.055262804031372, + "logps/chosen": -14.615339279174805, + "logps/rejected": -697.371826171875, + "logps_avg/chosen": -0.07996072620153427, + "logps_avg/rejected": -3.544267177581787, + "loss": 0.0777, + "losses_ref": -0.000313569646095857, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8430, + "u": -5.545459747314453, + "weight": 0.025315571576356888 + }, + { + "diff_generated": -35.521690368652344, + "epoch": 2.7349319507453016, + "grad_norm": 2.7734083827432023, + "learning_rate": 1.887523205544741e-08, + "logits/chosen": -2.2610440254211426, + "logits/rejected": -2.050652027130127, + "logps/chosen": -14.45335865020752, + "logps/rejected": -622.8594970703125, + "logps_avg/chosen": -0.07939153164625168, + "logps_avg/rejected": -3.552168607711792, + "loss": 0.0766, + "losses_ref": -0.0003281990939285606, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8440, + "u": -5.403284072875977, + "weight": 0.05036498233675957 + }, + { + "diff_generated": -36.24019241333008, + "epoch": 2.7381723914452367, + "grad_norm": 2.7395054198494893, + "learning_rate": 1.8420109454598997e-08, + "logits/chosen": -2.304332971572876, + "logits/rejected": -2.057947874069214, + "logps/chosen": -12.37016487121582, + "logps/rejected": -682.6101684570312, + "logps_avg/chosen": -0.07354007661342621, + "logps_avg/rejected": -3.624018907546997, + "loss": 0.0773, + "losses_ref": -0.0003148287651129067, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8450, + "u": -5.472707271575928, + "weight": 0.0378398522734642 + }, + { + "diff_generated": -32.481624603271484, + "epoch": 2.741412832145172, + "grad_norm": 2.8390132929113516, + "learning_rate": 1.797041168442921e-08, + "logits/chosen": -2.293044328689575, + "logits/rejected": -2.065861701965332, + "logps/chosen": -14.110382080078125, + "logps/rejected": -585.2341918945312, + "logps_avg/chosen": -0.0777052789926529, + "logps_avg/rejected": -3.2481625080108643, + "loss": 0.078, + "losses_ref": -0.00031997732003219426, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8460, + "u": -5.43752908706665, + "weight": 0.04411545395851135 + }, + { + "diff_generated": -34.74770736694336, + "epoch": 2.744653272845107, + "grad_norm": 2.7392703290260005, + "learning_rate": 1.7526145138184377e-08, + "logits/chosen": -2.314601182937622, + "logits/rejected": -2.0743463039398193, + "logps/chosen": -13.695550918579102, + "logps/rejected": -634.08740234375, + "logps_avg/chosen": -0.07747956365346909, + "logps_avg/rejected": -3.474771022796631, + "loss": 0.0782, + "losses_ref": -0.0006029107025824487, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8470, + "u": -5.472765922546387, + "weight": 0.0382402203977108 + }, + { + "diff_generated": -34.0060920715332, + "epoch": 2.747893713545042, + "grad_norm": 2.8765591727104396, + "learning_rate": 1.708731613189669e-08, + "logits/chosen": -2.3314247131347656, + "logits/rejected": -2.0638935565948486, + "logps/chosen": -15.631362915039062, + "logps/rejected": -601.2354736328125, + "logps_avg/chosen": -0.0813279002904892, + "logps_avg/rejected": -3.400609254837036, + "loss": 0.0773, + "losses_ref": -0.00018041368457488716, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8480, + "u": -5.435568332672119, + "weight": 0.04391627386212349 + }, + { + "diff_generated": -35.1057243347168, + "epoch": 2.751134154244977, + "grad_norm": 2.781336468432542, + "learning_rate": 1.6653930904293677e-08, + "logits/chosen": -2.2818095684051514, + "logits/rejected": -2.0274269580841064, + "logps/chosen": -14.73913860321045, + "logps/rejected": -648.7092895507812, + "logps_avg/chosen": -0.07969610393047333, + "logps_avg/rejected": -3.5105724334716797, + "loss": 0.0764, + "losses_ref": -0.0015385873848572373, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8490, + "u": -5.467836856842041, + "weight": 0.03979206085205078 + }, + { + "diff_generated": -38.000701904296875, + "epoch": 2.7543745949449123, + "grad_norm": 2.7782768367868798, + "learning_rate": 1.6225995616710297e-08, + "logits/chosen": -2.2996420860290527, + "logits/rejected": -2.0055785179138184, + "logps/chosen": -14.530261039733887, + "logps/rejected": -654.1446533203125, + "logps_avg/chosen": -0.07845546305179596, + "logps_avg/rejected": -3.800069808959961, + "loss": 0.0748, + "losses_ref": -0.0009555866126902401, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8500, + "u": -5.5420684814453125, + "weight": 0.026282688602805138 + }, + { + "diff_generated": -35.94993209838867, + "epoch": 2.7576150356448474, + "grad_norm": 2.7776443099222017, + "learning_rate": 1.58035163530009e-08, + "logits/chosen": -2.367295026779175, + "logits/rejected": -2.0653717517852783, + "logps/chosen": -15.043771743774414, + "logps/rejected": -671.5582275390625, + "logps_avg/chosen": -0.07488191872835159, + "logps_avg/rejected": -3.5949931144714355, + "loss": 0.0771, + "losses_ref": -0.0011525630252435803, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8510, + "u": -5.470739364624023, + "weight": 0.03922674432396889 + }, + { + "diff_generated": -34.60573959350586, + "epoch": 2.760855476344783, + "grad_norm": 2.773851757598069, + "learning_rate": 1.538649911945291e-08, + "logits/chosen": -2.2526590824127197, + "logits/rejected": -2.052077531814575, + "logps/chosen": -14.62193489074707, + "logps/rejected": -670.3350830078125, + "logps_avg/chosen": -0.0839783325791359, + "logps_avg/rejected": -3.4605743885040283, + "loss": 0.079, + "losses_ref": -0.0002155410184059292, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8520, + "u": -5.506975173950195, + "weight": 0.03145802766084671 + }, + { + "diff_generated": -34.427696228027344, + "epoch": 2.764095917044718, + "grad_norm": 2.9143017584146054, + "learning_rate": 1.497494984470107e-08, + "logits/chosen": -2.257265567779541, + "logits/rejected": -2.0002946853637695, + "logps/chosen": -15.755085945129395, + "logps/rejected": -644.8309936523438, + "logps_avg/chosen": -0.08270631730556488, + "logps_avg/rejected": -3.4427692890167236, + "loss": 0.0774, + "losses_ref": -0.001095159212127328, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8530, + "u": -5.358209609985352, + "weight": 0.057795751839876175 + }, + { + "diff_generated": -31.565780639648438, + "epoch": 2.7673363577446533, + "grad_norm": 2.716723351573739, + "learning_rate": 1.4568874379643936e-08, + "logits/chosen": -2.290512800216675, + "logits/rejected": -2.0521178245544434, + "logps/chosen": -12.882516860961914, + "logps/rejected": -598.2491455078125, + "logps_avg/chosen": -0.06945003569126129, + "logps_avg/rejected": -3.1565780639648438, + "loss": 0.075, + "losses_ref": -0.0006087241927161813, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8540, + "u": -5.04725456237793, + "weight": 0.113297238945961 + }, + { + "diff_generated": -33.27885055541992, + "epoch": 2.7705767984445884, + "grad_norm": 2.7208468194715025, + "learning_rate": 1.4168278497359798e-08, + "logits/chosen": -2.32669734954834, + "logits/rejected": -2.113250255584717, + "logps/chosen": -13.624165534973145, + "logps/rejected": -610.7620849609375, + "logps_avg/chosen": -0.07655289769172668, + "logps_avg/rejected": -3.327885150909424, + "loss": 0.0757, + "losses_ref": -0.00013436308654490858, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8550, + "u": -5.400485038757324, + "weight": 0.05012362450361252 + }, + { + "diff_generated": -34.342716217041016, + "epoch": 2.7738172391445235, + "grad_norm": 2.768052375106582, + "learning_rate": 1.3773167893025161e-08, + "logits/chosen": -2.2872402667999268, + "logits/rejected": -2.050503730773926, + "logps/chosen": -15.019485473632812, + "logps/rejected": -664.238525390625, + "logps_avg/chosen": -0.0805942639708519, + "logps_avg/rejected": -3.4342715740203857, + "loss": 0.0769, + "losses_ref": -0.0003937376313842833, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8560, + "u": -5.361881732940674, + "weight": 0.056711576879024506 + }, + { + "diff_generated": -33.52961730957031, + "epoch": 2.777057679844459, + "grad_norm": 2.779576150611529, + "learning_rate": 1.3383548183833715e-08, + "logits/chosen": -2.2941019535064697, + "logits/rejected": -2.067523241043091, + "logps/chosen": -14.290433883666992, + "logps/rejected": -590.431884765625, + "logps_avg/chosen": -0.08122588694095612, + "logps_avg/rejected": -3.352961778640747, + "loss": 0.0768, + "losses_ref": -0.0005186675698496401, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8570, + "u": -5.29172420501709, + "weight": 0.06934425234794617 + }, + { + "diff_generated": -36.81610107421875, + "epoch": 2.780298120544394, + "grad_norm": 2.7797296431792393, + "learning_rate": 1.2999424908916346e-08, + "logits/chosen": -2.2424275875091553, + "logits/rejected": -1.9814599752426147, + "logps/chosen": -14.798460006713867, + "logps/rejected": -660.2183227539062, + "logps_avg/chosen": -0.08434576541185379, + "logps_avg/rejected": -3.681610107421875, + "loss": 0.0764, + "losses_ref": -0.00028714913059957325, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8580, + "u": -5.438215732574463, + "weight": 0.04404758661985397 + }, + { + "diff_generated": -33.61157989501953, + "epoch": 2.7835385612443293, + "grad_norm": 2.783877857928745, + "learning_rate": 1.2620803529262357e-08, + "logits/chosen": -2.2669458389282227, + "logits/rejected": -2.053138256072998, + "logps/chosen": -12.835203170776367, + "logps/rejected": -592.4732055664062, + "logps_avg/chosen": -0.07494383305311203, + "logps_avg/rejected": -3.3611583709716797, + "loss": 0.0747, + "losses_ref": -0.0006721061654388905, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8590, + "u": -5.401367664337158, + "weight": 0.05073683336377144 + }, + { + "diff_generated": -33.477386474609375, + "epoch": 2.7867790019442644, + "grad_norm": 2.7995289130877468, + "learning_rate": 1.2247689427642027e-08, + "logits/chosen": -2.3007161617279053, + "logits/rejected": -2.055169105529785, + "logps/chosen": -14.55711555480957, + "logps/rejected": -620.1129150390625, + "logps_avg/chosen": -0.0780087560415268, + "logps_avg/rejected": -3.347738742828369, + "loss": 0.0764, + "losses_ref": -0.0003297061484772712, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8600, + "u": -5.402604103088379, + "weight": 0.05038810521364212 + }, + { + "diff_generated": -37.25822830200195, + "epoch": 2.7900194426441995, + "grad_norm": 2.8526707944449226, + "learning_rate": 1.1880087908529945e-08, + "logits/chosen": -2.26621675491333, + "logits/rejected": -2.0047826766967773, + "logps/chosen": -14.627006530761719, + "logps/rejected": -665.5721435546875, + "logps_avg/chosen": -0.08128681033849716, + "logps_avg/rejected": -3.725823163986206, + "loss": 0.0769, + "losses_ref": -0.0007085074321366847, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8610, + "u": -5.400392055511475, + "weight": 0.0508403554558754 + }, + { + "diff_generated": -34.120948791503906, + "epoch": 2.7932598833441347, + "grad_norm": 2.535138540278953, + "learning_rate": 1.1518004198029529e-08, + "logits/chosen": -2.300736665725708, + "logits/rejected": -2.0538346767425537, + "logps/chosen": -15.004858016967773, + "logps/rejected": -642.0592041015625, + "logps_avg/chosen": -0.0812058076262474, + "logps_avg/rejected": -3.412094831466675, + "loss": 0.0772, + "losses_ref": -0.00012083786714356393, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8620, + "u": -5.397956371307373, + "weight": 0.05010969564318657 + }, + { + "diff_generated": -32.81501770019531, + "epoch": 2.79650032404407, + "grad_norm": 2.877966483352646, + "learning_rate": 1.1161443443798946e-08, + "logits/chosen": -2.2787914276123047, + "logits/rejected": -2.059709072113037, + "logps/chosen": -13.227984428405762, + "logps/rejected": -628.3048095703125, + "logps_avg/chosen": -0.07205172628164291, + "logps_avg/rejected": -3.281501293182373, + "loss": 0.077, + "losses_ref": -5.9588219301076606e-05, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8630, + "u": -5.293186187744141, + "weight": 0.06879880279302597 + }, + { + "diff_generated": -36.92168426513672, + "epoch": 2.7997407647440054, + "grad_norm": 2.9731936844378373, + "learning_rate": 1.0810410714977747e-08, + "logits/chosen": -2.236611843109131, + "logits/rejected": -1.9774243831634521, + "logps/chosen": -15.089825630187988, + "logps/rejected": -671.8287963867188, + "logps_avg/chosen": -0.08256001025438309, + "logps_avg/rejected": -3.692168712615967, + "loss": 0.0774, + "losses_ref": -0.00027638330357149243, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8640, + "u": -5.357158184051514, + "weight": 0.05651743337512016 + }, + { + "diff_generated": -36.185157775878906, + "epoch": 2.8029812054439405, + "grad_norm": 3.0221447823428393, + "learning_rate": 1.0464911002114885e-08, + "logits/chosen": -2.2851452827453613, + "logits/rejected": -2.084177255630493, + "logps/chosen": -14.222898483276367, + "logps/rejected": -670.3880004882812, + "logps_avg/chosen": -0.08353248238563538, + "logps_avg/rejected": -3.6185154914855957, + "loss": 0.0767, + "losses_ref": -0.0012774534989148378, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8650, + "u": -5.578444004058838, + "weight": 0.020543891936540604 + }, + { + "diff_generated": -34.61139678955078, + "epoch": 2.8062216461438756, + "grad_norm": 2.7265514642437094, + "learning_rate": 1.0124949217097656e-08, + "logits/chosen": -2.317802667617798, + "logits/rejected": -2.0749332904815674, + "logps/chosen": -12.769566535949707, + "logps/rejected": -647.1751708984375, + "logps_avg/chosen": -0.07187042385339737, + "logps_avg/rejected": -3.461139678955078, + "loss": 0.0773, + "losses_ref": -0.0005036048823967576, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8660, + "u": -5.4337568283081055, + "weight": 0.04427201300859451 + }, + { + "diff_generated": -36.47317886352539, + "epoch": 2.8094620868438107, + "grad_norm": 2.8915693908709352, + "learning_rate": 9.790530193082114e-09, + "logits/chosen": -2.285475730895996, + "logits/rejected": -2.0016865730285645, + "logps/chosen": -15.852392196655273, + "logps/rejected": -661.9172973632812, + "logps_avg/chosen": -0.08655592799186707, + "logps_avg/rejected": -3.647317409515381, + "loss": 0.0802, + "losses_ref": -0.0001104946932173334, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8670, + "u": -5.475028038024902, + "weight": 0.03760233893990517 + }, + { + "diff_generated": -34.181583404541016, + "epoch": 2.812702527543746, + "grad_norm": 2.895358001000874, + "learning_rate": 9.461658684423968e-09, + "logits/chosen": -2.2664928436279297, + "logits/rejected": -2.0277068614959717, + "logps/chosen": -15.316492080688477, + "logps/rejected": -675.1531982421875, + "logps_avg/chosen": -0.08478286117315292, + "logps_avg/rejected": -3.4181582927703857, + "loss": 0.0785, + "losses_ref": -0.00024127769574988633, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8680, + "u": -5.331995964050293, + "weight": 0.06275250762701035 + }, + { + "diff_generated": -34.03679656982422, + "epoch": 2.8159429682436814, + "grad_norm": 2.86929217256149, + "learning_rate": 9.138339366611526e-09, + "logits/chosen": -2.335113763809204, + "logits/rejected": -2.091174840927124, + "logps/chosen": -13.370208740234375, + "logps/rejected": -636.9915161132812, + "logps_avg/chosen": -0.07555247843265533, + "logps_avg/rejected": -3.403679609298706, + "loss": 0.0765, + "losses_ref": -0.0005011368775740266, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8690, + "u": -5.543493747711182, + "weight": 0.025575250387191772 + }, + { + "diff_generated": -34.14396286010742, + "epoch": 2.8191834089436165, + "grad_norm": 2.7985689135675913, + "learning_rate": 8.82057683619859e-09, + "logits/chosen": -2.233975887298584, + "logits/rejected": -2.043182611465454, + "logps/chosen": -11.014206886291504, + "logps/rejected": -620.0697631835938, + "logps_avg/chosen": -0.07187635451555252, + "logps_avg/rejected": -3.414396286010742, + "loss": 0.0737, + "losses_ref": -0.0006925543420948088, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8700, + "u": -5.368803977966309, + "weight": 0.05707864090800285 + }, + { + "diff_generated": -35.2857666015625, + "epoch": 2.8224238496435516, + "grad_norm": 2.780204667008307, + "learning_rate": 8.508375610739626e-09, + "logits/chosen": -2.307744264602661, + "logits/rejected": -2.0247180461883545, + "logps/chosen": -14.565858840942383, + "logps/rejected": -652.158935546875, + "logps_avg/chosen": -0.07844166457653046, + "logps_avg/rejected": -3.528576612472534, + "loss": 0.0767, + "losses_ref": -0.0003016654518432915, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8710, + "u": -5.438412666320801, + "weight": 0.04406419396400452 + }, + { + "diff_generated": -35.18162536621094, + "epoch": 2.8256642903434868, + "grad_norm": 2.952787034975414, + "learning_rate": 8.201740128725365e-09, + "logits/chosen": -2.264735221862793, + "logits/rejected": -2.0731773376464844, + "logps/chosen": -12.935811996459961, + "logps/rejected": -644.2762451171875, + "logps_avg/chosen": -0.0769585520029068, + "logps_avg/rejected": -3.518162488937378, + "loss": 0.0749, + "losses_ref": -0.0003010678628925234, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8720, + "u": -5.331357479095459, + "weight": 0.0628284215927124 + }, + { + "diff_generated": -33.50090789794922, + "epoch": 2.828904731043422, + "grad_norm": 2.6378440705183106, + "learning_rate": 7.900674749519564e-09, + "logits/chosen": -2.302125930786133, + "logits/rejected": -2.0807623863220215, + "logps/chosen": -14.297933578491211, + "logps/rejected": -624.4725341796875, + "logps_avg/chosen": -0.07768501341342926, + "logps_avg/rejected": -3.3500912189483643, + "loss": 0.0767, + "losses_ref": -0.00035457880585454404, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8730, + "u": -5.366321563720703, + "weight": 0.05664187669754028 + }, + { + "diff_generated": -36.1597785949707, + "epoch": 2.832145171743357, + "grad_norm": 2.731065373320376, + "learning_rate": 7.605183753297283e-09, + "logits/chosen": -2.332219123840332, + "logits/rejected": -2.050250768661499, + "logps/chosen": -14.659116744995117, + "logps/rejected": -635.1754150390625, + "logps_avg/chosen": -0.07973220199346542, + "logps_avg/rejected": -3.6159775257110596, + "loss": 0.0765, + "losses_ref": -9.081260213861242e-05, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8740, + "u": -5.324737548828125, + "weight": 0.06257982552051544 + }, + { + "diff_generated": -32.69268798828125, + "epoch": 2.835385612443292, + "grad_norm": 2.732954337771039, + "learning_rate": 7.315271340983731e-09, + "logits/chosen": -2.301483154296875, + "logits/rejected": -2.0711069107055664, + "logps/chosen": -13.039764404296875, + "logps/rejected": -624.9254150390625, + "logps_avg/chosen": -0.07299993187189102, + "logps_avg/rejected": -3.269268751144409, + "loss": 0.0752, + "losses_ref": -0.00042771859443746507, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8750, + "u": -5.33248233795166, + "weight": 0.0629906952381134 + }, + { + "diff_generated": -36.09053421020508, + "epoch": 2.8386260531432272, + "grad_norm": 2.672121170797753, + "learning_rate": 7.030941634194932e-09, + "logits/chosen": -2.306715726852417, + "logits/rejected": -2.0734031200408936, + "logps/chosen": -14.156460762023926, + "logps/rejected": -667.5546264648438, + "logps_avg/chosen": -0.07908565551042557, + "logps_avg/rejected": -3.609053373336792, + "loss": 0.079, + "losses_ref": -0.00014179803838487715, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8760, + "u": -5.332102298736572, + "weight": 0.0626293420791626 + }, + { + "diff_generated": -35.369422912597656, + "epoch": 2.841866493843163, + "grad_norm": 3.2762929988972, + "learning_rate": 6.752198675178711e-09, + "logits/chosen": -2.310044765472412, + "logits/rejected": -2.0577964782714844, + "logps/chosen": -12.950109481811523, + "logps/rejected": -658.7679443359375, + "logps_avg/chosen": -0.07082493603229523, + "logps_avg/rejected": -3.53694224357605, + "loss": 0.0751, + "losses_ref": -0.0007642454584129155, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8770, + "u": -5.402093410491943, + "weight": 0.05093027278780937 + }, + { + "diff_generated": -33.57276153564453, + "epoch": 2.845106934543098, + "grad_norm": 2.9315528501466384, + "learning_rate": 6.479046426757584e-09, + "logits/chosen": -2.2507405281066895, + "logits/rejected": -1.9945474863052368, + "logps/chosen": -13.323570251464844, + "logps/rejected": -611.6863403320312, + "logps_avg/chosen": -0.07454784214496613, + "logps_avg/rejected": -3.3572757244110107, + "loss": 0.0749, + "losses_ref": -0.00011777288455050439, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8780, + "u": -5.293331623077393, + "weight": 0.06885615736246109 + }, + { + "diff_generated": -33.651031494140625, + "epoch": 2.848347375243033, + "grad_norm": 2.9451502753913834, + "learning_rate": 6.211488772272133e-09, + "logits/chosen": -2.257718324661255, + "logits/rejected": -2.0689949989318848, + "logps/chosen": -12.400961875915527, + "logps/rejected": -665.5693969726562, + "logps_avg/chosen": -0.07117728888988495, + "logps_avg/rejected": -3.365103244781494, + "loss": 0.0754, + "losses_ref": -0.0011662624310702085, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8790, + "u": -5.36535120010376, + "weight": 0.05799577385187149 + }, + { + "diff_generated": -32.80426788330078, + "epoch": 2.851587815942968, + "grad_norm": 2.7998741601476436, + "learning_rate": 5.9495295155260305e-09, + "logits/chosen": -2.3018977642059326, + "logits/rejected": -2.0845401287078857, + "logps/chosen": -14.197778701782227, + "logps/rejected": -623.5587158203125, + "logps_avg/chosen": -0.07896491140127182, + "logps_avg/rejected": -3.2804272174835205, + "loss": 0.0784, + "losses_ref": -0.00036378385266289115, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8800, + "u": -5.40054178237915, + "weight": 0.050371408462524414 + }, + { + "diff_generated": -35.56111526489258, + "epoch": 2.8548282566429033, + "grad_norm": 2.9920608913232547, + "learning_rate": 5.69317238073177e-09, + "logits/chosen": -2.272759199142456, + "logits/rejected": -2.0058791637420654, + "logps/chosen": -13.803054809570312, + "logps/rejected": -639.3179931640625, + "logps_avg/chosen": -0.07773126661777496, + "logps_avg/rejected": -3.5561110973358154, + "loss": 0.0749, + "losses_ref": -0.00025532610015943646, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8810, + "u": -5.288957118988037, + "weight": 0.06902964413166046 + }, + { + "diff_generated": -34.240325927734375, + "epoch": 2.858068697342839, + "grad_norm": 2.6821027741068324, + "learning_rate": 5.442421012457909e-09, + "logits/chosen": -2.240999698638916, + "logits/rejected": -2.008993148803711, + "logps/chosen": -12.136984825134277, + "logps/rejected": -616.26416015625, + "logps_avg/chosen": -0.07080712169408798, + "logps_avg/rejected": -3.424032688140869, + "loss": 0.0753, + "losses_ref": -9.598202450433746e-05, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8820, + "u": -5.154082775115967, + "weight": 0.09383802115917206 + }, + { + "diff_generated": -36.475528717041016, + "epoch": 2.861309138042774, + "grad_norm": 3.178350218749426, + "learning_rate": 5.197278975577069e-09, + "logits/chosen": -2.230213165283203, + "logits/rejected": -1.9734901189804077, + "logps/chosen": -13.831930160522461, + "logps/rejected": -667.8572998046875, + "logps_avg/chosen": -0.08242715895175934, + "logps_avg/rejected": -3.647552967071533, + "loss": 0.0771, + "losses_ref": -0.0016425810754299164, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8830, + "u": -5.473348617553711, + "weight": 0.039764031767845154 + }, + { + "diff_generated": -33.9207763671875, + "epoch": 2.864549578742709, + "grad_norm": 2.8690068694207493, + "learning_rate": 4.957749755215346e-09, + "logits/chosen": -2.2395389080047607, + "logits/rejected": -2.0926663875579834, + "logps/chosen": -12.328548431396484, + "logps/rejected": -645.32275390625, + "logps_avg/chosen": -0.07605170458555222, + "logps_avg/rejected": -3.392077684402466, + "loss": 0.0791, + "losses_ref": -0.0004766159108839929, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8840, + "u": -5.4698357582092285, + "weight": 0.03803374990820885 + }, + { + "diff_generated": -36.080318450927734, + "epoch": 2.8677900194426442, + "grad_norm": 2.9374770724521193, + "learning_rate": 4.723836756702848e-09, + "logits/chosen": -2.2504429817199707, + "logits/rejected": -1.9984312057495117, + "logps/chosen": -12.243127822875977, + "logps/rejected": -637.8248291015625, + "logps_avg/chosen": -0.07207809388637543, + "logps_avg/rejected": -3.6080322265625, + "loss": 0.0759, + "losses_ref": -0.0004624236316885799, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8850, + "u": -5.293095588684082, + "weight": 0.0692666620016098 + }, + { + "diff_generated": -34.3936653137207, + "epoch": 2.8710304601425793, + "grad_norm": 2.863835701676596, + "learning_rate": 4.495543305524974e-09, + "logits/chosen": -2.269010066986084, + "logits/rejected": -2.0275280475616455, + "logps/chosen": -13.334935188293457, + "logps/rejected": -628.5553588867188, + "logps_avg/chosen": -0.0751592367887497, + "logps_avg/rejected": -3.439366102218628, + "loss": 0.077, + "losses_ref": -0.00043837359407916665, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8860, + "u": -5.3313493728637695, + "weight": 0.06295828521251678 + }, + { + "diff_generated": -35.203651428222656, + "epoch": 2.8742709008425145, + "grad_norm": 2.8246905754348792, + "learning_rate": 4.2728726472756934e-09, + "logits/chosen": -2.278592109680176, + "logits/rejected": -2.0227110385894775, + "logps/chosen": -15.298222541809082, + "logps/rejected": -655.2586669921875, + "logps_avg/chosen": -0.08556310832500458, + "logps_avg/rejected": -3.5203652381896973, + "loss": 0.0781, + "losses_ref": -0.0014250215608626604, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8870, + "u": -5.466298580169678, + "weight": 0.03932160139083862 + }, + { + "diff_generated": -37.24098205566406, + "epoch": 2.8775113415424496, + "grad_norm": 2.9024973540127244, + "learning_rate": 4.055827947610746e-09, + "logits/chosen": -2.273810625076294, + "logits/rejected": -2.0100204944610596, + "logps/chosen": -14.425886154174805, + "logps/rejected": -702.1931762695312, + "logps_avg/chosen": -0.07655589282512665, + "logps_avg/rejected": -3.7240982055664062, + "loss": 0.0751, + "losses_ref": -0.0006063595064915717, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8880, + "u": -5.471449851989746, + "weight": 0.03819319233298302 + }, + { + "diff_generated": -37.44664764404297, + "epoch": 2.8807517822423847, + "grad_norm": 2.9010138738691547, + "learning_rate": 3.844412292203092e-09, + "logits/chosen": -2.254565954208374, + "logits/rejected": -1.9562934637069702, + "logps/chosen": -13.297273635864258, + "logps/rejected": -671.3079223632812, + "logps_avg/chosen": -0.07370129227638245, + "logps_avg/rejected": -3.7446651458740234, + "loss": 0.0759, + "losses_ref": -0.00030630582477897406, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8890, + "u": -5.402109622955322, + "weight": 0.05031546205282211 + }, + { + "diff_generated": -33.91117477416992, + "epoch": 2.8839922229423203, + "grad_norm": 2.78041629343256, + "learning_rate": 3.638628686698908e-09, + "logits/chosen": -2.2411606311798096, + "logits/rejected": -1.9876295328140259, + "logps/chosen": -14.107019424438477, + "logps/rejected": -651.743408203125, + "logps_avg/chosen": -0.07398734986782074, + "logps_avg/rejected": -3.3911170959472656, + "loss": 0.0766, + "losses_ref": -0.0007171139004640281, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8900, + "u": -5.2610578536987305, + "weight": 0.07587677985429764 + }, + { + "diff_generated": -34.82379913330078, + "epoch": 2.8872326636422554, + "grad_norm": 3.1121766646536337, + "learning_rate": 3.438480056674864e-09, + "logits/chosen": -2.277547597885132, + "logits/rejected": -2.014165163040161, + "logps/chosen": -13.730894088745117, + "logps/rejected": -658.84423828125, + "logps_avg/chosen": -0.07599518448114395, + "logps_avg/rejected": -3.482379913330078, + "loss": 0.0769, + "losses_ref": -0.0007300475845113397, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8910, + "u": -5.328692436218262, + "weight": 0.06340476125478745 + }, + { + "diff_generated": -35.029197692871094, + "epoch": 2.8904731043421905, + "grad_norm": 2.96319952309038, + "learning_rate": 3.243969247596423e-09, + "logits/chosen": -2.2526772022247314, + "logits/rejected": -2.013214111328125, + "logps/chosen": -13.636917114257812, + "logps/rejected": -669.37939453125, + "logps_avg/chosen": -0.07388485968112946, + "logps_avg/rejected": -3.5029196739196777, + "loss": 0.0791, + "losses_ref": -0.00036009997711516917, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8920, + "u": -5.543162822723389, + "weight": 0.025403300300240517 + }, + { + "diff_generated": -35.807281494140625, + "epoch": 2.8937135450421256, + "grad_norm": 2.831128371216867, + "learning_rate": 3.0550990247776522e-09, + "logits/chosen": -2.2789690494537354, + "logits/rejected": -2.0398330688476562, + "logps/chosen": -12.486102104187012, + "logps/rejected": -649.1627197265625, + "logps_avg/chosen": -0.07312561571598053, + "logps_avg/rejected": -3.580728530883789, + "loss": 0.0771, + "losses_ref": -0.001107497839257121, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8930, + "u": -5.4004716873168945, + "weight": 0.05142833665013313 + }, + { + "diff_generated": -35.70695495605469, + "epoch": 2.8969539857420608, + "grad_norm": 2.79925996934682, + "learning_rate": 2.871872073341608e-09, + "logits/chosen": -2.2894484996795654, + "logits/rejected": -2.0516390800476074, + "logps/chosen": -13.297342300415039, + "logps/rejected": -691.9161376953125, + "logps_avg/chosen": -0.07701648771762848, + "logps_avg/rejected": -3.570695400238037, + "loss": 0.076, + "losses_ref": -0.0008863200200721622, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8940, + "u": -5.4015045166015625, + "weight": 0.05104244500398636 + }, + { + "diff_generated": -33.015342712402344, + "epoch": 2.9001944264419963, + "grad_norm": 3.05899367637055, + "learning_rate": 2.694290998182325e-09, + "logits/chosen": -2.293231964111328, + "logits/rejected": -2.0784249305725098, + "logps/chosen": -14.234578132629395, + "logps/rejected": -644.7971801757812, + "logps_avg/chosen": -0.0775744691491127, + "logps_avg/rejected": -3.3015339374542236, + "loss": 0.0775, + "losses_ref": -0.0009552057599648833, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8950, + "u": -5.434551239013672, + "weight": 0.04488217085599899 + }, + { + "diff_generated": -34.278602600097656, + "epoch": 2.9034348671419314, + "grad_norm": 2.9237886474738604, + "learning_rate": 2.52235832392782e-09, + "logits/chosen": -2.2705326080322266, + "logits/rejected": -2.050858736038208, + "logps/chosen": -13.253606796264648, + "logps/rejected": -638.6285400390625, + "logps_avg/chosen": -0.07300833612680435, + "logps_avg/rejected": -3.4278602600097656, + "loss": 0.0777, + "losses_ref": -0.0007817854057066143, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8960, + "u": -5.468316555023193, + "weight": 0.03847404569387436 + }, + { + "diff_generated": -35.571598052978516, + "epoch": 2.9066753078418666, + "grad_norm": 2.689594232404768, + "learning_rate": 2.35607649490408e-09, + "logits/chosen": -2.3017730712890625, + "logits/rejected": -2.005580186843872, + "logps/chosen": -15.054954528808594, + "logps/rejected": -660.2969970703125, + "logps_avg/chosen": -0.07933951914310455, + "logps_avg/rejected": -3.557159900665283, + "loss": 0.0779, + "losses_ref": -0.0001488261332269758, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8970, + "u": -5.6155853271484375, + "weight": 0.012644929811358452 + }, + { + "diff_generated": -34.998565673828125, + "epoch": 2.9099157485418017, + "grad_norm": 3.1797481778973307, + "learning_rate": 2.1954478751003313e-09, + "logits/chosen": -2.2666759490966797, + "logits/rejected": -2.0279688835144043, + "logps/chosen": -11.937381744384766, + "logps/rejected": -630.4774780273438, + "logps_avg/chosen": -0.06900829821825027, + "logps_avg/rejected": -3.499856472015381, + "loss": 0.0746, + "losses_ref": -0.0006766252918168902, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8980, + "u": -5.366180896759033, + "weight": 0.05714557692408562 + }, + { + "diff_generated": -35.84283447265625, + "epoch": 2.913156189241737, + "grad_norm": 2.8220235271975103, + "learning_rate": 2.040474748135512e-09, + "logits/chosen": -2.2554118633270264, + "logits/rejected": -2.0103278160095215, + "logps/chosen": -13.508143424987793, + "logps/rejected": -655.4435424804688, + "logps_avg/chosen": -0.07491391152143478, + "logps_avg/rejected": -3.5842833518981934, + "loss": 0.0764, + "losses_ref": -0.00030079128919169307, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 8990, + "u": -5.29534387588501, + "weight": 0.06904434412717819 + }, + { + "diff_generated": -32.51457977294922, + "epoch": 2.916396629941672, + "grad_norm": 2.909222858975261, + "learning_rate": 1.8911593172258544e-09, + "logits/chosen": -2.2562718391418457, + "logits/rejected": -2.0374486446380615, + "logps/chosen": -13.960149765014648, + "logps/rejected": -612.7658081054688, + "logps_avg/chosen": -0.07424553483724594, + "logps_avg/rejected": -3.2514584064483643, + "loss": 0.0773, + "losses_ref": -0.0002548511838540435, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9000, + "u": -5.330750942230225, + "weight": 0.06278066337108612 + }, + { + "diff_generated": -33.9921760559082, + "epoch": 2.919637070641607, + "grad_norm": 2.8440369601590207, + "learning_rate": 1.7475037051532638e-09, + "logits/chosen": -2.2932300567626953, + "logits/rejected": -2.03836727142334, + "logps/chosen": -14.461469650268555, + "logps/rejected": -643.2342529296875, + "logps_avg/chosen": -0.0805402547121048, + "logps_avg/rejected": -3.3992176055908203, + "loss": 0.0777, + "losses_ref": -0.00012567141675390303, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9010, + "u": -5.36099910736084, + "weight": 0.05636243149638176 + }, + { + "diff_generated": -33.33674240112305, + "epoch": 2.9228775113415426, + "grad_norm": 2.7812330472973668, + "learning_rate": 1.609509954235566e-09, + "logits/chosen": -2.2596919536590576, + "logits/rejected": -2.0829384326934814, + "logps/chosen": -13.54491138458252, + "logps/rejected": -645.8247680664062, + "logps_avg/chosen": -0.0812302976846695, + "logps_avg/rejected": -3.333674669265747, + "loss": 0.0753, + "losses_ref": -0.0008088911999948323, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9020, + "u": -5.473405361175537, + "weight": 0.03860088437795639 + }, + { + "diff_generated": -34.202003479003906, + "epoch": 2.9261179520414777, + "grad_norm": 3.025771626101646, + "learning_rate": 1.4771800262970203e-09, + "logits/chosen": -2.244457960128784, + "logits/rejected": -2.0527331829071045, + "logps/chosen": -14.208236694335938, + "logps/rejected": -643.656005859375, + "logps_avg/chosen": -0.0844058021903038, + "logps_avg/rejected": -3.4202003479003906, + "loss": 0.0764, + "losses_ref": -0.0002643067273311317, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9030, + "u": -5.436182975769043, + "weight": 0.044031400233507156 + }, + { + "diff_generated": -34.946044921875, + "epoch": 2.929358392741413, + "grad_norm": 2.7504660685046667, + "learning_rate": 1.3505158026408724e-09, + "logits/chosen": -2.258826494216919, + "logits/rejected": -2.033946990966797, + "logps/chosen": -16.03264808654785, + "logps/rejected": -628.7940673828125, + "logps_avg/chosen": -0.08787710964679718, + "logps_avg/rejected": -3.4946041107177734, + "loss": 0.076, + "losses_ref": -0.000992965535260737, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9040, + "u": -5.508906364440918, + "weight": 0.032481517642736435 + }, + { + "diff_generated": -35.106163024902344, + "epoch": 2.932598833441348, + "grad_norm": 2.646894119734614, + "learning_rate": 1.2295190840223125e-09, + "logits/chosen": -2.2918362617492676, + "logits/rejected": -2.0843429565429688, + "logps/chosen": -13.711763381958008, + "logps/rejected": -652.3299560546875, + "logps_avg/chosen": -0.07778448611497879, + "logps_avg/rejected": -3.5106163024902344, + "loss": 0.074, + "losses_ref": -0.0004062582738697529, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9050, + "u": -5.400134086608887, + "weight": 0.050418026745319366 + }, + { + "diff_generated": -34.945518493652344, + "epoch": 2.935839274141283, + "grad_norm": 2.731671929569239, + "learning_rate": 1.1141915906228928e-09, + "logits/chosen": -2.275437116622925, + "logits/rejected": -2.0286378860473633, + "logps/chosen": -12.794441223144531, + "logps/rejected": -651.5943603515625, + "logps_avg/chosen": -0.07423492521047592, + "logps_avg/rejected": -3.494551420211792, + "loss": 0.0755, + "losses_ref": -0.0004529617144726217, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9060, + "u": -5.432457447052002, + "weight": 0.04429139569401741 + }, + { + "diff_generated": -33.463844299316406, + "epoch": 2.9390797148412187, + "grad_norm": 2.9375110396431, + "learning_rate": 1.0045349620262379e-09, + "logits/chosen": -2.2780842781066895, + "logits/rejected": -2.049278974533081, + "logps/chosen": -13.470865249633789, + "logps/rejected": -626.6282958984375, + "logps_avg/chosen": -0.07648901641368866, + "logps_avg/rejected": -3.3463847637176514, + "loss": 0.0752, + "losses_ref": -0.00029730232199653983, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9070, + "u": -5.330691337585449, + "weight": 0.06280551105737686 + }, + { + "diff_generated": -33.51942443847656, + "epoch": 2.942320155541154, + "grad_norm": 2.8372455817426117, + "learning_rate": 9.005507571945958e-10, + "logits/chosen": -2.2972397804260254, + "logits/rejected": -1.9933170080184937, + "logps/chosen": -14.175498962402344, + "logps/rejected": -617.5062255859375, + "logps_avg/chosen": -0.07958875596523285, + "logps_avg/rejected": -3.351942539215088, + "loss": 0.0744, + "losses_ref": -0.00039287720574066043, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9080, + "u": -5.3291778564453125, + "weight": 0.0629565417766571 + }, + { + "diff_generated": -35.39436340332031, + "epoch": 2.945560596241089, + "grad_norm": 2.7022321365970066, + "learning_rate": 8.022404544466788e-10, + "logits/chosen": -2.286496877670288, + "logits/rejected": -2.0509543418884277, + "logps/chosen": -13.762849807739258, + "logps/rejected": -622.6043701171875, + "logps_avg/chosen": -0.07843898236751556, + "logps_avg/rejected": -3.5394368171691895, + "loss": 0.0746, + "losses_ref": -0.0006384230218827724, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9090, + "u": -5.439492225646973, + "weight": 0.044552553445100784 + }, + { + "diff_generated": -32.19483184814453, + "epoch": 2.948801036941024, + "grad_norm": 3.274540869163379, + "learning_rate": 7.096054514367455e-10, + "logits/chosen": -2.22894024848938, + "logits/rejected": -2.036980390548706, + "logps/chosen": -12.699054718017578, + "logps/rejected": -615.8016357421875, + "logps_avg/chosen": -0.07660754024982452, + "logps_avg/rejected": -3.2194831371307373, + "loss": 0.075, + "losses_ref": -0.000949250883422792, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9100, + "u": -5.186242580413818, + "weight": 0.08869956433773041 + }, + { + "diff_generated": -33.05915451049805, + "epoch": 2.952041477640959, + "grad_norm": 2.8465823055989925, + "learning_rate": 6.226470651346182e-10, + "logits/chosen": -2.2585575580596924, + "logits/rejected": -2.082470655441284, + "logps/chosen": -13.074926376342773, + "logps/rejected": -632.4801025390625, + "logps_avg/chosen": -0.07661643624305725, + "logps_avg/rejected": -3.3059158325195312, + "loss": 0.0765, + "losses_ref": -0.0003152258286718279, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9110, + "u": -5.401144027709961, + "weight": 0.05031600594520569 + }, + { + "diff_generated": -34.64142608642578, + "epoch": 2.9552819183408943, + "grad_norm": 2.778297006969446, + "learning_rate": 5.413665318070304e-10, + "logits/chosen": -2.264369487762451, + "logits/rejected": -2.0411882400512695, + "logps/chosen": -14.223039627075195, + "logps/rejected": -649.0701904296875, + "logps_avg/chosen": -0.08272770047187805, + "logps_avg/rejected": -3.464142322540283, + "loss": 0.0791, + "losses_ref": -0.0005291260313242674, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9120, + "u": -5.506434917449951, + "weight": 0.031941771507263184 + }, + { + "diff_generated": -34.728755950927734, + "epoch": 2.9585223590408294, + "grad_norm": 3.030304877955613, + "learning_rate": 4.657650069999963e-10, + "logits/chosen": -2.2826015949249268, + "logits/rejected": -2.050818920135498, + "logps/chosen": -13.238696098327637, + "logps/rejected": -654.2847900390625, + "logps_avg/chosen": -0.07198430597782135, + "logps_avg/rejected": -3.4728755950927734, + "loss": 0.0777, + "losses_ref": -9.715888882055879e-05, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9130, + "u": -5.3992228507995605, + "weight": 0.05008460208773613 + }, + { + "diff_generated": -37.10472869873047, + "epoch": 2.9617627997407645, + "grad_norm": 2.8495368006170585, + "learning_rate": 3.95843565522469e-10, + "logits/chosen": -2.2772858142852783, + "logits/rejected": -1.9869455099105835, + "logps/chosen": -12.714567184448242, + "logps/rejected": -666.079833984375, + "logps_avg/chosen": -0.07313890010118484, + "logps_avg/rejected": -3.710472583770752, + "loss": 0.0753, + "losses_ref": -0.0004528468125499785, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9140, + "u": -5.504135608673096, + "weight": 0.03178011626005173 + }, + { + "diff_generated": -32.66667175292969, + "epoch": 2.9650032404407, + "grad_norm": 2.6945972204400293, + "learning_rate": 3.3160320143097444e-10, + "logits/chosen": -2.309319496154785, + "logits/rejected": -2.0778040885925293, + "logps/chosen": -13.646771430969238, + "logps/rejected": -614.1995849609375, + "logps_avg/chosen": -0.07696790993213654, + "logps_avg/rejected": -3.266667127609253, + "loss": 0.0764, + "losses_ref": -0.001008645980618894, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9150, + "u": -5.1822099685668945, + "weight": 0.0887695699930191 + }, + { + "diff_generated": -37.78270721435547, + "epoch": 2.968243681140635, + "grad_norm": 3.783364664902114, + "learning_rate": 2.7304482801548957e-10, + "logits/chosen": -2.2790446281433105, + "logits/rejected": -1.9892492294311523, + "logps/chosen": -13.996490478515625, + "logps/rejected": -638.9437255859375, + "logps_avg/chosen": -0.0773763507604599, + "logps_avg/rejected": -3.778270721435547, + "loss": 0.0761, + "losses_ref": -0.0019606896676123142, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9160, + "u": -5.3608598709106445, + "weight": 0.059903018176555634 + }, + { + "diff_generated": -34.349769592285156, + "epoch": 2.9714841218405703, + "grad_norm": 2.763967615865681, + "learning_rate": 2.201692777865194e-10, + "logits/chosen": -2.25236177444458, + "logits/rejected": -2.01774263381958, + "logps/chosen": -12.498361587524414, + "logps/rejected": -638.0401000976562, + "logps_avg/chosen": -0.07391633093357086, + "logps_avg/rejected": -3.4349770545959473, + "loss": 0.0761, + "losses_ref": -0.0002926269080489874, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9170, + "u": -5.401417255401611, + "weight": 0.050305772572755814 + }, + { + "diff_generated": -37.91081237792969, + "epoch": 2.9747245625405054, + "grad_norm": 2.9257328592725975, + "learning_rate": 1.729773024631953e-10, + "logits/chosen": -2.2642340660095215, + "logits/rejected": -2.015472888946533, + "logps/chosen": -13.496482849121094, + "logps/rejected": -673.7174682617188, + "logps_avg/chosen": -0.0796927958726883, + "logps_avg/rejected": -3.7910804748535156, + "loss": 0.079, + "losses_ref": -0.000292365497443825, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9180, + "u": -5.440535545349121, + "weight": 0.04405521973967552 + }, + { + "diff_generated": -32.530948638916016, + "epoch": 2.9779650032404406, + "grad_norm": 2.713846199857598, + "learning_rate": 1.3146957296261696e-10, + "logits/chosen": -2.1936991214752197, + "logits/rejected": -2.0365915298461914, + "logps/chosen": -12.709261894226074, + "logps/rejected": -644.2265014648438, + "logps_avg/chosen": -0.07670806348323822, + "logps_avg/rejected": -3.2530949115753174, + "loss": 0.0778, + "losses_ref": -0.0005275515140965581, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9190, + "u": -5.187443256378174, + "weight": 0.08807355910539627 + }, + { + "diff_generated": -35.345008850097656, + "epoch": 2.981205443940376, + "grad_norm": 2.6283452311065227, + "learning_rate": 9.564667939030435e-11, + "logits/chosen": -2.2979166507720947, + "logits/rejected": -2.068493366241455, + "logps/chosen": -13.378866195678711, + "logps/rejected": -643.0728759765625, + "logps_avg/chosen": -0.07449330389499664, + "logps_avg/rejected": -3.53450083732605, + "loss": 0.0766, + "losses_ref": -0.0003306058351881802, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9200, + "u": -5.58246374130249, + "weight": 0.019121162593364716 + }, + { + "diff_generated": -37.17274856567383, + "epoch": 2.9844458846403112, + "grad_norm": 2.83432801334109, + "learning_rate": 6.550913103189337e-11, + "logits/chosen": -2.266289234161377, + "logits/rejected": -2.0171196460723877, + "logps/chosen": -12.02930736541748, + "logps/rejected": -709.2539672851562, + "logps_avg/chosen": -0.07492565363645554, + "logps_avg/rejected": -3.7172749042510986, + "loss": 0.0745, + "losses_ref": -0.0002527030010242015, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9210, + "u": -5.570784568786621, + "weight": 0.019006643444299698 + }, + { + "diff_generated": -36.656272888183594, + "epoch": 2.9876863253402464, + "grad_norm": 2.9330329822293932, + "learning_rate": 4.1057356345675085e-11, + "logits/chosen": -2.255496025085449, + "logits/rejected": -1.9425241947174072, + "logps/chosen": -15.756494522094727, + "logps/rejected": -619.2554931640625, + "logps_avg/chosen": -0.08364422619342804, + "logps_avg/rejected": -3.6656272411346436, + "loss": 0.0785, + "losses_ref": -0.0012684316607192159, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9220, + "u": -5.468225955963135, + "weight": 0.03942258656024933 + }, + { + "diff_generated": -35.7361946105957, + "epoch": 2.9909267660401815, + "grad_norm": 2.6544514182140717, + "learning_rate": 2.229170295673377e-11, + "logits/chosen": -2.2972233295440674, + "logits/rejected": -2.0817008018493652, + "logps/chosen": -13.582697868347168, + "logps/rejected": -631.3333740234375, + "logps_avg/chosen": -0.07444195449352264, + "logps_avg/rejected": -3.5736191272735596, + "loss": 0.0744, + "losses_ref": -9.415384556632489e-05, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9230, + "u": -5.581204891204834, + "weight": 0.018835904076695442 + }, + { + "diff_generated": -32.17867660522461, + "epoch": 2.9941672067401166, + "grad_norm": 2.7794692518725967, + "learning_rate": 9.212437651973103e-12, + "logits/chosen": -2.300394058227539, + "logits/rejected": -2.0665271282196045, + "logps/chosen": -14.166768074035645, + "logps/rejected": -594.6533813476562, + "logps_avg/chosen": -0.07981442660093307, + "logps_avg/rejected": -3.217867612838745, + "loss": 0.0756, + "losses_ref": -0.0003018935676664114, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9240, + "u": -5.294437408447266, + "weight": 0.06905417144298553 + }, + { + "diff_generated": -33.743309020996094, + "epoch": 2.9974076474400517, + "grad_norm": 2.7191689143451625, + "learning_rate": 1.819746376119369e-12, + "logits/chosen": -2.2722482681274414, + "logits/rejected": -2.033750057220459, + "logps/chosen": -15.440897941589355, + "logps/rejected": -607.9214477539062, + "logps_avg/chosen": -0.08211788535118103, + "logps_avg/rejected": -3.3743317127227783, + "loss": 0.0794, + "losses_ref": -0.0002549213822931051, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 9250, + "u": -5.401520729064941, + "weight": 0.05025525018572807 + } + ], + "logging_steps": 10, + "max_steps": 9258, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}